网络爬虫:将网站转换为AI就绪的Markdown格式并存储到Google表格

高级

这是一个自动化工作流,包含 22 个节点。主要使用 Set、Html、Filter、Switch、Markdown 等节点。 网络爬虫:将网站转换为AI就绪的Markdown格式并存储到Google表格

前置要求
  • 可能需要目标 API 的认证凭证
  • Google Sheets API 凭证

分类

未分类
工作流预览
可视化展示节点连接关系,支持缩放和平移
导出工作流
复制以下 JSON 配置到 n8n 导入,即可使用此工作流
{
  "meta": {
    "instanceId": "3d7eb9567ae690bf8c9bba1cb43396e6e40c18e15eb5889cf9673ed1713da6db",
    "templateCredsSetupCompleted": true
  },
  "nodes": [
    {
      "id": "349e50cf-75b8-432c-818e-63f1ff3ead34",
      "name": "概述笔记",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        1696,
        3104
      ],
      "parameters": {
        "color": 4,
        "width": 600,
        "height": 1112,
        "content": "# 用于 AI 知识库的自动化网站爬虫"
      },
      "typeVersion": 1
    },
    {
      "id": "eb43d67c-01fc-4d83-bb2c-099938a57468",
      "name": "注意:触发器和设置",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        2512,
        3072
      ],
      "parameters": {
        "color": 6,
        "width": 556,
        "height": 176,
        "content": "## 🖱️ 触发器与设置节点"
      },
      "typeVersion": 1
    },
    {
      "id": "3c8581cb-46cd-4f25-af5a-c52bc2f463c6",
      "name": "设置网站",
      "type": "n8n-nodes-base.set",
      "position": [
        2688,
        3296
      ],
      "parameters": {
        "options": {},
        "assignments": {
          "assignments": [
            {
              "id": "a652f57e-210e-421e-b20b-781d6f4dc240",
              "name": "website_url",
              "type": "string",
              "value": "https://example.com"
            }
          ]
        }
      },
      "typeVersion": 3.4
    },
    {
      "id": "18201858-7764-4a14-9f6b-12e36eaf158b",
      "name": "手动触发器",
      "type": "n8n-nodes-base.manualTrigger",
      "position": [
        2496,
        3296
      ],
      "parameters": {},
      "typeVersion": 1
    },
    {
      "id": "b7435481-bed3-439f-933c-1c5e0142ad5c",
      "name": "抓取首页",
      "type": "n8n-nodes-base.httpRequest",
      "onError": "continueRegularOutput",
      "position": [
        2880,
        3296
      ],
      "parameters": {
        "url": "={{ $json.website_url }}",
        "options": {
          "redirect": {
            "redirect": {}
          },
          "allowUnauthorizedCerts": false
        }
      },
      "executeOnce": false,
      "typeVersion": 4.2,
      "alwaysOutputData": false
    },
    {
      "id": "ce13710d-24ca-47d4-a25c-8890c1592947",
      "name": "注意:首页抓取",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        3168,
        3488
      ],
      "parameters": {
        "color": 5,
        "width": 396,
        "height": 192,
        "content": "## 🌐 首页抓取节点"
      },
      "typeVersion": 1
    },
    {
      "id": "61a60f2c-f032-4b46-83ba-405df0ce05df",
      "name": "从HTML提取链接",
      "type": "n8n-nodes-base.html",
      "position": [
        3088,
        3296
      ],
      "parameters": {
        "options": {
          "trimValues": true,
          "cleanUpText": true
        },
        "operation": "extractHtmlContent",
        "extractionValues": {
          "values": [
            {
              "key": "links",
              "attribute": "href",
              "cssSelector": "a",
              "returnArray": true,
              "returnValue": "attribute"
            }
          ]
        }
      },
      "typeVersion": 1.2
    },
    {
      "id": "582eeae0-fec0-4548-9c78-7c05ac5aaebc",
      "name": "拆分链接",
      "type": "n8n-nodes-base.splitOut",
      "position": [
        3296,
        3296
      ],
      "parameters": {
        "options": {},
        "fieldToSplitOut": "links"
      },
      "typeVersion": 1
    },
    {
      "id": "17d59531-4d51-4494-8ae9-e91b81851a0b",
      "name": "移除重复链接",
      "type": "n8n-nodes-base.removeDuplicates",
      "position": [
        3520,
        3296
      ],
      "parameters": {
        "options": {}
      },
      "typeVersion": 2
    },
    {
      "id": "d50fa2a9-1a58-4dad-8bd0-cfbd31aeae91",
      "name": "过滤真实超链接",
      "type": "n8n-nodes-base.filter",
      "position": [
        3696,
        3296
      ],
      "parameters": {
        "options": {},
        "conditions": {
          "options": {
            "version": 2,
            "leftValue": "",
            "caseSensitive": true,
            "typeValidation": "strict"
          },
          "combinator": "and",
          "conditions": [
            {
              "id": "bd6c6da6-8af7-4809-b6cd-01a38d71953b",
              "operator": {
                "type": "string",
                "operation": "startsWith"
              },
              "leftValue": "={{ $json.links }}",
              "rightValue": "https://"
            }
          ]
        }
      },
      "typeVersion": 2.2
    },
    {
      "id": "cb121b70-a14a-4cbd-a54c-e55c6fc235b7",
      "name": "注意:链接处理",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        3216,
        3056
      ],
      "parameters": {
        "color": 2,
        "width": 556,
        "height": 224,
        "content": "## 🔄 链接处理节点"
      },
      "typeVersion": 1
    },
    {
      "id": "d69c0dc2-2c4c-474b-ba11-3d79e1390b12",
      "name": "分离图片和链接",
      "type": "n8n-nodes-base.switch",
      "position": [
        2480,
        3680
      ],
      "parameters": {
        "rules": {
          "values": [
            {
              "outputKey": "Images",
              "conditions": {
                "options": {
                  "version": 2,
                  "leftValue": "",
                  "caseSensitive": true,
                  "typeValidation": "strict"
                },
                "combinator": "and",
                "conditions": [
                  {
                    "id": "16724958-4eea-489d-b494-3d76a3ba2562",
                    "operator": {
                      "type": "string",
                      "operation": "regex"
                    },
                    "leftValue": "={{ $json.links }}",
                    "rightValue": "=^https?:\\/\\/.*\\.(?:png|jpe?g|gif|webp|bmp|svg|ico)(?:\\?.*)?$"
                  }
                ]
              },
              "renameOutput": true
            },
            {
              "outputKey": "Links",
              "conditions": {
                "options": {
                  "version": 2,
                  "leftValue": "",
                  "caseSensitive": true,
                  "typeValidation": "strict"
                },
                "combinator": "and",
                "conditions": [
                  {
                    "id": "816392f0-96db-4134-8bee-4b74688ff929",
                    "operator": {
                      "type": "string",
                      "operation": "notRegex"
                    },
                    "leftValue": "={{ $json.links }}",
                    "rightValue": "=^https?:\\/\\/.*\\.(?:png|jpe?g|gif|webp|bmp|svg|ico)(?:\\?.*)?$"
                  }
                ]
              },
              "renameOutput": true
            }
          ]
        },
        "options": {}
      },
      "typeVersion": 3.2
    },
    {
      "id": "23896343-575e-4956-8e95-3b5e6e4c8ae7",
      "name": "聚合图片",
      "type": "n8n-nodes-base.aggregate",
      "position": [
        2736,
        3504
      ],
      "parameters": {
        "options": {},
        "fieldsToAggregate": {
          "fieldToAggregate": [
            {
              "fieldToAggregate": "links"
            }
          ]
        }
      },
      "typeVersion": 1
    },
    {
      "id": "fcad347b-60d7-4fa2-9b02-e96c2f27116d",
      "name": "聚合链接",
      "type": "n8n-nodes-base.aggregate",
      "position": [
        2736,
        3696
      ],
      "parameters": {
        "options": {},
        "fieldsToAggregate": {
          "fieldToAggregate": [
            {
              "fieldToAggregate": "links"
            }
          ]
        }
      },
      "typeVersion": 1
    },
    {
      "id": "fc5d6ce1-1765-4768-a9c7-de3677e8109d",
      "name": "抓取内容链接",
      "type": "n8n-nodes-base.httpRequest",
      "position": [
        2736,
        3872
      ],
      "parameters": {
        "url": "={{ $json.links }}",
        "options": {}
      },
      "typeVersion": 4.2
    },
    {
      "id": "0d4b6a4e-b6cb-4e6c-9a22-bd0dc6a72027",
      "name": "注意:内容抓取",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        2320,
        3984
      ],
      "parameters": {
        "color": 5,
        "width": 428,
        "height": 224,
        "content": "## 📄 内容抓取与聚合节点"
      },
      "typeVersion": 1
    },
    {
      "id": "349e5f7c-c81b-467b-a59b-ea40a47226f0",
      "name": "转换为 Markdown",
      "type": "n8n-nodes-base.markdown",
      "position": [
        2944,
        3872
      ],
      "parameters": {
        "html": "={{ $json.data }}",
        "options": {}
      },
      "typeVersion": 1
    },
    {
      "id": "24f22a31-03a3-4faf-81f4-3c38c0956ee4",
      "name": "聚合抓取内容",
      "type": "n8n-nodes-base.aggregate",
      "position": [
        3136,
        3872
      ],
      "parameters": {
        "options": {},
        "fieldsToAggregate": {
          "fieldToAggregate": [
            {
              "fieldToAggregate": "data"
            }
          ]
        }
      },
      "typeVersion": 1
    },
    {
      "id": "a4d34aab-1af2-4196-85f5-1a2d832969dd",
      "name": "添加图片到表格",
      "type": "n8n-nodes-base.googleSheets",
      "position": [
        2944,
        3504
      ],
      "parameters": {
        "columns": {
          "value": {
            "Images": "={{ $json.links.join('\\n\\n') }}",
            "Website": "={{ $('Set Website').item.json.website_url }}"
          },
          "schema": [
            {
              "id": "Website",
              "type": "string",
              "display": true,
              "removed": false,
              "required": false,
              "displayName": "Website",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            },
            {
              "id": "Links",
              "type": "string",
              "display": true,
              "removed": true,
              "required": false,
              "displayName": "Links",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            },
            {
              "id": "Scraped Content",
              "type": "string",
              "display": true,
              "removed": true,
              "required": false,
              "displayName": "Scraped Content",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            },
            {
              "id": "Images",
              "type": "string",
              "display": true,
              "required": false,
              "displayName": "Images",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            }
          ],
          "mappingMode": "defineBelow",
          "matchingColumns": [
            "Website"
          ],
          "attemptToConvertTypes": false,
          "convertFieldsToString": false
        },
        "options": {},
        "operation": "appendOrUpdate",
        "sheetName": "your-sheet-name",
        "documentId": "your-document-id"
      },
      "credentials": {
        "googleSheetsOAuth2Api": {
          "id": "ZVbWK0SlohYDlZYO",
          "name": "Ewere"
        }
      },
      "typeVersion": 4.7
    },
    {
      "id": "6afbfad8-b80f-4a0d-81b4-9138cc2af46a",
      "name": "添加链接到表格",
      "type": "n8n-nodes-base.googleSheets",
      "position": [
        2944,
        3696
      ],
      "parameters": {
        "columns": {
          "value": {
            "Links": "={{ $json.links.join('\\n\\n') }}",
            "Website": "={{ $('Set Website').item.json.website_url }}"
          },
          "schema": [
            {
              "id": "Website",
              "type": "string",
              "display": true,
              "removed": false,
              "required": false,
              "displayName": "Website",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            },
            {
              "id": "Links",
              "type": "string",
              "display": true,
              "removed": false,
              "required": false,
              "displayName": "Links",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            },
            {
              "id": "Scraped Content",
              "type": "string",
              "display": true,
              "removed": true,
              "required": false,
              "displayName": "Scraped Content",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            },
            {
              "id": "Images",
              "type": "string",
              "display": true,
              "removed": true,
              "required": false,
              "displayName": "Images",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            }
          ],
          "mappingMode": "defineBelow",
          "matchingColumns": [
            "Website"
          ],
          "attemptToConvertTypes": false,
          "convertFieldsToString": false
        },
        "options": {},
        "operation": "appendOrUpdate",
        "sheetName": "your-sheet-name",
        "documentId": "your-document-id"
      },
      "credentials": {
        "googleSheetsOAuth2Api": {
          "id": "ZVbWK0SlohYDlZYO",
          "name": "Ewere"
        }
      },
      "typeVersion": 4.7
    },
    {
      "id": "35ae2c30-a93a-4fd2-82b6-07d2f4c56c88",
      "name": "添加抓取内容到表格",
      "type": "n8n-nodes-base.googleSheets",
      "position": [
        3344,
        3872
      ],
      "parameters": {
        "columns": {
          "value": {
            "Website": "={{ $('Set Website').item.json.website_url }}",
            "Scraped Content": "={{ $json.data.join('\\n\\n').slice(0, 50000) }}"
          },
          "schema": [
            {
              "id": "Website",
              "type": "string",
              "display": true,
              "removed": false,
              "required": false,
              "displayName": "Website",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            },
            {
              "id": "Links",
              "type": "string",
              "display": true,
              "removed": true,
              "required": false,
              "displayName": "Links",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            },
            {
              "id": "Scraped Content",
              "type": "string",
              "display": true,
              "removed": false,
              "required": false,
              "displayName": "Scraped Content",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            },
            {
              "id": "Images",
              "type": "string",
              "display": true,
              "removed": true,
              "required": false,
              "displayName": "Images",
              "defaultMatch": false,
              "canBeUsedToMatch": true
            }
          ],
          "mappingMode": "defineBelow",
          "matchingColumns": [
            "Website"
          ],
          "attemptToConvertTypes": false,
          "convertFieldsToString": false
        },
        "options": {},
        "operation": "appendOrUpdate",
        "sheetName": "your-sheet-name",
        "documentId": "your-document-id"
      },
      "credentials": {
        "googleSheetsOAuth2Api": {
          "id": "ZVbWK0SlohYDlZYO",
          "name": "Ewere"
        }
      },
      "typeVersion": 4.7
    },
    {
      "id": "c3f7b022-db11-400c-baaa-77392acfb991",
      "name": "注意:表格集成",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        3232,
        4048
      ],
      "parameters": {
        "color": 3,
        "width": 444,
        "height": 176,
        "content": "## 📊 表格集成节点"
      },
      "typeVersion": 1
    }
  ],
  "pinData": {},
  "connections": {
    "Set Website": {
      "main": [
        [
          {
            "node": "Scrape Homepage",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Split Links": {
      "main": [
        [
          {
            "node": "Remove Duplicate Links",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Manual Trigger": {
      "main": [
        [
          {
            "node": "Set Website",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Aggregate Links": {
      "main": [
        [
          {
            "node": "Add Links to Sheet",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Scrape Homepage": {
      "main": [
        [
          {
            "node": "Extract Links from HTML",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Aggregate Images": {
      "main": [
        [
          {
            "node": "Add Images to Sheet",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Convert to Markdown": {
      "main": [
        [
          {
            "node": "Aggregate Scraped Content",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Scrape Content Links": {
      "main": [
        [
          {
            "node": "Convert to Markdown",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Filter Real Hyperlinks": {
      "main": [
        [
          {
            "node": "Separate Images and Links",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Remove Duplicate Links": {
      "main": [
        [
          {
            "node": "Filter Real Hyperlinks",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Extract Links from HTML": {
      "main": [
        [
          {
            "node": "Split Links",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Aggregate Scraped Content": {
      "main": [
        [
          {
            "node": "Add Scraped Content to Sheet",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Separate Images and Links": {
      "main": [
        [
          {
            "node": "Aggregate Images",
            "type": "main",
            "index": 0
          }
        ],
        [
          {
            "node": "Aggregate Links",
            "type": "main",
            "index": 0
          },
          {
            "node": "Scrape Content Links",
            "type": "main",
            "index": 0
          }
        ]
      ]
    }
  }
}
常见问题

如何使用这个工作流?

复制上方的 JSON 配置代码,在您的 n8n 实例中创建新工作流并选择「从 JSON 导入」,粘贴配置后根据需要修改凭证设置即可。

这个工作流适合什么场景?

这是一个高级难度的通用自动化工作流。适合高级用户,包含 16+ 个节点的复杂工作流

需要付费吗?

本工作流完全免费,您可以直接导入使用。但请注意,工作流中使用的第三方服务(如 OpenAI API)可能需要您自行付费。

工作流信息
难度等级
高级
节点数量22
分类-
节点类型12
难度说明

适合高级用户,包含 16+ 个节点的复杂工作流

作者
Daniel Nkencho

Daniel Nkencho

@daniel-automates

AI Automation Consultant | Helping Business Owners Implement AI Systems for Growth and Lead Gen

外部链接
在 n8n.io 上查看 →

分享此工作流