网站地图页面提取器

高级

这是一个Market Research、Multimodal AI领域的自动化工作流,包含 19 个节点。主要使用 If、Set、Code、Filter、FormTrigger 等节点。 网站地图页面提取器:发现、清理并将网站 URL 保存至 Google Sheets

前置要求
  • 可能需要目标 API 的认证凭证
  • Google Sheets API 凭证
工作流预览
可视化展示节点连接关系,支持缩放和平移
导出工作流
复制以下 JSON 配置到 n8n 导入,即可使用此工作流
{
  "id": "ytgCDUiHYhFkJqlY",
  "meta": {
    "instanceId": "bc8ca75c203589705ae2e446cad7181d6f2a7cc1766f958ef9f34810e53b8cb2",
    "templateCredsSetupCompleted": true
  },
  "name": "网站地图页面提取器",
  "tags": [],
  "nodes": [
    {
      "id": "2464b9f1-f0fe-41df-9941-acad5d5dedb9",
      "name": "便签",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -256,
        928
      ],
      "parameters": {
        "color": 2,
        "height": 336,
        "content": "## 构建网站地图 URL:"
      },
      "typeVersion": 1
    },
    {
      "id": "1b1b00a9-6825-4bae-b74a-6c86d8972299",
      "name": "便签 1",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        304,
        560
      ],
      "parameters": {
        "color": 2,
        "width": 400,
        "height": 432,
        "content": "## 筛选和提取网站地图文件"
      },
      "typeVersion": 1
    },
    {
      "id": "e0c1c670-5e7f-4146-a605-e5972d786cf8",
      "name": "便签 2",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        720,
        560
      ],
      "parameters": {
        "color": 2,
        "width": 432,
        "height": 432,
        "content": "## 获取网站地图 XML 并提取页面 URL"
      },
      "typeVersion": 1
    },
    {
      "id": "c95c2d0b-2a6b-471a-b168-1f33d4dd6cb4",
      "name": "便签 3",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        1488,
        688
      ],
      "parameters": {
        "color": 2,
        "width": 208,
        "height": 304,
        "content": "将每个爬取的页面 URL 附加到 List_Of_All_URLs 工作表中,通过自动匹配现有条目避免重复。"
      },
      "typeVersion": 1
    },
    {
      "id": "1b2c2bde-4edc-4dee-97c1-28f1b1ee8b94",
      "name": "便签 5",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        288,
        1152
      ],
      "parameters": {
        "color": 2,
        "width": 256,
        "height": 304,
        "content": "向每个生成的网站地图 URL 发送 HTTP 请求并获取原始响应,以检查网站地图是否存在且包含有效数据。"
      },
      "typeVersion": 1
    },
    {
      "id": "c9f75b84-334a-4926-bdc4-62e59e605b97",
      "name": "便签6",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        1184,
        640
      ],
      "parameters": {
        "color": 2,
        "width": 272,
        "height": 352,
        "content": "## 排除网站地图 URL:"
      },
      "typeVersion": 1
    },
    {
      "id": "5ae6ec48-76f1-44ca-8387-ce5f9a84ac9d",
      "name": "便签7",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -1248,
        784
      ],
      "parameters": {
        "width": 464,
        "height": 640,
        "content": "## 📌 自动化摘要:"
      },
      "typeVersion": 1
    },
    {
      "id": "342fb312-ba7a-427e-907e-b57a206b6252",
      "name": "便签 4",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -672,
        928
      ],
      "parameters": {
        "color": 2,
        "width": 400,
        "height": 336,
        "content": "## 表单输入与 URL 准备"
      },
      "typeVersion": 1
    },
    {
      "id": "e69d955f-2212-4e5b-afa8-998cd18e11f1",
      "name": "输入网站 URL",
      "type": "n8n-nodes-base.formTrigger",
      "position": [
        -640,
        1088
      ],
      "webhookId": "8a207eb4-74f2-4577-9fe6-98e87f2e9a0d",
      "parameters": {
        "options": {},
        "formTitle": "Sitemap Page Extractor",
        "formFields": {
          "values": [
            {
              "fieldLabel": "Website URL"
            }
          ]
        }
      },
      "typeVersion": 2.2
    },
    {
      "id": "4d33e736-fb1d-462f-bad2-46a68bc1b837",
      "name": "准备网站 URL",
      "type": "n8n-nodes-base.set",
      "position": [
        -432,
        1088
      ],
      "parameters": {
        "options": {},
        "assignments": {
          "assignments": [
            {
              "id": "2a310b45-ec77-41dd-9436-f3b58b7df477",
              "name": "url",
              "type": "string",
              "value": "={{ $json[\"Website URL\"] }}"
            }
          ]
        }
      },
      "typeVersion": 3.4
    },
    {
      "id": "604a9837-436b-4701-9d17-cb59cb2a2099",
      "name": "构建网站地图 URL",
      "type": "n8n-nodes-base.code",
      "position": [
        -192,
        1088
      ],
      "parameters": {
        "jsCode": "const inputData = $input.first().json;\nlet baseUrl = inputData.url || inputData.website_url || '';\n\nif (!baseUrl) {\n  throw new Error(\"No URL provided\");\n}\n\nbaseUrl = baseUrl.replace(/\\/$/, '');\n\nlet domain = '';\nif (baseUrl.includes('://')) {\n  const parts = baseUrl.split('/');\n  domain = parts[0] + '//' + parts[2];\n} else {\n  domain = 'https://' + baseUrl;\n}\n\nconst urls = [\n  `${domain}/robots.txt`,\n  `${domain}/sitemap.xml`,\n  `${domain}/sitemap_index.xml`,\n  `${domain}/sitemap-index.xml`,\n  `${domain}/sitemap1.xml`,\n  `${domain}/sitemap/sitemap.xml`,\n  `${domain}/sitemaps/sitemap.xml`\n];\n\nreturn urls.map(url => ({ sitemap_url: url }));"
      },
      "typeVersion": 2
    },
    {
      "id": "ad781031-ef3e-4378-92d0-837d3e8fbaf2",
      "name": "网站地图 URL 检查",
      "type": "n8n-nodes-base.splitInBatches",
      "position": [
        80,
        1088
      ],
      "parameters": {
        "options": {}
      },
      "typeVersion": 3
    },
    {
      "id": "8ca63a55-44bc-4e72-906b-a6e2ed5f9c31",
      "name": "获取网站地图数据",
      "type": "n8n-nodes-base.httpRequest",
      "onError": "continueRegularOutput",
      "position": [
        368,
        1280
      ],
      "parameters": {
        "url": "={{ $json.sitemap_url }}",
        "options": {
          "response": {
            "response": {
              "responseFormat": "text"
            }
          }
        }
      },
      "typeVersion": 4.2
    },
    {
      "id": "02698f76-9606-426d-b953-3f2adfd80a6f",
      "name": "筛选非空网站地图响应",
      "type": "n8n-nodes-base.if",
      "position": [
        336,
        832
      ],
      "parameters": {
        "options": {},
        "conditions": {
          "options": {
            "version": 2,
            "leftValue": "",
            "caseSensitive": true,
            "typeValidation": "strict"
          },
          "combinator": "and",
          "conditions": [
            {
              "id": "bf0c94e8-a8c6-4419-9b18-cf5d1a01e577",
              "operator": {
                "type": "string",
                "operation": "notEmpty",
                "singleValue": true
              },
              "leftValue": "={{ $json.data }}",
              "rightValue": ""
            }
          ]
        }
      },
      "typeVersion": 2.2
    },
    {
      "id": "2554e396-8d0b-44c5-ab1e-91a192751acd",
      "name": "提取网站地图 URL",
      "type": "n8n-nodes-base.code",
      "position": [
        560,
        816
      ],
      "parameters": {
        "jsCode": "const allUrls = [];\n\n$input.all().forEach(item => {\n  const content = item.json.data || '';\n\n  const robotMatches = [...content.matchAll(/Sitemap:\\s*(\\S+)/gi)];\n  robotMatches.forEach(match => {\n    allUrls.push(match[1]);\n  });\n\n  const locMatches = [...content.matchAll(/<loc>\\s*(.*?)\\s*<\\/loc>/gi)];\n  locMatches.forEach(match => {\n    allUrls.push(match[1]);\n  });\n});\n\nconst uniqueUrls = [...new Set(allUrls)];\n\nreturn uniqueUrls.map(url => ({ json: { sitemap_url: url }}));"
      },
      "typeVersion": 2
    },
    {
      "id": "95f24edc-bef3-481d-954c-f4245af73faa",
      "name": "获取网站地图页面 XML",
      "type": "n8n-nodes-base.httpRequest",
      "onError": "continueRegularOutput",
      "position": [
        784,
        816
      ],
      "parameters": {
        "url": "={{ $json.sitemap_url }}",
        "method": "=GET",
        "options": {
          "response": {
            "response": {
              "responseFormat": "text"
            }
          }
        }
      },
      "typeVersion": 4.2
    },
    {
      "id": "134f2ded-0200-44a7-8df1-343d76cf16d0",
      "name": "从网站地图提取页面 URL",
      "type": "n8n-nodes-base.code",
      "position": [
        1008,
        816
      ],
      "parameters": {
        "jsCode": "const urls = new Set();\n\n$input.all().forEach(item => {\n  const content = item.json.data || '';\n\n  const xmlMatches = [...content.matchAll(/<loc>(.*?)<\\/loc>/gi)];\n  xmlMatches.forEach(match => {\n    const url = match[1].trim();\n    urls.add(url);\n  });\n\n  const htmlMatches = [...content.matchAll(/<a\\s[^>]*href=[\"']([^\"']+)[\"']/gi)];\n  htmlMatches.forEach(match => {\n    const url = match[1].trim();\n    if (url.startsWith('/') || url.startsWith('http')) {\n      urls.add(url);\n    }\n  });\n});\n\nif (urls.size === 0) {\n  return [{ json: { message: \"No URLs found in XML or HTML content\" }}];\n}\n\nreturn Array.from(urls).map(url => ({\n  json: { page_url: url }\n}));"
      },
      "typeVersion": 2
    },
    {
      "id": "9708b71a-34c3-42fd-9a3c-15ad7d4cd7c3",
      "name": "排除网站地图 URL",
      "type": "n8n-nodes-base.filter",
      "position": [
        1296,
        816
      ],
      "parameters": {
        "options": {},
        "conditions": {
          "options": {
            "version": 2,
            "leftValue": "",
            "caseSensitive": true,
            "typeValidation": "strict"
          },
          "combinator": "and",
          "conditions": [
            {
              "id": "303d0665-ac2d-452b-8678-a05e53a7372b",
              "operator": {
                "type": "string",
                "operation": "notContains"
              },
              "leftValue": "={{ $json.page_url }}",
              "rightValue": "sitemap"
            }
          ]
        }
      },
      "typeVersion": 2.2
    },
    {
      "id": "535579fc-dcad-4386-920c-6f8857a0bd70",
      "name": "将页面 URL 保存到工作表",
      "type": "n8n-nodes-base.googleSheets",
      "position": [
        1568,
        816
      ],
      "parameters": {
        "columns": {
          "value": {},
          "schema": [],
          "mappingMode": "defineBelow",
          "matchingColumns": [
            "List URLs"
          ],
          "attemptToConvertTypes": false,
          "convertFieldsToString": false
        },
        "options": {},
        "operation": "appendOrUpdate",
        "sheetName": "List_Of_All_URLs",
        "documentId": "YOUR_GOOGLE_SHEET_URL"
      },
      "credentials": {
        "googleSheetsOAuth2Api": {
          "id": "Aam5AW9oxNhvIGd2",
          "name": "Shiv@incrementors.com - Google Sheets"
        }
      },
      "typeVersion": 4.6
    }
  ],
  "active": false,
  "pinData": {},
  "settings": {
    "callerPolicy": "any",
    "executionOrder": "v1"
  },
  "versionId": "ff2c3009-f913-4cd2-9042-a36d056b6f90",
  "connections": {
    "Input Website URL": {
      "main": [
        [
          {
            "node": "Prepare website URL",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Sitemap URL Check": {
      "main": [
        [
          {
            "node": "Filter Non-Empty Sitemap Responses",
            "type": "main",
            "index": 0
          }
        ],
        [
          {
            "node": "Fetch Sitemap Data",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Build sitemap URLs": {
      "main": [
        [
          {
            "node": "Sitemap URL Check",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Fetch Sitemap Data": {
      "main": [
        [
          {
            "node": "Sitemap URL Check",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Prepare website URL": {
      "main": [
        [
          {
            "node": "Build sitemap URLs",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Extract Sitemap URLs": {
      "main": [
        [
          {
            "node": "Fetch Sitemap Pages XML",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Fetch Sitemap Pages XML": {
      "main": [
        [
          {
            "node": "Extract Page URLs from Sitemap",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Exclude the Sitemap URLs": {
      "main": [
        [
          {
            "node": "Save Page URLs to Sheet",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Extract Page URLs from Sitemap": {
      "main": [
        [
          {
            "node": "Exclude the Sitemap URLs",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Filter Non-Empty Sitemap Responses": {
      "main": [
        [
          {
            "node": "Extract Sitemap URLs",
            "type": "main",
            "index": 0
          }
        ]
      ]
    }
  }
}
常见问题

如何使用这个工作流?

复制上方的 JSON 配置代码,在您的 n8n 实例中创建新工作流并选择「从 JSON 导入」,粘贴配置后根据需要修改凭证设置即可。

这个工作流适合什么场景?

这是一个高级难度的工作流,适用于Market Research、Multimodal AI等场景。适合高级用户,包含 16+ 个节点的复杂工作流

需要付费吗?

本工作流完全免费,您可以直接导入使用。但请注意,工作流中使用的第三方服务(如 OpenAI API)可能需要您自行付费。

工作流信息
难度等级
高级
节点数量19
分类2
节点类型9
难度说明

适合高级用户,包含 16+ 个节点的复杂工作流

外部链接
在 n8n.io 上查看 →

分享此工作流