从站点地图爬取到向量存储:创建高效的RAG工作流

高级

这是一个Content Creation、Multimodal AI领域的自动化工作流,包含 40 个节点。主要使用 If、Set、Xml、Code、Wait 等节点。 从站点地图爬取到向量存储:创建高效的RAG工作流

前置要求
  • PostgreSQL 数据库连接信息
  • Supabase URL 和 API Key
  • 可能需要目标 API 的认证凭证
  • OpenAI API Key
工作流预览
可视化展示节点连接关系,支持缩放和平移
导出工作流
复制以下 JSON 配置到 n8n 导入,即可使用此工作流
{
  "meta": {
    "instanceId": "0862f70dc42e115052f6a2d4c2b6537665b4361a614cec7cd17d1c45c8868621",
    "templateCredsSetupCompleted": true
  },
  "nodes": [
    {
      "id": "ab180eb3-c086-4f9f-b9d0-f3f56056a416",
      "name": "当点击\"测试工作流\"时",
      "type": "n8n-nodes-base.manualTrigger",
      "position": [
        -6816,
        -304
      ],
      "parameters": {},
      "typeVersion": 1
    },
    {
      "id": "20e77374-c3ce-457f-945c-d6f6dc928de1",
      "name": "HTTP请求",
      "type": "n8n-nodes-base.httpRequest",
      "position": [
        -6624,
        -304
      ],
      "parameters": {
        "url": "https://www.kiekens.com/sitemap.xml",
        "options": {}
      },
      "typeVersion": 4.2
    },
    {
      "id": "b23dd724-1bd7-4eef-9e22-8bef987b2128",
      "name": "XML",
      "type": "n8n-nodes-base.xml",
      "position": [
        -6432,
        -304
      ],
      "parameters": {
        "options": {}
      },
      "typeVersion": 1
    },
    {
      "id": "4715b380-f386-4926-892e-2c133a1155c1",
      "name": "拆分输出",
      "type": "n8n-nodes-base.splitOut",
      "position": [
        -6224,
        -304
      ],
      "parameters": {
        "options": {},
        "fieldToSplitOut": "urlset.url"
      },
      "typeVersion": 1
    },
    {
      "id": "56181432-63f2-4d93-be6d-6f1489e04ca9",
      "name": "遍历项目",
      "type": "n8n-nodes-base.splitInBatches",
      "position": [
        -5152,
        -592
      ],
      "parameters": {
        "options": {}
      },
      "typeVersion": 3
    },
    {
      "id": "f52b3e19-7d64-4f3d-848d-81cf2b65bb15",
      "name": "等待",
      "type": "n8n-nodes-base.wait",
      "position": [
        -4192,
        -608
      ],
      "webhookId": "9af87c5e-b07f-48dc-9ca8-61b471a24cad",
      "parameters": {
        "amount": 30
      },
      "typeVersion": 1.1
    },
    {
      "id": "961143cf-c387-4e2d-a477-0988c0b0f512",
      "name": "如果",
      "type": "n8n-nodes-base.if",
      "position": [
        -3728,
        -608
      ],
      "parameters": {
        "options": {},
        "conditions": {
          "options": {
            "version": 2,
            "leftValue": "",
            "caseSensitive": true,
            "typeValidation": "strict"
          },
          "combinator": "and",
          "conditions": [
            {
              "id": "9d90c1ce-590e-40a5-ae8c-d92326032975",
              "operator": {
                "type": "string",
                "operation": "equals"
              },
              "leftValue": "={{ $json.status }}",
              "rightValue": "completed"
            }
          ]
        }
      },
      "typeVersion": 2.2
    },
    {
      "id": "991580c5-10ed-4bab-811e-2ec50d4050fd",
      "name": "默认数据加载器",
      "type": "@n8n/n8n-nodes-langchain.documentDefaultDataLoader",
      "position": [
        -2384,
        -496
      ],
      "parameters": {
        "options": {
          "metadata": {
            "metadataValues": [
              {
                "name": "page",
                "value": "={{ $json.result.url }}"
              }
            ]
          }
        },
        "jsonData": "={{ $json.cleanedText }}",
        "jsonMode": "expressionData"
      },
      "typeVersion": 1
    },
    {
      "id": "0fc79f0d-8ebd-4d61-ac29-7ba65284af52",
      "name": "字符文本分割器",
      "type": "@n8n/n8n-nodes-langchain.textSplitterCharacterTextSplitter",
      "position": [
        -2368,
        -352
      ],
      "parameters": {
        "chunkSize": 5000
      },
      "typeVersion": 1
    },
    {
      "id": "bc5aac68-bb66-4c9c-abd7-9a913b0a56fa",
      "name": "OpenAI 嵌入",
      "type": "@n8n/n8n-nodes-langchain.embeddingsOpenAi",
      "position": [
        -2528,
        -464
      ],
      "parameters": {
        "model": "text-embedding-ada-002",
        "options": {}
      },
      "credentials": {
        "openAiApi": {
          "id": "OwpPpcltPaXyVklS",
          "name": "OpenAi_Mariela.b.d."
        }
      },
      "typeVersion": 1.1
    },
    {
      "id": "e3b525eb-7a3f-456d-a476-b013293c85e0",
      "name": "编辑字段",
      "type": "n8n-nodes-base.set",
      "position": [
        -4064,
        -288
      ],
      "parameters": {
        "options": {},
        "assignments": {
          "assignments": [
            {
              "id": "f2bcdb54-e1fe-4670-99aa-6eec973bf5f1",
              "name": "task_id",
              "type": "string",
              "value": "={{ $('Crawl4ai Web Page Scrape').item.json.task_id }}"
            }
          ]
        }
      },
      "typeVersion": 3.4
    },
    {
      "id": "bdbed5ea-d1a1-4922-a7b7-759466709fcb",
      "name": "Crawl4AI_任务状态",
      "type": "n8n-nodes-base.httpRequest",
      "onError": "continueErrorOutput",
      "position": [
        -3968,
        -608
      ],
      "parameters": {
        "url": "=https://crawl4ai-app-nrcsv.ondigitalocean.app/task/{{ $json.task_id }}",
        "options": {
          "timeout": 5000
        },
        "authentication": "genericCredentialType",
        "genericAuthType": "httpHeaderAuth"
      },
      "credentials": {
        "httpHeaderAuth": {
          "id": "De808MMiUFOFLbNm",
          "name": "Crawl4ai_marinextai"
        }
      },
      "retryOnFail": true,
      "typeVersion": 4.2,
      "waitBetweenTries": 5000
    },
    {
      "id": "f0da6b36-885a-4e86-b044-f3b490bf3829",
      "name": "Loop Over Items1",
      "type": "n8n-nodes-base.splitInBatches",
      "position": [
        -5824,
        144
      ],
      "parameters": {
        "options": {}
      },
      "typeVersion": 3
    },
    {
      "id": "f78a39bd-183c-4985-b1b1-f3142dfe31f3",
      "name": "条件判断2",
      "type": "n8n-nodes-base.if",
      "position": [
        -4736,
        -592
      ],
      "parameters": {
        "options": {},
        "conditions": {
          "options": {
            "version": 2,
            "leftValue": "",
            "caseSensitive": true,
            "typeValidation": "strict"
          },
          "combinator": "or",
          "conditions": [
            {
              "id": "fbc89427-990b-45d0-8538-e403c1b18ddd",
              "operator": {
                "type": "string",
                "operation": "contains"
              },
              "leftValue": "={{ $json.status }}",
              "rightValue": "pending"
            }
          ]
        }
      },
      "typeVersion": 2.2
    },
    {
      "id": "b6dfe888-4e2e-4c74-8a66-c3db28604514",
      "name": "拆分输出1",
      "type": "n8n-nodes-base.splitOut",
      "position": [
        -5392,
        -384
      ],
      "parameters": {
        "include": "selectedOtherFields",
        "options": {},
        "fieldToSplitOut": "url",
        "fieldsToInclude": "status"
      },
      "typeVersion": 1
    },
    {
      "id": "78f05cb5-8b9c-4f51-b252-4ca2195b52ad",
      "name": "格式化 URL",
      "type": "n8n-nodes-base.set",
      "position": [
        -5648,
        160
      ],
      "parameters": {
        "options": {},
        "assignments": {
          "assignments": [
            {
              "id": "9038a5b3-6985-4edc-bdd1-8dc5a3e8877c",
              "name": "loc",
              "type": "string",
              "value": "={{ $json.loc.trim().toLowerCase() }}"
            }
          ]
        }
      },
      "typeVersion": 3.4
    },
    {
      "id": "805f1fea-841b-40aa-a055-de7ddbbb306f",
      "name": "检查 URL 是否在 Supabase 表中",
      "type": "n8n-nodes-base.supabase",
      "onError": "continueErrorOutput",
      "position": [
        -5456,
        160
      ],
      "parameters": {
        "filters": {
          "conditions": [
            {
              "keyName": "url",
              "keyValue": "={{ $json.loc }}",
              "condition": "eq"
            }
          ]
        },
        "tableId": "scrape_queue",
        "operation": "getAll",
        "returnAll": true
      },
      "credentials": {
        "supabaseApi": {
          "id": "CYPZsYCPJqrO9xBO",
          "name": "Supabase_N8N AI Agent Assistant_marinextai"
        }
      },
      "retryOnFail": true,
      "typeVersion": 1,
      "alwaysOutputData": true,
      "waitBetweenTries": 5000
    },
    {
      "id": "4f6e6ccb-7757-4e9f-b50c-9acb2fe99009",
      "name": "格式化来自 Supabase 节点的输出",
      "type": "n8n-nodes-base.code",
      "position": [
        -5184,
        160
      ],
      "parameters": {
        "jsCode": "const supabaseResult = $json;\n\n// Get the clean URL from the Set node (Edit Fields1)\nconst originalLoc = $('Format the URL').item.json.loc;\nconst cleanUrl = typeof originalLoc === 'string' ? originalLoc.trim().toLowerCase() : '';\n\n// Check if URL already exists\n// Empty object {} means URL doesn't exist, so we should insert\nconst shouldInsert = Object.keys(supabaseResult).length === 0;\n\nreturn [\n  {\n    json: {\n      url: cleanUrl,\n      shouldInsert,\n    }\n  }\n];"
      },
      "typeVersion": 2
    },
    {
      "id": "54ed36e4-e675-4bd2-a74e-aeadbe7f486c",
      "name": "如果 \"shouldInsert\" 为 true",
      "type": "n8n-nodes-base.if",
      "position": [
        -4992,
        160
      ],
      "parameters": {
        "options": {},
        "conditions": {
          "options": {
            "version": 2,
            "leftValue": "",
            "caseSensitive": true,
            "typeValidation": "strict"
          },
          "combinator": "and",
          "conditions": [
            {
              "id": "f3a00d98-73af-4d35-b4e5-5158c120753f",
              "operator": {
                "type": "boolean",
                "operation": "true",
                "singleValue": true
              },
              "leftValue": "={{ $json.shouldInsert }}",
              "rightValue": "true"
            }
          ]
        }
      },
      "typeVersion": 2.2
    },
    {
      "id": "483dc0c7-da52-423a-a3bb-cc9ef6d6f1df",
      "name": "新行中的 URL",
      "type": "n8n-nodes-base.supabase",
      "position": [
        -4752,
        272
      ],
      "parameters": {
        "tableId": "scrape_queue",
        "fieldsUi": {
          "fieldValues": [
            {
              "fieldId": "url",
              "fieldValue": "={{ $json.url }}"
            }
          ]
        }
      },
      "credentials": {
        "supabaseApi": {
          "id": "CYPZsYCPJqrO9xBO",
          "name": "Supabase_N8N AI Agent Assistant_marinextai"
        }
      },
      "typeVersion": 1
    },
    {
      "id": "632752e1-138e-481f-92ad-2ac14c245c45",
      "name": "便签1",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -5888,
        64
      ],
      "parameters": {
        "width": 1280,
        "height": 500,
        "content": "## 将所有网站的 URL 放入 Supabase 表 - scrape_queue"
      },
      "typeVersion": 1
    },
    {
      "id": "5fc57e6f-771c-4eaa-ba8e-8e233dc2a343",
      "name": "在 Supabase 中创建表 scrape_queue",
      "type": "n8n-nodes-base.postgres",
      "position": [
        -6816,
        -688
      ],
      "parameters": {
        "query": "CREATE TABLE scrape_queue (\n  id uuid DEFAULT gen_random_uuid() PRIMARY KEY,\n  url text NOT NULL UNIQUE,\n  status text NOT NULL DEFAULT 'pending', -- 'pending', 'completed', 'error'\n  task_id text,\n  result text,\n  created_at timestamp with time zone DEFAULT now(),\n  updated_at timestamp with time zone DEFAULT now()\n);\n\n-- Optional: Auto-update updated_at on row change\nCREATE OR REPLACE FUNCTION update_updated_at_column()\nRETURNS TRIGGER AS $$\nBEGIN\n   NEW.updated_at = now();\n   RETURN NEW;\nEND;\n$$ language 'plpgsql';\n\nCREATE TRIGGER update_scrape_queue_updated_at\nBEFORE UPDATE ON scrape_queue\nFOR EACH ROW\nEXECUTE PROCEDURE update_updated_at_column();",
        "options": {},
        "operation": "executeQuery"
      },
      "credentials": {
        "postgres": {
          "id": "k1GeBv6AjFuwp2B1",
          "name": "Postgres_N8N AI Agent Assistant_marinextai"
        }
      },
      "typeVersion": 2.6
    },
    {
      "id": "8b2666b7-0eb6-42df-9ae2-e204516dd3d1",
      "name": "在 Supabase 中创建表 scrape_queue1",
      "type": "n8n-nodes-base.postgres",
      "position": [
        -6608,
        -688
      ],
      "parameters": {
        "query": "CREATE TABLE documents (\n  id SERIAL PRIMARY KEY,\n  content TEXT,\n  metadata JSONB,\n  embedding VECTOR(1536) -- Adjust the dimension size based on your OpenAI model (e.g. ada-002 returns 1536)\n);",
        "options": {},
        "operation": "executeQuery"
      },
      "credentials": {
        "postgres": {
          "id": "k1GeBv6AjFuwp2B1",
          "name": "Postgres_N8N AI Agent Assistant_marinextai"
        }
      },
      "typeVersion": 2.6
    },
    {
      "id": "7c7b8f66-00f6-48db-af03-fba30dc5e6b1",
      "name": "便签2",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -6848,
        -768
      ],
      "parameters": {
        "color": 3,
        "width": 500,
        "height": 280,
        "content": "## 执行一次"
      },
      "typeVersion": 1
    },
    {
      "id": "82279582-c71b-43aa-8e60-6b8af7ce866c",
      "name": "便签",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -4992,
        -736
      ],
      "parameters": {
        "color": 4,
        "width": 460,
        "height": 360,
        "content": "## 从 Supabase 获取 URL 并检查是否完成"
      },
      "typeVersion": 1
    },
    {
      "id": "8b2245b2-cdc2-408a-879b-260335a10bcb",
      "name": "便签3",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -4448,
        -736
      ],
      "parameters": {
        "color": 5,
        "width": 640,
        "height": 360,
        "content": "## Crawl4AI URL 抓取"
      },
      "typeVersion": 1
    },
    {
      "id": "b42143d2-1e13-4031-996a-26af2dc26632",
      "name": "Crawl4ai 网页抓取",
      "type": "n8n-nodes-base.httpRequest",
      "onError": "continueErrorOutput",
      "position": [
        -4384,
        -608
      ],
      "parameters": {
        "url": "https://crawl4ai-app-nrcsv.ondigitalocean.app/crawl",
        "method": "POST",
        "options": {},
        "sendBody": true,
        "authentication": "genericCredentialType",
        "bodyParameters": {
          "parameters": [
            {
              "name": "urls",
              "value": "={{ $json.url }}"
            },
            {
              "name": "priority",
              "value": "10"
            }
          ]
        },
        "genericAuthType": "httpHeaderAuth"
      },
      "credentials": {
        "httpHeaderAuth": {
          "id": "De808MMiUFOFLbNm",
          "name": "Crawl4ai_marinextai"
        }
      },
      "retryOnFail": true,
      "typeVersion": 4.2,
      "waitBetweenTries": 5000
    },
    {
      "id": "6ac1fda6-8363-4cff-8810-7cb2ffa63b67",
      "name": "从抓取数据中移除冗余数据",
      "type": "n8n-nodes-base.code",
      "onError": "continueRegularOutput",
      "position": [
        -3488,
        -768
      ],
      "parameters": {
        "jsCode": "return items.map(item => {\n  // Handle both data structures\n  const raw = item.json.result?.markdown || item.json.cleanedText || item.json.html || '';\n  \n  // Add a safety check for null/undefined\n  if (!raw) {\n    return {\n      json: {\n        url: item.json.result?.url || item.json.url || '',\n        cleanedText: '',\n        error: 'No content found to process'\n      }\n    };\n  }\n  \n  let cleaned = raw\n    // Remove headers but keep the content structure\n    .replace(/^#{1,6}\\s+(.+)$/gm, '$1') // Convert headers to plain text\n    \n    // Remove markdown links but keep the text\n    .replace(/\\[([^\\]]+)\\]\\([^)]+\\)/g, '$1') // Keep link text, remove URL\n    \n    // Remove code blocks completely\n    .replace(/```[\\s\\S]*?```/g, '') \n    .replace(/`([^`]+)`/g, '$1') // Remove inline code backticks but keep content\n    \n    // Remove markdown formatting\n    .replace(/\\*\\*([^*]+)\\*\\*/g, '$1') // Remove bold formatting\n    .replace(/\\*([^*]+)\\*/g, '$1') // Remove italic formatting\n    .replace(/_{2,}([^_]+)_{2,}/g, '$1') // Remove underline formatting\n    .replace(/~~([^~]+)~~/g, '$1') // Remove strikethrough\n    \n    // Remove lists formatting but keep content\n    .replace(/^\\s*[-*+]\\s+/gm, '') // Remove bullet points\n    .replace(/^\\s*\\d+\\.\\s+/gm, '') // Remove numbered lists\n    \n    // Remove HTML remnants\n    .replace(/<[^>]*>/g, '') // Remove any remaining HTML tags\n    .replace(/&[a-zA-Z0-9#]+;/g, '') // Remove HTML entities\n    \n    // Remove navigation and common web elements\n    .replace(/\\b(Home|About|Contact|Privacy|Terms|Login|Register|Menu|Navigation|Footer|Header|Sidebar)\\b/gi, '')\n    .replace(/\\b(Click here|Read more|Learn more|Show more|View all|See all)\\b/gi, '')\n    .replace(/\\b(Previous|Next|Page \\d+|Back to top)\\b/gi, '')\n    \n    // Remove social media and sharing text\n    .replace(/\\b(Share|Tweet|Facebook|LinkedIn|Instagram|Follow us|Subscribe)\\b/gi, '')\n    \n    // Remove common website noise\n    .replace(/\\b(Cookie|Cookies|GDPR|Accept|Decline|Consent)\\b/gi, '')\n    .replace(/\\b(Advertisement|Ad|Sponsored|Promotion)\\b/gi, '')\n    \n    // Remove excessive punctuation and symbols\n    .replace(/[^\\w\\s.,!?;:()\\-\"']/g, '') // Keep only essential punctuation\n    .replace(/\\.{2,}/g, '.') // Replace multiple dots with single dot\n    .replace(/\\?{2,}/g, '?') // Replace multiple question marks\n    .replace(/!{2,}/g, '!') // Replace multiple exclamation marks\n    \n    // Clean up whitespace and line breaks\n    .replace(/\\n{3,}/g, '\\n\\n') // Replace multiple line breaks with double\n    .replace(/\\s+/g, ' ') // Normalize whitespace\n    .replace(/\\s*\\n\\s*/g, '\\n') // Clean line breaks\n    \n    // Remove lines that are too short (likely noise)\n    .split('\\n')\n    .filter(line => line.trim().length > 10) // Remove very short lines\n    .join('\\n')\n    \n    .trim();\n  \n  // Additional quality checks\n  const wordCount = cleaned.split(/\\s+/).length;\n  const hasMinimumContent = wordCount >= 50; // Minimum 50 words\n  \n  // Check if content is mostly meaningful (not just numbers/symbols)\n  const meaningfulContent = cleaned.replace(/[^\\w\\s]/g, '').length > cleaned.length * 0.7;\n  \n  // Extract additional metadata for better context\n  const extractedTitle = raw.match(/^#{1,3}\\s+(.+)$/m)?.[1] || '';\n  const domain = (item.json.result?.url || item.json.url || '').replace(/^https?:\\/\\//, '').split('/')[0];\n  \n  return {\n    json: {\n      url: item.json.result?.url || item.json.url || '',\n      cleanedText: cleaned,\n      wordCount: wordCount,\n      hasMinimumContent: hasMinimumContent,\n      meaningfulContent: meaningfulContent,\n      extractedTitle: extractedTitle,\n      domain: domain,\n      contentLength: cleaned.length,\n      // Quality score for filtering\n      qualityScore: (hasMinimumContent ? 0.5 : 0) + (meaningfulContent ? 0.5 : 0)\n    }\n  };\n});"
      },
      "typeVersion": 2
    },
    {
      "id": "520a512f-2da8-4cb7-b834-fe6fbfa2ad02",
      "name": "Supabase Vector Store_documents",
      "type": "@n8n/n8n-nodes-langchain.vectorStoreSupabase",
      "position": [
        -2544,
        -672
      ],
      "parameters": {
        "mode": "insert",
        "options": {
          "queryName": "match_documents"
        },
        "tableName": {
          "__rl": true,
          "mode": "list",
          "value": "documents",
          "cachedResultName": "documents"
        }
      },
      "credentials": {
        "supabaseApi": {
          "id": "CYPZsYCPJqrO9xBO",
          "name": "Supabase_N8N AI Agent Assistant_marinextai"
        }
      },
      "typeVersion": 1
    },
    {
      "id": "edb03374-1674-4070-b8a6-7afff6118f9a",
      "name": "获取一行 - scrape_queue 表",
      "type": "n8n-nodes-base.supabase",
      "position": [
        -4912,
        -592
      ],
      "parameters": {
        "filters": {
          "conditions": [
            {
              "keyName": "url",
              "keyValue": "={{ $json.url }}"
            }
          ]
        },
        "tableId": "scrape_queue",
        "operation": "get"
      },
      "credentials": {
        "supabaseApi": {
          "id": "CYPZsYCPJqrO9xBO",
          "name": "Supabase_N8N AI Agent Assistant_marinextai"
        }
      },
      "typeVersion": 1
    },
    {
      "id": "57358b66-0d48-4d53-a188-c5c550e46a9e",
      "name": "更新 scrape_queue 表中的一行",
      "type": "n8n-nodes-base.supabase",
      "position": [
        -2224,
        -992
      ],
      "parameters": {
        "filters": {
          "conditions": [
            {
              "keyName": "url",
              "keyValue": "={{ $('Get a row - scrape_queue Table').item.json.url }}",
              "condition": "eq"
            }
          ]
        },
        "tableId": "scrape_queue",
        "fieldsUi": {
          "fieldValues": [
            {
              "fieldId": "status",
              "fieldValue": "={{ $('Crawl4AI_Task Status').item.json.status }}"
            },
            {
              "fieldId": "task_id",
              "fieldValue": "={{ $('Crawl4ai Web Page Scrape').item.json.task_id }}"
            }
          ]
        },
        "operation": "update"
      },
      "credentials": {
        "supabaseApi": {
          "id": "CYPZsYCPJqrO9xBO",
          "name": "Supabase_N8N AI Agent Assistant_marinextai"
        }
      },
      "typeVersion": 1
    },
    {
      "id": "3291a358-282c-4cc2-a869-c9b4651e157e",
      "name": "更新 scrape_queue 表中的一行1",
      "type": "n8n-nodes-base.supabase",
      "position": [
        -3984,
        -1072
      ],
      "parameters": {
        "filters": {
          "conditions": [
            {
              "keyName": "url",
              "keyValue": "={{ $('Get a row - scrape_queue Table').first().json.url }}",
              "condition": "eq"
            }
          ]
        },
        "tableId": "scrape_queue",
        "fieldsUi": {
          "fieldValues": [
            {
              "fieldId": "task_id",
              "fieldValue": "={{ $json.task_id }}"
            },
            {
              "fieldId": "status",
              "fieldValue": "={{ $json.error.status }}"
            }
          ]
        },
        "operation": "update"
      },
      "credentials": {
        "supabaseApi": {
          "id": "CYPZsYCPJqrO9xBO",
          "name": "Supabase_N8N AI Agent Assistant_marinextai"
        }
      },
      "typeVersion": 1
    },
    {
      "id": "f801de82-dbe9-44c1-a6c3-ac2847e93060",
      "name": "等待1",
      "type": "n8n-nodes-base.wait",
      "position": [
        -4352,
        -208
      ],
      "webhookId": "32f2ac99-68dc-4afc-8ebb-f64625cc96ef",
      "parameters": {
        "unit": "minutes"
      },
      "typeVersion": 1.1
    },
    {
      "id": "10aecbd3-6fd8-420f-b997-34d68eecde54",
      "name": "质量筛选节点",
      "type": "n8n-nodes-base.code",
      "onError": "continueRegularOutput",
      "position": [
        -3264,
        -768
      ],
      "parameters": {
        "jsCode": "// Filter out low-quality content\nreturn items.filter(item => {\n  const quality = item.json.qualityScore || 0;\n  const minWords = item.json.wordCount >= 50;\n  const hasContent = item.json.cleanedText.length > 200;\n  \n  return quality >= 0.5 && minWords && hasContent;\n});"
      },
      "typeVersion": 2
    },
    {
      "id": "9473c86c-7525-41f6-a2be-f7750d930317",
      "name": "内容类型检测",
      "type": "n8n-nodes-base.code",
      "onError": "continueRegularOutput",
      "position": [
        -3008,
        -768
      ],
      "parameters": {
        "jsCode": "// Content Type Detection - Fixed Version\nreturn items.map(item => {\n  const text = item.json.cleanedText || '';\n  \n  // Content type detection function\n  const detectContentType = (text) => {\n    if (!text || text.length < 10) {\n      return 'unknown';\n    }\n    \n    const lowerText = text.toLowerCase();\n    \n    // Check for code content\n    if (lowerText.includes('function') || lowerText.includes('class') || \n        lowerText.includes('import') || lowerText.includes('def ') ||\n        lowerText.includes('var ') || lowerText.includes('const ')) {\n      return 'code';\n    }\n    \n    // Check for tutorial content\n    if (lowerText.includes('step 1') || lowerText.includes('tutorial') || \n        lowerText.includes('how to') || lowerText.includes('guide') ||\n        lowerText.includes('walkthrough')) {\n      return 'tutorial';\n    }\n    \n    // Check for FAQ content\n    if (lowerText.includes('faq') || lowerText.includes('q:') || \n        lowerText.includes('a:') || lowerText.includes('question') ||\n        lowerText.includes('frequently asked')) {\n      return 'faq';\n    }\n    \n    // Check for documentation\n    if (lowerText.includes('api') || lowerText.includes('documentation') ||\n        lowerText.includes('reference') || lowerText.includes('manual')) {\n      return 'documentation';\n    }\n    \n    // Check for news/blog content\n    if (lowerText.includes('published') || lowerText.includes('author') ||\n        lowerText.includes('posted') || lowerText.includes('blog')) {\n      return 'blog';\n    }\n    \n    // Check for product/service pages\n    if (lowerText.includes('price') || lowerText.includes('buy') ||\n        lowerText.includes('purchase') || lowerText.includes('product')) {\n      return 'product';\n    }\n    \n    // Default to article\n    return 'article';\n  };\n  \n  // Detect content type\n  const contentType = detectContentType(text);\n  \n  // Return the item with added content type\n  return {\n    json: {\n      ...item.json, // Keep all existing data\n      contentType: contentType\n    }\n  };\n});"
      },
      "typeVersion": 2
    },
    {
      "id": "54873bf5-ecb2-44e3-9dfb-e0e6ace02917",
      "name": "更好的元数据提取",
      "type": "n8n-nodes-base.code",
      "onError": "continueRegularOutput",
      "position": [
        -2784,
        -768
      ],
      "parameters": {
        "jsCode": "// Enhanced metadata extraction - Fixed Version\nreturn items.map(item => {\n  const cleaned = item.json.cleanedText || '';\n  const url = item.json.url || '';\n  const contentType = item.json.contentType || 'article';\n  \n  // Extract title from the cleaned text (look for first meaningful line)\n  const extractTitle = (text) => {\n    if (!text) return '';\n    \n    const lines = text.split('\\n').filter(line => line.trim().length > 0);\n    if (lines.length === 0) return '';\n    \n    // Find the first substantial line (likely the title)\n    const titleLine = lines.find(line => \n      line.trim().length > 10 && \n      line.trim().length < 200 &&\n      !line.includes('http') &&\n      !line.includes('www.')\n    );\n    \n    return titleLine ? titleLine.trim() : lines[0].trim();\n  };\n  \n  // Extract domain from URL\n  const extractDomain = (url) => {\n    if (!url) return '';\n    try {\n      return url.replace(/^https?:\\/\\//, '').split('/')[0];\n    } catch (e) {\n      return '';\n    }\n  };\n  \n  // Count words in the text\n  const countWords = (text) => {\n    if (!text) return 0;\n    return text.trim().split(/\\s+/).filter(word => word.length > 0).length;\n  };\n  \n  // Calculate quality score\n  const calculateQualityScore = (text, wordCount) => {\n    if (!text || wordCount < 50) return 0;\n    \n    const meaningfulContent = text.replace(/[^\\w\\s]/g, '').length > text.length * 0.7;\n    const hasMinimumContent = wordCount >= 50;\n    \n    return (hasMinimumContent ? 0.5 : 0) + (meaningfulContent ? 0.5 : 0);\n  };\n  \n  // Simple language detection (basic version)\n  const detectLanguage = (text) => {\n    if (!text) return 'unknown';\n    \n    // Simple heuristic - could be improved with a proper language detection library\n    const commonEnglishWords = ['the', 'and', 'is', 'in', 'to', 'of', 'a', 'for', 'on', 'with'];\n    const commonDutchWords = ['de', 'het', 'en', 'van', 'een', 'in', 'op', 'te', 'aan', 'met'];\n    \n    const lowerText = text.toLowerCase();\n    const englishCount = commonEnglishWords.filter(word => lowerText.includes(` ${word} `)).length;\n    const dutchCount = commonDutchWords.filter(word => lowerText.includes(` ${word} `)).length;\n    \n    if (englishCount > dutchCount) return 'en';\n    if (dutchCount > englishCount) return 'nl';\n    return 'unknown';\n  };\n  \n  // Extract all metadata\n  const extractedTitle = extractTitle(cleaned);\n  const domain = extractDomain(url);\n  const wordCount = countWords(cleaned);\n  const qualityScore = calculateQualityScore(cleaned, wordCount);\n  const detectedLanguage = detectLanguage(cleaned);\n  \n  // Enhanced metadata object\n  const metadata = {\n    page: url,\n    title: extractedTitle,\n    domain: domain,\n    contentType: contentType,\n    wordCount: wordCount,\n    scrapedDate: new Date().toISOString(),\n    language: detectedLanguage,\n    qualityScore: qualityScore,\n    contentLength: cleaned.length\n  };\n  \n  return {\n    json: {\n      ...item.json, // Keep all existing data\n      metadata: metadata,\n      // Also keep individual fields for easier access\n      extractedTitle: extractedTitle,\n      domain: domain,\n      wordCount: wordCount,\n      qualityScore: qualityScore,\n      detectedLanguage: detectedLanguage\n    }\n  };\n});"
      },
      "typeVersion": 2
    },
    {
      "id": "f2d3d6a3-b48e-4b08-bf8e-f8fff06d3494",
      "name": "便签4",
      "type": "n8n-nodes-base.stickyNote",
      "position": [
        -3536,
        -912
      ],
      "parameters": {
        "color": 6,
        "width": 900,
        "height": 340,
        "content": "## 清理 HTML 代码"
      },
      "typeVersion": 1
    },
    {
      "id": "6ddcf33d-84cb-4ee7-bf62-cb2747aff406",
      "name": "条件判断1",
      "type": "n8n-nodes-base.if",
      "position": [
        -3632,
        -288
      ],
      "parameters": {
        "options": {},
        "conditions": {
          "options": {
            "version": 2,
            "leftValue": "",
            "caseSensitive": true,
            "typeValidation": "strict"
          },
          "combinator": "and",
          "conditions": [
            {
              "id": "3e84e5d8-e49c-4a7b-98c3-9e115f592c10",
              "operator": {
                "type": "string",
                "operation": "exists",
                "singleValue": true
              },
              "leftValue": "={{ $json.task_id }}",
              "rightValue": ""
            },
            {
              "id": "c6a0525f-3224-4ad5-8d0a-e0a7a27fb5d1",
              "operator": {
                "type": "number",
                "operation": "gte"
              },
              "leftValue": "={{ $json.attempt_count }}",
              "rightValue": 10
            }
          ]
        }
      },
      "typeVersion": 2.2
    },
    {
      "id": "ffb7b9cb-a4fb-4db2-833c-331672de42bd",
      "name": "更新 scrape_queue 表中的一行2",
      "type": "n8n-nodes-base.supabase",
      "position": [
        -3376,
        -176
      ],
      "parameters": {
        "filters": {
          "conditions": [
            {
              "keyName": "url",
              "keyValue": "={{ $('Get a row - scrape_queue Table').first().json.url }}",
              "condition": "eq"
            }
          ]
        },
        "tableId": "scrape_queue",
        "fieldsUi": {
          "fieldValues": [
            {
              "fieldId": "task_id",
              "fieldValue": "={{ $json.task_id }}"
            },
            {
              "fieldId": "status",
              "fieldValue": "=error"
            }
          ]
        },
        "operation": "update"
      },
      "credentials": {
        "supabaseApi": {
          "id": "CYPZsYCPJqrO9xBO",
          "name": "Supabase_N8N AI Agent Assistant_marinextai"
        }
      },
      "typeVersion": 1
    },
    {
      "id": "44c7fe75-0e88-4114-b506-6e7850c2a038",
      "name": "任务 ID 计数器",
      "type": "n8n-nodes-base.code",
      "position": [
        -3856,
        -288
      ],
      "parameters": {
        "jsCode": "// Simple counter that resets for each new task ID\nif (typeof globalThis.currentTaskId === 'undefined') {\n  globalThis.currentTaskId = null;\n  globalThis.currentCounter = 0;\n}\n\nreturn items.map(item => {\n  const taskId = item.json.task_id;\n  \n  // Check if this is a new task ID\n  if (globalThis.currentTaskId !== taskId) {\n    // New task ID detected - reset counter\n    globalThis.currentTaskId = taskId;\n    globalThis.currentCounter = 1;\n  } else {\n    // Same task ID - increment counter\n    globalThis.currentCounter++;\n  }\n  \n  return {\n    json: {\n      ...item.json,\n      attempt_count: globalThis.currentCounter\n    }\n  };\n});"
      },
      "typeVersion": 2
    }
  ],
  "pinData": {},
  "connections": {
    "If": {
      "main": [
        [
          {
            "node": "Remove redundant data from the scraping",
            "type": "main",
            "index": 0
          }
        ],
        [
          {
            "node": "Edit Fields",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "If1": {
      "main": [
        [
          {
            "node": "Update a row in scrape_queue Table2",
            "type": "main",
            "index": 0
          }
        ],
        [
          {
            "node": "Wait",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "If2": {
      "main": [
        [
          {
            "node": "Crawl4ai Web Page Scrape",
            "type": "main",
            "index": 0
          }
        ],
        [
          {
            "node": "Loop Over Items",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "XML": {
      "main": [
        [
          {
            "node": "Split Out",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Wait": {
      "main": [
        [
          {
            "node": "Crawl4AI_Task Status",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Wait1": {
      "main": [
        [
          {
            "node": "Crawl4ai Web Page Scrape",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Split Out": {
      "main": [
        [
          {
            "node": "Loop Over Items1",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Split Out1": {
      "main": [
        [
          {
            "node": "Loop Over Items",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Edit Fields": {
      "main": [
        [
          {
            "node": "Task_id Counter",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "HTTP Request": {
      "main": [
        [
          {
            "node": "XML",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Format the URL": {
      "main": [
        [
          {
            "node": "Check if the URL is in the Supabase Table",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Loop Over Items": {
      "main": [
        [],
        [
          {
            "node": "Get a row - scrape_queue Table",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Task_id Counter": {
      "main": [
        [
          {
            "node": "If1",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Loop Over Items1": {
      "main": [
        [
          {
            "node": "Split Out1",
            "type": "main",
            "index": 0
          }
        ],
        [
          {
            "node": "Format the URL",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "URL in a new row": {
      "main": [
        [
          {
            "node": "Loop Over Items1",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Embeddings OpenAI": {
      "ai_embedding": [
        [
          {
            "node": "Supabase Vector Store_documents",
            "type": "ai_embedding",
            "index": 0
          }
        ]
      ]
    },
    "Default Data Loader": {
      "ai_document": [
        [
          {
            "node": "Supabase Vector Store_documents",
            "type": "ai_document",
            "index": 0
          }
        ]
      ]
    },
    "Quality Filter Node": {
      "main": [
        [
          {
            "node": "Content Type Detection",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Crawl4AI_Task Status": {
      "main": [
        [
          {
            "node": "If",
            "type": "main",
            "index": 0
          }
        ],
        [
          {
            "node": "Update a row in scrape_queue Table1",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Content Type Detection": {
      "main": [
        [
          {
            "node": "Better Metadata Extraction",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Character Text Splitter": {
      "ai_textSplitter": [
        [
          {
            "node": "Default Data Loader",
            "type": "ai_textSplitter",
            "index": 0
          }
        ]
      ]
    },
    "Crawl4ai Web Page Scrape": {
      "main": [
        [
          {
            "node": "Wait",
            "type": "main",
            "index": 0
          }
        ],
        [
          {
            "node": "Wait1",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "If \"shouldInsert\" is true": {
      "main": [
        [
          {
            "node": "URL in a new row",
            "type": "main",
            "index": 0
          }
        ],
        [
          {
            "node": "Loop Over Items1",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Better Metadata Extraction": {
      "main": [
        [
          {
            "node": "Supabase Vector Store_documents",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Get a row - scrape_queue Table": {
      "main": [
        [
          {
            "node": "If2",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Supabase Vector Store_documents": {
      "main": [
        [
          {
            "node": "Update a row in scrape_queue Table",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "When clicking ‘Test workflow’": {
      "main": [
        [
          {
            "node": "HTTP Request",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Update a row in scrape_queue Table": {
      "main": [
        [
          {
            "node": "Loop Over Items",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Update a row in scrape_queue Table1": {
      "main": [
        [
          {
            "node": "Loop Over Items",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Update a row in scrape_queue Table2": {
      "main": [
        [
          {
            "node": "Loop Over Items",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Remove redundant data from the scraping": {
      "main": [
        [
          {
            "node": "Quality Filter Node",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Format the Output from the Supabase node": {
      "main": [
        [
          {
            "node": "If \"shouldInsert\" is true",
            "type": "main",
            "index": 0
          }
        ]
      ]
    },
    "Check if the URL is in the Supabase Table": {
      "main": [
        [
          {
            "node": "Format the Output from the Supabase node",
            "type": "main",
            "index": 0
          }
        ]
      ]
    }
  }
}
常见问题

如何使用这个工作流?

复制上方的 JSON 配置代码,在您的 n8n 实例中创建新工作流并选择「从 JSON 导入」,粘贴配置后根据需要修改凭证设置即可。

这个工作流适合什么场景?

这是一个高级难度的工作流,适用于Content Creation、Multimodal AI等场景。适合高级用户,包含 16+ 个节点的复杂工作流

需要付费吗?

本工作流完全免费,您可以直接导入使用。但请注意,工作流中使用的第三方服务(如 OpenAI API)可能需要您自行付费。

工作流信息
难度等级
高级
节点数量40
分类2
节点类型16
难度说明

适合高级用户,包含 16+ 个节点的复杂工作流

作者
Mariela Slavenova

Mariela Slavenova

@marielabg

🚀 Fractional Head of AI Ops | COO | CTO | I diagnose, fix & ship automations that pay for themselves | The Harden Method™ - Discover→Design→Build→Break→Harden→Launch→Monitor | Founder @ MarinextAI

外部链接
在 n8n.io 上查看 →

分享此工作流