从站点地图爬取到向量存储:创建高效的RAG工作流
高级
这是一个Content Creation、Multimodal AI领域的自动化工作流,包含 40 个节点。主要使用 If、Set、Xml、Code、Wait 等节点。 从站点地图爬取到向量存储:创建高效的RAG工作流
前置要求
- •PostgreSQL 数据库连接信息
- •Supabase URL 和 API Key
- •可能需要目标 API 的认证凭证
- •OpenAI API Key
使用的节点 (40 个)
工作流预览
可视化展示节点连接关系,支持缩放和平移
导出工作流
复制以下 JSON 配置到 n8n 导入,即可使用此工作流
{
"meta": {
"instanceId": "0862f70dc42e115052f6a2d4c2b6537665b4361a614cec7cd17d1c45c8868621",
"templateCredsSetupCompleted": true
},
"nodes": [
{
"id": "ab180eb3-c086-4f9f-b9d0-f3f56056a416",
"name": "当点击\"测试工作流\"时",
"type": "n8n-nodes-base.manualTrigger",
"position": [
-6816,
-304
],
"parameters": {},
"typeVersion": 1
},
{
"id": "20e77374-c3ce-457f-945c-d6f6dc928de1",
"name": "HTTP请求",
"type": "n8n-nodes-base.httpRequest",
"position": [
-6624,
-304
],
"parameters": {
"url": "https://www.kiekens.com/sitemap.xml",
"options": {}
},
"typeVersion": 4.2
},
{
"id": "b23dd724-1bd7-4eef-9e22-8bef987b2128",
"name": "XML",
"type": "n8n-nodes-base.xml",
"position": [
-6432,
-304
],
"parameters": {
"options": {}
},
"typeVersion": 1
},
{
"id": "4715b380-f386-4926-892e-2c133a1155c1",
"name": "拆分输出",
"type": "n8n-nodes-base.splitOut",
"position": [
-6224,
-304
],
"parameters": {
"options": {},
"fieldToSplitOut": "urlset.url"
},
"typeVersion": 1
},
{
"id": "56181432-63f2-4d93-be6d-6f1489e04ca9",
"name": "遍历项目",
"type": "n8n-nodes-base.splitInBatches",
"position": [
-5152,
-592
],
"parameters": {
"options": {}
},
"typeVersion": 3
},
{
"id": "f52b3e19-7d64-4f3d-848d-81cf2b65bb15",
"name": "等待",
"type": "n8n-nodes-base.wait",
"position": [
-4192,
-608
],
"webhookId": "9af87c5e-b07f-48dc-9ca8-61b471a24cad",
"parameters": {
"amount": 30
},
"typeVersion": 1.1
},
{
"id": "961143cf-c387-4e2d-a477-0988c0b0f512",
"name": "如果",
"type": "n8n-nodes-base.if",
"position": [
-3728,
-608
],
"parameters": {
"options": {},
"conditions": {
"options": {
"version": 2,
"leftValue": "",
"caseSensitive": true,
"typeValidation": "strict"
},
"combinator": "and",
"conditions": [
{
"id": "9d90c1ce-590e-40a5-ae8c-d92326032975",
"operator": {
"type": "string",
"operation": "equals"
},
"leftValue": "={{ $json.status }}",
"rightValue": "completed"
}
]
}
},
"typeVersion": 2.2
},
{
"id": "991580c5-10ed-4bab-811e-2ec50d4050fd",
"name": "默认数据加载器",
"type": "@n8n/n8n-nodes-langchain.documentDefaultDataLoader",
"position": [
-2384,
-496
],
"parameters": {
"options": {
"metadata": {
"metadataValues": [
{
"name": "page",
"value": "={{ $json.result.url }}"
}
]
}
},
"jsonData": "={{ $json.cleanedText }}",
"jsonMode": "expressionData"
},
"typeVersion": 1
},
{
"id": "0fc79f0d-8ebd-4d61-ac29-7ba65284af52",
"name": "字符文本分割器",
"type": "@n8n/n8n-nodes-langchain.textSplitterCharacterTextSplitter",
"position": [
-2368,
-352
],
"parameters": {
"chunkSize": 5000
},
"typeVersion": 1
},
{
"id": "bc5aac68-bb66-4c9c-abd7-9a913b0a56fa",
"name": "OpenAI 嵌入",
"type": "@n8n/n8n-nodes-langchain.embeddingsOpenAi",
"position": [
-2528,
-464
],
"parameters": {
"model": "text-embedding-ada-002",
"options": {}
},
"credentials": {
"openAiApi": {
"id": "OwpPpcltPaXyVklS",
"name": "OpenAi_Mariela.b.d."
}
},
"typeVersion": 1.1
},
{
"id": "e3b525eb-7a3f-456d-a476-b013293c85e0",
"name": "编辑字段",
"type": "n8n-nodes-base.set",
"position": [
-4064,
-288
],
"parameters": {
"options": {},
"assignments": {
"assignments": [
{
"id": "f2bcdb54-e1fe-4670-99aa-6eec973bf5f1",
"name": "task_id",
"type": "string",
"value": "={{ $('Crawl4ai Web Page Scrape').item.json.task_id }}"
}
]
}
},
"typeVersion": 3.4
},
{
"id": "bdbed5ea-d1a1-4922-a7b7-759466709fcb",
"name": "Crawl4AI_任务状态",
"type": "n8n-nodes-base.httpRequest",
"onError": "continueErrorOutput",
"position": [
-3968,
-608
],
"parameters": {
"url": "=https://crawl4ai-app-nrcsv.ondigitalocean.app/task/{{ $json.task_id }}",
"options": {
"timeout": 5000
},
"authentication": "genericCredentialType",
"genericAuthType": "httpHeaderAuth"
},
"credentials": {
"httpHeaderAuth": {
"id": "De808MMiUFOFLbNm",
"name": "Crawl4ai_marinextai"
}
},
"retryOnFail": true,
"typeVersion": 4.2,
"waitBetweenTries": 5000
},
{
"id": "f0da6b36-885a-4e86-b044-f3b490bf3829",
"name": "Loop Over Items1",
"type": "n8n-nodes-base.splitInBatches",
"position": [
-5824,
144
],
"parameters": {
"options": {}
},
"typeVersion": 3
},
{
"id": "f78a39bd-183c-4985-b1b1-f3142dfe31f3",
"name": "条件判断2",
"type": "n8n-nodes-base.if",
"position": [
-4736,
-592
],
"parameters": {
"options": {},
"conditions": {
"options": {
"version": 2,
"leftValue": "",
"caseSensitive": true,
"typeValidation": "strict"
},
"combinator": "or",
"conditions": [
{
"id": "fbc89427-990b-45d0-8538-e403c1b18ddd",
"operator": {
"type": "string",
"operation": "contains"
},
"leftValue": "={{ $json.status }}",
"rightValue": "pending"
}
]
}
},
"typeVersion": 2.2
},
{
"id": "b6dfe888-4e2e-4c74-8a66-c3db28604514",
"name": "拆分输出1",
"type": "n8n-nodes-base.splitOut",
"position": [
-5392,
-384
],
"parameters": {
"include": "selectedOtherFields",
"options": {},
"fieldToSplitOut": "url",
"fieldsToInclude": "status"
},
"typeVersion": 1
},
{
"id": "78f05cb5-8b9c-4f51-b252-4ca2195b52ad",
"name": "格式化 URL",
"type": "n8n-nodes-base.set",
"position": [
-5648,
160
],
"parameters": {
"options": {},
"assignments": {
"assignments": [
{
"id": "9038a5b3-6985-4edc-bdd1-8dc5a3e8877c",
"name": "loc",
"type": "string",
"value": "={{ $json.loc.trim().toLowerCase() }}"
}
]
}
},
"typeVersion": 3.4
},
{
"id": "805f1fea-841b-40aa-a055-de7ddbbb306f",
"name": "检查 URL 是否在 Supabase 表中",
"type": "n8n-nodes-base.supabase",
"onError": "continueErrorOutput",
"position": [
-5456,
160
],
"parameters": {
"filters": {
"conditions": [
{
"keyName": "url",
"keyValue": "={{ $json.loc }}",
"condition": "eq"
}
]
},
"tableId": "scrape_queue",
"operation": "getAll",
"returnAll": true
},
"credentials": {
"supabaseApi": {
"id": "CYPZsYCPJqrO9xBO",
"name": "Supabase_N8N AI Agent Assistant_marinextai"
}
},
"retryOnFail": true,
"typeVersion": 1,
"alwaysOutputData": true,
"waitBetweenTries": 5000
},
{
"id": "4f6e6ccb-7757-4e9f-b50c-9acb2fe99009",
"name": "格式化来自 Supabase 节点的输出",
"type": "n8n-nodes-base.code",
"position": [
-5184,
160
],
"parameters": {
"jsCode": "const supabaseResult = $json;\n\n// Get the clean URL from the Set node (Edit Fields1)\nconst originalLoc = $('Format the URL').item.json.loc;\nconst cleanUrl = typeof originalLoc === 'string' ? originalLoc.trim().toLowerCase() : '';\n\n// Check if URL already exists\n// Empty object {} means URL doesn't exist, so we should insert\nconst shouldInsert = Object.keys(supabaseResult).length === 0;\n\nreturn [\n {\n json: {\n url: cleanUrl,\n shouldInsert,\n }\n }\n];"
},
"typeVersion": 2
},
{
"id": "54ed36e4-e675-4bd2-a74e-aeadbe7f486c",
"name": "如果 \"shouldInsert\" 为 true",
"type": "n8n-nodes-base.if",
"position": [
-4992,
160
],
"parameters": {
"options": {},
"conditions": {
"options": {
"version": 2,
"leftValue": "",
"caseSensitive": true,
"typeValidation": "strict"
},
"combinator": "and",
"conditions": [
{
"id": "f3a00d98-73af-4d35-b4e5-5158c120753f",
"operator": {
"type": "boolean",
"operation": "true",
"singleValue": true
},
"leftValue": "={{ $json.shouldInsert }}",
"rightValue": "true"
}
]
}
},
"typeVersion": 2.2
},
{
"id": "483dc0c7-da52-423a-a3bb-cc9ef6d6f1df",
"name": "新行中的 URL",
"type": "n8n-nodes-base.supabase",
"position": [
-4752,
272
],
"parameters": {
"tableId": "scrape_queue",
"fieldsUi": {
"fieldValues": [
{
"fieldId": "url",
"fieldValue": "={{ $json.url }}"
}
]
}
},
"credentials": {
"supabaseApi": {
"id": "CYPZsYCPJqrO9xBO",
"name": "Supabase_N8N AI Agent Assistant_marinextai"
}
},
"typeVersion": 1
},
{
"id": "632752e1-138e-481f-92ad-2ac14c245c45",
"name": "便签1",
"type": "n8n-nodes-base.stickyNote",
"position": [
-5888,
64
],
"parameters": {
"width": 1280,
"height": 500,
"content": "## 将所有网站的 URL 放入 Supabase 表 - scrape_queue"
},
"typeVersion": 1
},
{
"id": "5fc57e6f-771c-4eaa-ba8e-8e233dc2a343",
"name": "在 Supabase 中创建表 scrape_queue",
"type": "n8n-nodes-base.postgres",
"position": [
-6816,
-688
],
"parameters": {
"query": "CREATE TABLE scrape_queue (\n id uuid DEFAULT gen_random_uuid() PRIMARY KEY,\n url text NOT NULL UNIQUE,\n status text NOT NULL DEFAULT 'pending', -- 'pending', 'completed', 'error'\n task_id text,\n result text,\n created_at timestamp with time zone DEFAULT now(),\n updated_at timestamp with time zone DEFAULT now()\n);\n\n-- Optional: Auto-update updated_at on row change\nCREATE OR REPLACE FUNCTION update_updated_at_column()\nRETURNS TRIGGER AS $$\nBEGIN\n NEW.updated_at = now();\n RETURN NEW;\nEND;\n$$ language 'plpgsql';\n\nCREATE TRIGGER update_scrape_queue_updated_at\nBEFORE UPDATE ON scrape_queue\nFOR EACH ROW\nEXECUTE PROCEDURE update_updated_at_column();",
"options": {},
"operation": "executeQuery"
},
"credentials": {
"postgres": {
"id": "k1GeBv6AjFuwp2B1",
"name": "Postgres_N8N AI Agent Assistant_marinextai"
}
},
"typeVersion": 2.6
},
{
"id": "8b2666b7-0eb6-42df-9ae2-e204516dd3d1",
"name": "在 Supabase 中创建表 scrape_queue1",
"type": "n8n-nodes-base.postgres",
"position": [
-6608,
-688
],
"parameters": {
"query": "CREATE TABLE documents (\n id SERIAL PRIMARY KEY,\n content TEXT,\n metadata JSONB,\n embedding VECTOR(1536) -- Adjust the dimension size based on your OpenAI model (e.g. ada-002 returns 1536)\n);",
"options": {},
"operation": "executeQuery"
},
"credentials": {
"postgres": {
"id": "k1GeBv6AjFuwp2B1",
"name": "Postgres_N8N AI Agent Assistant_marinextai"
}
},
"typeVersion": 2.6
},
{
"id": "7c7b8f66-00f6-48db-af03-fba30dc5e6b1",
"name": "便签2",
"type": "n8n-nodes-base.stickyNote",
"position": [
-6848,
-768
],
"parameters": {
"color": 3,
"width": 500,
"height": 280,
"content": "## 执行一次"
},
"typeVersion": 1
},
{
"id": "82279582-c71b-43aa-8e60-6b8af7ce866c",
"name": "便签",
"type": "n8n-nodes-base.stickyNote",
"position": [
-4992,
-736
],
"parameters": {
"color": 4,
"width": 460,
"height": 360,
"content": "## 从 Supabase 获取 URL 并检查是否完成"
},
"typeVersion": 1
},
{
"id": "8b2245b2-cdc2-408a-879b-260335a10bcb",
"name": "便签3",
"type": "n8n-nodes-base.stickyNote",
"position": [
-4448,
-736
],
"parameters": {
"color": 5,
"width": 640,
"height": 360,
"content": "## Crawl4AI URL 抓取"
},
"typeVersion": 1
},
{
"id": "b42143d2-1e13-4031-996a-26af2dc26632",
"name": "Crawl4ai 网页抓取",
"type": "n8n-nodes-base.httpRequest",
"onError": "continueErrorOutput",
"position": [
-4384,
-608
],
"parameters": {
"url": "https://crawl4ai-app-nrcsv.ondigitalocean.app/crawl",
"method": "POST",
"options": {},
"sendBody": true,
"authentication": "genericCredentialType",
"bodyParameters": {
"parameters": [
{
"name": "urls",
"value": "={{ $json.url }}"
},
{
"name": "priority",
"value": "10"
}
]
},
"genericAuthType": "httpHeaderAuth"
},
"credentials": {
"httpHeaderAuth": {
"id": "De808MMiUFOFLbNm",
"name": "Crawl4ai_marinextai"
}
},
"retryOnFail": true,
"typeVersion": 4.2,
"waitBetweenTries": 5000
},
{
"id": "6ac1fda6-8363-4cff-8810-7cb2ffa63b67",
"name": "从抓取数据中移除冗余数据",
"type": "n8n-nodes-base.code",
"onError": "continueRegularOutput",
"position": [
-3488,
-768
],
"parameters": {
"jsCode": "return items.map(item => {\n // Handle both data structures\n const raw = item.json.result?.markdown || item.json.cleanedText || item.json.html || '';\n \n // Add a safety check for null/undefined\n if (!raw) {\n return {\n json: {\n url: item.json.result?.url || item.json.url || '',\n cleanedText: '',\n error: 'No content found to process'\n }\n };\n }\n \n let cleaned = raw\n // Remove headers but keep the content structure\n .replace(/^#{1,6}\\s+(.+)$/gm, '$1') // Convert headers to plain text\n \n // Remove markdown links but keep the text\n .replace(/\\[([^\\]]+)\\]\\([^)]+\\)/g, '$1') // Keep link text, remove URL\n \n // Remove code blocks completely\n .replace(/```[\\s\\S]*?```/g, '') \n .replace(/`([^`]+)`/g, '$1') // Remove inline code backticks but keep content\n \n // Remove markdown formatting\n .replace(/\\*\\*([^*]+)\\*\\*/g, '$1') // Remove bold formatting\n .replace(/\\*([^*]+)\\*/g, '$1') // Remove italic formatting\n .replace(/_{2,}([^_]+)_{2,}/g, '$1') // Remove underline formatting\n .replace(/~~([^~]+)~~/g, '$1') // Remove strikethrough\n \n // Remove lists formatting but keep content\n .replace(/^\\s*[-*+]\\s+/gm, '') // Remove bullet points\n .replace(/^\\s*\\d+\\.\\s+/gm, '') // Remove numbered lists\n \n // Remove HTML remnants\n .replace(/<[^>]*>/g, '') // Remove any remaining HTML tags\n .replace(/&[a-zA-Z0-9#]+;/g, '') // Remove HTML entities\n \n // Remove navigation and common web elements\n .replace(/\\b(Home|About|Contact|Privacy|Terms|Login|Register|Menu|Navigation|Footer|Header|Sidebar)\\b/gi, '')\n .replace(/\\b(Click here|Read more|Learn more|Show more|View all|See all)\\b/gi, '')\n .replace(/\\b(Previous|Next|Page \\d+|Back to top)\\b/gi, '')\n \n // Remove social media and sharing text\n .replace(/\\b(Share|Tweet|Facebook|LinkedIn|Instagram|Follow us|Subscribe)\\b/gi, '')\n \n // Remove common website noise\n .replace(/\\b(Cookie|Cookies|GDPR|Accept|Decline|Consent)\\b/gi, '')\n .replace(/\\b(Advertisement|Ad|Sponsored|Promotion)\\b/gi, '')\n \n // Remove excessive punctuation and symbols\n .replace(/[^\\w\\s.,!?;:()\\-\"']/g, '') // Keep only essential punctuation\n .replace(/\\.{2,}/g, '.') // Replace multiple dots with single dot\n .replace(/\\?{2,}/g, '?') // Replace multiple question marks\n .replace(/!{2,}/g, '!') // Replace multiple exclamation marks\n \n // Clean up whitespace and line breaks\n .replace(/\\n{3,}/g, '\\n\\n') // Replace multiple line breaks with double\n .replace(/\\s+/g, ' ') // Normalize whitespace\n .replace(/\\s*\\n\\s*/g, '\\n') // Clean line breaks\n \n // Remove lines that are too short (likely noise)\n .split('\\n')\n .filter(line => line.trim().length > 10) // Remove very short lines\n .join('\\n')\n \n .trim();\n \n // Additional quality checks\n const wordCount = cleaned.split(/\\s+/).length;\n const hasMinimumContent = wordCount >= 50; // Minimum 50 words\n \n // Check if content is mostly meaningful (not just numbers/symbols)\n const meaningfulContent = cleaned.replace(/[^\\w\\s]/g, '').length > cleaned.length * 0.7;\n \n // Extract additional metadata for better context\n const extractedTitle = raw.match(/^#{1,3}\\s+(.+)$/m)?.[1] || '';\n const domain = (item.json.result?.url || item.json.url || '').replace(/^https?:\\/\\//, '').split('/')[0];\n \n return {\n json: {\n url: item.json.result?.url || item.json.url || '',\n cleanedText: cleaned,\n wordCount: wordCount,\n hasMinimumContent: hasMinimumContent,\n meaningfulContent: meaningfulContent,\n extractedTitle: extractedTitle,\n domain: domain,\n contentLength: cleaned.length,\n // Quality score for filtering\n qualityScore: (hasMinimumContent ? 0.5 : 0) + (meaningfulContent ? 0.5 : 0)\n }\n };\n});"
},
"typeVersion": 2
},
{
"id": "520a512f-2da8-4cb7-b834-fe6fbfa2ad02",
"name": "Supabase Vector Store_documents",
"type": "@n8n/n8n-nodes-langchain.vectorStoreSupabase",
"position": [
-2544,
-672
],
"parameters": {
"mode": "insert",
"options": {
"queryName": "match_documents"
},
"tableName": {
"__rl": true,
"mode": "list",
"value": "documents",
"cachedResultName": "documents"
}
},
"credentials": {
"supabaseApi": {
"id": "CYPZsYCPJqrO9xBO",
"name": "Supabase_N8N AI Agent Assistant_marinextai"
}
},
"typeVersion": 1
},
{
"id": "edb03374-1674-4070-b8a6-7afff6118f9a",
"name": "获取一行 - scrape_queue 表",
"type": "n8n-nodes-base.supabase",
"position": [
-4912,
-592
],
"parameters": {
"filters": {
"conditions": [
{
"keyName": "url",
"keyValue": "={{ $json.url }}"
}
]
},
"tableId": "scrape_queue",
"operation": "get"
},
"credentials": {
"supabaseApi": {
"id": "CYPZsYCPJqrO9xBO",
"name": "Supabase_N8N AI Agent Assistant_marinextai"
}
},
"typeVersion": 1
},
{
"id": "57358b66-0d48-4d53-a188-c5c550e46a9e",
"name": "更新 scrape_queue 表中的一行",
"type": "n8n-nodes-base.supabase",
"position": [
-2224,
-992
],
"parameters": {
"filters": {
"conditions": [
{
"keyName": "url",
"keyValue": "={{ $('Get a row - scrape_queue Table').item.json.url }}",
"condition": "eq"
}
]
},
"tableId": "scrape_queue",
"fieldsUi": {
"fieldValues": [
{
"fieldId": "status",
"fieldValue": "={{ $('Crawl4AI_Task Status').item.json.status }}"
},
{
"fieldId": "task_id",
"fieldValue": "={{ $('Crawl4ai Web Page Scrape').item.json.task_id }}"
}
]
},
"operation": "update"
},
"credentials": {
"supabaseApi": {
"id": "CYPZsYCPJqrO9xBO",
"name": "Supabase_N8N AI Agent Assistant_marinextai"
}
},
"typeVersion": 1
},
{
"id": "3291a358-282c-4cc2-a869-c9b4651e157e",
"name": "更新 scrape_queue 表中的一行1",
"type": "n8n-nodes-base.supabase",
"position": [
-3984,
-1072
],
"parameters": {
"filters": {
"conditions": [
{
"keyName": "url",
"keyValue": "={{ $('Get a row - scrape_queue Table').first().json.url }}",
"condition": "eq"
}
]
},
"tableId": "scrape_queue",
"fieldsUi": {
"fieldValues": [
{
"fieldId": "task_id",
"fieldValue": "={{ $json.task_id }}"
},
{
"fieldId": "status",
"fieldValue": "={{ $json.error.status }}"
}
]
},
"operation": "update"
},
"credentials": {
"supabaseApi": {
"id": "CYPZsYCPJqrO9xBO",
"name": "Supabase_N8N AI Agent Assistant_marinextai"
}
},
"typeVersion": 1
},
{
"id": "f801de82-dbe9-44c1-a6c3-ac2847e93060",
"name": "等待1",
"type": "n8n-nodes-base.wait",
"position": [
-4352,
-208
],
"webhookId": "32f2ac99-68dc-4afc-8ebb-f64625cc96ef",
"parameters": {
"unit": "minutes"
},
"typeVersion": 1.1
},
{
"id": "10aecbd3-6fd8-420f-b997-34d68eecde54",
"name": "质量筛选节点",
"type": "n8n-nodes-base.code",
"onError": "continueRegularOutput",
"position": [
-3264,
-768
],
"parameters": {
"jsCode": "// Filter out low-quality content\nreturn items.filter(item => {\n const quality = item.json.qualityScore || 0;\n const minWords = item.json.wordCount >= 50;\n const hasContent = item.json.cleanedText.length > 200;\n \n return quality >= 0.5 && minWords && hasContent;\n});"
},
"typeVersion": 2
},
{
"id": "9473c86c-7525-41f6-a2be-f7750d930317",
"name": "内容类型检测",
"type": "n8n-nodes-base.code",
"onError": "continueRegularOutput",
"position": [
-3008,
-768
],
"parameters": {
"jsCode": "// Content Type Detection - Fixed Version\nreturn items.map(item => {\n const text = item.json.cleanedText || '';\n \n // Content type detection function\n const detectContentType = (text) => {\n if (!text || text.length < 10) {\n return 'unknown';\n }\n \n const lowerText = text.toLowerCase();\n \n // Check for code content\n if (lowerText.includes('function') || lowerText.includes('class') || \n lowerText.includes('import') || lowerText.includes('def ') ||\n lowerText.includes('var ') || lowerText.includes('const ')) {\n return 'code';\n }\n \n // Check for tutorial content\n if (lowerText.includes('step 1') || lowerText.includes('tutorial') || \n lowerText.includes('how to') || lowerText.includes('guide') ||\n lowerText.includes('walkthrough')) {\n return 'tutorial';\n }\n \n // Check for FAQ content\n if (lowerText.includes('faq') || lowerText.includes('q:') || \n lowerText.includes('a:') || lowerText.includes('question') ||\n lowerText.includes('frequently asked')) {\n return 'faq';\n }\n \n // Check for documentation\n if (lowerText.includes('api') || lowerText.includes('documentation') ||\n lowerText.includes('reference') || lowerText.includes('manual')) {\n return 'documentation';\n }\n \n // Check for news/blog content\n if (lowerText.includes('published') || lowerText.includes('author') ||\n lowerText.includes('posted') || lowerText.includes('blog')) {\n return 'blog';\n }\n \n // Check for product/service pages\n if (lowerText.includes('price') || lowerText.includes('buy') ||\n lowerText.includes('purchase') || lowerText.includes('product')) {\n return 'product';\n }\n \n // Default to article\n return 'article';\n };\n \n // Detect content type\n const contentType = detectContentType(text);\n \n // Return the item with added content type\n return {\n json: {\n ...item.json, // Keep all existing data\n contentType: contentType\n }\n };\n});"
},
"typeVersion": 2
},
{
"id": "54873bf5-ecb2-44e3-9dfb-e0e6ace02917",
"name": "更好的元数据提取",
"type": "n8n-nodes-base.code",
"onError": "continueRegularOutput",
"position": [
-2784,
-768
],
"parameters": {
"jsCode": "// Enhanced metadata extraction - Fixed Version\nreturn items.map(item => {\n const cleaned = item.json.cleanedText || '';\n const url = item.json.url || '';\n const contentType = item.json.contentType || 'article';\n \n // Extract title from the cleaned text (look for first meaningful line)\n const extractTitle = (text) => {\n if (!text) return '';\n \n const lines = text.split('\\n').filter(line => line.trim().length > 0);\n if (lines.length === 0) return '';\n \n // Find the first substantial line (likely the title)\n const titleLine = lines.find(line => \n line.trim().length > 10 && \n line.trim().length < 200 &&\n !line.includes('http') &&\n !line.includes('www.')\n );\n \n return titleLine ? titleLine.trim() : lines[0].trim();\n };\n \n // Extract domain from URL\n const extractDomain = (url) => {\n if (!url) return '';\n try {\n return url.replace(/^https?:\\/\\//, '').split('/')[0];\n } catch (e) {\n return '';\n }\n };\n \n // Count words in the text\n const countWords = (text) => {\n if (!text) return 0;\n return text.trim().split(/\\s+/).filter(word => word.length > 0).length;\n };\n \n // Calculate quality score\n const calculateQualityScore = (text, wordCount) => {\n if (!text || wordCount < 50) return 0;\n \n const meaningfulContent = text.replace(/[^\\w\\s]/g, '').length > text.length * 0.7;\n const hasMinimumContent = wordCount >= 50;\n \n return (hasMinimumContent ? 0.5 : 0) + (meaningfulContent ? 0.5 : 0);\n };\n \n // Simple language detection (basic version)\n const detectLanguage = (text) => {\n if (!text) return 'unknown';\n \n // Simple heuristic - could be improved with a proper language detection library\n const commonEnglishWords = ['the', 'and', 'is', 'in', 'to', 'of', 'a', 'for', 'on', 'with'];\n const commonDutchWords = ['de', 'het', 'en', 'van', 'een', 'in', 'op', 'te', 'aan', 'met'];\n \n const lowerText = text.toLowerCase();\n const englishCount = commonEnglishWords.filter(word => lowerText.includes(` ${word} `)).length;\n const dutchCount = commonDutchWords.filter(word => lowerText.includes(` ${word} `)).length;\n \n if (englishCount > dutchCount) return 'en';\n if (dutchCount > englishCount) return 'nl';\n return 'unknown';\n };\n \n // Extract all metadata\n const extractedTitle = extractTitle(cleaned);\n const domain = extractDomain(url);\n const wordCount = countWords(cleaned);\n const qualityScore = calculateQualityScore(cleaned, wordCount);\n const detectedLanguage = detectLanguage(cleaned);\n \n // Enhanced metadata object\n const metadata = {\n page: url,\n title: extractedTitle,\n domain: domain,\n contentType: contentType,\n wordCount: wordCount,\n scrapedDate: new Date().toISOString(),\n language: detectedLanguage,\n qualityScore: qualityScore,\n contentLength: cleaned.length\n };\n \n return {\n json: {\n ...item.json, // Keep all existing data\n metadata: metadata,\n // Also keep individual fields for easier access\n extractedTitle: extractedTitle,\n domain: domain,\n wordCount: wordCount,\n qualityScore: qualityScore,\n detectedLanguage: detectedLanguage\n }\n };\n});"
},
"typeVersion": 2
},
{
"id": "f2d3d6a3-b48e-4b08-bf8e-f8fff06d3494",
"name": "便签4",
"type": "n8n-nodes-base.stickyNote",
"position": [
-3536,
-912
],
"parameters": {
"color": 6,
"width": 900,
"height": 340,
"content": "## 清理 HTML 代码"
},
"typeVersion": 1
},
{
"id": "6ddcf33d-84cb-4ee7-bf62-cb2747aff406",
"name": "条件判断1",
"type": "n8n-nodes-base.if",
"position": [
-3632,
-288
],
"parameters": {
"options": {},
"conditions": {
"options": {
"version": 2,
"leftValue": "",
"caseSensitive": true,
"typeValidation": "strict"
},
"combinator": "and",
"conditions": [
{
"id": "3e84e5d8-e49c-4a7b-98c3-9e115f592c10",
"operator": {
"type": "string",
"operation": "exists",
"singleValue": true
},
"leftValue": "={{ $json.task_id }}",
"rightValue": ""
},
{
"id": "c6a0525f-3224-4ad5-8d0a-e0a7a27fb5d1",
"operator": {
"type": "number",
"operation": "gte"
},
"leftValue": "={{ $json.attempt_count }}",
"rightValue": 10
}
]
}
},
"typeVersion": 2.2
},
{
"id": "ffb7b9cb-a4fb-4db2-833c-331672de42bd",
"name": "更新 scrape_queue 表中的一行2",
"type": "n8n-nodes-base.supabase",
"position": [
-3376,
-176
],
"parameters": {
"filters": {
"conditions": [
{
"keyName": "url",
"keyValue": "={{ $('Get a row - scrape_queue Table').first().json.url }}",
"condition": "eq"
}
]
},
"tableId": "scrape_queue",
"fieldsUi": {
"fieldValues": [
{
"fieldId": "task_id",
"fieldValue": "={{ $json.task_id }}"
},
{
"fieldId": "status",
"fieldValue": "=error"
}
]
},
"operation": "update"
},
"credentials": {
"supabaseApi": {
"id": "CYPZsYCPJqrO9xBO",
"name": "Supabase_N8N AI Agent Assistant_marinextai"
}
},
"typeVersion": 1
},
{
"id": "44c7fe75-0e88-4114-b506-6e7850c2a038",
"name": "任务 ID 计数器",
"type": "n8n-nodes-base.code",
"position": [
-3856,
-288
],
"parameters": {
"jsCode": "// Simple counter that resets for each new task ID\nif (typeof globalThis.currentTaskId === 'undefined') {\n globalThis.currentTaskId = null;\n globalThis.currentCounter = 0;\n}\n\nreturn items.map(item => {\n const taskId = item.json.task_id;\n \n // Check if this is a new task ID\n if (globalThis.currentTaskId !== taskId) {\n // New task ID detected - reset counter\n globalThis.currentTaskId = taskId;\n globalThis.currentCounter = 1;\n } else {\n // Same task ID - increment counter\n globalThis.currentCounter++;\n }\n \n return {\n json: {\n ...item.json,\n attempt_count: globalThis.currentCounter\n }\n };\n});"
},
"typeVersion": 2
}
],
"pinData": {},
"connections": {
"If": {
"main": [
[
{
"node": "Remove redundant data from the scraping",
"type": "main",
"index": 0
}
],
[
{
"node": "Edit Fields",
"type": "main",
"index": 0
}
]
]
},
"If1": {
"main": [
[
{
"node": "Update a row in scrape_queue Table2",
"type": "main",
"index": 0
}
],
[
{
"node": "Wait",
"type": "main",
"index": 0
}
]
]
},
"If2": {
"main": [
[
{
"node": "Crawl4ai Web Page Scrape",
"type": "main",
"index": 0
}
],
[
{
"node": "Loop Over Items",
"type": "main",
"index": 0
}
]
]
},
"XML": {
"main": [
[
{
"node": "Split Out",
"type": "main",
"index": 0
}
]
]
},
"Wait": {
"main": [
[
{
"node": "Crawl4AI_Task Status",
"type": "main",
"index": 0
}
]
]
},
"Wait1": {
"main": [
[
{
"node": "Crawl4ai Web Page Scrape",
"type": "main",
"index": 0
}
]
]
},
"Split Out": {
"main": [
[
{
"node": "Loop Over Items1",
"type": "main",
"index": 0
}
]
]
},
"Split Out1": {
"main": [
[
{
"node": "Loop Over Items",
"type": "main",
"index": 0
}
]
]
},
"Edit Fields": {
"main": [
[
{
"node": "Task_id Counter",
"type": "main",
"index": 0
}
]
]
},
"HTTP Request": {
"main": [
[
{
"node": "XML",
"type": "main",
"index": 0
}
]
]
},
"Format the URL": {
"main": [
[
{
"node": "Check if the URL is in the Supabase Table",
"type": "main",
"index": 0
}
]
]
},
"Loop Over Items": {
"main": [
[],
[
{
"node": "Get a row - scrape_queue Table",
"type": "main",
"index": 0
}
]
]
},
"Task_id Counter": {
"main": [
[
{
"node": "If1",
"type": "main",
"index": 0
}
]
]
},
"Loop Over Items1": {
"main": [
[
{
"node": "Split Out1",
"type": "main",
"index": 0
}
],
[
{
"node": "Format the URL",
"type": "main",
"index": 0
}
]
]
},
"URL in a new row": {
"main": [
[
{
"node": "Loop Over Items1",
"type": "main",
"index": 0
}
]
]
},
"Embeddings OpenAI": {
"ai_embedding": [
[
{
"node": "Supabase Vector Store_documents",
"type": "ai_embedding",
"index": 0
}
]
]
},
"Default Data Loader": {
"ai_document": [
[
{
"node": "Supabase Vector Store_documents",
"type": "ai_document",
"index": 0
}
]
]
},
"Quality Filter Node": {
"main": [
[
{
"node": "Content Type Detection",
"type": "main",
"index": 0
}
]
]
},
"Crawl4AI_Task Status": {
"main": [
[
{
"node": "If",
"type": "main",
"index": 0
}
],
[
{
"node": "Update a row in scrape_queue Table1",
"type": "main",
"index": 0
}
]
]
},
"Content Type Detection": {
"main": [
[
{
"node": "Better Metadata Extraction",
"type": "main",
"index": 0
}
]
]
},
"Character Text Splitter": {
"ai_textSplitter": [
[
{
"node": "Default Data Loader",
"type": "ai_textSplitter",
"index": 0
}
]
]
},
"Crawl4ai Web Page Scrape": {
"main": [
[
{
"node": "Wait",
"type": "main",
"index": 0
}
],
[
{
"node": "Wait1",
"type": "main",
"index": 0
}
]
]
},
"If \"shouldInsert\" is true": {
"main": [
[
{
"node": "URL in a new row",
"type": "main",
"index": 0
}
],
[
{
"node": "Loop Over Items1",
"type": "main",
"index": 0
}
]
]
},
"Better Metadata Extraction": {
"main": [
[
{
"node": "Supabase Vector Store_documents",
"type": "main",
"index": 0
}
]
]
},
"Get a row - scrape_queue Table": {
"main": [
[
{
"node": "If2",
"type": "main",
"index": 0
}
]
]
},
"Supabase Vector Store_documents": {
"main": [
[
{
"node": "Update a row in scrape_queue Table",
"type": "main",
"index": 0
}
]
]
},
"When clicking ‘Test workflow’": {
"main": [
[
{
"node": "HTTP Request",
"type": "main",
"index": 0
}
]
]
},
"Update a row in scrape_queue Table": {
"main": [
[
{
"node": "Loop Over Items",
"type": "main",
"index": 0
}
]
]
},
"Update a row in scrape_queue Table1": {
"main": [
[
{
"node": "Loop Over Items",
"type": "main",
"index": 0
}
]
]
},
"Update a row in scrape_queue Table2": {
"main": [
[
{
"node": "Loop Over Items",
"type": "main",
"index": 0
}
]
]
},
"Remove redundant data from the scraping": {
"main": [
[
{
"node": "Quality Filter Node",
"type": "main",
"index": 0
}
]
]
},
"Format the Output from the Supabase node": {
"main": [
[
{
"node": "If \"shouldInsert\" is true",
"type": "main",
"index": 0
}
]
]
},
"Check if the URL is in the Supabase Table": {
"main": [
[
{
"node": "Format the Output from the Supabase node",
"type": "main",
"index": 0
}
]
]
}
}
}常见问题
如何使用这个工作流?
复制上方的 JSON 配置代码,在您的 n8n 实例中创建新工作流并选择「从 JSON 导入」,粘贴配置后根据需要修改凭证设置即可。
这个工作流适合什么场景?
这是一个高级难度的工作流,适用于Content Creation、Multimodal AI等场景。适合高级用户,包含 16+ 个节点的复杂工作流
需要付费吗?
本工作流完全免费,您可以直接导入使用。但请注意,工作流中使用的第三方服务(如 OpenAI API)可能需要您自行付费。
相关工作流推荐
宠物店 4
🐶 宠物店预约 AI 代理
If
Set
Code
+41
187 节点Bruno Dias
AI
交付汉堡店MVP
🤖 餐厅与配送自动化的 AI 驱动 WhatsApp 助手
If
Set
Code
+37
152 节点Bruno Dias
上下文混合RAG AI文案
Google Drive到Supabase上下文向量数据库同步用于RAG应用
If
Set
Code
+25
76 节点Michael Taleb
AI RAG
内容生成器 v3
AI驱动博客自动化:使用GPT-4生成并发布SEO文章至WordPress和Twitter
If
Set
Code
+25
144 节点Jay Emp0
Content Creation
我的智能体竞技场社区竞赛
使用Qdrant、Mistral OCR和GPT-4构建基于RAG的问答系统
Set
Code
Wait
+19
41 节点Davide
Content Creation
创建自更新的RAG聊天机器人(Google Drive、Gemini和Supabase)
使用Google Drive、Gemini和Supabase创建自更新的RAG聊天机器人
Set
Code
Merge
+19
45 节点Anirudh Aeran
Content Creation
工作流信息
难度等级
高级
节点数量40
分类2
节点类型16
作者
Mariela Slavenova
@marielabg🚀 Fractional Head of AI Ops | COO | CTO | I diagnose, fix & ship automations that pay for themselves | The Harden Method™ - Discover→Design→Build→Break→Harden→Launch→Monitor | Founder @ MarinextAI
外部链接
在 n8n.io 上查看 →
分享此工作流