From 32e2ab7749480d294b79e1e550daae07b778d1d1 Mon Sep 17 00:00:00 2001 From: Yasutake Yohei <61961825+yasutakeyohei@users.noreply.github.com> Date: Sun, 21 Jun 2026 19:05:27 +0900 Subject: AI向けに llms.txt と llms-full.txt を生成する仕組みを追加 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AI(LLM)がサイトを参照・学習しやすくするため、以下を追加: - public/llms.txt: 全ページをカテゴリ別に列挙した AI 向けサイトマップ - scripts/generate-llms-full.mjs: 全 MDX ファイルの本文を収集・結合する生成スクリプト - package.json の build スクリプトに生成処理を追加 ビルド時に dist/llms.txt と dist/llms-full.txt が自動生成される。 --- scripts/generate-llms-full.mjs | 144 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 scripts/generate-llms-full.mjs (limited to 'scripts/generate-llms-full.mjs') diff --git a/scripts/generate-llms-full.mjs b/scripts/generate-llms-full.mjs new file mode 100644 index 0000000..cc3fdd4 --- /dev/null +++ b/scripts/generate-llms-full.mjs @@ -0,0 +1,144 @@ +import { readFileSync, writeFileSync } from "node:fs"; +import { readdir, readFile } from "node:fs/promises"; +import { join, extname, relative } from "node:path"; + +const DOCS_DIR = join(import.meta.dirname, "..", "src", "content", "docs"); +const OUTPUT_FILE = join(import.meta.dirname, "..", "dist", "llms-full.txt"); +const SITE_URL = "https://yasutakeyohei.com"; + +/** + * Recursively collect all .mdx files, excluding files starting with "_" (partials). + */ +async function collectMdxFiles(dir) { + const entries = await readdir(dir, { withFileTypes: true }); + const files = []; + + for (const entry of entries) { + if (entry.name.startsWith("_")) continue; + const fullPath = join(dir, entry.name); + if (entry.isDirectory()) { + const subFiles = await collectMdxFiles(fullPath); + files.push(...subFiles); + } else if (extname(entry.name) === ".mdx") { + files.push(fullPath); + } + } + + return files; +} + +/** + * Derive the page URL from the file path. + * Converts Starlight docs convention to URL path. + */ +function pathToUrl(filePath) { + const relPath = relative(DOCS_DIR, filePath).replace(/\\/g, "/"); + let urlPath = relPath.replace(/\.mdx$/, ""); + + // index.mdx → directory URL + if (urlPath.endsWith("/index")) { + urlPath = urlPath.replace(/\/index$/, "/"); + } + + // Root index → / + if (urlPath === "index") { + urlPath = ""; + } + + return `${SITE_URL}/${urlPath}`; +} + +/** + * Strip frontmatter (content between --- delimiters). + * Returns the body content after the second "---" line. + */ +function stripFrontmatter(content) { + // Match frontmatter delimited by --- at the start of the file + const match = content.match(/^---\r?\n[\s\S]*?\r?\n---/); + if (match) { + return content.slice(match[0].length).trimStart(); + } + return content; +} + +/** + * Strip JSX components (self-closing and paired tags with uppercase names). + * This is a simple heuristic that removes Astro/MDX components. + */ +function stripJsxComponents(content) { + // Remove import/export statements + content = content.replace( + /^import\s+[\s\S]*?(?:from\s+['"][^'"]+['"]|['"][^'"]+['"])\s*;?\s*$/gm, + "", + ); + content = content.replace( + /^export\s+(?:const|let|var|function|default|async)\s+[\s\S]*?$/gm, + "", + ); + + return content; +} + +/** + * Extract the title from the frontmatter of an MDX file. + */ +function extractTitle(content) { + const match = content.match(/^---\r?\n([\s\S]*?)\r?\n---/); + if (!match) return null; + const frontmatter = match[1]; + const titleMatch = frontmatter.match(/^title:\s*(.+)$/m); + return titleMatch ? titleMatch[1].trim().replace(/['"]/g, "") : null; +} + +async function main() { + console.log("Collecting MDX files..."); + const files = await collectMdxFiles(DOCS_DIR); + + // Sort: top-level files first, then by path + files.sort((a, b) => { + const aRel = relative(DOCS_DIR, a).replace(/\\/g, "/"); + const bRel = relative(DOCS_DIR, b).replace(/\\/g, "/"); + const aDepth = (aRel.match(/\//g) || []).length; + const bDepth = (bRel.match(/\//g) || []).length; + if (aDepth !== bDepth) return aDepth - bDepth; + return aRel.localeCompare(bRel); + }); + + console.log(`Found ${files.length} MDX files.`); + + const sections = []; + + for (const file of files) { + const rawContent = readFileSync(file, "utf-8"); + const title = extractTitle(rawContent) || pathToUrl(file); + const url = pathToUrl(file); + let body = stripFrontmatter(rawContent); + body = stripJsxComponents(body); + + // Skip empty or near-empty pages + if (body.trim().length < 10) { + console.log(` Skipping (too short): ${relative(DOCS_DIR, file)}`); + continue; + } + + sections.push(`# ${title}\n> ${url}\n\n${body.trim()}\n\n---\n`); + } + + const fullContent = [ + "# 安竹洋平 公式サイト - 全コンテンツ\n", + `> このファイルは AI(LLM)による学習・参照用に自動生成されています。\n`, + `> 生成元: ${SITE_URL}\n`, + `> 更新日: ${new Date().toISOString().split("T")[0]}\n\n`, + ...sections, + ].join(""); + + writeFileSync(OUTPUT_FILE, fullContent, "utf-8"); + console.log( + `Generated: ${OUTPUT_FILE} (${fullContent.length.toLocaleString()} chars)`, + ); +} + +main().catch((err) => { + console.error("Error:", err); + process.exit(1); +}); -- cgit v1.3.1