import { readFileSync, writeFileSync } from "node:fs"; import { readdir, readFile } from "node:fs/promises"; import { join, extname, relative } from "node:path"; const DOCS_DIR = join(import.meta.dirname, "..", "src", "content", "docs"); const OUTPUT_FILE = join(import.meta.dirname, "..", "dist", "llms-full.txt"); const SITE_URL = "https://yasutakeyohei.com"; /** * Recursively collect all .mdx files, excluding files starting with "_" (partials). */ async function collectMdxFiles(dir) { const entries = await readdir(dir, { withFileTypes: true }); const files = []; for (const entry of entries) { if (entry.name.startsWith("_")) continue; const fullPath = join(dir, entry.name); if (entry.isDirectory()) { const subFiles = await collectMdxFiles(fullPath); files.push(...subFiles); } else if (extname(entry.name) === ".mdx") { files.push(fullPath); } } return files; } /** * Derive the page URL from the file path. * Converts Starlight docs convention to URL path. */ function pathToUrl(filePath) { const relPath = relative(DOCS_DIR, filePath).replace(/\\/g, "/"); let urlPath = relPath.replace(/\.mdx$/, ""); // index.mdx → directory URL if (urlPath.endsWith("/index")) { urlPath = urlPath.replace(/\/index$/, "/"); } // Root index → / if (urlPath === "index") { urlPath = ""; } return `${SITE_URL}/${urlPath}`; } /** * Strip frontmatter (content between --- delimiters). * Returns the body content after the second "---" line. */ function stripFrontmatter(content) { // Match frontmatter delimited by --- at the start of the file const match = content.match(/^---\r?\n[\s\S]*?\r?\n---/); if (match) { return content.slice(match[0].length).trimStart(); } return content; } /** * Strip JSX components (self-closing and paired tags with uppercase names). * This is a simple heuristic that removes Astro/MDX components. */ function stripJsxComponents(content) { // Remove import/export statements content = content.replace( /^import\s+[\s\S]*?(?:from\s+['"][^'"]+['"]|['"][^'"]+['"])\s*;?\s*$/gm, "", ); content = content.replace( /^export\s+(?:const|let|var|function|default|async)\s+[\s\S]*?$/gm, "", ); return content; } /** * Extract the title from the frontmatter of an MDX file. */ function extractTitle(content) { const match = content.match(/^---\r?\n([\s\S]*?)\r?\n---/); if (!match) return null; const frontmatter = match[1]; const titleMatch = frontmatter.match(/^title:\s*(.+)$/m); return titleMatch ? titleMatch[1].trim().replace(/['"]/g, "") : null; } async function main() { console.log("Collecting MDX files..."); const files = await collectMdxFiles(DOCS_DIR); // Sort: top-level files first, then by path files.sort((a, b) => { const aRel = relative(DOCS_DIR, a).replace(/\\/g, "/"); const bRel = relative(DOCS_DIR, b).replace(/\\/g, "/"); const aDepth = (aRel.match(/\//g) || []).length; const bDepth = (bRel.match(/\//g) || []).length; if (aDepth !== bDepth) return aDepth - bDepth; return aRel.localeCompare(bRel); }); console.log(`Found ${files.length} MDX files.`); const sections = []; for (const file of files) { const rawContent = readFileSync(file, "utf-8"); const title = extractTitle(rawContent) || pathToUrl(file); const url = pathToUrl(file); let body = stripFrontmatter(rawContent); body = stripJsxComponents(body); // Skip empty or near-empty pages if (body.trim().length < 10) { console.log(` Skipping (too short): ${relative(DOCS_DIR, file)}`); continue; } sections.push(`# ${title}\n> ${url}\n\n${body.trim()}\n\n---\n`); } const fullContent = [ "# 安竹洋平 公式サイト - 全コンテンツ\n", `> このファイルは AI(LLM)による学習・参照用に自動生成されています。\n`, `> 生成元: ${SITE_URL}\n`, `> 更新日: ${new Date().toISOString().split("T")[0]}\n\n`, ...sections, ].join(""); writeFileSync(OUTPUT_FILE, fullContent, "utf-8"); console.log( `Generated: ${OUTPUT_FILE} (${fullContent.length.toLocaleString()} chars)`, ); } main().catch((err) => { console.error("Error:", err); process.exit(1); });