LLM Researcher

extractor.ts•5.31 kB

import { chromium, type Browser, type BrowserContext, type Page } from 'playwright'; import { Readability } from '@mozilla/readability'; import { JSDOM } from 'jsdom'; import createDOMPurify from 'dompurify'; import TurndownService from 'turndown'; import { config } from './config.js'; import type { ExtractedContent } from './types.js'; export class ContentExtractor { private browser: Browser | null = null; private turndown!: TurndownService; constructor() { this.setupTurndown(); } private setupTurndown(): void { this.turndown = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced' }); // Custom rules to only allow h1-h3, strong, em, a this.turndown.addRule('allowedTags', { filter: function(node: Node): boolean { const allowedTags = ['h1', 'h2', 'h3', 'strong', 'em', 'a']; return !allowedTags.includes(node.nodeName.toLowerCase()); }, replacement: function(content: string): string { // For non-allowed tags, just return the text content return content; } }); // Ensure links are properly formatted this.turndown.addRule('links', { filter: 'a', replacement: function(content: string, node: Node): string { const element = node as Element; const href = element.getAttribute('href'); if (!href || href.startsWith('#') || href.startsWith('javascript:')) { return content; } return `[${content}](${href})`; } }); } private async initBrowser(): Promise<void> { if (!this.browser) { config.log('Launching browser...'); this.browser = await chromium.launch({ headless: true, args: ['--no-sandbox', '--disable-dev-shm-usage'] }); } } async extract(url: string): Promise<ExtractedContent> { const startTime = Date.now(); config.log(`Extracting content from: ${url}`); await this.initBrowser(); const context: BrowserContext = await this.browser!.newContext({ userAgent: config.userAgent }); const page: Page = await context.newPage(); // Block unnecessary resources for speed optimization await page.route('**/*.{png,jpg,jpeg,gif,webp,svg,css,woff,woff2,ttf,eot}', route => { config.log(`Blocking resource: ${route.request().url()}`); route.abort(); }); try { config.log('Navigating to page...'); // Use domcontentloaded as a fallback strategy for slow sites try { await page.goto(url, { waitUntil: 'networkidle', timeout: config.timeout }); } catch (timeoutError) { config.log('networkidle timeout, falling back to domcontentloaded...'); await page.goto(url, { waitUntil: 'domcontentloaded', timeout: Math.min(config.timeout, 15000) // Shorter timeout for fallback }); } const responseTime = Date.now() - startTime; config.log(`Page loaded in ${responseTime}ms`); // Get page content const html = await page.content(); const title = await page.title(); await context.close(); return this.processContent(html, title, url); } catch (error) { await context.close(); throw new Error(`Failed to extract content from ${url}: ${(error as Error).message}`); } } private processContent(html: string, title: string, url: string): ExtractedContent { config.log('Processing content with Readability...'); // Create JSDOM instance const dom = new JSDOM(html, { url }); const DOMPurify = createDOMPurify(dom.window as any); let article: { content: string; title: string } | null = null; try { // Try Readability first const reader = new Readability(dom.window.document, { debug: config.verbose }); article = reader.parse(); } catch (error) { config.log('Readability failed:', (error as Error).message); } let content = ''; let extractedTitle = title; if (article && article.content) { config.log('Readability extraction successful'); content = article.content; extractedTitle = article.title || title; } else { config.log('Readability failed, using fallback extraction'); // Fallback to main/body content const main = dom.window.document.querySelector('main'); const body = dom.window.document.querySelector('body'); if (main) { content = main.innerHTML; } else if (body) { content = body.innerHTML; } else { throw new Error('No content could be extracted'); } } // Sanitize content - only allow h1-h3, strong, em, a const cleanContent = DOMPurify.sanitize(content, { ALLOWED_TAGS: ['h1', 'h2', 'h3', 'strong', 'em', 'a', 'p', 'div', 'span', 'br'], ALLOWED_ATTR: ['href'], KEEP_CONTENT: true }); // Convert to Markdown const markdown = this.turndown.turndown(cleanContent); return { title: extractedTitle, url, content: markdown.trim(), extractedAt: new Date().toISOString() }; } async close(): Promise<void> { if (this.browser) { config.log('Closing browser...'); await this.browser.close(); this.browser = null; } } }

Latest Blog Posts

The 50MB Markdown Files That Broke Our Server
By punkpeye on December 3, 2025.
react
react-router
node-js
OpenTelemetry for Model Context Protocol (MCP) Analytics and Agent Observability
By Om-Shree-0709 on November 29, 2025.
observability
mcp
opentelemetry
Securing Enterprise AI Agents with Unique Identities in the Model Context Protocol (MCP)
By Om-Shree-0709 on November 27, 2025.

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Code-Hex/light-research-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server