import { chromium, type Browser, type BrowserContext, type Page } from 'playwright';
import { Readability } from '@mozilla/readability';
import { JSDOM } from 'jsdom';
import createDOMPurify from 'dompurify';
import TurndownService from 'turndown';
import { config } from './config.js';
import type { ExtractedContent } from './types.js';
export class ContentExtractor {
private browser: Browser | null = null;
private turndown!: TurndownService;
constructor() {
this.setupTurndown();
}
private setupTurndown(): void {
this.turndown = new TurndownService({
headingStyle: 'atx',
codeBlockStyle: 'fenced'
});
// Custom rules to only allow h1-h3, strong, em, a
this.turndown.addRule('allowedTags', {
filter: function(node: Node): boolean {
const allowedTags = ['h1', 'h2', 'h3', 'strong', 'em', 'a'];
return !allowedTags.includes(node.nodeName.toLowerCase());
},
replacement: function(content: string): string {
// For non-allowed tags, just return the text content
return content;
}
});
// Ensure links are properly formatted
this.turndown.addRule('links', {
filter: 'a',
replacement: function(content: string, node: Node): string {
const element = node as Element;
const href = element.getAttribute('href');
if (!href || href.startsWith('#') || href.startsWith('javascript:')) {
return content;
}
return `[${content}](${href})`;
}
});
}
private async initBrowser(): Promise<void> {
if (!this.browser) {
config.log('Launching browser...');
this.browser = await chromium.launch({
headless: true,
args: ['--no-sandbox', '--disable-dev-shm-usage']
});
}
}
async extract(url: string): Promise<ExtractedContent> {
const startTime = Date.now();
config.log(`Extracting content from: ${url}`);
await this.initBrowser();
const context: BrowserContext = await this.browser!.newContext({
userAgent: config.userAgent
});
const page: Page = await context.newPage();
// Block unnecessary resources for speed optimization
await page.route('**/*.{png,jpg,jpeg,gif,webp,svg,css,woff,woff2,ttf,eot}', route => {
config.log(`Blocking resource: ${route.request().url()}`);
route.abort();
});
try {
config.log('Navigating to page...');
// Use domcontentloaded as a fallback strategy for slow sites
try {
await page.goto(url, {
waitUntil: 'networkidle',
timeout: config.timeout
});
} catch (timeoutError) {
config.log('networkidle timeout, falling back to domcontentloaded...');
await page.goto(url, {
waitUntil: 'domcontentloaded',
timeout: Math.min(config.timeout, 15000) // Shorter timeout for fallback
});
}
const responseTime = Date.now() - startTime;
config.log(`Page loaded in ${responseTime}ms`);
// Get page content
const html = await page.content();
const title = await page.title();
await context.close();
return this.processContent(html, title, url);
} catch (error) {
await context.close();
throw new Error(`Failed to extract content from ${url}: ${(error as Error).message}`);
}
}
private processContent(html: string, title: string, url: string): ExtractedContent {
config.log('Processing content with Readability...');
// Create JSDOM instance
const dom = new JSDOM(html, { url });
const DOMPurify = createDOMPurify(dom.window as any);
let article: { content: string; title: string } | null = null;
try {
// Try Readability first
const reader = new Readability(dom.window.document, {
debug: config.verbose
});
article = reader.parse();
} catch (error) {
config.log('Readability failed:', (error as Error).message);
}
let content = '';
let extractedTitle = title;
if (article && article.content) {
config.log('Readability extraction successful');
content = article.content;
extractedTitle = article.title || title;
} else {
config.log('Readability failed, using fallback extraction');
// Fallback to main/body content
const main = dom.window.document.querySelector('main');
const body = dom.window.document.querySelector('body');
if (main) {
content = main.innerHTML;
} else if (body) {
content = body.innerHTML;
} else {
throw new Error('No content could be extracted');
}
}
// Sanitize content - only allow h1-h3, strong, em, a
const cleanContent = DOMPurify.sanitize(content, {
ALLOWED_TAGS: ['h1', 'h2', 'h3', 'strong', 'em', 'a', 'p', 'div', 'span', 'br'],
ALLOWED_ATTR: ['href'],
KEEP_CONTENT: true
});
// Convert to Markdown
const markdown = this.turndown.turndown(cleanContent);
return {
title: extractedTitle,
url,
content: markdown.trim(),
extractedAt: new Date().toISOString()
};
}
async close(): Promise<void> {
if (this.browser) {
config.log('Closing browser...');
await this.browser.close();
this.browser = null;
}
}
}