Skip to main content
Glama
test-pdf-parsing.jsโ€ข6.67 kB
#!/usr/bin/env node /** * Test script for PDF parsing functionality using @opendocsg/pdf2md (v3) * Verifies parsing of sample PDFs and URL */ import { parsePdfToMarkdown } from '../dist/tools/pdf/index.js'; import path from 'path'; import { fileURLToPath } from 'url'; import fs from 'fs/promises'; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); const SAMPLES_DIR = path.join(__dirname, 'samples'); const SAMPLES = [ '01_sample_simple.pdf', '02_sample_invoice.pdf', '03_sample_compex.pdf', // 'gpc-genai-ocsummaryv2-content.pdf', // '2025-Wharton-GBK-AI-Adoption-Report_Full-Report.pdf' // 'statement.pdf' ]; const URL_SAMPLE = 'https://pdfobject.com/pdf/sample.pdf'; async function testSample(name, source) { console.log(`\n================================================================================`); console.log(`Processing: ${name}`); console.log(`Source: ${source}`); console.log(`--------------------------------------------------------------------------------`); try { const startTime = Date.now(); // Content (Markdown) console.log('\n๐Ÿ“ CONTENT PREVIEW (Markdown):'); const result = await parsePdfToMarkdown(source); // Verify new structure if (result.pages) { console.log(`\n๐Ÿ“„ Pages Found: ${result.pages.length}`); result.pages.forEach((p, i) => { console.log(` Page ${p.pageNumber}: ${p.text.length} chars, ${p.images.length} images`); }); } const markdown = result.pages.map(p => p.text).join(''); const images = result.pages.flatMap(p => p.images); const processingTime = Date.now() - startTime; console.log('\n--- Full Text Preview ---'); console.log(markdown.substring(0, 200) + '...'); // Save extracted images to disk if (images && images.length > 0) { console.log(`\n๐Ÿ–ผ๏ธ EXTRACTED IMAGES (${images.length}):`); // Create images directory for this PDF const imagesDir = path.join(SAMPLES_DIR, `${name}_images`); await fs.mkdir(imagesDir, { recursive: true }); for (let i = 0; i < images.length; i++) { const img = images[i]; // Determine file extension from MIME type const ext = img.mimeType.split('/')[1] || 'png'; const filename = `page_${img.page}_img_${i + 1}.${ext}`; const filepath = path.join(imagesDir, filename); // Decode base64 and save to file const buffer = Buffer.from(img.data, 'base64'); await fs.writeFile(filepath, buffer); console.log(` - Saved: ${filename} (${img.width}x${img.height}, ${img.mimeType})`); } console.log(`\n Images saved to: ${imagesDir}`); } // save to markdown file const markdownPath = path.join(SAMPLES_DIR, `${name}.md`); await fs.writeFile(markdownPath, markdown); const preview = markdown.substring(0, 500).replace(/\n/g, '\n '); console.log(` ${preview}...`); console.log(`\n [Total Length: ${markdown.length} chars]`); console.log(` Processing Time: ${processingTime}ms`); } catch (error) { console.error(`โŒ Error processing ${name}:`, error); } } async function testPageFiltering() { console.log(`\n================================================================================`); console.log(`๐Ÿงช Testing Page Filtering`); console.log(`--------------------------------------------------------------------------------`); const sampleName = '03_sample_compex.pdf'; const samplePath = path.join(SAMPLES_DIR, sampleName); try { // Case 1: Specific Pages (Array) console.log('\nCase 1: Specific Pages [1]'); const result1 = await parsePdfToMarkdown(samplePath, [1]); console.log(`Pages returned: ${result1.pages.length}`); console.log(`Page numbers: ${result1.pages.map(p => p.pageNumber).join(', ')}`); // Case 2: Page Range (Positive Offset) - First Page console.log('\nCase 2: Page Range { offset: 0, length: 1 } (First Page)'); const result2 = await parsePdfToMarkdown(samplePath, { offset: 0, length: 1 }); console.log(`Pages returned: ${result2.pages.length}`); console.log(`Page numbers: ${result2.pages.map(p => p.pageNumber).join(', ')}`); // Case 3: Page Range (Negative Offset) - Last Page console.log('\nCase 3: Page Range { offset: -2, length: 2 } (Last Page)'); const result3 = await parsePdfToMarkdown(samplePath, { offset: -2, length: 2 }); console.log(`Pages returned: ${result3.pages.length}`); console.log(`Page numbers: ${result3.pages.map(p => p.pageNumber).join(', ')}`); // Case 4: Page Range (Large Length) - All Pages from start console.log('\nCase 4: Page Range { offset: 0, length: 100 } (All Pages)'); const result4 = await parsePdfToMarkdown(samplePath, { offset: 0, length: 100 }); console.log(`Pages returned: ${result4.pages.length}`); console.log(`Page numbers: ${result4.pages.map(p => p.pageNumber).join(', ')}`); // Case 5: Page Range (Large Negative Offset) - All Pages from end console.log('\nCase 5: Page Range { offset: -100, length: 100 } (All Pages from end)'); const result5 = await parsePdfToMarkdown(samplePath, { offset: -100, length: 100 }); console.log(`Pages returned: ${result5.pages.length}`); console.log(`Page numbers: ${result5.pages.map(p => p.pageNumber).join(', ')}`); // Case 6: Page Range (Large Length) - All Pages console.log('\nCase 6: Page Range [1,5,14] (All Pages)'); const result6 = await parsePdfToMarkdown(samplePath, [1, 5, 14]); console.log(`Pages returned: ${result6.pages.length}`); console.log(`Page numbers: ${result6.pages.map(p => p.pageNumber).join(', ')}`); } catch (error) { console.error('โŒ Error in page filtering test:', error); } } async function main() { console.log('๐Ÿงช PDF v3 Sample Test Suite (@opendocsg/pdf2md)'); // Test Page Filtering await testPageFiltering(); // Test Local Samples for (const sample of SAMPLES) { const samplePath = path.join(SAMPLES_DIR, sample); await testSample(sample, samplePath); } // Test URL await testSample('URL Sample', URL_SAMPLE); } if (import.meta.url === `file://${process.argv[1]}`) { main().catch(console.error); }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/wonderwhy-er/DesktopCommanderMCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server