Skip to main content
Glama

PocketMCP

ingest-enhanced.test.ts12.7 kB
import { describe, it, expect, beforeEach, afterEach } from '@jest/globals'; import { existsSync, unlinkSync, mkdirSync } from 'fs'; import { resolve } from 'path'; import { DatabaseManager } from '../src/db.js'; import { EmbeddingManager } from '../src/embeddings.js'; import { TextChunker } from '../src/chunker.js'; import { IngestManager } from '../src/ingest.js'; import { FileIngestManager } from '../src/file-ingest.js'; import { PdfExtractor, DocxExtractor, createExtractor, getContentType, ParseError, TooLargeError, EncryptedPdfError } from '../src/extractors.js'; describe('Enhanced Ingestion Pipeline', () => { let db: DatabaseManager; let embeddings: EmbeddingManager; let chunker: TextChunker; let ingestManager: IngestManager; let fileIngestManager: FileIngestManager; const testDbPath = './test-enhanced.db'; const fixturesDir = './fixtures/ingest'; beforeEach(async () => { // Clean up any existing test database if (existsSync(testDbPath)) { unlinkSync(testDbPath); } // Create test database and managers db = new DatabaseManager(testDbPath); embeddings = new EmbeddingManager('Xenova/all-MiniLM-L6-v2'); chunker = new TextChunker({ chunkSize: 500, chunkOverlap: 50 }); ingestManager = new IngestManager(db, embeddings, chunker); fileIngestManager = new FileIngestManager(ingestManager, { supportedExtensions: ['.md', '.txt', '.pdf', '.docx'], pdfMaxPages: 10, pdfMinTextChars: 100, docMaxBytes: 1000000, docxSplitOnHeadings: false }); // Initialize embeddings await embeddings.initialize(); }); afterEach(() => { // Clean up db.close(); if (existsSync(testDbPath)) { unlinkSync(testDbPath); } }); describe('Database Schema', () => { it('should create schema with documents, segments, and chunks tables', () => { expect(db.isHealthy()).toBe(true); // Test that we can create a document with new fields const doc = db.upsertDocument({ doc_id: 'test_doc_1', external_id: 'test_external_1', source: 'file', uri: 'file:///test.pdf', title: 'Test Document', content_type: 'application/pdf', size_bytes: 1024, content_sha256: 'test_hash', mtime: new Date().toISOString(), ingest_status: 'ok' }); expect(doc.content_type).toBe('application/pdf'); expect(doc.ingest_status).toBe('ok'); }); it('should handle segments and chunks', () => { // Create a document const doc = db.upsertDocument({ doc_id: 'test_doc_2', external_id: 'test_external_2', source: 'file', uri: 'file:///test.pdf', title: 'Test Document', content_type: 'application/pdf', size_bytes: 1024, content_sha256: 'test_hash', mtime: new Date().toISOString(), ingest_status: 'ok' }); // Create segments const segments = [ { segment_id: 'test_doc_2_page_1', doc_id: 'test_doc_2', kind: 'page' as const, page: 1, text: 'This is page 1 content' }, { segment_id: 'test_doc_2_page_2', doc_id: 'test_doc_2', kind: 'page' as const, page: 2, text: 'This is page 2 content' } ]; db.replaceDocumentSegments('test_doc_2', segments); const retrievedSegments = db.getSegmentsByDocId('test_doc_2'); expect(retrievedSegments).toHaveLength(2); expect(retrievedSegments[0].kind).toBe('page'); expect(retrievedSegments[0].page).toBe(1); }); }); describe('Content Type Detection', () => { it('should correctly detect content types', () => { expect(getContentType('test.pdf')).toBe('application/pdf'); expect(getContentType('test.docx')).toBe('application/vnd.openxmlformats-officedocument.wordprocessingml.document'); expect(getContentType('test.md')).toBe('text/markdown'); expect(getContentType('test.txt')).toBe('text/plain'); }); }); describe('Extractor Factory', () => { it('should create appropriate extractors', () => { const pdfExtractor = createExtractor('test.pdf'); const docxExtractor = createExtractor('test.docx'); const noExtractor = createExtractor('test.xyz'); expect(pdfExtractor).toBeInstanceOf(PdfExtractor); expect(docxExtractor).toBeInstanceOf(DocxExtractor); expect(noExtractor).toBeNull(); }); }); describe('Segment-based Ingestion', () => { it('should handle documents with pre-segmented content', async () => { const segments = [ { segment_id: 'doc1_page_1', doc_id: 'doc1', kind: 'page' as const, page: 1, text: 'This is the first page of the document. It contains important information about the topic.' }, { segment_id: 'doc1_page_2', doc_id: 'doc1', kind: 'page' as const, page: 2, text: 'This is the second page with additional details and conclusions.' } ]; const result = await ingestManager.ingestSingle({ segments, external_id: 'test_segmented_doc', title: 'Test Segmented Document', source: 'file', uri: 'file:///test.pdf', content_type: 'application/pdf', size_bytes: 2048, mtime: new Date().toISOString(), ingest_status: 'ok' }); expect(result.status).toBe('inserted'); expect(result.chunks).toBeGreaterThan(0); // Verify segments were stored const storedSegments = db.getSegmentsByDocId(result.doc_id); expect(storedSegments).toHaveLength(2); expect(storedSegments[0].kind).toBe('page'); expect(storedSegments[0].page).toBe(1); }); it('should handle documents with different ingest statuses', async () => { const testCases = [ { status: 'too_large', notes: 'File exceeds size limit' }, { status: 'needs_ocr', notes: 'PDF requires OCR processing' }, { status: 'error', notes: 'Failed to parse document' }, { status: 'skipped', notes: 'Encrypted file' } ]; for (const testCase of testCases) { const result = await ingestManager.ingestSingle({ text: 'Sample text', external_id: `test_${testCase.status}`, title: `Test ${testCase.status}`, source: 'file', uri: 'file:///test.pdf', content_type: 'application/pdf', size_bytes: 1024, mtime: new Date().toISOString(), ingest_status: testCase.status as any, notes: testCase.notes }); expect(result.status).toBe('inserted'); const doc = db.getDocument(result.doc_id); expect(doc?.ingest_status).toBe(testCase.status); expect(doc?.notes).toBe(testCase.notes); } }); }); describe('Search with Source Badges', () => { it('should return search results with appropriate source badges', async () => { // Create a PDF document with page segments const pdfSegments = [ { segment_id: 'pdf_doc_page_1', doc_id: 'pdf_doc', kind: 'page' as const, page: 1, text: 'Machine learning algorithms are powerful tools for data analysis.' }, { segment_id: 'pdf_doc_page_2', doc_id: 'pdf_doc', kind: 'page' as const, page: 2, text: 'Deep learning networks can process complex patterns in data.' } ]; await ingestManager.ingestSingle({ segments: pdfSegments, external_id: 'test_pdf', title: 'ML Guide.pdf', source: 'file', uri: 'file:///ml-guide.pdf', content_type: 'application/pdf', size_bytes: 2048, mtime: new Date().toISOString(), ingest_status: 'ok' }); // Create a DOCX document with section segments const docxSegments = [ { segment_id: 'docx_doc_section_1', doc_id: 'docx_doc', kind: 'section' as const, meta: { heading: 'Introduction', level: 1 }, text: 'This document covers machine learning fundamentals and applications.' } ]; await ingestManager.ingestSingle({ segments: docxSegments, external_id: 'test_docx', title: 'ML Basics.docx', source: 'file', uri: 'file:///ml-basics.docx', content_type: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', size_bytes: 1024, mtime: new Date().toISOString(), ingest_status: 'ok' }); // Search for machine learning content const results = await ingestManager.search('machine learning', 5); expect(results.length).toBeGreaterThan(0); // Check that results have appropriate resource URIs for (const result of results) { expect(result.resource).toMatch(/^mcp\+doc:\/\/.+#.+$/); } }); }); describe('File Ingestion with New Extensions', () => { it('should support .pdf and .docx extensions', () => { const supportedExtensions = fileIngestManager.getSupportedExtensions(); expect(supportedExtensions).toContain('.pdf'); expect(supportedExtensions).toContain('.docx'); expect(supportedExtensions).toContain('.md'); expect(supportedExtensions).toContain('.txt'); }); it('should handle extraction errors gracefully', async () => { // Test with a non-existent file to simulate extraction errors const result = await fileIngestManager.ingestFile('./non-existent.pdf'); expect(result.status).toBe('skipped'); expect(result.error).toBeDefined(); }); }); describe('Environment Configuration', () => { it('should use environment variables for limits', () => { const originalEnv = process.env; // Set test environment variables process.env.PDF_MAX_PAGES = '50'; process.env.PDF_MIN_TEXT_CHARS = '200'; process.env.DOC_MAX_BYTES = '5000000'; process.env.DOCX_SPLIT_ON_HEADINGS = 'true'; const testFileManager = new FileIngestManager(ingestManager); // The constructor should pick up the environment variables // We can't directly test private options, but we can verify the manager was created expect(testFileManager).toBeDefined(); // Restore environment process.env = originalEnv; }); }); describe('Chunk Boundary Respect', () => { it('should not create chunks that cross segment boundaries', async () => { const segments = [ { segment_id: 'doc_seg_1', doc_id: 'boundary_test_doc', kind: 'section' as const, text: 'A'.repeat(300) // Short segment }, { segment_id: 'doc_seg_2', doc_id: 'boundary_test_doc', kind: 'section' as const, text: 'B'.repeat(800) // Longer segment that will be chunked } ]; const result = await ingestManager.ingestSingle({ segments, external_id: 'boundary_test', title: 'Boundary Test', source: 'raw', uri: 'test://boundary', content_type: 'text/plain', size_bytes: 1100, mtime: new Date().toISOString(), ingest_status: 'ok' }); expect(result.status).toBe('inserted'); expect(result.chunks).toBeGreaterThan(1); // Verify that all chunks belong to their respective segments // This is implicitly tested by the database foreign key constraints const storedSegments = db.getSegmentsByDocId(result.doc_id); expect(storedSegments).toHaveLength(2); }); }); }); describe('Error Handling', () => { describe('PDF Extraction Errors', () => { it('should handle TooLargeError', () => { const error = new TooLargeError('PDF has too many pages'); expect(error.name).toBe('TooLargeError'); expect(error.message).toContain('too many pages'); }); it('should handle EncryptedPdfError', () => { const error = new EncryptedPdfError(); expect(error.name).toBe('EncryptedPdfError'); expect(error.message).toContain('encrypted'); }); it('should handle ParseError', () => { const originalError = new Error('Invalid PDF'); const error = new ParseError('Failed to parse', originalError); expect(error.name).toBe('ParseError'); expect(error.originalError).toBe(originalError); }); }); });

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Kailash-Sankar/PocketMCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server