ingest_v2.py•17.5 kB
"""Documentation indexing and search for PrestaShop MCP server - Version 2.
This version supports indexing ALL documentation types from prestashop-docs folder.
"""
import json
import logging
import sqlite3
from pathlib import Path
from typing import Dict, List, Optional
from .config import CATEGORIES, DB_PATH, DOCS_PATH, INDEXING_CONFIG, DOC_TYPES
from .parsers.base_parser import ParserRegistry
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def create_database_v2():
"""Create SQLite database with enhanced schema for all doc types."""
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
# Main documentation table with new columns
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS prestashop_docs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL,
title TEXT NOT NULL,
category TEXT NOT NULL,
subcategory TEXT,
doc_type TEXT NOT NULL,
path TEXT NOT NULL,
origin TEXT,
location TEXT,
content TEXT NOT NULL,
metadata TEXT,
version TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
"""
)
# FTS5 full-text search table with doc_type
cursor.execute(
"""
CREATE VIRTUAL TABLE IF NOT EXISTS prestashop_docs_fts USING fts5(
name, title, category, subcategory, doc_type, origin, location, content,
content='prestashop_docs',
content_rowid='id',
tokenize='porter unicode61'
)
"""
)
# Specialized hooks table (backward compatibility)
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS hooks (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT UNIQUE NOT NULL,
type TEXT NOT NULL,
origin TEXT NOT NULL,
locations TEXT NOT NULL,
description TEXT,
aliases TEXT,
github_refs TEXT,
code_examples TEXT,
doc_id INTEGER,
FOREIGN KEY(doc_id) REFERENCES prestashop_docs(id)
)
"""
)
# New: Domain commands/queries table
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS domain_references (
id INTEGER PRIMARY KEY AUTOINCREMENT,
entity TEXT NOT NULL,
name TEXT NOT NULL,
type TEXT NOT NULL,
description TEXT,
parameters TEXT,
code_examples TEXT,
doc_id INTEGER,
FOREIGN KEY(doc_id) REFERENCES prestashop_docs(id)
)
"""
)
# New: Components table (forms, grids, etc.)
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS components (
id INTEGER PRIMARY KEY AUTOINCREMENT,
component_type TEXT NOT NULL,
name TEXT NOT NULL,
category TEXT,
description TEXT,
options TEXT,
code_examples TEXT,
doc_id INTEGER,
FOREIGN KEY(doc_id) REFERENCES prestashop_docs(id)
)
"""
)
# New: API resources table
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS api_resources (
id INTEGER PRIMARY KEY AUTOINCREMENT,
api_type TEXT NOT NULL,
resource_name TEXT NOT NULL,
methods TEXT,
schema TEXT,
authentication TEXT,
doc_id INTEGER,
FOREIGN KEY(doc_id) REFERENCES prestashop_docs(id)
)
"""
)
# Create indexes
cursor.execute(
"CREATE INDEX IF NOT EXISTS idx_category ON prestashop_docs(category)"
)
cursor.execute(
"CREATE INDEX IF NOT EXISTS idx_subcategory ON prestashop_docs(subcategory)"
)
cursor.execute(
"CREATE INDEX IF NOT EXISTS idx_doc_type ON prestashop_docs(doc_type)"
)
cursor.execute("CREATE INDEX IF NOT EXISTS idx_origin ON prestashop_docs(origin)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_version ON prestashop_docs(version)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_hook_type ON hooks(type)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_hook_origin ON hooks(origin)")
cursor.execute(
"CREATE INDEX IF NOT EXISTS idx_domain_entity ON domain_references(entity)"
)
cursor.execute(
"CREATE INDEX IF NOT EXISTS idx_component_type ON components(component_type)"
)
cursor.execute(
"CREATE INDEX IF NOT EXISTS idx_api_resource ON api_resources(resource_name)"
)
conn.commit()
conn.close()
logger.info(f"Database created at {DB_PATH}")
def migrate_database():
"""Migrate from old schema to new schema if needed."""
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
try:
# Check if doc_type column exists
cursor.execute("PRAGMA table_info(prestashop_docs)")
columns = [row[1] for row in cursor.fetchall()]
if "doc_type" not in columns:
logger.info("Migrating database schema...")
# Add missing columns
cursor.execute("ALTER TABLE prestashop_docs ADD COLUMN doc_type TEXT")
cursor.execute("ALTER TABLE prestashop_docs ADD COLUMN version TEXT")
cursor.execute(
"ALTER TABLE prestashop_docs ADD COLUMN updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP"
)
# Update existing records
cursor.execute("UPDATE prestashop_docs SET doc_type = 'hook'")
# Recreate FTS5 table with new schema
cursor.execute("DROP TABLE IF EXISTS prestashop_docs_fts")
cursor.execute(
"""
CREATE VIRTUAL TABLE prestashop_docs_fts USING fts5(
name, title, category, subcategory, doc_type, origin, location, content,
content='prestashop_docs',
content_rowid='id',
tokenize='porter unicode61'
)
"""
)
# Repopulate FTS5
cursor.execute(
"""
INSERT INTO prestashop_docs_fts (rowid, name, title, category, subcategory, doc_type, origin, location, content)
SELECT id, name, title, category, subcategory, doc_type, origin, location, content
FROM prestashop_docs
"""
)
conn.commit()
logger.info("Database migration completed")
except sqlite3.Error as e:
logger.error(f"Migration error: {e}")
conn.rollback()
finally:
conn.close()
def clear_database_v2():
"""Clear all data from database tables."""
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
# Clear all tables
tables = [
"api_resources",
"components",
"domain_references",
"hooks",
"prestashop_docs",
"prestashop_docs_fts",
]
for table in tables:
try:
cursor.execute(f"DELETE FROM {table}")
except sqlite3.Error:
pass # Table might not exist yet
conn.commit()
conn.close()
logger.info("Database cleared")
def should_skip_file(file_path: Path) -> bool:
"""Check if file should be skipped.
Args:
file_path: Path to check
Returns:
True if file should be skipped
"""
path_str = str(file_path)
# Check skip patterns from config
for pattern in INDEXING_CONFIG["skip_patterns"]:
# Convert glob pattern to simple check
pattern_simple = pattern.replace("**/", "").replace("/**", "")
if pattern_simple in path_str:
return True
# Skip non-markdown
if file_path.suffix != ".md":
return True
# Skip _index.md files (they're usually just navigation)
if file_path.name == "_index.md":
return True
return False
def index_all_documentation(
parser_registry: ParserRegistry, force: bool = False
) -> Dict[str, int]:
"""Index all PrestaShop documentation from all categories.
Args:
parser_registry: Parser registry with registered parsers
force: Force re-indexing
Returns:
Dictionary with counts per category
"""
if not DOCS_PATH.exists():
logger.error(f"Documentation path not found: {DOCS_PATH}")
return {}
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
category_counts = {}
total_indexed = 0
total_errors = 0
for category_name, category_path in CATEGORIES.items():
full_path = DOCS_PATH / category_path
if not full_path.exists():
logger.warning(f"Category path not found: {full_path}")
continue
logger.info(f"Indexing category: {category_name}")
md_files = list(full_path.rglob("*.md"))
logger.info(f"Found {len(md_files)} markdown files in {category_name}")
indexed_count = 0
for i, md_file in enumerate(md_files, 1):
# Skip certain files
if should_skip_file(md_file):
continue
# Parse file
try:
parsed_data = parser_registry.parse_file(md_file)
if not parsed_data:
continue
# Insert into main docs table
cursor.execute(
"""
INSERT INTO prestashop_docs (
name, title, category, subcategory, doc_type, path,
origin, location, content, metadata, version
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
(
parsed_data.get("name", ""),
parsed_data.get("title", ""),
parsed_data.get("category", category_name),
parsed_data.get("subcategory", ""),
parsed_data.get("doc_type", DOC_TYPES["general"]),
parsed_data.get("path", ""),
parsed_data.get("origin", ""),
parsed_data.get("location", ""),
parsed_data.get("content", ""),
json.dumps(parsed_data.get("metadata", {})),
parsed_data.get("version", ""),
),
)
doc_id = cursor.lastrowid
# Insert into FTS5
cursor.execute(
"""
INSERT INTO prestashop_docs_fts (
rowid, name, title, category, subcategory, doc_type,
origin, location, content
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
(
doc_id,
parsed_data.get("name", ""),
parsed_data.get("title", ""),
parsed_data.get("category", category_name),
parsed_data.get("subcategory", ""),
parsed_data.get("doc_type", DOC_TYPES["general"]),
parsed_data.get("origin", ""),
parsed_data.get("location", ""),
parsed_data.get("content", ""),
),
)
# Insert into specialized tables based on doc_type
doc_type = parsed_data.get("doc_type", "")
metadata = parsed_data.get("metadata", {})
# Populate domain_references table for CQRS references
if doc_type == DOC_TYPES["reference"] and "references" in metadata:
entity = metadata.get("entity", "")
references = metadata.get("references", [])
for ref in references:
try:
cursor.execute(
"""
INSERT INTO domain_references (
entity, name, type, description, parameters,
code_examples, doc_id
) VALUES (?, ?, ?, ?, ?, ?, ?)
""",
(
entity,
ref.get("name", ""),
ref.get("type", "command"),
ref.get("description", ""),
json.dumps(ref.get("parameters", [])),
json.dumps([]), # Code examples per reference not extracted yet
doc_id,
),
)
except Exception as e:
logger.error(f"Error inserting domain reference {ref.get('name', '')}: {e}")
# Populate components table for form types, grid columns, etc.
if doc_type == DOC_TYPES["component"]:
component_type = metadata.get("component_type", "component")
options = metadata.get("options", [])
try:
cursor.execute(
"""
INSERT INTO components (
component_type, name, category, description,
options, code_examples, doc_id
) VALUES (?, ?, ?, ?, ?, ?, ?)
""",
(
component_type,
parsed_data.get("name", ""),
parsed_data.get("category", ""),
metadata.get("description", ""),
json.dumps(options),
json.dumps(metadata.get("code_examples", [])),
doc_id,
),
)
except Exception as e:
logger.error(f"Error inserting component {parsed_data.get('name', '')}: {e}")
indexed_count += 1
total_indexed += 1
# Progress logging
if i % 50 == 0:
logger.info(f" Progress: {i}/{len(md_files)} files...")
conn.commit() # Commit in batches
except Exception as e:
logger.error(f"Error indexing {md_file}: {e}")
total_errors += 1
continue
category_counts[category_name] = indexed_count
logger.info(f"Indexed {indexed_count} files from {category_name}")
# Commit after each category
conn.commit()
conn.close()
logger.info(f"Total indexed: {total_indexed} files")
logger.info(f"Total errors: {total_errors} files")
return category_counts
def index_documentation_v2(force: bool = False) -> int:
"""Index all PrestaShop documentation - new version.
Args:
force: Force re-indexing
Returns:
Total number of documents indexed
"""
# Create/migrate database
create_database_v2()
migrate_database()
# Clear if force re-index
if force:
logger.info("Clearing existing documentation...")
clear_database_v2()
# Check if already indexed
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) FROM prestashop_docs")
count = cursor.fetchone()[0]
conn.close()
if count > 0 and not force:
logger.info(f"Documentation already indexed ({count} documents)")
return count
# Create parser registry and register parsers
# Order matters: more specific parsers first, general parser last
from .parsers.hooks import HookParser
from .parsers.domain_reference_parser import DomainReferenceParser
from .parsers.component_parser import ComponentParser
from .parsers.guide_parser import GuideParser
from .parsers.general_parser import GeneralParser
parser_registry = ParserRegistry()
parser_registry.register(HookParser()) # Most specific
parser_registry.register(DomainReferenceParser()) # CQRS commands/queries
parser_registry.register(ComponentParser()) # Form types, grid columns
parser_registry.register(GuideParser()) # Installation guides, tutorials
parser_registry.register(GeneralParser()) # Fallback for everything else
# Index all documentation
category_counts = index_all_documentation(parser_registry, force=force)
# Calculate total
total = sum(category_counts.values())
logger.info(f"\n=== Indexing Complete ===")
logger.info(f"Total documents indexed: {total}")
for category, count in sorted(category_counts.items()):
logger.info(f" {category}: {count}")
return total
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description="Index PrestaShop documentation (Version 2)"
)
parser.add_argument(
"--force", action="store_true", help="Force re-indexing of all documentation"
)
args = parser.parse_args()
count = index_documentation_v2(force=args.force)
print(f"\nIndexed {count} documents")