Skip to main content
Glama
download_test_data.py12 kB
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Download test data from zim-testing-suite repository. This script downloads essential ZIM test files from the official zim-testing-suite repository for comprehensive testing of OpenZIM MCP functionality. """ import argparse import hashlib import json import logging import os import sys from pathlib import Path from typing import Dict, List, Optional from urllib.request import urlopen, urlretrieve from urllib.error import URLError # Ensure UTF-8 encoding for Windows compatibility if sys.platform == "win32": import io sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8') # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ) logger = logging.getLogger(__name__) # Base URL for zim-testing-suite repository BASE_URL = "https://raw.githubusercontent.com/openzim/zim-testing-suite/main/data" # Essential test files to download by category ESSENTIAL_FILES = { "basic": { "withns/small.zim": { "description": "Small ZIM file with namespaces for basic testing", "size_mb": 0.08, "priority": 1 }, "nons/small.zim": { "description": "Small ZIM file without namespaces for basic testing", "size_mb": 0.04, "priority": 1 } }, "real_content": { "withns/wikibooks_be_all_nopic_2017-02.zim": { "description": "Real Wikibooks content for integration testing", "size_mb": 0.15, "priority": 2 }, "withns/wikipedia_en_climate_change_mini_2024-06.zim": { "description": "Wikipedia climate change mini for comprehensive testing", "size_mb": 13.6, "priority": 3 } }, "invalid_files": { "withns/invalid.smaller_than_header.zim": { "description": "Invalid ZIM file smaller than header", "size_mb": 0.00004, "priority": 2 }, "withns/invalid.bad_mimetype_in_dirent.zim": { "description": "Invalid ZIM file with bad MIME type in dirent", "size_mb": 0.08, "priority": 2 }, "withns/invalid.outofbounds_clusterptrpos.zim": { "description": "Invalid ZIM file with out-of-bounds cluster pointer", "size_mb": 0.08, "priority": 2 } }, "special_cases": { "withns/small.zim.embedded": { "description": "ZIM file with embedded content", "size_mb": 0.08, "priority": 3 }, "withns/wikibooks_be_all_nopic_2017-02_splitted.zimaa": { "description": "Split ZIM file part A", "size_mb": 0.05, "priority": 3 }, "withns/wikibooks_be_all_nopic_2017-02_splitted.zimab": { "description": "Split ZIM file part B", "size_mb": 0.05, "priority": 3 }, "withns/wikibooks_be_all_nopic_2017-02_splitted.zimac": { "description": "Split ZIM file part C", "size_mb": 0.05, "priority": 3 } } } def get_file_hash(file_path: Path) -> str: """Calculate SHA256 hash of a file.""" sha256_hash = hashlib.sha256() with open(file_path, "rb") as f: for chunk in iter(lambda: f.read(4096), b""): sha256_hash.update(chunk) return sha256_hash.hexdigest() def download_file(url: str, dest_path: Path, description: str) -> bool: """ Download a file from URL to destination path. Args: url: Source URL dest_path: Destination file path description: File description for logging Returns: True if successful, False otherwise """ try: logger.info(f"Downloading {description}...") logger.debug(f"URL: {url}") logger.debug(f"Destination: {dest_path}") # Create parent directory if it doesn't exist dest_path.parent.mkdir(parents=True, exist_ok=True) # Download with progress indication for large files def progress_hook(block_num: int, block_size: int, total_size: int) -> None: if total_size > 0: percent = min(100, (block_num * block_size * 100) // total_size) if block_num % 100 == 0 or percent >= 100: # Update every 100 blocks or at completion logger.debug(f"Progress: {percent}%") urlretrieve(url, dest_path, reporthook=progress_hook) logger.info(f"[OK] Downloaded: {dest_path.name}") return True except URLError as e: logger.error(f"[FAIL] Failed to download {dest_path.name}: {e}") return False except Exception as e: logger.error(f"[FAIL] Unexpected error downloading {dest_path.name}: {e}") return False def list_available_files() -> None: """List all available test files by category.""" print("\nAvailable test files by category:\n") total_size = 0 total_files = 0 for category, files in ESSENTIAL_FILES.items(): print(f"[DIR] {category.upper().replace('_', ' ')}") print("=" * 50) for file_path, info in files.items(): size_mb = info["size_mb"] priority = info["priority"] description = info["description"] priority_str = "[HIGH]" if priority == 1 else "[MED]" if priority == 2 else "[LOW]" print(f" {priority_str} {file_path}") print(f" {description}") print(f" Size: {size_mb:.2f} MB") print() total_size += size_mb total_files += 1 print(f"Total: {total_files} files, {total_size:.2f} MB") print("\nPriority levels:") print("[HIGH] Priority 1: Essential for basic testing") print("[MED] Priority 2: Important for comprehensive testing") print("[LOW] Priority 3: Advanced testing scenarios") def download_files( output_dir: Path, categories: Optional[List[str]] = None, max_priority: int = 3, force: bool = False ) -> bool: """ Download test files based on criteria. Args: output_dir: Output directory for downloaded files categories: List of categories to download (None for all) max_priority: Maximum priority level to download (1-3) force: Force re-download even if file exists Returns: True if all downloads successful, False otherwise """ success_count = 0 total_count = 0 total_size = 0 # Filter files based on criteria files_to_download = {} for category, files in ESSENTIAL_FILES.items(): if categories and category not in categories: continue for file_path, info in files.items(): if info["priority"] <= max_priority: files_to_download[file_path] = info total_size += info["size_mb"] total_count = len(files_to_download) if total_count == 0: logger.warning("No files match the specified criteria") return False logger.info(f"Downloading {total_count} files ({total_size:.2f} MB total)") # Create output directory output_dir.mkdir(parents=True, exist_ok=True) # Download files for file_path, info in files_to_download.items(): dest_path = output_dir / file_path # Skip if file exists and not forcing if dest_path.exists() and not force: logger.info(f"[SKIP] Skipping existing file: {dest_path.name}") success_count += 1 continue url = f"{BASE_URL}/{file_path}" if download_file(url, dest_path, info["description"]): success_count += 1 # Summary logger.info(f"\nDownload complete: {success_count}/{total_count} files successful") if success_count == total_count: logger.info("[OK] All downloads completed successfully!") return True else: logger.warning(f"[FAIL] {total_count - success_count} downloads failed") return False def create_manifest(output_dir: Path) -> None: """Create a manifest file with downloaded file information.""" manifest_path = output_dir / "manifest.json" manifest = { "created": str(Path.cwd()), "files": {} } for category, files in ESSENTIAL_FILES.items(): for file_path, info in files.items(): dest_path = output_dir / file_path if dest_path.exists(): manifest["files"][file_path] = { "category": category, "description": info["description"], "priority": info["priority"], "size_bytes": dest_path.stat().st_size, "sha256": get_file_hash(dest_path) } with open(manifest_path, "w") as f: json.dump(manifest, f, indent=2) logger.info(f"Created manifest: {manifest_path}") def main() -> int: """Main entry point.""" parser = argparse.ArgumentParser( description="Download test data from zim-testing-suite repository", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: %(prog)s --list # List available files %(prog)s # Download priority 1 files %(prog)s --priority 2 # Download priority 1-2 files %(prog)s --category basic invalid_files # Download specific categories %(prog)s --all # Download all files %(prog)s --force # Force re-download existing files """ ) parser.add_argument( "--output-dir", "-o", type=Path, default=Path("test_data/zim-testing-suite"), help="Output directory for downloaded files (default: test_data/zim-testing-suite)" ) parser.add_argument( "--list", "-l", action="store_true", help="List available files and exit" ) parser.add_argument( "--category", "-c", action="append", choices=list(ESSENTIAL_FILES.keys()), help="Download specific categories (can be used multiple times)" ) parser.add_argument( "--priority", "-p", type=int, choices=[1, 2, 3], default=1, help="Maximum priority level to download (default: 1)" ) parser.add_argument( "--all", "-a", action="store_true", help="Download all files (equivalent to --priority 3)" ) parser.add_argument( "--force", "-f", action="store_true", help="Force re-download even if files exist" ) parser.add_argument( "--verbose", "-v", action="store_true", help="Enable verbose logging" ) args = parser.parse_args() if args.verbose: logging.getLogger().setLevel(logging.DEBUG) if args.list: list_available_files() return 0 # Determine priority level max_priority = 3 if args.all else args.priority # Download files success = download_files( output_dir=args.output_dir, categories=args.category, max_priority=max_priority, force=args.force ) if success: create_manifest(args.output_dir) # Print usage instructions print(f"\n[INFO] Usage Instructions:") print(f"Set environment variable: export ZIM_TEST_DATA_DIR={args.output_dir.absolute()}") print(f"Or use in tests: ZIM_TEST_DATA_DIR={args.output_dir.absolute()} make test") return 0 else: return 1 if __name__ == "__main__": sys.exit(main())

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/cameronrye/openzim-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server