Skip to main content
Glama
file_comparison.py6.13 kB
""" 文件差异和比较工具 """ import hashlib from pathlib import Path from typing import List, Dict, Any, Optional from difflib import unified_diff, context_diff # 使用内置异常 from .content_processor import read_file_safe from .file_utils import get_file_info def get_file_hash( file_path: str | Path, algorithm: str = "md5", ) -> str: """ 计算文件哈希值 Args: file_path: 文件路径 algorithm: 哈希算法("md5", "sha1", "sha256") Returns: 哈希值字符串 Raises: FileNotFoundError: 文件不存在 FileOperationError: 哈希计算失败 """ path = Path(file_path) if not path.exists(): raise FileNotFoundError(f"文件不存在: {path}") if not path.is_file(): raise FileOperationError(f"路径不是文件: {path}") hash_obj = hashlib.new(algorithm) try: with open(path, "rb") as f: for chunk in iter(lambda: f.read(4096), b""): hash_obj.update(chunk) return hash_obj.hexdigest() except Exception as e: raise FileOperationError(f"计算文件哈希失败: {e}") def compare_files( file1_path: str | Path, file2_path: str | Path, diff_format: str = "unified", context_lines: int = 3, ) -> Dict[str, Any]: """ 比较两个文件 Args: file1_path: 第一个文件路径 file2_path: 第二个文件路径 diff_format: 差异格式("unified", "context") context_lines: 上下文行数 Returns: 比较结果字典,包含: - are_identical: 是否完全相同 - hash1: 文件1的哈希值 - hash2: 文件2的哈希值 - diff: 差异内容 - diff_lines: 差异行数统计 """ path1 = Path(file1_path) path2 = Path(file2_path) if not path1.exists(): raise FileNotFoundError(f"文件不存在: {path1}") if not path2.exists(): raise FileNotFoundError(f"文件不存在: {path2}") # 计算哈希 hash1 = get_file_hash(path1) hash2 = get_file_hash(path2) are_identical = hash1 == hash2 if are_identical: return { "are_identical": True, "hash1": hash1, "hash2": hash2, "diff": "", "diff_lines": {"added": 0, "removed": 0, "modified": 0}, } # 读取文件内容 content1 = read_file_safe(path1) content2 = read_file_safe(path2) lines1 = content1.splitlines(keepends=True) lines2 = content2.splitlines(keepends=True) # 生成差异 if diff_format == "context": diff_lines = list(context_diff( lines1, lines2, fromfile=str(path1), tofile=str(path2), lineterm="", n=context_lines, )) else: diff_lines = list(unified_diff( lines1, lines2, fromfile=str(path1), tofile=str(path2), lineterm="", n=context_lines, )) diff_text = "".join(diff_lines) # 统计差异行数 added = sum(1 for line in diff_lines if line.startswith("+") and not line.startswith("+++")) removed = sum(1 for line in diff_lines if line.startswith("-") and not line.startswith("---")) modified = min(added, removed) return { "are_identical": False, "hash1": hash1, "hash2": hash2, "diff": diff_text, "diff_lines": { "added": added - modified, "removed": removed - modified, "modified": modified, }, } def compare_directories( dir1_path: str | Path, dir2_path: str | Path, ignore_patterns: Optional[List[str]] = None, ) -> Dict[str, Any]: """ 比较两个目录 Args: dir1_path: 第一个目录路径 dir2_path: 第二个目录路径 ignore_patterns: 要忽略的文件/目录模式列表 Returns: 比较结果字典,包含: - only_in_dir1: 只在目录1中的文件 - only_in_dir2: 只在目录2中的文件 - different_files: 内容不同的文件 - identical_files: 内容相同的文件 """ dir1 = Path(dir1_path) dir2 = Path(dir2_path) if not dir1.exists() or not dir1.is_dir(): raise FileNotFoundError(f"目录不存在: {dir1}") if not dir2.exists() or not dir2.is_dir(): raise FileNotFoundError(f"目录不存在: {dir2}") ignore_patterns = ignore_patterns or [] def should_ignore(path: Path) -> bool: """检查是否应该忽略""" for pattern in ignore_patterns: if pattern in str(path): return True return False # 收集所有文件 files1 = {f.relative_to(dir1): f for f in dir1.rglob("*") if f.is_file() and not should_ignore(f)} files2 = {f.relative_to(dir2): f for f in dir2.rglob("*") if f.is_file() and not should_ignore(f)} all_files = set(files1.keys()) | set(files2.keys()) only_in_dir1 = [] only_in_dir2 = [] different_files = [] identical_files = [] for rel_path in all_files: if rel_path not in files1: only_in_dir2.append(str(rel_path)) elif rel_path not in files2: only_in_dir1.append(str(rel_path)) else: # 比较文件内容 try: hash1 = get_file_hash(files1[rel_path]) hash2 = get_file_hash(files2[rel_path]) if hash1 == hash2: identical_files.append(str(rel_path)) else: different_files.append(str(rel_path)) except Exception: # 如果无法比较,标记为不同 different_files.append(str(rel_path)) return { "only_in_dir1": only_in_dir1, "only_in_dir2": only_in_dir2, "different_files": different_files, "identical_files": identical_files, }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/yfcyfc123234/showdoc_mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server