Skip to main content
Glama
file_validator.py5.83 kB
""" 文件验证和检查工具 提供文件编码检测、格式验证、文件查找等功能。 """ import chardet from pathlib import Path from typing import List, Optional, Dict, Any from collections import defaultdict # 使用内置异常 from .file_utils import get_file_info def detect_encoding( file_path: str | Path, sample_size: int = 10000, ) -> Dict[str, Any]: """ 检测文件编码 Args: file_path: 文件路径 sample_size: 采样大小(字节) Returns: 编码信息字典,包含: - encoding: 检测到的编码 - confidence: 置信度(0-1) - language: 语言(如果可检测) """ path = Path(file_path) if not path.exists(): raise FileNotFoundError(f"文件不存在: {path}") try: with open(path, "rb") as f: sample = f.read(sample_size) result = chardet.detect(sample) return { "encoding": result.get("encoding", "unknown"), "confidence": result.get("confidence", 0.0), "language": result.get("language", None), } except Exception as e: return { "encoding": "unknown", "confidence": 0.0, "language": None, "error": str(e), } def convert_encoding( file_path: str | Path, target_encoding: str = "utf-8", source_encoding: Optional[str] = None, backup: bool = True, ) -> Path: """ 转换文件编码 Args: file_path: 文件路径 target_encoding: 目标编码 source_encoding: 源编码(如果为 None 则自动检测) backup: 是否备份原文件 Returns: 文件路径 Raises: FileNotFoundError: 文件不存在 EncodingError: 编码转换失败 """ from .exceptions import EncodingError from .content_processor import read_file_safe from .safe_writer import SafeFileWriter path = Path(file_path) if not path.exists(): raise FileNotFoundError(f"文件不存在: {path}") # 检测源编码 if source_encoding is None: encoding_info = detect_encoding(path) source_encoding = encoding_info.get("encoding", "utf-8") # 读取文件 try: content = read_file_safe(path, encoding=source_encoding) except Exception as e: raise EncodingError(f"读取文件失败: {e}") # 写入新编码 writer = SafeFileWriter(path, encoding=target_encoding, backup=backup) writer.write(content) return path def find_large_files( directory: str | Path, min_size: int, recursive: bool = True, ) -> List[Dict[str, Any]]: """ 查找大文件 Args: directory: 搜索目录 min_size: 最小文件大小(字节) recursive: 是否递归搜索 Returns: 大文件列表,每个文件包含: - path: 文件路径 - size: 文件大小(字节) - size_mb: 文件大小(MB) """ dir_path = Path(directory) if not dir_path.exists() or not dir_path.is_dir(): return [] large_files = [] if recursive: search_paths = dir_path.rglob("*") else: search_paths = dir_path.glob("*") for path in search_paths: if not path.is_file(): continue try: size = path.stat().st_size if size >= min_size: large_files.append({ "path": str(path), "size": size, "size_mb": round(size / (1024 * 1024), 2), }) except Exception: continue # 按大小排序 large_files.sort(key=lambda x: x["size"], reverse=True) return large_files def find_empty_files( directory: str | Path, recursive: bool = True, ) -> List[Path]: """ 查找空文件 Args: directory: 搜索目录 recursive: 是否递归搜索 Returns: 空文件路径列表 """ dir_path = Path(directory) if not dir_path.exists() or not dir_path.is_dir(): return [] empty_files = [] if recursive: search_paths = dir_path.rglob("*") else: search_paths = dir_path.glob("*") for path in search_paths: if not path.is_file(): continue try: if path.stat().st_size == 0: empty_files.append(path) except Exception: continue return empty_files def find_duplicate_files( directory: str | Path, recursive: bool = True, ) -> List[List[Path]]: """ 查找重复文件(基于内容哈希) Args: directory: 搜索目录 recursive: 是否递归搜索 Returns: 重复文件组列表,每个组包含内容相同的文件路径列表 """ from .file_comparison import get_file_hash dir_path = Path(directory) if not dir_path.exists() or not dir_path.is_dir(): return [] # 收集所有文件及其哈希 file_hashes = defaultdict(list) if recursive: search_paths = dir_path.rglob("*") else: search_paths = dir_path.glob("*") for path in search_paths: if not path.is_file(): continue try: file_hash = get_file_hash(path) file_hashes[file_hash].append(path) except Exception: continue # 找出重复的文件组(哈希值相同的文件) duplicate_groups = [files for files in file_hashes.values() if len(files) > 1] return duplicate_groups

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/yfcyfc123234/showdoc_mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server