Skip to main content
Glama
extractor.py27.7 kB
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Markdown TOC 核心功能模块 专注于三个核心功能: 1. TOC 提取 - 从 Markdown 内容中提取标题信息(包含行号) 2. 编号问题分析 - 检测重复编号和不连续编号问题 3. TOC 生成 - 生成格式化的 TOC 内容供插入文档 设计原则: - 输入输出明确,便于 MCP Agent 调用 - 功能模块化,职责单一 - 无文件 I/O 依赖,纯内容处理 """ import re from typing import List, Dict, Any, Optional class MarkdownTOCExtractor: """Markdown TOC 核心功能类""" def __init__(self): """初始化提取器""" pass def extract_toc(self, content: str, min_depth: int = 1, max_depth: int = 6, include_line_numbers: bool = True) -> List[Dict[str, Any]]: """ 从 Markdown 内容中提取 TOC 信息 Args: content: Markdown 文档内容字符串 min_depth: 最小标题深度 (1-6),默认为 1 max_depth: 最大标题深度 (1-6),默认为 6 include_line_numbers: 是否包含行号信息,默认为 True Returns: 标题信息列表,每个元素包含: - level: 标题级别 (1-6) - title: 标题文本 - line_number: 行号 (当 include_line_numbers=True 时) - raw_line: 原始行内容 Example: >>> extractor = MarkdownTOCExtractor() >>> content = "# 标题1\\n## 标题2\\n### 标题3" >>> result = extractor.extract_toc(content, min_depth=2) >>> print(result[0]) {'level': 2, 'title': '标题2', 'line_number': 2, 'raw_line': '## 标题2'} """ # 参数验证 if not (1 <= min_depth <= 6): raise ValueError("min_depth 必须在 1-6 之间") if not (1 <= max_depth <= 6): raise ValueError("max_depth 必须在 1-6 之间") if min_depth > max_depth: raise ValueError("min_depth 不能大于 max_depth") # 直接提取标题信息,不进行代码块清理 headers = self._extract_headers(content) # 按深度过滤 filtered_headers = [ h for h in headers if min_depth <= h['level'] <= max_depth ] # 根据参数决定是否包含行号 if not include_line_numbers: for header in filtered_headers: header.pop('line_number', None) return filtered_headers def analyze_numbering_issues(self, headers: List[Dict[str, Any]], check_types: List[str] = None) -> Dict[str, Any]: """ 分析标题编号问题 Args: headers: 由 extract_toc 返回的标题信息列表 check_types: 要执行的检查类型列表,可选值: - 'duplicates': 检查重复编号 - 'discontinuous': 检查不连续编号 - 'formats': 检查编号格式一致性(未来实现) - 'missing': 检查缺失编号(未来实现) 默认为 ['duplicates', 'discontinuous'] Returns: 编号问题分析结果: - has_issues: 是否存在问题 - duplicate_numbers: 重复编号列表(当 'duplicates' 在 check_types 中时) - discontinuous_numbers: 不连续编号信息(当 'discontinuous' 在 check_types 中时) - statistics: 统计信息 Example: >>> headers = [ ... {'level': 1, 'title': '1. 介绍', 'line_number': 1}, ... {'level': 1, 'title': '1. 重复', 'line_number': 3}, ... {'level': 1, 'title': '3. 跳跃', 'line_number': 5} ... ] >>> result = extractor.analyze_numbering_issues(headers, ['duplicates']) >>> result['has_issues'] True """ # 设置默认检查类型 if check_types is None: check_types = ['duplicates', 'discontinuous'] # 验证检查类型 valid_types = ['duplicates', 'discontinuous', 'formats', 'missing'] for check_type in check_types: if check_type not in valid_types: raise ValueError(f"不支持的检查类型: {check_type}。支持的类型: {valid_types}") duplicate_numbers = [] discontinuous_numbers = [] # 按级别分组分析 level_groups = {} for header in headers: level = header['level'] if level not in level_groups: level_groups[level] = [] level_groups[level].append(header) # 分析每个级别的编号 for level, level_headers in level_groups.items(): # 提取编号 numbered_headers = [] for header in level_headers: extracted_number = self._extract_number_from_title(header['title'], level) if extracted_number is not None: numbered_headers.append({ 'number': extracted_number, 'title': header['title'], 'line_number': header['line_number'] }) if not numbered_headers: continue # 检测重复编号(仅当 'duplicates' 在 check_types 中时) if 'duplicates' in check_types: seen_numbers = {} for item in numbered_headers: number = item['number'] if number in seen_numbers: # 检查是否已经存在这个重复编号的记录 existing_duplicate = None for dup in duplicate_numbers: if dup['number'] == number and dup['level'] == level: existing_duplicate = dup break if existing_duplicate: existing_duplicate['occurrences'].append(item) else: duplicate_numbers.append({ 'number': number, 'level': level, 'occurrences': [seen_numbers[number], item] }) else: seen_numbers[number] = item # 检测不连续编号(仅当 'discontinuous' 在 check_types 中时) if 'discontinuous' in check_types: # 分别处理简单编号和多级编号 simple_numbered_headers = [] multilevel_groups = {} for item in numbered_headers: number_str = item['number'] # 处理多级编号(如 4.1, 4.2) if '.' in number_str: parts = number_str.split('.') if len(parts) == 2 and parts[0].isdigit() and parts[1].isdigit(): prefix = parts[0] suffix = int(parts[1]) if prefix not in multilevel_groups: multilevel_groups[prefix] = [] multilevel_groups[prefix].append({ 'number': suffix, 'title': item['title'], 'line_number': item['line_number'] }) # 处理简单数字编号 elif number_str.isdigit(): simple_numbered_headers.append({ 'number': int(number_str), 'title': item['title'], 'line_number': item['line_number'] }) # 检查简单编号的连续性 if simple_numbered_headers: numbers = sorted([item['number'] for item in simple_numbered_headers]) expected = 1 for number in numbers: if number != expected: discontinuous_numbers.append({ 'level': level, 'expected': expected, 'actual': number, 'title': next(h['title'] for h in simple_numbered_headers if h['number'] == number), 'line_number': next(h['line_number'] for h in simple_numbered_headers if h['number'] == number) }) expected = number + 1 else: expected += 1 # 检查多级编号的连续性(每个前缀组内部) for prefix, group_headers in multilevel_groups.items(): numbers = sorted([item['number'] for item in group_headers]) expected = 1 for number in numbers: if number != expected: discontinuous_numbers.append({ 'level': level, 'expected': expected, 'actual': number, 'title': next(h['title'] for h in group_headers if h['number'] == number), 'line_number': next(h['line_number'] for h in group_headers if h['number'] == number) }) expected = number + 1 else: expected += 1 # 统计信息 total_headers = len(headers) numbered_headers_count = sum( len([h for h in level_headers if self._extract_number_from_title(h['title'], level) is not None]) for level, level_headers in level_groups.items() ) return { 'has_issues': len(duplicate_numbers) > 0 or len(discontinuous_numbers) > 0, 'duplicate_numbers': duplicate_numbers, 'discontinuous_numbers': discontinuous_numbers, 'statistics': { 'total_headers': total_headers, 'numbered_headers': numbered_headers_count, 'levels_with_issues': len(set( [item['level'] for item in duplicate_numbers] + [item['level'] for item in discontinuous_numbers] )) } } def generate_toc(self, headers: List[Dict[str, Any]], format_type: str = 'markdown', include_links: bool = False, max_level: Optional[int] = 6) -> Dict[str, Any]: """ 生成格式化的 TOC 内容 Args: headers: 由 extract_toc 返回的标题信息列表 format_type: 输出格式 ('markdown', 'html', 'text') include_links: 是否包含链接(仅对 markdown 格式有效) max_level: 最大包含的标题级别 Returns: 生成的 TOC 信息: - content: 格式化的 TOC 内容 - format: 使用的格式 - total_items: 包含的条目数 - levels_included: 包含的级别范围 Example: >>> headers = [ ... {'level': 1, 'title': '介绍', 'line_number': 1}, ... {'level': 2, 'title': '概述', 'line_number': 3} ... ] >>> result = extractor.generate_toc(headers, 'markdown') >>> print(result['content']) - [介绍](#介绍) - [概述](#概述) """ # 过滤级别 filtered_headers = headers if max_level: filtered_headers = [h for h in headers if h['level'] <= max_level] if not filtered_headers: return { 'content': '', 'format': format_type, 'total_items': 0, 'levels_included': [] } # 生成内容 if format_type == 'markdown': content = self._generate_markdown_toc(filtered_headers, include_links) elif format_type == 'html': content = self._generate_html_toc(filtered_headers) elif format_type == 'text': content = self._generate_text_toc(filtered_headers) else: raise ValueError(f"不支持的格式类型: {format_type}") # 统计信息 levels_included = sorted(list(set(h['level'] for h in filtered_headers))) return { 'content': content, 'format': format_type, 'total_items': len(filtered_headers), 'levels_included': levels_included } def _remove_code_blocks(self, content: str) -> str: """移除代码块中的内容,但保留标题行""" lines = content.split('\n') result_lines = [] in_code_block = False for line in lines: stripped_line = line.strip() # 处理代码块边界 - 支持多种代码块标记 # 三重反引号代码块 (```language) if stripped_line.startswith('```'): in_code_block = not in_code_block # 保留代码块开始标记,以便后续行号对齐 result_lines.append('') # 添加空行保持行号一致 continue # 单行代码块标记 (text, scala, python 等单独一行) elif stripped_line in ['text', 'scala', 'python', 'java', 'javascript', 'sql']: in_code_block = not in_code_block # 保留代码块开始标记,以便后续行号对齐 result_lines.append('') # 添加空行保持行号一致 continue # 如果在代码块内,跳过 if in_code_block: result_lines.append('') # 添加空行保持行号一致 continue # 检查是否是标题行,如果是标题行则保留原样 if re.match(r'^#{1,6}\s+', stripped_line): result_lines.append(line) else: # 非标题行才移除行内代码 line_cleaned = re.sub(r'`[^`\n]*`', '', line) result_lines.append(line_cleaned) return '\n'.join(result_lines) def _extract_headers(self, content: str) -> List[Dict[str, Any]]: """提取标题信息,跳过代码块内的内容""" headers = [] lines = content.split('\n') in_code_block = False code_block_type = None for i, line in enumerate(lines, 1): # 检查代码块边界 if line.strip().startswith('```'): if not in_code_block: # 代码块开始 - 无论什么语言标识,都认为是代码块 in_code_block = True code_block_type = line.strip()[3:].strip() else: # 代码块结束 in_code_block = False code_block_type = None continue # 如果在代码块中,跳过标题提取 if in_code_block: continue # 提取标题 header_match = re.match(r'^(#{1,6})\s+(.+)$', line.strip()) if header_match: level = len(header_match.group(1)) title = header_match.group(2).strip() # 清理标题文本 title = self._clean_title_text(title) headers.append({ 'level': level, 'title': title, 'line_number': i, 'raw_line': line.strip() }) return headers def _chinese_number_to_int(self, chinese_num: str) -> Optional[int]: """将汉字数字转换为阿拉伯数字 Args: chinese_num: 汉字数字字符串 Returns: 对应的阿拉伯数字,如果无法转换则返回 None """ # 基础汉字数字映射 chinese_digits = { '零': 0, '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9, '十': 10, '〇': 0, '壹': 1, '贰': 2, '叁': 3, '肆': 4, '伍': 5, '陆': 6, '柒': 7, '捌': 8, '玖': 9, '拾': 10 } # 处理简单的单个汉字数字 if len(chinese_num) == 1 and chinese_num in chinese_digits: return chinese_digits[chinese_num] # 处理 "十X" 格式 (如 "十一", "十二") if chinese_num.startswith('十') and len(chinese_num) == 2: second_char = chinese_num[1] if second_char in chinese_digits: return 10 + chinese_digits[second_char] # 处理 "X十" 格式 (如 "二十", "三十") if chinese_num.endswith('十') and len(chinese_num) == 2: first_char = chinese_num[0] if first_char in chinese_digits: return chinese_digits[first_char] * 10 # 处理 "X十Y" 格式 (如 "二十一", "三十五") if len(chinese_num) == 3 and chinese_num[1] == '十': first_char = chinese_num[0] third_char = chinese_num[2] if first_char in chinese_digits and third_char in chinese_digits: return chinese_digits[first_char] * 10 + chinese_digits[third_char] # 处理特殊情况 "十" (表示 10) if chinese_num == '十': return 10 return None def _extract_number_from_title(self, title: str, level: int) -> Optional[str]: """从标题中提取编号(返回完整编号字符串),支持多种编号格式 Args: title: 标题文本 level: 标题级别 Returns: 提取到的完整编号字符串,如果没有找到则返回 None """ # 1. 标准数字编号 (1.2.3) number_match = re.match(r'^(\d+(?:\.\d+)*)\.?\s*(.+)', title) if number_match: full_number = number_match.group(1) number_parts = full_number.split('.') # 更灵活的匹配条件:编号部分数量应该小于等于标题级别 if len(number_parts) <= level and len(number_parts) > 0: return full_number # 2. 中文章节编号 (第 X 章/节/小节/部分) - 数字版本 chapter_match = re.match(r'^第\s*(\d+)\s*(章|节|小节|部分)\s*(.+)', title) if chapter_match: return chapter_match.group(1) # 3. 中文章节编号 (第 X 章/节/小节/部分) - 汉字版本 chinese_chapter_match = re.match(r'^第\s*([一二三四五六七八九十壹贰叁肆伍陆柒捌玖拾〇零]+)\s*(章|节|小节|部分)\s*(.+)', title) if chinese_chapter_match: chinese_num = chinese_chapter_match.group(1) converted_num = self._chinese_number_to_int(chinese_num) if converted_num is not None: return str(converted_num) # 4. 汉字编号带顿号 (一、二、三、) chinese_with_comma_match = re.match(r'^([一二三四五六七八九十壹贰叁肆伍陆柒捌玖拾〇零]+)、\s*(.+)', title) if chinese_with_comma_match: chinese_num = chinese_with_comma_match.group(1) converted_num = self._chinese_number_to_int(chinese_num) if converted_num is not None: return str(converted_num) # 5. 汉字编号带括号 ((一)、(二)、(三)) chinese_with_parentheses_match = re.match(r'^\(([一二三四五六七八九十壹贰叁肆伍陆柒捌玖拾〇零]+)\)\s*(.+)', title) if chinese_with_parentheses_match: chinese_num = chinese_with_parentheses_match.group(1) converted_num = self._chinese_number_to_int(chinese_num) if converted_num is not None: return str(converted_num) # 6. 纯汉字编号 (一 二 三) chinese_only_match = re.match(r'^([一二三四五六七八九十壹贰叁肆伍陆柒捌玖拾〇零]+)\s+(.+)', title) if chinese_only_match: chinese_num = chinese_only_match.group(1) converted_num = self._chinese_number_to_int(chinese_num) if converted_num is not None: return str(converted_num) # 7. 其他可能的编号格式 # 简单数字开头 (如 "1 介绍") simple_number_match = re.match(r'^(\d+)\s+(.+)', title) if simple_number_match: return simple_number_match.group(1) return None def _clean_title_text(self, title: str) -> str: """清理标题文本中的 Markdown 格式""" # 移除 Markdown 标题标记 title = re.sub(r'^#{1,6}\s+', '', title) # 移除粗体 title = re.sub(r'\*\*(.*?)\*\*', r'\1', title) # 移除斜体 title = re.sub(r'\*(.*?)\*', r'\1', title) # 移除行内代码 title = re.sub(r'`(.*?)`', r'\1', title) # 移除链接,保留文本 title = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', title) return title.strip() def _generate_markdown_toc(self, headers: List[Dict[str, Any]], include_links: bool) -> str: """生成 Markdown 格式的 TOC""" lines = [] for header in headers: level = header['level'] title = header['title'] # 计算缩进 indent = ' ' * (level - 1) if include_links: # 生成锚点链接 anchor = self._generate_anchor(title) line = f"{indent}- [{title}](#{anchor})" else: line = f"{indent}- {title}" lines.append(line) return '\n'.join(lines) def _generate_html_toc(self, headers: List[Dict[str, Any]]) -> str: """生成 HTML 格式的 TOC""" lines = ['<ul>'] current_level = 0 for header in headers: level = header['level'] title = header['title'] # 处理级别变化 if level > current_level: for _ in range(level - current_level): if current_level > 0: lines.append(' ' * current_level + '<li><ul>') current_level += 1 elif level < current_level: for _ in range(current_level - level): lines.append(' ' * current_level + '</ul></li>') current_level -= 1 # 添加当前项 indent = ' ' * level lines.append(f"{indent}<li>{title}</li>") # 关闭所有未关闭的标签 while current_level > 0: lines.append(' ' * current_level + '</ul>') if current_level > 1: lines.append(' ' * (current_level - 1) + '</li>') current_level -= 1 return '\n'.join(lines) def _generate_text_toc(self, headers: List[Dict[str, Any]]) -> str: """生成纯文本格式的 TOC""" lines = [] for header in headers: level = header['level'] title = header['title'] line_number = header['line_number'] # 计算缩进 indent = ' ' * (level - 1) line = f"{indent}{title} (第 {line_number} 行)" lines.append(line) return '\n'.join(lines) def _generate_anchor(self, title: str) -> str: """为标题生成锚点链接""" # 转换为小写 anchor = title.lower() # 移除特殊字符,保留中文、英文、数字、连字符 anchor = re.sub(r'[^\w\u4e00-\u9fff\-]', '-', anchor) # 移除多余的连字符 anchor = re.sub(r'-+', '-', anchor) anchor = anchor.strip('-') return anchor # 便捷函数,供外部直接调用 def extract_toc_from_content(content: str) -> List[Dict[str, Any]]: """ 便捷函数:从 Markdown 内容提取 TOC Args: content: Markdown 内容 Returns: 标题信息列表 """ extractor = MarkdownTOCExtractor() return extractor.extract_toc(content) def analyze_numbering_issues_from_headers(headers: List[Dict[str, Any]]) -> Dict[str, Any]: """ 便捷函数:分析标题编号问题 Args: headers: 标题信息列表 Returns: 编号问题分析结果 """ extractor = MarkdownTOCExtractor() return extractor.analyze_numbering_issues(headers) def generate_toc_from_headers(headers: List[Dict[str, Any]], format_type: str = 'markdown', include_links: bool = False, max_level: Optional[int] = None) -> Dict[str, Any]: """ 便捷函数:生成 TOC 内容 Args: headers: 标题信息列表 format_type: 输出格式 include_links: 是否包含链接 max_level: 最大级别 Returns: 生成的 TOC 信息 """ extractor = MarkdownTOCExtractor() return extractor.generate_toc(headers, format_type, include_links, max_level) if __name__ == "__main__": # 简单的测试示例 test_content = """# 1. 介绍 这是介绍部分。 ## 1.1 概述 概述内容。 ## 1.2 目标 目标内容。 # 2. 方法 方法部分。 ```python # 这是代码块中的注释 def example(): pass ``` ### 2.1.1 详细步骤 详细步骤。 # 1. 重复标题 这会产生重复编号问题。 """ print("=== Markdown TOC 核心功能测试 ===") # 测试 TOC 提取 extractor = MarkdownTOCExtractor() headers = extractor.extract_toc(test_content) print(f"\n1. TOC 提取结果 ({len(headers)} 个标题):") for header in headers: print(f" L{header['level']}: {header['title']} (第 {header['line_number']} 行)") # 测试编号问题分析 issues = extractor.analyze_numbering_issues(headers) print(f"\n2. 编号问题分析:") print(f" 存在问题: {issues['has_issues']}") print(f" 重复编号: {len(issues['duplicate_numbers'])} 个") print(f" 不连续编号: {len(issues['discontinuous_numbers'])} 个") # 测试 TOC 生成 toc_result = extractor.generate_toc(headers, 'markdown', include_links=True) print(f"\n3. TOC 生成结果:") print(f" 格式: {toc_result['format']}") print(f" 条目数: {toc_result['total_items']}") print(f" 内容:\n{toc_result['content']}")

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ForceInjection/markdown-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server