Source code for markdown_diagrams.extractors

#!/usr/bin/env python3
"""
Extractor functions for parsing content from Markdown files.
"""

import logging
import re
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

# Set up logging
logger = logging.getLogger(__name__)

# Supported diagram types with their regex patterns.
# Be careful with pattern matching to avoid issues
# with nested code blocks.
DIAGRAM_PATTERNS = {
    "mermaid": r"```(?:mermaid)(.*?```)",
    "flowchart": r"```(?:flowchart)(.*?```)",
    "sequence": r"```(?:sequence)(.*?```)",
    "graph": r"```(?:graph)(.*?```)",
    "gantt": r"```(?:gantt)(.*?```)",
    "pie": r"```(?:pie)(.*?```)",
    "toc": r"```(?:toc)(.*?```)",
    "mindmap": r"```(?:mindmap)(.*?```)",
    "quadrantDiagram": r"```(?:quadrantDiagram)(.*?```)",
    "er": r"```(?:er)(.*?```)",
    "class": r"```(?:class)(.*?```)",
    "state": r"```(?:state)(.*?```)",
    "journey": r"```(?:journey)(.*?```)",
    "active": r"```(?:active)(.*?```)",
    "component": r"```(?:component)(.*?```)",
    "gitGraph": r"```(?:gitGraph)(.*?```)",
    "userJourney": r"```(?:userJourney)(.*?```)",
    "requirement": r"```(?:requirement)(.*?```)",
    "c4": r"```(?:c4)(.*?```)",
    "entityRelationship": r"```(?:entityRelationship)(.*?```)",
}

# Mapping from code fence type to the Mermaid diagram type declaration
# that must be prepended to the extracted content.
# None means the content already includes the diagram type (e.g. ```mermaid blocks).
DIAGRAM_TYPE_PREFIX = {
    "mermaid": None,
    "flowchart": "flowchart",
    "sequence": "sequenceDiagram",
    "graph": "graph",
    "gantt": "gantt",
    "pie": "pie",
    "toc": None,
    "mindmap": "mindmap",
    "quadrantDiagram": "quadrantChart",
    "er": "erDiagram",
    "class": "classDiagram",
    "state": "stateDiagram-v2",
    "journey": "journey",
    "active": None,
    "component": None,
    "gitGraph": "gitGraph",
    "userJourney": "journey",
    "requirement": "requirementDiagram",
    "c4": "C4Context",
    "entityRelationship": "erDiagram",
}


def _find_preceding_heading(content: str, position: int) -> Optional[str]:
    """Find the nearest markdown heading before the given position.

    Args:
        content: The full markdown file content.
        position: The character offset of the diagram code fence.

    Returns:
        The heading text (without ``#`` markers), or None if no heading is found.
    """
    heading_pattern = r"^(#{1,6})\s+(.+)$"
    last_heading = None
    for match in re.finditer(heading_pattern, content[:position], re.MULTILINE):
        last_heading = match.group(2).strip()
    return last_heading


def _find_preceding_label(
    content: str, position: int, max_lines_back: int = 5
) -> Optional[str]:
    """Find a descriptive bold-text label immediately before a diagram fence.

    Many Markdown documents place a descriptive title in bold text
    (``**Title**``) on the line(s) just above a code fence, while the
    nearest ``#``-heading is a generic section name like "Diagrams".
    This function returns that bold label when present so it can be
    used for a more descriptive filename.

    Args:
        content: The full markdown file content.
        position: The character offset of the diagram code fence.
        max_lines_back: How many lines before the fence to search.

    Returns:
        The bold label text (without ``**`` markers), or None.
    """
    # Grab the text just before the code fence and split into lines.
    preceding = content[:position].rstrip()
    lines = preceding.split("\n")
    search_lines = lines[-max_lines_back:] if len(lines) >= max_lines_back else lines

    # Walk backwards to find the closest bold-text label.
    bold_pattern = re.compile(r"^\s*\*\*(.+?)\*\*\s*$")
    for line in reversed(search_lines):
        stripped = line.strip()
        if not stripped:
            continue
        m = bold_pattern.match(stripped)
        if m:
            return m.group(1).strip()
        # Stop searching once we hit non-empty, non-bold content.
        break
    return None


[docs] def heading_to_filename(heading: str) -> str: """Convert a markdown heading to a filesystem-safe filename stem. Args: heading: Raw heading text (e.g. ``3.1 System Architecture``). Returns: A lowercase, underscore-separated filename stem (e.g. ``system_architecture``). """ # Strip leading section numbers like "3.1" or "6.2" name = re.sub(r"^[\d.]+\s*", "", heading) name = name.lower() # Replace non-alphanumeric runs with a single underscore name = re.sub(r"[^a-z0-9]+", "_", name) name = name.strip("_") # Truncate overly long names if len(name) > 60: name = name[:60].rstrip("_") return name or "diagram"
[docs] def extract_diagrams( markdown_file: Path, diagram_types: List[str] = None ) -> Dict[str, List[Dict[str, Any]]]: """Extract diagrams from a Markdown file. Args: markdown_file: Path to the Markdown file. diagram_types: List of diagram types to extract. If None, extract all supported types. Returns: Dictionary mapping diagram type to a list of dicts, each with: - ``content``: The diagram source text. - ``heading``: The best available label for the diagram: a bold-text title immediately above the fence if present, otherwise the nearest preceding markdown heading, or None. """ # Initialize result dictionary result: Dict[str, List[Dict[str, Any]]] = { dtype: [] for dtype in DIAGRAM_PATTERNS.keys() } # If no specific types requested, extract all if diagram_types is None: diagram_types = list(DIAGRAM_PATTERNS.keys()) # Verify that requested diagram types are supported unsupported_types = [ dtype for dtype in diagram_types if dtype not in DIAGRAM_PATTERNS ] if unsupported_types: logger.warning(f"Unsupported diagram types requested: {unsupported_types}") diagram_types = [dtype for dtype in diagram_types if dtype in DIAGRAM_PATTERNS] try: # Read the file content with error handling try: content = markdown_file.read_text(encoding="utf-8") except UnicodeDecodeError: logger.error(f"Failed to read file {markdown_file} - encoding error") return result except Exception as e: logger.error(f"Failed to read file {markdown_file}: {str(e)}") return result # Extract each requested diagram type for diagram_type in diagram_types: pattern = DIAGRAM_PATTERNS[diagram_type] try: prefix = DIAGRAM_TYPE_PREFIX.get(diagram_type) diagrams: List[Dict[str, Any]] = [] for match in re.finditer(pattern, content, re.DOTALL | re.IGNORECASE): diagram_content = match.group(1).strip().rstrip("`") if diagram_content: if prefix: diagram_content = f"{prefix}\n{diagram_content}" label = _find_preceding_label(content, match.start()) heading = label or _find_preceding_heading( content, match.start() ) diagrams.append( {"content": diagram_content, "heading": heading} ) result[diagram_type] = diagrams except Exception as e: logger.error( f"Error extracting {diagram_type} diagrams " f"from {markdown_file}: {e}" ) result[diagram_type] = [] except Exception as e: logger.error(f"Unexpected error processing {markdown_file}: {str(e)}") return result
[docs] def extract_mermaid_diagrams(markdown_file: Path) -> List[str]: """Extract Mermaid diagrams from a Markdown file (legacy function). Args: markdown_file: Path to the Markdown file. Returns: List of Mermaid diagram content strings. """ try: diagrams = extract_diagrams(markdown_file, ["mermaid"]) return [d["content"] for d in diagrams["mermaid"]] except Exception as e: logger.error(f"Error extracting Mermaid diagrams: {str(e)}") return []
[docs] def extract_diagrams_with_position( markdown_file: Path, diagram_type: str = "mermaid" ) -> List[Tuple[str, int, int]]: """ Extract diagrams along with their position in the file. Args: markdown_file: Path to the Markdown file diagram_type: Type of diagram to extract positions for Returns: List of tuples containing (diagram_content, start_position, end_position) """ try: content = markdown_file.read_text(encoding="utf-8") pattern = DIAGRAM_PATTERNS[diagram_type] # Find all matches with position diagrams = [] for match in re.finditer(pattern, content, re.DOTALL | re.IGNORECASE): # Extract just the diagram content (without the ``` markers or # the trailing newline that precedes the closing fence). diagram_content = match.group(1).strip().rstrip("`").strip() diagrams.append((diagram_content, match.start(), match.end())) return diagrams except Exception as e: logger.error( f"Error extracting {diagram_type} diagrams with position: {str(e)}" ) return []