Source code for markdown_diagrams.extractors
#!/usr/bin/env python3
"""
Extractor functions for parsing content from Markdown files.
"""
import logging
import re
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
# Set up logging
logger = logging.getLogger(__name__)
# Supported diagram types with their regex patterns.
# Be careful with pattern matching to avoid issues
# with nested code blocks.
DIAGRAM_PATTERNS = {
"mermaid": r"```(?:mermaid)(.*?```)",
"flowchart": r"```(?:flowchart)(.*?```)",
"sequence": r"```(?:sequence)(.*?```)",
"graph": r"```(?:graph)(.*?```)",
"gantt": r"```(?:gantt)(.*?```)",
"pie": r"```(?:pie)(.*?```)",
"toc": r"```(?:toc)(.*?```)",
"mindmap": r"```(?:mindmap)(.*?```)",
"quadrantDiagram": r"```(?:quadrantDiagram)(.*?```)",
"er": r"```(?:er)(.*?```)",
"class": r"```(?:class)(.*?```)",
"state": r"```(?:state)(.*?```)",
"journey": r"```(?:journey)(.*?```)",
"active": r"```(?:active)(.*?```)",
"component": r"```(?:component)(.*?```)",
"gitGraph": r"```(?:gitGraph)(.*?```)",
"userJourney": r"```(?:userJourney)(.*?```)",
"requirement": r"```(?:requirement)(.*?```)",
"c4": r"```(?:c4)(.*?```)",
"entityRelationship": r"```(?:entityRelationship)(.*?```)",
}
# Mapping from code fence type to the Mermaid diagram type declaration
# that must be prepended to the extracted content.
# None means the content already includes the diagram type (e.g. ```mermaid blocks).
DIAGRAM_TYPE_PREFIX = {
"mermaid": None,
"flowchart": "flowchart",
"sequence": "sequenceDiagram",
"graph": "graph",
"gantt": "gantt",
"pie": "pie",
"toc": None,
"mindmap": "mindmap",
"quadrantDiagram": "quadrantChart",
"er": "erDiagram",
"class": "classDiagram",
"state": "stateDiagram-v2",
"journey": "journey",
"active": None,
"component": None,
"gitGraph": "gitGraph",
"userJourney": "journey",
"requirement": "requirementDiagram",
"c4": "C4Context",
"entityRelationship": "erDiagram",
}
def _find_preceding_heading(content: str, position: int) -> Optional[str]:
"""Find the nearest markdown heading before the given position.
Args:
content: The full markdown file content.
position: The character offset of the diagram code fence.
Returns:
The heading text (without ``#`` markers), or None if no heading is found.
"""
heading_pattern = r"^(#{1,6})\s+(.+)$"
last_heading = None
for match in re.finditer(heading_pattern, content[:position], re.MULTILINE):
last_heading = match.group(2).strip()
return last_heading
def _find_preceding_label(
content: str, position: int, max_lines_back: int = 5
) -> Optional[str]:
"""Find a descriptive bold-text label immediately before a diagram fence.
Many Markdown documents place a descriptive title in bold text
(``**Title**``) on the line(s) just above a code fence, while the
nearest ``#``-heading is a generic section name like "Diagrams".
This function returns that bold label when present so it can be
used for a more descriptive filename.
Args:
content: The full markdown file content.
position: The character offset of the diagram code fence.
max_lines_back: How many lines before the fence to search.
Returns:
The bold label text (without ``**`` markers), or None.
"""
# Grab the text just before the code fence and split into lines.
preceding = content[:position].rstrip()
lines = preceding.split("\n")
search_lines = lines[-max_lines_back:] if len(lines) >= max_lines_back else lines
# Walk backwards to find the closest bold-text label.
bold_pattern = re.compile(r"^\s*\*\*(.+?)\*\*\s*$")
for line in reversed(search_lines):
stripped = line.strip()
if not stripped:
continue
m = bold_pattern.match(stripped)
if m:
return m.group(1).strip()
# Stop searching once we hit non-empty, non-bold content.
break
return None
[docs]
def heading_to_filename(heading: str) -> str:
"""Convert a markdown heading to a filesystem-safe filename stem.
Args:
heading: Raw heading text (e.g. ``3.1 System Architecture``).
Returns:
A lowercase, underscore-separated filename stem
(e.g. ``system_architecture``).
"""
# Strip leading section numbers like "3.1" or "6.2"
name = re.sub(r"^[\d.]+\s*", "", heading)
name = name.lower()
# Replace non-alphanumeric runs with a single underscore
name = re.sub(r"[^a-z0-9]+", "_", name)
name = name.strip("_")
# Truncate overly long names
if len(name) > 60:
name = name[:60].rstrip("_")
return name or "diagram"
[docs]
def extract_diagrams(
markdown_file: Path, diagram_types: List[str] = None
) -> Dict[str, List[Dict[str, Any]]]:
"""Extract diagrams from a Markdown file.
Args:
markdown_file: Path to the Markdown file.
diagram_types: List of diagram types to extract.
If None, extract all supported types.
Returns:
Dictionary mapping diagram type to a list of dicts, each with:
- ``content``: The diagram source text.
- ``heading``: The best available label for the diagram: a
bold-text title immediately above the fence if present,
otherwise the nearest preceding markdown heading, or None.
"""
# Initialize result dictionary
result: Dict[str, List[Dict[str, Any]]] = {
dtype: [] for dtype in DIAGRAM_PATTERNS.keys()
}
# If no specific types requested, extract all
if diagram_types is None:
diagram_types = list(DIAGRAM_PATTERNS.keys())
# Verify that requested diagram types are supported
unsupported_types = [
dtype for dtype in diagram_types if dtype not in DIAGRAM_PATTERNS
]
if unsupported_types:
logger.warning(f"Unsupported diagram types requested: {unsupported_types}")
diagram_types = [dtype for dtype in diagram_types if dtype in DIAGRAM_PATTERNS]
try:
# Read the file content with error handling
try:
content = markdown_file.read_text(encoding="utf-8")
except UnicodeDecodeError:
logger.error(f"Failed to read file {markdown_file} - encoding error")
return result
except Exception as e:
logger.error(f"Failed to read file {markdown_file}: {str(e)}")
return result
# Extract each requested diagram type
for diagram_type in diagram_types:
pattern = DIAGRAM_PATTERNS[diagram_type]
try:
prefix = DIAGRAM_TYPE_PREFIX.get(diagram_type)
diagrams: List[Dict[str, Any]] = []
for match in re.finditer(pattern, content, re.DOTALL | re.IGNORECASE):
diagram_content = match.group(1).strip().rstrip("`")
if diagram_content:
if prefix:
diagram_content = f"{prefix}\n{diagram_content}"
label = _find_preceding_label(content, match.start())
heading = label or _find_preceding_heading(
content, match.start()
)
diagrams.append(
{"content": diagram_content, "heading": heading}
)
result[diagram_type] = diagrams
except Exception as e:
logger.error(
f"Error extracting {diagram_type} diagrams "
f"from {markdown_file}: {e}"
)
result[diagram_type] = []
except Exception as e:
logger.error(f"Unexpected error processing {markdown_file}: {str(e)}")
return result
[docs]
def extract_mermaid_diagrams(markdown_file: Path) -> List[str]:
"""Extract Mermaid diagrams from a Markdown file (legacy function).
Args:
markdown_file: Path to the Markdown file.
Returns:
List of Mermaid diagram content strings.
"""
try:
diagrams = extract_diagrams(markdown_file, ["mermaid"])
return [d["content"] for d in diagrams["mermaid"]]
except Exception as e:
logger.error(f"Error extracting Mermaid diagrams: {str(e)}")
return []
[docs]
def extract_diagrams_with_position(
markdown_file: Path, diagram_type: str = "mermaid"
) -> List[Tuple[str, int, int]]:
"""
Extract diagrams along with their position in the file.
Args:
markdown_file: Path to the Markdown file
diagram_type: Type of diagram to extract positions for
Returns:
List of tuples containing (diagram_content, start_position, end_position)
"""
try:
content = markdown_file.read_text(encoding="utf-8")
pattern = DIAGRAM_PATTERNS[diagram_type]
# Find all matches with position
diagrams = []
for match in re.finditer(pattern, content, re.DOTALL | re.IGNORECASE):
# Extract just the diagram content (without the ``` markers or
# the trailing newline that precedes the closing fence).
diagram_content = match.group(1).strip().rstrip("`").strip()
diagrams.append((diagram_content, match.start(), match.end()))
return diagrams
except Exception as e:
logger.error(
f"Error extracting {diagram_type} diagrams with position: {str(e)}"
)
return []