Source code for pytexmd.filter.file_maker

__all__ = ["string_to_tree",
           "process_string",
           "element_to_file_whole",
           "element_to_file_only_begin",
           "split_document_to_files",
           "split_by_sections",
           "verify_content_integrity",
           "string_to_filename",
           ]

from . import preprocessor,enumitem,equations,antibugs,core,splitting, text

from typing import List
from pathlib import Path
import os
import re

NUM_FILES = 0

# Section hierarchy mapping for splitting
SECTION_HIERARCHY = {
    "\\part": 0,
    "\\chapter": 1,
    "\\section": 2,
    "\\subsection": 3,
    "\\subsubsection": 4,
    "\\paragraph": 5,
    "\\subparagraph": 6,
    "\\part*": 0,
    "\\chapter*": 1,
    "\\section*": 2,
    "\\subsection*": 3,
    "\\subsubsection*": 4,
    "\\paragraph*": 5,
    "\\subparagraph*": 6,
}


[docs]
def string_to_tree(string:str)->core.Document:
    """
    Converts a string to a document tree structure.

    Args:
        string (str): The input string to process.

    Returns:
        Document: The processed document tree.

    Example:
        ```python
        latex = r\"""\\section{Intro}\\begin{equation}E=mc^2\\end{equation}\"""
        doc = string_to_tree(latex)
        print(doc.to_string())
        ```
    """
    string = antibugs.no_more_bugs_begin(string)
    
    string  = preprocessor.run_preprocessor(string)
    all_expands = []
    
    #basic_expands += junkSearcher+replaceSearcher
    #all_expands += [[core.BackMatter()]]  #backmatter and appendix splitter
    all_expands += core.get_section_like_filters_top_lvl()
    all_expands += [text.get_theoremSearchers(string)]+[[text.Proof]]+ [enumitem.get_all_filters()]
    all_expands += [equations.get_all_filters()]
    all_expands += [text.get_all_filters()] 
    all_expands += [[core.OneArgumentJunkSearcher(r"\hspace")]]
    junk_commands = ["\\sffamily","\\itshape","\\nonumber","\\noindent","\\indent","\\newpage"]
    #replace_mentdict = {"\\noindent":""}#"\\prerequisites ":"</p><h1 style=\"font-size:20px\">Prerequisites</h1><p>","\\N ":"\\mathbb{N}","\\id ":"id","\\GL ":"GL","\\Mat ":"\mathfrak{M}"}
    all_expands += [[core.JunkSearcher(elem) for elem in junk_commands]]

    #basic_expands += get_drawtex_searchers()
    
    #all_expands = [basic_expands]
    #all_expands.append([section.Label])
    all_expands.append([text.EqRef,text.Ref,text.Cite])

    
    number_within_equation = text.get_number_within_equation(string)
    
    pre_docmuent,document,post_document = text.Document.split_and_create(string,None)
    #document.globals.number_within_equation = number_within_equation
    
    for expand_on in all_expands:
        document.expand(expand_on)
    document.expand([core.JunkSearcher("{",save_split=False),core.JunkSearcher("}",save_split=False)])
    document.expand([core.JunkSearcher("\\ ",save_split=False)])
    
    
    #pre_content are just commands
    print("processing finished! now the final file will be created.")
    document._finish_up()
    
    
    return document




[docs]
def element_to_file_whole(element:core.SectionLike,output_folder:str,file_name:str,output_suffix:str=".md"):
    """
    Writes the whole element to a file.

    Args:
        element (SectionEnumerate): The element to write.
        output_folder (str): The output folder path.
        file_name (str): The file name.
        output_suffix (str, optional): The file suffix. Defaults to ".md".

    Returns:
        None

    Example:
        ```python
        # Save the entire document as 'output/index.md'
        doc = string_to_tree(r"\\section{Intro}")
        element_to_file_whole(doc, "output", "index")
        ```
    """
    global NUM_FILES
    NUM_FILES += 1

    file_name = output_folder+"/"+file_name+output_suffix
    with open(file_name,"w",encoding="utf-8") as f:
        f.write(element.to_string())
    print(f"File {file_name} created.")
    return [file_name.replace(output_suffix,"")]



[docs]
def element_to_file_only_begin(element:core.SectionLike,output_folder:str,file_name:str,file_names:List[str],output_suffix:str=".md"):
    """
    Writes only the beginning part of the element to a file, with a toctree.

    Args:
        element (SectionEnumerate): The element to write.
        output_folder (str): The output folder path.
        file_name (str): The file name.
        output_suffix (str, optional): The file suffix. Defaults to ".md".

    Returns:
        None

    Example:
        ```python
        # Save only the introduction and generate a toctree for subsections
        doc = string_to_tree(r"\\section{Intro}\\section{Background}")
        element_to_file_only_begin(doc, "output", "index")
        ```
    """
    global NUM_FILES
    NUM_FILES += 1

    file_name = output_folder+"/"+file_name+output_suffix
    out_str = ""
    for child in element.children:
        if isinstance(child,core.SectionLike):
            break
        out_str += child.to_string()
    
    out_str += "\n\n"
    out_str += "\n```{toctree}\n"
    for child_file_name in file_names:
        out_str += core.TAB+f"{child_file_name}"
                
    out_str += "```\n"
    
    with open(file_name,"w",encoding="utf-8") as f:
        f.write(out_str)

    print(f"File {file_name} created.")
    return [file_name.replace(output_suffix,"")]




[docs]
def string_to_filename(name):
    """Convert section name to valid filename.
    
    Args:
        name (str): Section name to convert
        
    Returns:
        str: Sanitized filename
    """
    # Remove special characters and replace spaces with underscores
    filename = re.sub(r'[^\w\s-]', '', name.lower())
    filename = re.sub(r'[-\s]+', '_', filename)
    return filename.strip('_') or 'section'



[docs]
def split_by_sections(content_string, max_depth=2):
    """
    Split document string into hierarchical sections based on MyST comment markers.
    
    Args:
        content_string (str): The full document string with MyST markers
        max_depth (int): Maximum depth for splitting (0=part, 1=chapter, 2=section, etc.)
        
    Returns:
        dict: Hierarchical structure of sections with content and children tracking
    """
    # Pattern to find section definitions
    def_pattern = f"<!-- {core.SEC_DEF_SPLITTER}(.*?){core.SEC_DEF_SPLITTER}(.*?){core.SEC_DEF_SPLITTER} -->"
    
    sections = []
    
    for match in re.finditer(def_pattern, content_string):
        command = match.group(1).strip()
        name = match.group(2).strip()
        level = SECTION_HIERARCHY.get(command, 999)
        
        # Debug: warn if command not found in hierarchy
        if level == 999:
            print(f"Warning: Unknown section command '{command}' (repr: {repr(command)}) - treating as level 999")
            print(f"  Available commands: {list(SECTION_HIERARCHY.keys())}")
        
        # Find corresponding PREFIX_BEGIN and PREFIX_END
        begin_marker = f"<!-- {core.SEC_PREFIX_BEGIN}{command}{name} -->"
        end_marker = f"<!-- {core.SEC_PREFIX_END}{command}{name} -->"
        
        # Start searching from the DEF_SPLITTER position
        begin_pos = content_string.find(begin_marker, match.start())
        end_pos = content_string.find(end_marker, begin_pos)
        
        if begin_pos != -1 and end_pos != -1:
            # Extract full content from DEF_SPLITTER to END marker (includes all markers)
            full_content = content_string[match.start():end_pos + len(end_marker)]
            
            sections.append({
                'command': command,
                'name': name,
                'level': level,
                'content': full_content,
                'start_pos': match.start(),
                'begin_pos': begin_pos,
                'end_pos': end_pos + len(end_marker),
                'children': [],
                'child_files': []  # Track children file names
            })
    
    # Build hierarchy first
    root = {'command': 'document', 'name': 'index', 'level': -1, 'content': '', 'children': [], 'child_files': []}
    
    if not sections:
        root['content'] = content_string
        return root
    
    stack = [root]
    
    for section in sections:
        # Pop stack until we find the parent
        while len(stack) > 1:
            parent = stack[-1]
            if 'end_pos' in parent and section['end_pos'] <= parent['end_pos']:
                break
            stack.pop()
        
        # Add to parent's children
        stack[-1]['children'].append(section)
        stack.append(section)
    
    # Collect content chunks - only include TOP-LEVEL sections
    content_chunks = []
    top_level_sections = root['children']
    
    if not top_level_sections:
        root['content'] = content_string
        return root
    
    # Preamble
    if top_level_sections[0]['start_pos'] > 0:
        preamble = content_string[:top_level_sections[0]['start_pos']]
        content_chunks.append(('preamble', preamble))
    
    # Add top-level sections and inter-section content
    for i, section in enumerate(top_level_sections):
        content_chunks.append(('section', section))
        
        if i < len(top_level_sections) - 1:
            inter_content = content_string[section['end_pos']:top_level_sections[i + 1]['start_pos']]
            if inter_content:
                content_chunks.append(('inter', inter_content))
    
    # Epilogue
    if top_level_sections[-1]['end_pos'] < len(content_string):
        epilogue = content_string[top_level_sections[-1]['end_pos']:]
        if epilogue:
            content_chunks.append(('epilogue', epilogue))
    
    root['content_chunks'] = content_chunks
    return root


def extract_section_content(section):
    """Extract section's own content (before children).
    
    Args:
        section (dict): Section structure
        
    Returns:
        tuple: (own_content, remaining_content)
    """
    content = section['content']
    
    if not section['children']:
        return content, ''
    
    begin_marker = f"<!-- {core.SEC_PREFIX_BEGIN}"
    begin_pos = content.find(begin_marker)
    
    if begin_pos == -1:
        return content, ''
    
    search_from = begin_pos + len(begin_marker)
    child_begin_pos = content.find(begin_marker, search_from)
    
    if child_begin_pos != -1:
        own_content = content[:child_begin_pos]
        remaining = content[child_begin_pos:]
        return own_content, remaining
    
    return content, ''

def write_section_files(section, output_folder, max_depth, current_depth=0, output_suffix=".md"):
    """
    Recursively write section and its children to files.
    
    Args:
        section (dict): Section structure
        output_folder (str): Output directory
        max_depth (int): Maximum splitting depth
        current_depth (int): Current recursion depth
        output_suffix (str): File extension
        
    Returns:
        str: Filename of created file (without extension)
    """
    os.makedirs(output_folder, exist_ok=True)
    
    filename = string_to_filename(section['name'])
    filepath = os.path.join(output_folder, filename + output_suffix)
    
    should_split = current_depth < max_depth and len(section['children']) > 1
    
    with open(filepath, 'w', encoding='utf-8') as f:
        if should_split:
            # Extract only this section's own content
            own_content, remaining = extract_section_content(section)
            f.write(own_content.strip() + "\n\n")
            
            # Create toctree for children
            f.write("```{toctree}\n")
            f.write(":maxdepth: 2\n\n")
            
            child_files = []
            for child in section['children']:
                child_filename = write_section_files(
                    child, 
                    output_folder, 
                    max_depth, 
                    current_depth + 1,
                    output_suffix
                )
                child_files.append(child_filename)
            
            # Store child filenames in section structure
            section['child_files'] = child_files
            
            for child_file in child_files:
                f.write(f"{child_file}\n")
            
            f.write("```\n")
        else:
            # Include all content
            f.write(section['content'].strip() + "\n")
    
    print(f"Created: {filepath}")
    return filename

def reconstruct_content_from_structure(section):
    """
    Recursively reconstruct the full content from a section structure.
    
    Args:
        section (dict): Section structure with content and children
        
    Returns:
        str: Reconstructed content
    """
    if section.get('command') == 'document' and 'content_chunks' in section:
        reconstructed = ''
        for chunk_type, chunk_data in section['content_chunks']:
            if chunk_type == 'section':
                reconstructed += chunk_data['content']
            else:
                reconstructed += chunk_data
        return reconstructed
    
    return section.get('content', '')


[docs]
def verify_content_integrity(original_content, structure):
    """
    Verify that the split structure contains all original content.
    
    Args:
        original_content (str): Original document string
        structure (dict): Parsed section structure
        
    Returns:
        tuple: (is_valid, message, stats)
    """
    reconstructed = reconstruct_content_from_structure(structure)
    
    stats = {
        'original_length': len(original_content),
        'reconstructed_length': len(reconstructed),
        'difference': len(original_content) - len(reconstructed),
        'match': original_content == reconstructed
    }
    
    if stats['match']:
        message = "✓ Content integrity verified: All content preserved!"
        is_valid = True
    else:
        message = f"✗ Content mismatch: {stats['difference']} character difference"
        is_valid = False
        
        # Find where they differ
        for i, (c1, c2) in enumerate(zip(original_content, reconstructed)):
            if c1 != c2:
                start = max(0, i - 50)
                end = min(len(original_content), i + 50)
                message += f"\n  First difference at position {i}:"
                message += f"\n  Original: ...{original_content[start:end]}..."
                message += f"\n  Reconstructed: ...{reconstructed[start:end]}..."
                break
    
    return is_valid, message, stats



[docs]
def split_document_to_files(document_md, output_folder, depth=2, output_suffix=".md", verify=True):
    """
    Main function to split document tree into hierarchical MyST files.
    
    Each section file will know its child files through the structure.
    
    Args:
        document_md: Document tree object (from string_to_tree)
        output_folder (str): Output directory path
        depth (int): Splitting depth (0=no split, 1=chapter, 2=section, etc.)
        output_suffix (str): File extension
        verify (bool): Verify content integrity after parsing
        
    Returns:
        dict: Root structure with child_files tracking for all sections
        
    Example:
        ```python
        # Convert and split a document
        doc = string_to_tree(latex_string)
        structure = split_document_to_files(doc, "./output", depth=2, verify=True)
        # Each section in structure has 'child_files' list
        ```
    """
    # Convert document to string
    content_string = document_md.to_string()
    
    # Parse hierarchical structure
    root = split_by_sections(content_string, depth)
    
    # Verify content integrity if requested
    if verify:
        is_valid, message, stats = verify_content_integrity(content_string, root)
        print(f"\n{message}")
        print(f"  Original: {stats['original_length']:,} chars")
        print(f"  Reconstructed: {stats['reconstructed_length']:,} chars")
        if not is_valid:
            print("\n⚠ Warning: Proceeding with file creation despite content mismatch")
    
    # Write files
    write_section_files(root, output_folder, depth, 0, output_suffix)
    
    print(f"\n✓ Document split into files in: {output_folder}")
    return root



[docs]
def process_string(output_folder:str, string:str, depth=2, output_suffix:str=".md", verify=True):
    """
    Processes a LaTeX string and writes the document to hierarchical MyST files.
    
    This function converts LaTeX to a document tree, then splits it into multiple
    files based on section hierarchy with automatic content verification.

    Args:
        output_folder (str): The output folder path.
        string (str): The input LaTeX string.
        depth (int, optional): Splitting depth (0=no split, 1=chapter, 2=section, etc.). Defaults to 2.
        output_suffix (str, optional): The file suffix. Defaults to ".md".
        verify (bool, optional): Verify content integrity after parsing. Defaults to True.

    Returns:
        dict: Root structure with child_files tracking for all sections

    Example:
        ```python
        # Process a LaTeX string and split into hierarchical files
        latex = r\"""\\chapter{Intro}\\section{Background}\\subsection{Details}\"""
        structure = process_string("output", latex, depth=2)
        # Creates: output/intro.md with toctree to output/background.md
        ```
    """
    if not isinstance(depth, int) or depth < 0:
        raise ValueError("depth must be a non-negative integer")
    if not isinstance(output_folder, str):
        raise ValueError("output_folder must be a string")
    if not isinstance(string, str):
        raise ValueError("string must be a string")
    
    # Convert LaTeX to document tree
    document = string_to_tree(string)
    
    # Split document into hierarchical files with verification
    structure = split_document_to_files(
        document, 
        output_folder, 
        depth=depth, 
        output_suffix=output_suffix,
        verify=verify
    )
    
    return structure