__all__ = ["string_to_tree",
"process_string",
"element_to_file_whole",
"element_to_file_only_begin",
"split_document_to_files",
"split_by_sections",
"verify_content_integrity",
"string_to_filename",
]
from . import preprocessor,enumitem,equations,antibugs,core,splitting, text
from typing import List
from pathlib import Path
import os
import re
NUM_FILES = 0
# Section hierarchy mapping for splitting
SECTION_HIERARCHY = {
"\\part": 0,
"\\chapter": 1,
"\\section": 2,
"\\subsection": 3,
"\\subsubsection": 4,
"\\paragraph": 5,
"\\subparagraph": 6,
"\\part*": 0,
"\\chapter*": 1,
"\\section*": 2,
"\\subsection*": 3,
"\\subsubsection*": 4,
"\\paragraph*": 5,
"\\subparagraph*": 6,
}
[docs]
def string_to_tree(string:str)->core.Document:
"""
Converts a string to a document tree structure.
Args:
string (str): The input string to process.
Returns:
Document: The processed document tree.
Example:
```python
latex = r\"""\\
section{Intro}\\
begin{equation}E=mc^2\\
end{equation}\
"""
doc = string_to_tree(latex)
print(doc.to_string())
```
"""
string = antibugs.no_more_bugs_begin(string)
string = preprocessor.run_preprocessor(string)
all_expands = []
#basic_expands += junkSearcher+replaceSearcher
#all_expands += [[core.BackMatter()]] #backmatter and appendix splitter
all_expands += core.get_section_like_filters_top_lvl()
all_expands += [text.get_theoremSearchers(string)]+[[text.Proof]]+ [enumitem.get_all_filters()]
all_expands += [equations.get_all_filters()]
all_expands += [text.get_all_filters()]
all_expands += [[core.OneArgumentJunkSearcher(r"\hspace")]]
junk_commands = ["\\sffamily","\\itshape","\\nonumber","\\noindent","\\indent","\\newpage"]
#replace_mentdict = {"\\noindent":""}#"\\prerequisites ":"</p><h1 style=\"font-size:20px\">Prerequisites</h1><p>","\\N ":"\\mathbb{N}","\\id ":"id","\\GL ":"GL","\\Mat ":"\mathfrak{M}"}
all_expands += [[core.JunkSearcher(elem) for elem in junk_commands]]
#basic_expands += get_drawtex_searchers()
#all_expands = [basic_expands]
#all_expands.append([section.Label])
all_expands.append([text.EqRef,text.Ref,text.Cite])
number_within_equation = text.get_number_within_equation(string)
pre_docmuent,document,post_document = text.Document.split_and_create(string,None)
#document.globals.number_within_equation = number_within_equation
for expand_on in all_expands:
document.expand(expand_on)
document.expand([core.JunkSearcher("{",save_split=False),core.JunkSearcher("}",save_split=False)])
document.expand([core.JunkSearcher("\\ ",save_split=False)])
#pre_content are just commands
print("processing finished! now the final file will be created.")
document._finish_up()
return document
[docs]
def element_to_file_whole(element:core.SectionLike,output_folder:str,file_name:str,output_suffix:str=".md"):
"""
Writes the whole element to a file.
Args:
element (SectionEnumerate): The element to write.
output_folder (str): The output folder path.
file_name (str): The file name.
output_suffix (str, optional): The file suffix. Defaults to ".md".
Returns:
None
Example:
```python
# Save the entire document as 'output/index.md'
doc = string_to_tree(r"\\section{Intro}")
element_to_file_whole(doc, "output", "index")
```
"""
global NUM_FILES
NUM_FILES += 1
file_name = output_folder+"/"+file_name+output_suffix
with open(file_name,"w",encoding="utf-8") as f:
f.write(element.to_string())
print(f"File {file_name} created.")
return [file_name.replace(output_suffix,"")]
[docs]
def element_to_file_only_begin(element:core.SectionLike,output_folder:str,file_name:str,file_names:List[str],output_suffix:str=".md"):
"""
Writes only the beginning part of the element to a file, with a toctree.
Args:
element (SectionEnumerate): The element to write.
output_folder (str): The output folder path.
file_name (str): The file name.
output_suffix (str, optional): The file suffix. Defaults to ".md".
Returns:
None
Example:
```python
# Save only the introduction and generate a toctree for subsections
doc = string_to_tree(r"\\section{Intro}\\section{Background}")
element_to_file_only_begin(doc, "output", "index")
```
"""
global NUM_FILES
NUM_FILES += 1
file_name = output_folder+"/"+file_name+output_suffix
out_str = ""
for child in element.children:
if isinstance(child,core.SectionLike):
break
out_str += child.to_string()
out_str += "\n\n"
out_str += "\n```{toctree}\n"
for child_file_name in file_names:
out_str += core.TAB+f"{child_file_name}"
out_str += "```\n"
with open(file_name,"w",encoding="utf-8") as f:
f.write(out_str)
print(f"File {file_name} created.")
return [file_name.replace(output_suffix,"")]
[docs]
def string_to_filename(name):
"""Convert section name to valid filename.
Args:
name (str): Section name to convert
Returns:
str: Sanitized filename
"""
# Remove special characters and replace spaces with underscores
filename = re.sub(r'[^\w\s-]', '', name.lower())
filename = re.sub(r'[-\s]+', '_', filename)
return filename.strip('_') or 'section'
[docs]
def split_by_sections(content_string, max_depth=2):
"""
Split document string into hierarchical sections based on MyST comment markers.
Args:
content_string (str): The full document string with MyST markers
max_depth (int): Maximum depth for splitting (0=part, 1=chapter, 2=section, etc.)
Returns:
dict: Hierarchical structure of sections with content and children tracking
"""
# Pattern to find section definitions
def_pattern = f"<!-- {core.SEC_DEF_SPLITTER}(.*?){core.SEC_DEF_SPLITTER}(.*?){core.SEC_DEF_SPLITTER} -->"
sections = []
for match in re.finditer(def_pattern, content_string):
command = match.group(1).strip()
name = match.group(2).strip()
level = SECTION_HIERARCHY.get(command, 999)
# Debug: warn if command not found in hierarchy
if level == 999:
print(f"Warning: Unknown section command '{command}' (repr: {repr(command)}) - treating as level 999")
print(f" Available commands: {list(SECTION_HIERARCHY.keys())}")
# Find corresponding PREFIX_BEGIN and PREFIX_END
begin_marker = f"<!-- {core.SEC_PREFIX_BEGIN}{command}{name} -->"
end_marker = f"<!-- {core.SEC_PREFIX_END}{command}{name} -->"
# Start searching from the DEF_SPLITTER position
begin_pos = content_string.find(begin_marker, match.start())
end_pos = content_string.find(end_marker, begin_pos)
if begin_pos != -1 and end_pos != -1:
# Extract full content from DEF_SPLITTER to END marker (includes all markers)
full_content = content_string[match.start():end_pos + len(end_marker)]
sections.append({
'command': command,
'name': name,
'level': level,
'content': full_content,
'start_pos': match.start(),
'begin_pos': begin_pos,
'end_pos': end_pos + len(end_marker),
'children': [],
'child_files': [] # Track children file names
})
# Build hierarchy first
root = {'command': 'document', 'name': 'index', 'level': -1, 'content': '', 'children': [], 'child_files': []}
if not sections:
root['content'] = content_string
return root
stack = [root]
for section in sections:
# Pop stack until we find the parent
while len(stack) > 1:
parent = stack[-1]
if 'end_pos' in parent and section['end_pos'] <= parent['end_pos']:
break
stack.pop()
# Add to parent's children
stack[-1]['children'].append(section)
stack.append(section)
# Collect content chunks - only include TOP-LEVEL sections
content_chunks = []
top_level_sections = root['children']
if not top_level_sections:
root['content'] = content_string
return root
# Preamble
if top_level_sections[0]['start_pos'] > 0:
preamble = content_string[:top_level_sections[0]['start_pos']]
content_chunks.append(('preamble', preamble))
# Add top-level sections and inter-section content
for i, section in enumerate(top_level_sections):
content_chunks.append(('section', section))
if i < len(top_level_sections) - 1:
inter_content = content_string[section['end_pos']:top_level_sections[i + 1]['start_pos']]
if inter_content:
content_chunks.append(('inter', inter_content))
# Epilogue
if top_level_sections[-1]['end_pos'] < len(content_string):
epilogue = content_string[top_level_sections[-1]['end_pos']:]
if epilogue:
content_chunks.append(('epilogue', epilogue))
root['content_chunks'] = content_chunks
return root
def extract_section_content(section):
"""Extract section's own content (before children).
Args:
section (dict): Section structure
Returns:
tuple: (own_content, remaining_content)
"""
content = section['content']
if not section['children']:
return content, ''
begin_marker = f"<!-- {core.SEC_PREFIX_BEGIN}"
begin_pos = content.find(begin_marker)
if begin_pos == -1:
return content, ''
search_from = begin_pos + len(begin_marker)
child_begin_pos = content.find(begin_marker, search_from)
if child_begin_pos != -1:
own_content = content[:child_begin_pos]
remaining = content[child_begin_pos:]
return own_content, remaining
return content, ''
def write_section_files(section, output_folder, max_depth, current_depth=0, output_suffix=".md"):
"""
Recursively write section and its children to files.
Args:
section (dict): Section structure
output_folder (str): Output directory
max_depth (int): Maximum splitting depth
current_depth (int): Current recursion depth
output_suffix (str): File extension
Returns:
str: Filename of created file (without extension)
"""
os.makedirs(output_folder, exist_ok=True)
filename = string_to_filename(section['name'])
filepath = os.path.join(output_folder, filename + output_suffix)
should_split = current_depth < max_depth and len(section['children']) > 1
with open(filepath, 'w', encoding='utf-8') as f:
if should_split:
# Extract only this section's own content
own_content, remaining = extract_section_content(section)
f.write(own_content.strip() + "\n\n")
# Create toctree for children
f.write("```{toctree}\n")
f.write(":maxdepth: 2\n\n")
child_files = []
for child in section['children']:
child_filename = write_section_files(
child,
output_folder,
max_depth,
current_depth + 1,
output_suffix
)
child_files.append(child_filename)
# Store child filenames in section structure
section['child_files'] = child_files
for child_file in child_files:
f.write(f"{child_file}\n")
f.write("```\n")
else:
# Include all content
f.write(section['content'].strip() + "\n")
print(f"Created: {filepath}")
return filename
def reconstruct_content_from_structure(section):
"""
Recursively reconstruct the full content from a section structure.
Args:
section (dict): Section structure with content and children
Returns:
str: Reconstructed content
"""
if section.get('command') == 'document' and 'content_chunks' in section:
reconstructed = ''
for chunk_type, chunk_data in section['content_chunks']:
if chunk_type == 'section':
reconstructed += chunk_data['content']
else:
reconstructed += chunk_data
return reconstructed
return section.get('content', '')
[docs]
def verify_content_integrity(original_content, structure):
"""
Verify that the split structure contains all original content.
Args:
original_content (str): Original document string
structure (dict): Parsed section structure
Returns:
tuple: (is_valid, message, stats)
"""
reconstructed = reconstruct_content_from_structure(structure)
stats = {
'original_length': len(original_content),
'reconstructed_length': len(reconstructed),
'difference': len(original_content) - len(reconstructed),
'match': original_content == reconstructed
}
if stats['match']:
message = "✓ Content integrity verified: All content preserved!"
is_valid = True
else:
message = f"✗ Content mismatch: {stats['difference']} character difference"
is_valid = False
# Find where they differ
for i, (c1, c2) in enumerate(zip(original_content, reconstructed)):
if c1 != c2:
start = max(0, i - 50)
end = min(len(original_content), i + 50)
message += f"\n First difference at position {i}:"
message += f"\n Original: ...{original_content[start:end]}..."
message += f"\n Reconstructed: ...{reconstructed[start:end]}..."
break
return is_valid, message, stats
[docs]
def split_document_to_files(document_md, output_folder, depth=2, output_suffix=".md", verify=True):
"""
Main function to split document tree into hierarchical MyST files.
Each section file will know its child files through the structure.
Args:
document_md: Document tree object (from string_to_tree)
output_folder (str): Output directory path
depth (int): Splitting depth (0=no split, 1=chapter, 2=section, etc.)
output_suffix (str): File extension
verify (bool): Verify content integrity after parsing
Returns:
dict: Root structure with child_files tracking for all sections
Example:
```python
# Convert and split a document
doc = string_to_tree(latex_string)
structure = split_document_to_files(doc, "./output", depth=2, verify=True)
# Each section in structure has 'child_files' list
```
"""
# Convert document to string
content_string = document_md.to_string()
# Parse hierarchical structure
root = split_by_sections(content_string, depth)
# Verify content integrity if requested
if verify:
is_valid, message, stats = verify_content_integrity(content_string, root)
print(f"\n{message}")
print(f" Original: {stats['original_length']:,} chars")
print(f" Reconstructed: {stats['reconstructed_length']:,} chars")
if not is_valid:
print("\n⚠ Warning: Proceeding with file creation despite content mismatch")
# Write files
write_section_files(root, output_folder, depth, 0, output_suffix)
print(f"\n✓ Document split into files in: {output_folder}")
return root
[docs]
def process_string(output_folder:str, string:str, depth=2, output_suffix:str=".md", verify=True):
"""
Processes a LaTeX string and writes the document to hierarchical MyST files.
This function converts LaTeX to a document tree, then splits it into multiple
files based on section hierarchy with automatic content verification.
Args:
output_folder (str): The output folder path.
string (str): The input LaTeX string.
depth (int, optional): Splitting depth (0=no split, 1=chapter, 2=section, etc.). Defaults to 2.
output_suffix (str, optional): The file suffix. Defaults to ".md".
verify (bool, optional): Verify content integrity after parsing. Defaults to True.
Returns:
dict: Root structure with child_files tracking for all sections
Example:
```python
# Process a LaTeX string and split into hierarchical files
latex = r\"""\\
chapter{Intro}\\
section{Background}\\
subsection{Details}\
"""
structure = process_string("output", latex, depth=2)
# Creates: output/intro.md with toctree to output/background.md
```
"""
if not isinstance(depth, int) or depth < 0:
raise ValueError("depth must be a non-negative integer")
if not isinstance(output_folder, str):
raise ValueError("output_folder must be a string")
if not isinstance(string, str):
raise ValueError("string must be a string")
# Convert LaTeX to document tree
document = string_to_tree(string)
# Split document into hierarchical files with verification
structure = split_document_to_files(
document,
output_folder,
depth=depth,
output_suffix=output_suffix,
verify=verify
)
return structure