-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPythonToLatexMainFile.py
94 lines (74 loc) · 3.73 KB
/
PythonToLatexMainFile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import os
import re
from docx import Document
from pylatexenc.latex2text import LatexNodes2Text
import logging
# Set up logging for debugging
logging.basicConfig(filename=r'C:\\Users\\latou\\Desktop\\LatexToWordProject\\latex_processing.log', level=logging.DEBUG,
format='%(asctime)s %(levelname)s %(message)s')
# Function to clean LaTeX text while preserving \frac and other manual replacements
def clean_latex_text(latex_string):
logging.debug(f"Original LaTeX string: {latex_string}")
# Manual replacements for common LaTeX symbols (leave \frac intact)
replacements = {
r'\cdot': '·', # Multiplication dot
r'\times': '×', # Multiplication cross
r'\div': '÷', # Division symbol
r'\pm': '±', # Plus-minus
r'\approx': '≈', # Approximately equal
r'\leq': '≤', # Less than or equal to
r'\geq': '≥', # Greater than or equal to
r'\sqrt': '√', # Square root
}
# Replace LaTeX symbols with Unicode equivalents (but leave \frac intact)
for latex_symbol, unicode_char in replacements.items():
latex_string = latex_string.replace(latex_symbol, unicode_char)
# Process the text, leaving \frac{numerator}{denominator} intact
latex_text = latex_string # No further processing, since pylatexenc was modifying fractions
logging.debug(f"Cleaned LaTeX string: {latex_text}")
return latex_text
# Function to clean up hidden characters
def clean_latex_input(latex_string):
latex_string = latex_string.replace(chr(160), " ") # Replace non-breaking spaces
latex_string = latex_string.replace("\n", " ") # Replace paragraph breaks with spaces
latex_string = latex_string.replace("\t", " ") # Replace any tabs with spaces
latex_string = re.sub(r'[\u200B-\u200D\uFEFF]', '', latex_string) # Remove zero-width spaces
logging.debug(f"Cleaned hidden characters from LaTeX input: {latex_string}")
return latex_string
# Function to split the continuous LaTeX string into individual equations
def split_latex_equations(latex_string):
equations = re.split(r'(?<!\\)\$', latex_string) # Split by $ symbols while avoiding escaped \$ signs
logging.debug(f"Split equations: {equations}")
return [eq.strip() for eq in equations if eq.strip()] # Filter out empty strings
# Ensure that the Python script is pulling input directly from the document
doc_path = r'C:\\Users\\latou\\Desktop\\LatexToWordProject\\LatexTestWord.docx'
output_file_path = r'C:\\Users\\latou\\Desktop\\LatexToWordProject\\latex_output.txt' # Corrected output path
if not os.path.exists(doc_path):
logging.error(f"Document not found: {doc_path}")
exit()
doc = Document(doc_path)
latex_string = ""
# Extract text from the Word document
for para in doc.paragraphs:
latex_string += para.text + "\n"
logging.debug(f"Extracted LaTeX from Word document: {latex_string}")
# Clean the LaTeX input to remove hidden characters
latex_string = clean_latex_input(latex_string)
# Split the continuous LaTeX input into separate equations
equations = split_latex_equations(latex_string)
# Create a list to hold the final results
results = []
# Process each equation separately
for eq in equations:
cleaned_latex = clean_latex_text(eq)
results.append(cleaned_latex)
# Join the results with newline characters and prepare the output string
final_output = '\n'.join(results)
# Output the final cleaned LaTeX directly (for VBA to capture)
try:
with open(output_file_path, 'w', encoding='utf-8') as f:
f.write(final_output)
logging.debug(f"Final LaTeX/MathML results written to {output_file_path}")
except Exception as e:
logging.error(f"Error writing to file: {e}")
logging.debug("Python script finished.")