-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmd2respec.py
243 lines (188 loc) · 7.64 KB
/
md2respec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
import argparse
import frontmatter
import os
import latex2mathml.converter
import markdown
import re
from string import Template
from pathlib import Path
template_path = "template.html"
def convert_markdown_to_html(markdown_file, config):
"""
Converts a markdown file to HTML, incorporating front matter and custom configuration.
Args:
markdown_file: Path to the markdown file.
config: A dictionary of configuration options.
Returns:
A string containing the HTML output.
"""
try:
with open(markdown_file, "r") as f:
post = frontmatter.load(f)
except FileNotFoundError:
raise ValueError("Input file not found")
# Extract front matter
metadata = {
"title": post.get("title", "title not present"),
"abstract": post.get("abstract", "abstract not present"),
"sotd": post.get("sotd", "sotd not present"),
"shortName": post.get("shortName", "shortName not present"),
"editor": post.get("editor", "editor not present"),
}
# Convert Markdown to HTML
latex = Latex()
html = latex.run(post.content)
html = markdown.markdown(html, extensions=["fenced_code", "tables"])
# html = markdown2.markdown(post.content, extras=["fenced-code-blocks", "latex"])
# Apply custom configuration
if config.get("section_headers", False):
html = apply_section_headers(html)
# Apply base URL from environment variable if present
base_url = os.environ.get("BASE_URL", "")
if base_url:
html = apply_base_url(html, base_url)
return metadata, html
def apply_base_url(html, base_url):
"""
Prepends the base URL to absolute links in the HTML.
Args:
html: The HTML string to process.
base_url: The base URL to prepend.
Returns:
The modified HTML string with updated links.
"""
# This uses a simple regex to find absolute links.
# A more robust solution might use an HTML parser.
return re.sub(r'(href|src)="(/[^"]+)"', rf'\1="{base_url}\2"', html)
def apply_section_headers(html):
"""
Wraps headers in <section> tags to create nested sections.
Args:
html: The HTML string to process.
Returns:
The modified HTML string with section tags.
"""
# Simple implementation for demonstration
# This could be more robust with a proper HTML parser
lines = html.splitlines()
new_lines = []
section_level = 0
root_section_level = 0
for line in lines:
if line.startswith("<h"):
match = re.match(r"<h(\d)>", line)
if match:
level = int(match.group(1)) # Extract header level (h1, h2, etc.)
if root_section_level == 0:
root_section_level = level
assert root_section_level == 2, "First section must be an h2!"
if level > section_level: # h2 -> h3
section_level = level
elif level < section_level: # h3 -> h2
new_lines.extend(["</section>"] * (1 + section_level - level))
section_level = level
else: # h2 -> h2
new_lines.append("</section>")
new_lines.append("<section>")
new_lines.append(line)
new_lines.extend(
["</section>"] * (section_level - 1)
) # Close remaining sections (we assume first section was an h2!)
return "\n".join(new_lines)
def html_to_respec(metadata, html_content):
metadata["spec"] = html_content
with open(template_path, "r") as f:
template_content = f.read()
template = Template(template_content)
return template.substitute(metadata)
class Latex:
_single_dollar_re = re.compile(r"(?<!\$)\$(?!\$)(.*?)\$")
_double_dollar_re = re.compile(r"\$\$(.*?)\$\$", re.DOTALL)
# Ways to escape
_pre_code_block_re = re.compile(r"<pre>(.*?)</pre>", re.DOTALL) # Wraped in <pre>
_triple_re = re.compile(r"```(.*?)```", re.DOTALL) # Wrapped in a code block ```
_single_re = re.compile(r"(?<!`)(`)(.*?)(?<!`)\1(?!`)") # Wrapped in a single `
converter = None
code_blocks = {}
def _convert_single_match(self, match):
return self.converter.convert(match.group(1))
def _convert_double_match(self, match):
return self.converter.convert(
match.group(1).replace(r"\n", ""), display="block"
)
def code_placeholder(self, match):
placeholder = f"<!--CODE_BLOCK_{len(self.code_blocks)}-->"
self.code_blocks[placeholder] = match.group(0)
return placeholder
def run(self, text):
try:
import latex2mathml.converter
self.converter = latex2mathml.converter
except ImportError:
raise ImportError(
'The "latex" extra requires the "latex2mathml" package to be installed.'
)
# Escape by replacing with a code block
text = self._pre_code_block_re.sub(self.code_placeholder, text)
text = self._single_re.sub(self.code_placeholder, text)
text = self._triple_re.sub(self.code_placeholder, text)
text = self._single_dollar_re.sub(self._convert_single_match, text)
text = self._double_dollar_re.sub(self._convert_double_match, text)
# Convert placeholder tag back to original code
for placeholder, code_block in self.code_blocks.items():
text = text.replace(placeholder, code_block)
return text
def recursive_folder_conversion(input_dir, output_dir):
if not os.path.isdir(input_dir):
raise ValueError(
"Input path must be a directory when using the --recursive option"
)
if not os.path.isdir(output_dir):
raise ValueError(
"Output path must be a directory when using the --recursive option"
)
for md_file in Path(input_dir).rglob("*.md"):
input_path = str(md_file)
# preserve path
relative_path = os.path.relpath(md_file, input_dir)
output_path = os.path.join(
output_dir, os.path.splitext(relative_path)[0] + ".html"
)
# convert
config = {"section_headers": True} # Enable section headers
metadata, html_output = convert_markdown_to_html(input_path, config)
html_output = html_to_respec(metadata, html_output)
with open(output_path, "w") as f:
f.write(html_output)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("input_path", help="markdown input file")
parser.add_argument("--output-path", default="spec.html", help="HTML output file")
parser.add_argument("--pure-html", action="store_true", help="Output pure HTML")
parser.add_argument(
"--recursive",
action="store_true",
help="Convert all markdown files in a folder",
)
args = parser.parse_args()
if args.recursive:
if args.pure_html:
raise ValueError(
"Cannot use the --pure-html option with the --recursive option"
)
recursive_folder_conversion(args.input_path, args.output_path)
return
# ensure that input path is a .md file
if not args.input_path.endswith(".md"):
raise ValueError("Input file must be a markdown file")
# ensure that output path is a.html file
if not args.output_path.endswith(".html"):
raise ValueError("Output file must be an HTML file")
# convert
config = {"section_headers": True} # Enable section headers
metadata, html_output = convert_markdown_to_html(args.input_path, config)
if not args.pure_html:
html_output = html_to_respec(metadata, html_output)
print(html_output)
if __name__ == "__main__":
main()