diff --git a/src/modm_data/html/document.py b/src/modm_data/html/document.py
index 3371819..4335fb5 100644
--- a/src/modm_data/html/document.py
+++ b/src/modm_data/html/document.py
@@ -1,7 +1,7 @@
# Copyright 2022, Niklas Hauser
# SPDX-License-Identifier: MPL-2.0
-import re
+import re, os
import logging
from pathlib import Path
from functools import cached_property
@@ -13,7 +13,7 @@
class Document:
def __init__(self, path: str):
self.path = Path(path)
- self.relpath = self.path.relative_to(Path().cwd())
+ self.relpath = os.path.relpath(self.path, Path().cwd())
self.fullname = self.path.stem
self.name = self.fullname.split("-")[0]
self.version = self.fullname.split("-")[1]
@@ -41,6 +41,8 @@ def chapter(self, pattern: str) -> Chapter:
LOGGER.error(f"Cannot find chapter with pattern '{pattern}'!")
if len(chapters) > 1:
LOGGER.error(f"Found multiple chapters with pattern '{pattern}'!")
+ for chapter in chapters:
+ LOGGER.error(f" - {chapter.name}")
assert len(chapters) == 1
return chapters[0]
diff --git a/src/modm_data/html/stmicro/__init__.py b/src/modm_data/html/stmicro/__init__.py
index b70414f..bca0b82 100644
--- a/src/modm_data/html/stmicro/__init__.py
+++ b/src/modm_data/html/stmicro/__init__.py
@@ -1,7 +1,8 @@
# Copyright 2022, Niklas Hauser
# SPDX-License-Identifier: MPL-2.0
-from .datasheet import DatasheetMicro, DatasheetSensor
+from .datasheet_sensor import DatasheetSensor
+from .datasheet_stm32 import DatasheetStm32
from .reference import ReferenceManual
from .document import load_documents, load_document_devices
from .document import datasheet_for_device, reference_manual_for_device
diff --git a/src/modm_data/html/stmicro/datasheet_sensor.py b/src/modm_data/html/stmicro/datasheet_sensor.py
new file mode 100644
index 0000000..adbb57e
--- /dev/null
+++ b/src/modm_data/html/stmicro/datasheet_sensor.py
@@ -0,0 +1,25 @@
+# Copyright 2022, Niklas Hauser
+# SPDX-License-Identifier: MPL-2.0
+
+import re
+import itertools
+from pathlib import Path
+from functools import cached_property, cache
+from collections import defaultdict
+
+from .helper import split_device_filter, split_package
+from ...html.text import ReDict
+
+import modm_data.html as html
+
+
+class DatasheetSensor(html.Document):
+ def __init__(self, path: str):
+ super().__init__(path)
+
+ def __repr__(self) -> str:
+ return f"DSsensor({self.fullname})"
+
+ @cache
+ def register_map(self, assert_table=True):
+ pass
diff --git a/src/modm_data/html/stmicro/datasheet.py b/src/modm_data/html/stmicro/datasheet_stm32.py
similarity index 97%
rename from src/modm_data/html/stmicro/datasheet.py
rename to src/modm_data/html/stmicro/datasheet_stm32.py
index 5fe4147..c77f16d 100644
--- a/src/modm_data/html/stmicro/datasheet.py
+++ b/src/modm_data/html/stmicro/datasheet_stm32.py
@@ -14,14 +14,14 @@
import modm_data.html as html
-class DatasheetMicro(html.Document):
+class DatasheetStm32(html.Document):
def __init__(self, path: str):
super().__init__(path)
self._id = {}
self._devices = {}
def __repr__(self) -> str:
- return f"DSµC({self.fullname})"
+ return f"DSstm32({self.fullname})"
@cached_property
def device_family(self) -> str:
@@ -247,11 +247,3 @@ def packages_pins(self):
data_pin["alternate"][af].extend(signals)
return data_packages, data_pins
-
-
-class DatasheetSensor(html.Document):
- def __init__(self, path: str):
- super().__init__(path)
-
- def __repr__(self) -> str:
- return f"DSsens({self.fullname})"
diff --git a/src/modm_data/html/stmicro/document.py b/src/modm_data/html/stmicro/document.py
index ade7d16..3fa7649 100644
--- a/src/modm_data/html/stmicro/document.py
+++ b/src/modm_data/html/stmicro/document.py
@@ -5,7 +5,8 @@
from collections import defaultdict
from ...html import Document
from ...utils import cache_path, ext_path
-from .datasheet import DatasheetMicro, DatasheetSensor
+from .datasheet_stm32 import DatasheetStm32
+from .datasheet_sensor import DatasheetSensor
from .reference import ReferenceManual
from ...owl import DeviceIdentifier
from ...owl.stmicro import did_from_string
@@ -27,7 +28,7 @@ def load_documents() -> list:
# FIXME: Better detection that DS13252 is a STM32WB55 module, not a chip!
if any("STM32" in h.html for h in chap[0].headings()) and \
"DS13252" not in doc.name and "DS14096" not in doc.name:
- documents[doc.name][doc.version] = DatasheetMicro(path)
+ documents[doc.name][doc.version] = DatasheetStm32(path)
else:
documents[doc.name][doc.version] = DatasheetSensor(path)
elif "RM" in doc.name:
@@ -35,7 +36,7 @@ def load_documents() -> list:
return documents
-def load_document_devices(use_cached=True) -> tuple[dict[DeviceIdentifier, DatasheetMicro],
+def load_document_devices(use_cached=True) -> tuple[dict[DeviceIdentifier, DatasheetStm32],
dict[DeviceIdentifier, ReferenceManual]]:
global DOCUMENT_CACHE
if DOCUMENT_CACHE is not None:
@@ -48,7 +49,7 @@ def load_document_devices(use_cached=True) -> tuple[dict[DeviceIdentifier, Datas
docs = {}
for path in set(json_data["ds"].values()):
- docs[path] = DatasheetMicro(path)
+ docs[path] = DatasheetStm32(path)
for path in set(json_data["rm"].values()):
docs[path] = ReferenceManual(path)
datasheets = {did_from_string(did): docs[path]
@@ -63,7 +64,7 @@ def load_document_devices(use_cached=True) -> tuple[dict[DeviceIdentifier, Datas
doc = list(versions.values())[-1]
# print(doc.path_pdf.relative_to(Path().cwd()), doc.path.relative_to(Path().cwd()))
# print(doc.devices)
- if isinstance(doc, DatasheetMicro):
+ if isinstance(doc, DatasheetStm32):
if not doc.devices:
raise ValueError(f"{doc} has no associated devices!")
for dev in doc.devices:
@@ -120,7 +121,7 @@ def _document_for_device(did: DeviceIdentifier, documents):
return None
-def datasheet_for_device(did: DeviceIdentifier) -> DatasheetMicro:
+def datasheet_for_device(did: DeviceIdentifier) -> DatasheetStm32:
datasheets, _ = load_document_devices()
return _document_for_device(did, datasheets)
diff --git a/src/modm_data/html2owl/stmicro/__main__.py b/src/modm_data/html2owl/stmicro/__main__.py
index bea1104..0d11c32 100644
--- a/src/modm_data/html2owl/stmicro/__main__.py
+++ b/src/modm_data/html2owl/stmicro/__main__.py
@@ -9,7 +9,7 @@
from collections import defaultdict
from multiprocessing.pool import ThreadPool
-from modm_data.html.stmicro import DatasheetMicro, ReferenceManual, load_documents
+from modm_data.html.stmicro import DatasheetStm32, ReferenceManual, load_documents
from modm_data.owl import Store
from modm_data.py2owl.stmicro import owl_from_doc
@@ -25,7 +25,7 @@ def main():
for name, versions in load_documents().items():
# always use latest version for now
doc = list(versions.values())[-1]
- if isinstance(doc, DatasheetMicro):
+ if isinstance(doc, DatasheetStm32):
docs.append(doc)
elif isinstance(doc, ReferenceManual):
docs.append(doc)
@@ -40,7 +40,7 @@ def main():
path = Path(args.document).absolute()
if path.stem.startswith("DS"):
- doc = DatasheetMicro(path)
+ doc = DatasheetStm32(path)
elif path.stem.startswith("RM"):
doc = ReferenceManual(path)
diff --git a/src/modm_data/html2svd/stmicro/__init__.py b/src/modm_data/html2svd/stmicro/__init__.py
index 22e4fbf..ee08e88 100644
--- a/src/modm_data/html2svd/stmicro/__init__.py
+++ b/src/modm_data/html2svd/stmicro/__init__.py
@@ -2,3 +2,4 @@
# SPDX-License-Identifier: MPL-2.0
from .reference import memory_map_from_reference_manual
+from .datasheet import memory_map_from_datasheet
diff --git a/src/modm_data/html2svd/stmicro/__main__.py b/src/modm_data/html2svd/stmicro/__main__.py
index 80347d1..9f2270c 100644
--- a/src/modm_data/html2svd/stmicro/__main__.py
+++ b/src/modm_data/html2svd/stmicro/__main__.py
@@ -8,8 +8,8 @@
from pathlib import Path
from multiprocessing.pool import ThreadPool
-from modm_data.html.stmicro import ReferenceManual, load_documents
-from modm_data.html2svd.stmicro import memory_map_from_reference_manual
+from modm_data.html.stmicro import ReferenceManual, DatasheetSensor, load_documents
+from modm_data.html2svd.stmicro import memory_map_from_reference_manual, memory_map_from_datasheet
from modm_data.svd import format_svd, write_svd
from modm_data.utils import ext_path
from anytree import RenderTree
@@ -17,7 +17,8 @@
def main():
parser = argparse.ArgumentParser()
- parser.add_argument("--document", type=str, default="")
+ parser.add_argument("--stm32", type=Path)
+ parser.add_argument("--sensor", type=Path)
parser.add_argument("--all", action="store_true", default=False)
args = parser.parse_args()
@@ -30,7 +31,7 @@ def main():
docs.append(doc)
Path("log/stmicro/svd").mkdir(exist_ok=True, parents=True)
- calls = [f"python3 -m modm_data.html2svd.stmicro --document {doc.path} "
+ calls = [f"python3 -m modm_data.html2svd.stmicro --stm32 {doc.path} "
f"> log/stmicro/svd/html_{doc.name}.txt 2>&1" for doc in docs]
with ThreadPool() as pool:
retvals = list(tqdm.tqdm(pool.imap(lambda c: subprocess.run(c, shell=True), calls), total=len(calls)))
@@ -38,12 +39,17 @@ def main():
if retval.returncode != 0: print(call)
return all(r.returncode == 0 for r in retvals)
- path = Path(args.document).absolute()
- doc = ReferenceManual(path)
+ if args.stm32:
+ doc = ReferenceManual(args.stm32.absolute())
+ elif args.sensor:
+ doc = DatasheetSensor(args.sensor.absolute())
print(doc.path_pdf.relative_to(Path().cwd()),
doc.path.relative_to(Path().cwd()))
- mmaptrees = memory_map_from_reference_manual(doc)
+ if args.stm32:
+ mmaptrees = memory_map_from_reference_manual(doc)
+ elif args.sensor:
+ mmaptrees = memory_map_from_datasheet(doc)
for mmaptree in mmaptrees:
print(RenderTree(mmaptree, maxlevel=2))
svd = format_svd(mmaptree)
diff --git a/src/modm_data/html2svd/stmicro/datasheet.py b/src/modm_data/html2svd/stmicro/datasheet.py
new file mode 100644
index 0000000..35e93f4
--- /dev/null
+++ b/src/modm_data/html2svd/stmicro/datasheet.py
@@ -0,0 +1,379 @@
+# Copyright 2022, Niklas Hauser
+# SPDX-License-Identifier: MPL-2.0
+
+import re
+from functools import cached_property
+from collections import defaultdict
+from anytree import RenderTree
+
+from ...html.stmicro.helper import split_device_filter
+from ...svd import *
+from ...header2svd.stmicro.tree import _normalize_order
+from ...cubemx import cubemx_device_list
+from ...html import replace as html_replace
+
+
+def _deduplicate_bit_fields(bit_fields):
+ named_fields = defaultdict(set)
+ for field in sorted(bit_fields, key=lambda f: f.position):
+ named_fields[field.name].add(field.position)
+
+ new_fields = []
+ for name, positions in named_fields.items():
+ position = min(positions)
+ width = max(positions) + 1 - position
+ new_fields.append(BitField(name, position, width))
+
+ return new_fields
+
+
+def _peripheral_map_to_tree(chapter, peripheral_maps):
+ cap_replace = {"STM32F415/417xx": "STM32F415/417"}
+
+ peripheral_trees = []
+ for caption, (heading, register_map) in peripheral_maps.items():
+ print(caption)
+ if match := re.search(f"OTG_[FH]S", caption):
+ replace_name = peripheral_name = "OTG"
+ elif match := re.search(f"JPEG", caption):
+ replace_name = peripheral_name = "JPEG"
+ elif match := re.search(f"CCU ", caption):
+ peripheral_name = "CANCCU"
+ replace_name = "FDCAN_CCU"
+ else:
+ peripheral_names = {n.split("_")[0] for n in register_map.keys()}
+ replace_name = peripheral_name = list(sorted(peripheral_names))[-1]
+ if all(p.startswith("COMP") for p in peripheral_names):
+ peripheral_name = "COMP"
+ replace_name = ""
+ if all(p.startswith("OPAMP") for p in peripheral_names):
+ peripheral_name = "OPAMP"
+ replace_name = ""
+ elif len(peripheral_names) > 1:
+ print(f"Multiple peripheral names detected: {peripheral_names}")
+
+ if peripheral_name == "M7": continue
+ # Some chapters have multiple tables for multiple instances
+ filters = defaultdict(set)
+ instances = set()
+ if peripheral_name.startswith("LPTIM"):
+ replace_name = peripheral_name = "LPTIM"
+ elif peripheral_name.startswith("DLYB"):
+ instances.add("DLYB")
+ elif peripheral_name.startswith("TIM"):
+ peripheral_name = "TIM"
+ if match := re.search(r"TIM(\d+) +to +TIM(\d+)", caption):
+ irange = list(sorted([int(match.group(1)), int(match.group(2))]))
+ irange = range(irange[0], irange[1] + 1)
+ instances.add(f"TIM({'|'.join(map(str, irange))})")
+ for pfilter in re.findall(r"TIM\d+(?:/\d+)*", caption):
+ if "/" in pfilter:
+ pfilter = f"TIM({pfilter[3:].replace('/', '|')})"
+ instances.add(f"^{pfilter}$")
+ elif "GPIOx" in peripheral_name:
+ peripheral_name = "GPIO"
+ for pfilter in re.findall(r"GPIO[A-Z](?:[/A-Z]+]+)?", caption):
+ if "/" in pfilter:
+ pfilter = f"GPIO({pfilter[4:].replace('/', '|')})"
+ instances.add(pfilter)
+ if instances:
+ filters["instances"].update(instances)
+
+ devices = set()
+ for pfilter in re.findall(r"STM32[\w/]+", html_replace(caption, **cap_replace)):
+ devices.update(split_device_filter(pfilter) if "/" in pfilter else [pfilter])
+ if devices:
+ filters["devices"].update(d.replace("x", ".") for d in devices)
+
+ if "connectivity line" in chapter.name:
+ filters["devices"].add("STM32F10[57]")
+ elif "low medium high and xl density" in chapter.name:
+ filters["devices"].add("STM32F10[123]")
+
+ peripheral_type = PeripheralType(peripheral_name, _chapter=chapter,
+ filters=dict(filters), section=heading)
+ for rname, (offset, bitfields) in register_map.items():
+ filters = {}
+ if replace_name:
+ if replace_name == "OTG" and (match := re.match("^OTG_[FH]S", rname)):
+ filters["instances"] = {match.group(0)}
+ nrname = rname.replace(match.group(0) + "_", "")
+ else:
+ nrname = rname.replace(replace_name + "_", "")
+ if len(rname) == len(nrname) and "_" in rname:
+ instance = rname.split("_")[0]
+ filters["instances"] = {instance+"$"}
+ nrname = rname.replace(instance + "_", "")
+ print(instance, nrname)
+ rname = nrname
+ if match := re.match("(.*?)connectivitylinedevices", rname):
+ rname = match.group(1)
+ filters["devices"] = {r"STM32F10[57]"}
+ elif match := re.match("(.*?)low,medium,highandXLdensitydevices", rname):
+ rname = match.group(1)
+ filters["devices"] = {r"STM32F10[123]"}
+ try: offset = int(offset, 16)
+ except: pass
+ register_type = Register(rname, offset, filters=filters, parent=peripheral_type)
+ fields = [BitField(field, bit) for bit, field in bitfields.items()]
+ register_type.children = _deduplicate_bit_fields(fields)
+
+ peripheral_trees.append(peripheral_type)
+
+ return peripheral_trees
+
+
+def _expand_register_offsets(peripheral_trees):
+ for peripheral in peripheral_trees:
+ unexpanded = defaultdict(list)
+ for register in peripheral.children:
+ if (isinstance(register.offset, str) or
+ ("CAN" in peripheral.name and "F1R2" in register.name) or
+ ("GFXMMU" in peripheral.name and "LUT0L" in register.name) or
+ ("GFXMMU" in peripheral.name and "LUT0H" in register.name) or
+ ("HSEM" in peripheral.name and "R1" in register.name)):
+ unexpanded[str(register.offset)].append(register)
+ for offsets, registers in unexpanded.items():
+ print(offsets, registers)
+
+ conv = lambda i: int(i, 16)
+ # if match := re.search(r"x=([\d,]+)", registers[0].name):
+ # offsets = [offsets] * len(match.group(1).split(","))
+ if any(pat in offsets for pat in ["x=", "channelnumber"]):
+ if matches := re.findall(r"(0x[\dA-Fa-f]+)\(x=\w+\)", offsets):
+ orange = enumerate(map(conv, matches))
+ formula = "x"
+ elif "channelnumber" in offsets:
+ orange = enumerate(range(0, 16))
+ formula = offsets.replace("channelnumber", "x")
+ elif "moni-ringunitnumber" in offsets:
+ orange = [(i, i) for i in range(1, 6)]
+ formula = offsets.split("(x=")[0]
+ else:
+ match = re.search(r"\(x=(\d+)(?:-\.?|\.\.)(\d+)", offsets)
+ orange = [(i, i) for i in range(int(match.group(1)), int(match.group(2)) + 1)]
+ formula = re.split(r"\(x=|,", offsets)[0]
+ offsets = [(ii, eval(formula, None, {"x": x})) for ii, x in orange]
+ print(formula, offsets, orange)
+ elif "-" in offsets:
+ omin, omax = list(map(conv, offsets.split("-")))
+ offsets = enumerate(range(omin, omax+1, 4))
+ elif "or" in offsets:
+ offsets = enumerate(list(map(conv, offsets.split("or"))))
+ elif "F1R2" in registers[0].name:
+ offsets = enumerate(range(int(offsets), int(offsets)+4*25*2+1, 4))
+ elif "LUT0" in registers[0].name:
+ offsets = enumerate(range(int(offsets), int(offsets)+4*2044+1, 8))
+ elif "HSEM" in peripheral.name:
+ print(offsets)
+ offsets = enumerate(range(int(offsets), int(offsets)+4*29+1, 4))
+ else:
+ print(f"Unknown expansion format for {offsets}!")
+ return False
+
+ fields = registers[0].children
+ if all(re.match(r"BKP\d+R", r.name) for r in registers):
+ name_template = lambda i: f"BKP{i}R"
+ elif "SAI" in peripheral.name:
+ name_template = lambda i: f"{registers[0].name[1:]}{chr(i+ord('A'))}"
+ elif "HRTIM" in peripheral.name:
+ name_template = lambda i: registers[0].name.replace("x", chr(i+ord('A')))
+ elif "CAN" in peripheral.name:
+ name_template = lambda i: f"F{(i+3)//2}R{(i+1)%2+1}"
+ elif "GFXMMU" in peripheral.name:
+ name_template = lambda i: f"LUT{i}{registers[0].name[-1]}"
+ elif "HSEM" in peripheral.name:
+ name_template = lambda i: f"{registers[0].name[:-1]}{i+1}"
+ elif len(registers) == 1:
+ # if "x=" in registers[0].name:
+ # name_template = lambda i: f"{registers[0].name.split('x=')[0]}.{i}"
+ if "x" in registers[0].name:
+ name_template = lambda i: registers[0].name.replace("x", str(i))
+ else:
+ name_template = lambda i: f"{registers[0].name}.{i}"
+ else:
+ print(f"Unknown expansion pattern for {registers}!")
+ return False
+
+ for ii, offset in offsets:
+ nreg = Register(name_template(ii), offset, filters=registers[0].filters, parent=peripheral)
+ nreg.children = [BitField(f.name, f.position, f.width) for f in fields]
+ for register in registers:
+ register.parent = None
+
+ return True
+
+
+def _link_instance_to_type(ds, peripheral_types, instance_offsets):
+ cap_replace = {}
+ peripherals = set()
+ for caption, locations in ds.peripherals.items():
+ filters = defaultdict(set)
+ devices = set()
+ for pfilter in re.findall(r"STM32[\w/]+", html_replace(caption, **cap_replace)):
+ devices.update(split_device_filter(pfilter) if "/" in pfilter else [pfilter])
+ if "Low and medium-density device" in caption:
+ devices.add("STM32F10..[468B]")
+ elif "High-density device" in caption:
+ devices.add("STM32F10..[CDE]")
+ if devices:
+ filters["devices"].update(d.replace("x", ".") for d in devices)
+
+ for (names, amin, amax, bus, sections) in locations:
+ for name in names:
+ ptypes = [t for tname, types in peripheral_types.items() for t in types if tname == name]
+ if not ptypes:
+ ptypes = [t for tname, types in peripheral_types.items() for t in types if tname in name]
+ if not ptypes:
+ ptypes = [t for tname, types in peripheral_types.items()
+ for t in types if t.section in sections]
+ if not ptypes and name.startswith("UART"):
+ ptypes = [t for tname, types in peripheral_types.items() for t in types if tname == "USART"]
+ if not ptypes and "BKP" == name:
+ ptypes = [t for tname, types in peripheral_types.items() for t in types if tname == "RTC"]
+ if not ptypes:
+ print(f"Cannot find peripheral type for instance {name} in section {sections}!")
+ nsections = list(sorted({t.section for types in peripheral_types.values() for t in types}))
+ print(f"Available sections are {nsections}.")
+ exit(1)
+ offsets = [v for k, v in instance_offsets.items() if re.search(k, name)]
+ if offsets: amin += offsets[0]
+ p = Peripheral(name, ptypes, amin, filters=dict(filters), sections=sections)
+ peripherals.add(p)
+ return peripherals
+
+
+def _resolve_filters(filters, **kw):
+ keys = []
+ for key, value in kw.items():
+ if values := filters.get(key):
+ keys.append(key)
+ if any(re.search(pat, value, flags=re.IGNORECASE) for pat in values):
+ return True
+ return not keys
+
+
+def _normalize_instances(memtree, peripherals, device):
+ for peripheral in peripherals:
+ if not _resolve_filters(peripheral.filters, devices=device.string):
+ continue
+ ptypes = peripheral.type
+ if len(ptypes) > 1:
+ ptypes = [ptype for ptype in sorted(peripheral.type, key=lambda p: -len(p.filters))
+ if _resolve_filters(ptype.filters, instances=peripheral.name, devices=device.string)]
+ if len(ptypes) > 1 and any(p.filters for p in ptypes):
+ ptypes = [p for p in ptypes if p.filters]
+ if len(ptypes) > 1:
+ nptypes = [p for p in ptypes if any(p.section.startswith(per) or per.startswith(p.section)
+ for per in peripheral.sections)]
+ if nptypes: ptypes = nptypes
+ for pname in ["DMAMUX", "BDMA", "OCTOSPI"]:
+ if len(ptypes) > 1 and pname in peripheral.name:
+ ptypes = [p for p in ptypes if pname in p.name]
+
+ if len(ptypes) != 1:
+ print(f"Unknown peripheral type {device} {peripheral} {ptypes}!")
+ continue
+ ptype = ptypes[0]
+
+ nper = Peripheral(peripheral.name, ptype, peripheral.address,
+ filters=peripheral.filters, parent=memtree)
+ rmap = defaultdict(list)
+ for treg in ptype.children:
+ rmap[treg.name].append(treg)
+
+ for name, tregs in rmap.items():
+ regs = [reg for reg in sorted(tregs, key=lambda p: -len(p.filters))
+ if _resolve_filters(reg.filters, instances=peripheral.name, devices=device.string)]
+ if len(regs) > 1 and any(r.filters for r in regs):
+ regs = [r for r in regs if r.filters]
+ if len(regs) != 1:
+ if len(regs) > 1:
+ print(f"Unsuccessful register filtering {peripheral.name} {device}: {tregs}!")
+ continue
+ treg = regs[0]
+ if _resolve_filters(treg.filters, devices=device.string, instances=nper.name):
+ preg = Register(treg.name, offset=treg.offset, width=treg.width,
+ filters=treg.filters, parent=nper)
+ for tbit in treg.children:
+ BitField(tbit.name, tbit.position, tbit.width, parent=preg)
+
+
+def _build_device_trees(ds, peripheral_types, instance_offsets):
+ devices = ds.filter_devices(modm_device_list())
+ memtrees = []
+
+ for device in devices:
+ memtree = Device(device)
+ peripherals = _link_instance_to_type(ds, peripheral_types, instance_offsets)
+ _normalize_instances(memtree, peripherals, device)
+ memtrees.append(memtree)
+ return memtrees
+
+
+def _compactify_device_trees(memtrees):
+ memtree_hashes = defaultdict(list)
+ for memtree in memtrees:
+ memtree_hashes[hash(memtree)].append(memtree)
+
+ new_memtrees = []
+ for memtrees in memtree_hashes.values():
+ memtree = memtrees[0]
+ for mtree in memtrees[1:]:
+ memtree.compatible.extend(mtree.compatible)
+ memtree.compatible.sort(key=lambda d: d.string)
+ memtree.name = memtree.compatible[0]
+ new_memtrees.append(memtree)
+
+ return new_memtrees
+
+
+def memory_map_from_datasheet(ds):
+ register = ds.chapter(r"chapter +\d+ +register +mapping")
+ table = register.tables("register")[0]
+ print(table)
+ registers = {}
+ for row in table.cell_rows():
+ cname = row.match_value("name")[0].text()
+ ctype = row.match_value("type")[0].text()
+ caddr = row.match_value(r"address.*?hex")[0].text()
+ cvalue = row.match_value(r"default")[0].text()
+ ccomment = row.match_value(r"comment")[0].text()
+ if not ctype: continue
+ cvalue = int(cvalue, 2) if cvalue.isdigit() else None
+ print(cname, ctype, int(caddr, 16), cvalue, ccomment)
+
+
+
+
+
+ exit(1)
+
+ peripheral_types = defaultdict(set)
+ instance_offsets = {}
+ for chapter in all_chapters:
+ print()
+ peripheral_maps, peripheral_offsets = ds.peripheral_maps(chapter, assert_table=chapter in type_chapters)
+ instance_offsets.update(peripheral_offsets)
+ peripheral_maps = _peripheral_map_to_tree(chapter, peripheral_maps)
+ if not _expand_register_offsets(peripheral_maps):
+ exit(1)
+ for pmap in peripheral_maps:
+ print(pmap)
+ # print(RenderTree(pmap, maxlevel=2))
+ peripheral_types[pmap.name].add(pmap)
+
+ for name, pmaps in peripheral_types.items():
+ print(name)
+ for pmap in pmaps:
+ print(pmap.section, pmap._chapter._relpath)
+ print(RenderTree(pmap, maxlevel=2))
+
+
+ memtrees = _build_device_trees(ds, peripheral_types, instance_offsets)
+ # for tree in memtrees:
+ # print(RenderTree(tree, maxlevel=2))
+ # exit(1)
+ memtrees = _compactify_device_trees(memtrees)
+ memtrees = [_normalize_order(memtree) for memtree in memtrees]
+ return memtrees
diff --git a/src/modm_data/pdf/__init__.py b/src/modm_data/pdf/__init__.py
index ed2f441..aa8d6b4 100644
--- a/src/modm_data/pdf/__init__.py
+++ b/src/modm_data/pdf/__init__.py
@@ -16,5 +16,6 @@
from .page import Page
from .character import Character
from .link import ObjLink, WebLink
-from .graphics import Path, Image
+from .path import Path
+from .image import Image
from .render import render_page_pdf
diff --git a/src/modm_data/pdf/document.py b/src/modm_data/pdf/document.py
index 00c7c4d..58917af 100644
--- a/src/modm_data/pdf/document.py
+++ b/src/modm_data/pdf/document.py
@@ -21,7 +21,7 @@
from collections import defaultdict
from .page import Page
-LOGGER = logging.getLogger(__name__)
+_LOGGER = logging.getLogger(__name__)
# We cannot monkey patch this class, since it's a named tuple. :-(
@@ -48,11 +48,11 @@ def __init__(self, path: Path, autoclose: bool = False):
"""
path = Path(path)
self.name: str = path.stem
- super().__init__(path, autoclose=autoclose)
"""Stem of the document file name"""
+ super().__init__(path, autoclose=autoclose)
self._path = path
self._bbox_cache = defaultdict(dict)
- LOGGER.debug(f"Loading: {path}")
+ _LOGGER.debug(f"Loading: {path}")
@cached_property
def metadata(self) -> dict[str, str]:
@@ -84,7 +84,7 @@ def toc(self) -> list[pp.PdfOutlineItem]:
outline = _OutlineItem(toc.level, toc.title, toc.is_closed,
toc.n_kids, toc.page_index or last_page_index,
toc.view_mode, toc.view_pos)
- last_page_index = toc.page_index
+ last_page_index = toc.page_index or last_page_index
tocs.add(outline)
return list(sorted(list(tocs), key=lambda o: (o.page_index, o.level, o.title)))
diff --git a/src/modm_data/pdf/image.py b/src/modm_data/pdf/image.py
new file mode 100644
index 0000000..24a4041
--- /dev/null
+++ b/src/modm_data/pdf/image.py
@@ -0,0 +1,86 @@
+# Copyright 2022, Niklas Hauser
+# SPDX-License-Identifier: MPL-2.0
+
+"""
+# PDF Images
+
+Images support bitmap data.
+"""
+
+from functools import cached_property
+import pypdfium2 as pp
+from ..utils import Point, Rectangle, Line
+
+
+class Image(pp.PdfImage):
+ """
+ This class extends `pypdfium2.PdfImage` to align it with the interface of
+ the `Path` class so that it can be used in the same
+ algorithms without filtering.
+
+ You must construct the images by calling `modm_data.pdf.page.Page.images`.
+
+ .. note:: Images are currently ignored.
+ """
+ # Overwrite the PdfPageObject.__new__ function
+ def __new__(cls, *args, **kwargs):
+ return object.__new__(cls)
+
+ def __init__(self, obj):
+ """
+ :param obj: Page object of the image.
+ """
+ super().__init__(obj.raw, obj.page, obj.pdf, obj.level)
+ assert pp.raw.FPDFPageObj_GetType(obj.raw) == pp.raw.FPDF_PAGEOBJ_IMAGE
+ self.type = pp.raw.FPDF_PAGEOBJ_IMAGE
+
+ self.count: int = 4
+ """Number of segments. Always 4 due to rectangular image form.
+ (For compatibility with `Path.count`.)"""
+ self.stroke: int = 0
+ """The border stroke color. Always 0.
+ (For compatibility with `Path.stroke`.)"""
+ self.fill: int = 0
+ """The image fill color. Always 0.
+ (For compatibility with `Path.fill`.)"""
+ self.width: float = 0
+ """The border line width. Always 0.
+ (For compatibility with `Path.width`.)"""
+
+ @cached_property
+ def matrix(self) -> pp.PdfMatrix:
+ """The transformation matrix."""
+ return self.get_matrix()
+
+ @cached_property
+ def bbox(self) -> Rectangle:
+ """The bounding box of the image."""
+ bbox = Rectangle(*self.get_pos())
+ if self.page.rotation:
+ bbox = Rectangle(bbox.p0.y, self.page.height - bbox.p1.x,
+ bbox.p1.y, self.page.height - bbox.p0.x)
+ return bbox
+
+ @cached_property
+ def points(self) -> list[Point]:
+ """
+ The 4 points of the bounding box.
+ (For compatibility with `Path.points`.)
+ """
+ points = self.bbox.points
+ if self.page.rotation:
+ points = [Point(p.y, self.page.height - p.x, p.type) for p in points]
+ return points
+
+ @cached_property
+ def lines(self) -> list[Line]:
+ """
+ The 4 lines of the bounding box.
+ (For compatibility with `Path.lines`.)
+ """
+ p = self.points
+ return [Line(p[0], p[1], p[1].type, 0), Line(p[1], p[2], p[2].type, 0),
+ Line(p[2], p[3], p[3].type, 0), Line(p[3], p[0], p[0].type, 0)]
+
+ def __repr__(self) -> str:
+ return f"I{self.bbox}"
diff --git a/src/modm_data/pdf/page.py b/src/modm_data/pdf/page.py
index 3d86f1d..2beb50c 100644
--- a/src/modm_data/pdf/page.py
+++ b/src/modm_data/pdf/page.py
@@ -19,10 +19,11 @@
from ..utils import Rectangle, Region
from .character import Character
from .link import ObjLink, WebLink
-from .graphics import Path, Image
+from .path import Path
+from .image import Image
from .structure import Structure
-LOGGER = logging.getLogger(__name__)
+_LOGGER = logging.getLogger(__name__)
class Page(pp.PdfPage):
@@ -46,7 +47,7 @@ def __init__(self, document: "modm_data.pdf.Document", index: int):
self._weblinks = None
self._linked = False
- LOGGER.debug(f"Loading: {index}")
+ _LOGGER.debug(f"Loading: {index}")
self._text = self.get_textpage()
self._linkpage = pp.raw.FPDFLink_LoadWebLinks(self._text)
@@ -177,9 +178,8 @@ def images(self) -> list[Image]:
"""All images."""
return [Image(o) for o in self.get_objects([pp.raw.FPDF_PAGEOBJ_IMAGE])]
- def graphic_clusters(self, predicate: Callable[[Path|Image], bool] = None,
- absolute_tolerance: float = None) -> \
- list[tuple[Rectangle, list[Path]]]:
+ def graphic_clusters(self, predicate: Callable[[Path | Image], bool] = None,
+ absolute_tolerance: float = None) -> list[tuple[Rectangle, list[Path]]]:
if absolute_tolerance is None:
absolute_tolerance = min(self.width, self.height) * 0.01
@@ -287,4 +287,4 @@ def _key(char):
bbox = bbox.rotated(-self.rotation - char._rotation).translated(char.origin)
char._bbox = bbox
elif char.unicode not in {0x20, 0xa, 0xd}:
- LOGGER.debug(f"Unable to fix bbox for {char.descr()}!")
+ _LOGGER.debug(f"Unable to fix bbox for {char.descr()}!")
diff --git a/src/modm_data/pdf/graphics.py b/src/modm_data/pdf/path.py
similarity index 66%
rename from src/modm_data/pdf/graphics.py
rename to src/modm_data/pdf/path.py
index aca3f32..bf59f28 100644
--- a/src/modm_data/pdf/graphics.py
+++ b/src/modm_data/pdf/path.py
@@ -7,8 +7,6 @@
PDF uses a subset of the PostScript graphics language, which draws vector paths
with various rendering options. We are only interested in the basic properties,
in particular, for recognizing table cell borders.
-
-In addition, images support bitmap data.
"""
import ctypes
@@ -148,77 +146,3 @@ def lines(self) -> list[Line]:
def __repr__(self) -> str:
points = ",".join(repr(p) for p in self.points)
return f"P{self.count}={points}"
-
-
-class Image(pp.PdfImage):
- """
- This class extends `pypdfium2.PdfImage` to align it with the interface of
- the `Path` class so that it can be used in the same
- algorithms without filtering.
-
- You must construct the images by calling `modm_data.pdf.page.Page.images`.
-
- .. note:: Images are currently ignored.
- """
- # Overwrite the PdfPageObject.__new__ function
- def __new__(cls, *args, **kwargs):
- return object.__new__(cls)
-
- def __init__(self, obj):
- """
- :param obj: Page object of the image.
- """
- super().__init__(obj.raw, obj.page, obj.pdf, obj.level)
- assert pp.raw.FPDFPageObj_GetType(obj.raw) == pp.raw.FPDF_PAGEOBJ_IMAGE
- self.type = pp.raw.FPDF_PAGEOBJ_IMAGE
-
- self.count: int = 4
- """Number of segments. Always 4 due to rectangular image form.
- (For compatibility with `Path.count`.)"""
- self.stroke: int = 0
- """The border stroke color. Always 0.
- (For compatibility with `Path.stroke`.)"""
- self.fill: int = 0
- """The image fill color. Always 0.
- (For compatibility with `Path.fill`.)"""
- self.width: float = 0
- """The border line width. Always 0.
- (For compatibility with `Path.width`.)"""
-
- @cached_property
- def matrix(self) -> pp.PdfMatrix:
- """The transformation matrix."""
- return self.get_matrix()
-
- @cached_property
- def bbox(self) -> Rectangle:
- """The bounding box of the image."""
- bbox = Rectangle(*self.get_pos())
- if self.page.rotation:
- bbox = Rectangle(bbox.p0.y, self.page.height - bbox.p1.x,
- bbox.p1.y, self.page.height - bbox.p0.x)
- return bbox
-
- @cached_property
- def points(self) -> list[Point]:
- """
- The 4 points of the bounding box.
- (For compatibility with `Path.points`.)
- """
- points = self.bbox.points
- if self.page.rotation:
- points = [Point(p.y, self.page.height - p.x, p.type) for p in points]
- return points
-
- @cached_property
- def lines(self) -> list[Line]:
- """
- The 4 lines of the bounding box.
- (For compatibility with `Path.lines`.)
- """
- p = self.points
- return [Line(p[0], p[1], p[1].type, 0), Line(p[1], p[2], p[2].type, 0),
- Line(p[2], p[3], p[3].type, 0), Line(p[3], p[0], p[0].type, 0)]
-
- def __repr__(self) -> str:
- return f"I{self.bbox}"
diff --git a/src/modm_data/pdf2html/__init__.py b/src/modm_data/pdf2html/__init__.py
index bf28123..c272980 100644
--- a/src/modm_data/pdf2html/__init__.py
+++ b/src/modm_data/pdf2html/__init__.py
@@ -7,5 +7,5 @@
from . import stmicro
from .render import render_page_pdf
-from .line import CharCluster, CharLine
-from .figure import Figure
+from .convert import convert, patch
+from .html import format_document, write_html
diff --git a/src/modm_data/pdf2html/stmicro/ast.py b/src/modm_data/pdf2html/ast.py
similarity index 51%
rename from src/modm_data/pdf2html/stmicro/ast.py
rename to src/modm_data/pdf2html/ast.py
index 226c0c9..ee252c4 100644
--- a/src/modm_data/pdf2html/stmicro/ast.py
+++ b/src/modm_data/pdf2html/ast.py
@@ -2,17 +2,16 @@
# SPDX-License-Identifier: MPL-2.0
import logging
-from lxml import etree
import anytree
-from anytree import RenderTree
+from anytree import RenderTree, Node
from collections import defaultdict
-from ...utils import list_strip, Rectangle, ReversePreOrderIter
+from ..utils import Rectangle, ReversePreOrderIter
from .table import VirtualTable, TableCell
-LOGGER = logging.getLogger(__name__)
+_LOGGER = logging.getLogger(__name__)
-def _normalize_area(area):
+def _normalize_area(area: Node) -> Node:
for child in ReversePreOrderIter(area):
if child.name.startswith("list"):
# We need to normalize the xpos back to the first character
@@ -24,13 +23,13 @@ def _normalize_area(area):
return area
-def merge_area(document, area, debug=False):
+def merge_area(document: Node, area: Node, debug: bool = False) -> Node:
if document is None:
- document = anytree.Node("document", xpos=0, _page=area.page, _doc=area.page.pdf, _end=None)
+ document = Node("document", xpos=0, _page=area.page, _doc=area.page.pdf, _end=None)
document._end = document
if not area.children:
return document
- if debug: print()
+ if debug: _LOGGER.debug()
def _find_end(node):
# Find the last leaf node but skip lines, paragraphs, captions/tables/figures
@@ -43,7 +42,7 @@ def _find_ancestor(filter_):
if filter_(c)), document.root)
area = _normalize_area(area)
- if debug: print(RenderTree(area))
+ if debug: _LOGGER.debug(RenderTree(area))
children = area.children
# All area nodes up to the next top-level element must now be
# xpos-aligned with the previous area's last leaf node
@@ -51,7 +50,7 @@ def _find_ancestor(filter_):
if c.name.startswith("head")), len(children))
x_em = area.page._spacing["x_em"]
- if debug: print("area=", area, "connect_index=", connect_index)
+ if debug: _LOGGER.debug("area=", area, "connect_index=", connect_index)
# Align these children with the last leaf node xpos
for child in children[:connect_index]:
if any(child.name.startswith(name) for name in {"list"}):
@@ -68,10 +67,10 @@ def _find_ancestor(filter_):
child.parent = host
document._end = _find_end(document)
if debug:
- print("child=", child)
- print("host=", host)
- print("end=", document._end)
- print()
+ _LOGGER.debug(f"{child=}", )
+ _LOGGER.debug(f"{host=}")
+ _LOGGER.debug(f"end={document._end}")
+ _LOGGER.debug()
# Add the remaining top-level children to connect index node
if connect_index < len(children):
@@ -82,19 +81,19 @@ def _find_ancestor(filter_):
document._end = _find_end(document)
if debug:
- print()
- print()
+ _LOGGER.debug()
+ _LOGGER.debug()
return document
-def _normalize_lists(node):
+def normalize_lists(node: Node) -> Node:
lists = []
current = []
current_name = None
for child in node.children:
# Normalize the lists from the leaves up
- _normalize_lists(child)
+ normalize_lists(child)
# then split the children based on their names
if current_name is None or child.name == current_name:
current.append(child)
@@ -110,7 +109,7 @@ def _normalize_lists(node):
for llist in lists:
# Insert a new list group node and redirect all children to it
if llist[0].name.startswith("list"):
- nlist = anytree.Node(llist[0].name, obj=llist[0].obj,
+ nlist = Node(llist[0].name, obj=llist[0].obj,
start=llist[0].value, xpos=llist[0].xpos)
for lnode in llist:
lnode.name = "element"
@@ -125,7 +124,7 @@ def _normalize_lists(node):
return node
-def _normalize_paragraphs(document):
+def normalize_paragraphs(document: Node) -> Node:
paras = anytree.search.findall(document, filter_=lambda n: n.name == "para")
parents = set(p.parent for p in paras if p.parent.name in {"element", "caption", "document", "cell"})
for parent in parents:
@@ -144,17 +143,17 @@ def _normalize_paragraphs(document):
return document
-def _normalize_lines(document):
+def normalize_lines(document: Node) -> Node:
paras = anytree.search.findall(document, filter_=lambda n: n.name == "para")
for para in paras:
- text = anytree.Node("text")
+ text = Node("text")
for line in para.children:
line.parent = text
para.children = [text]
return document
-def _normalize_captions(document):
+def normalize_captions(document: Node) -> Node:
captions = anytree.search.findall(document, filter_=lambda n: n.name == "caption")
for caption in captions:
cindex = caption.parent.children.index(caption)
@@ -165,12 +164,12 @@ def _normalize_captions(document):
sibling.number = caption.number
break
else:
- LOGGER.error(f"Discarding caption {caption}!\n{RenderTree(caption)}")
+ _LOGGER.error(f"Discarding caption {caption}!\n{RenderTree(caption)}")
caption.parent = None
return document
-def _normalize_headings(document):
+def normalize_headings(document: Node) -> Node:
headings = anytree.search.findall(document, filter_=lambda n: n.name.startswith("head"))
for heading in headings:
para = heading.children[0]
@@ -185,7 +184,7 @@ def _normalize_headings(document):
return document
-def _normalize_registers(document):
+def normalize_registers(document: Node) -> Node:
bits_list = []
sections = anytree.search.findall(document, filter_=lambda n: n.name == "section")
for section in (sections + (document,)):
@@ -195,7 +194,7 @@ def _normalize_registers(document):
if child.name == "bit":
# Insert a new bits group node and redirect all children to it
if bits is None or bits._page != child._page:
- bits = anytree.Node("table", xpos=child.xpos, obj=None,
+ bits = Node("table", xpos=child.xpos, obj=None,
_type="bits", _width=1, _page=child._page)
new_children.append(bits)
bits_list.append(bits)
@@ -229,7 +228,7 @@ def _normalize_registers(document):
return document
-def _normalize_tables(document):
+def normalize_tables(document: Node) -> Node:
content_tables = defaultdict(list)
register_tables = []
bits_tables = []
@@ -298,7 +297,7 @@ def _push():
return document
-def _normalize_chapters(document) -> list:
+def normalize_chapters(document: Node) -> Node:
headings = anytree.search.findall(document, filter_=lambda n: n.name in ["head1", "head2"], maxlevel=3)
idxs = [document.children.index(h.parent) for h in headings] + [len(document.children)]
if idxs[0] != 0:
@@ -321,300 +320,8 @@ def _normalize_chapters(document) -> list:
chapters.append( (chapter_name, filename, document.children[idx0:idx1 + 1]) )
for title, filename, nodes in chapters:
- chapter = anytree.Node("chapter", title=title, _filename=filename, parent=document)
+ chapter = Node("chapter", title=title, _filename=filename, parent=document)
for node in nodes:
node.parent = chapter
return document
-
-
-def normalize_document(document):
- def _debug(func, indata, debug=0):
- print(func.__name__[1:])
- if debug == -1:
- print(RenderTree(indata))
- print()
- outdata = func(indata)
- if debug == 1:
- print(RenderTree(outdata))
- print()
- return outdata
-
- document = _debug(_normalize_lines, document)
- document = _debug(_normalize_captions, document)
- document = _debug(_normalize_lists, document)
- document = _debug(_normalize_paragraphs, document)
- document = _debug(_normalize_headings, document)
- document = _debug(_normalize_registers, document)
- document = _debug(_normalize_tables, document)
- # document = _debug(_normalize_chapters, document)
- return document
-
-
-def _format_html_figure(xmlnode, figurenode):
- tnode = etree.Element("table")
- tnode.set("width", f"{int(figurenode._width * 50)}%")
- xmlnode.append(tnode)
-
- captionnode = next((c for c in figurenode.children if c.name == "caption"), None)
- if captionnode is not None:
- tnode.set("id", f"figure{captionnode.number}")
- caption = etree.Element("caption")
- tnode.append(caption)
- _format_html(caption, captionnode, with_newlines=True)
-
- ynode = etree.Element("tr")
- tnode.append(ynode)
-
- xynode = etree.Element("td")
- ynode.append(xynode)
- xynode.text = "(omitted)"
-
-
-def _format_html_table(xmlnode, tablenode):
- tnode = etree.Element("table")
- xmlnode.append(tnode)
- # Format the caption
- captionnode = next((c for c in tablenode.children if c.name == "caption"), None)
- if captionnode is not None:
- tnode.set("id", f"table{captionnode.number}")
- caption = etree.Element("caption")
- tnode.append(caption)
- _format_html(caption, captionnode, with_newlines=True)
- if tablenode.obj._type == "register":
- tnode.set("class", "rt")
- if tablenode.obj._type == "bitfield":
- tnode.set("class", "bt")
-
- # Cells are ordered (y, x) positions
- ypos = -1
- ynode = None
- header_rows = tablenode.obj.header_rows
- for cell in tablenode.obj.cells:
- # Add another row to the table
- if ypos != cell.y or ynode is None:
- ypos = cell.y
- ynode = etree.Element("tr")
- tnode.append(ynode)
-
- # Add the right cell with spans and style
- xynodespan = xynode = etree.Element("th" if cell.is_header else "td")
- ynode.append(xynode)
- if cell.xspan > 1:
- xynode.set("colspan", str(cell.xspan))
- if cell.yspan > 1:
- xynode.set("rowspan", str(cell.yspan))
- if not cell.rotation and tablenode.obj._type != "register" and cell.left_aligned:
- xynode.set("class", "tl")
- if cell.rotation:
- xynodespan = etree.Element("span")
- xynodespan.set("class", "tv")
- xynode.append(xynodespan)
- if (cell.y + cell.yspan) == header_rows:
- if cl := xynode.get("class"):
- xynode.set("class", "thb " + cl)
- else:
- xynode.set("class", "thb")
-
- if cell._is_simple:
- xynodespan.text = cell.content.strip()
- else:
- cell_doc = anytree.Node("document", _page=cell.ast.page)
- cell.ast.parent = cell_doc
- cell_doc = _normalize_lines(cell_doc)
- cell_doc = _normalize_lists(cell_doc)
- cell_doc = _normalize_paragraphs(cell_doc)
- # print(RenderTree(cell_doc))
- _format_html(xynodespan, cell_doc, with_newlines=True,
- ignore_formatting={"bold"} if cell.is_header else None)
-
-
-def _format_char(node, state, chars, ignore):
- NOFMT = {
- "superscript": False,
- "subscript": False,
- "italic": False,
- "bold": False,
- "underline": False,
- }
- if state is None: state = NOFMT
- char = chars[0]
- if char["char"] in {'\r'}:
- return (True, node, state)
-
- # print(node, state, char["char"])
- diffs = {}
- for key in NOFMT:
- if state[key] != char[key] and key not in ignore:
- diffs[key] = char[key]
- # if diffs: print(diffs)
- if not diffs:
- prev_name = node.children[-1].name if node.children else None
- # print(node)
- if prev_name != "newline" and char["char"] == '\n':
- # if not (prev_name == "chars" and node.children[-1].chars[-1] == " "):
- anytree.Node("newline", parent=node)
- elif prev_name != "chars":
- anytree.Node("chars", parent=node, chars=char["char"])
- else:
- node.children[-1].chars += char["char"]
- return (True, node, state)
- else:
- disable = [key for key, value in diffs.items() if not value]
- if disable:
- state[node.name] = False
- return (False, node.parent, state)
- else:
- enable = [key for key, value in diffs.items() if value][0]
- fmtnode = anytree.Node(enable, parent=node)
- state[enable] = True
- return (False, fmtnode, state)
-
-
-def _format_lines(textnode, ignore, with_newlines, with_start):
- char_props = textnode.root._page._char_properties
- formatn = anytree.Node("format")
- chars = []
- for line in textnode.children:
- if line.name == "line":
- for char in line.obj.chars[0 if with_start else line.start:]:
- if not with_newlines and char.unicode in {0xa, 0xd}:
- continue
- chars.append(char_props(line.obj, char))
- if with_newlines and chars[-1]["char"] not in {'\n'}:
- char = char_props(line.obj, line.obj.chars[-1])
- char["char"] = '\n'
- chars.append(char)
-
- chars = list_strip(chars, lambda c: c["char"] in {' ', '\n'})
- state = None
- node = formatn
- while chars:
- popchar, node, state = _format_char(node, state, chars, ignore)
- if popchar: chars.pop(0)
- return formatn
-
-
-def _format_html_fmt(xmlnode, treenode, tail=False):
- CONV = {
- "superscript": "sup",
- "subscript": "sub",
- "italic": "i",
- "bold": "b",
- "underline": "u",
- "newline": "br",
- }
- # print(xmlnode, treenode)
- if treenode.name == "chars":
- # print(f"{'tail' if tail else 'text'} char={treenode.chars}")
- if tail:
- xmlnode.tail = (xmlnode.tail or "") + treenode.chars
- else:
- xmlnode.text = (xmlnode.text or "") + treenode.chars
- return (tail, xmlnode)
- else:
- # print(f"sub {treenode.name}")
- if tail: xmlnode = xmlnode.getparent()
- subnode = etree.SubElement(xmlnode, CONV[treenode.name])
- tail = False
- iternode = subnode
- for child in treenode.children:
- tail, iternode = _format_html_fmt(iternode, child, tail)
- return (True, subnode)
-
-
-def _format_html_text(xmlnode, treenode, ignore=None, with_newlines=False, with_start=True):
- fmttree = _format_lines(treenode, ignore or set(), with_newlines, with_start)
- tail = False
- fmtnode = xmlnode
- for child in fmttree.children:
- tail, fmtnode = _format_html_fmt(fmtnode, child, tail)
-
- # print(RenderTree(fmttree))
- # print(etree.tostring(xmlnode, pretty_print=True).decode("utf-8"))
-
-
-def _format_html(xmlnode, treenode, ignore_formatting=None,
- with_newlines=False, with_start=True):
- if ignore_formatting is None:
- ignore_formatting = set()
- # print(xmlnode, treenode.name)
- current = xmlnode
- if treenode.name.startswith("head"):
- current = etree.Element(f"h{treenode.name[4]}")
- if treenode.marker:
- current.set("id", f"section{treenode.marker}")
- xmlnode.append(current)
- ignore_formatting = ignore_formatting | {"bold", "italic", "underline"}
-
- elif treenode.name in {"para"}:
- current = etree.Element("p")
- xmlnode.append(current)
-
- elif treenode.name in {"note"}:
- current = etree.Element("div")
- current.set("class", "nt")
- xmlnode.append(current)
-
- elif treenode.name == "text":
- _format_html_text(xmlnode, treenode, ignore_formatting, with_newlines, with_start)
-
- elif treenode.name == "page":
- if not current.get("id"):
- current.set("id", f"page{treenode.number}")
- print(f"{treenode.number}.", end="", flush=True)
- return
-
- elif treenode.name == "table":
- _format_html_table(xmlnode, treenode)
- return
-
- elif treenode.name == "figure":
- _format_html_figure(xmlnode, treenode)
- return
-
- elif treenode.name == "bits":
- _format_html_bits(xmlnode, treenode)
- return
-
- elif treenode.name.startswith("list"):
- if treenode.name[4] in {"b", "s"}:
- current = etree.Element("ul")
- else:
- current = etree.Element("ol")
- xmlnode.append(current)
-
- elif treenode.name == "element":
- current = etree.Element("li")
- if xmlnode.tag == "ol":
- current.set("value", str(treenode.value))
- xmlnode.append(current)
- with_start = False
-
- for child in treenode.children:
- _format_html(current, child, ignore_formatting, with_newlines, with_start)
-
-
-def format_document(document):
- html = etree.Element("html")
-
- head = etree.Element("head")
- html.append(head)
-
- link = etree.Element("link")
- link.set("rel", "stylesheet")
- link.set("href", "../style.css")
- head.append(link)
-
- body = etree.Element("body")
- html.append(body)
-
- _format_html(body, document, with_newlines=True)
-
- html = etree.ElementTree(html)
- return html
-
-
-def write_html(html, path, pretty=True):
- with open(path, "wb") as f:
- html.write(f, pretty_print=pretty, doctype="")
diff --git a/src/modm_data/pdf2html/cell.py b/src/modm_data/pdf2html/cell.py
new file mode 100644
index 0000000..2c051eb
--- /dev/null
+++ b/src/modm_data/pdf2html/cell.py
@@ -0,0 +1,125 @@
+# Copyright 2022, Niklas Hauser
+# SPDX-License-Identifier: MPL-2.0
+
+from functools import cached_property
+from anytree import Node
+from ..utils import Rectangle
+from .line import CharLine
+
+
+class TableCell:
+ class Borders:
+ """The four borders of a Cell"""
+ def __init__(self, l, b, r, t):
+ self.l = l
+ self.b = b
+ self.r = r
+ self.t = t
+
+ def __init__(self, table, position, bbox, borders, is_simple=False):
+ self._table = table
+ self._bboxes = [bbox]
+ self.b = borders
+ """Borders of the cell"""
+ self.positions = [position]
+ """Index positions of the cell"""
+ self.is_header = False
+ """Is this cell a header?"""
+ self._is_simple = is_simple
+
+ def _merge(self, other):
+ self.positions.extend(other.positions)
+ self.positions.sort()
+ self._bboxes.append(other.bbox)
+ self._invalidate()
+
+ def _move(self, x, y):
+ self.positions = [(py + y, px + x) for (py, px) in self.positions]
+ self.positions.sort()
+ self._invalidate()
+
+ def _expand(self, dx, dy):
+ ymax, xmax = self.positions[-1]
+ for yi in range(ymax, ymax + dy + 1):
+ for xi in range(xmax, xmax + dx + 1):
+ self.positions.append((yi, xi))
+ self.positions.sort()
+ self._invalidate()
+
+ def _invalidate(self):
+ for key, value in self.__class__.__dict__.items():
+ if isinstance(value, cached_property):
+ self.__dict__.pop(key, None)
+
+ @cached_property
+ def x(self) -> int:
+ """The horizontal position of the cell."""
+ return self.positions[0][1]
+
+ @cached_property
+ def y(self) -> int:
+ """The vertical position of the cell."""
+ return self.positions[0][0]
+
+ @cached_property
+ def xspan(self) -> int:
+ """The horizontal span of the cell."""
+ return self.positions[-1][1] - self.positions[0][1] + 1
+
+ @cached_property
+ def yspan(self) -> int:
+ """The vertical span of the cell."""
+ return self.positions[-1][0] - self.positions[0][0] + 1
+
+ @cached_property
+ def rotation(self) -> int:
+ """The rotation of the cell text."""
+ if not self.lines: return 0
+ return self.lines[0].rotation
+
+ @cached_property
+ def bbox(self) -> Rectangle:
+ """The tight bounding box of this cell."""
+ return Rectangle(min(bbox.left for bbox in self._bboxes),
+ min(bbox.bottom for bbox in self._bboxes),
+ max(bbox.right for bbox in self._bboxes),
+ max(bbox.top for bbox in self._bboxes))
+
+ @cached_property
+ def lines(self) -> list[CharLine]:
+ """The character lines in this cell."""
+ return self._table._page.charlines_in_area(self.bbox)
+
+ @cached_property
+ def content(self):
+ """The concatenated text content of the table cell."""
+ return "".join(c.char for line in self.lines for c in line.chars)
+
+ @cached_property
+ def is_left_aligned(self) -> bool:
+ """Is the text in the cell left aligned?"""
+ x_em = self._table._page._spacing["x_em"]
+ for line in self.lines:
+ if (line.bbox.left - self.bbox.left + x_em) < (self.bbox.right - line.bbox.right):
+ return True
+ return False
+
+ @cached_property
+ def ast(self) -> Node:
+ """The abstract syntax tree of the cell without graphics."""
+ ast = self._table._page.ast_in_area(self.bbox, with_graphics=False,
+ ignore_xpos=not self.is_left_aligned,
+ with_bits=False, with_notes=False)
+ ast.name = "cell"
+ return ast
+
+ def __repr__(self) -> str:
+ positions = ",".join(f"({p[1]},{p[0]})" for p in self.positions)
+ borders = ""
+ if self.b.l: borders += "["
+ if self.b.b: borders += "_"
+ if self.b.t: borders += "^"
+ if self.b.r: borders += "]"
+ start = "CellH" if self.is_header else "Cell"
+ return start + f"[{positions}] {borders}"
+
diff --git a/src/modm_data/pdf2html/stmicro/convert.py b/src/modm_data/pdf2html/convert.py
similarity index 81%
rename from src/modm_data/pdf2html/stmicro/convert.py
rename to src/modm_data/pdf2html/convert.py
index 1f5ed3b..62504f7 100644
--- a/src/modm_data/pdf2html/stmicro/convert.py
+++ b/src/modm_data/pdf2html/convert.py
@@ -3,10 +3,11 @@
from anytree import RenderTree
-from .ast import merge_area, normalize_document
-from .ast import format_document, write_html
-from ..render import render_page_pdf
-from ...utils import pkg_apply_patch, pkg_file_exists
+from .html import format_document, write_html
+from .render import render_page_pdf
+from ..utils import pkg_apply_patch, pkg_file_exists
+from .ast import merge_area
+from pathlib import Path
import pypdfium2 as pp
import subprocess
@@ -19,7 +20,7 @@ def convert(doc, page_range, output_path, format_chapters=False, pretty=True,
debug_doc = None
debug_index = 0
for page in doc.pages(page_range):
- if not render_all and any(c in page.top for c in {"Contents", "List of ", "Index"}):
+ if not render_all and not page.is_relevant:
continue
print(f"\n\n=== {page.top} #{page.number} ===\n")
@@ -50,7 +51,7 @@ def convert(doc, page_range, output_path, format_chapters=False, pretty=True,
print("No pages parsed, empty document!")
return True
- document = normalize_document(document)
+ document = doc._normalize(document)
if show_tree:
print(RenderTree(document))
@@ -72,15 +73,14 @@ def convert(doc, page_range, output_path, format_chapters=False, pretty=True,
return True
-def patch(doc, output_path, patch_file=None) -> bool:
+def patch(doc, data_module, output_path: Path, patch_file: Path = None) -> bool:
if patch_file is None:
- from . import data
# First try the patch file for the specific version
patch_file = f"{doc.name}.patch"
- if not pkg_file_exists(data, patch_file):
+ if not pkg_file_exists(data_module, patch_file):
# Then try the patch file shared between versions
patch_file = f"{doc.name.split('-')[0]}.patch"
- if not pkg_file_exists(data, patch_file):
+ if not pkg_file_exists(data_module, patch_file):
return True
- return pkg_apply_patch(data, patch_file, output_path)
+ return pkg_apply_patch(data_module, patch_file, output_path)
return apply_patch(patch_file, output_path)
diff --git a/src/modm_data/pdf2html/html.py b/src/modm_data/pdf2html/html.py
new file mode 100644
index 0000000..8db89a8
--- /dev/null
+++ b/src/modm_data/pdf2html/html.py
@@ -0,0 +1,279 @@
+# Copyright 2022, Niklas Hauser
+# SPDX-License-Identifier: MPL-2.0
+
+import logging
+from lxml import etree
+import anytree
+from anytree import RenderTree
+from ..utils import list_strip
+from .ast import normalize_lines, normalize_lists, normalize_paragraphs
+
+_LOGGER = logging.getLogger(__name__)
+
+def _format_html_figure(xmlnode, figurenode):
+ tnode = etree.Element("table")
+ tnode.set("width", f"{int(figurenode._width * 50)}%")
+ xmlnode.append(tnode)
+
+ captionnode = next((c for c in figurenode.children if c.name == "caption"), None)
+ if captionnode is not None:
+ tnode.set("id", f"figure{captionnode.number}")
+ caption = etree.Element("caption")
+ tnode.append(caption)
+ _format_html(caption, captionnode, with_newlines=True)
+
+ ynode = etree.Element("tr")
+ tnode.append(ynode)
+
+ xynode = etree.Element("td")
+ ynode.append(xynode)
+ xynode.text = "(omitted)"
+
+
+def _format_html_table(xmlnode, tablenode):
+ tnode = etree.Element("table")
+ xmlnode.append(tnode)
+ # Format the caption
+ captionnode = next((c for c in tablenode.children if c.name == "caption"), None)
+ if captionnode is not None:
+ tnode.set("id", f"table{captionnode.number}")
+ caption = etree.Element("caption")
+ tnode.append(caption)
+ _format_html(caption, captionnode, with_newlines=True)
+ if tablenode.obj._type == "register":
+ tnode.set("class", "rt")
+ if tablenode.obj._type == "bitfield":
+ tnode.set("class", "bt")
+
+ # Cells are ordered (y, x) positions
+ ypos = -1
+ ynode = None
+ header_rows = tablenode.obj.header_rows
+ for cell in tablenode.obj.cells:
+ # Add another row to the table
+ if ypos != cell.y or ynode is None:
+ ypos = cell.y
+ ynode = etree.Element("tr")
+ tnode.append(ynode)
+
+ # Add the right cell with spans and style
+ xynodespan = xynode = etree.Element("th" if cell.is_header else "td")
+ ynode.append(xynode)
+ if cell.xspan > 1:
+ xynode.set("colspan", str(cell.xspan))
+ if cell.yspan > 1:
+ xynode.set("rowspan", str(cell.yspan))
+ if not cell.rotation and tablenode.obj._type != "register" and cell.is_left_aligned:
+ xynode.set("class", "tl")
+ if cell.rotation:
+ xynodespan = etree.Element("span")
+ xynodespan.set("class", "tv")
+ xynode.append(xynodespan)
+ if (cell.y + cell.yspan) == header_rows:
+ if cl := xynode.get("class"):
+ xynode.set("class", "thb " + cl)
+ else:
+ xynode.set("class", "thb")
+
+ if cell._is_simple:
+ xynodespan.text = cell.content.strip()
+ else:
+ cell_doc = anytree.Node("document", _page=cell.ast.page)
+ cell.ast.parent = cell_doc
+ cell_doc = normalize_lines(cell_doc)
+ cell_doc = normalize_lists(cell_doc)
+ cell_doc = normalize_paragraphs(cell_doc)
+ # _LOGGER.debug(RenderTree(cell_doc))
+ _format_html(xynodespan, cell_doc, with_newlines=True,
+ ignore_formatting={"bold"} if cell.is_header else None)
+
+
+def _format_char(node, state, chars, ignore):
+ NOFMT = {
+ "superscript": False,
+ "subscript": False,
+ "italic": False,
+ "bold": False,
+ "underline": False,
+ }
+ if state is None: state = NOFMT
+ char = chars[0]
+ if char["char"] in {'\r'}:
+ return (True, node, state)
+
+ # print(node, state, char["char"])
+ diffs = {}
+ for key in NOFMT:
+ if state[key] != char[key] and key not in ignore:
+ diffs[key] = char[key]
+ # if diffs: print(diffs)
+ if not diffs:
+ prev_name = node.children[-1].name if node.children else None
+ # print(node)
+ if prev_name != "newline" and char["char"] == '\n':
+ # if not (prev_name == "chars" and node.children[-1].chars[-1] == " "):
+ anytree.Node("newline", parent=node)
+ elif prev_name != "chars":
+ anytree.Node("chars", parent=node, chars=char["char"])
+ else:
+ node.children[-1].chars += char["char"]
+ return (True, node, state)
+ else:
+ disable = [key for key, value in diffs.items() if not value]
+ if disable:
+ state[node.name] = False
+ return (False, node.parent, state)
+ else:
+ enable = [key for key, value in diffs.items() if value][0]
+ fmtnode = anytree.Node(enable, parent=node)
+ state[enable] = True
+ return (False, fmtnode, state)
+
+
+def _format_lines(textnode, ignore, with_newlines, with_start):
+ char_props = textnode.root._page._char_properties
+ formatn = anytree.Node("format")
+ chars = []
+ for line in textnode.children:
+ if line.name == "line":
+ for char in line.obj.chars[0 if with_start else line.start:]:
+ if not with_newlines and char.unicode in {0xa, 0xd}:
+ continue
+ chars.append(char_props(line.obj, char))
+ if with_newlines and chars[-1]["char"] not in {'\n'}:
+ char = char_props(line.obj, line.obj.chars[-1])
+ char["char"] = '\n'
+ chars.append(char)
+
+ chars = list_strip(chars, lambda c: c["char"] in {' ', '\n'})
+ state = None
+ node = formatn
+ while chars:
+ popchar, node, state = _format_char(node, state, chars, ignore)
+ if popchar: chars.pop(0)
+ return formatn
+
+
+def _format_html_fmt(xmlnode, treenode, tail=False):
+ CONV = {
+ "superscript": "sup",
+ "subscript": "sub",
+ "italic": "i",
+ "bold": "b",
+ "underline": "u",
+ "newline": "br",
+ }
+ # print(xmlnode, treenode)
+ if treenode.name == "chars":
+ # print(f"{'tail' if tail else 'text'} char={treenode.chars}")
+ if tail:
+ xmlnode.tail = (xmlnode.tail or "") + treenode.chars
+ else:
+ xmlnode.text = (xmlnode.text or "") + treenode.chars
+ return (tail, xmlnode)
+ else:
+ # print(f"sub {treenode.name}")
+ if tail: xmlnode = xmlnode.getparent()
+ subnode = etree.SubElement(xmlnode, CONV[treenode.name])
+ tail = False
+ iternode = subnode
+ for child in treenode.children:
+ tail, iternode = _format_html_fmt(iternode, child, tail)
+ return (True, subnode)
+
+
+def _format_html_text(xmlnode, treenode, ignore=None, with_newlines=False, with_start=True):
+ fmttree = _format_lines(treenode, ignore or set(), with_newlines, with_start)
+ tail = False
+ fmtnode = xmlnode
+ for child in fmttree.children:
+ tail, fmtnode = _format_html_fmt(fmtnode, child, tail)
+
+ # print(RenderTree(fmttree))
+ # print(etree.tostring(xmlnode, pretty_print=True).decode("utf-8"))
+
+
+def _format_html(xmlnode, treenode, ignore_formatting=None,
+ with_newlines=False, with_start=True):
+ if ignore_formatting is None:
+ ignore_formatting = set()
+ # print(xmlnode, treenode.name)
+ current = xmlnode
+ if treenode.name.startswith("head"):
+ current = etree.Element(f"h{treenode.name[4]}")
+ if treenode.marker:
+ current.set("id", f"section{treenode.marker}")
+ xmlnode.append(current)
+ ignore_formatting = ignore_formatting | {"bold", "italic", "underline"}
+
+ elif treenode.name in {"para"}:
+ current = etree.Element("p")
+ xmlnode.append(current)
+
+ elif treenode.name in {"note"}:
+ current = etree.Element("div")
+ current.set("class", "nt")
+ xmlnode.append(current)
+
+ elif treenode.name == "text":
+ _format_html_text(xmlnode, treenode, ignore_formatting, with_newlines, with_start)
+
+ elif treenode.name == "page":
+ if not current.get("id"):
+ current.set("id", f"page{treenode.number}")
+ print(f"{treenode.number}.", end="", flush=True)
+ return
+
+ elif treenode.name == "table":
+ _format_html_table(xmlnode, treenode)
+ return
+
+ elif treenode.name == "figure":
+ _format_html_figure(xmlnode, treenode)
+ return
+
+ elif treenode.name == "bits":
+ _format_html_bits(xmlnode, treenode)
+ return
+
+ elif treenode.name.startswith("list"):
+ if treenode.name[4] in {"b", "s"}:
+ current = etree.Element("ul")
+ else:
+ current = etree.Element("ol")
+ xmlnode.append(current)
+
+ elif treenode.name == "element":
+ current = etree.Element("li")
+ if xmlnode.tag == "ol":
+ current.set("value", str(treenode.value))
+ xmlnode.append(current)
+ with_start = False
+
+ for child in treenode.children:
+ _format_html(current, child, ignore_formatting, with_newlines, with_start)
+
+
+def format_document(document):
+ html = etree.Element("html")
+
+ head = etree.Element("head")
+ html.append(head)
+
+ link = etree.Element("link")
+ link.set("rel", "stylesheet")
+ link.set("href", "../style.css")
+ head.append(link)
+
+ body = etree.Element("body")
+ html.append(body)
+
+ _format_html(body, document, with_newlines=True)
+
+ html = etree.ElementTree(html)
+ return html
+
+
+def write_html(html, path, pretty=True):
+ with open(path, "wb") as f:
+ html.write(f, pretty_print=pretty, doctype="")
diff --git a/src/modm_data/pdf2html/line.py b/src/modm_data/pdf2html/line.py
index 5b0eb88..31d6e0e 100644
--- a/src/modm_data/pdf2html/line.py
+++ b/src/modm_data/pdf2html/line.py
@@ -3,6 +3,7 @@
from functools import cached_property
from ..utils import Rectangle
+from ..pdf import Character
class CharCluster:
@@ -12,7 +13,7 @@ class CharCluster:
character stream of the PDF page.
"""
- def __init__(self, line, chars: list):
+ def __init__(self, line: "CharLine", chars: list[Character]):
self._line = line
self.chars = chars
@@ -49,16 +50,19 @@ def __init__(self, page, chars: list, bottom: float,
@cached_property
def bbox(self) -> Rectangle:
+ """Bounding box of the character line"""
return Rectangle(min(c.bbox.left for c in self.chars),
min(c.bbox.bottom for c in self.chars),
max(c.bbox.right for c in self.chars),
max(c.bbox.top for c in self.chars))
@cached_property
- def fonts(self) -> set:
+ def fonts(self) -> set[str]:
+ """All font names in this character line"""
return set(c.font for c in self.chars if c.font)
- def contains_font(self, *fragments) -> bool:
+ def contains_font(self, *fragments: str) -> bool:
+ """:return: True if any fragment is part of the font names"""
for fragment in fragments:
if any(fragment in font for font in self.fonts):
return True
@@ -66,22 +70,23 @@ def contains_font(self, *fragments) -> bool:
@cached_property
def content(self) -> str:
+ """Text contained in the character line"""
return "".join(c.char for c in self.chars)
- def clusters(self, atol: float = None) -> list[CharCluster]:
- # Find clusters of characters in a line incl. whitespace chars
+ def clusters(self, absolute_tolerance: float = None) -> list[CharCluster]:
+ """Find clusters of characters in a line separated by `absolute_tolerance`."""
def _cluster(clusters, chars):
if chars:
clusters.append(CharCluster(self, chars))
# We want to group the chars if the space between them is > 1em
- if atol is None:
- atol = self._page._spacing["x_em"] * 1
+ if absolute_tolerance is None:
+ absolute_tolerance = self._page._spacing["x_em"] * 1
clusters = []
current_chars = [self.chars[0]]
last_char = current_chars[0]
for next_char in self.chars[1:]:
- if next_char.bbox.left - last_char.bbox.right < atol:
+ if next_char.bbox.left - last_char.bbox.right < absolute_tolerance:
# Keep this char in the current cluster
current_chars.append(next_char)
if next_char.unicode not in {0x20, 0xa, 0xd}:
diff --git a/src/modm_data/pdf2html/page.py b/src/modm_data/pdf2html/page.py
new file mode 100644
index 0000000..33f687a
--- /dev/null
+++ b/src/modm_data/pdf2html/page.py
@@ -0,0 +1,380 @@
+# Copyright 2022, Niklas Hauser
+# SPDX-License-Identifier: MPL-2.0
+
+import re
+import math
+import logging
+import textwrap
+import statistics
+from typing import Callable
+from functools import cached_property, cache, reduce
+from collections import defaultdict
+from .table import Table
+from .figure import Figure
+from .line import CharLine
+from ..utils import HLine, VLine, Rectangle, Region
+from ..pdf import Path, Image, Page as PdfPage, Character
+from anytree import Node
+
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class Page(PdfPage):
+ def __init__(self, document, index: int):
+ super().__init__(document, index)
+ self._template = "default"
+ self.is_relevant: bool = True
+ """Is this page relevant for the conversion?"""
+
+ def _unicode_filter(self, code: int) -> int:
+ return code
+
+ @cached_property
+ def _spacing(self) -> dict[str, float]:
+ content = 0.1
+ return {
+ # Horizontal spacing: left->right
+ "x_em": 0.01 * self.width,
+ "x_left": content * self.width,
+ "x_right": (1 - content) * self.width,
+ "x_content": 0.2 * self.width,
+ # Vertical spacing: bottom->top
+ "y_em": 0.01 * self.height,
+ # Max table line thickness
+ "y_tline": 0.005 * self.height,
+ # Max line height distance to detect paragraphs
+ "lh": 0.9,
+ # Max line height distance to detect super-/subscript
+ "sc": 0.3,
+ # Table header cell bold text threshold
+ "th": 0.3,
+ }
+
+ def _line_size(self, line: CharLine) -> str:
+ rsize = line.height
+ if rsize >= 17.5: return "h1"
+ elif rsize >= 15.5: return "h2"
+ elif rsize >= 13.5: return "h3"
+ elif rsize >= 11.4: return "h4"
+ elif rsize >= 8.5: return "n"
+ else: return "fn"
+
+ def _colors(self, color: int) -> str:
+ if 0xff <= color <= 0xff: return "black"
+ if 0xffffffff <= color <= 0xffffffff: return "white"
+ return "unknown"
+
+ @cached_property
+ def _areas(self) -> dict[str, list[Rectangle] | Rectangle]:
+ content = Rectangle(0.1, 0.1, 0.9, 0.9)
+ areas = {"content": [content]}
+ scaled_areas = {}
+ def _s(r):
+ return Rectangle(r.left * self.width, r.bottom * self.height,
+ r.right * self.width, r.top * self.height)
+ for name, area in areas.items():
+ scaled_areas[name] = [_s(r) for r in area] if isinstance(area, list) else _s(area)
+ return scaled_areas
+
+ def _char_properties(self, line, char):
+ cp = {
+ "superscript": False,
+ "subscript": False,
+ "bold": any(frag in char.font for frag in {"Bold"}),
+ "italic": any(frag in char.font for frag in {"Italic", "Oblique"}),
+ "underline": (char.objlink or char.weblink) is not None,
+ "size": round(line.height),
+ "relsize": self._line_size(line),
+ "char": chr(char.unicode),
+ }
+ if line.rotation:
+ if char.origin.x < (line.origin - 0.25 * line.height):
+ cp["superscript"] = True
+ elif char.origin.x > (line.origin + 0.15 * line.height):
+ cp["subscript"] = True
+ elif char.origin.y > (line.origin + 0.25 * line.height):
+ cp["superscript"] = True
+ elif char.origin.y < (line.origin - 0.15 * line.height):
+ cp["subscript"] = True
+ return cp
+
+ def text_in_named_area(self, name: str, check_length: bool = True) -> str | None:
+ """
+ Find all text in the named area.
+
+ :param name: the name of the area(s) to query.
+ :param check_length: assert that the text has a length.
+ :return: the concatenated text of the named area(s) or `None` if area not found.
+ """
+ if name not in self._areas: return None
+ text = ""
+ areas = self._areas[name]
+ if not isinstance(areas, list): areas = [areas]
+ for area in areas: text += self.text_in_area(area)
+ if check_length: assert text
+ return text
+
+ def charlines_in_area(self, area: Rectangle,
+ predicate: Callable[[Character], bool] = None,
+ rtol: float = None) -> list[CharLine]:
+ """
+ Coalesce the characters in the area and predicate into lines.
+
+ 1. Every character in the area is filtered by the `predicate`.
+ 2. Character orientation is split into horizontal (left->right) and
+ vertical (bottom->top) character lines sorted by x or y position.
+ Lines containing only whitespace are discarded.
+ 3. Overlapping character lines are merged into sub- and superscript
+ using `rtol * max(current_line.height, next_line.height)` as the
+ tolerance for checking if the lines overlap.
+ 4. The characters in the merged lines are re-sorted by origin.
+
+ :param area: Area to search for characters.
+ :param predicate: Function to discard characters in the area or include all by default.
+ :param rtol: Relative tolerance to separate lines vertically or use `sc` spacing by default.
+ :return: A list of character lines sorted by x or y position.
+ """
+ if rtol is None: rtol = self._spacing["sc"]
+ # Split all chars into lines based on rounded origin
+ origin_lines_y = defaultdict(list)
+ origin_lines_x = defaultdict(list)
+ for char in self.chars_in_area(area):
+ # Ignore all characters we don't want
+ if predicate is not None and not predicate(char):
+ continue
+ cunicode = self._unicode_filter(char.unicode)
+ if cunicode is None: continue
+ char.unicode = cunicode
+ if char.unicode < 32 and char.unicode not in {0xa}:
+ continue
+ # Ignore characters without width that are not spaces
+ if not char.width and char.unicode not in {0xa, 0xd, 0x20}:
+ _LOGGER.error(f"Unknown char width for {char}: {char.bbox}")
+ # Split up the chars depending on the orientation
+ if 45 < char.rotation <= 135 or 225 < char.rotation <= 315:
+ origin_lines_x[round(char.origin.x, 1)].append(char)
+ elif char.rotation <= 45 or 135 < char.rotation <= 225 or 315 < char.rotation:
+ origin_lines_y[round(char.origin.y, 1)].append(char)
+ else:
+ _LOGGER.error("Unknown char rotation:", char, char.rotation)
+
+ # Convert characters into lines
+ bbox_lines_y = []
+ for chars in origin_lines_y.values():
+ # Remove lines with whitespace only
+ if all(c.unicode in {0xa, 0xd, 0x20} for c in chars):
+ continue
+ origin = statistics.fmean(c.origin.y for c in chars)
+ line = CharLine(self, chars,
+ min(c.bbox.bottom for c in chars),
+ origin,
+ max(c.bbox.top for c in chars),
+ max(c.height for c in chars),
+ sort_origin=self.height - origin)
+ bbox_lines_y.append(line)
+ # print(line, line.top, line.origin, line.bottom, line.height)
+ bbox_lines = sorted(bbox_lines_y, key=lambda l: l._sort_origin)
+
+ bbox_lines_x = []
+ for chars in origin_lines_x.values():
+ # Remove lines with whitespace only
+ if all(c.unicode in {0xa, 0xd, 0x20} for c in chars):
+ continue
+ line = CharLine(self, chars,
+ min(c.bbox.left for c in chars),
+ statistics.fmean(c.origin.x for c in chars),
+ max(c.bbox.right for c in chars),
+ max(c.width for c in chars),
+ 270 if sum(c.rotation for c in chars) <= 135 * len(chars) else 90)
+ bbox_lines_x.append(line)
+ bbox_lines += sorted(bbox_lines_x, key=lambda l: l._sort_origin)
+
+ if not bbox_lines:
+ return []
+
+ # Merge lines that have overlapping bbox_lines
+ # FIXME: This merges lines that "collide" vertically like in formulas
+ merged_lines = []
+ current_line = bbox_lines[0]
+ for next_line in bbox_lines[1:]:
+ height = max(current_line.height, next_line.height)
+ # Calculate overlap via normalize origin (increasing with line index)
+ if ((current_line._sort_origin + rtol * height) >
+ (next_line._sort_origin - rtol * height)):
+ # if line.rotation or self.rotation:
+ # # The next line overlaps this one, we merge the shorter line
+ # # (typically super- and subscript) into taller line
+ # use_current = len(current_line.chars) >= len(next_line.chars)
+ # else:
+ use_current = current_line.height >= next_line.height
+ line = current_line if use_current else next_line
+ current_line = CharLine(self, current_line.chars + next_line.chars,
+ line.bottom, line.origin, line.top,
+ height, line.rotation,
+ sort_origin=line._sort_origin)
+ else:
+ # The next line does not overlap the current line
+ merged_lines.append(current_line)
+ current_line = next_line
+ # append last line
+ merged_lines.append(current_line)
+
+ # Sort all lines horizontally based on character origin
+ sorted_lines = []
+ for line in merged_lines:
+ if line.rotation == 90:
+ def sort_key(char):
+ if char.unicode in {0xa, 0xd}:
+ return char.tbbox.midpoint.y - 1e9
+ return char.tbbox.midpoint.y
+ elif line.rotation == 270:
+ def sort_key(char):
+ if char.unicode in {0xa, 0xd}:
+ return -char.tbbox.midpoint.y + 1e9
+ return -char.tbbox.midpoint.y
+ else:
+ def sort_key(char):
+ if char.unicode in {0xa, 0xd}:
+ return char.origin.x + 1e9
+ return char.origin.x
+ sorted_lines.append(CharLine(self, sorted(line.chars, key=sort_key),
+ line.bottom, line.origin,
+ line.top, line.height,
+ line.rotation, area.left,
+ sort_origin=line._sort_origin))
+
+ return sorted_lines
+
+ def graphic_bboxes_in_area(self, area: Rectangle, with_graphics: bool = True) -> list[tuple[Rectangle, Table | Figure | None]]:
+ """
+ Coalesce the graphics in the area into full width bounding boxes.
+
+ 1. Group vertically overlapping graphics.
+ 2. Widen the overlapped graphics bounding boxes to the edges of the area.
+
+ :param area: area to search for content.
+ :param with_graphics: search for graphics in the area.
+ :return: list of tuples (bounding box, graphic objects or `None`).
+ """
+ if with_graphics:
+ graphics = self.graphics_in_area(area)
+ regions = []
+ # Check if graphics bounding boxes overlap vertically and group them
+ for graphic in sorted(graphics, key=lambda g: (-g.bbox.top, g.bbox.x)):
+ gbbox = graphic.bbox.joined(graphic.cbbox) if graphic.cbbox else graphic.bbox
+ for reg in regions:
+ if reg.overlaps(gbbox.bottom, gbbox.top):
+ # They overlap, so merge them
+ reg.v0 = min(reg.v0, gbbox.bottom)
+ reg.v1 = max(reg.v1, gbbox.top)
+ reg.objs.append(graphic)
+ break
+ else:
+ regions.append(Region(gbbox.bottom, gbbox.top, graphic))
+
+ # print(regions)
+ # Coalesce all overlapped graphics objects into full width areas
+ areas = []
+ ypos = area.top
+ for reg in regions:
+ if ypos - reg.v1 > self._spacing["y_em"]:
+ areas.append((Rectangle(area.left, reg.v1, area.right, ypos), None))
+ for obj in reg.objs:
+ oarea = obj.bbox.joined(obj.cbbox) if obj.cbbox else obj.bbox
+ areas.append((oarea, obj))
+ ypos = reg.v0
+ areas.append((Rectangle(area.left, area.bottom, area.right, ypos), None))
+ else:
+ areas = [(area, None)]
+ return areas
+
+ def objects_in_area(self, area: Rectangle, with_graphics: bool = True) -> list[CharLine | Table | Figure]:
+ """
+ Find all content objects in this area.
+
+ :param area: area to search for content.
+ :param with_graphics: search for graphics in the area.
+ :return: list of content objects sorted top to bottom.
+ """
+ self._link_characters()
+ areas = self.graphic_bboxes_in_area(area, with_graphics)
+ objects = []
+ for narea, obj in areas:
+ if obj is None:
+ objects += self.charlines_in_area(narea)
+ else:
+ oarea = obj.bbox.joined(obj.cbbox) if obj.cbbox else obj.bbox
+ predicate = lambda c: not obj.bbox.contains(c.origin)
+ lines = self.charlines_in_area(oarea, predicate)
+ # print(obj, oarea, lines, [line.content for line in lines])
+ objects += list(sorted(lines + [obj], key=lambda o: (-o.bbox.y, o.bbox.x)))
+ return objects
+
+ def graphics_in_area(self, area: Rectangle) -> list[Table | Figure]:
+ """
+ Find all tables and figures in this area.
+
+ :param area: area to search for graphics.
+ :return: list of tables and figures.
+ """
+ return []
+
+ def ast_in_area(self, area: Rectangle, with_graphics: bool = True) -> Node:
+ """
+ Convert the area content into an abstract syntax tree.
+
+ :param area: area to search for content.
+ :param with_graphics: including graphics in the area.
+ :return: An abstract syntax tree including the content formatting.
+ """
+ return Node("area", obj=area, xpos=int(area.left), page=self)
+
+ @property
+ def content_ast(self) -> list[Node]:
+ """The abstract syntax trees in the content area."""
+ ast = []
+ with_graphics = True
+ for area in self._areas["content"]:
+ ast.append(self.ast_in_area(area, with_graphics=with_graphics))
+ # Add a page node to the first leaf to keep track of where a page starts
+ first_leaf = next((n for n in iter(ast[0].descendants) if n.is_leaf), ast[0])
+ Node("page", parent=first_leaf, xpos=first_leaf.xpos, number=self.number)
+ return ast
+
+ @property
+ def content_objects(self) -> list[CharLine | Table | Figure]:
+ """All objects in the content areas."""
+ objs = []
+ for area in self._areas["content"]:
+ objs.extend(self.objects_in_area(area))
+ return objs
+
+ @property
+ def content_graphics(self) -> list[Table | Figure]:
+ """All graphics in the content areas."""
+ objs = []
+ for area in self._areas["content"]:
+ objs.extend(self.graphics_in_area(area))
+ return objs
+
+ @property
+ def content_lines(self) -> list[CharLine]:
+ """All lines in the content areas."""
+ objs = []
+ for area in self._areas["content"]:
+ objs.extend(self.charlines_in_area(area))
+ return objs
+
+ @property
+ def content_tables(self) -> list[Table]:
+ """All tables in the content areas."""
+ return [o for o in self.content_graphics if isinstance(o, Table)]
+
+ @property
+ def content_figures(self) -> list[Figure]:
+ """All figures in the content areas."""
+ return [o for o in self.content_graphics if isinstance(o, Figure)]
+
+ def __repr__(self) -> str:
+ return f"Page({self.number})"
diff --git a/src/modm_data/pdf2html/render.py b/src/modm_data/pdf2html/render.py
index 0717bd4..526eb61 100644
--- a/src/modm_data/pdf2html/render.py
+++ b/src/modm_data/pdf2html/render.py
@@ -11,7 +11,7 @@
def render_page_pdf(doc, page, new_doc = None, index = 0):
"""
- Test doc string
+
:param doc: PDF document
:param page: PDF page
diff --git a/src/modm_data/pdf2html/stmicro/__init__.py b/src/modm_data/pdf2html/stmicro/__init__.py
index 4adcde6..fd9ce58 100644
--- a/src/modm_data/pdf2html/stmicro/__init__.py
+++ b/src/modm_data/pdf2html/stmicro/__init__.py
@@ -1,7 +1,5 @@
# Copyright 2022, Niklas Hauser
# SPDX-License-Identifier: MPL-2.0
-from .page import Page, is_compatible
-from .ast import normalize_document, merge_area, format_document, write_html
-from .convert import convert, patch
+
from .document import Document
diff --git a/src/modm_data/pdf2html/stmicro/__main__.py b/src/modm_data/pdf2html/stmicro/__main__.py
index 40d2ef0..208e2f6 100644
--- a/src/modm_data/pdf2html/stmicro/__main__.py
+++ b/src/modm_data/pdf2html/stmicro/__main__.py
@@ -3,15 +3,16 @@
import re
import tqdm
+import logging
import argparse
import subprocess
from pathlib import Path
from multiprocessing.pool import ThreadPool
-import modm_data
-from . import convert, patch
+from .. import convert, patch
def main():
+ import modm_data
parser = argparse.ArgumentParser()
parser.add_argument("--document", type=Path)
parser.add_argument("--output", type=str, default="")
@@ -25,12 +26,14 @@ def main():
parser.add_argument("--chapters", action="store_true")
parser.add_argument("--tags", action="store_true")
parser.add_argument("--all", action="store_true")
+ parser.add_argument("-v", dest="verbose", action="count", default=0)
args = parser.parse_args()
+ logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
doc = modm_data.pdf2html.stmicro.Document(args.document)
- # if doc.page_count == 0 or not doc.page(1).width:
- # print("Corrupt PDF!")
- # exit(1)
+ if doc.page_count == 0 or not doc.page(1).width:
+ print("Corrupt PDF!")
+ exit(1)
if args.page or args.range:
page_range = list(map(lambda p: p - 1, args.page or []))
@@ -79,7 +82,8 @@ def main():
for retval, call in zip(retvals, calls):
if retval.returncode != 0: print(call)
if all(r.returncode == 0 for r in retvals):
- return patch(doc, output_dir)
+ from . import data
+ return patch(doc, data, output_dir)
return False
return convert(doc, page_range, output_path, format_chapters=args.chapters,
diff --git a/src/modm_data/pdf2html/stmicro/document.py b/src/modm_data/pdf2html/stmicro/document.py
index fdecf9b..3931033 100644
--- a/src/modm_data/pdf2html/stmicro/document.py
+++ b/src/modm_data/pdf2html/stmicro/document.py
@@ -1,13 +1,44 @@
# Copyright 2023, Niklas Hauser
# SPDX-License-Identifier: MPL-2.0
+import logging
from .page import Page as StmPage
from ...pdf import Document as PdfDocument
+from ..ast import normalize_lines, normalize_captions, normalize_lists
+from ..ast import normalize_paragraphs, normalize_headings, normalize_registers
+from ..ast import normalize_tables, normalize_chapters
+
+_LOGGER = logging.getLogger(__name__)
+
+def _debug(func, indata, debug=0):
+ _LOGGER.debug(func.__name__)
+ if debug == -1:
+ _LOGGER.debug(RenderTree(indata))
+ _LOGGER.debug()
+ outdata = func(indata)
+ if debug == 1:
+ _LOGGER.debug(RenderTree(outdata))
+ _LOGGER.debug()
+ return outdata
+
+
+def _normalize_document(document):
+ document = _debug(normalize_lines, document)
+ document = _debug(normalize_captions, document)
+ document = _debug(normalize_lists, document)
+ document = _debug(normalize_paragraphs, document)
+ document = _debug(normalize_headings, document)
+ document = _debug(normalize_registers, document)
+ document = _debug(normalize_tables, document)
+ # document = _debug(normalize_chapters, document)
+ return document
+
class Document(PdfDocument):
def __init__(self, path: str):
super().__init__(path)
+ self._normalize = _normalize_document
def page(self, index: int) -> StmPage:
assert index < self.page_count
diff --git a/src/modm_data/pdf2html/stmicro/page.py b/src/modm_data/pdf2html/stmicro/page.py
index b56b56e..68cec3c 100644
--- a/src/modm_data/pdf2html/stmicro/page.py
+++ b/src/modm_data/pdf2html/stmicro/page.py
@@ -8,15 +8,17 @@
import statistics
from functools import cached_property, cache, reduce
from collections import defaultdict
-from .table import Table
+from ..table import Table
from ..figure import Figure
from ..line import CharLine
from ...utils import HLine, VLine, Rectangle, Region
from ...pdf import Path, Image, Page as PdfPage
+from ..page import Page as BasePage
from anytree import Node
-LOGGER = logging.getLogger(__name__)
+_LOGGER = logging.getLogger(__name__)
+
def is_compatible(document) -> bool:
if "stmicro" in document.metadata.get("Author", "").lower():
@@ -24,7 +26,7 @@ def is_compatible(document) -> bool:
return False
-def areas_black_white(page) -> dict:
+def _areas_black_white(page) -> dict:
def _scale(r):
if page.rotation:
return Rectangle(r.bottom * page.width, (1 - r.right) * page.height,
@@ -94,7 +96,7 @@ def _scale(r):
return scaled_areas
-def areas_blue_gray(page) -> dict:
+def _areas_blue_gray(page) -> dict:
def _scale(r):
return Rectangle(r.left * page.width, r.bottom * page.height,
r.right * page.width, r.top * page.height)
@@ -146,7 +148,7 @@ def _scale(r):
return scaled_areas
-def spacing_black_white(page) -> dict:
+def _spacing_black_white(page) -> dict:
content = 0.1125
spacing = {
# Horizontal spacing: left->right
@@ -177,10 +179,10 @@ def spacing_black_white(page) -> dict:
"lh": 1.2,
"sc": 0.4,
})
- return spacing
+ return spacing | _spacing_special(page)
-def spacing_blue_gray(page) -> dict:
+def _spacing_blue_gray(page) -> dict:
content = 0.07
spacing = {
# Horizontal spacing: left->right
@@ -210,10 +212,25 @@ def spacing_blue_gray(page) -> dict:
"lh": 1.6,
"sc": 0.2,
})
- return spacing
+ return spacing | _spacing_special(page)
+
+
+def _spacing_special(page) -> dict:
+ # Patches to detect the header cells correctly
+ if ((page.pdf.name == "DS12930-v1" and page.index in range(90, 106)) or
+ (page.pdf.name == "DS12931-v1" and page.index in range(89, 105))):
+ return {"th": 0.1}
+ if ((page.pdf.name == "RM0453-v2" and page.index in [1354]) or
+ (page.pdf.name == "RM0456-v2" and page.index in [2881]) or
+ (page.pdf.name == "RM0456-v3" and page.index in [2880]) or
+ (page.pdf.name == "RM0461-v4" and page.index in [1246])):
+ return {"th": 0.5}
+ if ((page.pdf.name == "RM0456-v2" and page.index in [3005])):
+ return {"th": 0.52}
+ return {}
-def linesize_black_white(line: float) -> str:
+def _linesize_black_white(line: CharLine) -> str:
rsize = line.height
if rsize >= 17.5: return "h1"
elif rsize >= 15.5: return "h2"
@@ -223,7 +240,7 @@ def linesize_black_white(line: float) -> str:
else: return "fn"
-def linesize_blue_gray(line: float) -> str:
+def _linesize_blue_gray(line: CharLine) -> str:
rsize = round(line.height)
if rsize >= 16: return "h1"
elif rsize >= 14: return "h2"
@@ -233,7 +250,7 @@ def linesize_blue_gray(line: float) -> str:
else: return "fn"
-def colors_black_white(color: int) -> str:
+def _colors_black_white(color: int) -> str:
if 0xff <= color <= 0xff:
return "black"
if 0xffffffff <= color <= 0xffffffff:
@@ -241,7 +258,7 @@ def colors_black_white(color: int) -> str:
return "unknown"
-def colors_blue_gray(color: int) -> str:
+def _colors_blue_gray(color: int) -> str:
if 0xff <= color <= 0xff:
return "black"
if 0xffffffff <= color <= 0xffffffff:
@@ -257,230 +274,53 @@ def colors_blue_gray(color: int) -> str:
return "unknown"
-class Page(PdfPage):
-
+class Page(BasePage):
def __init__(self, document, index: int):
super().__init__(document, index)
- self._template = "black_white"
producer = self.pdf.metadata.get("Producer", "").lower()
- if "acrobat" in producer:
- pass # default
+ self._template = "black_white"
+ if "acrobat" in producer or "adobe" in producer:
+ pass
elif "antenna" in producer:
self._template = "blue_gray"
else:
- LOGGER.error(f"Unknown page template! Defaulting to Black/White template. '{producer}'")
+ _LOGGER.error(f"Unknown page template! Defaulting to Black/White template. '{producer}'")
if "blue_gray" in self._template:
- self._areas = areas_blue_gray(self)
- self._spacing = spacing_blue_gray(self)
- self._colors = colors_blue_gray
- self._line_size = linesize_blue_gray
+ self._areas = _areas_blue_gray(self)
+ self._spacing = _spacing_blue_gray(self)
+ self._colors = _colors_blue_gray
+ self._line_size = _linesize_blue_gray
elif "black_white" in self._template:
- self._areas = areas_black_white(self)
- self._spacing = spacing_black_white(self)
- self._colors = colors_black_white
- self._line_size = linesize_black_white
-
- # Patches to detect the header cells correctly
- if ((self.pdf.name == "DS12930-v1" and self.index in range(90, 106)) or
- (self.pdf.name == "DS12931-v1" and self.index in range(89, 105))):
- self._spacing["th"] = 0.1
- if ((self.pdf.name == "RM0453-v2" and self.index in [1354]) or
- (self.pdf.name == "RM0456-v2" and self.index in [2881]) or
- (self.pdf.name == "RM0456-v3" and self.index in [2880]) or
- (self.pdf.name == "RM0461-v4" and self.index in [1246])):
- self._spacing["th"] = 0.5
- if ((self.pdf.name == "RM0456-v2" and self.index in [3005])):
- self._spacing["th"] = 0.52
-
- def _text_in_area(self, name, check_length=True) -> str:
- if name not in self._areas: return ""
- text = ""
- areas = self._areas[name]
- if not isinstance(areas, list): areas = [areas]
- for area in areas:
- text += self.text_in_area(area)
- if check_length: assert text
- return text
+ self._areas = _areas_black_white(self)
+ self._spacing = _spacing_black_white(self)
+ self._colors = _colors_black_white
+ self._line_size = _linesize_black_white
+
+ def _unicode_filter(self, code: int) -> int:
+ # Ignore Carriage Return characters and ® (superscript issues)
+ if code in {0xd, ord("®")}: return None
+ # Correct some weird unicode stuffing choices
+ if code in {2}: return ord("-")
+ if code in {61623, 61664}: return ord("•")
+ return code
@cached_property
def identifier(self) -> str:
- return self._text_in_area("id", check_length=False)
+ return self.text_in_named_area("id", check_length=False)
@cached_property
def top(self) -> str:
if self.index == 0:
return "Cover"
- return self._text_in_area("top", check_length=False)
+ return self.text_in_named_area("top", check_length=False)
+ @cached_property
def is_relevant(self) -> bool:
if any(c in self.top for c in {"Contents", "List of ", "Index"}):
return False
return True
- def _charlines_filtered(self, area, predicate = None, rtol = None) -> list[CharLine]:
- if rtol is None: rtol = self._spacing["sc"]
- # Split all chars into lines based on rounded origin
- origin_lines_y = defaultdict(list)
- origin_lines_x = defaultdict(list)
- for char in self.chars_in_area(area):
- # Ignore all characters we don't want
- if predicate is not None and not predicate(char):
- continue
- # Ignore Carriage Return characters and ® (superscript issues)
- if char.unicode in {0xd, ord("®")}:
- continue
- # Correct some weird unicode stuffing choices
- if char.unicode in {2}:
- char.unicode = ord("-")
- if char.unicode in {61623, 61664}:
- char.unicode = ord("•")
- if char.unicode < 32 and char.unicode not in {0xa}:
- continue
- # Ignore characters without width that are not spaces
- if not char.width and char.unicode not in {0xa, 0xd, 0x20}:
- LOGGER.error(f"Unknown char width for {char}: {char.bbox}")
- # Split up the chars depending on the orientation
- if 45 < char.rotation <= 135 or 225 < char.rotation <= 315:
- origin_lines_x[round(char.origin.x, 1)].append(char)
- elif char.rotation <= 45 or 135 < char.rotation <= 225 or 315 < char.rotation:
- origin_lines_y[round(char.origin.y, 1)].append(char)
- else:
- LOGGER.error("Unknown char rotation:", char, char.rotation)
-
- # Convert characters into lines
- bbox_lines_y = []
- for chars in origin_lines_y.values():
- # Remove lines with whitespace only
- if all(c.unicode in {0xa, 0xd, 0x20} for c in chars):
- continue
- origin = statistics.fmean(c.origin.y for c in chars)
- line = CharLine(self, chars,
- min(c.bbox.bottom for c in chars),
- origin,
- max(c.bbox.top for c in chars),
- max(c.height for c in chars),
- sort_origin=self.height - origin)
- bbox_lines_y.append(line)
- # print(line, line.top, line.origin, line.bottom, line.height)
- bbox_lines = sorted(bbox_lines_y, key=lambda l: l._sort_origin)
-
- bbox_lines_x = []
- for chars in origin_lines_x.values():
- # Remove lines with whitespace only
- if all(c.unicode in {0xa, 0xd, 0x20} for c in chars):
- continue
- line = CharLine(self, chars,
- min(c.bbox.left for c in chars),
- statistics.fmean(c.origin.x for c in chars),
- max(c.bbox.right for c in chars),
- max(c.width for c in chars),
- 270 if sum(c.rotation for c in chars) <= 135 * len(chars) else 90)
- bbox_lines_x.append(line)
- bbox_lines += sorted(bbox_lines_x, key=lambda l: l._sort_origin)
-
- if not bbox_lines:
- return []
-
- # Merge lines that have overlapping bbox_lines
- # FIXME: This merges lines that "collide" vertically like in formulas
- merged_lines = []
- current_line = bbox_lines[0]
- for next_line in bbox_lines[1:]:
- height = max(current_line.height, next_line.height)
- # Calculate overlap via normalize origin (increasing with line index)
- if ((current_line._sort_origin + rtol * height) >
- (next_line._sort_origin - rtol * height)):
- # if line.rotation or self.rotation:
- # # The next line overlaps this one, we merge the shorter line
- # # (typically super- and subscript) into taller line
- # use_current = len(current_line.chars) >= len(next_line.chars)
- # else:
- use_current = current_line.height >= next_line.height
- line = current_line if use_current else next_line
- current_line = CharLine(self, current_line.chars + next_line.chars,
- line.bottom, line.origin, line.top,
- height, line.rotation,
- sort_origin=line._sort_origin)
- else:
- # The next line does not overlap the current line
- merged_lines.append(current_line)
- current_line = next_line
- # append last line
- merged_lines.append(current_line)
-
- # Sort all lines horizontally based on character origin
- sorted_lines = []
- for line in merged_lines:
- if line.rotation == 90:
- def sort_key(char):
- if char.unicode in {0xa, 0xd}:
- return char.tbbox.midpoint.y - 1e9
- return char.tbbox.midpoint.y
- elif line.rotation == 270:
- def sort_key(char):
- if char.unicode in {0xa, 0xd}:
- return -char.tbbox.midpoint.y + 1e9
- return -char.tbbox.midpoint.y
- else:
- def sort_key(char):
- if char.unicode in {0xa, 0xd}:
- return char.origin.x + 1e9
- return char.origin.x
- sorted_lines.append(CharLine(self, sorted(line.chars, key=sort_key),
- line.bottom, line.origin,
- line.top, line.height,
- line.rotation, area.left,
- sort_origin=line._sort_origin))
-
- return sorted_lines
-
- def _content_areas(self, area: Rectangle, with_graphics: bool = True) -> list:
- if with_graphics:
- graphics = self._graphics_filtered(area)
- regions = []
- for graphic in sorted(graphics, key=lambda g: (-g.bbox.top, g.bbox.x)):
- gbbox = graphic.bbox.joined(graphic.cbbox) if graphic.cbbox else graphic.bbox
- for reg in regions:
- if reg.overlaps(gbbox.bottom, gbbox.top):
- # They overlap, so merge them
- reg.v0 = min(reg.v0, gbbox.bottom)
- reg.v1 = max(reg.v1, gbbox.top)
- reg.objs.append(graphic)
- break
- else:
- regions.append(Region(gbbox.bottom, gbbox.top, graphic))
-
- # print(regions)
- areas = []
- ypos = area.top
- for reg in regions:
- if ypos - reg.v1 > self._spacing["y_em"]:
- areas.append((Rectangle(area.left, reg.v1, area.right, ypos), None))
- for obj in reg.objs:
- oarea = obj.bbox.joined(obj.cbbox) if obj.cbbox else obj.bbox
- areas.append((oarea, obj))
- ypos = reg.v0
- areas.append((Rectangle(area.left, area.bottom, area.right, ypos), None))
- else:
- areas = [(area, None)]
- return areas
-
- def _objects_filtered(self, area: Rectangle, with_graphics: bool = True) -> list:
- self._link_characters()
- areas = self._content_areas(area, with_graphics)
- objects = []
- for narea, obj in areas:
- if obj is None:
- objects += self._charlines_filtered(narea)
- else:
- oarea = obj.bbox.joined(obj.cbbox) if obj.cbbox else obj.bbox
- predicate = lambda c: not obj.bbox.contains(c.origin)
- lines = self._charlines_filtered(oarea, predicate)
- # print(obj, oarea, lines, [line.content for line in lines])
- objects += list(sorted(lines + [obj], key=lambda o: (-o.bbox.y, o.bbox.x)))
- return objects
-
@property
def content_ast(self) -> list:
ast = []
@@ -492,13 +332,13 @@ def content_ast(self) -> list:
re.search("ordering +information|part +numbering", item.title, re.IGNORECASE)), -1)
with_graphics = (order_page != self.index)
for area in self._areas["content"]:
- ast.append(self._ast_filtered(area, with_graphics=with_graphics))
+ ast.append(self.ast_in_area(area, with_graphics=with_graphics))
# Add a page node to the first leaf to keep track of where a page starts
first_leaf = next((n for n in iter(ast[0].descendants) if n.is_leaf), ast[0])
Node("page", parent=first_leaf, xpos=first_leaf.xpos, number=self.number)
return ast
- def _graphics_filtered(self, area) -> list:
+ def graphics_in_area(self, area: Rectangle) -> list[Table | Figure]:
# Find all graphic clusters in this area
em = self._spacing["y_em"]
large_area = area.offset_x(em/2)
@@ -511,7 +351,7 @@ def _graphics_filtered(self, area) -> list:
# Find the captions and group them by y origin to catch side-by-side figures
ycaptions = defaultdict(list)
- for line in self._charlines_filtered(area, lambda c: "Bold" in c.font):
+ for line in self.charlines_in_area(area, lambda c: "Bold" in c.font):
for cluster in line.clusters():
for phrase in [r"Figure \d+\.", r"Table \d+\."]:
if re.match(phrase, cluster.content):
@@ -531,7 +371,7 @@ def _graphics_filtered(self, area) -> list:
if b.bottom <= bottom and
left <= b.left and b.right <= right), None)
if graphic is None:
- LOGGER.error(f"Graphic cluster not found for caption {''.join(c.char for c in chars)}")
+ _LOGGER.error(f"Graphic cluster not found for caption {''.join(c.char for c in chars)}")
continue
if self._template == "blue_gray":
@@ -545,7 +385,7 @@ def _graphics_filtered(self, area) -> list:
break
cbbox = nbbox
cchars = nchars
- elif self._template == "black_white":
+ else:
cbbox = Rectangle(left, min(graphic[0].top, bottom), right, top)
otype = phrase.split(" ")[0].lower()
@@ -583,6 +423,7 @@ def _graphics_filtered(self, area) -> list:
for gbbox, paths in graphic_clusters:
if gbbox.width < self._spacing["x_em"] or gbbox.height < self._spacing["y_em"]:
continue
+ category = ""
if any(isinstance(p, Image) for p in paths):
category = "figure"
elif self._template == "blue_gray":
@@ -643,9 +484,9 @@ def _graphics_filtered(self, area) -> list:
elif line.direction == line.Direction.HORIZONTAL:
ylines.append(line.specialize())
else:
- LOGGER.warn(f"Line not vertical or horizontal: {line}")
+ _LOGGER.warn(f"Line not vertical or horizontal: {line}")
else:
- LOGGER.warn(f"Path too long: {path}")
+ _LOGGER.warn(f"Path too long: {path}")
elif self._colors(path.fill) == "darkblue":
# Add the bottom line of the dark blue header box as a very thick line
line = HLine(path.bbox.bottom, path.bbox.left, path.bbox.right, 5)
@@ -681,58 +522,9 @@ def _graphics_filtered(self, area) -> list:
return objects
- @property
- def content_objects(self) -> list:
- objs = []
- for area in self._areas["content"]:
- objs.extend(self._objects_filtered(area))
- return objs
-
- @property
- def content_graphics(self) -> list:
- objs = []
- for area in self._areas["content"]:
- objs.extend(self._graphics_filtered(area))
- return objs
-
- @property
- def content_lines(self) -> list:
- return [o for o in self.content_objects if isinstance(o, CharLine)]
-
- @property
- def content_tables(self) -> list:
- return [o for o in self.content_graphics if isinstance(o, Table)]
-
- @property
- def content_figures(self) -> list:
- return [o for o in self.content_graphics if isinstance(o, Figure)]
-
- def _char_properties(self, line, char):
- cp = {
- "superscript": False,
- "subscript": False,
- "bold": any(frag in char.font for frag in {"Bold"}),
- "italic": any(frag in char.font for frag in {"Italic", "Oblique"}),
- "underline": (char.objlink or char.weblink) is not None,
- "size": round(line.height),
- "relsize": self._line_size(line),
- "char": chr(char.unicode),
- }
-
- if line.rotation:
- if char.origin.x < (line.origin - 0.25 * line.height):
- cp["superscript"] = True
- elif char.origin.x > (line.origin + 0.15 * line.height):
- cp["subscript"] = True
- elif char.origin.y > (line.origin + 0.25 * line.height):
- cp["superscript"] = True
- elif char.origin.y < (line.origin - 0.15 * line.height):
- cp["subscript"] = True
-
- return cp
-
- def _ast_filtered(self, area: Rectangle, with_graphics=True,
- ignore_xpos=False, with_bits=True, with_notes=True) -> list:
+ def ast_in_area(self, area: Rectangle, with_graphics: bool = True,
+ ignore_xpos: bool = False, with_bits: bool = True,
+ with_notes: bool = True) -> Node:
x_em = self._spacing["x_em"]
spacing_content = self._spacing["x_content"]
lh_factor = self._spacing["lh"]
@@ -753,8 +545,9 @@ def parent_name(current):
current = root
ypos = area.top
- for obj in self._objects_filtered(area, with_graphics):
+ for obj in self.objects_in_area(area, with_graphics):
xpos = round(obj.bbox.left)
+
# Tables should remain in their current hierarchy regardless of indentation
if isinstance(obj, (Table, Figure)):
current = next((c for c in current.iter_path_reverse()
@@ -763,6 +556,7 @@ def parent_name(current):
Node(name, parent=current, obj=obj, xpos=xpos, number=-1,
_width=obj.bbox.width / area.width, _type=obj._type)
ypos = obj.bbox.bottom
+
# Lines of text need to be carefully checked for indentation
elif isinstance(obj, CharLine):
newlines = round((ypos - obj.origin) / (lh_factor * obj.height))
@@ -783,6 +577,7 @@ def parent_name(current):
current = current.parent.parent
# print(obj.fonts, ypos, xpos, current.xpos, f"{obj.height:.2f}", content)
+
# Check if line is a heading, which may be multi-line, so we must
# be careful not to nest them, but group them properly
# Headings are always inserted into the root note!
@@ -853,15 +648,15 @@ def parent_name(current):
else:
# Default back to the regex
if "Reserved" not in content:
- LOGGER.warning(f"Fallback to Regex length for Bit pattern '{content}'!\nFonts: {obj.fonts}")
+ _LOGGER.warning(f"Fallback to Regex length for Bit pattern '{content}'!\nFonts: {obj.fonts}")
content_start = re.match(r" *([Bb]ytes? *.+? *)?(B[uio]t)( *\d+:?|s *(\d+ *([:-] *\d+ *)? *,? *)+) *", content)
if content_start is None:
- LOGGER.error(f"Unable to match Bit regex at all! '{content}'!")
+ _LOGGER.error(f"Unable to match Bit regex at all! '{content}'!")
content_start = 0
else:
content_start = len(content_start.group(0))
if not content_start:
- LOGGER.error(f"Missing content start (=0)! '{content}'!")
+ _LOGGER.error(f"Missing content start (=0)! '{content}'!")
content_start = min(content_start, len(obj.chars) - 1)
current = next((c for c in current.iter_path_reverse()
@@ -895,4 +690,4 @@ def parent_name(current):
return root
def __repr__(self) -> str:
- return f"StPage({self.number})"
+ return f"StmPage({self.number})"
diff --git a/src/modm_data/pdf2html/stmicro/table.py b/src/modm_data/pdf2html/table.py
similarity index 82%
rename from src/modm_data/pdf2html/stmicro/table.py
rename to src/modm_data/pdf2html/table.py
index e0744b5..6aa0995 100644
--- a/src/modm_data/pdf2html/stmicro/table.py
+++ b/src/modm_data/pdf2html/table.py
@@ -5,112 +5,10 @@
import statistics
from functools import cached_property
from collections import defaultdict
-from ...utils import HLine, VLine, Rectangle
-
-LOGGER = logging.getLogger(__name__)
-
-
-class TableCell:
- class Borders:
- def __init__(self, l, b, r, t):
- self.l = l
- self.b = b
- self.r = r
- self.t = t
-
- def __init__(self, table, position, bbox, borders, is_simple=False):
- self._table = table
- self._bboxes = [bbox]
- self.b = borders
- self.positions = [position]
- self.is_header = False
- self._is_simple = is_simple
- self._bbox = None
- self._lines = None
-
- def _merge(self, other):
- self.positions.extend(other.positions)
- self.positions.sort()
- self._bboxes.append(other.bbox)
- self._bbox = None
- self._lines = None
-
- def _move(self, x, y):
- self.positions = [(py + y, px + x) for (py, px) in self.positions]
- self.positions.sort()
-
- def _expand(self, dx, dy):
- ymax, xmax = self.positions[-1]
- for yi in range(ymax, ymax + dy + 1):
- for xi in range(xmax, xmax + dx + 1):
- self.positions.append((yi, xi))
- self.positions.sort()
+from ..utils import HLine, VLine, Rectangle
+from .cell import TableCell
- @property
- def x(self) -> int:
- return self.positions[0][1]
-
- @property
- def y(self) -> int:
- return self.positions[0][0]
-
- @property
- def xspan(self) -> int:
- return self.positions[-1][1] - self.positions[0][1] + 1
-
- @property
- def yspan(self) -> int:
- return self.positions[-1][0] - self.positions[0][0] + 1
-
- @property
- def rotation(self) -> int:
- if not self.lines: return 0
- return self.lines[0].rotation
-
- @property
- def bbox(self) -> Rectangle:
- if self._bbox is None:
- self._bbox = Rectangle(min(bbox.left for bbox in self._bboxes),
- min(bbox.bottom for bbox in self._bboxes),
- max(bbox.right for bbox in self._bboxes),
- max(bbox.top for bbox in self._bboxes))
- return self._bbox
-
- @property
- def lines(self):
- if self._lines is None:
- self._lines = self._table._page._charlines_filtered(self.bbox)
- return self._lines
-
- @property
- def content(self):
- return "".join(c.char for line in self.lines for c in line.chars)
-
- @property
- def left_aligned(self):
- x_em = self._table._page._spacing["x_em"]
- for line in self.lines:
- if (line.bbox.left - self.bbox.left + x_em) < (self.bbox.right - line.bbox.right):
- return True
- return False
-
- @property
- def ast(self):
- ast = self._table._page._ast_filtered(self.bbox, with_graphics=False,
- ignore_xpos=not self.left_aligned,
- with_bits=False, with_notes=False)
- ast.name = "cell"
- return ast
-
- def __repr__(self) -> str:
- positions = ",".join(f"({p[1]},{p[0]})" for p in self.positions)
- borders = ""
- if self.b.l: borders += "["
- if self.b.b: borders += "_"
- if self.b.t: borders += "^"
- if self.b.r: borders += "]"
- start = "CellH" if self.is_header else "Cell"
- return start + f"[{positions}] {borders}"
+_LOGGER = logging.getLogger(__name__)
class Table:
@@ -143,26 +41,26 @@ def _cluster(lines, key):
# Find the positions of the top numbers
clusters = []
- if lines := self._page._charlines_filtered(cbbox):
+ if lines := self._page.charlines_in_area(cbbox):
if len(cluster := lines[0].clusters(self._page._spacing["x_em"] / 2)):
clusters.append((cluster, cbbox))
else:
self.grid = (0, 0)
- LOGGER.error(f"Cannot find any bit position clusters! {self} ({self._page})")
+ _LOGGER.error(f"Cannot find any bit position clusters! {self} ({self._page})")
# Find the positions of the second row of numbers
if len(ygrid) > 2:
for yi, (ypos0, ypos1) in enumerate(zip(sorted(ygrid), sorted(ygrid)[1:])):
nbbox = Rectangle(self.bbox.left, ygrid[ypos0][0].p0.y,
self.bbox.right, ygrid[ypos1][0].p0.y)
- if lines := self._page._charlines_filtered(nbbox):
+ if lines := self._page.charlines_in_area(nbbox):
if all(c.char.isnumeric() or c.unicode in {0x20, 0xa, 0xd} for c in lines[0].chars):
if not len(cluster := lines[0].clusters(self._page._spacing["x_em"] / 2)) % 16:
clusters.append((cluster, nbbox))
self._bit_headers = len(ygrid) - yi - 1
else:
self.grid = (len(cluster), 0)
- LOGGER.warning(f"Second bit pattern does not have 16 or 32 clusters! {self} ({self._page})")
+ _LOGGER.warning(f"Second bit pattern does not have 16 or 32 clusters! {self} ({self._page})")
break
# Merge these clusters to find their positions
@@ -235,7 +133,7 @@ def _fix_borders(self, cells, x: int, y: int):
r = cells[(x + 1, y)].b if cells[(x + 1, y)] is not None else TableCell.Borders(0, 0, 1, 0)
t = cells[(x, y + 1)].b if cells[(x, y + 1)] is not None else TableCell.Borders(0, 1, 0, 0)
- # if (not c.t and c.l and c.r and c.b) and "Reset value" in cell.content:
+ # if (not c.t and csand c.r and c.b) and "Reset value" in cell.content:
# c.t = 1
# Open at the top into a span
@@ -401,7 +299,7 @@ def append_bottom(self, other, merge_headers=True) -> bool:
print(len(merged_xheaders), merged_xheaders)
# If they are not equal length the table layouts are not compatible at all!
if len(self_heads) != len(other_heads):
- LOGGER.error(f"Failure to append table {other} ({other._page}) onto table {self} ({self._page})")
+ _LOGGER.error(f"Failure to append table {other} ({other._page}) onto table {self} ({self._page})")
return False
# We want to stuff/move the cell positions inplace, therefore we start
@@ -444,6 +342,7 @@ def _insert_cells(cell, src, dsts, insert_only):
assert new_positions
assert len(new_positions) == len(set(new_positions))
cell.positions = sorted(new_positions)
+ cell._invalidate()
def _move_cells(cells, own_xpos):
if debug:
@@ -497,7 +396,7 @@ def _move_cells(cells, own_xpos):
def append_side(self, other, expand=False) -> bool:
if self.grid[1] != other.grid[1]:
if expand:
- LOGGER.debug(f"Expanding bottom cells to match height: {self} ({self._page}) + {other} ({other._page})")
+ _LOGGER.debug(f"Expanding bottom cells to match height: {self} ({self._page}) + {other} ({other._page})")
ymin = min(self.grid[1], other.grid[1])
ymax = max(self.grid[1], other.grid[1])
etable = other if self.grid[1] > other.grid[1] else self
@@ -506,7 +405,7 @@ def append_side(self, other, expand=False) -> bool:
cell._expand(0, ymax - ymin)
etable.grid = (etable.grid[0], ymax)
else:
- LOGGER.error(f"Unable to append table at side: {self} ({self._page}) + {other} ({other._page})")
+ _LOGGER.error(f"Unable to append table at side: {self} ({self._page}) + {other} ({other._page})")
return False
# We must move all cells to the right now
diff --git a/tools/scripts/search_html.py b/tools/scripts/search_html.py
index 8c9337b..a89b519 100644
--- a/tools/scripts/search_html.py
+++ b/tools/scripts/search_html.py
@@ -1,11 +1,5 @@
-# Copyright (c) 2022, Niklas Hauser
-#
-# This file is part of the modm-data project.
-#
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-# -----------------------------------------------------------------------------
+# Copyright 2022, Niklas Hauser
+# SPDX-License-Identifier: MPL-2.0
import re
import sys
@@ -15,7 +9,6 @@
from pathlib import Path
sys.path.append(".")
-from modm_data.utils import ext_path
from modm_data.html import Document
def _format_html(xmlnode, treenode):
@@ -93,7 +86,7 @@ def format_document(document):
link = etree.Element("link")
link.set("rel", "stylesheet")
- link.set("href", "ext/stmicro/html/style.css")
+ link.set("href", "ext/stmicro/html-archive/style.css")
head.append(link)
body = etree.Element("body")
@@ -113,8 +106,8 @@ def main():
parser.add_argument("--html", type=str)
args = parser.parse_args()
- documents = ext_path("stmicro/html").glob(args.document)
- documents = [Document(d) for d in documents]
+ documents = (Path(__file__).parents[2] / "ext/stmicro/html-archive").absolute()
+ documents = [Document(d) for d in documents.glob(args.document)]
rootnode = anytree.Node("root", document=args.document, chapter=args.chapter, table=args.table)