Skip to content

Commit

Permalink
feat: patcher implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
alexgarel committed Nov 8, 2024
1 parent dfca2b0 commit 4bd9f99
Show file tree
Hide file tree
Showing 6 changed files with 282 additions and 45 deletions.
9 changes: 7 additions & 2 deletions parser/openfoodfacts_taxonomy_parser/parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,10 +245,15 @@ def _add_text_to_project(self, filename: str, taxonomy_name: str, branch_name: s
"""Add file content to the db"""
project_label = get_project_name(taxonomy_name, branch_name)
query = f"""
MATCH (n:{project_label})
MATCH (n:PROJECT)
WHERE n.branch_name = $branch_name AND n.taxonomy_name = $taxonomy_name
SET n.original_text = $original_text
"""
params = {"original_text": open(filename, "r", encoding="utf-8").read()}
params = {
"branch_name": branch_name,
"taxonomy_name": taxonomy_name,
"original_text": open(filename, "r", encoding="utf-8").read(),
}
self.session.run(query, params)

def __call__(
Expand Down
11 changes: 6 additions & 5 deletions parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,10 @@ class NodeData:
preceding_lines: list[str] = field(default_factory=list)
parent_tags: list[tuple[str, int]] = field(default_factory=list)
src_position: int | None = None
# lines taken by this entry in the source file
# this can be more than (start, end) if we merged duplicates
src_lines: list[tuple[int, int]] | None = None
# lines taken by this entry in the source file,
# stored a "start,end" strings, because nested collections are not supported
# this can be more than )start, end) if we merged duplicates
src_lines: list[str] | None = None
properties: dict[str, str] = field(default_factory=dict)
tags: dict[str, list[str]] = field(default_factory=dict)
comments: dict[str, list[str]] = field(default_factory=dict)
Expand Down Expand Up @@ -236,7 +237,7 @@ def is_entry_synonyms_line(self, line):

def finalize_data(self, data, comments, saved_nodes, line_number: int):
data = self._remove_separating_line(data)
data.src_lines = [(data.src_position, line_number)]
data.src_lines = [f"{data.src_position},{line_number}"]
if data.get_node_type() == NodeType.ENTRY:
self._add_comments(data, comments, "end")
if data.id in saved_nodes:
Expand Down Expand Up @@ -598,7 +599,7 @@ def _create_taxonomy(
id="__header__",
preceding_lines=harvested_header_data,
src_position=1,
src_lines=[(1, entries_start_line - 1)],
src_lines=[f"1,{entries_start_line - 1}"],
)
]
previous_links: list[PreviousLink] = []
Expand Down
107 changes: 106 additions & 1 deletion parser/openfoodfacts_taxonomy_parser/patcher.py
Original file line number Diff line number Diff line change
@@ -1 +1,106 @@
FIXME
"""This module provide a function to dump a taxonomy from a neo4j database into a file,
but taking the original taxonomy content and only modifying nodes that where modified or added
"""

from .unparser import WriteTaxonomy
from .utils import get_project_name, src_lines


class PatchTaxonomy(WriteTaxonomy):
"""Implementation to dump a taxonomy from neoo4j database into a file,
while taking the original content and
only modifying lines corresponding to nodes that where modified or added
"""

def get_all_nodes(self, project_label):
"""Get modified and removed nodes, in the start line order"""
query = f"""
MATCH (n:{project_label})
WHERE
// no external node
n.is_external = false
AND (
// modified nodes
((n:TEXT OR n:SYNONYMS OR n:STOPWORDS OR n:ENTRY) AND n.modified IS NOT NULL) OR
// removed nodes
(n:REMOVED_TEXT OR n:REMOVED_SYNONYMS OR n:REMOVED_STOPWORDS OR n:REMOVED_ENTRY)
)
// optional match for node might not have parents
OPTIONAL
MATCH (n)-[r:is_child_of]->(parent)
WITH n, r, parent ORDER BY n.src_position, r.position
RETURN n, collect(parent)
"""
results = self.session.run(query)
for result in results:
node, parents = result.values()
yield node, parents

def get_original_text(self, branch_name, taxonomy_name):
"""Get the original text of the taxonomy"""
query = f"""
MATCH (n:PROJECT)
WHERE n.branch_name = $branch_name AND n.taxonomy_name = $taxonomy_name
RETURN n.original_text
"""
results = self.session.run(
query, {"branch_name": branch_name, "taxonomy_name": taxonomy_name}
)
for result in results:
return result.values()[0]

def iter_lines(self, branch_name, taxonomy_name):
nodes_by_lines = self.nodes_by_lines(branch_name, taxonomy_name)
# get lines to replace and put them in a dict with the line number
original_text = self.get_original_text(branch_name, taxonomy_name)
nodes_by_lines = self.nodes_by_lines(branch_name, taxonomy_name)
# get lines to skip in original text
skip_lines = {
num_line
for node, _ in nodes_by_lines.values()
for start, end in src_lines(node.src_lines)
for num_line in range(start, end)
}
previous_line = None
for line_num, line in enumerate(original_text.split("\n")):
if line_num in nodes_by_lines and not node.labels.startswith("REMOVED"):
if previous_line != "":
# we need a blank line between 2 nodes
yield ""
node, parents = nodes_by_lines.pop(line_num)
node_lines = list(self.iter_node_lines(dict(node), parents))
yield from lines
previous_line = lines[-1]
# this is not a elif, because previous entry might not replace content (new entry)
if line_num in skip_lines:
continue
else:
yield line
previous_line = line
# add remaining nodes
if not previous_line == "" and nodes_by_lines:
yield ""
for node, parents in nodes_by_lines.values():
yield from self.iter_node_lines(dict(node), parents)
yield ""

def nodes_by_lines(self, branch_name, taxonomy_name):
"""Get the lines to replace in the original text"""
project_label = get_project_name(taxonomy_name, branch_name)
# get nodes by future position in the file
nodes_by_lines = {}
new_lines = -1 # we will use negative positions to add new nodes at the end
for node, parents in self.get_all_nodes(project_label):
node_position = node.src_position
if not node_position:
# this is a new node
# we try to add it nearby it's latest parent, if it's not possible, we add it at the end
parents_with_position = filter(lambda x: x.src_position is not None, parents)
parents_positions = sorted(parents_with_position, key=lambda x: x.src_position)
if parents_positions:
node_position = int(parents_positions[-1].src_lines[-1][-1].split(",")[-1])
else:
node_position = new_lines
new_lines -= 1
nodes_by_lines[node_position] = (node, parents)
return nodes_by_lines
70 changes: 38 additions & 32 deletions parser/openfoodfacts_taxonomy_parser/unparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ def get_all_nodes(self, project_label):
# and finally it returns the node and its parents
# (the parents are ordered in the same order as in the original file)
# Note: OPTIONAL MATCH is used to return nodes without parents
# Note that as we follow the is_before relation,
# we won't get removed nodes, as intended
query = f"""
MATCH path = ShortestPath(
(h:{project_label}:TEXT)-[:is_before*]->(f:{project_label}:TEXT)
Expand Down Expand Up @@ -92,11 +94,12 @@ def get_parents_lines(self, parents):
parent_id = parent["tags_" + lc][0]
yield "< " + lc + ":" + parent_id

def iter_lines(self, project_label):
def iter_lines(self, branch_name, taxonomy_name):
project_label = get_project_name(taxonomy_name, branch_name)
previous_block_id = ""
for node, parents in self.get_all_nodes(project_label):
node = dict(node)
has_content = node["id"] not in ["__header__", "__footer__"]
node = dict(node)
# eventually add a blank line but in specific case
following_synonyms = node["id"].startswith("synonyms") and previous_block_id.startswith(
"synonyms"
Expand All @@ -107,36 +110,40 @@ def iter_lines(self, project_label):
add_blank = has_content and not (following_synonyms or following_stopwords)
if add_blank:
yield ""
# comments
yield from node.get("preceding_lines", [])
if has_content:
tags_lc = self.list_tags_lc(node)
if node["id"].startswith("stopwords"):
yield "stopwords:" + self.get_tags_line(node, tags_lc[0])
elif node["id"].startswith("synonyms"):
yield "synonyms:" + self.get_tags_line(node, tags_lc[0])
else:
# parents
yield from node.get("parent_comments", [])
yield from self.get_parents_lines(parents)
# main language synonyms first
main_language = node.pop("main_language")
tags_lc.remove(main_language)
yield from node.get("tags_" + main_language + "_comments", [])
yield self.get_tags_line(node, main_language)
# more synonyms after
for lc in tags_lc:
yield from node.get("tags_" + lc + "_comments", [])
yield self.get_tags_line(node, lc)
# properties
properties_list = self.list_property_and_lc(node)
for property in properties_list:
yield from node.get("prop_" + property + "_comments", [])
yield self.get_property_line(node, property)
# final comments
yield from node.get("end_comments", [])
yield from self.iter_node_lines(node, parents)
previous_block_id = node["id"]

def iter_node_lines(self, node, parents):
has_content = node["id"] not in ["__header__", "__footer__"]
# comments
yield from node.get("preceding_lines", [])
if has_content:
tags_lc = self.list_tags_lc(node)
if node["id"].startswith("stopwords"):
yield "stopwords:" + self.get_tags_line(node, tags_lc[0])
elif node["id"].startswith("synonyms"):
yield "synonyms:" + self.get_tags_line(node, tags_lc[0])
else:
# parents
yield from node.get("parent_comments", [])
yield from self.get_parents_lines(parents)
# main language synonyms first
main_language = node.pop("main_language")
tags_lc.remove(main_language)
yield from node.get("tags_" + main_language + "_comments", [])
yield self.get_tags_line(node, main_language)
# more synonyms after
for lc in tags_lc:
yield from node.get("tags_" + lc + "_comments", [])
yield self.get_tags_line(node, lc)
# properties
properties_list = self.list_property_and_lc(node)
for property in properties_list:
yield from node.get("prop_" + property + "_comments", [])
yield self.get_property_line(node, property)
# final comments
yield from node.get("end_comments", [])

def rewrite_file(self, filename, lines):
"""Write a .txt file with the given name"""
filename = normalize_filename(filename)
Expand All @@ -147,8 +154,7 @@ def rewrite_file(self, filename, lines):
def __call__(self, filename, branch_name, taxonomy_name):
filename = normalize_filename(filename)
branch_name = normalize_text(branch_name, char="_")
project_label = get_project_name(taxonomy_name, branch_name)
lines = self.iter_lines(project_label)
lines = self.iter_lines(branch_name, taxonomy_name)
self.rewrite_file(filename, lines)


Expand Down
6 changes: 6 additions & 0 deletions parser/openfoodfacts_taxonomy_parser/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,9 @@ def normalize_filename(filename: str) -> str:
def get_project_name(taxonomy_name: str, branch_name: str) -> str:
"""Create a project name for given branch and taxonomy"""
return "p_" + taxonomy_name + "_" + branch_name


def src_lines(src_lines_str: list[str]):
for line in src_lines_str:
start, end = line.split(",")
yield int(start), int(end)
Loading

0 comments on commit 4bd9f99

Please sign in to comment.