feat: patcher implementation

openfoodfacts · Nov 8, 2024 · 4bd9f99 · 4bd9f99
1 parent dfca2b0
commit 4bd9f99
Show file tree

Hide file tree

Showing 6 changed files with 282 additions and 45 deletions.
diff --git a/parser/openfoodfacts_taxonomy_parser/parser/parser.py b/parser/openfoodfacts_taxonomy_parser/parser/parser.py
@@ -245,10 +245,15 @@ def _add_text_to_project(self, filename: str, taxonomy_name: str, branch_name: s
         """Add file content to the db"""
         project_label = get_project_name(taxonomy_name, branch_name)
         query = f"""
-            MATCH (n:{project_label})
+            MATCH (n:PROJECT)
+            WHERE n.branch_name = $branch_name AND n.taxonomy_name = $taxonomy_name
             SET n.original_text = $original_text
         """
-        params = {"original_text": open(filename, "r", encoding="utf-8").read()}
+        params = {
+            "branch_name": branch_name,
+            "taxonomy_name": taxonomy_name,
+            "original_text": open(filename, "r", encoding="utf-8").read(),
+        }
         self.session.run(query, params)
 
     def __call__(

diff --git a/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py b/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py
@@ -27,9 +27,10 @@ class NodeData:
     preceding_lines: list[str] = field(default_factory=list)
     parent_tags: list[tuple[str, int]] = field(default_factory=list)
     src_position: int | None = None
-    # lines taken by this entry in the source file
-    # this can be more than (start, end) if we merged duplicates
-    src_lines: list[tuple[int, int]] | None = None
+    # lines taken by this entry in the source file,
+    # stored a "start,end" strings, because nested collections are not supported
+    # this can be more than )start, end) if we merged duplicates
+    src_lines: list[str] | None = None
     properties: dict[str, str] = field(default_factory=dict)
     tags: dict[str, list[str]] = field(default_factory=dict)
     comments: dict[str, list[str]] = field(default_factory=dict)
@@ -236,7 +237,7 @@ def is_entry_synonyms_line(self, line):
 
     def finalize_data(self, data, comments, saved_nodes, line_number: int):
         data = self._remove_separating_line(data)
-        data.src_lines = [(data.src_position, line_number)]
+        data.src_lines = [f"{data.src_position},{line_number}"]
         if data.get_node_type() == NodeType.ENTRY:
             self._add_comments(data, comments, "end")
         if data.id in saved_nodes:
@@ -598,7 +599,7 @@ def _create_taxonomy(
                 id="__header__",
                 preceding_lines=harvested_header_data,
                 src_position=1,
-                src_lines=[(1, entries_start_line - 1)],
+                src_lines=[f"1,{entries_start_line - 1}"],
             )
         ]
         previous_links: list[PreviousLink] = []

diff --git a/parser/openfoodfacts_taxonomy_parser/patcher.py b/parser/openfoodfacts_taxonomy_parser/patcher.py
@@ -1 +1,106 @@
-FIXME
+"""This module provide a function to dump a taxonomy from a neo4j database into a file,
+but taking the original taxonomy content and only modifying nodes that where modified or added
+"""
+
+from .unparser import WriteTaxonomy
+from .utils import get_project_name, src_lines
+
+
+class PatchTaxonomy(WriteTaxonomy):
+    """Implementation to dump a taxonomy from neoo4j database into a file,
+    while taking the original content and
+    only modifying lines corresponding to nodes that where modified or added
+    """
+
+    def get_all_nodes(self, project_label):
+        """Get modified and removed nodes, in the start line  order"""
+        query = f"""
+            MATCH (n:{project_label})
+            WHERE
+                // no external node
+                n.is_external = false
+                AND (
+                    // modified nodes
+                    ((n:TEXT OR n:SYNONYMS OR n:STOPWORDS OR n:ENTRY) AND n.modified IS NOT NULL) OR
+                    // removed nodes
+                    (n:REMOVED_TEXT OR n:REMOVED_SYNONYMS OR n:REMOVED_STOPWORDS OR n:REMOVED_ENTRY)
+                )
+            // optional match for node might not have parents
+            OPTIONAL
+                MATCH (n)-[r:is_child_of]->(parent)
+                WITH n, r, parent ORDER BY n.src_position, r.position
+            RETURN n, collect(parent)
+        """
+        results = self.session.run(query)
+        for result in results:
+            node, parents = result.values()
+            yield node, parents
+
+    def get_original_text(self, branch_name, taxonomy_name):
+        """Get the original text of the taxonomy"""
+        query = f"""
+            MATCH (n:PROJECT)
+            WHERE n.branch_name = $branch_name AND n.taxonomy_name = $taxonomy_name
+            RETURN n.original_text
+        """
+        results = self.session.run(
+            query, {"branch_name": branch_name, "taxonomy_name": taxonomy_name}
+        )
+        for result in results:
+            return result.values()[0]
+
+    def iter_lines(self, branch_name, taxonomy_name):
+        nodes_by_lines = self.nodes_by_lines(branch_name, taxonomy_name)
+        # get lines to replace and put them in a dict with the line number
+        original_text = self.get_original_text(branch_name, taxonomy_name)
+        nodes_by_lines = self.nodes_by_lines(branch_name, taxonomy_name)
+        # get lines to skip in original text
+        skip_lines = {
+            num_line
+            for node, _ in nodes_by_lines.values()
+            for start, end in src_lines(node.src_lines)
+            for num_line in range(start, end)
+        }
+        previous_line = None
+        for line_num, line in enumerate(original_text.split("\n")):
+            if line_num in nodes_by_lines and not node.labels.startswith("REMOVED"):
+                if previous_line != "":
+                    # we need a blank line between 2 nodes
+                    yield ""
+                node, parents = nodes_by_lines.pop(line_num)
+                node_lines = list(self.iter_node_lines(dict(node), parents))
+                yield from lines
+                previous_line = lines[-1]
+            # this is not a elif, because previous entry might not replace content (new entry)
+            if line_num in skip_lines:
+                continue
+            else:
+                yield line
+                previous_line = line
+        # add remaining nodes
+        if not previous_line == "" and nodes_by_lines:
+            yield ""
+        for node, parents in nodes_by_lines.values():
+            yield from self.iter_node_lines(dict(node), parents)
+            yield ""
+
+    def nodes_by_lines(self, branch_name, taxonomy_name):
+        """Get the lines to replace in the original text"""
+        project_label = get_project_name(taxonomy_name, branch_name)
+        # get nodes by future position in the file
+        nodes_by_lines = {}
+        new_lines = -1  # we will use negative positions to add new nodes at the end
+        for node, parents in self.get_all_nodes(project_label):
+            node_position = node.src_position
+            if not node_position:
+                # this is a new node
+                # we try to add it nearby it's latest parent, if it's not possible, we add it at the end
+                parents_with_position = filter(lambda x: x.src_position is not None, parents)
+                parents_positions = sorted(parents_with_position, key=lambda x: x.src_position)
+                if parents_positions:
+                    node_position = int(parents_positions[-1].src_lines[-1][-1].split(",")[-1])
+                else:
+                    node_position = new_lines
+                    new_lines -= 1
+            nodes_by_lines[node_position] = (node, parents)
+        return nodes_by_lines
diff --git a/parser/openfoodfacts_taxonomy_parser/unparser.py b/parser/openfoodfacts_taxonomy_parser/unparser.py
@@ -22,6 +22,8 @@ def get_all_nodes(self, project_label):
         # and finally it returns the node and its parents
         # (the parents are ordered in the same order as in the original file)
         # Note: OPTIONAL MATCH is used to return nodes without parents
+        # Note that as we follow the is_before relation,
+        # we won't get removed nodes, as intended
         query = f"""
             MATCH path = ShortestPath(
                 (h:{project_label}:TEXT)-[:is_before*]->(f:{project_label}:TEXT)
@@ -92,11 +94,12 @@ def get_parents_lines(self, parents):
             parent_id = parent["tags_" + lc][0]
             yield "< " + lc + ":" + parent_id
 
-    def iter_lines(self, project_label):
+    def iter_lines(self, branch_name, taxonomy_name):
+        project_label = get_project_name(taxonomy_name, branch_name)
         previous_block_id = ""
         for node, parents in self.get_all_nodes(project_label):
-            node = dict(node)
             has_content = node["id"] not in ["__header__", "__footer__"]
+            node = dict(node)
             # eventually add a blank line but in specific case
             following_synonyms = node["id"].startswith("synonyms") and previous_block_id.startswith(
                 "synonyms"
@@ -107,36 +110,40 @@ def iter_lines(self, project_label):
             add_blank = has_content and not (following_synonyms or following_stopwords)
             if add_blank:
                 yield ""
-            # comments
-            yield from node.get("preceding_lines", [])
-            if has_content:
-                tags_lc = self.list_tags_lc(node)
-                if node["id"].startswith("stopwords"):
-                    yield "stopwords:" + self.get_tags_line(node, tags_lc[0])
-                elif node["id"].startswith("synonyms"):
-                    yield "synonyms:" + self.get_tags_line(node, tags_lc[0])
-                else:
-                    # parents
-                    yield from node.get("parent_comments", [])
-                    yield from self.get_parents_lines(parents)
-                    # main language synonyms first
-                    main_language = node.pop("main_language")
-                    tags_lc.remove(main_language)
-                    yield from node.get("tags_" + main_language + "_comments", [])
-                    yield self.get_tags_line(node, main_language)
-                    # more synonyms after
-                    for lc in tags_lc:
-                        yield from node.get("tags_" + lc + "_comments", [])
-                        yield self.get_tags_line(node, lc)
-                    # properties
-                    properties_list = self.list_property_and_lc(node)
-                    for property in properties_list:
-                        yield from node.get("prop_" + property + "_comments", [])
-                        yield self.get_property_line(node, property)
-                    # final comments
-                    yield from node.get("end_comments", [])
+            yield from self.iter_node_lines(node, parents)
             previous_block_id = node["id"]
 
+    def iter_node_lines(self, node, parents):
+        has_content = node["id"] not in ["__header__", "__footer__"]
+        # comments
+        yield from node.get("preceding_lines", [])
+        if has_content:
+            tags_lc = self.list_tags_lc(node)
+            if node["id"].startswith("stopwords"):
+                yield "stopwords:" + self.get_tags_line(node, tags_lc[0])
+            elif node["id"].startswith("synonyms"):
+                yield "synonyms:" + self.get_tags_line(node, tags_lc[0])
+            else:
+                # parents
+                yield from node.get("parent_comments", [])
+                yield from self.get_parents_lines(parents)
+                # main language synonyms first
+                main_language = node.pop("main_language")
+                tags_lc.remove(main_language)
+                yield from node.get("tags_" + main_language + "_comments", [])
+                yield self.get_tags_line(node, main_language)
+                # more synonyms after
+                for lc in tags_lc:
+                    yield from node.get("tags_" + lc + "_comments", [])
+                    yield self.get_tags_line(node, lc)
+                # properties
+                properties_list = self.list_property_and_lc(node)
+                for property in properties_list:
+                    yield from node.get("prop_" + property + "_comments", [])
+                    yield self.get_property_line(node, property)
+                # final comments
+                yield from node.get("end_comments", [])
+
     def rewrite_file(self, filename, lines):
         """Write a .txt file with the given name"""
         filename = normalize_filename(filename)
@@ -147,8 +154,7 @@ def rewrite_file(self, filename, lines):
     def __call__(self, filename, branch_name, taxonomy_name):
         filename = normalize_filename(filename)
         branch_name = normalize_text(branch_name, char="_")
-        project_label = get_project_name(taxonomy_name, branch_name)
-        lines = self.iter_lines(project_label)
+        lines = self.iter_lines(branch_name, taxonomy_name)
         self.rewrite_file(filename, lines)
 
 

diff --git a/parser/openfoodfacts_taxonomy_parser/utils.py b/parser/openfoodfacts_taxonomy_parser/utils.py
@@ -62,3 +62,9 @@ def normalize_filename(filename: str) -> str:
 def get_project_name(taxonomy_name: str, branch_name: str) -> str:
     """Create a project name for given branch and taxonomy"""
     return "p_" + taxonomy_name + "_" + branch_name
+
+
+def src_lines(src_lines_str: list[str]):
+    for line in src_lines_str:
+        start, end = line.split(",")
+        yield int(start), int(end)