From 1fe888e5240f71da376a3feb45738d43d2814404 Mon Sep 17 00:00:00 2001 From: Alex Garel Date: Tue, 23 Jul 2024 15:28:13 +0200 Subject: [PATCH] fix: less transformations on comment lines and others --- Makefile | 4 +- .../parser/parser.py | 39 ++++-- .../parser/taxonomy_parser.py | 121 +++++++++++------- .../openfoodfacts_taxonomy_parser/unparser.py | 12 +- parser/openfoodfacts_taxonomy_parser/utils.py | 11 +- parser/tests/data/test.txt | 16 +-- .../test_parse_unparse_integration.py | 32 +++-- .../integration/test_parser_integration.py | 84 ++++++++++-- parser/tests/unit/test_parser_unit.py | 1 - parser/tests/unit/test_unparser_unit.py | 38 ++++++ 10 files changed, 263 insertions(+), 95 deletions(-) create mode 100644 parser/tests/unit/test_unparser_unit.py diff --git a/Makefile b/Makefile index cf31b416..611b4fbd 100644 --- a/Makefile +++ b/Makefile @@ -127,8 +127,8 @@ lint: backend_lint frontend_lint config_lint ## Run all linters backend_lint: ## Run lint on backend code @echo "🍜 Linting python code" - ${DOCKER_COMPOSE} run --rm taxonomy_api isort . - ${DOCKER_COMPOSE} run --rm taxonomy_api black . + ${DOCKER_COMPOSE} run --rm taxonomy_api isort . /parser + ${DOCKER_COMPOSE} run --rm taxonomy_api black . /parser frontend_lint: ## Run lint on frontend code @echo "🍜 Linting react code" diff --git a/parser/openfoodfacts_taxonomy_parser/parser/parser.py b/parser/openfoodfacts_taxonomy_parser/parser/parser.py index e2f051a3..f9dca464 100644 --- a/parser/openfoodfacts_taxonomy_parser/parser/parser.py +++ b/parser/openfoodfacts_taxonomy_parser/parser/parser.py @@ -9,7 +9,14 @@ from ..utils import get_project_name, normalize_text from .logger import ParserConsoleLogger -from .taxonomy_parser import ChildLink, NodeData, NodeType, PreviousLink, Taxonomy, TaxonomyParser +from .taxonomy_parser import ( + ChildLink, + NodeData, + NodeType, + PreviousLink, + Taxonomy, + TaxonomyParser, +) class Parser: @@ -19,7 +26,9 @@ def __init__(self, session: Session): self.session = session self.parser_logger = ParserConsoleLogger() - def _create_other_node(self, tx: Transaction, node_data: NodeData, project_label: str): + def _create_other_node( + self, tx: Transaction, node_data: NodeData, project_label: str + ): """Create a TEXT, SYNONYMS or STOPWORDS node""" if node_data.get_node_type() == NodeType.TEXT: type_label = "TEXT" @@ -88,20 +97,26 @@ def _create_entry_nodes(self, entry_nodes: list[NodeData], project_label: str): original_taxonomy: entry_node.original_taxonomy """ - properties_query = ",\n".join([base_properties_query, *additional_properties_queries]) + properties_query = ",\n".join( + [base_properties_query, *additional_properties_queries] + ) query = f""" WITH $entry_nodes as entry_nodes UNWIND entry_nodes as entry_node CREATE (n:{project_label}:ENTRY {{ {properties_query} }}) """ - self.session.run(query, entry_nodes=[entry_node.to_dict() for entry_node in entry_nodes]) + self.session.run( + query, entry_nodes=[entry_node.to_dict() for entry_node in entry_nodes] + ) self.parser_logger.info( f"Created {len(entry_nodes)} ENTRY nodes in {timeit.default_timer() - start_time} seconds" ) - def _create_previous_links(self, previous_links: list[PreviousLink], project_label: str): + def _create_previous_links( + self, previous_links: list[PreviousLink], project_label: str + ): """Create the 'is_before' relations between nodes""" self.parser_logger.info("Creating 'is_before' links") start_time = timeit.default_timer() @@ -142,9 +157,9 @@ def _create_child_links(self, child_links: list[ChildLink], project_label: str): MATCH (p:{project_label}) USING INDEX p:{project_label}(id) WHERE p.id = child_link.parent_id MATCH (c:{project_label}) USING INDEX c:{project_label}(id) + WHERE c.id = child_link.id """ + """ - WHERE c.id = child_link.id CREATE (c)-[relations:is_child_of {position: child_link.position}]->(p) WITH relations UNWIND relations AS relation @@ -210,7 +225,9 @@ def _create_node_fulltext_index(self, project_label: str): ) self.session.run(query) - language_codes = [lang.alpha2 for lang in list(iso639.languages) if lang.alpha2 != ""] + language_codes = [ + lang.alpha2 for lang in list(iso639.languages) if lang.alpha2 != "" + ] tags_prefixed_lc = ["n.tags_ids_" + lc for lc in language_codes] tags_prefixed_lc = ", ".join(tags_prefixed_lc) query = f"""CREATE FULLTEXT INDEX {project_label+'_SearchTagsIds'} IF NOT EXISTS @@ -225,9 +242,13 @@ def _create_node_indexes(self, project_label: str): self._create_node_id_index(project_label) self._create_node_fulltext_index(project_label) - self.parser_logger.info(f"Created indexes in {timeit.default_timer() - start_time} seconds") + self.parser_logger.info( + f"Created indexes in {timeit.default_timer() - start_time} seconds" + ) - def _write_to_database(self, taxonomy: Taxonomy, taxonomy_name: str, branch_name: str): + def _write_to_database( + self, taxonomy: Taxonomy, taxonomy_name: str, branch_name: str + ): project_label = get_project_name(taxonomy_name, branch_name) # First create nodes, then create node indexes to accelerate relationship creation, then create relationships self._create_other_nodes(taxonomy.other_nodes, project_label) diff --git a/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py b/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py index 9eb4fa78..84eeb11f 100644 --- a/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py +++ b/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py @@ -96,15 +96,7 @@ def _file_iter(self, filename: str, start: int = 0) -> Iterator[tuple[int, str]] for line_number, line in enumerate(file): if line_number < start: continue - # sanitizing - # remove any space characters at end of line - line = line.rstrip() - # replace commas between digits and that have no space around by a lower comma character - # and do the same for escaped comma (preceded by a \) - # (to distinguish them from commas acting as tags separators) - line = re.sub(r"(\d),(\d)", r"\1‚\2", line) - line = re.sub(r"\\,", "\\‚", line) - yield line_number, line + yield line_number, line.rstrip("\n") line_count += 1 yield line_count, "" # to end the last entry if not ended @@ -113,13 +105,30 @@ def _normalize_entry_id(self, raw_id: str) -> str: Get a normalized string but keeping the language code "lc:", used for id and parent tag """ + raw_id = raw_id.strip() lc, main_tag = raw_id.split(":", 1) normalized_main_tag = normalize_text(main_tag, lc, stopwords=self.stopwords) normalized_id = f"{lc}:{normalized_main_tag}" return normalized_id + def prepare_line(self, line: str) -> str: + """prepare line for parsing + + This is different from `normalize_text` which is to compute ids + """ + line = line.strip() + # sanitizing + # remove any space or commas characters at end of line + line = re.sub(r"[\s,]+$", "", line) + # replace commas between digits and that have no space around by a lower comma character + # and do the same for escaped comma (preceded by a \) + # (to distinguish them from commas acting as tags separators) + line = re.sub(r"(\d),(\d)", r"\1‚\2", line) + line = re.sub(r"\\,", "\\‚", line) + return line + def undo_normalize_text(self, text: str) -> str: - """Undo some normalizations made in `_file_iter`""" + """Undo some normalizations made in `prepare_line`""" # restore commas from lower comma characters text = re.sub(r"(\d)‚(\d)", r"\1,\2", text) text = re.sub(r"\\‚", "\\,", text) @@ -148,7 +157,7 @@ def _header_harvest(self, filename: str) -> tuple[list[str], int]: h = 0 header: list[str] = [] for _, line in self._file_iter(filename): - if not (line) or line[0] == "#": + if not (line.strip()) or line[0] == "#": header.append(line) else: break @@ -157,7 +166,7 @@ def _header_harvest(self, filename: str) -> tuple[list[str], int]: # we don't want to eat the comments of the next block # and it removes the last separating line for i in range(len(header)): - if header.pop(): + if header.pop().strip(): h -= 1 else: break @@ -169,7 +178,7 @@ def _entry_end(self, line: str, data: NodeData) -> bool: if data.id.startswith("stopwords") or data.id.startswith("synonyms"): # stopwords and synonyms are one-liners; if the id is set, it's the end return True - if not line and data.id: + if not line.strip() and data.id: # entries are separated by a blank line return True return False @@ -181,7 +190,7 @@ def _remove_separating_line(self, data: NodeData) -> NodeData: """ is_before = data.is_before # first, check if there is at least one preceding line - if data.preceding_lines and not data.preceding_lines[0]: + if data.preceding_lines and not data.preceding_lines[0].strip(): if data.id.startswith("synonyms"): # it's a synonyms block, # if the previous block is a stopwords block, @@ -211,7 +220,9 @@ def _get_node_data_with_comments_above_key( # Get comments just above the given line comments_above = [] current_line = line_number - 1 - while new_data.comments_stack and new_data.comments_stack[-1][0] == current_line: + while ( + new_data.comments_stack and new_data.comments_stack[-1][0] == current_line + ): comments_above.append(new_data.comments_stack.pop()[1]) current_line -= 1 if comments_above: @@ -246,12 +257,12 @@ def is_entry_synonyms_line(self, line): matching_prefix = self._language_code_prefix.match(line) if matching_prefix: # verify it's not a property, that is a name followed by a colon and a language - return not ( - self._language_code_prefix.match(line[matching_prefix.end():]) - ) + return not (self._language_code_prefix.match(line[matching_prefix.end() :])) return False - def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[NodeData]: + def _harvest_entries( + self, filename: str, entries_start_line: int + ) -> Iterator[NodeData]: """Transform data from file to dictionary""" saved_nodes = [] index_stopwords = 0 @@ -263,12 +274,10 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N self.stopwords = {} # the first entry is after __header__ which was created before data = NodeData(is_before="__header__") - line_number = ( - entries_start_line # if the iterator is empty, line_number will not be unbound - ) - for line_number, line in self._file_iter(filename, entries_start_line): + line_number = entries_start_line # if the iterator is empty, line_number will not be unbound + for line_number, raw_line in self._file_iter(filename, entries_start_line): # yield data if block ended - if self._entry_end(line, data): + if self._entry_end(raw_line, data): data = self._remove_separating_line(data) if data.get_node_type() == NodeType.ENTRY: data = self._get_node_data_with_parent_and_end_comments(data) @@ -289,16 +298,16 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N data = NodeData(is_before=is_before) # harvest the line - if not (line) or line[0] == "#": + if not (raw_line.strip()) or raw_line[0] == "#": # comment or blank line if data.id: # we are within the node definition - data.comments_stack.append((line_number, line)) + data.comments_stack.append((line_number, raw_line)) else: # we are before the actual node - data.preceding_lines.append(line) + data.preceding_lines.append(raw_line) else: - line = line.rstrip(",") + line = self.prepare_line(raw_line) if not data.src_position: data.src_position = line_number + 1 if line.startswith("stopwords"): @@ -309,7 +318,9 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N # remove "stopwords:" part line = line[10:] try: - lc, tags, tags_ids = self._get_lc_value(line, remove_stopwords=False) + lc, tags, tags_ids = self._get_lc_value( + line, remove_stopwords=False + ) except ValueError: self.parser_logger.error( f"Missing language code at line {line_number + 1} ? '{self.parser_logger.ellipsis(line)}'" @@ -337,7 +348,9 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N data.tags["tags_ids_" + lc] = tags_ids elif line[0] == "<": # parent definition - data.parent_tags.append((self._normalize_entry_id(line[1:]), line_number + 1)) + data.parent_tags.append( + (self._normalize_entry_id(line[1:]), line_number + 1) + ) elif self.is_entry_synonyms_line(line): # synonyms definition if not data.id: @@ -353,7 +366,9 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N tagsids_list = [] for word in line.split(","): tags_list.append(self.undo_normalize_text(word.strip())) - word_normalized = normalize_text(word, lang, stopwords=self.stopwords) + word_normalized = normalize_text( + word, lang, stopwords=self.stopwords + ) if word_normalized not in tagsids_list: # in case 2 normalized synonyms are the same tagsids_list.append(word_normalized) @@ -376,14 +391,17 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N property_name = property_name.strip() lc = lc.strip().replace("-", "_") if not ( - correctly_written.match(property_name) and correctly_written.match(lc) + correctly_written.match(property_name) + and correctly_written.match(lc) ): self.parser_logger.error( f"Reading error at line {line_number + 1}, unexpected format: '{self.parser_logger.ellipsis(line)}'" ) if property_name: prop_key = "prop_" + property_name + "_" + lc - data.properties[prop_key] = self.undo_normalize_text(property_value.strip()) + data.properties[prop_key] = self.undo_normalize_text( + property_value.strip() + ) data = self._get_node_data_with_comments_above_key( data, line_number, prop_key ) @@ -409,7 +427,9 @@ def _normalise_and_validate_child_links( # we collect all the tags_ids in a certain language tags_ids = {} for node in entry_nodes: - node_tags_ids = {tag_id: node.id for tag_id in node.tags.get(f"tags_ids_{lc}", [])} + node_tags_ids = { + tag_id: node.id for tag_id in node.tags.get(f"tags_ids_{lc}", []) + } tags_ids.update(node_tags_ids) # we check if the parent_id exists in the tags_ids @@ -418,7 +438,9 @@ def _normalise_and_validate_child_links( if parent_id not in tags_ids: missing_child_links.append(child_link) else: - child_link["parent_id"] = tags_ids[parent_id] # normalise the parent_id + child_link["parent_id"] = tags_ids[ + parent_id + ] # normalise the parent_id normalised_child_links.append(child_link) return normalised_child_links, missing_child_links @@ -444,8 +466,10 @@ def _get_valid_child_links( ] # Normalise and validate the unnormalised links - normalised_child_links, missing_child_links = self._normalise_and_validate_child_links( - entry_nodes, child_links_to_normalise + normalised_child_links, missing_child_links = ( + self._normalise_and_validate_child_links( + entry_nodes, child_links_to_normalise + ) ) valid_child_links.extend(normalised_child_links) @@ -459,7 +483,9 @@ def _get_valid_child_links( return valid_child_links - def _remove_duplicate_child_links(self, child_links: list[ChildLink]) -> list[ChildLink]: + def _remove_duplicate_child_links( + self, child_links: list[ChildLink] + ) -> list[ChildLink]: """Remove duplicate child links (i.e child links with the same parent_id and id)""" unique_child_links = [] children_to_parents = collections.defaultdict(set) @@ -470,10 +496,13 @@ def _remove_duplicate_child_links(self, child_links: list[ChildLink]) -> list[Ch unique_child_links.append(child_link) return unique_child_links - def _merge_duplicate_entry_nodes(self, entry_nodes: list[NodeData]) -> list[NodeData]: + def _merge_duplicate_entry_nodes( + self, entry_nodes: list[NodeData] + ) -> list[NodeData]: """Merge entry nodes with the same id: - merge their tags (union) - - merge their properties (union, and in case of conflict, keep the last value)""" + - merge their properties (union, and in case of conflict, keep the last value) + """ unique_entry_nodes = [] ids_to_nodes = dict() for node in entry_nodes: @@ -539,7 +568,9 @@ def _create_taxonomy( entry_nodes: list[NodeData] = [] entry_nodes.extend(external_entry_nodes) other_nodes = [ - NodeData(id="__header__", preceding_lines=harvested_header_data, src_position=1) + NodeData( + id="__header__", preceding_lines=harvested_header_data, src_position=1 + ) ] previous_links: list[PreviousLink] = [] raw_child_links: list[ChildLink] = [] @@ -551,7 +582,9 @@ def _create_taxonomy( else: other_nodes.append(entry) if entry.is_before: - previous_links.append(PreviousLink(before_id=entry.is_before, id=entry.id)) + previous_links.append( + PreviousLink(before_id=entry.is_before, id=entry.id) + ) if entry.parent_tags: for position, (parent, line_position) in enumerate(entry.parent_tags): raw_child_links.append( @@ -584,7 +617,9 @@ def parse_file( start_time = timeit.default_timer() filename = normalize_filename(filename) taxonomy = self._create_taxonomy(filename, external_filenames) - self.parser_logger.info(f"Parsing done in {timeit.default_timer() - start_time} seconds.") + self.parser_logger.info( + f"Parsing done in {timeit.default_timer() - start_time} seconds." + ) self.parser_logger.info( f"Found {len(taxonomy.entry_nodes) + len(taxonomy.other_nodes)} nodes" ) diff --git a/parser/openfoodfacts_taxonomy_parser/unparser.py b/parser/openfoodfacts_taxonomy_parser/unparser.py index 158c2578..4a4770aa 100644 --- a/parser/openfoodfacts_taxonomy_parser/unparser.py +++ b/parser/openfoodfacts_taxonomy_parser/unparser.py @@ -67,7 +67,11 @@ def list_property_and_lc(self, node): """return an ordered list of properties with their language code (lc)""" # there is no rule for the order of properties # properties will be arranged in alphabetical order - values = [property[5:] for property in node if property.startswith("prop_") and not property.endswith("_comments")] + values = [ + property[5:] + for property in node + if property.startswith("prop_") and not property.endswith("_comments") + ] # note: using the fact that we are sure to find language code after the first underscore return sorted(values, key=self.property_sort_key) @@ -83,7 +87,7 @@ def get_parents_lines(self, parents): parent = dict(parent) lc = parent["main_language"] parent_id = parent["tags_" + lc][0] - yield "<" + lc + ": " + parent_id + yield "< " + lc + ":" + parent_id def iter_lines(self, project_label): previous_block_id = "" @@ -91,9 +95,9 @@ def iter_lines(self, project_label): node = dict(node) has_content = node["id"] not in ["__header__", "__footer__"] # eventually add a blank line but in specific case - following_synonyms = node["id"].startswith("synonyms") and previous_block_id.startswith( + following_synonyms = node["id"].startswith( "synonyms" - ) + ) and previous_block_id.startswith("synonyms") following_stopwords = node["id"].startswith( "stopwords" ) and previous_block_id.startswith("stopwords") diff --git a/parser/openfoodfacts_taxonomy_parser/utils.py b/parser/openfoodfacts_taxonomy_parser/utils.py index 36225921..fa8a61d6 100644 --- a/parser/openfoodfacts_taxonomy_parser/utils.py +++ b/parser/openfoodfacts_taxonomy_parser/utils.py @@ -4,7 +4,12 @@ import unidecode -def normalize_text(line: str, lang: str = "default", char: str = "-", stopwords: dict[str, list[str]] | None = None) -> str: +def normalize_text( + line: str, + lang: str = "default", + char: str = "-", + stopwords: dict[str, list[str]] | None = None, +) -> str: """Normalize a string depending on the language code""" if stopwords is None: stopwords = {} @@ -43,7 +48,9 @@ def normalize_text(line: str, lang: str = "default", char: str = "-", stopwords: stopwords = stopwords[lang] line_surrounded_by_char = char + line + char for stopword in stopwords: - line_surrounded_by_char = line_surrounded_by_char.replace(char + stopword + char, char) + line_surrounded_by_char = line_surrounded_by_char.replace( + char + stopword + char, char + ) line = line_surrounded_by_char[1:-1] return line diff --git a/parser/tests/data/test.txt b/parser/tests/data/test.txt index 92f38f86..c501a4b5 100644 --- a/parser/tests/data/test.txt +++ b/parser/tests/data/test.txt @@ -6,33 +6,33 @@ synonyms:en: passion fruit, passionfruit synonyms:fr: fruit de la passion, maracuja, passion -