openfoodfacts · hangy · Dec 22, 2024 · Dec 22, 2024
@@ -8,114 +8,118 @@
 
 # In[]:
 
+import pandas as pd
 urls = ['https://oapi.fsai.ie/LAApprovedEstablishments.aspx',
         'https://oapi.fsai.ie/AuthReg99901Establishments.aspx',
         'https://oapi.fsai.ie/HSEApprovedEstablishments.aspx'
-       ]
+        ]
 urls_second_format = ['https://www.sfpa.ie/Seafood-Safety/Registration-Approval-of-Businesses/List-of-Approved-Establishments-and-Vessels/Approved-Establishments',
                       'https://www.sfpa.ie/Seafood-Safety/Registration-Approval-of-Businesses/Approved-Freezer-Vessels'
-                     ]
+                      ]
 
 csv_file = 'Ireland_concatenated.csv'
 
-import pandas as pd
 pages = [pd.read_html(url) for url in urls]
-pages2= [pd.read_html(url) for url in urls_second_format]
+pages2 = [pd.read_html(url) for url in urls_second_format]
 
 
 # In[]:
 
-def ireland_correction_of_1_dataframe(df):     #Version to get anything
-    #print ("df as recuperated :")
-    #print(df.head())
+def ireland_correction_of_1_dataframe(df):  # Version to get anything
+    # print ("df as recuperated :")
+    # print(df.head())
     df.columns = df.iloc[[0]].values.tolist()
     df = df.rename(columns={' Address': 'Address'})
-    df=df.drop(df.index[0]) #
+    df = df.drop(df.index[0])
     row_reference = df.iloc[0]
 
     if 'Approval_Number' not in df.columns:
         print("this table has no approval number and was not added")
         return pd.DataFrame()
 
-    df_is_null=df.isnull()
-    for i in range(1,len(df)): #len(df)
-        if df_is_null.iloc[i,len(df.columns)-1]:   #We assume that on a row, there is no merged cell(null in pandas) on the webpage after an unmerged cell (not null)
-            row_retrieved=[]
+    df_is_null = df.isnull()
+    for i in range(1, len(df)):  # len(df)
+        # We assume that on a row, there is no merged cell(null in pandas) on the webpage after an unmerged cell (not null)
+        if df_is_null.iloc[i, len(df.columns)-1]:
+            row_retrieved = []
             value = ""
-            j=0
-            while not df_is_null.iloc[i,j]:
-                value=df.iloc[i,j]
+            j = 0
+            while not df_is_null.iloc[i, j]:
+                value = df.iloc[i, j]
                 row_retrieved.append(value)
-                #print("while loop - j:"+str(j)+ "value : "+str(value))
-                j+=1
+                # print("while loop - j:"+str(j)+ "value : "+str(value))
+                j += 1
             row = row_reference.copy()
-            row[len(row)-len(row_retrieved):len(row)]=row_retrieved
-            df.iloc[i]= row
-
-        row_reference =df.iloc[i]
+            row[len(row)-len(row_retrieved):len(row)] = row_retrieved
+            df.iloc[i] = row
 
+        row_reference = df.iloc[i]
 
-    df["Address"]=df["Address"].apply(add_space_before_uppercase)
+    df["Address"] = df["Address"].apply(add_space_before_uppercase)
 
-    #print ("result corrected : ")
-    #print(df.head())
+    # print ("result corrected : ")
+    # print(df.head())
     return df
 
-#df=pages[0][18]
-#ireland_correction_of_1_dataframe(df)
+# df=pages[0][18]
+# ireland_correction_of_1_dataframe(df)
 
 
 # In[]:
 
 def add_space_before_uppercase(words):
-        result=""
-        for s in words:
-            if isinstance(s, str):
-                if s.isupper():
-                    result+=" "
-            result+=s
-        return result
+    result = ""
+    for s in words:
+        if isinstance(s, str):
+            if s.isupper():
+                result += " "
+        result += s
+    return result
+
+
 """ This could have been done more efficienty using Regex r"[a-z][A-Z]"" and avoid r" [A-Z]". But google maps recognize it this way."""
 
 
 # In[ ]:
 
-df=pd.DataFrame()
+df = pd.DataFrame()
 
 
 # In[]:
 
-i=0
+i = 0
 for page in pages:
-    j=0
+    j = 0
     for table in page:
-        df=df.append(ireland_correction_of_1_dataframe(table), ignore_index=True)
-        #print ("table "+str(j)+" is ok")
-        #j+=1
-    print ("page "+str(i)+" is done")
-    i+=1
+        df = df.append(ireland_correction_of_1_dataframe(
+            table), ignore_index=True)
+        # print ("table "+str(j)+" is ok")
+        # j+=1
+    print("page "+str(i)+" is done")
+    i += 1
 print("finished for all in urls!")
 
 
 # In[]:
 
-i=0
+i = 0
 for page2 in pages2:
-    j=0
+    j = 0
     for table in page2:
-        #print (table.head(3))
-        table=table.drop(table.index[0])
-        table.loc[0,0]='Approval_Number'
-        #print (ireland_correction_of_1_dataframe(table).head())
-        df=df.append(ireland_correction_of_1_dataframe(table), ignore_index=True)
-        print ("table "+str(j)+" is ok")
-        j+=1
-    print ("page "+str(i)+" is done")
-    i+=1
+        # print (table.head(3))
+        table = table.drop(table.index[0])
+        table.loc[0, 0] = 'Approval_Number'
+        # print (ireland_correction_of_1_dataframe(table).head())
+        df = df.append(ireland_correction_of_1_dataframe(
+            table), ignore_index=True)
+        print("table "+str(j)+" is ok")
+        j += 1
+    print("page "+str(i)+" is done")
+    i += 1
 print("finished for table in urls_second_format!")
 
 
 # In[]:
 
 
-df.to_csv(csv_file, index = False)
+df.to_csv(csv_file, index=False)
@@ -3,6 +3,7 @@
 import os
 import pandas
 
+
 def main():
     if not (os.getenv('OFF_PUBLIC_DATA_DIR') and os.getenv('PRODUCT_OPENER_FLAVOR') and os.getenv('PRODUCT_OPENER_FLAVOR_SHORT')):
         print("Environment variables OFF_PUBLIC_DATA_DIR, PRODUCT_OPENER_FLAVOR and PRODUCT_OPENER_FLAVOR_SHORT are required")
@@ -13,15 +14,18 @@ def main():
 
     if not os.path.exists(off_public_data_dir + '/offline'):
         os.makedirs(off_public_data_dir + '/offline')
-
-    df = pandas.read_csv(off_public_data_dir + '/en.' + product_opener_flavor + '.org.products.csv', sep='\t', low_memory=False)
-    colnames = ['code','product_name','quantity','brands']
+
+    df = pandas.read_csv(off_public_data_dir + '/en.' + product_opener_flavor +
+                         '.org.products.csv', sep='\t', low_memory=False)
+    colnames = ['code', 'product_name', 'quantity', 'brands']
     # add 'nutriscore_grade','nova_group','environmental_score_grade' columns if the flavor is off
     if product_opener_flavor_short == 'off':
-        colnames = colnames + ['nutriscore_grade','nova_group','environmental_score_grade']
+        colnames = colnames + ['nutriscore_grade',
+                               'nova_group', 'environmental_score_grade']
+
+    df.rename(columns={'nutriscore_grade': 'nutrition_grade_fr'}).to_csv(off_public_data_dir + '/offline/en.' +
+                                                                         product_opener_flavor + '.org.products.small.csv', columns=colnames, sep='\t', index=False)
+
 
-    df.rename(columns={'nutriscore_grade': 'nutrition_grade_fr'}).to_csv(off_public_data_dir + '/offline/en.' + product_opener_flavor + '.org.products.small.csv', columns = colnames,sep='\t',index=False)
-
 if __name__ == '__main__':
     main()
-
@@ -14,75 +14,81 @@
 
 temporary_exists = os.path.isfile(PATH_TO_TEMPORARY)
 if temporary_exists:
-	print "The temporary file already exists"
-	exit()
+    print "The temporary file already exists"
+    exit()
 
 ingredients_exists = os.path.isfile(PATH_TO_INGREDIENTS)
 if not ingredients_exists:
-        print "The ingredient file does not exist, check the path :" + PATH_TO_INGREDIENTS
-        exit()
+    print "The ingredient file does not exist, check the path :" + PATH_TO_INGREDIENTS
+    exit()
 
 foodGES_exists = os.path.isfile(PATH_TO_FOODGES)
 if not foodGES_exists:
-        print "The foodGES file does not exist, check the path :" + PATH_TO_FOODGES
-        exit()
+    print "The foodGES file does not exist, check the path :" + PATH_TO_FOODGES
+    exit()
+
 
 def check_next_lines(ingredients):
-	next_line_is_not_foodges = True
-	keep_lines = []
-	while next_line_is_not_foodges:
-		next_line = ingredients.readline()
-		keep_lines.append(next_line)	  
-		if STRING_FOODGES_VALUE not in next_line and STRING_FOODGES_INGREDIENT not in next_line:
-			next_line_is_not_foodges = False
-	return keep_lines
+    next_line_is_not_foodges = True
+    keep_lines = []
+    while next_line_is_not_foodges:
+        next_line = ingredients.readline()
+        keep_lines.append(next_line)
+        if STRING_FOODGES_VALUE not in next_line and STRING_FOODGES_INGREDIENT not in next_line:
+            next_line_is_not_foodges = False
+    return keep_lines
+
 
 def write_next_lines(next_lines, temporary_file):
-	size = len(next_lines)
-	for i in range(0, size-1):
-		line = next_lines[i]
-		if STRING_FOODGES_INGREDIENT in line:
-			temporary_file.write(line)
-			if line.rstrip("\n") not in dict:
-                        	print("this mapping is not known : " + line.rstrip("\n"))
-			else:
-				temporary_file.write(STRING_FOODGES_VALUE + dict.get(line.rstrip("\n")) + "\n")
-				if line.rstrip("\n") in unused_mappings:
-					unused_mappings.remove(line.rstrip("\n"))
-	temporary_file.write(next_lines[size-1])
+    size = len(next_lines)
+    for i in range(0, size-1):
+        line = next_lines[i]
+        if STRING_FOODGES_INGREDIENT in line:
+            temporary_file.write(line)
+            if line.rstrip("\n") not in dict:
+                print("this mapping is not known : " + line.rstrip("\n"))
+            else:
+                temporary_file.write(
+                    STRING_FOODGES_VALUE + dict.get(line.rstrip("\n")) + "\n")
+                if line.rstrip("\n") in unused_mappings:
+                    unused_mappings.remove(line.rstrip("\n"))
+    temporary_file.write(next_lines[size-1])
+
 
 with open(PATH_TO_FOODGES, 'r') as csvFile:
-	reader = csv.reader(csvFile)
-	for row in reader:
-		dict[row[2]]=row[1]
-		unused_mappings.append(row[2])
+    reader = csv.reader(csvFile)
+    for row in reader:
+        dict[row[2]] = row[1]
+        unused_mappings.append(row[2])
 
 csvFile.close()
 
-temporary_file = open(PATH_TO_TEMPORARY,"w+")
+temporary_file = open(PATH_TO_TEMPORARY, "w+")
 ingredients = file(PATH_TO_INGREDIENTS)
 
 while True:
-	line = ingredients.readline()
-	temporary_file.write(line)
-	if not line: break
-	if STRING_FOODGES_INGREDIENT in line:
-		if line.rstrip("\n") not in dict:
-			print("this mapping is not known : " + line.rstrip("\n"))
-		else:
-			temporary_file.write(STRING_FOODGES_VALUE + dict.get(line.rstrip("\n")) + "\n")
-			if line.rstrip("\n") in unused_mappings:
-				unused_mappings.remove(line.rstrip("\n"))
-			next_lines = check_next_lines(ingredients)
-			write_next_lines(next_lines, temporary_file)
+    line = ingredients.readline()
+    temporary_file.write(line)
+    if not line:
+        break
+    if STRING_FOODGES_INGREDIENT in line:
+        if line.rstrip("\n") not in dict:
+            print("this mapping is not known : " + line.rstrip("\n"))
+        else:
+            temporary_file.write(STRING_FOODGES_VALUE +
+                                 dict.get(line.rstrip("\n")) + "\n")
+            if line.rstrip("\n") in unused_mappings:
+                unused_mappings.remove(line.rstrip("\n"))
+            next_lines = check_next_lines(ingredients)
+            write_next_lines(next_lines, temporary_file)
 
 ingredients.close()
-temporary_file.close() 
+temporary_file.close()
 
 os.remove(PATH_TO_INGREDIENTS)
 os.rename(PATH_TO_TEMPORARY, PATH_TO_INGREDIENTS)
 
 print("\n")
 print "This is the list of unused mapping : "
 for mapping in unused_mappings:
-	print mapping
+    print mapping