-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathimport_parallel_to_tfcsv.py
121 lines (107 loc) · 3.59 KB
/
import_parallel_to_tfcsv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import cProfile
import csv
import re
from betacode import decode
ketiv = 0
qere = 0
rafe = 0
line = ""
tf_content = []
print("Reading tfdata ...")
# RecordId, HebrewText, HebLemma, SDBH, LexDomain, GLemma
verses = {}
recordid_map = {}
with open('tfdata.csv') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
tf_content.append(row)
v = re.match(r'(^.*),', row["RecordId"])
if v:
vid = v.group(1)
if vid not in verses:
verses[vid] = []
verses[vid].append(row["RecordId"])
recordid_map[row["RecordId"]] = len(tf_content) - 1
# to_decode = re.sub(r'[\d\/]', '', row['HebrewText'])
# to_decode = re.sub(r'~', ' ', to_decode)
# if re.search(r'\*\*', to_decode):
# qere += 1
# # continue
# if re.search(r'\*', to_decode):
# ketiv += 1
# to_decode = re.sub(r'\*', '', to_decode)
# if re.search(r',', to_decode):
# to_decode = re.sub(r'\,', '', to_decode)
# rafe += 1
# decoded = decode(to_decode) if to_decode != "_" else "_"
# tf_content.append({
# "id": row["RecordId"],
# "betacode": row["HebrewText"],
# "hebrew": decoded,
# "lemma": row["HebLemma"],
# "sdbh": row["SDBH"],
# "domain": row["LexDomain"],
# "glemma": row["GLemma"],
# })
if len(tf_content) % 50000 == 0:
print(" |", len(tf_content), "lines processed")
print(" ^", len(tf_content), "lines processed")
print("Completed tfdata prep\n")
# HebLemma, MTVerseID, MTSTartSequence, MTEndSequence, GrkLemma, LXXVerseID, LXXPhrase
og_content = []
emptylemma = 0
print("Reading ogdata ...")
with open('parallelsearch.csv') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
og_content.append(row)
if row["GrkLemma"] == "":
emptylemma += 1
if len(og_content) % 50000 == 0:
print(" |", len(og_content), "lines processed")
print(" ^", len(og_content), "lines processed")
print("Completed ogdata prep ({} empty lemmas)\n".format(emptylemma))
def id_in_range(rid, lower, upper):
m = re.search("(^\w+ \d+:\d+,)(\d+.\d+)", rid)
pos_float = float(m.group(2))
return pos_float >= lower and pos_float <= upper
def partial_match_id_range(abbreviated_map_ids, id_partial_start, id_partial_end):
lower = float(id_partial_start)
upper = float(id_partial_end)
return list(filter(lambda x: id_in_range(x,lower,upper), abbreviated_map_ids))
def process_everything():
# inject greek stuff...
failures = 0
oldid = ""
ids_done = 0
print("Processing parallels ({})".format(len(og_content)))
for i, og in enumerate(og_content):
if i % 50000 == 0:
print(" |", i, "matched rows")
if og["MTSTartSequence"] == "" or og["MTSTartSequence"] == "MTSTartSequence":
continue
if og["MTSTartSequence"] == og["MTEndSequence"]:
rid = og["MTVerseID"] + "," + og["MTSTartSequence"]
tf_content[ recordid_map[rid] ]["GLemma"] = og["GrkLemma"]
else:
success = False
rr = partial_match_id_range(verses[og["MTVerseID"]], og["MTSTartSequence"], og["MTEndSequence"])
if rr is None:
print("None?", og["MTVerseID"], og["MTSTartSequence"], og["MTEndSequence"])
for j, rid in enumerate(rr):
tfid = recordid_map[rid]
if tf_content[tfid]["HebLemma"] == og["HebLemma"]:
success = True
tf_content[tfid]["GLemma"] = og["GrkLemma"]
if not success:
failures += 1
print(" ^ DONE:", failures, "words without greek lemmas\n")
# cProfile.run('process_everything()')
process_everything()
print ("Writing csv file...")
filename = "processed_tfdata.csv"
with open(filename, mode='wt', encoding='utf-8') as out:
csvout = csv.DictWriter(out, ["RecordId", "HebrewText", "HebLemma", "SDBH", "LexDomain", "GLemma"])
csvout.writeheader()
csvout.writerows(tf_content)
print ("done")