-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathutils.py
97 lines (91 loc) · 3.31 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
""" Copied from https://github.com/gyyang/multitask.
Some functions are moved from network.py
"""
import os
from io import open
import json
import numpy as np
class Dictionary(object):
def __init__(self):
self.word2idx = {}
self.idx2word = []
def add_word(self, word):
if word not in self.word2idx:
self.idx2word.append(word)
self.word2idx[word] = len(self.idx2word) - 1
return self.word2idx[word]
def __len__(self):
return len(self.idx2word)
def load(self, path):
assert os.path.exists(path)
with open(path) as f:
dicts = json.load(f)
self.word2idx = dicts['word2idx']
self.idx2word = dicts['idx2word']
def unkify(tokens, words_dict):
# take a list of words, and return a list of words with
# the out-of-vocabulary tokens mapped to fine-grained UNK symbols
final = []
for token in tokens:
if len(token.rstrip()) == 0:
final.append('UNK')
elif not(token.rstrip() in words_dict):
numCaps = 0
hasDigit = False
hasDash = False
hasLower = False
for char in token.rstrip():
if char.isdigit():
hasDigit = True
elif char == '-':
hasDash = True
elif char.isalpha():
if char.islower():
hasLower = True
elif char.isupper():
numCaps += 1
result = 'UNK'
lower = token.rstrip().lower()
ch0 = token.rstrip()[0]
if ch0.isupper():
if numCaps == 1:
result = result + '-INITC'
if lower in words_dict:
result = result + '-KNOWNLC'
else:
result = result + '-CAPS'
elif not(ch0.isalpha()) and numCaps > 0:
result = result + '-CAPS'
elif hasLower:
result = result + '-LC'
if hasDigit:
result = result + '-NUM'
if hasDash:
result = result + '-DASH'
if lower[-1] == 's' and len(lower) >= 3:
ch2 = lower[-2]
if not(ch2 == 's') and not(ch2 == 'i') and not(ch2 == 'u'):
result = result + '-s'
elif len(lower) >= 5 and not(hasDash) and not(hasDigit and numCaps > 0):
if lower[-2:] == 'ed':
result = result + '-ed'
elif lower[-3:] == 'ing':
result = result + '-ing'
elif lower[-3:] == 'ion':
result = result + '-ion'
elif lower[-2:] == 'er':
result = result + '-er'
elif lower[-3:] == 'est':
result = result + '-est'
elif lower[-2:] == 'ly':
result = result + '-ly'
elif lower[-3:] == 'ity':
result = result + '-ity'
elif lower[-1] == 'y':
result = result + '-y'
elif lower[-2:] == 'al':
result = result + '-al'
final.append(result)
else:
final.append(token.rstrip())
return final