-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess_data.py
233 lines (165 loc) · 7.24 KB
/
preprocess_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
"""
Functions for preprocessing data for training purposes
Author: Jeanne Elizabeth Daniel
April 2019
"""
import pandas as pd
import numpy as np
import gensim
def preprocess(text, min_token_length = 0, join = False):
""" Method for preprocessing text
Args:
text: string of text
min_token_length: integer value indicating min number of characters in a token
join: boolean indicating if function should join the list of tokens into the string or not
Returns:
list of cleaned words or joined string
"""
if type(text) != str:
return []
result = []
for token in gensim.utils.simple_preprocess(text, min_len = min_token_length):
if len(token) > 2:
result.append(token)
if join:
return ' '.join(result)
return result
def create_dictionary(train_data, no_below = 1, no_above = 0.25, keep_n = 95000, min_token_length = 0):
""" Create dictionary of all words in our dataset that adhere to the following conditions:
Args:
train_data: dataframe with questions
no_below: integer = minimum number of occurrences in the dataset
no_above: float between 0 and 1 - proportion of sentences containing word
keep_n: max number of words in our vocabulary
min_token_length: minimum number of characters a token must have
Returns:
dictionary of words found in training set in "dict" format
"""
documents = train_data[['helpdesk_question']]
documents['index'] = documents.index
processed_docs = documents['helpdesk_question'].apply(preprocess, args = [min_token_length])
dictionary = gensim.corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n)
dictionary_of_words = pd.DataFrame(pd.Series(dict(dictionary)))
dictionary_of_words['index'] = dictionary_of_words.index
return dictionary_of_words.set_index(0)['index'].to_dict()
def filter_words(text_list, dictionary):
""" Filter sentences to remove any words from that does not appear in our dictionary
Args:
text_list: list of words in a sentence
dictionary: dictionary of words in training set
Returns:
Filtered list of words in a sentence
"""
result = []
for t in text_list:
if dictionary.get(t) != None:
result.append(t)
return result
def preprocess_question(question, dictionary, minimum_token_length):
""" Create list of cleaned and filtered words for each sentence
Args:
question: string text
dictionary: dictionary of words in training set
Return:
Cleaned and filtered list of words
"""
return filter_words(preprocess(question, minimum_token_length), dictionary)
def create_lookup_tables(unique_words):
""" Create lookup tables for word_to_id and id_to_word
Args:
unique_words: dictionary of words in training set
Return:
word_to_id: dict with words as keys and corresponding ids as values
id_to_word: dict with ids as keys and corresponding words as values
"""
word_to_id = {} # word->id lookup
id_to_word = {} # id->word lookup
for index, word in enumerate(sorted(list(unique_words))):
word_to_id[word] = index + 1
id_to_word[index + 1] = word
return word_to_id, id_to_word
def transform_sequence_to_word_ids(seq, word_to_id):
""" Create list of word IDs for sequence of words, padded with zeroes and truncated to a fixed length
Args:
seq: list of words
word_to_id: dict with words as keys and corresponding ids as values
Return:
list of word IDs padded and truncated to length of 30 items
"""
seq_word_ids = []
for word in seq:
seq_word_ids.append([word_to_id[word]])
# pad sequence with zeros
for i in range(30 - len(seq_word_ids)):
seq_word_ids.append([0])
return seq_word_ids[:30]
def create_one_hot_vector_for_reply(reply, all_responses):
""" Constructs a one-hot vector for replies
Args:
reply: query item
all_responses: dict containing all the template responses with their corresponding IDs
Return:
a one-hot vector where the corresponding ID of the reply is the one-hot index
"""
Y = np.zeros(len(all_responses), dtype = int)
Y[all_responses[reply]] += 1
return Y
def label_preprocess(entry, responses):
""" Returns integer ID corresponding to response for easy comparison and classification
Args:
entry: query item
responses: dict containing all the template responses with their corresponding IDs
Return:
integer corresponding to each response
"""
if responses.get(entry) != None:
return responses[entry]
else:
return len(responses) #default unknown class
def sample_pairs_offline(df, sample_size = 10):
""" Offline sampling for sentence pairs
Args:
df: dataframe of questions and answers
sample_size: number of positive/negative samples per sentence
Returns:
a data frame of positive and negative pairs
"""
sentences_1 = []
sentences_2 = []
labels = []
sample_size = sample_size
df['helpdesk_question_clean'] = df['helpdesk_question'].apply(preprocess_data.preprocess, args = [0, True])
for group in df.groupby('helpdesk_reply'):
questions = list(group[1]['helpdesk_question_clean'])
low_resource = list(group[1]['low_resource'])
for i in range(len(questions)):
q = questions[i]
if len(preprocess_data.preprocess(q, 0)) > 0:
for s in list(group[1]['helpdesk_question_clean'].sample(sample_size)):
if s != q and len(preprocess_data.preprocess(s, 0)) > 0:
if s > q:
sentences_1.append(s)
sentences_2.append(q)
labels.append(1) # positive
else:
sentences_1.append(q)
sentences_2.append(s)
labels.append(1) # positive
#sample negatives
negatives = df.loc[df['helpdesk_reply'] != group[0]]
samples = negatives['helpdesk_question_clean'].sample(sample_size)
if samples.shape[0] > 0:
for s in list(samples):
if len(preprocess_data.preprocess(s, 0)) > 0:
if s > q:
sentences_1.append(s)
sentences_2.append(q)
labels.append(0) # negative
else:
sentences_1.append(q)
sentences_2.append(s)
labels.append(0) #negative
data_pairs = pd.concat([pd.Series(sentences_1), pd.Series(sentences_2), pd.Series(labels)], axis = 1)
del sentences_1, sentences_2, labels
return data_pairs.drop_duplicates()