forked from Networks-Learning/prediction-powered-ranking
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_process.py
197 lines (155 loc) · 6.87 KB
/
data_process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import json
import random
import numpy as np
def input_json(path, ignore_ties=False):
"""
Parses a json file of pairwise comparison data
Parameters
----------
path : string
file path of json file with data
json file must include columns "question_id", "winner", "model_a", "model_b"
ignore_ties : bool, optional
The default is False.
If True, ignores the pairwise comparisons that are ties
Returns
-------
data : list of dictionaries
Keys: 'question_id', 'model_a', 'model_b', 'winner'
Each element corresponds to one row of the json dataset
instances : dictionary
Keys: every distinct tuple ('question_id', 'model_a', 'model_b')
Values: list of indices of rows in the dataset that correspond to tuple
"""
data = []
instances = dict()
with open(path) as f:
for line in f:
l = json.loads(line)
if ignore_ties and (l['winner'] != 'model_a' and l['winner'] != 'model_b'):
continue
data.append({'question_id':l['question_id'],
'model_a':l['model_a'],
'model_b':l['model_b'],
'winner':l['winner']})
t = (l['question_id'], l['model_a'], l['model_b'])
if t in instances:
instances[t].append(len(data)-1)
else:
instances[t] = [len(data)-1]
return data, instances
def model_list(data):
"""
Returns list of unique models in data
Parameters
----------
data : list of dictionaries
Keys: must include 'model_a', 'model_b'
Each element corresponds to one row of the json dataset
Returns
-------
list
Each element corresponds to a unique model.
"""
models = set()
for sample in data:
models.add(sample['model_a'])
models.add(sample['model_b'])
return list(models)
def sample_data(human_data, llm_data, human_instances, llm_instances, n=0, N=0):
"""
Samples from human and llm pairwise comparison datasets
Parameters
----------
human_data : list of dictionaries
Human pairwise comparisons as returned by input_json
llm_data : list of dictionaries
LLM pairwise comparisons as returned by input_json
human_instances : dictionary
Indices of distinct human pairwise comparisons as returned by input_json
llm_instances : dictionary
Indices of distinct llm pairwise comparisons as returned by input_json
n : int, optional
Number of human samples. The default is 0, in which case all the samples are used.
N : int, optional
Number of predicted samples. The default is 0, in which case all the samples are used.
Returns
-------
small_dataset_human : list of dictionaries
Random samples from human_data
small_dataset_llm : list of dictionaries
Samples from llm_data where 'question_id','model_a','model_b' are the same as small_dataset_human
big_dataset_llm : list of dictionaries
Samples from llm_data that are not in small_dataset_llm
"""
# find instances of (question_id, model_a, model_b) where there exists both human and llm annotation, and keep their indices
common_instances = dict()
for t in human_instances:
if t in llm_instances:
l1 = len(human_instances[t])
l2 = len(llm_instances[t])
h = random.sample(human_instances[t], min(l1,l2))
l = random.sample(llm_instances[t], min(l1,l2))
for i in range(len(h)):
common_instances[(t,i)] = [h[i],l[i]]
if (n == 0) or (n > len(common_instances)):
n = len(common_instances)
if (N == 0) or (N > len(llm_data)-n):
N = len(llm_data) - n
same_samples = random.sample([t for t in common_instances],n)
small_dataset_human = []
small_dataset_llm = []
small_dataset_llm_index = set()
for t in same_samples:
small_dataset_human.append(human_data[common_instances[t][0]])
llm_i = common_instances[t][1]
small_dataset_llm_index.add(llm_i)
small_dataset_llm.append(llm_data[llm_i])
big_dataset_llm = [llm_data[i] for i in range(len(llm_data)) if i not in small_dataset_llm_index]
big_dataset_llm=random.sample(big_dataset_llm,N)
return small_dataset_human, small_dataset_llm, big_dataset_llm
def summarize_dataset(models, llm_dataset, human_dataset=[]):
"""
Transforms data into boolean matrix form
Parameters
----------
models : list
List of unique models
llm_dataset : list of dictionaries
List of llm pairwise comparison samples
human_dataset : list of dictionaries, optional
List of human pairwise comparison samples, for the same 'question_id','model_a','model_b' as llm_dataset.
The default is [].
Returns
-------
summarized : dictionary
Keys: 'winner_predicted', numpy.ndarray indicating the winner of each pairwise comparison in llm_dataset (1:model_a won, 0:model_b won, 0.5:tie)
'model_a_matrix', numpy.ndarray where each column is a one-hot vector corresponding to each sample indicating which model was model_a
'model_b_matrix', numpy.ndarray where each column is a one-hot vector corresponding to each sample indicating which model was model_b
'winner_human' (optional), numpy.ndarray indicating the winner of each pairwise comparison in human_dataset (1:model_a won, 0:model_b won, 0.5:tie)
"""
n = len(llm_dataset)
if len(human_dataset) > 0:
assert len(human_dataset) == len(llm_dataset), 'human_dataset and llm_dataset must have the same length'
winner_human, winner_llm = np.zeros(shape=(n,1)), np.zeros(shape=(n,1))
model_a_matrix, model_b_matrix = np.zeros(shape=(len(models),n)), np.zeros(shape=(len(models),n))
for i in range(n):
if len(human_dataset) > 0:
if human_dataset[i]['winner'] == 'model_a':
winner_human[i] = 1
elif human_dataset[i]['winner'] == 'model_b':
winner_human[i] = 0
else:
winner_human[i] = 0.5
if llm_dataset[i]['winner'] == 'model_a':
winner_llm[i] = 1
elif llm_dataset[i]['winner'] == 'model_b':
winner_llm[i] = 0
else:
winner_llm[i] = 0.5
model_a_matrix[models.index(llm_dataset[i]['model_a'])][i] = 1
model_b_matrix[models.index(llm_dataset[i]['model_b'])][i] = 1
summarized = {'winner_predicted':winner_llm, 'model_a_matrix': model_a_matrix, 'model_b_matrix': model_b_matrix}
if(len(human_dataset) > 0):
summarized['winner_human']=winner_human
return summarized