-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathimbalanced_weights.py
92 lines (71 loc) · 3.06 KB
/
imbalanced_weights.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import numpy as np
import matplotlib.pyplot as plt
def cal_multilabel_weights(Data_labels, threshold_percentile=0):
"""
Encode old label (multi-hot encoding) to new label (number) with a threshold (>=80-percentile length)
input: Code_label_train (multi-hot encoding)
output: sample_weights
"""
# key: multi-hot encoding label in str
# val: idx
labels_dict = {}
for idx, data_label in enumerate(Data_labels):
key = ''.join([str(l) for l in data_label])
if key not in labels_dict:
labels_dict[key] = [idx]
else:
labels_dict[key].append(idx)
# for each label (multi-hot encoding), their length
labels_dict_len = []
labels_dict_key = []
for key in labels_dict.keys():
labels_dict_len.append(len(labels_dict[key]))
labels_dict_key.append(key)
labels_dict_len = np.array(labels_dict_len)
labels_dict_key = np.array(labels_dict_key)
threshold = 0
if threshold_percentile > 0:
threshold = np.percentile(labels_dict_len, threshold_percentile)
labels_dict_len_threshold = labels_dict_len[labels_dict_len>0]
labels_dict_key_threshold = labels_dict_key[labels_dict_len>0]
# old label (multi-hot encoding) => new label (number)
new_index_dict = {}
for i in range(len(labels_dict_key_threshold)):
new_index_dict[labels_dict_key_threshold[i]] = i
# change label to new label
Data_labels_new = np.zeros((len(Data_labels),), dtype = int)
special_class = -1
for key, vals in labels_dict.items():
for val in vals:
if key not in labels_dict_key_threshold:
Data_labels_new[val] = special_class
else:
Data_labels_new[val] = new_index_dict[key]
# distribution of classes in the dataset
label_to_count = {}
for label in Data_labels_new:
if label in label_to_count:
label_to_count[label] += 1
else:
label_to_count[label] = 1
# weight for each sample
sample_weights = [1.0 / label_to_count[label]
for label in Data_labels_new]
return sample_weights, Data_labels_new, label_to_count, new_index_dict
def proportional_weight(Data_labels, class_idx):
class_weights = np.sum(Data_labels, axis=0)[class_idx]
return class_weights/np.sum(class_weights)
def inverse_weight(Data_labels, class_idx, K=1):
# inv_class_weights = 1.0/np.sum(Data_labels, axis=0)[class_idx]
# inv_class_weights = inv_class_weights / np.sum(inv_class_weights)
# return inv_class_weights
Data_labels = Data_labels[:,class_idx]
N_labels = np.sum(Data_labels, axis=0).flatten()+1
return np.log((np.sum(N_labels)-N_labels)/N_labels + K)
def inverse_weight_no_log(Data_labels, class_idx, K=1):
# inv_class_weights = 1.0/np.sum(Data_labels, axis=0)[class_idx]
# inv_class_weights = inv_class_weights / np.sum(inv_class_weights)
# return inv_class_weights
Data_labels = Data_labels[:,class_idx]
N_labels = np.sum(Data_labels, axis=0).flatten()+1
return (np.sum(N_labels)-N_labels)/N_labels + K