-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess_data.py
74 lines (52 loc) · 2.73 KB
/
preprocess_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 20 18:41:02 2017
@author: Gerardo Cervantes
"""
import numpy as np
from sklearn import preprocessing
from sklearn.utils import shuffle
#Preprocess the data to prepare it for the neural network.
#Higher level function to preprocess the data with various techniques.
def preprocess(x_train, y_train, x_validation):
# x_train, x_validation = add_missing_column_features(x_train, x_validation)
# x_train = change_missing_values_to_mean(x_train)
# x_validation = change_missing_values_to_mean(x_validation)
# n = 60
# x_train, x_validation, mn, evals = pca(x_train, x_validation, n)
x_train, x_validation = normalizeData(x_train, x_validation)
return x_train, y_train, x_validation
#Normalizes the data so the mean is 0 and standard deviation is 1.
#Parameters is the data as numpy matrix of size (samples, feature)
#Normalizes x_train and x_val with the same normalization.
#Returns the normalized data.
def normalizeData(x_train, x_validation):
x_train_len = len(x_train)
normalized_x = preprocessing.normalize(np.concatenate((x_train, x_validation), axis = 0))
x_train = normalized_x[:x_train_len,:]
x_validation = normalized_x[x_train_len:,:]
return x_train, x_validation
#x_matrix in a numpy matrix of size (samples, feature).
#Replaces missing values (values with -1) to the mean of that column
def change_missing_values_to_mean(x_matrix):
x_matrix[x_matrix == -1] = np.nan #Replaces negative values with NaN, so we can take mean
avg_features = np.nanmean(x_matrix, axis = 0).astype(np.float32) #Takes mean, ignores NaN values
avg_features = np.resize(avg_features, (1,len(avg_features)))
avg_features = np.repeat(avg_features, len(x_matrix), axis = 0)
np.copyto(x_matrix, avg_features, where = np.isnan(x_matrix))
return x_matrix
#Original data is imalanced with ratio of 1:26, with most not claiming insurance
#Balances data by duplicating data of those who claimed insurance.
#This technique can sometimes help, but can usually cause overfitting
def balance_data_upscaling(x, y, factor):
train_zero_indices = np.where(y == 0)[0] #Returns indices where y_train == 1
train_one_indices = np.where(y == 1)[0] #Returns indices where y_train == 1
x_train_zero = x[train_zero_indices]
y_train_zero = y[train_zero_indices]
x_train_one = x[train_one_indices]
y_train_one = y[train_one_indices]
x_train_one = np.repeat(x_train_one, factor, axis = 0)
y_train_one = np.repeat(y_train_one, factor, axis = 0)
x,y = np.concatenate((x_train_zero, x_train_one), axis = 0), np.concatenate((y_train_zero, y_train_one), axis = 0)
x, y = shuffle(x, y, random_state=0)
return x, y