-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathclassify_nodes.py
147 lines (122 loc) · 4.69 KB
/
classify_nodes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# usage: python classify_nodes.py nodes.npy
import numpy as np
import pickle
import skimage.measure as skm
from glob import glob
from sklearn import cross_validation
from sklearn.cross_validation import StratifiedKFold as KFold
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier as RF
import xgboost as xgb
def getRegionFromMap(slice_npy):
thr = np.where(slice_npy > np.mean(slice_npy),0.,1.0)
label_image = skm.label(thr)
labels = label_image.astype(int)
regions = skm.regionprops(labels)
return regions
def getRegionMetricRow(fname = "nodules.npy"):
# fname, numpy array of dimension [#slices, 1, 512, 512] containing the images
seg = np.load(fname)
nslices = seg.shape[0]
#metrics
totalArea = 0.
avgArea = 0.
maxArea = 0.
avgEcc = 0.
avgEquivlentDiameter = 0.
stdEquivlentDiameter = 0.
weightedX = 0.
weightedY = 0.
numNodes = 0.
numNodesperSlice = 0.
# crude hueristic to filter some bad segmentaitons
# do not allow any nodes to be larger than 10% of the pixels to eliminate background regions
maxAllowedArea = 0.10 * 512 * 512
areas = []
eqDiameters = []
for slicen in range(nslices):
regions = getRegionFromMap(seg[slicen,0,:,:])
for region in regions:
if region.area > maxAllowedArea:
continue
totalArea += region.area
areas.append(region.area)
avgEcc += region.eccentricity
avgEquivlentDiameter += region.equivalent_diameter
eqDiameters.append(region.equivalent_diameter)
weightedX += region.centroid[0]*region.area
weightedY += region.centroid[1]*region.area
numNodes += 1
weightedX = weightedX / totalArea
weightedY = weightedY / totalArea
avgArea = totalArea / numNodes
avgEcc = avgEcc / numNodes
avgEquivlentDiameter = avgEquivlentDiameter / numNodes
stdEquivlentDiameter = np.std(eqDiameters)
maxArea = max(areas)
numNodesperSlice = numNodes*1. / nslices
return np.array([avgArea,maxArea,avgEcc,avgEquivlentDiameter,\
stdEquivlentDiameter, weightedX, weightedY, numNodes, numNodesperSlice])
def createFeatureDataset(nodfiles=None):
if nodfiles == None:
# directory of numpy arrays containing masks for nodules
# found via unet segmentation
noddir = "/training_set/"
nodfiles = glob(noddir +"*npy")
# dict with mapping between training examples and true labels
# the training set is the output masks from the unet segmentation
truthdata = pickle.load(open("truthdict.pkl",'r'))
numfeatures = 9
feature_array = np.zeros((len(nodfiles),numfeatures))
truth_metric = np.zeros((len(nodfiles)))
for i,nodfile in enumerate(nodfiles):
patID = nodfile.split("_")[2]
truth_metric[i] = truthdata[int(patID)]
feature_array[i] = getRegionMetricRow(nodfile)
np.save("dataY.npy", truth_metric)
np.save("dataX.npy", feature_array)
import scipy as sp
def logloss(act, pred):
epsilon = 1e-15
pred = sp.maximum(epsilon, pred)
pred = sp.minimum(1-epsilon, pred)
ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred)))
ll = ll * -1.0/len(act)
return ll
def classifyData():
X = np.load("dataX.npy")
Y = np.load("dataY.npy")
kf = KFold(Y, n_folds=3)
y_pred = Y * 0
for train, test in kf:
X_train, X_test, y_train, y_test = X[train,:], X[test,:], Y[train], Y[test]
clf = RF(n_estimators=100, n_jobs=3)
clf.fit(X_train, y_train)
y_pred[test] = clf.predict(X_test)
print(classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"]))
print("logloss",logloss(Y, y_pred))
# All Cancer
print("Predicting all positive")
y_pred = np.ones(Y.shape)
print(classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"]))
print("logloss",logloss(Y, y_pred))
# No Cancer
print("Predicting all negative")
y_pred = Y*0
print(classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"]))
print("logloss",logloss(Y, y_pred))
# try XGBoost
print("XGBoost")
kf = KFold(Y, n_folds=3)
y_pred = Y * 0
for train, test in kf:
X_train, X_test, y_train, y_test = X[train,:], X[test,:], Y[train], Y[test]
clf = xgb.XGBClassifier(objective="binary:logistic")
clf.fit(X_train, y_train)
y_pred[test] = clf.predict(X_test)
print(classification_report(Y, y_pred, target_names=["No Cancer", "Cancer"]))
print("logloss",logloss(Y, y_pred))
if __name__ == "__main__":
from sys import argv
getRegionMetricRow(argv[1:])
classifyData()