-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathload-crossvalidation.R
144 lines (137 loc) · 7.31 KB
/
load-crossvalidation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
##
## load cross validation results from an SVM run output
##
## HBB-0.2-0-80.2.8-2.crossvalidation.txt
## alpha=0.2 kappa=0 minsup=80 minsize=2 minlen=8 run=2
##
## these will be used by plot-crossvalidation.R as well
study = readline(prompt="Study (ex. SCD): ")
graph = readline(prompt="Graph (ex. HBB): ")
requireHomozygous = as.logical(readline(prompt="Require homozygous calls (TRUE/FALSE): "))
if (requireHomozygous) {
mergeMode = "HOM mode: sample is called CASE iff both paths are called CASE"
} else {
mergeMode = "HET mode: sample is called CASE if at least ONE path is called CASE"
}
## standard, nodes-based kappa
alphaValues = c("0.2","0.5","0.8","1.0")
kappaValues = c(0,1,2,3,4,5,10)
minsupValues = c(1,2,4,10,20,50,100)
minsizeValues = c(1,2,4,10,20,50,100)
minlenValues = c(1,2,4,10,20,50,100,500)
## bases-based kappa values
## kappaValues = c(0,1,2,3,12,48,192,768)
## single alpha, kappa
## alphaValues = c("0.2")
## kappaValues = c(3)
## our resulting data frame
results = data.frame(graph=character(), alpha=numeric(), kappa=numeric(),
minsup=numeric(), minsize=numeric(), minlen=numeric(),
maxIndex=numeric(), C=numeric(), gamma=numeric(), nrFold=numeric(),
cases=numeric(), controls=numeric(), caseFails=numeric(), controlFails=numeric(),
TPR=numeric(), FPR=numeric(), precision=numeric(), recall=numeric(), MCC=numeric(), F1=numeric())
## loop through all the files, adding to results
for (alpha in alphaValues) {
for (kappa in kappaValues) {
prefix = paste(graph,"-",alpha,"-",kappa, sep="")
for (minsup in minsupValues) {
for (minsize in minsizeValues) {
for (minlen in minlenValues) {
for (run in 1:3) {
filename = paste(prefix,"-",minsup,".",minsize,".",minlen,"-",run,".crossvalidation.txt", sep="")
filesize = file.size(filename)
if (!is.na(filesize) && filesize>0) {
cat(filename, "\n")
## read header
## HBB-0.2-0-10.4.1.svm.scale.txt 28 0.03125 0.125 10 40 40 2 0
con = file(filename,"r")
header = readLines(con, n=1)
close(con)
parts = strsplit(header, "\t", fixed=TRUE)[[1]]
svmfile = parts[1]
maxIndex = as.numeric(parts[2])
C = as.numeric(parts[3])
gamma = as.numeric(parts[4])
nrFold = as.numeric(parts[5])
## cases and controls are PATH counts, not sample counts!
cases = as.numeric(parts[6])
controls = as.numeric(parts[7])
## read data to merge the path calls per sample
## HG00122.1 ctrl true
df = read.table(filename, skip=1, colClasses=c("character", "character", "logical"))
colnames(df) = c("Path", "Label", "Correct")
caseFails = 0
controlFails = 0
sample0 = ""
correct0 = FALSE
for (i in 1:length(df$Path)) {
path = df$Path[i]
label = df$Label[i]
correct = df$Correct[i]
sample = strsplit(path, ".", fixed=TRUE)[[1]][1]
if (sample==sample0) {
## second path for this sample, so make the combined call
if (requireHomozygous) {
if (label=="case") {
## HOM case: both paths must be called "case"
correct = correct && correct0
} else {
## HOM control: either path may be called "ctrl"
correct = correct || correct0
}
} else {
if (label=="case") {
## HET case: either path may be called "case"
correct = correct || correct0
} else {
## HET control: both paths must be called "ctrl"
correct = correct && correct0
}
}
## increment the case/control failure counts per SAMPLE
if (!correct) {
if (label=="case") {
caseFails = caseFails + 1
} else if (label=="ctrl") {
controlFails = controlFails + 1
}
}
} else {
## first path for this sample
sample0 = sample
correct0 = correct
}
}
## standard classifier rates for samples, not paths
caseSamples = cases/2
controlSamples = controls/2
TP = caseSamples - caseFails
FP = controlFails
TN = controlSamples - controlFails
FN = caseFails
TPR = TP/caseSamples
FPR = FP/controlSamples
precision = TP/(TP+FP)
recall = TP/(TP+FN)
## Matthews correlation coefficient handling zero denominator
denom = (TP+FP)*(TP+FN)*(TN+FP)*(TN+FN)
if (denom==0) {
denom = 1
}
MCC = (TP*TN-FP*FN)/sqrt(denom)
## F1 score
F1 = 2*precision*recall/(precision+recall)
## append to the results dataframe
df = data.frame(graph=graph, alpha=as.numeric(alpha), kappa=kappa,
minsup=minsup, minsize=minsize, minlen=minlen,
maxIndex=maxIndex, C=C, gamma=gamma, nrFold=nrFold,
cases=cases, controls=controls, caseFails=caseFails, controlFails=controlFails,
TPR=TPR, FPR=FPR, precision=precision, recall=recall, MCC=MCC, F1=F1)
results = rbind(results, df)
}
}
}
}
}
}
}