-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPED_APPEND_ANALYSIS_R.R
123 lines (79 loc) · 3.02 KB
/
PED_APPEND_ANALYSIS_R.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
gc()
library(tidyverse)
append <- read.csv('ped_append.csv', header=TRUE)
append <- tibble::rowid_to_column(append, "ID")
View(append)
#clean the data
wbc <- append%>%select(WBC_Count, ID)
wbc <- na.omit(wbc)
rbc <- append%>%select(RBC_Count, ID)
rbc <- na.omit(rbc)
np <- append%>%select(Neutrophil_Percentage, ID)
np <- na.omit(np)
wbc_np <- left_join(np, wbc)
View(wbc_np)
whole <- inner_join(wbc_np, rbc)
#testing quantitatively for a linear relationship by computing the correlation coefficient.
coeff_1 <- cor.test(append$WBC_Count, append$Age)
print(coeff_1$estimate)
#this demonstrates that there is a very low correlation, and therefore a weak linear relationship between these two variables.
coeff_2 <- cor.test(whole$Neutrophil_Percentage, whole$WBC_Count)
coeff_2$estimate
#the value 0.6634112 demonstrates a greater linear linear relationship than is demonstrated in coeff_1.
coeff_3 <- cor.test(whole$Neutrophil_Percentage, whole$RBC_Count)
coeff_3$estimate
#coeff_3 returns a value of the 0.006627787 that indicates that there is a low correlation.
str(append)
WBC_dist <- append%>%ggplot(aes(WBC_Count)) + geom_bar()
WBC_dist
#check for outliers in the WBC distribution
plot <- whole%>%ggplot(aes(WBC_Count, WBC_Count)) + geom_boxplot()
plot
threshold = 32
wbc_clean <- whole%>%filter(WBC_Count < threshold)
plot_2 <- wbc_clean%>%ggplot(aes(WBC_Count,WBC_Count)) + geom_boxplot() +ggtitle("WBC Count Boxplot")
plot_2
##building a simple linear model
#splitting the data 60/40
sample <-sample(c(TRUE, FALSE), nrow(wbc_clean),replace =T, prob= c(0.6, 0.4))
#subset data points into train and test set
train <- wbc_clean[sample, ]
test <- wbc_clean[!sample, ]
#build the linear model
model <- lm(Neutrophil_Percentage ~ WBC_Count, data=train)
#quantify the model fit
#calculating the Residual Standard Error (RSE)
summary(model)
sigma(model)
#these are two modes of generating the RSE
#calculating the r^2 statistic represents the proportion of variance explained.
summary(model)$r.squared
#our r^2 value is 0.4780326.This suggests that we have 47 - 48% variability in the total wbc count value.
#to quantify in comparison the model fit to other models, I will build a second model
model_2 <- lm(Neutrophil_Percentage ~ RBC_Count, data=train)
model_2
r_sq_2 <- summary(model_2)$r.squared
r_sq_2
r_sq <- summary(model)$r.squared
r_sq
##checking model residuals
train$estimate <- predict(model)
train$residuals <- residuals(model)
length(resid(lm(Neutrophil_Percentage ~ RBC_Count, test)))
length(resid(lm(Neutrophil_Percentage ~ WBC_Count, test)))
nrow(test)
summary(model)
summary(model_2)
#visualize the fit
ggplot(train, aes(Neutrophil_Percentage, WBC_Count)) +
geom_point() +
geom_smooth(method = "lm") +
geom_smooth(se = FALSE, color = "red")
ggplot(train, aes(Neutrophil_Percentage, RBC_Count)) +
geom_point() +
geom_smooth(method = "lm") +
geom_smooth(se = FALSE, color = "red")
# build a multiple linear model
model_3 <- lm(Neutrophil_Percentage ~ RBC_Count + WBC_Count, data=train)
model_3
summary(model_3)