My Functions
# Changing class of varibles
convert.magic <- function(obj, type)
{
FUN1 <- switch(type,
character = as.character,
numeric = as.numeric,
factor = as.factor,
logical = as.logical)
out <- lapply(obj, FUN1)
as.data.frame(out)
}
# remove outlier function
outlier.qr <- function(data, var_name)
{
data$var_name <- eval(substitute(var_name), data)
iqr <- IQR(data$`var_name`, na.rm = TRUE)
qr <- quantile(data$var_name, c(0.25,0.75))
Q1 <- qr[[1]] - 1.5*iqr
Q3 <- qr[[2]] + 1.5*iqr
data <- data %>% filter(var_name < Q3 & var_name > Q1)
return(data)
}
Pre-processing
# open
data <- read_csv('Dataset/heart.csv')
# varible types
# look at type of each varible
sapply(data, class)
## age sex cp trestbps chol fbs restecg
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## thalach exang oldpeak slope ca thal target
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
# use convert.magic function
factor.varibles <- c('sex', 'cp', 'restecg', 'slope', 'ca','thal')
logical.varibles <- c('fbs', 'exang', 'target')
numerical.varibles <- c('age', 'trestbps', 'chol', 'thalach', 'oldpeak')
data[,factor.varibles] <- convert.magic(data[,factor.varibles], "factor")
data[,logical.varibles] <- convert.magic(data[,logical.varibles], "logical")
# check
sapply(data, class)
## age sex cp trestbps chol fbs restecg
## "numeric" "factor" "factor" "numeric" "numeric" "logical" "factor"
## thalach exang oldpeak slope ca thal target
## "numeric" "logical" "numeric" "factor" "factor" "factor" "logical"
# Missing vaules
data %>% summarise_each(funs(sum(is.na(.))))
# outliers
# numerical varibles
plot.table <- data[, numerical.varibles] %>% gather(key = "variable", value = "value")
ggplot(data = plot.table, aes(x=variable, y=value)) + geom_violin(scale = 'area', fill = '#595959', colour = NA) +
geom_boxplot(fill = NA, colour = 'orange', outlier.colour = '#00bbff') + facet_wrap( ~ variable, scales="free")

# other varibles
plot.table <- data[, c(factor.varibles, logical.varibles)] %>% gather(key = "variable", value = "value") %>%
group_by(variable, value) %>% summarise(count = n())
ggplot(data = plot.table, aes(x = value, y = count)) + geom_bar(stat = 'identity', colour = "orange") +
facet_wrap( ~ variable, scales="free")

# remove outliers
# List of Vraibles That are Having Outliers Removed
varNames <- lapply(numerical.varibles, as.name)
# For Loop to Remove Outliers
for (i in seq_along(varNames))
{
data <- outlier.qr(data, eval(varNames[[i]]))
}
# remove extra coloum
data <- data[, !(names(data) %in% c('var_name'))]
# resahpe for glm model
dmy <- dummyVars(" ~ + cp + restecg + slope + ca + thal", data = data[])
trsf <- data.frame(predict(dmy, newdata = data))
data <- trsf %>% cbind(data[,c("age", "sex", "trestbps", "chol", "fbs", "thalach", "exang",
"oldpeak", "target")])
Data sample
data %>% head()
Subset data
# take 80% of rows as trning data
sam.size <- 0.8 * nrow(data)
# traning rows
train.rows <- sample(seq_len(nrow(data)), size = sam.size)
# subseting
train.data <- data[train.rows, ]
test.data <- data[-train.rows, ]
Models
# full modle
full.fit <- glm(target ~ ., data = train.data)
summary(full.fit)
##
## Call:
## glm(formula = target ~ ., data = train.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.04735 -0.17197 0.03379 0.21579 0.98640
##
## Coefficients: (5 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.7679793 0.5673232 1.354 0.177363
## cp.0 -0.2216834 0.0955712 -2.320 0.021374 *
## cp.1 -0.0670189 0.1036497 -0.647 0.518638
## cp.2 -0.0047483 0.0964950 -0.049 0.960803
## cp.3 NA NA NA NA
## restecg.0 -0.1143420 0.3612756 -0.316 0.751957
## restecg.1 -0.0158170 0.3617621 -0.044 0.965169
## restecg.2 NA NA NA NA
## slope.0 0.0004451 0.1164068 0.004 0.996953
## slope.1 -0.0963362 0.0617723 -1.560 0.120451
## slope.2 NA NA NA NA
## ca.0 -0.0200994 0.1638379 -0.123 0.902485
## ca.1 -0.2996342 0.1701429 -1.761 0.079754 .
## ca.2 -0.3536612 0.1835665 -1.927 0.055445 .
## ca.3 -0.2881011 0.1923433 -1.498 0.135749
## ca.4 NA NA NA NA
## thal.0 -0.0496098 0.2540504 -0.195 0.845375
## thal.1 0.1865771 0.1115010 1.673 0.095827 .
## thal.2 0.2214727 0.0604016 3.667 0.000315 ***
## thal.3 NA NA NA NA
## age 0.0044995 0.0032716 1.375 0.170573
## sex1 -0.1197558 0.0597398 -2.005 0.046352 *
## trestbps -0.0025390 0.0016950 -1.498 0.135726
## chol -0.0003307 0.0005615 -0.589 0.556499
## fbsTRUE 0.0226063 0.0718247 0.315 0.753286
## thalach 0.0023778 0.0014050 1.692 0.092114 .
## exangTRUE -0.1149108 0.0623654 -1.843 0.066876 .
## oldpeak -0.0594872 0.0310970 -1.913 0.057182 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.1191267)
##
## Null deviance: 54.215 on 222 degrees of freedom
## Residual deviance: 23.825 on 200 degrees of freedom
## AIC: 182.12
##
## Number of Fisher Scoring iterations: 2
# backwards
backwards.fit <- step(full.fit, direction = 'backward')
## Start: AIC=182.12
## target ~ cp.0 + cp.1 + cp.2 + cp.3 + restecg.0 + restecg.1 +
## restecg.2 + slope.0 + slope.1 + slope.2 + ca.0 + ca.1 + ca.2 +
## ca.3 + ca.4 + thal.0 + thal.1 + thal.2 + thal.3 + age + sex +
## trestbps + chol + fbs + thalach + exang + oldpeak
##
##
## Step: AIC=182.12
## target ~ cp.0 + cp.1 + cp.2 + cp.3 + restecg.0 + restecg.1 +
## restecg.2 + slope.0 + slope.1 + slope.2 + ca.0 + ca.1 + ca.2 +
## ca.3 + ca.4 + thal.0 + thal.1 + thal.2 + age + sex + trestbps +
## chol + fbs + thalach + exang + oldpeak
##
##
## Step: AIC=182.12
## target ~ cp.0 + cp.1 + cp.2 + cp.3 + restecg.0 + restecg.1 +
## restecg.2 + slope.0 + slope.1 + slope.2 + ca.0 + ca.1 + ca.2 +
## ca.3 + thal.0 + thal.1 + thal.2 + age + sex + trestbps +
## chol + fbs + thalach + exang + oldpeak
##
##
## Step: AIC=182.12
## target ~ cp.0 + cp.1 + cp.2 + cp.3 + restecg.0 + restecg.1 +
## restecg.2 + slope.0 + slope.1 + ca.0 + ca.1 + ca.2 + ca.3 +
## thal.0 + thal.1 + thal.2 + age + sex + trestbps + chol +
## fbs + thalach + exang + oldpeak
##
##
## Step: AIC=182.12
## target ~ cp.0 + cp.1 + cp.2 + cp.3 + restecg.0 + restecg.1 +
## slope.0 + slope.1 + ca.0 + ca.1 + ca.2 + ca.3 + thal.0 +
## thal.1 + thal.2 + age + sex + trestbps + chol + fbs + thalach +
## exang + oldpeak
##
##
## Step: AIC=182.12
## target ~ cp.0 + cp.1 + cp.2 + restecg.0 + restecg.1 + slope.0 +
## slope.1 + ca.0 + ca.1 + ca.2 + ca.3 + thal.0 + thal.1 + thal.2 +
## age + sex + trestbps + chol + fbs + thalach + exang + oldpeak
##
## Df Deviance AIC
## - slope.0 1 23.825 180.12
## - restecg.1 1 23.826 180.13
## - cp.2 1 23.826 180.13
## - ca.0 1 23.827 180.14
## - thal.0 1 23.830 180.17
## - fbs 1 23.837 180.24
## - restecg.0 1 23.837 180.24
## - chol 1 23.867 180.51
## - cp.1 1 23.875 180.59
## <none> 23.825 182.12
## - age 1 24.051 182.22
## - ca.3 1 24.093 182.61
## - trestbps 1 24.093 182.61
## - slope.1 1 24.115 182.82
## - thal.1 1 24.159 183.22
## - thalach 1 24.167 183.30
## - ca.1 1 24.195 183.56
## - exang 1 24.230 183.88
## - oldpeak 1 24.261 184.17
## - ca.2 1 24.267 184.22
## - sex 1 24.304 184.56
## - cp.0 1 24.466 186.04
## - thal.2 1 25.427 194.63
##
## Step: AIC=180.12
## target ~ cp.0 + cp.1 + cp.2 + restecg.0 + restecg.1 + slope.1 +
## ca.0 + ca.1 + ca.2 + ca.3 + thal.0 + thal.1 + thal.2 + age +
## sex + trestbps + chol + fbs + thalach + exang + oldpeak
##
## Df Deviance AIC
## - restecg.1 1 23.826 178.13
## - cp.2 1 23.826 178.13
## - ca.0 1 23.827 178.14
## - thal.0 1 23.830 178.17
## - restecg.0 1 23.837 178.24
## - fbs 1 23.838 178.24
## - chol 1 23.867 178.51
## - cp.1 1 23.875 178.59
## <none> 23.825 180.12
## - age 1 24.051 180.22
## - trestbps 1 24.093 180.61
## - ca.3 1 24.094 180.63
## - slope.1 1 24.164 181.27
## - thal.1 1 24.165 181.28
## - thalach 1 24.170 181.32
## - ca.1 1 24.195 181.56
## - exang 1 24.230 181.88
## - ca.2 1 24.269 182.24
## - sex 1 24.304 182.56
## - oldpeak 1 24.340 182.89
## - cp.0 1 24.466 184.04
## - thal.2 1 25.429 192.65
##
## Step: AIC=178.13
## target ~ cp.0 + cp.1 + cp.2 + restecg.0 + slope.1 + ca.0 + ca.1 +
## ca.2 + ca.3 + thal.0 + thal.1 + thal.2 + age + sex + trestbps +
## chol + fbs + thalach + exang + oldpeak
##
## Df Deviance AIC
## - cp.2 1 23.826 176.13
## - ca.0 1 23.827 176.14
## - thal.0 1 23.830 176.17
## - fbs 1 23.838 176.24
## - chol 1 23.868 176.52
## - cp.1 1 23.875 176.59
## <none> 23.826 178.13
## - age 1 24.059 178.30
## - trestbps 1 24.093 178.61
## - ca.3 1 24.095 178.63
## - slope.1 1 24.164 179.27
## - thal.1 1 24.165 179.28
## - thalach 1 24.170 179.33
## - ca.1 1 24.196 179.56
## - exang 1 24.231 179.89
## - ca.2 1 24.269 180.24
## - sex 1 24.305 180.57
## - restecg.0 1 24.312 180.63
## - oldpeak 1 24.341 180.90
## - cp.0 1 24.466 182.04
## - thal.2 1 25.435 190.71
##
## Step: AIC=176.13
## target ~ cp.0 + cp.1 + restecg.0 + slope.1 + ca.0 + ca.1 + ca.2 +
## ca.3 + thal.0 + thal.1 + thal.2 + age + sex + trestbps +
## chol + fbs + thalach + exang + oldpeak
##
## Df Deviance AIC
## - ca.0 1 23.828 174.14
## - thal.0 1 23.831 174.17
## - fbs 1 23.839 174.25
## - chol 1 23.869 174.53
## - cp.1 1 23.926 175.07
## <none> 23.826 176.13
## - age 1 24.061 176.32
## - trestbps 1 24.094 176.63
## - ca.3 1 24.095 176.63
## - slope.1 1 24.165 177.28
## - thal.1 1 24.168 177.30
## - thalach 1 24.172 177.35
## - ca.1 1 24.196 177.57
## - exang 1 24.231 177.89
## - ca.2 1 24.272 178.27
## - sex 1 24.307 178.59
## - restecg.0 1 24.312 178.63
## - oldpeak 1 24.342 178.91
## - thal.2 1 25.436 188.71
## - cp.0 1 25.464 188.96
##
## Step: AIC=174.14
## target ~ cp.0 + cp.1 + restecg.0 + slope.1 + ca.1 + ca.2 + ca.3 +
## thal.0 + thal.1 + thal.2 + age + sex + trestbps + chol +
## fbs + thalach + exang + oldpeak
##
## Df Deviance AIC
## - thal.0 1 23.832 172.19
## - fbs 1 23.841 172.27
## - chol 1 23.872 172.56
## - cp.1 1 23.930 173.10
## <none> 23.828 174.14
## - age 1 24.061 174.32
## - trestbps 1 24.095 174.63
## - slope.1 1 24.165 175.28
## - thal.1 1 24.168 175.31
## - thalach 1 24.173 175.35
## - exang 1 24.232 175.90
## - sex 1 24.309 176.60
## - restecg.0 1 24.320 176.70
## - oldpeak 1 24.364 177.11
## - ca.3 1 24.539 178.70
## - thal.2 1 25.439 186.74
## - cp.0 1 25.476 187.06
## - ca.2 1 25.759 189.53
## - ca.1 1 26.189 193.22
##
## Step: AIC=172.19
## target ~ cp.0 + cp.1 + restecg.0 + slope.1 + ca.1 + ca.2 + ca.3 +
## thal.1 + thal.2 + age + sex + trestbps + chol + fbs + thalach +
## exang + oldpeak
##
## Df Deviance AIC
## - fbs 1 23.844 170.30
## - chol 1 23.875 170.59
## - cp.1 1 23.933 171.13
## <none> 23.832 172.19
## - age 1 24.067 172.38
## - trestbps 1 24.100 172.68
## - slope.1 1 24.168 173.31
## - thal.1 1 24.180 173.42
## - thalach 1 24.185 173.47
## - exang 1 24.241 173.98
## - sex 1 24.309 174.61
## - restecg.0 1 24.329 174.79
## - oldpeak 1 24.365 175.12
## - ca.3 1 24.539 176.71
## - cp.0 1 25.477 185.07
## - thal.2 1 25.506 185.33
## - ca.2 1 25.759 187.53
## - ca.1 1 26.193 191.25
##
## Step: AIC=170.3
## target ~ cp.0 + cp.1 + restecg.0 + slope.1 + ca.1 + ca.2 + ca.3 +
## thal.1 + thal.2 + age + sex + trestbps + chol + thalach +
## exang + oldpeak
##
## Df Deviance AIC
## - chol 1 23.885 168.68
## - cp.1 1 23.951 169.30
## <none> 23.844 170.30
## - age 1 24.082 170.51
## - trestbps 1 24.108 170.76
## - thal.1 1 24.190 171.51
## - slope.1 1 24.197 171.57
## - thalach 1 24.198 171.59
## - exang 1 24.245 172.02
## - sex 1 24.317 172.68
## - restecg.0 1 24.331 172.80
## - oldpeak 1 24.385 173.31
## - ca.3 1 24.540 174.71
## - thal.2 1 25.506 183.33
## - cp.0 1 25.543 183.65
## - ca.2 1 25.764 185.57
## - ca.1 1 26.210 189.39
##
## Step: AIC=168.68
## target ~ cp.0 + cp.1 + restecg.0 + slope.1 + ca.1 + ca.2 + ca.3 +
## thal.1 + thal.2 + age + sex + trestbps + thalach + exang +
## oldpeak
##
## Df Deviance AIC
## - cp.1 1 24.009 167.84
## <none> 23.885 168.68
## - age 1 24.102 168.70
## - trestbps 1 24.160 169.23
## - thalach 1 24.228 169.86
## - slope.1 1 24.233 169.91
## - thal.1 1 24.277 170.31
## - exang 1 24.305 170.57
## - sex 1 24.325 170.75
## - restecg.0 1 24.405 171.49
## - oldpeak 1 24.424 171.66
## - ca.3 1 24.608 173.34
## - thal.2 1 25.572 181.90
## - cp.0 1 25.619 182.31
## - ca.2 1 25.922 184.93
## - ca.1 1 26.279 187.99
##
## Step: AIC=167.84
## target ~ cp.0 + restecg.0 + slope.1 + ca.1 + ca.2 + ca.3 + thal.1 +
## thal.2 + age + sex + trestbps + thalach + exang + oldpeak
##
## Df Deviance AIC
## - age 1 24.212 167.72
## <none> 24.009 167.84
## - trestbps 1 24.264 168.20
## - thalach 1 24.357 169.04
## - slope.1 1 24.362 169.09
## - thal.1 1 24.400 169.44
## - exang 1 24.433 169.74
## - oldpeak 1 24.474 170.12
## - sex 1 24.497 170.32
## - restecg.0 1 24.513 170.47
## - ca.3 1 24.714 172.29
## - cp.0 1 25.626 180.37
## - thal.2 1 25.641 180.51
## - ca.2 1 26.055 184.07
## - ca.1 1 26.336 186.46
##
## Step: AIC=167.72
## target ~ cp.0 + restecg.0 + slope.1 + ca.1 + ca.2 + ca.3 + thal.1 +
## thal.2 + sex + trestbps + thalach + exang + oldpeak
##
## Df Deviance AIC
## - trestbps 1 24.382 167.27
## - thalach 1 24.425 167.67
## <none> 24.212 167.72
## - slope.1 1 24.564 168.93
## - thal.1 1 24.648 169.70
## - restecg.0 1 24.661 169.81
## - exang 1 24.668 169.88
## - oldpeak 1 24.674 169.93
## - ca.3 1 24.823 171.27
## - sex 1 24.825 171.29
## - thal.2 1 25.823 180.08
## - cp.0 1 25.927 180.97
## - ca.2 1 26.067 182.18
## - ca.1 1 26.342 184.52
##
## Step: AIC=167.27
## target ~ cp.0 + restecg.0 + slope.1 + ca.1 + ca.2 + ca.3 + thal.1 +
## thal.2 + sex + thalach + exang + oldpeak
##
## Df Deviance AIC
## - thalach 1 24.601 167.27
## <none> 24.382 167.27
## - slope.1 1 24.711 168.26
## - thal.1 1 24.769 168.78
## - exang 1 24.803 169.09
## - restecg.0 1 24.863 169.63
## - oldpeak 1 24.949 170.40
## - sex 1 24.954 170.45
## - ca.3 1 25.088 171.64
## - thal.2 1 26.003 179.63
## - cp.0 1 26.018 179.75
## - ca.2 1 26.273 181.93
## - ca.1 1 26.483 183.71
##
## Step: AIC=167.27
## target ~ cp.0 + restecg.0 + slope.1 + ca.1 + ca.2 + ca.3 + thal.1 +
## thal.2 + sex + exang + oldpeak
##
## Df Deviance AIC
## <none> 24.601 167.27
## - thal.1 1 24.960 168.50
## - sex 1 25.062 169.41
## - restecg.0 1 25.093 169.69
## - slope.1 1 25.102 169.76
## - exang 1 25.147 170.17
## - oldpeak 1 25.258 171.14
## - ca.3 1 25.578 173.95
## - thal.2 1 26.404 181.04
## - cp.0 1 26.459 181.51
## - ca.2 1 26.536 182.15
## - ca.1 1 27.143 187.20
# fowards
fowards.fit <- step(full.fit, direction = 'forward')
## Start: AIC=182.12
## target ~ cp.0 + cp.1 + cp.2 + cp.3 + restecg.0 + restecg.1 +
## restecg.2 + slope.0 + slope.1 + slope.2 + ca.0 + ca.1 + ca.2 +
## ca.3 + ca.4 + thal.0 + thal.1 + thal.2 + thal.3 + age + sex +
## trestbps + chol + fbs + thalach + exang + oldpeak
Test
# set threshold
thres <- 0.35
# full model
full.result <- predict.glm(full.fit, newdata = test.data, type="response") %>% unname()
full.result <- ifelse(full.result > thres, TRUE, FALSE)
table(test.data$target, full.result, dnn=c("actual", "predicted"))
## predicted
## actual FALSE TRUE
## FALSE 23 5
## TRUE 1 27
# backwards model
backwards.result <- predict.glm(backwards.fit, newdata = test.data, type="response") %>%
unname()
backwards.result <- ifelse(backwards.result > thres, TRUE, FALSE)
table(test.data$target, backwards.result, dnn=c("actual", "predicted"))
## predicted
## actual FALSE TRUE
## FALSE 22 6
## TRUE 0 28
# fowards model
fowards.result <- predict.glm(fowards.fit, newdata = test.data, type="response") %>%
unname()
fowards.result <- ifelse(fowards.result > thres, TRUE, FALSE)
table(test.data$target, fowards.result, dnn=c("actual", "predicted"))
## predicted
## actual FALSE TRUE
## FALSE 23 5
## TRUE 1 27