Packges

# Packges
  library(dplyr)
  library(readr)
  library(ggplot2)
  library(tidyr)
  library(caret)
# set seed
  set.seed(18102019)

My Functions

# Changing class of varibles
  convert.magic <- function(obj, type)
  {
    FUN1 <- switch(type,
                   character = as.character,
                   numeric = as.numeric,
                   factor = as.factor,
                   logical = as.logical)
    out <- lapply(obj, FUN1)
    as.data.frame(out)
  }
# remove outlier function
outlier.qr <- function(data, var_name)
  {
    data$var_name <- eval(substitute(var_name), data) 
    iqr <- IQR(data$`var_name`, na.rm = TRUE)
    qr <- quantile(data$var_name, c(0.25,0.75))
    Q1 <- qr[[1]] - 1.5*iqr
    Q3 <- qr[[2]] + 1.5*iqr
    data <- data %>% filter(var_name < Q3 & var_name > Q1)
    return(data)
  }

Pre-processing

# open
  data <- read_csv('Dataset/heart.csv')
# varible types
  # look at type of each varible
    sapply(data, class)
##       age       sex        cp  trestbps      chol       fbs   restecg 
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" 
##   thalach     exang   oldpeak     slope        ca      thal    target 
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
  # use convert.magic function
    factor.varibles <- c('sex', 'cp', 'restecg', 'slope', 'ca','thal')
    logical.varibles <- c('fbs', 'exang', 'target')
    numerical.varibles <- c('age', 'trestbps', 'chol', 'thalach', 'oldpeak')
    data[,factor.varibles] <- convert.magic(data[,factor.varibles], "factor")
    data[,logical.varibles] <- convert.magic(data[,logical.varibles], "logical")
  # check
    sapply(data, class)
##       age       sex        cp  trestbps      chol       fbs   restecg 
## "numeric"  "factor"  "factor" "numeric" "numeric" "logical"  "factor" 
##   thalach     exang   oldpeak     slope        ca      thal    target 
## "numeric" "logical" "numeric"  "factor"  "factor"  "factor" "logical"
# Missing vaules
  data %>% summarise_each(funs(sum(is.na(.))))
# outliers
  # numerical varibles
    plot.table <- data[, numerical.varibles] %>% gather(key = "variable", value = "value")
    ggplot(data = plot.table, aes(x=variable, y=value)) + geom_violin(scale = 'area', fill = '#595959', colour = NA) + 
      geom_boxplot(fill = NA, colour = 'orange', outlier.colour = '#00bbff') + facet_wrap( ~ variable, scales="free")

  # other varibles
    plot.table <- data[, c(factor.varibles, logical.varibles)] %>% gather(key = "variable", value = "value") %>% 
      group_by(variable, value) %>% summarise(count = n())
    ggplot(data = plot.table, aes(x = value, y = count)) + geom_bar(stat = 'identity', colour = "orange") + 
      facet_wrap( ~ variable, scales="free")

  # remove outliers
    # List of Vraibles That are Having Outliers Removed
      varNames <- lapply(numerical.varibles, as.name)
    
    # For Loop to Remove Outliers
      for (i in seq_along(varNames)) 
      {
        data <- outlier.qr(data, eval(varNames[[i]]))
      }
    # remove extra coloum
      data <- data[, !(names(data) %in% c('var_name'))]

# resahpe for glm model
  dmy <- dummyVars(" ~ + cp + restecg + slope + ca + thal", data = data[])
  trsf <- data.frame(predict(dmy, newdata = data))
  data <- trsf %>% cbind(data[,c("age", "sex", "trestbps", "chol", "fbs", "thalach", "exang",
                                 "oldpeak", "target")]) 

Data sample

data %>% head()

Subset data

# take 80% of rows as trning data
  sam.size <- 0.8 * nrow(data)
# traning rows
  train.rows <- sample(seq_len(nrow(data)), size = sam.size)
# subseting  
  train.data <- data[train.rows, ]
  test.data <- data[-train.rows, ]

Models

# full modle
  full.fit <- glm(target ~ ., data = train.data)
  summary(full.fit)
## 
## Call:
## glm(formula = target ~ ., data = train.data)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.04735  -0.17197   0.03379   0.21579   0.98640  
## 
## Coefficients: (5 not defined because of singularities)
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.7679793  0.5673232   1.354 0.177363    
## cp.0        -0.2216834  0.0955712  -2.320 0.021374 *  
## cp.1        -0.0670189  0.1036497  -0.647 0.518638    
## cp.2        -0.0047483  0.0964950  -0.049 0.960803    
## cp.3                NA         NA      NA       NA    
## restecg.0   -0.1143420  0.3612756  -0.316 0.751957    
## restecg.1   -0.0158170  0.3617621  -0.044 0.965169    
## restecg.2           NA         NA      NA       NA    
## slope.0      0.0004451  0.1164068   0.004 0.996953    
## slope.1     -0.0963362  0.0617723  -1.560 0.120451    
## slope.2             NA         NA      NA       NA    
## ca.0        -0.0200994  0.1638379  -0.123 0.902485    
## ca.1        -0.2996342  0.1701429  -1.761 0.079754 .  
## ca.2        -0.3536612  0.1835665  -1.927 0.055445 .  
## ca.3        -0.2881011  0.1923433  -1.498 0.135749    
## ca.4                NA         NA      NA       NA    
## thal.0      -0.0496098  0.2540504  -0.195 0.845375    
## thal.1       0.1865771  0.1115010   1.673 0.095827 .  
## thal.2       0.2214727  0.0604016   3.667 0.000315 ***
## thal.3              NA         NA      NA       NA    
## age          0.0044995  0.0032716   1.375 0.170573    
## sex1        -0.1197558  0.0597398  -2.005 0.046352 *  
## trestbps    -0.0025390  0.0016950  -1.498 0.135726    
## chol        -0.0003307  0.0005615  -0.589 0.556499    
## fbsTRUE      0.0226063  0.0718247   0.315 0.753286    
## thalach      0.0023778  0.0014050   1.692 0.092114 .  
## exangTRUE   -0.1149108  0.0623654  -1.843 0.066876 .  
## oldpeak     -0.0594872  0.0310970  -1.913 0.057182 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1191267)
## 
##     Null deviance: 54.215  on 222  degrees of freedom
## Residual deviance: 23.825  on 200  degrees of freedom
## AIC: 182.12
## 
## Number of Fisher Scoring iterations: 2
# backwards
  backwards.fit <- step(full.fit, direction = 'backward')
## Start:  AIC=182.12
## target ~ cp.0 + cp.1 + cp.2 + cp.3 + restecg.0 + restecg.1 + 
##     restecg.2 + slope.0 + slope.1 + slope.2 + ca.0 + ca.1 + ca.2 + 
##     ca.3 + ca.4 + thal.0 + thal.1 + thal.2 + thal.3 + age + sex + 
##     trestbps + chol + fbs + thalach + exang + oldpeak
## 
## 
## Step:  AIC=182.12
## target ~ cp.0 + cp.1 + cp.2 + cp.3 + restecg.0 + restecg.1 + 
##     restecg.2 + slope.0 + slope.1 + slope.2 + ca.0 + ca.1 + ca.2 + 
##     ca.3 + ca.4 + thal.0 + thal.1 + thal.2 + age + sex + trestbps + 
##     chol + fbs + thalach + exang + oldpeak
## 
## 
## Step:  AIC=182.12
## target ~ cp.0 + cp.1 + cp.2 + cp.3 + restecg.0 + restecg.1 + 
##     restecg.2 + slope.0 + slope.1 + slope.2 + ca.0 + ca.1 + ca.2 + 
##     ca.3 + thal.0 + thal.1 + thal.2 + age + sex + trestbps + 
##     chol + fbs + thalach + exang + oldpeak
## 
## 
## Step:  AIC=182.12
## target ~ cp.0 + cp.1 + cp.2 + cp.3 + restecg.0 + restecg.1 + 
##     restecg.2 + slope.0 + slope.1 + ca.0 + ca.1 + ca.2 + ca.3 + 
##     thal.0 + thal.1 + thal.2 + age + sex + trestbps + chol + 
##     fbs + thalach + exang + oldpeak
## 
## 
## Step:  AIC=182.12
## target ~ cp.0 + cp.1 + cp.2 + cp.3 + restecg.0 + restecg.1 + 
##     slope.0 + slope.1 + ca.0 + ca.1 + ca.2 + ca.3 + thal.0 + 
##     thal.1 + thal.2 + age + sex + trestbps + chol + fbs + thalach + 
##     exang + oldpeak
## 
## 
## Step:  AIC=182.12
## target ~ cp.0 + cp.1 + cp.2 + restecg.0 + restecg.1 + slope.0 + 
##     slope.1 + ca.0 + ca.1 + ca.2 + ca.3 + thal.0 + thal.1 + thal.2 + 
##     age + sex + trestbps + chol + fbs + thalach + exang + oldpeak
## 
##             Df Deviance    AIC
## - slope.0    1   23.825 180.12
## - restecg.1  1   23.826 180.13
## - cp.2       1   23.826 180.13
## - ca.0       1   23.827 180.14
## - thal.0     1   23.830 180.17
## - fbs        1   23.837 180.24
## - restecg.0  1   23.837 180.24
## - chol       1   23.867 180.51
## - cp.1       1   23.875 180.59
## <none>           23.825 182.12
## - age        1   24.051 182.22
## - ca.3       1   24.093 182.61
## - trestbps   1   24.093 182.61
## - slope.1    1   24.115 182.82
## - thal.1     1   24.159 183.22
## - thalach    1   24.167 183.30
## - ca.1       1   24.195 183.56
## - exang      1   24.230 183.88
## - oldpeak    1   24.261 184.17
## - ca.2       1   24.267 184.22
## - sex        1   24.304 184.56
## - cp.0       1   24.466 186.04
## - thal.2     1   25.427 194.63
## 
## Step:  AIC=180.12
## target ~ cp.0 + cp.1 + cp.2 + restecg.0 + restecg.1 + slope.1 + 
##     ca.0 + ca.1 + ca.2 + ca.3 + thal.0 + thal.1 + thal.2 + age + 
##     sex + trestbps + chol + fbs + thalach + exang + oldpeak
## 
##             Df Deviance    AIC
## - restecg.1  1   23.826 178.13
## - cp.2       1   23.826 178.13
## - ca.0       1   23.827 178.14
## - thal.0     1   23.830 178.17
## - restecg.0  1   23.837 178.24
## - fbs        1   23.838 178.24
## - chol       1   23.867 178.51
## - cp.1       1   23.875 178.59
## <none>           23.825 180.12
## - age        1   24.051 180.22
## - trestbps   1   24.093 180.61
## - ca.3       1   24.094 180.63
## - slope.1    1   24.164 181.27
## - thal.1     1   24.165 181.28
## - thalach    1   24.170 181.32
## - ca.1       1   24.195 181.56
## - exang      1   24.230 181.88
## - ca.2       1   24.269 182.24
## - sex        1   24.304 182.56
## - oldpeak    1   24.340 182.89
## - cp.0       1   24.466 184.04
## - thal.2     1   25.429 192.65
## 
## Step:  AIC=178.13
## target ~ cp.0 + cp.1 + cp.2 + restecg.0 + slope.1 + ca.0 + ca.1 + 
##     ca.2 + ca.3 + thal.0 + thal.1 + thal.2 + age + sex + trestbps + 
##     chol + fbs + thalach + exang + oldpeak
## 
##             Df Deviance    AIC
## - cp.2       1   23.826 176.13
## - ca.0       1   23.827 176.14
## - thal.0     1   23.830 176.17
## - fbs        1   23.838 176.24
## - chol       1   23.868 176.52
## - cp.1       1   23.875 176.59
## <none>           23.826 178.13
## - age        1   24.059 178.30
## - trestbps   1   24.093 178.61
## - ca.3       1   24.095 178.63
## - slope.1    1   24.164 179.27
## - thal.1     1   24.165 179.28
## - thalach    1   24.170 179.33
## - ca.1       1   24.196 179.56
## - exang      1   24.231 179.89
## - ca.2       1   24.269 180.24
## - sex        1   24.305 180.57
## - restecg.0  1   24.312 180.63
## - oldpeak    1   24.341 180.90
## - cp.0       1   24.466 182.04
## - thal.2     1   25.435 190.71
## 
## Step:  AIC=176.13
## target ~ cp.0 + cp.1 + restecg.0 + slope.1 + ca.0 + ca.1 + ca.2 + 
##     ca.3 + thal.0 + thal.1 + thal.2 + age + sex + trestbps + 
##     chol + fbs + thalach + exang + oldpeak
## 
##             Df Deviance    AIC
## - ca.0       1   23.828 174.14
## - thal.0     1   23.831 174.17
## - fbs        1   23.839 174.25
## - chol       1   23.869 174.53
## - cp.1       1   23.926 175.07
## <none>           23.826 176.13
## - age        1   24.061 176.32
## - trestbps   1   24.094 176.63
## - ca.3       1   24.095 176.63
## - slope.1    1   24.165 177.28
## - thal.1     1   24.168 177.30
## - thalach    1   24.172 177.35
## - ca.1       1   24.196 177.57
## - exang      1   24.231 177.89
## - ca.2       1   24.272 178.27
## - sex        1   24.307 178.59
## - restecg.0  1   24.312 178.63
## - oldpeak    1   24.342 178.91
## - thal.2     1   25.436 188.71
## - cp.0       1   25.464 188.96
## 
## Step:  AIC=174.14
## target ~ cp.0 + cp.1 + restecg.0 + slope.1 + ca.1 + ca.2 + ca.3 + 
##     thal.0 + thal.1 + thal.2 + age + sex + trestbps + chol + 
##     fbs + thalach + exang + oldpeak
## 
##             Df Deviance    AIC
## - thal.0     1   23.832 172.19
## - fbs        1   23.841 172.27
## - chol       1   23.872 172.56
## - cp.1       1   23.930 173.10
## <none>           23.828 174.14
## - age        1   24.061 174.32
## - trestbps   1   24.095 174.63
## - slope.1    1   24.165 175.28
## - thal.1     1   24.168 175.31
## - thalach    1   24.173 175.35
## - exang      1   24.232 175.90
## - sex        1   24.309 176.60
## - restecg.0  1   24.320 176.70
## - oldpeak    1   24.364 177.11
## - ca.3       1   24.539 178.70
## - thal.2     1   25.439 186.74
## - cp.0       1   25.476 187.06
## - ca.2       1   25.759 189.53
## - ca.1       1   26.189 193.22
## 
## Step:  AIC=172.19
## target ~ cp.0 + cp.1 + restecg.0 + slope.1 + ca.1 + ca.2 + ca.3 + 
##     thal.1 + thal.2 + age + sex + trestbps + chol + fbs + thalach + 
##     exang + oldpeak
## 
##             Df Deviance    AIC
## - fbs        1   23.844 170.30
## - chol       1   23.875 170.59
## - cp.1       1   23.933 171.13
## <none>           23.832 172.19
## - age        1   24.067 172.38
## - trestbps   1   24.100 172.68
## - slope.1    1   24.168 173.31
## - thal.1     1   24.180 173.42
## - thalach    1   24.185 173.47
## - exang      1   24.241 173.98
## - sex        1   24.309 174.61
## - restecg.0  1   24.329 174.79
## - oldpeak    1   24.365 175.12
## - ca.3       1   24.539 176.71
## - cp.0       1   25.477 185.07
## - thal.2     1   25.506 185.33
## - ca.2       1   25.759 187.53
## - ca.1       1   26.193 191.25
## 
## Step:  AIC=170.3
## target ~ cp.0 + cp.1 + restecg.0 + slope.1 + ca.1 + ca.2 + ca.3 + 
##     thal.1 + thal.2 + age + sex + trestbps + chol + thalach + 
##     exang + oldpeak
## 
##             Df Deviance    AIC
## - chol       1   23.885 168.68
## - cp.1       1   23.951 169.30
## <none>           23.844 170.30
## - age        1   24.082 170.51
## - trestbps   1   24.108 170.76
## - thal.1     1   24.190 171.51
## - slope.1    1   24.197 171.57
## - thalach    1   24.198 171.59
## - exang      1   24.245 172.02
## - sex        1   24.317 172.68
## - restecg.0  1   24.331 172.80
## - oldpeak    1   24.385 173.31
## - ca.3       1   24.540 174.71
## - thal.2     1   25.506 183.33
## - cp.0       1   25.543 183.65
## - ca.2       1   25.764 185.57
## - ca.1       1   26.210 189.39
## 
## Step:  AIC=168.68
## target ~ cp.0 + cp.1 + restecg.0 + slope.1 + ca.1 + ca.2 + ca.3 + 
##     thal.1 + thal.2 + age + sex + trestbps + thalach + exang + 
##     oldpeak
## 
##             Df Deviance    AIC
## - cp.1       1   24.009 167.84
## <none>           23.885 168.68
## - age        1   24.102 168.70
## - trestbps   1   24.160 169.23
## - thalach    1   24.228 169.86
## - slope.1    1   24.233 169.91
## - thal.1     1   24.277 170.31
## - exang      1   24.305 170.57
## - sex        1   24.325 170.75
## - restecg.0  1   24.405 171.49
## - oldpeak    1   24.424 171.66
## - ca.3       1   24.608 173.34
## - thal.2     1   25.572 181.90
## - cp.0       1   25.619 182.31
## - ca.2       1   25.922 184.93
## - ca.1       1   26.279 187.99
## 
## Step:  AIC=167.84
## target ~ cp.0 + restecg.0 + slope.1 + ca.1 + ca.2 + ca.3 + thal.1 + 
##     thal.2 + age + sex + trestbps + thalach + exang + oldpeak
## 
##             Df Deviance    AIC
## - age        1   24.212 167.72
## <none>           24.009 167.84
## - trestbps   1   24.264 168.20
## - thalach    1   24.357 169.04
## - slope.1    1   24.362 169.09
## - thal.1     1   24.400 169.44
## - exang      1   24.433 169.74
## - oldpeak    1   24.474 170.12
## - sex        1   24.497 170.32
## - restecg.0  1   24.513 170.47
## - ca.3       1   24.714 172.29
## - cp.0       1   25.626 180.37
## - thal.2     1   25.641 180.51
## - ca.2       1   26.055 184.07
## - ca.1       1   26.336 186.46
## 
## Step:  AIC=167.72
## target ~ cp.0 + restecg.0 + slope.1 + ca.1 + ca.2 + ca.3 + thal.1 + 
##     thal.2 + sex + trestbps + thalach + exang + oldpeak
## 
##             Df Deviance    AIC
## - trestbps   1   24.382 167.27
## - thalach    1   24.425 167.67
## <none>           24.212 167.72
## - slope.1    1   24.564 168.93
## - thal.1     1   24.648 169.70
## - restecg.0  1   24.661 169.81
## - exang      1   24.668 169.88
## - oldpeak    1   24.674 169.93
## - ca.3       1   24.823 171.27
## - sex        1   24.825 171.29
## - thal.2     1   25.823 180.08
## - cp.0       1   25.927 180.97
## - ca.2       1   26.067 182.18
## - ca.1       1   26.342 184.52
## 
## Step:  AIC=167.27
## target ~ cp.0 + restecg.0 + slope.1 + ca.1 + ca.2 + ca.3 + thal.1 + 
##     thal.2 + sex + thalach + exang + oldpeak
## 
##             Df Deviance    AIC
## - thalach    1   24.601 167.27
## <none>           24.382 167.27
## - slope.1    1   24.711 168.26
## - thal.1     1   24.769 168.78
## - exang      1   24.803 169.09
## - restecg.0  1   24.863 169.63
## - oldpeak    1   24.949 170.40
## - sex        1   24.954 170.45
## - ca.3       1   25.088 171.64
## - thal.2     1   26.003 179.63
## - cp.0       1   26.018 179.75
## - ca.2       1   26.273 181.93
## - ca.1       1   26.483 183.71
## 
## Step:  AIC=167.27
## target ~ cp.0 + restecg.0 + slope.1 + ca.1 + ca.2 + ca.3 + thal.1 + 
##     thal.2 + sex + exang + oldpeak
## 
##             Df Deviance    AIC
## <none>           24.601 167.27
## - thal.1     1   24.960 168.50
## - sex        1   25.062 169.41
## - restecg.0  1   25.093 169.69
## - slope.1    1   25.102 169.76
## - exang      1   25.147 170.17
## - oldpeak    1   25.258 171.14
## - ca.3       1   25.578 173.95
## - thal.2     1   26.404 181.04
## - cp.0       1   26.459 181.51
## - ca.2       1   26.536 182.15
## - ca.1       1   27.143 187.20
# fowards
  fowards.fit <- step(full.fit, direction = 'forward')
## Start:  AIC=182.12
## target ~ cp.0 + cp.1 + cp.2 + cp.3 + restecg.0 + restecg.1 + 
##     restecg.2 + slope.0 + slope.1 + slope.2 + ca.0 + ca.1 + ca.2 + 
##     ca.3 + ca.4 + thal.0 + thal.1 + thal.2 + thal.3 + age + sex + 
##     trestbps + chol + fbs + thalach + exang + oldpeak

Test

# set threshold
  thres <- 0.35
#  full model
  full.result <- predict.glm(full.fit, newdata = test.data, type="response") %>% unname()
  full.result <- ifelse(full.result > thres, TRUE, FALSE)
  table(test.data$target, full.result, dnn=c("actual", "predicted"))
##        predicted
## actual  FALSE TRUE
##   FALSE    23    5
##   TRUE      1   27
# backwards model
  backwards.result <- predict.glm(backwards.fit, newdata = test.data, type="response") %>%
    unname()
  backwards.result <- ifelse(backwards.result > thres, TRUE, FALSE)
  table(test.data$target, backwards.result, dnn=c("actual", "predicted"))
##        predicted
## actual  FALSE TRUE
##   FALSE    22    6
##   TRUE      0   28
# fowards model
  fowards.result <- predict.glm(fowards.fit, newdata = test.data, type="response") %>%
    unname()
  fowards.result <- ifelse(fowards.result > thres, TRUE, FALSE)
  table(test.data$target, fowards.result, dnn=c("actual", "predicted"))  
##        predicted
## actual  FALSE TRUE
##   FALSE    23    5
##   TRUE      1   27