A Hearty Analysis

Part 0

Here, we are conducting data pre-processing. We read the data in through the URL. We now make the data more easily readable by adding column and row names. We make 'sex' a column and stipulate that we have a binary variable for the column. Chest pain is the next column we make, and make it a categorical variable represented by numerics. We make several other variables that are numerical and categorical variables. We convert all variables into factors at that point. Followed by that, we fill in any missing data values by using the impute function. We stored the various variables as vectors of integers. The Healthy category, we first used ifelse on it then we stored it as a vector of integers

library(cowplot)
## Loading required package: ggplot2

## 
## Attaching package: 'cowplot'

## The following object is masked from 'package:ggplot2':
## 
##     ggsave
library(randomForest)
## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin
url <- "http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
data <- read.csv(url, header=FALSE)
colnames(data) <- c(
  "age",
  "sex",# 0 = female, 1 = male
  "cp", # chest pain
          # 1 = typical angina,
          # 2 = atypical angina,
          # 3 = non-anginal pain,
          # 4 = asymptomatic
  "trestbps", # resting blood pressure (in mm Hg)
  "chol", # serum cholestoral in mg/dl
  "fbs",  # fasting blood sugar greater than 120 mg/dl, 1 = TRUE, 0 = FALSE
  "restecg", # resting electrocardiographic results
          # 1 = normal
          # 2 = having ST-T wave abnormality
          # 3 = showing probable or definite left ventricular hypertrophy
  "thalach", # maximum heart rate achieved
  "exang",   # exercise induced angina, 1 = yes, 0 = no
  "oldpeak", # ST depression induced by exercise relative to rest
  "slope", # the slope of the peak exercise ST segment
          # 1 = upsloping
          # 2 = flat
          # 3 = downsloping
  "ca", # number of major vessels (0-3) colored by fluoroscopy
  "thal", # this is short of thalium heart scan
          # 3 = normal (no cold spots)
          # 6 = fixed defect (cold spots during rest and exercise)
          # 7 = reversible defect (when cold spots only appear during exercise)
  "hd" # (the predicted attribute) - diagnosis of heart disease
          # 0 if less than or equal to 50% diameter narrowing
          # 1 if greater than 50% diameter narrowing
  )
str(data)
## 'data.frame':    303 obs. of  14 variables:
##  $ age     : num  63 67 67 37 41 56 62 57 63 53 ...
##  $ sex     : num  1 1 1 1 0 1 0 0 1 1 ...
##  $ cp      : num  1 4 4 3 2 2 4 4 4 4 ...
##  $ trestbps: num  145 160 120 130 130 120 140 120 130 140 ...
##  $ chol    : num  233 286 229 250 204 236 268 354 254 203 ...
##  $ fbs     : num  1 0 0 0 0 0 0 0 0 1 ...
##  $ restecg : num  2 2 2 0 2 0 2 0 2 2 ...
##  $ thalach : num  150 108 129 187 172 178 160 163 147 155 ...
##  $ exang   : num  0 1 1 0 0 0 0 1 0 1 ...
##  $ oldpeak : num  2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ...
##  $ slope   : num  3 2 2 3 1 1 3 1 2 3 ...
##  $ ca      : Factor w/ 5 levels "?","0.0","1.0",..: 2 5 4 2 2 2 4 2 3 2 ...
##  $ thal    : Factor w/ 4 levels "?","3.0","6.0",..: 3 2 4 2 2 2 2 2 4 4 ...
##  $ hd      : int  0 2 1 0 0 0 3 0 2 1 ...
data[data == "?"] <- NA


data[data$sex == 0,]$sex <- "F"
data[data$sex == 1,]$sex <- "M"
data$sex <- as.factor(data$sex)
 data$cp <- as.factor(data$cp) 
data$fbs <- as.factor(data$fbs) 
data$restecg <- as.factor(data$restecg)
data$exang <- as.factor(data$exang) 
data$slope <- as.factor(data$slope)


data$ca <- as.integer(data$ca) 
data$ca <- as.factor(data$ca)  

data$thal <- as.integer(data$thal) 
data$thal <- as.factor(data$thal)

data$hd <- ifelse(test=data$hd == 0, yes="Healthy", no="Unhealthy")
data$hd <- as.factor(data$hd) # Now convert to a factor


set.seed(123)

data.imputed <- rfImpute(hd ~ ., data = data, iter=6)
## ntree      OOB      1      2
##   300:  19.14% 15.85% 23.02%
## ntree      OOB      1      2
##   300:  15.51% 11.59% 20.14%
## ntree      OOB      1      2
##   300:  18.81% 15.85% 22.30%
## ntree      OOB      1      2
##   300:  16.50% 11.59% 22.30%
## ntree      OOB      1      2
##   300:  17.16% 12.20% 23.02%
## ntree      OOB      1      2
##   300:  17.49% 14.02% 21.58%

Part 1

From our confusion matrix, we see that the percentage of correct guesses is (112/151)*100 = 74.2%

library(tree)
Train = sample(1:nrow(data.imputed), 152)
Test = -Train

training_data=data.imputed[Train, ]
testing_data = data.imputed[Test,]
hd.test=data.imputed$hd[Test]

tree_model= tree(hd~., training_data)
tree.pred=predict(tree_model, testing_data, type="class")
tree.hd =tree(hd~., data = data.imputed, subset = Train)
table(tree.pred, hd.test)
##            hd.test
## tree.pred   Healthy Unhealthy
##   Healthy        71        25
##   Unhealthy      14        41
plot(tree.hd)
text(tree.hd, pretty=0)

Part 2

Percent of test observations that are correctly classified:112/151*100 = 74.2%. We use cross validation to find the best sample feature node. We notice that in the screeplot we see the lowest deviation to be at 9 nodes. We now use this for the pruned tree model.

set.seed(123)
hd_cv = cv.tree(tree.hd, FUN = prune.misclass)
names(hd_cv)
## [1] "size"   "dev"    "k"      "method"
plot(hd_cv$size, hd_cv$dev, type = 'b')

#minimum is at 9 
pruned_model = prune.misclass(tree.hd, best = 9)
plot(pruned_model)
text(pruned_model, pretty = 0)

pruned_predictions = predict(pruned_model, testing_data, type = 'class')
table(pruned_predictions, hd.test)
##                   hd.test
## pruned_predictions Healthy Unhealthy
##          Healthy        71        25
##          Unhealthy      14        41

Part 3

Here we see that the bagging model provides a lower test error rate than does random forest.

#bagging model 
hd_subset = data.imputed[-Train, "hd"]
bag.hd = randomForest( hd~.,data=data.imputed , subset=Train , mtry=13,importance =TRUE)
hd.bag_predictions = predict (bag.hd , newdata=data.imputed[-Train ,])

mean(hd.bag_predictions != hd_subset)
## [1] 0.2450331
#random forest
hd.rf = randomForest(hd~.,data=data.imputed, subset=Train ,proximity=TRUE)

hd.rf_predictions = predict (hd.rf , newdata=data.imputed[-Train ,])

mean(hd.rf_predictions != hd_subset)
## [1] 0.205298

Part 4

The errors are 0.1644737 and 0.15131 for the bagging OOB and random forest methods, respectively. The out-of-bag error for the random forest is lower.

#bagging OOB
bag.oob<-bag.hd$err.rate[nrow(bag.hd$err.rate),1]
bag.oob
##       OOB 
## 0.1644737
#random forest oob 
rf.oob <-hd.rf$err.rate[nrow(hd.rf$err.rate),1]
rf.oob
##       OOB 
## 0.1513158

Part 6

Importance plots are created to observe which predictors are indeed the most important. We conclude that thaI, exang, and ca are the three most important variables.

importance(bag.hd)
##             Healthy  Unhealthy MeanDecreaseAccuracy MeanDecreaseGini
## age       2.5740946 -0.8159902            1.1917256        3.3853668
## sex       1.8637810  0.5150640            1.5729746        0.1581245
## cp       -0.2314854  9.7402595            7.0639621        5.5888162
## trestbps  0.4617004  4.7377298            3.6679917        3.7801897
## chol      3.8715363  0.1453287            2.7514070        5.0994866
## fbs       0.4965870 -2.4672812           -0.9177406        0.1203391
## restecg   1.9205530  1.7771895            2.4892878        0.6881156
## thalach   9.4716183  4.9782275            9.7320165        7.6800929
## exang    16.3007588 10.0728474           18.2964050        8.0351951
## oldpeak  10.6083995 10.0047144           13.9465489        6.7715779
## slope     5.0435943  3.6705378            6.1537444        1.4265211
## ca       18.2842292  7.1019466           17.0954636        7.1773147
## thal     28.8542443 21.0910932           32.7466136       25.4230967
varImpPlot(bag.hd)

importance(hd.rf)
##          MeanDecreaseGini
## age             4.2880268
## sex             1.5080880
## cp              7.5026267
## trestbps        4.7705163
## chol            4.6174582
## fbs             0.4910598
## restecg         1.1242381
## thalach         8.7751852
## exang           8.1130455
## oldpeak         7.9968242
## slope           3.6372945
## ca              8.0393918
## thal           14.0232293
varImpPlot(hd.rf)

Part 7

We can see that there is indeed a significant difference in MSD1 between healthy and unhealthy individuals based on the plot. Unhealthy individuals seem to have lower MDS1-26.8% whereas healthy individuals seem to have more of it as well as MSD1-13.9%.

MDSdataframe=data.imputed[,-1]
for (i in 1: ncol(MDSdataframe)) {
  if(is.numeric(MDSdataframe[0,i])){
    MDSdataframe[,i] = scale(MDSdataframe[,i])
  }
}
MDSdistance = dist(MDSdataframe)
## Warning in dist(MDSdataframe): NAs introduced by coercion
mds.var = cmdscale(MDSdistance, eig = T, x.ret = T)
mdscale=data.frame(cmdscale(MDSdistance))
mds.var.per <-round(mds.var$eig/sum(mds.var$eig)*100,1)
heart_disease = data.imputed$hd
health = data.imputed$hd
ggplot(data = mdscale,aes(x=X1, y=X2, label=health)) + geom_text(aes(color=health))+ theme_bw()+
xlab(paste("MDS1 -", mds.var.per[1], "%", sep = ""))+
ylab(paste("MDS1 -", mds.var.per[2], "%", sep = ""))+
ggtitle("MDS plot Heart Disease")

Part 8

We see that the classification error rate after boosting is 23.84%, confirmed by the confusion matrix as well as the R calculation.

library(fastAdaboost)
boost = adaboost(hd~., training_data, 13)
boost_pred = predict(boost, newdata=testing_data)
boost_pred$error * 100
## [1] 23.84106
table(boost_pred$class, hd.test)
##            hd.test
##             Healthy Unhealthy
##   Healthy        68        19
##   Unhealthy      17        47