Part 0

Explain in detail all steps we took for data preparation (data pre-processing).

Load packages and data

library("ISLR")
library("tree")
library("ggplot2")
library("cowplot")
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
## 
##     ggsave
library("randomForest")
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
library("adabag")
## Loading required package: rpart
## Loading required package: caret
## Loading required package: lattice
## Loading required package: foreach
## Loading required package: doParallel
## Loading required package: iterators
## Loading required package: parallel
library("fastAdaboost")

data <- read.csv("http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data", header=FALSE)

Rename columns

colnames(data) <- c(
  "age",
  "sex",# 0 = female, 1 = male
  "cp", # chest pain
          # 1 = typical angina,
          # 2 = atypical angina,
          # 3 = non-anginal pain,
          # 4 = asymptomatic
  "trestbps", # resting blood pressure (in mm Hg)
  "chol", # serum cholestoral in mg/dl
  "fbs",  # fasting blood sugar greater than 120 mg/dl, 1 = TRUE, 0 = FALSE
  "restecg", # resting electrocardiographic results
          # 1 = normal
          # 2 = having ST-T wave abnormality
          # 3 = showing probable or definite left ventricular hypertrophy
  "thalach", # maximum heart rate achieved
  "exang",   # exercise induced angina, 1 = yes, 0 = no
  "oldpeak", # ST depression induced by exercise relative to rest
  "slope", # the slope of the peak exercise ST segment
          # 1 = upsloping
          # 2 = flat
          # 3 = downsloping
  "ca", # number of major vessels (0-3) colored by fluoroscopy
  "thal", # this is short of thalium heart scan
          # 3 = normal (no cold spots)
          # 6 = fixed defect (cold spots during rest and exercise)
          # 7 = reversible defect (when cold spots only appear during exercise)
  "hd" # (the predicted attribute) - diagnosis of heart disease
          # 0 if less than or equal to 50% diameter narrowing
          # 1 if greater than 50% diameter narrowing
  )

Replace “?” with NA’s

data[data == "?"] = NA

Add factors for categorical data

## Now add factors for variables that are factors and clean up the factors
## that had missing data...
data[data$sex == 0,]$sex <- "F"
data[data$sex == 1,]$sex <- "M"
#We need to make that the features below are "factor" (categorical)
data$sex <- as.factor(data$sex)
data$cp <- as.factor(data$cp) 
data$fbs <- as.factor(data$fbs) 
data$restecg <- as.factor(data$restecg)
data$exang <- as.factor(data$exang) 
data$slope <- as.factor(data$slope)
data$ca <- as.integer(data$ca) # since this column had "?"s in it (which
                               # we have since converted to NAs) R thinks that
                               # the levels for the factor are strings, but
                               # we know they are integers, so we'll first
                               # convert the strings to integiers...
data$ca <- as.factor(data$ca)  # ...then convert the integers to factor levels
data$thal <- as.integer(data$thal) # "thal" also had "?"s in it.
data$thal <- as.factor(data$thal)

## This next line replaces 0 and 1 with "Healthy" and "Unhealthy"
data$hd <- ifelse(test=data$hd == 0, yes="Healthy", no="Unhealthy")
data$hd <- as.factor(data$hd) # Now convert to a factor

Set seed to replicate results

set.seed(123)

Impute missing data data

data <- rfImpute(hd ~ ., data = data, iter=20)
## ntree      OOB      1      2
##   300:  19.14% 15.85% 23.02%
## ntree      OOB      1      2
##   300:  15.51% 11.59% 20.14%
## ntree      OOB      1      2
##   300:  18.81% 15.85% 22.30%
## ntree      OOB      1      2
##   300:  16.50% 11.59% 22.30%
## ntree      OOB      1      2
##   300:  17.16% 12.20% 23.02%
## ntree      OOB      1      2
##   300:  17.49% 14.02% 21.58%
## ntree      OOB      1      2
##   300:  16.17% 12.80% 20.14%
## ntree      OOB      1      2
##   300:  17.16% 14.02% 20.86%
## ntree      OOB      1      2
##   300:  15.84% 12.80% 19.42%
## ntree      OOB      1      2
##   300:  18.15% 15.85% 20.86%
## ntree      OOB      1      2
##   300:  18.15% 15.24% 21.58%
## ntree      OOB      1      2
##   300:  17.82% 14.02% 22.30%
## ntree      OOB      1      2
##   300:  17.49% 13.41% 22.30%
## ntree      OOB      1      2
##   300:  17.16% 14.02% 20.86%
## ntree      OOB      1      2
##   300:  18.48% 14.02% 23.74%
## ntree      OOB      1      2
##   300:  16.17% 12.20% 20.86%
## ntree      OOB      1      2
##   300:  16.50% 13.41% 20.14%
## ntree      OOB      1      2
##   300:  17.16% 14.02% 20.86%
## ntree      OOB      1      2
##   300:  17.49% 14.02% 21.58%
## ntree      OOB      1      2
##   300:  17.16% 14.63% 20.14%
## we get values a little better and a little worse, so doing more
## iterations doesn't improve the situation.
##
## NOTE: If you really want to micromanage how rfImpute(),
## you can change the number of trees it makes (the default is 300) and the
## number of variables that it will consider at each step.

Part 1

Construct a Tree for Classification for the CHD “Heart Disease” data set. Print the tree. Construct a confusion matrix and report the percent of test observations that are correctly classified.

set.seed(2)
tree.hd = tree(data$hd ~ .-data$hd, data)
plot(tree.hd)
text(tree.hd, pretty=0)

train = sample(1:nrow(data), 152)
data.test = data[-train ,]
hd.test = data$hd[-train]
tree.data = tree(data$hd ~ .-data$hd, data, subset = train)
tree.predict = predict(tree.data, data.test, type = "class")
conf = table(tree.predict, hd.test)
conf
##             hd.test
## tree.predict Healthy Unhealthy
##    Healthy        61        24
##    Unhealthy      21        45

Percent of observations correctly classified:

(conf[1] + conf[4])/152
## [1] 0.6973684

Part 2

Prune the tree from Part 1. Print the tree. Construct a confusion matrix and report the percent of test observations that are correctly classified.

set.seed(3)
cv.data = cv.tree(tree.data, FUN = prune.misclass)
par(mfrow=c(1,2))
plot(cv.data$size, cv.data$dev, type = "b")
plot(cv.data$k, cv.data$dev, type = "b")

prune.data = prune.misclass(tree.data, best = 9)
#plot(prune.data)
#text(prune.data, pretty = 0)
##Posted .png for better resolution
Pruned Tree

Pruned Tree

tree.predict = predict(prune.data, data.test, type ="class")
conf = table(tree.predict, hd.test)
conf
##             hd.test
## tree.predict Healthy Unhealthy
##    Healthy        61        21
##    Unhealthy      21        48

Percent of observations correctly classified:

(conf[1] + conf[4])/152
## [1] 0.7171053

Part 3

Run Bagging and a classification Random Forest model for the CHD data set. Report the error in prediction using the test data for both models.

set.seed(1)

p = dim(data)[2] - 1
bag.data = randomForest(data$hd ~., data, subset = train, mtry = p, importance = TRUE)
bag.predict=predict(bag.data, data.test, type="class")

Prediction error of bagging:

mean(bag.predict != hd.test)
## [1] 0.1854305
set.seed(5)

p = round(sqrt(p))
rf.data = randomForest(data$hd ~., data ,subset = train, mtry = p, importance = TRUE)
bag.predict2 = predict(rf.data, data.test, type="class")

Prediction error of random forest:

mean(bag.predict2 != hd.test)
## [1] 0.1655629

Remember to explain how to calculate “best” sample of feature to compute for nodes in the Random Forest model.

We want to our “best” value to be the value that produces the lowest error rate. Ideally we would use cross validation to find our “best” value, but that is operation intestive. We can also find the “best” value using “Cost complexity pruning”.

Part 4

Report the error in prediction for both models in Part 3 using the OOB sample.

set.seed(6)
n = 500
p=13
bag.data = randomForest(data$hd ~., data, subset = train, ntree = n, mtry = p, importance = TRUE)
p = round(sqrt(p))
rf.data = randomForest(data$hd ~., data ,subset = train, mtry = p, importance = TRUE)

The out of bound rates for the nth tree of the bagging and random forest models respectively are.

bag.data$err.rate[n,1]
##       OOB 
## 0.2171053
rf.data$err.rate[n,1]
##       OOB 
## 0.2039474

Part 5

Construct a plot in order to compare error rates of the Random Forest model OOB, healthy and unhealthy patients as a function of the number of trees (similar to page 318).

set.seed(7)

model = randomForest(data$hd ~., data, mtry = 13, proximity=TRUE)
#ggplot(data=oob.error.data, aes(x=Trees, y=Error)) +  geom_line(aes(color=Type)) + ggtitle("Bagging")
oob.error.data <- data.frame(
  Trees=rep(1:nrow(model$err.rate), times=3),
  Type=rep(c("OOB", "No", "Yes"), each=nrow(model$err.rate)),
  Error=c(model$err.rate[,"OOB"],
    model$err.rate[,"Unhealthy"],
    model$err.rate[,"Healthy"]))
#posting bagging errors through attatched image due to knitting errors
model = randomForest(data$hd ~., data, mtry = 4, proximity=TRUE)
ggplot(data=oob.error.data, aes(x=Trees, y=Error)) +  geom_line(aes(color=Type)) + ggtitle("Random Forest")

oob.error.data <- data.frame(
  Trees=rep(1:nrow(model$err.rate), times=3),
  Type=rep(c("OOB", "No", "Yes"), each=nrow(model$err.rate)),
  Error=c(model$err.rate[,"OOB"],
    model$err.rate[,"Unhealthy"],
    model$err.rate[,"Healthy"]))

Part 6

Rank the feature importance (variable importance) using the mean decrease in Gini index and create a plot. Also compute the relative importance (see page 320).

varImpPlot(bag.data)

varImpPlot(rf.data)

ca, cp, and thal are the most important variables in either case.

set.seed(8)

model = randomForest(data$hd ~., data, mtry = 13, proximity=TRUE)
distance.matrix <- dist(1-model$proximity)
mds.stuff <- cmdscale(distance.matrix, eig=TRUE, x.ret=TRUE)
mds.var.per <- round(mds.stuff$eig/sum(mds.stuff$eig)*100, 1)
mds.values <- mds.stuff$points
mds.data <- data.frame(Sample=rownames(mds.values),
              X=mds.values[,1],
              Y=mds.values[,2],
              Status=data$hd)
ggplot(data=mds.data, aes(x=X, y=Y, label=Sample)) +
  geom_text(aes(color=Status)) +
  theme_bw() +
  xlab(paste("MDS1 - ", mds.var.per[1], "%", sep="")) +
  ylab(paste("MDS2 - ", mds.var.per[2], "%", sep="")) +
  ggtitle("MDS plot using (1 - Random Forest Proximities)")

Our interpretation is that for any subjects with the properties \[\textrm{MDS1} \notin \{-1.25, 1\} \cap \ \textrm{MDS2} > -0.75\] We can predict the subject’s health with high confidence.

Part 8

Construct a classification model using Adaboosting and report a confusion matrix. Also use the confusion matrix in order to check that R had the correct error in classification.

#run adaboost
#training.data = data[train,]

#test_adaboost <- adaboost(data$hd~., training.data, 13)
#pred <- predict( test_adaboost, newdata = data.test)
#print(paste("Adaboost Error on Carseats:",pred$error))
#print(table(pred$class,testing_high))

The “adaboost” function does not run and I am unable to resolve the problem