library("ISLR")
library("tree")
library("ggplot2")
library("cowplot")
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
##
## ggsave
library("randomForest")
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library("adabag")
## Loading required package: rpart
## Loading required package: caret
## Loading required package: lattice
## Loading required package: foreach
## Loading required package: doParallel
## Loading required package: iterators
## Loading required package: parallel
library("fastAdaboost")
data <- read.csv("http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data", header=FALSE)
colnames(data) <- c(
"age",
"sex",# 0 = female, 1 = male
"cp", # chest pain
# 1 = typical angina,
# 2 = atypical angina,
# 3 = non-anginal pain,
# 4 = asymptomatic
"trestbps", # resting blood pressure (in mm Hg)
"chol", # serum cholestoral in mg/dl
"fbs", # fasting blood sugar greater than 120 mg/dl, 1 = TRUE, 0 = FALSE
"restecg", # resting electrocardiographic results
# 1 = normal
# 2 = having ST-T wave abnormality
# 3 = showing probable or definite left ventricular hypertrophy
"thalach", # maximum heart rate achieved
"exang", # exercise induced angina, 1 = yes, 0 = no
"oldpeak", # ST depression induced by exercise relative to rest
"slope", # the slope of the peak exercise ST segment
# 1 = upsloping
# 2 = flat
# 3 = downsloping
"ca", # number of major vessels (0-3) colored by fluoroscopy
"thal", # this is short of thalium heart scan
# 3 = normal (no cold spots)
# 6 = fixed defect (cold spots during rest and exercise)
# 7 = reversible defect (when cold spots only appear during exercise)
"hd" # (the predicted attribute) - diagnosis of heart disease
# 0 if less than or equal to 50% diameter narrowing
# 1 if greater than 50% diameter narrowing
)
data[data == "?"] = NA
## Now add factors for variables that are factors and clean up the factors
## that had missing data...
data[data$sex == 0,]$sex <- "F"
data[data$sex == 1,]$sex <- "M"
#We need to make that the features below are "factor" (categorical)
data$sex <- as.factor(data$sex)
data$cp <- as.factor(data$cp)
data$fbs <- as.factor(data$fbs)
data$restecg <- as.factor(data$restecg)
data$exang <- as.factor(data$exang)
data$slope <- as.factor(data$slope)
data$ca <- as.integer(data$ca) # since this column had "?"s in it (which
# we have since converted to NAs) R thinks that
# the levels for the factor are strings, but
# we know they are integers, so we'll first
# convert the strings to integiers...
data$ca <- as.factor(data$ca) # ...then convert the integers to factor levels
data$thal <- as.integer(data$thal) # "thal" also had "?"s in it.
data$thal <- as.factor(data$thal)
## This next line replaces 0 and 1 with "Healthy" and "Unhealthy"
data$hd <- ifelse(test=data$hd == 0, yes="Healthy", no="Unhealthy")
data$hd <- as.factor(data$hd) # Now convert to a factor
set.seed(123)
data <- rfImpute(hd ~ ., data = data, iter=20)
## ntree OOB 1 2
## 300: 19.14% 15.85% 23.02%
## ntree OOB 1 2
## 300: 15.51% 11.59% 20.14%
## ntree OOB 1 2
## 300: 18.81% 15.85% 22.30%
## ntree OOB 1 2
## 300: 16.50% 11.59% 22.30%
## ntree OOB 1 2
## 300: 17.16% 12.20% 23.02%
## ntree OOB 1 2
## 300: 17.49% 14.02% 21.58%
## ntree OOB 1 2
## 300: 16.17% 12.80% 20.14%
## ntree OOB 1 2
## 300: 17.16% 14.02% 20.86%
## ntree OOB 1 2
## 300: 15.84% 12.80% 19.42%
## ntree OOB 1 2
## 300: 18.15% 15.85% 20.86%
## ntree OOB 1 2
## 300: 18.15% 15.24% 21.58%
## ntree OOB 1 2
## 300: 17.82% 14.02% 22.30%
## ntree OOB 1 2
## 300: 17.49% 13.41% 22.30%
## ntree OOB 1 2
## 300: 17.16% 14.02% 20.86%
## ntree OOB 1 2
## 300: 18.48% 14.02% 23.74%
## ntree OOB 1 2
## 300: 16.17% 12.20% 20.86%
## ntree OOB 1 2
## 300: 16.50% 13.41% 20.14%
## ntree OOB 1 2
## 300: 17.16% 14.02% 20.86%
## ntree OOB 1 2
## 300: 17.49% 14.02% 21.58%
## ntree OOB 1 2
## 300: 17.16% 14.63% 20.14%
## we get values a little better and a little worse, so doing more
## iterations doesn't improve the situation.
##
## NOTE: If you really want to micromanage how rfImpute(),
## you can change the number of trees it makes (the default is 300) and the
## number of variables that it will consider at each step.
set.seed(2)
tree.hd = tree(data$hd ~ .-data$hd, data)
plot(tree.hd)
text(tree.hd, pretty=0)
train = sample(1:nrow(data), 152)
data.test = data[-train ,]
hd.test = data$hd[-train]
tree.data = tree(data$hd ~ .-data$hd, data, subset = train)
tree.predict = predict(tree.data, data.test, type = "class")
conf = table(tree.predict, hd.test)
conf
## hd.test
## tree.predict Healthy Unhealthy
## Healthy 61 24
## Unhealthy 21 45
Percent of observations correctly classified:
(conf[1] + conf[4])/152
## [1] 0.6973684
set.seed(3)
cv.data = cv.tree(tree.data, FUN = prune.misclass)
par(mfrow=c(1,2))
plot(cv.data$size, cv.data$dev, type = "b")
plot(cv.data$k, cv.data$dev, type = "b")
prune.data = prune.misclass(tree.data, best = 9)
#plot(prune.data)
#text(prune.data, pretty = 0)
##Posted .png for better resolution
Pruned Tree
tree.predict = predict(prune.data, data.test, type ="class")
conf = table(tree.predict, hd.test)
conf
## hd.test
## tree.predict Healthy Unhealthy
## Healthy 61 21
## Unhealthy 21 48
Percent of observations correctly classified:
(conf[1] + conf[4])/152
## [1] 0.7171053
set.seed(1)
p = dim(data)[2] - 1
bag.data = randomForest(data$hd ~., data, subset = train, mtry = p, importance = TRUE)
bag.predict=predict(bag.data, data.test, type="class")
Prediction error of bagging:
mean(bag.predict != hd.test)
## [1] 0.1854305
set.seed(5)
p = round(sqrt(p))
rf.data = randomForest(data$hd ~., data ,subset = train, mtry = p, importance = TRUE)
bag.predict2 = predict(rf.data, data.test, type="class")
Prediction error of random forest:
mean(bag.predict2 != hd.test)
## [1] 0.1655629
We want to our “best” value to be the value that produces the lowest error rate. Ideally we would use cross validation to find our “best” value, but that is operation intestive. We can also find the “best” value using “Cost complexity pruning”.
set.seed(6)
n = 500
p=13
bag.data = randomForest(data$hd ~., data, subset = train, ntree = n, mtry = p, importance = TRUE)
p = round(sqrt(p))
rf.data = randomForest(data$hd ~., data ,subset = train, mtry = p, importance = TRUE)
The out of bound rates for the nth tree of the bagging and random forest models respectively are.
bag.data$err.rate[n,1]
## OOB
## 0.2171053
rf.data$err.rate[n,1]
## OOB
## 0.2039474
set.seed(7)
model = randomForest(data$hd ~., data, mtry = 13, proximity=TRUE)
#ggplot(data=oob.error.data, aes(x=Trees, y=Error)) + geom_line(aes(color=Type)) + ggtitle("Bagging")
oob.error.data <- data.frame(
Trees=rep(1:nrow(model$err.rate), times=3),
Type=rep(c("OOB", "No", "Yes"), each=nrow(model$err.rate)),
Error=c(model$err.rate[,"OOB"],
model$err.rate[,"Unhealthy"],
model$err.rate[,"Healthy"]))
#posting bagging errors through attatched image due to knitting errors
model = randomForest(data$hd ~., data, mtry = 4, proximity=TRUE)
ggplot(data=oob.error.data, aes(x=Trees, y=Error)) + geom_line(aes(color=Type)) + ggtitle("Random Forest")
oob.error.data <- data.frame(
Trees=rep(1:nrow(model$err.rate), times=3),
Type=rep(c("OOB", "No", "Yes"), each=nrow(model$err.rate)),
Error=c(model$err.rate[,"OOB"],
model$err.rate[,"Unhealthy"],
model$err.rate[,"Healthy"]))
varImpPlot(bag.data)
varImpPlot(rf.data)
ca, cp, and thal are the most important variables in either case.
set.seed(8)
model = randomForest(data$hd ~., data, mtry = 13, proximity=TRUE)
distance.matrix <- dist(1-model$proximity)
mds.stuff <- cmdscale(distance.matrix, eig=TRUE, x.ret=TRUE)
mds.var.per <- round(mds.stuff$eig/sum(mds.stuff$eig)*100, 1)
mds.values <- mds.stuff$points
mds.data <- data.frame(Sample=rownames(mds.values),
X=mds.values[,1],
Y=mds.values[,2],
Status=data$hd)
ggplot(data=mds.data, aes(x=X, y=Y, label=Sample)) +
geom_text(aes(color=Status)) +
theme_bw() +
xlab(paste("MDS1 - ", mds.var.per[1], "%", sep="")) +
ylab(paste("MDS2 - ", mds.var.per[2], "%", sep="")) +
ggtitle("MDS plot using (1 - Random Forest Proximities)")
Our interpretation is that for any subjects with the properties \[\textrm{MDS1} \notin \{-1.25, 1\} \cap \ \textrm{MDS2} > -0.75\] We can predict the subject’s health with high confidence.
#run adaboost
#training.data = data[train,]
#test_adaboost <- adaboost(data$hd~., training.data, 13)
#pred <- predict( test_adaboost, newdata = data.test)
#print(paste("Adaboost Error on Carseats:",pred$error))
#print(table(pred$class,testing_high))
The “adaboost” function does not run and I am unable to resolve the problem