KNN.reg in the FNN package to construct predictive models for brain weight using KNN regression for \(K=1,..,15\) on a training set (use the exact same training/validation split–same seed and same split percentages–you used for linear regression). Hint: use a for loop to iterate over \(K\).library(FNN)
brain <- read.table("/Users/ninasong/Desktop/PM591/assignment_2/brain.txt",header=T)
set.seed(2018)
n <- sample(1:nrow(brain),0.7*nrow(brain),replace = FALSE)
train <- brain[n,]
train_x <- train[,-4]
train_y <- train$Brain.weight
test <- brain[-n,]
test_x <- test[,-4]
test_y <- test$Brain.weight
k <- 1:15
rmse <- c()
for(i in k){
model <- knn.reg(train_x,test_x,train_y,k=i)
pred <- model$pred
rmse <- c(rmse,sqrt(mean((test_y-pred)^2)))
}
a. Plot the validation RMSE as a function of $K$ and select the best K.
plot(k,rmse,type="l")
The best k is 5.
c. Using the validation RMSE compare to the best linear regression model from homework 1. Is there an improvement in prediction performance? Interpret your results based on the bias-variance tradeoff.
model <- knn.reg(train_x,y=train_y,k=5)
sqrt(mean(model$residuals^2))
## [1] 76.2342
KNN regression has lower RMSE and better results than linear regression.
Dataset notes: According to the Mayo Clinic, “ischemic stroke occurs when a blood clot blocks or narrows an artery leading to the brain. A blood clot often forms in arteries damaged by the buildup of plaques (atherosclerosis). It can occur in the carotid artery of the neck as well as other arteries. This is the most common type of stroke.” (https://www.mayoclinic.org/diseases-conditions/stroke/symptoms-causes/syc-20350113#:~:text=Ischemic%20stroke%20occurs%20when%20a,most%20common%20type%20of%20stroke.)
Note: Because of the moderate sample size we will not have a separate test set – we will learn later in the course about cross-validation, which will allow us to split the data into training and testing only and still perform model selection.
boxplot(your_predictor ~ Stroke, data=stroke_train) to get a boxplot for a quantitative predictor and plot(Stroke, your_predictor, data=stroke_train) for a categorical predictor to get a mosaic plot. Visually determine the 3 most most predictive imaging features, i.e. the imaging features that best separate the stroke=YES vs. stroke=‘No’ classes. (This is an informal procedure since a visual assessment is inherently subjective, in a future class we will learn how to do feature selection in a systematic way).# Code for reading in stroke data and converting categorical variables to factors
# To run chunk set options above to eval=TRUE
setwd("/Users/ninasong/Desktop/PM591/assignment_2/") #replace "/Users/yourpath/" with the path to the folder where 'stroke.csv' file is stored
stroke = read.csv("stroke.csv" )
stroke$Stroke <- factor(stroke$Stroke, levels=c('N', 'Y'), labels=c("No", "Yes"))
stroke$NASCET <- factor(stroke$NASCET, levels=0:1, labels=c("No", "Yes"))
stroke$sex <- factor(stroke$sex, levels=0:1, labels=c("Female", "Male"))
stroke$SmokingHistory <- factor(stroke$SmokingHistory, levels=0:1, labels=c("No", "Yes"))
stroke$AtrialFibrillation <- factor(stroke$AtrialFibrillation, levels=0:1, labels=c("No", "Yes"))
stroke$CoronaryArteryDisease <- factor(stroke$CoronaryArteryDisease, levels=0:1, labels=c("No", "Yes"))
stroke$DiabetesHistory <- factor(stroke$DiabetesHistory, levels=0:1, labels=c("No", "Yes"))
stroke$HypercholesterolemiaHistory <- factor(stroke$HypercholesterolemiaHistory, levels=0:1, labels=c("No", "Yes"))
stroke$HypertensionHistory <- factor(stroke$HypertensionHistory, levels=0:1, labels=c("No", "Yes"))
# stroke <- read.csv("/Users/ninasong/Desktop/PM591/assignment_2/stroke.csv")
stroke <- stroke[,-1]
set.seed(303)
n = nrow(stroke)
positives = (1:n)[stroke$Stroke=='Yes']
negatives = (1:n)[stroke$Stroke=='No']
positives_train = sample(positives, floor(0.7*length(positives)))
positives_val = setdiff(positives, positives_train)
negatives_train = sample(negatives, floor(0.7*length(negatives)))
negatives_val = setdiff(negatives, negatives_train)
stroke_train = stroke[c(positives_train, negatives_train), ]
stroke_val = stroke[c(positives_val, negatives_val), ]
ntrain = nrow(stroke_train); nval=nrow(stroke_val)
table(stroke_train$Stroke)
##
## No Yes
## 43 44
table(stroke_val$Stroke)
##
## No Yes
## 19 20
boxplot(MATXVol ~ Stroke, data=stroke_train)
for(i in 23:29){
stroke_train[,i] <- factor(stroke_train[,i])
stroke_val[,i] <- factor(stroke_val[,i])
}
data=ftable(stroke_train[,c("Stroke","sex","SmokingHistory")])
mosaicplot(Stroke~sex,data=data)
mosaicplot(Stroke~SmokingHistory,data=data)
library(MASS)
model1 <- lda(Stroke~age+sex+SmokingHistory,data=stroke_train)
model2 <- lda(Stroke~age+sex+SmokingHistory+AtrialFibrillation+CoronaryArteryDisease+DiabetesHistory+HypercholesterolemiaHistory+HypertensionHistory,data=stroke_train)
model3 <- lda(Stroke~age+sex+SmokingHistory+AtrialFibrillation+CoronaryArteryDisease+DiabetesHistory+HypercholesterolemiaHistory+HypertensionHistory+MATXVol,data=stroke_train)
model4 <- lda(Stroke~age+sex+SmokingHistory+AtrialFibrillation+CoronaryArteryDisease+DiabetesHistory+HypercholesterolemiaHistory+HypertensionHistory+MATXVol+WallVol+MaxDilationByArea,data=stroke_train)
classificationError to compute the overall misclassification error, specificity, and sensitivity of a classifier. The function should take a confusion matrix as its input (which you can create using table as shown in the lecture) and return a vector with the overall misclassication error, specificity and sensitivity. (Hint: separately compute the three quantities error, spec, and sens inside the body of the function and then put them together in a vector using c(error=error, sensitivity=sens, specificity=spec) in the last line of the body of the function before the closing } – the last line is by default what a function returns. The returned object can be any R object including a siggle number, a vector, a data.frame or even another function!)classificationError <- function(x){
TP <- x[1,1]
TN <- x[2,2]
FP <- x[1,2]
FN <- x[2,1]
n <- sum(x)
error <- (FP+FN)/n
sens <- TP/(TP+FN)
spec <- TN/(FP+TN)
return(c(error=error, sensitivity=sens, specificity=spec))
}
#model1
pred1_t <- predict(model1)
pred1_v <- predict(model1,stroke_val)
m1_t <- classificationError(table(stroke_train$Stroke,pred1_t$class))
m1_v <- classificationError(table(stroke_val$Stroke,pred1_v$class))
#model2
pred2_t <- predict(model2)
pred2_v <- predict(model2,stroke_val)
m2_t <- classificationError(table(stroke_train$Stroke,pred2_t$class))
m2_v <- classificationError(table(stroke_val$Stroke,pred2_v$class))
#model3
pred3_t <- predict(model3)
pred3_v <- predict(model3,stroke_val)
m3_t <- classificationError(table(stroke_train$Stroke,pred3_t$class))
m3_v <- classificationError(table(stroke_val$Stroke,pred3_v$class))
#model4
pred4_t <- predict(model4)
pred4_v <- predict(model4,stroke_val)
m4_t <- classificationError(table(stroke_train$Stroke,pred4_t$class))
m4_v <- classificationError(table(stroke_val$Stroke,pred4_v$class))
class_error <- rbind.data.frame(m1_t,m1_v,m2_t,m2_v,m3_t,m3_v,m4_t,m4_v)
colnames(class_error) <- c("error","sensitivity","specificity")
class_error$model <- factor(rep(1:4,each=2))
class_error$d <- factor(rep(c("train","test"),4),levels = c("train","test"))
library(ggplot2)
ggplot(data=class_error,aes(d,error,color=model,group=model))+geom_point()+geom_line()
The more complex the model is, the smaller the training error is. The test error decreases first, and then decreases with the increase of model complexity.