Number 11
Part A
library(ISLR)
library(MASS)
library(class)
#Part A
auto1 <- Auto
median(auto1$mpg)
## [1] 22.75
for (i in 1:nrow(auto1)) {
if (auto1$mpg[i] > 22.75){
auto1$mpg01[i] <- 1
} else
auto1$mpg01[i] <- 0
}
Part B
pairs(auto1)

boxplot(mpg~displacement, data = auto1)

boxplot(mpg~horsepower, data = auto1)

boxplot(mpg~weight, data = auto1)

Visual representations with binary variables are rarely informative and often difficult to interpret and the same is true here. However, looking at correlations between MPG and the other variables reveals that there is a small positive correlation with year which likely has to do with increasing emission standards over time and negative correlations with displacement, horepower, weight, cylinders and origin which are all reasonable to expect. So all of those variables will likely be useful in predicting mpg01.
Part C
#Create Training and Test set with 70:30 Ratio
set.seed(1)
train <- sample(1:length(auto1$mpg), length(auto1$mpg)*.7, rep=FALSE)
test <- -train
training_data <- auto1[train,]
testing_data <- auto1[test,]
Part D
#Part D
lda.1 <- lda(mpg01~ year + origin + horsepower + displacement + cylinders + weight, data = training_data )
lda.predict <- predict(lda.1, newdata = testing_data[1:8])
lda.predict.y <- lda.predict$class
mean(lda.predict.y != testing_data$mpg01)
## [1] 0.04237288
This LDA model gives an error rate of just 4.2% which is quite good.
Part E
#Part E
qda.1 <- qda(mpg01~ year + origin + horsepower + displacement + cylinders + weight, data = training_data )
qda.predict <- predict(qda.1, newdata = testing_data[1:8])
qda.predict.y <- qda.predict$class
mean(qda.predict.y != testing_data$mpg01)
## [1] 0.05932203
This QDA model gives an error rate of 5.9% error rate which is also good but not as good as the LDA.
Part F
#Part F
log.1 <- glm(mpg01~ year + origin + horsepower + displacement + cylinders + weight, data = training_data, family ="binomial" )
log.predict <- predict(log.1, newdata = testing_data[1:8], type = "response")
log_pred_y = rep(0, length(testing_data$mpg))
log_pred_y[log.predict > 0.5] <- 1
mean(log_pred_y!= testing_data$mpg01)
## [1] 0.05932203
This Logistic Regression gives an 5.9% error rate.
Part G
#Part G
data = as.data.frame(scale(auto1[,-c(8,9,10)]))
#Split Data
set.seed(1)
train <- sample(1:length(auto1$mpg), length(auto1$mpg)*.7, rep=FALSE)
test <- -train
#Set up Training and Test
training_data <- data[train,-c(1,6)]
training_data$origin <- auto1$origin[train]
training_y <- auto1$mpg01[train]
testing_data <- data[test,-c(1,6)]
testing_data$origin <- auto1$origin[test]
testing_y <- auto1$mpg01[test]
#Run models in loop to determine best K value
knn_pred_y = NULL
error_rate = NULL
for(i in 1:270){
set.seed(1)
knn_pred_y = knn(training_data,testing_data,training_y,k=i)
error_rate[i] = mean(testing_y != knn_pred_y)
}
#find the minimum error rate
min_error_rate = min(error_rate)
print(min_error_rate)
## [1] 0.03389831
#get the index of the lowest error rate
K = which(error_rate == min_error_rate)
print(K)
## [1] 12
KNN gives a minimum error rate of 3.4% with K = 12.
Number 13
#Load data and set up new median variable
boston <- Boston
median(boston$crim)
## [1] 0.25651
for (i in 1:nrow(boston)) {
if (boston$crim[i] > median(boston$crim)){
boston$crim01[i] <- 1
} else
boston$crim01[i] <- 0
}
#Check Correlations
cor(boston)
## crim zn indus chas nox
## crim 1.00000000 -0.20046922 0.40658341 -0.055891582 0.42097171
## zn -0.20046922 1.00000000 -0.53382819 -0.042696719 -0.51660371
## indus 0.40658341 -0.53382819 1.00000000 0.062938027 0.76365145
## chas -0.05589158 -0.04269672 0.06293803 1.000000000 0.09120281
## nox 0.42097171 -0.51660371 0.76365145 0.091202807 1.00000000
## rm -0.21924670 0.31199059 -0.39167585 0.091251225 -0.30218819
## age 0.35273425 -0.56953734 0.64477851 0.086517774 0.73147010
## dis -0.37967009 0.66440822 -0.70802699 -0.099175780 -0.76923011
## rad 0.62550515 -0.31194783 0.59512927 -0.007368241 0.61144056
## tax 0.58276431 -0.31456332 0.72076018 -0.035586518 0.66802320
## ptratio 0.28994558 -0.39167855 0.38324756 -0.121515174 0.18893268
## black -0.38506394 0.17552032 -0.35697654 0.048788485 -0.38005064
## lstat 0.45562148 -0.41299457 0.60379972 -0.053929298 0.59087892
## medv -0.38830461 0.36044534 -0.48372516 0.175260177 -0.42732077
## crim01 0.40939545 -0.43615103 0.60326017 0.070096774 0.72323480
## rm age dis rad tax
## crim -0.21924670 0.35273425 -0.37967009 0.625505145 0.58276431
## zn 0.31199059 -0.56953734 0.66440822 -0.311947826 -0.31456332
## indus -0.39167585 0.64477851 -0.70802699 0.595129275 0.72076018
## chas 0.09125123 0.08651777 -0.09917578 -0.007368241 -0.03558652
## nox -0.30218819 0.73147010 -0.76923011 0.611440563 0.66802320
## rm 1.00000000 -0.24026493 0.20524621 -0.209846668 -0.29204783
## age -0.24026493 1.00000000 -0.74788054 0.456022452 0.50645559
## dis 0.20524621 -0.74788054 1.00000000 -0.494587930 -0.53443158
## rad -0.20984667 0.45602245 -0.49458793 1.000000000 0.91022819
## tax -0.29204783 0.50645559 -0.53443158 0.910228189 1.00000000
## ptratio -0.35550149 0.26151501 -0.23247054 0.464741179 0.46085304
## black 0.12806864 -0.27353398 0.29151167 -0.444412816 -0.44180801
## lstat -0.61380827 0.60233853 -0.49699583 0.488676335 0.54399341
## medv 0.69535995 -0.37695457 0.24992873 -0.381626231 -0.46853593
## crim01 -0.15637178 0.61393992 -0.61634164 0.619786249 0.60874128
## ptratio black lstat medv crim01
## crim 0.2899456 -0.38506394 0.4556215 -0.3883046 0.40939545
## zn -0.3916785 0.17552032 -0.4129946 0.3604453 -0.43615103
## indus 0.3832476 -0.35697654 0.6037997 -0.4837252 0.60326017
## chas -0.1215152 0.04878848 -0.0539293 0.1752602 0.07009677
## nox 0.1889327 -0.38005064 0.5908789 -0.4273208 0.72323480
## rm -0.3555015 0.12806864 -0.6138083 0.6953599 -0.15637178
## age 0.2615150 -0.27353398 0.6023385 -0.3769546 0.61393992
## dis -0.2324705 0.29151167 -0.4969958 0.2499287 -0.61634164
## rad 0.4647412 -0.44441282 0.4886763 -0.3816262 0.61978625
## tax 0.4608530 -0.44180801 0.5439934 -0.4685359 0.60874128
## ptratio 1.0000000 -0.17738330 0.3740443 -0.5077867 0.25356836
## black -0.1773833 1.00000000 -0.3660869 0.3334608 -0.35121093
## lstat 0.3740443 -0.36608690 1.0000000 -0.7376627 0.45326273
## medv -0.5077867 0.33346082 -0.7376627 1.0000000 -0.26301673
## crim01 0.2535684 -0.35121093 0.4532627 -0.2630167 1.00000000
#Create Training and Test sets
set.seed(11)
train <- sample(1:length(boston$crim), length(boston$crim)*.7, rep=FALSE)
test <- -train
training_data <- boston[train,]
testing_data <- boston[test,]
#LDA Model
lda.1 <- lda(crim01~ indus + nox + age + dis + rad + tax, data = training_data )
lda.predict <- predict(lda.1, newdata = testing_data[2:14])
lda.predict.y <- lda.predict$class
mean(lda.predict.y != testing_data$crim01)
## [1] 0.1052632
#Logistic Regression Model
log.1 <- glm(crim01~ indus + nox + age + dis + rad + tax, data = training_data, family ="binomial" )
log.predict <- predict(log.1, newdata = testing_data[2:14], type = "response")
log_pred_y = rep(0, length(testing_data$crim))
log_pred_y[log.predict > 0.5] <- 1
mean(log_pred_y!= testing_data$crim01)
## [1] 0.125
#KNN Model
data = as.data.frame(scale(boston[,-c(15)]))
set.seed(11)
train <- sample(1:length(boston$crim), length(boston$crim)*.7, rep=FALSE)
test <- -train
training_data <- data[train,-c(1,2,4,6,11,12,13,14)]
training_y <- boston$crim01[train]
testing_data <- data[test,-c(1,2,4,6,11,12,13,14)]
testing_y <- boston$crim01[test]
knn_pred_y = NULL
error_rate = NULL
for(i in 1:30){
set.seed(1)
knn_pred_y = knn(training_data,testing_data,training_y,k=i)
error_rate[i] = mean(testing_y != knn_pred_y)
}
### find the minimum error rate
min_error_rate = min(error_rate)
print(min_error_rate)
## [1] 0.05263158
### get the index of that error rate, which is the k
K = which(error_rate == min_error_rate)
print(K)
## [1] 3
#Show plot of Error Rates vs. K Value
library(ggplot2)
qplot(1:30, error_rate, xlab = "K",
ylab = "Error Rate",
geom=c("point", "line"))

The initial results of the models for predicting Crime Rate above Median are quite good. The initial models used the following variables which were found to have Pearson’s Correlations with Crim01 (the above median variable) of at least 0.6: indus, nox, age, dis, rad and tax. Using those variables the models returned the following Test Errors, LDA:10.5%, Logistic Reg: 12.5%, KNN(With K=3): 5.3%. The plot of Error Rates vs. K also clearly shows that K=3 is the optimum number of nearest neighbors to use in that case.
Different Variables
set.seed(11)
train <- sample(1:length(boston$crim), length(boston$crim)*.7, rep=FALSE)
test <- -train
training_data <- boston[train,]
testing_data <- boston[test,]
for (i in 1:nrow(boston)) {
if (boston$crim[i] > median(boston$crim)){
boston$crim01[i] <- 1
} else
boston$crim01[i] <- 0
}
#LDA Model
lda.1 <- lda(crim01~ zn + nox + age + rad + tax + black + medv, data = training_data )
lda.predict <- predict(lda.1, newdata = testing_data[2:14])
lda.predict.y <- lda.predict$class
mean(lda.predict.y != testing_data$crim01)
## [1] 0.1184211
#Logistic Regression Model
log.1 <- glm(crim01~ zn + nox + age + rad + tax + black + medv, data = training_data, family ="binomial" )
log.predict <- predict(log.1, newdata = testing_data[2:14], type = "response")
log_pred_y = rep(0, length(testing_data$crim))
log_pred_y[log.predict > 0.5] <- 1
mean(log_pred_y!= testing_data$crim01)
## [1] 0.1315789
#KNN Model
data = as.data.frame(scale(boston[,-c(15)]))
set.seed(11)
train <- sample(1:length(boston$crim), length(boston$crim)*.7, rep=FALSE)
test <- -train
training_data <- data[train,-c(1,3,4,6,8,11,13)]
training_y <- boston$crim01[train]
testing_data <- data[test,-c(1,3,4,6,8,11,13)]
testing_y <- boston$crim01[test]
knn_pred_y = NULL
error_rate = NULL
for(i in 1:30){
set.seed(1)
knn_pred_y = knn(training_data,testing_data,training_y,k=i)
error_rate[i] = mean(testing_y != knn_pred_y)
}
### find the minimum error rate
min_error_rate = min(error_rate)
print(min_error_rate)
## [1] 0.06578947
### get the index of that error rate, which is the k
K = which(error_rate == min_error_rate)
print(K)
## [1] 1 3
Using a slightly different set of variables we get slightly worse but similar results to the first set. Building LDA, LogReg and KNN models using the following variables: zn, nox, age, rad, tax, black and medv we get Test Error rates of 11.8%, 13.2% and 6.6%(with K=1,3) respectively. So these results indicate that for this problem KNN gives the best results overall and in choosing variables one can see that simply using all the variables with Pearson’s Correlation values over 0.6 yielded the best models as well as the best overall model which was a KNN model with K=3 and Test Error Rate of 5.3%.