library(ISLR2) # Adding Packages
## Warning: package 'ISLR2' was built under R version 4.3.1
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.1
library(moments)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.1
library(caret)
## Warning: package 'caret' was built under R version 4.3.1
## Loading required package: lattice
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
library(e1071)
## Warning: package 'e1071' was built under R version 4.3.1
##
## Attaching package: 'e1071'
## The following objects are masked from 'package:moments':
##
## kurtosis, moment, skewness
library(class)
library("corrplot")
## Warning: package 'corrplot' was built under R version 4.3.1
## corrplot 0.92 loaded
This question should be answered using the Weekly data set, which is part of the ISLR2 package. This data is similar in nature to the Smarket data from this chapter’s lab, except that it contains 1, 089 weekly returns for 21 years, from the beginning of 1990 to the end of 2010.
data("Weekly") #adding data
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
subset_data <- Weekly[, 2:8] #Subset Data
data_long <- gather(subset_data)
# Create side-by-side histograms of lag variables
ggplot(data_long, aes(x = value, fill = key)) +
geom_histogram(position = "dodge", bins = 20) +
facet_wrap(~ key, scales = "free_x") +
labs(title = "Lag 1-5", x = "Value") +
theme_minimal()
kurtosis_lag<-kurtosis(Weekly$Lag1) #Sampling kurtosis Values
print(kurtosis_lag)
## [1] 5.668866
kurtosis_lag2<-kurtosis(Weekly$Lag2)
print(kurtosis_lag2)
## [1] 5.665817
week_sub<-subset(Weekly, select = -c(Direction))
week_sd<-sapply(week_sub, sd)
print(week_sd)
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today
## 6.033182 2.357013 2.357254 2.360502 2.360279 2.361285 1.686636 2.356927
plot(Weekly[-9])
cor(Weekly[-9])
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
plot(Weekly$Volume, ylab = "Shares")
#plotting response variable
barplot(table(Weekly$Direction), main = "Direction: Up and Down", xlab = "Direction", col = "blue", border = "white")
# Log Regression (b) Use the full data set to perform a logistic
regression with Direction as the response and the five lag variables
plus Volume as predictors. Use the summary function to print the
results. Do any of the predictors appear to be statistically
significant? If so, which ones?
# Log Regression
week.fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Weekly, family = binomial)
summary(week.fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
#Creating Confusion Matrix
probs = predict(week.fit, type = "response")
preds_prob=rep("Down", 1089)
preds_prob[probs > 0.5] = "Up"
table(preds_prob, Weekly$Direction)
##
## preds_prob Down Up
## Down 54 48
## Up 430 557
mean(preds_prob == Weekly$Direction)
## [1] 0.5610652
The correct prediction percentage, given by (TN + TP)/ (TO) = 56% (roughly), which is slightly better than chance.
train<-subset(Weekly, Year < 2009)
test<- subset(Weekly, Year > 2008)
lag2_reg = glm(Direction ~ Lag2, data= train, family = "binomial")
print(lag2_reg)
##
## Call: glm(formula = Direction ~ Lag2, family = "binomial", data = train)
##
## Coefficients:
## (Intercept) Lag2
## 0.2033 0.0581
##
## Degrees of Freedom: 984 Total (i.e. Null); 983 Residual
## Null Deviance: 1355
## Residual Deviance: 1351 AIC: 1355
lag2_pred = predict(lag2_reg, type="response", newdata = test)
lag2_dir = Weekly$Direction[Weekly$Year>2008]
length(lag2_dir)
## [1] 104
lag2_prob = rep("Down", 104)
lag2_prob[lag2_pred > 0.5] = "Up"
table(lag2_prob, test$Direction)
##
## lag2_prob Down Up
## Down 9 5
## Up 34 56
mean(lag2_prob == test$Direction)
## [1] 0.625
The error rate for this model was slightly higher than with the inclusion of other predictors. This indicates that it correctly predicted the movement of the market by 62.2%.It correctly predicted that the direction would go down 9 times, while predicting it would go up 56 times correctly. It falsely predicted that the direction would go up 5 times, when it actually went down, It falsely prediction the direction would go down 34 times, when it actually went up. Thus, it correctly predicted the direction would go down for 21% of the down observations, and up 91% of the time. Considering that some of the up observations were incorrectly categorized, it means that of all the occurrences of up (which were distributed correctly and incorrectly) the model identified 62% of them correctly (the others being distributed into the incorrect down category). Alternatively, it correctly identified down 64% of the time, incorrectly distributing the rest to up.
lag2.fit <- lda(Direction ~ Lag2, data = train)
lag2.fit
## Call:
## lda(Direction ~ Lag2, data = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
##
## Coefficients of linear discriminants:
## LD1
## Lag2 0.4414162
lag2.pred<-predict(lag2.fit, test)
names(lag2.pred)
## [1] "class" "posterior" "x"
lda.class <-lag2.pred$class
table(lda.class, test$Direction)
##
## lda.class Down Up
## Down 9 5
## Up 34 56
mean(lda.class == test$Direction)
## [1] 0.625
qda.fit<-qda(Direction ~ Lag2 , data = train)
qda.fit
## Call:
## qda(Direction ~ Lag2, data = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
qda.class<-predict(qda.fit, test)$class
table(qda.class, test$Direction)
##
## qda.class Down Up
## Down 0 0
## Up 43 61
mean(qda.class == test$Direction)
## [1] 0.5865385
nb.fit <-naiveBayes(Direction ~ Lag2 , data = train)
nb.fit
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## Down Up
## 0.4477157 0.5522843
##
## Conditional probabilities:
## Lag2
## Y [,1] [,2]
## Down -0.03568254 2.199504
## Up 0.26036581 2.317485
nb.class<-predict(nb.fit, test)
table(nb.class, test$Direction)
##
## nb.class Down Up
## Down 0 0
## Up 43 61
mean(nb.class == test$Direction)
## [1] 0.5865385
nb.preds<-predict(nb.fit, Weekly, type = "raw")
nb.preds[1:5, ]
## Down Up
## [1,] 0.4342284 0.5657716
## [2,] 0.4492059 0.5507941
## [3,] 0.4657684 0.5342316
## [4,] 0.4810977 0.5189023
## [5,] 0.3835799 0.6164201
# Splitting Data
train.X <-cbind(train$Lag2)
test.X <- cbind(test$Lag2)
train.Y = cbind(train$Direction)
test.Y = cbind(test$Direction)
knn.pred = knn(train.X, test.X, train.Y, k=1)
table(knn.pred, test$Direction)
##
## knn.pred Down Up
## 1 21 29
## 2 22 32
mean(knn.pred == test.Y)
## [1] 0.5096154
Reviewing the mean errors calculated from each model, in rank, log, lda, qda are the top 3 performing models.
14.In this problem, you will develop a model to predict whether a given car gets high or low gas mileage based on the Auto data set.
summary(Auto)
## mpg cylinders displacement horsepower weight
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0 1st Qu.:2225
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5 Median :2804
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5 Mean :2978
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0 3rd Qu.:3615
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
##
## acceleration year origin name
## Min. : 8.00 Min. :70.00 Min. :1.000 amc matador : 5
## 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000 ford pinto : 5
## Median :15.50 Median :76.00 Median :1.000 toyota corolla : 5
## Mean :15.54 Mean :75.98 Mean :1.577 amc gremlin : 4
## 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000 amc hornet : 4
## Max. :24.80 Max. :82.00 Max. :3.000 chevrolet chevette: 4
## (Other) :365
median_mpg<-median(Auto$mpg)
Auto$mpg01<-ifelse(Auto$mpg > median_mpg, 1, 0)
auto01<-subset(Auto, select = -mpg)
corrplot(cor(auto01[,-8]), method= "number")
1) It seems that MPG has some negative correlations. The most pronounced
of them include cylinder, displacement, horsepower, and weight. I’ll
move forward with reviewing these variables.
par(mfrow=c(1,2))
# cylinder
boxplot(cylinders ~ mpg01, data = auto01, col = c("lavender"), main = "mpg vs cyl", xlab = "mpg", ylab = "cyl")
# weight
boxplot(weight ~ mpg01, data = auto01, col = c("lightgreen"), main = "mpg vs weight", xlab = "mpg", ylab = "weight")
# displacement
boxplot(displacement ~ mpg01, data = auto01, col = c("orange"), main = "mpg vs cyl", xlab = "mpg", ylab = "displacement")
# horsepower
boxplot(horsepower ~ mpg01, data = auto01, col = c("magenta"), main = "mpg vs horsepower", xlab = "mpg", ylab = "horsepower")
# index
set.seed(42)
ind<-sample(1:nrow(auto01), size = 0.7*nrow(auto01))
# splitting data
train<-auto01[ind, ]
test<-auto01[-ind, ]
auto_lda<-lda(mpg01 ~ cylinders + horsepower + displacement + weight, data = auto01)
auto_lda
## Call:
## lda(mpg01 ~ cylinders + horsepower + displacement + weight, data = auto01)
##
## Prior probabilities of groups:
## 0 1
## 0.5 0.5
##
## Group means:
## cylinders horsepower displacement weight
## 0 6.765306 130.11224 273.1582 3620.403
## 1 4.178571 78.82653 115.6658 2334.765
##
## Coefficients of linear discriminants:
## LD1
## cylinders -0.4708126926
## horsepower 0.0044241441
## displacement -0.0014339977
## weight -0.0009846242
predict_auto<-predict(auto_lda, test)
table(predict_auto$class, test$mpg01)
##
## 0 1
## 0 42 4
## 1 5 67
mean(predict_auto$class != test$mpg01)
## [1] 0.07627119
The output from this model shows that the group means for all predictors tend to be higher for all predictor variables. This may suggest that increased values of these predictors are associated with a decrease in mpg performance. The rate of error is .07%, which is great.
auto_qda<-qda(mpg01 ~ cylinders + horsepower + displacement + weight, data = auto01)
auto_qda
## Call:
## qda(mpg01 ~ cylinders + horsepower + displacement + weight, data = auto01)
##
## Prior probabilities of groups:
## 0 1
## 0.5 0.5
##
## Group means:
## cylinders horsepower displacement weight
## 0 6.765306 130.11224 273.1582 3620.403
## 1 4.178571 78.82653 115.6658 2334.765
predict_qda<-predict(auto_qda, test)$class
table(predict_qda, test$mpg01)
##
## predict_qda 0 1
## 0 43 6
## 1 4 65
mean(predict_qda != test$mpg01)
## [1] 0.08474576
qda_coef<-coef(auto_qda)
The output from this model has a similar trend in group means as the LDA model, but the error rate is slightly higher at .08%
auto_reg = glm(mpg01 ~ cylinders + horsepower + displacement + weight, data = auto01, family = "binomial")
summary(auto_reg)
##
## Call:
## glm(formula = mpg01 ~ cylinders + horsepower + displacement +
## weight, family = "binomial", data = auto01)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 11.7966002 1.7091082 6.902 5.12e-12 ***
## cylinders -0.0129303 0.3457347 -0.037 0.97017
## horsepower -0.0421321 0.0139763 -3.015 0.00257 **
## displacement -0.0129820 0.0082203 -1.579 0.11428
## weight -0.0019458 0.0006918 -2.812 0.00492 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 543.43 on 391 degrees of freedom
## Residual deviance: 207.27 on 387 degrees of freedom
## AIC: 217.27
##
## Number of Fisher Scoring iterations: 7
auto_pred = predict(auto_reg, type="response", newdata = test)
pred<-rep(0, length(auto_pred))
pred[auto_pred > 0.5] <- 1
table(pred, test$mpg01)
##
## pred 0 1
## 0 43 5
## 1 4 66
mean(pred != test$mpg01)
## [1] 0.07627119
As opposed to LDA, the log regression assigns negative coefficients to all of the predictor variables.The test error is .07%.
nb_auto<-naiveBayes(mpg01 ~ cylinders + horsepower + displacement + weight, data = auto01)
nb_auto
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## 0 1
## 0.5 0.5
##
## Conditional probabilities:
## cylinders
## Y [,1] [,2]
## 0 6.765306 1.4200110
## 1 4.178571 0.6746319
##
## horsepower
## Y [,1] [,2]
## 0 130.11224 37.35564
## 1 78.82653 15.91969
##
## displacement
## Y [,1] [,2]
## 0 273.1582 89.52399
## 1 115.6658 38.42951
##
## weight
## Y [,1] [,2]
## 0 3620.403 676.9322
## 1 2334.765 397.1924
nba_pred<-predict(nb_auto, test)
table(nba_pred, test$mpg01)
##
## nba_pred 0 1
## 0 43 5
## 1 4 66
mean(nba_pred != test$mpg01)
## [1] 0.07627119
This model follows trends from previous models, with an error rate of .07%
# Splitting Data for KNN
train.auto <-cbind(train$cylinders, train$displacement, train$horsepower, train$weight)
test.auto <- cbind(test$cylinders, test$displacement, test$horsepower, test$weight)
train.Y_auto = cbind(train$mpg01)
test.Y_auto = cbind(test$mpg01)
knn_auto = knn(train.auto, test.auto, train.Y_auto, k=1)
table(knn_auto, test.Y_auto)
## test.Y_auto
## knn_auto 0 1
## 0 42 9
## 1 5 62
mean(knn_auto != test.Y_auto)
## [1] 0.1186441
knn_5 = knn(train.auto, test.auto, train.Y_auto, k=5)
table(knn_5, test.Y_auto)
## test.Y_auto
## knn_5 0 1
## 0 42 10
## 1 5 61
mean(knn_5 != test.Y_auto)
## [1] 0.1271186
knn_10 = knn(train.auto, test.auto, train.Y_auto, k=10)
table(knn_10, test.Y_auto)
## test.Y_auto
## knn_10 0 1
## 0 42 11
## 1 5 60
mean(knn_10 != test.Y_auto)
## [1] 0.1355932
It looks like knn=1 is the best model, with increasing the value of k resulting in increased rate of error.
IN PROGRESS
power<-(2^3)
print(power)
## [1] 8
power2<-function(x, a) {x^a}
power2(3, 8)
## [1] 6561
power2(10, 3)
## [1] 1000
power2(81, 7)
## [1] 2.287679e+13
power2(131, 3)
## [1] 2248091
power3<-function(x, a) return({x^a})
result = power3(5,5)
result
## [1] 3125
x<-1:10
plot(x, power3(x, 2), log = "x", xlab = "Log of x", ylab = "x^2", main = "Log of x versus x^2")
(f) Create a function, PlotPower(), that allows you to create a plot of
x against x^a for a fixed a and for a range of values of x. For
instance, if you call PlotPower(1:10, 3) then a plot should be created
with an x-axis taking on values 1, 2, . . . , 10, and a y-axis taking on
values 13, 23, . . . , 103.
plotpower<-function(x, a) {plot(x, x^2, xlab = "x", ylab = "x^a", main = "x^a")}
plotpower(1:10, 3)
16. Using the Boston data set, fit classification models in order to
predict whether a given census tract has a crime rate above or below the
median. Explore logistic regression, LDA, naive Bayes, and KNN models
using various subsets of the predictors. Describe your findings.
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
crime_median<-median(Boston$crim)
Boston$crim<-ifelse(Boston$crim > crime_median, 1, 0)
barplot(table(Boston$crim), main = "Crime: High vs Low", xlab = "Crime", col = "red", border = "white")
cor(Boston)
## crim zn indus chas nox
## crim 1.00000000 -0.43615103 0.60326017 0.070096774 0.72323480
## zn -0.43615103 1.00000000 -0.53382819 -0.042696719 -0.51660371
## indus 0.60326017 -0.53382819 1.00000000 0.062938027 0.76365145
## chas 0.07009677 -0.04269672 0.06293803 1.000000000 0.09120281
## nox 0.72323480 -0.51660371 0.76365145 0.091202807 1.00000000
## rm -0.15637178 0.31199059 -0.39167585 0.091251225 -0.30218819
## age 0.61393992 -0.56953734 0.64477851 0.086517774 0.73147010
## dis -0.61634164 0.66440822 -0.70802699 -0.099175780 -0.76923011
## rad 0.61978625 -0.31194783 0.59512927 -0.007368241 0.61144056
## tax 0.60874128 -0.31456332 0.72076018 -0.035586518 0.66802320
## ptratio 0.25356836 -0.39167855 0.38324756 -0.121515174 0.18893268
## black -0.35121093 0.17552032 -0.35697654 0.048788485 -0.38005064
## lstat 0.45326273 -0.41299457 0.60379972 -0.053929298 0.59087892
## medv -0.26301673 0.36044534 -0.48372516 0.175260177 -0.42732077
## rm age dis rad tax ptratio
## crim -0.15637178 0.61393992 -0.61634164 0.619786249 0.60874128 0.2535684
## zn 0.31199059 -0.56953734 0.66440822 -0.311947826 -0.31456332 -0.3916785
## indus -0.39167585 0.64477851 -0.70802699 0.595129275 0.72076018 0.3832476
## chas 0.09125123 0.08651777 -0.09917578 -0.007368241 -0.03558652 -0.1215152
## nox -0.30218819 0.73147010 -0.76923011 0.611440563 0.66802320 0.1889327
## rm 1.00000000 -0.24026493 0.20524621 -0.209846668 -0.29204783 -0.3555015
## age -0.24026493 1.00000000 -0.74788054 0.456022452 0.50645559 0.2615150
## dis 0.20524621 -0.74788054 1.00000000 -0.494587930 -0.53443158 -0.2324705
## rad -0.20984667 0.45602245 -0.49458793 1.000000000 0.91022819 0.4647412
## tax -0.29204783 0.50645559 -0.53443158 0.910228189 1.00000000 0.4608530
## ptratio -0.35550149 0.26151501 -0.23247054 0.464741179 0.46085304 1.0000000
## black 0.12806864 -0.27353398 0.29151167 -0.444412816 -0.44180801 -0.1773833
## lstat -0.61380827 0.60233853 -0.49699583 0.488676335 0.54399341 0.3740443
## medv 0.69535995 -0.37695457 0.24992873 -0.381626231 -0.46853593 -0.5077867
## black lstat medv
## crim -0.35121093 0.4532627 -0.2630167
## zn 0.17552032 -0.4129946 0.3604453
## indus -0.35697654 0.6037997 -0.4837252
## chas 0.04878848 -0.0539293 0.1752602
## nox -0.38005064 0.5908789 -0.4273208
## rm 0.12806864 -0.6138083 0.6953599
## age -0.27353398 0.6023385 -0.3769546
## dis 0.29151167 -0.4969958 0.2499287
## rad -0.44441282 0.4886763 -0.3816262
## tax -0.44180801 0.5439934 -0.4685359
## ptratio -0.17738330 0.3740443 -0.5077867
## black 1.00000000 -0.3660869 0.3334608
## lstat -0.36608690 1.0000000 -0.7376627
## medv 0.33346082 -0.7376627 1.0000000
correlations<-cor(Boston$crim, Boston[2:14])
print(correlations)
## zn indus chas nox rm age dis
## [1,] -0.436151 0.6032602 0.07009677 0.7232348 -0.1563718 0.6139399 -0.6163416
## rad tax ptratio black lstat medv
## [1,] 0.6197862 0.6087413 0.2535684 -0.3512109 0.4532627 -0.2630167
Some correlations with outome, but also correlations with other predictors, which may be a conflict for the naive bayes analysis that assumes independence of predictors. The variables that meet criteria for high correlation include nox (Nitric oxides concentration (parts per 10 million), indus (proportion of non-retail business acres per town), age (proportion of owner-occupied units built prior to 1940), dis (weighted distances to five Boston employment centres), rad (index of accessibility to radial highways), tax (full-value property-tax rate per $10,000), There are also some moderately correlated variables including zn (proportion of residential land zoned for lots over 25,000 sq.ft), black (1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town), and lstat (% lower status of the population). We will include both highly correlated and moderately correlated variables for this analysis. Note that some of the variables do show correlation with each other such as lstat and medv (-0.7376627). This analysis will not manipulate (i.e. combine/ change variable dimensions) predictors, but future endeavors could benefit by exploring these areas within the paramaters of classification models.
subset_bos <- Boston[c(2, 3, 5, 7, 8, 9, 10, 12, 13)] #Subset Data
data_bo <- gather(subset_bos)
ggplot(data_bo, aes(x = value, fill = key)) +
geom_histogram(position = "dodge", bins = 20) +
facet_wrap(~ key, scales = "free_x") +
labs(title = "Lag 1-5", x = "Value") +
theme_minimal()
Looking at the distribution of the predictor variblaes, we see that the
predictor distributions are not gaussian, which violates an assumption
of the LDA model and naive bayes.
#Box Plots of Predictors on Crime
predictors<-c("zn", "tax", "black", "rad", "nox", "lstat", "indus", "dis", "age")
outcome<-"crim"
colors<-c("blue", "green", "lavender", "purple", "pink", "orange", "red", "yellow")
layout(matrix(1:9, nrow = 3))
for(i in seq_along(predictors)) {boxplot(Boston[[outcome]], Boston[[predictors[i]]], col=colors[i], xlab=outcome, ylab=predictors[i], main=paste("Boxplot of", outcome, "and", predictors[i]))}
par(mfrow=c(1, length(predictors)), mar=c(5, 4, 4, 2) + 0.1)
layout(1)
All of the predictor variables show to be related to higher crime rates excepting nox (Nitric oxides concentration (parts per 10 million), which has the reverse relationship.
# splitting Boston Data
set.seed(42)
ind<-sample(1:nrow(Boston), size = 0.7*nrow(Boston))
# splitting data
train_b<-Boston[ind, ]
test_b<-Boston[-ind, ]
#Log Reg
boston_reg = glm(crim ~ zn + tax + black + rad + nox + lstat + indus + dis + age, data = Boston, family = "binomial")
summary(boston_reg)
##
## Call:
## glm(formula = crim ~ zn + tax + black + rad + nox + lstat + indus +
## dis + age, family = "binomial", data = Boston)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -20.059653 4.489141 -4.468 7.88e-06 ***
## zn -0.081306 0.030119 -2.699 0.00694 **
## tax -0.007723 0.002511 -3.076 0.00210 **
## black -0.011199 0.006034 -1.856 0.06344 .
## rad 0.665127 0.133407 4.986 6.17e-07 ***
## nox 40.193351 6.570294 6.117 9.51e-10 ***
## lstat -0.017439 0.034637 -0.503 0.61464
## indus -0.040343 0.041719 -0.967 0.33354
## dis 0.432404 0.184260 2.347 0.01894 *
## age 0.015372 0.009573 1.606 0.10833
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 701.46 on 505 degrees of freedom
## Residual deviance: 229.30 on 496 degrees of freedom
## AIC: 249.3
##
## Number of Fisher Scoring iterations: 9
boston_pred = predict(boston_reg, type="response", newdata = test_b)
pred<-rep(0, length(boston_pred))
pred[boston_pred > 0.5] <- 1
table(pred, test_b$crim)
##
## pred 0 1
## 0 76 6
## 1 2 68
mean(pred != test_b$crim)
## [1] 0.05263158
This model shows that it had an error rate of .05%, very good! The model indicates that variables black, lstat, indus, and age are not significant predictors. The model seems to fit pretty good, given factors such as AIC, and resiudal deviance on degrees of freedom.
#LDA
boston_lda<-lda(crim ~ zn + tax + black + rad + nox + lstat + indus + dis + age, data = Boston)
boston_lda
## Call:
## lda(crim ~ zn + tax + black + rad + nox + lstat + indus + dis +
## age, data = Boston)
##
## Prior probabilities of groups:
## 0 1
## 0.5 0.5
##
## Group means:
## zn tax black rad nox lstat indus dis
## 0 21.525692 305.7431 388.7061 4.158103 0.4709711 9.419486 7.002292 5.091596
## 1 1.201581 510.7312 324.6420 14.940711 0.6384190 15.886640 15.271265 2.498489
## age
## 0 51.31028
## 1 85.83953
##
## Coefficients of linear discriminants:
## LD1
## zn -0.0026883085
## tax -0.0014982050
## black -0.0007710848
## rad 0.0878244826
## nox 7.2508340136
## lstat -0.0212585580
## indus 0.0107470871
## dis -0.0264048093
## age 0.0128439952
predict_bos<-predict(boston_lda, test_b)
table(predict_bos$class, test_b$crim)
##
## 0 1
## 0 76 19
## 1 2 55
mean(predict_bos$class != test_b$crim)
## [1] 0.1381579
The output from this model indicates a higher error rate than with the log model. It also seems fairly split in terms of what variables it suggests are indicative of high or low crime rates.
nb_bos<-naiveBayes(crim ~ zn + tax + black + rad + nox + lstat + indus + dis + age, data = Boston)
nb_bos
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## 0 1
## 0.5 0.5
##
## Conditional probabilities:
## zn
## Y [,1] [,2]
## 0 21.525692 29.319808
## 1 1.201581 4.798611
##
## tax
## Y [,1] [,2]
## 0 305.7431 87.4837
## 1 510.7312 167.8553
##
## black
## Y [,1] [,2]
## 0 388.7061 22.83774
## 1 324.6420 118.83084
##
## rad
## Y [,1] [,2]
## 0 4.158103 1.659121
## 1 14.940711 9.529843
##
## nox
## Y [,1] [,2]
## 0 0.4709711 0.05559789
## 1 0.6384190 0.09870365
##
## lstat
## Y [,1] [,2]
## 0 9.419486 4.923497
## 1 15.886640 7.546922
##
## indus
## Y [,1] [,2]
## 0 7.002292 5.514454
## 1 15.271265 5.439010
##
## dis
## Y [,1] [,2]
## 0 5.091596 2.081304
## 1 2.498489 1.085521
##
## age
## Y [,1] [,2]
## 0 51.31028 25.88190
## 1 85.83953 17.87423
nbb_pred<-predict(nb_bos, test_b)
table(nbb_pred, test_b$crim)
##
## nbb_pred 0 1
## 0 74 19
## 1 4 55
mean(nbb_pred != test_b$crim)
## [1] 0.1513158
The error rate shows to be getting higher with each model. We know that with the earlier EDA, some assumptions of this model are violated by elements of the data (i.e. correlations among predictors).
# Splitting Data for KNN
train.bos <-cbind(train_b$zn, train_b$tax, train_b$black, train_b$rad, train_b$nox, train_b$lstat, train_b$indus, train_b$dis, train_b$age)
test.bos <- cbind(test_b$zn, test_b$tax, test_b$black, test_b$rad, test_b$nox, test_b$lstat, test_b$indus, test_b$dis, test_b$age)
trainb.Y = cbind(train_b$crim)
testb.Y = cbind(test_b$crim)
knn_bos = knn(train.bos, test.bos, trainb.Y, k=1)
table(knn_bos, testb.Y)
## testb.Y
## knn_bos 0 1
## 0 67 4
## 1 11 70
mean(knn_bos != testb.Y)
## [1] 0.09868421
#knn5
knn_bos5 = knn(train.bos, test.bos, trainb.Y, k=5)
table(knn_bos5, testb.Y)
## testb.Y
## knn_bos5 0 1
## 0 70 4
## 1 8 70
mean(knn_bos5 != testb.Y)
## [1] 0.07894737
knn_bos10 = knn(train.bos, test.bos, trainb.Y, k=10)
table(knn_bos10, testb.Y)
## testb.Y
## knn_bos10 0 1
## 0 69 6
## 1 9 68
mean(knn_bos10 != testb.Y)
## [1] 0.09868421
Of all the models, it seems like the log regression fits the models the best, with knn 3 being the next best fit. This makes sense, as some assumptions of the other models are violated in the data.