library(ISLR2)
library(tidyverse)
library(caret)
library(corrplot)
library(MASS)
select = dplyr::select
library(class)
library(gridExtra)
data("Weekly")
A. Produce graphical and numerical summaries of the “weekly” data set.
str(Weekly)
## 'data.frame': 1089 obs. of 9 variables:
## $ Year : num 1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
## $ Lag1 : num 0.816 -0.27 -2.576 3.514 0.712 ...
## $ Lag2 : num 1.572 0.816 -0.27 -2.576 3.514 ...
## $ Lag3 : num -3.936 1.572 0.816 -0.27 -2.576 ...
## $ Lag4 : num -0.229 -3.936 1.572 0.816 -0.27 ...
## $ Lag5 : num -3.484 -0.229 -3.936 1.572 0.816 ...
## $ Volume : num 0.155 0.149 0.16 0.162 0.154 ...
## $ Today : num -0.27 -2.576 3.514 0.712 1.178 ...
## $ Direction: Factor w/ 2 levels "Down","Up": 1 1 2 2 2 1 2 2 2 1 ...
pairs(Weekly[,-9])
As we can observe from this data there does not seem to be any relationships within the “lag” variable. Although we can observe that there is relationships over time as we caqn see a positive relationship in the year row.
corrplot(cor(Weekly[,-9]), method = "circle")
As we can see from this plot non of the variables from our dataset are highly correlated.
Weekly %>%
filter(lead(Lag1) != Today) %>%
nrow()
## [1] 0
This line of code is checking to make sure that we have our data ordered in ascending weeks.
ggplot(Weekly) +
aes(x = Year, y = Volume) +
geom_point(shape = "circle", size = 1.5, colour = "#112446") +
theme_minimal()
This chart shows that as the years increased so did the number of shares traded. We can see that there was a peak around 2008 and then the shares started to decrease around 2010.
B. Perform a logistic regression on the data set
glm.fit = glm(Direction ~ .-Today, data = Weekly, family = binomial)
summary(glm.fit)
##
## Call:
## glm(formula = Direction ~ . - Today, family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7071 -1.2578 0.9941 1.0873 1.4665
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 17.225822 37.890522 0.455 0.6494
## Year -0.008500 0.018991 -0.448 0.6545
## Lag1 -0.040688 0.026447 -1.538 0.1239
## Lag2 0.059449 0.026970 2.204 0.0275 *
## Lag3 -0.015478 0.026703 -0.580 0.5622
## Lag4 -0.027316 0.026485 -1.031 0.3024
## Lag5 -0.014022 0.026409 -0.531 0.5955
## Volume 0.003256 0.068836 0.047 0.9623
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.2 on 1081 degrees of freedom
## AIC: 1502.2
##
## Number of Fisher Scoring iterations: 4
We can see from these results that the only variable that was stastically significant would be Lag2 with a significance level less than .05.
C. Compute a confusion matrix and overall fraction of correct predictions.
attach(Weekly)
glm.probs = predict(glm.fit, Weekly, type = "response")
contrasts(Direction)
## Up
## Down 0
## Up 1
glm.pred = rep("Down", 1089)
glm.pred[glm.probs > .5] = "Up"
table(glm.pred, Direction)
## Direction
## glm.pred Down Up
## Down 56 47
## Up 428 558
Using our logistic regression we can see that the correct predictions from the training data set are 56% and the other 44% is our error rate. We can also see that our model predicted the down variable correct only 11.57% of the time calculated by 56/(56+428). Our model predicted the Up variable correctly 92.2% of the time calculated by 558/(47+558).
D. Train and test - logistic regression
train = Weekly[Weekly$Year <= 2008,]
test = Weekly[Weekly$Year > 2008,]
glm_dr = glm(Direction ~ Lag2, data = train, family = "binomial")
predicted = factor(ifelse(predict(glm_dr, newdata = test, type = "response") < .5, "Down", "Up"))
confusionMatrix(predicted, test$Direction, positive = "Up")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 9 5
## Up 34 56
##
## Accuracy : 0.625
## 95% CI : (0.5247, 0.718)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.2439
##
## Kappa : 0.1414
##
## Mcnemar's Test P-Value : 7.34e-06
##
## Sensitivity : 0.9180
## Specificity : 0.2093
## Pos Pred Value : 0.6222
## Neg Pred Value : 0.6429
## Prevalence : 0.5865
## Detection Rate : 0.5385
## Detection Prevalence : 0.8654
## Balanced Accuracy : 0.5637
##
## 'Positive' Class : Up
##
With this confusion matrix I am able to see that this model has an accuracy of .625. The confusion matrix also shows us that our model has a higher accuracy of the previous model produced. The model is slowly getting better at predicting when the market goes down.
E.Repeat model using LDA
library(MASS)
lda.fit = lda(Direction ~ Lag2, data = train)
predicted_lda = predict(lda.fit, newdata = test)
confusionMatrix(data = predicted_lda$class, reference = test$Direction, positive = "Up")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 9 5
## Up 34 56
##
## Accuracy : 0.625
## 95% CI : (0.5247, 0.718)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.2439
##
## Kappa : 0.1414
##
## Mcnemar's Test P-Value : 7.34e-06
##
## Sensitivity : 0.9180
## Specificity : 0.2093
## Pos Pred Value : 0.6222
## Neg Pred Value : 0.6429
## Prevalence : 0.5865
## Detection Rate : 0.5385
## Detection Prevalence : 0.8654
## Balanced Accuracy : 0.5637
##
## 'Positive' Class : Up
##
The accuracy rate was .625 while this rate is better than the original rate we can see that this rate may not be significant due to the size of the sample in the test.
F.Repeat using a QDA
qda_dir = qda(Direction ~ Lag2, data = train)
predicted_qda = predict(qda_dir, newdata = test)
confusionMatrix(data = predicted_qda$class, reference = test$Direction, positive = "Up")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 0 0
## Up 43 61
##
## Accuracy : 0.5865
## 95% CI : (0.4858, 0.6823)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.5419
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 1.504e-10
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.5865
## Neg Pred Value : NaN
## Prevalence : 0.5865
## Detection Rate : 0.5865
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Up
##
This model holds an accuracy of .5865. This model also holds a sensitivity of 1.0 which says that it will predict “Up” for every test observation.
library(class)
set.seed(1)
predicted_knn = knn(train = data.frame(Lag2 = train$Lag2),
test = data.frame(Lag2 = test$Lag2), cl = train$Direction, k = 1, prob = T)
attr(predicted_knn, "prob")[100]
## [1] 0.5
confusionMatrix(data = predicted_knn, reference = test$Direction, positive = "Up")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 21 30
## Up 22 31
##
## Accuracy : 0.5
## 95% CI : (0.4003, 0.5997)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.9700
##
## Kappa : -0.0033
##
## Mcnemar's Test P-Value : 0.3317
##
## Sensitivity : 0.5082
## Specificity : 0.4884
## Pos Pred Value : 0.5849
## Neg Pred Value : 0.4118
## Prevalence : 0.5865
## Detection Rate : 0.2981
## Detection Prevalence : 0.5096
## Balanced Accuracy : 0.4983
##
## 'Positive' Class : Up
##
Here we get an acuuracy of .5 which is also worse than the accuracy of our original model.
The LDA & Logistic Regression methods both have an accuracy if .625 so they would be tied in terms of effectiveness.
a.Develop a model to predict whether a given car gets high or low gas mileage based on the “Auto” data set. Create a binary variable, “mpg01”, that contains a 1 if “mpg” contains a value above its median, and a 0 if “mpg” contains a value below its median. You can compute the median using the median() function. Note you may find it helpful to use the data.frame() function to create a single data set containing both “mpg01” and the other “Auto” variables.
detach(Weekly)
attach(Auto)
## The following object is masked from package:ggplot2:
##
## mpg
mpg1 = rep(0,length(mpg))
mpg1[mpg > median(mpg)] = 1
Auto2 = data.frame(Auto, mpg1)
summary(Auto2)
## mpg cylinders displacement horsepower weight
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0 1st Qu.:2225
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5 Median :2804
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5 Mean :2978
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0 3rd Qu.:3615
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
##
## acceleration year origin name
## Min. : 8.00 Min. :70.00 Min. :1.000 amc matador : 5
## 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000 ford pinto : 5
## Median :15.50 Median :76.00 Median :1.000 toyota corolla : 5
## Mean :15.54 Mean :75.98 Mean :1.577 amc gremlin : 4
## 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000 amc hornet : 4
## Max. :24.80 Max. :82.00 Max. :3.000 chevrolet chevette: 4
## (Other) :365
## mpg1
## Min. :0.0
## 1st Qu.:0.0
## Median :0.5
## Mean :0.5
## 3rd Qu.:1.0
## Max. :1.0
##
B. Explore the data graphically
pairs(Auto2[,-9])
cor(Auto2[,-9])
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## mpg1 0.8369392 -0.7591939 -0.7534766 -0.6670526 -0.7577566
## acceleration year origin mpg1
## mpg 0.4233285 0.5805410 0.5652088 0.8369392
## cylinders -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration 1.0000000 0.2903161 0.2127458 0.3468215
## year 0.2903161 1.0000000 0.1815277 0.4299042
## origin 0.2127458 0.1815277 1.0000000 0.5136984
## mpg1 0.3468215 0.4299042 0.5136984 1.0000000
From this correlation data we can see that the variables that could be useful in predicting mpg would be cylinders, displacement
par(mfrow=c(2,2))
boxplot(cylinders~mpg1)
boxplot(displacement~mpg1)
boxplot(horsepower~mpg1)
boxplot(weight~mpg1)
set.seed(1)
intrain = sample(1:nrow(Auto2),nrow(Auto2)*.7)
training = Auto2[intrain,-1]
summary(training)
## cylinders displacement horsepower weight acceleration
## Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1649 Min. : 8.00
## 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 78.0 1st Qu.:2224 1st Qu.:14.00
## Median :4.000 Median :151.0 Median : 93.5 Median :2790 Median :15.50
## Mean :5.464 Mean :193.2 Mean :103.8 Mean :2967 Mean :15.66
## 3rd Qu.:8.000 3rd Qu.:261.5 3rd Qu.:123.8 3rd Qu.:3612 3rd Qu.:17.27
## Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140 Max. :24.80
##
## year origin name
## Min. :70.00 Min. :1.00 amc gremlin : 4
## 1st Qu.:73.00 1st Qu.:1.00 amc hornet : 4
## Median :76.00 Median :1.00 amc matador : 3
## Mean :76.06 Mean :1.54 chevrolet caprice classic: 3
## 3rd Qu.:79.00 3rd Qu.:2.00 chevrolet chevette : 3
## Max. :82.00 Max. :3.00 chevrolet impala : 3
## (Other) :254
## mpg1
## Min. :0.0000
## 1st Qu.:0.0000
## Median :1.0000
## Mean :0.5073
## 3rd Qu.:1.0000
## Max. :1.0000
##
testing= Auto2[-intrain,-1]
dim(testing)
## [1] 118 9
test.mpg1 = mpg1[-intrain]
train.mpg1=mpg1[intrain]
length(test.mpg1)
## [1] 118
length(train.mpg1)
## [1] 274
lda.fit = lda(mpg1~ cylinders+ displacement+ horsepower+ weight, data = Auto2, subset = intrain)
plot(lda.fit)
lda.pred=predict(lda.fit, testing)
names(lda.pred)
## [1] "class" "posterior" "x"
lda.class=lda.pred$class
table(lda.class, test.mpg1)
## test.mpg1
## lda.class 0 1
## 0 50 3
## 1 11 54
mean(lda.class==test.mpg1)
## [1] 0.8813559
Values that were correctly predicted
mean(lda.class!=test.mpg1)
## [1] 0.1186441
rate of misclassifcation within our model
qda.fit = qda(mpg1~cylinders + displacement + horsepower + weight, data = Auto2, subset = intrain)
qda.class=predict(qda.fit, testing)$class
table(qda.class, test.mpg1)
## test.mpg1
## qda.class 0 1
## 0 52 5
## 1 9 52
mean(qda.class==test.mpg1)
## [1] 0.8813559
Amount correctly predicted
mean(qda.class!=test.mpg1)
## [1] 0.1186441
The misclassification rate
F. Perform a logistic regression on the training data set in order to predict mpg1
glm.fit = glm(mpg1 ~cylinders + displacement + horsepower + weight, data = Auto2, subset = intrain, family = binomial)
glm.probs = predict(glm.fit, testing, type = 'response')
glm.pred = rep(0,length(glm.probs))
glm.pred[glm.probs>.5]=1
table(glm.pred, test.mpg1)
## test.mpg1
## glm.pred 0 1
## 0 53 3
## 1 8 54
mean(glm.pred==test.mpg1)
## [1] 0.9067797
mean(glm.pred!=test.mpg1)
## [1] 0.09322034
G. Perform a KNN test on the data
From the test below we can see that the highest accuracy comes from K = 5 model.
train.X=cbind(training$cylinders, training$weight, training$displacement, training$horsepower)
test.X = cbind(testing$cylinders,testing$weight,testing$displacement,testing$horsepower)
knn.pred = knn(train.X,test.X,train.mpg1)
mean(knn.pred==test.mpg1)
## [1] 0.8644068
mean(knn.pred!=test.mpg1)
## [1] 0.1355932
knn.pred = knn(train.X,test.X,train.mpg1, k=5)
mean(knn.pred==test.mpg1)
## [1] 0.8728814
mean(knn.pred!=test.mpg1)
## [1] 0.1271186
knn.pred = knn(train.X,test.X,train.mpg1, k=10)
mean(knn.pred==test.mpg1)
## [1] 0.8559322
mean(knn.pred!=test.mpg1)
## [1] 0.1440678
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
cor(Boston)
## crim zn indus chas nox
## crim 1.00000000 -0.20046922 0.40658341 -0.055891582 0.42097171
## zn -0.20046922 1.00000000 -0.53382819 -0.042696719 -0.51660371
## indus 0.40658341 -0.53382819 1.00000000 0.062938027 0.76365145
## chas -0.05589158 -0.04269672 0.06293803 1.000000000 0.09120281
## nox 0.42097171 -0.51660371 0.76365145 0.091202807 1.00000000
## rm -0.21924670 0.31199059 -0.39167585 0.091251225 -0.30218819
## age 0.35273425 -0.56953734 0.64477851 0.086517774 0.73147010
## dis -0.37967009 0.66440822 -0.70802699 -0.099175780 -0.76923011
## rad 0.62550515 -0.31194783 0.59512927 -0.007368241 0.61144056
## tax 0.58276431 -0.31456332 0.72076018 -0.035586518 0.66802320
## ptratio 0.28994558 -0.39167855 0.38324756 -0.121515174 0.18893268
## black -0.38506394 0.17552032 -0.35697654 0.048788485 -0.38005064
## lstat 0.45562148 -0.41299457 0.60379972 -0.053929298 0.59087892
## medv -0.38830461 0.36044534 -0.48372516 0.175260177 -0.42732077
## rm age dis rad tax ptratio
## crim -0.21924670 0.35273425 -0.37967009 0.625505145 0.58276431 0.2899456
## zn 0.31199059 -0.56953734 0.66440822 -0.311947826 -0.31456332 -0.3916785
## indus -0.39167585 0.64477851 -0.70802699 0.595129275 0.72076018 0.3832476
## chas 0.09125123 0.08651777 -0.09917578 -0.007368241 -0.03558652 -0.1215152
## nox -0.30218819 0.73147010 -0.76923011 0.611440563 0.66802320 0.1889327
## rm 1.00000000 -0.24026493 0.20524621 -0.209846668 -0.29204783 -0.3555015
## age -0.24026493 1.00000000 -0.74788054 0.456022452 0.50645559 0.2615150
## dis 0.20524621 -0.74788054 1.00000000 -0.494587930 -0.53443158 -0.2324705
## rad -0.20984667 0.45602245 -0.49458793 1.000000000 0.91022819 0.4647412
## tax -0.29204783 0.50645559 -0.53443158 0.910228189 1.00000000 0.4608530
## ptratio -0.35550149 0.26151501 -0.23247054 0.464741179 0.46085304 1.0000000
## black 0.12806864 -0.27353398 0.29151167 -0.444412816 -0.44180801 -0.1773833
## lstat -0.61380827 0.60233853 -0.49699583 0.488676335 0.54399341 0.3740443
## medv 0.69535995 -0.37695457 0.24992873 -0.381626231 -0.46853593 -0.5077867
## black lstat medv
## crim -0.38506394 0.4556215 -0.3883046
## zn 0.17552032 -0.4129946 0.3604453
## indus -0.35697654 0.6037997 -0.4837252
## chas 0.04878848 -0.0539293 0.1752602
## nox -0.38005064 0.5908789 -0.4273208
## rm 0.12806864 -0.6138083 0.6953599
## age -0.27353398 0.6023385 -0.3769546
## dis 0.29151167 -0.4969958 0.2499287
## rad -0.44441282 0.4886763 -0.3816262
## tax -0.44180801 0.5439934 -0.4685359
## ptratio -0.17738330 0.3740443 -0.5077867
## black 1.00000000 -0.3660869 0.3334608
## lstat -0.36608690 1.0000000 -0.7376627
## medv 0.33346082 -0.7376627 1.0000000
attach(Boston)
crim_ind=rep(0,length(crim))
crim_ind[crim>median(crim)]=1
Boston2 = data.frame(Boston, crim_ind)
summary(Boston2)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv crim_ind
## Min. : 1.73 Min. : 5.00 Min. :0.0
## 1st Qu.: 6.95 1st Qu.:17.02 1st Qu.:0.0
## Median :11.36 Median :21.20 Median :0.5
## Mean :12.65 Mean :22.53 Mean :0.5
## 3rd Qu.:16.95 3rd Qu.:25.00 3rd Qu.:1.0
## Max. :37.97 Max. :50.00 Max. :1.0
create training and testing splits
set.seed(1)
train = sample(1:nrow(Boston2), .7*nrow(Boston2))
Boston2.train = Boston2[train,]
Boston2.test = Boston2[-train,]
crim.ind.test = crim_ind[-train]
crim.ind.test
## [1] 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [38] 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0
## [75] 0 0 0 1 1 1 1 1 0 0 0 0 0 0 1 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
## [149] 1 0 0 0
glm.fit = glm(crim_ind ~zn + indus + chas + nox + rm + age + dis + rad + tax + ptratio + black + lstat + medv, data = Boston2, subset = train, family = binomial)
summary(glm.fit)
##
## Call:
## glm(formula = crim_ind ~ zn + indus + chas + nox + rm + age +
## dis + rad + tax + ptratio + black + lstat + medv, family = binomial,
## data = Boston2, subset = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1363 -0.1504 -0.0009 0.0018 3.5797
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -42.731502 8.009551 -5.335 9.55e-08 ***
## zn -0.105323 0.045585 -2.310 0.020862 *
## indus -0.066308 0.057649 -1.150 0.250057
## chas 0.187896 0.810675 0.232 0.816711
## nox 56.344292 10.073420 5.593 2.23e-08 ***
## rm -0.234402 0.900292 -0.260 0.794585
## age 0.022052 0.014304 1.542 0.123141
## dis 1.143918 0.303909 3.764 0.000167 ***
## rad 0.657976 0.184373 3.569 0.000359 ***
## tax -0.006299 0.003239 -1.944 0.051849 .
## ptratio 0.292158 0.155755 1.876 0.060689 .
## black -0.008703 0.005212 -1.670 0.094933 .
## lstat 0.112414 0.060168 1.868 0.061715 .
## medv 0.191745 0.087816 2.184 0.028999 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 490.65 on 353 degrees of freedom
## Residual deviance: 140.41 on 340 degrees of freedom
## AIC: 168.41
##
## Number of Fisher Scoring iterations: 9
glm.fit2 = glm(crim_ind~zn + nox + dis + rad + medv, data = Boston2, subset = train, family = binomial)
summary(glm.fit2)
##
## Call:
## glm(formula = crim_ind ~ zn + nox + dis + rad + medv, family = binomial,
## data = Boston2, subset = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0766 -0.3226 -0.0034 0.0069 3.2715
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -30.10487 4.91615 -6.124 9.14e-10 ***
## zn -0.09780 0.03537 -2.765 0.005698 **
## nox 42.65837 7.07436 6.030 1.64e-09 ***
## dis 0.84586 0.23756 3.561 0.000370 ***
## rad 0.49353 0.13194 3.741 0.000184 ***
## medv 0.08028 0.02906 2.763 0.005730 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 490.65 on 353 degrees of freedom
## Residual deviance: 164.99 on 348 degrees of freedom
## AIC: 176.99
##
## Number of Fisher Scoring iterations: 8
glm.probs=predict(glm.fit2, Boston2.test, type = 'response')
glm.pred = rep(0, length(glm.probs))
glm.pred[glm.probs > .5] = 1
table(glm.pred, crim.ind.test)
## crim.ind.test
## glm.pred 0 1
## 0 60 11
## 1 13 68
mean(glm.pred == crim.ind.test)
## [1] 0.8421053
mean(glm.pred!= crim.ind.test)
## [1] 0.1578947
lda.fit = lda(crim_ind~zn + nox + dis + rad + medv, data = Boston2, subset = train)
lda.fit
## Call:
## lda(crim_ind ~ zn + nox + dis + rad + medv, data = Boston2, subset = train)
##
## Prior probabilities of groups:
## 0 1
## 0.5084746 0.4915254
##
## Group means:
## zn nox dis rad medv
## 0 21.188889 0.4695150 5.070315 4.127778 25.37056
## 1 1.172414 0.6377989 2.537681 14.873563 20.44770
##
## Coefficients of linear discriminants:
## LD1
## zn -0.010534928
## nox 9.134260498
## dis 0.008656437
## rad 0.076273497
## medv 0.029024616
lda.pred = predict(lda.fit, Boston2.test)
names(lda.pred)
## [1] "class" "posterior" "x"
lda.class = lda.pred$class
table(lda.class, crim.ind.test)
## crim.ind.test
## lda.class 0 1
## 0 71 19
## 1 2 60
mean(lda.class==crim.ind.test)
## [1] 0.8618421
mean(lda.class!=crim.ind.test)
## [1] 0.1381579
qda.fit = qda(crim_ind~zn + nox + dis + rad + medv, data = Boston2, subset = train)
qda.fit
## Call:
## qda(crim_ind ~ zn + nox + dis + rad + medv, data = Boston2, subset = train)
##
## Prior probabilities of groups:
## 0 1
## 0.5084746 0.4915254
##
## Group means:
## zn nox dis rad medv
## 0 21.188889 0.4695150 5.070315 4.127778 25.37056
## 1 1.172414 0.6377989 2.537681 14.873563 20.44770
qda.pred = predict(qda.fit, Boston2.test)
names(qda.pred)
## [1] "class" "posterior"
qda.class = qda.pred$class
table(qda.class, crim.ind.test)
## crim.ind.test
## qda.class 0 1
## 0 66 15
## 1 7 64
mean(qda.class == crim.ind.test)
## [1] 0.8552632
mean(qda.class!= crim.ind.test)
## [1] 0.1447368
train.X = cbind(nox,rad,ptratio,medv)[train,]
test.X = cbind(nox,rad,ptratio,medv)[-train,]
train.crim_ind = crim_ind[train]
dim(train.X)
## [1] 354 4
KNN=1
set.seed(1)
knn.pred=knn(train.X,test.X,train.crim_ind, k=1)
table(knn.pred, crim.ind.test)
## crim.ind.test
## knn.pred 0 1
## 0 58 6
## 1 15 73
KNN=5
knn.pred=knn(train.X,test.X,train.crim_ind, k=5)
table(knn.pred, crim.ind.test)
## crim.ind.test
## knn.pred 0 1
## 0 61 12
## 1 12 67
KNN=10
knn.pred=knn(train.X,test.X,train.crim_ind, k=10)
table(knn.pred, crim.ind.test)
## crim.ind.test
## knn.pred 0 1
## 0 67 20
## 1 6 59
KNN=100
knn.pred=knn(train.X,test.X,train.crim_ind, k=100)
table(knn.pred, crim.ind.test)
## crim.ind.test
## knn.pred 0 1
## 0 73 39
## 1 0 40