HW3

Loading libraries

library(ISLR2)
library(tidyverse)
library(caret)
library(corrplot)
library(MASS)
select = dplyr::select
library(class)
library(gridExtra)
data("Weekly")

Q10.

A. Produce graphical and numerical summaries of the “weekly” data set.

str(Weekly)

## 'data.frame':    1089 obs. of  9 variables:
##  $ Year     : num  1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
##  $ Lag1     : num  0.816 -0.27 -2.576 3.514 0.712 ...
##  $ Lag2     : num  1.572 0.816 -0.27 -2.576 3.514 ...
##  $ Lag3     : num  -3.936 1.572 0.816 -0.27 -2.576 ...
##  $ Lag4     : num  -0.229 -3.936 1.572 0.816 -0.27 ...
##  $ Lag5     : num  -3.484 -0.229 -3.936 1.572 0.816 ...
##  $ Volume   : num  0.155 0.149 0.16 0.162 0.154 ...
##  $ Today    : num  -0.27 -2.576 3.514 0.712 1.178 ...
##  $ Direction: Factor w/ 2 levels "Down","Up": 1 1 2 2 2 1 2 2 2 1 ...

pairs(Weekly[,-9])

As we can observe from this data there does not seem to be any relationships within the “lag” variable. Although we can observe that there is relationships over time as we caqn see a positive relationship in the year row.

corrplot(cor(Weekly[,-9]), method = "circle")

As we can see from this plot non of the variables from our dataset are highly correlated.

Weekly %>%
  filter(lead(Lag1) != Today) %>%
  nrow()

## [1] 0

This line of code is checking to make sure that we have our data ordered in ascending weeks.

ggplot(Weekly) +
  aes(x = Year, y = Volume) +
  geom_point(shape = "circle", size = 1.5, colour = "#112446") +
  theme_minimal()

This chart shows that as the years increased so did the number of shares traded. We can see that there was a peak around 2008 and then the shares started to decrease around 2010.

Q10

B. Perform a logistic regression on the data set

glm.fit = glm(Direction ~ .-Today, data = Weekly, family = binomial)
summary(glm.fit)

## 
## Call:
## glm(formula = Direction ~ . - Today, family = binomial, data = Weekly)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7071  -1.2578   0.9941   1.0873   1.4665  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)  
## (Intercept) 17.225822  37.890522   0.455   0.6494  
## Year        -0.008500   0.018991  -0.448   0.6545  
## Lag1        -0.040688   0.026447  -1.538   0.1239  
## Lag2         0.059449   0.026970   2.204   0.0275 *
## Lag3        -0.015478   0.026703  -0.580   0.5622  
## Lag4        -0.027316   0.026485  -1.031   0.3024  
## Lag5        -0.014022   0.026409  -0.531   0.5955  
## Volume       0.003256   0.068836   0.047   0.9623  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.2  on 1081  degrees of freedom
## AIC: 1502.2
## 
## Number of Fisher Scoring iterations: 4

We can see from these results that the only variable that was stastically significant would be Lag2 with a significance level less than .05.

Q10

C. Compute a confusion matrix and overall fraction of correct predictions.

attach(Weekly)
glm.probs = predict(glm.fit, Weekly, type = "response")
contrasts(Direction)

##      Up
## Down  0
## Up    1

glm.pred = rep("Down", 1089)
glm.pred[glm.probs > .5] = "Up"
table(glm.pred, Direction)

##         Direction
## glm.pred Down  Up
##     Down   56  47
##     Up    428 558

Using our logistic regression we can see that the correct predictions from the training data set are 56% and the other 44% is our error rate. We can also see that our model predicted the down variable correct only 11.57% of the time calculated by 56/(56+428). Our model predicted the Up variable correctly 92.2% of the time calculated by 558/(47+558).

Q10

D. Train and test - logistic regression

train = Weekly[Weekly$Year <= 2008,]
test = Weekly[Weekly$Year > 2008,]

glm_dr = glm(Direction ~ Lag2, data = train, family = "binomial")
predicted = factor(ifelse(predict(glm_dr, newdata = test, type = "response") < .5, "Down", "Up"))
confusionMatrix(predicted, test$Direction, positive = "Up")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Down Up
##       Down    9  5
##       Up     34 56
##                                          
##                Accuracy : 0.625          
##                  95% CI : (0.5247, 0.718)
##     No Information Rate : 0.5865         
##     P-Value [Acc > NIR] : 0.2439         
##                                          
##                   Kappa : 0.1414         
##                                          
##  Mcnemar's Test P-Value : 7.34e-06       
##                                          
##             Sensitivity : 0.9180         
##             Specificity : 0.2093         
##          Pos Pred Value : 0.6222         
##          Neg Pred Value : 0.6429         
##              Prevalence : 0.5865         
##          Detection Rate : 0.5385         
##    Detection Prevalence : 0.8654         
##       Balanced Accuracy : 0.5637         
##                                          
##        'Positive' Class : Up             
##

With this confusion matrix I am able to see that this model has an accuracy of .625. The confusion matrix also shows us that our model has a higher accuracy of the previous model produced. The model is slowly getting better at predicting when the market goes down.

Q10

E.Repeat model using LDA

library(MASS)
lda.fit = lda(Direction ~ Lag2, data = train)
predicted_lda = predict(lda.fit, newdata = test)

confusionMatrix(data = predicted_lda$class, reference = test$Direction, positive = "Up")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Down Up
##       Down    9  5
##       Up     34 56
##                                          
##                Accuracy : 0.625          
##                  95% CI : (0.5247, 0.718)
##     No Information Rate : 0.5865         
##     P-Value [Acc > NIR] : 0.2439         
##                                          
##                   Kappa : 0.1414         
##                                          
##  Mcnemar's Test P-Value : 7.34e-06       
##                                          
##             Sensitivity : 0.9180         
##             Specificity : 0.2093         
##          Pos Pred Value : 0.6222         
##          Neg Pred Value : 0.6429         
##              Prevalence : 0.5865         
##          Detection Rate : 0.5385         
##    Detection Prevalence : 0.8654         
##       Balanced Accuracy : 0.5637         
##                                          
##        'Positive' Class : Up             
##

The accuracy rate was .625 while this rate is better than the original rate we can see that this rate may not be significant due to the size of the sample in the test.

Q10

F.Repeat using a QDA

qda_dir = qda(Direction ~ Lag2, data = train)
predicted_qda = predict(qda_dir, newdata = test)

confusionMatrix(data = predicted_qda$class, reference = test$Direction, positive = "Up")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Down Up
##       Down    0  0
##       Up     43 61
##                                           
##                Accuracy : 0.5865          
##                  95% CI : (0.4858, 0.6823)
##     No Information Rate : 0.5865          
##     P-Value [Acc > NIR] : 0.5419          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : 1.504e-10       
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.5865          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.5865          
##          Detection Rate : 0.5865          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : Up              
##

This model holds an accuracy of .5865. This model also holds a sensitivity of 1.0 which says that it will predict “Up” for every test observation.

Q10

Repeat using KNN with K=1

library(class)
set.seed(1)
predicted_knn = knn(train = data.frame(Lag2 = train$Lag2), 
                    test = data.frame(Lag2 = test$Lag2), cl = train$Direction, k = 1, prob = T)
attr(predicted_knn, "prob")[100]

## [1] 0.5

confusionMatrix(data = predicted_knn, reference = test$Direction, positive = "Up")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Down Up
##       Down   21 30
##       Up     22 31
##                                           
##                Accuracy : 0.5             
##                  95% CI : (0.4003, 0.5997)
##     No Information Rate : 0.5865          
##     P-Value [Acc > NIR] : 0.9700          
##                                           
##                   Kappa : -0.0033         
##                                           
##  Mcnemar's Test P-Value : 0.3317          
##                                           
##             Sensitivity : 0.5082          
##             Specificity : 0.4884          
##          Pos Pred Value : 0.5849          
##          Neg Pred Value : 0.4118          
##              Prevalence : 0.5865          
##          Detection Rate : 0.2981          
##    Detection Prevalence : 0.5096          
##       Balanced Accuracy : 0.4983          
##                                           
##        'Positive' Class : Up              
##

Here we get an acuuracy of .5 which is also worse than the accuracy of our original model.

Q10

Which of these methods provides the best results in this data?

The LDA & Logistic Regression methods both have an accuracy if .625 so they would be tied in terms of effectiveness.

Q11

a.Develop a model to predict whether a given car gets high or low gas mileage based on the “Auto” data set. Create a binary variable, “mpg01”, that contains a 1 if “mpg” contains a value above its median, and a 0 if “mpg” contains a value below its median. You can compute the median using the median() function. Note you may find it helpful to use the data.frame() function to create a single data set containing both “mpg01” and the other “Auto” variables.

detach(Weekly)
attach(Auto)

## The following object is masked from package:ggplot2:
## 
##     mpg

mpg1 = rep(0,length(mpg))
mpg1[mpg > median(mpg)] = 1
Auto2 = data.frame(Auto, mpg1)
summary(Auto2)

##       mpg          cylinders      displacement     horsepower        weight    
##  Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0   Min.   :1613  
##  1st Qu.:17.00   1st Qu.:4.000   1st Qu.:105.0   1st Qu.: 75.0   1st Qu.:2225  
##  Median :22.75   Median :4.000   Median :151.0   Median : 93.5   Median :2804  
##  Mean   :23.45   Mean   :5.472   Mean   :194.4   Mean   :104.5   Mean   :2978  
##  3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:275.8   3rd Qu.:126.0   3rd Qu.:3615  
##  Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0   Max.   :5140  
##                                                                                
##   acceleration        year           origin                      name    
##  Min.   : 8.00   Min.   :70.00   Min.   :1.000   amc matador       :  5  
##  1st Qu.:13.78   1st Qu.:73.00   1st Qu.:1.000   ford pinto        :  5  
##  Median :15.50   Median :76.00   Median :1.000   toyota corolla    :  5  
##  Mean   :15.54   Mean   :75.98   Mean   :1.577   amc gremlin       :  4  
##  3rd Qu.:17.02   3rd Qu.:79.00   3rd Qu.:2.000   amc hornet        :  4  
##  Max.   :24.80   Max.   :82.00   Max.   :3.000   chevrolet chevette:  4  
##                                                  (Other)           :365  
##       mpg1    
##  Min.   :0.0  
##  1st Qu.:0.0  
##  Median :0.5  
##  Mean   :0.5  
##  3rd Qu.:1.0  
##  Max.   :1.0  
##

Q11

B. Explore the data graphically

pairs(Auto2[,-9])

cor(Auto2[,-9])

##                     mpg  cylinders displacement horsepower     weight
## mpg           1.0000000 -0.7776175   -0.8051269 -0.7784268 -0.8322442
## cylinders    -0.7776175  1.0000000    0.9508233  0.8429834  0.8975273
## displacement -0.8051269  0.9508233    1.0000000  0.8972570  0.9329944
## horsepower   -0.7784268  0.8429834    0.8972570  1.0000000  0.8645377
## weight       -0.8322442  0.8975273    0.9329944  0.8645377  1.0000000
## acceleration  0.4233285 -0.5046834   -0.5438005 -0.6891955 -0.4168392
## year          0.5805410 -0.3456474   -0.3698552 -0.4163615 -0.3091199
## origin        0.5652088 -0.5689316   -0.6145351 -0.4551715 -0.5850054
## mpg1          0.8369392 -0.7591939   -0.7534766 -0.6670526 -0.7577566
##              acceleration       year     origin       mpg1
## mpg             0.4233285  0.5805410  0.5652088  0.8369392
## cylinders      -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement   -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower     -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight         -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration    1.0000000  0.2903161  0.2127458  0.3468215
## year            0.2903161  1.0000000  0.1815277  0.4299042
## origin          0.2127458  0.1815277  1.0000000  0.5136984
## mpg1            0.3468215  0.4299042  0.5136984  1.0000000

From this correlation data we can see that the variables that could be useful in predicting mpg would be cylinders, displacement

par(mfrow=c(2,2))
boxplot(cylinders~mpg1)
boxplot(displacement~mpg1)
boxplot(horsepower~mpg1)
boxplot(weight~mpg1)

Q11

split the data into a training and testing split

set.seed(1)
intrain = sample(1:nrow(Auto2),nrow(Auto2)*.7)
training = Auto2[intrain,-1]
summary(training)

##    cylinders      displacement     horsepower        weight      acceleration  
##  Min.   :3.000   Min.   : 68.0   Min.   : 46.0   Min.   :1649   Min.   : 8.00  
##  1st Qu.:4.000   1st Qu.:105.0   1st Qu.: 78.0   1st Qu.:2224   1st Qu.:14.00  
##  Median :4.000   Median :151.0   Median : 93.5   Median :2790   Median :15.50  
##  Mean   :5.464   Mean   :193.2   Mean   :103.8   Mean   :2967   Mean   :15.66  
##  3rd Qu.:8.000   3rd Qu.:261.5   3rd Qu.:123.8   3rd Qu.:3612   3rd Qu.:17.27  
##  Max.   :8.000   Max.   :455.0   Max.   :230.0   Max.   :5140   Max.   :24.80  
##                                                                                
##       year           origin                            name    
##  Min.   :70.00   Min.   :1.00   amc gremlin              :  4  
##  1st Qu.:73.00   1st Qu.:1.00   amc hornet               :  4  
##  Median :76.00   Median :1.00   amc matador              :  3  
##  Mean   :76.06   Mean   :1.54   chevrolet caprice classic:  3  
##  3rd Qu.:79.00   3rd Qu.:2.00   chevrolet chevette       :  3  
##  Max.   :82.00   Max.   :3.00   chevrolet impala         :  3  
##                                 (Other)                  :254  
##       mpg1       
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :1.0000  
##  Mean   :0.5073  
##  3rd Qu.:1.0000  
##  Max.   :1.0000  
##

testing= Auto2[-intrain,-1]
dim(testing)

## [1] 118   9

test.mpg1 = mpg1[-intrain]
train.mpg1=mpg1[intrain]
length(test.mpg1)

## [1] 118

length(train.mpg1)

## [1] 274

Q11

Perform an LDA test

lda.fit = lda(mpg1~ cylinders+ displacement+ horsepower+ weight, data = Auto2, subset = intrain)
plot(lda.fit)

lda.pred=predict(lda.fit, testing)
names(lda.pred)

## [1] "class"     "posterior" "x"

lda.class=lda.pred$class
table(lda.class, test.mpg1)

##          test.mpg1
## lda.class  0  1
##         0 50  3
##         1 11 54

mean(lda.class==test.mpg1)

## [1] 0.8813559

Values that were correctly predicted

mean(lda.class!=test.mpg1)

## [1] 0.1186441

rate of misclassifcation within our model

Q11

Perform a QDA on the training data set

qda.fit = qda(mpg1~cylinders + displacement + horsepower + weight, data = Auto2, subset = intrain)
qda.class=predict(qda.fit, testing)$class
table(qda.class, test.mpg1)

##          test.mpg1
## qda.class  0  1
##         0 52  5
##         1  9 52

mean(qda.class==test.mpg1)

## [1] 0.8813559

Amount correctly predicted

mean(qda.class!=test.mpg1)

## [1] 0.1186441

The misclassification rate

Q11

F. Perform a logistic regression on the training data set in order to predict mpg1

glm.fit = glm(mpg1 ~cylinders + displacement + horsepower + weight, data = Auto2, subset = intrain, family = binomial)
glm.probs = predict(glm.fit, testing, type = 'response')
glm.pred = rep(0,length(glm.probs))
glm.pred[glm.probs>.5]=1
table(glm.pred, test.mpg1)

##         test.mpg1
## glm.pred  0  1
##        0 53  3
##        1  8 54

mean(glm.pred==test.mpg1)

## [1] 0.9067797

mean(glm.pred!=test.mpg1)

## [1] 0.09322034

Q11

G. Perform a KNN test on the data

From the test below we can see that the highest accuracy comes from K = 5 model.

train.X=cbind(training$cylinders, training$weight, training$displacement, training$horsepower)
test.X = cbind(testing$cylinders,testing$weight,testing$displacement,testing$horsepower)
knn.pred = knn(train.X,test.X,train.mpg1)
mean(knn.pred==test.mpg1)

## [1] 0.8644068

mean(knn.pred!=test.mpg1)

## [1] 0.1355932

K = 5

knn.pred = knn(train.X,test.X,train.mpg1, k=5)

mean(knn.pred==test.mpg1)

## [1] 0.8728814

mean(knn.pred!=test.mpg1)

## [1] 0.1271186

K = 10

knn.pred = knn(train.X,test.X,train.mpg1, k=10)

mean(knn.pred==test.mpg1)

## [1] 0.8559322

mean(knn.pred!=test.mpg1)

## [1] 0.1440678

Q13

summary(Boston)

##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          black       
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   :  0.32  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38  
##  Median : 5.000   Median :330.0   Median :19.05   Median :391.44  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :356.67  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :396.90  
##      lstat            medv      
##  Min.   : 1.73   Min.   : 5.00  
##  1st Qu.: 6.95   1st Qu.:17.02  
##  Median :11.36   Median :21.20  
##  Mean   :12.65   Mean   :22.53  
##  3rd Qu.:16.95   3rd Qu.:25.00  
##  Max.   :37.97   Max.   :50.00

cor(Boston)

##                crim          zn       indus         chas         nox
## crim     1.00000000 -0.20046922  0.40658341 -0.055891582  0.42097171
## zn      -0.20046922  1.00000000 -0.53382819 -0.042696719 -0.51660371
## indus    0.40658341 -0.53382819  1.00000000  0.062938027  0.76365145
## chas    -0.05589158 -0.04269672  0.06293803  1.000000000  0.09120281
## nox      0.42097171 -0.51660371  0.76365145  0.091202807  1.00000000
## rm      -0.21924670  0.31199059 -0.39167585  0.091251225 -0.30218819
## age      0.35273425 -0.56953734  0.64477851  0.086517774  0.73147010
## dis     -0.37967009  0.66440822 -0.70802699 -0.099175780 -0.76923011
## rad      0.62550515 -0.31194783  0.59512927 -0.007368241  0.61144056
## tax      0.58276431 -0.31456332  0.72076018 -0.035586518  0.66802320
## ptratio  0.28994558 -0.39167855  0.38324756 -0.121515174  0.18893268
## black   -0.38506394  0.17552032 -0.35697654  0.048788485 -0.38005064
## lstat    0.45562148 -0.41299457  0.60379972 -0.053929298  0.59087892
## medv    -0.38830461  0.36044534 -0.48372516  0.175260177 -0.42732077
##                  rm         age         dis          rad         tax    ptratio
## crim    -0.21924670  0.35273425 -0.37967009  0.625505145  0.58276431  0.2899456
## zn       0.31199059 -0.56953734  0.66440822 -0.311947826 -0.31456332 -0.3916785
## indus   -0.39167585  0.64477851 -0.70802699  0.595129275  0.72076018  0.3832476
## chas     0.09125123  0.08651777 -0.09917578 -0.007368241 -0.03558652 -0.1215152
## nox     -0.30218819  0.73147010 -0.76923011  0.611440563  0.66802320  0.1889327
## rm       1.00000000 -0.24026493  0.20524621 -0.209846668 -0.29204783 -0.3555015
## age     -0.24026493  1.00000000 -0.74788054  0.456022452  0.50645559  0.2615150
## dis      0.20524621 -0.74788054  1.00000000 -0.494587930 -0.53443158 -0.2324705
## rad     -0.20984667  0.45602245 -0.49458793  1.000000000  0.91022819  0.4647412
## tax     -0.29204783  0.50645559 -0.53443158  0.910228189  1.00000000  0.4608530
## ptratio -0.35550149  0.26151501 -0.23247054  0.464741179  0.46085304  1.0000000
## black    0.12806864 -0.27353398  0.29151167 -0.444412816 -0.44180801 -0.1773833
## lstat   -0.61380827  0.60233853 -0.49699583  0.488676335  0.54399341  0.3740443
## medv     0.69535995 -0.37695457  0.24992873 -0.381626231 -0.46853593 -0.5077867
##               black      lstat       medv
## crim    -0.38506394  0.4556215 -0.3883046
## zn       0.17552032 -0.4129946  0.3604453
## indus   -0.35697654  0.6037997 -0.4837252
## chas     0.04878848 -0.0539293  0.1752602
## nox     -0.38005064  0.5908789 -0.4273208
## rm       0.12806864 -0.6138083  0.6953599
## age     -0.27353398  0.6023385 -0.3769546
## dis      0.29151167 -0.4969958  0.2499287
## rad     -0.44441282  0.4886763 -0.3816262
## tax     -0.44180801  0.5439934 -0.4685359
## ptratio -0.17738330  0.3740443 -0.5077867
## black    1.00000000 -0.3660869  0.3334608
## lstat   -0.36608690  1.0000000 -0.7376627
## medv     0.33346082 -0.7376627  1.0000000

attach(Boston)
crim_ind=rep(0,length(crim))
crim_ind[crim>median(crim)]=1
Boston2 = data.frame(Boston, crim_ind)
summary(Boston2)

##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          black       
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   :  0.32  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38  
##  Median : 5.000   Median :330.0   Median :19.05   Median :391.44  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :356.67  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :396.90  
##      lstat            medv          crim_ind  
##  Min.   : 1.73   Min.   : 5.00   Min.   :0.0  
##  1st Qu.: 6.95   1st Qu.:17.02   1st Qu.:0.0  
##  Median :11.36   Median :21.20   Median :0.5  
##  Mean   :12.65   Mean   :22.53   Mean   :0.5  
##  3rd Qu.:16.95   3rd Qu.:25.00   3rd Qu.:1.0  
##  Max.   :37.97   Max.   :50.00   Max.   :1.0

create training and testing splits

set.seed(1)
train = sample(1:nrow(Boston2), .7*nrow(Boston2))
Boston2.train = Boston2[train,]
Boston2.test = Boston2[-train,]

crim.ind.test = crim_ind[-train]
crim.ind.test

##   [1] 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [38] 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0
##  [75] 0 0 0 1 1 1 1 1 0 0 0 0 0 0 1 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
## [149] 1 0 0 0

Logistic regression

glm.fit = glm(crim_ind ~zn + indus + chas + nox + rm + age + dis + rad + tax + ptratio + black + lstat + medv, data = Boston2, subset = train, family = binomial)
summary(glm.fit)

## 
## Call:
## glm(formula = crim_ind ~ zn + indus + chas + nox + rm + age + 
##     dis + rad + tax + ptratio + black + lstat + medv, family = binomial, 
##     data = Boston2, subset = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1363  -0.1504  -0.0009   0.0018   3.5797  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -42.731502   8.009551  -5.335 9.55e-08 ***
## zn           -0.105323   0.045585  -2.310 0.020862 *  
## indus        -0.066308   0.057649  -1.150 0.250057    
## chas          0.187896   0.810675   0.232 0.816711    
## nox          56.344292  10.073420   5.593 2.23e-08 ***
## rm           -0.234402   0.900292  -0.260 0.794585    
## age           0.022052   0.014304   1.542 0.123141    
## dis           1.143918   0.303909   3.764 0.000167 ***
## rad           0.657976   0.184373   3.569 0.000359 ***
## tax          -0.006299   0.003239  -1.944 0.051849 .  
## ptratio       0.292158   0.155755   1.876 0.060689 .  
## black        -0.008703   0.005212  -1.670 0.094933 .  
## lstat         0.112414   0.060168   1.868 0.061715 .  
## medv          0.191745   0.087816   2.184 0.028999 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 490.65  on 353  degrees of freedom
## Residual deviance: 140.41  on 340  degrees of freedom
## AIC: 168.41
## 
## Number of Fisher Scoring iterations: 9

Logistic regression with significant predictors

glm.fit2 = glm(crim_ind~zn + nox + dis + rad + medv, data = Boston2, subset = train, family = binomial)
summary(glm.fit2)

## 
## Call:
## glm(formula = crim_ind ~ zn + nox + dis + rad + medv, family = binomial, 
##     data = Boston2, subset = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0766  -0.3226  -0.0034   0.0069   3.2715  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -30.10487    4.91615  -6.124 9.14e-10 ***
## zn           -0.09780    0.03537  -2.765 0.005698 ** 
## nox          42.65837    7.07436   6.030 1.64e-09 ***
## dis           0.84586    0.23756   3.561 0.000370 ***
## rad           0.49353    0.13194   3.741 0.000184 ***
## medv          0.08028    0.02906   2.763 0.005730 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 490.65  on 353  degrees of freedom
## Residual deviance: 164.99  on 348  degrees of freedom
## AIC: 176.99
## 
## Number of Fisher Scoring iterations: 8

glm.probs=predict(glm.fit2, Boston2.test, type = 'response')
glm.pred = rep(0, length(glm.probs))
glm.pred[glm.probs > .5] = 1
table(glm.pred, crim.ind.test)

##         crim.ind.test
## glm.pred  0  1
##        0 60 11
##        1 13 68

mean(glm.pred == crim.ind.test)

## [1] 0.8421053

mean(glm.pred!= crim.ind.test)

## [1] 0.1578947

LDA model

lda.fit = lda(crim_ind~zn + nox + dis + rad + medv, data = Boston2, subset = train)
lda.fit

## Call:
## lda(crim_ind ~ zn + nox + dis + rad + medv, data = Boston2, subset = train)
## 
## Prior probabilities of groups:
##         0         1 
## 0.5084746 0.4915254 
## 
## Group means:
##          zn       nox      dis       rad     medv
## 0 21.188889 0.4695150 5.070315  4.127778 25.37056
## 1  1.172414 0.6377989 2.537681 14.873563 20.44770
## 
## Coefficients of linear discriminants:
##               LD1
## zn   -0.010534928
## nox   9.134260498
## dis   0.008656437
## rad   0.076273497
## medv  0.029024616

lda.pred = predict(lda.fit, Boston2.test)
names(lda.pred)

## [1] "class"     "posterior" "x"

lda.class = lda.pred$class
table(lda.class, crim.ind.test)

##          crim.ind.test
## lda.class  0  1
##         0 71 19
##         1  2 60

mean(lda.class==crim.ind.test)

## [1] 0.8618421

mean(lda.class!=crim.ind.test)

## [1] 0.1381579

QDA

qda.fit = qda(crim_ind~zn + nox + dis + rad + medv, data = Boston2, subset = train)
qda.fit

## Call:
## qda(crim_ind ~ zn + nox + dis + rad + medv, data = Boston2, subset = train)
## 
## Prior probabilities of groups:
##         0         1 
## 0.5084746 0.4915254 
## 
## Group means:
##          zn       nox      dis       rad     medv
## 0 21.188889 0.4695150 5.070315  4.127778 25.37056
## 1  1.172414 0.6377989 2.537681 14.873563 20.44770

qda.pred = predict(qda.fit, Boston2.test)
names(qda.pred)

## [1] "class"     "posterior"

qda.class = qda.pred$class
table(qda.class, crim.ind.test)

##          crim.ind.test
## qda.class  0  1
##         0 66 15
##         1  7 64

mean(qda.class == crim.ind.test)

## [1] 0.8552632

mean(qda.class!= crim.ind.test)

## [1] 0.1447368

KNN Classifier

train.X = cbind(nox,rad,ptratio,medv)[train,]
test.X =  cbind(nox,rad,ptratio,medv)[-train,]
train.crim_ind = crim_ind[train]
dim(train.X)

## [1] 354   4

KNN=1

set.seed(1)
knn.pred=knn(train.X,test.X,train.crim_ind, k=1)
table(knn.pred, crim.ind.test)

##         crim.ind.test
## knn.pred  0  1
##        0 58  6
##        1 15 73

KNN=5

knn.pred=knn(train.X,test.X,train.crim_ind, k=5)
table(knn.pred, crim.ind.test)

##         crim.ind.test
## knn.pred  0  1
##        0 61 12
##        1 12 67

KNN=10

knn.pred=knn(train.X,test.X,train.crim_ind, k=10)
table(knn.pred, crim.ind.test)

##         crim.ind.test
## knn.pred  0  1
##        0 67 20
##        1  6 59

KNN=100

knn.pred=knn(train.X,test.X,train.crim_ind, k=100)
table(knn.pred, crim.ind.test)

##         crim.ind.test
## knn.pred  0  1
##        0 73 39
##        1  0 40

HW3

Thomas Farrell

2022-02-25