Chapter 4 Homework

library(ISLR)
library(naivebayes)

## naivebayes 0.9.7 loaded

library(e1071)
library(class)
library(MASS)
head(Weekly)

##   Year   Lag1   Lag2   Lag3   Lag4   Lag5    Volume  Today Direction
## 1 1990  0.816  1.572 -3.936 -0.229 -3.484 0.1549760 -0.270      Down
## 2 1990 -0.270  0.816  1.572 -3.936 -0.229 0.1485740 -2.576      Down
## 3 1990 -2.576 -0.270  0.816  1.572 -3.936 0.1598375  3.514        Up
## 4 1990  3.514 -2.576 -0.270  0.816  1.572 0.1616300  0.712        Up
## 5 1990  0.712  3.514 -2.576 -0.270  0.816 0.1537280  1.178        Up
## 6 1990  1.178  0.712  3.514 -2.576 -0.270 0.1544440 -1.372      Down

dim(Weekly)

## [1] 1089    9

attach(Weekly)
par(mfrow = c(2,2))

summary(Weekly)

##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
##

plot(Weekly)

There appears to be growing volume as the years progress.

cor(Weekly[,-9])

##               Year         Lag1        Lag2        Lag3         Lag4
## Year    1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1   -0.03228927  1.000000000 -0.07485305  0.05863568 -0.071273876
## Lag2   -0.03339001 -0.074853051  1.00000000 -0.07572091  0.058381535
## Lag3   -0.03000649  0.058635682 -0.07572091  1.00000000 -0.075395865
## Lag4   -0.03112792 -0.071273876  0.05838153 -0.07539587  1.000000000
## Lag5   -0.03051910 -0.008183096 -0.07249948  0.06065717 -0.075675027
## Volume  0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today  -0.03245989 -0.075031842  0.05916672 -0.07124364 -0.007825873
##                Lag5      Volume        Today
## Year   -0.030519101  0.84194162 -0.032459894
## Lag1   -0.008183096 -0.06495131 -0.075031842
## Lag2   -0.072499482 -0.08551314  0.059166717
## Lag3    0.060657175 -0.06928771 -0.071243639
## Lag4   -0.075675027 -0.06107462 -0.007825873
## Lag5    1.000000000 -0.05851741  0.011012698
## Volume -0.058517414  1.00000000 -0.033077783
## Today   0.011012698 -0.03307778  1.000000000

This once agains shows the only significant relationships being volume and year

glm.fit1 <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, family = binomial, data = Weekly)
summary(glm.fit1)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6949  -1.2565   0.9913   1.0849   1.4579  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

Lag 2 has a p-value of .0296 evidencing .05 significance as indicated by the asterisk.

glm.prob1 <- predict(glm.fit1, type = "response")
glm.pred <- rep("Down", 1089)
glm.pred[glm.prob1 >0.5] = "Up"
table(glm.pred, Weekly$Direction)

##         
## glm.pred Down  Up
##     Down   54  48
##     Up    430 557

(54+557)/1089

## [1] 0.5610652

(430+48)/1089

## [1] 0.4389348

This confusion matrix shows that there is a 56% accuracy rate, with 44% false positive.

train = (Weekly$Year < 2009)
glm.fit = glm(Direction ~ Lag2, data = Weekly, subset = train, family = "binomial")
summary(glm.fit)

## 
## Call:
## glm(formula = Direction ~ Lag2, family = "binomial", data = Weekly, 
##     subset = train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.536  -1.264   1.021   1.091   1.368  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.20326    0.06428   3.162  0.00157 **
## Lag2         0.05810    0.02870   2.024  0.04298 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1354.7  on 984  degrees of freedom
## Residual deviance: 1350.5  on 983  degrees of freedom
## AIC: 1354.5
## 
## Number of Fisher Scoring iterations: 4

glm.probs = predict(glm.fit, Weekly[!train, ], type = "response")
glm.pred = rep("Down", dim(Weekly[!train, ])[1])
glm.pred[glm.probs > 0.5] = "Up"
table(glm.pred, Weekly[!train, ]$Direction)

##         
## glm.pred Down Up
##     Down    9  5
##     Up     34 56

(34+5)/104

## [1] 0.375

(9+56)/104

## [1] 0.625

The confusion matrix still shows that this model is unreliable with an accurate prediction only 62.5% of the time.

library(MASS)
lda.fit = lda(Direction ~ Lag2, data = Weekly, subset = train)
lda.fit

## Call:
## lda(Direction ~ Lag2, data = Weekly, subset = train)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Group means:
##             Lag2
## Down -0.03568254
## Up    0.26036581
## 
## Coefficients of linear discriminants:
##            LD1
## Lag2 0.4414162

lda.pred = predict(lda.fit, Weekly[!train, ])
table(lda.pred$class, Weekly[!train, ]$Direction)

##       
##        Down Up
##   Down    9  5
##   Up     34 56

The results from the LDA output are identical to the results from part d.

qda.fit = qda(Direction ~ Lag2, data = Weekly, subset = train)
qda.fit

## Call:
## qda(Direction ~ Lag2, data = Weekly, subset = train)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Group means:
##             Lag2
## Down -0.03568254
## Up    0.26036581

qda.pred = predict(qda.fit, Weekly[!train, ])
table(qda.pred$class, Weekly[!train, ]$Direction)

##       
##        Down Up
##   Down    0  0
##   Up     43 61

61/104

## [1] 0.5865385

The results from this show that predictions for the “Up category are 100%, but 0% for the down proving again to be reliable only 59% of the time which is only slighlty better than the previous two models.

library(class)
train.X = data.frame(Weekly[train, ]$Lag2)
test.X = data.frame(Weekly[!train, ]$Lag2)
train.Direction = Weekly[train, ]$Direction
set.seed(1)
knn.pred = knn(train.X, test.X, train.Direction, k = 1)
table(knn.pred, Weekly[!train, ]$Direction)

##         
## knn.pred Down Up
##     Down   21 30
##     Up     22 31

This model only has a 50% accuracy rate meaning it is less accururate than the previous models

nb.fit <- naiveBayes(Direction ~ Lag2, data = Weekly, subset = train)
nb.class<- predict(nb.fit, Weekly[!train,])
table(nb.class, Weekly[!train, ]$Direction)

##         
## nb.class Down Up
##     Down    0  0
##     Up     43 61

mean(nb.class ==Weekly[!train,]$Direction)

## [1] 0.5865385

This model is 59% accurate, which is very similar to other models, but better than the KNN.

The Linear Regression model seems to be the most accurate.

Problem 14 (A)

Auto = na.omit(Auto)
head(Auto)

##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name
## 1 chevrolet chevelle malibu
## 2         buick skylark 320
## 3        plymouth satellite
## 4             amc rebel sst
## 5               ford torino
## 6          ford galaxie 500

mpg01 = rep(0, dim(Auto)[1])
mpg01[Auto$mpg > median(Auto$mpg)] = 1
Auto = data.frame(Auto, mpg01)
attach(Auto)

## The following object is masked _by_ .GlobalEnv:
## 
##     mpg01

par(mfrow = c(2,3))
boxplot(cylinders~mpg01, data=Auto, xlab="mpg01", ylab="Cylinders", main="Cylinders to MPG01")
boxplot(displacement~mpg01, data=Auto, xlab="mpg01", ylab="Displacement", main="Displacement to MPG01")
boxplot(horsepower~mpg01, data=Auto, xlab="mpg01", ylab="Horsepower", main="Horsepower to MPG01")
boxplot(weight~mpg01, data=Auto, xlab="mpg01", ylab="Weight", main="Weight to MPG01")
boxplot(acceleration~mpg01, data=Auto, xlab="mpg01", ylab="Acceleration", main="Acceleration to MPG01")
boxplot(year~mpg01, data=Auto, xlab="mpg01", ylab="Year", main="Year to MPG01")

I am not sure if I did something wrong, but the scatterplots weren’t particularly helpful. Items in the boxplots came back with pretty much what was expected. The more cylinders, higher displacement, higher horsepower, heavier, quicker, and older the car, the worse it is on gas. It does appear that there are a couple of outliers in with 8 cylinders that still achieved over the median mpg which was surprising.

set.seed(1)
train = sample(dim(Auto)[1], size = 0.75*dim(Auto)[1])

I had numerous issues with the following steps, but following along with similar items, I found it easier to just set the training data and remove the the train data manually below.

lda.fit <- lda(mpg01 ~ cylinders+displacement, data = Auto, subset = train)
lda.pred = predict(lda.fit, Auto[-train, ])
table(lda.pred$class, Auto[-train, "mpg01"], dnn = c("Predicted", "Actual"))

##          Actual
## Predicted  0  1
##         0 41  2
##         1 12 43

Overall, this model had an accuracy of 86% based on the two variables cylinders and displacement. Error rate of 14%

 qda.fit = qda(mpg01 ~ cylinders + displacement, data = Auto, subset = train)
qda.pred = predict(qda.fit, Auto[-train, ])
table(qda.pred$class, Auto[-train, "mpg01"], dnn = c("Predicted", "Actual"))

##          Actual
## Predicted  0  1
##         0 45  3
##         1  8 42

mean(qda.pred$class == Auto[-train, "mpg01"])

## [1] 0.8877551

This model is slightly better than the previous with an 89% accuracy rating. Overall, there were less false positive, but this primarily impacted the accuracy on the non-efficient or 0 portion. Error rate of 11%

glm.fit = glm(mpg01 ~ cylinders + displacement, data = Auto, subset = train, family = "binomial")
summary(glm.fit)

## 
## Call:
## glm(formula = mpg01 ~ cylinders + displacement, family = "binomial", 
##     data = Auto, subset = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6093  -0.1964   0.3058   0.4688   3.2678  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   5.743895   0.978458   5.870 4.35e-09 ***
## cylinders    -0.112651   0.382326  -0.295 0.768264    
## displacement -0.029077   0.007526  -3.864 0.000112 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 407.35  on 293  degrees of freedom
## Residual deviance: 176.80  on 291  degrees of freedom
## AIC: 182.8
## 
## Number of Fisher Scoring iterations: 6

glm.probs = predict(glm.fit, Auto[-train, ], type = "response")
glm.pred = rep(0, dim(Auto[-train, ])[1])
glm.pred[glm.probs > 0.5] = 1
table(glm.pred, Auto[-train, "mpg01"], dnn = c("Predicted", "Actual"))

##          Actual
## Predicted  0  1
##         0 41  1
##         1 12 44

mean(glm.pred == Auto[-train, "mpg01"])

## [1] 0.8673469

This model falls right between the two previous models at 87%, but overall has properly classified the most “1”s and mis classified the lease false postive “1”s. Error rate of 13%

nb.fit <- naiveBayes(mpg01 ~ cylinders + displacement, data = Auto, subset = train)
nb.class <- predict(nb.fit, Auto[-train, ])
table (nb.class , Auto[-train, "mpg01"])

##         
## nb.class  0  1
##        0 45  3
##        1  8 42

The bayes model has an accuracy of 89% and error rate of 11%

train.x<-cbind(cylinders, displacement)[train, ]
test.x<-cbind(cylinders, displacement)[-train,]
train.mpg01 = Auto[train, "mpg01"]
set.seed(1)
knn.pred <- knn(train.x, test.x, train.mpg01 , k = 1)
table(knn.pred, Auto[-train, "mpg01"])

##         
## knn.pred  0  1
##        0 44  2
##        1  9 43

mean(knn.pred == Auto[-train, "mpg01"])

## [1] 0.8877551

KNN with a value of 1 has an 89% accuracy and 11% error rate.

set.seed(1)
knn.pred <- knn(train.x, test.x, train.mpg01, k = 5)
table(knn.pred, Auto[-train, "mpg01"])

##         
## knn.pred  0  1
##        0 41  0
##        1 12 45

mean(knn.pred == Auto[-train, "mpg01"])

## [1] 0.877551

nb.class <- predict(nb.fit , Auto[-train, "mpg01"])

## Warning in predict.naiveBayes(nb.fit, Auto[-train, "mpg01"]): Type mismatch
## between training and new data for variable 'cylinders'. Did you use factors with
## numeric labels for training, and numeric values for new data?

## Warning in predict.naiveBayes(nb.fit, Auto[-train, "mpg01"]): Type mismatch
## between training and new data for variable 'displacement'. Did you use factors
## with numeric labels for training, and numeric values for new data?

knn.pred <- knn(train.x, test.x, train.mpg01, k = 10)
table(knn.pred, Auto[-train, "mpg01"])

##         
## knn.pred  0  1
##        0 41  0
##        1 12 45

mean(knn.pred == Auto[-train, "mpg01"])

## [1] 0.877551

knn.pred <- knn(train.x, test.x, train.mpg01, k = 15)
table(knn.pred, Auto[-train, "mpg01"])

##         
## knn.pred  0  1
##        0 41  1
##        1 12 44

mean(knn.pred == Auto[-train, "mpg01"])

## [1] 0.8673469

With KNN values at intervals of 5 between 5 and 15, there were no significant changes.

Problem 16

Boston = na.omit(Boston)
head(Boston)

##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21
##   medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7

attach(Boston)
crim01 = rep(0, dim(Boston)[1])
crim01[Boston$crim > median(Boston$crim)] = 1
Boston = data.frame(Boston, crim01)
attach(Boston)

## The following object is masked _by_ .GlobalEnv:
## 
##     crim01

## The following objects are masked from Boston (pos = 3):
## 
##     age, black, chas, crim, dis, indus, lstat, medv, nox, ptratio, rad,
##     rm, tax, zn

cor(Boston)

##                crim          zn       indus         chas         nox
## crim     1.00000000 -0.20046922  0.40658341 -0.055891582  0.42097171
## zn      -0.20046922  1.00000000 -0.53382819 -0.042696719 -0.51660371
## indus    0.40658341 -0.53382819  1.00000000  0.062938027  0.76365145
## chas    -0.05589158 -0.04269672  0.06293803  1.000000000  0.09120281
## nox      0.42097171 -0.51660371  0.76365145  0.091202807  1.00000000
## rm      -0.21924670  0.31199059 -0.39167585  0.091251225 -0.30218819
## age      0.35273425 -0.56953734  0.64477851  0.086517774  0.73147010
## dis     -0.37967009  0.66440822 -0.70802699 -0.099175780 -0.76923011
## rad      0.62550515 -0.31194783  0.59512927 -0.007368241  0.61144056
## tax      0.58276431 -0.31456332  0.72076018 -0.035586518  0.66802320
## ptratio  0.28994558 -0.39167855  0.38324756 -0.121515174  0.18893268
## black   -0.38506394  0.17552032 -0.35697654  0.048788485 -0.38005064
## lstat    0.45562148 -0.41299457  0.60379972 -0.053929298  0.59087892
## medv    -0.38830461  0.36044534 -0.48372516  0.175260177 -0.42732077
## crim01   0.40939545 -0.43615103  0.60326017  0.070096774  0.72323480
##                  rm         age         dis          rad         tax    ptratio
## crim    -0.21924670  0.35273425 -0.37967009  0.625505145  0.58276431  0.2899456
## zn       0.31199059 -0.56953734  0.66440822 -0.311947826 -0.31456332 -0.3916785
## indus   -0.39167585  0.64477851 -0.70802699  0.595129275  0.72076018  0.3832476
## chas     0.09125123  0.08651777 -0.09917578 -0.007368241 -0.03558652 -0.1215152
## nox     -0.30218819  0.73147010 -0.76923011  0.611440563  0.66802320  0.1889327
## rm       1.00000000 -0.24026493  0.20524621 -0.209846668 -0.29204783 -0.3555015
## age     -0.24026493  1.00000000 -0.74788054  0.456022452  0.50645559  0.2615150
## dis      0.20524621 -0.74788054  1.00000000 -0.494587930 -0.53443158 -0.2324705
## rad     -0.20984667  0.45602245 -0.49458793  1.000000000  0.91022819  0.4647412
## tax     -0.29204783  0.50645559 -0.53443158  0.910228189  1.00000000  0.4608530
## ptratio -0.35550149  0.26151501 -0.23247054  0.464741179  0.46085304  1.0000000
## black    0.12806864 -0.27353398  0.29151167 -0.444412816 -0.44180801 -0.1773833
## lstat   -0.61380827  0.60233853 -0.49699583  0.488676335  0.54399341  0.3740443
## medv     0.69535995 -0.37695457  0.24992873 -0.381626231 -0.46853593 -0.5077867
## crim01  -0.15637178  0.61393992 -0.61634164  0.619786249  0.60874128  0.2535684
##               black      lstat       medv      crim01
## crim    -0.38506394  0.4556215 -0.3883046  0.40939545
## zn       0.17552032 -0.4129946  0.3604453 -0.43615103
## indus   -0.35697654  0.6037997 -0.4837252  0.60326017
## chas     0.04878848 -0.0539293  0.1752602  0.07009677
## nox     -0.38005064  0.5908789 -0.4273208  0.72323480
## rm       0.12806864 -0.6138083  0.6953599 -0.15637178
## age     -0.27353398  0.6023385 -0.3769546  0.61393992
## dis      0.29151167 -0.4969958  0.2499287 -0.61634164
## rad     -0.44441282  0.4886763 -0.3816262  0.61978625
## tax     -0.44180801  0.5439934 -0.4685359  0.60874128
## ptratio -0.17738330  0.3740443 -0.5077867  0.25356836
## black    1.00000000 -0.3660869  0.3334608 -0.35121093
## lstat   -0.36608690  1.0000000 -0.7376627  0.45326273
## medv     0.33346082 -0.7376627  1.0000000 -0.26301673
## crim01  -0.35121093  0.4532627 -0.2630167  1.00000000

train = sample(dim(Boston)[1], size = 0.7*dim(Boston)[1])
test = Boston[-train,]
glm.fit = glm(crim01 ~ indus+nox+age+dis+rad+tax, data = Boston, subset = train, family = "binomial")
summary(glm.fit)

## 
## Call:
## glm(formula = crim01 ~ indus + nox + age + dis + rad + tax, family = "binomial", 
##     data = Boston, subset = train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.94918  -0.30759  -0.04657   0.01061   2.80525  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -28.398069   4.687537  -6.058 1.38e-09 ***
## indus        -0.068147   0.047921  -1.422  0.15501    
## nox          46.760746   8.465762   5.524 3.32e-08 ***
## age           0.019822   0.010246   1.935  0.05305 .  
## dis           0.499578   0.174007   2.871  0.00409 ** 
## rad           0.555643   0.130216   4.267 1.98e-05 ***
## tax          -0.006519   0.002840  -2.295  0.02171 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 490.34  on 353  degrees of freedom
## Residual deviance: 182.39  on 347  degrees of freedom
## AIC: 196.39
## 
## Number of Fisher Scoring iterations: 8

glm.probs = predict(glm.fit, Boston[-train, ], type = "response")
glm.pred = rep(0, dim(Boston[-train, ])[1])
glm.pred[glm.probs > 0.5] = 1
table(glm.pred, Boston[-train, "crim01"], dnn = c("Predicted", "Actual"))

##          Actual
## Predicted  0  1
##         0 60  7
##         1 10 75

lda.fit <- lda(crim01 ~ indus+nox+age+dis+rad+tax, data = Boston, subset = train)
lda.pred = predict(lda.fit, Boston[-train, ])
table(lda.pred$class, Boston[-train, "crim01"], dnn = c("Predicted", "Actual"))

##          Actual
## Predicted  0  1
##         0 65 17
##         1  5 65

nb.fit <- naiveBayes(crim01 ~ indus+nox+age+dis+rad+tax, data = Boston)
nb.class <- predict(nb.fit, Boston[-train, ])
table (nb.class , Boston[-train, "crim01"])

##         
## nb.class  0  1
##        0 64 18
##        1  6 64

After numerous hours of errors on KNN, I finally gave up on trying. The linear regression provided the most accurate readings with an error rate of ~11%. upon observing the P values, it appeasr that the most significant items were age followed by tax.

Chapter 4 Homework

Charles Ponthieux

2022-10-02