HW3

This is an R HTML document. When you click the Knit HTML button a web page will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

##Q10.
#a
library(ISLR)
summary(Weekly)

##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
##

cor(Weekly[, -9])

##               Year         Lag1        Lag2        Lag3         Lag4
## Year    1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1   -0.03228927  1.000000000 -0.07485305  0.05863568 -0.071273876
## Lag2   -0.03339001 -0.074853051  1.00000000 -0.07572091  0.058381535
## Lag3   -0.03000649  0.058635682 -0.07572091  1.00000000 -0.075395865
## Lag4   -0.03112792 -0.071273876  0.05838153 -0.07539587  1.000000000
## Lag5   -0.03051910 -0.008183096 -0.07249948  0.06065717 -0.075675027
## Volume  0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today  -0.03245989 -0.075031842  0.05916672 -0.07124364 -0.007825873
##                Lag5      Volume        Today
## Year   -0.030519101  0.84194162 -0.032459894
## Lag1   -0.008183096 -0.06495131 -0.075031842
## Lag2   -0.072499482 -0.08551314  0.059166717
## Lag3    0.060657175 -0.06928771 -0.071243639
## Lag4   -0.075675027 -0.06107462 -0.007825873
## Lag5    1.000000000 -0.05851741  0.011012698
## Volume -0.058517414  1.00000000 -0.033077783
## Today   0.011012698 -0.03307778  1.000000000

attach(Weekly)
plot(Volume)

#The only variables that show a correlation is between Year and Volume

#b
fit.glm = glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Weekly, family = binomial)
summary(fit.glm)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6949  -1.2565   0.9913   1.0849   1.4579  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

#It would seem that Lag2 is the only predictor with a  p-value < 0.05 so its statistically significant

#c
probs = predict(fit.glm, type = "response")
pred.glm = rep("Down", length(probs))
pred.glm[probs > 0.5] = "Up"
table(pred.glm, Direction)

##         Direction
## pred.glm Down  Up
##     Down   54  48
##     Up    430 557

#The matrix shows false negatives and false positives. There are 430 false positives, 54/484 of the true negatives, 557 true positives, and 48/625 false positives

#d
train = (Year < 2009)
Weekly.20092010 = Weekly[!train, ]
Direction.20092010 = Direction[!train]
fit.glm2 = glm(Direction ~ Lag2, data = Weekly, family = binomial, subset = train)
summary(fit.glm2)

## 
## Call:
## glm(formula = Direction ~ Lag2, family = binomial, data = Weekly, 
##     subset = train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.536  -1.264   1.021   1.091   1.368  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.20326    0.06428   3.162  0.00157 **
## Lag2         0.05810    0.02870   2.024  0.04298 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1354.7  on 984  degrees of freedom
## Residual deviance: 1350.5  on 983  degrees of freedom
## AIC: 1354.5
## 
## Number of Fisher Scoring iterations: 4

probs2 = predict(fit.glm2, Weekly.20092010, type = "response")
pred.glm2 = rep("Down", length(probs2))
pred.glm2[probs2 > 0.5] = "Up"
table(pred.glm2, Direction.20092010)

##          Direction.20092010
## pred.glm2 Down Up
##      Down    9  5
##      Up     34 56

#e
fit.lda = lda(Direction ~ Lag2, data = Weekly, subset = train)

## Error in lda(Direction ~ Lag2, data = Weekly, subset = train): could not find function "lda"

fit.lda

## Error in eval(expr, envir, enclos): object 'fit.lda' not found

pred.lda = predict(fit.lda, Weekly.20092010)

## Error in predict(fit.lda, Weekly.20092010): object 'fit.lda' not found

table(pred.lda$class, Direction.20092010)

## Error in table(pred.lda$class, Direction.20092010): object 'pred.lda' not found

#f
fit.qda = qda(Direction ~ Lag2, data = Weekly, subset = train)

## Error in qda(Direction ~ Lag2, data = Weekly, subset = train): could not find function "qda"

fit.qda

## Error in eval(expr, envir, enclos): object 'fit.qda' not found

pred.qda = predict(fit.qda, Weekly.20092010)

## Error in predict(fit.qda, Weekly.20092010): object 'fit.qda' not found

table(pred.qda$class, Direction.20092010)

## Error in table(pred.qda$class, Direction.20092010): object 'pred.qda' not found

#g
train.X = as.matrix(Lag2[train])
test.X = as.matrix(Lag2[!train])
train.Direction = Direction[train]
set.seed(1)
pred.knn = knn(train.X, test.X, train.Direction, k = 1)

## Error in knn(train.X, test.X, train.Direction, k = 1): could not find function "knn"

table(pred.knn, Direction.20092010)

## Error in table(pred.knn, Direction.20092010): object 'pred.knn' not found

#h
#The best results came from Logistic regression and LDA.

#i
fit.glm3 = glm(Direction ~ Lag2:Lag1, data = Weekly, family = binomial, subset = train)
probs3 = predict(fit.glm3, Weekly.20092010, type = "response")
pred.glm3 = rep("Down", length(probs3))
pred.glm3[probs3 > 0.5] = "Up"
table(pred.glm3, Direction.20092010)

##          Direction.20092010
## pred.glm3 Down Up
##      Down    1  1
##      Up     42 60

mean(pred.glm3 == Direction.20092010)

## [1] 0.5865385

fit.lda2 = lda(Direction ~ Lag2:Lag1, data = Weekly, subset = train)

## Error in lda(Direction ~ Lag2:Lag1, data = Weekly, subset = train): could not find function "lda"

pred.lda2 = predict(fit.lda2, Weekly.20092010)

## Error in predict(fit.lda2, Weekly.20092010): object 'fit.lda2' not found

mean(pred.lda2$class == Direction.20092010)

## Error in mean(pred.lda2$class == Direction.20092010): object 'pred.lda2' not found

fit.qda2 = qda(Direction ~ Lag2 + sqrt(abs(Lag2)), data = Weekly, subset = train)

## Error in qda(Direction ~ Lag2 + sqrt(abs(Lag2)), data = Weekly, subset = train): could not find function "qda"

pred.qda2 = predict(fit.qda2, Weekly.20092010)

## Error in predict(fit.qda2, Weekly.20092010): object 'fit.qda2' not found

table(pred.qda2$class, Direction.20092010)

## Error in table(pred.qda2$class, Direction.20092010): object 'pred.qda2' not found

mean(pred.qda2$class == Direction.20092010)

## Error in mean(pred.qda2$class == Direction.20092010): object 'pred.qda2' not found

pred.knn2 = knn(train.X, test.X, train.Direction, k = 10)

## Error in knn(train.X, test.X, train.Direction, k = 10): could not find function "knn"

table(pred.knn2, Direction.20092010)

## Error in table(pred.knn2, Direction.20092010): object 'pred.knn2' not found

mean(pred.knn2 == Direction.20092010)

## Error in mean(pred.knn2 == Direction.20092010): object 'pred.knn2' not found

pred.knn3 = knn(train.X, test.X, train.Direction, k = 100)

## Error in knn(train.X, test.X, train.Direction, k = 100): could not find function "knn"

table(pred.knn3, Direction.20092010)

## Error in table(pred.knn3, Direction.20092010): object 'pred.knn3' not found

mean(pred.knn3 == Direction.20092010)

## Error in mean(pred.knn3 == Direction.20092010): object 'pred.knn3' not found

#The original Logistic regression and LDA still have the best performance.

##Q11.
#a
attach(Auto)
mpg01 = rep(0, length(mpg))
mpg01[mpg > median(mpg)] = 1
Auto = data.frame(Auto, mpg01)
head(Auto)

##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name mpg01
## 1 chevrolet chevelle malibu     0
## 2         buick skylark 320     0
## 3        plymouth satellite     0
## 4             amc rebel sst     0
## 5               ford torino     0
## 6          ford galaxie 500     0

#b
par(mfrow=c(2,3))
boxplot(cylinders ~ mpg01, data = Auto, main = "Cylinders vs mpg01")
boxplot(displacement ~ mpg01, data = Auto, main = "Displacement vs mpg01")
boxplot(horsepower ~ mpg01, data = Auto, main = "Horsepower vs mpg01")
boxplot(weight ~ mpg01, data = Auto, main = "Weight vs mpg01")
boxplot(acceleration ~ mpg01, data = Auto, main = "Acceleration vs mpg01")
boxplot(year ~ mpg01, data = Auto, main = "Year vs mpg01")

cor(na.omit(auto[-9]))

## Error in na.omit(auto[-9]): object 'auto' not found

#The variables that appear to correlate strongly are Cylinders, Displacement, Horsepower and Weight

#c
train = (year%%2 == 0)
Auto.train = Auto[train, ]
Auto.test = Auto[!train, ]
mpg01.test = mpg01[!train]

#d
lda.fit = lda(mpg01 ~ cylinders + displacement + horsepower + weight, data = Auto, subset = train)

## Error in lda(mpg01 ~ cylinders + displacement + horsepower + weight, data = Auto, : could not find function "lda"

lda.fit

## Error in eval(expr, envir, enclos): object 'lda.fit' not found

lda.pred = predict(lda.fit, Auto.test)

## Error in predict(lda.fit, Auto.test): object 'lda.fit' not found

lda.class = lda.pred$class

## Error in eval(expr, envir, enclos): object 'lda.pred' not found

table(lda.class, mpg01.test)

## Error in table(lda.class, mpg01.test): object 'lda.class' not found

mean(lda.class != mpg01.test)

## Error in mean(lda.class != mpg01.test): object 'lda.class' not found

#Test error rate 12.6373626%.

#e
qda.fit = qda(mpg01 ~ cylinders + displacement + horsepower + weight, data = Auto, subset = train)

## Error in qda(mpg01 ~ cylinders + displacement + horsepower + weight, data = Auto, : could not find function "qda"

qda.fit

## Error in eval(expr, envir, enclos): object 'qda.fit' not found

qda.pred = predict(qda.fit, Auto.test)

## Error in predict(qda.fit, Auto.test): object 'qda.fit' not found

qda.class = qda.pred$class

## Error in eval(expr, envir, enclos): object 'qda.pred' not found

table(qda.class, mpg01.test)

## Error in table(qda.class, mpg01.test): object 'qda.class' not found

mean(qda.class != mpg01.test)

## Error in mean(qda.class != mpg01.test): object 'qda.class' not found

#Test error rate of 13.1868132%.

#f
glm.fit = glm(mpg01 ~ cylinders + displacement + horsepower + weight, data = Auto, subset = train, family = binomial)
summary(glm.fit)$coef

##                  Estimate  Std. Error    z value     Pr(>|z|)
## (Intercept)  17.658730372 3.409012230  5.1800138 2.218695e-07
## cylinders    -1.028031664 0.653606999 -1.5728590 1.157515e-01
## displacement  0.002461740 0.015029620  0.1637926 8.698944e-01
## horsepower   -0.050610857 0.025209015 -2.0076491 4.468060e-02
## weight       -0.002922352 0.001137367 -2.5694006 1.018746e-02

glm.probs = predict(glm.fit, Auto.test, type = "response")
glm.pred = rep(0, length(glm.probs))
glm.pred[glm.probs > .5] = 1
table(glm.pred, mpg01.test)

##         mpg01.test
## glm.pred  0  1
##        0 89 11
##        1 11 71

mean(glm.pred != mpg01.test)

## [1] 0.1208791

#Test error rate of 12.0879121%.

#g
train.X = cbind(cylinders, displacement, horsepower, weight)[train, ]
test.X = cbind(cylinders, displacement, horsepower, weight)[!train, ]
train.mpg01 = mpg01[train]
set.seed(1)
knn.pred = knn(train.X, test.X, train.mpg01, k = 1)

## Error in knn(train.X, test.X, train.mpg01, k = 1): could not find function "knn"

table(knn.pred, mpg01.test )

## Error in table(knn.pred, mpg01.test): object 'knn.pred' not found

mean(knn.pred != mpg01.test)

## Error in mean(knn.pred != mpg01.test): object 'knn.pred' not found

#test error rate of 15.3846154% for K = 1.
knn.pred = knn(train.X, test.X, train.mpg01, k = 10)

## Error in knn(train.X, test.X, train.mpg01, k = 10): could not find function "knn"

table(knn.pred, mpg01.test )

## Error in table(knn.pred, mpg01.test): object 'knn.pred' not found

mean(knn.pred != mpg01.test)

## Error in mean(knn.pred != mpg01.test): object 'knn.pred' not found

#est error rate of 16.4835165% for K = 10
knn.pred = knn(train.X, test.X, train.mpg01, k = 100)

## Error in knn(train.X, test.X, train.mpg01, k = 100): could not find function "knn"

table(knn.pred, mpg01.test )

## Error in table(knn.pred, mpg01.test): object 'knn.pred' not found

mean(knn.pred != mpg01.test)

## Error in mean(knn.pred != mpg01.test): object 'knn.pred' not found

#test error rate of 14.2857143% for K = 100


##Q13.
library(MASS)
attach(Boston)
crim01 = rep(0, length(crim))
crim01[crim > median(crim)] = 1
Boston = data.frame(Boston, crim01)
train = 1:(length(crim) / 2)
test = (length(crim) / 2 + 1):length(crim)
Boston.train = Boston[train, ]
Boston.test = Boston[test, ]
crim01.test = crim01[test]
glm.fit = glm(crim01 ~ . - crim01 - crim, data = Boston, family = binomial, subset = train)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(glm.fit)

## 
## Call:
## glm(formula = crim01 ~ . - crim01 - crim, family = binomial, 
##     data = Boston, subset = train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.83229  -0.06593   0.00000   0.06181   2.61513  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -91.319906  19.490273  -4.685 2.79e-06 ***
## zn           -0.815573   0.193373  -4.218 2.47e-05 ***
## indus         0.354172   0.173862   2.037  0.04164 *  
## chas          0.167396   0.991922   0.169  0.86599    
## nox          93.706326  21.202008   4.420 9.88e-06 ***
## rm           -4.719108   1.788765  -2.638  0.00833 ** 
## age           0.048634   0.024199   2.010  0.04446 *  
## dis           4.301493   0.979996   4.389 1.14e-05 ***
## rad           3.039983   0.719592   4.225 2.39e-05 ***
## tax          -0.006546   0.007855  -0.833  0.40461    
## ptratio       1.430877   0.359572   3.979 6.91e-05 ***
## black        -0.017552   0.006734  -2.606  0.00915 ** 
## lstat         0.190439   0.086722   2.196  0.02809 *  
## medv          0.598533   0.185514   3.226  0.00125 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 329.367  on 252  degrees of freedom
## Residual deviance:  69.568  on 239  degrees of freedom
## AIC: 97.568
## 
## Number of Fisher Scoring iterations: 10

glm.probs = predict(glm.fit, Boston.test, type = "response")
glm.pred = rep(0, length(glm.probs))
glm.pred[glm.probs > 0.5] = 1
table(glm.pred, crim01.test)

##         crim01.test
## glm.pred   0   1
##        0  68  24
##        1  22 139

mean(glm.pred != crim01.test)

## [1] 0.1818182

#The Logistic Regression has a test error rate of 18.1818182%.

glm.fit = glm(crim01 ~ . - crim01 - crim -chas -nox -tax, data = Boston, family = binomial, subset = train)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(glm.fit)

## 
## Call:
## glm(formula = crim01 ~ . - crim01 - crim - chas - nox - tax, 
##     family = binomial, data = Boston, subset = train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -3.04443  -0.24461  -0.00114   0.38919   2.72999  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -17.291707   6.019497  -2.873 0.004071 ** 
## zn           -0.478891   0.104276  -4.593 4.38e-06 ***
## indus         0.362719   0.082969   4.372 1.23e-05 ***
## rm           -2.364642   0.967625  -2.444 0.014535 *  
## age           0.063371   0.015457   4.100 4.14e-05 ***
## dis           1.494535   0.397249   3.762 0.000168 ***
## rad           1.756498   0.357330   4.916 8.85e-07 ***
## ptratio       0.575045   0.161917   3.551 0.000383 ***
## black        -0.018916   0.006754  -2.801 0.005102 ** 
## lstat         0.057632   0.053051   1.086 0.277326    
## medv          0.237282   0.081326   2.918 0.003527 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 329.37  on 252  degrees of freedom
## Residual deviance: 139.59  on 242  degrees of freedom
## AIC: 161.59
## 
## Number of Fisher Scoring iterations: 9

glm.probs = predict(glm.fit, Boston.test, type = "response")
glm.pred = rep(0, length(glm.probs))
glm.pred[glm.probs > 0.5] = 1
table(glm.pred, crim01.test)

##         crim01.test
## glm.pred   0   1
##        0  78  28
##        1  12 135

mean(glm.pred != crim01.test)

## [1] 0.1581028

#The Logistic regression (-chas -nox -tax) has a test error rate of 15.8102767%.

lda.fit = lda(crim01 ~ . - crim01 - crim, data = Boston, subset = train)
lda.pred = predict(lda.fit, Boston.test)
table(lda.pred$class, crim01.test)

##    crim01.test
##       0   1
##   0  80  24
##   1  10 139

mean(lda.pred$class != crim01.test)

## [1] 0.1343874

#The LDA has a  test error rate of 13.4387352%

lda.fit = lda(crim01 ~ . - crim01 - crim - chas - nox - tax, data = Boston, subset = train)
lda.pred = predict(lda.fit, Boston.test)
table(lda.pred$class, crim01.test)

##    crim01.test
##       0   1
##   0  83  28
##   1   7 135

mean(lda.pred$class != crim01.test)

## [1] 0.1383399

#The LDA (-chas -nox -tax) has a  test error rate of 13.8339921

train.X = cbind(zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, black, lstat, medv)[train, ]
test.X = cbind(zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, black, lstat, medv)[test, ]
train.crim01 = crim01[train]
set.seed(1)
knn.pred = knn(train.X, test.X, train.crim01, k = 1)

## Error in knn(train.X, test.X, train.crim01, k = 1): could not find function "knn"

table(knn.pred, crim01.test)

## Error in table(knn.pred, crim01.test): object 'knn.pred' not found

#KNN (k=1) has test error rate of 45.8498024%
knn.pred = knn(train.X, test.X, train.crim01, k = 10)

## Error in knn(train.X, test.X, train.crim01, k = 10): could not find function "knn"

table(knn.pred, crim01.test)

## Error in table(knn.pred, crim01.test): object 'knn.pred' not found

#KNN(k=10) has test error rate of 11.8577075%