This is an R HTML document. When you click the Knit HTML button a web page will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
##Q10. #a library(ISLR) summary(Weekly)
## Year Lag1 Lag2 Lag3 ## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950 ## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580 ## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410 ## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472 ## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090 ## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260 ## Lag4 Lag5 Volume Today ## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950 ## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540 ## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410 ## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499 ## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050 ## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260 ## Direction ## Down:484 ## Up :605 ## ## ## ##
cor(Weekly[, -9])
## Year Lag1 Lag2 Lag3 Lag4 ## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923 ## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876 ## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535 ## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865 ## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000 ## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027 ## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617 ## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873 ## Lag5 Volume Today ## Year -0.030519101 0.84194162 -0.032459894 ## Lag1 -0.008183096 -0.06495131 -0.075031842 ## Lag2 -0.072499482 -0.08551314 0.059166717 ## Lag3 0.060657175 -0.06928771 -0.071243639 ## Lag4 -0.075675027 -0.06107462 -0.007825873 ## Lag5 1.000000000 -0.05851741 0.011012698 ## Volume -0.058517414 1.00000000 -0.033077783 ## Today 0.011012698 -0.03307778 1.000000000
attach(Weekly) plot(Volume)

#The only variables that show a correlation is between Year and Volume #b fit.glm = glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Weekly, family = binomial) summary(fit.glm)
## ## Call: ## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + ## Volume, family = binomial, data = Weekly) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -1.6949 -1.2565 0.9913 1.0849 1.4579 ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) 0.26686 0.08593 3.106 0.0019 ** ## Lag1 -0.04127 0.02641 -1.563 0.1181 ## Lag2 0.05844 0.02686 2.175 0.0296 * ## Lag3 -0.01606 0.02666 -0.602 0.5469 ## Lag4 -0.02779 0.02646 -1.050 0.2937 ## Lag5 -0.01447 0.02638 -0.549 0.5833 ## Volume -0.02274 0.03690 -0.616 0.5377 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for binomial family taken to be 1) ## ## Null deviance: 1496.2 on 1088 degrees of freedom ## Residual deviance: 1486.4 on 1082 degrees of freedom ## AIC: 1500.4 ## ## Number of Fisher Scoring iterations: 4
#It would seem that Lag2 is the only predictor with a p-value < 0.05 so its statistically significant #c probs = predict(fit.glm, type = "response") pred.glm = rep("Down", length(probs)) pred.glm[probs > 0.5] = "Up" table(pred.glm, Direction)
## Direction ## pred.glm Down Up ## Down 54 48 ## Up 430 557
#The matrix shows false negatives and false positives. There are 430 false positives, 54/484 of the true negatives, 557 true positives, and 48/625 false positives #d train = (Year < 2009) Weekly.20092010 = Weekly[!train, ] Direction.20092010 = Direction[!train] fit.glm2 = glm(Direction ~ Lag2, data = Weekly, family = binomial, subset = train) summary(fit.glm2)
## ## Call: ## glm(formula = Direction ~ Lag2, family = binomial, data = Weekly, ## subset = train) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -1.536 -1.264 1.021 1.091 1.368 ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) 0.20326 0.06428 3.162 0.00157 ** ## Lag2 0.05810 0.02870 2.024 0.04298 * ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for binomial family taken to be 1) ## ## Null deviance: 1354.7 on 984 degrees of freedom ## Residual deviance: 1350.5 on 983 degrees of freedom ## AIC: 1354.5 ## ## Number of Fisher Scoring iterations: 4
probs2 = predict(fit.glm2, Weekly.20092010, type = "response") pred.glm2 = rep("Down", length(probs2)) pred.glm2[probs2 > 0.5] = "Up" table(pred.glm2, Direction.20092010)
## Direction.20092010 ## pred.glm2 Down Up ## Down 9 5 ## Up 34 56
#e fit.lda = lda(Direction ~ Lag2, data = Weekly, subset = train)
## Error in lda(Direction ~ Lag2, data = Weekly, subset = train): could not find function "lda"
fit.lda
## Error in eval(expr, envir, enclos): object 'fit.lda' not found
pred.lda = predict(fit.lda, Weekly.20092010)
## Error in predict(fit.lda, Weekly.20092010): object 'fit.lda' not found
table(pred.lda$class, Direction.20092010)
## Error in table(pred.lda$class, Direction.20092010): object 'pred.lda' not found
#f fit.qda = qda(Direction ~ Lag2, data = Weekly, subset = train)
## Error in qda(Direction ~ Lag2, data = Weekly, subset = train): could not find function "qda"
fit.qda
## Error in eval(expr, envir, enclos): object 'fit.qda' not found
pred.qda = predict(fit.qda, Weekly.20092010)
## Error in predict(fit.qda, Weekly.20092010): object 'fit.qda' not found
table(pred.qda$class, Direction.20092010)
## Error in table(pred.qda$class, Direction.20092010): object 'pred.qda' not found
#g train.X = as.matrix(Lag2[train]) test.X = as.matrix(Lag2[!train]) train.Direction = Direction[train] set.seed(1) pred.knn = knn(train.X, test.X, train.Direction, k = 1)
## Error in knn(train.X, test.X, train.Direction, k = 1): could not find function "knn"
table(pred.knn, Direction.20092010)
## Error in table(pred.knn, Direction.20092010): object 'pred.knn' not found
#h #The best results came from Logistic regression and LDA. #i fit.glm3 = glm(Direction ~ Lag2:Lag1, data = Weekly, family = binomial, subset = train) probs3 = predict(fit.glm3, Weekly.20092010, type = "response") pred.glm3 = rep("Down", length(probs3)) pred.glm3[probs3 > 0.5] = "Up" table(pred.glm3, Direction.20092010)
## Direction.20092010 ## pred.glm3 Down Up ## Down 1 1 ## Up 42 60
mean(pred.glm3 == Direction.20092010)
## [1] 0.5865385
fit.lda2 = lda(Direction ~ Lag2:Lag1, data = Weekly, subset = train)
## Error in lda(Direction ~ Lag2:Lag1, data = Weekly, subset = train): could not find function "lda"
pred.lda2 = predict(fit.lda2, Weekly.20092010)
## Error in predict(fit.lda2, Weekly.20092010): object 'fit.lda2' not found
mean(pred.lda2$class == Direction.20092010)
## Error in mean(pred.lda2$class == Direction.20092010): object 'pred.lda2' not found
fit.qda2 = qda(Direction ~ Lag2 + sqrt(abs(Lag2)), data = Weekly, subset = train)
## Error in qda(Direction ~ Lag2 + sqrt(abs(Lag2)), data = Weekly, subset = train): could not find function "qda"
pred.qda2 = predict(fit.qda2, Weekly.20092010)
## Error in predict(fit.qda2, Weekly.20092010): object 'fit.qda2' not found
table(pred.qda2$class, Direction.20092010)
## Error in table(pred.qda2$class, Direction.20092010): object 'pred.qda2' not found
mean(pred.qda2$class == Direction.20092010)
## Error in mean(pred.qda2$class == Direction.20092010): object 'pred.qda2' not found
pred.knn2 = knn(train.X, test.X, train.Direction, k = 10)
## Error in knn(train.X, test.X, train.Direction, k = 10): could not find function "knn"
table(pred.knn2, Direction.20092010)
## Error in table(pred.knn2, Direction.20092010): object 'pred.knn2' not found
mean(pred.knn2 == Direction.20092010)
## Error in mean(pred.knn2 == Direction.20092010): object 'pred.knn2' not found
pred.knn3 = knn(train.X, test.X, train.Direction, k = 100)
## Error in knn(train.X, test.X, train.Direction, k = 100): could not find function "knn"
table(pred.knn3, Direction.20092010)
## Error in table(pred.knn3, Direction.20092010): object 'pred.knn3' not found
mean(pred.knn3 == Direction.20092010)
## Error in mean(pred.knn3 == Direction.20092010): object 'pred.knn3' not found
#The original Logistic regression and LDA still have the best performance. ##Q11. #a attach(Auto) mpg01 = rep(0, length(mpg)) mpg01[mpg > median(mpg)] = 1 Auto = data.frame(Auto, mpg01) head(Auto)
## mpg cylinders displacement horsepower weight acceleration year origin ## 1 18 8 307 130 3504 12.0 70 1 ## 2 15 8 350 165 3693 11.5 70 1 ## 3 18 8 318 150 3436 11.0 70 1 ## 4 16 8 304 150 3433 12.0 70 1 ## 5 17 8 302 140 3449 10.5 70 1 ## 6 15 8 429 198 4341 10.0 70 1 ## name mpg01 ## 1 chevrolet chevelle malibu 0 ## 2 buick skylark 320 0 ## 3 plymouth satellite 0 ## 4 amc rebel sst 0 ## 5 ford torino 0 ## 6 ford galaxie 500 0
#b par(mfrow=c(2,3)) boxplot(cylinders ~ mpg01, data = Auto, main = "Cylinders vs mpg01") boxplot(displacement ~ mpg01, data = Auto, main = "Displacement vs mpg01") boxplot(horsepower ~ mpg01, data = Auto, main = "Horsepower vs mpg01") boxplot(weight ~ mpg01, data = Auto, main = "Weight vs mpg01") boxplot(acceleration ~ mpg01, data = Auto, main = "Acceleration vs mpg01") boxplot(year ~ mpg01, data = Auto, main = "Year vs mpg01")

cor(na.omit(auto[-9]))
## Error in na.omit(auto[-9]): object 'auto' not found
#The variables that appear to correlate strongly are Cylinders, Displacement, Horsepower and Weight #c train = (year%%2 == 0) Auto.train = Auto[train, ] Auto.test = Auto[!train, ] mpg01.test = mpg01[!train] #d lda.fit = lda(mpg01 ~ cylinders + displacement + horsepower + weight, data = Auto, subset = train)
## Error in lda(mpg01 ~ cylinders + displacement + horsepower + weight, data = Auto, : could not find function "lda"
lda.fit
## Error in eval(expr, envir, enclos): object 'lda.fit' not found
lda.pred = predict(lda.fit, Auto.test)
## Error in predict(lda.fit, Auto.test): object 'lda.fit' not found
lda.class = lda.pred$class
## Error in eval(expr, envir, enclos): object 'lda.pred' not found
table(lda.class, mpg01.test)
## Error in table(lda.class, mpg01.test): object 'lda.class' not found
mean(lda.class != mpg01.test)
## Error in mean(lda.class != mpg01.test): object 'lda.class' not found
#Test error rate 12.6373626%. #e qda.fit = qda(mpg01 ~ cylinders + displacement + horsepower + weight, data = Auto, subset = train)
## Error in qda(mpg01 ~ cylinders + displacement + horsepower + weight, data = Auto, : could not find function "qda"
qda.fit
## Error in eval(expr, envir, enclos): object 'qda.fit' not found
qda.pred = predict(qda.fit, Auto.test)
## Error in predict(qda.fit, Auto.test): object 'qda.fit' not found
qda.class = qda.pred$class
## Error in eval(expr, envir, enclos): object 'qda.pred' not found
table(qda.class, mpg01.test)
## Error in table(qda.class, mpg01.test): object 'qda.class' not found
mean(qda.class != mpg01.test)
## Error in mean(qda.class != mpg01.test): object 'qda.class' not found
#Test error rate of 13.1868132%. #f glm.fit = glm(mpg01 ~ cylinders + displacement + horsepower + weight, data = Auto, subset = train, family = binomial) summary(glm.fit)$coef
## Estimate Std. Error z value Pr(>|z|) ## (Intercept) 17.658730372 3.409012230 5.1800138 2.218695e-07 ## cylinders -1.028031664 0.653606999 -1.5728590 1.157515e-01 ## displacement 0.002461740 0.015029620 0.1637926 8.698944e-01 ## horsepower -0.050610857 0.025209015 -2.0076491 4.468060e-02 ## weight -0.002922352 0.001137367 -2.5694006 1.018746e-02
glm.probs = predict(glm.fit, Auto.test, type = "response") glm.pred = rep(0, length(glm.probs)) glm.pred[glm.probs > .5] = 1 table(glm.pred, mpg01.test)
## mpg01.test ## glm.pred 0 1 ## 0 89 11 ## 1 11 71
mean(glm.pred != mpg01.test)
## [1] 0.1208791
#Test error rate of 12.0879121%. #g train.X = cbind(cylinders, displacement, horsepower, weight)[train, ] test.X = cbind(cylinders, displacement, horsepower, weight)[!train, ] train.mpg01 = mpg01[train] set.seed(1) knn.pred = knn(train.X, test.X, train.mpg01, k = 1)
## Error in knn(train.X, test.X, train.mpg01, k = 1): could not find function "knn"
table(knn.pred, mpg01.test )
## Error in table(knn.pred, mpg01.test): object 'knn.pred' not found
mean(knn.pred != mpg01.test)
## Error in mean(knn.pred != mpg01.test): object 'knn.pred' not found
#test error rate of 15.3846154% for K = 1. knn.pred = knn(train.X, test.X, train.mpg01, k = 10)
## Error in knn(train.X, test.X, train.mpg01, k = 10): could not find function "knn"
table(knn.pred, mpg01.test )
## Error in table(knn.pred, mpg01.test): object 'knn.pred' not found
mean(knn.pred != mpg01.test)
## Error in mean(knn.pred != mpg01.test): object 'knn.pred' not found
#est error rate of 16.4835165% for K = 10 knn.pred = knn(train.X, test.X, train.mpg01, k = 100)
## Error in knn(train.X, test.X, train.mpg01, k = 100): could not find function "knn"
table(knn.pred, mpg01.test )
## Error in table(knn.pred, mpg01.test): object 'knn.pred' not found
mean(knn.pred != mpg01.test)
## Error in mean(knn.pred != mpg01.test): object 'knn.pred' not found
#test error rate of 14.2857143% for K = 100 ##Q13. library(MASS) attach(Boston) crim01 = rep(0, length(crim)) crim01[crim > median(crim)] = 1 Boston = data.frame(Boston, crim01) train = 1:(length(crim) / 2) test = (length(crim) / 2 + 1):length(crim) Boston.train = Boston[train, ] Boston.test = Boston[test, ] crim01.test = crim01[test] glm.fit = glm(crim01 ~ . - crim01 - crim, data = Boston, family = binomial, subset = train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(glm.fit)
## ## Call: ## glm(formula = crim01 ~ . - crim01 - crim, family = binomial, ## data = Boston, subset = train) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -2.83229 -0.06593 0.00000 0.06181 2.61513 ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) -91.319906 19.490273 -4.685 2.79e-06 *** ## zn -0.815573 0.193373 -4.218 2.47e-05 *** ## indus 0.354172 0.173862 2.037 0.04164 * ## chas 0.167396 0.991922 0.169 0.86599 ## nox 93.706326 21.202008 4.420 9.88e-06 *** ## rm -4.719108 1.788765 -2.638 0.00833 ** ## age 0.048634 0.024199 2.010 0.04446 * ## dis 4.301493 0.979996 4.389 1.14e-05 *** ## rad 3.039983 0.719592 4.225 2.39e-05 *** ## tax -0.006546 0.007855 -0.833 0.40461 ## ptratio 1.430877 0.359572 3.979 6.91e-05 *** ## black -0.017552 0.006734 -2.606 0.00915 ** ## lstat 0.190439 0.086722 2.196 0.02809 * ## medv 0.598533 0.185514 3.226 0.00125 ** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for binomial family taken to be 1) ## ## Null deviance: 329.367 on 252 degrees of freedom ## Residual deviance: 69.568 on 239 degrees of freedom ## AIC: 97.568 ## ## Number of Fisher Scoring iterations: 10
glm.probs = predict(glm.fit, Boston.test, type = "response") glm.pred = rep(0, length(glm.probs)) glm.pred[glm.probs > 0.5] = 1 table(glm.pred, crim01.test)
## crim01.test ## glm.pred 0 1 ## 0 68 24 ## 1 22 139
mean(glm.pred != crim01.test)
## [1] 0.1818182
#The Logistic Regression has a test error rate of 18.1818182%. glm.fit = glm(crim01 ~ . - crim01 - crim -chas -nox -tax, data = Boston, family = binomial, subset = train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(glm.fit)
## ## Call: ## glm(formula = crim01 ~ . - crim01 - crim - chas - nox - tax, ## family = binomial, data = Boston, subset = train) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -3.04443 -0.24461 -0.00114 0.38919 2.72999 ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) -17.291707 6.019497 -2.873 0.004071 ** ## zn -0.478891 0.104276 -4.593 4.38e-06 *** ## indus 0.362719 0.082969 4.372 1.23e-05 *** ## rm -2.364642 0.967625 -2.444 0.014535 * ## age 0.063371 0.015457 4.100 4.14e-05 *** ## dis 1.494535 0.397249 3.762 0.000168 *** ## rad 1.756498 0.357330 4.916 8.85e-07 *** ## ptratio 0.575045 0.161917 3.551 0.000383 *** ## black -0.018916 0.006754 -2.801 0.005102 ** ## lstat 0.057632 0.053051 1.086 0.277326 ## medv 0.237282 0.081326 2.918 0.003527 ** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for binomial family taken to be 1) ## ## Null deviance: 329.37 on 252 degrees of freedom ## Residual deviance: 139.59 on 242 degrees of freedom ## AIC: 161.59 ## ## Number of Fisher Scoring iterations: 9
glm.probs = predict(glm.fit, Boston.test, type = "response") glm.pred = rep(0, length(glm.probs)) glm.pred[glm.probs > 0.5] = 1 table(glm.pred, crim01.test)
## crim01.test ## glm.pred 0 1 ## 0 78 28 ## 1 12 135
mean(glm.pred != crim01.test)
## [1] 0.1581028
#The Logistic regression (-chas -nox -tax) has a test error rate of 15.8102767%. lda.fit = lda(crim01 ~ . - crim01 - crim, data = Boston, subset = train) lda.pred = predict(lda.fit, Boston.test) table(lda.pred$class, crim01.test)
## crim01.test ## 0 1 ## 0 80 24 ## 1 10 139
mean(lda.pred$class != crim01.test)
## [1] 0.1343874
#The LDA has a test error rate of 13.4387352% lda.fit = lda(crim01 ~ . - crim01 - crim - chas - nox - tax, data = Boston, subset = train) lda.pred = predict(lda.fit, Boston.test) table(lda.pred$class, crim01.test)
## crim01.test ## 0 1 ## 0 83 28 ## 1 7 135
mean(lda.pred$class != crim01.test)
## [1] 0.1383399
#The LDA (-chas -nox -tax) has a test error rate of 13.8339921 train.X = cbind(zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, black, lstat, medv)[train, ] test.X = cbind(zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, black, lstat, medv)[test, ] train.crim01 = crim01[train] set.seed(1) knn.pred = knn(train.X, test.X, train.crim01, k = 1)
## Error in knn(train.X, test.X, train.crim01, k = 1): could not find function "knn"
table(knn.pred, crim01.test)
## Error in table(knn.pred, crim01.test): object 'knn.pred' not found
#KNN (k=1) has test error rate of 45.8498024% knn.pred = knn(train.X, test.X, train.crim01, k = 10)
## Error in knn(train.X, test.X, train.crim01, k = 10): could not find function "knn"
table(knn.pred, crim01.test)
## Error in table(knn.pred, crim01.test): object 'knn.pred' not found
#KNN(k=10) has test error rate of 11.8577075%