###13
##a
library(MASS)
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.1.3
##
## Attaching package: 'ISLR2'
## The following object is masked from 'package:MASS':
##
## Boston
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
plot(Weekly)
The only significant patterns are between Volume and Year.
##b
glm.fits=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data=Weekly,family="binomial")
summary (glm.fits)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = "binomial", data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Lag2 was the only statistically significant variable.
##c
glm.probs=predict(glm.fits, type="response")
glm.pred=rep("Down", 1089)
glm.pred[glm.probs >.5]=" Up"
table(glm.pred, Weekly$Direction)
##
## glm.pred Down Up
## Up 430 557
## Down 54 48
This displays that the model can correctly predict the Up trends 96.3%. While Down rates were only correctly predicted at 4.75%.
##D
train = (Weekly$Year<2009)
Weekly.0910= Weekly[!train ,]
Direction.0910 = Weekly$Direction[!train]
glm.fits=glm(Direction~Lag2, data=Weekly,family=binomial ,subset=train)
glm.probs=predict (glm.fits, Weekly.0910, type="response")
glm.pred=rep("Down", 1089)
glm.pred[glm.probs >.5]=" Up"
table(glm.pred, Direction.0910)
## Error in table(glm.pred, Direction.0910): all arguments must have the same length
mean(logWeekly.pred == Direction.0910)
## Error in mean(logWeekly.pred == Direction.0910): object 'logWeekly.pred' not found
##e
lda.fit = lda(Direction~Lag2, data= Weekly,family=binomial, subset=train)
lda.pred<-predict(lda.fit, Weekly.0910)
table(lda.pred$class, Direction.0910)
## Direction.0910
## Down Up
## Down 9 5
## Up 34 56
mean(lda.pred$class==Direction.0910)
## [1] 0.625
##f
qda.fit = qda(Direction ~ Lag2, data = Weekly, subset = train)
qda.pred = predict(qda.fit, Weekly.0910)
table(qda.pred$class, Direction.0910)
## Direction.0910
## Down Up
## Down 0 0
## Up 43 61
mean(qda.pred$class==Direction.0910)
## [1] 0.5865385
##g
library(class)
train.X=cbind(Lag1, Lag2)[train ,]
## Error in cbind(Lag1, Lag2): object 'Lag1' not found
test.X=cbind(Lag1, Lag2)[!train ,]
## Error in cbind(Lag1, Lag2): object 'Lag1' not found
train.Direction =Direction [train]
## Error in eval(expr, envir, enclos): object 'Direction' not found
set.seed(1)
knn.pred=knn(train.X,test.X,train.Direction ,k=1)
## Error in as.matrix(train): object 'train.X' not found
table(knn.pred ,Direction.0910)
## Error in table(knn.pred, Direction.0910): object 'knn.pred' not found
##h
##i Both the Logistic Regression method and the Linear Discriminant Analysis have the higest rates.
##j
###14
##a
attach(Auto)
summary(Auto)
## mpg cylinders displacement horsepower weight
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0 1st Qu.:2225
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5 Median :2804
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5 Mean :2978
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0 3rd Qu.:3615
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
##
## acceleration year origin name
## Min. : 8.00 Min. :70.00 Min. :1.000 amc matador : 5
## 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000 ford pinto : 5
## Median :15.50 Median :76.00 Median :1.000 toyota corolla : 5
## Mean :15.54 Mean :75.98 Mean :1.577 amc gremlin : 4
## 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000 amc hornet : 4
## Max. :24.80 Max. :82.00 Max. :3.000 chevrolet chevette: 4
## (Other) :365
mpg01 <- rep(0, length(mpg))
mpg01[mpg > median(mpg)] <- 1
Auto = data.frame(Auto, mpg01)
##b
plot(Auto)
Variables that can strongly associate with mpg01 include Weight,
Displacement, and Cylinders.
##c
train.auto <- Auto[train,]
test.auto <- Auto[-train,]
##d
lda_m <- lda(mpg01~displacement+horsepower+weight+year+cylinders+origin, data = train.auto)
lda_p <- predict(lda_m, test.auto)
table(lda_p$class, test.auto$mpg01)
##
## 0 1
## 0 166 7
## 1 29 189
##e
qda_m <- qda(mpg01~displacement+horsepower+weight+year+cylinders+origin, data = train.auto)
qda_p <- predict(qda_m, test.auto)
table(qda_p$class, test.auto$mpg01)
##
## 0 1
## 0 175 14
## 1 20 182
##f
glm_m <- glm(mpg01 ~ cylinders + weight + displacement + horsepower, data = train.auto, family = binomial)
summary(glm_m)
##
## Call:
## glm(formula = mpg01 ~ cylinders + weight + displacement + horsepower,
## family = binomial, data = train.auto)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4400 -0.1910 0.0476 0.3572 3.3790
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 11.7966002 1.7091082 6.902 5.12e-12 ***
## cylinders -0.0129303 0.3457347 -0.037 0.97017
## weight -0.0019458 0.0006918 -2.812 0.00492 **
## displacement -0.0129820 0.0082203 -1.579 0.11428
## horsepower -0.0421321 0.0139763 -3.015 0.00257 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 543.43 on 391 degrees of freedom
## Residual deviance: 207.27 on 387 degrees of freedom
## (593 observations deleted due to missingness)
## AIC: 217.27
##
## Number of Fisher Scoring iterations: 7
probs <- predict(glm_m, test.auto, type = "response")
pred.glm <- rep(0, length(probs))
pred.glm[probs > 0.5] <- 1
table(pred.glm, mpg01.test)
## Error in table(pred.glm, mpg01.test): object 'mpg01.test' not found
mean(pred.glm != mpg01.test)
## Error in mean(pred.glm != mpg01.test): object 'mpg01.test' not found
##g
##h
###16
data(Boston)
crim_1 <- rep(0, length(Boston$crim))
crim_1[Boston$crim > median(Boston$crim)] <- 1
Boston <- data.frame(Boston, crim_1)
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio lstat
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 1.73
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.: 6.95
## Median : 5.000 Median :330.0 Median :19.05 Median :11.36
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :12.65
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:16.95
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :37.97
## medv crim_1
## Min. : 5.00 Min. :0.0
## 1st Qu.:17.02 1st Qu.:0.0
## Median :21.20 Median :0.5
## Mean :22.53 Mean :0.5
## 3rd Qu.:25.00 3rd Qu.:1.0
## Max. :50.00 Max. :1.0
set.seed(1337)
train <- sample(1:dim(Boston)[1], dim(Boston)[1]*.7, rep=FALSE)
test <- -train
Boston.train <- Boston[train, ]
Boston.test <- Boston[test, ]
crim_1.test <- crim_1[test]
##Logistic Regression
fit.glm <- glm(crim_1~nox+indus+age+rad, data = Boston, family = binomial)
probs <- predict(fit.glm, Boston.test, type = "response")
pred.glm <- rep(0, length(probs))
pred.glm[probs > 0.5] <- 1
table(pred.glm, crim_1.test)
## crim_1.test
## pred.glm 0 1
## 0 63 16
## 1 7 66
mean(pred.glm != crim_1.test)
## [1] 0.1513158
The Error rate for the Logistic Regression is 15.1%
###Linear Discriminant Analysis
fit.lda <- lda(crim_1~nox+indus+age+rad, data = Boston)
pred.lda <- predict(fit.lda, Boston.test)
table(pred.lda$class, crim_1.test)
## crim_1.test
## 0 1
## 0 67 22
## 1 3 60
mean(pred.lda$class != crim_1.test)
## [1] 0.1644737
The Error rate for the LDA model is 16.4%
##KKN
data = scale(Boston[,-c(1,15)])
set.seed(1234)
train <- sample(1:dim(Boston)[1], dim(Boston)[1]*.7, rep=FALSE)
test <- -train
training_data = data[train, c("nox","indus","age","rad")]
testing_data = data[test, c("nox","indus","age","rad")]
train.crime_1 = Boston$crim0_1[train]
test.crime_1= Boston$crim_1[test]