library(ISLR)
## Warning: package 'ISLR' was built under R version 3.4.4
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202
## Median : 0.2380 Median : 0.2340 Median :1.00268
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821
## Today Direction
## Min. :-18.1950 Down:484
## 1st Qu.: -1.1540 Up :605
## Median : 0.2410
## Mean : 0.1499
## 3rd Qu.: 1.4050
## Max. : 12.0260
pairs(Weekly)
There seems to be a pattern at Volume and year
glm.fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data=Weekly, family="binomial")
summary(glm.fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = "binomial", data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Lag 2 seems to be significant
glm.prob <- predict(glm.fit, type="response")
glm.pred <- ifelse(glm.prob>.5, "Up", "Down")
cmatrix <- table(Weekly$Direction, glm.pred)
cmatrix
## glm.pred
## Down Up
## Down 54 430
## Up 48 557
There is prevalence of Up prediction. The model predicts Up direction very well, but for Down direction it performs badly.
train = (Weekly$Year<=2008)
test = Weekly[!train,]
glm.fit2 <- glm(Direction ~ Lag2, data=Weekly, subset=train, family="binomial")
glm.prob2 <- predict(glm.fit2, type="response", newdata=test)
glm.pred2 <- ifelse(glm.prob2>.5, "Up", "Down")
cmatrix2 <- table(test$Direction, glm.pred2)
cmatrix2
## glm.pred2
## Down Up
## Down 9 34
## Up 5 56
oa <- (cmatrix2["Down", "Down"] + cmatrix2["Up", "Up"])/sum(cmatrix2)
oa
## [1] 0.625
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.4
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(MASS)
## Warning: package 'MASS' was built under R version 3.4.4
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
lda.fit <- lda(Direction ~ Lag2, data=Weekly, subset=train)
lda.pred <- predict(lda.fit, newdata=test)
cmatrix3 <- table(test$Direction, lda.pred$class)
cmatrix3
##
## Down Up
## Down 9 34
## Up 5 56
oa2<- (cmatrix3["Down", "Down"] + cmatrix3["Up", "Up"])/sum(cmatrix3)
oa2
## [1] 0.625
qda.fit <- qda(Direction ~ Lag2, data=Weekly, subset=train)
qda.pred <- predict(qda.fit, newdata=test)
cmatrix4 <- table(test$Direction, qda.pred$class)
cmatrix4
##
## Down Up
## Down 0 43
## Up 0 61
oa3<- (cmatrix4["Down", "Down"] + cmatrix4["Up", "Up"])/sum(cmatrix4)
oa3
## [1] 0.5865385
library(class)
train2 = Weekly[train, c("Lag2", "Direction")]
knn.pred = knn(train=data.frame(train2$Lag2), test=data.frame(test$Lag2), cl=train2$Direction, k=1)
cmatrix5 <- table(test$Direction, knn.pred)
cmatrix5
## knn.pred
## Down Up
## Down 21 22
## Up 29 32
oa4 <- (cmatrix5["Down", "Down"] + cmatrix5["Up", "Up"])/sum(cmatrix5)
oa4
## [1] 0.5096154
rbind(oa, oa2, oa3, oa4)
## [,1]
## oa 0.6250000
## oa2 0.6250000
## oa3 0.5865385
## oa4 0.5096154
models glm.fit2 oa from d and lda.fit oa2 from e
qda.fit <- qda(Direction ~ Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data=Weekly, subset=train)
qda.pred <- predict(qda.fit, test)
print( sum(qda.pred$class==test$Direction)/length(qda.pred$class))
## [1] 0.4326923
qda.fit <- qda(Direction ~ Lag1*Lag2*Lag3*Lag4*Lag5 + Volume, data=Weekly, subset=train)
qda.pred <- predict(qda.fit, test)
print( sum(qda.pred$class==test$Direction)/length(qda.pred$class))
## [1] 0.4134615
its worse than using only the Lag2 predictor shown in g and the second time even worse.
lda.fit <- lda(Direction ~ Lag1*Lag2*Lag3*Lag4*Lag5 + Volume, data=Weekly, subset=train)
lda.pred <- predict(lda.fit, test)
print( sum(lda.pred$class==test$Direction)/length(lda.pred$class))
## [1] 0.4423077
the lda is just as bad as when I ran qda the first time
mpg01 <- rep(0, length(Auto$mpg))
mpg01[Auto$mpg > median(Auto$mpg)] <- 1
Auto <- data.frame(Auto, mpg01)
summary(Auto)
## mpg cylinders displacement horsepower
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0
##
## weight acceleration year origin
## Min. :1613 Min. : 8.00 Min. :70.00 Min. :1.000
## 1st Qu.:2225 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000
## Median :2804 Median :15.50 Median :76.00 Median :1.000
## Mean :2978 Mean :15.54 Mean :75.98 Mean :1.577
## 3rd Qu.:3615 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000
## Max. :5140 Max. :24.80 Max. :82.00 Max. :3.000
##
## name mpg01
## amc matador : 5 Min. :0.0
## ford pinto : 5 1st Qu.:0.0
## toyota corolla : 5 Median :0.5
## amc gremlin : 4 Mean :0.5
## amc hornet : 4 3rd Qu.:1.0
## chevrolet chevette: 4 Max. :1.0
## (Other) :365
cor(Auto[, -9])
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## mpg01 0.8369392 -0.7591939 -0.7534766 -0.6670526 -0.7577566
## acceleration year origin mpg01
## mpg 0.4233285 0.5805410 0.5652088 0.8369392
## cylinders -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration 1.0000000 0.2903161 0.2127458 0.3468215
## year 0.2903161 1.0000000 0.1815277 0.4299042
## origin 0.2127458 0.1815277 1.0000000 0.5136984
## mpg01 0.3468215 0.4299042 0.5136984 1.0000000
pairs(Auto)
par(mfrow=c(2,3))
boxplot(cylinders ~ mpg01, data = Auto, main = "Cylinders vs mpg01")
boxplot(displacement ~ mpg01, data = Auto, main = "Displacement vs mpg01")
boxplot(horsepower ~ mpg01, data = Auto, main = "Horsepower vs mpg01")
boxplot(weight ~ mpg01, data = Auto, main = "Weight vs mpg01")
boxplot(acceleration ~ mpg01, data = Auto, main = "Acceleration vs mpg01")
boxplot(year ~ mpg01, data = Auto, main = "Year vs mpg01")
## Question b answer some association between “mpg01” and “cylinders”, “weight”, “displacement” and “horsepower” exsist.
set.seed(123)
train <- sample(1:dim(Auto)[1], dim(Auto)[1]*.7, rep=FALSE)
test <- -train
trainingset <- Auto[train, ]
testingset = Auto[test, ]
mpg01.test <- mpg01[test]
lda.fit <- lda(mpg01 ~ cylinders + weight + displacement + horsepower, data = trainingset)
lda.fit
## Call:
## lda(mpg01 ~ cylinders + weight + displacement + horsepower, data = trainingset)
##
## Prior probabilities of groups:
## 0 1
## 0.4817518 0.5182482
##
## Group means:
## cylinders weight displacement horsepower
## 0 6.795455 3659.008 274.3333 130.88636
## 1 4.218310 2343.599 117.0528 79.71127
##
## Coefficients of linear discriminants:
## LD1
## cylinders -0.4483062756
## weight -0.0012201696
## displacement 0.0001078142
## horsepower 0.0046253024
lda.pred = predict(lda.fit, testingset)
names(lda.pred)
## [1] "class" "posterior" "x"
lda.pred <- predict(lda.fit, testingset)
table(lda.pred$class, mpg01.test)
## mpg01.test
## 0 1
## 0 53 3
## 1 11 51
mean(lda.pred$class != mpg01.test)
## [1] 0.1186441
test error rate of 11.86%
qda.fit = qda(mpg01 ~ cylinders + horsepower + weight + acceleration, data=trainingset)
qda.fit
## Call:
## qda(mpg01 ~ cylinders + horsepower + weight + acceleration, data = trainingset)
##
## Prior probabilities of groups:
## 0 1
## 0.4817518 0.5182482
##
## Group means:
## cylinders horsepower weight acceleration
## 0 6.795455 130.88636 3659.008 14.65303
## 1 4.218310 79.71127 2343.599 16.46620
qda.class=predict(qda.fit, testingset)$class
table(qda.class, testingset$mpg01)
##
## qda.class 0 1
## 0 56 3
## 1 8 51
mean(qda.class != testingset$mpg01)
## [1] 0.09322034
test error rate of 0.093%
glm.fit <- glm(mpg01 ~ cylinders + weight + displacement + horsepower, data = trainingset, family = binomial)
summary(glm.fit)
##
## Call:
## glm(formula = mpg01 ~ cylinders + weight + displacement + horsepower,
## family = binomial, data = trainingset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4870 -0.1763 0.1231 0.3633 3.2024
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 11.6661375 1.9885822 5.867 4.45e-09 ***
## cylinders 0.0365940 0.3817545 0.096 0.92363
## weight -0.0023706 0.0008319 -2.850 0.00438 **
## displacement -0.0106969 0.0096956 -1.103 0.26991
## horsepower -0.0327332 0.0154688 -2.116 0.03434 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 379.48 on 273 degrees of freedom
## Residual deviance: 147.50 on 269 degrees of freedom
## AIC: 157.5
##
## Number of Fisher Scoring iterations: 7
probs <- predict(glm.fit, testingset, type = "response")
glm.pred <- rep(0, length(probs))
glm.pred[probs > 0.5] <- 1
table(glm.pred, mpg01.test)
## mpg01.test
## glm.pred 0 1
## 0 54 3
## 1 10 51
mean(glm.pred != mpg01.test)
## [1] 0.1101695
Test error rate is 11.01695%.
str(Auto)
## 'data.frame': 392 obs. of 10 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : num 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : num 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : num 3504 3693 3436 3433 3449 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : num 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : num 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## $ mpg01 : num 0 0 0 0 0 0 0 0 0 0 ...
data = scale(Auto[,-c(9,10)])
set.seed(1234)
train <- sample(1:dim(Auto)[1], 392*.7, rep=FALSE)
test <- -train
trainingset = data[train,c("cylinders","horsepower","weight","acceleration")]
testingset = data[test, c("cylinders", "horsepower","weight","acceleration")]
train.mpg01 = Auto$mpg01[train]
test.mpg01= Auto$mpg01[test]
library(class)
set.seed(1234)
knn_pred_y = knn(trainingset, testingset, train.mpg01, k = 1)
table(knn_pred_y, test.mpg01)
## test.mpg01
## knn_pred_y 0 1
## 0 50 4
## 1 6 58
mean(knn_pred_y != test.mpg01)
## [1] 0.08474576
Test error rate is 8.47%
library(MASS)
data("Boston")
crim01 <- rep(0, length(Boston$crim))
crim01[Boston$crim > median(Boston$crim)] <- 1
Boston <- data.frame(Boston, crim01)
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08204 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv crim01
## Min. : 1.73 Min. : 5.00 Min. :0.0
## 1st Qu.: 6.95 1st Qu.:17.02 1st Qu.:0.0
## Median :11.36 Median :21.20 Median :0.5
## Mean :12.65 Mean :22.53 Mean :0.5
## 3rd Qu.:16.95 3rd Qu.:25.00 3rd Qu.:1.0
## Max. :37.97 Max. :50.00 Max. :1.0
set.seed(1234)
train <- sample(1:dim(Boston)[1], dim(Boston)[1]*.7, rep=FALSE)
test <- -train
Boston.train <- Boston[train, ]
Boston.test <- Boston[test, ]
crim01.test <- crim01[test]
glm.fit <- glm(crim01 ~ . - crim01 - crim, data = Boston, family = binomial)
glm.fit
##
## Call: glm(formula = crim01 ~ . - crim01 - crim, family = binomial,
## data = Boston)
##
## Coefficients:
## (Intercept) zn indus chas nox
## -34.103704 -0.079918 -0.059389 0.785327 48.523782
## rm age dis rad tax
## -0.425596 0.022172 0.691400 0.656465 -0.006412
## ptratio black lstat medv
## 0.368716 -0.013524 0.043862 0.167130
##
## Degrees of Freedom: 505 Total (i.e. Null); 492 Residual
## Null Deviance: 701.5
## Residual Deviance: 211.9 AIC: 239.9
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.4.4
## corrplot 0.84 loaded
corrplot::corrplot.mixed(cor(Boston[, -1]), upper="shade")
glm.fit <- glm(crim01 ~ nox + indus + age + rad, data = Boston, family = binomial)
probs <- predict(glm.fit, Boston.test, type = "response")
glm.pred <- rep(0, length(probs))
glm.pred[probs > 0.5] <- 1
table(glm.pred, crim01.test)
## crim01.test
## glm.pred 0 1
## 0 65 15
## 1 4 68
mean(glm.pred != crim01.test)
## [1] 0.125
lda.fit <- lda(crim01 ~ nox + indus + age + rad , data = Boston)
lda.pred <- predict(lda.fit, Boston.test)
table(lda.pred$class, crim01.test)
## crim01.test
## 0 1
## 0 66 20
## 1 3 63
mean(lda.pred$class != crim01.test)
## [1] 0.1513158
data = scale(Boston[,-c(1,15)])
set.seed(1234)
train <- sample(1:dim(Boston)[1], dim(Boston)[1]*.7, rep=FALSE)
test <- -train
trainingset = data[train, c("nox" , "indus" , "age" , "rad")]
testingset = data[test, c("nox" , "indus" , "age" , "rad")]
train.crime01 = Boston$crim01[train]
test.crime01= Boston$crim01[test]
library(class)
set.seed(1234)
knn.pred.y = knn(trainingset, testingset, train.crime01, k = 1)
table(knn.pred.y, test.crime01)
## test.crime01
## knn.pred.y 0 1
## 0 62 7
## 1 7 76
mean(knn.pred.y != test.crime01)
## [1] 0.09210526
knn_pred_y = NULL
error_rate = NULL
for(i in 1:dim(testingset)[1]){
set.seed(1234)
knn.pred.y = knn(trainingset,testingset,train.crime01,k=i)
error_rate[i] = mean(test.crime01 != knn.pred.y)}
min_error_rate = min(error_rate)
print(min_error_rate)
## [1] 0.06578947
K = which(error_rate == min_error_rate)
print(K)
## [1] 3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
qplot(1:dim(testingset)[1], error_rate, xlab = "K",
ylab = "Error Rate",
geom=c("point", "line"))
library(rsconnect)
## Warning: package 'rsconnect' was built under R version 3.4.4