library(ISLR)
library(corrplot)
## corrplot 0.92 loaded
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
It shows two day pattern since the even days, today (lag0), lag2, lag4, have higher than 0 correlation and the same for the odd.
require(corrplot)
cor_test <- cor.mtest(Weekly[,1:8], conf.level = .90)
corrplot(cor(Weekly[,1:8]), method = 'color',
order = 'hclust', addrect = 3,
p.mat = cor_test$p, sig.level = 0.1, tl.col = 'black')
Just Lag2 variable is statistically significant.
attach(Weekly)
Weekly_fit<-glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data=Weekly,family=binomial)
summary(Weekly_fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
logWeekly.prob= predict(Weekly_fit, type='response')
logWeekly.pred =rep("Down", length(logWeekly.prob))
logWeekly.pred[logWeekly.prob > 0.5] = "Up"
table(logWeekly.pred, Direction)
## Direction
## logWeekly.pred Down Up
## Down 54 48
## Up 430 557
The model predicted the weekly market trend correctly 56.11% of the time. Separating in how the model correctly predicts the Up and Down trends. While the model correctly predicted the Up weekly trends 55748+557=0.9207; 92.07% correct. Down weekly trends were predicted at a lower rate, 54430+54=0.1115; or only 11.15% correctly predicted.
train = (Year<2009)
Weekly.0910 <-Weekly[!train,]
Weekly_fit<-glm(Direction~Lag2, data=Weekly,family=binomial, subset=train)
logWeekly.prob= predict(Weekly_fit, Weekly.0910, type = "response")
logWeekly.pred = rep("Down", length(logWeekly.prob))
logWeekly.pred[logWeekly.prob > 0.5] = "Up"
Direction.0910 = Direction[!train]
table(logWeekly.pred, Direction.0910)
## Direction.0910
## logWeekly.pred Down Up
## Down 9 5
## Up 34 56
mean(logWeekly.pred == Direction.0910)
## [1] 0.625
The model correctly predicted weekly trends at a rate of 62.5%. It is better predicting upward trends very good.
library(MASS)
Weeklylda_fit<-lda(Direction~Lag2, data=Weekly,family=binomial, subset=train)
Weeklylda.pred<-predict(Weeklylda_fit, Weekly.0910)
table(Weeklylda.pred$class, Direction.0910)
## Direction.0910
## Down Up
## Down 9 5
## Up 34 56
mean(Weeklylda.pred$class==Direction.0910)
## [1] 0.625
Weeklyqda_fit = qda(Direction ~ Lag2, data = Weekly, subset = train)
Weeklyqda.pred = predict(Weeklyqda_fit, Weekly.0910)$class
table(Weeklyqda.pred, Direction.0910)
## Direction.0910
## Weeklyqda.pred Down Up
## Down 0 0
## Up 43 61
mean(Weeklyqda.pred==Direction.0910)
## [1] 0.5865385
library(class)
Week.train=as.matrix(Lag2[train])
Week.test=as.matrix(Lag2[!train])
train.Direction =Direction[train]
set.seed(1)
Weekknn.pred=knn(Week.train,Week.test,train.Direction,k=1)
table(Weekknn.pred,Direction.0910)
## Direction.0910
## Weekknn.pred Down Up
## Down 21 30
## Up 22 31
mean(Weekknn.pred == Direction.0910)
## [1] 0.5
The methods that have the highest accuracy rates are the Logistic Regression and Linear Discriminant Analysis; both having rates of 62.5%.
Weekly.fit<-glm(Direction~Lag2:Lag4+Lag2, data=Weekly,family=binomial, subset=train)
logWeekly.prob= predict(Weekly.fit, Weekly.0910, type = "response")
logWeekly.pred = rep("Down", length(logWeekly.prob))
logWeekly.pred[logWeekly.prob > 0.5] = "Up"
Direction.0910 = Direction[!train]
table(logWeekly.pred, Direction.0910)
## Direction.0910
## logWeekly.pred Down Up
## Down 3 4
## Up 40 57
mean(logWeekly.pred == Direction.0910)
## [1] 0.5769231
Weeklylda.fit<-lda(Direction~Lag2:Lag4+Lag2, data=Weekly,family=binomial, subset=train)
Weeklylda.pred<-predict(Weeklylda.fit, Weekly.0910)
table(Weeklylda.pred$class, Direction.0910)
## Direction.0910
## Down Up
## Down 3 3
## Up 40 58
mean(Weeklylda.pred$class==Direction.0910)
## [1] 0.5865385
Weeklyqda.fit = qda(Direction ~ poly(Lag2,2), data = Weekly, subset = train)
Weeklyqda.pred = predict(Weeklyqda.fit, Weekly.0910)$class
table(Weeklyqda.pred, Direction.0910)
## Direction.0910
## Weeklyqda.pred Down Up
## Down 7 3
## Up 36 58
mean(Weeklyqda.pred==Direction.0910)
## [1] 0.625
Week.train=as.matrix(Lag2[train])
Week.test=as.matrix(Lag2[!train])
train.Direction =Direction[train]
set.seed(1)
Weekknn.pred=knn(Week.train,Week.test,train.Direction,k=10)
table(Weekknn.pred,Direction.0910)
## Direction.0910
## Weekknn.pred Down Up
## Down 17 21
## Up 26 40
mean(Weekknn.pred == Direction.0910)
## [1] 0.5480769
#K=100
Week.train=as.matrix(Lag2[train])
Week.test=as.matrix(Lag2[!train])
train.Direction =Direction[train]
set.seed(1)
Weekknn.pred=knn(Week.train,Week.test,train.Direction,k=100)
table(Weekknn.pred,Direction.0910)
## Direction.0910
## Weekknn.pred Down Up
## Down 10 11
## Up 33 50
mean(Weekknn.pred == Direction.0910)
## [1] 0.5769231
library(ISLR)
attach(Auto)
summary(Auto)
## mpg cylinders displacement horsepower weight
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0 1st Qu.:2225
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5 Median :2804
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5 Mean :2978
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0 3rd Qu.:3615
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
##
## acceleration year origin name
## Min. : 8.00 Min. :70.00 Min. :1.000 amc matador : 5
## 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000 ford pinto : 5
## Median :15.50 Median :76.00 Median :1.000 toyota corolla : 5
## Mean :15.54 Mean :75.98 Mean :1.577 amc gremlin : 4
## 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000 amc hornet : 4
## Max. :24.80 Max. :82.00 Max. :3.000 chevrolet chevette: 4
## (Other) :365
mpg01 <- rep(0, length(mpg))
mpg01[mpg > median(mpg)] <- 1
Auto = data.frame(Auto, mpg01)
corrplot(cor(Auto[,-9]), method="square")
The next variables show to be negatively correlated with mpg01: cylinders, displacement, horsepower and weight. Mpg shows to be positively correlated.
set.seed(1)
num_train <- nrow(Auto) * 0.75
inTrain <- sample(nrow(Auto), size = num_train)
training <- Auto[inTrain,]
testing <- Auto[-inTrain,]
require(MASS)
fmla <- as.formula('mpg01 ~ displacement + horsepower + weight + year + cylinders')
lda_model <- lda(fmla, data = training)
pred <- predict(lda_model, testing)
table(pred$class, testing$mpg01)
##
## 0 1
## 0 41 0
## 1 12 45
mean(pred$class == testing$mpg01)
## [1] 0.877551
The prediction error is 6.2%.
autoqda.fit <- qda(mpg01~displacement+horsepower+weight+cylinders+mpg, data=training)
autoqda.pred <- predict(autoqda.fit, testing)
table(autoqda.pred$class, testing$mpg01)
##
## 0 1
## 0 47 1
## 1 6 44
mean(autoqda.pred$class != testing$mpg01)
## [1] 0.07142857
The predictor error is 7.14%
log_reg <- glm(fmla, data = training, family = binomial)
pred <- predict(log_reg, testing, type = 'response')
pred_values <- round(pred)
table(pred_values, testing$mpg01)
##
## pred_values 0 1
## 0 46 2
## 1 7 43
mean(pred_values == testing$mpg01)
## [1] 0.9081633
The predictor error is 6.2%
train <- (year %% 2 == 0)
train.auto <- Auto[train,]
test.auto <- Auto[-train,]
K=1
train.K= cbind(displacement,horsepower,weight,cylinders,year, origin)[train,]
test.K=cbind(displacement,horsepower,weight,cylinders, year, origin)[-train,]
set.seed(1)
autok.pred=knn(train.K,test.K,train.auto$mpg01,k=1)
mean(autok.pred != test.auto$mpg01)
## [1] 0.07161125
[1] 0.07161125
K=5
autok.pred=knn(train.K,test.K,train.auto$mpg01,k=5)
mean(autok.pred != test.auto$mpg01)
## [1] 0.112532
[1] 0.112532
K=10
autok.pred=knn(train.K,test.K,train.auto$mpg01,k=10)
mean(autok.pred != test.auto$mpg01)
## [1] 0.1253197
[1] 0.1253197
detach(Auto)
K=1 has the lowest error rate of 7.16%.
whether a given suburb has a crime rate above or below the median. Explore logistic regression, LDA, and KNN models using various subsets of the predictors. Describe your findings.
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
attach(Boston)
crime01 <- rep(0, length(crim))
crime01[crim > median(crim)] <- 1
Boston= data.frame(Boston,crime01)
train = 1:(dim(Boston)[1]/2)
test = (dim(Boston)[1]/2 + 1):dim(Boston)[1]
Boston.train = Boston[train, ]
Boston.test = Boston[test, ]
crime01.test = crime01[test]
corrplot(cor(Boston), method="square")
LOGISTIC REGRESION MODEL
set.seed(1)
Boston.fit <-glm(crime01~ indus+nox+age+dis+rad+tax, data=Boston.train,family=binomial)
Boston.probs = predict(Boston.fit, Boston.test, type = "response")
Boston.pred = rep(0, length(Boston.probs))
Boston.pred[Boston.probs > 0.5] = 1
table(Boston.pred, crime01.test)
## crime01.test
## Boston.pred 0 1
## 0 75 8
## 1 15 155
mean(Boston.pred != crime01.test)
## [1] 0.09090909
summary(Boston.fit)
##
## Call:
## glm(formula = crime01 ~ indus + nox + age + dis + rad + tax,
## family = binomial, data = Boston.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.97810 -0.21406 -0.03454 0.47107 3.04502
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -42.214032 7.617440 -5.542 2.99e-08 ***
## indus -0.213126 0.073236 -2.910 0.00361 **
## nox 80.868029 16.066473 5.033 4.82e-07 ***
## age 0.003397 0.012032 0.282 0.77772
## dis 0.307145 0.190502 1.612 0.10690
## rad 0.847236 0.183767 4.610 4.02e-06 ***
## tax -0.013760 0.004956 -2.777 0.00549 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 329.37 on 252 degrees of freedom
## Residual deviance: 144.44 on 246 degrees of freedom
## AIC: 158.44
##
## Number of Fisher Scoring iterations: 8
LINEAR DISCRIMINANT ANALYSIS
Boston.ldafit <-lda(crime01~ indus+nox+age+dis+rad+tax, data=Boston.train,family=binomial)
Bostonlda.pred = predict(Boston.ldafit, Boston.test)
table(Bostonlda.pred$class, crime01.test)
## crime01.test
## 0 1
## 0 81 18
## 1 9 145
mean(Bostonlda.pred$class != crime01.test)
## [1] 0.1067194
#K=1
train.K=cbind(indus,nox,age,dis,rad,tax)[train,]
test.K=cbind(indus,nox,age,dis,rad,tax)[test,]
Bosknn.pred=knn(train.K, test.K, crime01.test, k=1)
table(Bosknn.pred,crime01.test)
## crime01.test
## Bosknn.pred 0 1
## 0 31 155
## 1 59 8
mean(Bosknn.pred !=crime01.test)
## [1] 0.8458498
#K=100
train.K=cbind(indus,nox,age,dis,rad,tax)[train,]
test.K=cbind(indus,nox,age,dis,rad,tax)[test,]
Bosknn.pred=knn(train.K, test.K, crime01.test, k=100)
table(Bosknn.pred,crime01.test)
## crime01.test
## Bosknn.pred 0 1
## 0 21 6
## 1 69 157
mean(Bosknn.pred !=crime01.test)
## [1] 0.2964427
##logistic regression had the lowest test error rate of 9.09%