Question 10
head(Weekly)
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
## 1 1990 0.816 1.572 -3.936 -0.229 -3.484 0.1549760 -0.270 Down
## 2 1990 -0.270 0.816 1.572 -3.936 -0.229 0.1485740 -2.576 Down
## 3 1990 -2.576 -0.270 0.816 1.572 -3.936 0.1598375 3.514 Up
## 4 1990 3.514 -2.576 -0.270 0.816 1.572 0.1616300 0.712 Up
## 5 1990 0.712 3.514 -2.576 -0.270 0.816 0.1537280 1.178 Up
## 6 1990 1.178 0.712 3.514 -2.576 -0.270 0.1544440 -1.372 Down
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
pairs(~Year+Lag1+Lag2+Lag3+Lag4+Lag5+Volume+Today,Weekly)
glm.fit = glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,data=Weekly,family=binomial)
summary(glm.fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Lag2 p value is less than 0.05 so it appears to be significant
c)
glm.prob = predict(glm.fit,type='response')
glm.pred = rep('Down',1089)
glm.pred[glm.prob>0.5] = 'Up'
table(glm.pred,Weekly$Direction)
##
## glm.pred Down Up
## Down 54 48
## Up 430 557
Overall correct prediction:
(54+557)/(54+48+430+557)
## [1] 0.5610652
mean(glm.pred == Weekly$Direction)
## [1] 0.5610652
The error rate of false predict down to up:
48/(48+54)
## [1] 0.4705882
The error rate of false predict up to down:
430/(430+557)
## [1] 0.4356636
Train-Test Split Models
Logistic regression
train = Weekly[Weekly$Year>=1990&Weekly$Year<=2008,]
test = Weekly[Weekly$Year>2008,]
glm.fit1 = glm(Direction~Lag2,data=train,family=binomial)
summary(glm.fit1)
##
## Call:
## glm(formula = Direction ~ Lag2, family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.536 -1.264 1.021 1.091 1.368
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.20326 0.06428 3.162 0.00157 **
## Lag2 0.05810 0.02870 2.024 0.04298 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1350.5 on 983 degrees of freedom
## AIC: 1354.5
##
## Number of Fisher Scoring iterations: 4
glm.prob1 = predict(glm.fit1,test,type = "response")
glm.pred1 = rep("Down",104)
glm.pred1[glm.prob1>0.5] = "Up"
table(glm.pred1,test$Direction)
##
## glm.pred1 Down Up
## Down 9 5
## Up 34 56
mean(glm.pred1 == test$Direction)
## [1] 0.625
**LDA*
library(MASS)
lda.fit = lda(Direction~Lag2,train)
lda.fit
## Call:
## lda(Direction ~ Lag2, data = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
##
## Coefficients of linear discriminants:
## LD1
## Lag2 0.4414162
lda.pred = predict(lda.fit,test)
lda.class = lda.pred$class
table(lda.class,test$Direction)
##
## lda.class Down Up
## Down 9 5
## Up 34 56
mean(lda.class == test$Direction)
## [1] 0.625
qda
qda.fit = qda(Direction~Lag2,train)
qda.fit
## Call:
## qda(Direction ~ Lag2, data = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
qda.pred = predict(qda.fit,test)
qda.class = qda.pred$class
table(qda.class,test$Direction)
##
## qda.class Down Up
## Down 0 0
## Up 43 61
mean(qda.class == test$Direction)
## [1] 0.5865385
naive bayes
library(e1071)
nb.fit = naiveBayes(Direction~Lag2,train)
nb.fit
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## Down Up
## 0.4477157 0.5522843
##
## Conditional probabilities:
## Lag2
## Y [,1] [,2]
## Down -0.03568254 2.199504
## Up 0.26036581 2.317485
nb.pred = predict(nb.fit,test)
table(nb.pred,test$Direction)
##
## nb.pred Down Up
## Down 0 0
## Up 43 61
mean(nb.pred == test$Direction)
## [1] 0.5865385
KNN k = 1
library(class)
train.x = as.matrix(train$Lag2)
test.x = as.matrix(test$Lag2)
train.y=train$Direction
set.seed(10)
knn.pred = knn(train.x,test.x,train.y,k=1)
table(knn.pred,test$Direction)
##
## knn.pred Down Up
## Down 21 29
## Up 22 32
mean(knn.pred == test$Direction)
## [1] 0.5096154
Till Now Logistic Regression and LDA has shown better Prediction Rate approx 62.5%
j)
Experimenting with k values for KNN
k = c(1,2,5,7,10,20,30,50)
rate = rep(0,8)
a = 1
set.seed(10)
for(i in k ){
knn.pred = knn(train.x,test.x,train.y,k=i)
table(knn.pred,test$Direction)
rate[a] = mean(knn.pred == test$Direction)
a = a+1
}
table = cbind(k,rate)
table
## k rate
## [1,] 1 0.5096154
## [2,] 2 0.5480769
## [3,] 5 0.5384615
## [4,] 7 0.5576923
## [5,] 10 0.5961538
## [6,] 20 0.5673077
## [7,] 30 0.5384615
## [8,] 50 0.5769231
With set.seed = 10 We can get best values for k = 10,20,50
Question 11
?Auto
## starting httpd help server ... done
med = median(Auto$mpg)
mpg01 = rep(0,dim(Auto)[1])
mpg01[Auto$mpg >=med] = 1
new.Auto = data.frame(Auto,mpg01)
pairs(new.Auto)
set.seed(10)
tr <- sample(1:nrow(new.Auto),nrow(new.Auto)*0.6,replace = F)
train <- new.Auto[tr,]
test <- new.Auto[-tr,]
lda.fit = lda(mpg01 ~ weight+horsepower+acceleration+displacement+year,train)
lda.fit
## Call:
## lda(mpg01 ~ weight + horsepower + acceleration + displacement +
## year, data = train)
##
## Prior probabilities of groups:
## 0 1
## 0.5191489 0.4808511
##
## Group means:
## weight horsepower acceleration displacement year
## 0 3696.910 133.57377 14.49016 280.2869 74.58197
## 1 2368.327 79.78761 16.60177 120.2699 77.44248
##
## Coefficients of linear discriminants:
## LD1
## weight -0.001372751
## horsepower 0.011595099
## acceleration 0.039057171
## displacement -0.006113288
## year 0.115264085
lda.pred = predict(lda.fit,test)
lda.class = lda.pred$class
table(lda.class,test$mpg01)
##
## lda.class 0 1
## 0 59 1
## 1 15 82
mean(lda.class != test$mpg01)
## [1] 0.1019108
We get a test error rate of 9.55%
QDA
qda.fit = qda(mpg01~weight+horsepower+acceleration+displacement+year,train)
qda.fit
## Call:
## qda(mpg01 ~ weight + horsepower + acceleration + displacement +
## year, data = train)
##
## Prior probabilities of groups:
## 0 1
## 0.5191489 0.4808511
##
## Group means:
## weight horsepower acceleration displacement year
## 0 3696.910 133.57377 14.49016 280.2869 74.58197
## 1 2368.327 79.78761 16.60177 120.2699 77.44248
qda.pred = predict(qda.fit,test)
qda.class = qda.pred$class
table(qda.class,test$mpg01)
##
## qda.class 0 1
## 0 64 4
## 1 10 79
mean(qda.class != test$mpg01)
## [1] 0.08917197
test error rate for qda is about 10.19%
logistic regression
glm.fit = glm(mpg01~weight+horsepower+acceleration+displacement+year,train,family = binomial)
summary(glm.fit)
##
## Call:
## glm(formula = mpg01 ~ weight + horsepower + acceleration + displacement +
## year, family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.14705 -0.10756 -0.00123 0.28493 3.02524
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -17.275740 6.565915 -2.631 0.00851 **
## weight -0.004043 0.001354 -2.987 0.00282 **
## horsepower -0.036106 0.028626 -1.261 0.20721
## acceleration 0.019956 0.171689 0.116 0.90747
## displacement -0.002589 0.008409 -0.308 0.75821
## year 0.424688 0.093301 4.552 5.32e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 325.43 on 234 degrees of freedom
## Residual deviance: 100.15 on 229 degrees of freedom
## AIC: 112.15
##
## Number of Fisher Scoring iterations: 8
glm.fit2 = glm(mpg01 ~ weight + year,train,family=binomial)
summary(glm.fit2)
##
## Call:
## glm(formula = mpg01 ~ weight + year, family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.32477 -0.13705 -0.00441 0.30349 2.79933
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.929e+01 5.760e+00 -3.350 0.000809 ***
## weight -4.902e-03 7.223e-04 -6.786 1.15e-11 ***
## year 4.369e-01 9.036e-02 4.835 1.33e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 325.43 on 234 degrees of freedom
## Residual deviance: 105.56 on 232 degrees of freedom
## AIC: 111.56
##
## Number of Fisher Scoring iterations: 7
glm.prob = predict(glm.fit2,test,type = "response")
glm.pred = rep(0,157)
glm.pred[glm.prob > 0.5]=1
table(glm.pred,test$mpg01)
##
## glm.pred 0 1
## 0 66 7
## 1 8 76
mean(glm.pred != test$mpg01)
## [1] 0.0955414
Test error rate is 9.55%
library(class)
train.x=dplyr::select(train,mpg,weight,horsepower,acceleration,displacement,year)
train.y=dplyr::select(train,mpg01)[,1]
test.x=dplyr::select(test,mpg,weight,horsepower,acceleration,displacement,year)
test.y=dplyr::select(test,mpg01)[,1]
min_error_rate=1000
k=0
for (i in 1:200){
knn.pred=knn(train.x,test.x,train.y,i)
t=table(knn.pred,test$mpg01)
error_rate=(t[2]+t[3])/sum(t)
if(error_rate<min_error_rate){
min_error_rate=error_rate
k=i
}
}
c(min_error_rate,k)
## [1] 0.1082803 3.0000000
Question 13
?Boston
head(Boston)
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21
## medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
crim.med = rep(0,dim(Boston)[1])
crim.med[Boston$crim > median(Boston$crim)] = 1
boston = data.frame(Boston,crim.med)
head(boston)
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21
## medv crim.med
## 1 24.0 0
## 2 21.6 0
## 3 34.7 0
## 4 33.4 0
## 5 36.2 0
## 6 28.7 0
cor(boston[,-15])[,"crim"]
## crim zn indus chas nox rm
## 1.00000000 -0.20046922 0.40658341 -0.05589158 0.42097171 -0.21924670
## age dis rad tax ptratio black
## 0.35273425 -0.37967009 0.62550515 0.58276431 0.28994558 -0.38506394
## lstat medv
## 0.45562148 -0.38830461
pairs(boston[,-15])
set.seed(123)
tr <- sample(1:nrow(boston),nrow(boston)*0.7,replace=F)
train<- boston[tr,]
test <- boston[-tr,]
glm.fit = glm(crim.med ~ zn+indus+chas+nox+rm+ age+rad+tax+ptratio+black+lstat+medv, data = train, family ="binomial")
glm.probs = predict(glm.fit,test, type = "response")
glm.pred = rep(0, 152)
glm.pred[glm.probs > 0.5] = 1
table(glm.pred, test$crim.med, dnn = c("Predicted", "Actual"))
## Actual
## Predicted 0 1
## 0 63 6
## 1 12 71
mean(glm.pred != test$crim.med)
## [1] 0.1184211
11.18% error rate
glm.fit = glm(crim.med ~ indus+nox+ age+ dis+rad+tax+ptratio+lstat, data = train, family ="binomial")
glm.probs = predict(glm.fit,test, type = "response")
glm.pred = rep(0, 152)
glm.pred[glm.probs > 0.5] = 1
table(glm.pred, test$crim.med, dnn = c("Predicted", "Actual"))
## Actual
## Predicted 0 1
## 0 66 6
## 1 9 71
mean(glm.pred != test$crim.med)
## [1] 0.09868421
after removing predictors with negative corrlation with crim.med the error rate is 9.87%
building lda with above predictors
lda.fit = lda(crim.med ~ indus+nox+ age+ dis+rad+tax+ptratio+lstat, data = train)
lda.fit
## Call:
## lda(crim.med ~ indus + nox + age + dis + rad + tax + ptratio +
## lstat, data = train)
##
## Prior probabilities of groups:
## 0 1
## 0.5028249 0.4971751
##
## Group means:
## indus nox age dis rad tax ptratio lstat
## 0 6.83382 0.4695708 50.20730 5.184995 4.224719 307.5562 17.94719 9.220562
## 1 15.21273 0.6344489 86.15795 2.510232 14.948864 509.1193 19.03295 16.029489
##
## Coefficients of linear discriminants:
## LD1
## indus 0.030698279
## nox 6.622350919
## age 0.015390817
## dis -0.008444875
## rad 0.105192097
## tax -0.002895752
## ptratio -0.001465541
## lstat -0.011599001
lda.pred = predict(lda.fit,test)
lda.class = lda.pred$class
table(lda.class,test$crim.med)
##
## lda.class 0 1
## 0 69 16
## 1 6 61
mean(lda.class!=test$crim.med)
## [1] 0.1447368
QDA
qda.fit = qda(crim.med ~ indus+nox+ age+ dis+rad+tax+ptratio+lstat, data = train)
qda.fit
## Call:
## qda(crim.med ~ indus + nox + age + dis + rad + tax + ptratio +
## lstat, data = train)
##
## Prior probabilities of groups:
## 0 1
## 0.5028249 0.4971751
##
## Group means:
## indus nox age dis rad tax ptratio lstat
## 0 6.83382 0.4695708 50.20730 5.184995 4.224719 307.5562 17.94719 9.220562
## 1 15.21273 0.6344489 86.15795 2.510232 14.948864 509.1193 19.03295 16.029489
qda.pred = predict(qda.fit,test)
qda.class = qda.pred$class
table(qda.class,test$crim.med)
##
## qda.class 0 1
## 0 74 14
## 1 1 63
mean(qda.class!=test$crim.med)
## [1] 0.09868421
For the Boston dataset The QDA model with following predictors can be suggested: indus,nox,age,dis,rad,tax,ptratio,lstat