Question 10

head(Weekly)
##   Year   Lag1   Lag2   Lag3   Lag4   Lag5    Volume  Today Direction
## 1 1990  0.816  1.572 -3.936 -0.229 -3.484 0.1549760 -0.270      Down
## 2 1990 -0.270  0.816  1.572 -3.936 -0.229 0.1485740 -2.576      Down
## 3 1990 -2.576 -0.270  0.816  1.572 -3.936 0.1598375  3.514        Up
## 4 1990  3.514 -2.576 -0.270  0.816  1.572 0.1616300  0.712        Up
## 5 1990  0.712  3.514 -2.576 -0.270  0.816 0.1537280  1.178        Up
## 6 1990  1.178  0.712  3.514 -2.576 -0.270 0.1544440 -1.372      Down
summary(Weekly)
##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
## 
pairs(~Year+Lag1+Lag2+Lag3+Lag4+Lag5+Volume+Today,Weekly)

glm.fit = glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,data=Weekly,family=binomial)
summary(glm.fit)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6949  -1.2565   0.9913   1.0849   1.4579  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

Lag2 p value is less than 0.05 so it appears to be significant

c)

glm.prob = predict(glm.fit,type='response')
glm.pred = rep('Down',1089)
glm.pred[glm.prob>0.5] = 'Up'
table(glm.pred,Weekly$Direction)
##         
## glm.pred Down  Up
##     Down   54  48
##     Up    430 557

Overall correct prediction:

(54+557)/(54+48+430+557)
## [1] 0.5610652
mean(glm.pred == Weekly$Direction)
## [1] 0.5610652

The error rate of false predict down to up:

48/(48+54)
## [1] 0.4705882

The error rate of false predict up to down:

430/(430+557)
## [1] 0.4356636

Train-Test Split Models

Logistic regression

train = Weekly[Weekly$Year>=1990&Weekly$Year<=2008,]
test = Weekly[Weekly$Year>2008,]
glm.fit1 = glm(Direction~Lag2,data=train,family=binomial)
summary(glm.fit1)
## 
## Call:
## glm(formula = Direction ~ Lag2, family = binomial, data = train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.536  -1.264   1.021   1.091   1.368  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.20326    0.06428   3.162  0.00157 **
## Lag2         0.05810    0.02870   2.024  0.04298 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1354.7  on 984  degrees of freedom
## Residual deviance: 1350.5  on 983  degrees of freedom
## AIC: 1354.5
## 
## Number of Fisher Scoring iterations: 4
glm.prob1 = predict(glm.fit1,test,type = "response")
glm.pred1 = rep("Down",104)
glm.pred1[glm.prob1>0.5] = "Up"
table(glm.pred1,test$Direction)
##          
## glm.pred1 Down Up
##      Down    9  5
##      Up     34 56
mean(glm.pred1 == test$Direction)
## [1] 0.625

**LDA*

library(MASS)
lda.fit = lda(Direction~Lag2,train)
lda.fit
## Call:
## lda(Direction ~ Lag2, data = train)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Group means:
##             Lag2
## Down -0.03568254
## Up    0.26036581
## 
## Coefficients of linear discriminants:
##            LD1
## Lag2 0.4414162
lda.pred = predict(lda.fit,test)
lda.class = lda.pred$class
table(lda.class,test$Direction)
##          
## lda.class Down Up
##      Down    9  5
##      Up     34 56
mean(lda.class == test$Direction)
## [1] 0.625

qda

qda.fit = qda(Direction~Lag2,train)
qda.fit
## Call:
## qda(Direction ~ Lag2, data = train)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Group means:
##             Lag2
## Down -0.03568254
## Up    0.26036581
qda.pred = predict(qda.fit,test)
qda.class = qda.pred$class
table(qda.class,test$Direction)
##          
## qda.class Down Up
##      Down    0  0
##      Up     43 61
mean(qda.class == test$Direction)
## [1] 0.5865385

naive bayes

library(e1071)
nb.fit = naiveBayes(Direction~Lag2,train)
nb.fit
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Conditional probabilities:
##       Lag2
## Y             [,1]     [,2]
##   Down -0.03568254 2.199504
##   Up    0.26036581 2.317485
nb.pred = predict(nb.fit,test)
table(nb.pred,test$Direction)
##        
## nb.pred Down Up
##    Down    0  0
##    Up     43 61
mean(nb.pred == test$Direction)
## [1] 0.5865385

KNN k = 1

library(class)
train.x = as.matrix(train$Lag2)
test.x = as.matrix(test$Lag2)

train.y=train$Direction

set.seed(10)
knn.pred = knn(train.x,test.x,train.y,k=1)
table(knn.pred,test$Direction)
##         
## knn.pred Down Up
##     Down   21 29
##     Up     22 32
mean(knn.pred == test$Direction)
## [1] 0.5096154

Till Now Logistic Regression and LDA has shown better Prediction Rate approx 62.5%

j)

Experimenting with k values for KNN

k = c(1,2,5,7,10,20,30,50)
rate = rep(0,8)
a = 1
set.seed(10)
for(i in k ){
  knn.pred = knn(train.x,test.x,train.y,k=i)
  table(knn.pred,test$Direction)
  rate[a] = mean(knn.pred == test$Direction)
  a = a+1
}

table = cbind(k,rate)
table
##       k      rate
## [1,]  1 0.5096154
## [2,]  2 0.5480769
## [3,]  5 0.5384615
## [4,]  7 0.5576923
## [5,] 10 0.5961538
## [6,] 20 0.5673077
## [7,] 30 0.5384615
## [8,] 50 0.5769231

With set.seed = 10 We can get best values for k = 10,20,50

Question 11

?Auto
## starting httpd help server ... done
med = median(Auto$mpg)
mpg01 = rep(0,dim(Auto)[1])
mpg01[Auto$mpg >=med] = 1
new.Auto = data.frame(Auto,mpg01)
pairs(new.Auto)

set.seed(10)
tr <- sample(1:nrow(new.Auto),nrow(new.Auto)*0.6,replace = F)
train <- new.Auto[tr,]
test <- new.Auto[-tr,]
lda.fit = lda(mpg01 ~ weight+horsepower+acceleration+displacement+year,train)
lda.fit
## Call:
## lda(mpg01 ~ weight + horsepower + acceleration + displacement + 
##     year, data = train)
## 
## Prior probabilities of groups:
##         0         1 
## 0.5191489 0.4808511 
## 
## Group means:
##     weight horsepower acceleration displacement     year
## 0 3696.910  133.57377     14.49016     280.2869 74.58197
## 1 2368.327   79.78761     16.60177     120.2699 77.44248
## 
## Coefficients of linear discriminants:
##                       LD1
## weight       -0.001372751
## horsepower    0.011595099
## acceleration  0.039057171
## displacement -0.006113288
## year          0.115264085
lda.pred = predict(lda.fit,test)
lda.class = lda.pred$class
table(lda.class,test$mpg01)
##          
## lda.class  0  1
##         0 59  1
##         1 15 82
mean(lda.class != test$mpg01)
## [1] 0.1019108

We get a test error rate of 9.55%

QDA

qda.fit = qda(mpg01~weight+horsepower+acceleration+displacement+year,train)
qda.fit
## Call:
## qda(mpg01 ~ weight + horsepower + acceleration + displacement + 
##     year, data = train)
## 
## Prior probabilities of groups:
##         0         1 
## 0.5191489 0.4808511 
## 
## Group means:
##     weight horsepower acceleration displacement     year
## 0 3696.910  133.57377     14.49016     280.2869 74.58197
## 1 2368.327   79.78761     16.60177     120.2699 77.44248
qda.pred = predict(qda.fit,test)
qda.class = qda.pred$class
table(qda.class,test$mpg01)
##          
## qda.class  0  1
##         0 64  4
##         1 10 79
mean(qda.class != test$mpg01)
## [1] 0.08917197

test error rate for qda is about 10.19%

logistic regression

glm.fit = glm(mpg01~weight+horsepower+acceleration+displacement+year,train,family = binomial)
summary(glm.fit)
## 
## Call:
## glm(formula = mpg01 ~ weight + horsepower + acceleration + displacement + 
##     year, family = binomial, data = train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.14705  -0.10756  -0.00123   0.28493   3.02524  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -17.275740   6.565915  -2.631  0.00851 ** 
## weight        -0.004043   0.001354  -2.987  0.00282 ** 
## horsepower    -0.036106   0.028626  -1.261  0.20721    
## acceleration   0.019956   0.171689   0.116  0.90747    
## displacement  -0.002589   0.008409  -0.308  0.75821    
## year           0.424688   0.093301   4.552 5.32e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 325.43  on 234  degrees of freedom
## Residual deviance: 100.15  on 229  degrees of freedom
## AIC: 112.15
## 
## Number of Fisher Scoring iterations: 8
glm.fit2 = glm(mpg01 ~ weight + year,train,family=binomial)
summary(glm.fit2)
## 
## Call:
## glm(formula = mpg01 ~ weight + year, family = binomial, data = train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.32477  -0.13705  -0.00441   0.30349   2.79933  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.929e+01  5.760e+00  -3.350 0.000809 ***
## weight      -4.902e-03  7.223e-04  -6.786 1.15e-11 ***
## year         4.369e-01  9.036e-02   4.835 1.33e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 325.43  on 234  degrees of freedom
## Residual deviance: 105.56  on 232  degrees of freedom
## AIC: 111.56
## 
## Number of Fisher Scoring iterations: 7
glm.prob = predict(glm.fit2,test,type = "response")
glm.pred = rep(0,157)
glm.pred[glm.prob > 0.5]=1
table(glm.pred,test$mpg01)
##         
## glm.pred  0  1
##        0 66  7
##        1  8 76
mean(glm.pred != test$mpg01)
## [1] 0.0955414

Test error rate is 9.55%

library(class)
train.x=dplyr::select(train,mpg,weight,horsepower,acceleration,displacement,year)
train.y=dplyr::select(train,mpg01)[,1]
test.x=dplyr::select(test,mpg,weight,horsepower,acceleration,displacement,year)
test.y=dplyr::select(test,mpg01)[,1]
min_error_rate=1000
k=0
for (i in 1:200){
 knn.pred=knn(train.x,test.x,train.y,i)
 t=table(knn.pred,test$mpg01)
 error_rate=(t[2]+t[3])/sum(t)
 if(error_rate<min_error_rate){
   min_error_rate=error_rate
   k=i
 }
}
c(min_error_rate,k)
## [1] 0.1082803 3.0000000

Question 13

?Boston
head(Boston)
##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21
##   medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
crim.med = rep(0,dim(Boston)[1])
crim.med[Boston$crim > median(Boston$crim)] = 1
boston = data.frame(Boston,crim.med)
head(boston)
##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21
##   medv crim.med
## 1 24.0        0
## 2 21.6        0
## 3 34.7        0
## 4 33.4        0
## 5 36.2        0
## 6 28.7        0
cor(boston[,-15])[,"crim"]
##        crim          zn       indus        chas         nox          rm 
##  1.00000000 -0.20046922  0.40658341 -0.05589158  0.42097171 -0.21924670 
##         age         dis         rad         tax     ptratio       black 
##  0.35273425 -0.37967009  0.62550515  0.58276431  0.28994558 -0.38506394 
##       lstat        medv 
##  0.45562148 -0.38830461
pairs(boston[,-15])

set.seed(123)


tr <- sample(1:nrow(boston),nrow(boston)*0.7,replace=F)
train<- boston[tr,]
test <- boston[-tr,]
glm.fit = glm(crim.med ~ zn+indus+chas+nox+rm+ age+rad+tax+ptratio+black+lstat+medv, data = train, family ="binomial")
glm.probs = predict(glm.fit,test, type = "response")
glm.pred = rep(0, 152)
glm.pred[glm.probs > 0.5] = 1
table(glm.pred, test$crim.med, dnn = c("Predicted", "Actual"))
##          Actual
## Predicted  0  1
##         0 63  6
##         1 12 71
mean(glm.pred != test$crim.med)
## [1] 0.1184211

11.18% error rate

glm.fit = glm(crim.med ~ indus+nox+ age+ dis+rad+tax+ptratio+lstat, data = train, family ="binomial")
glm.probs = predict(glm.fit,test, type = "response")
glm.pred = rep(0, 152)
glm.pred[glm.probs > 0.5] = 1
table(glm.pred, test$crim.med, dnn = c("Predicted", "Actual"))
##          Actual
## Predicted  0  1
##         0 66  6
##         1  9 71
mean(glm.pred != test$crim.med)
## [1] 0.09868421

after removing predictors with negative corrlation with crim.med the error rate is 9.87%

building lda with above predictors

lda.fit = lda(crim.med ~ indus+nox+ age+ dis+rad+tax+ptratio+lstat, data = train)
lda.fit
## Call:
## lda(crim.med ~ indus + nox + age + dis + rad + tax + ptratio + 
##     lstat, data = train)
## 
## Prior probabilities of groups:
##         0         1 
## 0.5028249 0.4971751 
## 
## Group means:
##      indus       nox      age      dis       rad      tax  ptratio     lstat
## 0  6.83382 0.4695708 50.20730 5.184995  4.224719 307.5562 17.94719  9.220562
## 1 15.21273 0.6344489 86.15795 2.510232 14.948864 509.1193 19.03295 16.029489
## 
## Coefficients of linear discriminants:
##                  LD1
## indus    0.030698279
## nox      6.622350919
## age      0.015390817
## dis     -0.008444875
## rad      0.105192097
## tax     -0.002895752
## ptratio -0.001465541
## lstat   -0.011599001
lda.pred = predict(lda.fit,test)
lda.class = lda.pred$class
table(lda.class,test$crim.med)
##          
## lda.class  0  1
##         0 69 16
##         1  6 61
mean(lda.class!=test$crim.med)
## [1] 0.1447368

QDA

qda.fit = qda(crim.med ~ indus+nox+ age+ dis+rad+tax+ptratio+lstat, data = train)
qda.fit
## Call:
## qda(crim.med ~ indus + nox + age + dis + rad + tax + ptratio + 
##     lstat, data = train)
## 
## Prior probabilities of groups:
##         0         1 
## 0.5028249 0.4971751 
## 
## Group means:
##      indus       nox      age      dis       rad      tax  ptratio     lstat
## 0  6.83382 0.4695708 50.20730 5.184995  4.224719 307.5562 17.94719  9.220562
## 1 15.21273 0.6344489 86.15795 2.510232 14.948864 509.1193 19.03295 16.029489
qda.pred = predict(qda.fit,test)
qda.class = qda.pred$class
table(qda.class,test$crim.med)
##          
## qda.class  0  1
##         0 74 14
##         1  1 63
mean(qda.class!=test$crim.med)
## [1] 0.09868421

For the Boston dataset The QDA model with following predictors can be suggested: indus,nox,age,dis,rad,tax,ptratio,lstat