HW #3 data mining

Questions 10, 11, 13

10.

a) It looks like there is a log-based relationship between Year and Volume.

##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
##

b) Only Lag2 is statistically significant.

m<- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, weekly, family="binomial")
summary(m)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = "binomial", data = weekly)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6949  -1.2565   0.9913   1.0849   1.4579  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

c) The regression model is pretty good at predicting in the “Up” direction, but there are a large number of false negatives in the “Down” direction (i.e. overpredicting “Up”).

m_probs <- predict(m, type = "response")
m_predict <- ifelse(m_probs >0.5, "up","down")
c_matrix<- table(weekly$Direction, m_predict)
c_matrix

##       m_predict
##        down  up
##   Down   54 430
##   Up     48 557

train_df = filter(weekly, Year<=2008)
test_df= filter(weekly, Year>2008)
p<- glm(Direction ~ Lag2, data = train_df, family = "binomial")
p_probs<- predict(p, type = "response", newdata = test_df)
p_preds<- ifelse(p_probs>0.5, "up", "down")
p_matrix<- table(test_df$Direction, p_preds)
p_matrix

##       p_preds
##        down up
##   Down    9 34
##   Up      5 56

acc = (9+56)/sum(p_matrix)
acc

## [1] 0.625

e<- lda(Direction ~ Lag2, data = train_df)
e

## Call:
## lda(Direction ~ Lag2, data = train_df)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Group means:
##             Lag2
## Down -0.03568254
## Up    0.26036581
## 
## Coefficients of linear discriminants:
##            LD1
## Lag2 0.4414162

e_preds<- predict(e, newdata = test_df)
e_matrix<- table(test_df$Direction, e_preds$class)
e_matrix

##       
##        Down Up
##   Down    9 34
##   Up      5 56

acc = (9+56)/sum(e_matrix)
acc

## [1] 0.625

f<- qda(Direction ~ Lag2, data = train_df)
f_preds<- predict(f, newdata = test_df)
f_matrix<- table(test_df$Direction, f_preds$class)
f_matrix

##       
##        Down Up
##   Down    0 43
##   Up      0 61

f_acc= (0)/104
f_acc

## [1] 0

g<- knn(train = data.frame(train_df$Lag2), test =data.frame(test_df$Lag2), cl = train_df$Direction, k = 1)
g_matrix<- table(test_df$Direction, g)
g_matrix

##       g
##        Down Up
##   Down   21 22
##   Up     29 32

g_acc= (21+32)/sum(g_matrix)
g_acc

## [1] 0.5096154

h) The logistic regression and LDA produce the highest accuracy.

i) LDA accuracy improves slightly by adding the other Lag variables.

j<- lda(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5, data = train_df)
j

## Call:
## lda(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5, data = train_df)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Group means:
##              Lag1        Lag2       Lag3       Lag4       Lag5
## Down  0.289444444 -0.03568254 0.17080045 0.15925624 0.21409297
## Up   -0.009213235  0.26036581 0.08404044 0.09220956 0.04548897
## 
## Coefficients of linear discriminants:
##              LD1
## Lag1 -0.27604838
## Lag2  0.26309706
## Lag3 -0.04233861
## Lag4 -0.11994755
## Lag5 -0.15335694

j_probs<- predict(j, newdata = test_df)
j_matrix<- table(test_df$Direction, j_probs$class)
j_matrix

##       
##        Down Up
##   Down    9 34
##   Up     13 48

acc = (9+56)/sum(j_matrix)
acc

## [1] 0.625

11.

data(Auto)
Auto$mpg01<- ifelse(Auto$mpg>median(Auto$mpg), 1,0)

b) Displacement, horsepower, and weight seem to be the three biggest contributors to predicting mpg.

pairs(Auto)

noname<- Auto[,c(1:8,10)]
cor(noname)

##                     mpg  cylinders displacement horsepower     weight
## mpg           1.0000000 -0.7776175   -0.8051269 -0.7784268 -0.8322442
## cylinders    -0.7776175  1.0000000    0.9508233  0.8429834  0.8975273
## displacement -0.8051269  0.9508233    1.0000000  0.8972570  0.9329944
## horsepower   -0.7784268  0.8429834    0.8972570  1.0000000  0.8645377
## weight       -0.8322442  0.8975273    0.9329944  0.8645377  1.0000000
## acceleration  0.4233285 -0.5046834   -0.5438005 -0.6891955 -0.4168392
## year          0.5805410 -0.3456474   -0.3698552 -0.4163615 -0.3091199
## origin        0.5652088 -0.5689316   -0.6145351 -0.4551715 -0.5850054
## mpg01         0.8369392 -0.7591939   -0.7534766 -0.6670526 -0.7577566
##              acceleration       year     origin      mpg01
## mpg             0.4233285  0.5805410  0.5652088  0.8369392
## cylinders      -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement   -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower     -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight         -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration    1.0000000  0.2903161  0.2127458  0.3468215
## year            0.2903161  1.0000000  0.1815277  0.4299042
## origin          0.2127458  0.1815277  1.0000000  0.5136984
## mpg01           0.3468215  0.4299042  0.5136984  1.0000000

bound <- floor((nrow(Auto)/4)*3)     
df <- Auto[sample(nrow(Auto)), ] 
train_df <- df[1:bound, ]            
test_df <- df[(bound+1):nrow(df), ]

d) The accuracy metric (using displacement, weight, and horsepower as predictors) is 0.89, so the error rate is ~0.11.

l<- lda(mpg01 ~ displacement + horsepower + weight, data = train_df)
l_preds<- predict(l, newdata = test_df)
l_matrix<- table(test_df$mpg01, l_preds$class)
l_matrix

##    
##      0  1
##   0 37  8
##   1  2 51

acc = (41+46)/sum(l_matrix)
acc

## [1] 0.8877551

e) The accuracy metric is 0.91, so the error rate is ~0.09.

q<- qda(mpg01 ~ displacement + horsepower + weight, data = train_df)
q_preds<- predict(q, newdata = test_df)
q_matrix<- table(test_df$mpg01, q_preds$class)
q_matrix

##    
##      0  1
##   0 37  8
##   1  4 49

q_acc= (44+45)/sum(q_matrix)
q_acc

## [1] 0.9081633

f) The logistic regression had an accuracy of 0.9, and a subsequent error rate of ~0.1.

p<- glm(mpg01 ~ displacement + horsepower + weight, data = train_df, family = "binomial")
p_probs<- predict(p, type = "response", newdata = test_df)
p_preds<- ifelse(p_probs>0.5, "up", "down")
p_matrix<- table(test_df$mpg01, p_preds)
p_matrix

##    p_preds
##     down up
##   0   40  5
##   1    5 48

acc = (43+45)/sum(p_matrix)
acc

## [1] 0.8979592

g) The KNN did not classify any points incorrectly, achieving an error rate of 0.0.

k<- knn(train = data.frame(train_df$mpg01), test =data.frame(test_df$mpg01), cl = train_df$mpg01, k = 1)
k_matrix<- table(test_df$mpg01, k)
k_matrix

##    k
##      0  1
##   0 45  0
##   1  0 53

k_acc= (50+48)/sum(k_matrix)
k_acc

## [1] 1

13. When we use all variables except “chas” in the models, we can obtain highly accurate results. The logistic regression had a 0.99 accuracy, the LDA had a 0.83 accuracy, and the knn did not have any incorrect classifications. Because these metrics were so high, I think there might be some over-fitting for this dataset.

Log Regression on Boston data Accuracy = 0.99

data("Boston")
Boston$median<- ifelse(Boston$crim> median(Boston$crim), 1,0)
bound <- floor((nrow(Boston)/4)*3)     
df <- Boston[sample(nrow(Boston)), ] 
df<- df[,c(1:3,5:15)]
df.train <- df[1:bound, ]            
df.test <- df[(bound+1):nrow(df), ]

logreg<- glm(as.factor(median) ~ ., data = df.train, family = "binomial")

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

logreg_probs <- predict(logreg, df.test, type="response")
logreg_preds <- ifelse(logreg_probs>.5, "1","0")
logreg_matrix<- table(df.test$median, logreg_preds)
logreg_matrix

##    logreg_preds
##      0  1
##   0 62  1
##   1  0 64

acc = (59+67)/sum(logreg_matrix)

LDA on Boston dataset Accuracy = 0.835

l<- lda(as.factor(median) ~ ., data = df.train)
l_preds<- predict(l, newdata = df.test)
l_matrix<- table(df.test$median, l_preds$class)
l_matrix

##    
##      0  1
##   0 62  1
##   1 12 52

acc = (59+47)/sum(l_matrix)
acc

## [1] 0.8346457

KNN on Boston dataset Accuracy = 1.0

k<- knn(train = data.frame(df.train$median), test =data.frame(df.test$median), cl = df.train$median, k = 1)
k_matrix<- table(df.test$median, k)
k_matrix

##    k
##      0  1
##   0 63  0
##   1  0 64

k_acc= (60+67)/sum(k_matrix)
k_acc

## [1] 1

HW #3 data mining

Amber Chin

3/4/2021