data(Auto)
Auto <- na.omit(Auto)
str(Auto)
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : int 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
## ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...
sapply(Auto[, -9], range)
## mpg cylinders displacement horsepower weight acceleration year origin
## [1,] 9.0 3 68 46 1613 8.0 70 1
## [2,] 46.6 8 455 230 5140 24.8 82 3
sapply(Auto[, -9], mean)
## mpg cylinders displacement horsepower weight acceleration
## 23.445918 5.471939 194.411990 104.469388 2977.584184 15.541327
## year origin
## 75.979592 1.576531
sapply(Auto[, -9], sd)
## mpg cylinders displacement horsepower weight acceleration
## 7.8050075 1.7057832 104.6440039 38.4911599 849.4025600 2.7588641
## year origin
## 3.6837365 0.8055182
Auto_sub <- Auto[-(10:85), ]
sapply(Auto_sub[, -9], range)
## mpg cylinders displacement horsepower weight acceleration year origin
## [1,] 11.0 3 68 46 1649 8.5 70 1
## [2,] 46.6 8 455 230 4997 24.8 82 3
sapply(Auto_sub[, -9], mean)
## mpg cylinders displacement horsepower weight acceleration
## 24.404430 5.373418 187.240506 100.721519 2935.971519 15.726899
## year origin
## 77.145570 1.601266
sapply(Auto_sub[, -9], sd)
## mpg cylinders displacement horsepower weight acceleration
## 7.867283 1.654179 99.678367 35.708853 811.300208 2.693721
## year origin
## 3.106217 0.819910
GGally::ggpairs(Auto %>% select_if(is.numeric))
Strong predictors: horsepower, weight, displacement.
GGally::ggpairs(Auto %>% select_if(is.numeric))
cor(Auto %>% select_if(is.numeric))
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## acceleration year origin
## mpg 0.4233285 0.5805410 0.5652088
## cylinders -0.5046834 -0.3456474 -0.5689316
## displacement -0.5438005 -0.3698552 -0.6145351
## horsepower -0.6891955 -0.4163615 -0.4551715
## weight -0.4168392 -0.3091199 -0.5850054
## acceleration 1.0000000 0.2903161 0.2127458
## year 0.2903161 1.0000000 0.1815277
## origin 0.2127458 0.1815277 1.0000000
lm_fit <- lm(mpg ~ . - name, data = Auto)
summary(lm_fit)
##
## Call:
## lm(formula = mpg ~ . - name, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5903 -2.1565 -0.1169 1.8690 13.0604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.218435 4.644294 -3.707 0.00024 ***
## cylinders -0.493376 0.323282 -1.526 0.12780
## displacement 0.019896 0.007515 2.647 0.00844 **
## horsepower -0.016951 0.013787 -1.230 0.21963
## weight -0.006474 0.000652 -9.929 < 2e-16 ***
## acceleration 0.080576 0.098845 0.815 0.41548
## year 0.750773 0.050973 14.729 < 2e-16 ***
## origin 1.426141 0.278136 5.127 4.67e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared: 0.8215, Adjusted R-squared: 0.8182
## F-statistic: 252.4 on 7 and 384 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(lm_fit)
lm_interact <- lm(mpg ~ (.-name)^2, data=Auto)
summary(lm_interact)
##
## Call:
## lm(formula = mpg ~ (. - name)^2, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.6303 -1.4481 0.0596 1.2739 11.1386
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.548e+01 5.314e+01 0.668 0.50475
## cylinders 6.989e+00 8.248e+00 0.847 0.39738
## displacement -4.785e-01 1.894e-01 -2.527 0.01192 *
## horsepower 5.034e-01 3.470e-01 1.451 0.14769
## weight 4.133e-03 1.759e-02 0.235 0.81442
## acceleration -5.859e+00 2.174e+00 -2.696 0.00735 **
## year 6.974e-01 6.097e-01 1.144 0.25340
## origin -2.090e+01 7.097e+00 -2.944 0.00345 **
## cylinders:displacement -3.383e-03 6.455e-03 -0.524 0.60051
## cylinders:horsepower 1.161e-02 2.420e-02 0.480 0.63157
## cylinders:weight 3.575e-04 8.955e-04 0.399 0.69000
## cylinders:acceleration 2.779e-01 1.664e-01 1.670 0.09584 .
## cylinders:year -1.741e-01 9.714e-02 -1.793 0.07389 .
## cylinders:origin 4.022e-01 4.926e-01 0.816 0.41482
## displacement:horsepower -8.491e-05 2.885e-04 -0.294 0.76867
## displacement:weight 2.472e-05 1.470e-05 1.682 0.09342 .
## displacement:acceleration -3.479e-03 3.342e-03 -1.041 0.29853
## displacement:year 5.934e-03 2.391e-03 2.482 0.01352 *
## displacement:origin 2.398e-02 1.947e-02 1.232 0.21875
## horsepower:weight -1.968e-05 2.924e-05 -0.673 0.50124
## horsepower:acceleration -7.213e-03 3.719e-03 -1.939 0.05325 .
## horsepower:year -5.838e-03 3.938e-03 -1.482 0.13916
## horsepower:origin 2.233e-03 2.930e-02 0.076 0.93931
## weight:acceleration 2.346e-04 2.289e-04 1.025 0.30596
## weight:year -2.245e-04 2.127e-04 -1.056 0.29182
## weight:origin -5.789e-04 1.591e-03 -0.364 0.71623
## acceleration:year 5.562e-02 2.558e-02 2.174 0.03033 *
## acceleration:origin 4.583e-01 1.567e-01 2.926 0.00365 **
## year:origin 1.393e-01 7.399e-02 1.882 0.06062 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.695 on 363 degrees of freedom
## Multiple R-squared: 0.8893, Adjusted R-squared: 0.8808
## F-statistic: 104.2 on 28 and 363 DF, p-value: < 2.2e-16
lm_poly <- lm(mpg ~ poly(horsepower,2), data=Auto)
summary(lm_poly)
##
## Call:
## lm(formula = mpg ~ poly(horsepower, 2), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.7135 -2.5943 -0.0859 2.2868 15.8961
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.4459 0.2209 106.13 <2e-16 ***
## poly(horsepower, 2)1 -120.1377 4.3739 -27.47 <2e-16 ***
## poly(horsepower, 2)2 44.0895 4.3739 10.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.374 on 389 degrees of freedom
## Multiple R-squared: 0.6876, Adjusted R-squared: 0.686
## F-statistic: 428 on 2 and 389 DF, p-value: < 2.2e-16
data(Boston)
results <- map(names(Boston)[-1], function(var){
model <- lm(as.formula(paste("crim ~", var)), data=Boston)
summary(model)$coefficients[2,]
})
names(results) <- names(Boston)[-1]
results
## $zn
## Estimate Std. Error t value Pr(>|t|)
## -7.393498e-02 1.609460e-02 -4.593776e+00 5.506472e-06
##
## $indus
## Estimate Std. Error t value Pr(>|t|)
## 5.097763e-01 5.102433e-02 9.990848e+00 1.450349e-21
##
## $chas
## Estimate Std. Error t value Pr(>|t|)
## -1.8927766 1.5061155 -1.2567274 0.2094345
##
## $nox
## Estimate Std. Error t value Pr(>|t|)
## 3.124853e+01 2.999190e+00 1.041899e+01 3.751739e-23
##
## $rm
## Estimate Std. Error t value Pr(>|t|)
## -2.684051e+00 5.320411e-01 -5.044819e+00 6.346703e-07
##
## $age
## Estimate Std. Error t value Pr(>|t|)
## 1.077862e-01 1.273644e-02 8.462825e+00 2.854869e-16
##
## $dis
## Estimate Std. Error t value Pr(>|t|)
## -1.550902e+00 1.683300e-01 -9.213458e+00 8.519949e-19
##
## $rad
## Estimate Std. Error t value Pr(>|t|)
## 6.179109e-01 3.433182e-02 1.799820e+01 2.693844e-56
##
## $tax
## Estimate Std. Error t value Pr(>|t|)
## 2.974225e-02 1.847415e-03 1.609939e+01 2.357127e-47
##
## $ptratio
## Estimate Std. Error t value Pr(>|t|)
## 1.151983e+00 1.693736e-01 6.801430e+00 2.942922e-11
##
## $black
## Estimate Std. Error t value Pr(>|t|)
## -3.627964e-02 3.873154e-03 -9.366951e+00 2.487274e-19
##
## $lstat
## Estimate Std. Error t value Pr(>|t|)
## 5.488048e-01 4.776097e-02 1.149065e+01 2.654277e-27
##
## $medv
## Estimate Std. Error t value Pr(>|t|)
## -3.631599e-01 3.839017e-02 -9.459710e+00 1.173987e-19
lm_boston <- lm(crim ~ ., data=Boston)
summary(lm_boston)
##
## Call:
## lm(formula = crim ~ ., data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.924 -2.120 -0.353 1.019 75.051
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.033228 7.234903 2.354 0.018949 *
## zn 0.044855 0.018734 2.394 0.017025 *
## indus -0.063855 0.083407 -0.766 0.444294
## chas -0.749134 1.180147 -0.635 0.525867
## nox -10.313535 5.275536 -1.955 0.051152 .
## rm 0.430131 0.612830 0.702 0.483089
## age 0.001452 0.017925 0.081 0.935488
## dis -0.987176 0.281817 -3.503 0.000502 ***
## rad 0.588209 0.088049 6.680 6.46e-11 ***
## tax -0.003780 0.005156 -0.733 0.463793
## ptratio -0.271081 0.186450 -1.454 0.146611
## black -0.007538 0.003673 -2.052 0.040702 *
## lstat 0.126211 0.075725 1.667 0.096208 .
## medv -0.198887 0.060516 -3.287 0.001087 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.439 on 492 degrees of freedom
## Multiple R-squared: 0.454, Adjusted R-squared: 0.4396
## F-statistic: 31.47 on 13 and 492 DF, p-value: < 2.2e-16
simple_coef <- sapply(results, function(x) x[1])
multi_coef <- coef(lm_boston)[-1]
plot(simple_coef, multi_coef)
abline(0,1,col="red")
poly_results <- map(names(Boston)[-1], function(var){
x <- Boston[[var]]
if(length(unique(x)) > 3){
model <- lm(as.formula(paste("crim ~ poly(", var, ",3)", sep="")), data=Boston)
} else {
model <- lm(as.formula(paste("crim ~", var)), data=Boston)
}
summary(model)
})
data(Weekly)
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
GGally::ggpairs(Weekly %>% select_if(is.numeric))
glm_full <- glm(Direction ~ Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
data=Weekly, family=binomial)
summary(glm_full)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
prob <- predict(glm_full, type="response")
pred <- ifelse(prob>0.5,"Up","Down")
table(pred, Weekly$Direction)
##
## pred Down Up
## Down 54 48
## Up 430 557
mean(pred==Weekly$Direction)
## [1] 0.5610652
train <- Weekly$Year < 2009
glm_lag2 <- glm(Direction ~ Lag2,
data=Weekly, subset=train, family=binomial)
prob_test <- predict(glm_lag2, Weekly[!train,], type="response")
pred_test <- ifelse(prob_test>0.5,"Up","Down")
table(pred_test, Weekly$Direction[!train])
##
## pred_test Down Up
## Down 9 5
## Up 34 56
mean(pred_test==Weekly$Direction[!train])
## [1] 0.625
lda_fit <- lda(Direction ~ Lag2, data=Weekly, subset=train)
lda_pred <- predict(lda_fit, Weekly[!train,])
table(lda_pred$class, Weekly$Direction[!train])
##
## Down Up
## Down 9 5
## Up 34 56
mean(lda_pred$class==Weekly$Direction[!train])
## [1] 0.625
qda_fit <- qda(Direction ~ Lag2, data=Weekly, subset=train)
qda_pred <- predict(qda_fit, Weekly[!train,])
table(qda_pred$class, Weekly$Direction[!train])
##
## Down Up
## Down 0 0
## Up 43 61
mean(qda_pred$class==Weekly$Direction[!train])
## [1] 0.5865385
train_x <- as.matrix(Weekly[train,"Lag2",drop=FALSE])
test_x <- as.matrix(Weekly[!train,"Lag2",drop=FALSE])
train_y <- Weekly$Direction[train]
knn_pred <- knn(train_x, test_x, train_y, k=1)
table(knn_pred, Weekly$Direction[!train])
##
## knn_pred Down Up
## Down 21 30
## Up 22 31
mean(knn_pred==Weekly$Direction[!train])
## [1] 0.5
nb_fit <- naiveBayes(Direction ~ Lag2, data=Weekly[train,])
nb_pred <- predict(nb_fit, Weekly[!train,])
table(nb_pred, Weekly$Direction[!train])
##
## nb_pred Down Up
## Down 0 0
## Up 43 61
mean(nb_pred==Weekly$Direction[!train])
## [1] 0.5865385
Best: Logistic / LDA (~60%)
glm_best <- glm(Direction ~ Lag1+Lag2,
data=Weekly, subset=train, family=binomial)
prob_best <- predict(glm_best, Weekly[!train,], type="response")
pred_best <- ifelse(prob_best>0.5,"Up","Down")
table(pred_best, Weekly$Direction[!train])
##
## pred_best Down Up
## Down 7 8
## Up 36 53
mean(pred_best==Weekly$Direction[!train])
## [1] 0.5769231
All models run successfully. Nonlinearity exists in Boston data. Financial prediction remains difficult.