data(mtcars)
data = na.omit(mtcars)
# correlation matrix: just to compare with cor.prob output
#cor(mtcars)
# correlation matrix with p-values
#cor.prob(mtcars)
# "flatten" that table
#flattenSquareMatrix(cor.prob(mtcars))
# plot the data: EXAMPLE-1
suppressMessages(library(PerformanceAnalytics))
chart.Correlation(mtcars)
mtcars$am = ifelse(mtcars$am == 0,"Automatic","Manual")
ggplot(data=mtcars, aes(x=mpg, y=am)) +
geom_point(aes(color=am,size=1.2,alpha=0.1))+
labs(x="MPG (Miles Per Gallon)", y="Transmission type")+
ggtitle("MPG for Transmission types")
boxplot(mpg ~ as.factor(am), data=mtcars,
xlab = "Transmission type",
ylab = "MPG (Miles Per Gallon)",
main = "MPG by Transmission types",
col = c("red","green"))
mtcars %>% group_by(am) %>% summarise(Mean = round(mean(mpg),1), median = round(median(mpg),1),Q1 = round(quantile(mpg,p=0.25)[[1]],1),Q3 = round(quantile(mpg,p=0.75)[[1]],1))
## Source: local data frame [2 x 5]
##
## am Mean median Q1 Q3
## (chr) (dbl) (dbl) (dbl) (dbl)
## 1 Automatic 17.1 17.3 14.9 19.2
## 2 Manual 24.4 22.8 21.0 30.4
mtcars$am = as.factor(mtcars$am)
lm.fit = lm(mpg ~ am, data=mtcars)
lm.fit
##
## Call:
## lm(formula = mpg ~ am, data = mtcars)
##
## Coefficients:
## (Intercept) amManual
## 17.147 7.245
summary(lm.fit)
##
## Call:
## lm(formula = mpg ~ am, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.3923 -3.0923 -0.2974 3.2439 9.5077
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.147 1.125 15.247 1.13e-15 ***
## amManual 7.245 1.764 4.106 0.000285 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.902 on 30 degrees of freedom
## Multiple R-squared: 0.3598, Adjusted R-squared: 0.3385
## F-statistic: 16.86 on 1 and 30 DF, p-value: 0.000285
p-value of “am” variable is < 0.05. this again re-confirm our earlier conclusion that “am” is an significant predictor of “mpg”, meaning they have significant relationship.
INTERPRETATION OF INTERCEPT: the model predicts that on average Automatic transmision (am=0) has average MPG of 17.1.
INTERPRETATION OF SLOPE: the model predicts that on average manual transmision (am=1) has 7.245 higher MPG than automatic transmission (am=0).
adj-R2 of the model is 0.34, that means, only 34% of the variability in MPG can be explained by this model with “AM” only as the oredictor in the model.
set.seed(1123)
n = nrow(mtcars)
index = sample(n, n*0.80, replace = FALSE)
train_set = mtcars[index,]
dim(train_set)
## [1] 25 11
test_set = mtcars[-index,]
dim(test_set)
## [1] 7 11
model_full = lm(mpg ~ ., data = train_set)
model_int = lm(mpg ~ -., data = train_set)
scopeformula = formula(model_full)
scopeformula
## mpg ~ cyl + disp + hp + drat + wt + qsec + vs + am + gear + carb
fwd_sel = step(object=model_int, scope=scopeformula, direction="forward")
## Start: AIC=91.59
## mpg ~ -(cyl + disp + hp + drat + wt + qsec + vs + am + gear +
## carb)
##
## Df Sum of Sq RSS AIC
## + cyl 1 701.88 198.06 55.743
## + wt 1 650.66 249.28 61.493
## + disp 1 641.41 258.53 62.403
## + hp 1 554.58 345.36 69.643
## + drat 1 414.30 485.64 78.165
## + vs 1 385.60 514.34 79.600
## + am 1 304.21 595.73 83.273
## + carb 1 272.21 627.73 84.581
## + qsec 1 225.94 674.00 86.359
## + gear 1 136.81 763.13 89.464
## <none> 899.94 91.586
##
## Step: AIC=55.74
## mpg ~ cyl
##
## Df Sum of Sq RSS AIC
## + wt 1 69.420 128.65 46.954
## + am 1 31.539 166.53 53.407
## + carb 1 22.919 175.15 54.669
## <none> 198.06 55.743
## + disp 1 10.500 187.56 56.381
## + drat 1 8.377 189.69 56.663
## + qsec 1 6.726 191.34 56.879
## + hp 1 6.289 191.78 56.936
## + vs 1 1.409 196.66 57.564
## + gear 1 1.066 197.00 57.608
##
## Step: AIC=46.95
## mpg ~ cyl + wt
##
## Df Sum of Sq RSS AIC
## + carb 1 18.7522 109.89 45.016
## + gear 1 14.2174 114.43 46.027
## <none> 128.65 46.954
## + disp 1 8.3032 120.34 47.286
## + hp 1 6.1952 122.45 47.721
## + qsec 1 5.6870 122.96 47.824
## + am 1 0.7563 127.89 48.807
## + vs 1 0.5754 128.07 48.842
## + drat 1 0.3860 128.26 48.879
##
## Step: AIC=45.02
## mpg ~ cyl + wt + carb
##
## Df Sum of Sq RSS AIC
## + am 1 18.9262 90.966 42.290
## <none> 109.892 45.016
## + drat 1 3.5500 106.342 46.195
## + disp 1 1.7528 108.140 46.614
## + hp 1 1.4561 108.436 46.682
## + qsec 1 0.4230 109.469 46.919
## + vs 1 0.1895 109.703 46.972
## + gear 1 0.1720 109.720 46.976
##
## Step: AIC=42.29
## mpg ~ cyl + wt + carb + am
##
## Df Sum of Sq RSS AIC
## + gear 1 7.1492 83.817 42.244
## <none> 90.966 42.290
## + qsec 1 2.6838 88.282 43.542
## + vs 1 2.5061 88.460 43.592
## + drat 1 0.6771 90.289 44.104
## + disp 1 0.2536 90.713 44.221
## + hp 1 0.0112 90.955 44.287
##
## Step: AIC=42.24
## mpg ~ cyl + wt + carb + am + gear
##
## Df Sum of Sq RSS AIC
## <none> 83.817 42.244
## + vs 1 1.80719 82.010 43.699
## + drat 1 1.71891 82.098 43.726
## + qsec 1 1.34807 82.469 43.839
## + hp 1 0.29077 83.526 44.157
## + disp 1 0.00243 83.815 44.243
summary(fwd_sel)
##
## Call:
## lm(formula = mpg ~ cyl + wt + carb + am + gear, data = train_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.1535 -1.4578 -0.3146 1.3896 3.5355
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 44.5197 6.5137 6.835 1.6e-06 ***
## cyl -2.0197 0.4732 -4.268 0.000415 ***
## wt -1.5867 1.0306 -1.540 0.140155
## carb -0.5958 0.4917 -1.212 0.240439
## amManual 4.1805 1.7252 2.423 0.025540 *
## gear -1.6873 1.3254 -1.273 0.218362
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.1 on 19 degrees of freedom
## Multiple R-squared: 0.9069, Adjusted R-squared: 0.8824
## F-statistic: 37 on 5 and 19 DF, p-value: 3.737e-09
FwdSelection_AIC = AIC(fwd_sel)
FwdSelection_AIC # AIC of the model using forward selection method
## [1] 115.1909
model_full = lm(mpg ~ ., data = train_set)
scopeformula = formula(model_full)
back_sel = step(object=model_full, scope=scopeformula, direction="backward")
## Start: AIC=50.87
## mpg ~ cyl + disp + hp + drat + wt + qsec + vs + am + gear + carb
##
## Df Sum of Sq RSS AIC
## - disp 1 0.0026 79.330 48.868
## - hp 1 0.1028 79.430 48.900
## - vs 1 0.3859 79.713 48.989
## - qsec 1 0.8612 80.188 49.138
## - drat 1 2.0389 81.366 49.502
## - carb 1 2.7495 82.077 49.719
## - wt 1 2.8567 82.184 49.752
## <none> 79.327 50.868
## - gear 1 6.7757 86.103 50.917
## - cyl 1 8.9698 88.297 51.546
## - am 1 24.3651 103.692 55.564
##
## Step: AIC=48.87
## mpg ~ cyl + hp + drat + wt + qsec + vs + am + gear + carb
##
## Df Sum of Sq RSS AIC
## - hp 1 0.1327 79.462 46.910
## - vs 1 0.3939 79.724 46.992
## - qsec 1 1.0179 80.348 47.187
## - drat 1 2.0604 81.390 47.509
## - carb 1 5.8782 85.208 48.655
## <none> 79.330 48.868
## - gear 1 6.7851 86.115 48.920
## - wt 1 8.2676 87.597 49.347
## - cyl 1 9.2855 88.615 49.636
## - am 1 24.4539 103.784 53.586
##
## Step: AIC=46.91
## mpg ~ cyl + drat + wt + qsec + vs + am + gear + carb
##
## Df Sum of Sq RSS AIC
## - vs 1 0.4602 79.923 45.055
## - qsec 1 0.9605 80.423 45.211
## - drat 1 2.1632 81.626 45.582
## - carb 1 5.9714 85.434 46.722
## <none> 79.462 46.910
## - gear 1 6.6903 86.153 46.931
## - wt 1 8.2754 87.738 47.387
## - cyl 1 11.7335 91.196 48.353
## - am 1 25.6991 105.161 51.916
##
## Step: AIC=45.05
## mpg ~ cyl + drat + wt + qsec + am + gear + carb
##
## Df Sum of Sq RSS AIC
## - qsec 1 2.1755 82.098 43.726
## - drat 1 2.5463 82.469 43.839
## - carb 1 5.5560 85.479 44.735
## <none> 79.923 45.055
## - gear 1 6.7952 86.718 45.095
## - wt 1 10.1926 90.115 46.055
## - cyl 1 12.6622 92.585 46.731
## - am 1 25.4033 105.326 49.955
##
## Step: AIC=43.73
## mpg ~ cyl + drat + wt + am + gear + carb
##
## Df Sum of Sq RSS AIC
## - drat 1 1.719 83.817 42.244
## <none> 82.098 43.726
## - carb 1 7.243 89.341 43.840
## - gear 1 8.191 90.289 44.104
## - wt 1 8.316 90.414 44.138
## - am 1 23.349 105.447 47.983
## - cyl 1 63.282 145.380 56.012
##
## Step: AIC=42.24
## mpg ~ cyl + wt + am + gear + carb
##
## Df Sum of Sq RSS AIC
## - carb 1 6.478 90.295 42.105
## <none> 83.817 42.244
## - gear 1 7.149 90.966 42.290
## - wt 1 10.456 94.273 43.183
## - am 1 25.903 109.720 46.976
## - cyl 1 80.373 164.189 57.054
##
## Step: AIC=42.11
## mpg ~ cyl + wt + am + gear
##
## Df Sum of Sq RSS AIC
## <none> 90.295 42.105
## - wt 1 23.035 113.330 45.786
## - am 1 24.132 114.427 46.027
## - gear 1 37.593 127.888 48.807
## - cyl 1 151.069 241.364 64.686
summary(back_sel)
##
## Call:
## lm(formula = mpg ~ cyl + wt + am + gear, data = train_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8425 -1.1382 -0.3879 1.0215 3.7350
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 50.6645 4.1360 12.250 9.44e-11 ***
## cyl -2.3297 0.4027 -5.785 1.17e-05 ***
## wt -2.1250 0.9408 -2.259 0.03522 *
## amManual 4.0236 1.7404 2.312 0.03155 *
## gear -2.7971 0.9693 -2.886 0.00914 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.125 on 20 degrees of freedom
## Multiple R-squared: 0.8997, Adjusted R-squared: 0.8796
## F-statistic: 44.83 on 4 and 20 DF, p-value: 1.034e-09
BackSelection_AIC = AIC(back_sel)
BackSelection_AIC # AIC of the model using backward selection method
## [1] 115.0521
# plots for linear regression assumptions:
plot(back_sel)
plot(back_sel$residuals)
model_full = lm(mpg ~ ., data = train_set)
scopeformula = formula(model_full)
both_sel = step(object=model_full, scope=scopeformula, direction="both")
## Start: AIC=50.87
## mpg ~ cyl + disp + hp + drat + wt + qsec + vs + am + gear + carb
##
## Df Sum of Sq RSS AIC
## - disp 1 0.0026 79.330 48.868
## - hp 1 0.1028 79.430 48.900
## - vs 1 0.3859 79.713 48.989
## - qsec 1 0.8612 80.188 49.138
## - drat 1 2.0389 81.366 49.502
## - carb 1 2.7495 82.077 49.719
## - wt 1 2.8567 82.184 49.752
## <none> 79.327 50.868
## - gear 1 6.7757 86.103 50.917
## - cyl 1 8.9698 88.297 51.546
## - am 1 24.3651 103.692 55.564
##
## Step: AIC=48.87
## mpg ~ cyl + hp + drat + wt + qsec + vs + am + gear + carb
##
## Df Sum of Sq RSS AIC
## - hp 1 0.1327 79.462 46.910
## - vs 1 0.3939 79.724 46.992
## - qsec 1 1.0179 80.348 47.187
## - drat 1 2.0604 81.390 47.509
## - carb 1 5.8782 85.208 48.655
## <none> 79.330 48.868
## - gear 1 6.7851 86.115 48.920
## - wt 1 8.2676 87.597 49.347
## - cyl 1 9.2855 88.615 49.636
## + disp 1 0.0026 79.327 50.868
## - am 1 24.4539 103.784 53.586
##
## Step: AIC=46.91
## mpg ~ cyl + drat + wt + qsec + vs + am + gear + carb
##
## Df Sum of Sq RSS AIC
## - vs 1 0.4602 79.923 45.055
## - qsec 1 0.9605 80.423 45.211
## - drat 1 2.1632 81.626 45.582
## - carb 1 5.9714 85.434 46.722
## <none> 79.462 46.910
## - gear 1 6.6903 86.153 46.931
## - wt 1 8.2754 87.738 47.387
## - cyl 1 11.7335 91.196 48.353
## + hp 1 0.1327 79.330 48.868
## + disp 1 0.0324 79.430 48.900
## - am 1 25.6991 105.161 51.916
##
## Step: AIC=45.05
## mpg ~ cyl + drat + wt + qsec + am + gear + carb
##
## Df Sum of Sq RSS AIC
## - qsec 1 2.1755 82.098 43.726
## - drat 1 2.5463 82.469 43.839
## - carb 1 5.5560 85.479 44.735
## <none> 79.923 45.055
## - gear 1 6.7952 86.718 45.095
## - wt 1 10.1926 90.115 46.055
## - cyl 1 12.6622 92.585 46.731
## + vs 1 0.4602 79.462 46.910
## + hp 1 0.1989 79.724 46.992
## + disp 1 0.0348 79.888 47.044
## - am 1 25.4033 105.326 49.955
##
## Step: AIC=43.73
## mpg ~ cyl + drat + wt + am + gear + carb
##
## Df Sum of Sq RSS AIC
## - drat 1 1.719 83.817 42.244
## <none> 82.098 43.726
## - carb 1 7.243 89.341 43.840
## - gear 1 8.191 90.289 44.104
## - wt 1 8.316 90.414 44.138
## + qsec 1 2.175 79.923 45.055
## + vs 1 1.675 80.423 45.211
## + hp 1 0.155 81.943 45.679
## + disp 1 0.135 81.963 45.685
## - am 1 23.349 105.447 47.983
## - cyl 1 63.282 145.380 56.012
##
## Step: AIC=42.24
## mpg ~ cyl + wt + am + gear + carb
##
## Df Sum of Sq RSS AIC
## - carb 1 6.478 90.295 42.105
## <none> 83.817 42.244
## - gear 1 7.149 90.966 42.290
## - wt 1 10.456 94.273 43.183
## + vs 1 1.807 82.010 43.699
## + drat 1 1.719 82.098 43.726
## + qsec 1 1.348 82.469 43.839
## + hp 1 0.291 83.526 44.157
## + disp 1 0.002 83.815 44.243
## - am 1 25.903 109.720 46.976
## - cyl 1 80.373 164.189 57.054
##
## Step: AIC=42.11
## mpg ~ cyl + wt + am + gear
##
## Df Sum of Sq RSS AIC
## <none> 90.295 42.105
## + carb 1 6.478 83.817 42.244
## + qsec 1 2.829 87.466 43.309
## + disp 1 1.540 88.755 43.675
## + vs 1 1.238 89.058 43.760
## + drat 1 0.954 89.341 43.840
## + hp 1 0.224 90.071 44.043
## - wt 1 23.035 113.330 45.786
## - am 1 24.132 114.427 46.027
## - gear 1 37.593 127.888 48.807
## - cyl 1 151.069 241.364 64.686
summary(both_sel)
##
## Call:
## lm(formula = mpg ~ cyl + wt + am + gear, data = train_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8425 -1.1382 -0.3879 1.0215 3.7350
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 50.6645 4.1360 12.250 9.44e-11 ***
## cyl -2.3297 0.4027 -5.785 1.17e-05 ***
## wt -2.1250 0.9408 -2.259 0.03522 *
## amManual 4.0236 1.7404 2.312 0.03155 *
## gear -2.7971 0.9693 -2.886 0.00914 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.125 on 20 degrees of freedom
## Multiple R-squared: 0.8997, Adjusted R-squared: 0.8796
## F-statistic: 44.83 on 4 and 20 DF, p-value: 1.034e-09
BidirSelection_AIC = AIC(both_sel)
BidirSelection_AIC # AIC of the model using bidirectional(both) selection method
## [1] 115.0521
AIC_df = data.frame(FwdSelection=FwdSelection_AIC, BackSelection=BackSelection_AIC, BidirSelection=BidirSelection_AIC)
rownames(AIC_df) = c("AIC")
AIC_df
## FwdSelection BackSelection BidirSelection
## AIC 115.1909 115.0521 115.0521
p-values of all variables are less than popular significance level of 0.05. so all those independent variables are significant predictors of the model.
p-value of “am” variable is < 0.05. this again re-confirm our earlier conclusion that “am” is an significant predictor of “mpg”, meaning MPG and AM have significant relationship.
INTERPRETATION OF SLOPE: the model predicts that on average manual transmision (am=1) has 4.023 higher MPG than automatic transmission (am=0).
adj-R2 of the model is 0.8796, that means, 88% of the variability in MPG can be explained by this model with these 4 predictor variables.