Stepwise Model Selection
data_set<-read.csv("Advertising.csv")
data_set<-data_set[ , c(2:5)]
data_set
library(caret)
set.seed(1234)
k_flodsTrain<-trainControl(method = "repeatedcv",
number = 10,
repeats = 5,
verboseIter = TRUE)
trainIndexs<-createDataPartition(data_set$TV, p=0.7, list = FALSE)
trainData<-data_set[trainIndexs, ]
testData<-data_set[-trainIndexs, ]
trainData
modelo<-train(Sales ~ TV,
data = trainData,
method="lm",
trControl=k_flodsTrain)
+ Fold01.Rep1: intercept=TRUE
- Fold01.Rep1: intercept=TRUE
+ Fold02.Rep1: intercept=TRUE
- Fold02.Rep1: intercept=TRUE
+ Fold03.Rep1: intercept=TRUE
- Fold03.Rep1: intercept=TRUE
+ Fold04.Rep1: intercept=TRUE
- Fold04.Rep1: intercept=TRUE
+ Fold05.Rep1: intercept=TRUE
- Fold05.Rep1: intercept=TRUE
+ Fold06.Rep1: intercept=TRUE
- Fold06.Rep1: intercept=TRUE
+ Fold07.Rep1: intercept=TRUE
- Fold07.Rep1: intercept=TRUE
+ Fold08.Rep1: intercept=TRUE
- Fold08.Rep1: intercept=TRUE
+ Fold09.Rep1: intercept=TRUE
- Fold09.Rep1: intercept=TRUE
+ Fold10.Rep1: intercept=TRUE
- Fold10.Rep1: intercept=TRUE
+ Fold01.Rep2: intercept=TRUE
- Fold01.Rep2: intercept=TRUE
+ Fold02.Rep2: intercept=TRUE
- Fold02.Rep2: intercept=TRUE
+ Fold03.Rep2: intercept=TRUE
- Fold03.Rep2: intercept=TRUE
+ Fold04.Rep2: intercept=TRUE
- Fold04.Rep2: intercept=TRUE
+ Fold05.Rep2: intercept=TRUE
- Fold05.Rep2: intercept=TRUE
+ Fold06.Rep2: intercept=TRUE
- Fold06.Rep2: intercept=TRUE
+ Fold07.Rep2: intercept=TRUE
- Fold07.Rep2: intercept=TRUE
+ Fold08.Rep2: intercept=TRUE
- Fold08.Rep2: intercept=TRUE
+ Fold09.Rep2: intercept=TRUE
- Fold09.Rep2: intercept=TRUE
+ Fold10.Rep2: intercept=TRUE
- Fold10.Rep2: intercept=TRUE
+ Fold01.Rep3: intercept=TRUE
- Fold01.Rep3: intercept=TRUE
+ Fold02.Rep3: intercept=TRUE
- Fold02.Rep3: intercept=TRUE
+ Fold03.Rep3: intercept=TRUE
- Fold03.Rep3: intercept=TRUE
+ Fold04.Rep3: intercept=TRUE
- Fold04.Rep3: intercept=TRUE
+ Fold05.Rep3: intercept=TRUE
- Fold05.Rep3: intercept=TRUE
+ Fold06.Rep3: intercept=TRUE
- Fold06.Rep3: intercept=TRUE
+ Fold07.Rep3: intercept=TRUE
- Fold07.Rep3: intercept=TRUE
+ Fold08.Rep3: intercept=TRUE
- Fold08.Rep3: intercept=TRUE
+ Fold09.Rep3: intercept=TRUE
- Fold09.Rep3: intercept=TRUE
+ Fold10.Rep3: intercept=TRUE
- Fold10.Rep3: intercept=TRUE
+ Fold01.Rep4: intercept=TRUE
- Fold01.Rep4: intercept=TRUE
+ Fold02.Rep4: intercept=TRUE
- Fold02.Rep4: intercept=TRUE
+ Fold03.Rep4: intercept=TRUE
- Fold03.Rep4: intercept=TRUE
+ Fold04.Rep4: intercept=TRUE
- Fold04.Rep4: intercept=TRUE
+ Fold05.Rep4: intercept=TRUE
- Fold05.Rep4: intercept=TRUE
+ Fold06.Rep4: intercept=TRUE
- Fold06.Rep4: intercept=TRUE
+ Fold07.Rep4: intercept=TRUE
- Fold07.Rep4: intercept=TRUE
+ Fold08.Rep4: intercept=TRUE
- Fold08.Rep4: intercept=TRUE
+ Fold09.Rep4: intercept=TRUE
- Fold09.Rep4: intercept=TRUE
+ Fold10.Rep4: intercept=TRUE
- Fold10.Rep4: intercept=TRUE
+ Fold01.Rep5: intercept=TRUE
- Fold01.Rep5: intercept=TRUE
+ Fold02.Rep5: intercept=TRUE
- Fold02.Rep5: intercept=TRUE
+ Fold03.Rep5: intercept=TRUE
- Fold03.Rep5: intercept=TRUE
+ Fold04.Rep5: intercept=TRUE
- Fold04.Rep5: intercept=TRUE
+ Fold05.Rep5: intercept=TRUE
- Fold05.Rep5: intercept=TRUE
+ Fold06.Rep5: intercept=TRUE
- Fold06.Rep5: intercept=TRUE
+ Fold07.Rep5: intercept=TRUE
- Fold07.Rep5: intercept=TRUE
+ Fold08.Rep5: intercept=TRUE
- Fold08.Rep5: intercept=TRUE
+ Fold09.Rep5: intercept=TRUE
- Fold09.Rep5: intercept=TRUE
+ Fold10.Rep5: intercept=TRUE
- Fold10.Rep5: intercept=TRUE
Aggregating results
Fitting final model on full training set
modelo
Linear Regression
140 samples
1 predictor
No pre-processing
Resampling: Cross-Validated (10 fold, repeated 5 times)
Summary of sample sizes: 126, 125, 127, 126, 126, 127, ...
Resampling results:
RMSE Rsquared MAE
3.213605 0.6292166 2.565841
Tuning parameter 'intercept' was held constant at a value of TRUE
summary(modelo)
Call:
lm(formula = .outcome ~ ., data = dat)
Residuals:
Min 1Q Median 3Q Max
-8.2543 -1.8500 -0.1661 2.1735 7.3204
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.932358 0.551085 12.58 <2e-16 ***
TV 0.047423 0.003239 14.64 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.266 on 138 degrees of freedom
Multiple R-squared: 0.6083, Adjusted R-squared: 0.6055
F-statistic: 214.3 on 1 and 138 DF, p-value: < 2.2e-16
predict(object = modelo, newdata = testData)
1 2 8 13 15 23 24 25 27 32 35
17.844356 9.042675 12.632585 8.061022 16.611361 7.558340 17.758994 9.886802 13.709083 12.286398 11.470725
36 37 39 40 43 54 56 58 61 62 63
20.718180 19.589516 8.976283 17.744768 20.855706 15.591770 16.364763 13.391350 9.469481 19.323948 18.280646
66 67 68 70 79 82 83 87 90 92 93
10.204535 8.426178 13.538361 17.213632 7.188442 18.304357 10.503299 10.550722 12.139387 8.288652 17.256312
94 97 100 106 109 116 118 119 125 132 133
18.830751 16.303113 13.343927 13.471969 7.553598 10.493814 10.555464 12.893410 17.815902 19.508897 7.330710
137 138 155 160 164 167 169 170 173 174 177
8.146383 19.911992 15.838369 13.177947 14.685994 7.781227 17.147240 20.414674 7.861846 14.918366 18.712194
178 181 188 194 198
15.003727 14.358776 15.994864 14.842489 15.326202
Forward Selection
null_model<-lm(formula = price ~ 1, data = diamonds)
summary(null_model)
Call:
lm(formula = price ~ 1, data = diamonds)
Residuals:
Min 1Q Median 3Q Max
-3607 -2983 -1532 1392 14890
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3932.80 17.18 229 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3989 on 53939 degrees of freedom
full_model<-lm(formula = price ~ ., data = diamonds)
summary(full_model)
Call:
lm(formula = price ~ ., data = diamonds)
Residuals:
Min 1Q Median 3Q Max
-21376.0 -592.4 -183.5 376.4 10694.2
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5753.762 396.630 14.507 < 2e-16 ***
carat 11256.978 48.628 231.494 < 2e-16 ***
cut.L 584.457 22.478 26.001 < 2e-16 ***
cut.Q -301.908 17.994 -16.778 < 2e-16 ***
cut.C 148.035 15.483 9.561 < 2e-16 ***
cut^4 -20.794 12.377 -1.680 0.09294 .
color.L -1952.160 17.342 -112.570 < 2e-16 ***
color.Q -672.054 15.777 -42.597 < 2e-16 ***
color.C -165.283 14.725 -11.225 < 2e-16 ***
color^4 38.195 13.527 2.824 0.00475 **
color^5 -95.793 12.776 -7.498 6.59e-14 ***
color^6 -48.466 11.614 -4.173 3.01e-05 ***
clarity.L 4097.431 30.259 135.414 < 2e-16 ***
clarity.Q -1925.004 28.227 -68.197 < 2e-16 ***
clarity.C 982.205 24.152 40.668 < 2e-16 ***
clarity^4 -364.918 19.285 -18.922 < 2e-16 ***
clarity^5 233.563 15.752 14.828 < 2e-16 ***
clarity^6 6.883 13.715 0.502 0.61575
clarity^7 90.640 12.103 7.489 7.06e-14 ***
depth -63.806 4.535 -14.071 < 2e-16 ***
table -26.474 2.912 -9.092 < 2e-16 ***
x -1008.261 32.898 -30.648 < 2e-16 ***
y 9.609 19.333 0.497 0.61918
z -50.119 33.486 -1.497 0.13448
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1130 on 53916 degrees of freedom
Multiple R-squared: 0.9198, Adjusted R-squared: 0.9198
F-statistic: 2.688e+04 on 23 and 53916 DF, p-value: < 2.2e-16
forward_selection<-step(
object = null_model,
scope = list(lower = null_model, upper = full_model),
direction = "forward"
)
Start: AIC=894477.9
price ~ 1
Df Sum of Sq RSS AIC
+ carat 1 7.2913e+11 1.2935e+11 792389
+ x 1 6.7152e+11 1.8695e+11 812259
+ y 1 6.4296e+11 2.1552e+11 819929
+ z 1 6.3677e+11 2.2170e+11 821454
+ color 6 2.6849e+10 8.3162e+11 892776
+ clarity 7 2.3308e+10 8.3517e+11 893007
+ table 1 1.3876e+10 8.4460e+11 893601
+ cut 4 1.1042e+10 8.4743e+11 893788
+ depth 1 9.7323e+07 8.5838e+11 894474
<none> 8.5847e+11 894478
Step: AIC=792389.4
price ~ carat
Df Sum of Sq RSS AIC
+ clarity 7 3.9082e+10 9.0264e+10 772998
+ color 6 1.2561e+10 1.1678e+11 786891
+ cut 4 6.1332e+09 1.2321e+11 789777
+ x 1 3.5206e+09 1.2583e+11 790903
+ z 1 2.8493e+09 1.2650e+11 791190
+ table 1 1.4377e+09 1.2791e+11 791789
+ y 1 1.2425e+09 1.2810e+11 791871
+ depth 1 1.1546e+09 1.2819e+11 791908
<none> 1.2935e+11 792389
Step: AIC=772998.5
price ~ carat + clarity
Df Sum of Sq RSS AIC
+ color 6 1.6402e+10 7.3862e+10 762193
+ x 1 1.8542e+09 8.8410e+10 771881
+ cut 4 1.7808e+09 8.8483e+10 771932
+ z 1 1.4814e+09 8.8783e+10 772108
+ y 1 7.4127e+08 8.9523e+10 772556
+ table 1 3.7751e+08 8.9886e+10 772774
+ depth 1 3.5822e+08 8.9906e+10 772786
<none> 9.0264e+10 772998
Step: AIC=762193.4
price ~ carat + clarity + color
Df Sum of Sq RSS AIC
+ x 1 2733710969 7.1128e+10 760161
+ z 1 1842294631 7.2020e+10 760833
+ cut 4 1699187372 7.2163e+10 760946
+ y 1 1145039064 7.2717e+10 761353
+ table 1 409645878 7.3452e+10 761895
+ depth 1 174658715 7.3687e+10 762068
<none> 7.3862e+10 762193
Step: AIC=760161.1
price ~ carat + clarity + color + x
Df Sum of Sq RSS AIC
+ cut 4 1918248123 6.9210e+10 758694
+ depth 1 722282102 7.0406e+10 759613
+ table 1 273738191 7.0855e+10 759955
+ z 1 199547343 7.0929e+10 760012
+ y 1 5354253 7.1123e+10 760159
<none> 7.1128e+10 760161
Step: AIC=758694.4
price ~ carat + clarity + color + x + cut
Df Sum of Sq RSS AIC
+ depth 1 244682865 6.8965e+10 758505
+ z 1 72666922 6.9137e+10 758640
+ table 1 9935285 6.9200e+10 758689
<none> 6.9210e+10 758694
+ y 1 982101 6.9209e+10 758696
Step: AIC=758505.4
price ~ carat + clarity + color + x + cut + depth
Df Sum of Sq RSS AIC
+ table 1 105497218 6.8860e+10 758425
<none> 6.8965e+10 758505
+ z 1 2323719 6.8963e+10 758506
+ y 1 298553 6.8965e+10 758507
Step: AIC=758424.8
price ~ carat + clarity + color + x + cut + depth + table
Df Sum of Sq RSS AIC
+ z 1 2662170 6.8857e+10 758425
<none> 6.8860e+10 758425
+ y 1 116788 6.8860e+10 758427
Step: AIC=758424.7
price ~ carat + clarity + color + x + cut + depth + table + z
Df Sum of Sq RSS AIC
<none> 6.8857e+10 758425
+ y 1 315487 6.8857e+10 758426
forward_selection
Call:
lm(formula = price ~ carat + clarity + color + x + cut + depth +
table + z, data = diamonds)
Coefficients:
(Intercept) carat clarity.L clarity.Q clarity.C clarity^4 clarity^5 clarity^6
5768.782 11257.752 4097.613 -1925.133 982.322 -364.976 233.635 6.871
clarity^7 color.L color.Q color.C color^4 color^5 color^6 x
90.622 -1952.179 -672.075 -165.277 38.193 -95.780 -48.452 -1000.354
cut.L cut.Q cut.C cut^4 depth table z
584.600 -302.211 148.446 -20.619 -64.003 -26.501 -47.925
Backward Elimination
backward_elimination<-step(
object = full_model,
scope = list(lower = null_model, upper = full_model),
direction = "backward"
)
Start: AIC=758426.5
price ~ carat + cut + color + clarity + depth + table + x + y +
z
Df Sum of Sq RSS AIC
- y 1 3.1549e+05 6.8857e+10 758425
<none> 6.8857e+10 758426
- z 1 2.8609e+06 6.8860e+10 758427
- table 1 1.0558e+08 6.8962e+10 758507
- depth 1 2.5286e+08 6.9110e+10 758622
- cut 4 8.6357e+08 6.9720e+10 759091
- x 1 1.1996e+09 7.0056e+10 759356
- color 6 1.7082e+10 8.5939e+10 770368
- clarity 7 3.5703e+10 1.0456e+11 780945
- carat 1 6.8440e+10 1.3730e+11 795649
Step: AIC=758424.7
price ~ carat + cut + color + clarity + depth + table + x + z
Df Sum of Sq RSS AIC
<none> 6.8857e+10 758425
- z 1 2.6622e+06 6.8860e+10 758425
- table 1 1.0584e+08 6.8963e+10 758506
- depth 1 2.5637e+08 6.9114e+10 758623
- cut 4 8.6409e+08 6.9721e+10 759089
- x 1 1.5413e+09 7.0398e+10 759617
- color 6 1.7082e+10 8.5940e+10 770366
- clarity 7 3.5708e+10 1.0457e+11 780946
- carat 1 6.8520e+10 1.3738e+11 795679
backward_elimination
Call:
lm(formula = price ~ carat + cut + color + clarity + depth +
table + x + z, data = diamonds)
Coefficients:
(Intercept) carat cut.L cut.Q cut.C cut^4 color.L color.Q
5768.782 11257.752 584.600 -302.211 148.446 -20.619 -1952.179 -672.075
color.C color^4 color^5 color^6 clarity.L clarity.Q clarity.C clarity^4
-165.277 38.193 -95.780 -48.452 4097.613 -1925.133 982.322 -364.976
clarity^5 clarity^6 clarity^7 depth table x z
233.635 6.871 90.622 -64.003 -26.501 -1000.354 -47.925
library(ggplot2)
diamonds
LS0tDQp0aXRsZTogIkRhdGEgTWluaW5nICYgQmlnIERhdGEgLSBDbGFzZSA0Ig0Kb3V0cHV0OiBodG1sX25vdGVib29rDQotLS0NCg0KIyMjIFN0ZXB3aXNlIE1vZGVsIFNlbGVjdGlvbg0KDQpgYGB7cn0NCmRhdGFfc2V0PC1yZWFkLmNzdigiQWR2ZXJ0aXNpbmcuY3N2IikNCmRhdGFfc2V0PC1kYXRhX3NldFsgLCBjKDI6NSldDQpkYXRhX3NldA0KYGBgDQoNCmBgYHtyfQ0KbGlicmFyeShjYXJldCkNCmBgYA0KDQpgYGB7cn0NCnNldC5zZWVkKDEyMzQpDQprX2Zsb2RzVHJhaW48LXRyYWluQ29udHJvbChtZXRob2QgPSAicmVwZWF0ZWRjdiIsDQogICAgICAgICAgICAgICAgICAgICAgICAgICBudW1iZXIgPSAxMCwNCiAgICAgICAgICAgICAgICAgICAgICAgICAgIHJlcGVhdHMgPSA1LA0KICAgICAgICAgICAgICAgICAgICAgICAgICAgdmVyYm9zZUl0ZXIgPSBUUlVFKQ0KYGBgDQoNCg0KYGBge3J9DQp0cmFpbkluZGV4czwtY3JlYXRlRGF0YVBhcnRpdGlvbihkYXRhX3NldCRUViwgcD0wLjcsIGxpc3QgPSBGQUxTRSkNCnRyYWluRGF0YTwtZGF0YV9zZXRbdHJhaW5JbmRleHMsIF0NCnRlc3REYXRhPC1kYXRhX3NldFstdHJhaW5JbmRleHMsIF0NCnRyYWluRGF0YQ0KYGBgDQoNCmBgYHtyfQ0KbW9kZWxvPC10cmFpbihTYWxlcyB+IFRWLA0KICAgICAgICAgICAgICBkYXRhID0gdHJhaW5EYXRhLA0KICAgICAgICAgICAgICBtZXRob2Q9ImxtIiwNCiAgICAgICAgICAgICAgdHJDb250cm9sPWtfZmxvZHNUcmFpbikNCm1vZGVsbw0KYGBgDQoNCg0KYGBge3J9DQpzdW1tYXJ5KG1vZGVsbykNCmBgYA0KDQpgYGB7cn0NCnByZWRpY3Qob2JqZWN0ID0gbW9kZWxvLCBuZXdkYXRhID0gdGVzdERhdGEpDQpgYGANCg0KIyMjIEZvcndhcmQgU2VsZWN0aW9uDQoNCg0KYGBge3J9DQpudWxsX21vZGVsPC1sbShmb3JtdWxhID0gcHJpY2UgfiAxLCBkYXRhID0gZGlhbW9uZHMpDQpzdW1tYXJ5KG51bGxfbW9kZWwpDQpgYGANCg0KYGBge3J9DQpmdWxsX21vZGVsPC1sbShmb3JtdWxhID0gcHJpY2UgfiAuLCBkYXRhID0gZGlhbW9uZHMpDQpzdW1tYXJ5KGZ1bGxfbW9kZWwpDQpgYGANCg0KDQpgYGB7cn0NCmZvcndhcmRfc2VsZWN0aW9uPC1zdGVwKA0KICBvYmplY3QgPSBudWxsX21vZGVsLA0KICBzY29wZSA9IGxpc3QobG93ZXIgPSBudWxsX21vZGVsLCB1cHBlciA9IGZ1bGxfbW9kZWwpLA0KICBkaXJlY3Rpb24gPSAiZm9yd2FyZCINCikNCmZvcndhcmRfc2VsZWN0aW9uDQpgYGANCg0KDQojIyMgQmFja3dhcmQgRWxpbWluYXRpb24NCmBgYHtyfQ0KYmFja3dhcmRfZWxpbWluYXRpb248LXN0ZXAoDQogIG9iamVjdCA9IGZ1bGxfbW9kZWwsDQogIHNjb3BlID0gbGlzdChsb3dlciA9IG51bGxfbW9kZWwsIHVwcGVyID0gZnVsbF9tb2RlbCksDQogIGRpcmVjdGlvbiA9ICJiYWNrd2FyZCINCikNCmJhY2t3YXJkX2VsaW1pbmF0aW9uDQpgYGANCg0KDQpgYGB7cn0NCmxpYnJhcnkoZ2dwbG90MikNCmRpYW1vbmRzDQpgYGANCg0K