library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
data(BloodBrain)
summary(bbbDescr)
## tpsa nbasic negative vsa_hyd
## Min. : 3.24 Min. :0.0000 Min. :0.000000 Min. : 65.44
## 1st Qu.: 35.83 1st Qu.:0.0000 1st Qu.:0.000000 1st Qu.:163.88
## Median : 45.88 Median :0.0000 Median :0.000000 Median :231.43
## Mean : 52.87 Mean :0.3654 Mean :0.004808 Mean :234.94
## 3rd Qu.: 72.68 3rd Qu.:1.0000 3rd Qu.:0.000000 3rd Qu.:300.54
## Max. :166.75 Max. :1.0000 Max. :1.000000 Max. :493.00
## a_aro weight peoe_vsa.0 peoe_vsa.1
## Min. : 0.000 Min. :129.2 Min. : 0.00 Min. : 0.00
## 1st Qu.: 6.000 1st Qu.:250.4 1st Qu.: 38.73 1st Qu.: 40.97
## Median :11.000 Median :303.8 Median : 63.86 Median : 76.57
## Mean : 9.707 Mean :310.6 Mean : 63.18 Mean : 75.77
## 3rd Qu.:12.000 3rd Qu.:376.9 3rd Qu.: 84.91 3rd Qu.:107.16
## Max. :21.000 Max. :671.9 Max. :149.28 Max. :162.36
## peoe_vsa.2 peoe_vsa.3 peoe_vsa.4 peoe_vsa.5
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.00
## Median : 9.425 Median : 5.442 Median : 0.000 Median : 0.00
## Mean : 14.955 Mean :10.977 Mean : 5.786 Mean : 3.96
## 3rd Qu.: 24.869 3rd Qu.:17.238 3rd Qu.:10.324 3rd Qu.: 6.70
## Max. :120.051 Max. :94.245 Max. :44.788 Max. :29.42
## peoe_vsa.6 peoe_vsa.0.1 peoe_vsa.1.1 peoe_vsa.2.1
## Min. : 0.000 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 8.619 1st Qu.: 24.51 1st Qu.: 24.51 1st Qu.: 0.000
## Median :17.238 Median : 38.73 Median : 54.09 Median : 0.000
## Mean :16.365 Mean : 44.49 Mean : 55.57 Mean : 2.992
## 3rd Qu.:24.301 3rd Qu.: 61.27 3rd Qu.: 73.53 3rd Qu.: 0.000
## Max. :68.301 Max. :143.62 Max. :227.81 Max. :44.603
## peoe_vsa.3.1 peoe_vsa.4.1 peoe_vsa.5.1 peoe_vsa.6.1
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 0.000 Median : 0.000 Median : 2.504 Median : 2.504
## Mean : 3.008 Mean : 4.090 Mean : 8.295 Mean : 6.419
## 3rd Qu.: 0.000 3rd Qu.: 5.683 3rd Qu.:13.567 3rd Qu.:10.079
## Max. :35.863 Max. :35.182 Max. :54.268 Max. :49.076
## a_acc a_acid a_base vsa_acc
## Min. :0.000 Min. :0.00000 Min. :0.000 Min. : 0.000
## 1st Qu.:1.000 1st Qu.:0.00000 1st Qu.:0.000 1st Qu.: 4.382
## Median :2.000 Median :0.00000 Median :1.000 Median :13.567
## Mean :2.082 Mean :0.07212 Mean :1.154 Mean :17.652
## 3rd Qu.:3.000 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:29.638
## Max. :6.000 Max. :3.00000 Max. :6.000 Max. :63.569
## vsa_acid vsa_base vsa_don vsa_other
## Min. : 0.0000 Min. : 0.000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.0000 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.:15.16
## Median : 0.0000 Median : 0.000 Median : 5.683 Median :23.99
## Mean : 0.9784 Mean : 5.326 Mean : 8.262 Mean :26.85
## 3rd Qu.: 0.0000 3rd Qu.: 7.103 3rd Qu.:11.365 3rd Qu.:36.69
## Max. :40.7008 Max. :58.215 Max. :58.910 Max. :70.45
## vsa_pol slogp_vsa0 slogp_vsa1 slogp_vsa2
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.: 8.592 1st Qu.: 5.259 1st Qu.: 0.00
## Median : 0.000 Median : 32.516 Median :16.786 Median :10.45
## Mean : 3.415 Mean : 31.122 Mean :18.399 Mean :16.02
## 3rd Qu.: 0.000 3rd Qu.: 47.021 3rd Qu.:28.590 3rd Qu.:23.86
## Max. :27.134 Max. :117.376 Max. :70.450 Max. :90.00
## slogp_vsa3 slogp_vsa4 slogp_vsa5 slogp_vsa6
## Min. : 0.00 Min. : 0.000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 12.77 1st Qu.: 3.186 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 35.67 Median : 6.371 Median : 19.36 Median : 0.000
## Mean : 33.40 Mean : 9.723 Mean : 29.03 Mean : 1.396
## 3rd Qu.: 43.20 3rd Qu.:10.930 3rd Qu.: 38.14 3rd Qu.: 3.501
## Max. :154.98 Max. :56.423 Max. :110.53 Max. :12.322
## slogp_vsa7 slogp_vsa8 slogp_vsa9 smr_vsa0
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 63.14 1st Qu.: 0.00 1st Qu.: 28.93 1st Qu.: 3.124
## Median :109.97 Median : 18.87 Median : 55.46 Median : 19.717
## Mean :103.55 Mean : 29.20 Mean : 60.23 Mean : 25.514
## 3rd Qu.:141.15 3rd Qu.: 41.72 3rd Qu.: 90.34 3rd Qu.: 47.724
## Max. :247.00 Max. :150.95 Max. :179.41 Max. :109.229
## smr_vsa1 smr_vsa2 smr_vsa3 smr_vsa4
## Min. : 0.00 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 3.982 1st Qu.: 0.000
## Median : 23.69 Median : 4.411 Median : 9.872 Median : 2.757
## Mean : 29.35 Mean :13.913 Mean :12.267 Mean : 8.367
## 3rd Qu.: 47.84 3rd Qu.:19.910 3rd Qu.:18.880 3rd Qu.: 7.841
## Max. :132.82 Max. :86.957 Max. :43.161 Max. :78.119
## smr_vsa5 smr_vsa6 smr_vsa7 tpsa.1
## Min. : 16.79 Min. : 0.00 Min. : 0.00 Min. : 4.44
## 1st Qu.: 88.15 1st Qu.: 15.35 1st Qu.: 33.33 1st Qu.: 38.03
## Median :142.16 Median : 35.67 Median : 66.65 Median : 50.41
## Mean :140.81 Mean : 35.15 Mean : 66.71 Mean : 55.08
## 3rd Qu.:196.53 3rd Qu.: 42.25 3rd Qu.: 99.98 3rd Qu.: 73.19
## Max. :326.15 Max. :154.98 Max. :171.65 Max. :167.95
## logp.o.w. frac.anion7. frac.cation7. andrewbind
## Min. :-2.075 Min. :0.00000 Min. :0.0000 Min. :-3.700
## 1st Qu.: 1.808 1st Qu.:0.00000 1st Qu.:0.0020 1st Qu.: 4.875
## Median : 2.781 Median :0.00000 Median :0.9610 Median : 9.500
## Mean : 2.865 Mean :0.04536 Mean :0.6313 Mean : 9.738
## 3rd Qu.: 3.839 3rd Qu.:0.00000 3rd Qu.:0.9960 3rd Qu.:14.400
## Max. : 7.653 Max. :0.99900 Max. :0.9990 Max. :35.500
## rotatablebonds mlogp clogp mw
## Min. : 0.000 Min. :-0.6995 Min. :-2.715 Min. :128.2
## 1st Qu.: 2.000 1st Qu.: 1.9553 1st Qu.: 1.365 1st Qu.:249.4
## Median : 4.000 Median : 2.8489 Median : 2.541 Median :303.3
## Mean : 4.659 Mean : 2.7255 Mean : 2.685 Mean :309.7
## 3rd Qu.: 7.000 3rd Qu.: 3.4994 3rd Qu.: 4.188 3rd Qu.:375.2
## Max. :14.000 Max. : 5.2125 Max. : 7.397 Max. :670.9
## nocount hbdnr rule.of.5violations alert
## Min. : 1.000 Min. :0.000 Min. :0.0000 Min. :0.000000
## 1st Qu.: 3.000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:0.000000
## Median : 4.000 Median :1.000 Median :0.0000 Median :0.000000
## Mean : 4.303 Mean :1.452 Mean :0.2212 Mean :0.009615
## 3rd Qu.: 6.000 3rd Qu.:2.000 3rd Qu.:0.0000 3rd Qu.:0.000000
## Max. :11.000 Max. :6.000 Max. :3.0000 Max. :1.000000
## prx ub pol inthb adistm
## Min. : 0 Min. :0.000 Min. :0.00 Min. :0.00000 Min. : 0.0
## 1st Qu.: 0 1st Qu.:3.000 1st Qu.:1.00 1st Qu.:0.00000 1st Qu.: 147.1
## Median : 2 Median :4.200 Median :2.00 Median :0.00000 Median : 614.9
## Mean : 3 Mean :4.203 Mean :1.75 Mean :0.05288 Mean :1011.6
## 3rd Qu.: 5 3rd Qu.:5.300 3rd Qu.:3.00 3rd Qu.:0.00000 3rd Qu.:1251.0
## Max. :12 Max. :7.300 Max. :4.00 Max. :1.00000 Max. :7231.5
## adistd polar_area nonpolar_area psa_npsa
## Min. : 0.000 Min. : 3.577 Min. :179.7 Min. :0.0056
## 1st Qu.: 7.183 1st Qu.: 52.347 1st Qu.:371.7 1st Qu.:0.1038
## Median : 18.885 Median : 77.991 Median :470.8 Median :0.1663
## Mean : 31.378 Mean : 89.958 Mean :474.5 Mean :0.2312
## 3rd Qu.: 46.339 3rd Qu.:124.453 3rd Qu.:588.6 3rd Qu.:0.2716
## Max. :245.085 Max. :281.792 Max. :870.2 Max. :1.1730
## tcsa tcpa tcnp ovality
## Min. :0.00720 Min. :0.02780 Min. :0.00820 Min. :1.074
## 1st Qu.:0.01087 1st Qu.:0.06020 1st Qu.:0.01210 1st Qu.:1.138
## Median :0.01220 Median :0.08875 Median :0.01410 Median :1.183
## Mean :0.01280 Mean :0.14447 Mean :0.01623 Mean :1.187
## 3rd Qu.:0.01415 3rd Qu.:0.13032 3rd Qu.:0.01820 3rd Qu.:1.230
## Max. :0.02180 Max. :1.82840 Max. :0.04330 Max. :1.360
## surface_area volume most_negative_charge most_positive_charge
## Min. : 321.0 Min. : 481.1 Min. :-1.4191 Min. :0.1391
## 1st Qu.: 464.6 1st Qu.: 769.0 1st Qu.:-0.8821 1st Qu.:0.3539
## Median : 552.6 Median : 949.4 Median :-0.7011 Median :0.4782
## Mean : 564.5 Mean : 974.4 Mean :-0.7291 Mean :0.5312
## 3rd Qu.: 653.7 3rd Qu.:1163.8 3rd Qu.:-0.5934 3rd Qu.:0.7303
## Max. :1041.3 Max. :1992.4 Max. :-0.3203 Max. :1.4963
## sum_absolute_charge dipole_moment homo lumo
## Min. : 3.063 Min. :0.4893 Min. :-11.347 Min. :-1.4550
## 1st Qu.: 5.712 1st Qu.:1.9647 1st Qu.: -9.406 1st Qu.:-0.5046
## Median : 6.948 Median :2.8893 Median : -8.969 Median :-0.1563
## Mean : 7.215 Mean :3.1565 Mean : -9.025 Mean :-0.1690
## 3rd Qu.: 8.004 3rd Qu.:3.9541 3rd Qu.: -8.639 3rd Qu.: 0.1278
## Max. :20.300 Max. :9.9364 Max. : -7.325 Max. : 3.4038
## hardness ppsa1 ppsa2 ppsa3
## Min. :3.363 Min. :168.5 Min. : 289.0 Min. : 29.13
## 1st Qu.:4.181 1st Qu.:282.4 1st Qu.: 875.8 1st Qu.: 41.62
## Median :4.401 Median :362.2 Median :1216.4 Median : 46.97
## Mean :4.428 Mean :369.3 Mean :1412.4 Mean : 49.80
## 3rd Qu.:4.632 3rd Qu.:433.7 3rd Qu.:1747.7 3rd Qu.: 55.39
## Max. :6.535 Max. :739.5 Max. :7496.0 Max. :106.33
## pnsa1 pnsa2 pnsa3 fpsa1
## Min. : 51.06 Min. :-3072.94 Min. :-113.80 Min. :0.3946
## 1st Qu.:153.97 1st Qu.: -891.44 1st Qu.: -55.68 1st Qu.:0.5931
## Median :185.87 Median : -668.01 Median : -40.24 Median :0.6663
## Mean :195.20 Mean : -733.18 Mean : -43.89 Mean :0.6511
## 3rd Qu.:235.80 3rd Qu.: -469.95 3rd Qu.: -29.89 3rd Qu.:0.7191
## Max. :326.19 Max. : -99.35 Max. : -10.49 Max. :0.8724
## fpsa2 fpsa3 fnsa1 fnsa2
## Min. :0.8717 Min. :0.05700 Min. :0.1276 Min. :-2.9512
## 1st Qu.:1.8100 1st Qu.:0.07758 1st Qu.:0.2809 1st Qu.:-1.5390
## Median :2.2415 Median :0.08490 Median :0.3337 Median :-1.1742
## Mean :2.3579 Mean :0.08960 Mean :0.3489 Mean :-1.2496
## 3rd Qu.:2.7382 3rd Qu.:0.09638 3rd Qu.:0.4069 3rd Qu.:-0.9328
## Max. :7.1990 Max. :0.18510 Max. :0.6054 Max. :-0.2482
## fnsa3 wpsa1 wpsa2 wpsa3
## Min. :-0.21190 Min. : 54.11 Min. : 95.32 Min. : 11.57
## 1st Qu.:-0.09477 1st Qu.:130.52 1st Qu.: 409.41 1st Qu.: 19.16
## Median :-0.07125 Median :198.62 Median : 672.70 Median : 26.08
## Mean :-0.08017 Mean :220.36 Mean : 893.38 Mean : 29.19
## 3rd Qu.:-0.05185 3rd Qu.:283.56 3rd Qu.:1154.12 3rd Qu.: 35.19
## Max. :-0.02620 Max. :768.99 Max. :7805.33 Max. :110.71
## wnsa1 wnsa2 wnsa3 dpsa1
## Min. : 20.43 Min. :-3199.73 Min. :-92.489 Min. :-100.12
## 1st Qu.: 76.04 1st Qu.: -557.99 1st Qu.:-30.838 1st Qu.: 98.39
## Median :107.91 Median : -355.50 Median :-22.460 Median : 179.08
## Mean :114.48 Mean : -452.91 Mean :-25.329 Mean : 174.08
## 3rd Qu.:147.61 3rd Qu.: -247.09 3rd Qu.:-15.093 3rd Qu.: 248.45
## Max. :315.24 Max. : -39.76 Max. : -4.197 Max. : 485.27
## dpsa2 dpsa3 rpcg rncg
## Min. : 505.2 Min. : 41.46 Min. :0.0405 Min. :0.1005
## 1st Qu.: 1373.5 1st Qu.: 75.26 1st Qu.:0.1104 1st Qu.:0.1598
## Median : 1869.9 Median : 87.66 Median :0.1399 Median :0.2046
## Mean : 2145.6 Mean : 93.69 Mean :0.1499 Mean :0.2137
## 3rd Qu.: 2558.5 3rd Qu.:105.85 3rd Qu.:0.1895 3rd Qu.:0.2430
## Max. :10569.0 Max. :212.13 Max. :0.2839 Max. :0.5338
## wpcs wncs sadh1 sadh2
## Min. :0.0000 Min. : 0.000 Min. : 0.00 Min. : 0.000
## 1st Qu.:0.3377 1st Qu.: 0.464 1st Qu.: 11.10 1st Qu.: 8.541
## Median :0.8821 Median : 1.262 Median : 20.92 Median :16.142
## Mean :1.2163 Mean : 2.087 Mean : 25.34 Mean :13.888
## 3rd Qu.:1.6464 3rd Qu.: 2.782 3rd Qu.: 36.37 3rd Qu.:21.361
## Max. :5.8819 Max. :14.050 Max. :134.74 Max. :30.809
## sadh3 chdh1 chdh2 chdh3
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.000000
## 1st Qu.:0.01617 1st Qu.:0.3069 1st Qu.:0.3069 1st Qu.:0.000500
## Median :0.03680 Median :0.4527 Median :0.3591 Median :0.000900
## Mean :0.04909 Mean :0.5737 Mean :0.3057 Mean :0.001069
## 3rd Qu.:0.07750 3rd Qu.:0.8537 3rd Qu.:0.4288 3rd Qu.:0.001600
## Max. :0.28920 Max. :2.6433 Max. :0.4822 Max. :0.005600
## scdh1 scdh2 scdh3 saaa1
## Min. : 0.000 Min. : 0.000 Min. :0.00000 Min. : 2.716
## 1st Qu.: 4.105 1st Qu.: 2.823 1st Qu.:0.00580 1st Qu.: 30.272
## Median : 7.516 Median : 5.836 Median :0.01395 Median : 56.356
## Mean :10.161 Mean : 5.459 Mean :0.01976 Mean : 58.284
## 3rd Qu.:15.960 3rd Qu.: 8.770 3rd Qu.:0.03122 3rd Qu.: 84.312
## Max. :58.806 Max. :12.668 Max. :0.12620 Max. :203.102
## saaa2 saaa3 chaa1 chaa2
## Min. : 1.52 Min. :0.00430 Min. :-2.9676 Min. :-0.8156
## 1st Qu.:11.37 1st Qu.:0.04992 1st Qu.:-1.5539 1st Qu.:-0.4621
## Median :17.44 Median :0.09605 Median :-1.1552 Median :-0.4111
## Mean :18.61 Mean :0.10775 Mean :-1.2148 Mean :-0.4247
## 3rd Qu.:24.50 3rd Qu.:0.14122 3rd Qu.:-0.7564 3rd Qu.:-0.3622
## Max. :43.43 Max. :0.41330 Max. :-0.3993 Max. :-0.1083
## chaa3 scaa1 scaa2 scaa3
## Min. :-0.005200 Min. :-77.547 Min. :-21.4632 Min. :-0.15780
## 1st Qu.:-0.002700 1st Qu.:-33.431 1st Qu.:-10.8593 1st Qu.:-0.05695
## Median :-0.002100 Median :-21.815 Median : -7.1684 Median :-0.03965
## Mean :-0.002169 Mean :-23.398 Mean : -7.8518 Mean :-0.04360
## 3rd Qu.:-0.001500 3rd Qu.:-12.233 3rd Qu.: -5.0563 3rd Qu.:-0.02180
## Max. :-0.000600 Max. : -1.189 Max. : -0.4237 Max. :-0.00170
## ctdh ctaa mchg achg
## Min. :0.000 Min. :1.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:0.8490 1st Qu.:0.8449
## Median :1.000 Median :3.000 Median :0.9255 Median :0.9229
## Mean :1.452 Mean :3.106 Mean :0.8985 Mean :0.8539
## 3rd Qu.:2.000 3rd Qu.:4.000 3rd Qu.:1.3327 3rd Qu.:1.2282
## Max. :6.000 Max. :7.000 Max. :1.6196 Max. :1.6189
## rdta n_sp2 n_sp3 o_sp2
## Min. :0.0000 Min. : 0.00 Min. : 0.000 Min. : 0.00
## 1st Qu.:0.2000 1st Qu.: 0.00 1st Qu.: 4.419 1st Qu.: 0.00
## Median :0.5000 Median : 0.00 Median : 7.727 Median : 21.59
## Mean :0.5845 Mean : 12.54 Mean :10.660 Mean : 28.78
## 3rd Qu.:0.6667 3rd Qu.: 21.44 3rd Qu.:14.812 3rd Qu.: 49.46
## Max. :3.0000 Max. :105.21 Max. :61.979 Max. :151.82
## o_sp3
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 3.292
## Mean :11.576
## 3rd Qu.:18.819
## Max. :78.612
nearZeroVar(bbbDescr)
## [1] 3 16 17 22 25 50 60
names(bbbDescr[nearZeroVar(bbbDescr)])
## [1] "negative" "peoe_vsa.2.1" "peoe_vsa.3.1" "a_acid" "vsa_acid"
## [6] "frac.anion7." "alert"
Yes, columns 3, 16, 17, 22, 25, 50, and 60 (negative, peoe_vsa.2.1, peoe_vsa.3.1, a_acid, vsa_acid, frac.anion7., and alert) are near zero variance, meaning that they might not contribute much to the model’s predictive power.
library(corrplot)
## corrplot 0.94 loaded
bbbCorr <- cor(bbbDescr)
corrplot(bbbCorr, method = "number", order = "hclust", tl.cex = .25)
highCorr <- findCorrelation(bbbCorr, cutoff = .75)
length(highCorr)
## [1] 66
colnames(bbbDescr)[highCorr]
## [1] "vsa_don" "slogp_vsa2" "slogp_vsa7"
## [4] "smr_vsa0" "smr_vsa5" "mw"
## [7] "nocount" "hbdnr" "ub"
## [10] "nonpolar_area" "tcnp" "ovality"
## [13] "surface_area" "volume" "ppsa1"
## [16] "ppsa2" "ppsa3" "pnsa2"
## [19] "pnsa3" "fpsa2" "fnsa2"
## [22] "fnsa3" "wpsa1" "wpsa2"
## [25] "wpsa3" "wnsa1" "wnsa2"
## [28] "wnsa3" "dpsa1" "dpsa2"
## [31] "dpsa3" "sadh1" "sadh3"
## [34] "chdh1" "chdh3" "scdh1"
## [37] "scdh2" "scdh3" "saaa1"
## [40] "saaa3" "scaa1" "scaa2"
## [43] "scaa3" "ctdh" "ctaa"
## [46] "mchg" "vsa_hyd" "tpsa"
## [49] "a_acid" "a_base" "vsa_acc"
## [52] "slogp_vsa3" "weight" "logp.o.w."
## [55] "tpsa.1" "a_acc" "adistm"
## [58] "polar_area" "psa_npsa" "homo"
## [61] "sum_absolute_charge" "fpsa1" "fpsa3"
## [64] "sadh2" "chaa1" "chdh2"
Yes, there are strong relationships between the predictor data. To reduce correlations in the predictor set, we can follow this algorithm: Find the two predictors associated with the largest absolute pairwise correlation (call them predictors A and B). Determine the average correlation between A and the other variables. Do the same for predictor B. If A has a larger average correlation, remove it; otherwise, remove predictor B. Continue until no absolute correlations are above the threshold.
If we remove all highly correlated predictors with 0.75 as the cutoff, we would reduce the number of predictors from 134 to 68.
bbb <- bbbDescr[,-nearZeroVar(bbbDescr)]
bbb[is.na(bbb)] <- 0
for (col in names(bbb)){
bbb[[col]] <- as.numeric(as.character(bbb[[col]]))
}
bbbCorr <- cor(bbb)
highCorr <- findCorrelation(bbbCorr, cutoff = .75)
bbb <- bbb[,-highCorr]
bbbCorr1 <- cor(bbb)
corrplot(bbbCorr1, method = "number", order = "hclust", tl.cex = .25)
The correlation matrix looks less correlated now. At this point, we can handle multicollinearity by removing any predictors with VIF >10. This further reduces the number of predictors from 68 to 21.
library(car)
## Loading required package: carData
multicollinearity <- vif(lm(logBBB ~ ., data = bbb))
final_predictors <- which(multicollinearity<10)
bbb <- bbb[final_predictors]
dim(bbb)
## [1] 208 21
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
bbb.lm <- lm(logBBB ~ ., data = bbb)
bbb.f <- ols_step_forward_p(bbb.lm, penter = 0.05)
bbb.b <- ols_step_backward_p(bbb.lm, prem = 0.1)
bbb.s <- ols_step_both_p(bbb.lm, pent = 0.05, prem = 0.1)
summary(bbb.f$model)
##
## Call:
## lm(formula = paste(response, "~", paste(preds, collapse = " + ")),
## data = l)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7480 -0.3017 0.0492 0.3816 1.6119
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.293342 0.179437 1.635 0.10373
## dipole_moment -0.081529 0.028357 -2.875 0.00450 **
## pol 0.169293 0.042236 4.008 8.75e-05 ***
## slogp_vsa8 0.005965 0.001496 3.987 9.51e-05 ***
## slogp_vsa6 0.038883 0.017388 2.236 0.02649 *
## achg -0.276001 0.101631 -2.716 0.00722 **
## peoe_vsa.5 -0.016083 0.007200 -2.234 0.02666 *
## frac.cation7. 0.318917 0.109613 2.909 0.00405 **
## smr_vsa4 -0.011677 0.003998 -2.920 0.00392 **
## wpcs -0.083439 0.038200 -2.184 0.03015 *
## peoe_vsa.0.1 -0.003801 0.001766 -2.153 0.03257 *
## rule.of.5violations 0.206264 0.101235 2.037 0.04297 *
## peoe_vsa.2 -0.004987 0.002740 -1.820 0.07032 .
## nbasic -0.143390 0.101049 -1.419 0.15751
## slogp_vsa4 -0.005906 0.004319 -1.368 0.17307
## vsa_pol 0.007454 0.006956 1.071 0.28529
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6038 on 192 degrees of freedom
## Multiple R-squared: 0.4433, Adjusted R-squared: 0.3998
## F-statistic: 10.19 on 15 and 192 DF, p-value: < 2.2e-16
summary(bbb.b$model)
##
## Call:
## lm(formula = paste(response, "~", paste(c(include, cterms), collapse = " + ")),
## data = l)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7480 -0.3017 0.0492 0.3816 1.6119
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.293342 0.179437 1.635 0.10373
## nbasic -0.143390 0.101049 -1.419 0.15751
## peoe_vsa.2 -0.004987 0.002740 -1.820 0.07032 .
## peoe_vsa.5 -0.016083 0.007200 -2.234 0.02666 *
## peoe_vsa.0.1 -0.003801 0.001766 -2.153 0.03257 *
## vsa_pol 0.007454 0.006956 1.071 0.28529
## slogp_vsa4 -0.005906 0.004319 -1.368 0.17307
## slogp_vsa6 0.038883 0.017388 2.236 0.02649 *
## slogp_vsa8 0.005965 0.001496 3.987 9.51e-05 ***
## smr_vsa4 -0.011677 0.003998 -2.920 0.00392 **
## frac.cation7. 0.318917 0.109613 2.909 0.00405 **
## rule.of.5violations 0.206264 0.101235 2.037 0.04297 *
## pol 0.169293 0.042236 4.008 8.75e-05 ***
## dipole_moment -0.081529 0.028357 -2.875 0.00450 **
## wpcs -0.083439 0.038200 -2.184 0.03015 *
## achg -0.276001 0.101631 -2.716 0.00722 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6038 on 192 degrees of freedom
## Multiple R-squared: 0.4433, Adjusted R-squared: 0.3998
## F-statistic: 10.19 on 15 and 192 DF, p-value: < 2.2e-16
summary(bbb.s$model)
##
## Call:
## lm(formula = paste(response, "~", paste(preds, collapse = " + ")),
## data = l)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7533 -0.3027 0.0669 0.3782 1.7224
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.281740 0.175047 1.610 0.109122
## dipole_moment -0.076342 0.027844 -2.742 0.006679 **
## pol 0.161712 0.042048 3.846 0.000163 ***
## slogp_vsa8 0.005777 0.001489 3.879 0.000143 ***
## slogp_vsa6 0.039323 0.017421 2.257 0.025101 *
## achg -0.274588 0.100091 -2.743 0.006648 **
## peoe_vsa.5 -0.020514 0.006583 -3.116 0.002109 **
## frac.cation7. 0.243654 0.099570 2.447 0.015288 *
## smr_vsa4 -0.011968 0.004005 -2.988 0.003169 **
## wpcs -0.083362 0.037871 -2.201 0.028896 *
## peoe_vsa.0.1 -0.004085 0.001736 -2.354 0.019578 *
## rule.of.5violations 0.209855 0.101378 2.070 0.039767 *
## peoe_vsa.2 -0.004514 0.002596 -1.739 0.083616 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6064 on 195 degrees of freedom
## Multiple R-squared: 0.4297, Adjusted R-squared: 0.3946
## F-statistic: 12.24 on 12 and 195 DF, p-value: < 2.2e-16
The forward selection model has 15 predictors. The backward elimination model has 15 predictors. The stepwise regression model has 12 predictors.
library(leaps)
mod.sel <- regsubsets(logBBB ~ ., data = bbb, nvmax = 21)
reg.summary <- summary(mod.sel)
names(reg.summary)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
reg.summary$outmat
## nbasic peoe_vsa.2 peoe_vsa.5 peoe_vsa.0.1 peoe_vsa.4.1 vsa_pol
## 1 ( 1 ) " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " "
## 4 ( 1 ) " " " " "*" " " " " " "
## 5 ( 1 ) " " " " "*" " " " " " "
## 6 ( 1 ) " " " " "*" " " " " " "
## 7 ( 1 ) " " " " "*" " " " " " "
## 8 ( 1 ) " " " " "*" " " " " " "
## 9 ( 1 ) " " " " "*" " " " " " "
## 10 ( 1 ) " " " " "*" "*" " " " "
## 11 ( 1 ) " " " " "*" "*" " " " "
## 12 ( 1 ) " " "*" "*" "*" " " " "
## 13 ( 1 ) "*" "*" "*" "*" " " " "
## 14 ( 1 ) "*" "*" "*" "*" " " " "
## 15 ( 1 ) "*" "*" "*" "*" " " "*"
## 16 ( 1 ) "*" "*" "*" "*" " " "*"
## 17 ( 1 ) "*" "*" "*" "*" " " "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*"
## 20 ( 1 ) "*" "*" "*" "*" "*" "*"
## 21 ( 1 ) "*" "*" "*" "*" "*" "*"
## slogp_vsa4 slogp_vsa6 slogp_vsa8 smr_vsa3 smr_vsa4 smr_vsa6
## 1 ( 1 ) " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " "
## 3 ( 1 ) " " " " "*" " " " " " "
## 4 ( 1 ) " " "*" "*" " " " " " "
## 5 ( 1 ) " " "*" "*" " " " " " "
## 6 ( 1 ) " " "*" "*" " " " " " "
## 7 ( 1 ) " " "*" "*" " " " " " "
## 8 ( 1 ) " " "*" "*" " " "*" " "
## 9 ( 1 ) " " "*" "*" " " "*" " "
## 10 ( 1 ) " " "*" "*" " " "*" " "
## 11 ( 1 ) " " "*" "*" " " "*" " "
## 12 ( 1 ) " " "*" "*" " " "*" " "
## 13 ( 1 ) " " "*" "*" " " "*" " "
## 14 ( 1 ) "*" "*" "*" " " "*" " "
## 15 ( 1 ) "*" "*" "*" " " "*" " "
## 16 ( 1 ) "*" "*" "*" " " "*" " "
## 17 ( 1 ) "*" "*" "*" " " "*" " "
## 18 ( 1 ) "*" "*" "*" " " "*" " "
## 19 ( 1 ) "*" "*" "*" " " "*" " "
## 20 ( 1 ) "*" "*" "*" "*" "*" " "
## 21 ( 1 ) "*" "*" "*" "*" "*" "*"
## frac.cation7. rule.of.5violations pol inthb tcpa dipole_moment wpcs
## 1 ( 1 ) " " " " " " " " " " "*" " "
## 2 ( 1 ) " " " " "*" " " " " "*" " "
## 3 ( 1 ) " " " " "*" " " " " "*" " "
## 4 ( 1 ) " " " " " " " " " " " " " "
## 5 ( 1 ) " " " " " " " " " " "*" " "
## 6 ( 1 ) " " " " "*" " " " " "*" " "
## 7 ( 1 ) "*" " " "*" " " " " "*" " "
## 8 ( 1 ) "*" " " "*" " " " " "*" " "
## 9 ( 1 ) "*" " " "*" " " " " "*" "*"
## 10 ( 1 ) "*" " " "*" " " " " "*" "*"
## 11 ( 1 ) "*" "*" "*" " " " " "*" "*"
## 12 ( 1 ) "*" "*" "*" " " " " "*" "*"
## 13 ( 1 ) "*" "*" "*" " " " " "*" "*"
## 14 ( 1 ) "*" "*" "*" " " " " "*" "*"
## 15 ( 1 ) "*" "*" "*" " " " " "*" "*"
## 16 ( 1 ) "*" "*" "*" "*" " " "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 20 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 21 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## wncs achg
## 1 ( 1 ) " " " "
## 2 ( 1 ) " " " "
## 3 ( 1 ) " " " "
## 4 ( 1 ) " " "*"
## 5 ( 1 ) " " "*"
## 6 ( 1 ) " " "*"
## 7 ( 1 ) " " "*"
## 8 ( 1 ) " " "*"
## 9 ( 1 ) " " "*"
## 10 ( 1 ) " " "*"
## 11 ( 1 ) " " "*"
## 12 ( 1 ) " " "*"
## 13 ( 1 ) " " "*"
## 14 ( 1 ) " " "*"
## 15 ( 1 ) " " "*"
## 16 ( 1 ) " " "*"
## 17 ( 1 ) " " "*"
## 18 ( 1 ) " " "*"
## 19 ( 1 ) "*" "*"
## 20 ( 1 ) "*" "*"
## 21 ( 1 ) "*" "*"
cat(which.max(reg.summary$rsq), reg.summary$rsq[which.max(reg.summary$rsq)])
## 21 0.449689
cat(which.min(reg.summary$rss), reg.summary$rss[which.min(reg.summary$rss)])
## 21 69.18491
cat(which.max(reg.summary$adjr2), reg.summary$adjr2[which.max(reg.summary$adjr2)])
## 15 0.3998022
cat(which.min(reg.summary$cp), reg.summary$cp[which.min(reg.summary$cp)])
## 12 10.74935
cat(which.min(reg.summary$bic), reg.summary$bic[which.min(reg.summary$bic)])
## 7 -53.78173
For criterion results, RSQ and RSS agree on using all 21 predictors, AdjR2 with 15, Cp with 12, and BIC with 7.
As shown above, many of the methods give different results for number of important predictors but for the most part, share the same predictors. For example, BIC criterion selects peoe_vsa.5, slogp_vsa6, slogp_vsa8, frac.cation7., pol, dipole_moment, and achg. However, these selected predictors are also included within all the selection methods listed in part (i).
\(y = 50 + 20\times GPA + 0.07\times IQ + 35\times Level + 0.01\times GPA:IQ - 10\times GPA:level\) \(y(GPA,IQ,Level=college) - y(GPA,IQ,Level=highschool) = 35 - 10\times GPA)\)
Thus choice (iii) is correct: For a fixed IQ and GPA, high school graduates earn more, on average, than college graduates if GPA >3.5.
\(y = 50 + 20\times 4 + 0.07\times 110 + 35\times 1 + 0.01\times 4\times 110 - 10\times 4\times 1 = 137.1\)
The predicted salary would be $137,100.
False; The magnitude of the coefficient is not an indicator of statistical significance. You have to check the p-value and whether it is greater or less than the significance level.
library(ISLR2)
data(Boston)
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio lstat
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 1.73
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.: 6.95
## Median : 5.000 Median :330.0 Median :19.05 Median :11.36
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :12.65
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:16.95
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :37.97
## medv
## Min. : 5.00
## 1st Qu.:17.02
## Median :21.20
## Mean :22.53
## 3rd Qu.:25.00
## Max. :50.00
lm.crim <- lm(medv ~crim, data = Boston)
summary(lm.crim)
##
## Call:
## lm(formula = medv ~ crim, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.957 -5.449 -2.007 2.512 29.800
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.03311 0.40914 58.74 <2e-16 ***
## crim -0.41519 0.04389 -9.46 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.484 on 504 degrees of freedom
## Multiple R-squared: 0.1508, Adjusted R-squared: 0.1491
## F-statistic: 89.49 on 1 and 504 DF, p-value: < 2.2e-16
plot(Boston$crim, Boston$medv)
abline(lm.crim)
lm.zn <- lm(medv ~zn, data = Boston)
summary(lm.zn)
##
## Call:
## lm(formula = medv ~ zn, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.918 -5.518 -1.006 2.757 29.082
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.91758 0.42474 49.248 <2e-16 ***
## zn 0.14214 0.01638 8.675 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.587 on 504 degrees of freedom
## Multiple R-squared: 0.1299, Adjusted R-squared: 0.1282
## F-statistic: 75.26 on 1 and 504 DF, p-value: < 2.2e-16
plot(Boston$zn, Boston$medv)
abline(lm.zn)
lm.indus <- lm(medv ~indus, data = Boston)
summary(lm.indus)
##
## Call:
## lm(formula = medv ~ indus, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.017 -4.917 -1.457 3.180 32.943
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.75490 0.68345 43.54 <2e-16 ***
## indus -0.64849 0.05226 -12.41 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.057 on 504 degrees of freedom
## Multiple R-squared: 0.234, Adjusted R-squared: 0.2325
## F-statistic: 154 on 1 and 504 DF, p-value: < 2.2e-16
plot(Boston$indus, Boston$medv)
abline(lm.indus)
lm.chas <- lm(medv ~chas, data = Boston)
summary(lm.chas)
##
## Call:
## lm(formula = medv ~ chas, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.094 -5.894 -1.417 2.856 27.906
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.0938 0.4176 52.902 < 2e-16 ***
## chas 6.3462 1.5880 3.996 7.39e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.064 on 504 degrees of freedom
## Multiple R-squared: 0.03072, Adjusted R-squared: 0.02879
## F-statistic: 15.97 on 1 and 504 DF, p-value: 7.391e-05
plot(Boston$chas, Boston$medv)
abline(lm.chas)
lm.nox <- lm(medv ~nox, data = Boston)
summary(lm.nox)
##
## Call:
## lm(formula = medv ~ nox, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.691 -5.121 -2.161 2.959 31.310
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.346 1.811 22.83 <2e-16 ***
## nox -33.916 3.196 -10.61 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.323 on 504 degrees of freedom
## Multiple R-squared: 0.1826, Adjusted R-squared: 0.181
## F-statistic: 112.6 on 1 and 504 DF, p-value: < 2.2e-16
plot(Boston$nox, Boston$medv)
abline(lm.nox)
lm.rm <- lm(medv ~rm, data = Boston)
summary(lm.rm)
##
## Call:
## lm(formula = medv ~ rm, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.346 -2.547 0.090 2.986 39.433
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -34.671 2.650 -13.08 <2e-16 ***
## rm 9.102 0.419 21.72 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.616 on 504 degrees of freedom
## Multiple R-squared: 0.4835, Adjusted R-squared: 0.4825
## F-statistic: 471.8 on 1 and 504 DF, p-value: < 2.2e-16
plot(Boston$rm, Boston$medv)
abline(lm.rm)
lm.age <- lm(medv ~age, data = Boston)
summary(lm.age)
##
## Call:
## lm(formula = medv ~ age, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.097 -5.138 -1.958 2.397 31.338
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.97868 0.99911 31.006 <2e-16 ***
## age -0.12316 0.01348 -9.137 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.527 on 504 degrees of freedom
## Multiple R-squared: 0.1421, Adjusted R-squared: 0.1404
## F-statistic: 83.48 on 1 and 504 DF, p-value: < 2.2e-16
plot(Boston$age, Boston$medv)
abline(lm.age)
lm.dis <- lm(medv ~dis, data = Boston)
summary(lm.dis)
##
## Call:
## lm(formula = medv ~ dis, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.016 -5.556 -1.865 2.288 30.377
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.3901 0.8174 22.499 < 2e-16 ***
## dis 1.0916 0.1884 5.795 1.21e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.914 on 504 degrees of freedom
## Multiple R-squared: 0.06246, Adjusted R-squared: 0.0606
## F-statistic: 33.58 on 1 and 504 DF, p-value: 1.207e-08
plot(Boston$dis, Boston$medv)
abline(lm.dis)
lm.rad <- lm(medv ~rad, data = Boston)
summary(lm.rad)
##
## Call:
## lm(formula = medv ~ rad, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.770 -5.199 -1.967 3.321 33.292
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 26.38213 0.56176 46.964 <2e-16 ***
## rad -0.40310 0.04349 -9.269 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.509 on 504 degrees of freedom
## Multiple R-squared: 0.1456, Adjusted R-squared: 0.1439
## F-statistic: 85.91 on 1 and 504 DF, p-value: < 2.2e-16
plot(Boston$rad, Boston$medv)
abline(lm.rad)
lm.tax <- lm(medv ~tax, data = Boston)
summary(lm.tax)
##
## Call:
## lm(formula = medv ~ tax, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.091 -5.173 -2.085 3.158 34.058
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.970654 0.948296 34.77 <2e-16 ***
## tax -0.025568 0.002147 -11.91 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.133 on 504 degrees of freedom
## Multiple R-squared: 0.2195, Adjusted R-squared: 0.218
## F-statistic: 141.8 on 1 and 504 DF, p-value: < 2.2e-16
plot(Boston$tax, Boston$medv)
abline(lm.tax)
lm.ptratio <- lm(medv ~ptratio, data = Boston)
summary(lm.ptratio)
##
## Call:
## lm(formula = medv ~ ptratio, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.8342 -4.8262 -0.6426 3.1571 31.2303
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 62.345 3.029 20.58 <2e-16 ***
## ptratio -2.157 0.163 -13.23 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.931 on 504 degrees of freedom
## Multiple R-squared: 0.2578, Adjusted R-squared: 0.2564
## F-statistic: 175.1 on 1 and 504 DF, p-value: < 2.2e-16
plot(Boston$ptratio, Boston$medv)
abline(lm.ptratio)
lm.lstat <- lm(medv ~lstat, data = Boston)
summary(lm.lstat)
##
## Call:
## lm(formula = medv ~ lstat, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.168 -3.990 -1.318 2.034 24.500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.55384 0.56263 61.41 <2e-16 ***
## lstat -0.95005 0.03873 -24.53 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared: 0.5441, Adjusted R-squared: 0.5432
## F-statistic: 601.6 on 1 and 504 DF, p-value: < 2.2e-16
plot(Boston$lstat, Boston$medv)
abline(lm.lstat)
I chose medv (median value of homes) as my response variable. In all of the linear models, there is a statistically significant association between the predictor and response, as shown with p<<0.05 and the plots.
lm.multiple <- lm(medv ~ ., data = Boston)
summary(lm.multiple)
##
## Call:
## lm(formula = medv ~ ., data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.1304 -2.7673 -0.5814 1.9414 26.2526
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.617270 4.936039 8.431 3.79e-16 ***
## crim -0.121389 0.033000 -3.678 0.000261 ***
## zn 0.046963 0.013879 3.384 0.000772 ***
## indus 0.013468 0.062145 0.217 0.828520
## chas 2.839993 0.870007 3.264 0.001173 **
## nox -18.758022 3.851355 -4.870 1.50e-06 ***
## rm 3.658119 0.420246 8.705 < 2e-16 ***
## age 0.003611 0.013329 0.271 0.786595
## dis -1.490754 0.201623 -7.394 6.17e-13 ***
## rad 0.289405 0.066908 4.325 1.84e-05 ***
## tax -0.012682 0.003801 -3.337 0.000912 ***
## ptratio -0.937533 0.132206 -7.091 4.63e-12 ***
## lstat -0.552019 0.050659 -10.897 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.798 on 493 degrees of freedom
## Multiple R-squared: 0.7343, Adjusted R-squared: 0.7278
## F-statistic: 113.5 on 12 and 493 DF, p-value: < 2.2e-16
All predictors are significant except for indus and age, which have p-value of 0.83 and 0.79 >0.05 respectively. The rest can reject the null hypothesis.
coef_x <- c(coef(lm.crim)[2],
coef(lm.zn)[2],
coef(lm.indus)[2],
coef(lm.chas)[2],
coef(lm.nox)[2],
coef(lm.rm)[2],
coef(lm.age)[2],
coef(lm.dis)[2],
coef(lm.rad)[2],
coef(lm.tax)[2],
coef(lm.ptratio)[2],
coef(lm.lstat)[2])
coef_y <- coef(lm.multiple)[-1]
plot(coef_x, coef_y)
The simple linear regression models found that every predictor were statistically significant, but MLR ruled out indus and age. This may be due to there being a confounding variable that better explains medv than indus and age. The plot is not a perfect 1:1 since the MLR coefficients assume all other variables are held constant unlike in univariate linear regression.
multiple.f <- ols_step_forward_p(lm.multiple, penter = 0.05)
multiple.b <- ols_step_backward_p(lm.multiple, prem = 0.1)
multiple.s <- ols_step_both_p(lm.multiple, pent = 0.05, prem = 0.1)
summary(multiple.f$model)
##
## Call:
## lm(formula = paste(response, "~", paste(preds, collapse = " + ")),
## data = l)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.1814 -2.7625 -0.6243 1.8448 26.3920
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.451747 4.903283 8.454 3.18e-16 ***
## lstat -0.546509 0.047442 -11.519 < 2e-16 ***
## rm 3.672957 0.409127 8.978 < 2e-16 ***
## ptratio -0.930961 0.130423 -7.138 3.39e-12 ***
## dis -1.515951 0.187675 -8.078 5.08e-15 ***
## nox -18.262427 3.565247 -5.122 4.33e-07 ***
## chas 2.871873 0.862591 3.329 0.000935 ***
## zn 0.046191 0.013673 3.378 0.000787 ***
## crim -0.121665 0.032919 -3.696 0.000244 ***
## rad 0.283932 0.063945 4.440 1.11e-05 ***
## tax -0.012292 0.003407 -3.608 0.000340 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.789 on 495 degrees of freedom
## Multiple R-squared: 0.7342, Adjusted R-squared: 0.7289
## F-statistic: 136.8 on 10 and 495 DF, p-value: < 2.2e-16
summary(multiple.b$model)
##
## Call:
## lm(formula = paste(response, "~", paste(c(include, cterms), collapse = " + ")),
## data = l)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.1814 -2.7625 -0.6243 1.8448 26.3920
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.451747 4.903283 8.454 3.18e-16 ***
## crim -0.121665 0.032919 -3.696 0.000244 ***
## zn 0.046191 0.013673 3.378 0.000787 ***
## chas 2.871873 0.862591 3.329 0.000935 ***
## nox -18.262427 3.565247 -5.122 4.33e-07 ***
## rm 3.672957 0.409127 8.978 < 2e-16 ***
## dis -1.515951 0.187675 -8.078 5.08e-15 ***
## rad 0.283932 0.063945 4.440 1.11e-05 ***
## tax -0.012292 0.003407 -3.608 0.000340 ***
## ptratio -0.930961 0.130423 -7.138 3.39e-12 ***
## lstat -0.546509 0.047442 -11.519 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.789 on 495 degrees of freedom
## Multiple R-squared: 0.7342, Adjusted R-squared: 0.7289
## F-statistic: 136.8 on 10 and 495 DF, p-value: < 2.2e-16
summary(multiple.s$model)
##
## Call:
## lm(formula = paste(response, "~", paste(preds, collapse = " + ")),
## data = l)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.1814 -2.7625 -0.6243 1.8448 26.3920
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.451747 4.903283 8.454 3.18e-16 ***
## lstat -0.546509 0.047442 -11.519 < 2e-16 ***
## rm 3.672957 0.409127 8.978 < 2e-16 ***
## ptratio -0.930961 0.130423 -7.138 3.39e-12 ***
## dis -1.515951 0.187675 -8.078 5.08e-15 ***
## nox -18.262427 3.565247 -5.122 4.33e-07 ***
## chas 2.871873 0.862591 3.329 0.000935 ***
## zn 0.046191 0.013673 3.378 0.000787 ***
## crim -0.121665 0.032919 -3.696 0.000244 ***
## rad 0.283932 0.063945 4.440 1.11e-05 ***
## tax -0.012292 0.003407 -3.608 0.000340 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.789 on 495 degrees of freedom
## Multiple R-squared: 0.7342, Adjusted R-squared: 0.7289
## F-statistic: 136.8 on 10 and 495 DF, p-value: < 2.2e-16
The methods agree on all 10 predictors: lstat, rm, ptratio, dis, nox, chas, zn, crim, rad, and tax.
mod.sel <- regsubsets(Boston$medv ~ ., data = Boston, nvmax = 12)
reg.summary <- summary(mod.sel)
names(reg.summary)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
reg.summary$outmat
## crim zn indus chas nox rm age dis rad tax ptratio lstat
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " "*"
## 2 ( 1 ) " " " " " " " " " " "*" " " " " " " " " " " "*"
## 3 ( 1 ) " " " " " " " " " " "*" " " " " " " " " "*" "*"
## 4 ( 1 ) " " " " " " " " " " "*" " " "*" " " " " "*" "*"
## 5 ( 1 ) " " " " " " " " "*" "*" " " "*" " " " " "*" "*"
## 6 ( 1 ) " " " " " " "*" "*" "*" " " "*" " " " " "*" "*"
## 7 ( 1 ) " " "*" " " "*" "*" "*" " " "*" " " " " "*" "*"
## 8 ( 1 ) "*" "*" " " "*" "*" "*" " " "*" " " " " "*" "*"
## 9 ( 1 ) "*" "*" " " " " "*" "*" " " "*" "*" "*" "*" "*"
## 10 ( 1 ) "*" "*" " " "*" "*" "*" " " "*" "*" "*" "*" "*"
## 11 ( 1 ) "*" "*" " " "*" "*" "*" "*" "*" "*" "*" "*" "*"
## 12 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
cat(which.max(reg.summary$rsq), reg.summary$rsq[which.max(reg.summary$rsq)])
## 12 0.734307
cat(which.min(reg.summary$rss), reg.summary$rss[which.min(reg.summary$rss)])
## 12 11349.42
cat(which.max(reg.summary$adjr2), reg.summary$adjr2[which.max(reg.summary$adjr2)])
## 10 0.7288734
cat(which.min(reg.summary$cp), reg.summary$cp[which.min(reg.summary$cp)])
## 10 9.120223
cat(which.min(reg.summary$bic), reg.summary$bic[which.min(reg.summary$bic)])
## 10 -602.0442
RSQ and RSS agree with using all 12 predictors, but AdjR2, Cp, and BIC agree on 10 predictors, the same 10 as those chosen for forward selection, backward elimination, and stepwise regression. Only indus and age are rejected, confirming that they are not important predictors for medv.