library(ISLR2)
library(MASS)
head(Boston)
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21
## medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
str(Boston)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
We will seek to predict medv using 12 predictors such as rm (average number of rooms per house), age (proportion of owner-occupied units built prior to 1940) and lstat (percent of households with low socioeconomic status).
lmf<- lm(medv~.,data=Boston)
summary(lmf)
##
## Call:
## lm(formula = medv ~ ., data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.595 -2.730 -0.518 1.777 26.199
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.646e+01 5.103e+00 7.144 3.28e-12 ***
## crim -1.080e-01 3.286e-02 -3.287 0.001087 **
## zn 4.642e-02 1.373e-02 3.382 0.000778 ***
## indus 2.056e-02 6.150e-02 0.334 0.738288
## chas 2.687e+00 8.616e-01 3.118 0.001925 **
## nox -1.777e+01 3.820e+00 -4.651 4.25e-06 ***
## rm 3.810e+00 4.179e-01 9.116 < 2e-16 ***
## age 6.922e-04 1.321e-02 0.052 0.958229
## dis -1.476e+00 1.995e-01 -7.398 6.01e-13 ***
## rad 3.060e-01 6.635e-02 4.613 5.07e-06 ***
## tax -1.233e-02 3.760e-03 -3.280 0.001112 **
## ptratio -9.527e-01 1.308e-01 -7.283 1.31e-12 ***
## black 9.312e-03 2.686e-03 3.467 0.000573 ***
## lstat -5.248e-01 5.072e-02 -10.347 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.745 on 492 degrees of freedom
## Multiple R-squared: 0.7406, Adjusted R-squared: 0.7338
## F-statistic: 108.1 on 13 and 492 DF, p-value: < 2.2e-16
coef(lmf)
## (Intercept) crim zn indus chas
## 3.645949e+01 -1.080114e-01 4.642046e-02 2.055863e-02 2.686734e+00
## nox rm age dis rad
## -1.776661e+01 3.809865e+00 6.922246e-04 -1.475567e+00 3.060495e-01
## tax ptratio black lstat
## -1.233459e-02 -9.527472e-01 9.311683e-03 -5.247584e-01
confint(lmf)
## 2.5 % 97.5 %
## (Intercept) 26.432226009 46.486750761
## crim -0.172584412 -0.043438304
## zn 0.019448778 0.073392139
## indus -0.100267941 0.141385193
## chas 0.993904193 4.379563446
## nox -25.271633564 -10.261588893
## rm 2.988726773 4.631003640
## age -0.025262320 0.026646769
## dis -1.867454981 -1.083678710
## rad 0.175692169 0.436406789
## tax -0.019723286 -0.004945902
## ptratio -1.209795296 -0.695699168
## black 0.004034306 0.014589060
## lstat -0.624403622 -0.425113133
What if we would like to perform a regression using all of the variables but one?
lmf_noindus <- lm(medv~ . - indus, data = Boston)
lmf_noindus <- update(lmf,~.-indus)
summary(lmf_noindus)
##
## Call:
## lm(formula = medv ~ crim + zn + chas + nox + rm + age + dis +
## rad + tax + ptratio + black + lstat, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.587 -2.737 -0.506 1.742 26.212
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.636e+01 5.091e+00 7.143 3.30e-12 ***
## crim -1.084e-01 3.281e-02 -3.304 0.001022 **
## zn 4.593e-02 1.364e-02 3.368 0.000816 ***
## chas 2.716e+00 8.562e-01 3.173 0.001605 **
## nox -1.743e+01 3.681e+00 -4.735 2.87e-06 ***
## rm 3.797e+00 4.158e-01 9.132 < 2e-16 ***
## age 6.971e-04 1.320e-02 0.053 0.957898
## dis -1.490e+00 1.948e-01 -7.648 1.08e-13 ***
## rad 2.999e-01 6.367e-02 4.710 3.22e-06 ***
## tax -1.178e-02 3.378e-03 -3.489 0.000529 ***
## ptratio -9.471e-01 1.296e-01 -7.308 1.10e-12 ***
## black 9.282e-03 2.682e-03 3.461 0.000586 ***
## lstat -5.235e-01 5.052e-02 -10.361 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.741 on 493 degrees of freedom
## Multiple R-squared: 0.7406, Adjusted R-squared: 0.7343
## F-statistic: 117.3 on 12 and 493 DF, p-value: < 2.2e-16
summary(lm(medv ~ lstat * age , data = Boston))
##
## Call:
## lm(formula = medv ~ lstat * age, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.806 -4.045 -1.333 2.085 27.552
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.0885359 1.4698355 24.553 < 2e-16 ***
## lstat -1.3921168 0.1674555 -8.313 8.78e-16 ***
## age -0.0007209 0.0198792 -0.036 0.9711
## lstat:age 0.0041560 0.0018518 2.244 0.0252 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.149 on 502 degrees of freedom
## Multiple R-squared: 0.5557, Adjusted R-squared: 0.5531
## F-statistic: 209.3 on 3 and 502 DF, p-value: < 2.2e-16
For instance, given a predictor X, we can create a predictor X2 using I(X^2).
lm.fit2 <- lm(medv ~ lstat + I(lstat^2),data=Boston)
summary(lm.fit2)
##
## Call:
## lm(formula = medv ~ lstat + I(lstat^2), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.2834 -3.8313 -0.5295 2.3095 25.4148
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42.862007 0.872084 49.15 <2e-16 ***
## lstat -2.332821 0.123803 -18.84 <2e-16 ***
## I(lstat^2) 0.043547 0.003745 11.63 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.524 on 503 degrees of freedom
## Multiple R-squared: 0.6407, Adjusted R-squared: 0.6393
## F-statistic: 448.5 on 2 and 503 DF, p-value: < 2.2e-16
lm.org <- lm(medv~lstat,data=Boston)
lm.log <- lm(medv~log(lstat),data = Boston)
residuals(lm.log) #residuals
## 1 2 3 4 5
## -8.087430784 -2.908603893 -0.029197222 -5.265156593 4.960293658
## 6 7 8 9 10
## -2.823915332 2.228702449 11.822851433 6.796336963 2.209702696
## 11 12 13 14 15
## 0.542603767 -0.955130053 3.951553255 -5.372125068 -4.865897690
## 16 17 18 19 20
## -5.558778765 -5.510171356 -1.103304225 -1.237368764 -3.682971040
## 21 22 23 24 25
## -0.514275800 0.260762161 -0.360594497 -0.310216857 -1.688302563
## 26 27 28 29 30
## -3.228531764 -1.884759500 -1.759605302 -1.905202364 -0.131524823
## 31 32 33 34 35
## -0.509710593 -5.573351013 2.534453863 -2.709750853 -1.024712255
## 36 37 38 39 40
## -4.892177172 -1.739952606 -4.024362332 1.474950926 -3.061906619
## 41 42 43 44 45
## -8.699043851 -5.843327831 -4.863480442 -2.377057369 -2.760929236
## 46 47 48 49 50
## -3.826869829 0.946257889 1.092629346 5.058010458 2.034891213
## 51 52 53 54 55
## 0.013029301 -3.618751851 -6.357341257 -2.117860400 0.406810258
## 56 57 58 59 60
## 2.879070020 -5.549705117 -3.379450661 -4.790055593 -4.799836562
## 61 62 63 64 65
## -1.268508324 -2.800534271 -6.128845272 0.973553699 6.906458746
## 66 67 68 69 70
## -9.389593091 -3.690250814 -4.016259446 -2.625586040 -4.095931860
## 71 72 73 74 75
## -4.147404342 -1.836933672 -8.002540047 -3.510419751 -4.136461637
## 76 77 78 79 80
## -3.384742859 -1.141947347 -2.253738924 0.438004634 -4.263345086
## 81 82 83 84 85
## -3.333725409 -3.551684930 -3.547404342 -4.060177845 0.030220678
## 86 87 88 89 90
## -2.105373741 2.253165500 -3.303063751 -7.247843081 -1.702046686
## 91 92 93 94 95
## -2.367566002 -3.863116762 -3.024148532 -4.332493259 -2.070783562
## 96 97 98 99 100
## -0.078096344 -0.416758830 4.516174729 7.558099216 3.827245506
## 101 102 103 104 105
## 3.368005737 -0.197064604 -4.023729888 -0.396253683 -0.672113707
## 106 107 108 109 110
## 2.341192970 3.899338142 1.293222517 -1.032996618 1.523787869
## 111 112 113 114 115
## 1.588304957 -0.388141293 1.442593139 2.002401747 -4.336882601
## 116 117 118 119 120
## 0.591213194 0.130828207 -3.817333534 2.378470899 -0.239374334
## 121 122 123 124 125
## 3.138815276 1.351657295 4.401261137 5.552970617 2.455218799
## 126 127 128 129 130
## 2.915240500 4.830103747 -0.424780240 -0.005298966 -1.516554326
## 131 132 133 134 135
## -1.301757168 -1.243172707 0.938726326 0.099279617 -0.937955735
## 136 137 138 139 140
## 1.307098706 0.562866035 -1.571553046 -0.637405025 2.064843607
## 141 142 143 144 145
## 1.623375162 6.437256571 2.327006523 4.339460212 1.826558740
## 146 147 148 149 150
## 3.174925434 -1.423142954 4.728410149 7.406225907 1.538467380
## 151 152 153 154 155
## 2.402077410 -0.245728190 -5.686516047 1.714948806 -1.226206895
## 156 157 158 159 160
## -2.709027187 -4.311420426 8.194747597 -4.597985158 -3.861217985
## 161 162 163 164 165
## -3.847843081 4.716331488 6.016895934 12.851970493 1.209133631
## 166 167 168 169 170
## 1.374323643 14.204508941 2.834062643 1.716258323 0.461209439
## 171 172 173 174 175
## -1.409180598 -1.979542357 4.513699827 -1.045909702 -1.243858290
## 176 177 178 179 180
## -1.839706342 -0.049715035 -4.572734634 1.918632805 5.262043723
## 181 182 183 184 185
## 12.922642446 12.107690896 5.404991051 2.054083419 7.195401785
## 186 187 188 189 190
## 9.631491676 16.508137904 3.578082170 -3.387095136 3.800007492
## 191 192 193 194 195
## 5.209749259 -2.336255522 -2.565917464 -0.862744711 -4.589752550
## 196 197 198 199 200
## 11.461554872 -1.275299191 5.045831918 6.065471131 1.712904864
## 201 202 203 204 205
## -0.591862096 -2.993844153 4.336438102 13.070156113 11.077494657
## 206 207 208 209 210
## 0.254926646 2.169222062 6.491426930 5.788185054 7.058002729
## 211 212 213 214 215
## 5.133169822 6.830039664 4.903225997 3.914895137 13.836860367
## 216 217 218 219 220
## 0.934077739 3.668582681 4.920709739 5.394298252 0.222692558
## 221 222 223 224 225
## 2.946443711 7.844284659 4.026069854 3.288505249 10.406908018
## 226 227 228 229 230
## 17.003043046 -0.283555392 2.565396112 11.625395342 -4.094720422
## 231 232 233 234 235
## 2.819851512 0.271541900 0.860765011 13.320549339 2.906458746
## 236 237 238 239 240
## 1.666403403 1.125994836 -1.230259367 -5.334603888 -3.895041779
## 241 242 243 244 245
## 0.227188300 -0.601456983 0.250463618 -7.871919130 -3.001207672
## 246 247 248 249 250
## 2.764843607 -0.181323068 -2.700431764 0.499801810 -2.448165212
## 251 252 253 254 255
## -5.571625683 -11.372174501 -6.782532944 6.452773932 -6.729153827
## 256 257 258 259 260
## -3.459291950 6.036438102 18.258598528 9.496693059 2.082508403
## 261 262 263 264 265
## 9.891237912 15.717270893 18.849510600 9.083790666 10.483740554
## 266 267 268 269 270
## -0.036882601 12.198374319 22.922942631 5.735500889 1.197253637
## 271 272 273 274 275
## 0.988304957 -3.391217712 -2.199809784 6.589828644 -3.982532944
## 276 277 278 279 280
## -6.496492241 3.541720619 -1.232942602 1.596347033 2.682432699
## 281 282 283 284 285
## 9.805279578 2.294747597 7.628526889 12.235500889 5.792455464
## 286 287 288 289 290
## -3.817537994 -0.079081867 -4.390750125 -4.511494751 0.786684655
## 291 292 293 294 295
## -8.610492721 1.023089454 -4.909671971 -1.397731729 -1.196743493
## 296 297 298 299 300
## -0.612482986 -0.061217985 2.654408124 -9.612518174 -3.703900388
## 301 302 303 304 305
## -4.817088016 -2.026446301 1.232505684 0.708140169 8.136655873
## 306 307 308 309 310
## 3.601288508 4.579416467 1.273016249 -10.441956641 -3.123755316
## 311 312 313 314 315
## -4.362197794 -7.703529241 -2.005379924 -4.728300003 -0.518878621
## 316 317 318 319 320
## -5.441891211 1.976638489 2.232954464 0.155160215 0.626355001
## 321 322 323 324 325
## -3.686306234 -4.971875004 -6.248342465 -2.884099489 -4.514700468
## 326 327 328 329 330
## -7.239291952 -6.453668697 1.885043066 -4.123755316 -4.645949961
## 331 332 333 334 335
## -4.777067982 -3.571297551 -7.039383768 -8.245916581 -7.591809719
## 336 337 338 339 340
## -5.055713100 -4.138405550 -4.206190599 -4.799975489 -4.715054545
## 341 342 343 344 345
## -5.605436539 1.829443588 -8.696318745 -3.621023843 -1.850987221
## 346 347 348 349 350
## -5.241698367 -3.232610332 -5.934603888 -5.282675482 -3.392797821
## 351 352 353 354 355
## -6.903529241 -6.770556412 -7.903306941 -3.252408443 -7.893541254
## 356 357 358 359 360
## -10.089996704 1.469409786 1.844869947 1.036383853 2.167389668
## 361 362 363 364 365
## -1.503306941 0.881490014 -2.351342332 -1.828853818 -9.433725409
## 366 367 368 369 370
## -0.125759888 2.713244493 3.301175180 12.624347670 14.305297958
## 371 372 373 374 375
## 11.419460491 26.012905194 25.131209873 5.967155112 7.065997726
## 376 377 378 379 380
## -4.696253683 1.038820875 -0.684326028 0.478182035 -3.470979725
## 381 382 383 384 385
## -6.210267480 -3.178700558 -1.369324366 0.128321839 -0.615120447
## 386 387 388 389 390
## -2.141989542 0.088584954 -1.472904168 0.780804133 -2.715626501
## 391 392 393 394 395
## -1.583000624 7.666045795 -1.915109483 -4.385001932 -4.550075972
## 396 397 398 399 400
## -3.575708206 -2.634581395 -6.285129467 -4.431430116 -3.386993957
## 401 402 403 404 405
## -5.496283225 -7.336990635 -2.443134357 -6.579468330 -2.315074911
## 406 407 408 409 410
## -8.001598293 -0.907589639 6.923777539 5.930008510 12.626843160
## 411 412 413 414 415
## -8.249715035 3.203916108 9.922739591 1.614718843 -0.063738929
## 416 417 418 419 420
## -2.876130675 -4.061761551 -0.757040745 -5.554071306 -4.732633325
## 421 422 423 424 425
## -1.609027187 -3.556393878 1.702077410 0.565644380 -4.946581070
## 426 427 428 429 430
## -3.958369634 -7.564346075 -7.831578449 -2.820868490 -2.918021135
## 431 432 433 434 435
## -1.802256553 -0.830075484 -4.979542357 -3.049709684 -6.485001932
## 436 437 438 439 440
## 0.554921895 -6.415485821 -2.546375642 0.294990551 -0.256029135
## 441 442 443 444 445
## -2.983292397 2.061698117 1.331799336 -0.074220642 -1.769244289
## 446 447 448 449 450
## -0.669960336 -1.296574532 -4.581561771 -1.860290719 -2.173302125
## 451 452 453 454 455
## -3.044572487 -1.038740018 -0.466830178 0.844140078 -0.667263464
## 456 457 458 459 460
## -1.860290719 -2.668728438 -3.307628106 -2.442017251 1.422193174
## 461 462 463 464 465
## -0.796754679 -0.920331474 0.304326328 -2.829456869 1.497754103
## 466 467 468 469 470
## 0.828604460 2.346143516 5.162594975 3.139709281 1.573032235
## 471 472 473 474 475
## 2.604038050 -0.637133005 4.330126816 8.330560198 -2.153408462
## 476 477 478 479 480
## 0.892340819 1.112708226 0.004930415 -1.429322821 1.393468883
## 481 482 483 484 485
## 0.504760431 -2.883674071 -2.820088928 -1.072764673 0.810534740
## 486 487 488 489 490
## -1.482574750 0.757690201 -1.096274618 -0.808573070 -5.475166164
## 491 492 493 494 495
## -1.708352228 -2.401664145 0.319887286 0.699690630 4.942271292
## 496 497 498 499 500
## 6.769409786 5.656773569 -0.797922590 1.011261674 -0.742727046
## 501 502 503 504 505
## -2.095974913 -1.405077403 -3.990805982 -6.634121698 -6.801307896
## 506
## -14.459937465
fitted(lm.log) #fitted value
## 1 2 3 4 5 6 7 8
## 32.087431 24.508604 34.729197 38.665157 31.239706 31.523915 20.671298 15.277149
## 9 10 11 12 13 14 15 16
## 9.703663 16.690297 14.457396 19.855130 17.748447 25.772125 23.065898 25.458779
## 17 18 19 20 21 22 23 24
## 28.610171 18.603304 21.437369 21.882971 14.114276 19.339238 15.560594 14.810217
## 25 26 27 28 29 30 31 32
## 17.288303 17.128532 18.484760 16.559605 20.305202 21.131525 13.209711 20.073351
## 33 34 35 36 37 38 39 40
## 10.665546 15.809751 14.524712 23.792177 21.739953 25.024362 23.225049 33.861907
## 41 42 43 44 45 46 47 48
## 43.599044 32.443328 30.163480 27.077057 23.960929 23.126870 19.053742 15.507371
## 49 50 51 52 53 54 55 56
## 9.341990 17.365109 19.686971 24.118752 31.357341 25.517860 18.493190 32.520930
## 57 58 59 60 61 62 63 64
## 30.249705 34.979451 28.090056 24.399837 19.968508 18.800534 28.328845 24.026446
## 65 66 67 68 69 70 71 72
## 26.093541 32.889593 23.090251 26.016259 20.025586 24.995932 28.347404 23.536934
## 73 74 75 76 77 78 79 80
## 30.802540 26.910420 28.236462 24.784743 21.141947 23.053739 20.761995 24.563345
## 81 82 83 84 85 86 87 88
## 31.333725 27.451685 28.347404 26.960178 23.869779 28.705374 20.246834 25.503064
## 89 90 91 92 93 94 95 96
## 30.847843 30.402047 24.967566 25.863117 25.924149 29.332493 22.670784 28.478096
## 97 98 99 100 101 102 103 104
## 21.816759 34.183825 36.241901 29.372754 24.131994 26.697065 22.623730 19.696254
## 105 106 107 108 109 110 111 112
## 20.772114 17.158807 15.600662 19.106777 20.832997 17.876212 20.111695 23.188141
## 113 114 115 116 117 118 119 120
## 17.357407 16.697598 22.836883 17.708787 21.069172 23.017334 18.021529 19.539374
## 121 122 123 124 125 126 127 128
## 18.861185 18.948343 16.098739 11.747029 16.344781 18.484760 10.869896 16.624780
## 129 130 131 132 133 134 135 136
## 18.005299 15.816554 20.501757 20.843173 22.061274 18.300720 16.537956 16.792901
## 137 138 139 140 141 142 143 144
## 16.837134 18.671553 13.937405 15.735156 12.376625 7.962743 11.072993 11.260540
## 145 146 147 148 149 150 151 152
## 9.973441 10.625075 17.023143 9.871590 10.393774 13.861533 19.097923 19.845728
## 153 154 155 156 157 158 159 160
## 20.986516 17.685051 18.226207 18.309027 17.411420 33.105252 28.897985 27.161218
## 161 162 163 164 165 166 167 168
## 30.847843 45.283669 43.983104 37.148030 21.490866 23.625676 35.795491 20.965937
## 169 170 171 172 173 174 175 176
## 22.083742 21.838791 18.809181 21.079542 18.586300 24.645910 23.843858 31.239706
## 177 178 179 180 181 182 183 184
## 23.249715 29.172735 27.981367 31.937956 26.877358 24.092309 32.495009 30.445917
## 185 186 187 188 189 190 191 192
## 19.204598 19.968508 33.491862 28.421918 33.187095 31.099993 31.790251 32.836256
## 193 194 195 196 197 198 199 200
## 38.965917 31.962745 33.689753 38.538445 34.575299 25.254168 28.534529 33.187095
## 201 202 203 204 205 206 207 208
## 33.491862 27.093844 37.963562 35.429844 38.922505 22.345073 22.230778 16.008573
## 209 210 211 212 213 214 215 216
## 18.611815 12.941997 16.566830 12.469960 17.496774 24.185105 9.863140 24.065922
## 217 218 219 220 221 222 223 224
## 19.631417 23.779290 16.105702 22.777307 23.753556 13.855715 23.473930 26.811495
## 225 226 227 228 229 230 231 232
## 34.393092 32.996957 37.883555 29.034604 35.074605 35.594720 21.480148 31.428458
## 233 234 235 236 237 238 239 240
## 40.839235 34.979451 26.093541 22.333597 23.974005 32.730259 29.034604 27.195042
## 241 242 243 244 245 246 247 248
## 21.772812 20.701457 21.949536 31.571919 20.601208 15.735156 24.481323 23.200432
## 249 250 251 252 253 254 255 256
## 24.000198 28.648165 29.971626 36.172175 36.382533 36.347226 28.629154 24.359292
## 257 258 259 260 261 262 263 264
## 37.963562 31.741401 26.503307 28.017492 23.908762 27.382729 29.950489 21.916209
## 265 266 267 268 269 270 271 272
## 26.016259 22.836883 18.501626 27.077057 37.764499 19.502746 20.111695 28.591218
## 273 274 275 276 277 278 279 280
## 26.599810 28.610171 36.382533 38.496492 29.658279 34.332943 27.503653 32.417567
## 281 282 283 284 285 286 287 288
## 35.594720 33.105252 38.371473 37.764499 26.407545 25.817538 20.179082 27.590750
## 289 290 291 292 293 294 295 296
## 26.811495 24.013315 37.110493 36.276911 32.809672 25.297732 22.896743 29.212483
## 297 298 299 300 301 302 303 304
## 27.161218 17.645592 32.112518 32.703900 29.617088 24.026446 25.167494 32.391860
## 305 306 307 308 309 310 311 312
## 27.963344 24.798711 28.820584 26.926984 33.241957 23.423755 20.462198 29.803529
## 313 314 315 316 317 318 319 320
## 21.405380 26.328300 24.318879 21.641891 15.823362 17.567046 22.944840 20.373645
## 321 322 323 324 325 326 327 328
## 27.486306 28.071875 26.648342 21.384099 29.514700 31.839292 29.453669 20.314957
## 329 330 331 332 333 334 335 336
## 23.423755 27.245950 24.577068 20.671298 26.439384 30.445917 28.291810 26.155713
## 337 338 339 340 341 342 343 344
## 23.638406 22.706191 25.399975 23.715055 24.305437 30.870556 25.196319 27.521024
## 345 346 347 348 349 350 351 352
## 33.050987 22.741698 20.432610 29.034604 29.782675 29.992798 29.803529 30.870556
## 353 354 355 356 357 358 359 360
## 26.503307 33.352408 26.093541 30.689997 16.330590 19.855130 21.663616 20.432610
## 361 362 363 364 365 366 367 368
## 26.503307 19.018510 23.151342 18.628854 31.333725 27.625760 19.186756 19.798825
## 369 370 371 372 373 374 375 376
## 37.375652 35.694702 38.580540 23.987095 24.868790 7.832845 6.734002 19.696254
## 377 378 379 380 381 382 383 384
## 12.861179 13.984326 12.621818 13.670980 16.610267 14.078701 12.669324 12.171678
## 385 386 387 388 389 390 391 392
## 9.415120 9.341990 10.411415 8.872904 9.419196 14.215627 16.683001 15.533954
## 393 394 395 396 397 398 399 400
## 11.615109 18.185002 17.250076 16.675708 15.134581 14.785129 9.431430 9.686994
## 401 402 403 404 405 406 407 408
## 11.096283 14.536991 14.543134 14.879468 10.815075 13.001598 12.807590 20.976222
## 409 410 411 412 413 414 415 416
## 11.269991 14.873157 23.249715 13.996084 7.977260 14.685281 7.063739 10.076131
## 417 418 419 420 421 422 423 424
## 11.561762 11.157041 14.354071 13.132633 18.309027 17.756394 19.097923 12.834356
## 425 426 427 428 429 430 431 432
## 16.646581 12.258370 17.764346 18.731578 13.820868 12.418021 16.302257 14.930075
## 433 434 435 436 437 438 439 440
## 21.079542 17.349710 18.185002 12.845078 16.015486 11.246376 8.105009 13.056029
## 441 442 443 444 445 446 447 448
## 13.483292 15.038302 17.068201 15.474221 12.569244 12.469960 16.196575 17.181562
## 449 450 451 452 453 454 455 456
## 15.960291 15.173302 16.444572 16.238740 16.566830 16.955860 15.567263 15.960291
## 457 458 459 460 461 462 463 464
## 15.368728 16.807628 17.342017 18.577807 17.196755 18.620331 19.195674 23.029457
## 465 466 467 468 469 470 471 472
## 19.902246 19.071396 16.653856 13.937405 15.960291 18.526968 17.295962 20.237133
## 473 474 475 476 477 478 479 480
## 18.869873 21.469440 15.953408 12.407659 15.587292 11.995070 16.029323 20.006531
## 481 482 483 484 485 486 487 488
## 22.495240 26.583674 27.820089 22.872765 19.789465 22.682575 18.342310 21.696275
## 489 490 491 492 493 494 495 496
## 16.008573 12.475166 9.808352 16.001664 19.780113 21.100309 19.557729 16.330590
## 497 498 499 500 501 502 503 504
## 14.043226 19.097923 20.188738 18.242727 18.895975 23.805077 24.590806 30.534122
## 505 506
## 28.801308 26.359937
The null hypothesis is that the two models fit the data equally well, and the alternative hypothesis is that the full model is superior.
anova(lm.fit2,lm(medv~lstat,data=Boston))
## Analysis of Variance Table
##
## Model 1: medv ~ lstat + I(lstat^2)
## Model 2: medv ~ lstat
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 503 15347
## 2 504 19472 -1 -4125.1 135.2 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
zero p-value provides very clear evidence that the model containing the predictors lstat and lstat2 is far superior to the model that only contains the predictor lstat.
We can include a predictor of the form I(X^3).
summary(lm(medv ~lstat +I(lstat^2)+ I(lstat^3),data=Boston))
##
## Call:
## lm(formula = medv ~ lstat + I(lstat^2) + I(lstat^3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.5441 -3.7122 -0.5145 2.4846 26.4153
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 48.6496253 1.4347240 33.909 < 2e-16 ***
## lstat -3.8655928 0.3287861 -11.757 < 2e-16 ***
## I(lstat^2) 0.1487385 0.0212987 6.983 9.18e-12 ***
## I(lstat^3) -0.0020039 0.0003997 -5.013 7.43e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.396 on 502 degrees of freedom
## Multiple R-squared: 0.6578, Adjusted R-squared: 0.6558
## F-statistic: 321.7 on 3 and 502 DF, p-value: < 2.2e-16
dd <- lm(medv~.,data=Boston) # . stands for the corresponding part of the old model formula
summary(update(dd,.~.+I(lstat^2)))
##
## Call:
## lm(formula = medv ~ crim + zn + indus + chas + nox + rm + age +
## dis + rad + tax + ptratio + black + lstat + I(lstat^2), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.2282 -2.4793 -0.3538 1.8270 24.4107
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 43.581458 4.651116 9.370 < 2e-16 ***
## crim -0.150565 0.029909 -5.034 6.75e-07 ***
## zn 0.024663 0.012548 1.965 0.04992 *
## indus 0.045740 0.055515 0.824 0.41039
## chas 2.421636 0.777481 3.115 0.00195 **
## nox -16.284075 3.447939 -4.723 3.04e-06 ***
## rm 3.043719 0.383719 7.932 1.47e-14 ***
## age 0.028707 0.012200 2.353 0.01902 *
## dis -1.192984 0.181834 -6.561 1.36e-10 ***
## rad 0.294976 0.059849 4.929 1.13e-06 ***
## tax -0.010900 0.003394 -3.211 0.00141 **
## ptratio -0.817451 0.118676 -6.888 1.74e-11 ***
## black 0.007809 0.002427 3.218 0.00138 **
## lstat -1.771125 0.125465 -14.117 < 2e-16 ***
## I(lstat^2) 0.034814 0.003263 10.668 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.28 on 491 degrees of freedom
## Multiple R-squared: 0.7894, Adjusted R-squared: 0.7834
## F-statistic: 131.5 on 14 and 491 DF, p-value: < 2.2e-16
However, this approach can start to get cumbersome for higherorder polynomials. A better approach involves using the poly() function.
lm.fit5 <- lm(medv ~ poly(lstat , 5),data=Boston)
summary(lm.fit5)
##
## Call:
## lm(formula = medv ~ poly(lstat, 5), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.5433 -3.1039 -0.7052 2.0844 27.1153
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.5328 0.2318 97.197 < 2e-16 ***
## poly(lstat, 5)1 -152.4595 5.2148 -29.236 < 2e-16 ***
## poly(lstat, 5)2 64.2272 5.2148 12.316 < 2e-16 ***
## poly(lstat, 5)3 -27.0511 5.2148 -5.187 3.10e-07 ***
## poly(lstat, 5)4 25.4517 5.2148 4.881 1.42e-06 ***
## poly(lstat, 5)5 -19.2524 5.2148 -3.692 0.000247 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.215 on 500 degrees of freedom
## Multiple R-squared: 0.6817, Adjusted R-squared: 0.6785
## F-statistic: 214.2 on 5 and 500 DF, p-value: < 2.2e-16
We also try a log transformation.
summary(lm(medv~ log(rm), data = Boston))
##
## Call:
## lm(formula = medv ~ log(rm), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.487 -2.875 -0.104 2.837 39.816
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -76.488 5.028 -15.21 <2e-16 ***
## log(rm) 54.055 2.739 19.73 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.915 on 504 degrees of freedom
## Multiple R-squared: 0.4358, Adjusted R-squared: 0.4347
## F-statistic: 389.3 on 1 and 504 DF, p-value: < 2.2e-16
We will now examine the Carseats data,
str(Carseats)
## 'data.frame': 400 obs. of 11 variables:
## $ Sales : num 9.5 11.22 10.06 7.4 4.15 ...
## $ CompPrice : num 138 111 113 117 141 124 115 136 132 132 ...
## $ Income : num 73 48 35 100 64 113 105 81 110 113 ...
## $ Advertising: num 11 16 10 4 3 13 0 15 0 0 ...
## $ Population : num 276 260 269 466 340 501 45 425 108 131 ...
## $ Price : num 120 83 80 97 128 72 108 120 124 124 ...
## $ ShelveLoc : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
## $ Age : num 42 65 59 55 38 78 71 67 76 76 ...
## $ Education : num 17 10 12 14 13 16 15 10 10 17 ...
## $ Urban : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
## $ US : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
lm.fit <- lm(Sales ~ . + Income:Advertising + Price:Age,data = Carseats)
summary(lm.fit)
##
## Call:
## lm(formula = Sales ~ . + Income:Advertising + Price:Age, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.9208 -0.7503 0.0177 0.6754 3.3413
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.5755654 1.0087470 6.519 2.22e-10 ***
## CompPrice 0.0929371 0.0041183 22.567 < 2e-16 ***
## Income 0.0108940 0.0026044 4.183 3.57e-05 ***
## Advertising 0.0702462 0.0226091 3.107 0.002030 **
## Population 0.0001592 0.0003679 0.433 0.665330
## Price -0.1008064 0.0074399 -13.549 < 2e-16 ***
## ShelveLocGood 4.8486762 0.1528378 31.724 < 2e-16 ***
## ShelveLocMedium 1.9532620 0.1257682 15.531 < 2e-16 ***
## Age -0.0579466 0.0159506 -3.633 0.000318 ***
## Education -0.0208525 0.0196131 -1.063 0.288361
## UrbanYes 0.1401597 0.1124019 1.247 0.213171
## USYes -0.1575571 0.1489234 -1.058 0.290729
## Income:Advertising 0.0007510 0.0002784 2.698 0.007290 **
## Price:Age 0.0001068 0.0001333 0.801 0.423812
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.011 on 386 degrees of freedom
## Multiple R-squared: 0.8761, Adjusted R-squared: 0.8719
## F-statistic: 210 on 13 and 386 DF, p-value: < 2.2e-16
class(Carseats$ShelveLoc)
## [1] "factor"
levels(Carseats$ShelveLoc)
## [1] "Bad" "Good" "Medium"
xtabs(~Carseats$ShelveLoc)
## Carseats$ShelveLoc
## Bad Good Medium
## 96 85 219
R has created a ShelveLocGood dummy variable that takes on a value of 1 if the shelving location is good, and 0 otherwise. It has also created a ShelveLocMedium dummy variable that equals 1 if the shelving location is medium, and 0 otherwise. A bad shelving location corresponds to a zero for each of the two dummy variables. The fact that the coefficient for ShelveLocGood in the regression output is positive indicates that a good shelving location is associated with high sales (relative to a bad location). And ShelveLocMedium has a smaller positive coefficient, indicating that a medium shelving location is associated with higher sales than a bad shelving location but lower sales than a good shelving location.
Given a qualitative variable such as Shelveloc, R generates dummy variables automatically.
contrasts(Carseats$ShelveLoc)
## Good Medium
## Bad 0 0
## Good 1 0
## Medium 0 1
This problem involves the Prestige dataset in library carData,and answer following questions with evidence: - remove rows with NA by na.omit - Build a model without type variable. What are significant factors for prestige? - Conduct model selection using anova and step - What do your final model results suggest about the factors that most strongly influence occupational prestige? - Include type (occupation type: blue-collar, white-collar, professional) as a categorical predictor. Which types of occupations have the highest prestige? - Try log transform the income
#It usually we need to conduct Normality Chekc before modeling.
#qqPlot(p5)