library(MASS)
library(car)
data("iris")
RegModel.2 <-
lm(Sepal.Length~Sepal.Width+
Petal.Width+
Petal.Length+Species,
data=iris)
str(RegModel.2)
## List of 13
## $ coefficients : Named num [1:6] 2.171 0.496 -0.315 0.829 -0.724 ...
## ..- attr(*, "names")= chr [1:6] "(Intercept)" "Sepal.Width" "Petal.Width" "Petal.Length" ...
## $ residuals : Named num [1:150] 0.0952 0.1432 -0.0731 -0.2894 -0.0544 ...
## ..- attr(*, "names")= chr [1:150] "1" "2" "3" "4" ...
## $ effects : Named num [1:150] -71.5659 -1.1884 8.4169 3.9326 -0.0587 ...
## ..- attr(*, "names")= chr [1:150] "(Intercept)" "Sepal.Width" "Petal.Width" "Petal.Length" ...
## $ rank : int 6
## $ fitted.values: Named num [1:150] 5 4.76 4.77 4.89 5.05 ...
## ..- attr(*, "names")= chr [1:150] "1" "2" "3" "4" ...
## $ assign : int [1:6] 0 1 2 3 4 4
## $ qr :List of 5
## ..$ qr : num [1:150, 1:6] -12.2474 0.0816 0.0816 0.0816 0.0816 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:150] "1" "2" "3" "4" ...
## .. .. ..$ : chr [1:6] "(Intercept)" "Sepal.Width" "Petal.Width" "Petal.Length" ...
## .. ..- attr(*, "assign")= int [1:6] 0 1 2 3 4 4
## .. ..- attr(*, "contrasts")=List of 1
## .. .. ..$ Species: chr "contr.treatment"
## ..$ qraux: num [1:6] 1.08 1.02 1.1 1.01 1.02 ...
## ..$ pivot: int [1:6] 1 2 3 4 5 6
## ..$ tol : num 1e-07
## ..$ rank : int 6
## ..- attr(*, "class")= chr "qr"
## $ df.residual : int 144
## $ contrasts :List of 1
## ..$ Species: chr "contr.treatment"
## $ xlevels :List of 1
## ..$ Species: chr [1:3] "setosa" "versicolor" "virginica"
## $ call : language lm(formula = Sepal.Length ~ Sepal.Width + Petal.Width + Petal.Length + Species, data = iris)
## $ terms :Classes 'terms', 'formula' length 3 Sepal.Length ~ Sepal.Width + Petal.Width + Petal.Length + Species
## .. ..- attr(*, "variables")= language list(Sepal.Length, Sepal.Width, Petal.Width, Petal.Length, Species)
## .. ..- attr(*, "factors")= int [1:5, 1:4] 0 1 0 0 0 0 0 1 0 0 ...
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:5] "Sepal.Length" "Sepal.Width" "Petal.Width" "Petal.Length" ...
## .. .. .. ..$ : chr [1:4] "Sepal.Width" "Petal.Width" "Petal.Length" "Species"
## .. ..- attr(*, "term.labels")= chr [1:4] "Sepal.Width" "Petal.Width" "Petal.Length" "Species"
## .. ..- attr(*, "order")= int [1:4] 1 1 1 1
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(Sepal.Length, Sepal.Width, Petal.Width, Petal.Length, Species)
## .. ..- attr(*, "dataClasses")= Named chr [1:5] "numeric" "numeric" "numeric" "numeric" ...
## .. .. ..- attr(*, "names")= chr [1:5] "Sepal.Length" "Sepal.Width" "Petal.Width" "Petal.Length" ...
## $ model :'data.frame': 150 obs. of 5 variables:
## ..$ Sepal.Length: num [1:150] 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## ..$ Sepal.Width : num [1:150] 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## ..$ Petal.Width : num [1:150] 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## ..$ Petal.Length: num [1:150] 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## ..$ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "terms")=Classes 'terms', 'formula' length 3 Sepal.Length ~ Sepal.Width + Petal.Width + Petal.Length + Species
## .. .. ..- attr(*, "variables")= language list(Sepal.Length, Sepal.Width, Petal.Width, Petal.Length, Species)
## .. .. ..- attr(*, "factors")= int [1:5, 1:4] 0 1 0 0 0 0 0 1 0 0 ...
## .. .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. .. ..$ : chr [1:5] "Sepal.Length" "Sepal.Width" "Petal.Width" "Petal.Length" ...
## .. .. .. .. ..$ : chr [1:4] "Sepal.Width" "Petal.Width" "Petal.Length" "Species"
## .. .. ..- attr(*, "term.labels")= chr [1:4] "Sepal.Width" "Petal.Width" "Petal.Length" "Species"
## .. .. ..- attr(*, "order")= int [1:4] 1 1 1 1
## .. .. ..- attr(*, "intercept")= int 1
## .. .. ..- attr(*, "response")= int 1
## .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. .. ..- attr(*, "predvars")= language list(Sepal.Length, Sepal.Width, Petal.Width, Petal.Length, Species)
## .. .. ..- attr(*, "dataClasses")= Named chr [1:5] "numeric" "numeric" "numeric" "numeric" ...
## .. .. .. ..- attr(*, "names")= chr [1:5] "Sepal.Length" "Sepal.Width" "Petal.Width" "Petal.Length" ...
## - attr(*, "class")= chr "lm"
names(RegModel.2)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "contrasts" "xlevels" "call" "terms"
## [13] "model"
summary(RegModel.2)
##
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width + Petal.Width + Petal.Length +
## Species, data = iris)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.79424 -0.21874 0.00899 0.20255 0.73103
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.17127 0.27979 7.760 1.43e-12 ***
## Sepal.Width 0.49589 0.08607 5.761 4.87e-08 ***
## Petal.Width -0.31516 0.15120 -2.084 0.03889 *
## Petal.Length 0.82924 0.06853 12.101 < 2e-16 ***
## Speciesversicolor -0.72356 0.24017 -3.013 0.00306 **
## Speciesvirginica -1.02350 0.33373 -3.067 0.00258 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3068 on 144 degrees of freedom
## Multiple R-squared: 0.8673, Adjusted R-squared: 0.8627
## F-statistic: 188.3 on 5 and 144 DF, p-value: < 2.2e-16
iris$pred= predict(RegModel.2,iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species pred
## 1 5.1 3.5 1.4 0.2 setosa 5.004788
## 2 4.9 3.0 1.4 0.2 setosa 4.756844
## 3 4.7 3.2 1.3 0.2 setosa 4.773097
## 4 4.6 3.1 1.5 0.2 setosa 4.889357
## 5 5.0 3.6 1.4 0.2 setosa 5.054377
## 6 5.4 3.9 1.7 0.4 setosa 5.388886
sample(150,100,F)
## [1] 138 104 130 92 17 120 150 58 136 26 113 43 55 105 1 47 68
## [18] 19 112 111 99 45 142 83 12 147 95 109 119 84 87 76 98 146
## [35] 73 7 96 29 85 20 108 78 52 123 129 25 18 132 8 23 57
## [52] 79 59 69 77 121 54 91 48 93 49 16 149 140 127 75 40 15
## [69] 35 5 53 33 97 50 41 61 32 145 100 39 116 21 90 38 72
## [86] 82 70 115 86 148 106 133 139 11 131 134 65 13 4 31
a=nrow(iris)
b=100
a
## [1] 150
random_row_numbs=sample(a,b,F)
random_row_numbs
## [1] 142 2 59 63 44 20 66 75 9 1 150 41 38 138 50 55 43
## [18] 92 126 102 49 11 123 14 98 94 28 105 5 113 42 10 87 53
## [35] 25 100 114 124 69 12 149 34 71 122 80 118 115 79 30 52 8
## [52] 125 27 72 108 37 23 140 112 111 89 96 139 39 78 73 74 116
## [69] 120 147 91 36 110 95 18 128 117 131 61 24 67 6 65 132 7
## [86] 99 135 54 68 56 82 46 70 109 85 88 101 45 130 64
iris_train=iris[random_row_numbs,]
iris_test=iris[-random_row_numbs,]
nrow(iris_test)
## [1] 50
RegModel.3 <-
lm(Sepal.Length~Sepal.Width+
Petal.Width+
Petal.Length+Species,
data=iris_train)
summary(RegModel.3)
##
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width + Petal.Width + Petal.Length +
## Species, data = iris_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7654 -0.2421 0.0177 0.1894 0.7297
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.41627 0.33121 7.295 9.33e-11 ***
## Sepal.Width 0.43405 0.10696 4.058 0.000102 ***
## Petal.Width -0.36676 0.17904 -2.048 0.043309 *
## Petal.Length 0.78367 0.08316 9.424 3.08e-15 ***
## Speciesversicolor -0.52942 0.30828 -1.717 0.089211 .
## Speciesvirginica -0.74473 0.42140 -1.767 0.080424 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3032 on 94 degrees of freedom
## Multiple R-squared: 0.8723, Adjusted R-squared: 0.8655
## F-statistic: 128.4 on 5 and 94 DF, p-value: < 2.2e-16
summary(RegModel.2)
##
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width + Petal.Width + Petal.Length +
## Species, data = iris)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.79424 -0.21874 0.00899 0.20255 0.73103
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.17127 0.27979 7.760 1.43e-12 ***
## Sepal.Width 0.49589 0.08607 5.761 4.87e-08 ***
## Petal.Width -0.31516 0.15120 -2.084 0.03889 *
## Petal.Length 0.82924 0.06853 12.101 < 2e-16 ***
## Speciesversicolor -0.72356 0.24017 -3.013 0.00306 **
## Speciesvirginica -1.02350 0.33373 -3.067 0.00258 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3068 on 144 degrees of freedom
## Multiple R-squared: 0.8673, Adjusted R-squared: 0.8627
## F-statistic: 188.3 on 5 and 144 DF, p-value: < 2.2e-16
#SCORING THE MODEL
iris_test$pred= predict(RegModel.3,iris_test)
head(iris_test)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species pred
## 3 4.7 3.2 1.3 0.2 setosa 4.750653
## 4 4.6 3.1 1.5 0.2 setosa 4.863982
## 13 4.8 3.0 1.4 0.1 setosa 4.778886
## 15 5.8 4.0 1.2 0.2 setosa 5.019526
## 16 5.7 4.4 1.5 0.4 setosa 5.354896
## 17 5.4 3.9 1.3 0.4 setosa 4.981137
#Boston
#2:1 for splitting the Boston Data
data(Boston)
RegModel1.1 <-
lm(medv~age+black+chas+crim+dis+indus+lstat+nox+ptratio+rad+rm+tax+zn,
data=Boston)
summary(RegModel1.1)
##
## Call:
## lm(formula = medv ~ age + black + chas + crim + dis + indus +
## lstat + nox + ptratio + rad + rm + tax + zn, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.595 -2.730 -0.518 1.777 26.199
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.646e+01 5.103e+00 7.144 3.28e-12 ***
## age 6.922e-04 1.321e-02 0.052 0.958229
## black 9.312e-03 2.686e-03 3.467 0.000573 ***
## chas 2.687e+00 8.616e-01 3.118 0.001925 **
## crim -1.080e-01 3.286e-02 -3.287 0.001087 **
## dis -1.476e+00 1.995e-01 -7.398 6.01e-13 ***
## indus 2.056e-02 6.150e-02 0.334 0.738288
## lstat -5.248e-01 5.072e-02 -10.347 < 2e-16 ***
## nox -1.777e+01 3.820e+00 -4.651 4.25e-06 ***
## ptratio -9.527e-01 1.308e-01 -7.283 1.31e-12 ***
## rad 3.060e-01 6.635e-02 4.613 5.07e-06 ***
## rm 3.810e+00 4.179e-01 9.116 < 2e-16 ***
## tax -1.233e-02 3.760e-03 -3.280 0.001112 **
## zn 4.642e-02 1.373e-02 3.382 0.000778 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.745 on 492 degrees of freedom
## Multiple R-squared: 0.7406, Adjusted R-squared: 0.7338
## F-statistic: 108.1 on 13 and 492 DF, p-value: < 2.2e-16
a=nrow(Boston)
b=round(0.66*a)
a
## [1] 506
b
## [1] 334
random_row_numbs=sample(a,b,F)
random_row_numbs
## [1] 415 11 406 371 187 394 32 334 1 355 240 106 255 40 155 232 262
## [18] 312 215 459 74 346 416 483 146 497 277 400 438 201 266 405 456 429
## [35] 445 17 104 72 14 330 332 54 237 193 319 356 28 44 259 46 200
## [52] 274 123 122 455 128 260 18 448 446 177 491 61 169 488 361 154 42
## [69] 60 35 49 23 107 282 289 383 467 164 424 482 261 369 287 450 127
## [86] 315 447 115 501 162 276 63 86 401 374 36 34 283 6 301 409 228
## [103] 306 226 349 291 381 48 233 165 427 344 358 297 293 310 422 95 55
## [120] 56 168 210 495 307 460 239 380 341 457 368 366 407 437 403 321 280
## [137] 382 204 378 24 43 317 197 314 425 110 309 119 85 298 503 8 159
## [154] 80 133 499 377 12 462 487 399 198 114 71 143 384 333 285 364 223
## [171] 354 22 340 39 411 170 81 421 21 203 430 208 360 337 238 351 132
## [188] 144 65 102 179 386 2 98 216 58 243 192 136 484 147 414 418 270
## [205] 64 62 247 236 140 433 286 492 67 304 205 494 94 387 175 338 182
## [222] 325 473 227 281 88 504 353 219 161 9 271 453 196 435 51 89 318
## [239] 152 410 395 256 30 199 398 265 326 180 33 41 313 303 443 212 92
## [256] 111 478 145 209 253 148 183 7 254 373 362 163 83 299 13 372 90
## [273] 444 442 481 345 186 500 211 202 496 485 158 419 490 426 249 79 167
## [290] 322 458 138 153 440 284 25 244 26 472 241 87 471 16 339 308 195
## [307] 231 118 417 5 375 502 498 166 476 327 78 379 116 357 75 99 103
## [324] 272 402 323 486 347 73 479 290 257 70 84
#OVERFITTING /SPLIT
Boston_train=Boston[random_row_numbs,]
Boston_test=Boston[-random_row_numbs,]
nrow(Boston_test)
## [1] 172
RegModel1.2 <-
lm(medv~age+black+chas+crim+dis+
indus+lstat+nox+ptratio+rad+rm+tax+zn,
data=Boston_train)
summary(RegModel1.2)
##
## Call:
## lm(formula = medv ~ age + black + chas + crim + dis + indus +
## lstat + nox + ptratio + rad + rm + tax + zn, data = Boston_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.5053 -2.7723 -0.6659 1.4557 25.2114
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.064350 6.562089 5.648 3.58e-08 ***
## age 0.007661 0.016711 0.458 0.646921
## black 0.009131 0.003319 2.751 0.006277 **
## chas 3.872493 1.112600 3.481 0.000570 ***
## crim -0.100624 0.036018 -2.794 0.005525 **
## dis -1.408710 0.253714 -5.552 5.93e-08 ***
## indus 0.054962 0.086047 0.639 0.523447
## lstat -0.581514 0.061089 -9.519 < 2e-16 ***
## nox -18.705425 4.965826 -3.767 0.000197 ***
## ptratio -0.941298 0.165469 -5.689 2.89e-08 ***
## rad 0.305968 0.087327 3.504 0.000524 ***
## rm 3.701278 0.534387 6.926 2.38e-11 ***
## tax -0.012194 0.005184 -2.352 0.019260 *
## zn 0.046799 0.016872 2.774 0.005865 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.921 on 320 degrees of freedom
## Multiple R-squared: 0.7458, Adjusted R-squared: 0.7355
## F-statistic: 72.23 on 13 and 320 DF, p-value: < 2.2e-16
Boston_test$pred= predict(RegModel1.2,Boston_test)
head(Boston_test)
## crim zn indus chas nox rm age dis rad tax ptratio black
## 3 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83
## 4 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63
## 10 0.17004 12.5 7.87 0 0.524 6.004 85.9 6.5921 5 311 15.2 386.71
## 15 0.63796 0.0 8.14 0 0.538 6.096 84.5 4.4619 4 307 21.0 380.02
## 19 0.80271 0.0 8.14 0 0.538 5.456 36.6 3.7965 4 307 21.0 288.99
## 20 0.72580 0.0 8.14 0 0.538 5.727 69.5 3.7965 4 307 21.0 390.95
## lstat medv pred
## 3 4.03 34.7 30.89124
## 4 2.94 33.4 28.82865
## 10 17.10 18.9 18.87431
## 15 10.26 18.2 19.52558
## 19 11.69 20.2 16.04781
## 20 11.28 18.2 18.48006
RegModel1.3 <-
lm(medv~age+black+chas+crim+dis+
indus+lstat+ptratio+rm+zn,
data=Boston_train)
summary(RegModel1.3)
##
## Call:
## lm(formula = medv ~ age + black + chas + crim + dis + indus +
## lstat + ptratio + rm + zn, data = Boston_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.5268 -3.0117 -0.8198 1.2548 27.2430
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.299522 5.433475 3.552 0.000439 ***
## age -0.010181 0.016596 -0.613 0.540022
## black 0.009086 0.003347 2.714 0.006996 **
## chas 4.014595 1.137272 3.530 0.000476 ***
## crim -0.068776 0.034323 -2.004 0.045927 *
## dis -1.155275 0.247698 -4.664 4.55e-06 ***
## indus -0.097778 0.068636 -1.425 0.155241
## lstat -0.598274 0.062604 -9.556 < 2e-16 ***
## ptratio -0.699146 0.151619 -4.611 5.78e-06 ***
## rm 4.181650 0.540227 7.741 1.28e-13 ***
## zn 0.040445 0.016520 2.448 0.014890 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.072 on 323 degrees of freedom
## Multiple R-squared: 0.7274, Adjusted R-squared: 0.719
## F-statistic: 86.19 on 10 and 323 DF, p-value: < 2.2e-16
#MULTO COLLINEARITY
vif(RegModel1.3)
## age black chas crim dis indus lstat ptratio
## 2.951697 1.285478 1.076716 1.447176 3.721734 2.783200 2.766016 1.423619
## rm zn
## 1.836707 2.283260
RegModel1.4 <-
lm(medv~age+black+chas+crim+
indus+lstat+ptratio+rm+zn,
data=Boston_train)
summary(RegModel1.4)
##
## Call:
## lm(formula = medv ~ age + black + chas + crim + indus + lstat +
## ptratio + rm + zn, data = Boston_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.7879 -2.9683 -0.9062 1.4251 28.2141
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.623496 5.406809 2.335 0.02017 *
## age 0.020826 0.015686 1.328 0.18522
## black 0.008764 0.003452 2.539 0.01159 *
## chas 4.060463 1.173086 3.461 0.00061 ***
## crim -0.043619 0.034965 -1.247 0.21312
## indus 0.028366 0.065072 0.436 0.66318
## lstat -0.610773 0.064519 -9.467 < 2e-16 ***
## ptratio -0.810364 0.154453 -5.247 2.80e-07 ***
## rm 4.396271 0.555234 7.918 3.89e-14 ***
## zn 0.007420 0.015397 0.482 0.63017
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.232 on 324 degrees of freedom
## Multiple R-squared: 0.709, Adjusted R-squared: 0.701
## F-statistic: 87.73 on 9 and 324 DF, p-value: < 2.2e-16
vif(RegModel1.4)
## age black chas crim indus lstat ptratio rm
## 2.478075 1.284930 1.076635 1.411437 2.351033 2.760947 1.388405 1.823381
## zn
## 1.863828
#OUTLIERS
outlierTest(RegModel1.4)
## rstudent unadjusted p-value Bonferonni p
## 369 5.894128 9.4761e-09 3.1650e-06
## 372 5.403382 1.2719e-07 4.2480e-05
## 373 4.945052 1.2261e-06 4.0951e-04
Boston[c(372,413,370,365),]
## crim zn indus chas nox rm age dis rad tax ptratio black
## 372 9.23230 0 18.1 0 0.631 6.216 100.0 1.1691 24 666 20.2 366.15
## 413 18.81100 0 18.1 0 0.597 4.628 100.0 1.5539 24 666 20.2 28.79
## 370 5.66998 0 18.1 1 0.631 6.683 96.8 1.3567 24 666 20.2 375.33
## 365 3.47428 0 18.1 1 0.718 8.780 82.9 1.9047 24 666 20.2 354.55
## lstat medv
## 372 9.53 50.0
## 413 34.37 17.9
## 370 3.73 50.0
## 365 5.29 21.9
#HETEROSCEDASCITY
library(gvlma)
gvlma(RegModel1.4)
##
## Call:
## lm(formula = medv ~ age + black + chas + crim + indus + lstat +
## ptratio + rm + zn, data = Boston_train)
##
## Coefficients:
## (Intercept) age black chas crim
## 12.623496 0.020826 0.008764 4.060463 -0.043619
## indus lstat ptratio rm zn
## 0.028366 -0.610773 -0.810364 4.396271 0.007420
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = RegModel1.4)
##
## Value p-value Decision
## Global Stat 850.6747 0.0000 Assumptions NOT satisfied!
## Skewness 206.9665 0.0000 Assumptions NOT satisfied!
## Kurtosis 544.6416 0.0000 Assumptions NOT satisfied!
## Link Function 98.9745 0.0000 Assumptions NOT satisfied!
## Heteroscedasticity 0.0921 0.7615 Assumptions acceptable.
#?gvlma
vif(RegModel1.4)
## age black chas crim indus lstat ptratio rm
## 2.478075 1.284930 1.076635 1.411437 2.351033 2.760947 1.388405 1.823381
## zn
## 1.863828
summary(RegModel1.4)
##
## Call:
## lm(formula = medv ~ age + black + chas + crim + indus + lstat +
## ptratio + rm + zn, data = Boston_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.7879 -2.9683 -0.9062 1.4251 28.2141
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.623496 5.406809 2.335 0.02017 *
## age 0.020826 0.015686 1.328 0.18522
## black 0.008764 0.003452 2.539 0.01159 *
## chas 4.060463 1.173086 3.461 0.00061 ***
## crim -0.043619 0.034965 -1.247 0.21312
## indus 0.028366 0.065072 0.436 0.66318
## lstat -0.610773 0.064519 -9.467 < 2e-16 ***
## ptratio -0.810364 0.154453 -5.247 2.80e-07 ***
## rm 4.396271 0.555234 7.918 3.89e-14 ***
## zn 0.007420 0.015397 0.482 0.63017
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.232 on 324 degrees of freedom
## Multiple R-squared: 0.709, Adjusted R-squared: 0.701
## F-statistic: 87.73 on 9 and 324 DF, p-value: < 2.2e-16
RegModel1.5 <-
lm(medv~lstat+ptratio+rm,
data=Boston_train)
summary(RegModel1.5)
##
## Call:
## lm(formula = medv ~ lstat + ptratio + rm, data = Boston_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.800 -3.175 -1.038 1.660 29.090
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.92854 5.00382 3.983 8.39e-05 ***
## lstat -0.61204 0.05204 -11.760 < 2e-16 ***
## ptratio -0.93983 0.14760 -6.367 6.44e-10 ***
## rm 4.41867 0.54929 8.044 1.58e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.401 on 330 degrees of freedom
## Multiple R-squared: 0.6842, Adjusted R-squared: 0.6813
## F-statistic: 238.3 on 3 and 330 DF, p-value: < 2.2e-16
vif(RegModel1.5)
## lstat ptratio rm
## 1.685617 1.189784 1.674550
#OUTLIERS
outlierTest(RegModel1.5)
## rstudent unadjusted p-value Bonferonni p
## 369 5.773195 1.7995e-08 6.0103e-06
## 373 5.543177 6.1006e-08 2.0376e-05
## 372 5.299931 2.1275e-07 7.1058e-05
Boston[c(372,413,370,365),]
## crim zn indus chas nox rm age dis rad tax ptratio black
## 372 9.23230 0 18.1 0 0.631 6.216 100.0 1.1691 24 666 20.2 366.15
## 413 18.81100 0 18.1 0 0.597 4.628 100.0 1.5539 24 666 20.2 28.79
## 370 5.66998 0 18.1 1 0.631 6.683 96.8 1.3567 24 666 20.2 375.33
## 365 3.47428 0 18.1 1 0.718 8.780 82.9 1.9047 24 666 20.2 354.55
## lstat medv
## 372 9.53 50.0
## 413 34.37 17.9
## 370 3.73 50.0
## 365 5.29 21.9
#HETEROSCEDASCITY
gvlma(RegModel1.5)
##
## Call:
## lm(formula = medv ~ lstat + ptratio + rm, data = Boston_train)
##
## Coefficients:
## (Intercept) lstat ptratio rm
## 19.9285 -0.6120 -0.9398 4.4187
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = RegModel1.5)
##
## Value p-value Decision
## Global Stat 950.04282 0.0000 Assumptions NOT satisfied!
## Skewness 222.29781 0.0000 Assumptions NOT satisfied!
## Kurtosis 638.03988 0.0000 Assumptions NOT satisfied!
## Link Function 89.68018 0.0000 Assumptions NOT satisfied!
## Heteroscedasticity 0.02495 0.8745 Assumptions acceptable.