getwd()
## [1] "C:/Users/dell/Desktop"
setwd("C:/Users/dell/Desktop")
memory.limit()
## [1] 1535
memory.size()
## [1] 18.04
rm(list = ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 365542 9.8 592000 15.9 460000 12.3
## Vcells 372990 2.9 1023718 7.9 752284 5.8
library(car)
library(MASS)
data(Boston, package="MASS")
cor(Boston)
## crim zn indus chas nox
## crim 1.00000000 -0.20046922 0.40658341 -0.055891582 0.42097171
## zn -0.20046922 1.00000000 -0.53382819 -0.042696719 -0.51660371
## indus 0.40658341 -0.53382819 1.00000000 0.062938027 0.76365145
## chas -0.05589158 -0.04269672 0.06293803 1.000000000 0.09120281
## nox 0.42097171 -0.51660371 0.76365145 0.091202807 1.00000000
## rm -0.21924670 0.31199059 -0.39167585 0.091251225 -0.30218819
## age 0.35273425 -0.56953734 0.64477851 0.086517774 0.73147010
## dis -0.37967009 0.66440822 -0.70802699 -0.099175780 -0.76923011
## rad 0.62550515 -0.31194783 0.59512927 -0.007368241 0.61144056
## tax 0.58276431 -0.31456332 0.72076018 -0.035586518 0.66802320
## ptratio 0.28994558 -0.39167855 0.38324756 -0.121515174 0.18893268
## black -0.38506394 0.17552032 -0.35697654 0.048788485 -0.38005064
## lstat 0.45562148 -0.41299457 0.60379972 -0.053929298 0.59087892
## medv -0.38830461 0.36044534 -0.48372516 0.175260177 -0.42732077
## rm age dis rad tax
## crim -0.21924670 0.35273425 -0.37967009 0.625505145 0.58276431
## zn 0.31199059 -0.56953734 0.66440822 -0.311947826 -0.31456332
## indus -0.39167585 0.64477851 -0.70802699 0.595129275 0.72076018
## chas 0.09125123 0.08651777 -0.09917578 -0.007368241 -0.03558652
## nox -0.30218819 0.73147010 -0.76923011 0.611440563 0.66802320
## rm 1.00000000 -0.24026493 0.20524621 -0.209846668 -0.29204783
## age -0.24026493 1.00000000 -0.74788054 0.456022452 0.50645559
## dis 0.20524621 -0.74788054 1.00000000 -0.494587930 -0.53443158
## rad -0.20984667 0.45602245 -0.49458793 1.000000000 0.91022819
## tax -0.29204783 0.50645559 -0.53443158 0.910228189 1.00000000
## ptratio -0.35550149 0.26151501 -0.23247054 0.464741179 0.46085304
## black 0.12806864 -0.27353398 0.29151167 -0.444412816 -0.44180801
## lstat -0.61380827 0.60233853 -0.49699583 0.488676335 0.54399341
## medv 0.69535995 -0.37695457 0.24992873 -0.381626231 -0.46853593
## ptratio black lstat medv
## crim 0.2899456 -0.38506394 0.4556215 -0.3883046
## zn -0.3916785 0.17552032 -0.4129946 0.3604453
## indus 0.3832476 -0.35697654 0.6037997 -0.4837252
## chas -0.1215152 0.04878848 -0.0539293 0.1752602
## nox 0.1889327 -0.38005064 0.5908789 -0.4273208
## rm -0.3555015 0.12806864 -0.6138083 0.6953599
## age 0.2615150 -0.27353398 0.6023385 -0.3769546
## dis -0.2324705 0.29151167 -0.4969958 0.2499287
## rad 0.4647412 -0.44441282 0.4886763 -0.3816262
## tax 0.4608530 -0.44180801 0.5439934 -0.4685359
## ptratio 1.0000000 -0.17738330 0.3740443 -0.5077867
## black -0.1773833 1.00000000 -0.3660869 0.3334608
## lstat 0.3740443 -0.36608690 1.0000000 -0.7376627
## medv -0.5077867 0.33346082 -0.7376627 1.0000000
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08204 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
library(corrgram)
corrgram(Boston)

attach(Boston)
boxplot(medv~black)

plot(medv~black)

plot(medv~rm)

str(Boston)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
RegModel.1 <-
lm(medv~age+black+chas+crim+dis+indus+lstat+nox+ptratio+rad+rm+tax+zn,
data=Boston)
summary(RegModel.1)
##
## Call:
## lm(formula = medv ~ age + black + chas + crim + dis + indus +
## lstat + nox + ptratio + rad + rm + tax + zn, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.595 -2.730 -0.518 1.777 26.199
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.646e+01 5.103e+00 7.144 3.28e-12 ***
## age 6.922e-04 1.321e-02 0.052 0.958229
## black 9.312e-03 2.686e-03 3.467 0.000573 ***
## chas 2.687e+00 8.616e-01 3.118 0.001925 **
## crim -1.080e-01 3.286e-02 -3.287 0.001087 **
## dis -1.476e+00 1.995e-01 -7.398 6.01e-13 ***
## indus 2.056e-02 6.150e-02 0.334 0.738288
## lstat -5.248e-01 5.072e-02 -10.347 < 2e-16 ***
## nox -1.777e+01 3.820e+00 -4.651 4.25e-06 ***
## ptratio -9.527e-01 1.308e-01 -7.283 1.31e-12 ***
## rad 3.060e-01 6.635e-02 4.613 5.07e-06 ***
## rm 3.810e+00 4.179e-01 9.116 < 2e-16 ***
## tax -1.233e-02 3.760e-03 -3.280 0.001112 **
## zn 4.642e-02 1.373e-02 3.382 0.000778 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.745 on 492 degrees of freedom
## Multiple R-squared: 0.7406, Adjusted R-squared: 0.7338
## F-statistic: 108.1 on 13 and 492 DF, p-value: < 2.2e-16
vif(RegModel.1)
## age black chas crim dis indus lstat nox
## 3.100826 1.348521 1.073995 1.792192 3.955945 3.991596 2.941491 4.393720
## ptratio rad rm tax zn
## 1.799084 7.484496 1.933744 9.008554 2.298758
library(zoo, pos=15)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(lmtest, pos=15)
bptest(RegModel.1)
##
## studentized Breusch-Pagan test
##
## data: RegModel.1
## BP = 65.122, df = 13, p-value = 6.265e-09
RegModel.2 <- lm(medv~black+chas+crim+dis+lstat+nox+ptratio+rm+zn,
data=Boston)
summary(RegModel.2)
##
## Call:
## lm(formula = medv ~ black + chas + crim + dis + lstat + nox +
## ptratio + rm + zn, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.803 -2.832 -0.625 1.454 27.766
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.507997 4.872538 6.056 2.76e-09 ***
## black 0.008292 0.002688 3.084 0.002153 **
## chas 3.029924 0.868349 3.489 0.000527 ***
## crim -0.061174 0.030377 -2.014 0.044567 *
## dis -1.431665 0.188603 -7.591 1.59e-13 ***
## lstat -0.525004 0.048351 -10.858 < 2e-16 ***
## nox -16.088513 3.232702 -4.977 8.93e-07 ***
## ptratio -0.838640 0.117342 -7.147 3.19e-12 ***
## rm 4.149667 0.407685 10.179 < 2e-16 ***
## zn 0.042032 0.013422 3.131 0.001842 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.833 on 496 degrees of freedom
## Multiple R-squared: 0.7288, Adjusted R-squared: 0.7239
## F-statistic: 148.1 on 9 and 496 DF, p-value: < 2.2e-16
vif(RegModel.2)
## black chas crim dis lstat nox ptratio rm
## 1.302455 1.051879 1.476281 3.410535 2.577927 3.034316 1.395503 1.774261
## zn
## 2.119038
bptest(medv ~ black + chas + crim + dis + lstat + nox + ptratio + rm + zn,
varformula = ~ fitted.values(RegModel.2), studentize=FALSE, data=Boston)
##
## Breusch-Pagan test
##
## data: medv ~ black + chas + crim + dis + lstat + nox + ptratio + rm + zn
## BP = 8.817, df = 1, p-value = 0.002984
outlierTest(RegModel.2)
## rstudent unadjusted p-value Bonferonni p
## 369 6.093117 2.2275e-09 1.1271e-06
## 372 5.574335 4.0893e-08 2.0692e-05
## 373 5.360117 1.2776e-07 6.4644e-05
Boston <- Boston[-c(369,372,373),]
RegModel.3 <- lm(medv~black+chas+crim+lstat+nox+ptratio+rm+zn, data=Boston)
summary(RegModel.3)
##
## Call:
## lm(formula = medv ~ black + chas + crim + lstat + nox + ptratio +
## rm + zn, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.6944 -2.6544 -0.6449 1.6392 21.4155
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.386058 4.150172 2.744 0.006300 **
## black 0.008849 0.002545 3.476 0.000553 ***
## chas 2.670557 0.833048 3.206 0.001434 **
## crim -0.046149 0.028565 -1.616 0.106827
## lstat -0.392872 0.046578 -8.435 3.68e-16 ***
## nox -5.144940 2.539786 -2.026 0.043329 *
## ptratio -0.958587 0.111396 -8.605 < 2e-16 ***
## rm 5.318514 0.384910 13.818 < 2e-16 ***
## zn -0.010100 0.011040 -0.915 0.360736
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.578 on 494 degrees of freedom
## Multiple R-squared: 0.744, Adjusted R-squared: 0.7399
## F-statistic: 179.5 on 8 and 494 DF, p-value: < 2.2e-16
RegModel.4 <- lm(medv~black+lstat+ptratio+rm+zn, data=Boston)
summary(RegModel.4)
##
## Call:
## lm(formula = medv ~ black + lstat + ptratio + rm + zn, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.2574 -2.8802 -0.6129 1.8640 22.9620
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.253649 3.879508 2.385 0.0174 *
## black 0.011319 0.002462 4.598 5.43e-06 ***
## lstat -0.453250 0.041643 -10.884 < 2e-16 ***
## ptratio -0.991397 0.108946 -9.100 < 2e-16 ***
## rm 5.274984 0.386849 13.636 < 2e-16 ***
## zn -0.004863 0.010154 -0.479 0.6322
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.644 on 497 degrees of freedom
## Multiple R-squared: 0.735, Adjusted R-squared: 0.7323
## F-statistic: 275.7 on 5 and 497 DF, p-value: < 2.2e-16
RegModel.5 <- lm(medv~black+lstat+ptratio+rm, data=Boston)
summary(RegModel.5)
##
## Call:
## lm(formula = medv ~ black + lstat + ptratio + rm, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.1780 -2.8640 -0.6212 1.8545 23.0366
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.92877 3.81678 2.339 0.0197 *
## black 0.01130 0.00246 4.592 5.55e-06 ***
## lstat -0.44870 0.04051 -11.075 < 2e-16 ***
## ptratio -0.97746 0.10491 -9.317 < 2e-16 ***
## rm 5.26910 0.38635 13.638 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.64 on 498 degrees of freedom
## Multiple R-squared: 0.7348, Adjusted R-squared: 0.7327
## F-statistic: 345 on 4 and 498 DF, p-value: < 2.2e-16
vif(RegModel.5)
## black lstat ptratio rm
## 1.182491 1.954482 1.205162 1.715115
bptest(medv ~ black + lstat + ptratio + rm, varformula = ~
fitted.values(RegModel.5), studentize=FALSE, data=Boston)
##
## Breusch-Pagan test
##
## data: medv ~ black + lstat + ptratio + rm
## BP = 0.21772, df = 1, p-value = 0.6408
a=nrow(Boston)
a
## [1] 503
b=round(0.7*a)
b
## [1] 352
random_row_numbs=sample(a,b,F)
random_row_numbs
## [1] 417 35 286 410 352 408 346 274 195 379 180 165 318 110 231 72 366
## [18] 425 27 16 401 368 249 469 345 444 442 203 263 419 80 70 89 470
## [35] 57 88 58 234 177 478 152 473 420 182 247 77 190 313 457 303 186
## [52] 446 100 422 264 333 142 230 448 254 85 340 102 405 295 233 302 4
## [69] 135 243 236 81 34 397 139 222 48 20 240 371 188 259 158 199 464
## [86] 441 258 283 466 497 284 51 407 443 271 316 130 246 98 64 74 291
## [103] 117 296 46 86 336 28 13 227 450 118 106 140 131 374 490 14 23
## [120] 307 1 115 288 492 252 90 196 114 472 11 462 388 456 221 503 427
## [137] 79 418 59 421 260 373 458 451 493 431 133 334 200 282 56 146 424
## [154] 276 395 463 194 7 148 482 312 2 479 175 73 229 394 40 209 187
## [171] 17 261 269 141 400 347 322 432 239 41 381 159 306 10 370 277 49
## [188] 24 323 500 372 292 168 383 124 127 319 498 272 353 53 94 242 321
## [205] 314 392 328 213 255 459 308 486 166 361 294 440 87 8 217 445 423
## [222] 206 220 428 97 112 151 399 226 281 32 103 358 193 251 172 329 84
## [239] 205 477 237 33 309 491 364 484 216 433 65 447 39 275 359 300 439
## [256] 52 268 385 449 183 278 435 232 297 211 348 365 137 173 163 63 192
## [273] 105 357 76 437 378 18 341 202 75 452 489 31 273 162 413 317 26
## [290] 416 67 5 138 121 104 356 426 299 332 136 499 15 30 25 50 119
## [307] 384 455 176 335 289 210 393 93 107 305 429 406 280 496 483 480 108
## [324] 298 344 29 460 343 225 201 191 396 468 125 235 197 198 174 485 143
## [341] 387 409 156 290 71 350 132 149 331 218 212 109
plot(random_row_numbs)

Boston_train=Boston[random_row_numbs,]
Boston_test=Boston[-random_row_numbs,]
RegModel.6 <- lm(medv~black+lstat+ptratio+rm, data=Boston_train)
summary(RegModel.6)
##
## Call:
## lm(formula = medv ~ black + lstat + ptratio + rm, data = Boston_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.6989 -2.8976 -0.6286 1.8782 20.9866
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.082977 4.685939 2.579 0.010333 *
## black 0.010551 0.002876 3.668 0.000283 ***
## lstat -0.459176 0.049661 -9.246 < 2e-16 ***
## ptratio -1.081483 0.130743 -8.272 2.86e-15 ***
## rm 5.130985 0.465497 11.023 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.731 on 347 degrees of freedom
## Multiple R-squared: 0.7336, Adjusted R-squared: 0.7305
## F-statistic: 238.8 on 4 and 347 DF, p-value: < 2.2e-16
vif(RegModel.6)
## black lstat ptratio rm
## 1.181865 1.909594 1.266947 1.670012
outlierTest(RegModel.6)
## rstudent unadjusted p-value Bonferonni p
## 371 4.597485 6.0040e-06 0.0021134
## 413 4.320507 2.0369e-05 0.0071698
## 366 4.216016 3.1779e-05 0.0111860
## 368 4.012047 7.3806e-05 0.0259800
bptest(RegModel.6)
##
## studentized Breusch-Pagan test
##
## data: RegModel.6
## BP = 1.9743, df = 4, p-value = 0.7405
dbc=boxcox(RegModel.6)

dbc
## $x
## [1] -2.00000000 -1.95959596 -1.91919192 -1.87878788 -1.83838384
## [6] -1.79797980 -1.75757576 -1.71717172 -1.67676768 -1.63636364
## [11] -1.59595960 -1.55555556 -1.51515152 -1.47474747 -1.43434343
## [16] -1.39393939 -1.35353535 -1.31313131 -1.27272727 -1.23232323
## [21] -1.19191919 -1.15151515 -1.11111111 -1.07070707 -1.03030303
## [26] -0.98989899 -0.94949495 -0.90909091 -0.86868687 -0.82828283
## [31] -0.78787879 -0.74747475 -0.70707071 -0.66666667 -0.62626263
## [36] -0.58585859 -0.54545455 -0.50505051 -0.46464646 -0.42424242
## [41] -0.38383838 -0.34343434 -0.30303030 -0.26262626 -0.22222222
## [46] -0.18181818 -0.14141414 -0.10101010 -0.06060606 -0.02020202
## [51] 0.02020202 0.06060606 0.10101010 0.14141414 0.18181818
## [56] 0.22222222 0.26262626 0.30303030 0.34343434 0.38383838
## [61] 0.42424242 0.46464646 0.50505051 0.54545455 0.58585859
## [66] 0.62626263 0.66666667 0.70707071 0.74747475 0.78787879
## [71] 0.82828283 0.86868687 0.90909091 0.94949495 0.98989899
## [76] 1.03030303 1.07070707 1.11111111 1.15151515 1.19191919
## [81] 1.23232323 1.27272727 1.31313131 1.35353535 1.39393939
## [86] 1.43434343 1.47474747 1.51515152 1.55555556 1.59595960
## [91] 1.63636364 1.67676768 1.71717172 1.75757576 1.79797980
## [96] 1.83838384 1.87878788 1.91919192 1.95959596 2.00000000
##
## $y
## [1] -859.1799 -847.1039 -835.1407 -823.2931 -811.5635 -799.9545 -788.4687
## [8] -777.1090 -765.8782 -754.7796 -743.8162 -732.9914 -722.3087 -711.7718
## [15] -701.3844 -691.1505 -681.0742 -671.1596 -661.4114 -651.8339 -642.4319
## [22] -633.2102 -624.1740 -615.3282 -606.6782 -598.2292 -589.9869 -581.9566
## [29] -574.1441 -566.5549 -559.1948 -552.0694 -545.1843 -538.5451 -532.1573
## [36] -526.0261 -520.1569 -514.5544 -509.2236 -504.1689 -499.3943 -494.9038
## [43] -490.7007 -486.7882 -483.1687 -479.8445 -476.8171 -474.0877 -471.6570
## [50] -469.5250 -467.6913 -466.1549 -464.9143 -463.9677 -463.3123 -462.9453
## [57] -462.8632 -463.0621 -463.5379 -464.2858 -465.3009 -466.5779 -468.1114
## [64] -469.8955 -471.9244 -474.1919 -476.6919 -479.4181 -482.3641 -485.5236
## [71] -488.8903 -492.4578 -496.2198 -500.1703 -504.3031 -508.6121 -513.0917
## [78] -517.7360 -522.5394 -527.4966 -532.6022 -537.8512 -543.2387 -548.7598
## [85] -554.4099 -560.1847 -566.0799 -572.0912 -578.2149 -584.4471 -590.7842
## [92] -597.2227 -603.7593 -610.3907 -617.1139 -623.9259 -630.8240 -637.8056
## [99] -644.8680 -652.0084