RegressioninR.R

library(MASS)
library(car)
data("iris")
RegModel.2 <- 
  lm(Sepal.Length~Sepal.Width+
       Petal.Width+
       Petal.Length+Species,
     data=iris)
str(RegModel.2)

## List of 13
##  $ coefficients : Named num [1:6] 2.171 0.496 -0.315 0.829 -0.724 ...
##   ..- attr(*, "names")= chr [1:6] "(Intercept)" "Sepal.Width" "Petal.Width" "Petal.Length" ...
##  $ residuals    : Named num [1:150] 0.0952 0.1432 -0.0731 -0.2894 -0.0544 ...
##   ..- attr(*, "names")= chr [1:150] "1" "2" "3" "4" ...
##  $ effects      : Named num [1:150] -71.5659 -1.1884 8.4169 3.9326 -0.0587 ...
##   ..- attr(*, "names")= chr [1:150] "(Intercept)" "Sepal.Width" "Petal.Width" "Petal.Length" ...
##  $ rank         : int 6
##  $ fitted.values: Named num [1:150] 5 4.76 4.77 4.89 5.05 ...
##   ..- attr(*, "names")= chr [1:150] "1" "2" "3" "4" ...
##  $ assign       : int [1:6] 0 1 2 3 4 4
##  $ qr           :List of 5
##   ..$ qr   : num [1:150, 1:6] -12.2474 0.0816 0.0816 0.0816 0.0816 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:150] "1" "2" "3" "4" ...
##   .. .. ..$ : chr [1:6] "(Intercept)" "Sepal.Width" "Petal.Width" "Petal.Length" ...
##   .. ..- attr(*, "assign")= int [1:6] 0 1 2 3 4 4
##   .. ..- attr(*, "contrasts")=List of 1
##   .. .. ..$ Species: chr "contr.treatment"
##   ..$ qraux: num [1:6] 1.08 1.02 1.1 1.01 1.02 ...
##   ..$ pivot: int [1:6] 1 2 3 4 5 6
##   ..$ tol  : num 1e-07
##   ..$ rank : int 6
##   ..- attr(*, "class")= chr "qr"
##  $ df.residual  : int 144
##  $ contrasts    :List of 1
##   ..$ Species: chr "contr.treatment"
##  $ xlevels      :List of 1
##   ..$ Species: chr [1:3] "setosa" "versicolor" "virginica"
##  $ call         : language lm(formula = Sepal.Length ~ Sepal.Width + Petal.Width + Petal.Length +      Species, data = iris)
##  $ terms        :Classes 'terms', 'formula' length 3 Sepal.Length ~ Sepal.Width + Petal.Width + Petal.Length + Species
##   .. ..- attr(*, "variables")= language list(Sepal.Length, Sepal.Width, Petal.Width, Petal.Length, Species)
##   .. ..- attr(*, "factors")= int [1:5, 1:4] 0 1 0 0 0 0 0 1 0 0 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:5] "Sepal.Length" "Sepal.Width" "Petal.Width" "Petal.Length" ...
##   .. .. .. ..$ : chr [1:4] "Sepal.Width" "Petal.Width" "Petal.Length" "Species"
##   .. ..- attr(*, "term.labels")= chr [1:4] "Sepal.Width" "Petal.Width" "Petal.Length" "Species"
##   .. ..- attr(*, "order")= int [1:4] 1 1 1 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(Sepal.Length, Sepal.Width, Petal.Width, Petal.Length, Species)
##   .. ..- attr(*, "dataClasses")= Named chr [1:5] "numeric" "numeric" "numeric" "numeric" ...
##   .. .. ..- attr(*, "names")= chr [1:5] "Sepal.Length" "Sepal.Width" "Petal.Width" "Petal.Length" ...
##  $ model        :'data.frame':   150 obs. of  5 variables:
##   ..$ Sepal.Length: num [1:150] 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##   ..$ Sepal.Width : num [1:150] 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##   ..$ Petal.Width : num [1:150] 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##   ..$ Petal.Length: num [1:150] 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##   ..$ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "terms")=Classes 'terms', 'formula' length 3 Sepal.Length ~ Sepal.Width + Petal.Width + Petal.Length + Species
##   .. .. ..- attr(*, "variables")= language list(Sepal.Length, Sepal.Width, Petal.Width, Petal.Length, Species)
##   .. .. ..- attr(*, "factors")= int [1:5, 1:4] 0 1 0 0 0 0 0 1 0 0 ...
##   .. .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. .. ..$ : chr [1:5] "Sepal.Length" "Sepal.Width" "Petal.Width" "Petal.Length" ...
##   .. .. .. .. ..$ : chr [1:4] "Sepal.Width" "Petal.Width" "Petal.Length" "Species"
##   .. .. ..- attr(*, "term.labels")= chr [1:4] "Sepal.Width" "Petal.Width" "Petal.Length" "Species"
##   .. .. ..- attr(*, "order")= int [1:4] 1 1 1 1
##   .. .. ..- attr(*, "intercept")= int 1
##   .. .. ..- attr(*, "response")= int 1
##   .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. .. ..- attr(*, "predvars")= language list(Sepal.Length, Sepal.Width, Petal.Width, Petal.Length, Species)
##   .. .. ..- attr(*, "dataClasses")= Named chr [1:5] "numeric" "numeric" "numeric" "numeric" ...
##   .. .. .. ..- attr(*, "names")= chr [1:5] "Sepal.Length" "Sepal.Width" "Petal.Width" "Petal.Length" ...
##  - attr(*, "class")= chr "lm"

names(RegModel.2)

##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "contrasts"     "xlevels"       "call"          "terms"        
## [13] "model"

summary(RegModel.2)

## 
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width + Petal.Width + Petal.Length + 
##     Species, data = iris)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.79424 -0.21874  0.00899  0.20255  0.73103 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        2.17127    0.27979   7.760 1.43e-12 ***
## Sepal.Width        0.49589    0.08607   5.761 4.87e-08 ***
## Petal.Width       -0.31516    0.15120  -2.084  0.03889 *  
## Petal.Length       0.82924    0.06853  12.101  < 2e-16 ***
## Speciesversicolor -0.72356    0.24017  -3.013  0.00306 ** 
## Speciesvirginica  -1.02350    0.33373  -3.067  0.00258 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3068 on 144 degrees of freedom
## Multiple R-squared:  0.8673, Adjusted R-squared:  0.8627 
## F-statistic: 188.3 on 5 and 144 DF,  p-value: < 2.2e-16

iris$pred= predict(RegModel.2,iris)
head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species     pred
## 1          5.1         3.5          1.4         0.2  setosa 5.004788
## 2          4.9         3.0          1.4         0.2  setosa 4.756844
## 3          4.7         3.2          1.3         0.2  setosa 4.773097
## 4          4.6         3.1          1.5         0.2  setosa 4.889357
## 5          5.0         3.6          1.4         0.2  setosa 5.054377
## 6          5.4         3.9          1.7         0.4  setosa 5.388886

sample(150,100,F)

##   [1] 138 104 130  92  17 120 150  58 136  26 113  43  55 105   1  47  68
##  [18]  19 112 111  99  45 142  83  12 147  95 109 119  84  87  76  98 146
##  [35]  73   7  96  29  85  20 108  78  52 123 129  25  18 132   8  23  57
##  [52]  79  59  69  77 121  54  91  48  93  49  16 149 140 127  75  40  15
##  [69]  35   5  53  33  97  50  41  61  32 145 100  39 116  21  90  38  72
##  [86]  82  70 115  86 148 106 133 139  11 131 134  65  13   4  31

a=nrow(iris)
b=100
a

## [1] 150

random_row_numbs=sample(a,b,F)
random_row_numbs

##   [1] 142   2  59  63  44  20  66  75   9   1 150  41  38 138  50  55  43
##  [18]  92 126 102  49  11 123  14  98  94  28 105   5 113  42  10  87  53
##  [35]  25 100 114 124  69  12 149  34  71 122  80 118 115  79  30  52   8
##  [52] 125  27  72 108  37  23 140 112 111  89  96 139  39  78  73  74 116
##  [69] 120 147  91  36 110  95  18 128 117 131  61  24  67   6  65 132   7
##  [86]  99 135  54  68  56  82  46  70 109  85  88 101  45 130  64

iris_train=iris[random_row_numbs,]
iris_test=iris[-random_row_numbs,]
nrow(iris_test)

## [1] 50

RegModel.3 <- 
  lm(Sepal.Length~Sepal.Width+
       Petal.Width+
       Petal.Length+Species,
     data=iris_train)

summary(RegModel.3)

## 
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width + Petal.Width + Petal.Length + 
##     Species, data = iris_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7654 -0.2421  0.0177  0.1894  0.7297 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        2.41627    0.33121   7.295 9.33e-11 ***
## Sepal.Width        0.43405    0.10696   4.058 0.000102 ***
## Petal.Width       -0.36676    0.17904  -2.048 0.043309 *  
## Petal.Length       0.78367    0.08316   9.424 3.08e-15 ***
## Speciesversicolor -0.52942    0.30828  -1.717 0.089211 .  
## Speciesvirginica  -0.74473    0.42140  -1.767 0.080424 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3032 on 94 degrees of freedom
## Multiple R-squared:  0.8723, Adjusted R-squared:  0.8655 
## F-statistic: 128.4 on 5 and 94 DF,  p-value: < 2.2e-16

summary(RegModel.2)

## 
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width + Petal.Width + Petal.Length + 
##     Species, data = iris)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.79424 -0.21874  0.00899  0.20255  0.73103 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        2.17127    0.27979   7.760 1.43e-12 ***
## Sepal.Width        0.49589    0.08607   5.761 4.87e-08 ***
## Petal.Width       -0.31516    0.15120  -2.084  0.03889 *  
## Petal.Length       0.82924    0.06853  12.101  < 2e-16 ***
## Speciesversicolor -0.72356    0.24017  -3.013  0.00306 ** 
## Speciesvirginica  -1.02350    0.33373  -3.067  0.00258 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3068 on 144 degrees of freedom
## Multiple R-squared:  0.8673, Adjusted R-squared:  0.8627 
## F-statistic: 188.3 on 5 and 144 DF,  p-value: < 2.2e-16

#SCORING THE MODEL
iris_test$pred= predict(RegModel.3,iris_test)
head(iris_test)

##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species     pred
## 3           4.7         3.2          1.3         0.2  setosa 4.750653
## 4           4.6         3.1          1.5         0.2  setosa 4.863982
## 13          4.8         3.0          1.4         0.1  setosa 4.778886
## 15          5.8         4.0          1.2         0.2  setosa 5.019526
## 16          5.7         4.4          1.5         0.4  setosa 5.354896
## 17          5.4         3.9          1.3         0.4  setosa 4.981137

#Boston
#2:1 for splitting the Boston Data
data(Boston)
RegModel1.1 <- 
  lm(medv~age+black+chas+crim+dis+indus+lstat+nox+ptratio+rad+rm+tax+zn, 
     data=Boston)
summary(RegModel1.1)

## 
## Call:
## lm(formula = medv ~ age + black + chas + crim + dis + indus + 
##     lstat + nox + ptratio + rad + rm + tax + zn, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.595  -2.730  -0.518   1.777  26.199 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.646e+01  5.103e+00   7.144 3.28e-12 ***
## age          6.922e-04  1.321e-02   0.052 0.958229    
## black        9.312e-03  2.686e-03   3.467 0.000573 ***
## chas         2.687e+00  8.616e-01   3.118 0.001925 ** 
## crim        -1.080e-01  3.286e-02  -3.287 0.001087 ** 
## dis         -1.476e+00  1.995e-01  -7.398 6.01e-13 ***
## indus        2.056e-02  6.150e-02   0.334 0.738288    
## lstat       -5.248e-01  5.072e-02 -10.347  < 2e-16 ***
## nox         -1.777e+01  3.820e+00  -4.651 4.25e-06 ***
## ptratio     -9.527e-01  1.308e-01  -7.283 1.31e-12 ***
## rad          3.060e-01  6.635e-02   4.613 5.07e-06 ***
## rm           3.810e+00  4.179e-01   9.116  < 2e-16 ***
## tax         -1.233e-02  3.760e-03  -3.280 0.001112 ** 
## zn           4.642e-02  1.373e-02   3.382 0.000778 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.745 on 492 degrees of freedom
## Multiple R-squared:  0.7406, Adjusted R-squared:  0.7338 
## F-statistic: 108.1 on 13 and 492 DF,  p-value: < 2.2e-16

a=nrow(Boston)
b=round(0.66*a)
a

## [1] 506

## [1] 334

random_row_numbs=sample(a,b,F)
random_row_numbs

##   [1] 415  11 406 371 187 394  32 334   1 355 240 106 255  40 155 232 262
##  [18] 312 215 459  74 346 416 483 146 497 277 400 438 201 266 405 456 429
##  [35] 445  17 104  72  14 330 332  54 237 193 319 356  28  44 259  46 200
##  [52] 274 123 122 455 128 260  18 448 446 177 491  61 169 488 361 154  42
##  [69]  60  35  49  23 107 282 289 383 467 164 424 482 261 369 287 450 127
##  [86] 315 447 115 501 162 276  63  86 401 374  36  34 283   6 301 409 228
## [103] 306 226 349 291 381  48 233 165 427 344 358 297 293 310 422  95  55
## [120]  56 168 210 495 307 460 239 380 341 457 368 366 407 437 403 321 280
## [137] 382 204 378  24  43 317 197 314 425 110 309 119  85 298 503   8 159
## [154]  80 133 499 377  12 462 487 399 198 114  71 143 384 333 285 364 223
## [171] 354  22 340  39 411 170  81 421  21 203 430 208 360 337 238 351 132
## [188] 144  65 102 179 386   2  98 216  58 243 192 136 484 147 414 418 270
## [205]  64  62 247 236 140 433 286 492  67 304 205 494  94 387 175 338 182
## [222] 325 473 227 281  88 504 353 219 161   9 271 453 196 435  51  89 318
## [239] 152 410 395 256  30 199 398 265 326 180  33  41 313 303 443 212  92
## [256] 111 478 145 209 253 148 183   7 254 373 362 163  83 299  13 372  90
## [273] 444 442 481 345 186 500 211 202 496 485 158 419 490 426 249  79 167
## [290] 322 458 138 153 440 284  25 244  26 472 241  87 471  16 339 308 195
## [307] 231 118 417   5 375 502 498 166 476 327  78 379 116 357  75  99 103
## [324] 272 402 323 486 347  73 479 290 257  70  84

#OVERFITTING /SPLIT
Boston_train=Boston[random_row_numbs,]
Boston_test=Boston[-random_row_numbs,]
nrow(Boston_test)

## [1] 172

RegModel1.2 <- 
  lm(medv~age+black+chas+crim+dis+
       indus+lstat+nox+ptratio+rad+rm+tax+zn, 
     data=Boston_train)
summary(RegModel1.2)

## 
## Call:
## lm(formula = medv ~ age + black + chas + crim + dis + indus + 
##     lstat + nox + ptratio + rad + rm + tax + zn, data = Boston_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.5053  -2.7723  -0.6659   1.4557  25.2114 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  37.064350   6.562089   5.648 3.58e-08 ***
## age           0.007661   0.016711   0.458 0.646921    
## black         0.009131   0.003319   2.751 0.006277 ** 
## chas          3.872493   1.112600   3.481 0.000570 ***
## crim         -0.100624   0.036018  -2.794 0.005525 ** 
## dis          -1.408710   0.253714  -5.552 5.93e-08 ***
## indus         0.054962   0.086047   0.639 0.523447    
## lstat        -0.581514   0.061089  -9.519  < 2e-16 ***
## nox         -18.705425   4.965826  -3.767 0.000197 ***
## ptratio      -0.941298   0.165469  -5.689 2.89e-08 ***
## rad           0.305968   0.087327   3.504 0.000524 ***
## rm            3.701278   0.534387   6.926 2.38e-11 ***
## tax          -0.012194   0.005184  -2.352 0.019260 *  
## zn            0.046799   0.016872   2.774 0.005865 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.921 on 320 degrees of freedom
## Multiple R-squared:  0.7458, Adjusted R-squared:  0.7355 
## F-statistic: 72.23 on 13 and 320 DF,  p-value: < 2.2e-16

Boston_test$pred= predict(RegModel1.2,Boston_test)

head(Boston_test)

##       crim   zn indus chas   nox    rm  age    dis rad tax ptratio  black
## 3  0.02729  0.0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83
## 4  0.03237  0.0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63
## 10 0.17004 12.5  7.87    0 0.524 6.004 85.9 6.5921   5 311    15.2 386.71
## 15 0.63796  0.0  8.14    0 0.538 6.096 84.5 4.4619   4 307    21.0 380.02
## 19 0.80271  0.0  8.14    0 0.538 5.456 36.6 3.7965   4 307    21.0 288.99
## 20 0.72580  0.0  8.14    0 0.538 5.727 69.5 3.7965   4 307    21.0 390.95
##    lstat medv     pred
## 3   4.03 34.7 30.89124
## 4   2.94 33.4 28.82865
## 10 17.10 18.9 18.87431
## 15 10.26 18.2 19.52558
## 19 11.69 20.2 16.04781
## 20 11.28 18.2 18.48006

RegModel1.3 <- 
  lm(medv~age+black+chas+crim+dis+
       indus+lstat+ptratio+rm+zn, 
     data=Boston_train)
summary(RegModel1.3)

## 
## Call:
## lm(formula = medv ~ age + black + chas + crim + dis + indus + 
##     lstat + ptratio + rm + zn, data = Boston_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.5268  -3.0117  -0.8198   1.2548  27.2430 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 19.299522   5.433475   3.552 0.000439 ***
## age         -0.010181   0.016596  -0.613 0.540022    
## black        0.009086   0.003347   2.714 0.006996 ** 
## chas         4.014595   1.137272   3.530 0.000476 ***
## crim        -0.068776   0.034323  -2.004 0.045927 *  
## dis         -1.155275   0.247698  -4.664 4.55e-06 ***
## indus       -0.097778   0.068636  -1.425 0.155241    
## lstat       -0.598274   0.062604  -9.556  < 2e-16 ***
## ptratio     -0.699146   0.151619  -4.611 5.78e-06 ***
## rm           4.181650   0.540227   7.741 1.28e-13 ***
## zn           0.040445   0.016520   2.448 0.014890 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.072 on 323 degrees of freedom
## Multiple R-squared:  0.7274, Adjusted R-squared:  0.719 
## F-statistic: 86.19 on 10 and 323 DF,  p-value: < 2.2e-16

#MULTO COLLINEARITY

vif(RegModel1.3)

##      age    black     chas     crim      dis    indus    lstat  ptratio 
## 2.951697 1.285478 1.076716 1.447176 3.721734 2.783200 2.766016 1.423619 
##       rm       zn 
## 1.836707 2.283260

RegModel1.4 <- 
  lm(medv~age+black+chas+crim+
       indus+lstat+ptratio+rm+zn, 
     data=Boston_train)
summary(RegModel1.4)

## 
## Call:
## lm(formula = medv ~ age + black + chas + crim + indus + lstat + 
##     ptratio + rm + zn, data = Boston_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.7879  -2.9683  -0.9062   1.4251  28.2141 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 12.623496   5.406809   2.335  0.02017 *  
## age          0.020826   0.015686   1.328  0.18522    
## black        0.008764   0.003452   2.539  0.01159 *  
## chas         4.060463   1.173086   3.461  0.00061 ***
## crim        -0.043619   0.034965  -1.247  0.21312    
## indus        0.028366   0.065072   0.436  0.66318    
## lstat       -0.610773   0.064519  -9.467  < 2e-16 ***
## ptratio     -0.810364   0.154453  -5.247 2.80e-07 ***
## rm           4.396271   0.555234   7.918 3.89e-14 ***
## zn           0.007420   0.015397   0.482  0.63017    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.232 on 324 degrees of freedom
## Multiple R-squared:  0.709,  Adjusted R-squared:  0.701 
## F-statistic: 87.73 on 9 and 324 DF,  p-value: < 2.2e-16

vif(RegModel1.4)

##      age    black     chas     crim    indus    lstat  ptratio       rm 
## 2.478075 1.284930 1.076635 1.411437 2.351033 2.760947 1.388405 1.823381 
##       zn 
## 1.863828

#OUTLIERS
outlierTest(RegModel1.4)

##     rstudent unadjusted p-value Bonferonni p
## 369 5.894128         9.4761e-09   3.1650e-06
## 372 5.403382         1.2719e-07   4.2480e-05
## 373 4.945052         1.2261e-06   4.0951e-04

Boston[c(372,413,370,365),]

##         crim zn indus chas   nox    rm   age    dis rad tax ptratio  black
## 372  9.23230  0  18.1    0 0.631 6.216 100.0 1.1691  24 666    20.2 366.15
## 413 18.81100  0  18.1    0 0.597 4.628 100.0 1.5539  24 666    20.2  28.79
## 370  5.66998  0  18.1    1 0.631 6.683  96.8 1.3567  24 666    20.2 375.33
## 365  3.47428  0  18.1    1 0.718 8.780  82.9 1.9047  24 666    20.2 354.55
##     lstat medv
## 372  9.53 50.0
## 413 34.37 17.9
## 370  3.73 50.0
## 365  5.29 21.9

#HETEROSCEDASCITY
library(gvlma)
gvlma(RegModel1.4)

## 
## Call:
## lm(formula = medv ~ age + black + chas + crim + indus + lstat + 
##     ptratio + rm + zn, data = Boston_train)
## 
## Coefficients:
## (Intercept)          age        black         chas         crim  
##   12.623496     0.020826     0.008764     4.060463    -0.043619  
##       indus        lstat      ptratio           rm           zn  
##    0.028366    -0.610773    -0.810364     4.396271     0.007420  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = RegModel1.4) 
## 
##                       Value p-value                   Decision
## Global Stat        850.6747  0.0000 Assumptions NOT satisfied!
## Skewness           206.9665  0.0000 Assumptions NOT satisfied!
## Kurtosis           544.6416  0.0000 Assumptions NOT satisfied!
## Link Function       98.9745  0.0000 Assumptions NOT satisfied!
## Heteroscedasticity   0.0921  0.7615    Assumptions acceptable.

#?gvlma
vif(RegModel1.4)

##      age    black     chas     crim    indus    lstat  ptratio       rm 
## 2.478075 1.284930 1.076635 1.411437 2.351033 2.760947 1.388405 1.823381 
##       zn 
## 1.863828

summary(RegModel1.4)

## 
## Call:
## lm(formula = medv ~ age + black + chas + crim + indus + lstat + 
##     ptratio + rm + zn, data = Boston_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.7879  -2.9683  -0.9062   1.4251  28.2141 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 12.623496   5.406809   2.335  0.02017 *  
## age          0.020826   0.015686   1.328  0.18522    
## black        0.008764   0.003452   2.539  0.01159 *  
## chas         4.060463   1.173086   3.461  0.00061 ***
## crim        -0.043619   0.034965  -1.247  0.21312    
## indus        0.028366   0.065072   0.436  0.66318    
## lstat       -0.610773   0.064519  -9.467  < 2e-16 ***
## ptratio     -0.810364   0.154453  -5.247 2.80e-07 ***
## rm           4.396271   0.555234   7.918 3.89e-14 ***
## zn           0.007420   0.015397   0.482  0.63017    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.232 on 324 degrees of freedom
## Multiple R-squared:  0.709,  Adjusted R-squared:  0.701 
## F-statistic: 87.73 on 9 and 324 DF,  p-value: < 2.2e-16

RegModel1.5 <- 
  lm(medv~lstat+ptratio+rm, 
     data=Boston_train)
summary(RegModel1.5)

## 
## Call:
## lm(formula = medv ~ lstat + ptratio + rm, data = Boston_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.800  -3.175  -1.038   1.660  29.090 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 19.92854    5.00382   3.983 8.39e-05 ***
## lstat       -0.61204    0.05204 -11.760  < 2e-16 ***
## ptratio     -0.93983    0.14760  -6.367 6.44e-10 ***
## rm           4.41867    0.54929   8.044 1.58e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.401 on 330 degrees of freedom
## Multiple R-squared:  0.6842, Adjusted R-squared:  0.6813 
## F-statistic: 238.3 on 3 and 330 DF,  p-value: < 2.2e-16

vif(RegModel1.5)

##    lstat  ptratio       rm 
## 1.685617 1.189784 1.674550

#OUTLIERS
outlierTest(RegModel1.5)

##     rstudent unadjusted p-value Bonferonni p
## 369 5.773195         1.7995e-08   6.0103e-06
## 373 5.543177         6.1006e-08   2.0376e-05
## 372 5.299931         2.1275e-07   7.1058e-05

Boston[c(372,413,370,365),]

##         crim zn indus chas   nox    rm   age    dis rad tax ptratio  black
## 372  9.23230  0  18.1    0 0.631 6.216 100.0 1.1691  24 666    20.2 366.15
## 413 18.81100  0  18.1    0 0.597 4.628 100.0 1.5539  24 666    20.2  28.79
## 370  5.66998  0  18.1    1 0.631 6.683  96.8 1.3567  24 666    20.2 375.33
## 365  3.47428  0  18.1    1 0.718 8.780  82.9 1.9047  24 666    20.2 354.55
##     lstat medv
## 372  9.53 50.0
## 413 34.37 17.9
## 370  3.73 50.0
## 365  5.29 21.9

#HETEROSCEDASCITY
gvlma(RegModel1.5)

## 
## Call:
## lm(formula = medv ~ lstat + ptratio + rm, data = Boston_train)
## 
## Coefficients:
## (Intercept)        lstat      ptratio           rm  
##     19.9285      -0.6120      -0.9398       4.4187  
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = RegModel1.5) 
## 
##                        Value p-value                   Decision
## Global Stat        950.04282  0.0000 Assumptions NOT satisfied!
## Skewness           222.29781  0.0000 Assumptions NOT satisfied!
## Kurtosis           638.03988  0.0000 Assumptions NOT satisfied!
## Link Function       89.68018  0.0000 Assumptions NOT satisfied!
## Heteroscedasticity   0.02495  0.8745    Assumptions acceptable.

RegressioninR.R

dell

Sun Jan 03 20:33:23 2016