Data

library(car);library(ggplot2);library(GGally);library(glmnet);library(MASS); library(caret)
## Loading required package: carData
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## Loading required package: Matrix
## Loaded glmnet 3.0-1
## Loading required package: lattice
df = read.csv(file = "C:\\Users\\chae\\Dropbox\\Konkuk\\2-1\\회귀분석\\boston housing\\Boston.csv")
df[which(df[,"TRACT"] %in%  c(2042,2084,3585,3823,905,911,923,1805)),"MEDV"]=c(22.1,24.2,33,27,8.2,14.8,14.4,19)
df=subset(df,select=-CMEDV)
attach(df)
summary(df)
##        NO                       TOWN         TOWNNO          TRACT     
##  Min.   :  1.0   Cambridge        : 30   Min.   : 0.00   Min.   :   1  
##  1st Qu.:127.2   Boston Savin Hill: 23   1st Qu.:26.25   1st Qu.:1303  
##  Median :253.5   Lynn             : 22   Median :42.00   Median :3394  
##  Mean   :253.5   Boston Roxbury   : 19   Mean   :47.53   Mean   :2700  
##  3rd Qu.:379.8   Newton           : 18   3rd Qu.:78.00   3rd Qu.:3740  
##  Max.   :506.0   Somerville       : 15   Max.   :91.00   Max.   :5082  
##                  (Other)          :379                                 
##       LON              LAT             MEDV            CRIM         
##  Min.   :-71.29   Min.   :42.03   Min.   : 5.00   Min.   : 0.00632  
##  1st Qu.:-71.09   1st Qu.:42.18   1st Qu.:17.02   1st Qu.: 0.08204  
##  Median :-71.05   Median :42.22   Median :21.20   Median : 0.25651  
##  Mean   :-71.06   Mean   :42.22   Mean   :22.53   Mean   : 3.61352  
##  3rd Qu.:-71.02   3rd Qu.:42.25   3rd Qu.:25.00   3rd Qu.: 3.67708  
##  Max.   :-70.81   Max.   :42.38   Max.   :50.00   Max.   :88.97620  
##                                                                     
##        ZN             INDUS            CHAS              NOX        
##  Min.   :  0.00   Min.   : 0.46   Min.   :0.00000   Min.   :0.3850  
##  1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000   1st Qu.:0.4490  
##  Median :  0.00   Median : 9.69   Median :0.00000   Median :0.5380  
##  Mean   : 11.36   Mean   :11.14   Mean   :0.06917   Mean   :0.5547  
##  3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000   3rd Qu.:0.6240  
##  Max.   :100.00   Max.   :27.74   Max.   :1.00000   Max.   :0.8710  
##                                                                     
##        RM             AGE              DIS              RAD        
##  Min.   :3.561   Min.   :  2.90   Min.   : 1.130   Min.   : 1.000  
##  1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100   1st Qu.: 4.000  
##  Median :6.208   Median : 77.50   Median : 3.207   Median : 5.000  
##  Mean   :6.285   Mean   : 68.57   Mean   : 3.795   Mean   : 9.549  
##  3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188   3rd Qu.:24.000  
##  Max.   :8.780   Max.   :100.00   Max.   :12.127   Max.   :24.000  
##                                                                    
##       TAX           PTRATIO            B              LSTAT      
##  Min.   :187.0   Min.   :12.60   Min.   :  0.32   Min.   : 1.73  
##  1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38   1st Qu.: 6.95  
##  Median :330.0   Median :19.05   Median :391.44   Median :11.36  
##  Mean   :408.2   Mean   :18.46   Mean   :356.67   Mean   :12.65  
##  3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23   3rd Qu.:16.95  
##  Max.   :711.0   Max.   :22.00   Max.   :396.90   Max.   :37.97  
## 
str(df)
## 'data.frame':    506 obs. of  20 variables:
##  $ NO     : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ TOWN   : Factor w/ 92 levels "Arlington","Ashland",..: 54 77 77 46 46 46 69 69 69 69 ...
##  $ TOWNNO : int  0 1 1 2 2 2 3 3 3 3 ...
##  $ TRACT  : int  2011 2021 2022 2031 2032 2033 2041 2042 2043 2044 ...
##  $ LON    : num  -71 -71 -70.9 -70.9 -70.9 ...
##  $ LAT    : num  42.3 42.3 42.3 42.3 42.3 ...
##  $ MEDV   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 22.1 16.5 18.9 ...
##  $ CRIM   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ ZN     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ INDUS  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ CHAS   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NOX    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ RM     : num  6.58 6.42 7.18 7 7.15 ...
##  $ AGE    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ DIS    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ RAD    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ TAX    : int  296 242 242 222 222 222 311 311 311 311 ...
##  $ PTRATIO: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ B      : num  397 397 393 395 397 ...
##  $ LSTAT  : num  4.98 9.14 4.03 2.94 5.33 ...

Regression Models

model 1

df1 = df[,-c(1:6)]
fit.lm=lm(MEDV~.,data=df1)
summary(fit.lm)
## 
## Call:
## lm(formula = MEDV ~ ., data = df1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.5651  -2.6908  -0.5352   1.8446  26.1319 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.637e+01  5.058e+00   7.191 2.40e-12 ***
## CRIM        -1.062e-01  3.257e-02  -3.261 0.001189 ** 
## ZN           4.772e-02  1.360e-02   3.508 0.000493 ***
## INDUS        2.325e-02  6.094e-02   0.382 0.702970    
## CHAS         2.692e+00  8.539e-01   3.152 0.001718 ** 
## NOX         -1.774e+01  3.785e+00  -4.687 3.59e-06 ***
## RM           3.789e+00  4.142e-01   9.149  < 2e-16 ***
## AGE          5.749e-04  1.309e-02   0.044 0.964989    
## DIS         -1.502e+00  1.977e-01  -7.598 1.53e-13 ***
## RAD          3.038e-01  6.575e-02   4.620 4.91e-06 ***
## TAX         -1.270e-02  3.727e-03  -3.409 0.000706 ***
## PTRATIO     -9.239e-01  1.297e-01  -7.126 3.70e-12 ***
## B            9.228e-03  2.662e-03   3.467 0.000573 ***
## LSTAT       -5.307e-01  5.026e-02 -10.558  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.703 on 492 degrees of freedom
## Multiple R-squared:  0.7444, Adjusted R-squared:  0.7377 
## F-statistic: 110.2 on 13 and 492 DF,  p-value: < 2.2e-16
layout(matrix(c(1,2,3,4),2,2))
plot(fit.lm)

model 2

hist(MEDV)

hist(log(MEDV))

fit.lm1 = lm(log(MEDV)~.,data = df1)
layout(matrix(c(1,2,3,4),2,2))
plot(fit.lm1)

model 3

fit.lm2=lm(MEDV~.+I(LSTAT^2),data=df1)
summary(fit.lm2)
## 
## Call:
## lm(formula = MEDV ~ . + I(LSTAT^2), data = df1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.2075  -2.5063  -0.3202   1.8247  24.3895 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  43.531669   4.593869   9.476  < 2e-16 ***
## CRIM         -0.148980   0.029541  -5.043 6.45e-07 ***
## ZN            0.025851   0.012394   2.086 0.037512 *  
## INDUS         0.048567   0.054832   0.886 0.376186    
## CHAS          2.425222   0.767911   3.158 0.001685 ** 
## NOX         -16.252217   3.405501  -4.772 2.41e-06 ***
## RM            3.019182   0.378996   7.966 1.15e-14 ***
## AGE           0.028738   0.012050   2.385 0.017463 *  
## DIS          -1.217712   0.179596  -6.780 3.45e-11 ***
## RAD           0.292628   0.059112   4.950 1.02e-06 ***
## TAX          -0.011263   0.003353  -3.359 0.000842 ***
## PTRATIO      -0.787897   0.117215  -6.722 4.99e-11 ***
## B             0.007718   0.002397   3.220 0.001366 ** 
## LSTAT        -1.783645   0.123920 -14.393  < 2e-16 ***
## I(LSTAT^2)    0.034999   0.003223  10.859  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.227 on 491 degrees of freedom
## Multiple R-squared:  0.7939, Adjusted R-squared:  0.7881 
## F-statistic: 135.1 on 14 and 491 DF,  p-value: < 2.2e-16
layout(matrix(c(1,2,3,4),2,2))
plot(fit.lm2)

model 4

aaa = as.numeric(names(fit.lm2$fitted.values[fit.lm2$fitted.values>=35]))
df[aaa,]
##      NO               TOWN TOWNNO TRACT      LON     LAT MEDV    CRIM   ZN
## 41   41          Lynnfield      6  2092 -71.0300 42.3240 34.9 0.03359 75.0
## 98   98         Winchester     23  3383 -71.0890 42.2715 38.7 0.12083  0.0
## 99   99         Winchester     23  3384 -71.0982 42.2685 43.8 0.08187  0.0
## 158 158          Cambridge     28  3536 -71.0690 42.2285 41.3 1.22358  0.0
## 161 161          Cambridge     28  3539 -71.0700 42.2214 27.0 1.27346  0.0
## 162 162          Cambridge     28  3540 -71.0735 42.2282 50.0 1.46336  0.0
## 163 163          Cambridge     28  3541 -71.0770 42.2250 50.0 1.83377  0.0
## 164 164          Cambridge     28  3542 -71.0815 42.2250 50.0 1.51902  0.0
## 167 167          Cambridge     28  3545 -71.0770 42.2305 50.0 2.01019  0.0
## 183 183            Belmont     30  3574 -71.1005 42.2287 37.9 0.09103  0.0
## 187 187            Belmont     30  3578 -71.1115 42.2442 50.0 0.05602  0.0
## 193 193          Lexington     31  3587 -71.1340 42.2785 36.4 0.08664 45.0
## 196 196            Lincoln     33  3602 -71.1890 42.2525 50.0 0.01381 80.0
## 197 197            Concord     34  3611 -71.2200 42.2715 33.3 0.04011 80.0
## 203 203            Wayland     36  3662 -71.2140 42.2180 42.3 0.02177 82.5
## 204 204             Weston     37  3671 -71.1990 42.2320 48.5 0.03510 95.0
## 205 205             Weston     37  3672 -71.1990 42.2058 50.0 0.02009 95.0
## 225 225             Newton     40  3735 -71.1100 42.2060 44.8 0.31533  0.0
## 226 226             Newton     40  3736 -71.1012 42.1975 50.0 0.52693  0.0
## 227 227             Newton     40  3737 -71.1215 42.2025 37.6 0.38214  0.0
## 229 229             Newton     40  3739 -71.1100 42.1850 46.7 0.29819  0.0
## 232 232             Newton     40  3742 -71.1285 42.1930 31.7 0.46296  0.0
## 233 233             Newton     40  3743 -71.1400 42.1946 41.7 0.57529  0.0
## 234 234             Newton     40  3744 -71.1340 42.2040 48.3 0.33147  0.0
## 238 238             Newton     40  3748 -71.1491 42.2030 31.5 0.51183  0.0
## 257 257           Sherborn     44  3861 -71.2230 42.1450 44.0 0.01538 90.0
## 258 258          Brookline     45  4001 -71.0679 42.2073 50.0 0.61154 20.0
## 259 259          Brookline     45  4002 -71.0727 42.2077 36.0 0.66351 20.0
## 260 260          Brookline     45  4003 -71.0765 42.2075 30.1 0.65665 20.0
## 262 262          Brookline     45  4005 -71.0790 42.2020 43.1 0.53412 20.0
## 263 263          Brookline     45  4006 -71.0835 42.2000 48.8 0.52014 20.0
## 268 268          Brookline     45  4011 -71.0850 42.1925 50.0 0.57834 20.0
## 269 269          Brookline     45  4012 -71.0925 42.2075 43.5 0.54050 20.0
## 274 274             Dedham     46  4025 -71.1170 42.1510 35.2 0.22188 20.0
## 275 275            Needham     47  4031 -71.1305 42.1675 32.4 0.05644 40.0
## 276 276            Needham     47  4032 -71.1380 42.1733 32.0 0.09604 40.0
## 277 277            Needham     47  4033 -71.1405 42.1632 33.2 0.10469 40.0
## 278 278            Needham     47  4034 -71.1495 42.1730 33.1 0.06127 40.0
## 280 280          Wellesley     48  4041 -71.1480 42.1880 35.1 0.21038 20.0
## 281 281          Wellesley     48  4042 -71.1660 42.1870 45.4 0.03578 20.0
## 282 282          Wellesley     48  4043 -71.1850 42.1848 35.4 0.03705 20.0
## 283 283          Wellesley     48  4044 -71.1775 42.1735 46.0 0.06129 20.0
## 284 284              Dover     49  4051 -71.1730 42.1475 50.0 0.01501 90.0
## 291 291           Westwood     54  4121 -71.1385 42.1235 28.5 0.03502 80.0
## 292 292           Westwood     54  4122 -71.1330 42.1408 37.3 0.07886 80.0
## 307 307             Milton     58  4163 -71.0450 42.1590 33.4 0.07503 33.0
## 365 365    Boston Back Bay     75   101 -71.0590 42.2098 21.9 3.47428  0.0
## 370 370    Boston Back Bay     75   108 -71.0497 42.2125 50.0 5.66998  0.0
## 371 371 Boston Beacon Hill     76   201 -71.0422 42.2144 50.0 6.53876  0.0
##     INDUS CHAS    NOX    RM   AGE    DIS RAD TAX PTRATIO      B LSTAT
## 41   2.95    0 0.4280 7.024  15.8 5.4011   3 252    18.3 395.62  1.98
## 98   2.89    0 0.4450 8.069  76.0 3.4952   2 276    18.0 396.90  4.21
## 99   2.89    0 0.4450 7.820  36.9 3.4952   2 276    18.0 393.53  3.57
## 158 19.58    0 0.6050 6.943  97.4 1.8773   5 403    14.7 363.43  4.59
## 161 19.58    1 0.6050 6.250  92.6 1.7984   5 403    14.7 338.92  5.50
## 162 19.58    0 0.6050 7.489  90.8 1.9709   5 403    14.7 374.43  1.73
## 163 19.58    1 0.6050 7.802  98.2 2.0407   5 403    14.7 389.61  1.92
## 164 19.58    1 0.6050 8.375  93.9 2.1620   5 403    14.7 388.45  3.32
## 167 19.58    0 0.6050 7.929  96.2 2.0459   5 403    14.7 369.30  3.70
## 183  2.46    0 0.4880 7.155  92.2 2.7006   3 193    17.8 394.12  4.82
## 187  2.46    0 0.4880 7.831  53.6 3.1992   3 193    17.8 392.63  4.45
## 193  3.44    0 0.4370 7.178  26.3 6.4798   5 398    15.2 390.49  2.87
## 196  0.46    0 0.4220 7.875  32.0 5.6484   4 255    14.4 394.23  2.97
## 197  1.52    0 0.4040 7.287  34.1 7.3090   2 329    12.6 396.90  4.08
## 203  2.03    0 0.4150 7.610  15.7 6.2700   2 348    14.7 395.38  3.11
## 204  2.68    0 0.4161 7.853  33.2 5.1180   4 224    14.7 392.78  3.81
## 205  2.68    0 0.4161 8.034  31.9 5.1180   4 224    14.7 390.55  2.88
## 225  6.20    0 0.5040 8.266  78.3 2.8944   8 307    17.4 385.05  4.14
## 226  6.20    0 0.5040 8.725  83.0 2.8944   8 307    17.4 382.00  4.63
## 227  6.20    0 0.5040 8.040  86.5 3.2157   8 307    17.4 387.38  3.13
## 229  6.20    0 0.5040 7.686  17.0 3.3751   8 307    17.4 377.51  3.92
## 232  6.20    0 0.5040 7.412  76.9 3.6715   8 307    17.4 376.14  5.25
## 233  6.20    0 0.5070 8.337  73.3 3.8384   8 307    17.4 385.91  2.47
## 234  6.20    0 0.5070 8.247  70.4 3.6519   8 307    17.4 378.95  3.95
## 238  6.20    0 0.5070 7.358  71.6 4.1480   8 307    17.4 390.07  4.73
## 257  3.75    0 0.3940 7.454  34.2 6.3361   3 244    15.9 386.34  3.11
## 258  3.97    0 0.6470 8.704  86.9 1.8010   5 264    13.0 389.70  5.12
## 259  3.97    0 0.6470 7.333 100.0 1.8946   5 264    13.0 383.29  7.79
## 260  3.97    0 0.6470 6.842 100.0 2.0107   5 264    13.0 391.93  6.90
## 262  3.97    0 0.6470 7.520  89.4 2.1398   5 264    13.0 388.37  7.26
## 263  3.97    0 0.6470 8.398  91.5 2.2885   5 264    13.0 386.86  5.91
## 268  3.97    0 0.5750 8.297  67.0 2.4216   5 264    13.0 384.54  7.44
## 269  3.97    0 0.5750 7.470  52.6 2.8720   5 264    13.0 390.30  3.16
## 274  6.96    1 0.4640 7.691  51.8 4.3665   3 223    18.6 390.77  6.58
## 275  6.41    1 0.4470 6.758  32.9 4.0776   4 254    17.6 396.90  3.53
## 276  6.41    0 0.4470 6.854  42.8 4.2673   4 254    17.6 396.90  2.98
## 277  6.41    1 0.4470 7.267  49.0 4.7872   4 254    17.6 389.25  6.05
## 278  6.41    1 0.4470 6.826  27.6 4.8628   4 254    17.6 393.45  4.16
## 280  3.33    0 0.4429 6.812  32.2 4.1007   5 216    14.9 396.90  4.85
## 281  3.33    0 0.4429 7.820  64.5 4.6947   5 216    14.9 387.31  3.76
## 282  3.33    0 0.4429 6.968  37.2 5.2447   5 216    14.9 392.23  4.59
## 283  3.33    1 0.4429 7.645  49.7 5.2119   5 216    14.9 377.07  3.01
## 284  1.21    1 0.4010 7.923  24.8 5.8850   1 198    13.6 395.52  3.16
## 291  4.95    0 0.4110 6.861  27.9 5.1167   4 245    19.2 396.90  3.33
## 292  4.95    0 0.4110 7.148  27.7 5.1167   4 245    19.2 396.90  3.56
## 307  2.18    0 0.4720 7.420  71.9 3.0992   7 222    18.4 396.90  6.47
## 365 18.10    1 0.7180 8.780  82.9 1.9047  24 666    20.2 354.55  5.29
## 370 18.10    1 0.6310 6.683  96.8 1.3567  24 666    20.2 375.33  3.73
## 371 18.10    1 0.6310 7.016  97.5 1.2024  24 666    20.2 392.05  2.96
par(mfrow=c(2,1))
boxplot(df[-aaa,-c(11,13)])
boxplot(df[aaa,-c(11,13)])

# town plot
library(ggplot2)
ggplot(data=df,aes(x=as.factor(TOWN),y=MEDV))+
  geom_bar(stat='summary',fun.y='median',fill='slateblue',color='darkslategrey')+
  theme(axis.text.x=element_text(angle=45,hjust=1))+
  geom_hline(yintercept=median(df$MEDV),linetype='dashed',color='blue')+
  geom_hline(yintercept=median(df$MEDV)*0.5,linetype='dashed',color='black')+
  geom_hline(yintercept=median(df$MEDV)*1.5,linetype='dashed',color='red')

model 5

fit.lm3=lm(MEDV~CRIM+ZN+INDUS+CHAS+NOX+RM+AGE+DIS+RAD+TAX+PTRATIO+B+LSTAT+I(LSTAT^2),data=df1)
layout(matrix(c(1,2,3,4),2,2))
plot(fit.lm3)

summary(fit.lm3)
## 
## Call:
## lm(formula = MEDV ~ CRIM + ZN + INDUS + CHAS + NOX + RM + AGE + 
##     DIS + RAD + TAX + PTRATIO + B + LSTAT + I(LSTAT^2), data = df1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.2075  -2.5063  -0.3202   1.8247  24.3895 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  43.531669   4.593869   9.476  < 2e-16 ***
## CRIM         -0.148980   0.029541  -5.043 6.45e-07 ***
## ZN            0.025851   0.012394   2.086 0.037512 *  
## INDUS         0.048567   0.054832   0.886 0.376186    
## CHAS          2.425222   0.767911   3.158 0.001685 ** 
## NOX         -16.252217   3.405501  -4.772 2.41e-06 ***
## RM            3.019182   0.378996   7.966 1.15e-14 ***
## AGE           0.028738   0.012050   2.385 0.017463 *  
## DIS          -1.217712   0.179596  -6.780 3.45e-11 ***
## RAD           0.292628   0.059112   4.950 1.02e-06 ***
## TAX          -0.011263   0.003353  -3.359 0.000842 ***
## PTRATIO      -0.787897   0.117215  -6.722 4.99e-11 ***
## B             0.007718   0.002397   3.220 0.001366 ** 
## LSTAT        -1.783645   0.123920 -14.393  < 2e-16 ***
## I(LSTAT^2)    0.034999   0.003223  10.859  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.227 on 491 degrees of freedom
## Multiple R-squared:  0.7939, Adjusted R-squared:  0.7881 
## F-statistic: 135.1 on 14 and 491 DF,  p-value: < 2.2e-16

model 6

# Variable Selection
nullmodel = lm(MEDV ~ 1, data=df1)
fullmodel = lm(MEDV~.+I(LSTAT^2),data=df1)
# forward selection
model.step.f = step(nullmodel, scope = list(lower = nullmodel, upper = fullmodel), direction = "forward",trace=F)
# Backward selection
model.step.b = step(fullmodel, direction = "backward",trace=F)
# stepwise selection
model.step = step(nullmodel, scope = list(lower = nullmodel, upper = fullmodel), direction = "both",trace=F)

model.step.f
## 
## Call:
## lm(formula = MEDV ~ LSTAT + I(LSTAT^2) + RM + PTRATIO + DIS + 
##     CRIM + NOX + CHAS + RAD + B + TAX + AGE + ZN, data = df1)
## 
## Coefficients:
## (Intercept)        LSTAT   I(LSTAT^2)           RM      PTRATIO          DIS  
##   43.281387    -1.776253     0.034878     2.991565    -0.774971    -1.251890  
##        CRIM          NOX         CHAS          RAD            B          TAX  
##   -0.149793   -15.462506     2.496108     0.278106     0.007653    -0.009970  
##         AGE           ZN  
##    0.028652     0.024776
model.step.b
## 
## Call:
## lm(formula = MEDV ~ CRIM + ZN + CHAS + NOX + RM + AGE + DIS + 
##     RAD + TAX + PTRATIO + B + LSTAT + I(LSTAT^2), data = df1)
## 
## Coefficients:
## (Intercept)         CRIM           ZN         CHAS          NOX           RM  
##   43.281387    -0.149793     0.024776     2.496108   -15.462506     2.991565  
##         AGE          DIS          RAD          TAX      PTRATIO            B  
##    0.028652    -1.251890     0.278106    -0.009970    -0.774971     0.007653  
##       LSTAT   I(LSTAT^2)  
##   -1.776253     0.034878
model.step
## 
## Call:
## lm(formula = MEDV ~ LSTAT + I(LSTAT^2) + RM + PTRATIO + DIS + 
##     CRIM + NOX + CHAS + RAD + B + TAX + AGE + ZN, data = df1)
## 
## Coefficients:
## (Intercept)        LSTAT   I(LSTAT^2)           RM      PTRATIO          DIS  
##   43.281387    -1.776253     0.034878     2.991565    -0.774971    -1.251890  
##        CRIM          NOX         CHAS          RAD            B          TAX  
##   -0.149793   -15.462506     2.496108     0.278106     0.007653    -0.009970  
##         AGE           ZN  
##    0.028652     0.024776
c(AIC(model.step.f),AIC(model.step.b),AIC(model.step))
## [1] 2910.396 2910.396 2910.396
c(BIC(model.step.f),BIC(model.step.b),BIC(model.step))
## [1] 2973.794 2973.794 2973.794
summary(model.step)
## 
## Call:
## lm(formula = MEDV ~ LSTAT + I(LSTAT^2) + RM + PTRATIO + DIS + 
##     CRIM + NOX + CHAS + RAD + B + TAX + AGE + ZN, data = df1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.1851  -2.5314  -0.3066   1.9376  24.4388 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  43.281387   4.584167   9.441  < 2e-16 ***
## LSTAT        -1.776253   0.123612 -14.370  < 2e-16 ***
## I(LSTAT^2)    0.034878   0.003220  10.833  < 2e-16 ***
## RM            2.991565   0.377629   7.922 1.57e-14 ***
## PTRATIO      -0.774971   0.116278  -6.665 7.12e-11 ***
## DIS          -1.251890   0.175363  -7.139 3.40e-12 ***
## CRIM         -0.149793   0.029520  -5.074 5.52e-07 ***
## NOX         -15.462506   3.286001  -4.706 3.29e-06 ***
## CHAS          2.496108   0.763563   3.269  0.00115 ** 
## RAD           0.278106   0.056781   4.898 1.32e-06 ***
## B             0.007653   0.002395   3.195  0.00149 ** 
## TAX          -0.009970   0.003017  -3.304  0.00102 ** 
## AGE           0.028652   0.012047   2.378  0.01777 *  
## ZN            0.024776   0.012331   2.009  0.04507 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.226 on 492 degrees of freedom
## Multiple R-squared:  0.7936, Adjusted R-squared:  0.7881 
## F-statistic: 145.5 on 13 and 492 DF,  p-value: < 2.2e-16

model 7 제곱항

fo1=paste(names(df1)[-1],sep="",collapse="+")
fo2=paste(paste("I(",names(df1)[-1],"^2)",sep=""),sep="",collapse = "+")
full.fo = paste("MEDV","~",paste(fo1,"+",fo2,sep=""),sep = "")
formula = as.formula(full.fo)
lmlm=lm(formula,data = df1)
summary(lmlm)
## 
## Call:
## lm(formula = formula, data = df1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.9503  -2.1642  -0.2243   1.7648  24.9448 
## 
## Coefficients: (1 not defined because of singularities)
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.619e+02  2.076e+01   7.799 3.92e-14 ***
## CRIM         -3.709e-01  8.343e-02  -4.446 1.09e-05 ***
## ZN           -5.703e-02  3.123e-02  -1.826 0.068434 .  
## INDUS        -1.089e-01  1.956e-01  -0.557 0.578055    
## CHAS          2.618e+00  7.136e-01   3.668 0.000271 ***
## NOX          -2.284e+01  2.461e+01  -0.928 0.354033    
## RM           -1.917e+01  2.751e+00  -6.968 1.07e-11 ***
## AGE          -9.307e-03  3.680e-02  -0.253 0.800471    
## DIS          -2.721e+00  5.602e-01  -4.857 1.62e-06 ***
## RAD           5.721e-01  2.073e-01   2.760 0.006008 ** 
## TAX          -2.706e-02  1.442e-02  -1.876 0.061238 .  
## PTRATIO      -5.338e+00  1.650e+00  -3.236 0.001298 ** 
## B             2.347e-02  1.087e-02   2.159 0.031345 *  
## LSTAT        -1.333e+00  1.261e-01 -10.569  < 2e-16 ***
## I(CRIM^2)     2.955e-03  1.076e-03   2.747 0.006236 ** 
## I(ZN^2)       7.141e-04  3.369e-04   2.120 0.034545 *  
## I(INDUS^2)    5.503e-03  7.474e-03   0.736 0.461948    
## I(CHAS^2)            NA         NA      NA       NA    
## I(NOX^2)     -4.543e-01  1.805e+01  -0.025 0.979935    
## I(RM^2)       1.740e+00  2.153e-01   8.082 5.21e-15 ***
## I(AGE^2)      6.689e-05  3.082e-04   0.217 0.828279    
## I(DIS^2)      1.464e-01  4.630e-02   3.161 0.001671 ** 
## I(RAD^2)     -7.724e-03  8.236e-03  -0.938 0.348756    
## I(TAX^2)      1.662e-05  1.904e-05   0.873 0.383298    
## I(PTRATIO^2)  1.291e-01  4.662e-02   2.770 0.005819 ** 
## I(B^2)       -3.919e-05  2.387e-05  -1.642 0.101315    
## I(LSTAT^2)    2.295e-02  3.415e-03   6.720 5.17e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.813 on 480 degrees of freedom
## Multiple R-squared:  0.8361, Adjusted R-squared:  0.8276 
## F-statistic: 97.94 on 25 and 480 DF,  p-value: < 2.2e-16

model 8 교호작용

fo1=paste(names(df1)[-1],sep="",collapse="+")
fo2=paste(paste("I(",names(df1)[-1],"^2)",sep=""),sep="",collapse = "+")
fo3=paste("(",paste(names(df1)[-1],collapse = "+"),")^2",sep="")
full.fo = paste("MEDV","~",paste(fo1,fo2,fo3,sep="+"),sep = "")
formula1 = as.formula(full.fo)
lmlm1=lm(formula1,data = df1)
summary(lmlm1)
## 
## Call:
## lm(formula = formula1, data = df1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -6.939 -1.363 -0.062  1.252 18.284 
## 
## Coefficients: (1 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -1.980e+02  8.296e+01  -2.387 0.017449 *  
## CRIM          -4.234e+00  6.716e+00  -0.631 0.528722    
## ZN             1.929e-01  4.682e-01   0.412 0.680459    
## INDUS         -4.247e+00  1.849e+00  -2.297 0.022107 *  
## CHAS           5.665e+01  1.958e+01   2.894 0.004017 ** 
## NOX            1.352e+02  1.255e+02   1.077 0.281931    
## RM             2.452e+01  7.691e+00   3.188 0.001543 ** 
## AGE            9.115e-01  2.818e-01   3.235 0.001319 ** 
## DIS           -7.117e+00  4.891e+00  -1.455 0.146468    
## RAD            2.159e+00  2.470e+00   0.874 0.382473    
## TAX            2.126e-02  1.381e-01   0.154 0.877752    
## PTRATIO        7.642e+00  3.891e+00   1.964 0.050223 .  
## B              9.474e-02  7.262e-02   1.305 0.192770    
## LSTAT          8.412e-01  8.709e-01   0.966 0.334717    
## I(CRIM^2)      1.459e-03  1.220e-03   1.195 0.232637    
## I(ZN^2)       -4.078e-04  5.990e-04  -0.681 0.496379    
## I(INDUS^2)     3.820e-02  1.514e-02   2.523 0.012029 *  
## I(CHAS^2)             NA         NA      NA       NA    
## I(NOX^2)      -3.100e+01  6.410e+01  -0.484 0.628987    
## I(RM^2)        3.361e-01  2.668e-01   1.260 0.208433    
## I(AGE^2)       3.050e-05  4.076e-04   0.075 0.940390    
## I(DIS^2)       4.187e-01  9.915e-02   4.223 2.99e-05 ***
## I(RAD^2)      -1.044e-01  4.139e-02  -2.522 0.012045 *  
## I(TAX^2)      -8.626e-05  4.905e-05  -1.759 0.079402 .  
## I(PTRATIO^2)   2.939e-03  5.829e-02   0.050 0.959806    
## I(B^2)        -4.059e-05  2.191e-05  -1.852 0.064724 .  
## I(LSTAT^2)     1.443e-02  4.824e-03   2.992 0.002946 ** 
## CRIM:ZN        2.434e-01  1.890e-01   1.288 0.198509    
## CRIM:INDUS     3.944e-01  4.562e-01   0.865 0.387795    
## CRIM:CHAS      2.463e+00  5.463e-01   4.509 8.57e-06 ***
## CRIM:NOX      -6.452e-01  8.900e-01  -0.725 0.468906    
## CRIM:RM        1.322e-01  5.918e-02   2.234 0.026018 *  
## CRIM:AGE      -2.861e-03  3.693e-03  -0.775 0.439032    
## CRIM:DIS      -9.542e-02  1.045e-01  -0.913 0.361883    
## CRIM:RAD       2.075e-01  6.045e-01   0.343 0.731660    
## CRIM:TAX      -2.329e-02  4.443e-02  -0.524 0.600476    
## CRIM:PTRATIO   3.516e-01  3.225e-01   1.090 0.276206    
## CRIM:B        -3.164e-04  1.802e-04  -1.756 0.079777 .  
## CRIM:LSTAT     1.964e-02  6.599e-03   2.976 0.003099 ** 
## ZN:INDUS      -5.690e-03  4.927e-03  -1.155 0.248848    
## ZN:CHAS       -5.486e-02  6.224e-02  -0.881 0.378643    
## ZN:NOX        -1.092e+00  6.038e-01  -1.808 0.071347 .  
## ZN:RM          8.191e-03  2.641e-02   0.310 0.756660    
## ZN:AGE         2.834e-04  8.527e-04   0.332 0.739806    
## ZN:DIS        -1.146e-02  9.416e-03  -1.217 0.224440    
## ZN:RAD        -5.849e-03  6.789e-03  -0.862 0.389463    
## ZN:TAX         6.363e-04  1.991e-04   3.196 0.001502 ** 
## ZN:PTRATIO    -6.886e-03  7.312e-03  -0.942 0.346844    
## ZN:B           8.235e-04  7.554e-04   1.090 0.276269    
## ZN:LSTAT      -4.973e-03  4.614e-03  -1.078 0.281743    
## INDUS:CHAS     7.106e-02  3.844e-01   0.185 0.853439    
## INDUS:NOX      8.912e-01  2.019e+00   0.441 0.659104    
## INDUS:RM       2.897e-01  1.323e-01   2.189 0.029149 *  
## INDUS:AGE      3.162e-03  3.651e-03   0.866 0.387022    
## INDUS:DIS      1.204e-01  7.986e-02   1.508 0.132342    
## INDUS:RAD     -3.185e-02  6.804e-02  -0.468 0.640020    
## INDUS:TAX      6.489e-04  9.492e-04   0.684 0.494565    
## INDUS:PTRATIO -1.941e-02  4.005e-02  -0.485 0.628254    
## INDUS:B        2.464e-03  1.989e-03   1.239 0.216001    
## INDUS:LSTAT   -1.094e-02  1.472e-02  -0.743 0.457961    
## CHAS:NOX      -3.381e+01  1.182e+01  -2.861 0.004446 ** 
## CHAS:RM       -5.328e+00  1.190e+00  -4.478 9.82e-06 ***
## CHAS:AGE       1.754e-02  5.639e-02   0.311 0.755911    
## CHAS:DIS       1.156e+00  1.279e+00   0.904 0.366407    
## CHAS:RAD      -8.057e-03  5.724e-01  -0.014 0.988777    
## CHAS:TAX       3.001e-04  3.666e-02   0.008 0.993472    
## CHAS:PTRATIO  -8.807e-01  6.649e-01  -1.324 0.186094    
## CHAS:B         1.949e-02  1.485e-02   1.313 0.190097    
## CHAS:LSTAT    -2.312e-01  1.777e-01  -1.301 0.194071    
## NOX:RM         3.753e-01  5.545e+00   0.068 0.946065    
## NOX:AGE       -4.591e-01  2.695e-01  -1.704 0.089225 .  
## NOX:DIS        1.338e+01  5.471e+00   2.446 0.014874 *  
## NOX:RAD       -2.099e+00  1.857e+00  -1.130 0.259158    
## NOX:TAX        1.863e-01  1.364e-01   1.366 0.172733    
## NOX:PTRATIO   -1.050e+01  3.929e+00  -2.672 0.007844 ** 
## NOX:B         -1.991e-02  3.468e-02  -0.574 0.566200    
## NOX:LSTAT      9.573e-01  6.524e-01   1.467 0.143071    
## RM:AGE        -5.930e-02  2.116e-02  -2.803 0.005307 ** 
## RM:DIS         1.337e-01  3.177e-01   0.421 0.674013    
## RM:RAD        -1.458e-02  1.481e-01  -0.098 0.921645    
## RM:TAX        -1.885e-02  9.708e-03  -1.941 0.052901 .  
## RM:PTRATIO    -6.804e-01  2.354e-01  -2.890 0.004057 ** 
## RM:B          -3.489e-03  3.232e-03  -1.079 0.281035    
## RM:LSTAT      -1.761e-01  5.655e-02  -3.113 0.001983 ** 
## AGE:DIS       -7.814e-03  9.134e-03  -0.855 0.392798    
## AGE:RAD        1.531e-02  4.069e-03   3.763 0.000193 ***
## AGE:TAX       -5.478e-04  2.135e-04  -2.565 0.010671 *  
## AGE:PTRATIO    2.431e-03  7.086e-03   0.343 0.731790    
## AGE:B         -6.595e-04  2.060e-04  -3.201 0.001477 ** 
## AGE:LSTAT     -7.581e-03  1.916e-03  -3.957 8.97e-05 ***
## DIS:RAD       -8.641e-02  6.936e-02  -1.246 0.213515    
## DIS:TAX       -5.018e-03  2.614e-03  -1.919 0.055629 .  
## DIS:PTRATIO   -1.499e-01  9.761e-02  -1.535 0.125504    
## DIS:B         -4.126e-03  5.491e-03  -0.752 0.452768    
## DIS:LSTAT      8.867e-02  4.791e-02   1.851 0.064964 .  
## RAD:TAX        7.165e-03  2.787e-03   2.571 0.010488 *  
## RAD:PTRATIO   -1.087e-01  9.045e-02  -1.202 0.230057    
## RAD:B          2.087e-04  2.419e-03   0.086 0.931307    
## RAD:LSTAT     -2.684e-02  1.769e-02  -1.518 0.129871    
## TAX:PTRATIO    5.606e-03  2.528e-03   2.217 0.027174 *  
## TAX:B         -9.436e-05  1.943e-04  -0.486 0.627496    
## TAX:LSTAT     -6.082e-04  1.183e-03  -0.514 0.607590    
## PTRATIO:B      2.675e-03  3.271e-03   0.818 0.413914    
## PTRATIO:LSTAT -5.257e-03  2.958e-02  -0.178 0.859032    
## B:LSTAT       -4.646e-04  4.039e-04  -1.150 0.250732    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.675 on 402 degrees of freedom
## Multiple R-squared:  0.9324, Adjusted R-squared:  0.9151 
## F-statistic: 53.85 on 103 and 402 DF,  p-value: < 2.2e-16
layout(matrix(c(1,2,3,4),2,2))
plot(lmlm1)