df=read.csv("C:/Users/Soyeon/Downloads/boston_csv.csv")
df[which(df[,"TRACT"] %in%  c(2042,2084,3585,3823,905,911,923,1805)),"MEDV"]=c(22.1,24.2,33,27,8.2,14.8,14.4,19)
df=subset(df,select=-CMEDV)
df1 = df[,-c(1:6)]
nullmodel = lm(MEDV ~ 1, data=df1)

data scale

df_tmp = preProcess(df1, c('center','scale'))
df_scale = predict(df_tmp,df1)

fit.lm.scale = lm(MEDV~.,data=df_scale)
summary(fit.lm.scale)
## 
## Call:
## lm(formula = MEDV ~ ., data = df_scale)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.69514 -0.29304 -0.05828  0.20089  2.84594 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.805e-16  2.277e-02   0.000 1.000000    
## CRIM        -9.948e-02  3.051e-02  -3.261 0.001189 ** 
## ZN           1.212e-01  3.455e-02   3.508 0.000493 ***
## INDUS        1.737e-02  4.553e-02   0.382 0.702970    
## CHAS         7.446e-02  2.362e-02   3.152 0.001718 ** 
## NOX         -2.239e-01  4.777e-02  -4.687 3.59e-06 ***
## RM           2.900e-01  3.169e-02   9.149  < 2e-16 ***
## AGE          1.762e-03  4.013e-02   0.044 0.964989    
## DIS         -3.444e-01  4.533e-02  -7.598 1.53e-13 ***
## RAD          2.880e-01  6.235e-02   4.620 4.91e-06 ***
## TAX         -2.332e-01  6.840e-02  -3.409 0.000706 ***
## PTRATIO     -2.178e-01  3.057e-02  -7.126 3.70e-12 ***
## B            9.175e-02  2.647e-02   3.467 0.000573 ***
## LSTAT       -4.127e-01  3.909e-02 -10.558  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5122 on 492 degrees of freedom
## Multiple R-squared:  0.7444, Adjusted R-squared:  0.7377 
## F-statistic: 110.2 on 13 and 492 DF,  p-value: < 2.2e-16

data origin

fit.lm=lm(MEDV~.,data=df1)
summary(fit.lm)
## 
## Call:
## lm(formula = MEDV ~ ., data = df1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.5651  -2.6908  -0.5352   1.8446  26.1319 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.637e+01  5.058e+00   7.191 2.40e-12 ***
## CRIM        -1.062e-01  3.257e-02  -3.261 0.001189 ** 
## ZN           4.772e-02  1.360e-02   3.508 0.000493 ***
## INDUS        2.325e-02  6.094e-02   0.382 0.702970    
## CHAS         2.692e+00  8.539e-01   3.152 0.001718 ** 
## NOX         -1.774e+01  3.785e+00  -4.687 3.59e-06 ***
## RM           3.789e+00  4.142e-01   9.149  < 2e-16 ***
## AGE          5.749e-04  1.309e-02   0.044 0.964989    
## DIS         -1.502e+00  1.977e-01  -7.598 1.53e-13 ***
## RAD          3.038e-01  6.575e-02   4.620 4.91e-06 ***
## TAX         -1.270e-02  3.727e-03  -3.409 0.000706 ***
## PTRATIO     -9.239e-01  1.297e-01  -7.126 3.70e-12 ***
## B            9.228e-03  2.662e-03   3.467 0.000573 ***
## LSTAT       -5.307e-01  5.026e-02 -10.558  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.703 on 492 degrees of freedom
## Multiple R-squared:  0.7444, Adjusted R-squared:  0.7377 
## F-statistic: 110.2 on 13 and 492 DF,  p-value: < 2.2e-16

stepwise : model1 (ols)

fit.lm=lm(MEDV~.,data=df1)
#summary(fit.lm)

t1=summary(stepAIC(fit.lm, direction = 'both',trace=F)) ; t1$coef
##                  Estimate  Std. Error    t value     Pr(>|t|)
## (Intercept)  36.244826574 5.022209186   7.216909 2.017013e-12
## CRIM         -0.106656728 0.032486529  -3.283106 1.099454e-03
## ZN            0.047098851 0.013401831   3.514359 4.813561e-04
## CHAS          2.727209191 0.846606352   3.221343 1.360054e-03
## NOX         -17.316822520 3.503652199  -4.942506 1.058023e-06
## RM            3.778662417 0.402685075   9.383666 2.317178e-19
## DIS          -1.520269827 0.184071092  -8.259145 1.354236e-15
## RAD           0.296555180 0.062835544   4.719545 3.082374e-06
## TAX          -0.012076874 0.003342197  -3.613453 3.330966e-04
## PTRATIO      -0.917035153 0.127912290  -7.169250 2.766814e-12
## B             0.009202182 0.002650011   3.472507 5.608988e-04
## LSTAT        -0.528440595 0.047000576 -11.243279 2.855042e-26
f1=summary(stepAIC(nullmodel, scope = list(lower = nullmodel, upper = fit.lm), direction = 'forward',trace=F)) ; f1$coef
##                  Estimate  Std. Error    t value     Pr(>|t|)
## (Intercept)  36.244826574 5.022209186   7.216909 2.017013e-12
## LSTAT        -0.528440595 0.047000576 -11.243279 2.855042e-26
## RM            3.778662417 0.402685075   9.383666 2.317178e-19
## PTRATIO      -0.917035153 0.127912290  -7.169250 2.766814e-12
## DIS          -1.520269827 0.184071092  -8.259145 1.354236e-15
## NOX         -17.316822520 3.503652199  -4.942506 1.058023e-06
## CHAS          2.727209191 0.846606352   3.221343 1.360054e-03
## B             0.009202182 0.002650011   3.472507 5.608988e-04
## ZN            0.047098851 0.013401831   3.514359 4.813561e-04
## CRIM         -0.106656728 0.032486529  -3.283106 1.099454e-03
## RAD           0.296555180 0.062835544   4.719545 3.082374e-06
## TAX          -0.012076874 0.003342197  -3.613453 3.330966e-04
b1=summary(stepAIC(fit.lm, direction = 'backward',trace=F)); b1$coef
##                  Estimate  Std. Error    t value     Pr(>|t|)
## (Intercept)  36.244826574 5.022209186   7.216909 2.017013e-12
## CRIM         -0.106656728 0.032486529  -3.283106 1.099454e-03
## ZN            0.047098851 0.013401831   3.514359 4.813561e-04
## CHAS          2.727209191 0.846606352   3.221343 1.360054e-03
## NOX         -17.316822520 3.503652199  -4.942506 1.058023e-06
## RM            3.778662417 0.402685075   9.383666 2.317178e-19
## DIS          -1.520269827 0.184071092  -8.259145 1.354236e-15
## RAD           0.296555180 0.062835544   4.719545 3.082374e-06
## TAX          -0.012076874 0.003342197  -3.613453 3.330966e-04
## PTRATIO      -0.917035153 0.127912290  -7.169250 2.766814e-12
## B             0.009202182 0.002650011   3.472507 5.608988e-04
## LSTAT        -0.528440595 0.047000576 -11.243279 2.855042e-26

r square

c(t1$r.squared, f1$r.squared, b1$r.squared)
## [1] 0.7443687 0.7443687 0.7443687
c(t1$adj.r.squared, f1$adj.r.squared, b1$adj.r.squared)
## [1] 0.7386765 0.7386765 0.7386765

stepwise : model2 (add LSTAT^2)

fit.lm2=lm(MEDV~.+I(LSTAT^2),data=df1)
#summary(fit.lm2)

t2=summary(stepAIC(fit.lm2, direction = 'both',trace=F));t2$coef
##                  Estimate  Std. Error    t value     Pr(>|t|)
## (Intercept)  43.281387213 4.584166806   9.441495 1.472732e-19
## CRIM         -0.149793391 0.029520211  -5.074266 5.523877e-07
## ZN            0.024775632 0.012331333   2.009161 4.506576e-02
## CHAS          2.496108282 0.763562609   3.269029 1.154721e-03
## NOX         -15.462505998 3.286001115  -4.705569 3.294921e-06
## RM            2.991564562 0.377628800   7.921971 1.571532e-14
## AGE           0.028651721 0.012047041   2.378320 1.777254e-02
## DIS          -1.251889943 0.175362990  -7.138849 3.398511e-12
## RAD           0.278106268 0.056780683   4.897903 1.316302e-06
## TAX          -0.009969506 0.003017291  -3.304125 1.022190e-03
## PTRATIO      -0.774970710 0.116277777  -6.664822 7.123253e-11
## B             0.007652755 0.002395118   3.195148 1.487328e-03
## LSTAT        -1.776253363 0.123611954 -14.369592 2.393569e-39
## I(LSTAT^2)    0.034877783 0.003219567  10.833067 1.135152e-24
f2=summary(stepAIC(nullmodel, scope = list(lower = nullmodel, upper = fit.lm2), direction = 'forward',trace=F));f2$coef
##                  Estimate  Std. Error    t value     Pr(>|t|)
## (Intercept)  43.281387213 4.584166806   9.441495 1.472732e-19
## LSTAT        -1.776253363 0.123611954 -14.369592 2.393569e-39
## I(LSTAT^2)    0.034877783 0.003219567  10.833067 1.135152e-24
## RM            2.991564562 0.377628800   7.921971 1.571532e-14
## PTRATIO      -0.774970710 0.116277777  -6.664822 7.123253e-11
## DIS          -1.251889943 0.175362990  -7.138849 3.398511e-12
## CRIM         -0.149793391 0.029520211  -5.074266 5.523877e-07
## NOX         -15.462505998 3.286001115  -4.705569 3.294921e-06
## CHAS          2.496108282 0.763562609   3.269029 1.154721e-03
## RAD           0.278106268 0.056780683   4.897903 1.316302e-06
## B             0.007652755 0.002395118   3.195148 1.487328e-03
## TAX          -0.009969506 0.003017291  -3.304125 1.022190e-03
## AGE           0.028651721 0.012047041   2.378320 1.777254e-02
## ZN            0.024775632 0.012331333   2.009161 4.506576e-02
b2=summary(stepAIC(fit.lm2, direction = 'backward',trace=F));b2$coef
##                  Estimate  Std. Error    t value     Pr(>|t|)
## (Intercept)  43.281387213 4.584166806   9.441495 1.472732e-19
## CRIM         -0.149793391 0.029520211  -5.074266 5.523877e-07
## ZN            0.024775632 0.012331333   2.009161 4.506576e-02
## CHAS          2.496108282 0.763562609   3.269029 1.154721e-03
## NOX         -15.462505998 3.286001115  -4.705569 3.294921e-06
## RM            2.991564562 0.377628800   7.921971 1.571532e-14
## AGE           0.028651721 0.012047041   2.378320 1.777254e-02
## DIS          -1.251889943 0.175362990  -7.138849 3.398511e-12
## RAD           0.278106268 0.056780683   4.897903 1.316302e-06
## TAX          -0.009969506 0.003017291  -3.304125 1.022190e-03
## PTRATIO      -0.774970710 0.116277777  -6.664822 7.123253e-11
## B             0.007652755 0.002395118   3.195148 1.487328e-03
## LSTAT        -1.776253363 0.123611954 -14.369592 2.393569e-39
## I(LSTAT^2)    0.034877783 0.003219567  10.833067 1.135152e-24

r square

c(t2$r.squared, f2$r.squared, b2$r.squared)
## [1] 0.7936014 0.7936014 0.7936014
c(t2$adj.r.squared, f2$adj.r.squared, b2$adj.r.squared)
## [1] 0.7881478 0.7881478 0.7881478

stepwise : model3 (add LSTAT^2 + log(crim))

fit.lm3=lm(MEDV~log(CRIM)+ZN+INDUS+CHAS+NOX+RM+AGE+DIS+RAD+TAX+PTRATIO+B+LSTAT+I(LSTAT^2),data=df1)
#summary(fit.lm3)

t3=summary(stepAIC(fit.lm3, direction = 'both',trace=F));t3$coef
##                  Estimate  Std. Error    t value     Pr(>|t|)
## (Intercept)  41.318267964 4.681058797   8.826693 1.884265e-17
## ZN            0.020586380 0.012608700   1.632712 1.031680e-01
## CHAS          2.718514621 0.781202804   3.479909 5.460849e-04
## NOX         -14.394384046 3.360552041  -4.283339 2.214117e-05
## RM            3.083966925 0.386540855   7.978372 1.045983e-14
## AGE           0.026682962 0.012339309   2.162436 3.106484e-02
## DIS          -1.171279322 0.178971707  -6.544494 1.499287e-10
## RAD           0.196847671 0.055826277   3.526076 4.611389e-04
## TAX          -0.009629406 0.003091333  -3.114969 1.946774e-03
## PTRATIO      -0.768868115 0.119154190  -6.452716 2.629435e-10
## B             0.009148468 0.002435839   3.755777 1.934393e-04
## LSTAT        -1.731530675 0.126354161 -13.703788 1.910336e-36
## I(LSTAT^2)    0.032674358 0.003269240   9.994481 1.536390e-21
f3=summary(stepAIC(nullmodel, scope = list(lower = nullmodel, upper = fit.lm3), direction = 'forward',trace=F));f3$coef
##                  Estimate  Std. Error    t value     Pr(>|t|)
## (Intercept)  41.318267964 4.681058797   8.826693 1.884265e-17
## LSTAT        -1.731530675 0.126354161 -13.703788 1.910336e-36
## I(LSTAT^2)    0.032674358 0.003269240   9.994481 1.536390e-21
## RM            3.083966925 0.386540855   7.978372 1.045983e-14
## PTRATIO      -0.768868115 0.119154190  -6.452716 2.629435e-10
## DIS          -1.171279322 0.178971707  -6.544494 1.499287e-10
## NOX         -14.394384046 3.360552041  -4.283339 2.214117e-05
## CHAS          2.718514621 0.781202804   3.479909 5.460849e-04
## B             0.009148468 0.002435839   3.755777 1.934393e-04
## AGE           0.026682962 0.012339309   2.162436 3.106484e-02
## RAD           0.196847671 0.055826277   3.526076 4.611389e-04
## TAX          -0.009629406 0.003091333  -3.114969 1.946774e-03
## ZN            0.020586380 0.012608700   1.632712 1.031680e-01
b3=summary(stepAIC(fit.lm3, direction = 'backward',trace=F));b3$coef
##                  Estimate  Std. Error    t value     Pr(>|t|)
## (Intercept)  41.318267964 4.681058797   8.826693 1.884265e-17
## ZN            0.020586380 0.012608700   1.632712 1.031680e-01
## CHAS          2.718514621 0.781202804   3.479909 5.460849e-04
## NOX         -14.394384046 3.360552041  -4.283339 2.214117e-05
## RM            3.083966925 0.386540855   7.978372 1.045983e-14
## AGE           0.026682962 0.012339309   2.162436 3.106484e-02
## DIS          -1.171279322 0.178971707  -6.544494 1.499287e-10
## RAD           0.196847671 0.055826277   3.526076 4.611389e-04
## TAX          -0.009629406 0.003091333  -3.114969 1.946774e-03
## PTRATIO      -0.768868115 0.119154190  -6.452716 2.629435e-10
## B             0.009148468 0.002435839   3.755777 1.934393e-04
## LSTAT        -1.731530675 0.126354161 -13.703788 1.910336e-36
## I(LSTAT^2)    0.032674358 0.003269240   9.994481 1.536390e-21

r square

c(t3$r.squared, f3$r.squared, b3$r.squared)
## [1] 0.7827998 0.7827998 0.7827998
c(t3$adj.r.squared, f3$adj.r.squared, b3$adj.r.squared)
## [1] 0.777513 0.777513 0.777513

model4 (only RM+PTRATIO+LSTAT)

fit.lm4=lm(MEDV~RM+PTRATIO+LSTAT,data=df1)

r square

summary(fit.lm4)$r.squared ; summary(fit.lm4)$adj.r.squared
## [1] 0.6811822
## [1] 0.6792769