df=read.csv("C:/Users/Soyeon/Downloads/boston_csv.csv")
df[which(df[,"TRACT"] %in% c(2042,2084,3585,3823,905,911,923,1805)),"MEDV"]=c(22.1,24.2,33,27,8.2,14.8,14.4,19)
df=subset(df,select=-CMEDV)
df1 = df[,-c(1:6)]
nullmodel = lm(MEDV ~ 1, data=df1)
data scale
df_tmp = preProcess(df1, c('center','scale'))
df_scale = predict(df_tmp,df1)
fit.lm.scale = lm(MEDV~.,data=df_scale)
summary(fit.lm.scale)
##
## Call:
## lm(formula = MEDV ~ ., data = df_scale)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.69514 -0.29304 -0.05828 0.20089 2.84594
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.805e-16 2.277e-02 0.000 1.000000
## CRIM -9.948e-02 3.051e-02 -3.261 0.001189 **
## ZN 1.212e-01 3.455e-02 3.508 0.000493 ***
## INDUS 1.737e-02 4.553e-02 0.382 0.702970
## CHAS 7.446e-02 2.362e-02 3.152 0.001718 **
## NOX -2.239e-01 4.777e-02 -4.687 3.59e-06 ***
## RM 2.900e-01 3.169e-02 9.149 < 2e-16 ***
## AGE 1.762e-03 4.013e-02 0.044 0.964989
## DIS -3.444e-01 4.533e-02 -7.598 1.53e-13 ***
## RAD 2.880e-01 6.235e-02 4.620 4.91e-06 ***
## TAX -2.332e-01 6.840e-02 -3.409 0.000706 ***
## PTRATIO -2.178e-01 3.057e-02 -7.126 3.70e-12 ***
## B 9.175e-02 2.647e-02 3.467 0.000573 ***
## LSTAT -4.127e-01 3.909e-02 -10.558 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5122 on 492 degrees of freedom
## Multiple R-squared: 0.7444, Adjusted R-squared: 0.7377
## F-statistic: 110.2 on 13 and 492 DF, p-value: < 2.2e-16
data origin
fit.lm=lm(MEDV~.,data=df1)
summary(fit.lm)
##
## Call:
## lm(formula = MEDV ~ ., data = df1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.5651 -2.6908 -0.5352 1.8446 26.1319
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.637e+01 5.058e+00 7.191 2.40e-12 ***
## CRIM -1.062e-01 3.257e-02 -3.261 0.001189 **
## ZN 4.772e-02 1.360e-02 3.508 0.000493 ***
## INDUS 2.325e-02 6.094e-02 0.382 0.702970
## CHAS 2.692e+00 8.539e-01 3.152 0.001718 **
## NOX -1.774e+01 3.785e+00 -4.687 3.59e-06 ***
## RM 3.789e+00 4.142e-01 9.149 < 2e-16 ***
## AGE 5.749e-04 1.309e-02 0.044 0.964989
## DIS -1.502e+00 1.977e-01 -7.598 1.53e-13 ***
## RAD 3.038e-01 6.575e-02 4.620 4.91e-06 ***
## TAX -1.270e-02 3.727e-03 -3.409 0.000706 ***
## PTRATIO -9.239e-01 1.297e-01 -7.126 3.70e-12 ***
## B 9.228e-03 2.662e-03 3.467 0.000573 ***
## LSTAT -5.307e-01 5.026e-02 -10.558 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.703 on 492 degrees of freedom
## Multiple R-squared: 0.7444, Adjusted R-squared: 0.7377
## F-statistic: 110.2 on 13 and 492 DF, p-value: < 2.2e-16
stepwise : model1 (ols)
fit.lm=lm(MEDV~.,data=df1)
#summary(fit.lm)
t1=summary(stepAIC(fit.lm, direction = 'both',trace=F)) ; t1$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.244826574 5.022209186 7.216909 2.017013e-12
## CRIM -0.106656728 0.032486529 -3.283106 1.099454e-03
## ZN 0.047098851 0.013401831 3.514359 4.813561e-04
## CHAS 2.727209191 0.846606352 3.221343 1.360054e-03
## NOX -17.316822520 3.503652199 -4.942506 1.058023e-06
## RM 3.778662417 0.402685075 9.383666 2.317178e-19
## DIS -1.520269827 0.184071092 -8.259145 1.354236e-15
## RAD 0.296555180 0.062835544 4.719545 3.082374e-06
## TAX -0.012076874 0.003342197 -3.613453 3.330966e-04
## PTRATIO -0.917035153 0.127912290 -7.169250 2.766814e-12
## B 0.009202182 0.002650011 3.472507 5.608988e-04
## LSTAT -0.528440595 0.047000576 -11.243279 2.855042e-26
f1=summary(stepAIC(nullmodel, scope = list(lower = nullmodel, upper = fit.lm), direction = 'forward',trace=F)) ; f1$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.244826574 5.022209186 7.216909 2.017013e-12
## LSTAT -0.528440595 0.047000576 -11.243279 2.855042e-26
## RM 3.778662417 0.402685075 9.383666 2.317178e-19
## PTRATIO -0.917035153 0.127912290 -7.169250 2.766814e-12
## DIS -1.520269827 0.184071092 -8.259145 1.354236e-15
## NOX -17.316822520 3.503652199 -4.942506 1.058023e-06
## CHAS 2.727209191 0.846606352 3.221343 1.360054e-03
## B 0.009202182 0.002650011 3.472507 5.608988e-04
## ZN 0.047098851 0.013401831 3.514359 4.813561e-04
## CRIM -0.106656728 0.032486529 -3.283106 1.099454e-03
## RAD 0.296555180 0.062835544 4.719545 3.082374e-06
## TAX -0.012076874 0.003342197 -3.613453 3.330966e-04
b1=summary(stepAIC(fit.lm, direction = 'backward',trace=F)); b1$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.244826574 5.022209186 7.216909 2.017013e-12
## CRIM -0.106656728 0.032486529 -3.283106 1.099454e-03
## ZN 0.047098851 0.013401831 3.514359 4.813561e-04
## CHAS 2.727209191 0.846606352 3.221343 1.360054e-03
## NOX -17.316822520 3.503652199 -4.942506 1.058023e-06
## RM 3.778662417 0.402685075 9.383666 2.317178e-19
## DIS -1.520269827 0.184071092 -8.259145 1.354236e-15
## RAD 0.296555180 0.062835544 4.719545 3.082374e-06
## TAX -0.012076874 0.003342197 -3.613453 3.330966e-04
## PTRATIO -0.917035153 0.127912290 -7.169250 2.766814e-12
## B 0.009202182 0.002650011 3.472507 5.608988e-04
## LSTAT -0.528440595 0.047000576 -11.243279 2.855042e-26
r square
c(t1$r.squared, f1$r.squared, b1$r.squared)
## [1] 0.7443687 0.7443687 0.7443687
c(t1$adj.r.squared, f1$adj.r.squared, b1$adj.r.squared)
## [1] 0.7386765 0.7386765 0.7386765
stepwise : model2 (add LSTAT^2)
fit.lm2=lm(MEDV~.+I(LSTAT^2),data=df1)
#summary(fit.lm2)
t2=summary(stepAIC(fit.lm2, direction = 'both',trace=F));t2$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 43.281387213 4.584166806 9.441495 1.472732e-19
## CRIM -0.149793391 0.029520211 -5.074266 5.523877e-07
## ZN 0.024775632 0.012331333 2.009161 4.506576e-02
## CHAS 2.496108282 0.763562609 3.269029 1.154721e-03
## NOX -15.462505998 3.286001115 -4.705569 3.294921e-06
## RM 2.991564562 0.377628800 7.921971 1.571532e-14
## AGE 0.028651721 0.012047041 2.378320 1.777254e-02
## DIS -1.251889943 0.175362990 -7.138849 3.398511e-12
## RAD 0.278106268 0.056780683 4.897903 1.316302e-06
## TAX -0.009969506 0.003017291 -3.304125 1.022190e-03
## PTRATIO -0.774970710 0.116277777 -6.664822 7.123253e-11
## B 0.007652755 0.002395118 3.195148 1.487328e-03
## LSTAT -1.776253363 0.123611954 -14.369592 2.393569e-39
## I(LSTAT^2) 0.034877783 0.003219567 10.833067 1.135152e-24
f2=summary(stepAIC(nullmodel, scope = list(lower = nullmodel, upper = fit.lm2), direction = 'forward',trace=F));f2$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 43.281387213 4.584166806 9.441495 1.472732e-19
## LSTAT -1.776253363 0.123611954 -14.369592 2.393569e-39
## I(LSTAT^2) 0.034877783 0.003219567 10.833067 1.135152e-24
## RM 2.991564562 0.377628800 7.921971 1.571532e-14
## PTRATIO -0.774970710 0.116277777 -6.664822 7.123253e-11
## DIS -1.251889943 0.175362990 -7.138849 3.398511e-12
## CRIM -0.149793391 0.029520211 -5.074266 5.523877e-07
## NOX -15.462505998 3.286001115 -4.705569 3.294921e-06
## CHAS 2.496108282 0.763562609 3.269029 1.154721e-03
## RAD 0.278106268 0.056780683 4.897903 1.316302e-06
## B 0.007652755 0.002395118 3.195148 1.487328e-03
## TAX -0.009969506 0.003017291 -3.304125 1.022190e-03
## AGE 0.028651721 0.012047041 2.378320 1.777254e-02
## ZN 0.024775632 0.012331333 2.009161 4.506576e-02
b2=summary(stepAIC(fit.lm2, direction = 'backward',trace=F));b2$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 43.281387213 4.584166806 9.441495 1.472732e-19
## CRIM -0.149793391 0.029520211 -5.074266 5.523877e-07
## ZN 0.024775632 0.012331333 2.009161 4.506576e-02
## CHAS 2.496108282 0.763562609 3.269029 1.154721e-03
## NOX -15.462505998 3.286001115 -4.705569 3.294921e-06
## RM 2.991564562 0.377628800 7.921971 1.571532e-14
## AGE 0.028651721 0.012047041 2.378320 1.777254e-02
## DIS -1.251889943 0.175362990 -7.138849 3.398511e-12
## RAD 0.278106268 0.056780683 4.897903 1.316302e-06
## TAX -0.009969506 0.003017291 -3.304125 1.022190e-03
## PTRATIO -0.774970710 0.116277777 -6.664822 7.123253e-11
## B 0.007652755 0.002395118 3.195148 1.487328e-03
## LSTAT -1.776253363 0.123611954 -14.369592 2.393569e-39
## I(LSTAT^2) 0.034877783 0.003219567 10.833067 1.135152e-24
r square
c(t2$r.squared, f2$r.squared, b2$r.squared)
## [1] 0.7936014 0.7936014 0.7936014
c(t2$adj.r.squared, f2$adj.r.squared, b2$adj.r.squared)
## [1] 0.7881478 0.7881478 0.7881478
stepwise : model3 (add LSTAT^2 + log(crim))
fit.lm3=lm(MEDV~log(CRIM)+ZN+INDUS+CHAS+NOX+RM+AGE+DIS+RAD+TAX+PTRATIO+B+LSTAT+I(LSTAT^2),data=df1)
#summary(fit.lm3)
t3=summary(stepAIC(fit.lm3, direction = 'both',trace=F));t3$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.318267964 4.681058797 8.826693 1.884265e-17
## ZN 0.020586380 0.012608700 1.632712 1.031680e-01
## CHAS 2.718514621 0.781202804 3.479909 5.460849e-04
## NOX -14.394384046 3.360552041 -4.283339 2.214117e-05
## RM 3.083966925 0.386540855 7.978372 1.045983e-14
## AGE 0.026682962 0.012339309 2.162436 3.106484e-02
## DIS -1.171279322 0.178971707 -6.544494 1.499287e-10
## RAD 0.196847671 0.055826277 3.526076 4.611389e-04
## TAX -0.009629406 0.003091333 -3.114969 1.946774e-03
## PTRATIO -0.768868115 0.119154190 -6.452716 2.629435e-10
## B 0.009148468 0.002435839 3.755777 1.934393e-04
## LSTAT -1.731530675 0.126354161 -13.703788 1.910336e-36
## I(LSTAT^2) 0.032674358 0.003269240 9.994481 1.536390e-21
f3=summary(stepAIC(nullmodel, scope = list(lower = nullmodel, upper = fit.lm3), direction = 'forward',trace=F));f3$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.318267964 4.681058797 8.826693 1.884265e-17
## LSTAT -1.731530675 0.126354161 -13.703788 1.910336e-36
## I(LSTAT^2) 0.032674358 0.003269240 9.994481 1.536390e-21
## RM 3.083966925 0.386540855 7.978372 1.045983e-14
## PTRATIO -0.768868115 0.119154190 -6.452716 2.629435e-10
## DIS -1.171279322 0.178971707 -6.544494 1.499287e-10
## NOX -14.394384046 3.360552041 -4.283339 2.214117e-05
## CHAS 2.718514621 0.781202804 3.479909 5.460849e-04
## B 0.009148468 0.002435839 3.755777 1.934393e-04
## AGE 0.026682962 0.012339309 2.162436 3.106484e-02
## RAD 0.196847671 0.055826277 3.526076 4.611389e-04
## TAX -0.009629406 0.003091333 -3.114969 1.946774e-03
## ZN 0.020586380 0.012608700 1.632712 1.031680e-01
b3=summary(stepAIC(fit.lm3, direction = 'backward',trace=F));b3$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.318267964 4.681058797 8.826693 1.884265e-17
## ZN 0.020586380 0.012608700 1.632712 1.031680e-01
## CHAS 2.718514621 0.781202804 3.479909 5.460849e-04
## NOX -14.394384046 3.360552041 -4.283339 2.214117e-05
## RM 3.083966925 0.386540855 7.978372 1.045983e-14
## AGE 0.026682962 0.012339309 2.162436 3.106484e-02
## DIS -1.171279322 0.178971707 -6.544494 1.499287e-10
## RAD 0.196847671 0.055826277 3.526076 4.611389e-04
## TAX -0.009629406 0.003091333 -3.114969 1.946774e-03
## PTRATIO -0.768868115 0.119154190 -6.452716 2.629435e-10
## B 0.009148468 0.002435839 3.755777 1.934393e-04
## LSTAT -1.731530675 0.126354161 -13.703788 1.910336e-36
## I(LSTAT^2) 0.032674358 0.003269240 9.994481 1.536390e-21
r square
c(t3$r.squared, f3$r.squared, b3$r.squared)
## [1] 0.7827998 0.7827998 0.7827998
c(t3$adj.r.squared, f3$adj.r.squared, b3$adj.r.squared)
## [1] 0.777513 0.777513 0.777513
model4 (only RM+PTRATIO+LSTAT)
fit.lm4=lm(MEDV~RM+PTRATIO+LSTAT,data=df1)
r square
summary(fit.lm4)$r.squared ; summary(fit.lm4)$adj.r.squared
## [1] 0.6811822
## [1] 0.6792769