df=read.csv("C:/Users/Soyeon/Downloads/boston_csv.csv")
df[which(df[,"TRACT"] %in% c(2042,2084,3585,3823,905,911,923,1805)),"MEDV"]=c(22.1,24.2,33,27,8.2,14.8,14.4,19)
df=subset(df,select=-CMEDV)
df1 = df[,-c(1:6)]
nullmodel = lm(MEDV ~ 1, data=df1)
town
library(ggplot2)
ggplot(data=df,aes(x=as.factor(TOWN),y=MEDV))+
geom_bar(stat='summary',fun.y='median',fill='slateblue',color='darkslategrey')+
theme(axis.text.x=element_text(angle=45,hjust=1))+
geom_hline(yintercept=median(df$MEDV),linetype='dashed',color='blue')+
geom_hline(yintercept=median(df$MEDV)*0.5,linetype='dashed',color='black')+
geom_hline(yintercept=median(df$MEDV)*1.5,linetype='dashed',color='red')

standard line
v1 = median(df$MEDV)*1.5
v2 = median(df$MEDV)
v3 = median(df$MEDV)*0.5
data split
data1=df[df$MEDV >= v1,]
data2=df[df$MEDV >= v2,]
data3=df[df$MEDV >= v3,]
data4=df[df$MEDV < v3,]
data1=data1[-c(1:6)]
data2=data2[-c(1:6)]
data3=data3[-c(1:6)]
data4=data4[-c(1:6)]
ols
summary(lm(MEDV~., data1))
##
## Call:
## lm(formula = MEDV ~ ., data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.3784 -2.2160 -0.3298 1.9769 10.3782
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 76.133917 32.088454 2.373 0.02119 *
## CRIM 2.845555 0.833171 3.415 0.00120 **
## ZN 0.023641 0.024143 0.979 0.33177
## INDUS 0.075276 0.203811 0.369 0.71329
## CHAS -1.350325 1.444603 -0.935 0.35401
## NOX -1.270119 14.814804 -0.086 0.93199
## RM 5.458100 0.879607 6.205 7.49e-08 ***
## AGE -0.023467 0.034458 -0.681 0.49872
## DIS -0.789209 0.448804 -1.758 0.08423 .
## RAD 0.312064 0.261931 1.191 0.23861
## TAX -0.023539 0.009794 -2.404 0.01964 *
## PTRATIO -0.507316 0.357298 -1.420 0.16129
## B -0.140430 0.074393 -1.888 0.06435 .
## LSTAT -1.095182 0.324146 -3.379 0.00134 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.7 on 55 degrees of freedom
## Multiple R-squared: 0.7653, Adjusted R-squared: 0.7098
## F-statistic: 13.79 on 13 and 55 DF, p-value: 6.28e-13
summary(lm(MEDV~., data2))
##
## Call:
## lm(formula = MEDV ~ ., data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.8027 -2.8150 -0.7192 2.3930 23.5108
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.733985 8.773119 2.819 0.005213 **
## CRIM 0.753561 0.319874 2.356 0.019284 *
## ZN 0.034051 0.017530 1.943 0.053240 .
## INDUS -0.021778 0.090497 -0.241 0.810036
## CHAS 1.713507 1.090644 1.571 0.117472
## NOX -8.049136 7.065586 -1.139 0.255750
## RM 5.510101 0.594500 9.268 < 2e-16 ***
## AGE -0.014084 0.018534 -0.760 0.448042
## DIS -1.361005 0.277668 -4.902 1.75e-06 ***
## RAD 0.397122 0.139524 2.846 0.004805 **
## TAX -0.017863 0.006378 -2.801 0.005515 **
## PTRATIO -0.700572 0.203118 -3.449 0.000664 ***
## B -0.005400 0.014698 -0.367 0.713633
## LSTAT -0.638199 0.103402 -6.172 2.83e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.94 on 241 degrees of freedom
## Multiple R-squared: 0.658, Adjusted R-squared: 0.6396
## F-statistic: 35.68 on 13 and 241 DF, p-value: < 2.2e-16
summary(lm(MEDV~., data3))
##
## Call:
## lm(formula = MEDV ~ ., data = data3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.6285 -2.6702 -0.6423 1.5991 26.4182
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.804166 5.208879 6.298 7.05e-10 ***
## CRIM -0.072140 0.070319 -1.026 0.305479
## ZN 0.046581 0.013618 3.421 0.000680 ***
## INDUS 0.026938 0.061075 0.441 0.659370
## CHAS 2.536719 0.851409 2.979 0.003040 **
## NOX -16.761981 3.838477 -4.367 1.56e-05 ***
## RM 4.157814 0.429872 9.672 < 2e-16 ***
## AGE -0.002288 0.013297 -0.172 0.863480
## DIS -1.489785 0.199238 -7.477 3.86e-13 ***
## RAD 0.313269 0.072307 4.332 1.81e-05 ***
## TAX -0.013086 0.003883 -3.370 0.000815 ***
## PTRATIO -0.891252 0.129545 -6.880 1.96e-11 ***
## B 0.009768 0.003194 3.058 0.002359 **
## LSTAT -0.520934 0.054225 -9.607 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.681 on 461 degrees of freedom
## Multiple R-squared: 0.7181, Adjusted R-squared: 0.7101
## F-statistic: 90.33 on 13 and 461 DF, p-value: < 2.2e-16
summary(lm(MEDV~., data4))
##
## Call:
## lm(formula = MEDV ~ ., data = data4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.0999 -0.6267 -0.2893 0.9024 2.7954
##
## Coefficients: (5 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.784830 10.616189 1.581 0.1281
## CRIM -0.007291 0.017488 -0.417 0.6808
## ZN NA NA NA NA
## INDUS -0.046733 0.193220 -0.242 0.8111
## CHAS NA NA NA NA
## NOX 7.816987 12.789745 0.611 0.5473
## RM -1.110628 0.604163 -1.838 0.0796 .
## AGE -0.021139 0.038256 -0.553 0.5861
## DIS 0.535069 3.158936 0.169 0.8670
## RAD NA NA NA NA
## TAX NA NA NA NA
## PTRATIO NA NA NA NA
## B -0.001742 0.003417 -0.510 0.6152
## LSTAT -0.192668 0.089469 -2.153 0.0425 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.564 on 22 degrees of freedom
## Multiple R-squared: 0.2974, Adjusted R-squared: 0.04186
## F-statistic: 1.164 on 8 and 22 DF, p-value: 0.3633
r square
c(summary(lm(MEDV~., data1))$r.squared,summary(lm(MEDV~., data2))$r.squared,summary(lm(MEDV~., data3))$r.squared,summary(lm(MEDV~., data4))$r.squared)
## [1] 0.7652835 0.6580495 0.7180884 0.2973664
adj r square
c(summary(lm(MEDV~., data1))$adj.r.squared,summary(lm(MEDV~., data2))$adj.r.squared,summary(lm(MEDV~., data3))$adj.r.squared,summary(lm(MEDV~., data4))$adj.r.squared)
## [1] 0.70980503 0.63960406 0.71013863 0.04186322
stepwise : both
d1=summary(stepAIC(lm(MEDV~., data1), direction='both', trace = F)) ; d1
##
## Call:
## lm(formula = MEDV ~ CRIM + ZN + RM + DIS + RAD + TAX + PTRATIO +
## B + LSTAT, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.6654 -2.2911 -0.2838 2.0756 10.2981
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 79.166949 27.830680 2.845 0.006103 **
## CRIM 2.617029 0.737955 3.546 0.000773 ***
## ZN 0.028691 0.020397 1.407 0.164785
## RM 5.291905 0.837078 6.322 3.76e-08 ***
## DIS -0.656910 0.354964 -1.851 0.069230 .
## RAD 0.306250 0.226875 1.350 0.182216
## TAX -0.021362 0.008448 -2.529 0.014145 *
## PTRATIO -0.458703 0.290194 -1.581 0.119299
## B -0.154094 0.067374 -2.287 0.025791 *
## LSTAT -1.135433 0.261199 -4.347 5.53e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.612 on 59 degrees of freedom
## Multiple R-squared: 0.76, Adjusted R-squared: 0.7234
## F-statistic: 20.76 on 9 and 59 DF, p-value: 3.207e-15
d2=summary(stepAIC(lm(MEDV~., data2), direction='both', trace = F)) ; d2
##
## Call:
## lm(formula = MEDV ~ CRIM + ZN + CHAS + NOX + RM + DIS + RAD +
## TAX + PTRATIO + LSTAT, data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.2666 -2.8281 -0.6053 2.4246 22.9893
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.463355 7.535659 3.114 0.002069 **
## CRIM 0.769255 0.266863 2.883 0.004296 **
## ZN 0.035943 0.016867 2.131 0.034090 *
## CHAS 1.577904 1.060425 1.488 0.138044
## NOX -10.330693 6.356655 -1.625 0.105416
## RM 5.415554 0.571937 9.469 < 2e-16 ***
## DIS -1.301917 0.266789 -4.880 1.92e-06 ***
## RAD 0.398558 0.132190 3.015 0.002841 **
## TAX -0.018024 0.005876 -3.067 0.002402 **
## PTRATIO -0.709435 0.195977 -3.620 0.000358 ***
## LSTAT -0.648331 0.101863 -6.365 9.61e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.917 on 244 degrees of freedom
## Multiple R-squared: 0.657, Adjusted R-squared: 0.643
## F-statistic: 46.74 on 10 and 244 DF, p-value: < 2.2e-16
d3=summary(stepAIC(lm(MEDV~., data3), direction='both', trace = F)) ; d3
##
## Call:
## lm(formula = MEDV ~ ZN + CHAS + NOX + RM + DIS + RAD + TAX +
## PTRATIO + B + LSTAT, data = data3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.418 -2.627 -0.605 1.590 26.614
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 31.91288 5.09971 6.258 8.89e-10 ***
## ZN 0.04488 0.01336 3.358 0.000848 ***
## CHAS 2.61140 0.84337 3.096 0.002078 **
## NOX -16.08468 3.53139 -4.555 6.71e-06 ***
## RM 4.15975 0.41489 10.026 < 2e-16 ***
## DIS -1.46480 0.18305 -8.002 9.88e-15 ***
## RAD 0.27952 0.06481 4.313 1.97e-05 ***
## TAX -0.01244 0.00355 -3.504 0.000502 ***
## PTRATIO -0.88072 0.12772 -6.896 1.76e-11 ***
## B 0.01061 0.00306 3.468 0.000573 ***
## LSTAT -0.52649 0.04989 -10.553 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.673 on 464 degrees of freedom
## Multiple R-squared: 0.7173, Adjusted R-squared: 0.7112
## F-statistic: 117.7 on 10 and 464 DF, p-value: < 2.2e-16
d4=summary(stepAIC(lm(MEDV~., data4), direction='both', trace = F)) ; d4
##
## Call:
## lm(formula = MEDV ~ RM + B + LSTAT, data = data4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4805 -0.8169 -0.1452 1.0899 2.2740
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.251490 4.455823 4.321 0.000189 ***
## RM -0.991539 0.534156 -1.856 0.074359 .
## B -0.002548 0.001777 -1.434 0.163083
## LSTAT -0.180817 0.064778 -2.791 0.009520 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.469 on 27 degrees of freedom
## Multiple R-squared: 0.239, Adjusted R-squared: 0.1545
## F-statistic: 2.827 on 3 and 27 DF, p-value: 0.05737
r square
c(d1$r.squared,d2$r.squared,d3$r.squared,d4$r.squared)
## [1] 0.7599718 0.6570164 0.7172895 0.2390412
adj r square
c(d1$adj.r.squared,d2$adj.r.squared,d3$adj.r.squared,d4$adj.r.squared)
## [1] 0.7233573 0.6429597 0.7111966 0.1544902