Data
library(car);library(ggplot2);library(GGally);library(glmnet);library(MASS); library(caret)
## Loading required package: carData
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
## Loading required package: Matrix
## Loaded glmnet 3.0-1
## Loading required package: lattice
df = read.csv(file = "C:\\Users\\chae\\Dropbox\\Konkuk\\2-1\\회귀분석\\boston housing\\Boston.csv")
df[which(df[,"TRACT"] %in% c(2042,2084,3585,3823,905,911,923,1805)),"MEDV"]=c(22.1,24.2,33,27,8.2,14.8,14.4,19)
df=subset(df,select=-CMEDV)
attach(df)
summary(df)
## NO TOWN TOWNNO TRACT
## Min. : 1.0 Cambridge : 30 Min. : 0.00 Min. : 1
## 1st Qu.:127.2 Boston Savin Hill: 23 1st Qu.:26.25 1st Qu.:1303
## Median :253.5 Lynn : 22 Median :42.00 Median :3394
## Mean :253.5 Boston Roxbury : 19 Mean :47.53 Mean :2700
## 3rd Qu.:379.8 Newton : 18 3rd Qu.:78.00 3rd Qu.:3740
## Max. :506.0 Somerville : 15 Max. :91.00 Max. :5082
## (Other) :379
## LON LAT MEDV CRIM
## Min. :-71.29 Min. :42.03 Min. : 5.00 Min. : 0.00632
## 1st Qu.:-71.09 1st Qu.:42.18 1st Qu.:17.02 1st Qu.: 0.08204
## Median :-71.05 Median :42.22 Median :21.20 Median : 0.25651
## Mean :-71.06 Mean :42.22 Mean :22.53 Mean : 3.61352
## 3rd Qu.:-71.02 3rd Qu.:42.25 3rd Qu.:25.00 3rd Qu.: 3.67708
## Max. :-70.81 Max. :42.38 Max. :50.00 Max. :88.97620
##
## ZN INDUS CHAS NOX
## Min. : 0.00 Min. : 0.46 Min. :0.00000 Min. :0.3850
## 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000 1st Qu.:0.4490
## Median : 0.00 Median : 9.69 Median :0.00000 Median :0.5380
## Mean : 11.36 Mean :11.14 Mean :0.06917 Mean :0.5547
## 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000 3rd Qu.:0.6240
## Max. :100.00 Max. :27.74 Max. :1.00000 Max. :0.8710
##
## RM AGE DIS RAD
## Min. :3.561 Min. : 2.90 Min. : 1.130 Min. : 1.000
## 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100 1st Qu.: 4.000
## Median :6.208 Median : 77.50 Median : 3.207 Median : 5.000
## Mean :6.285 Mean : 68.57 Mean : 3.795 Mean : 9.549
## 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188 3rd Qu.:24.000
## Max. :8.780 Max. :100.00 Max. :12.127 Max. :24.000
##
## TAX PTRATIO B LSTAT
## Min. :187.0 Min. :12.60 Min. : 0.32 Min. : 1.73
## 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38 1st Qu.: 6.95
## Median :330.0 Median :19.05 Median :391.44 Median :11.36
## Mean :408.2 Mean :18.46 Mean :356.67 Mean :12.65
## 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23 3rd Qu.:16.95
## Max. :711.0 Max. :22.00 Max. :396.90 Max. :37.97
##
str(df)
## 'data.frame': 506 obs. of 20 variables:
## $ NO : int 1 2 3 4 5 6 7 8 9 10 ...
## $ TOWN : Factor w/ 92 levels "Arlington","Ashland",..: 54 77 77 46 46 46 69 69 69 69 ...
## $ TOWNNO : int 0 1 1 2 2 2 3 3 3 3 ...
## $ TRACT : int 2011 2021 2022 2031 2032 2033 2041 2042 2043 2044 ...
## $ LON : num -71 -71 -70.9 -70.9 -70.9 ...
## $ LAT : num 42.3 42.3 42.3 42.3 42.3 ...
## $ MEDV : num 24 21.6 34.7 33.4 36.2 28.7 22.9 22.1 16.5 18.9 ...
## $ CRIM : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ ZN : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ INDUS : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ CHAS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NOX : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ RM : num 6.58 6.42 7.18 7 7.15 ...
## $ AGE : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ DIS : num 4.09 4.97 4.97 6.06 6.06 ...
## $ RAD : int 1 2 2 3 3 3 5 5 5 5 ...
## $ TAX : int 296 242 242 222 222 222 311 311 311 311 ...
## $ PTRATIO: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ B : num 397 397 393 395 397 ...
## $ LSTAT : num 4.98 9.14 4.03 2.94 5.33 ...
Regression Models
model 1
df1 = df[,-c(1:6)]
fit.lm=lm(MEDV~.,data=df1)
summary(fit.lm)
##
## Call:
## lm(formula = MEDV ~ ., data = df1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.5651 -2.6908 -0.5352 1.8446 26.1319
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.637e+01 5.058e+00 7.191 2.40e-12 ***
## CRIM -1.062e-01 3.257e-02 -3.261 0.001189 **
## ZN 4.772e-02 1.360e-02 3.508 0.000493 ***
## INDUS 2.325e-02 6.094e-02 0.382 0.702970
## CHAS 2.692e+00 8.539e-01 3.152 0.001718 **
## NOX -1.774e+01 3.785e+00 -4.687 3.59e-06 ***
## RM 3.789e+00 4.142e-01 9.149 < 2e-16 ***
## AGE 5.749e-04 1.309e-02 0.044 0.964989
## DIS -1.502e+00 1.977e-01 -7.598 1.53e-13 ***
## RAD 3.038e-01 6.575e-02 4.620 4.91e-06 ***
## TAX -1.270e-02 3.727e-03 -3.409 0.000706 ***
## PTRATIO -9.239e-01 1.297e-01 -7.126 3.70e-12 ***
## B 9.228e-03 2.662e-03 3.467 0.000573 ***
## LSTAT -5.307e-01 5.026e-02 -10.558 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.703 on 492 degrees of freedom
## Multiple R-squared: 0.7444, Adjusted R-squared: 0.7377
## F-statistic: 110.2 on 13 and 492 DF, p-value: < 2.2e-16
layout(matrix(c(1,2,3,4),2,2))
plot(fit.lm)

model 2
hist(MEDV)

hist(log(MEDV))

fit.lm1 = lm(log(MEDV)~.,data = df1)
layout(matrix(c(1,2,3,4),2,2))
plot(fit.lm1)

model 3
fit.lm2=lm(MEDV~.+I(LSTAT^2),data=df1)
summary(fit.lm2)
##
## Call:
## lm(formula = MEDV ~ . + I(LSTAT^2), data = df1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.2075 -2.5063 -0.3202 1.8247 24.3895
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 43.531669 4.593869 9.476 < 2e-16 ***
## CRIM -0.148980 0.029541 -5.043 6.45e-07 ***
## ZN 0.025851 0.012394 2.086 0.037512 *
## INDUS 0.048567 0.054832 0.886 0.376186
## CHAS 2.425222 0.767911 3.158 0.001685 **
## NOX -16.252217 3.405501 -4.772 2.41e-06 ***
## RM 3.019182 0.378996 7.966 1.15e-14 ***
## AGE 0.028738 0.012050 2.385 0.017463 *
## DIS -1.217712 0.179596 -6.780 3.45e-11 ***
## RAD 0.292628 0.059112 4.950 1.02e-06 ***
## TAX -0.011263 0.003353 -3.359 0.000842 ***
## PTRATIO -0.787897 0.117215 -6.722 4.99e-11 ***
## B 0.007718 0.002397 3.220 0.001366 **
## LSTAT -1.783645 0.123920 -14.393 < 2e-16 ***
## I(LSTAT^2) 0.034999 0.003223 10.859 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.227 on 491 degrees of freedom
## Multiple R-squared: 0.7939, Adjusted R-squared: 0.7881
## F-statistic: 135.1 on 14 and 491 DF, p-value: < 2.2e-16
layout(matrix(c(1,2,3,4),2,2))
plot(fit.lm2)

model 4
aaa = as.numeric(names(fit.lm2$fitted.values[fit.lm2$fitted.values>=35]))
df[aaa,]
## NO TOWN TOWNNO TRACT LON LAT MEDV CRIM ZN
## 41 41 Lynnfield 6 2092 -71.0300 42.3240 34.9 0.03359 75.0
## 98 98 Winchester 23 3383 -71.0890 42.2715 38.7 0.12083 0.0
## 99 99 Winchester 23 3384 -71.0982 42.2685 43.8 0.08187 0.0
## 158 158 Cambridge 28 3536 -71.0690 42.2285 41.3 1.22358 0.0
## 161 161 Cambridge 28 3539 -71.0700 42.2214 27.0 1.27346 0.0
## 162 162 Cambridge 28 3540 -71.0735 42.2282 50.0 1.46336 0.0
## 163 163 Cambridge 28 3541 -71.0770 42.2250 50.0 1.83377 0.0
## 164 164 Cambridge 28 3542 -71.0815 42.2250 50.0 1.51902 0.0
## 167 167 Cambridge 28 3545 -71.0770 42.2305 50.0 2.01019 0.0
## 183 183 Belmont 30 3574 -71.1005 42.2287 37.9 0.09103 0.0
## 187 187 Belmont 30 3578 -71.1115 42.2442 50.0 0.05602 0.0
## 193 193 Lexington 31 3587 -71.1340 42.2785 36.4 0.08664 45.0
## 196 196 Lincoln 33 3602 -71.1890 42.2525 50.0 0.01381 80.0
## 197 197 Concord 34 3611 -71.2200 42.2715 33.3 0.04011 80.0
## 203 203 Wayland 36 3662 -71.2140 42.2180 42.3 0.02177 82.5
## 204 204 Weston 37 3671 -71.1990 42.2320 48.5 0.03510 95.0
## 205 205 Weston 37 3672 -71.1990 42.2058 50.0 0.02009 95.0
## 225 225 Newton 40 3735 -71.1100 42.2060 44.8 0.31533 0.0
## 226 226 Newton 40 3736 -71.1012 42.1975 50.0 0.52693 0.0
## 227 227 Newton 40 3737 -71.1215 42.2025 37.6 0.38214 0.0
## 229 229 Newton 40 3739 -71.1100 42.1850 46.7 0.29819 0.0
## 232 232 Newton 40 3742 -71.1285 42.1930 31.7 0.46296 0.0
## 233 233 Newton 40 3743 -71.1400 42.1946 41.7 0.57529 0.0
## 234 234 Newton 40 3744 -71.1340 42.2040 48.3 0.33147 0.0
## 238 238 Newton 40 3748 -71.1491 42.2030 31.5 0.51183 0.0
## 257 257 Sherborn 44 3861 -71.2230 42.1450 44.0 0.01538 90.0
## 258 258 Brookline 45 4001 -71.0679 42.2073 50.0 0.61154 20.0
## 259 259 Brookline 45 4002 -71.0727 42.2077 36.0 0.66351 20.0
## 260 260 Brookline 45 4003 -71.0765 42.2075 30.1 0.65665 20.0
## 262 262 Brookline 45 4005 -71.0790 42.2020 43.1 0.53412 20.0
## 263 263 Brookline 45 4006 -71.0835 42.2000 48.8 0.52014 20.0
## 268 268 Brookline 45 4011 -71.0850 42.1925 50.0 0.57834 20.0
## 269 269 Brookline 45 4012 -71.0925 42.2075 43.5 0.54050 20.0
## 274 274 Dedham 46 4025 -71.1170 42.1510 35.2 0.22188 20.0
## 275 275 Needham 47 4031 -71.1305 42.1675 32.4 0.05644 40.0
## 276 276 Needham 47 4032 -71.1380 42.1733 32.0 0.09604 40.0
## 277 277 Needham 47 4033 -71.1405 42.1632 33.2 0.10469 40.0
## 278 278 Needham 47 4034 -71.1495 42.1730 33.1 0.06127 40.0
## 280 280 Wellesley 48 4041 -71.1480 42.1880 35.1 0.21038 20.0
## 281 281 Wellesley 48 4042 -71.1660 42.1870 45.4 0.03578 20.0
## 282 282 Wellesley 48 4043 -71.1850 42.1848 35.4 0.03705 20.0
## 283 283 Wellesley 48 4044 -71.1775 42.1735 46.0 0.06129 20.0
## 284 284 Dover 49 4051 -71.1730 42.1475 50.0 0.01501 90.0
## 291 291 Westwood 54 4121 -71.1385 42.1235 28.5 0.03502 80.0
## 292 292 Westwood 54 4122 -71.1330 42.1408 37.3 0.07886 80.0
## 307 307 Milton 58 4163 -71.0450 42.1590 33.4 0.07503 33.0
## 365 365 Boston Back Bay 75 101 -71.0590 42.2098 21.9 3.47428 0.0
## 370 370 Boston Back Bay 75 108 -71.0497 42.2125 50.0 5.66998 0.0
## 371 371 Boston Beacon Hill 76 201 -71.0422 42.2144 50.0 6.53876 0.0
## INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT
## 41 2.95 0 0.4280 7.024 15.8 5.4011 3 252 18.3 395.62 1.98
## 98 2.89 0 0.4450 8.069 76.0 3.4952 2 276 18.0 396.90 4.21
## 99 2.89 0 0.4450 7.820 36.9 3.4952 2 276 18.0 393.53 3.57
## 158 19.58 0 0.6050 6.943 97.4 1.8773 5 403 14.7 363.43 4.59
## 161 19.58 1 0.6050 6.250 92.6 1.7984 5 403 14.7 338.92 5.50
## 162 19.58 0 0.6050 7.489 90.8 1.9709 5 403 14.7 374.43 1.73
## 163 19.58 1 0.6050 7.802 98.2 2.0407 5 403 14.7 389.61 1.92
## 164 19.58 1 0.6050 8.375 93.9 2.1620 5 403 14.7 388.45 3.32
## 167 19.58 0 0.6050 7.929 96.2 2.0459 5 403 14.7 369.30 3.70
## 183 2.46 0 0.4880 7.155 92.2 2.7006 3 193 17.8 394.12 4.82
## 187 2.46 0 0.4880 7.831 53.6 3.1992 3 193 17.8 392.63 4.45
## 193 3.44 0 0.4370 7.178 26.3 6.4798 5 398 15.2 390.49 2.87
## 196 0.46 0 0.4220 7.875 32.0 5.6484 4 255 14.4 394.23 2.97
## 197 1.52 0 0.4040 7.287 34.1 7.3090 2 329 12.6 396.90 4.08
## 203 2.03 0 0.4150 7.610 15.7 6.2700 2 348 14.7 395.38 3.11
## 204 2.68 0 0.4161 7.853 33.2 5.1180 4 224 14.7 392.78 3.81
## 205 2.68 0 0.4161 8.034 31.9 5.1180 4 224 14.7 390.55 2.88
## 225 6.20 0 0.5040 8.266 78.3 2.8944 8 307 17.4 385.05 4.14
## 226 6.20 0 0.5040 8.725 83.0 2.8944 8 307 17.4 382.00 4.63
## 227 6.20 0 0.5040 8.040 86.5 3.2157 8 307 17.4 387.38 3.13
## 229 6.20 0 0.5040 7.686 17.0 3.3751 8 307 17.4 377.51 3.92
## 232 6.20 0 0.5040 7.412 76.9 3.6715 8 307 17.4 376.14 5.25
## 233 6.20 0 0.5070 8.337 73.3 3.8384 8 307 17.4 385.91 2.47
## 234 6.20 0 0.5070 8.247 70.4 3.6519 8 307 17.4 378.95 3.95
## 238 6.20 0 0.5070 7.358 71.6 4.1480 8 307 17.4 390.07 4.73
## 257 3.75 0 0.3940 7.454 34.2 6.3361 3 244 15.9 386.34 3.11
## 258 3.97 0 0.6470 8.704 86.9 1.8010 5 264 13.0 389.70 5.12
## 259 3.97 0 0.6470 7.333 100.0 1.8946 5 264 13.0 383.29 7.79
## 260 3.97 0 0.6470 6.842 100.0 2.0107 5 264 13.0 391.93 6.90
## 262 3.97 0 0.6470 7.520 89.4 2.1398 5 264 13.0 388.37 7.26
## 263 3.97 0 0.6470 8.398 91.5 2.2885 5 264 13.0 386.86 5.91
## 268 3.97 0 0.5750 8.297 67.0 2.4216 5 264 13.0 384.54 7.44
## 269 3.97 0 0.5750 7.470 52.6 2.8720 5 264 13.0 390.30 3.16
## 274 6.96 1 0.4640 7.691 51.8 4.3665 3 223 18.6 390.77 6.58
## 275 6.41 1 0.4470 6.758 32.9 4.0776 4 254 17.6 396.90 3.53
## 276 6.41 0 0.4470 6.854 42.8 4.2673 4 254 17.6 396.90 2.98
## 277 6.41 1 0.4470 7.267 49.0 4.7872 4 254 17.6 389.25 6.05
## 278 6.41 1 0.4470 6.826 27.6 4.8628 4 254 17.6 393.45 4.16
## 280 3.33 0 0.4429 6.812 32.2 4.1007 5 216 14.9 396.90 4.85
## 281 3.33 0 0.4429 7.820 64.5 4.6947 5 216 14.9 387.31 3.76
## 282 3.33 0 0.4429 6.968 37.2 5.2447 5 216 14.9 392.23 4.59
## 283 3.33 1 0.4429 7.645 49.7 5.2119 5 216 14.9 377.07 3.01
## 284 1.21 1 0.4010 7.923 24.8 5.8850 1 198 13.6 395.52 3.16
## 291 4.95 0 0.4110 6.861 27.9 5.1167 4 245 19.2 396.90 3.33
## 292 4.95 0 0.4110 7.148 27.7 5.1167 4 245 19.2 396.90 3.56
## 307 2.18 0 0.4720 7.420 71.9 3.0992 7 222 18.4 396.90 6.47
## 365 18.10 1 0.7180 8.780 82.9 1.9047 24 666 20.2 354.55 5.29
## 370 18.10 1 0.6310 6.683 96.8 1.3567 24 666 20.2 375.33 3.73
## 371 18.10 1 0.6310 7.016 97.5 1.2024 24 666 20.2 392.05 2.96
par(mfrow=c(2,1))
boxplot(df[-aaa,-c(11,13)])
boxplot(df[aaa,-c(11,13)])

# town plot
library(ggplot2)
ggplot(data=df,aes(x=as.factor(TOWN),y=MEDV))+
geom_bar(stat='summary',fun.y='median',fill='slateblue',color='darkslategrey')+
theme(axis.text.x=element_text(angle=45,hjust=1))+
geom_hline(yintercept=median(df$MEDV),linetype='dashed',color='blue')+
geom_hline(yintercept=median(df$MEDV)*0.5,linetype='dashed',color='black')+
geom_hline(yintercept=median(df$MEDV)*1.5,linetype='dashed',color='red')

model 5
fit.lm3=lm(MEDV~CRIM+ZN+INDUS+CHAS+NOX+RM+AGE+DIS+RAD+TAX+PTRATIO+B+LSTAT+I(LSTAT^2),data=df1)
layout(matrix(c(1,2,3,4),2,2))
plot(fit.lm3)

summary(fit.lm3)
##
## Call:
## lm(formula = MEDV ~ CRIM + ZN + INDUS + CHAS + NOX + RM + AGE +
## DIS + RAD + TAX + PTRATIO + B + LSTAT + I(LSTAT^2), data = df1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.2075 -2.5063 -0.3202 1.8247 24.3895
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 43.531669 4.593869 9.476 < 2e-16 ***
## CRIM -0.148980 0.029541 -5.043 6.45e-07 ***
## ZN 0.025851 0.012394 2.086 0.037512 *
## INDUS 0.048567 0.054832 0.886 0.376186
## CHAS 2.425222 0.767911 3.158 0.001685 **
## NOX -16.252217 3.405501 -4.772 2.41e-06 ***
## RM 3.019182 0.378996 7.966 1.15e-14 ***
## AGE 0.028738 0.012050 2.385 0.017463 *
## DIS -1.217712 0.179596 -6.780 3.45e-11 ***
## RAD 0.292628 0.059112 4.950 1.02e-06 ***
## TAX -0.011263 0.003353 -3.359 0.000842 ***
## PTRATIO -0.787897 0.117215 -6.722 4.99e-11 ***
## B 0.007718 0.002397 3.220 0.001366 **
## LSTAT -1.783645 0.123920 -14.393 < 2e-16 ***
## I(LSTAT^2) 0.034999 0.003223 10.859 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.227 on 491 degrees of freedom
## Multiple R-squared: 0.7939, Adjusted R-squared: 0.7881
## F-statistic: 135.1 on 14 and 491 DF, p-value: < 2.2e-16
model 6
# Variable Selection
nullmodel = lm(MEDV ~ 1, data=df1)
fullmodel = lm(MEDV~.+I(LSTAT^2),data=df1)
# forward selection
model.step.f = step(nullmodel, scope = list(lower = nullmodel, upper = fullmodel), direction = "forward",trace=F)
# Backward selection
model.step.b = step(fullmodel, direction = "backward",trace=F)
# stepwise selection
model.step = step(nullmodel, scope = list(lower = nullmodel, upper = fullmodel), direction = "both",trace=F)
model.step.f
##
## Call:
## lm(formula = MEDV ~ LSTAT + I(LSTAT^2) + RM + PTRATIO + DIS +
## CRIM + NOX + CHAS + RAD + B + TAX + AGE + ZN, data = df1)
##
## Coefficients:
## (Intercept) LSTAT I(LSTAT^2) RM PTRATIO DIS
## 43.281387 -1.776253 0.034878 2.991565 -0.774971 -1.251890
## CRIM NOX CHAS RAD B TAX
## -0.149793 -15.462506 2.496108 0.278106 0.007653 -0.009970
## AGE ZN
## 0.028652 0.024776
model.step.b
##
## Call:
## lm(formula = MEDV ~ CRIM + ZN + CHAS + NOX + RM + AGE + DIS +
## RAD + TAX + PTRATIO + B + LSTAT + I(LSTAT^2), data = df1)
##
## Coefficients:
## (Intercept) CRIM ZN CHAS NOX RM
## 43.281387 -0.149793 0.024776 2.496108 -15.462506 2.991565
## AGE DIS RAD TAX PTRATIO B
## 0.028652 -1.251890 0.278106 -0.009970 -0.774971 0.007653
## LSTAT I(LSTAT^2)
## -1.776253 0.034878
model.step
##
## Call:
## lm(formula = MEDV ~ LSTAT + I(LSTAT^2) + RM + PTRATIO + DIS +
## CRIM + NOX + CHAS + RAD + B + TAX + AGE + ZN, data = df1)
##
## Coefficients:
## (Intercept) LSTAT I(LSTAT^2) RM PTRATIO DIS
## 43.281387 -1.776253 0.034878 2.991565 -0.774971 -1.251890
## CRIM NOX CHAS RAD B TAX
## -0.149793 -15.462506 2.496108 0.278106 0.007653 -0.009970
## AGE ZN
## 0.028652 0.024776
c(AIC(model.step.f),AIC(model.step.b),AIC(model.step))
## [1] 2910.396 2910.396 2910.396
c(BIC(model.step.f),BIC(model.step.b),BIC(model.step))
## [1] 2973.794 2973.794 2973.794
summary(model.step)
##
## Call:
## lm(formula = MEDV ~ LSTAT + I(LSTAT^2) + RM + PTRATIO + DIS +
## CRIM + NOX + CHAS + RAD + B + TAX + AGE + ZN, data = df1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.1851 -2.5314 -0.3066 1.9376 24.4388
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 43.281387 4.584167 9.441 < 2e-16 ***
## LSTAT -1.776253 0.123612 -14.370 < 2e-16 ***
## I(LSTAT^2) 0.034878 0.003220 10.833 < 2e-16 ***
## RM 2.991565 0.377629 7.922 1.57e-14 ***
## PTRATIO -0.774971 0.116278 -6.665 7.12e-11 ***
## DIS -1.251890 0.175363 -7.139 3.40e-12 ***
## CRIM -0.149793 0.029520 -5.074 5.52e-07 ***
## NOX -15.462506 3.286001 -4.706 3.29e-06 ***
## CHAS 2.496108 0.763563 3.269 0.00115 **
## RAD 0.278106 0.056781 4.898 1.32e-06 ***
## B 0.007653 0.002395 3.195 0.00149 **
## TAX -0.009970 0.003017 -3.304 0.00102 **
## AGE 0.028652 0.012047 2.378 0.01777 *
## ZN 0.024776 0.012331 2.009 0.04507 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.226 on 492 degrees of freedom
## Multiple R-squared: 0.7936, Adjusted R-squared: 0.7881
## F-statistic: 145.5 on 13 and 492 DF, p-value: < 2.2e-16
model 7 제곱항
fo1=paste(names(df1)[-1],sep="",collapse="+")
fo2=paste(paste("I(",names(df1)[-1],"^2)",sep=""),sep="",collapse = "+")
full.fo = paste("MEDV","~",paste(fo1,"+",fo2,sep=""),sep = "")
formula = as.formula(full.fo)
lmlm=lm(formula,data = df1)
summary(lmlm)
##
## Call:
## lm(formula = formula, data = df1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.9503 -2.1642 -0.2243 1.7648 24.9448
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.619e+02 2.076e+01 7.799 3.92e-14 ***
## CRIM -3.709e-01 8.343e-02 -4.446 1.09e-05 ***
## ZN -5.703e-02 3.123e-02 -1.826 0.068434 .
## INDUS -1.089e-01 1.956e-01 -0.557 0.578055
## CHAS 2.618e+00 7.136e-01 3.668 0.000271 ***
## NOX -2.284e+01 2.461e+01 -0.928 0.354033
## RM -1.917e+01 2.751e+00 -6.968 1.07e-11 ***
## AGE -9.307e-03 3.680e-02 -0.253 0.800471
## DIS -2.721e+00 5.602e-01 -4.857 1.62e-06 ***
## RAD 5.721e-01 2.073e-01 2.760 0.006008 **
## TAX -2.706e-02 1.442e-02 -1.876 0.061238 .
## PTRATIO -5.338e+00 1.650e+00 -3.236 0.001298 **
## B 2.347e-02 1.087e-02 2.159 0.031345 *
## LSTAT -1.333e+00 1.261e-01 -10.569 < 2e-16 ***
## I(CRIM^2) 2.955e-03 1.076e-03 2.747 0.006236 **
## I(ZN^2) 7.141e-04 3.369e-04 2.120 0.034545 *
## I(INDUS^2) 5.503e-03 7.474e-03 0.736 0.461948
## I(CHAS^2) NA NA NA NA
## I(NOX^2) -4.543e-01 1.805e+01 -0.025 0.979935
## I(RM^2) 1.740e+00 2.153e-01 8.082 5.21e-15 ***
## I(AGE^2) 6.689e-05 3.082e-04 0.217 0.828279
## I(DIS^2) 1.464e-01 4.630e-02 3.161 0.001671 **
## I(RAD^2) -7.724e-03 8.236e-03 -0.938 0.348756
## I(TAX^2) 1.662e-05 1.904e-05 0.873 0.383298
## I(PTRATIO^2) 1.291e-01 4.662e-02 2.770 0.005819 **
## I(B^2) -3.919e-05 2.387e-05 -1.642 0.101315
## I(LSTAT^2) 2.295e-02 3.415e-03 6.720 5.17e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.813 on 480 degrees of freedom
## Multiple R-squared: 0.8361, Adjusted R-squared: 0.8276
## F-statistic: 97.94 on 25 and 480 DF, p-value: < 2.2e-16
model 8 교호작용
fo1=paste(names(df1)[-1],sep="",collapse="+")
fo2=paste(paste("I(",names(df1)[-1],"^2)",sep=""),sep="",collapse = "+")
fo3=paste("(",paste(names(df1)[-1],collapse = "+"),")^2",sep="")
full.fo = paste("MEDV","~",paste(fo1,fo2,fo3,sep="+"),sep = "")
formula1 = as.formula(full.fo)
lmlm1=lm(formula1,data = df1)
summary(lmlm1)
##
## Call:
## lm(formula = formula1, data = df1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.939 -1.363 -0.062 1.252 18.284
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.980e+02 8.296e+01 -2.387 0.017449 *
## CRIM -4.234e+00 6.716e+00 -0.631 0.528722
## ZN 1.929e-01 4.682e-01 0.412 0.680459
## INDUS -4.247e+00 1.849e+00 -2.297 0.022107 *
## CHAS 5.665e+01 1.958e+01 2.894 0.004017 **
## NOX 1.352e+02 1.255e+02 1.077 0.281931
## RM 2.452e+01 7.691e+00 3.188 0.001543 **
## AGE 9.115e-01 2.818e-01 3.235 0.001319 **
## DIS -7.117e+00 4.891e+00 -1.455 0.146468
## RAD 2.159e+00 2.470e+00 0.874 0.382473
## TAX 2.126e-02 1.381e-01 0.154 0.877752
## PTRATIO 7.642e+00 3.891e+00 1.964 0.050223 .
## B 9.474e-02 7.262e-02 1.305 0.192770
## LSTAT 8.412e-01 8.709e-01 0.966 0.334717
## I(CRIM^2) 1.459e-03 1.220e-03 1.195 0.232637
## I(ZN^2) -4.078e-04 5.990e-04 -0.681 0.496379
## I(INDUS^2) 3.820e-02 1.514e-02 2.523 0.012029 *
## I(CHAS^2) NA NA NA NA
## I(NOX^2) -3.100e+01 6.410e+01 -0.484 0.628987
## I(RM^2) 3.361e-01 2.668e-01 1.260 0.208433
## I(AGE^2) 3.050e-05 4.076e-04 0.075 0.940390
## I(DIS^2) 4.187e-01 9.915e-02 4.223 2.99e-05 ***
## I(RAD^2) -1.044e-01 4.139e-02 -2.522 0.012045 *
## I(TAX^2) -8.626e-05 4.905e-05 -1.759 0.079402 .
## I(PTRATIO^2) 2.939e-03 5.829e-02 0.050 0.959806
## I(B^2) -4.059e-05 2.191e-05 -1.852 0.064724 .
## I(LSTAT^2) 1.443e-02 4.824e-03 2.992 0.002946 **
## CRIM:ZN 2.434e-01 1.890e-01 1.288 0.198509
## CRIM:INDUS 3.944e-01 4.562e-01 0.865 0.387795
## CRIM:CHAS 2.463e+00 5.463e-01 4.509 8.57e-06 ***
## CRIM:NOX -6.452e-01 8.900e-01 -0.725 0.468906
## CRIM:RM 1.322e-01 5.918e-02 2.234 0.026018 *
## CRIM:AGE -2.861e-03 3.693e-03 -0.775 0.439032
## CRIM:DIS -9.542e-02 1.045e-01 -0.913 0.361883
## CRIM:RAD 2.075e-01 6.045e-01 0.343 0.731660
## CRIM:TAX -2.329e-02 4.443e-02 -0.524 0.600476
## CRIM:PTRATIO 3.516e-01 3.225e-01 1.090 0.276206
## CRIM:B -3.164e-04 1.802e-04 -1.756 0.079777 .
## CRIM:LSTAT 1.964e-02 6.599e-03 2.976 0.003099 **
## ZN:INDUS -5.690e-03 4.927e-03 -1.155 0.248848
## ZN:CHAS -5.486e-02 6.224e-02 -0.881 0.378643
## ZN:NOX -1.092e+00 6.038e-01 -1.808 0.071347 .
## ZN:RM 8.191e-03 2.641e-02 0.310 0.756660
## ZN:AGE 2.834e-04 8.527e-04 0.332 0.739806
## ZN:DIS -1.146e-02 9.416e-03 -1.217 0.224440
## ZN:RAD -5.849e-03 6.789e-03 -0.862 0.389463
## ZN:TAX 6.363e-04 1.991e-04 3.196 0.001502 **
## ZN:PTRATIO -6.886e-03 7.312e-03 -0.942 0.346844
## ZN:B 8.235e-04 7.554e-04 1.090 0.276269
## ZN:LSTAT -4.973e-03 4.614e-03 -1.078 0.281743
## INDUS:CHAS 7.106e-02 3.844e-01 0.185 0.853439
## INDUS:NOX 8.912e-01 2.019e+00 0.441 0.659104
## INDUS:RM 2.897e-01 1.323e-01 2.189 0.029149 *
## INDUS:AGE 3.162e-03 3.651e-03 0.866 0.387022
## INDUS:DIS 1.204e-01 7.986e-02 1.508 0.132342
## INDUS:RAD -3.185e-02 6.804e-02 -0.468 0.640020
## INDUS:TAX 6.489e-04 9.492e-04 0.684 0.494565
## INDUS:PTRATIO -1.941e-02 4.005e-02 -0.485 0.628254
## INDUS:B 2.464e-03 1.989e-03 1.239 0.216001
## INDUS:LSTAT -1.094e-02 1.472e-02 -0.743 0.457961
## CHAS:NOX -3.381e+01 1.182e+01 -2.861 0.004446 **
## CHAS:RM -5.328e+00 1.190e+00 -4.478 9.82e-06 ***
## CHAS:AGE 1.754e-02 5.639e-02 0.311 0.755911
## CHAS:DIS 1.156e+00 1.279e+00 0.904 0.366407
## CHAS:RAD -8.057e-03 5.724e-01 -0.014 0.988777
## CHAS:TAX 3.001e-04 3.666e-02 0.008 0.993472
## CHAS:PTRATIO -8.807e-01 6.649e-01 -1.324 0.186094
## CHAS:B 1.949e-02 1.485e-02 1.313 0.190097
## CHAS:LSTAT -2.312e-01 1.777e-01 -1.301 0.194071
## NOX:RM 3.753e-01 5.545e+00 0.068 0.946065
## NOX:AGE -4.591e-01 2.695e-01 -1.704 0.089225 .
## NOX:DIS 1.338e+01 5.471e+00 2.446 0.014874 *
## NOX:RAD -2.099e+00 1.857e+00 -1.130 0.259158
## NOX:TAX 1.863e-01 1.364e-01 1.366 0.172733
## NOX:PTRATIO -1.050e+01 3.929e+00 -2.672 0.007844 **
## NOX:B -1.991e-02 3.468e-02 -0.574 0.566200
## NOX:LSTAT 9.573e-01 6.524e-01 1.467 0.143071
## RM:AGE -5.930e-02 2.116e-02 -2.803 0.005307 **
## RM:DIS 1.337e-01 3.177e-01 0.421 0.674013
## RM:RAD -1.458e-02 1.481e-01 -0.098 0.921645
## RM:TAX -1.885e-02 9.708e-03 -1.941 0.052901 .
## RM:PTRATIO -6.804e-01 2.354e-01 -2.890 0.004057 **
## RM:B -3.489e-03 3.232e-03 -1.079 0.281035
## RM:LSTAT -1.761e-01 5.655e-02 -3.113 0.001983 **
## AGE:DIS -7.814e-03 9.134e-03 -0.855 0.392798
## AGE:RAD 1.531e-02 4.069e-03 3.763 0.000193 ***
## AGE:TAX -5.478e-04 2.135e-04 -2.565 0.010671 *
## AGE:PTRATIO 2.431e-03 7.086e-03 0.343 0.731790
## AGE:B -6.595e-04 2.060e-04 -3.201 0.001477 **
## AGE:LSTAT -7.581e-03 1.916e-03 -3.957 8.97e-05 ***
## DIS:RAD -8.641e-02 6.936e-02 -1.246 0.213515
## DIS:TAX -5.018e-03 2.614e-03 -1.919 0.055629 .
## DIS:PTRATIO -1.499e-01 9.761e-02 -1.535 0.125504
## DIS:B -4.126e-03 5.491e-03 -0.752 0.452768
## DIS:LSTAT 8.867e-02 4.791e-02 1.851 0.064964 .
## RAD:TAX 7.165e-03 2.787e-03 2.571 0.010488 *
## RAD:PTRATIO -1.087e-01 9.045e-02 -1.202 0.230057
## RAD:B 2.087e-04 2.419e-03 0.086 0.931307
## RAD:LSTAT -2.684e-02 1.769e-02 -1.518 0.129871
## TAX:PTRATIO 5.606e-03 2.528e-03 2.217 0.027174 *
## TAX:B -9.436e-05 1.943e-04 -0.486 0.627496
## TAX:LSTAT -6.082e-04 1.183e-03 -0.514 0.607590
## PTRATIO:B 2.675e-03 3.271e-03 0.818 0.413914
## PTRATIO:LSTAT -5.257e-03 2.958e-02 -0.178 0.859032
## B:LSTAT -4.646e-04 4.039e-04 -1.150 0.250732
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.675 on 402 degrees of freedom
## Multiple R-squared: 0.9324, Adjusted R-squared: 0.9151
## F-statistic: 53.85 on 103 and 402 DF, p-value: < 2.2e-16
layout(matrix(c(1,2,3,4),2,2))
plot(lmlm1)
