Loading DataSet

crim: per capita crime rate by town. zn: proportion of residential land zoned for lots over 25,000 sq.ft. indus: proportion of non-retail business acres per town. chas: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise). nox: nitrogen oxides concentration (parts per 10 million). rm: average number of rooms per dwelling. age: proportion of owner-occupied units built prior to 1940. dis: weighted mean of distances to five Boston employment centres. rad: index of accessibility to radial highways. tax: full-value property-tax rate per $10,000. ptratio: pupil-teacher ratio by town. black: 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town. lstat: lower status of the population (percent). medv: median value of owner-occupied homes in $1000s.

attach(Boston)

head(Boston)

##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21
##   medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7

str(Boston)

## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ black  : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...

Missing Data

colSums(is.na(Boston))

##    crim      zn   indus    chas     nox      rm     age     dis     rad     tax 
##       0       0       0       0       0       0       0       0       0       0 
## ptratio   black   lstat    medv 
##       0       0       0       0

Boston[!complete.cases(Boston),]

##  [1] crim    zn      indus   chas    nox     rm      age     dis     rad    
## [10] tax     ptratio black   lstat   medv   
## <0 rows> (or 0-length row.names)

vis_miss(Boston, cluster = TRUE)

Correlation - Pearson

corr <- cor(Boston, method = "pearson")
corrplot::corrplot(corr, method = "color", type="upper",
         tl.col="black",tl.srt=45, addCoef.col = "gray8", diag = T)

EDA - Crime in Boston

summary(Boston$crim)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00632  0.08204  0.25651  3.61352  3.67708 88.97620

Crime and lower status of the population

highchart() %>% 
  hc_add_series(Boston, "scatter", hcaes(x= lstat, y= crim)) %>% 
  hc_add_theme(hc_theme_flatdark()) %>% 
  hc_title(text = "Crime and lower status of the population") %>% 
  hc_xAxis(title = list(text = "Lower status of population")) %>% 
  hc_yAxis(title= list(text = "Crime rate"))

highchart() %>% 
  hc_add_series(Boston, "scatter", hcaes(x= tax, y= crim)) %>% 
  hc_add_theme(hc_theme_flatdark()) %>% 
  hc_title(text = "Crime and tax") %>% 
  hc_xAxis(title = list(text = "property-tax rate per $10,000")) %>% 
  hc_yAxis(title= list(text = "Crime rate"))

hchart(Boston$crim)

Boston %>% 
  ggplot(aes(crim)) +
  geom_histogram(aes(y=..density..),
                 col= "Red",
                 fill="White") +
  stat_function(fun = dnorm, args = list(mean = mean(Boston$crim),
                            sd= sd(Boston$crim)))

quantile(Boston$crim, .90)

##    90% 
## 10.753

high_crim<-subset(Boston, crim > quantile(Boston$crim, .90))

summary(high_crim)

##       crim             zn        indus           chas        nox        
##  Min.   :10.83   Min.   :0   Min.   :18.1   Min.   :0   Min.   :0.5800  
##  1st Qu.:13.60   1st Qu.:0   1st Qu.:18.1   1st Qu.:0   1st Qu.:0.6590  
##  Median :15.86   Median :0   Median :18.1   Median :0   Median :0.6790  
##  Mean   :22.65   Mean   :0   Mean   :18.1   Mean   :0   Mean   :0.6723  
##  3rd Qu.:24.02   3rd Qu.:0   3rd Qu.:18.1   3rd Qu.:0   3rd Qu.:0.7000  
##  Max.   :88.98   Max.   :0   Max.   :18.1   Max.   :0   Max.   :0.7400  
##        rm             age              dis             rad          tax     
##  Min.   :3.863   Min.   : 56.70   Min.   :1.137   Min.   :24   Min.   :666  
##  1st Qu.:5.290   1st Qu.: 91.55   1st Qu.:1.452   1st Qu.:24   1st Qu.:666  
##  Median :5.854   Median : 97.30   Median :1.607   Median :24   Median :666  
##  Mean   :5.750   Mean   : 93.74   Mean   :1.693   Mean   :24   Mean   :666  
##  3rd Qu.:6.362   3rd Qu.:100.00   3rd Qu.:1.865   3rd Qu.:24   3rd Qu.:666  
##  Max.   :7.313   Max.   :100.00   Max.   :2.908   Max.   :24   Max.   :666  
##     ptratio         black            lstat            medv      
##  Min.   :20.2   Min.   :  2.60   Min.   :10.11   Min.   : 5.00  
##  1st Qu.:20.2   1st Qu.: 78.61   1st Qu.:17.63   1st Qu.: 8.45  
##  Median :20.2   Median :332.09   Median :22.98   Median :10.50  
##  Mean   :20.2   Mean   :251.20   Mean   :22.92   Mean   :12.27  
##  3rd Qu.:20.2   3rd Qu.:396.90   3rd Qu.:27.07   3rd Qu.:14.45  
##  Max.   :20.2   Max.   :396.90   Max.   :37.97   Max.   :27.90

범죄율이 높은 지역을 분류하여, ( 저소득층 주민 비율) 알아보기

plot_ly(Boston, y=~lstat, name ="Boston", type="box") %>% 
  add_boxplot(high_crim, y= ~lstat, name ="Area with 90th percentile" ,type ="box")

범죄율이 높은 지역을 분류 - medv 주택 가격 알아보기

plot_ly(data=Boston, y = ~medv, name = "Boston", type="box")  %>%
  add_boxplot(data=high_crim, y= ~medv, name = "Area with 90th percentile lstat", type="box")

EDA - lower status of the population in Boston

summary(Boston$lstat)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.73    6.95   11.36   12.65   16.95   37.97

저소득층 주민 비율 평균 11~12%

저소득층 비율에 대한 평균 집값

lm_md_one <- lm(medv ~ lstat, data=Boston)
summary(lm_md_one)

## 
## Call:
## lm(formula = medv ~ lstat, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.168  -3.990  -1.318   2.034  24.500 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 34.55384    0.56263   61.41   <2e-16 ***
## lstat       -0.95005    0.03873  -24.53   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared:  0.5441, Adjusted R-squared:  0.5432 
## F-statistic: 601.6 on 1 and 504 DF,  p-value: < 2.2e-16

highchart() %>% 
  hc_add_series(Boston, "scatter", hcaes(x= lstat, y=medv),
                regression = TRUE) %>% 
  hc_title(text ="Meidan value of home and lower status of the population") %>% 
  hc_xAxis(title = list(text = "lower status of the population")) %>% 
  hc_yAxis(title= list(text = "Meidan value of home")) %>% 
   hc_add_theme(hc_theme_flat())

저 소득 비율에과 방의 갯수 (rm)

highchart() %>% 
  hc_add_series(Boston, "point", hcaes(x= lstat, y=rm),
                name = "Lower Status VS The number of roos")%>% 
  hc_title(text ="Meidan value of home and room size") %>% 
  hc_tooltip(crosshairs = TRUE, 
             pointFormat = paste("Lower Status: <b> {point.x} </b> <br>,
                                 Number of rooms : <b>{point.y}</b>")) %>% 
  hc_xAxis(title = list(text = "lower status of the population")) %>% 
  hc_yAxis(title= list(text = "Number of rooms")) %>% 
  hc_add_theme(hc_theme_flat())

highchart() %>% 
  hc_add_series(Boston, "point", hcaes(lstat, age),
                name = "Lower Status VS Age")%>% 
  hc_title(text ="Meidan value of home and room size") %>% 
  hc_tooltip(crosshairs = TRUE, 
             pointFormat = paste("Lower Status: <b> {point.x} </b> <br>,
                                  Age : <b>{point.y}</b>")) %>% 
  hc_xAxis(title = list(text = "lower status of the population")) %>% 
  hc_yAxis(title= list(text = "Age")) %>% 
  hc_add_theme(hc_theme_flat())

Istat - 확인

hchart(Boston$lstat, name = "Histogram of Istat")

hlstat<-subset(Boston, lstat > quantile(Boston$lstat, .90))

Boston 지역과 저소득비율이 많은 지역 비교하기

방의 갯수

plot_ly(data=Boston, y = ~rm, name = "Boston", type="box")  %>%
  add_boxplot(data=hlstat, y= ~rm, name = "Area with 90th percentile lstat", type="box")

범죄율

plot_ly(data=Boston, y = ~crim, name = "Boston", type="box")  %>%
  add_boxplot(data=hlstat, y= ~crim, name = "Area with 90th percentile lstat", type="box")

연령계층

plot_ly(data=Boston, y = ~age, name = "Boston", type="box")  %>%
  add_boxplot(data=hlstat, y= ~age, name = "Area with 90th percentile lstat", type="box")

집값

plot_ly(data=Boston, y = ~medv, name = "Boston", type="box")  %>%
  add_boxplot(data=hlstat, y= ~medv, name = "Area with 90th percentile lstat", type="box")

각 변수에 따른 Med 값의 변화

#Boston %>%
#  select(crim, rm, age, rad, tax, lstat, medv) %>%
#  melt(., id.vars = "medv") %>%
#  ggplot(aes(x = value, y = medv, colour = variable)) +
#  geom_point(alpha = 0.7) +
#  stat_smooth(aes(colour = "black")) +
#  facet_wrap(~variable, scales = "free", ncol = 2) +
#  labs(x = "Variable Value", y = "Median House Price ($1000s)") +
#  theme_minimal()

Simple Linear Regression

sample_size <- floor(0.75 * nrow(Boston)) # sample size 확인 

# floor  - integer value 로 돌려줌 

set.seed(12) #Random number generator 샘플링 자동 

# Train and test 데이터 분리 

train_ind <- sample(seq_len(nrow(Boston)), size=sample_size)

train <- Boston[train_ind,]
test <- Boston[-train_ind,]

lm_one <- lm(medv~ lstat, data = train)

summary(lm_one)

## 
## Call:
## lm(formula = medv ~ lstat, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.800  -3.753  -1.161   1.690  24.242 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 34.12056    0.63739   53.53   <2e-16 ***
## lstat       -0.94171    0.04372  -21.54   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.007 on 377 degrees of freedom
## Multiple R-squared:  0.5517, Adjusted R-squared:  0.5505 
## F-statistic:   464 on 1 and 377 DF,  p-value: < 2.2e-16

# model 평가하기 
plot(lm_one)

dat <- data.frame(lstat = c(1:35),
                    medv = predict(lm_one, data.frame(lstat = (1:35))))

dat

##    lstat      medv
## 1      1 33.178848
## 2      2 32.237140
## 3      3 31.295433
## 4      4 30.353725
## 5      5 29.412017
## 6      6 28.470310
## 7      7 27.528602
## 8      8 26.586894
## 9      9 25.645187
## 10    10 24.703479
## 11    11 23.761771
## 12    12 22.820064
## 13    13 21.878356
## 14    14 20.936648
## 15    15 19.994941
## 16    16 19.053233
## 17    17 18.111526
## 18    18 17.169818
## 19    19 16.228110
## 20    20 15.286403
## 21    21 14.344695
## 22    22 13.402987
## 23    23 12.461280
## 24    24 11.519572
## 25    25 10.577864
## 26    26  9.636157
## 27    27  8.694449
## 28    28  7.752741
## 29    29  6.811034
## 30    30  5.869326
## 31    31  4.927619
## 32    32  3.985911
## 33    33  3.044203
## 34    34  2.102496
## 35    35  1.160788

plot_ly() %>% 
      add_trace(x= ~lstat, y=~medv, type="scatter", mode="lines", 
                data = dat, name = "Predicted Value") %>%
      add_trace(x= ~lstat, y=~medv, type="scatter", data = test, name = "Actual Value")

lm.fit_2<-lm(medv ~ lstat +I(lstat^2), data=train)


dat<-data.frame(lstat = (1:40),
           medv = predict(lm.fit_2, data.frame(lstat= (1:40))))



plot_ly() %>% 
  add_trace(x= ~lstat, y = ~medv, type= "scatter", mode ="lines", data = dat,
            name = "Predicted Value") %>% 
  add_trace(x = ~lstat, y= ~medv, type ="scatter", data= test,
            name= "Actual Value")

Multi-regression

lm.fit=lm(medv~crim+zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+black+lstat+I(lstat^2),
          data=train)

summary(lm.fit)

## 
## Call:
## lm(formula = medv ~ crim + zn + indus + chas + nox + rm + age + 
##     dis + rad + tax + ptratio + black + lstat + I(lstat^2), data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.5528  -2.3805  -0.3195   1.7342  24.7913 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  43.799421   5.151894   8.502 4.89e-16 ***
## crim         -0.135197   0.030422  -4.444 1.17e-05 ***
## zn            0.024778   0.014222   1.742 0.082318 .  
## indus         0.043005   0.060892   0.706 0.480485    
## chas          2.213046   0.889561   2.488 0.013301 *  
## nox         -19.393777   4.040227  -4.800 2.32e-06 ***
## rm            3.329732   0.448026   7.432 7.71e-13 ***
## age           0.021802   0.013471   1.618 0.106438    
## dis          -1.187884   0.214759  -5.531 6.08e-08 ***
## rad           0.334585   0.069606   4.807 2.25e-06 ***
## tax          -0.014200   0.003915  -3.627 0.000328 ***
## ptratio      -0.863652   0.130906  -6.598 1.48e-10 ***
## black         0.006460   0.002552   2.531 0.011787 *  
## lstat        -1.504056   0.143232 -10.501  < 2e-16 ***
## I(lstat^2)    0.029150   0.003678   7.925 2.82e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.156 on 364 degrees of freedom
## Multiple R-squared:  0.7928, Adjusted R-squared:  0.7848 
## F-statistic: 99.47 on 14 and 364 DF,  p-value: < 2.2e-16

length(coef(lm.fit)) #15

## [1] 15

vif(lm.fit) > 10

##       crim         zn      indus       chas        nox         rm        age 
##      FALSE      FALSE      FALSE      FALSE      FALSE      FALSE      FALSE 
##        dis        rad        tax    ptratio      black      lstat I(lstat^2) 
##      FALSE      FALSE      FALSE      FALSE      FALSE       TRUE       TRUE

predict(lm.fit, test)

##         4         5         6         7         9        18        19        27 
## 32.185569 29.856010 27.697662 21.631535 15.684892 15.827432 15.172403 14.782951 
##        28        33        37        52        57        59        70        76 
## 13.902636 11.438046 21.341769 24.014589 25.240643 22.855537 20.728622 23.656519 
##        80        95        97        98       101       102       117       122 
## 22.029421 26.024981 23.592516 37.602497 24.042342 25.711869 21.663219 21.256561 
##       128       130       135       142       143       144       145       146 
## 13.644318 12.610141 12.254930 11.749025 14.562059 12.334180 11.005436 13.420627 
##       148       149       150       162       164       175       178       179 
## 10.972334 11.275161 12.950493 40.443716 43.373142 25.751562 30.143505 31.723202 
##       182       185       186       191       193       195       198       205 
## 27.003583 21.584517 23.177962 30.912444 34.692200 32.147934 30.400816 43.457847 
##       209       214       220       223       225       227       230       232 
## 21.429856 24.286794 28.721966 30.825127 40.091287 40.446955 33.009066 34.850877 
##       235       238       240       242       243       248       249       251 
## 31.291339 34.645522 28.393525 22.419195 23.019604 20.426911 21.313377 25.176781 
##       254       255       256       258       262       267       270       272 
## 32.056319 24.024553 20.192338 42.849857 36.114979 27.786321 23.397811 27.136720 
##       273       280       290       292       293       302       308       310 
## 28.416140 36.025435 25.379950 35.195875 31.942963 27.465878 32.297320 23.091730 
##       317       319       322       323       328       332       334       335 
## 16.266924 23.369780 25.828604 23.483788 18.062541 17.933886 24.178563 22.945480 
##       337       339       343       350       353       354       358       363 
## 20.037938 22.218935 21.765125 23.167939 16.934948 27.588134 20.468116 17.887118 
##       366       367       370       371       372       380       382       383 
## 16.081499 14.414937 34.885170 37.191038 24.387184 15.143598 16.506088 12.926895 
##       387       392       398       403       407       409       417       422 
##  8.100571 15.272436 14.725001 16.430908  7.849194 14.877027 14.210219 16.318389 
##       428       441       446       447       463       468       472       477 
## 12.026845 11.066235 12.096042 15.780114 17.799987 16.135509 21.902882 18.665946 
##       480       487       489       493       494       495       498 
## 19.968841 18.061481  9.958772 13.964971 18.776169 18.045120 17.255152

Stepwise - linear

lm_full2<-lm(medv ~. ^2, data= train)


summary(lm_full2)

## 
## Call:
## lm(formula = medv ~ .^2, data = train)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -6.114 -1.414 -0.122  1.233  9.722 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -7.509e+01  8.526e+01  -0.881 0.379222    
## crim          -1.056e+01  8.818e+00  -1.197 0.232227    
## zn             1.067e-01  5.354e-01   0.199 0.842099    
## indus         -3.141e+00  2.111e+00  -1.488 0.137820    
## chas           4.071e+01  2.049e+01   1.987 0.047889 *  
## nox           -5.647e+01  8.740e+01  -0.646 0.518710    
## rm             2.563e+01  6.583e+00   3.893 0.000123 ***
## age            1.233e+00  3.004e-01   4.105 5.27e-05 ***
## dis           -5.906e+00  5.377e+00  -1.098 0.272961    
## rad            4.164e+00  2.881e+00   1.445 0.149487    
## tax           -2.591e-02  1.720e-01  -0.151 0.880343    
## ptratio       -7.892e-01  3.685e+00  -0.214 0.830574    
## black         -1.105e-03  1.188e-01  -0.009 0.992589    
## lstat          1.527e+00  9.375e-01   1.629 0.104358    
## crim:zn        2.232e-01  2.085e-01   1.070 0.285299    
## crim:indus     9.844e-02  4.696e-01   0.210 0.834125    
## crim:chas      2.850e+00  6.100e-01   4.673 4.57e-06 ***
## crim:nox      -2.220e+00  1.060e+00  -2.094 0.037124 *  
## crim:rm        2.287e-01  5.427e-02   4.214 3.36e-05 ***
## crim:age      -2.360e-03  4.550e-03  -0.519 0.604347    
## crim:dis      -2.147e-01  1.147e-01  -1.872 0.062274 .  
## crim:rad      -3.392e-01  6.198e-01  -0.547 0.584635    
## crim:tax       1.314e-02  4.504e-02   0.292 0.770720    
## crim:ptratio   4.018e-01  4.068e-01   0.988 0.324097    
## crim:black    -4.195e-04  1.984e-04  -2.114 0.035358 *  
## crim:lstat     3.303e-02  7.214e-03   4.578 7.00e-06 ***
## zn:indus      -4.763e-03  5.341e-03  -0.892 0.373221    
## zn:chas       -1.443e-02  8.006e-02  -0.180 0.857081    
## zn:nox         2.545e-01  5.515e-01   0.462 0.644760    
## zn:rm         -2.594e-02  2.963e-02  -0.876 0.381981    
## zn:age        -2.045e-05  9.503e-04  -0.022 0.982846    
## zn:dis         4.753e-03  9.954e-03   0.477 0.633387    
## zn:rad        -6.615e-03  8.578e-03  -0.771 0.441275    
## zn:tax         3.354e-04  1.931e-04   1.736 0.083554 .  
## zn:ptratio    -1.201e-02  7.994e-03  -1.503 0.133992    
## zn:black       3.848e-04  9.225e-04   0.417 0.676873    
## zn:lstat      -1.101e-02  5.077e-03  -2.168 0.030947 *  
## indus:chas    -4.249e-02  4.745e-01  -0.090 0.928706    
## indus:nox      3.256e+00  1.840e+00   1.770 0.077769 .  
## indus:rm       2.450e-01  1.537e-01   1.594 0.112047    
## indus:age     -3.955e-04  3.944e-03  -0.100 0.920193    
## indus:dis      2.742e-03  7.478e-02   0.037 0.970779    
## indus:rad     -3.288e-02  5.959e-02  -0.552 0.581556    
## indus:tax     -4.755e-04  7.869e-04  -0.604 0.546108    
## indus:ptratio -5.893e-02  4.490e-02  -1.312 0.190415    
## indus:black    3.490e-03  3.428e-03   1.018 0.309513    
## indus:lstat    3.766e-03  1.677e-02   0.225 0.822434    
## chas:nox      -3.203e+01  1.355e+01  -2.364 0.018764 *  
## chas:rm       -4.658e+00  1.409e+00  -3.307 0.001065 ** 
## chas:age       4.330e-02  7.193e-02   0.602 0.547700    
## chas:dis      -4.644e-01  1.505e+00  -0.309 0.757906    
## chas:rad      -4.855e-01  7.170e-01  -0.677 0.498838    
## chas:tax       7.091e-03  4.714e-02   0.150 0.880540    
## chas:ptratio  -2.496e-01  7.572e-01  -0.330 0.741956    
## chas:black     3.323e-02  1.911e-02   1.739 0.083144 .  
## chas:lstat    -3.337e-01  2.399e-01  -1.391 0.165286    
## nox:rm         8.032e+00  6.748e+00   1.190 0.234974    
## nox:age       -7.107e-01  2.584e-01  -2.750 0.006340 ** 
## nox:dis        3.506e+00  4.320e+00   0.812 0.417681    
## nox:rad       -1.294e+00  2.409e+00  -0.537 0.591720    
## nox:tax        4.445e-02  1.712e-01   0.260 0.795345    
## nox:ptratio   -1.201e+00  3.582e+00  -0.335 0.737677    
## nox:black     -1.207e-02  3.936e-02  -0.307 0.759330    
## nox:lstat      1.389e+00  9.221e-01   1.506 0.133062    
## rm:age        -5.978e-02  2.395e-02  -2.496 0.013127 *  
## rm:dis         5.077e-01  4.101e-01   1.238 0.216726    
## rm:rad        -1.597e-01  1.822e-01  -0.877 0.381290    
## rm:tax        -2.136e-02  1.175e-02  -1.817 0.070275 .  
## rm:ptratio    -4.789e-01  2.434e-01  -1.967 0.050093 .  
## rm:black      -8.012e-03  3.602e-03  -2.224 0.026914 *  
## rm:lstat      -2.629e-01  5.023e-02  -5.234 3.21e-07 ***
## age:dis       -1.707e-02  1.050e-02  -1.626 0.104977    
## age:rad        1.353e-02  4.844e-03   2.792 0.005584 ** 
## age:tax       -2.792e-04  2.374e-04  -1.176 0.240558    
## age:ptratio   -8.427e-03  7.290e-03  -1.156 0.248688    
## age:black     -6.755e-04  2.216e-04  -3.048 0.002516 ** 
## age:lstat     -6.190e-03  2.086e-03  -2.967 0.003262 ** 
## dis:rad       -7.510e-02  8.408e-02  -0.893 0.372534    
## dis:tax       -2.093e-03  3.549e-03  -0.590 0.555741    
## dis:ptratio    1.161e-01  1.151e-01   1.008 0.314233    
## dis:black     -4.022e-03  5.810e-03  -0.692 0.489373    
## dis:lstat      1.399e-01  5.795e-02   2.414 0.016420 *  
## rad:tax        3.173e-04  1.643e-03   0.193 0.847042    
## rad:ptratio   -6.484e-02  9.991e-02  -0.649 0.516826    
## rad:black     -1.630e-03  2.931e-03  -0.556 0.578435    
## rad:lstat     -4.612e-02  2.210e-02  -2.086 0.037827 *  
## tax:ptratio    8.142e-03  2.960e-03   2.750 0.006330 ** 
## tax:black      1.142e-05  2.381e-04   0.048 0.961775    
## tax:lstat     -5.029e-04  1.392e-03  -0.361 0.718077    
## ptratio:black  5.935e-03  5.672e-03   1.046 0.296282    
## ptratio:lstat -8.570e-03  3.719e-02  -0.230 0.817910    
## black:lstat   -8.763e-04  4.521e-04  -1.938 0.053561 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.659 on 287 degrees of freedom
## Multiple R-squared:  0.9331, Adjusted R-squared:  0.9119 
## F-statistic: 44.02 on 91 and 287 DF,  p-value: < 2.2e-16

이차 상호 작용 모형은 모수의 개수가 92개

length(coef(lm_full2))

## [1] 92

변수 선택 모형으로 중요변수 자동 선별

full_m <- lm(medv ~., data=Boston) # 모든 변수 사용 
null_m <- lm(medv ~1, data=Boston) # 변수사용 없음 

step(null_m, direction = "both", trace = 1,
     scope = list(lower = null_m,
                  upper = full_m)) ->step_one

## Start:  AIC=2246.51
## medv ~ 1
## 
##           Df Sum of Sq   RSS    AIC
## + lstat    1   23243.9 19472 1851.0
## + rm       1   20654.4 22062 1914.2
## + ptratio  1   11014.3 31702 2097.6
## + indus    1    9995.2 32721 2113.6
## + tax      1    9377.3 33339 2123.1
## + nox      1    7800.1 34916 2146.5
## + crim     1    6440.8 36276 2165.8
## + rad      1    6221.1 36495 2168.9
## + age      1    6069.8 36647 2171.0
## + zn       1    5549.7 37167 2178.1
## + black    1    4749.9 37966 2188.9
## + dis      1    2668.2 40048 2215.9
## + chas     1    1312.1 41404 2232.7
## <none>                 42716 2246.5
## 
## Step:  AIC=1851.01
## medv ~ lstat
## 
##           Df Sum of Sq   RSS    AIC
## + rm       1    4033.1 15439 1735.6
## + ptratio  1    2670.1 16802 1778.4
## + chas     1     786.3 18686 1832.2
## + dis      1     772.4 18700 1832.5
## + age      1     304.3 19168 1845.0
## + tax      1     274.4 19198 1845.8
## + black    1     198.3 19274 1847.8
## + zn       1     160.3 19312 1848.8
## + crim     1     146.9 19325 1849.2
## + indus    1      98.7 19374 1850.4
## <none>                 19472 1851.0
## + rad      1      25.1 19447 1852.4
## + nox      1       4.8 19468 1852.9
## - lstat    1   23243.9 42716 2246.5
## 
## Step:  AIC=1735.58
## medv ~ lstat + rm
## 
##           Df Sum of Sq   RSS    AIC
## + ptratio  1    1711.3 13728 1678.1
## + chas     1     548.5 14891 1719.3
## + black    1     512.3 14927 1720.5
## + tax      1     425.2 15014 1723.5
## + dis      1     351.2 15088 1725.9
## + crim     1     311.4 15128 1727.3
## + rad      1     180.5 15259 1731.6
## + indus    1      61.1 15378 1735.6
## <none>                 15439 1735.6
## + zn       1      56.6 15383 1735.7
## + age      1      20.2 15419 1736.9
## + nox      1      14.9 15424 1737.1
## - rm       1    4033.1 19472 1851.0
## - lstat    1    6622.6 22062 1914.2
## 
## Step:  AIC=1678.13
## medv ~ lstat + rm + ptratio
## 
##           Df Sum of Sq   RSS    AIC
## + dis      1     499.1 13229 1661.4
## + black    1     389.7 13338 1665.6
## + chas     1     378.0 13350 1666.0
## + crim     1     122.5 13606 1675.6
## + age      1      66.2 13662 1677.7
## <none>                 13728 1678.1
## + tax      1      44.4 13684 1678.5
## + nox      1      24.8 13703 1679.2
## + zn       1      15.0 13713 1679.6
## + rad      1       6.1 13722 1679.9
## + indus    1       0.8 13727 1680.1
## - ptratio  1    1711.3 15439 1735.6
## - rm       1    3074.3 16802 1778.4
## - lstat    1    5013.6 18742 1833.7
## 
## Step:  AIC=1661.39
## medv ~ lstat + rm + ptratio + dis
## 
##           Df Sum of Sq   RSS    AIC
## + nox      1     759.6 12469 1633.5
## + black    1     502.6 12726 1643.8
## + chas     1     267.4 12962 1653.1
## + indus    1     242.6 12986 1654.0
## + tax      1     240.3 12989 1654.1
## + crim     1     233.5 12995 1654.4
## + zn       1     144.8 13084 1657.8
## + age      1      61.4 13168 1661.0
## <none>                 13229 1661.4
## + rad      1      22.4 13206 1662.5
## - dis      1     499.1 13728 1678.1
## - ptratio  1    1859.3 15088 1725.9
## - rm       1    2622.6 15852 1750.9
## - lstat    1    5349.2 18578 1831.2
## 
## Step:  AIC=1633.47
## medv ~ lstat + rm + ptratio + dis + nox
## 
##           Df Sum of Sq   RSS    AIC
## + chas     1     328.3 12141 1622.0
## + black    1     311.8 12158 1622.7
## + zn       1     151.7 12318 1629.3
## + crim     1     141.4 12328 1629.7
## + rad      1      53.5 12416 1633.3
## <none>                 12469 1633.5
## + indus    1      17.1 12452 1634.8
## + tax      1      10.5 12459 1635.0
## + age      1       0.2 12469 1635.5
## - nox      1     759.6 13229 1661.4
## - dis      1    1233.8 13703 1679.2
## - ptratio  1    2116.5 14586 1710.8
## - rm       1    2546.2 15016 1725.5
## - lstat    1    3664.3 16134 1761.8
## 
## Step:  AIC=1621.97
## medv ~ lstat + rm + ptratio + dis + nox + chas
## 
##           Df Sum of Sq   RSS    AIC
## + black    1     272.8 11868 1612.5
## + zn       1     164.4 11977 1617.1
## + crim     1     116.3 12025 1619.1
## + rad      1      58.6 12082 1621.5
## <none>                 12141 1622.0
## + indus    1      26.3 12115 1622.9
## + tax      1       4.2 12137 1623.8
## + age      1       2.3 12139 1623.9
## - chas     1     328.3 12469 1633.5
## - nox      1     820.4 12962 1653.1
## - dis      1    1146.8 13288 1665.6
## - ptratio  1    1924.9 14066 1694.4
## - rm       1    2480.7 14622 1714.0
## - lstat    1    3509.3 15650 1748.5
## 
## Step:  AIC=1612.47
## medv ~ lstat + rm + ptratio + dis + nox + chas + black
## 
##           Df Sum of Sq   RSS    AIC
## + zn       1    189.94 11678 1606.3
## + rad      1    144.32 11724 1608.3
## + crim     1     55.63 11813 1612.1
## <none>                 11868 1612.5
## + indus    1     15.58 11853 1613.8
## + age      1      9.45 11859 1614.1
## + tax      1      2.70 11866 1614.4
## - black    1    272.84 12141 1622.0
## - chas     1    289.27 12158 1622.7
## - nox      1    626.85 12495 1636.5
## - dis      1   1103.33 12972 1655.5
## - ptratio  1   1804.30 13672 1682.1
## - rm       1   2658.21 14526 1712.7
## - lstat    1   2991.55 14860 1724.2
## 
## Step:  AIC=1606.31
## medv ~ lstat + rm + ptratio + dis + nox + chas + black + zn
## 
##           Df Sum of Sq   RSS    AIC
## + crim     1     94.71 11584 1604.2
## + rad      1     93.61 11585 1604.2
## <none>                 11678 1606.3
## + indus    1     16.05 11662 1607.6
## + tax      1      3.95 11674 1608.1
## + age      1      1.49 11677 1608.2
## - zn       1    189.94 11868 1612.5
## - black    1    298.37 11977 1617.1
## - chas     1    300.42 11979 1617.2
## - nox      1    627.62 12306 1630.8
## - dis      1   1276.45 12955 1656.8
## - ptratio  1   1364.63 13043 1660.2
## - rm       1   2384.55 14063 1698.3
## - lstat    1   3052.50 14731 1721.8
## 
## Step:  AIC=1604.19
## medv ~ lstat + rm + ptratio + dis + nox + chas + black + zn + 
##     crim
## 
##           Df Sum of Sq   RSS    AIC
## + rad      1    228.60 11355 1596.1
## <none>                 11584 1604.2
## + indus    1     15.77 11568 1605.5
## + age      1      2.47 11581 1606.1
## + tax      1      1.31 11582 1606.1
## - crim     1     94.71 11678 1606.3
## - black    1    222.18 11806 1611.8
## - zn       1    229.02 11813 1612.1
## - chas     1    284.34 11868 1614.5
## - nox      1    578.44 12162 1626.8
## - ptratio  1   1192.90 12776 1651.8
## - dis      1   1345.70 12929 1657.8
## - rm       1   2419.57 14003 1698.2
## - lstat    1   2753.42 14337 1710.1
## 
## Step:  AIC=1596.1
## medv ~ lstat + rm + ptratio + dis + nox + chas + black + zn + 
##     crim + rad
## 
##           Df Sum of Sq   RSS    AIC
## + tax      1    273.62 11081 1585.8
## <none>                 11355 1596.1
## + indus    1     33.89 11321 1596.6
## + age      1      0.10 11355 1598.1
## - zn       1    171.14 11526 1601.7
## - rad      1    228.60 11584 1604.2
## - crim     1    229.70 11585 1604.2
## - chas     1    272.67 11628 1606.1
## - black    1    295.78 11651 1607.1
## - nox      1    785.16 12140 1627.9
## - dis      1   1341.37 12696 1650.6
## - ptratio  1   1419.77 12775 1653.7
## - rm       1   2182.57 13538 1683.1
## - lstat    1   2785.28 14140 1705.1
## 
## Step:  AIC=1585.76
## medv ~ lstat + rm + ptratio + dis + nox + chas + black + zn + 
##     crim + rad + tax
## 
##           Df Sum of Sq   RSS    AIC
## <none>                 11081 1585.8
## + indus    1      2.52 11079 1587.7
## + age      1      0.06 11081 1587.8
## - chas     1    227.21 11309 1594.0
## - crim     1    245.37 11327 1594.8
## - zn       1    257.82 11339 1595.4
## - black    1    270.82 11352 1596.0
## - tax      1    273.62 11355 1596.1
## - rad      1    500.92 11582 1606.1
## - nox      1    541.91 11623 1607.9
## - ptratio  1   1206.45 12288 1636.0
## - dis      1   1448.94 12530 1645.9
## - rm       1   1963.66 13045 1666.3
## - lstat    1   2723.48 13805 1695.0

summary(step_one)

## 
## Call:
## lm(formula = medv ~ lstat + rm + ptratio + dis + nox + chas + 
##     black + zn + crim + rad + tax, data = Boston)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.5984  -2.7386  -0.5046   1.7273  26.2373 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  36.341145   5.067492   7.171 2.73e-12 ***
## lstat        -0.522553   0.047424 -11.019  < 2e-16 ***
## rm            3.801579   0.406316   9.356  < 2e-16 ***
## ptratio      -0.946525   0.129066  -7.334 9.24e-13 ***
## dis          -1.492711   0.185731  -8.037 6.84e-15 ***
## nox         -17.376023   3.535243  -4.915 1.21e-06 ***
## chas          2.718716   0.854240   3.183 0.001551 ** 
## black         0.009291   0.002674   3.475 0.000557 ***
## zn            0.045845   0.013523   3.390 0.000754 ***
## crim         -0.108413   0.032779  -3.307 0.001010 ** 
## rad           0.299608   0.063402   4.726 3.00e-06 ***
## tax          -0.011778   0.003372  -3.493 0.000521 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.736 on 494 degrees of freedom
## Multiple R-squared:  0.7406, Adjusted R-squared:  0.7348 
## F-statistic: 128.2 on 11 and 494 DF,  p-value: < 2.2e-16

모형의 개수 확인하기

length(coef(step_one))

## [1] 12

다중 공선성 확인하기

vif(step_one) > 10

##   lstat      rm ptratio     dis     nox    chas   black      zn    crim     rad 
##   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE 
##     tax 
##   FALSE

결과 모형 시각화

pred_step <- predict(step_one, test)

pred_step

##         4         5         6         7         9        18        19        27 
## 28.647995 27.982641 25.298037 22.982506 11.514153 16.956423 16.262046 15.497800 
##        28        33        37        52        57        59        70        76 
## 14.752117  8.892517 22.406970 23.912795 24.872847 21.745065 20.806181 23.912454 
##        80        95        97        98       101       102       117       122 
## 22.383017 26.862916 24.833877 35.851970 24.654400 25.668616 23.449469 22.140877 
##       128       130       135       142       143       144       145       146 
## 15.088861 13.966092 13.157465  3.945066 14.642722 12.161321  8.754922 12.051149 
##       148       149       150       162       164       175       178       179 
##  8.541784  9.741287 14.808057 36.610403 41.705425 26.625987 29.216443 31.517915 
##       182       185       186       191       193       195       198       205 
## 27.860109 22.801576 24.853099 30.820495 32.968202 31.567793 32.682068 43.136101 
##       209       214       220       223       225       227       230       232 
## 23.485857 25.188284 29.593372 32.198974 38.378361 37.581779 31.286010 33.293005 
##       235       238       240       242       243       248       249       251 
## 31.771354 32.720166 28.409392 23.719345 24.088929 19.806099 21.254268 24.195779 
##       254       255       256       258       262       267       270       272 
## 29.892381 23.932557 21.672802 43.379033 37.250005 31.334653 25.732112 27.199516 
##       273       280       290       292       293       302       308       310 
## 28.487324 35.318524 26.859459 34.361378 31.801079 28.922955 32.752564 23.596024 
##       317       319       322       323       328       332       334       335 
## 17.621271 24.293736 24.879915 22.884552 19.359223 19.998750 22.149838 21.566474 
##       337       339       343       350       353       354       358       363 
## 20.198213 22.212974 22.227842 22.224222 16.934217 25.241934 22.762791 18.246554 
##       366       367       370       371       372       380       382       383 
## 14.302181 15.563032 32.626363 34.586136 24.910039 16.836662 18.473228 13.450343 
##       387       392       398       403       407       409       417       422 
##  6.164554 17.310588 16.346933 18.266010  8.086738 13.701536 13.384999 18.133335 
##       428       441       446       447       463       468       472       477 
## 14.070446 12.734062 12.008867 17.699122 19.767151 16.906857 22.887320 20.498934 
##       480       487       489       493       494       495       498 
## 21.823125 19.590864 11.791079 15.853051 20.728662 20.724054 19.194181

plot(train$medv, lm.fit$fitted.values, xlab= "Actual", ylab="Predicted",cex = 0.5)
points(test$medv, pred_step, cex = 0.5, col = "red", pch=2)
lines(train$medv, train$medv, col="blue")

KNN

#training/ test 
attach(Boston)

tibble(Boston)

## # A tibble: 506 x 1
##    Boston$crim   $zn $indus $chas  $nox   $rm  $age  $dis  $rad  $tax $ptratio
##          <dbl> <dbl>  <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int> <dbl>    <dbl>
##  1     0.00632  18     2.31     0 0.538  6.58  65.2  4.09     1   296     15.3
##  2     0.0273    0     7.07     0 0.469  6.42  78.9  4.97     2   242     17.8
##  3     0.0273    0     7.07     0 0.469  7.18  61.1  4.97     2   242     17.8
##  4     0.0324    0     2.18     0 0.458  7.00  45.8  6.06     3   222     18.7
##  5     0.0690    0     2.18     0 0.458  7.15  54.2  6.06     3   222     18.7
##  6     0.0298    0     2.18     0 0.458  6.43  58.7  6.06     3   222     18.7
##  7     0.0883   12.5   7.87     0 0.524  6.01  66.6  5.56     5   311     15.2
##  8     0.145    12.5   7.87     0 0.524  6.17  96.1  5.95     5   311     15.2
##  9     0.211    12.5   7.87     0 0.524  5.63 100    6.08     5   311     15.2
## 10     0.170    12.5   7.87     0 0.524  6.00  85.9  6.59     5   311     15.2
## # ... with 496 more rows, and 3 more variables: $black <dbl>, $lstat <dbl>,
## #   $medv <dbl>

set.seed(1004)

flag <- sample(c("train","test"), size = nrow(Boston), c(8,2), replace = T)


train  <- Boston[which(flag == "train"),]
test <- Boston[which(flag == "test"),]

# validation set 할당해주기 
valid_flag <- sample(1:nrow(train), size = nrow(train) * 0.3)

train_knn <- train[-valid_flag,]
valid_knn <- train[valid_flag,]

# Standardization 

knn_model <- preProcess(train_knn[,-1], method = c("center", "scale"))

predict(knn_model, train_knn) -> train
predict(knn_model, valid_knn) -> valid
predict(knn_model, test) -> test

#training data set

tr_x <- train[,-1]
va_x <- valid[,-1]
te_x <- test[,-1]

str(tr_x)

## 'data.frame':    283 obs. of  13 variables:
##  $ zn     : num  0.342 -0.4539 0.0988 0.0988 0.0988 ...
##  $ indus  : num  -1.322 -0.632 -0.516 -0.516 -0.516 ...
##  $ chas   : num  -0.244 -0.244 -0.244 -0.244 -0.244 ...
##  $ nox    : num  -0.191 -0.796 -0.314 -0.314 -0.314 ...
##  $ rm     : num  0.456 0.236 -0.893 -0.36 -0.524 ...
##  $ age    : num  -0.117 0.369 1.118 0.618 -1.047 ...
##  $ dis    : num  0.21 0.646 1.2 1.453 0.886 ...
##  $ rad    : num  -0.997 -0.883 -0.543 -0.543 -0.543 ...
##  $ tax    : num  -0.678 -0.992 -0.591 -0.591 -0.591 ...
##  $ ptratio: num  -1.547 -0.373 -1.594 -1.594 -1.594 ...
##  $ black  : num  0.447 0.447 0.339 0.34 0.38 ...
##  $ lstat  : num  -1.094 -0.515 2.38 0.593 0.4 ...
##  $ medv   : num  0.1604 -0.0902 -0.6229 -0.3722 -0.0798 ...

# test data set 

tr_y <- train$medv
te_y <- test$medv
va_y <- valid$medv

# Finding an optimal K 


MSE_k <- NULL
for(i in 1:30){
  m_knn <- knnreg(tr_x, tr_y, k = i)
  MSE_va <- mean((predict(m_knn, va_x) - va_y)^2)
  MSE_k <- c(MSE_k, MSE_va)
}

which.min(MSE_k)

## [1] 2

m_knn <- knnreg(tr_x, tr_y, k = which.min(MSE_k))

pred_knn_tr <- predict(m_knn, tr_x)
pred_knn_te <- predict(m_knn, te_x)
pred_knn <- pred_knn_te

MSE_tr_knn <- mean((train$medv - pred_knn_tr)^2)
MSE_tr_knn # training MSE

## [1] 0.02429114

MSE_te_knn <- mean((test$medv - pred_knn_te)^2)
MSE_te_knn # test MSE

## [1] 0.107748

plot(train$medv, pred_knn_tr, cex = 0.5, pch = 19, xlab = "Actual price", ylab = "Predicted price", main = "KNN")
points(test$medv, pred_knn_te, pch = 2, cex = 0.5, col = "red")
lines(train$medv, train$medv, lty = 2, col = "blue")
legend("topright", legend = c("training", "test", "Actual price"), pch = c(19, 2, NA), col = c("black", "red", "blue"), lty = c(NA, NA, 1))

Random Forest

randomForest::randomForest(medv ~., data=train,
                           ntree = 100,# 나무의 개수
                           mtry = 12,   # 노드를 나눌 기준을 정할 때 고려할 변수의 수 
                           importance = T,
                           na.action = na.omit) -> rm_lm

rm_lm

## 
## Call:
##  randomForest(formula = medv ~ ., data = train, ntree = 100, mtry = 12,      importance = T, na.action = na.omit) 
##                Type of random forest: regression
##                      Number of trees: 100
## No. of variables tried at each split: 12
## 
##           Mean of squared residuals: 0.1273245
##                     % Var explained: 87.22

rm_lm$mse #오차 확인

##   [1] 0.1940397 0.2374696 0.2295626 0.2738142 0.2235058 0.2207807 0.1937333
##   [8] 0.2162865 0.2027402 0.2001209 0.1884504 0.1765167 0.1832419 0.1621718
##  [15] 0.1589990 0.1637178 0.1560163 0.1479209 0.1467187 0.1436222 0.1385999
##  [22] 0.1366274 0.1350043 0.1360038 0.1336296 0.1351691 0.1374413 0.1376752
##  [29] 0.1332711 0.1382791 0.1375132 0.1351487 0.1383579 0.1370608 0.1384346
##  [36] 0.1380418 0.1418443 0.1425056 0.1408705 0.1394818 0.1374067 0.1385827
##  [43] 0.1385760 0.1389907 0.1373392 0.1379532 0.1379855 0.1362088 0.1365497
##  [50] 0.1361787 0.1396227 0.1384502 0.1368060 0.1353351 0.1340874 0.1325548
##  [57] 0.1314070 0.1300176 0.1303804 0.1294095 0.1282636 0.1281030 0.1287540
##  [64] 0.1285293 0.1284314 0.1284836 0.1288628 0.1285740 0.1297792 0.1293553
##  [71] 0.1282734 0.1282212 0.1279820 0.1283967 0.1292654 0.1304802 0.1304559
##  [78] 0.1308265 0.1290593 0.1285534 0.1291263 0.1285451 0.1292779 0.1288247
##  [85] 0.1281480 0.1286090 0.1275658 0.1276529 0.1269840 0.1266019 0.1269341
##  [92] 0.1277946 0.1284814 0.1281689 0.1285915 0.1285107 0.1279970 0.1276308
##  [99] 0.1263938 0.1273245

plot(rm_lm)

round(rm_lm$importance, 2)

##         %IncMSE IncNodePurity
## crim       0.05         10.19
## zn         0.00          0.16
## indus      0.02          1.70
## chas       0.00          0.07
## nox        0.08         11.96
## rm         0.51         93.81
## age        0.04          4.46
## dis        0.18         21.24
## rad        0.00          1.04
## tax        0.02          2.76
## ptratio    0.02          3.01
## black      0.01          3.15
## lstat      1.02        122.57

importance(rm_lm)

##            %IncMSE IncNodePurity
## crim     5.3246446   10.19457052
## zn       1.0863427    0.16020563
## indus    4.4415660    1.70233787
## chas    -1.5376012    0.07409163
## nox      5.0659436   11.96372353
## rm      23.0203280   93.81098450
## age      7.1519167    4.45958224
## dis     10.2965283   21.23655246
## rad      0.6509553    1.04090473
## tax      2.5937085    2.76472640
## ptratio  5.4895482    3.01486756
## black    4.7811635    3.14743756
## lstat   18.4976580  122.56528773

varImpPlot(rm_lm)

predict(rm_lm, train) -> pred_train_rm

MSE_train_rf <- mean((rm_lm$predicted - train$medv),2) #예측값 - 실제 제곱 

MSE_train_rf

## [1] 0.02099339

predict(rm_lm, test) -> pred_test_rm
pred_test_rm -> pred_rf


MES_test_rf <- mean((pred_test_rm - test$medv)^2)
MES_test_rf

## [1] 0.1000575

plot(train$medv, pred_train_rm, cex=0.5, xlab = "Actual", ylab="Predicted")
points(test$medv, pred_test_rm, pch=2, col="red")
lines(train$medv, train$medv, col = "blue")

Rpart

tree <- rpart::rpart(medv ~., data= train)
tree

## n= 283 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 283 282.000000 -1.004301e-16  
##    2) lstat>=-0.4349859 171  44.082330 -5.421342e-01  
##      4) lstat>=0.4909677 77  13.384410 -9.134104e-01  
##        8) crim>=7.537955 40   3.953395 -1.156044e+00 *
##        9) crim< 7.537955 37   4.530396 -6.511039e-01 *
##      5) lstat< 0.4909677 94  11.389210 -2.380036e-01 *
##    3) lstat< -0.4349859 112 110.925200  8.277227e-01  
##      6) rm< 1.709961 96  52.348050  5.390202e-01  
##       12) dis>=-0.8508877 89  26.177900  4.082646e-01  
##         24) rm< 0.7733449 64   8.072064  1.486750e-01  
##           48) rm< 0.4104152 44   3.631000  6.375280e-03 *
##           49) rm>=0.4104152 20   1.589974  4.617345e-01 *
##         25) rm>=0.7733449 25   2.752436  1.072814e+00 *
##       13) dis< -0.8508877 7   5.302009  2.201484e+00 *
##      7) rm>=1.709961 16   2.566485  2.559938e+00 *

par(mfrow = c(1,1), xpd=NA) -> opar

plot(tree)
text(tree, use.n = TRUE)

par(opar)

즉, 나무 모형은 데이터의 다른 영역에서 변수들 간의 변화하는 상관관계를 찾아낼 수있다.

Boston Housing Prediction with linear regression

DoEUN GIM

2020 3 3