1. Introduction

1.1. House Sales in King County, USA

2014년 5월부터 2015년 5월 까지의 시애틀 킹 카운티의 주택 실거래가 정보(집과 관련된 다양한 변수들을 기준으로)에 머신 러닝 모델을 적용하여 주택 가격을 예측하는 모형을 만드는 것이 목표이다.

1.2. Data description

  • id : a notation for a house
  • date : Date house was sold
  • price : Price is prediction target
  • bedrooms : Number of Bedrooms/House
  • bathrooms : Number of bathrooms/bedrooms
  • sqft_living : square footage of the home
  • sqft_lot : square footage of the lot
  • floors : Total floors (levels) in house
  • waterfront : House which has a view to a waterfront
  • view : Has been viewed
  • condition : How good the condition is ( Overall )
  • grade : overall grade given to the housing unit, based on King County grading system
  • sqft_above : square footage of house apart from basement
  • sqft_basement : square footage of the basement
  • yr_built : Built Year
  • yr_renovated : Year when house was renovated
  • zipcode : zip
  • lat : Latitude coordinate
  • long : Longitude coordinate
  • sqft_living15 : Living room area in 2015(implies– some renovations) This might or might not have affected the lotsize area
  • sqft_lot15 : lotSize area in 2015(implies– some renovations)

2. Collect the data

Read the data

suppressMessages(library(tidyverse))
suppressMessages(library(stringr))
suppressMessages(library(lubridate))
suppressMessages(library(DT))
suppressMessages(library(caret))
suppressMessages(library(leaflet))
suppressMessages(library(corrplot))
suppressMessages(library(boot)) #for diagnostic plots
suppressMessages(library(ggthemes))


rm(list=ls())

fillColor = "#FFA07A"
fillColor2 = "#F1C40F"

house_df = read_csv("./input/house.csv")

Peek into the Data

datatable(house_df, style="bootstrap", class="table-condensed", options = list(dom = 'tp',scrollX = TRUE))

3. Exploratory Data Analysis

EDA 1 : Bedrooms and Price

침실 수(bedrooms)에 따라 가격이 어떻게 달라지는지 확인해보겠다.

house_df %>%
  group_by(bedrooms) %>%
  summarise(price_median = median(price, na.rm = TRUE)) %>%
  ungroup() %>%
  mutate(bedrooms = reorder(bedrooms,price_median)) %>%
  arrange(desc(price_median)) %>%
 
  ggplot(aes(x = bedrooms, y = price_median)) +
  geom_bar(stat='identity', colour="white", fill = fillColor2) +
  geom_text(aes(x = bedrooms, y = 1, label = paste0("(",price_median,")",sep="")),
            hjust=0, vjust=.5, size = 4, colour = 'black',
            fontface = 'bold') +
  labs(x = 'bedrooms', 
       y = 'Median Price', 
       title = 'bedrooms and Median Price') +
  coord_flip() +
  theme_economist()

EDA 2 : Bathrooms and Price

욕실의 수(bathrooms)가 가격에 어떻게 영향을 미치는지 조사해보겠다.

house_df %>%
  group_by(bathrooms) %>%
  summarise(price_median = median(price, na.rm = TRUE)) %>%
  ungroup() %>%
  mutate(bathrooms = reorder(bathrooms, price_median)) %>%
  arrange(desc(price_median)) %>%
  head(10) %>%
  
  ggplot(aes(x = bathrooms, y = price_median)) +
  geom_bar(stat = "identity", colour="white", fill = fillColor2) +
  geom_text(aes(x = bathrooms, y = 1, label = paste0("(",price_median,")",sep="")),
            hjust=0, vjust=.5, size = 4, colour = 'black',
            fontface = 'bold') +
  labs(x = 'bathrooms', 
       y = 'Median Price', 
       title = 'bathrooms and Median Price') +
  coord_flip() + 
  theme_economist()

EDA 3 : Grade and Price

등급(grade)이 가격에 미치는 영향을 살펴보겠다.

house_df %>%
  group_by(grade) %>%
  summarise(price_median = median(price, na.rm = TRUE)) %>%
  ungroup() %>%
  mutate(grade = reorder(grade, price_median)) %>%
  arrange(desc(price_median)) %>%
  
  ggplot(aes(x = grade, y = price_median)) +
  geom_bar(stat = "identity", colour="white", fill = fillColor2) +
  geom_text(aes(x = grade, y = 1, label = paste0("(",price_median,")", sep="")),
            hjust = 0, vjust = .5, size = 4, colour = "black",
            fontface = 'bold') +
  labs(x = 'grade',
       y = 'Median Price',
       title = 'grade and Median price') +
  coord_flip() +
  theme_economist()

EDA 4 : waterfront and Price

해안가의 전망 유무가(waterfront)이 가격에 영향을 끼치는지 확인해보기

house_df %>%
  group_by(waterfront) %>%
  summarise(price_median = median(price, na.rm = TRUE)) %>%
  ungroup() %>%
  mutate(waterfront = reorder(waterfront, price_median)) %>%
  arrange(desc(price_median)) %>%
  
  ggplot(aes(x = waterfront,y = price_median)) +
  geom_bar(stat='identity', colour="white", fill = fillColor2) +
  
  labs(x = 'waterfront', 
       y = 'Median Price', 
       title = 'waterfront and Median Price') +
  theme_economist()

EDA 5 : Sqft Living and Price

집면적(sqft_living)에 따른 주택 가격의 산점도를 그려보겠다.

house_df %>%
  filter(!is.na(price)) %>%
  filter(!is.na(sqft_living)) %>%
  
  ggplot(aes(x = sqft_living, y = price)) +
  geom_point(color = "orange") +
  
  stat_smooth(aes(x = sqft_living, y = price), method = "lm", color = "red") +
  theme_economist() +
  theme(axis.title = element_text(size = 16), axis.text = element_text(size = 14)) +
  xlab("(Sqft Living)") +
  ylab("Price")

EDA 6 : Sqft Lot and Price

Plot

house_df %>% 
  filter(!is.na(price)) %>% 
  filter(!is.na(sqft_lot)) %>% 
  
  ggplot(aes(x=sqft_lot,y=price)) +
  geom_point(color = "orange") +
  
  stat_smooth(aes(x=sqft_lot,y=price),method="lm", color="red") +
  theme_economist() +
  theme(axis.title = element_text(size=16),axis.text = element_text(size=14)) +
  xlab("(Sqft Lot)") +
  ylab("Price")

Plot with limits x axis

house_df %>% 
  filter(!is.na(price)) %>% 
  filter(!is.na(sqft_lot)) %>% 
  
  ggplot(aes(x=sqft_lot,y=price)) +
  geom_point(color = "orange") +
  
  scale_x_continuous(limits=c(0,500e3)) +
  stat_smooth(aes(x=sqft_lot,y=price),method="lm", color="red") +
  theme_economist() +
  theme(axis.title = element_text(size=16),axis.text = element_text(size=14)) +
  xlab("(Sqft Lot)") +
  ylab("Price")

EDA 7 : Year Built and Price

house_df %>%
  group_by(yr_built) %>%
  summarise(PriceMedian = median(price, na.rm = TRUE)) %>%
  ungroup() %>%
  mutate(yr_built = reorder(yr_built,PriceMedian)) %>%
  arrange(desc(PriceMedian)) %>%
  head(10) %>%
  
  
  ggplot(aes(x = yr_built,y = PriceMedian)) +
  geom_bar(stat='identity', colour="white", fill = fillColor2) +
  geom_text(aes(x = yr_built, y = 1, label = paste0("(",PriceMedian,")",sep="")),
            hjust=0, vjust=.5, size = 4, colour = 'black',
            fontface = 'bold') +
  labs(x = 'year built', 
       y = 'Median Price', 
       title = 'year built and Median Price') +
  coord_flip() + 
  theme_economist()

EDA 8 : Year Renovated and Price

house_df %>%
  group_by(yr_renovated) %>%
  summarise(PriceMedian = median(price, na.rm = TRUE)) %>%
  ungroup() %>%
  mutate(yr_renovated = reorder(yr_renovated,PriceMedian)) %>%
  arrange(desc(PriceMedian)) %>%
  head(10) %>%
  
  
  ggplot(aes(x = yr_renovated,y = PriceMedian)) +
  geom_bar(stat='identity',colour="white", fill = fillColor2) +
  geom_text(aes(x = yr_renovated, y = 1, label = paste0("(",PriceMedian,")",sep="")),
            hjust=0, vjust=.5, size = 4, colour = 'black',
            fontface = 'bold') +
  labs(x = 'year renovated', 
       y = 'Median Price', 
       title = 'year renovated and Median Price') +
  coord_flip() + 
  theme_economist()

EDA 9 : Price Plots

plot 1

house_df %>%
  
  ggplot(aes(x = price)) +    
  geom_histogram(alpha = 0.8,fill = fillColor2) +
  
  labs(x= 'Price',y = 'Count', title = paste("Distribution of", ' Price ')) +
  theme_economist()

Price Plots highest price being 2 million

최대값을 2백만 달러로 한정

house_df %>%
  
  ggplot(aes(x = price)) +    
  geom_histogram(alpha = 0.8,fill = fillColor2) +
  scale_x_continuous(limits=c(0,2e6)) +
  
  labs(x= 'Price',y = 'Count', title = paste("Distribution of", ' Price ')) +
  theme_economist()

Price Plots highest price being 1 million

최대값을 1백만 달러로 한정

house_df %>%
  
  ggplot(aes(x = price)) +    
  geom_histogram(alpha = 0.8,fill = fillColor2) +
  scale_x_continuous(limits=c(0,1e6)) +
  
  labs(x= 'Price',y = 'Count', title = paste("Distribution of", ' Price ')) +
  theme_economist()

EDA 10 : Maps of Houses

해안 가까이에있는 주택은 비용이 많이 든다.

house_df$PriceBin<-cut(house_df$price, c(0,250e3,500e3,750e3,1e6,2e6,999e6))

center_lon = median(house_df$long,na.rm = TRUE)
center_lat = median(house_df$lat,na.rm = TRUE)

factpal <- colorFactor(c("black","blue","yellow","orange","#0B5345","red"), 
                       house_df$PriceBin)



leaflet(house_df) %>% addProviderTiles("Esri.NatGeoWorldMap") %>%
  addCircles(lng = ~long, lat = ~lat, 
             color = ~factpal(PriceBin))  %>%
  # controls
  setView(lng=center_lon, lat=center_lat,zoom = 12) %>%
  
  addLegend("bottomright", pal = factpal, values = ~PriceBin,
            title = "House Price Distribution",
            opacity = 1)

EDA 11: Price Bins Count

대부분의 주택은 250 thousand에서 500 thousand 범위에 있다. 다음으로 가장 높은 카테고리는 아래와 같다.

  • 500 to 750 thousand

  • 0 to 250 thousand

  • 750 thousand to 1 million

  • 1 million to 2 million

  • and the least is above 2 million

house_df %>%
  mutate(PriceBin = as.factor(PriceBin)) %>%
  group_by(PriceBin) %>%
  dplyr::summarise(Count = n()) %>%
  ungroup() %>%
  mutate(PriceBin = reorder(PriceBin,Count)) %>%
  arrange(desc(Count)) %>%
  
  ggplot(aes(x = PriceBin,y = Count)) +
  geom_bar(stat='identity',colour="white", fill = fillColor2) +
  geom_text(aes(x = PriceBin, y = 1, label = paste0("(",Count,")",sep="")),
            hjust=0, vjust=.5, size = 4, colour = 'black',
            fontface = 'bold') +
  labs(x = 'PriceBin', 
       y = 'Count', 
       title = 'PriceBin and Count') +
  coord_flip() + 
  theme_bw()

EDA 12 : Price Bins and Maps

PriceBinGrouping = function(limit1, limit2)
{
  return(
    
    house_df %>%
      filter(price > limit1) %>%
      filter(price <= limit2)
  )
}

PriceGroup1 = PriceBinGrouping(0,250e3)

PriceGroup2 = PriceBinGrouping(250e3,500e3)

PriceGroup3 = PriceBinGrouping(500e3,750e3)

PriceGroup4 = PriceBinGrouping(750e3,1e6)

PriceGroup5 = PriceBinGrouping(1e6,2e6)

PriceGroup6 = PriceBinGrouping(2e6,999e6)

Price Group 1

The map shows houses in the range 0 to 250 thousands.

MapPriceGroups = function(PriceGroupName,color)
{
  center_lon = median(PriceGroupName$long,na.rm = TRUE)
  center_lat = median(PriceGroupName$lat,na.rm = TRUE)

leaflet(PriceGroup2) %>% addProviderTiles("Esri.NatGeoWorldMap") %>%
  addCircles(lng = ~long, lat = ~lat, 
             color = ~c(color))  %>%
  # controls
  setView(lng=center_lon, lat=center_lat,zoom = 12)
}

MapPriceGroups(PriceGroup1, "black")

Price Group 2

The map shows houses in the range 250 to 500 thousands.

MapPriceGroups(PriceGroup2, "blue")

Price Group 3

The map shows houses in the range 500 to 750 thousands.

MapPriceGroups(PriceGroup3, "yellow")

Price Group 4

The map shows houses in the range 750 to 1 million.

MapPriceGroups(PriceGroup4, "orange")

Price Group 5

The map shows houses in the range 1 million to 2 million.

MapPriceGroups(PriceGroup5, "#0B5345")

Price Group 6

The map shows houses in the range 2 million and above.

MapPriceGroups(PriceGroup6, "red")

Correlation Matrix

sqft_living, sqft_above, grade, sqft_living_15. bathrooms, bedrooms, floors 변수는 price 변수와 강한 상관관계를 갖고 있음을 알 수 있다.

house_df = house_df %>% select(-PriceBin)

house_df3 = house_df %>%
  select(-id,-date)

CorrelationResults = cor(house_df3)

corrplot(CorrelationResults, method="circle", type="full", order="hclust", addrect=5)

4. Modeling

4.1. Setting

-Load libraies

suppressMessages(library(corrplot)) # correlation
suppressMessages(library(caret)) # Machine learning
suppressMessages(library(car)) # 다중공선성
suppressMessages(library(relaimpo)) # 변수 중요도
suppressMessages(library(glmnet)) # Shrinkage method(릿지, 라쏘)
suppressMessages(library(rpart)) # Regression tree
suppressMessages(library(randomForest)) # randomForest
suppressMessages(library(gbm)) # Gradient boosting

- Load dataset

# data loading
house <- read.csv("./input/house.csv", na.string = c("", " "))
house <- house[, -c(1, 2)] # 불필요한 변수 제거 (id, date)

- Split the dataset

# training / test
set.seed(1004)
flag <- sample(c("tr", "te"), size = nrow(house), c(8, 2), replace = T) # 8:2로 training / test set
train <- house[which(flag == "tr"), ]
test <- house[which(flag == "te"), ]

4.2. Preprocessing

# Preprocessing
boxplot(train)

b1 <- boxplot(train$price)

out1 <- which(train$price > b1$stats[5]) # price outlier index

- shapiro-wilk normality test

# shapiro-wilk normality test
shapiro.test(sample(house$price, size = 5000))
## 
##  Shapiro-Wilk normality test
## 
## data:  sample(house$price, size = 5000)
## W = 0.66283, p-value < 2.2e-16

- Distribution

# histogram
par(mfrow = c(4,4))
for(i in 1:17){
  hist(house[, i], main = colnames(house)[i], xlab = colnames(house)[i])
}

par(mfrow = c(1,1))

# original data / transformed data
plot(sort(house$price), ylab = "Price", pch = 19, cex = 0.5)

hist(sort(house$price), xlab = "Price", main = "Price")

plot(sort(log(house$price)), ylab = "ln(Price)", pch = 19, cex = 0.5)

hist(sort(log(house$price)), xlab = "ln(Price)", main = "ln(Price)")

- qqplot

### q-q plot
# original data
qqnorm(house$price, main = "Normal Q-Q Plot of ln(Price)")
qqline(house$price, col = "red")

# log-transformed data
qqnorm(log(house$price), main = "Normal Q-Q Plot of ln(Price)")
qqline(log(house$price), col = "red")

- box-cow transformation

### box-cow transformation
b <- BoxCoxTrans(house$price)
price_tr <- predict(b, house$price)

hist(price_tr)

qqnorm(price_tr)
qqline(price_tr, col = "red")

4.3. Correlation analysis

# Correlation analysis

c <- cor(train)
corrplot(c, method = "circle", order = "hclust", addrect = 5)

4.4. Simple linear regression

# Simple linear regression
lm <- lm(price~sqft_living, data = train) # y변수: price, x변수: sqft_living
summary(lm)
## 
## Call:
## lm(formula = price ~ sqft_living, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1449470  -146407   -24015   105542  4385229 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -39005.691   4876.094  -7.999 1.33e-15 ***
## sqft_living    278.322      2.145 129.743  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 260200 on 17351 degrees of freedom
## Multiple R-squared:  0.4924, Adjusted R-squared:  0.4924 
## F-statistic: 1.683e+04 on 1 and 17351 DF,  p-value: < 2.2e-16
plot(train$sqft_living, train$price, pch = 19, cex = 0.5, xlab = "Sqft_living", ylab = "Price") 
abline(lm, col = "red", lwd = 2) # 예측 회귀선

# Prediction 
pred_lm <- predict(lm, test) # test dataset을 이용하여 예측
# 결과 plotting
plot(train$price, lm$fitted.values, cex = 0.5, pch = 19, xlim = c(5000, 4000000), ylim = c(5000, 4000000), xlab = "Actual price", ylab = "Predicted price")
points(test$price, pred_lm, pch = 2, cex = 0.5, col = "red")
abline(1,1, lty = 2, col = "blue", lwd = 2)
legend("topright", legend = c("training", "test", "Actual price"), pch = c(19, 2, NA), col = c("black", "red", "blue"), lty = c(NA, NA, 1))

# Performance evaluation
MSE <- function(y, y_hat){ return(mean((y - y_hat)^2)) }
MAPE <- function(y, y_hat){ return(mean(abs((y - y_hat) / y)) * 100) }
# MSE
MSE_tr1 <- MSE(train$price, lm$fitted.values)
MSE_tr1 # training MSE
## [1] 67717512084
MSE_te1 <- MSE(test$price, pred_lm)
MSE_te1 # test MSE
## [1] 70955856318
# MAPE
MAPE_tr1 <- MAPE(train$price, lm$fitted.values)
MAPE_tr1 # training MAPE
## [1] 35.86519
MAPE_te1 <- MAPE(test$price, pred_lm)
MAPE_te1 # test MAPE
## [1] 36.21707

4.5. Multiple Linear Regression

# Multiple Linear Regression
train <- subset(train, select = -(sqft_basement)) # sqft_basement 변수 제거
test <- subset(test, select = -(sqft_basement))

# Multiple Linear Regression
multi <- lm(price~., data = train) # 다중 회귀 모형
pred_multi <- predict(multi, test)

summary(multi)
## 
## Call:
## lm(formula = price ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1322694   -98400   -10705    75667  4386236 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    7.480e+06  3.246e+06   2.305   0.0212 *  
## bedrooms      -3.284e+04  2.067e+03 -15.889  < 2e-16 ***
## bathrooms      4.641e+04  3.601e+03  12.888  < 2e-16 ***
## sqft_living    1.403e+02  4.848e+00  28.943  < 2e-16 ***
## sqft_lot       1.514e-01  5.136e-02   2.948   0.0032 ** 
## floors         5.272e+03  3.980e+03   1.324   0.1854    
## waterfront     6.354e+05  1.940e+04  32.744  < 2e-16 ***
## view           4.838e+04  2.391e+03  20.237  < 2e-16 ***
## condition      2.708e+04  2.604e+03  10.402  < 2e-16 ***
## grade          9.746e+04  2.393e+03  40.730  < 2e-16 ***
## sqft_above     3.293e+01  4.804e+00   6.855 7.36e-12 ***
## yr_built      -2.654e+03  8.041e+01 -33.011  < 2e-16 ***
## yr_renovated   1.895e+01  4.043e+00   4.688 2.78e-06 ***
## zipcode       -5.811e+02  3.652e+01 -15.913  < 2e-16 ***
## lat            5.997e+05  1.190e+04  50.418  < 2e-16 ***
## long          -2.088e+05  1.452e+04 -14.380  < 2e-16 ***
## sqft_living15  2.124e+01  3.824e+00   5.555 2.82e-08 ***
## sqft_lot15    -3.961e-01  7.877e-02  -5.029 4.99e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 199700 on 17335 degrees of freedom
## Multiple R-squared:  0.7015, Adjusted R-squared:  0.7012 
## F-statistic:  2396 on 17 and 17335 DF,  p-value: < 2.2e-16

4.5.2. 다중공선성 확인

# 다중공선성 확인
vif(multi) # 분산 팽창 지수 계산 (VIF)
##      bedrooms     bathrooms   sqft_living      sqft_lot        floors 
##      1.647449      3.373018      8.675272      2.096439      2.008616 
##    waterfront          view     condition         grade    sqft_above 
##      1.209314      1.435016      1.245598      3.426006      6.943777 
##      yr_built  yr_renovated       zipcode           lat          long 
##      2.422221      1.144992      1.659538      1.182860      1.819474 
## sqft_living15    sqft_lot15 
##      2.973103      2.131906

4.6. Stepwise regression

## stepwise regression
full_m <- lm(price~., data = train) # 모든 변수를 이용한 full model
null_m <- lm(price~1, data = train) # 변수를 한 개도 이용하지 않은 null model

# stepwise 
step_m <- step(null_m, direction = "both", trace = 1, scope = list(lower = null_m, upper = full_m))
## Start:  AIC=444529
## price ~ 1
## 
##                 Df  Sum of Sq        RSS    AIC
## + sqft_living    1 1.1400e+15 1.1751e+15 432764
## + grade          1 1.0393e+15 1.2758e+15 434191
## + sqft_above     1 8.5295e+14 1.4622e+15 436557
## + sqft_living15  1 7.8899e+14 1.5261e+15 437300
## + bathrooms      1 6.5027e+14 1.6649e+15 438809
## + view           1 3.5877e+14 1.9564e+15 441609
## + lat            1 2.1916e+14 2.0960e+15 442805
## + bedrooms       1 2.1895e+14 2.0962e+15 442807
## + waterfront     1 1.8868e+14 2.1265e+15 443056
## + floors         1 1.5455e+14 2.1606e+15 443332
## + yr_renovated   1 3.4517e+13 2.2806e+15 444270
## + sqft_lot       1 1.8614e+13 2.2965e+15 444391
## + sqft_lot15     1 1.6979e+13 2.2982e+15 444403
## + zipcode        1 6.5539e+12 2.3086e+15 444482
## + yr_built       1 6.2839e+12 2.3089e+15 444484
## + condition      1 3.1984e+12 2.3119e+15 444507
## + long           1 1.6441e+12 2.3135e+15 444519
## <none>                        2.3151e+15 444529
## 
## Step:  AIC=432763.7
## price ~ sqft_living
## 
##                 Df  Sum of Sq        RSS    AIC
## + lat            1 1.6883e+14 1.0063e+15 430074
## + grade          1 1.0019e+14 1.0749e+15 431219
## + waterfront     1 9.6603e+13 1.0785e+15 431277
## + view           1 9.2294e+13 1.0828e+15 431346
## + yr_built       1 7.3147e+13 1.1020e+15 431650
## + long           1 5.0495e+13 1.1246e+15 432004
## + bedrooms       1 3.0373e+13 1.1447e+15 432311
## + zipcode        1 1.8067e+13 1.1570e+15 432497
## + yr_renovated   1 1.7055e+13 1.1580e+15 432512
## + sqft_living15  1 1.5109e+13 1.1600e+15 432541
## + condition      1 1.4711e+13 1.1604e+15 432547
## + sqft_lot15     1 5.1480e+12 1.1700e+15 432690
## + sqft_lot       1 2.4505e+12 1.1727e+15 432729
## + sqft_above     1 7.2854e+11 1.1744e+15 432755
## + floors         1 1.8474e+11 1.1749e+15 432763
## <none>                        1.1751e+15 432764
## + bathrooms      1 1.5555e+08 1.1751e+15 432766
## - sqft_living    1 1.1400e+15 2.3151e+15 444529
## 
## Step:  AIC=430074.1
## price ~ sqft_living + lat
## 
##                 Df  Sum of Sq        RSS    AIC
## + waterfront     1 1.0145e+14 9.0481e+14 428232
## + view           1 9.4518e+13 9.1175e+14 428364
## + grade          1 7.4793e+13 9.3148e+14 428736
## + yr_built       1 3.9793e+13 9.6647e+14 429376
## + long           1 2.6998e+13 9.7927e+14 429604
## + bedrooms       1 2.4121e+13 9.8215e+14 429655
## + condition      1 1.5656e+13 9.9061e+14 429804
## + yr_renovated   1 1.4265e+13 9.9200e+14 429828
## + sqft_living15  1 1.3927e+13 9.9234e+14 429834
## + sqft_lot15     1 1.1401e+12 1.0051e+15 430056
## + zipcode        1 3.0891e+11 1.0060e+15 430071
## + sqft_above     1 1.9851e+11 1.0061e+15 430073
## + sqft_lot       1 1.3763e+11 1.0061e+15 430074
## <none>                        1.0063e+15 430074
## + bathrooms      1 1.0545e+11 1.0062e+15 430074
## + floors         1 1.9182e+09 1.0063e+15 430076
## - lat            1 1.6883e+14 1.1751e+15 432764
## - sqft_living    1 1.0897e+15 2.0960e+15 442805
## 
## Step:  AIC=428232
## price ~ sqft_living + lat + waterfront
## 
##                 Df  Sum of Sq        RSS    AIC
## + grade          1 7.1313e+13 8.3350e+14 426809
## + view           1 3.9318e+13 8.6550e+14 427463
## + yr_built       1 3.1772e+13 8.7304e+14 427614
## + long           1 2.0343e+13 8.8447e+14 427839
## + bedrooms       1 1.6537e+13 8.8828e+14 427914
## + condition      1 1.3700e+13 8.9111e+14 427969
## + sqft_living15  1 1.3469e+13 8.9135e+14 427974
## + yr_renovated   1 8.6909e+12 8.9612e+14 428066
## + sqft_lot15     1 1.1803e+12 9.0363e+14 428211
## + sqft_above     1 8.2886e+11 9.0399e+14 428218
## + bathrooms      1 2.8435e+11 9.0453e+14 428229
## <none>                        9.0481e+14 428232
## + sqft_lot       1 7.4972e+10 9.0474e+14 428233
## + floors         1 3.9692e+10 9.0477e+14 428233
## + zipcode        1 1.9987e+10 9.0479e+14 428234
## - waterfront     1 1.0145e+14 1.0063e+15 430074
## - lat            1 1.7368e+14 1.0785e+15 431277
## - sqft_living    1 9.9750e+14 1.9023e+15 441125
## 
## Step:  AIC=426809.4
## price ~ sqft_living + lat + waterfront + grade
## 
##                 Df  Sum of Sq        RSS    AIC
## + yr_built       1 8.7257e+13 7.4624e+14 424892
## + view           1 3.3799e+13 7.9970e+14 426093
## + condition      1 2.5342e+13 8.0816e+14 426276
## + long           1 2.3965e+13 8.0954e+14 426305
## + yr_renovated   1 1.1125e+13 8.2238e+14 426578
## + bedrooms       1 7.6431e+12 8.2586e+14 426652
## + floors         1 6.3231e+12 8.2718e+14 426679
## + sqft_above     1 2.8307e+12 8.3067e+14 426752
## + bathrooms      1 1.7123e+12 8.3179e+14 426776
## + sqft_living15  1 9.9394e+11 8.3251e+14 426791
## + sqft_lot15     1 8.6420e+11 8.3264e+14 426793
## + zipcode        1 3.9330e+11 8.3311e+14 426803
## <none>                        8.3350e+14 426809
## + sqft_lot       1 1.7077e+10 8.3348e+14 426811
## - grade          1 7.1313e+13 9.0481e+14 428232
## - waterfront     1 9.7973e+13 9.3148e+14 428736
## - lat            1 1.4833e+14 9.8183e+14 429650
## - sqft_living    1 1.9875e+14 1.0322e+15 430518
## 
## Step:  AIC=424892.5
## price ~ sqft_living + lat + waterfront + grade + yr_built
## 
##                 Df  Sum of Sq        RSS    AIC
## + view           1 1.7965e+13 7.2828e+14 424472
## + bedrooms       1 7.2505e+12 7.3899e+14 424725
## + bathrooms      1 6.2833e+12 7.3996e+14 424748
## + condition      1 3.9958e+12 7.4225e+14 424801
## + zipcode        1 3.3173e+12 7.4293e+14 424817
## + long           1 2.9919e+12 7.4325e+14 424825
## + sqft_living15  1 1.6976e+12 7.4455e+14 424855
## + yr_renovated   1 1.1185e+12 7.4513e+14 424868
## + floors         1 9.3267e+11 7.4531e+14 424873
## + sqft_lot15     1 7.7647e+11 7.4547e+14 424876
## + sqft_above     1 1.3670e+11 7.4611e+14 424891
## <none>                        7.4624e+14 424892
## + sqft_lot       1 6.2536e+10 7.4618e+14 424893
## - waterfront     1 8.3015e+13 8.2926e+14 426721
## - yr_built       1 8.7257e+13 8.3350e+14 426809
## - lat            1 9.3296e+13 8.3954e+14 426935
## - grade          1 1.2680e+14 8.7304e+14 427614
## - sqft_living    1 1.8527e+14 9.3152e+14 428739
## 
## Step:  AIC=424471.6
## price ~ sqft_living + lat + waterfront + grade + yr_built + view
## 
##                 Df  Sum of Sq        RSS    AIC
## + bathrooms      1 5.9131e+12 7.2237e+14 424332
## + bedrooms       1 5.6159e+12 7.2266e+14 424339
## + zipcode        1 5.3127e+12 7.2297e+14 424347
## + condition      1 3.9586e+12 7.2432e+14 424379
## + long           1 1.8724e+12 7.2641e+14 424429
## + floors         1 1.3566e+12 7.2692e+14 424441
## + sqft_above     1 1.1961e+12 7.2708e+14 424445
## + sqft_lot15     1 9.4446e+11 7.2733e+14 424451
## + yr_renovated   1 9.4257e+11 7.2734e+14 424451
## + sqft_living15  1 7.5400e+11 7.2753e+14 424456
## + sqft_lot       1 1.2014e+11 7.2816e+14 424471
## <none>                        7.2828e+14 424472
## - view           1 1.7965e+13 7.4624e+14 424892
## - waterfront     1 4.5914e+13 7.7419e+14 425531
## - yr_built       1 7.1423e+13 7.9970e+14 426093
## - lat            1 9.7138e+13 8.2542e+14 426642
## - grade          1 1.1430e+14 8.4258e+14 426999
## - sqft_living    1 1.6776e+14 8.9604e+14 428067
## 
## Step:  AIC=424332.1
## price ~ sqft_living + lat + waterfront + grade + yr_built + view + 
##     bathrooms
## 
##                 Df  Sum of Sq        RSS    AIC
## + bedrooms       1 8.6539e+12 7.1371e+14 424125
## + zipcode        1 5.7487e+12 7.1662e+14 424195
## + condition      1 3.5035e+12 7.1886e+14 424250
## + sqft_above     1 1.6975e+12 7.2067e+14 424293
## + long           1 1.3033e+12 7.2106e+14 424303
## + sqft_living15  1 1.2587e+12 7.2111e+14 424304
## + sqft_lot15     1 5.8283e+11 7.2178e+14 424320
## + yr_renovated   1 3.9939e+11 7.2197e+14 424325
## + floors         1 3.8016e+11 7.2199e+14 424325
## <none>                        7.2237e+14 424332
## + sqft_lot       1 3.6627e+10 7.2233e+14 424333
## - bathrooms      1 5.9131e+12 7.2828e+14 424472
## - view           1 1.7595e+13 7.3996e+14 424748
## - waterfront     1 4.5951e+13 7.6832e+14 425400
## - yr_built       1 7.6320e+13 7.9869e+14 426073
## - sqft_living    1 8.9911e+13 8.1228e+14 426366
## - lat            1 9.4657e+13 8.1702e+14 426467
## - grade          1 1.1036e+14 8.3273e+14 426797
## 
## Step:  AIC=424125
## price ~ sqft_living + lat + waterfront + grade + yr_built + view + 
##     bathrooms + bedrooms
## 
##                 Df  Sum of Sq        RSS    AIC
## + zipcode        1 6.3874e+12 7.0733e+14 423971
## + condition      1 4.1455e+12 7.0957e+14 424026
## + long           1 1.4050e+12 7.1231e+14 424093
## + sqft_above     1 1.3899e+12 7.1232e+14 424093
## + sqft_living15  1 1.2254e+12 7.1249e+14 424097
## + sqft_lot15     1 1.0586e+12 7.1265e+14 424101
## + yr_renovated   1 2.5683e+11 7.1346e+14 424121
## + floors         1 2.2614e+11 7.1349e+14 424121
## + sqft_lot       1 1.8696e+11 7.1353e+14 424122
## <none>                        7.1371e+14 424125
## - bedrooms       1 8.6539e+12 7.2237e+14 424332
## - bathrooms      1 8.9511e+12 7.2266e+14 424339
## - view           1 1.5483e+13 7.2920e+14 424495
## - waterfront     1 4.3991e+13 7.5770e+14 425161
## - yr_built       1 8.0283e+13 7.9400e+14 425973
## - lat            1 9.2024e+13 8.0574e+14 426227
## - sqft_living    1 9.8068e+13 8.1178e+14 426357
## - grade          1 9.8216e+13 8.1193e+14 426360
## 
## Step:  AIC=423971
## price ~ sqft_living + lat + waterfront + grade + yr_built + view + 
##     bathrooms + bedrooms + zipcode
## 
##                 Df  Sum of Sq        RSS    AIC
## + long           1 7.2612e+12 7.0006e+14 423794
## + condition      1 3.0564e+12 7.0427e+14 423898
## + sqft_lot15     1 1.6597e+12 7.0567e+14 423932
## + sqft_above     1 1.0481e+12 7.0628e+14 423947
## + floors         1 7.1414e+11 7.0661e+14 423955
## + sqft_lot       1 4.3094e+11 7.0689e+14 423962
## + sqft_living15  1 3.6891e+11 7.0696e+14 423964
## + yr_renovated   1 2.3815e+11 7.0709e+14 423967
## <none>                        7.0733e+14 423971
## - zipcode        1 6.3874e+12 7.1371e+14 424125
## - bedrooms       1 9.2927e+12 7.1662e+14 424195
## - bathrooms      1 9.6356e+12 7.1696e+14 424204
## - view           1 1.7483e+13 7.2481e+14 424393
## - waterfront     1 4.4114e+13 7.5144e+14 425019
## - yr_built       1 8.6544e+13 7.9387e+14 425972
## - sqft_living    1 9.2694e+13 8.0002e+14 426106
## - grade          1 9.7500e+13 8.0483e+14 426210
## - lat            1 9.8402e+13 8.0573e+14 426229
## 
## Step:  AIC=423793.9
## price ~ sqft_living + lat + waterfront + grade + yr_built + view + 
##     bathrooms + bedrooms + zipcode + long
## 
##                 Df  Sum of Sq        RSS    AIC
## + condition      1 2.8359e+12 6.9723e+14 423725
## + sqft_above     1 2.3317e+12 6.9773e+14 423738
## + sqft_living15  1 1.2588e+12 6.9881e+14 423765
## + sqft_lot15     1 5.9123e+11 6.9947e+14 423781
## + floors         1 5.8764e+11 6.9948e+14 423781
## + yr_renovated   1 3.4459e+11 6.9972e+14 423787
## <none>                        7.0006e+14 423794
## + sqft_lot       1 2.6057e+10 7.0004e+14 423795
## - long           1 7.2612e+12 7.0733e+14 423971
## - bathrooms      1 8.3200e+12 7.0838e+14 423997
## - bedrooms       1 9.9443e+12 7.1001e+14 424037
## - zipcode        1 1.2244e+13 7.1231e+14 424093
## - view           1 1.6552e+13 7.1662e+14 424197
## - waterfront     1 4.3830e+13 7.4389e+14 424846
## - yr_built       1 6.8163e+13 7.6823e+14 425404
## - grade          1 9.1778e+13 7.9184e+14 425930
## - sqft_living    1 9.8857e+13 7.9892e+14 426084
## - lat            1 1.0060e+14 8.0067e+14 426122
## 
## Step:  AIC=423725.5
## price ~ sqft_living + lat + waterfront + grade + yr_built + view + 
##     bathrooms + bedrooms + zipcode + long + condition
## 
##                 Df  Sum of Sq        RSS    AIC
## + sqft_above     1 3.1062e+12 6.9412e+14 423650
## + sqft_living15  1 1.3623e+12 6.9587e+14 423694
## + floors         1 9.4107e+11 6.9629e+14 423704
## + yr_renovated   1 7.9098e+11 6.9644e+14 423708
## + sqft_lot15     1 6.1008e+11 6.9662e+14 423712
## <none>                        6.9723e+14 423725
## + sqft_lot       1 2.2198e+10 6.9721e+14 423727
## - condition      1 2.8359e+12 7.0006e+14 423794
## - long           1 7.0407e+12 7.0427e+14 423898
## - bathrooms      1 7.9162e+12 7.0514e+14 423919
## - bedrooms       1 1.0447e+13 7.0768e+14 423982
## - zipcode        1 1.0758e+13 7.0799e+14 423989
## - view           1 1.6306e+13 7.1353e+14 424125
## - waterfront     1 4.3750e+13 7.4098e+14 424780
## - yr_built       1 5.2863e+13 7.5009e+14 424992
## - grade          1 9.2371e+13 7.8960e+14 425882
## - sqft_living    1 9.9014e+13 7.9624e+14 426028
## - lat            1 1.0188e+14 7.9911e+14 426090
## 
## Step:  AIC=423650
## price ~ sqft_living + lat + waterfront + grade + yr_built + view + 
##     bathrooms + bedrooms + zipcode + long + condition + sqft_above
## 
##                 Df  Sum of Sq        RSS    AIC
## + sqft_living15  1 9.8608e+11 6.9314e+14 423627
## + yr_renovated   1 7.7984e+11 6.9334e+14 423632
## + sqft_lot15     1 6.9467e+11 6.9343e+14 423635
## <none>                        6.9412e+14 423650
## + floors         1 5.7106e+10 6.9406e+14 423651
## + sqft_lot       1 4.2330e+10 6.9408e+14 423651
## - sqft_above     1 3.1062e+12 6.9723e+14 423725
## - condition      1 3.6104e+12 6.9773e+14 423738
## - bathrooms      1 8.4262e+12 7.0255e+14 423857
## - long           1 8.5404e+12 7.0266e+14 423860
## - bedrooms       1 1.0069e+13 7.0419e+14 423898
## - zipcode        1 1.0919e+13 7.0504e+14 423919
## - view           1 1.8186e+13 7.1231e+14 424097
## - sqft_living    1 4.0452e+13 7.3457e+14 424631
## - waterfront     1 4.2991e+13 7.3711e+14 424691
## - yr_built       1 5.4709e+13 7.4883e+14 424965
## - grade          1 7.8936e+13 7.7306e+14 425517
## - lat            1 1.0430e+14 7.9842e+14 426077
## 
## Step:  AIC=423627.3
## price ~ sqft_living + lat + waterfront + grade + yr_built + view + 
##     bathrooms + bedrooms + zipcode + long + condition + sqft_above + 
##     sqft_living15
## 
##                 Df  Sum of Sq        RSS    AIC
## + yr_renovated   1 8.8464e+11 6.9225e+14 423607
## + sqft_lot15     1 7.0961e+11 6.9243e+14 423612
## + floors         1 1.2542e+11 6.9301e+14 423626
## <none>                        6.9314e+14 423627
## + sqft_lot       1 2.8026e+10 6.9311e+14 423629
## - sqft_living15  1 9.8608e+11 6.9412e+14 423650
## - sqft_above     1 2.7301e+12 6.9587e+14 423694
## - condition      1 3.6595e+12 6.9680e+14 423717
## - bathrooms      1 8.8013e+12 7.0194e+14 423844
## - long           1 9.2767e+12 7.0241e+14 423856
## - bedrooms       1 1.0054e+13 7.0319e+14 423875
## - zipcode        1 1.0155e+13 7.0329e+14 423878
## - view           1 1.6505e+13 7.0964e+14 424034
## - sqft_living    1 3.5217e+13 7.2835e+14 424485
## - waterfront     1 4.3455e+13 7.3659e+14 424681
## - yr_built       1 5.4210e+13 7.4735e+14 424932
## - grade          1 6.8291e+13 7.6143e+14 425256
## - lat            1 1.0318e+14 7.9632e+14 426033
## 
## Step:  AIC=423607.2
## price ~ sqft_living + lat + waterfront + grade + yr_built + view + 
##     bathrooms + bedrooms + zipcode + long + condition + sqft_above + 
##     sqft_living15 + yr_renovated
## 
##                 Df  Sum of Sq        RSS    AIC
## + sqft_lot15     1 7.1349e+11 6.9154e+14 423591
## + floors         1 9.4742e+10 6.9216e+14 423607
## <none>                        6.9225e+14 423607
## + sqft_lot       1 2.5557e+10 6.9223e+14 423609
## - yr_renovated   1 8.8464e+11 6.9314e+14 423627
## - sqft_living15  1 1.0909e+12 6.9334e+14 423632
## - sqft_above     1 2.7010e+12 6.9495e+14 423673
## - condition      1 4.1841e+12 6.9644e+14 423710
## - bathrooms      1 7.7556e+12 7.0001e+14 423799
## - long           1 9.4958e+12 7.0175e+14 423842
## - bedrooms       1 9.8431e+12 7.0209e+14 423850
## - zipcode        1 1.0062e+13 7.0231e+14 423856
## - view           1 1.6272e+13 7.0852e+14 424008
## - sqft_living    1 3.5323e+13 7.2757e+14 424469
## - waterfront     1 4.2707e+13 7.3496e+14 424644
## - yr_built       1 4.4893e+13 7.3714e+14 424696
## - grade          1 6.7501e+13 7.5975e+14 425220
## - lat            1 1.0375e+14 7.9600e+14 426029
## 
## Step:  AIC=423591.3
## price ~ sqft_living + lat + waterfront + grade + yr_built + view + 
##     bathrooms + bedrooms + zipcode + long + condition + sqft_above + 
##     sqft_living15 + yr_renovated + sqft_lot15
## 
##                 Df  Sum of Sq        RSS    AIC
## + sqft_lot       1 3.3910e+11 6.9120e+14 423585
## <none>                        6.9154e+14 423591
## + floors         1 6.2604e+10 6.9147e+14 423592
## - sqft_lot15     1 7.1349e+11 6.9225e+14 423607
## - yr_renovated   1 8.8852e+11 6.9243e+14 423612
## - sqft_living15  1 1.1068e+12 6.9264e+14 423617
## - sqft_above     1 2.7783e+12 6.9432e+14 423659
## - condition      1 4.2230e+12 6.9576e+14 423695
## - bathrooms      1 7.5264e+12 6.9906e+14 423777
## - long           1 8.2357e+12 6.9977e+14 423795
## - zipcode        1 1.0028e+13 7.0157e+14 423839
## - bedrooms       1 1.0231e+13 7.0177e+14 423844
## - view           1 1.6503e+13 7.0804e+14 423999
## - sqft_living    1 3.5918e+13 7.2746e+14 424468
## - waterfront     1 4.2603e+13 7.3414e+14 424627
## - yr_built       1 4.5265e+13 7.3680e+14 424690
## - grade          1 6.7173e+13 7.5871e+14 425198
## - lat            1 1.0217e+14 7.9371e+14 425981
## 
## Step:  AIC=423584.8
## price ~ sqft_living + lat + waterfront + grade + yr_built + view + 
##     bathrooms + bedrooms + zipcode + long + condition + sqft_above + 
##     sqft_living15 + yr_renovated + sqft_lot15 + sqft_lot
## 
##                 Df  Sum of Sq        RSS    AIC
## <none>                        6.9120e+14 423585
## + floors         1 6.9940e+10 6.9113e+14 423585
## - sqft_lot       1 3.3910e+11 6.9154e+14 423591
## - yr_renovated   1 9.0312e+11 6.9210e+14 423605
## - sqft_lot15     1 1.0270e+12 6.9223e+14 423609
## - sqft_living15  1 1.1803e+12 6.9238e+14 423612
## - sqft_above     1 2.7331e+12 6.9393e+14 423651
## - condition      1 4.2640e+12 6.9546e+14 423689
## - bathrooms      1 7.5066e+12 6.9870e+14 423770
## - long           1 8.4330e+12 6.9963e+14 423793
## - zipcode        1 1.0027e+13 7.0123e+14 423833
## - bedrooms       1 1.0129e+13 7.0133e+14 423835
## - view           1 1.6365e+13 7.0756e+14 423989
## - sqft_living    1 3.5693e+13 7.2689e+14 424456
## - waterfront     1 4.2756e+13 7.3395e+14 424624
## - yr_built       1 4.4760e+13 7.3596e+14 424672
## - grade          1 6.6994e+13 7.5819e+14 425188
## - lat            1 1.0246e+14 7.9366e+14 425981
anova(step_m)
## Analysis of Variance Table
## 
## Response: price
##                  Df     Sum Sq    Mean Sq   F value    Pr(>F)    
## sqft_living       1 1.1400e+15 1.1400e+15 28593.241 < 2.2e-16 ***
## lat               1 1.6883e+14 1.6883e+14  4234.547 < 2.2e-16 ***
## waterfront        1 1.0145e+14 1.0145e+14  2544.556 < 2.2e-16 ***
## grade             1 7.1313e+13 7.1313e+13  1788.596 < 2.2e-16 ***
## yr_built          1 8.7257e+13 8.7257e+13  2188.503 < 2.2e-16 ***
## view              1 1.7965e+13 1.7965e+13   450.588 < 2.2e-16 ***
## bathrooms         1 5.9131e+12 5.9131e+12   148.306 < 2.2e-16 ***
## bedrooms          1 8.6539e+12 8.6539e+12   217.050 < 2.2e-16 ***
## zipcode           1 6.3874e+12 6.3874e+12   160.204 < 2.2e-16 ***
## long              1 7.2612e+12 7.2612e+12   182.119 < 2.2e-16 ***
## condition         1 2.8359e+12 2.8359e+12    71.127 < 2.2e-16 ***
## sqft_above        1 3.1062e+12 3.1062e+12    77.908 < 2.2e-16 ***
## sqft_living15     1 9.8608e+11 9.8608e+11    24.732 6.651e-07 ***
## yr_renovated      1 8.8464e+11 8.8464e+11    22.188 2.492e-06 ***
## sqft_lot15        1 7.1349e+11 7.1349e+11    17.895 2.346e-05 ***
## sqft_lot          1 3.3910e+11 3.3910e+11     8.505  0.003546 ** 
## Residuals     17336 6.9120e+14 3.9871e+10                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(step_m)
## 
## Call:
## lm(formula = price ~ sqft_living + lat + waterfront + grade + 
##     yr_built + view + bathrooms + bedrooms + zipcode + long + 
##     condition + sqft_above + sqft_living15 + yr_renovated + sqft_lot15 + 
##     sqft_lot, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1323062   -98494   -10496    75884  4384765 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.737e+06  3.197e+06   2.107  0.03512 *  
## sqft_living    1.384e+02  4.625e+00  29.920  < 2e-16 ***
## lat            6.010e+05  1.186e+04  50.694  < 2e-16 ***
## waterfront     6.355e+05  1.941e+04  32.747  < 2e-16 ***
## grade          9.773e+04  2.384e+03  40.991  < 2e-16 ***
## yr_built      -2.632e+03  7.854e+01 -33.506  < 2e-16 ***
## view           4.843e+04  2.390e+03  20.260  < 2e-16 ***
## bathrooms      4.766e+04  3.474e+03  13.721  < 2e-16 ***
## bedrooms      -3.293e+04  2.066e+03 -15.939  < 2e-16 ***
## zipcode       -5.766e+02  3.636e+01 -15.859  < 2e-16 ***
## long          -2.104e+05  1.447e+04 -14.543  < 2e-16 ***
## condition      2.688e+04  2.599e+03  10.341  < 2e-16 ***
## sqft_above     3.573e+01  4.315e+00   8.279  < 2e-16 ***
## sqft_living15  2.068e+01  3.800e+00   5.441 5.37e-08 ***
## yr_renovated   1.922e+01  4.038e+00   4.759 1.96e-06 ***
## sqft_lot15    -3.996e-01  7.873e-02  -5.075 3.91e-07 ***
## sqft_lot       1.497e-01  5.135e-02   2.916  0.00355 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 199700 on 17336 degrees of freedom
## Multiple R-squared:  0.7014, Adjusted R-squared:  0.7012 
## F-statistic:  2546 on 16 and 17336 DF,  p-value: < 2.2e-16
## Stepwise regression prediction
pred_step <- predict(step_m, test)

# 결과 plotting
plot(train$price, multi$fitted.values, cex = 0.5, pch = 19, xlim = c(5000, 4000000), ylim = c(5000, 4000000), xlab = "Actual price", ylab = "Predicted price")
points(test$price, pred_step, pch = 2, cex = 0.5, col = "red")
lines(train$price, train$price, lty = 2, col = "blue")
legend("topright", legend = c("training", "test", "Actual price"), pch = c(19, 2, NA), col = c("black", "red", "blue"), lty = c(NA, NA, 1))

## Stepwise regression performance evaluation
MSE <- function(y, y_hat){ return(mean((y - y_hat)^2)) }
MAPE <- function(y, y_hat){ return(mean(abs((y - y_hat) / y)) * 100) }

# MSE
MSE_tr2 <- mean((train$price - multi$fitted.values)^2)
MSE_tr2 # training MSE
## [1] 39827604934
MSE_te2 <- mean((test$price - pred_step)^2)
MSE_te2 # test MSE
## [1] 43297626604
# MAPE
MAPE_tr2 <- mean(abs((train$price - multi$fitted.values)/train$price))*100
MAPE_tr2 # training MAPE
## [1] 25.33839
MAPE_te2 <- mean(abs((test$price - pred_step)/test$price))*100
MAPE_te2 # test MAPE
## [1] 25.91058
## 변수의 중요도 파악
imp <- calc.relimp(step_m, rela = T)
plot(imp)

imp_sort <- sort(imp@lmg, decreasing = T) # 변수의 중요도를 정렬해서 다시 변수에 대입하기
barplot(imp_sort, ylim = c(0, 0.2), ylab = "% of R2")

4.7. Shrinkage method

4.7.1 Ridge regression

### Ridge
dim(train)
## [1] 17353    18
cvfit_ridge <- cv.glmnet(as.matrix(train[, -1]), train$price, alpha = 0) # ridge regression
plot(cvfit_ridge)

lambda_ridge <- cvfit_ridge$lambda.min # cross validation error를 최소로 만드는 람다값

coef_ridge <- predict(cvfit_ridge, s = lambda_ridge, type = "coefficients") # ridge coefficients
coef_ridge
## 18 x 1 sparse Matrix of class "dgCMatrix"
##                           1
## (Intercept)   -5.873356e+06
## bedrooms      -2.314040e+04
## bathrooms      4.959574e+04
## sqft_living    1.088586e+02
## sqft_lot       1.327548e-01
## floors         8.236528e+02
## waterfront     6.000204e+05
## view           5.152237e+04
## condition      3.018267e+04
## grade          8.636864e+04
## sqft_above     5.176549e+01
## yr_built      -2.264691e+03
## yr_renovated   2.562505e+01
## zipcode       -4.349845e+02
## lat            5.723959e+05
## long          -2.054832e+05
## sqft_living15  3.795842e+01
## sqft_lot15    -3.119631e-01

4.7.2. LASSO regression

# LASSO regression
cvfit_lasso <- cv.glmnet(as.matrix(train[, -1]), train$price, alpha = 1) # LASSO regression
plot(cvfit_lasso)

lambda_lasso <- cvfit_lasso$lambda.min # cross validation error를 최소로 만드는 람다값

coef_lasso <- predict(cvfit_lasso, s = lambda_lasso, type = "coefficients") # ridge coefficients
coef_lasso
## 18 x 1 sparse Matrix of class "dgCMatrix"
##                           1
## (Intercept)    5.985070e+06
## bedrooms      -3.149896e+04
## bathrooms      4.526357e+04
## sqft_living    1.404097e+02
## sqft_lot       1.105991e-01
## floors         4.253484e+03
## waterfront     6.320584e+05
## view           4.826378e+04
## condition      2.646562e+04
## grade          9.775651e+04
## sqft_above     3.216392e+01
## yr_built      -2.622562e+03
## yr_renovated   1.839798e+01
## zipcode       -5.555733e+02
## lat            5.961406e+05
## long          -2.014590e+05
## sqft_living15  2.056046e+01
## sqft_lot15    -3.341489e-01

4.7.3. Ridge / LASSO regression performance evaluation

4.7.3.1. Ridge

# Ridge / LASSO regression performance evaluation

### Ridge
pred_ridge_tr <- predict(cvfit_ridge, as.matrix(train[, -1]), s = lambda_ridge)
pred_ridge <- predict(cvfit_ridge, as.matrix(test[, -1]), s = lambda_ridge)

# 결과 plotting
plot(train$price, pred_ridge_tr, cex = 0.5, pch = 19, xlim = c(5000, 4000000), ylim = c(5000, 4000000), xlab = "Actual price", ylab = "Predicted price", main = "Ridge")
points(test$price, pred_ridge, pch = 2, cex = 0.5, col = "red")
lines(train$price, train$price, lty = 2, col = "blue")
legend("topright", legend = c("training", "test", "Actual price"), pch = c(19, 2, NA), col = c("black", "red", "blue"), lty = c(NA, NA, 1))

### Performance evaluation (Ridge)
MSE <- function(y, y_hat){ return(mean((y - y_hat)^2)) }
MAPE <- function(y, y_hat){ return(mean(abs((y - y_hat) / y)) * 100) }

# MSE
MSE_tr3 <- mean((train$price - pred_ridge_tr)^2)
MSE_tr3 # training MSE
## [1] 40182554841
MSE_te3 <- mean((test$price - pred_ridge)^2)
MSE_te3 # test MSE
## [1] 43957704007
# MAPE
MAPE_tr3 <- mean(abs((train$price - pred_ridge_tr)/train$price))*100
MAPE_tr3 # training MAPE
## [1] 24.63661
MAPE_te3 <- mean(abs((test$price - pred_ridge)/test$price))*100
MAPE_te3 # test MAPE
## [1] 25.19801

4.7.3.2. LASSO

### LASSO
pred_lasso_tr <- predict(cvfit_lasso, as.matrix(train[, -1]), s = lambda_lasso)
pred_lasso <- predict(cvfit_lasso, as.matrix(test[, -1]), s = lambda_lasso)

# 결과 plotting
plot(train$price, pred_lasso_tr, cex = 0.5, pch = 19, xlim = c(5000, 4000000), ylim = c(5000, 4000000), xlab = "Actual price", ylab = "Predicted price", main = "LASSO")
points(test$price, pred_lasso, pch = 2, cex = 0.5, col = "red")
lines(train$price, train$price, lty = 2, col = "blue")
legend("topright", legend = c("training", "test", "Actual price"), pch = c(19, 2, NA), col = c("black", "red", "blue"), lty = c(NA, NA, 1))

### Performance evaluation (LASSO)
MSE <- function(y, y_hat){ return(mean((y - y_hat)^2)) }
MAPE <- function(y, y_hat){ return(mean(abs((y - y_hat) / y)) * 100) }

# MSE
MSE_tr4 <- mean((train$price - pred_lasso_tr)^2)
MSE_tr4 # training MSE
## [1] 39833116443
MSE_te4 <- mean((test$price - pred_lasso)^2)
MSE_te4 # test MSE
## [1] 43300523173
# MAPE
MAPE_tr4 <- mean(abs((train$price - pred_lasso_tr)/train$price))*100
MAPE_tr4 # training MAPE
## [1] 25.24949
MAPE_te4 <- mean(abs((test$price - pred_lasso)/test$price))*100
MAPE_te4 # test MAPE
## [1] 25.79931
# Experimental result 
result <- cbind(c(MSE_tr1, MSE_tr2, MSE_tr3, MSE_tr4), c(MSE_te1, MSE_te2, MSE_te3, MSE_te4), c(MAPE_tr1, MAPE_tr2, MAPE_tr3, MAPE_tr4), c(MAPE_te1, MAPE_te2, MAPE_te3, MAPE_te4))
result <- data.frame(result)
rownames(result) <- c("Simple", "Stepwise", "Ridge", "LASSO")
colnames(result) <- c("MSE_tr", "MSE_te", "MAPE_tr", "MAPE_te")
round(result, 2)
##               MSE_tr      MSE_te MAPE_tr MAPE_te
## Simple   67717512084 70955856318   35.87   36.22
## Stepwise 39827604934 43297626604   25.34   25.91
## Ridge    40182554841 43957704007   24.64   25.20
## LASSO    39833116443 43300523173   25.25   25.80

5. Machine Learning

1. k-NN

Setting - load dataset

# KNN regression

# data reading
house <- read.csv("./input/house.csv", na.string = c("", " "))
house <- house[, -c(1, 2)] # 불필요한 변수 제거

- training / validation / test

# training / test
set.seed(1004)
flag <- sample(c("tr", "te"), size = nrow(house), c(8, 2), replace = T) # 8:2로 training / test set
train <- house[which(flag == "tr"), ]
test <- house[which(flag == "te"), ]
## training set에서 30%를 validation set으로 할당
valid_idx <- sample(1:nrow(train), size = nrow(train)*0.3)

train_knn <- train[-valid_idx, ]
valid_knn <- train[valid_idx, ]

- Standardization

# Standardization 
pp_model <- preProcess(train_knn[, -1], method = c("center", "scale"))
train <- predict(pp_model, train_knn)
valid <- predict(pp_model, valid_knn)
test <- predict(pp_model, test)
# training data set
tr_x <- train[, -1]
va_x <- valid[, -1]
te_x <- test[, -1]

# test data set
tr_y <- train$price
va_y <- valid$price
te_y <- test$price

- find optimal k

# knn을 위한 optimal k 찾기

MSE_k <- NULL
for(i in 1:30){
  m_knn <- knnreg(tr_x, tr_y, k = i)
  MSE_va <- mean((predict(m_knn, va_x) - va_y)^2)
  MSE_k <- c(MSE_k, MSE_va)
}

which.min(MSE_k)
## [1] 6
plot(MSE_k, type = "l", ylab = "MSE", xlab = "k")
abline(v = which.min(MSE_k), lty = 2, col = "red")
abline(h = min(MSE_k), lty = 2, col = "red")
text(which.min(MSE_k), min(MSE_k), labels = which.min(MSE_k), pos = 3, col = "red")

- final knn regression model

# knn regression
# final knn regression model
m_knn <- knnreg(tr_x, tr_y, k = which.min(MSE_k))
pred_knn_tr <- predict(m_knn, tr_x)
pred_knn_te <- predict(m_knn, te_x)
pred_knn <- pred_knn_te

- knn regression performance evaluation

# knn regression performance evaluation
# Training set prediction
MSE_tr_knn <- mean((train$price - pred_knn_tr)^2)
MSE_tr_knn # training MSE
## [1] 20373311172
# Test set prediction
MSE_te_knn <- mean((test$price - pred_knn_te)^2)
MSE_te_knn # test MSE
## [1] 32216110085
# 결과 plotting
plot(train$price, pred_knn_tr, cex = 0.5, pch = 19, xlim = c(5000, 4000000), ylim = c(5000, 4000000), xlab = "Actual price", ylab = "Predicted price", main = "KNN")
points(test$price, pred_knn_te, pch = 2, cex = 0.5, col = "red")
lines(train$price, train$price, lty = 2, col = "blue")
legend("topright", legend = c("training", "test", "Actual price"), pch = c(19, 2, NA), col = c("black", "red", "blue"), lty = c(NA, NA, 1))

2. Regression tree

2.1. full tree

# Regression tree
set.seed(111)

# full tree
dt <- rpart(price~., data = train, cp = 0.1^20) # 모든 변수 사용하여 full tree 생성
# printcp(dt) # cptable 출력
plotcp(dt) # cpplot 출력

2.2. pruning tree

# pruning tree
dt_prune <- prune(dt, cp = dt$cptable[which.min(dt$cptable[, "xerror"]), "CP"])

plot(dt_prune, margin = 0.1) # tree plotting
text(dt_prune, use.n = T)

2.3. variable importance

# variable importance
dt_prune$variable.importance
##         grade   sqft_living    sqft_above sqft_living15           lat 
##  6.209976e+14  5.630485e+14  4.293959e+14  3.347341e+14  2.550235e+14 
##     bathrooms          long       zipcode sqft_basement      sqft_lot 
##  1.997829e+14  1.136858e+14  9.733735e+13  6.646620e+13  6.058923e+13 
##      yr_built    sqft_lot15          view    waterfront      bedrooms 
##  6.018883e+13  5.843219e+13  4.883048e+13  3.196875e+13  2.890691e+13 
##     condition  yr_renovated        floors 
##  1.278551e+13  7.374427e+12  5.573016e+12
barplot(dt_prune$variable.importance)

2.4. Regression tree performance evaluation

# Regression tree performance evaluation

# Training / test set prediction
pred_tree_tr <- predict(dt_prune, train)
pred_tree_te <- predict(dt_prune, test)
pred_tree <- pred_tree_te
pred_tree <- pred_tree_te

# MSE calculation
MSE_tr_dt <- mean((train$price - pred_tree_tr)^2)
MSE_tr_dt # training MSE
## [1] 16468776577
MSE_te_dt <- mean((test$price - pred_tree_te)^2)
MSE_te_dt # test MSE
## [1] 28764794894
# 결과 plotting
plot(train$price, pred_tree_tr, cex = 0.5, pch = 19, xlim = c(5000, 4000000), ylim = c(5000, 4000000), xlab = "Actual price", ylab = "Predicted price", main = "Regression Tree")
points(test$price, pred_tree_te, pch = 2, cex = 0.5, col = "red")
lines(train$price, train$price, lty = 2, col = "blue")
legend("topright", legend = c("training", "test", "Actual price"), pch = c(19, 2, NA), col = c("black", "red", "blue"), lty = c(NA, NA, 1))

3. Bagged tree

# Bagged tree
set.seed(1004)
bagg_lm <- ipred::bagging(price~., data = train, coob = T, nbag = 80)
pred_bg <- predict(bagg_lm, test)

4. Random Forest

4.1. Learning RF

# Random forest
set.seed(1004)
rf_lm <- randomForest(price~., data = train, ntree = 100, mtry = 12, importance = T, na.action = na.omit) # rf model
rf_lm
## 
## Call:
##  randomForest(formula = price ~ ., data = train, ntree = 100,      mtry = 12, importance = T, na.action = na.omit) 
##                Type of random forest: regression
##                      Number of trees: 100
## No. of variables tried at each split: 12
## 
##           Mean of squared residuals: 18920283974
##                     % Var explained: 85.6

4.2. optimal mtry 탐색

# set.seed(1004)
# tune.rf <- tuneRF(train[, -1], train$price)

4.3. tree의 개수 vs MSE

rf_lm$mse # error율 확인
##   [1] 39760621252 34371370888 34854873385 32456170618 29974234760
##   [6] 30103847052 27885683055 26351213103 25326743133 24438536742
##  [11] 24170695131 24099796694 22972728597 22248644741 22162362112
##  [16] 21858449739 21637907187 21220677354 20869837800 20406327810
##  [21] 20609664420 20635714746 20890667235 20869034130 20986497420
##  [26] 21045078671 20475414387 20465659938 20142239087 20116959955
##  [31] 20268905359 20208598488 20145393468 20261368596 20211229449
##  [36] 20190865714 20227170157 20137276273 20315641991 20258704516
##  [41] 20308804883 20213553524 20156987988 20080139791 20032037324
##  [46] 19995213869 20002168938 19801999066 19734918457 19788455233
##  [51] 19699623189 19758297750 19696687621 19631038599 19507428073
##  [56] 19511004714 19452264251 19525858066 19494417080 19521198316
##  [61] 19505888819 19425244584 19376349258 19398736324 19394764554
##  [66] 19389465983 19384272094 19345096494 19244735284 19174028652
##  [71] 19196699746 19230350223 19215158927 19155519717 19184388051
##  [76] 19211271541 19033283486 18997543695 19013816349 18977198065
##  [81] 18972412090 18917409513 18909972081 18985673416 18989043078
##  [86] 18968666890 19026795163 18994969977 19010175179 19014183488
##  [91] 19039942632 18975643366 18964871397 18912808644 18918671557
##  [96] 18900794257 18880455647 18913404163 18951921332 18920283974
plot(rf_lm)

4.4. variable importance

# variable importance
round(rf_lm$importance, 3)
##                   %IncMSE IncNodePurity
## bedrooms        793511912  6.726242e+12
## bathrooms      1060025457  4.205045e+13
## sqft_living   35537019825  3.980277e+14
## sqft_lot       2810479709  2.559008e+13
## floors          650567538  3.641708e+12
## waterfront     2442025896  3.097658e+13
## view           2464001950  3.199819e+13
## condition       538713369  5.660933e+12
## grade         37105536660  4.642863e+14
## sqft_above     5490322366  5.343239e+13
## sqft_basement   412006183  1.220180e+13
## yr_built      16287869089  5.258209e+13
## yr_renovated    190156694  5.166900e+12
## zipcode        5835564115  2.886005e+13
## lat           47843167872  2.397312e+14
## long          26761099768  8.743448e+13
## sqft_living15  7063640462  6.022626e+13
## sqft_lot15     1915974550  2.292937e+13
varImpPlot(rf_lm)

4.5. random forest prediction results

# random forest prediction results

# training set
pred_tr_rf_lm <- predict(rf_lm, train)
MSE_tr_rf <- mean((rf_lm$predicted - train$price)^2)
MSE_tr_rf
## [1] 18920283974
# test set
pred_te_rf_lm <- predict(rf_lm, test)
pred_rf <- pred_te_rf_lm
MSE_te_rf <- mean((pred_te_rf_lm-test$price)^2)
MSE_te_rf
## [1] 19171588484
# 결과 plotting
#png("plot_2.png", width = 400, height = 400)
plot(train$price, pred_tr_rf_lm, cex = 0.5, pch = 19, xlim = c(5000, 4000000), ylim = c(5000, 4000000), xlab = "Actual price", ylab = "Predicted price")
points(test$price, pred_te_rf_lm, pch = 2, cex = 0.5, col = "red")
lines(train$price, train$price, lty = 2, col = "blue")
legend("topright", legend = c("training", "test", "Actual price"), pch = c(19, 2, NA), col = c("black", "red", "blue"), lty = c(NA, NA, 1))

#dev.off()

5. Gradient boosting

5.1. Learning GBM

# Gradient boosting
set.seed(1004)
gbm_st <- gbm(price~., data = train, distribution = "gaussian", n.trees = 1000, shrinkage = 0.05, interaction.depth = 1, cv.folds = 10, bag.fraction = 0.3)

5.2. optimal n.trees

# optimal n.trees
tune.gbm <- gbm.perf(gbm_st, method = "cv")

tune.gbm
## [1] 997

5.3. variable importance

# variable importance
summary(gbm_st, n.trees = tune.gbm)

##                         var     rel.inf
## sqft_living     sqft_living 36.18343510
## grade                 grade 21.41642517
## lat                     lat 12.67181837
## bathrooms         bathrooms  6.74694338
## sqft_above       sqft_above  6.52427469
## sqft_basement sqft_basement  4.25449336
## view                   view  4.10646277
## sqft_living15 sqft_living15  2.74509329
## waterfront       waterfront  2.42166581
## yr_renovated   yr_renovated  0.76345257
## long                   long  0.61317815
## condition         condition  0.57715176
## zipcode             zipcode  0.42326748
## yr_built           yr_built  0.36784008
## bedrooms           bedrooms  0.13404888
## sqft_lot           sqft_lot  0.03617958
## sqft_lot15       sqft_lot15  0.01426954
## floors               floors  0.00000000

5.4. gradient boosting prediction results

# gradient boosting prediction results
# training set
pred_tr_bs <- predict(gbm_st, newdata = train, n.trees = tune.gbm)
MSE_tr_bs <- mean((pred_tr_bs - train$price)^2)
MSE_tr_bs
## [1] 26564778553
# test set
pred_te_bs <- predict(gbm_st, newdata = test, n.trees = tune.gbm)
MSE_te_bs <- mean((pred_te_bs-test$price)^2)
pred_bs <- pred_te_bs
MSE_te_bs
## [1] 31690349408
# 결과 plotting
#png("plot_3.png", width = 400, height = 400)
plot(train$price, pred_tr_bs, cex = 0.5, pch = 19, xlim = c(5000, 4000000), ylim = c(5000, 4000000), xlab = "Actual price", ylab = "Predicted price")
points(test$price, pred_te_bs, pch = 2, cex = 0.5, col = "red")
lines(train$price, train$price, lty = 2, col = "blue")
legend("topright", legend = c("training", "test", "Actual price"), pch = c(19, 2, NA), col = c("black", "red", "blue"), lty = c(NA, NA, 1))

#dev.off()

6. Performance evaluation

6.1. final results of regression

# final results of regression
perf_reg <- function(act, pred){
  MAE <- mean(abs(act - pred))
  MAPE <- mean(abs(act-pred)/abs(act)) * 100
  MSE <- mean((act-pred)^2)
  RMSE <- sqrt(MSE)
  pf <- c(MAE, MAPE, MSE, RMSE)
  return(pf)
}
names <- c("Multiple", "Stepwise", "Ridge", "LASSO", "Knn", "Tree", "Bagged tree", "Random forest", "Boosting")
pred <- cbind(pred_multi, pred_step, pred_ridge, pred_lasso, pred_knn, pred_tree, pred_bg, pred_rf, pred_bs)
pred <- data.frame(pred)
colnames(pred) <- names
pred[1:10, ]
##     Multiple  Stepwise     Ridge     LASSO       Knn      Tree Bagged tree
## 4   462466.0  464519.3  470281.4  462742.9  489333.3  631203.8    302475.5
## 6  1448028.6 1452308.2 1417429.6 1448325.0 1125021.2 1455621.9   1290901.3
## 18  543449.2  541487.9  537691.3  542926.9  574833.3  668294.1    483000.8
## 19  330241.8  330026.3  300487.3  327848.2  186416.7  232930.2    299312.4
## 23  339441.4  338367.4  373031.8  342895.9  328166.7  300542.9    351762.5
## 26  309285.0  308063.5  305388.8  306121.0  223291.7  240557.0    299312.4
## 29  459167.7  459766.8  444634.1  459786.3  393383.3  502609.8    473832.4
## 33  647433.4  645262.5  623500.0  646835.9  575216.7  738373.3    647369.9
## 44  511426.9  507335.1  495669.0  510221.0  685000.0  581260.0    483000.8
## 45  149401.0  148963.6  156934.0  150715.0  219833.3  232930.2    299312.4
##    Random forest  Boosting
## 4       477386.1  373017.8
## 6      1396788.9 1822386.9
## 18      648861.7  594251.2
## 19      193174.3  259276.5
## 23      321917.9  324442.7
## 26      209347.4  318724.2
## 29      472844.4  441505.4
## 33      683207.4  651951.2
## 44      607811.3  553387.4
## 45      240895.0  205991.0

6.2. actual price / predicted price plotting

### actual price / predicted price plotting
par(mfrow = c(3, 3))
perf_mat <- NULL
for(i in 1:dim(pred)[2]){
  perf_mat <- rbind(perf_mat, perf_reg(test$price, pred[,i]))
  plot(test$price, pred[,i], pch = 2, cex = 0.5, col = "red", xlab = "Actual price", ylab = "Predicted price", main = names[i])
  lines(train$price, train$price, lty = 2, col = "blue")
}

6.3. performance matrix

# performance matrix
perf_mat <- data.frame(perf_mat)
colnames(perf_mat) <- c("MAE", "MAPE", "MSE", "RMSE")
rownames(perf_mat) <- names
round(perf_mat, 3)
##                     MAE   MAPE         MSE     RMSE
## Multiple      127814.85 25.889 43282595177 208044.7
## Stepwise      127911.98 25.911 43297626604 208080.8
## Ridge         126334.70 25.198 43957704007 209660.9
## LASSO         127596.86 25.799 43300523173 208087.8
## Knn            93911.00 16.777 32216110085 179488.5
## Tree           91137.49 16.355 28764794894 169601.9
## Bagged tree   121554.80 24.009 39361868752 198398.3
## Random forest  71645.05 13.272 19171588484 138461.5
## Boosting       99965.80 18.593 31690349408 178017.8

6.4. performance plotting

# performance plotting
par(mfrow = c(2, 2))
for(i in 1:4){
  plot(perf_mat[, i], type = "o", main = colnames(perf_mat)[i], ylab = "Error")
}