Load Some house sales data

library(RCurl)
## Loading required package: bitops
csv <- getURL("https://s3-ap-northeast-1.amazonaws.com/ldktorage/coursera_dato/home_data.csv")
sales <- read.csv(text = csv)
colnames(sales)
##  [1] "id"            "date"          "price"         "bedrooms"     
##  [5] "bathrooms"     "sqft_living"   "sqft_lot"      "floors"       
##  [9] "waterfront"    "view"          "condition"     "grade"        
## [13] "sqft_above"    "sqft_basement" "yr_built"      "yr_renovated" 
## [17] "zipcode"       "lat"           "long"          "sqft_living15"
## [21] "sqft_lot15"
head(sales)
##           id            date   price bedrooms bathrooms sqft_living
## 1 7129300520 20141013T000000  221900        3      1.00        1180
## 2 6414100192 20141209T000000  538000        3      2.25        2570
## 3 5631500400 20150225T000000  180000        2      1.00         770
## 4 2487200875 20141209T000000  604000        4      3.00        1960
## 5 1954400510 20150218T000000  510000        3      2.00        1680
## 6 7237550310 20140512T000000 1225000        4      4.50        5420
##   sqft_lot floors waterfront view condition grade sqft_above sqft_basement
## 1     5650      1          0    0         3     7       1180             0
## 2     7242      2          0    0         3     7       2170           400
## 3    10000      1          0    0         3     6        770             0
## 4     5000      1          0    0         5     7       1050           910
## 5     8080      1          0    0         3     8       1680             0
## 6   101930      1          0    0         3    11       3890          1530
##   yr_built yr_renovated zipcode     lat     long sqft_living15 sqft_lot15
## 1     1955            0   98178 47.5112 -122.257          1340       5650
## 2     1951         1991   98125 47.7210 -122.319          1690       7639
## 3     1933            0   98028 47.7379 -122.233          2720       8062
## 4     1965            0   98136 47.5208 -122.393          1360       5000
## 5     1987            0   98074 47.6168 -122.045          1800       7503
## 6     2001            0   98053 47.6561 -122.005          4760     101930
str(sales)
## 'data.frame':    21613 obs. of  21 variables:
##  $ id           : num  7.13e+09 6.41e+09 5.63e+09 2.49e+09 1.95e+09 ...
##  $ date         : Factor w/ 372 levels "20140502T000000",..: 165 221 291 221 284 11 57 252 340 306 ...
##  $ price        : int  221900 538000 180000 604000 510000 1225000 257500 291850 229500 323000 ...
##  $ bedrooms     : int  3 3 2 4 3 4 3 3 3 3 ...
##  $ bathrooms    : num  1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
##  $ sqft_living  : int  1180 2570 770 1960 1680 5420 1715 1060 1780 1890 ...
##  $ sqft_lot     : int  5650 7242 10000 5000 8080 101930 6819 9711 7470 6560 ...
##  $ floors       : num  1 2 1 1 1 1 2 1 1 2 ...
##  $ waterfront   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ view         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ condition    : int  3 3 3 5 3 3 3 3 3 3 ...
##  $ grade        : int  7 7 6 7 8 11 7 7 7 7 ...
##  $ sqft_above   : int  1180 2170 770 1050 1680 3890 1715 1060 1050 1890 ...
##  $ sqft_basement: int  0 400 0 910 0 1530 0 0 730 0 ...
##  $ yr_built     : int  1955 1951 1933 1965 1987 2001 1995 1963 1960 2003 ...
##  $ yr_renovated : int  0 1991 0 0 0 0 0 0 0 0 ...
##  $ zipcode      : int  98178 98125 98028 98136 98074 98053 98003 98198 98146 98038 ...
##  $ lat          : num  47.5 47.7 47.7 47.5 47.6 ...
##  $ long         : num  -122 -122 -122 -122 -122 ...
##  $ sqft_living15: int  1340 1690 2720 1360 1800 4760 2238 1650 1780 2390 ...
##  $ sqft_lot15   : int  5650 7639 8062 5000 7503 101930 6819 9711 8113 7570 ...

Exploring the data for Housing

library(ggplot2)
qplot(sales$sqft_living,sales$price,colour=sales$price,
      xlab ="Squre feet", ylab="price($)")

Create a simple regression model of sqft_living to Price

set.seed(0)
prob <- c(0.8,0.2)
tags <- c("train", "test")
sample_tag <-  sample(tags,size=length(sales$id),replace=TRUE,prob=prob)
sales_tag <- cbind(sales,sample_tag)
train_data <- subset(sales_tag,sample_tag == 'train',select=-sample_tag)
test_data <- subset(sales_tag,sample_tag == 'test',select=-sample_tag)
length(train_data[,1])
## [1] 17226
length(test_data[,1])
## [1] 4387

Build the regression model

sqft_model <- lm(formula=price~sqft_living,data=train_data)

Evaludate the simple model

print(mean(train_data$price))
## [1] 540610.4
range(fitted(sqft_model)) #예측값의 범위
## [1]   33358.98 3363212.06
range(residuals(sqft_model)) #잔차의 범위
## [1] -1269213  4336788
dist <- fitted(sqft_model) + residuals(sqft_model)
range(dist)
## [1]   80000 7700000
deviance(sqft_model) #SSE(잔차제곱)
## [1] 1.203431e+15
sqrt(sum(residuals(sqft_model)^2) / length(train_data$id)) #RMSE
## [1] 264312.9
real_test <- test_data$price
predict_test <- predict(sqft_model,test_data)
sqrt(sum((predict_test-real_test)^2) / length(test_data$id)) #RMSE
## [1] 249897.2

Let’s show what our predictions look like

ggplot(train_data, aes(x=sqft_living, y=price)) +
    geom_point(alpha=1, colour='blue') +
    geom_abline(intercept=sqft_model$coefficient[1], slope=sqft_model$coefficients[2],colour="red")

ggplot(test_data, aes(x=sqft_living, y=price)) +
    geom_point(alpha=1, colour='blue') +
    geom_abline(intercept=sqft_model$coefficient[1], slope=sqft_model$coefficients[2],colour="red")

summary(sqft_model)
## 
## Call:
## lm(formula = price ~ sqft_living, data = train_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1269213  -148981   -24037   108087  4336788 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -48754.7     5002.7  -9.746   <2e-16 ***
## sqft_living    283.2        2.2 128.699   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 264300 on 17224 degrees of freedom
## Multiple R-squared:  0.4902, Adjusted R-squared:  0.4902 
## F-statistic: 1.656e+04 on 1 and 17224 DF,  p-value: < 2.2e-16
predict_val <- predict(sqft_model, newdata=data.frame(sqft_living=seq(1000,2000,by=100)),interval = "confidence")
sqftliving = seq(1000,2000,by=100)
predict_val <- cbind(predict_val,sqftliving)
predictdf <- data.frame(predict_val)

ggplot(predictdf, aes(sqftliving,fit)) +
  geom_line() +
  geom_smooth(aes(ymin = lwr, ymax = upr), data=predictdf, stat="identity")

par(mfrow = c(2, 2))
plot(sqft_model)

par(mfrow = c(1, 2))
plot(sqft_model, which = c(4, 6))

Explorer other Feautres in the data

my_features <- c('bedrooms','bathrooms','sqft_living','sqft_lot','floors','zipcode')
summary(sales[,my_features])
##     bedrooms        bathrooms      sqft_living       sqft_lot      
##  Min.   : 0.000   Min.   :0.000   Min.   :  290   Min.   :    520  
##  1st Qu.: 3.000   1st Qu.:1.750   1st Qu.: 1427   1st Qu.:   5040  
##  Median : 3.000   Median :2.250   Median : 1910   Median :   7618  
##  Mean   : 3.371   Mean   :2.115   Mean   : 2080   Mean   :  15107  
##  3rd Qu.: 4.000   3rd Qu.:2.500   3rd Qu.: 2550   3rd Qu.:  10688  
##  Max.   :33.000   Max.   :8.000   Max.   :13540   Max.   :1651359  
##      floors         zipcode     
##  Min.   :1.000   Min.   :98001  
##  1st Qu.:1.000   1st Qu.:98033  
##  Median :1.500   Median :98065  
##  Mean   :1.494   Mean   :98078  
##  3rd Qu.:2.000   3rd Qu.:98118  
##  Max.   :3.500   Max.   :98199
ggplot(sales, aes(x=as.factor(zipcode), y=price)) +
  geom_boxplot(fill="yellow", color="blue")

sort(table(sales$sqft_living),decreasing=TRUE)[1]
## 1300 
##  138
sales_sqr_ft_1300 <- subset(sales,sqft_living==1300)
range(sales_sqr_ft_1300$sqft_living)
## [1] 1300 1300
ggplot(sales_sqr_ft_1300, aes(as.factor(zipcode),price)) +
  geom_boxplot(fill="yellow", color="blue")

sort(table(sales$zipcode),decreasing=TRUE)[1]
## 98103 
##   602
sales_zipcode_98103 <- subset(sales,zipcode==98103)
ggplot(sales_zipcode_98103, aes(as.factor(bedrooms),price)) +
  geom_boxplot(color="blue", fill="green") +
  ggtitle("Box Plot for Bedrooms in Seattle") +
  xlab("Bedrooms") + ylab("Price")

ggplot(sales_zipcode_98103, aes(as.factor(bathrooms),price)) +
  geom_boxplot(color="blue", fill="green") +
  ggtitle("Box Plot for Bathrooms in Seattle") +
  xlab("Bathrooms") + ylab("Price")

ggplot(sales_zipcode_98103, aes(as.factor(sqft_lot),price)) +
  geom_boxplot(color="blue", fill="green") +
  ggtitle("Box Plot for Sqft lot in Seattle") +
  xlab("Sqft lot") + ylab("Price")

ggplot(sales_zipcode_98103, aes(as.factor(floors),price),price) +
  geom_boxplot(color="blue", fill="green") +
  ggtitle("Box Plot for Floors in Seattle") +
  xlab("Floors") + ylab("Price")

Build a regression model with more features

my_features_model <- lm(formula=price ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors + zipcode, data=train_data)
summary(my_features_model)
## 
## Call:
## lm(formula = price ~ bedrooms + bathrooms + sqft_living + sqft_lot + 
##     floors + zipcode, data = train_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1342194  -142693   -22608   101827  4089210 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.615e+07  3.713e+06 -15.121  < 2e-16 ***
## bedrooms    -5.737e+04  2.632e+03 -21.797  < 2e-16 ***
## bathrooms    6.771e+03  4.320e+03   1.567    0.117    
## sqft_living  3.216e+02  3.539e+00  90.858  < 2e-16 ***
## sqft_lot    -2.672e-01  4.889e-02  -5.465  4.7e-08 ***
## floors      -2.275e+03  4.263e+03  -0.534    0.594    
## zipcode      5.730e+02  3.785e+01  15.142  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 258600 on 17219 degrees of freedom
## Multiple R-squared:  0.5121, Adjusted R-squared:  0.5119 
## F-statistic:  3012 on 6 and 17219 DF,  p-value: < 2.2e-16
sqrt(sum(residuals(my_features_model)^2) / length(train_data$id)) #RMSE
## [1] 258590.8
real_test <- test_data$price
predict_test <- predict(my_features_model,test_data)
sqrt(sum((predict_test-real_test)^2) / length(test_data$id)) #RMSE
## [1] 243852.3

Apply learned models to predict prices of any house

house1 = sales[which(sales$id=='5309101200'),]
house1
##              id            date  price bedrooms bathrooms sqft_living
## 1055 5309101200 20140605T000000 620000        4      2.25        2400
##      sqft_lot floors waterfront view condition grade sqft_above
## 1055     5350    1.5          0    0         4     7       1460
##      sqft_basement yr_built yr_renovated zipcode     lat    long
## 1055           940     1929            0   98117 47.6763 -122.37
##      sqft_living15 sqft_lot15
## 1055          1250       4880
house1$price
## [1] 620000
predict(sqft_model,house1)
##     1055 
## 630807.1
predict(my_features_model,house1)
##     1055 
## 632955.7

house2 = sales[which(sales$id=='1925069082'),]
house2
##              id            date   price bedrooms bathrooms sqft_living
## 1362 1925069082 20150511T000000 2200000        5      4.25        4640
##      sqft_lot floors waterfront view condition grade sqft_above
## 1362    22703      2          1    4         5     8       2860
##      sqft_basement yr_built yr_renovated zipcode     lat     long
## 1362          1780     1952            0   98052 47.6393 -122.097
##      sqft_living15 sqft_lot15
## 1362          3140      14200
house2$price
## [1] 2200000
predict(sqft_model,house2)
##    1362 
## 1265065
predict(my_features_model,house2)
##    1362 
## 1266452

image of a fancier house

Build a regression model with advanced features

power_model <- lm(formula=price ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors + zipcode + condition + grade + waterfront + view + sqft_above + sqft_basement + yr_built + yr_renovated + lat + long + sqft_living15 + sqft_lot15, data=train_data)

summary(power_model)
## 
## Call:
## lm(formula = price ~ bedrooms + bathrooms + sqft_living + sqft_lot + 
##     floors + zipcode + condition + grade + waterfront + view + 
##     sqft_above + sqft_basement + yr_built + yr_renovated + lat + 
##     long + sqft_living15 + sqft_lot15, data = train_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1262846  -100471   -10032    78809  4304113 
## 
## Coefficients: (1 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    5.923e+06  3.315e+06   1.786  0.07404 .  
## bedrooms      -3.489e+04  2.138e+03 -16.317  < 2e-16 ***
## bathrooms      3.840e+04  3.713e+03  10.342  < 2e-16 ***
## sqft_living    1.491e+02  5.010e+00  29.765  < 2e-16 ***
## sqft_lot       1.547e-01  5.477e-02   2.825  0.00474 ** 
## floors         5.805e+03  4.101e+03   1.415  0.15695    
## zipcode       -5.803e+02  3.732e+01 -15.551  < 2e-16 ***
## condition      2.805e+04  2.666e+03  10.522  < 2e-16 ***
## grade          9.715e+04  2.448e+03  39.685  < 2e-16 ***
## waterfront     5.439e+05  1.930e+04  28.174  < 2e-16 ***
## view           5.607e+04  2.440e+03  22.982  < 2e-16 ***
## sqft_above     3.736e+01  4.948e+00   7.552 4.51e-14 ***
## sqft_basement         NA         NA      NA       NA    
## yr_built      -2.619e+03  8.301e+01 -31.554  < 2e-16 ***
## yr_renovated   1.891e+01  4.143e+00   4.565 5.04e-06 ***
## lat            6.085e+05  1.217e+04  49.994  < 2e-16 ***
## long          -2.169e+05  1.498e+04 -14.482  < 2e-16 ***
## sqft_living15  1.924e+01  3.917e+00   4.913 9.07e-07 ***
## sqft_lot15    -4.295e-01  8.331e-02  -5.156 2.56e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 204200 on 17208 degrees of freedom
## Multiple R-squared:  0.6961, Adjusted R-squared:  0.6958 
## F-statistic:  2319 on 17 and 17208 DF,  p-value: < 2.2e-16
sqrt(sum(residuals(power_model)^2) / length(train_data$id)) #RMSE
## [1] 204071.8
real_test <- test_data$price
predict_test <- predict(power_model,test_data)
## Warning in predict.lm(power_model, test_data): prediction from a rank-
## deficient fit may be misleading
sqrt(sum((predict_test-real_test)^2) / length(test_data$id)) #RMSE
## [1] 189642.1
par(mfrow=c(2,2))
plot(power_model)

predict(power_model, house1)
## Warning in predict.lm(power_model, house1): prediction from a rank-
## deficient fit may be misleading
##     1055 
## 672306.2
predict(power_model, house2)
## Warning in predict.lm(power_model, house2): prediction from a rank-
## deficient fit may be misleading
##    1362 
## 1927580