Load Some house sales data
library(RCurl)
## Loading required package: bitops
csv <- getURL("https://s3-ap-northeast-1.amazonaws.com/ldktorage/coursera_dato/home_data.csv")
sales <- read.csv(text = csv)
colnames(sales)
## [1] "id" "date" "price" "bedrooms"
## [5] "bathrooms" "sqft_living" "sqft_lot" "floors"
## [9] "waterfront" "view" "condition" "grade"
## [13] "sqft_above" "sqft_basement" "yr_built" "yr_renovated"
## [17] "zipcode" "lat" "long" "sqft_living15"
## [21] "sqft_lot15"
head(sales)
## id date price bedrooms bathrooms sqft_living
## 1 7129300520 20141013T000000 221900 3 1.00 1180
## 2 6414100192 20141209T000000 538000 3 2.25 2570
## 3 5631500400 20150225T000000 180000 2 1.00 770
## 4 2487200875 20141209T000000 604000 4 3.00 1960
## 5 1954400510 20150218T000000 510000 3 2.00 1680
## 6 7237550310 20140512T000000 1225000 4 4.50 5420
## sqft_lot floors waterfront view condition grade sqft_above sqft_basement
## 1 5650 1 0 0 3 7 1180 0
## 2 7242 2 0 0 3 7 2170 400
## 3 10000 1 0 0 3 6 770 0
## 4 5000 1 0 0 5 7 1050 910
## 5 8080 1 0 0 3 8 1680 0
## 6 101930 1 0 0 3 11 3890 1530
## yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
## 1 1955 0 98178 47.5112 -122.257 1340 5650
## 2 1951 1991 98125 47.7210 -122.319 1690 7639
## 3 1933 0 98028 47.7379 -122.233 2720 8062
## 4 1965 0 98136 47.5208 -122.393 1360 5000
## 5 1987 0 98074 47.6168 -122.045 1800 7503
## 6 2001 0 98053 47.6561 -122.005 4760 101930
str(sales)
## 'data.frame': 21613 obs. of 21 variables:
## $ id : num 7.13e+09 6.41e+09 5.63e+09 2.49e+09 1.95e+09 ...
## $ date : Factor w/ 372 levels "20140502T000000",..: 165 221 291 221 284 11 57 252 340 306 ...
## $ price : int 221900 538000 180000 604000 510000 1225000 257500 291850 229500 323000 ...
## $ bedrooms : int 3 3 2 4 3 4 3 3 3 3 ...
## $ bathrooms : num 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
## $ sqft_living : int 1180 2570 770 1960 1680 5420 1715 1060 1780 1890 ...
## $ sqft_lot : int 5650 7242 10000 5000 8080 101930 6819 9711 7470 6560 ...
## $ floors : num 1 2 1 1 1 1 2 1 1 2 ...
## $ waterfront : int 0 0 0 0 0 0 0 0 0 0 ...
## $ view : int 0 0 0 0 0 0 0 0 0 0 ...
## $ condition : int 3 3 3 5 3 3 3 3 3 3 ...
## $ grade : int 7 7 6 7 8 11 7 7 7 7 ...
## $ sqft_above : int 1180 2170 770 1050 1680 3890 1715 1060 1050 1890 ...
## $ sqft_basement: int 0 400 0 910 0 1530 0 0 730 0 ...
## $ yr_built : int 1955 1951 1933 1965 1987 2001 1995 1963 1960 2003 ...
## $ yr_renovated : int 0 1991 0 0 0 0 0 0 0 0 ...
## $ zipcode : int 98178 98125 98028 98136 98074 98053 98003 98198 98146 98038 ...
## $ lat : num 47.5 47.7 47.7 47.5 47.6 ...
## $ long : num -122 -122 -122 -122 -122 ...
## $ sqft_living15: int 1340 1690 2720 1360 1800 4760 2238 1650 1780 2390 ...
## $ sqft_lot15 : int 5650 7639 8062 5000 7503 101930 6819 9711 8113 7570 ...
Exploring the data for Housing
library(ggplot2)
qplot(sales$sqft_living,sales$price,colour=sales$price,
xlab ="Squre feet", ylab="price($)")

Create a simple regression model of sqft_living to Price
set.seed(0)
prob <- c(0.8,0.2)
tags <- c("train", "test")
sample_tag <- sample(tags,size=length(sales$id),replace=TRUE,prob=prob)
sales_tag <- cbind(sales,sample_tag)
train_data <- subset(sales_tag,sample_tag == 'train',select=-sample_tag)
test_data <- subset(sales_tag,sample_tag == 'test',select=-sample_tag)
length(train_data[,1])
## [1] 17226
length(test_data[,1])
## [1] 4387
Build the regression model
sqft_model <- lm(formula=price~sqft_living,data=train_data)
Evaludate the simple model
print(mean(train_data$price))
## [1] 540610.4
range(fitted(sqft_model)) #예측값의 범위
## [1] 33358.98 3363212.06
range(residuals(sqft_model)) #잔차의 범위
## [1] -1269213 4336788
dist <- fitted(sqft_model) + residuals(sqft_model)
range(dist)
## [1] 80000 7700000
deviance(sqft_model) #SSE(잔차제곱)
## [1] 1.203431e+15
sqrt(sum(residuals(sqft_model)^2) / length(train_data$id)) #RMSE
## [1] 264312.9
real_test <- test_data$price
predict_test <- predict(sqft_model,test_data)
sqrt(sum((predict_test-real_test)^2) / length(test_data$id)) #RMSE
## [1] 249897.2
Let’s show what our predictions look like
ggplot(train_data, aes(x=sqft_living, y=price)) +
geom_point(alpha=1, colour='blue') +
geom_abline(intercept=sqft_model$coefficient[1], slope=sqft_model$coefficients[2],colour="red")

ggplot(test_data, aes(x=sqft_living, y=price)) +
geom_point(alpha=1, colour='blue') +
geom_abline(intercept=sqft_model$coefficient[1], slope=sqft_model$coefficients[2],colour="red")

summary(sqft_model)
##
## Call:
## lm(formula = price ~ sqft_living, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1269213 -148981 -24037 108087 4336788
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -48754.7 5002.7 -9.746 <2e-16 ***
## sqft_living 283.2 2.2 128.699 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 264300 on 17224 degrees of freedom
## Multiple R-squared: 0.4902, Adjusted R-squared: 0.4902
## F-statistic: 1.656e+04 on 1 and 17224 DF, p-value: < 2.2e-16
predict_val <- predict(sqft_model, newdata=data.frame(sqft_living=seq(1000,2000,by=100)),interval = "confidence")
sqftliving = seq(1000,2000,by=100)
predict_val <- cbind(predict_val,sqftliving)
predictdf <- data.frame(predict_val)
ggplot(predictdf, aes(sqftliving,fit)) +
geom_line() +
geom_smooth(aes(ymin = lwr, ymax = upr), data=predictdf, stat="identity")

par(mfrow = c(2, 2))
plot(sqft_model)

par(mfrow = c(1, 2))
plot(sqft_model, which = c(4, 6))

Explorer other Feautres in the data
my_features <- c('bedrooms','bathrooms','sqft_living','sqft_lot','floors','zipcode')
summary(sales[,my_features])
## bedrooms bathrooms sqft_living sqft_lot
## Min. : 0.000 Min. :0.000 Min. : 290 Min. : 520
## 1st Qu.: 3.000 1st Qu.:1.750 1st Qu.: 1427 1st Qu.: 5040
## Median : 3.000 Median :2.250 Median : 1910 Median : 7618
## Mean : 3.371 Mean :2.115 Mean : 2080 Mean : 15107
## 3rd Qu.: 4.000 3rd Qu.:2.500 3rd Qu.: 2550 3rd Qu.: 10688
## Max. :33.000 Max. :8.000 Max. :13540 Max. :1651359
## floors zipcode
## Min. :1.000 Min. :98001
## 1st Qu.:1.000 1st Qu.:98033
## Median :1.500 Median :98065
## Mean :1.494 Mean :98078
## 3rd Qu.:2.000 3rd Qu.:98118
## Max. :3.500 Max. :98199
ggplot(sales, aes(x=as.factor(zipcode), y=price)) +
geom_boxplot(fill="yellow", color="blue")

sort(table(sales$sqft_living),decreasing=TRUE)[1]
## 1300
## 138
sales_sqr_ft_1300 <- subset(sales,sqft_living==1300)
range(sales_sqr_ft_1300$sqft_living)
## [1] 1300 1300
ggplot(sales_sqr_ft_1300, aes(as.factor(zipcode),price)) +
geom_boxplot(fill="yellow", color="blue")

sort(table(sales$zipcode),decreasing=TRUE)[1]
## 98103
## 602
sales_zipcode_98103 <- subset(sales,zipcode==98103)
ggplot(sales_zipcode_98103, aes(as.factor(bedrooms),price)) +
geom_boxplot(color="blue", fill="green") +
ggtitle("Box Plot for Bedrooms in Seattle") +
xlab("Bedrooms") + ylab("Price")

ggplot(sales_zipcode_98103, aes(as.factor(bathrooms),price)) +
geom_boxplot(color="blue", fill="green") +
ggtitle("Box Plot for Bathrooms in Seattle") +
xlab("Bathrooms") + ylab("Price")

ggplot(sales_zipcode_98103, aes(as.factor(sqft_lot),price)) +
geom_boxplot(color="blue", fill="green") +
ggtitle("Box Plot for Sqft lot in Seattle") +
xlab("Sqft lot") + ylab("Price")

ggplot(sales_zipcode_98103, aes(as.factor(floors),price),price) +
geom_boxplot(color="blue", fill="green") +
ggtitle("Box Plot for Floors in Seattle") +
xlab("Floors") + ylab("Price")

Build a regression model with more features
my_features_model <- lm(formula=price ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors + zipcode, data=train_data)
summary(my_features_model)
##
## Call:
## lm(formula = price ~ bedrooms + bathrooms + sqft_living + sqft_lot +
## floors + zipcode, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1342194 -142693 -22608 101827 4089210
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.615e+07 3.713e+06 -15.121 < 2e-16 ***
## bedrooms -5.737e+04 2.632e+03 -21.797 < 2e-16 ***
## bathrooms 6.771e+03 4.320e+03 1.567 0.117
## sqft_living 3.216e+02 3.539e+00 90.858 < 2e-16 ***
## sqft_lot -2.672e-01 4.889e-02 -5.465 4.7e-08 ***
## floors -2.275e+03 4.263e+03 -0.534 0.594
## zipcode 5.730e+02 3.785e+01 15.142 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 258600 on 17219 degrees of freedom
## Multiple R-squared: 0.5121, Adjusted R-squared: 0.5119
## F-statistic: 3012 on 6 and 17219 DF, p-value: < 2.2e-16
sqrt(sum(residuals(my_features_model)^2) / length(train_data$id)) #RMSE
## [1] 258590.8
real_test <- test_data$price
predict_test <- predict(my_features_model,test_data)
sqrt(sum((predict_test-real_test)^2) / length(test_data$id)) #RMSE
## [1] 243852.3
Apply learned models to predict prices of any house
house1 = sales[which(sales$id=='5309101200'),]
house1
## id date price bedrooms bathrooms sqft_living
## 1055 5309101200 20140605T000000 620000 4 2.25 2400
## sqft_lot floors waterfront view condition grade sqft_above
## 1055 5350 1.5 0 0 4 7 1460
## sqft_basement yr_built yr_renovated zipcode lat long
## 1055 940 1929 0 98117 47.6763 -122.37
## sqft_living15 sqft_lot15
## 1055 1250 4880
house1$price
## [1] 620000
predict(sqft_model,house1)
## 1055
## 630807.1
predict(my_features_model,house1)
## 1055
## 632955.7
house2 = sales[which(sales$id=='1925069082'),]
house2
## id date price bedrooms bathrooms sqft_living
## 1362 1925069082 20150511T000000 2200000 5 4.25 4640
## sqft_lot floors waterfront view condition grade sqft_above
## 1362 22703 2 1 4 5 8 2860
## sqft_basement yr_built yr_renovated zipcode lat long
## 1362 1780 1952 0 98052 47.6393 -122.097
## sqft_living15 sqft_lot15
## 1362 3140 14200
house2$price
## [1] 2200000
predict(sqft_model,house2)
## 1362
## 1265065
predict(my_features_model,house2)
## 1362
## 1266452

Build a regression model with advanced features
power_model <- lm(formula=price ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors + zipcode + condition + grade + waterfront + view + sqft_above + sqft_basement + yr_built + yr_renovated + lat + long + sqft_living15 + sqft_lot15, data=train_data)
summary(power_model)
##
## Call:
## lm(formula = price ~ bedrooms + bathrooms + sqft_living + sqft_lot +
## floors + zipcode + condition + grade + waterfront + view +
## sqft_above + sqft_basement + yr_built + yr_renovated + lat +
## long + sqft_living15 + sqft_lot15, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1262846 -100471 -10032 78809 4304113
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.923e+06 3.315e+06 1.786 0.07404 .
## bedrooms -3.489e+04 2.138e+03 -16.317 < 2e-16 ***
## bathrooms 3.840e+04 3.713e+03 10.342 < 2e-16 ***
## sqft_living 1.491e+02 5.010e+00 29.765 < 2e-16 ***
## sqft_lot 1.547e-01 5.477e-02 2.825 0.00474 **
## floors 5.805e+03 4.101e+03 1.415 0.15695
## zipcode -5.803e+02 3.732e+01 -15.551 < 2e-16 ***
## condition 2.805e+04 2.666e+03 10.522 < 2e-16 ***
## grade 9.715e+04 2.448e+03 39.685 < 2e-16 ***
## waterfront 5.439e+05 1.930e+04 28.174 < 2e-16 ***
## view 5.607e+04 2.440e+03 22.982 < 2e-16 ***
## sqft_above 3.736e+01 4.948e+00 7.552 4.51e-14 ***
## sqft_basement NA NA NA NA
## yr_built -2.619e+03 8.301e+01 -31.554 < 2e-16 ***
## yr_renovated 1.891e+01 4.143e+00 4.565 5.04e-06 ***
## lat 6.085e+05 1.217e+04 49.994 < 2e-16 ***
## long -2.169e+05 1.498e+04 -14.482 < 2e-16 ***
## sqft_living15 1.924e+01 3.917e+00 4.913 9.07e-07 ***
## sqft_lot15 -4.295e-01 8.331e-02 -5.156 2.56e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 204200 on 17208 degrees of freedom
## Multiple R-squared: 0.6961, Adjusted R-squared: 0.6958
## F-statistic: 2319 on 17 and 17208 DF, p-value: < 2.2e-16
sqrt(sum(residuals(power_model)^2) / length(train_data$id)) #RMSE
## [1] 204071.8
real_test <- test_data$price
predict_test <- predict(power_model,test_data)
## Warning in predict.lm(power_model, test_data): prediction from a rank-
## deficient fit may be misleading
sqrt(sum((predict_test-real_test)^2) / length(test_data$id)) #RMSE
## [1] 189642.1
par(mfrow=c(2,2))
plot(power_model)

predict(power_model, house1)
## Warning in predict.lm(power_model, house1): prediction from a rank-
## deficient fit may be misleading
## 1055
## 672306.2
predict(power_model, house2)
## Warning in predict.lm(power_model, house2): prediction from a rank-
## deficient fit may be misleading
## 1362
## 1927580