This report provides house price prediction using regression.
The dataset used in this report is Australia house data.
The Kaggle’s dataset link: here
Report Structure:
Data Extraction.
Exploratory Data Analysis.
Data Preparation/ Preprocessing.
Modelling
Evaluations
Recommendations
Read the house dataset and its structure
house_df <- read.csv("house.csv")
str(house_df)
## 'data.frame': 4600 obs. of 18 variables:
## $ date : chr "2014-05-02 00:00:00" "2014-05-02 00:00:00" "2014-05-02 00:00:00" "2014-05-02 00:00:00" ...
## $ price : num 313000 2384000 342000 420000 550000 ...
## $ bedrooms : num 3 5 3 3 4 2 2 4 3 4 ...
## $ bathrooms : num 1.5 2.5 2 2.25 2.5 1 2 2.5 2.5 2 ...
## $ sqft_living : int 1340 3650 1930 2000 1940 880 1350 2710 2430 1520 ...
## $ sqft_lot : int 7912 9050 11947 8030 10500 6380 2560 35868 88426 6200 ...
## $ floors : num 1.5 2 1 1 1 1 1 2 1 1.5 ...
## $ waterfront : int 0 0 0 0 0 0 0 0 0 0 ...
## $ view : int 0 4 0 0 0 0 0 0 0 0 ...
## $ condition : int 3 5 4 4 4 3 3 3 4 3 ...
## $ sqft_above : int 1340 3370 1930 1000 1140 880 1350 2710 1570 1520 ...
## $ sqft_basement: int 0 280 0 1000 800 0 0 0 860 0 ...
## $ yr_built : int 1955 1921 1966 1963 1976 1938 1976 1989 1985 1945 ...
## $ yr_renovated : int 2005 0 0 0 1992 1994 0 0 0 2010 ...
## $ street : chr "18810 Densmore Ave N" "709 W Blaine St" "26206-26214 143rd Ave SE" "857 170th Pl NE" ...
## $ city : chr "Shoreline" "Seattle" "Kent" "Bellevue" ...
## $ statezip : chr "WA 98133" "WA 98119" "WA 98042" "WA 98008" ...
## $ country : chr "USA" "USA" "USA" "USA" ...
The dataset contains 4600 observations and 18 variables. The target variable is price
## date price bedrooms bathrooms
## Length:4600 Min. : 0 Min. :0.000 Min. :0.000
## Class :character 1st Qu.: 322875 1st Qu.:3.000 1st Qu.:1.750
## Mode :character Median : 460943 Median :3.000 Median :2.250
## Mean : 551963 Mean :3.401 Mean :2.161
## 3rd Qu.: 654962 3rd Qu.:4.000 3rd Qu.:2.500
## Max. :26590000 Max. :9.000 Max. :8.000
## sqft_living sqft_lot floors waterfront
## Min. : 370 Min. : 638 Min. :1.000 Min. :0.000000
## 1st Qu.: 1460 1st Qu.: 5001 1st Qu.:1.000 1st Qu.:0.000000
## Median : 1980 Median : 7683 Median :1.500 Median :0.000000
## Mean : 2139 Mean : 14852 Mean :1.512 Mean :0.007174
## 3rd Qu.: 2620 3rd Qu.: 11001 3rd Qu.:2.000 3rd Qu.:0.000000
## Max. :13540 Max. :1074218 Max. :3.500 Max. :1.000000
## view condition sqft_above sqft_basement
## Min. :0.0000 Min. :1.000 Min. : 370 Min. : 0.0
## 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:1190 1st Qu.: 0.0
## Median :0.0000 Median :3.000 Median :1590 Median : 0.0
## Mean :0.2407 Mean :3.452 Mean :1827 Mean : 312.1
## 3rd Qu.:0.0000 3rd Qu.:4.000 3rd Qu.:2300 3rd Qu.: 610.0
## Max. :4.0000 Max. :5.000 Max. :9410 Max. :4820.0
## yr_built yr_renovated street city
## Min. :1900 Min. : 0.0 Length:4600 Length:4600
## 1st Qu.:1951 1st Qu.: 0.0 Class :character Class :character
## Median :1976 Median : 0.0 Mode :character Mode :character
## Mean :1971 Mean : 808.6
## 3rd Qu.:1997 3rd Qu.:1999.0
## Max. :2014 Max. :2014.0
## statezip country
## Length:4600 Length:4600
## Class :character Class :character
## Mode :character Mode :character
##
##
##
ggplot(data=house_df,
aes(y=price))+
geom_boxplot()+
scale_y_continuous(limits=c(0,2000000))
ggplot(data=house_df,
aes(x=bedrooms2, y=price))+
geom_boxplot()+
scale_y_continuous(limits=c(0,2000000))
house_df_num <- house_df[, 2:12]
r <- cor(house_df_num)
r
## price bedrooms bathrooms sqft_living sqft_lot
## price 1.00000000 0.200336289 0.32710992 0.43041003 0.050451295
## bedrooms 0.20033629 1.000000000 0.54591993 0.59488406 0.068819355
## bathrooms 0.32710992 0.545919926 1.00000000 0.76115370 0.107837479
## sqft_living 0.43041003 0.594884059 0.76115370 1.00000000 0.210538454
## sqft_lot 0.05045130 0.068819355 0.10783748 0.21053845 1.000000000
## floors 0.15146080 0.177894903 0.48642757 0.34485027 0.003749750
## waterfront 0.13564832 -0.003482794 0.07623225 0.11761601 0.017241054
## view 0.22850417 0.111028000 0.21196025 0.31100944 0.073906741
## condition 0.03491454 0.025079856 -0.11999434 -0.06282598 0.000558114
## sqft_above 0.36756960 0.484705342 0.68991841 0.87644325 0.216454651
## sqft_basement 0.21042657 0.334165246 0.29802018 0.44720554 0.034842303
## floors waterfront view condition sqft_above
## price 0.15146080 0.135648322 0.22850417 0.034914537 0.36756960
## bedrooms 0.17789490 -0.003482794 0.11102800 0.025079856 0.48470534
## bathrooms 0.48642757 0.076232247 0.21196025 -0.119994341 0.68991841
## sqft_living 0.34485027 0.117616006 0.31100944 -0.062825979 0.87644325
## sqft_lot 0.00374975 0.017241054 0.07390674 0.000558114 0.21645465
## floors 1.00000000 0.022023786 0.03121095 -0.275013395 0.52281374
## waterfront 0.02202379 1.000000000 0.36093476 0.000352282 0.07891113
## view 0.03121095 0.360934761 1.00000000 0.063077281 0.17432671
## condition -0.27501339 0.000352282 0.06307728 1.000000000 -0.17819634
## sqft_above 0.52281374 0.078911129 0.17432671 -0.178196344 1.00000000
## sqft_basement -0.25550982 0.097500955 0.32160180 0.200632350 -0.03872299
## sqft_basement
## price 0.21042657
## bedrooms 0.33416525
## bathrooms 0.29802018
## sqft_living 0.44720554
## sqft_lot 0.03484230
## floors -0.25550982
## waterfront 0.09750095
## view 0.32160180
## condition 0.20063235
## sqft_above -0.03872299
## sqft_basement 1.00000000
library(corrgram)
corrgram(house_df_num, order=TRUE,
upper.panel=panel.pie)
Remove rows with incorrect prices.
idx <- which(house_df_num$price %in% c(0))
#Remove the rows with incorrect values
house_df_num <- house_df_num[-idx,]
summary(house_df_num)
## price bedrooms bathrooms sqft_living
## Min. : 7800 Min. :0.000 Min. :0.000 Min. : 370
## 1st Qu.: 326264 1st Qu.:3.000 1st Qu.:1.750 1st Qu.: 1460
## Median : 465000 Median :3.000 Median :2.250 Median : 1970
## Mean : 557906 Mean :3.395 Mean :2.155 Mean : 2132
## 3rd Qu.: 657500 3rd Qu.:4.000 3rd Qu.:2.500 3rd Qu.: 2610
## Max. :26590000 Max. :9.000 Max. :8.000 Max. :13540
## sqft_lot floors waterfront view
## Min. : 638 Min. :1.000 Min. :0.000000 Min. :0.0000
## 1st Qu.: 5000 1st Qu.:1.000 1st Qu.:0.000000 1st Qu.:0.0000
## Median : 7680 Median :1.500 Median :0.000000 Median :0.0000
## Mean : 14835 Mean :1.512 Mean :0.006592 Mean :0.2347
## 3rd Qu.: 10978 3rd Qu.:2.000 3rd Qu.:0.000000 3rd Qu.:0.0000
## Max. :1074218 Max. :3.500 Max. :1.000000 Max. :4.0000
## condition sqft_above sqft_basement
## Min. :1.000 Min. : 370 Min. : 0.0
## 1st Qu.:3.000 1st Qu.:1190 1st Qu.: 0.0
## Median :3.000 Median :1590 Median : 0.0
## Mean :3.449 Mean :1822 Mean : 310.2
## 3rd Qu.:4.000 3rd Qu.:2300 3rd Qu.: 600.0
## Max. :5.000 Max. :9410 Max. :4820.0
Remove rows with outlier prices
out_price <- boxplot.stats(house_df_num$price)$out
out_idx <- which(house_df_num$price %in% c(out_price))
house_df_num <- house_df_num[-out_idx,]
summary(house_df_num)
## price bedrooms bathrooms sqft_living
## Min. : 7800 Min. :0.000 Min. :0.000 Min. : 370
## 1st Qu.: 320000 1st Qu.:3.000 1st Qu.:1.750 1st Qu.:1430
## Median : 450000 Median :3.000 Median :2.250 Median :1920
## Mean : 487457 Mean :3.352 Mean :2.094 Mean :2031
## 3rd Qu.: 615000 3rd Qu.:4.000 3rd Qu.:2.500 3rd Qu.:2510
## Max. :1150000 Max. :9.000 Max. :5.750 Max. :7320
## sqft_lot floors waterfront view
## Min. : 638 Min. :1.000 Min. :0.000000 Min. :0.0000
## 1st Qu.: 5000 1st Qu.:1.000 1st Qu.:0.000000 1st Qu.:0.0000
## Median : 7566 Median :1.500 Median :0.000000 Median :0.0000
## Mean : 14599 Mean :1.495 Mean :0.003711 Mean :0.1737
## 3rd Qu.: 10696 3rd Qu.:2.000 3rd Qu.:0.000000 3rd Qu.:0.0000
## Max. :1074218 Max. :3.500 Max. :1.000000 Max. :4.0000
## condition sqft_above sqft_basement
## Min. :1.000 Min. : 370 Min. : 0.0
## 1st Qu.:3.000 1st Qu.:1170 1st Qu.: 0.0
## Median :3.000 Median :1540 Median : 0.0
## Mean :3.444 Mean :1747 Mean : 283.7
## 3rd Qu.:4.000 3rd Qu.:2190 3rd Qu.: 560.0
## Max. :5.000 Max. :7320 Max. :2300.0
nrow(house_df_num)
## [1] 4311
Number of observations after the preprocessing is 4311.
house_df <- house_df[rownames(house_df_num),]
#Create a dataframe for statezip
statezip <- house_df$statezip
statezip_df <- data.frame(statezip)
colnames(statezip_df) <- c("loc.")
#One Hot Encoding on Statezip
library(caret)
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:corrgram':
##
## panel.fill
df1 <- dummyVars("~.", data=statezip_df)
df2 <- data.frame(predict(df1, newdata=statezip_df))
## Combine house_df with satezip_df
house_df_num <- cbind(house_df_num, df2)
dim(house_df_num)
## [1] 4311 88
set.seed(2022)
train_idx <- sample(m, m*0.7)
train_idx[1:5]
## [1] 4324 1459 2871 3915 708
train_data <- house_df_num[train_idx,]
test_data <- house_df_num[-train_idx,]
dim(train_data)
## [1] 3220 88
dim(test_data)
## [1] 1299 88
mymodel <- lm(formula=price~. +
I(sqft_living^2), data = train_data)
summary(mymodel)
##
## Call:
## lm(formula = price ~ . + I(sqft_living^2), data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -881028 -51239 -908 48649 486215
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.604e+04 7.329e+04 -0.765 0.444519
## bedrooms -1.168e+04 2.835e+03 -4.120 3.89e-05 ***
## bathrooms 1.581e+04 4.319e+03 3.661 0.000255 ***
## sqft_living 1.549e+02 1.137e+01 13.622 < 2e-16 ***
## sqft_lot 1.829e-01 5.253e-02 3.482 0.000505 ***
## floors -2.557e+04 5.151e+03 -4.964 7.29e-07 ***
## waterfront 1.681e+05 3.017e+04 5.573 2.74e-08 ***
## view 3.455e+04 3.188e+03 10.837 < 2e-16 ***
## condition 2.544e+04 3.008e+03 8.456 < 2e-16 ***
## sqft_above 7.585e+01 6.343e+00 11.958 < 2e-16 ***
## sqft_basement NA NA NA NA
## loc.WA.98001 -9.653e+04 7.329e+04 -1.317 0.187925
## loc.WA.98002 -1.014e+05 7.502e+04 -1.352 0.176499
## loc.WA.98003 -7.978e+04 7.359e+04 -1.084 0.278352
## loc.WA.98004 4.598e+05 7.404e+04 6.209 6.07e-10 ***
## loc.WA.98005 2.610e+05 7.464e+04 3.497 0.000477 ***
## loc.WA.98006 2.338e+05 7.282e+04 3.210 0.001341 **
## loc.WA.98007 2.117e+05 7.464e+04 2.836 0.004595 **
## loc.WA.98008 1.304e+05 7.440e+04 1.752 0.079834 .
## loc.WA.98010 1.054e+04 8.266e+04 0.128 0.898503
## loc.WA.98011 5.193e+04 7.468e+04 0.695 0.486899
## loc.WA.98014 1.327e+04 7.632e+04 0.174 0.861987
## loc.WA.98019 -6.587e+03 7.367e+04 -0.089 0.928761
## loc.WA.98022 -9.111e+04 7.450e+04 -1.223 0.221403
## loc.WA.98023 -1.036e+05 7.264e+04 -1.427 0.153824
## loc.WA.98024 4.857e+04 8.470e+04 0.573 0.566388
## loc.WA.98027 1.044e+05 7.280e+04 1.434 0.151556
## loc.WA.98028 5.498e+04 7.311e+04 0.752 0.452110
## loc.WA.98029 1.752e+05 7.293e+04 2.402 0.016358 *
## loc.WA.98030 -9.076e+04 7.401e+04 -1.226 0.220158
## loc.WA.98031 -7.498e+04 7.306e+04 -1.026 0.304813
## loc.WA.98032 -1.186e+05 7.593e+04 -1.562 0.118305
## loc.WA.98033 2.660e+05 7.297e+04 3.645 0.000272 ***
## loc.WA.98034 1.069e+05 7.259e+04 1.472 0.141120
## loc.WA.98038 -6.574e+04 7.281e+04 -0.903 0.366696
## loc.WA.98039 -3.734e+05 1.240e+05 -3.010 0.002632 **
## loc.WA.98040 2.908e+05 7.379e+04 3.942 8.28e-05 ***
## loc.WA.98042 -8.641e+04 7.255e+04 -1.191 0.233698
## loc.WA.98045 1.507e+02 7.351e+04 0.002 0.998364
## loc.WA.98047 -5.808e+04 8.465e+04 -0.686 0.492684
## loc.WA.98050 4.640e+04 1.245e+05 0.373 0.709432
## loc.WA.98051 1.249e+04 8.502e+04 0.147 0.883251
## loc.WA.98052 2.004e+05 7.236e+04 2.769 0.005654 **
## loc.WA.98053 1.753e+05 7.288e+04 2.405 0.016222 *
## loc.WA.98055 -2.244e+04 7.529e+04 -0.298 0.765746
## loc.WA.98056 3.456e+04 7.274e+04 0.475 0.634726
## loc.WA.98057 -8.825e+04 7.781e+04 -1.134 0.256768
## loc.WA.98058 -4.027e+04 7.254e+04 -0.555 0.578828
## loc.WA.98059 3.127e+04 7.251e+04 0.431 0.666377
## loc.WA.98065 4.399e+04 7.334e+04 0.600 0.548648
## loc.WA.98068 NA NA NA NA
## loc.WA.98070 1.910e+04 7.537e+04 0.253 0.799944
## loc.WA.98072 1.142e+05 7.340e+04 1.556 0.119836
## loc.WA.98074 1.476e+05 7.291e+04 2.025 0.042998 *
## loc.WA.98075 2.108e+05 7.295e+04 2.889 0.003891 **
## loc.WA.98077 1.269e+05 7.383e+04 1.718 0.085852 .
## loc.WA.98092 -9.043e+04 7.285e+04 -1.241 0.214557
## loc.WA.98102 3.441e+05 7.796e+04 4.413 1.05e-05 ***
## loc.WA.98103 2.498e+05 7.235e+04 3.453 0.000561 ***
## loc.WA.98105 3.248e+05 7.465e+04 4.351 1.40e-05 ***
## loc.WA.98106 3.359e+04 7.321e+04 0.459 0.646394
## loc.WA.98107 2.448e+05 7.333e+04 3.338 0.000854 ***
## loc.WA.98108 3.884e+04 7.401e+04 0.525 0.599756
## loc.WA.98109 4.300e+05 7.694e+04 5.589 2.50e-08 ***
## loc.WA.98112 3.690e+05 7.388e+04 4.994 6.25e-07 ***
## loc.WA.98115 2.299e+05 7.246e+04 3.173 0.001522 **
## loc.WA.98116 2.353e+05 7.316e+04 3.217 0.001311 **
## loc.WA.98117 2.214e+05 7.247e+04 3.055 0.002269 **
## loc.WA.98118 9.917e+04 7.286e+04 1.361 0.173609
## loc.WA.98119 3.472e+05 7.484e+04 4.639 3.66e-06 ***
## loc.WA.98122 2.546e+05 7.297e+04 3.489 0.000493 ***
## loc.WA.98125 8.640e+04 7.274e+04 1.188 0.235001
## loc.WA.98126 1.158e+05 7.291e+04 1.589 0.112262
## loc.WA.98133 6.753e+04 7.268e+04 0.929 0.352904
## loc.WA.98136 1.805e+05 7.346e+04 2.457 0.014049 *
## loc.WA.98144 1.935e+05 7.300e+04 2.651 0.008080 **
## loc.WA.98146 3.629e+04 7.333e+04 0.495 0.620680
## loc.WA.98148 -1.805e+04 7.781e+04 -0.232 0.816559
## loc.WA.98155 5.622e+04 7.283e+04 0.772 0.440188
## loc.WA.98166 2.513e+04 7.350e+04 0.342 0.732419
## loc.WA.98168 -4.070e+04 7.321e+04 -0.556 0.578234
## loc.WA.98177 1.584e+05 7.368e+04 2.149 0.031696 *
## loc.WA.98178 -6.624e+04 7.353e+04 -0.901 0.367777
## loc.WA.98188 -6.869e+04 7.783e+04 -0.883 0.377544
## loc.WA.98198 -6.901e+04 7.343e+04 -0.940 0.347358
## loc.WA.98199 2.480e+05 7.336e+04 3.380 0.000734 ***
## loc.WA.98288 -3.587e+04 9.241e+04 -0.388 0.697978
## loc.WA.98354 NA NA NA NA
## I(sqft_living^2) -1.212e-02 1.916e-03 -6.323 2.96e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 101100 on 2926 degrees of freedom
## (208 observations deleted due to missingness)
## Multiple R-squared: 0.7854, Adjusted R-squared: 0.7792
## F-statistic: 126 on 85 and 2926 DF, p-value: < 2.2e-16
actual <- test_data$price
pred.mymodel <- predict(mymodel, test_data)
## Warning in predict.lm(mymodel, test_data): prediction from a rank-deficient fit
## may be misleading
Plot Actual vs Prediction Series
price_df <- data.frame(actual, pred.mymodel)
ggplot(data=price_df,
aes(x=actual, y=pred.mymodel))+
geom_point()+
scale_x_continuous(limits=c(0,2000000))+
scale_y_continuous(limits=c(0,2000000))
Regression Metrics Measurement
performance <- function(actual, prediction, method){
e <- prediction-actual
se <- e^2
sse<- sum(se)
mse<- mean(se)
rmse<-sqrt(mse)
result <- paste("Method:", method,
"\n SSE:", round(sse,3),
"\n MSE:", round(mse,3),
"\n RMSE:", round(rmse,3))
cat(result)
}
performance(actual, pred.mymodel, "Regression Model")
## Method: Regression Model
## SSE: 12666481237787
## MSE: 9750947835.094
## RMSE: 98746.888