This project provides house price prediction using the regression models. The dataset for the modelling is located in Australia. The dataset link : Click Here
The project is structured as follows :
Data Understanding
Exploratory Data Analysis
Data Preparations
Modeling
Performance Evaluations
Read the house dataset and it’s structure
house_df <- read.csv("house.csv")
str(house_df)
## 'data.frame': 4600 obs. of 18 variables:
## $ date : chr "2014-05-02 00:00:00" "2014-05-02 00:00:00" "2014-05-02 00:00:00" "2014-05-02 00:00:00" ...
## $ price : num 313000 2384000 342000 420000 550000 ...
## $ bedrooms : num 3 5 3 3 4 2 2 4 3 4 ...
## $ bathrooms : num 1.5 2.5 2 2.25 2.5 1 2 2.5 2.5 2 ...
## $ sqft_living : int 1340 3650 1930 2000 1940 880 1350 2710 2430 1520 ...
## $ sqft_lot : int 7912 9050 11947 8030 10500 6380 2560 35868 88426 6200 ...
## $ floors : num 1.5 2 1 1 1 1 1 2 1 1.5 ...
## $ waterfront : int 0 0 0 0 0 0 0 0 0 0 ...
## $ view : int 0 4 0 0 0 0 0 0 0 0 ...
## $ condition : int 3 5 4 4 4 3 3 3 4 3 ...
## $ sqft_above : int 1340 3370 1930 1000 1140 880 1350 2710 1570 1520 ...
## $ sqft_basement: int 0 280 0 1000 800 0 0 0 860 0 ...
## $ yr_built : int 1955 1921 1966 1963 1976 1938 1976 1989 1985 1945 ...
## $ yr_renovated : int 2005 0 0 0 1992 1994 0 0 0 2010 ...
## $ street : chr "18810 Densmore Ave N" "709 W Blaine St" "26206-26214 143rd Ave SE" "857 170th Pl NE" ...
## $ city : chr "Shoreline" "Seattle" "Kent" "Bellevue" ...
## $ statezip : chr "WA 98133" "WA 98119" "WA 98042" "WA 98008" ...
## $ country : chr "USA" "USA" "USA" "USA" ...
The dataset contains 4600 rows and 18 columns. The target variable is Price and the remains are the features.
summary(house_df)
## date price bedrooms bathrooms
## Length:4600 Min. : 0 Min. :0.000 Min. :0.000
## Class :character 1st Qu.: 322875 1st Qu.:3.000 1st Qu.:1.750
## Mode :character Median : 460943 Median :3.000 Median :2.250
## Mean : 551963 Mean :3.401 Mean :2.161
## 3rd Qu.: 654962 3rd Qu.:4.000 3rd Qu.:2.500
## Max. :26590000 Max. :9.000 Max. :8.000
## sqft_living sqft_lot floors waterfront
## Min. : 370 Min. : 638 Min. :1.000 Min. :0.000000
## 1st Qu.: 1460 1st Qu.: 5001 1st Qu.:1.000 1st Qu.:0.000000
## Median : 1980 Median : 7683 Median :1.500 Median :0.000000
## Mean : 2139 Mean : 14852 Mean :1.512 Mean :0.007174
## 3rd Qu.: 2620 3rd Qu.: 11001 3rd Qu.:2.000 3rd Qu.:0.000000
## Max. :13540 Max. :1074218 Max. :3.500 Max. :1.000000
## view condition sqft_above sqft_basement
## Min. :0.0000 Min. :1.000 Min. : 370 Min. : 0.0
## 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:1190 1st Qu.: 0.0
## Median :0.0000 Median :3.000 Median :1590 Median : 0.0
## Mean :0.2407 Mean :3.452 Mean :1827 Mean : 312.1
## 3rd Qu.:0.0000 3rd Qu.:4.000 3rd Qu.:2300 3rd Qu.: 610.0
## Max. :4.0000 Max. :5.000 Max. :9410 Max. :4820.0
## yr_built yr_renovated street city
## Min. :1900 Min. : 0.0 Length:4600 Length:4600
## 1st Qu.:1951 1st Qu.: 0.0 Class :character Class :character
## Median :1976 Median : 0.0 Mode :character Mode :character
## Mean :1971 Mean : 808.6
## 3rd Qu.:1997 3rd Qu.:1999.0
## Max. :2014 Max. :2014.0
## statezip country
## Length:4600 Length:4600
## Class :character Class :character
## Mode :character Mode :character
##
##
##
Plot distribution of Price with Boxplot
ggplot(data = house_df,
aes(y=price)) + geom_boxplot() +
scale_y_continuous(limits = c(0, 2000000))
house_df$bedrooms2 <- factor(house_df$bathrooms)
house_df$city2 <- factor(house_df$city)
house_df$statezip2 <- factor(house_df$statezip)
house_df$street2 <- factor(house_df$street)
house_df$country2 <- factor(house_df$country)
ggplot(data = house_df,
aes(x = bedrooms2,
y = price)) + geom_boxplot() +
scale_y_continuous(limits=c(0, 2000000))
cor(house_df$price, house_df$bedrooms)
## [1] 0.2003363
cor(house_df$price, house_df$bathrooms)
## [1] 0.3271099
cor(house_df$bedrooms, house_df$bathrooms)
## [1] 0.5459199
house_df_num <- house_df[, 2:12]
r <- cor(house_df_num)
library(corrgram)
corrgram(house_df_num, order = TRUE,
upper.panel = panel.pie)
Remove rows with incorrect value prices
idx <- which(house_df_num$price %in% c(0))
house_df_num <- house_df_num[-idx,]
summary(house_df_num$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7800 326264 465000 557906 657500 26590000
Remove rows with outlier prices
out_price <- boxplot.stats(house_df_num$price)$out
out_idx <- which(house_df_num$price %in% c(out_price))
house_df_num <- house_df_num[-out_idx,]
summary(house_df_num)
## price bedrooms bathrooms sqft_living
## Min. : 7800 Min. :0.000 Min. :0.000 Min. : 370
## 1st Qu.: 320000 1st Qu.:3.000 1st Qu.:1.750 1st Qu.:1430
## Median : 450000 Median :3.000 Median :2.250 Median :1920
## Mean : 487457 Mean :3.352 Mean :2.094 Mean :2031
## 3rd Qu.: 615000 3rd Qu.:4.000 3rd Qu.:2.500 3rd Qu.:2510
## Max. :1150000 Max. :9.000 Max. :5.750 Max. :7320
## sqft_lot floors waterfront view
## Min. : 638 Min. :1.000 Min. :0.000000 Min. :0.0000
## 1st Qu.: 5000 1st Qu.:1.000 1st Qu.:0.000000 1st Qu.:0.0000
## Median : 7566 Median :1.500 Median :0.000000 Median :0.0000
## Mean : 14599 Mean :1.495 Mean :0.003711 Mean :0.1737
## 3rd Qu.: 10696 3rd Qu.:2.000 3rd Qu.:0.000000 3rd Qu.:0.0000
## Max. :1074218 Max. :3.500 Max. :1.000000 Max. :4.0000
## condition sqft_above sqft_basement
## Min. :1.000 Min. : 370 Min. : 0.0
## 1st Qu.:3.000 1st Qu.:1170 1st Qu.: 0.0
## Median :3.000 Median :1540 Median : 0.0
## Mean :3.444 Mean :1747 Mean : 283.7
## 3rd Qu.:4.000 3rd Qu.:2190 3rd Qu.: 560.0
## Max. :5.000 Max. :7320 Max. :2300.0
The minimum value on price is 7800 and the max value on price is 1150000
One Hot Encoding for Location Features
house_df <- house_df[rownames(house_df_num),]
### 1. Create dataframe for statezip
statezip <- house_df$statezip
statezip_df <- data.frame(statezip)
colnames(statezip_df) <- c("loc.")
### 2. One Hot Encoding the statezip dataframe
library(caret)
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:corrgram':
##
## panel.fill
df1 <- dummyVars("~.", data = statezip_df)
df2 <- data.frame(predict(df1, newdata = statezip_df))
### 3. Combine house_df_num dataframe
house_df_num <- cbind(house_df_num, df2)
dim(house_df_num)
## [1] 4311 88
set.seed(2022)
row <- dim(house_df_num)[1]
train_idx <- sample(row, 0.7*row)
train_data <- house_df_num[train_idx, ]
test_data <- house_df_num[-train_idx, ]
dim(train_data)
## [1] 3017 88
dim(test_data)
## [1] 1294 88
Create Regression Model
mymodel <- lm(formula = price~. +
I(sqft_living^2) +
sqft_living + bedrooms, data = train_data)
summary(mymodel)
##
## Call:
## lm(formula = price ~ . + I(sqft_living^2) + sqft_living + bedrooms,
## data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -825231 -53195 -969 50175 448548
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.720e+04 7.255e+04 -0.926 0.354418
## bedrooms -1.362e+04 2.862e+03 -4.760 2.03e-06 ***
## bathrooms 1.422e+04 4.301e+03 3.307 0.000953 ***
## sqft_living 1.676e+02 1.156e+01 14.507 < 2e-16 ***
## sqft_lot 1.843e-01 5.238e-02 3.519 0.000439 ***
## floors -2.456e+04 5.228e+03 -4.699 2.74e-06 ***
## waterfront 1.580e+05 3.035e+04 5.205 2.07e-07 ***
## view 3.376e+04 3.197e+03 10.558 < 2e-16 ***
## condition 2.627e+04 2.965e+03 8.861 < 2e-16 ***
## sqft_above 7.857e+01 6.298e+00 12.477 < 2e-16 ***
## sqft_basement NA NA NA NA
## loc.WA.98001 -1.018e+05 7.237e+04 -1.407 0.159655
## loc.WA.98002 -1.017e+05 7.406e+04 -1.373 0.169741
## loc.WA.98003 -9.118e+04 7.287e+04 -1.251 0.210939
## loc.WA.98004 4.467e+05 7.318e+04 6.105 1.17e-09 ***
## loc.WA.98005 2.526e+05 7.385e+04 3.421 0.000633 ***
## loc.WA.98006 2.147e+05 7.210e+04 2.977 0.002930 **
## loc.WA.98007 2.337e+05 7.372e+04 3.169 0.001544 **
## loc.WA.98008 1.473e+05 7.332e+04 2.010 0.044565 *
## loc.WA.98010 2.707e+04 8.676e+04 0.312 0.755062
## loc.WA.98011 4.764e+04 7.367e+04 0.647 0.517914
## loc.WA.98014 3.412e+04 7.552e+04 0.452 0.651431
## loc.WA.98019 -1.245e+04 7.303e+04 -0.170 0.864642
## loc.WA.98022 -1.005e+05 7.390e+04 -1.361 0.173730
## loc.WA.98023 -1.072e+05 7.191e+04 -1.491 0.135986
## loc.WA.98024 7.457e+04 7.927e+04 0.941 0.346933
## loc.WA.98027 1.025e+05 7.210e+04 1.422 0.155248
## loc.WA.98028 5.407e+04 7.248e+04 0.746 0.455718
## loc.WA.98029 1.668e+05 7.235e+04 2.305 0.021236 *
## loc.WA.98030 -8.562e+04 7.331e+04 -1.168 0.242931
## loc.WA.98031 -8.197e+04 7.234e+04 -1.133 0.257258
## loc.WA.98032 -1.311e+05 7.543e+04 -1.738 0.082391 .
## loc.WA.98033 2.447e+05 7.210e+04 3.394 0.000698 ***
## loc.WA.98034 1.117e+05 7.180e+04 1.555 0.119973
## loc.WA.98038 -6.483e+04 7.193e+04 -0.901 0.367493
## loc.WA.98039 -3.752e+05 1.227e+05 -3.058 0.002251 **
## loc.WA.98040 2.883e+05 7.313e+04 3.942 8.26e-05 ***
## loc.WA.98042 -9.472e+04 7.180e+04 -1.319 0.187163
## loc.WA.98045 5.362e+03 7.275e+04 0.074 0.941245
## loc.WA.98047 -5.793e+04 8.375e+04 -0.692 0.489187
## loc.WA.98050 3.869e+04 1.232e+05 0.314 0.753533
## loc.WA.98051 8.437e+03 8.413e+04 0.100 0.920126
## loc.WA.98052 2.000e+05 7.154e+04 2.795 0.005216 **
## loc.WA.98053 1.722e+05 7.203e+04 2.391 0.016851 *
## loc.WA.98055 -4.763e+04 7.486e+04 -0.636 0.524656
## loc.WA.98056 2.345e+04 7.212e+04 0.325 0.745066
## loc.WA.98057 -7.884e+04 7.650e+04 -1.031 0.302822
## loc.WA.98058 -4.386e+04 7.180e+04 -0.611 0.541289
## loc.WA.98059 3.217e+04 7.172e+04 0.449 0.653754
## loc.WA.98065 4.570e+04 7.237e+04 0.632 0.527751
## loc.WA.98068 NA NA NA NA
## loc.WA.98070 -8.824e+03 7.473e+04 -0.118 0.906013
## loc.WA.98072 1.033e+05 7.250e+04 1.425 0.154395
## loc.WA.98074 1.455e+05 7.205e+04 2.020 0.043466 *
## loc.WA.98075 1.897e+05 7.219e+04 2.627 0.008647 **
## loc.WA.98077 1.115e+05 7.312e+04 1.525 0.127260
## loc.WA.98092 -9.585e+04 7.192e+04 -1.333 0.182762
## loc.WA.98102 3.462e+05 7.664e+04 4.518 6.49e-06 ***
## loc.WA.98103 2.415e+05 7.160e+04 3.373 0.000752 ***
## loc.WA.98105 3.197e+05 7.384e+04 4.330 1.54e-05 ***
## loc.WA.98106 4.222e+04 7.247e+04 0.583 0.560208
## loc.WA.98107 2.471e+05 7.252e+04 3.408 0.000663 ***
## loc.WA.98108 4.752e+04 7.322e+04 0.649 0.516431
## loc.WA.98109 4.118e+05 7.657e+04 5.378 8.12e-08 ***
## loc.WA.98112 3.561e+05 7.317e+04 4.867 1.19e-06 ***
## loc.WA.98115 2.190e+05 7.177e+04 3.052 0.002296 **
## loc.WA.98116 2.235e+05 7.255e+04 3.080 0.002090 **
## loc.WA.98117 2.179e+05 7.166e+04 3.041 0.002381 **
## loc.WA.98118 8.803e+04 7.204e+04 1.222 0.221830
## loc.WA.98119 3.576e+05 7.345e+04 4.869 1.18e-06 ***
## loc.WA.98122 2.505e+05 7.234e+04 3.463 0.000541 ***
## loc.WA.98125 9.288e+04 7.223e+04 1.286 0.198552
## loc.WA.98126 1.129e+05 7.200e+04 1.568 0.116894
## loc.WA.98133 6.475e+04 7.189e+04 0.901 0.367804
## loc.WA.98136 1.814e+05 7.283e+04 2.491 0.012798 *
## loc.WA.98144 1.830e+05 7.226e+04 2.533 0.011359 *
## loc.WA.98146 4.867e+04 7.258e+04 0.671 0.502574
## loc.WA.98148 -2.905e+04 7.756e+04 -0.375 0.707989
## loc.WA.98155 5.062e+04 7.188e+04 0.704 0.481361
## loc.WA.98166 2.944e+04 7.263e+04 0.405 0.685229
## loc.WA.98168 -4.247e+04 7.235e+04 -0.587 0.557208
## loc.WA.98177 1.319e+05 7.354e+04 1.794 0.072956 .
## loc.WA.98178 -6.891e+04 7.291e+04 -0.945 0.344613
## loc.WA.98188 -7.984e+04 7.513e+04 -1.063 0.288015
## loc.WA.98198 -8.127e+04 7.267e+04 -1.118 0.263557
## loc.WA.98199 2.635e+05 7.267e+04 3.625 0.000294 ***
## loc.WA.98288 -3.570e+04 9.144e+04 -0.390 0.696250
## loc.WA.98354 NA NA NA NA
## I(sqft_living^2) -1.462e-02 2.010e-03 -7.272 4.52e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 100100 on 2931 degrees of freedom
## Multiple R-squared: 0.7868, Adjusted R-squared: 0.7806
## F-statistic: 127.2 on 85 and 2931 DF, p-value: < 2.2e-16
actual <- test_data$price
pred.myModel <- predict(mymodel, test_data)
## Warning in predict.lm(mymodel, test_data): prediction from a rank-deficient fit
## may be misleading
Plot Actual Data VS Predicted Data Prices
price_df <- data.frame(actual, pred.myModel)
ggplot(data = price_df,
aes(x = actual,
y = pred.myModel)) +
geom_point() +
scale_x_continuous(limits = c(0, 1500000)) +
scale_y_continuous(limits = c(0, 1500000))
cor(price_df$actual, price_df$pred.myModel)
## [1] 0.8889122
performance <- function(prediction, actual, method){
error <- prediction - actual
squared_error <- error^2
sum_squared_error <- sum(squared_error)
mean_squared_error <- mean(squared_error)
rmse <- sqrt(mean_squared_error)
r <- cor(prediction, actual)
result <- paste("Method", method,
"\nRMSE =", round(rmse, 3),
"\nR =", round(r, 3),
"\n")
cat(result)
}
performance(pred.myModel, actual, "My Regression Model")
## Method My Regression Model
## RMSE = 101125.499
## R = 0.889