Description

This project provides house price prediction using the regression models. The dataset for the modelling is located in Australia. The dataset link : Click Here

The project is structured as follows :

  1. Data Understanding

  2. Exploratory Data Analysis

  3. Data Preparations

  4. Modeling

  5. Performance Evaluations

1. Data Understanding

Read the house dataset and it’s structure

house_df <- read.csv("house.csv")

str(house_df)
## 'data.frame':    4600 obs. of  18 variables:
##  $ date         : chr  "2014-05-02 00:00:00" "2014-05-02 00:00:00" "2014-05-02 00:00:00" "2014-05-02 00:00:00" ...
##  $ price        : num  313000 2384000 342000 420000 550000 ...
##  $ bedrooms     : num  3 5 3 3 4 2 2 4 3 4 ...
##  $ bathrooms    : num  1.5 2.5 2 2.25 2.5 1 2 2.5 2.5 2 ...
##  $ sqft_living  : int  1340 3650 1930 2000 1940 880 1350 2710 2430 1520 ...
##  $ sqft_lot     : int  7912 9050 11947 8030 10500 6380 2560 35868 88426 6200 ...
##  $ floors       : num  1.5 2 1 1 1 1 1 2 1 1.5 ...
##  $ waterfront   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ view         : int  0 4 0 0 0 0 0 0 0 0 ...
##  $ condition    : int  3 5 4 4 4 3 3 3 4 3 ...
##  $ sqft_above   : int  1340 3370 1930 1000 1140 880 1350 2710 1570 1520 ...
##  $ sqft_basement: int  0 280 0 1000 800 0 0 0 860 0 ...
##  $ yr_built     : int  1955 1921 1966 1963 1976 1938 1976 1989 1985 1945 ...
##  $ yr_renovated : int  2005 0 0 0 1992 1994 0 0 0 2010 ...
##  $ street       : chr  "18810 Densmore Ave N" "709 W Blaine St" "26206-26214 143rd Ave SE" "857 170th Pl NE" ...
##  $ city         : chr  "Shoreline" "Seattle" "Kent" "Bellevue" ...
##  $ statezip     : chr  "WA 98133" "WA 98119" "WA 98042" "WA 98008" ...
##  $ country      : chr  "USA" "USA" "USA" "USA" ...

The dataset contains 4600 rows and 18 columns. The target variable is Price and the remains are the features.

summary(house_df)
##      date               price             bedrooms       bathrooms    
##  Length:4600        Min.   :       0   Min.   :0.000   Min.   :0.000  
##  Class :character   1st Qu.:  322875   1st Qu.:3.000   1st Qu.:1.750  
##  Mode  :character   Median :  460943   Median :3.000   Median :2.250  
##                     Mean   :  551963   Mean   :3.401   Mean   :2.161  
##                     3rd Qu.:  654962   3rd Qu.:4.000   3rd Qu.:2.500  
##                     Max.   :26590000   Max.   :9.000   Max.   :8.000  
##   sqft_living       sqft_lot           floors        waterfront      
##  Min.   :  370   Min.   :    638   Min.   :1.000   Min.   :0.000000  
##  1st Qu.: 1460   1st Qu.:   5001   1st Qu.:1.000   1st Qu.:0.000000  
##  Median : 1980   Median :   7683   Median :1.500   Median :0.000000  
##  Mean   : 2139   Mean   :  14852   Mean   :1.512   Mean   :0.007174  
##  3rd Qu.: 2620   3rd Qu.:  11001   3rd Qu.:2.000   3rd Qu.:0.000000  
##  Max.   :13540   Max.   :1074218   Max.   :3.500   Max.   :1.000000  
##       view          condition       sqft_above   sqft_basement   
##  Min.   :0.0000   Min.   :1.000   Min.   : 370   Min.   :   0.0  
##  1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:1190   1st Qu.:   0.0  
##  Median :0.0000   Median :3.000   Median :1590   Median :   0.0  
##  Mean   :0.2407   Mean   :3.452   Mean   :1827   Mean   : 312.1  
##  3rd Qu.:0.0000   3rd Qu.:4.000   3rd Qu.:2300   3rd Qu.: 610.0  
##  Max.   :4.0000   Max.   :5.000   Max.   :9410   Max.   :4820.0  
##     yr_built     yr_renovated       street              city          
##  Min.   :1900   Min.   :   0.0   Length:4600        Length:4600       
##  1st Qu.:1951   1st Qu.:   0.0   Class :character   Class :character  
##  Median :1976   Median :   0.0   Mode  :character   Mode  :character  
##  Mean   :1971   Mean   : 808.6                                        
##  3rd Qu.:1997   3rd Qu.:1999.0                                        
##  Max.   :2014   Max.   :2014.0                                        
##    statezip           country         
##  Length:4600        Length:4600       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 

2. Exploratory Data Analysis

2.1 Univariate Data Analysis

Plot distribution of Price with Boxplot

ggplot(data = house_df,
       aes(y=price)) + geom_boxplot() +
  scale_y_continuous(limits = c(0, 2000000))

2.2 Bivariate Data Analysis

house_df$bedrooms2 <- factor(house_df$bathrooms)
house_df$city2 <- factor(house_df$city)
house_df$statezip2 <- factor(house_df$statezip)
house_df$street2 <- factor(house_df$street)
house_df$country2 <- factor(house_df$country)

ggplot(data = house_df,
       aes(x = bedrooms2,
           y = price)) + geom_boxplot() +
  scale_y_continuous(limits=c(0, 2000000))

2.3 Multivariate Data Analysis

cor(house_df$price, house_df$bedrooms)
## [1] 0.2003363
cor(house_df$price, house_df$bathrooms)
## [1] 0.3271099
cor(house_df$bedrooms, house_df$bathrooms)
## [1] 0.5459199
house_df_num <- house_df[, 2:12]
r <- cor(house_df_num)

library(corrgram)
corrgram(house_df_num, order = TRUE,
         upper.panel = panel.pie)

3. Data Preprocessing

3.1 Data Cleaning

Remove rows with incorrect value prices

idx <- which(house_df_num$price %in% c(0))

house_df_num <- house_df_num[-idx,]
summary(house_df_num$price)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     7800   326264   465000   557906   657500 26590000

Remove rows with outlier prices

out_price <- boxplot.stats(house_df_num$price)$out

out_idx <- which(house_df_num$price %in% c(out_price))

house_df_num <- house_df_num[-out_idx,]

summary(house_df_num)
##      price            bedrooms       bathrooms      sqft_living  
##  Min.   :   7800   Min.   :0.000   Min.   :0.000   Min.   : 370  
##  1st Qu.: 320000   1st Qu.:3.000   1st Qu.:1.750   1st Qu.:1430  
##  Median : 450000   Median :3.000   Median :2.250   Median :1920  
##  Mean   : 487457   Mean   :3.352   Mean   :2.094   Mean   :2031  
##  3rd Qu.: 615000   3rd Qu.:4.000   3rd Qu.:2.500   3rd Qu.:2510  
##  Max.   :1150000   Max.   :9.000   Max.   :5.750   Max.   :7320  
##     sqft_lot           floors        waterfront            view       
##  Min.   :    638   Min.   :1.000   Min.   :0.000000   Min.   :0.0000  
##  1st Qu.:   5000   1st Qu.:1.000   1st Qu.:0.000000   1st Qu.:0.0000  
##  Median :   7566   Median :1.500   Median :0.000000   Median :0.0000  
##  Mean   :  14599   Mean   :1.495   Mean   :0.003711   Mean   :0.1737  
##  3rd Qu.:  10696   3rd Qu.:2.000   3rd Qu.:0.000000   3rd Qu.:0.0000  
##  Max.   :1074218   Max.   :3.500   Max.   :1.000000   Max.   :4.0000  
##    condition       sqft_above   sqft_basement   
##  Min.   :1.000   Min.   : 370   Min.   :   0.0  
##  1st Qu.:3.000   1st Qu.:1170   1st Qu.:   0.0  
##  Median :3.000   Median :1540   Median :   0.0  
##  Mean   :3.444   Mean   :1747   Mean   : 283.7  
##  3rd Qu.:4.000   3rd Qu.:2190   3rd Qu.: 560.0  
##  Max.   :5.000   Max.   :7320   Max.   :2300.0

The minimum value on price is 7800 and the max value on price is 1150000

3.2 Feature Engineering

One Hot Encoding for Location Features

house_df <- house_df[rownames(house_df_num),]

### 1. Create dataframe for statezip
statezip <- house_df$statezip
statezip_df <- data.frame(statezip)
colnames(statezip_df) <- c("loc.")

### 2. One Hot Encoding the statezip dataframe
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:corrgram':
## 
##     panel.fill
df1 <- dummyVars("~.", data = statezip_df)
df2 <- data.frame(predict(df1, newdata = statezip_df))

### 3. Combine house_df_num dataframe
house_df_num <- cbind(house_df_num, df2)
dim(house_df_num)
## [1] 4311   88

3.3 Training and Testing Split

set.seed(2022)
row <- dim(house_df_num)[1]

train_idx <- sample(row, 0.7*row)

train_data <- house_df_num[train_idx, ]
test_data <- house_df_num[-train_idx, ]

dim(train_data)
## [1] 3017   88
dim(test_data)
## [1] 1294   88

4. Modeling

Create Regression Model

mymodel <- lm(formula = price~. +
                I(sqft_living^2) +
                sqft_living + bedrooms, data = train_data)

summary(mymodel)
## 
## Call:
## lm(formula = price ~ . + I(sqft_living^2) + sqft_living + bedrooms, 
##     data = train_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -825231  -53195    -969   50175  448548 
## 
## Coefficients: (3 not defined because of singularities)
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -6.720e+04  7.255e+04  -0.926 0.354418    
## bedrooms         -1.362e+04  2.862e+03  -4.760 2.03e-06 ***
## bathrooms         1.422e+04  4.301e+03   3.307 0.000953 ***
## sqft_living       1.676e+02  1.156e+01  14.507  < 2e-16 ***
## sqft_lot          1.843e-01  5.238e-02   3.519 0.000439 ***
## floors           -2.456e+04  5.228e+03  -4.699 2.74e-06 ***
## waterfront        1.580e+05  3.035e+04   5.205 2.07e-07 ***
## view              3.376e+04  3.197e+03  10.558  < 2e-16 ***
## condition         2.627e+04  2.965e+03   8.861  < 2e-16 ***
## sqft_above        7.857e+01  6.298e+00  12.477  < 2e-16 ***
## sqft_basement            NA         NA      NA       NA    
## loc.WA.98001     -1.018e+05  7.237e+04  -1.407 0.159655    
## loc.WA.98002     -1.017e+05  7.406e+04  -1.373 0.169741    
## loc.WA.98003     -9.118e+04  7.287e+04  -1.251 0.210939    
## loc.WA.98004      4.467e+05  7.318e+04   6.105 1.17e-09 ***
## loc.WA.98005      2.526e+05  7.385e+04   3.421 0.000633 ***
## loc.WA.98006      2.147e+05  7.210e+04   2.977 0.002930 ** 
## loc.WA.98007      2.337e+05  7.372e+04   3.169 0.001544 ** 
## loc.WA.98008      1.473e+05  7.332e+04   2.010 0.044565 *  
## loc.WA.98010      2.707e+04  8.676e+04   0.312 0.755062    
## loc.WA.98011      4.764e+04  7.367e+04   0.647 0.517914    
## loc.WA.98014      3.412e+04  7.552e+04   0.452 0.651431    
## loc.WA.98019     -1.245e+04  7.303e+04  -0.170 0.864642    
## loc.WA.98022     -1.005e+05  7.390e+04  -1.361 0.173730    
## loc.WA.98023     -1.072e+05  7.191e+04  -1.491 0.135986    
## loc.WA.98024      7.457e+04  7.927e+04   0.941 0.346933    
## loc.WA.98027      1.025e+05  7.210e+04   1.422 0.155248    
## loc.WA.98028      5.407e+04  7.248e+04   0.746 0.455718    
## loc.WA.98029      1.668e+05  7.235e+04   2.305 0.021236 *  
## loc.WA.98030     -8.562e+04  7.331e+04  -1.168 0.242931    
## loc.WA.98031     -8.197e+04  7.234e+04  -1.133 0.257258    
## loc.WA.98032     -1.311e+05  7.543e+04  -1.738 0.082391 .  
## loc.WA.98033      2.447e+05  7.210e+04   3.394 0.000698 ***
## loc.WA.98034      1.117e+05  7.180e+04   1.555 0.119973    
## loc.WA.98038     -6.483e+04  7.193e+04  -0.901 0.367493    
## loc.WA.98039     -3.752e+05  1.227e+05  -3.058 0.002251 ** 
## loc.WA.98040      2.883e+05  7.313e+04   3.942 8.26e-05 ***
## loc.WA.98042     -9.472e+04  7.180e+04  -1.319 0.187163    
## loc.WA.98045      5.362e+03  7.275e+04   0.074 0.941245    
## loc.WA.98047     -5.793e+04  8.375e+04  -0.692 0.489187    
## loc.WA.98050      3.869e+04  1.232e+05   0.314 0.753533    
## loc.WA.98051      8.437e+03  8.413e+04   0.100 0.920126    
## loc.WA.98052      2.000e+05  7.154e+04   2.795 0.005216 ** 
## loc.WA.98053      1.722e+05  7.203e+04   2.391 0.016851 *  
## loc.WA.98055     -4.763e+04  7.486e+04  -0.636 0.524656    
## loc.WA.98056      2.345e+04  7.212e+04   0.325 0.745066    
## loc.WA.98057     -7.884e+04  7.650e+04  -1.031 0.302822    
## loc.WA.98058     -4.386e+04  7.180e+04  -0.611 0.541289    
## loc.WA.98059      3.217e+04  7.172e+04   0.449 0.653754    
## loc.WA.98065      4.570e+04  7.237e+04   0.632 0.527751    
## loc.WA.98068             NA         NA      NA       NA    
## loc.WA.98070     -8.824e+03  7.473e+04  -0.118 0.906013    
## loc.WA.98072      1.033e+05  7.250e+04   1.425 0.154395    
## loc.WA.98074      1.455e+05  7.205e+04   2.020 0.043466 *  
## loc.WA.98075      1.897e+05  7.219e+04   2.627 0.008647 ** 
## loc.WA.98077      1.115e+05  7.312e+04   1.525 0.127260    
## loc.WA.98092     -9.585e+04  7.192e+04  -1.333 0.182762    
## loc.WA.98102      3.462e+05  7.664e+04   4.518 6.49e-06 ***
## loc.WA.98103      2.415e+05  7.160e+04   3.373 0.000752 ***
## loc.WA.98105      3.197e+05  7.384e+04   4.330 1.54e-05 ***
## loc.WA.98106      4.222e+04  7.247e+04   0.583 0.560208    
## loc.WA.98107      2.471e+05  7.252e+04   3.408 0.000663 ***
## loc.WA.98108      4.752e+04  7.322e+04   0.649 0.516431    
## loc.WA.98109      4.118e+05  7.657e+04   5.378 8.12e-08 ***
## loc.WA.98112      3.561e+05  7.317e+04   4.867 1.19e-06 ***
## loc.WA.98115      2.190e+05  7.177e+04   3.052 0.002296 ** 
## loc.WA.98116      2.235e+05  7.255e+04   3.080 0.002090 ** 
## loc.WA.98117      2.179e+05  7.166e+04   3.041 0.002381 ** 
## loc.WA.98118      8.803e+04  7.204e+04   1.222 0.221830    
## loc.WA.98119      3.576e+05  7.345e+04   4.869 1.18e-06 ***
## loc.WA.98122      2.505e+05  7.234e+04   3.463 0.000541 ***
## loc.WA.98125      9.288e+04  7.223e+04   1.286 0.198552    
## loc.WA.98126      1.129e+05  7.200e+04   1.568 0.116894    
## loc.WA.98133      6.475e+04  7.189e+04   0.901 0.367804    
## loc.WA.98136      1.814e+05  7.283e+04   2.491 0.012798 *  
## loc.WA.98144      1.830e+05  7.226e+04   2.533 0.011359 *  
## loc.WA.98146      4.867e+04  7.258e+04   0.671 0.502574    
## loc.WA.98148     -2.905e+04  7.756e+04  -0.375 0.707989    
## loc.WA.98155      5.062e+04  7.188e+04   0.704 0.481361    
## loc.WA.98166      2.944e+04  7.263e+04   0.405 0.685229    
## loc.WA.98168     -4.247e+04  7.235e+04  -0.587 0.557208    
## loc.WA.98177      1.319e+05  7.354e+04   1.794 0.072956 .  
## loc.WA.98178     -6.891e+04  7.291e+04  -0.945 0.344613    
## loc.WA.98188     -7.984e+04  7.513e+04  -1.063 0.288015    
## loc.WA.98198     -8.127e+04  7.267e+04  -1.118 0.263557    
## loc.WA.98199      2.635e+05  7.267e+04   3.625 0.000294 ***
## loc.WA.98288     -3.570e+04  9.144e+04  -0.390 0.696250    
## loc.WA.98354             NA         NA      NA       NA    
## I(sqft_living^2) -1.462e-02  2.010e-03  -7.272 4.52e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 100100 on 2931 degrees of freedom
## Multiple R-squared:  0.7868, Adjusted R-squared:  0.7806 
## F-statistic: 127.2 on 85 and 2931 DF,  p-value: < 2.2e-16

5. Performance Evaluations

actual <- test_data$price
pred.myModel <- predict(mymodel, test_data)
## Warning in predict.lm(mymodel, test_data): prediction from a rank-deficient fit
## may be misleading

Plot Actual Data VS Predicted Data Prices

price_df <- data.frame(actual, pred.myModel)

ggplot(data = price_df,
       aes(x = actual,
           y = pred.myModel)) +
  geom_point() +
  scale_x_continuous(limits = c(0, 1500000)) +
  scale_y_continuous(limits = c(0, 1500000))

cor(price_df$actual, price_df$pred.myModel)
## [1] 0.8889122
performance <- function(prediction, actual, method){
  error <- prediction - actual
  squared_error <- error^2
  sum_squared_error <- sum(squared_error)
  mean_squared_error <- mean(squared_error)
  rmse <- sqrt(mean_squared_error)
  r <- cor(prediction, actual)
  
  result <- paste("Method", method,
                  "\nRMSE =", round(rmse, 3),
                  "\nR =", round(r, 3),
                  "\n")
  
  cat(result)
}

performance(pred.myModel, actual, "My Regression Model")
## Method My Regression Model 
## RMSE = 101125.499 
## R = 0.889