Description

Documentation provides house price prediction using regression algorithms. The dataset used in this documentation for modeling is house data in Austalia

The dataset link: Here

1. Data Extraction

Import Libraries

library(ggplot2)

Read house datasets

house_df <- read.csv("house.csv")
str(house_df)
## 'data.frame':    4600 obs. of  18 variables:
##  $ date         : chr  "2014-05-02 00:00:00" "2014-05-02 00:00:00" "2014-05-02 00:00:00" "2014-05-02 00:00:00" ...
##  $ price        : num  313000 2384000 342000 420000 550000 ...
##  $ bedrooms     : num  3 5 3 3 4 2 2 4 3 4 ...
##  $ bathrooms    : num  1.5 2.5 2 2.25 2.5 1 2 2.5 2.5 2 ...
##  $ sqft_living  : int  1340 3650 1930 2000 1940 880 1350 2710 2430 1520 ...
##  $ sqft_lot     : int  7912 9050 11947 8030 10500 6380 2560 35868 88426 6200 ...
##  $ floors       : num  1.5 2 1 1 1 1 1 2 1 1.5 ...
##  $ waterfront   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ view         : int  0 4 0 0 0 0 0 0 0 0 ...
##  $ condition    : int  3 5 4 4 4 3 3 3 4 3 ...
##  $ sqft_above   : int  1340 3370 1930 1000 1140 880 1350 2710 1570 1520 ...
##  $ sqft_basement: int  0 280 0 1000 800 0 0 0 860 0 ...
##  $ yr_built     : int  1955 1921 1966 1963 1976 1938 1976 1989 1985 1945 ...
##  $ yr_renovated : int  2005 0 0 0 1992 1994 0 0 0 2010 ...
##  $ street       : chr  "18810 Densmore Ave N" "709 W Blaine St" "26206-26214 143rd Ave SE" "857 170th Pl NE" ...
##  $ city         : chr  "Shoreline" "Seattle" "Kent" "Bellevue" ...
##  $ statezip     : chr  "WA 98133" "WA 98119" "WA 98042" "WA 98008" ...
##  $ country      : chr  "USA" "USA" "USA" "USA" ...

This dataset contains 4600 rows and 18 columns. The targer variable is price

2. Exploratory Data Analysis

Plot Distribution of price (Boxplot)

ggplot(house_df, aes(y = price)) +
  geom_boxplot() +
  scale_y_continuous(limits = c(0,2000000))

Convert the features to factor variable

house_df$bedrooms2 <- factor(house_df$bedrooms)
house_df$city2 <- factor(house_df$city)
house_df$statezip2 <- factor(house_df$statezip)
house_df$street2 <- factor(house_df$street)
house_df$country2 <- factor(house_df$country)
ggplot(data = house_df, aes(x=bedrooms2, y=price)) +
  geom_boxplot() +
  scale_y_continuous(limits = c(0,2000000))

3.Data Preprocessing

3.1 Data Cleansing

Remove rows with incorect price

house_df_num <- house_df[,2:12]
idx <- which(house_df_num$price %in% c(0))
house_df_num <- house_df_num[-idx,]
summary(house_df_num$price)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     7800   326264   465000   557906   657500 26590000

Remove rows with outliers

out_price <- boxplot.stats(house_df_num$price)$out
out_idx <- which(house_df_num$price %in% c(out_price))
house_df_num <- house_df_num[-out_idx,]
summary(house_df_num$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    7800  320000  450000  487457  615000 1150000

Preprocess data is 4311 rows

3.2 Feature Extraction

One Hot Encoding for location features

house_df <- house_df[rownames(house_df_num),]

### 1.dataframe statezip
statezip <- house_df$statezip
statezip_df <- data.frame(statezip)
colnames(statezip_df) <- c("loc.")

### 2. one hot encoding statezip dataframe
library(caret)
## Loading required package: lattice
df1 <- dummyVars("~.", data = statezip_df)
df2 <- data.frame(predict(df1, newdata = statezip_df))

house_df_num <- cbind(house_df_num,df2)

### 3.Training and Testing Split
d <- dim(house_df_num)
m <- d[1]
set.seed(2022)
train_idx <- sample(m, 0.85*m)
train_idx[1:5]
## [1] 1459 2871 3915  708 2751
train_data <- house_df_num[train_idx,]
test_data <- house_df_num[-train_idx,]

4. Modeling

Create Regression Models

mymodel <- lm(formula = price~., data = train_data)

5. Evaluations

actual <- test_data$price

pred.mymodel <- predict(mymodel, test_data)

plot Actual VS Predicted Prices

price_df <- data.frame(actual, pred.mymodel)

ggplot(price_df, aes(x = actual, y=pred.mymodel)) +
  geom_point() +
  scale_x_continuous(limits = c(0,2000000)) +
  scale_y_continuous(limits = c(0,2000000))

performance <- function(pred, actual, method){
  error <- pred - actual
  se <- error^2
  mse <- mean(se)
  rmse <- sqrt(mse)
  
  r <- cor(pred,actual)
  
  result <- paste("Method : ", method,
                  "\nRMSE   : ", round(rmse,3),
                  "\nR      : ", round(r,3))
  
  cat(result)
}

performance(pred.mymodel, actual, "Polynomial Regression")
## Method :  Polynomial Regression 
## RMSE   :  105545.837 
## R      :  0.886