Description

Report Provides house price prediction using regression algorithms. The dataset used in this documentation for modeling is house data in Australia

The dataset Link: Here

1. Data Extraction

Import Libraries

Read house datasets

getwd()
## [1] "D:/Document/R/Final Project"
setwd("D:/Document/R")
house_df <- read.csv("house.csv")

str(house_df)
## 'data.frame':    4600 obs. of  18 variables:
##  $ date         : chr  "2014-05-02 00:00:00" "2014-05-02 00:00:00" "2014-05-02 00:00:00" "2014-05-02 00:00:00" ...
##  $ price        : num  313000 2384000 342000 420000 550000 ...
##  $ bedrooms     : num  3 5 3 3 4 2 2 4 3 4 ...
##  $ bathrooms    : num  1.5 2.5 2 2.25 2.5 1 2 2.5 2.5 2 ...
##  $ sqft_living  : int  1340 3650 1930 2000 1940 880 1350 2710 2430 1520 ...
##  $ sqft_lot     : int  7912 9050 11947 8030 10500 6380 2560 35868 88426 6200 ...
##  $ floors       : num  1.5 2 1 1 1 1 1 2 1 1.5 ...
##  $ waterfront   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ view         : int  0 4 0 0 0 0 0 0 0 0 ...
##  $ condition    : int  3 5 4 4 4 3 3 3 4 3 ...
##  $ sqft_above   : int  1340 3370 1930 1000 1140 880 1350 2710 1570 1520 ...
##  $ sqft_basement: int  0 280 0 1000 800 0 0 0 860 0 ...
##  $ yr_built     : int  1955 1921 1966 1963 1976 1938 1976 1989 1985 1945 ...
##  $ yr_renovated : int  2005 0 0 0 1992 1994 0 0 0 2010 ...
##  $ street       : chr  "18810 Densmore Ave N" "709 W Blaine St" "26206-26214 143rd Ave SE" "857 170th Pl NE" ...
##  $ city         : chr  "Shoreline" "Seattle" "Kent" "Bellevue" ...
##  $ statezip     : chr  "WA 98133" "WA 98119" "WA 98042" "WA 98008" ...
##  $ country      : chr  "USA" "USA" "USA" "USA" ...

This dataset contain 4600 rows and 18 columns. The target variable is Price

2. Exploratory Data Analysis

Plot Distribution of Price (Boxplot)

ggplot(house_df, aes(y=price)) +
  geom_boxplot() + 
  scale_y_continuous(limits = c(0, 2000000))
## Warning: Removed 47 rows containing non-finite values (stat_boxplot).

house_df$bedrooms2 <- factor(house_df$bedrooms)
house_df$city2 <- factor(house_df$city)
house_df$statezip2 <- factor(house_df$statezip)
house_df$street2 <- factor(house_df$street)
house_df$country2 <- factor(house_df$country)
ggplot(data = house_df, aes(x = bedrooms2, y = price)) + 
  geom_boxplot() + 
  scale_y_continuous(limits = c(0,2000000))

3. Data Preprocessing

3.1 data Cleaning

Remove rows with incorrect prices

house_df_num <- house_df[,2:12]
idx <- which(house_df_num$price %in% c(0))

house_df_num <- house_df_num[-idx,]
summary(house_df_num$price)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     7800   326264   465000   557906   657500 26590000

Remove rows with outliers

out_price <- boxplot.stats(house_df_num$price)$out

out_idx <- which(house_df_num$price %in% c(out_price))

house_df_num <- house_df_num[-out_idx,]
summary(house_df_num)
##      price            bedrooms       bathrooms      sqft_living  
##  Min.   :   7800   Min.   :0.000   Min.   :0.000   Min.   : 370  
##  1st Qu.: 320000   1st Qu.:3.000   1st Qu.:1.750   1st Qu.:1430  
##  Median : 450000   Median :3.000   Median :2.250   Median :1920  
##  Mean   : 487457   Mean   :3.352   Mean   :2.094   Mean   :2031  
##  3rd Qu.: 615000   3rd Qu.:4.000   3rd Qu.:2.500   3rd Qu.:2510  
##  Max.   :1150000   Max.   :9.000   Max.   :5.750   Max.   :7320  
##     sqft_lot           floors        waterfront            view       
##  Min.   :    638   Min.   :1.000   Min.   :0.000000   Min.   :0.0000  
##  1st Qu.:   5000   1st Qu.:1.000   1st Qu.:0.000000   1st Qu.:0.0000  
##  Median :   7566   Median :1.500   Median :0.000000   Median :0.0000  
##  Mean   :  14599   Mean   :1.495   Mean   :0.003711   Mean   :0.1737  
##  3rd Qu.:  10696   3rd Qu.:2.000   3rd Qu.:0.000000   3rd Qu.:0.0000  
##  Max.   :1074218   Max.   :3.500   Max.   :1.000000   Max.   :4.0000  
##    condition       sqft_above   sqft_basement   
##  Min.   :1.000   Min.   : 370   Min.   :   0.0  
##  1st Qu.:3.000   1st Qu.:1170   1st Qu.:   0.0  
##  Median :3.000   Median :1540   Median :   0.0  
##  Mean   :3.444   Mean   :1747   Mean   : 283.7  
##  3rd Qu.:4.000   3rd Qu.:2190   3rd Qu.: 560.0  
##  Max.   :5.000   Max.   :7320   Max.   :2300.0

Preprocess data is 4311 rows

3.2 Feature Extraction

One HOT Encoding for location features

house_df <- house_df[rownames(house_df_num),]
### 1. dataframe statezip
statezip <- house_df$statezip
statezip_df <- data.frame(statezip)
colnames(statezip_df) <- c("loc.")

### 2. One HOT encoding statezip dataframe
library(caret)
## Loading required package: lattice
df1 <- dummyVars("~.", data = statezip_df)
df2 <- data.frame(predict(df1, newdata = statezip_df))

house_df_num <- cbind(house_df_num, df2)

### 3. Training and testing split
d <- dim(house_df_num)
m <- d[1]

set.seed(2022)
train_idx <- sample(m, 0.85 * m)
train_idx[1:5]
## [1] 1459 2871 3915  708 2751
train_data <- house_df_num[train_idx,]
test_data <- house_df_num[-train_idx,]

4. Modeling

Create Regression Models

mymodel <- lm(formula = price~., data= train_data)

5. Evaluation

actual <- test_data$price

pred.mymodel <- predict(mymodel, test_data)

Plot ACtual Vs Predicted Prices

price_df <- data.frame(actual, pred.mymodel)
ggplot(price_df, aes(x = actual,
                     y = pred.mymodel)) +
  geom_point() +
  scale_x_continuous(limits = c(0, 2000000)) +
  scale_y_continuous(limits = c(0, 2000000))

### Function Performance
performance <- function(pred, actual, method){
  error <- pred - actual
  se <- error^2
  mse <- mean(se)
  rmse <- sqrt(mse)
  
  r <- cor(pred, actual)
  
  result <- paste("Method  : ", method,
                  "\n RMSE : ", round(mse,3),
                  "\n R    : ", round(r, 3))
  cat(result)
}

performance(pred.mymodel, actual, "Polynomial Regression")
## Method  :  Polynomial Regression 
##  RMSE :  11139923787.583 
##  R    :  0.886