Multiple Regression

Real estate price prediction:Regression analysis, linear regression, multiple regression, and prediction models were all used to build this real estate dataset. It covers the purchase date, the age of the house, the location, the distance to the closest MRT station, and the housing price per square foot.

Data set was acquired using https://www.kaggle.com/datasets/quantbruce/real-estate-price-prediction

# Load Libraries
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.2
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2
## Warning: package 'readr' was built under R version 4.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
# Import data
real_estate <- read.csv("https://raw.githubusercontent.com/IvanGrozny88/DATA605Assignment_12/main/Real%20estate.csv")
head(real_estate, n = 5)
##   No X1.transaction.date X2.house.age X3.distance.to.the.nearest.MRT.station
## 1  1            2012.917         32.0                               84.87882
## 2  2            2012.917         19.5                              306.59470
## 3  3            2013.583         13.3                              561.98450
## 4  4            2013.500         13.3                              561.98450
## 5  5            2012.833          5.0                              390.56840
##   X4.number.of.convenience.stores X5.latitude X6.longitude
## 1                              10    24.98298     121.5402
## 2                               9    24.98034     121.5395
## 3                               5    24.98746     121.5439
## 4                               5    24.98746     121.5439
## 5                               5    24.97937     121.5425
##   Y.house.price.of.unit.area
## 1                       37.9
## 2                       42.2
## 3                       47.3
## 4                       54.8
## 5                       43.1

This dataset has no missing values.

# Checking is missing values
head(is.na(real_estate))
##         No X1.transaction.date X2.house.age
## [1,] FALSE               FALSE        FALSE
## [2,] FALSE               FALSE        FALSE
## [3,] FALSE               FALSE        FALSE
## [4,] FALSE               FALSE        FALSE
## [5,] FALSE               FALSE        FALSE
## [6,] FALSE               FALSE        FALSE
##      X3.distance.to.the.nearest.MRT.station X4.number.of.convenience.stores
## [1,]                                  FALSE                           FALSE
## [2,]                                  FALSE                           FALSE
## [3,]                                  FALSE                           FALSE
## [4,]                                  FALSE                           FALSE
## [5,]                                  FALSE                           FALSE
## [6,]                                  FALSE                           FALSE
##      X5.latitude X6.longitude Y.house.price.of.unit.area
## [1,]       FALSE        FALSE                      FALSE
## [2,]       FALSE        FALSE                      FALSE
## [3,]       FALSE        FALSE                      FALSE
## [4,]       FALSE        FALSE                      FALSE
## [5,]       FALSE        FALSE                      FALSE
## [6,]       FALSE        FALSE                      FALSE

There are 414 rows and 8 columns in this data set.

# Summary of data set
summary(real_estate)
##        No        X1.transaction.date  X2.house.age   
##  Min.   :  1.0   Min.   :2013        Min.   : 0.000  
##  1st Qu.:104.2   1st Qu.:2013        1st Qu.: 9.025  
##  Median :207.5   Median :2013        Median :16.100  
##  Mean   :207.5   Mean   :2013        Mean   :17.713  
##  3rd Qu.:310.8   3rd Qu.:2013        3rd Qu.:28.150  
##  Max.   :414.0   Max.   :2014        Max.   :43.800  
##  X3.distance.to.the.nearest.MRT.station X4.number.of.convenience.stores
##  Min.   :  23.38                        Min.   : 0.000                 
##  1st Qu.: 289.32                        1st Qu.: 1.000                 
##  Median : 492.23                        Median : 4.000                 
##  Mean   :1083.89                        Mean   : 4.094                 
##  3rd Qu.:1454.28                        3rd Qu.: 6.000                 
##  Max.   :6488.02                        Max.   :10.000                 
##   X5.latitude     X6.longitude   Y.house.price.of.unit.area
##  Min.   :24.93   Min.   :121.5   Min.   :  7.60            
##  1st Qu.:24.96   1st Qu.:121.5   1st Qu.: 27.70            
##  Median :24.97   Median :121.5   Median : 38.45            
##  Mean   :24.97   Mean   :121.5   Mean   : 37.98            
##  3rd Qu.:24.98   3rd Qu.:121.5   3rd Qu.: 46.60            
##  Max.   :25.01   Max.   :121.6   Max.   :117.50
# Getting colnames 
colnames(real_estate)
## [1] "No"                                    
## [2] "X1.transaction.date"                   
## [3] "X2.house.age"                          
## [4] "X3.distance.to.the.nearest.MRT.station"
## [5] "X4.number.of.convenience.stores"       
## [6] "X5.latitude"                           
## [7] "X6.longitude"                          
## [8] "Y.house.price.of.unit.area"
# Renaming for easier read
colnames(real_estate) <- c("No", "Transaction_Date", "House_Age", 
                           "Nearest_MRT_Station", "Number_Convenience_Stores",
                           "Latitude", "Longitude", "House_Price")
head(real_estate)
##   No Transaction_Date House_Age Nearest_MRT_Station Number_Convenience_Stores
## 1  1         2012.917      32.0            84.87882                        10
## 2  2         2012.917      19.5           306.59470                         9
## 3  3         2013.583      13.3           561.98450                         5
## 4  4         2013.500      13.3           561.98450                         5
## 5  5         2012.833       5.0           390.56840                         5
## 6  6         2012.667       7.1          2175.03000                         3
##   Latitude Longitude House_Price
## 1 24.98298  121.5402        37.9
## 2 24.98034  121.5395        42.2
## 3 24.98746  121.5439        47.3
## 4 24.98746  121.5439        54.8
## 5 24.97937  121.5425        43.1
## 6 24.96305  121.5125        32.1

Regression Analysis I started by running a simple linear model using the variables house price and house age. The multiple R2 is 0.4434, which according to the summary explains 44.34% of the data model’s variance, and has a p-value of 1.56e-05, which is less than 0.05.

# Obtaining our singular Linear Model
my_lm <- lm(House_Price ~ House_Age, real_estate)
summary(my_lm)
## 
## Call:
## lm(formula = House_Price ~ House_Age, data = real_estate)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -31.113 -10.738   1.626   8.199  77.781 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 42.43470    1.21098  35.042  < 2e-16 ***
## House_Age   -0.25149    0.05752  -4.372 1.56e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.32 on 412 degrees of freedom
## Multiple R-squared:  0.04434,    Adjusted R-squared:  0.04202 
## F-statistic: 19.11 on 1 and 412 DF,  p-value: 1.56e-05

Managing various regressions To see how the two models would differ from one another, I developed a second model and this time added extra coefficients. There are already some variations between the multiple, coefficients, and residuals. The current R2 is 58.24%, and the p-value is 2.2e-16.

# Running the multiple regression
my_lm2 <- lm(House_Price ~ Transaction_Date + House_Age +  
                           Nearest_MRT_Station + Number_Convenience_Stores +
                           Latitude + Longitude, real_estate)
summary(my_lm2)
## 
## Call:
## lm(formula = House_Price ~ Transaction_Date + House_Age + Nearest_MRT_Station + 
##     Number_Convenience_Stores + Latitude + Longitude, data = real_estate)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -35.664  -5.410  -0.966   4.217  75.193 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -1.444e+04  6.776e+03  -2.131  0.03371 *  
## Transaction_Date           5.146e+00  1.557e+00   3.305  0.00103 ** 
## House_Age                 -2.697e-01  3.853e-02  -7.000 1.06e-11 ***
## Nearest_MRT_Station       -4.488e-03  7.180e-04  -6.250 1.04e-09 ***
## Number_Convenience_Stores  1.133e+00  1.882e-01   6.023 3.84e-09 ***
## Latitude                   2.255e+02  4.457e+01   5.059 6.38e-07 ***
## Longitude                 -1.242e+01  4.858e+01  -0.256  0.79829    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.858 on 407 degrees of freedom
## Multiple R-squared:  0.5824, Adjusted R-squared:  0.5762 
## F-statistic: 94.59 on 6 and 407 DF,  p-value: < 2.2e-16

Since Longitude has a p-value above 0.05, as can be seen, I will eliminate it to create a third regression model. The coefficient values below show some tiny variations, but they are all less than 0.05.

# Multiple regression without Longitude
my_lm3 <- lm(House_Price ~ Transaction_Date + House_Age +  Nearest_MRT_Station + 
               Number_Convenience_Stores + Latitude, real_estate)
summary(my_lm3)
## 
## Call:
## lm(formula = House_Price ~ Transaction_Date + House_Age + Nearest_MRT_Station + 
##     Number_Convenience_Stores + Latitude, data = real_estate)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -35.623  -5.371  -1.020   4.244  75.346 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -1.596e+04  3.233e+03  -4.936 1.17e-06 ***
## Transaction_Date           5.135e+00  1.555e+00   3.303  0.00104 ** 
## House_Age                 -2.694e-01  3.847e-02  -7.003 1.04e-11 ***
## Nearest_MRT_Station       -4.353e-03  4.899e-04  -8.887  < 2e-16 ***
## Number_Convenience_Stores  1.136e+00  1.876e-01   6.056 3.17e-09 ***
## Latitude                   2.269e+02  4.417e+01   5.136 4.36e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.848 on 408 degrees of freedom
## Multiple R-squared:  0.5823, Adjusted R-squared:  0.5772 
## F-statistic: 113.8 on 5 and 408 DF,  p-value: < 2.2e-16

Residual Analysis

# my_lm
par(mfrow = c(2,2))
plot(my_lm)

# my_lm2
par(mfrow = c(2,2))
plot(my_lm2)

# my_lm3
par(mfrow = c(2,2))
plot(my_lm3)

# my_lm
hist(my_lm$residuals, xlab = 'Residuals', main = 'Histogram of Singluar Linear Model')

#my_lm2
hist(my_lm2$residuals, xlab = 'Residuals', main = 'Histogram of Multiple Linear Model')

#my_lm3
hist(my_lm3$residuals, xlab = 'Residuals', main = 'Histogram of Multiple Linear Model without Longitude')

Conclusion: This model does fit the data because residuals are regularly distributed, particularly for the second and third models that were developed.