\(~\)

Multiple Regression

Real estate price prediction: This real estate dataset was built for regression analysis, linear regression, multiple regression, and prediction models. It includes the date of purchase, house age, location, distance to nearest MRT station, and house price of unit area.

\(~\)

Data set was obtained through https://www.kaggle.com/datasets/quantbruce/real-estate-price-prediction

# Load Libraries
library(tidyverse)

\(~\)

# Import data
real_estate <- read.csv("https://raw.githubusercontent.com/letisalba/Data-605/main/Week-12/Real_estate.csv")
head(real_estate, n = 5)
##   No X1.transaction.date X2.house.age X3.distance.to.the.nearest.MRT.station
## 1  1            2012.917         32.0                               84.87882
## 2  2            2012.917         19.5                              306.59470
## 3  3            2013.583         13.3                              561.98450
## 4  4            2013.500         13.3                              561.98450
## 5  5            2012.833          5.0                              390.56840
##   X4.number.of.convenience.stores X5.latitude X6.longitude
## 1                              10    24.98298     121.5402
## 2                               9    24.98034     121.5395
## 3                               5    24.98746     121.5439
## 4                               5    24.98746     121.5439
## 5                               5    24.97937     121.5425
##   Y.house.price.of.unit.area
## 1                       37.9
## 2                       42.2
## 3                       47.3
## 4                       54.8
## 5                       43.1

There are no missing values in this dataset.

# Checking is missing values
head(is.na(real_estate))
##         No X1.transaction.date X2.house.age
## [1,] FALSE               FALSE        FALSE
## [2,] FALSE               FALSE        FALSE
## [3,] FALSE               FALSE        FALSE
## [4,] FALSE               FALSE        FALSE
## [5,] FALSE               FALSE        FALSE
## [6,] FALSE               FALSE        FALSE
##      X3.distance.to.the.nearest.MRT.station X4.number.of.convenience.stores
## [1,]                                  FALSE                           FALSE
## [2,]                                  FALSE                           FALSE
## [3,]                                  FALSE                           FALSE
## [4,]                                  FALSE                           FALSE
## [5,]                                  FALSE                           FALSE
## [6,]                                  FALSE                           FALSE
##      X5.latitude X6.longitude Y.house.price.of.unit.area
## [1,]       FALSE        FALSE                      FALSE
## [2,]       FALSE        FALSE                      FALSE
## [3,]       FALSE        FALSE                      FALSE
## [4,]       FALSE        FALSE                      FALSE
## [5,]       FALSE        FALSE                      FALSE
## [6,]       FALSE        FALSE                      FALSE

This data is complied of 414 rows and 8 columns.

# Glimpse of data set
glimpse(real_estate)
## Rows: 414
## Columns: 8
## $ No                                     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, …
## $ X1.transaction.date                    <dbl> 2012.917, 2012.917, 2013.583, 2…
## $ X2.house.age                           <dbl> 32.0, 19.5, 13.3, 13.3, 5.0, 7.…
## $ X3.distance.to.the.nearest.MRT.station <dbl> 84.87882, 306.59470, 561.98450,…
## $ X4.number.of.convenience.stores        <int> 10, 9, 5, 5, 5, 3, 7, 6, 1, 3, …
## $ X5.latitude                            <dbl> 24.98298, 24.98034, 24.98746, 2…
## $ X6.longitude                           <dbl> 121.5402, 121.5395, 121.5439, 1…
## $ Y.house.price.of.unit.area             <dbl> 37.9, 42.2, 47.3, 54.8, 43.1, 3…
# Summary of data set
summary(real_estate)
##        No        X1.transaction.date  X2.house.age   
##  Min.   :  1.0   Min.   :2013        Min.   : 0.000  
##  1st Qu.:104.2   1st Qu.:2013        1st Qu.: 9.025  
##  Median :207.5   Median :2013        Median :16.100  
##  Mean   :207.5   Mean   :2013        Mean   :17.713  
##  3rd Qu.:310.8   3rd Qu.:2013        3rd Qu.:28.150  
##  Max.   :414.0   Max.   :2014        Max.   :43.800  
##  X3.distance.to.the.nearest.MRT.station X4.number.of.convenience.stores
##  Min.   :  23.38                        Min.   : 0.000                 
##  1st Qu.: 289.32                        1st Qu.: 1.000                 
##  Median : 492.23                        Median : 4.000                 
##  Mean   :1083.89                        Mean   : 4.094                 
##  3rd Qu.:1454.28                        3rd Qu.: 6.000                 
##  Max.   :6488.02                        Max.   :10.000                 
##   X5.latitude     X6.longitude   Y.house.price.of.unit.area
##  Min.   :24.93   Min.   :121.5   Min.   :  7.60            
##  1st Qu.:24.96   1st Qu.:121.5   1st Qu.: 27.70            
##  Median :24.97   Median :121.5   Median : 38.45            
##  Mean   :24.97   Mean   :121.5   Mean   : 37.98            
##  3rd Qu.:24.98   3rd Qu.:121.5   3rd Qu.: 46.60            
##  Max.   :25.01   Max.   :121.6   Max.   :117.50
# Getting colnames 
colnames(real_estate)
## [1] "No"                                    
## [2] "X1.transaction.date"                   
## [3] "X2.house.age"                          
## [4] "X3.distance.to.the.nearest.MRT.station"
## [5] "X4.number.of.convenience.stores"       
## [6] "X5.latitude"                           
## [7] "X6.longitude"                          
## [8] "Y.house.price.of.unit.area"
# Renaming for easier read
colnames(real_estate) <- c("No", "Transaction_Date", "House_Age", 
                           "Nearest_MRT_Station", "Number_Convenience_Stores",
                           "Latitude", "Longitude", "House_Price")
head(real_estate)
##   No Transaction_Date House_Age Nearest_MRT_Station Number_Convenience_Stores
## 1  1         2012.917      32.0            84.87882                        10
## 2  2         2012.917      19.5           306.59470                         9
## 3  3         2013.583      13.3           561.98450                         5
## 4  4         2013.500      13.3           561.98450                         5
## 5  5         2012.833       5.0           390.56840                         5
## 6  6         2012.667       7.1          2175.03000                         3
##   Latitude Longitude House_Price
## 1 24.98298  121.5402        37.9
## 2 24.98034  121.5395        42.2
## 3 24.98746  121.5439        47.3
## 4 24.98746  121.5439        54.8
## 5 24.97937  121.5425        43.1
## 6 24.96305  121.5125        32.1

Regression Analysis

First I conducted a singluar linear model with House_Price and House_Age. Based on the summary the multiple \(R^2\) is 0.4434 which explains 44.34% variance of the data model and has a p-value of 1.56e-05, which is less than 0.05.

# Obtaining our singular Linear Model
my_lm <- lm(House_Price ~ House_Age, real_estate)
summary(my_lm)
## 
## Call:
## lm(formula = House_Price ~ House_Age, data = real_estate)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -31.113 -10.738   1.626   8.199  77.781 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 42.43470    1.21098  35.042  < 2e-16 ***
## House_Age   -0.25149    0.05752  -4.372 1.56e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.32 on 412 degrees of freedom
## Multiple R-squared:  0.04434,    Adjusted R-squared:  0.04202 
## F-statistic: 19.11 on 1 and 412 DF,  p-value: 1.56e-05

Working with multiple regressions I created another model, this time adding on more cofficients to check how different the two models would be. You can already see some differences with the residuals, the coefficients and the multiple \(R^2\) is now 58.24% and the p-value < 2.2e-16.

# Running the multiple regression
my_lm2 <- lm(House_Price ~ Transaction_Date + House_Age +  
                           Nearest_MRT_Station + Number_Convenience_Stores +
                           Latitude + Longitude, real_estate)
summary(my_lm2)
## 
## Call:
## lm(formula = House_Price ~ Transaction_Date + House_Age + Nearest_MRT_Station + 
##     Number_Convenience_Stores + Latitude + Longitude, data = real_estate)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -35.664  -5.410  -0.966   4.217  75.193 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -1.444e+04  6.776e+03  -2.131  0.03371 *  
## Transaction_Date           5.146e+00  1.557e+00   3.305  0.00103 ** 
## House_Age                 -2.697e-01  3.853e-02  -7.000 1.06e-11 ***
## Nearest_MRT_Station       -4.488e-03  7.180e-04  -6.250 1.04e-09 ***
## Number_Convenience_Stores  1.133e+00  1.882e-01   6.023 3.84e-09 ***
## Latitude                   2.255e+02  4.457e+01   5.059 6.38e-07 ***
## Longitude                 -1.242e+01  4.858e+01  -0.256  0.79829    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.858 on 407 degrees of freedom
## Multiple R-squared:  0.5824, Adjusted R-squared:  0.5762 
## F-statistic: 94.59 on 6 and 407 DF,  p-value: < 2.2e-16

We can see that Longitude has a p-value greater than 0.05 so I will remove it to creat a third regression model. Notice the slight changes in the coefficient values below, but they are all less than 0.05.

# Multiple regression without Longitude
my_lm3 <- lm(House_Price ~ Transaction_Date + House_Age +  Nearest_MRT_Station + 
               Number_Convenience_Stores + Latitude, real_estate)
summary(my_lm3)
## 
## Call:
## lm(formula = House_Price ~ Transaction_Date + House_Age + Nearest_MRT_Station + 
##     Number_Convenience_Stores + Latitude, data = real_estate)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -35.623  -5.371  -1.020   4.244  75.346 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -1.596e+04  3.233e+03  -4.936 1.17e-06 ***
## Transaction_Date           5.135e+00  1.555e+00   3.303  0.00104 ** 
## House_Age                 -2.694e-01  3.847e-02  -7.003 1.04e-11 ***
## Nearest_MRT_Station       -4.353e-03  4.899e-04  -8.887  < 2e-16 ***
## Number_Convenience_Stores  1.136e+00  1.876e-01   6.056 3.17e-09 ***
## Latitude                   2.269e+02  4.417e+01   5.136 4.36e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.848 on 408 degrees of freedom
## Multiple R-squared:  0.5823, Adjusted R-squared:  0.5772 
## F-statistic: 113.8 on 5 and 408 DF,  p-value: < 2.2e-16

\(~\)

Residual Analysis

# my_lm
par(mfrow = c(2,2))
plot(my_lm)

# my_lm2
par(mfrow = c(2,2))
plot(my_lm2)

# my_lm3
par(mfrow = c(2,2))
plot(my_lm3)

\(~\)

# my_lm
hist(my_lm$residuals, xlab = 'Residuals', main = 'Histogram of Singluar Linear Model')

#my_lm2
hist(my_lm2$residuals, xlab = 'Residuals', main = 'Histogram of Multiple Linear Model')

#my_lm3
hist(my_lm3$residuals, xlab = 'Residuals', main = 'Histogram of Multiple Linear Model without Longitude')

Conclusion:

Residuals are normally distributed, especially with the second and third model created, therefore this model does work with the dataset.