library(tidyr)
library(dplyr)
library(ggplot2)
library(reshape2)
library(corrplot)
library(knitr)
library(tidymodels)
library(tidyverse)
library(mice)
library(caret)
library(performance)
library(kableExtra)
library(MASS)
# Loading the real estate training data set
real_estate_training <- read.csv("https://raw.githubusercontent.com/GitHub-Vlad/Data-science/main/Real%20estate%20valuation%20data%20set.csv")
# Summary of the data set
summary(real_estate_training)
##        No        X1.transaction.date  X2.house.age   
##  Min.   :  1.0   Min.   :2013        Min.   : 0.000  
##  1st Qu.:104.2   1st Qu.:2013        1st Qu.: 9.025  
##  Median :207.5   Median :2013        Median :16.100  
##  Mean   :207.5   Mean   :2013        Mean   :17.713  
##  3rd Qu.:310.8   3rd Qu.:2013        3rd Qu.:28.150  
##  Max.   :414.0   Max.   :2014        Max.   :43.800  
##  X3.distance.to.the.nearest.MRT.station X4.number.of.convenience.stores
##  Min.   :  23.38                        Min.   : 0.000                 
##  1st Qu.: 289.32                        1st Qu.: 1.000                 
##  Median : 492.23                        Median : 4.000                 
##  Mean   :1083.89                        Mean   : 4.094                 
##  3rd Qu.:1454.28                        3rd Qu.: 6.000                 
##  Max.   :6488.02                        Max.   :10.000                 
##   X5.latitude     X6.longitude   Y.house.price.of.unit.area
##  Min.   :24.93   Min.   :121.5   Min.   :  7.60            
##  1st Qu.:24.96   1st Qu.:121.5   1st Qu.: 27.70            
##  Median :24.97   Median :121.5   Median : 38.45            
##  Mean   :24.97   Mean   :121.5   Mean   : 37.98            
##  3rd Qu.:24.98   3rd Qu.:121.5   3rd Qu.: 46.60            
##  Max.   :25.01   Max.   :121.6   Max.   :117.50
# creating distribution charts to see how the data is distributed for each of
# the variables
real_estate_training_dist <- real_estate_training %>%
    gather(key = "variable", value = "value")
# Histogram plots of each variable
ggplot(real_estate_training_dist) + geom_histogram(aes(x = value, y = ..density..),
    bins = 30) + geom_density(aes(x = value), color = "red") + facet_wrap(. ~ variable,
    scales = "free", ncol = 4)

# creating box plots for each variable to get an idea of the spread of each
# variable
real_estate_training %>%
    keep(is.numeric) %>%
    gather() %>%
    ggplot(aes(value)) + facet_wrap(~key, scales = "free") + geom_boxplot(fill = "black",
    color = "blue", outlier.colour = "green", outlier.shape = 14, outlier.size = 2,
    notch = FALSE)

# checking for NA values for each column in the data set
colSums(is.na(real_estate_training))
##                                     No                    X1.transaction.date 
##                                      0                                      0 
##                           X2.house.age X3.distance.to.the.nearest.MRT.station 
##                                      0                                      0 
##        X4.number.of.convenience.stores                            X5.latitude 
##                                      0                                      0 
##                           X6.longitude             Y.house.price.of.unit.area 
##                                      0                                      0
# creating a correlation matrix to observe correlation of all the variables in
# data set
corrplot(cor(real_estate_training %>%
    keep(is.numeric)), method = "shade", shade.col = NA, tl.col = "brown", tl.srt = 45)

Data Preparation:

# renaming columns to remove 'X' or 'Y' in the beginning of column and
# replacing: '.' with '_'.
colnames(real_estate_training)[colnames(real_estate_training) == "X2.house.age"] <- "house_age"
colnames(real_estate_training)[colnames(real_estate_training) == "X3.distance.to.the.nearest.MRT.station"] <- "distance_to_the_nearest_MRT_station"
colnames(real_estate_training)[colnames(real_estate_training) == "X4.number.of.convenience.stores"] <- "number_of_convenience_stores"
colnames(real_estate_training)[colnames(real_estate_training) == "X5.latitude"] <- "latitude"
colnames(real_estate_training)[colnames(real_estate_training) == "X6.longitude"] <- "longitude"
colnames(real_estate_training)[colnames(real_estate_training) == "Y.house.price.of.unit.area"] <- "house_price_of_unit_area"
# removing the 'No' and 'X1.transaction.date' columns from the data set
real_estate_training = dplyr::select(real_estate_training, -c(1, 2))
# preview the meta data of updated data set
glimpse(real_estate_training)
## Rows: 414
## Columns: 6
## $ house_age                           <dbl> 32.0, 19.5, 13.3, 13.3, 5.0, 7.1, …
## $ distance_to_the_nearest_MRT_station <dbl> 84.87882, 306.59470, 561.98450, 56…
## $ number_of_convenience_stores        <int> 10, 9, 5, 5, 5, 3, 7, 6, 1, 3, 1, …
## $ latitude                            <dbl> 24.98298, 24.98034, 24.98746, 24.9…
## $ longitude                           <dbl> 121.5402, 121.5395, 121.5439, 121.…
## $ house_price_of_unit_area            <dbl> 37.9, 42.2, 47.3, 54.8, 43.1, 32.1…
# I will now split the data 0.8(train):20(test)
set.seed(400)
train_bucket <- createDataPartition(real_estate_training$house_price_of_unit_area,
    p = 0.8, list = FALSE, times = 1)
training <- real_estate_training[train_bucket, ]
testing <- real_estate_training[-train_bucket, ]
# I will now perform log transformation to make
# distance_to_the_nearest_MRT_station predictor variable more evenly
# distributed.
training_log <- real_estate_training  # copy of basic model for log transformation
training_log$distance_to_the_nearest_MRT_station <- log10(training_log$distance_to_the_nearest_MRT_station +
    1)
# I will now perform the box-cox method to reduce skewness of predictor
# variables
training_boxcox <- training
value <- preProcess(training[, -1], c("BoxCox", "center", "scale"))
training_boxcox <- predict(value, training_boxcox)
# Plotting results of both the log transformation and Box-Cox methods
ggplot(melt(training_log), aes(x = value)) + geom_density() + facet_wrap(~variable,
    scales = "free") + labs(title = "Log Transformation")

ggplot(melt(training_boxcox), aes(x = value)) + geom_density() + facet_wrap(~variable,
    scales = "free") + labs(title = "BoxCox Transformation")


library(ResourceSelection)
require(rpart)
require(Metrics)
require(car)

Model Building

kdepairs(training)

model 1 boxcox data

model_1 <- lm(house_price_of_unit_area ~ house_age + distance_to_the_nearest_MRT_station +
    number_of_convenience_stores, data = training_boxcox)
summary(model_1)
## 
## Call:
## lm(formula = house_price_of_unit_area ~ house_age + distance_to_the_nearest_MRT_station + 
##     number_of_convenience_stores, data = training_boxcox)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.1218 -0.3659 -0.0416  0.2989  4.0526 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          0.249269   0.064181   3.884 0.000124 ***
## house_age                           -0.014103   0.003066  -4.599 6.06e-06 ***
## distance_to_the_nearest_MRT_station -0.642125   0.048389 -13.270  < 2e-16 ***
## number_of_convenience_stores         0.157435   0.048376   3.254 0.001255 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6264 on 328 degrees of freedom
## Multiple R-squared:  0.6112, Adjusted R-squared:  0.6076 
## F-statistic: 171.9 on 3 and 328 DF,  p-value: < 2.2e-16
m1_mse <- mse(predict(model_1), training_boxcox$house_price_of_unit_area)
m1_rmse <- rmse(predict(model_1), training_boxcox$house_price_of_unit_area)

print(paste("model_1 MSE:", m1_mse))
## [1] "model_1 MSE: 0.387655424553808"
print(paste("model_1 RMSE:", m1_rmse))
## [1] "model_1 RMSE: 0.622619807389556"
par(mfrow = c(2, 2))
plot(model_1)

kdepairs(training)

model 2 log data

model_2 <- lm(house_price_of_unit_area ~ house_age + distance_to_the_nearest_MRT_station +
    number_of_convenience_stores, data = training_log)
summary(model_2)
## 
## Call:
## lm(formula = house_price_of_unit_area ~ house_age + distance_to_the_nearest_MRT_station + 
##     number_of_convenience_stores, data = training_log)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -35.521  -4.923  -1.003   3.621  73.900 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          86.56270    4.07135  21.261  < 2e-16 ***
## house_age                            -0.21182    0.03869  -5.474 7.66e-08 ***
## distance_to_the_nearest_MRT_station -17.22430    1.24728 -13.809  < 2e-16 ***
## number_of_convenience_stores          0.73387    0.20513   3.578 0.000388 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.86 on 410 degrees of freedom
## Multiple R-squared:  0.579,  Adjusted R-squared:  0.576 
## F-statistic:   188 on 3 and 410 DF,  p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(model_2)

kdepairs(training_log)

m2_mse <- mse(predict(model_2), training_log$house_price_of_unit_area)
m2_rmse <- rmse(predict(model_2), training_log$house_price_of_unit_area)

print(paste("model_2 MSE:", m2_mse))
## [1] "model_2 MSE: 77.7451851685343"
print(paste("model_2 RMSE:", m2_rmse))
## [1] "model_2 RMSE: 8.81732301600289"

model 3 usign decision tree

model_tree <- rpart(house_price_of_unit_area ~ house_age + distance_to_the_nearest_MRT_station +
    number_of_convenience_stores, data = training_boxcox)
tree_predictions <- predict(model_tree, testing)

tree_mse <- mean((tree_predictions - testing$house_price_of_unit_area)^2)
tree_rmse <- sqrt(tree_mse)
print(paste("Mean Squared Error:", tree_mse))
## [1] "Mean Squared Error: 1774.7136429016"

model 4 using cross-validation k fold with training_boxcox

# linear model using cross validation
train_control <- trainControl(method = "cv", number = 10)

model_4 <- train(y = training_boxcox$house_price_of_unit_area, x = training[, 1:5],
    method = "lm", trControl = train_control)

summary(model_4)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.0167 -0.3374 -0.0528  0.3146  4.2135 
## 
## Coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                         -6.391e+02  4.638e+02  -1.378    0.169    
## house_age                           -1.779e-02  3.017e-03  -5.898 9.20e-09 ***
## distance_to_the_nearest_MRT_station -3.016e-04  5.581e-05  -5.405 1.25e-07 ***
## number_of_convenience_stores         7.932e-02  1.488e-02   5.331 1.83e-07 ***
## latitude                             2.126e+01  3.677e+00   5.781 1.74e-08 ***
## longitude                            8.939e-01  3.643e+00   0.245    0.806    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6164 on 326 degrees of freedom
## Multiple R-squared:  0.6258, Adjusted R-squared:   0.62 
## F-statistic:   109 on 5 and 326 DF,  p-value: < 2.2e-16
model_4$results
##   intercept      RMSE  Rsquared       MAE    RMSESD RsquaredSD      MAESD
## 1      TRUE 0.6127601 0.6418179 0.4325541 0.1338502  0.1061114 0.05789568

model 5 multivariate power transformation

c <- 0.01
training$house_age <- training$house_age + c
training$number_of_convenience_stores <- training$number_of_convenience_stores +
    c
training <- training |>
    dplyr::select(-longitude, -latitude)

myvars <- as.matrix(training)

# multivariate power transform
my_transforms <- powerTransform(myvars ~ 1)
my_transforms
## Estimated transformation parameters 
##                           house_age distance_to_the_nearest_MRT_station 
##                          0.53230245                          0.09806154 
##        number_of_convenience_stores            house_price_of_unit_area 
##                          0.55245728                          0.40467240
new_training = training
for (i in 1:4) {
    new_training[, i] = new_training[, i]^my_transforms$lambda[i]
}
model_5 <- lm(house_price_of_unit_area ~ ., data = new_training)
summary(model_5)
## 
## Call:
## lm(formula = house_price_of_unit_area ~ ., data = new_training)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.27356 -0.22592 -0.02604  0.18351  2.31597 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          8.13185    0.30862  26.349  < 2e-16 ***
## house_age                           -0.06610    0.01276  -5.180 3.89e-07 ***
## distance_to_the_nearest_MRT_station -1.99746    0.14326 -13.943  < 2e-16 ***
## number_of_convenience_stores         0.10393    0.02814   3.694 0.000259 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3982 on 328 degrees of freedom
## Multiple R-squared:  0.6383, Adjusted R-squared:  0.635 
## F-statistic:   193 on 3 and 328 DF,  p-value: < 2.2e-16
m5_mse <- mse(predict(model_5), new_training$house_price_of_unit_area)
m5_rmse <- rmse(predict(model_5), new_training$house_price_of_unit_area)
par(mfrow = c(2, 2))
plot(model_5)

kdepairs(new_training)

Model Evaluation

model_name <- c("Model 1", "Model 2", "Model Tree", "Model 4", "Model 5")
adj_r_squared <- c(0.6076, 0.576, NA, 0.62, 0.635)
mse_list <- c(m1_mse, m2_mse, tree_mse, sqrt(model_4$results[, "RMSE"]), m5_mse)
rmse_list <- c(m1_rmse, m2_rmse, tree_rmse, model_4$results[, "RMSE"], m5_rmse)

# Create the model_table data frame
model_table <- data.frame(Model_Name = model_name, Adjusted_R_squared = adj_r_squared,
    MSE = mse_list, RMSE = rmse_list)

model_table
##   Model_Name Adjusted_R_squared          MSE       RMSE
## 1    Model 1             0.6076    0.3876554  0.6226198
## 2    Model 2             0.5760   77.7451852  8.8173230
## 3 Model Tree                 NA 1774.7136429 42.1273503
## 4    Model 4             0.6200    0.7827899  0.6127601
## 5    Model 5             0.6350    0.1566916  0.3958429

Model Selection