library(tidyr)
library(dplyr)
library(ggplot2)
library(reshape2)
library(corrplot)
library(knitr)
library(tidymodels)
library(tidyverse)
library(mice)
library(caret)
library(performance)
library(kableExtra)
library(MASS)
# Loading the real estate training data set
real_estate_training <- read.csv("https://raw.githubusercontent.com/GitHub-Vlad/Data-science/main/Real%20estate%20valuation%20data%20set.csv")
# Summary of the data set
summary(real_estate_training)
## No X1.transaction.date X2.house.age
## Min. : 1.0 Min. :2013 Min. : 0.000
## 1st Qu.:104.2 1st Qu.:2013 1st Qu.: 9.025
## Median :207.5 Median :2013 Median :16.100
## Mean :207.5 Mean :2013 Mean :17.713
## 3rd Qu.:310.8 3rd Qu.:2013 3rd Qu.:28.150
## Max. :414.0 Max. :2014 Max. :43.800
## X3.distance.to.the.nearest.MRT.station X4.number.of.convenience.stores
## Min. : 23.38 Min. : 0.000
## 1st Qu.: 289.32 1st Qu.: 1.000
## Median : 492.23 Median : 4.000
## Mean :1083.89 Mean : 4.094
## 3rd Qu.:1454.28 3rd Qu.: 6.000
## Max. :6488.02 Max. :10.000
## X5.latitude X6.longitude Y.house.price.of.unit.area
## Min. :24.93 Min. :121.5 Min. : 7.60
## 1st Qu.:24.96 1st Qu.:121.5 1st Qu.: 27.70
## Median :24.97 Median :121.5 Median : 38.45
## Mean :24.97 Mean :121.5 Mean : 37.98
## 3rd Qu.:24.98 3rd Qu.:121.5 3rd Qu.: 46.60
## Max. :25.01 Max. :121.6 Max. :117.50
# creating distribution charts to see how the data is distributed for each of
# the variables
real_estate_training_dist <- real_estate_training %>%
gather(key = "variable", value = "value")
# Histogram plots of each variable
ggplot(real_estate_training_dist) + geom_histogram(aes(x = value, y = ..density..),
bins = 30) + geom_density(aes(x = value), color = "red") + facet_wrap(. ~ variable,
scales = "free", ncol = 4)
# creating box plots for each variable to get an idea of the spread of each
# variable
real_estate_training %>%
keep(is.numeric) %>%
gather() %>%
ggplot(aes(value)) + facet_wrap(~key, scales = "free") + geom_boxplot(fill = "black",
color = "blue", outlier.colour = "green", outlier.shape = 14, outlier.size = 2,
notch = FALSE)
# checking for NA values for each column in the data set
colSums(is.na(real_estate_training))
## No X1.transaction.date
## 0 0
## X2.house.age X3.distance.to.the.nearest.MRT.station
## 0 0
## X4.number.of.convenience.stores X5.latitude
## 0 0
## X6.longitude Y.house.price.of.unit.area
## 0 0
# creating a correlation matrix to observe correlation of all the variables in
# data set
corrplot(cor(real_estate_training %>%
keep(is.numeric)), method = "shade", shade.col = NA, tl.col = "brown", tl.srt = 45)
Data Preparation:
# renaming columns to remove 'X' or 'Y' in the beginning of column and
# replacing: '.' with '_'.
colnames(real_estate_training)[colnames(real_estate_training) == "X2.house.age"] <- "house_age"
colnames(real_estate_training)[colnames(real_estate_training) == "X3.distance.to.the.nearest.MRT.station"] <- "distance_to_the_nearest_MRT_station"
colnames(real_estate_training)[colnames(real_estate_training) == "X4.number.of.convenience.stores"] <- "number_of_convenience_stores"
colnames(real_estate_training)[colnames(real_estate_training) == "X5.latitude"] <- "latitude"
colnames(real_estate_training)[colnames(real_estate_training) == "X6.longitude"] <- "longitude"
colnames(real_estate_training)[colnames(real_estate_training) == "Y.house.price.of.unit.area"] <- "house_price_of_unit_area"
# removing the 'No' and 'X1.transaction.date' columns from the data set
real_estate_training = dplyr::select(real_estate_training, -c(1, 2))
# preview the meta data of updated data set
glimpse(real_estate_training)
## Rows: 414
## Columns: 6
## $ house_age <dbl> 32.0, 19.5, 13.3, 13.3, 5.0, 7.1, …
## $ distance_to_the_nearest_MRT_station <dbl> 84.87882, 306.59470, 561.98450, 56…
## $ number_of_convenience_stores <int> 10, 9, 5, 5, 5, 3, 7, 6, 1, 3, 1, …
## $ latitude <dbl> 24.98298, 24.98034, 24.98746, 24.9…
## $ longitude <dbl> 121.5402, 121.5395, 121.5439, 121.…
## $ house_price_of_unit_area <dbl> 37.9, 42.2, 47.3, 54.8, 43.1, 32.1…
# I will now split the data 0.8(train):20(test)
set.seed(400)
train_bucket <- createDataPartition(real_estate_training$house_price_of_unit_area,
p = 0.8, list = FALSE, times = 1)
training <- real_estate_training[train_bucket, ]
testing <- real_estate_training[-train_bucket, ]
# I will now perform log transformation to make
# distance_to_the_nearest_MRT_station predictor variable more evenly
# distributed.
training_log <- real_estate_training # copy of basic model for log transformation
training_log$distance_to_the_nearest_MRT_station <- log10(training_log$distance_to_the_nearest_MRT_station +
1)
# I will now perform the box-cox method to reduce skewness of predictor
# variables
training_boxcox <- training
value <- preProcess(training[, -1], c("BoxCox", "center", "scale"))
training_boxcox <- predict(value, training_boxcox)
# Plotting results of both the log transformation and Box-Cox methods
ggplot(melt(training_log), aes(x = value)) + geom_density() + facet_wrap(~variable,
scales = "free") + labs(title = "Log Transformation")
ggplot(melt(training_boxcox), aes(x = value)) + geom_density() + facet_wrap(~variable,
scales = "free") + labs(title = "BoxCox Transformation")
library(ResourceSelection)
require(rpart)
require(Metrics)
require(car)
kdepairs(training)
model_1 <- lm(house_price_of_unit_area ~ house_age + distance_to_the_nearest_MRT_station +
number_of_convenience_stores, data = training_boxcox)
summary(model_1)
##
## Call:
## lm(formula = house_price_of_unit_area ~ house_age + distance_to_the_nearest_MRT_station +
## number_of_convenience_stores, data = training_boxcox)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.1218 -0.3659 -0.0416 0.2989 4.0526
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.249269 0.064181 3.884 0.000124 ***
## house_age -0.014103 0.003066 -4.599 6.06e-06 ***
## distance_to_the_nearest_MRT_station -0.642125 0.048389 -13.270 < 2e-16 ***
## number_of_convenience_stores 0.157435 0.048376 3.254 0.001255 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6264 on 328 degrees of freedom
## Multiple R-squared: 0.6112, Adjusted R-squared: 0.6076
## F-statistic: 171.9 on 3 and 328 DF, p-value: < 2.2e-16
m1_mse <- mse(predict(model_1), training_boxcox$house_price_of_unit_area)
m1_rmse <- rmse(predict(model_1), training_boxcox$house_price_of_unit_area)
print(paste("model_1 MSE:", m1_mse))
## [1] "model_1 MSE: 0.387655424553808"
print(paste("model_1 RMSE:", m1_rmse))
## [1] "model_1 RMSE: 0.622619807389556"
par(mfrow = c(2, 2))
plot(model_1)
kdepairs(training)
model_2 <- lm(house_price_of_unit_area ~ house_age + distance_to_the_nearest_MRT_station +
number_of_convenience_stores, data = training_log)
summary(model_2)
##
## Call:
## lm(formula = house_price_of_unit_area ~ house_age + distance_to_the_nearest_MRT_station +
## number_of_convenience_stores, data = training_log)
##
## Residuals:
## Min 1Q Median 3Q Max
## -35.521 -4.923 -1.003 3.621 73.900
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 86.56270 4.07135 21.261 < 2e-16 ***
## house_age -0.21182 0.03869 -5.474 7.66e-08 ***
## distance_to_the_nearest_MRT_station -17.22430 1.24728 -13.809 < 2e-16 ***
## number_of_convenience_stores 0.73387 0.20513 3.578 0.000388 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.86 on 410 degrees of freedom
## Multiple R-squared: 0.579, Adjusted R-squared: 0.576
## F-statistic: 188 on 3 and 410 DF, p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(model_2)
kdepairs(training_log)
m2_mse <- mse(predict(model_2), training_log$house_price_of_unit_area)
m2_rmse <- rmse(predict(model_2), training_log$house_price_of_unit_area)
print(paste("model_2 MSE:", m2_mse))
## [1] "model_2 MSE: 77.7451851685343"
print(paste("model_2 RMSE:", m2_rmse))
## [1] "model_2 RMSE: 8.81732301600289"
model_tree <- rpart(house_price_of_unit_area ~ house_age + distance_to_the_nearest_MRT_station +
number_of_convenience_stores, data = training_boxcox)
tree_predictions <- predict(model_tree, testing)
tree_mse <- mean((tree_predictions - testing$house_price_of_unit_area)^2)
tree_rmse <- sqrt(tree_mse)
print(paste("Mean Squared Error:", tree_mse))
## [1] "Mean Squared Error: 1774.7136429016"
# linear model using cross validation
train_control <- trainControl(method = "cv", number = 10)
model_4 <- train(y = training_boxcox$house_price_of_unit_area, x = training[, 1:5],
method = "lm", trControl = train_control)
summary(model_4)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.0167 -0.3374 -0.0528 0.3146 4.2135
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.391e+02 4.638e+02 -1.378 0.169
## house_age -1.779e-02 3.017e-03 -5.898 9.20e-09 ***
## distance_to_the_nearest_MRT_station -3.016e-04 5.581e-05 -5.405 1.25e-07 ***
## number_of_convenience_stores 7.932e-02 1.488e-02 5.331 1.83e-07 ***
## latitude 2.126e+01 3.677e+00 5.781 1.74e-08 ***
## longitude 8.939e-01 3.643e+00 0.245 0.806
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6164 on 326 degrees of freedom
## Multiple R-squared: 0.6258, Adjusted R-squared: 0.62
## F-statistic: 109 on 5 and 326 DF, p-value: < 2.2e-16
model_4$results
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 0.6127601 0.6418179 0.4325541 0.1338502 0.1061114 0.05789568
c <- 0.01
training$house_age <- training$house_age + c
training$number_of_convenience_stores <- training$number_of_convenience_stores +
c
training <- training |>
dplyr::select(-longitude, -latitude)
myvars <- as.matrix(training)
# multivariate power transform
my_transforms <- powerTransform(myvars ~ 1)
my_transforms
## Estimated transformation parameters
## house_age distance_to_the_nearest_MRT_station
## 0.53230245 0.09806154
## number_of_convenience_stores house_price_of_unit_area
## 0.55245728 0.40467240
new_training = training
for (i in 1:4) {
new_training[, i] = new_training[, i]^my_transforms$lambda[i]
}
model_5 <- lm(house_price_of_unit_area ~ ., data = new_training)
summary(model_5)
##
## Call:
## lm(formula = house_price_of_unit_area ~ ., data = new_training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.27356 -0.22592 -0.02604 0.18351 2.31597
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.13185 0.30862 26.349 < 2e-16 ***
## house_age -0.06610 0.01276 -5.180 3.89e-07 ***
## distance_to_the_nearest_MRT_station -1.99746 0.14326 -13.943 < 2e-16 ***
## number_of_convenience_stores 0.10393 0.02814 3.694 0.000259 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3982 on 328 degrees of freedom
## Multiple R-squared: 0.6383, Adjusted R-squared: 0.635
## F-statistic: 193 on 3 and 328 DF, p-value: < 2.2e-16
m5_mse <- mse(predict(model_5), new_training$house_price_of_unit_area)
m5_rmse <- rmse(predict(model_5), new_training$house_price_of_unit_area)
par(mfrow = c(2, 2))
plot(model_5)
kdepairs(new_training)
model_name <- c("Model 1", "Model 2", "Model Tree", "Model 4", "Model 5")
adj_r_squared <- c(0.6076, 0.576, NA, 0.62, 0.635)
mse_list <- c(m1_mse, m2_mse, tree_mse, sqrt(model_4$results[, "RMSE"]), m5_mse)
rmse_list <- c(m1_rmse, m2_rmse, tree_rmse, model_4$results[, "RMSE"], m5_rmse)
# Create the model_table data frame
model_table <- data.frame(Model_Name = model_name, Adjusted_R_squared = adj_r_squared,
MSE = mse_list, RMSE = rmse_list)
model_table
## Model_Name Adjusted_R_squared MSE RMSE
## 1 Model 1 0.6076 0.3876554 0.6226198
## 2 Model 2 0.5760 77.7451852 8.8173230
## 3 Model Tree NA 1774.7136429 42.1273503
## 4 Model 4 0.6200 0.7827899 0.6127601
## 5 Model 5 0.6350 0.1566916 0.3958429