datos <- read.csv("Real estate.csv")
str(datos)
## 'data.frame': 414 obs. of 8 variables:
## $ No : int 1 2 3 4 5 6 7 8 9 10 ...
## $ X1.transaction.date : num 2013 2013 2014 2014 2013 ...
## $ X2.house.age : num 32 19.5 13.3 13.3 5 7.1 34.5 20.3 31.7 17.9 ...
## $ X3.distance.to.the.nearest.MRT.station: num 84.9 306.6 562 562 390.6 ...
## $ X4.number.of.convenience.stores : int 10 9 5 5 5 3 7 6 1 3 ...
## $ X5.latitude : num 25 25 25 25 25 ...
## $ X6.longitude : num 122 122 122 122 122 ...
## $ Y.house.price.of.unit.area : num 37.9 42.2 47.3 54.8 43.1 32.1 40.3 46.7 18.8 22.1 ...
#SIMPLE LINEAR REGRESSION
modelo1 <- lm(Y.house.price.of.unit.area ~X3.distance.to.the.nearest.MRT.station, data=datos)
summary(modelo1)
##
## Call:
## lm(formula = Y.house.price.of.unit.area ~ X3.distance.to.the.nearest.MRT.station,
## data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -35.396 -6.007 -1.195 4.831 73.483
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 45.8514271 0.6526105 70.26 <2e-16
## X3.distance.to.the.nearest.MRT.station -0.0072621 0.0003925 -18.50 <2e-16
##
## (Intercept) ***
## X3.distance.to.the.nearest.MRT.station ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.07 on 412 degrees of freedom
## Multiple R-squared: 0.4538, Adjusted R-squared: 0.4524
## F-statistic: 342.2 on 1 and 412 DF, p-value: < 2.2e-16
#R^2 OF 0.45 which indicates a not so good prediction model
# We have a p value of 2.2e-16, and because it is close to 0, it indicates the two variables have a considerable relationship
plot(datos$X3.distance.to.the.nearest.MRT.station, datos$Y.house.price.of.unit.area)
#MULTIPLE LINEAR REGRESSION
mtodos<- lm(Y.house.price.of.unit.area ~., data= datos)
summary(mtodos)
##
## Call:
## lm(formula = Y.house.price.of.unit.area ~ ., data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -36.003 -5.196 -0.990 4.181 75.384
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.404e+04 6.788e+03 -2.068 0.03927
## No -3.593e-03 3.653e-03 -0.984 0.32590
## X1.transaction.date 5.079e+00 1.559e+00 3.259 0.00121
## X2.house.age -2.708e-01 3.855e-02 -7.026 9.04e-12
## X3.distance.to.the.nearest.MRT.station -4.521e-03 7.189e-04 -6.289 8.28e-10
## X4.number.of.convenience.stores 1.129e+00 1.882e-01 6.000 4.37e-09
## X5.latitude 2.247e+02 4.458e+01 5.040 7.02e-07
## X6.longitude -1.442e+01 4.863e+01 -0.297 0.76691
##
## (Intercept) *
## No
## X1.transaction.date **
## X2.house.age ***
## X3.distance.to.the.nearest.MRT.station ***
## X4.number.of.convenience.stores ***
## X5.latitude ***
## X6.longitude
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.858 on 406 degrees of freedom
## Multiple R-squared: 0.5834, Adjusted R-squared: 0.5762
## F-statistic: 81.21 on 7 and 406 DF, p-value: < 2.2e-16
#We analize which variables have no significance for the model and we erase them.
modelo2 <- lm(Y.house.price.of.unit.area~.-(No+X1.transaction.date+X6.longitude), data=datos)
summary(modelo2)
##
## Call:
## lm(formula = Y.house.price.of.unit.area ~ . - (No + X1.transaction.date +
## X6.longitude), data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34.522 -5.292 -1.579 4.264 76.466
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.916e+03 1.113e+03 -5.317 1.74e-07
## X2.house.age -2.687e-01 3.893e-02 -6.903 1.95e-11
## X3.distance.to.the.nearest.MRT.station -4.175e-03 4.928e-04 -8.473 4.37e-16
## X4.number.of.convenience.stores 1.165e+00 1.897e-01 6.141 1.94e-09
## X5.latitude 2.386e+02 4.456e+01 5.355 1.43e-07
##
## (Intercept) ***
## X2.house.age ***
## X3.distance.to.the.nearest.MRT.station ***
## X4.number.of.convenience.stores ***
## X5.latitude ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.954 on 409 degrees of freedom
## Multiple R-squared: 0.5711, Adjusted R-squared: 0.5669
## F-statistic: 136.2 on 4 and 409 DF, p-value: < 2.2e-16
#Our R^2 went up at 0.57, which indicates a better model
#We can observe all the remaining variables have significance for the prediction model.
#Our variabes are related to our y because our p value
# With this model we were able to decrease the Residual standard error from 10.07 to 8.9, which tells us the model fits the dataset better.
#POLYNOMIAL REGRESSION
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
theme_set(theme_classic())
set.seed(123)
training.samples <- datos$Y.house.price.of.unit.area %>%
createDataPartition(p = 0.8, list = FALSE)
train.data <- datos[training.samples, ]
test.data <- datos[-training.samples, ]
modelo <- lm( Y.house.price.of.unit.area~ poly(X3.distance.to.the.nearest.MRT.station, 5, raw = TRUE),
data = train.data)
# Make predictions
predicciones <- modelo %>% predict(test.data)
# Model performance
modelPerfomance = data.frame(
RMSE = RMSE(predicciones, test.data$Y.house.price.of.unit.area),
R2 = R2(predicciones, test.data$Y.house.price.of.unit.area)
)
print(lm(Y.house.price.of.unit.area ~ X3.distance.to.the.nearest.MRT.station + I(X3.distance.to.the.nearest.MRT.station^3), data = train.data))
##
## Call:
## lm(formula = Y.house.price.of.unit.area ~ X3.distance.to.the.nearest.MRT.station +
## I(X3.distance.to.the.nearest.MRT.station^3), data = train.data)
##
## Coefficients:
## (Intercept)
## 4.822e+01
## X3.distance.to.the.nearest.MRT.station
## -1.104e-02
## I(X3.distance.to.the.nearest.MRT.station^3)
## 1.738e-10
print(modelPerfomance)
## RMSE R2
## 1 7.601113 0.622119
ggplot(train.data, aes(Y.house.price.of.unit.area, X3.distance.to.the.nearest.MRT.station) ) + geom_point() +
stat_smooth(method = lm, formula = y ~ poly(x, 5, raw = TRUE))

#The RMSE is 6.5, a result far from 0, which indicates our prediction and the real values have a significant difference.
#The model had an improvement in terms of R^2 oscillating between 0.50 and 0.70
#With these results we can infer that for this specific data set, a polynomial regression is the best prediction model.