datos <- read.csv("Real estate.csv")
str(datos)
## 'data.frame':    414 obs. of  8 variables:
##  $ No                                    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ X1.transaction.date                   : num  2013 2013 2014 2014 2013 ...
##  $ X2.house.age                          : num  32 19.5 13.3 13.3 5 7.1 34.5 20.3 31.7 17.9 ...
##  $ X3.distance.to.the.nearest.MRT.station: num  84.9 306.6 562 562 390.6 ...
##  $ X4.number.of.convenience.stores       : int  10 9 5 5 5 3 7 6 1 3 ...
##  $ X5.latitude                           : num  25 25 25 25 25 ...
##  $ X6.longitude                          : num  122 122 122 122 122 ...
##  $ Y.house.price.of.unit.area            : num  37.9 42.2 47.3 54.8 43.1 32.1 40.3 46.7 18.8 22.1 ...
#SIMPLE LINEAR REGRESSION 
modelo1 <- lm(Y.house.price.of.unit.area ~X3.distance.to.the.nearest.MRT.station, data=datos)
summary(modelo1)
## 
## Call:
## lm(formula = Y.house.price.of.unit.area ~ X3.distance.to.the.nearest.MRT.station, 
##     data = datos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -35.396  -6.007  -1.195   4.831  73.483 
## 
## Coefficients:
##                                          Estimate Std. Error t value Pr(>|t|)
## (Intercept)                            45.8514271  0.6526105   70.26   <2e-16
## X3.distance.to.the.nearest.MRT.station -0.0072621  0.0003925  -18.50   <2e-16
##                                           
## (Intercept)                            ***
## X3.distance.to.the.nearest.MRT.station ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.07 on 412 degrees of freedom
## Multiple R-squared:  0.4538, Adjusted R-squared:  0.4524 
## F-statistic: 342.2 on 1 and 412 DF,  p-value: < 2.2e-16
#R^2 OF 0.45 which indicates a not so good prediction model 
# We have a p value of 2.2e-16, and because it is close to 0, it indicates the two variables have a considerable relationship

plot(datos$X3.distance.to.the.nearest.MRT.station, datos$Y.house.price.of.unit.area)

#MULTIPLE LINEAR REGRESSION 
mtodos<- lm(Y.house.price.of.unit.area ~., data= datos)
summary(mtodos)
## 
## Call:
## lm(formula = Y.house.price.of.unit.area ~ ., data = datos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -36.003  -5.196  -0.990   4.181  75.384 
## 
## Coefficients:
##                                          Estimate Std. Error t value Pr(>|t|)
## (Intercept)                            -1.404e+04  6.788e+03  -2.068  0.03927
## No                                     -3.593e-03  3.653e-03  -0.984  0.32590
## X1.transaction.date                     5.079e+00  1.559e+00   3.259  0.00121
## X2.house.age                           -2.708e-01  3.855e-02  -7.026 9.04e-12
## X3.distance.to.the.nearest.MRT.station -4.521e-03  7.189e-04  -6.289 8.28e-10
## X4.number.of.convenience.stores         1.129e+00  1.882e-01   6.000 4.37e-09
## X5.latitude                             2.247e+02  4.458e+01   5.040 7.02e-07
## X6.longitude                           -1.442e+01  4.863e+01  -0.297  0.76691
##                                           
## (Intercept)                            *  
## No                                        
## X1.transaction.date                    ** 
## X2.house.age                           ***
## X3.distance.to.the.nearest.MRT.station ***
## X4.number.of.convenience.stores        ***
## X5.latitude                            ***
## X6.longitude                              
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.858 on 406 degrees of freedom
## Multiple R-squared:  0.5834, Adjusted R-squared:  0.5762 
## F-statistic: 81.21 on 7 and 406 DF,  p-value: < 2.2e-16
#We analize which variables have no significance for the model and we erase them.
modelo2 <- lm(Y.house.price.of.unit.area~.-(No+X1.transaction.date+X6.longitude), data=datos)
summary(modelo2)
## 
## Call:
## lm(formula = Y.house.price.of.unit.area ~ . - (No + X1.transaction.date + 
##     X6.longitude), data = datos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -34.522  -5.292  -1.579   4.264  76.466 
## 
## Coefficients:
##                                          Estimate Std. Error t value Pr(>|t|)
## (Intercept)                            -5.916e+03  1.113e+03  -5.317 1.74e-07
## X2.house.age                           -2.687e-01  3.893e-02  -6.903 1.95e-11
## X3.distance.to.the.nearest.MRT.station -4.175e-03  4.928e-04  -8.473 4.37e-16
## X4.number.of.convenience.stores         1.165e+00  1.897e-01   6.141 1.94e-09
## X5.latitude                             2.386e+02  4.456e+01   5.355 1.43e-07
##                                           
## (Intercept)                            ***
## X2.house.age                           ***
## X3.distance.to.the.nearest.MRT.station ***
## X4.number.of.convenience.stores        ***
## X5.latitude                            ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.954 on 409 degrees of freedom
## Multiple R-squared:  0.5711, Adjusted R-squared:  0.5669 
## F-statistic: 136.2 on 4 and 409 DF,  p-value: < 2.2e-16
#Our R^2 went up at 0.57, which indicates a better model
#We can observe all the remaining variables have significance for the prediction model. 
#Our variabes are related to our y because our p value 
# With this model we were able to decrease the Residual standard error from 10.07 to 8.9, which tells us the model fits the dataset better. 

#POLYNOMIAL REGRESSION 
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
theme_set(theme_classic())

set.seed(123)
training.samples <- datos$Y.house.price.of.unit.area %>%
  createDataPartition(p = 0.8, list = FALSE)
train.data <- datos[training.samples, ]
test.data <- datos[-training.samples, ]

modelo <- lm( Y.house.price.of.unit.area~ poly(X3.distance.to.the.nearest.MRT.station, 5, raw = TRUE),
             data = train.data)
# Make predictions
predicciones <- modelo %>% predict(test.data)

# Model performance
modelPerfomance = data.frame(
  RMSE = RMSE(predicciones, test.data$Y.house.price.of.unit.area),
  R2 = R2(predicciones, test.data$Y.house.price.of.unit.area)
)

print(lm(Y.house.price.of.unit.area ~ X3.distance.to.the.nearest.MRT.station + I(X3.distance.to.the.nearest.MRT.station^3), data = train.data))
## 
## Call:
## lm(formula = Y.house.price.of.unit.area ~ X3.distance.to.the.nearest.MRT.station + 
##     I(X3.distance.to.the.nearest.MRT.station^3), data = train.data)
## 
## Coefficients:
##                                 (Intercept)  
##                                   4.822e+01  
##      X3.distance.to.the.nearest.MRT.station  
##                                  -1.104e-02  
## I(X3.distance.to.the.nearest.MRT.station^3)  
##                                   1.738e-10
print(modelPerfomance)
##       RMSE       R2
## 1 7.601113 0.622119
ggplot(train.data, aes(Y.house.price.of.unit.area, X3.distance.to.the.nearest.MRT.station) ) + geom_point() +
  stat_smooth(method = lm, formula = y ~ poly(x, 5, raw = TRUE))

#The RMSE is 6.5, a result far from 0, which indicates our prediction and the real values have a significant difference.
#The model had an improvement in terms of R^2 oscillating between 0.50 and 0.70
#With these results we can infer that for this specific data set, a polynomial regression is the best prediction model.