Step 1: Load the Data

Real_estate = read.csv("C:/Users/mattv/Desktop/ADEC 7301 Assignments/Data Sets/Real estate.csv")

Step 2: Inspect the Data

head(Real_estate)
##   No X1.transaction.date X2.house.age X3.distance.to.the.nearest.MRT.station
## 1  1            2012.917         32.0                               84.87882
## 2  2            2012.917         19.5                              306.59470
## 3  3            2013.583         13.3                              561.98450
## 4  4            2013.500         13.3                              561.98450
## 5  5            2012.833          5.0                              390.56840
## 6  6            2012.667          7.1                             2175.03000
##   X4.number.of.convenience.stores X5.latitude X6.longitude
## 1                              10    24.98298     121.5402
## 2                               9    24.98034     121.5395
## 3                               5    24.98746     121.5439
## 4                               5    24.98746     121.5439
## 5                               5    24.97937     121.5425
## 6                               3    24.96305     121.5125
##   Y.house.price.of.unit.area
## 1                       37.9
## 2                       42.2
## 3                       47.3
## 4                       54.8
## 5                       43.1
## 6                       32.1
summary(Real_estate)
##        No        X1.transaction.date  X2.house.age   
##  Min.   :  1.0   Min.   :2013        Min.   : 0.000  
##  1st Qu.:104.2   1st Qu.:2013        1st Qu.: 9.025  
##  Median :207.5   Median :2013        Median :16.100  
##  Mean   :207.5   Mean   :2013        Mean   :17.713  
##  3rd Qu.:310.8   3rd Qu.:2013        3rd Qu.:28.150  
##  Max.   :414.0   Max.   :2014        Max.   :43.800  
##  X3.distance.to.the.nearest.MRT.station X4.number.of.convenience.stores
##  Min.   :  23.38                        Min.   : 0.000                 
##  1st Qu.: 289.32                        1st Qu.: 1.000                 
##  Median : 492.23                        Median : 4.000                 
##  Mean   :1083.89                        Mean   : 4.094                 
##  3rd Qu.:1454.28                        3rd Qu.: 6.000                 
##  Max.   :6488.02                        Max.   :10.000                 
##   X5.latitude     X6.longitude   Y.house.price.of.unit.area
##  Min.   :24.93   Min.   :121.5   Min.   :  7.60            
##  1st Qu.:24.96   1st Qu.:121.5   1st Qu.: 27.70            
##  Median :24.97   Median :121.5   Median : 38.45            
##  Mean   :24.97   Mean   :121.5   Mean   : 37.98            
##  3rd Qu.:24.98   3rd Qu.:121.5   3rd Qu.: 46.60            
##  Max.   :25.01   Max.   :121.6   Max.   :117.50
sum(is.na(Real_estate))
## [1] 0

Step 3: Calculate Correlation

correlation = cor(Real_estate$X2.house.age, Real_estate$Y.house.price.of.unit.area)
correlation_rounded = round(correlation, 4)
print(correlation_rounded)
## [1] -0.2106

Step 4: Perform a Correlation Test

cor_test = cor.test(Real_estate$X2.house.age, Real_estate$Y.house.price.of.unit.area)
print(cor_test)
## 
##  Pearson's product-moment correlation
## 
## data:  Real_estate$X2.house.age and Real_estate$Y.house.price.of.unit.area
## t = -4.3721, df = 412, p-value = 1.56e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.3008396 -0.1165546
## sample estimates:
##       cor 
## -0.210567

Step 5: Simple Linear Regression

model = lm(Real_estate$Y.house.price.of.unit.area ~ Real_estate$X2.house.age, data = Real_estate)
summary(model)
## 
## Call:
## lm(formula = Real_estate$Y.house.price.of.unit.area ~ Real_estate$X2.house.age, 
##     data = Real_estate)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -31.113 -10.738   1.626   8.199  77.781 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              42.43470    1.21098  35.042  < 2e-16 ***
## Real_estate$X2.house.age -0.25149    0.05752  -4.372 1.56e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.32 on 412 degrees of freedom
## Multiple R-squared:  0.04434,    Adjusted R-squared:  0.04202 
## F-statistic: 19.11 on 1 and 412 DF,  p-value: 1.56e-05
# Scatter plot with regression line
plot(Real_estate$X2.house.age, Real_estate$Y.house.price.of.unit.area,
     xlab = "House Age", ylab = "House Price")

abline(model, col = "red")

\[ \hat{y} = 42.4347 - 0.2515 \cdot \text{House Age} \]

Step 6: Residual Plots

# Install and Load car package
if (!require(car)) {
  install.packages(car)
  library(car)
}
## Loading required package: car
## Loading required package: carData
# Fitted vs Residuals
plot(model$fitted.values, model$residuals, 
     xlab = "Fitted values", 
     ylab = "Residuals", 
     main = "Fitted vs Residuals",
     pch = 20, col = "blue")
abline(h = 0, lty = 2, col = "red")

# Normal Q-Q Plot
qqPlot(model$residuals, main = "Normal Q-Q Plot")

## [1] 271 221