## Warning: package 'readxl' was built under R version 4.3.3
#Replacing the file path with the correct path to your Excel file
file_path <- "C:/Users/aleja/Desktop/Real estate valuation data set.xlsx"
#Reading the first 5 rows of the Excel file
data <- read_excel(file_path, n_max = 5)
#Viewing the first 5 rows of the data
print(data)## # A tibble: 5 × 8
## No `X1 transaction date` `X2 house age` X3 distance to the nearest MRT st…¹
## <dbl> <dbl> <dbl> <dbl>
## 1 1 2013. 32 84.9
## 2 2 2013. 19.5 307.
## 3 3 2014. 13.3 562.
## 4 4 2014. 13.3 562.
## 5 5 2013. 5 391.
## # ℹ abbreviated name: ¹​`X3 distance to the nearest MRT station`
## # ℹ 4 more variables: `X4 number of convenience stores` <dbl>,
## # `X5 latitude` <dbl>, `X6 longitude` <dbl>,
## # `Y house price of unit area` <dbl>
Summary statistics
## No X1 transaction date X2 house age
## Min. :1 Min. :2013 Min. : 5.00
## 1st Qu.:2 1st Qu.:2013 1st Qu.:13.30
## Median :3 Median :2013 Median :13.30
## Mean :3 Mean :2013 Mean :16.62
## 3rd Qu.:4 3rd Qu.:2014 3rd Qu.:19.50
## Max. :5 Max. :2014 Max. :32.00
## X3 distance to the nearest MRT station X4 number of convenience stores
## Min. : 84.88 Min. : 5.0
## 1st Qu.:306.59 1st Qu.: 5.0
## Median :390.57 Median : 5.0
## Mean :381.20 Mean : 6.8
## 3rd Qu.:561.98 3rd Qu.: 9.0
## Max. :561.98 Max. :10.0
## X5 latitude X6 longitude Y house price of unit area
## Min. :24.98 Min. :121.5 Min. :37.90
## 1st Qu.:24.98 1st Qu.:121.5 1st Qu.:42.20
## Median :24.98 Median :121.5 Median :43.10
## Mean :24.98 Mean :121.5 Mean :45.06
## 3rd Qu.:24.99 3rd Qu.:121.5 3rd Qu.:47.30
## Max. :24.99 Max. :121.5 Max. :54.80
# Check for multicollinearity
cor_matrix <- cor(data)
high_correlation <- which(cor_matrix > 0.7 & cor_matrix < 1, arr.ind = TRUE)
high_correlation## row col
## X3 distance to the nearest MRT station 4 2
## X5 latitude 6 2
## X6 longitude 7 2
## Y house price of unit area 8 2
## X4 number of convenience stores 5 3
## X1 transaction date 2 4
## X6 longitude 7 4
## Y house price of unit area 8 4
## X2 house age 3 5
## X1 transaction date 2 6
## X6 longitude 7 6
## X1 transaction date 2 7
## X3 distance to the nearest MRT station 4 7
## X5 latitude 6 7
## Y house price of unit area 8 7
## X1 transaction date 2 8
## X3 distance to the nearest MRT station 4 8
## X6 longitude 7 8
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.92 loaded
# Compute the correlation matrix
correlation_matrix <- cor(data)
# Compute the correlation matrix
correlation_matrix <- cor(data)
# Increase the plot size and decrease the text size
par(mar = c(1, 1, 1, 1)) # Adjust margin to leave space for larger plot
corrplot(correlation_matrix, method = "color", type = "upper", order = "hclust", tl.col = "black", tl.srt = 45, tl.cex = 0.7)Running a regression model
#Running a linear regression model with only X3 distance to the nearest MRT station
model <- lm(`Y house price of unit area` ~ `X3 distance to the nearest MRT station`, data = data)
#Showing the summary of the regression model
summary(model)##
## Call:
## lm(formula = `Y house price of unit area` ~ `X3 distance to the nearest MRT station`,
## data = data)
##
## Residuals:
## 1 2 3 4 5
## 1.2030 -0.7544 -2.8621 4.6379 -2.2243
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.301523 3.691416 9.292 0.00264
## `X3 distance to the nearest MRT station` 0.028222 0.008772 3.217 0.04869
##
## (Intercept) **
## `X3 distance to the nearest MRT station` *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.496 on 3 degrees of freedom
## Multiple R-squared: 0.7753, Adjusted R-squared: 0.7004
## F-statistic: 10.35 on 1 and 3 DF, p-value: 0.04869
##
## Shapiro-Wilk normality test
##
## data: residuals
## W = 0.92129, p-value = 0.5383
The results suggest that X3 distance to the nearest MRT station is a statistically significant predictor of Y house price of unit area. The model explains a significant portion of the variability in house prices, and the normality assumption for the residuals is not violated.