This dataset includes data from around 400 real estate units which includes various data to predict their prices The source of the data is the following link: Real estate price prediction.
The dataset already hints us which are the independent variables (X1,X2,X3,X4,X5,X6) and which is the dependent variable (Y). So we will be solving the problem accordingly.
#Dataset is loaded. Note that the data set should be in the same location as this rmd file
real_estate <- read.csv("Real estate.csv")
#Line Graph
ggplot(data = real_estate, aes (x = X2.house.age, y = Y.house.price.of.unit.area)) +
geom_line(color = "green", linewidth = 0.8) +
labs(
title = "Price per unit based on the transactions each year",
x = "Year",
y = "Price per unit"
) +
theme_minimal()
#Boxplot
ggplot(data = real_estate, aes(x = factor(X4.number.of.convenience.stores), y = Y.house.price.of.unit.area, fill = factor(X4.number.of.convenience.stores))) +
geom_boxplot(alpha = 0.8) +
labs(
title = "Distribution of price per unit per number of convenience stores",
x = "Number of convenience stores",
y = "Price per unit",
fill = "Convenience stores"
) +
theme_minimal()
# Scatter Plot
ggplot(data = real_estate, aes(x = X1.transaction.date, y = Y.house.price.of.unit.area)) +
geom_point(color = "orange", size = 2) +
geom_smooth(method = "lm", se = TRUE, color = "blue") +
labs(
title = "Age vs Price per unit of the house",
x = "Age of the house",
y = "Price per unit of the house"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
#1st attempt
m1 <- lm(Y.house.price.of.unit.area ~ X1.transaction.date+X2.house.age+X3.distance.to.the.nearest.MRT.station+X4.number.of.convenience.stores+X5.latitude+X6.longitude, data = real_estate)
summary(m1)
##
## Call:
## lm(formula = Y.house.price.of.unit.area ~ X1.transaction.date +
## X2.house.age + X3.distance.to.the.nearest.MRT.station + X4.number.of.convenience.stores +
## X5.latitude + X6.longitude, data = real_estate)
##
## Residuals:
## Min 1Q Median 3Q Max
## -35.664 -5.410 -0.966 4.217 75.193
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.444e+04 6.776e+03 -2.131 0.03371
## X1.transaction.date 5.146e+00 1.557e+00 3.305 0.00103
## X2.house.age -2.697e-01 3.853e-02 -7.000 1.06e-11
## X3.distance.to.the.nearest.MRT.station -4.488e-03 7.180e-04 -6.250 1.04e-09
## X4.number.of.convenience.stores 1.133e+00 1.882e-01 6.023 3.84e-09
## X5.latitude 2.255e+02 4.457e+01 5.059 6.38e-07
## X6.longitude -1.242e+01 4.858e+01 -0.256 0.79829
##
## (Intercept) *
## X1.transaction.date **
## X2.house.age ***
## X3.distance.to.the.nearest.MRT.station ***
## X4.number.of.convenience.stores ***
## X5.latitude ***
## X6.longitude
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.858 on 407 degrees of freedom
## Multiple R-squared: 0.5824, Adjusted R-squared: 0.5762
## F-statistic: 94.59 on 6 and 407 DF, p-value: < 2.2e-16
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
#(Intercept) -1.444e+04 6.776e+03 -2.131 0.03371 *
#X1.transaction.date 5.146e+00 1.557e+00 3.305 0.00103 **
#X2.house.age -2.697e-01 3.853e-02 -7.000 1.06e-11 ***
#X3.distance.to.the.nearest.MRT.station -4.488e-03 7.180e-04 -6.250 1.04e-09 ***
#X4.number.of.convenience.stores 1.133e+00 1.882e-01 6.023 3.84e-09 ***
#X5.latitude 2.255e+02 4.457e+01 5.059 6.38e-07 ***
#X6.longitude -1.242e+01 4.858e+01 -0.256 0.79829
#Based on the PR(>|t|), we see that the most important are X2,X3,X4,X5 and followed by X1. But X6 does not have any stars so we omit it.
#2nd attempt