Our aim is to predict house values. Before we begin to do any analysis, we should always check whether the dataset has missing value or not, we do so by typing:
taiwan_real_estate <- read.csv("real_estates.csv",row.names=1)
attach(taiwan_real_estate)
any(is.na(taiwan_real_estate))
## [1] FALSE
Let’s take a look at structure of the data set:
glimpse(taiwan_real_estate)
## Rows: 414
## Columns: 6
## $ house.age <dbl> 32.0, 19.5, 13.3, 13.3, 5.0, 7.1, …
## $ distance.to.the.nearest.MRT.station <dbl> 84.87882, 306.59470, 561.98450, 56…
## $ number.of.convenience.stores <int> 10, 9, 5, 5, 5, 3, 7, 6, 1, 3, 1, …
## $ latitude <dbl> 24.98298, 24.98034, 24.98746, 24.9…
## $ longitude <dbl> 121.5402, 121.5395, 121.5439, 121.…
## $ house.price.of.unit.area <dbl> 37.9, 42.2, 47.3, 54.8, 43.1, 32.1…
Let’s simplify variables’ names:
taiwan_real_estate <- taiwan_real_estate %>%
rename(house_age_years = house.age, price_twd_msq = house.price.of.unit.area,
n_convenience = number.of.convenience.stores,
dist_to_mrt_m = distance.to.the.nearest.MRT.station)
We can also perform binning for “house_age_years”:
#perform binning with specific number of bins
taiwan_real_estate<-taiwan_real_estate %>% mutate(house_age_cat = cut(house_age_years, breaks=c(0,15,30,45),include.lowest = T,
right = F))
Prepare a heatmap with correlation coefficients on it:
non_numeric_cols <- sapply(taiwan_real_estate, function(x) !is.numeric(x))
correlation_matrix <- cor(taiwan_real_estate[!non_numeric_cols])
library(ggplot2)
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.3.3
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
ggplot(data = melt(correlation_matrix), aes(x = Var1, y = Var2, fill = value)) +
geom_tile() +
scale_fill_gradient(low = "blue", high = "red") +
theme_minimal() +
labs(title = "Correlation Heatmap")
Draw a scatter plot of n_convenience vs. price_twd_msq:
library(ggplot2)
ggplot(data = taiwan_real_estate, aes(x = n_convenience, y = price_twd_msq)) +
geom_point() +
labs(x = "Number of Convenience Stores", y = "Price (TWD/msq)", title = "Scatter Plot: n_convenience vs. price_twd_msq")
Draw a scatter plot of house_age_years vs. price_twd_msq:
library(ggplot2)
ggplot(data = taiwan_real_estate, aes(x = house_age_years, y = price_twd_msq)) +
geom_point() +
labs(x = "House Age (years)", y = "Price (TWD/msq)", title = "Scatter Plot: house_age_years vs. price_twd_msq")
Draw a scatter plot of distance to nearest MRT station vs. price_twd_msq:
library(ggplot2)
ggplot(data = taiwan_real_estate, aes(x = dist_to_mrt_m, y = price_twd_msq)) +
geom_point() +
labs(x = "Distance to Nearest MRT Station (m)", y = "Price (TWD/msq)", title = "Scatter Plot: dist_to_mrt_m vs. price_twd_msq")
Plot a histogram of price_twd_msq with 10 bins, facet the plot so each house age group gets its own panel:
library(ggplot2)
ggplot(data = taiwan_real_estate, aes(x = price_twd_msq)) +
geom_histogram(bins = 10) +
facet_wrap(~ house_age_cat, nrow = 1) +
labs(x = "Price (TWD/msq)", y = "Frequency", title = "Histogram of Price/TWD per square meter by House Age Category")
Summarize to calculate the mean, sd, median etc. house price/area by house age:
library(dplyr)
summary_stats <- taiwan_real_estate %>%
group_by(house_age_cat) %>%
summarise(mean_price = mean(price_twd_msq),
sd_price = sd(price_twd_msq),
median_price = median(price_twd_msq),
min_price = min(price_twd_msq),
max_price = max(price_twd_msq))
print(summary_stats)
## # A tibble: 3 × 6
## house_age_cat mean_price sd_price median_price min_price max_price
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 [0,15) 41.8 14.2 42.6 7.6 118.
## 2 [15,30) 32.6 11.4 32.9 11.2 59.6
## 3 [30,45] 37.7 12.8 38.3 12.2 78.3
Run a linear regression of price_twd_msq vs. best, but only 1 predictor:
library(ggplot2)
lm_model <- lm(price_twd_msq ~ house_age_years, data = taiwan_real_estate)
ggplot(data = taiwan_real_estate, aes(x = house_age_years, y = price_twd_msq)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") + #linear regression line
labs(x = "House Age (years)", y = "Price (TWD/msq)", title = "Linear Regression: price_twd_msq vs. house_age_years")
## `geom_smooth()` using formula = 'y ~ x'
We start by displaying the statistical summary of the model using the R function summary():
library(ggplot2)
lm_model <- lm(price_twd_msq ~ house_age_years, data = taiwan_real_estate)
summary(lm_model)
##
## Call:
## lm(formula = price_twd_msq ~ house_age_years, data = taiwan_real_estate)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31.113 -10.738 1.626 8.199 77.781
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42.43470 1.21098 35.042 < 2e-16 ***
## house_age_years -0.25149 0.05752 -4.372 1.56e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.32 on 412 degrees of freedom
## Multiple R-squared: 0.04434, Adjusted R-squared: 0.04202
## F-statistic: 19.11 on 1 and 412 DF, p-value: 1.56e-05
ggplot(data = taiwan_real_estate, aes(x = house_age_years, y = price_twd_msq)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") + #linear regression line
labs(x = "House Age (years)", y = "Price (TWD/msq)", title = "Linear Regression: price_twd_msq vs. house_age_years")
## `geom_smooth()` using formula = 'y ~ x'
You can access lots of different aspects of the regression object. To see what’s inside, use names():
library(ggplot2)
lm_model <- lm(price_twd_msq ~ house_age_years, data = taiwan_real_estate)
summary(lm_model)
##
## Call:
## lm(formula = price_twd_msq ~ house_age_years, data = taiwan_real_estate)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31.113 -10.738 1.626 8.199 77.781
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42.43470 1.21098 35.042 < 2e-16 ***
## house_age_years -0.25149 0.05752 -4.372 1.56e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.32 on 412 degrees of freedom
## Multiple R-squared: 0.04434, Adjusted R-squared: 0.04202
## F-statistic: 19.11 on 1 and 412 DF, p-value: 1.56e-05
ggplot(data = taiwan_real_estate, aes(x = house_age_years, y = price_twd_msq)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") + # linear regression line
labs(x = "House Age (years)", y = "Price (TWD/msq)", title = "Linear Regression: price_twd_msq vs. house_age_years")
## `geom_smooth()` using formula = 'y ~ x'
names(lm_model)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
What do they mean?
#TBD
Discuss model accuracy:
#TBD
Model diagnostics:
par(mfrow = c(2, 2))
plot(lm_model)
The four plots show…
Create the diagnostic plots using ggfortify:
library(ggfortify)
## Warning: package 'ggfortify' was built under R version 4.3.3
autoplot(lm_model) # ?your name of the model
Outliers and high levarage points:
plot(lm_model, 5) # ?your name of the model
Influential values:
# Cook's distance
plot(lm_model, 4) # ?your name of the model
or just plot all of diagnostic plots together:
autoplot(lm_model, which = 1:6, label.size = 3) # ?your name of the model
Discussion:
to be continued next week…