Introduction

Our aim is to predict house values. Before we begin to do any analysis, we should always check whether the dataset has missing value or not, we do so by typing:

taiwan_real_estate <- read.csv("real_estates.csv",row.names=1)
attach(taiwan_real_estate)
any(is.na(taiwan_real_estate))
## [1] FALSE

Let’s take a look at structure of the data set:

glimpse(taiwan_real_estate)
## Rows: 414
## Columns: 6
## $ house.age                           <dbl> 32.0, 19.5, 13.3, 13.3, 5.0, 7.1, …
## $ distance.to.the.nearest.MRT.station <dbl> 84.87882, 306.59470, 561.98450, 56…
## $ number.of.convenience.stores        <int> 10, 9, 5, 5, 5, 3, 7, 6, 1, 3, 1, …
## $ latitude                            <dbl> 24.98298, 24.98034, 24.98746, 24.9…
## $ longitude                           <dbl> 121.5402, 121.5395, 121.5439, 121.…
## $ house.price.of.unit.area            <dbl> 37.9, 42.2, 47.3, 54.8, 43.1, 32.1…

Let’s simplify variables’ names:

taiwan_real_estate <- taiwan_real_estate %>%
rename(house_age_years = house.age, price_twd_msq = house.price.of.unit.area,
       n_convenience = number.of.convenience.stores, 
       dist_to_mrt_m = distance.to.the.nearest.MRT.station)

We can also perform binning for “house_age_years”:

#perform binning with specific number of bins
taiwan_real_estate<-taiwan_real_estate %>% mutate(house_age_cat = cut(house_age_years, breaks=c(0,15,30,45),include.lowest = T,
                                                                        right = F))

Descriptive Statistics

Prepare a heatmap with correlation coefficients on it:

non_numeric_cols <- sapply(taiwan_real_estate, function(x) !is.numeric(x))

correlation_matrix <- cor(taiwan_real_estate[!non_numeric_cols])

library(ggplot2)
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.3.3
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
ggplot(data = melt(correlation_matrix), aes(x = Var1, y = Var2, fill = value)) +
  geom_tile() +
  scale_fill_gradient(low = "blue", high = "red") +
  theme_minimal() +
  labs(title = "Correlation Heatmap")

Draw a scatter plot of n_convenience vs. price_twd_msq:

library(ggplot2)

ggplot(data = taiwan_real_estate, aes(x = n_convenience, y = price_twd_msq)) +
  geom_point() +
  labs(x = "Number of Convenience Stores", y = "Price (TWD/msq)", title = "Scatter Plot: n_convenience vs. price_twd_msq")

Draw a scatter plot of house_age_years vs. price_twd_msq:

library(ggplot2)

ggplot(data = taiwan_real_estate, aes(x = house_age_years, y = price_twd_msq)) +
  geom_point() +
  labs(x = "House Age (years)", y = "Price (TWD/msq)", title = "Scatter Plot: house_age_years vs. price_twd_msq")

Draw a scatter plot of distance to nearest MRT station vs. price_twd_msq:

library(ggplot2)

ggplot(data = taiwan_real_estate, aes(x = dist_to_mrt_m, y = price_twd_msq)) +
  geom_point() +
  labs(x = "Distance to Nearest MRT Station (m)", y = "Price (TWD/msq)", title = "Scatter Plot: dist_to_mrt_m vs. price_twd_msq")

Plot a histogram of price_twd_msq with 10 bins, facet the plot so each house age group gets its own panel:

library(ggplot2)

ggplot(data = taiwan_real_estate, aes(x = price_twd_msq)) +
  geom_histogram(bins = 10) +
  facet_wrap(~ house_age_cat, nrow = 1) + 
  labs(x = "Price (TWD/msq)", y = "Frequency", title = "Histogram of Price/TWD per square meter by House Age Category")

Summarize to calculate the mean, sd, median etc. house price/area by house age:

library(dplyr)

summary_stats <- taiwan_real_estate %>%
  group_by(house_age_cat) %>%
  summarise(mean_price = mean(price_twd_msq),
            sd_price = sd(price_twd_msq),
            median_price = median(price_twd_msq),
            min_price = min(price_twd_msq),
            max_price = max(price_twd_msq))

print(summary_stats)
## # A tibble: 3 × 6
##   house_age_cat mean_price sd_price median_price min_price max_price
##   <fct>              <dbl>    <dbl>        <dbl>     <dbl>     <dbl>
## 1 [0,15)              41.8     14.2         42.6       7.6     118. 
## 2 [15,30)             32.6     11.4         32.9      11.2      59.6
## 3 [30,45]             37.7     12.8         38.3      12.2      78.3

Simple model

Run a linear regression of price_twd_msq vs. best, but only 1 predictor:

library(ggplot2)

lm_model <- lm(price_twd_msq ~ house_age_years, data = taiwan_real_estate)

ggplot(data = taiwan_real_estate, aes(x = house_age_years, y = price_twd_msq)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +  #linear regression line
  labs(x = "House Age (years)", y = "Price (TWD/msq)", title = "Linear Regression: price_twd_msq vs. house_age_years")
## `geom_smooth()` using formula = 'y ~ x'

We start by displaying the statistical summary of the model using the R function summary():

library(ggplot2)

lm_model <- lm(price_twd_msq ~ house_age_years, data = taiwan_real_estate)

summary(lm_model)
## 
## Call:
## lm(formula = price_twd_msq ~ house_age_years, data = taiwan_real_estate)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -31.113 -10.738   1.626   8.199  77.781 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     42.43470    1.21098  35.042  < 2e-16 ***
## house_age_years -0.25149    0.05752  -4.372 1.56e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.32 on 412 degrees of freedom
## Multiple R-squared:  0.04434,    Adjusted R-squared:  0.04202 
## F-statistic: 19.11 on 1 and 412 DF,  p-value: 1.56e-05
ggplot(data = taiwan_real_estate, aes(x = house_age_years, y = price_twd_msq)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +  #linear regression line
  labs(x = "House Age (years)", y = "Price (TWD/msq)", title = "Linear Regression: price_twd_msq vs. house_age_years")
## `geom_smooth()` using formula = 'y ~ x'

You can access lots of different aspects of the regression object. To see what’s inside, use names():

library(ggplot2)

lm_model <- lm(price_twd_msq ~ house_age_years, data = taiwan_real_estate)

summary(lm_model)
## 
## Call:
## lm(formula = price_twd_msq ~ house_age_years, data = taiwan_real_estate)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -31.113 -10.738   1.626   8.199  77.781 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     42.43470    1.21098  35.042  < 2e-16 ***
## house_age_years -0.25149    0.05752  -4.372 1.56e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.32 on 412 degrees of freedom
## Multiple R-squared:  0.04434,    Adjusted R-squared:  0.04202 
## F-statistic: 19.11 on 1 and 412 DF,  p-value: 1.56e-05
ggplot(data = taiwan_real_estate, aes(x = house_age_years, y = price_twd_msq)) +
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE, color = "blue") +  # linear regression line
  labs(x = "House Age (years)", y = "Price (TWD/msq)", title = "Linear Regression: price_twd_msq vs. house_age_years")
## `geom_smooth()` using formula = 'y ~ x'

names(lm_model)
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"

What do they mean?

#TBD

Discuss model accuracy:

#TBD

Model diagnostics:

par(mfrow = c(2, 2))
plot(lm_model)

The four plots show…

Create the diagnostic plots using ggfortify:

library(ggfortify)
## Warning: package 'ggfortify' was built under R version 4.3.3
autoplot(lm_model) # ?your name of the model

Outliers and high levarage points:

plot(lm_model, 5)  # ?your name of the model

Influential values:

# Cook's distance
plot(lm_model, 4) # ?your name of the model

or just plot all of diagnostic plots together:

autoplot(lm_model, which = 1:6, label.size = 3) # ?your name of the model

Discussion:

Multiple Regression Model

to be continued next week…