An analysis on global housing market trends

Focusing specifically on Australia, Canada, France, Singapore, UK and USA

Erik Yuan Rui Woon, S3905260

Oct 5, 2025

Introduction

Introduction cont.

Problem Statement

Data

Data cont.

ds <- read.csv("global_house_purchase_dataset.csv")
ds$decision <- ds$decision %>% factor(levels = c(0,1), labels = c("No", "Yes"))
ds$furnishing_status <- ds$furnishing_status %>% factor(levels = c("Unfurnished", "Semi-Furnished", "Fully-Furnished"), ordered=TRUE)
ds_filtered <- subset(ds, !country %in% c("Brazil", "China", "India", "Germany","Japan", "South Africa", "UAE"))
currency_pairs <- c("AUDUSD=X", "CADUSD=X", "EURUSD=X", "SGDUSD=X", "GBPUSD=X")

exchange_data <- getQuote(currency_pairs)
exchange_rates <- exchange_data$Last

currency_codes <- c("AUD", "CAD","EUR", "SGD", "GBP")
names(exchange_rates) <- currency_codes

exchange_rates <- c("USD" = 1, exchange_rates)
country_currency <- c("Australia" = "AUD", "Canada" = "CAD", "France" = "EUR","Singapore" = "SGD", "UK" = "GBP", "USA" = "USD")
ds_filtered$currency <- country_currency[ds_filtered$country]
ds_filtered$price_usd <- ds_filtered$price * exchange_rates[ds_filtered$currency]
ds_filtered$salary_usd <- ds_filtered$customer_salary*exchange_rates[ds_filtered$currency]

Descriptive Statistics and Visualisations

country_summary <- ds_filtered %>%
  group_by(country) %>%
  summarise(avg_price_usd = mean(price_usd, na.rm = TRUE),
    median_price_usd = median(price_usd, na.rm = TRUE),
    avg_salary_usd = mean(salary_usd, na.rm = TRUE))
knitr::kable(country_summary)
country avg_price_usd median_price_usd avg_salary_usd
Australia 665715.5 666912.2 35767.41
Canada 794184.8 793537.5 39405.12
France 1562263.2 1567218.1 63955.33
Singapore 1726977.8 1716941.5 42327.34
UK 1710936.8 1714401.0 73586.92
USA 1603145.7 1600616.0 54950.75

Descriptive Statistics and Visualisations cont.

# Boxplot: Price comparison by Country
ggplot(ds_filtered, aes(x = country, y = price_usd, fill = country)) +
  geom_boxplot(fill = "white", color = "black") +
  xlab("Country") +  ylab("Property Price (USD)") +  theme_classic() +
  labs(title = "Boxplot of Property Prices by Country") +
  scale_y_continuous(labels = dollar) +  
  theme(legend.position = "none", axis.text.x = element_text(angle = 90, size = 10),
    axis.title.x = element_text(size = 12), axis.title.y = element_text(size = 12),
    plot.title = element_text(size = 16))

Descriptive Statistics and Visualisations cont.

ggplot(ds_filtered, aes(x = country, fill = decision)) +
  geom_bar(position = "dodge") + labs(title = "Decision to Buy by Country", 
  x = "Country", y = "Count of Decisions", fill = "Decision") + theme_minimal()

Descriptive Statistics and visualisations cont.

property_counts <- ds_filtered %>% group_by(country, constructed_year, property_type) %>% summarise(count = n()) 
filtered_counts <- property_counts %>% filter(property_type %in% c("Apartment", "Independent House"))
ggplot(filtered_counts, aes(x = constructed_year, y = count, colour = property_type)) +
  geom_line(size=0.75) + facet_wrap(~ country)+
  labs(title = "Number of apartments and townhouses built by year in each country",
       x = "Constructed Year", y = "Number of Properties", color = "Property Type") + theme_minimal() + 
  theme(legend.text = element_text(size = 8), legend.title = element_text(size = 9), legend.position = "bottom")

Hypothesis testing: Welch 2-sample t-test

yes_prices <- ds_filtered$price[ds_filtered$decision == "Yes"]
no_prices <- ds_filtered$price[ds_filtered$decision == "No"]

par(mfrow=c(1,2))
yes_prices %>% qqnorm()
no_prices %>% qqnorm()

Welch 2-sample t-test cont.

leveneTest(price ~ decision, data = ds_filtered)
## Levene's Test for Homogeneity of Variance (center = median)
##          Df F value    Pr(>F)    
## group     1  225.64 < 2.2e-16 ***
##       92441                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
t.test(yes_prices, no_prices)
## 
##  Welch Two Sample t-test
## 
## data:  yes_prices and no_prices
## t = -23.743, df = 39214, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -160893.0 -136354.8
## sample estimates:
## mean of x mean of y 
##   1319175   1467799

Regression analysis: One-way ANOVA test

ds_filtered$furnishing_status <- as.factor(ds_filtered$furnishing_status)

# Run ANOVA
anova_result <- aov(price ~ furnishing_status, data = ds_filtered)
par(mfrow=c(2,2))
plot(anova_result)

summary(anova_result)
##                      Df    Sum Sq   Mean Sq F value Pr(>F)
## furnishing_status     2 2.453e+12 1.227e+12   1.698  0.183
## Residuals         92440 6.679e+16 7.225e+11

Categorical association

furnishing_decision_table <- table(ds_filtered$furnishing_status, ds_filtered$decision)
chi_result <- chisq.test(furnishing_decision_table)
chi_result
## 
##  Pearson's Chi-squared test
## 
## data:  furnishing_decision_table
## X-squared = 4.81, df = 2, p-value = 0.09026

Categorical association cont.

country_decision_table <- table(ds_filtered$country, ds_filtered$decision)
chi_result <- chisq.test(country_decision_table)
chi_result
## 
##  Pearson's Chi-squared test
## 
## data:  country_decision_table
## X-squared = 487.55, df = 5, p-value < 2.2e-16

Categorical association cont.

property_furnishing_table <- table(ds_filtered$property_type, ds_filtered$furnishing_status)
chi_result <- chisq.test(property_furnishing_table)
chi_result
## 
##  Pearson's Chi-squared test
## 
## data:  property_furnishing_table
## X-squared = 7.8341, df = 10, p-value = 0.645

Discussion

References