An analysis on global housing market trends

Focusing specifically on Australia, Canada, France, Singapore, UK and USA

Erik Yuan Rui Woon, S3905260

Oct 5, 2025

Introduction

Introduction cont.

Problem Statement

Data

Data cont.

ds <- read.csv("global_house_purchase_dataset.csv")
ds$decision <- ds$decision %>% factor(levels = c(0,1), labels = c("No", "Yes"))
ds$furnishing_status <- ds$furnishing_status %>% factor(levels = c("Unfurnished", "Semi-Furnished", "Fully-Furnished"), ordered=TRUE)
ds_filtered <- subset(ds, !country %in% c("Brazil", "China", "India", "Germany","Japan", "South Africa", "UAE"))
currency_pairs <- c("AUDUSD=X", "CADUSD=X", "EURUSD=X", "SGDUSD=X", "GBPUSD=X")

exchange_data <- getQuote(currency_pairs)
exchange_rates <- exchange_data$Last

currency_codes <- c("AUD", "CAD","EUR", "SGD", "GBP")
names(exchange_rates) <- currency_codes

exchange_rates <- c("USD" = 1, exchange_rates)
country_currency <- c("Australia" = "AUD", "Canada" = "CAD", "France" = "EUR","Singapore" = "SGD", "UK" = "GBP", "USA" = "USD")
ds_filtered$currency <- country_currency[ds_filtered$country]
ds_filtered$price_usd <- ds_filtered$price * exchange_rates[ds_filtered$currency]
ds_filtered$salary_usd <- ds_filtered$customer_salary*exchange_rates[ds_filtered$currency]

Descriptive Statistics and Visualisations

country_summary <- ds_filtered %>%
  group_by(country) %>%
  summarise(avg_price_usd = mean(price_usd, na.rm = TRUE),
    median_price_usd = median(price_usd, na.rm = TRUE),
    avg_salary_usd = mean(salary_usd, na.rm = TRUE))
knitr::kable(country_summary)
country avg_price_usd median_price_usd avg_salary_usd
Australia 676000.4 677215.5 36319.99
Canada 797665.1 797015.0 39577.80
France 1562081.5 1567035.8 63947.89
Singapore 1728150.8 1718107.7 42356.09
UK 1718025.9 1721504.5 73891.82
USA 1603145.7 1600616.0 54950.75

Dessriptive Statistics and Visualisations cont.

ggplot(ds_filtered, aes(x = property_size_sqft)) +
  geom_histogram(binwidth = 100, fill = "steelblue", color = "white") +
  labs(title = "Distribution of Property Size (sqft)",
       x = "Property Size (sqft)",y = "Frequency") +
  theme_minimal() + theme(plot.title = element_text(hjust = 0.5))

Descriptive Statistics and Visualisations cont.

# Boxplot: Price comparison by Country
ggplot(ds_filtered, aes(x = country, y = price_usd, fill = country)) +
  geom_boxplot(fill = "white", color = "black") +
  xlab("Country") +  ylab("Property Price (USD)") +  theme_classic() +
  labs(title = "Boxplot of Property Prices by Country") +
  scale_y_continuous(labels = scales::comma) +  
  theme(legend.position = "none", axis.text.x = element_text(angle = 90, size = 10),
    axis.title.x = element_text(size = 12),
    axis.title.y = element_text(size = 12),
    plot.title = element_text(size = 16))

Regression analysis

# Correlation test between previous owners and price of property
cor.test(ds_filtered$previous_owners, ds_filtered$price_usd)
## 
##  Pearson's product-moment correlation
## 
## data:  ds_filtered$previous_owners and ds_filtered$price_usd
## t = -0.17441, df = 92441, p-value = 0.8615
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.007019945  0.005872699
## sample estimates:
##           cor 
## -0.0005736468
confint
## function (object, parm, level = 0.95, ...) 
## UseMethod("confint")
## <bytecode: 0x000001e9a3fb9118>
## <environment: namespace:stats>
# Linear regression model to analyse relationship between previous owner and property price
model1<-lm(previous_owners ~ price_usd, data = ds_filtered)
model1 %>% summary()
## 
## Call:
## lm(formula = previous_owners ~ price_usd, data = ds_filtered)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3.01115 -2.00845 -0.00961  1.98967  2.99319 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.011e+00  1.248e-02 241.286   <2e-16 ***
## price_usd   -1.373e-09  7.873e-09  -0.174    0.862    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2 on 92441 degrees of freedom
## Multiple R-squared:  3.291e-07,  Adjusted R-squared:  -1.049e-05 
## F-statistic: 0.03042 on 1 and 92441 DF,  p-value: 0.8615

Regression analysis cont.

# Correlation test between buyer salary and price of property
cor.test(ds_filtered$salary_usd, ds_filtered$price_usd)
## 
##  Pearson's product-moment correlation
## 
## data:  ds_filtered$salary_usd and ds_filtered$price_usd
## t = 52.626, df = 92441, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1642857 0.1768033
## sample estimates:
##       cor 
## 0.1705514
# Linear regression model to analyse relationship buyer salary and property price
model2<-lm(price_usd ~ salary_usd, data = ds_filtered)
model2 %>% summary()
## 
## Call:
## lm(formula = price_usd ~ salary_usd, data = ds_filtered)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1524283  -670031  -162244   628289  2107550 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.091e+06  5.573e+03  195.72   <2e-16 ***
## salary_usd  4.941e+00  9.388e-02   52.63   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 823400 on 92441 degrees of freedom
## Multiple R-squared:  0.02909,    Adjusted R-squared:  0.02908 
## F-statistic:  2769 on 1 and 92441 DF,  p-value: < 2.2e-16

Regression analysis cont.

aov_model <- aov(price_usd ~ country, data=ds_filtered)
summary(aov_model)
##                Df    Sum Sq   Mean Sq F value Pr(>F)    
## country         5 1.767e+16 3.533e+15    6967 <2e-16 ***
## Residuals   92437 4.688e+16 5.072e+11                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(aov_model)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = price_usd ~ country, data = ds_filtered)
## 
## $country
##                           diff        lwr        upr     p adj
## Canada-Australia     121664.72   98552.76  144776.68 0.0000000
## France-Australia     886081.12  863053.36  909108.89 0.0000000
## Singapore-Australia 1052150.47 1028991.98 1075308.97 0.0000000
## UK-Australia        1042025.53 1018918.08 1065132.98 0.0000000
## USA-Australia        927145.35  903988.00  950302.70 0.0000000
## France-Canada        764416.40  741373.22  787459.58 0.0000000
## Singapore-Canada     930485.75  907311.94  953659.57 0.0000000
## UK-Canada            920360.81  897238.00  943483.62 0.0000000
## USA-Canada           805480.63  782307.95  828653.30 0.0000000
## Singapore-France     166069.35  142979.50  189159.20 0.0000000
## UK-France            155944.41  132905.74  178983.07 0.0000000
## USA-France            41064.23   17975.52   64152.93 0.0000060
## UK-Singapore         -10124.94  -33294.27   13044.38 0.8144291
## USA-Singapore       -125005.12 -148224.21 -101786.04 0.0000000
## USA-UK              -114880.18 -138048.36  -91712.00 0.0000000

Categorical association

furnishing_decision_table <- table(ds_filtered$furnishing_status, ds_filtered$decision)
chi_result <- chisq.test(furnishing_decision_table)
chi_result
## 
##  Pearson's Chi-squared test
## 
## data:  furnishing_decision_table
## X-squared = 4.81, df = 2, p-value = 0.09026

Categorical association cont.

country_decision_table <- table(ds_filtered$country, ds_filtered$decision)
chi_result <- chisq.test(country_decision_table)
chi_result
## 
##  Pearson's Chi-squared test
## 
## data:  country_decision_table
## X-squared = 487.55, df = 5, p-value < 2.2e-16

Categorical association cont.

property_furnishing_table <- table(ds_filtered$property_type, ds_filtered$furnishing_status)
chi_result <- chisq.test(property_furnishing_table)
chi_result
## 
##  Pearson's Chi-squared test
## 
## data:  property_furnishing_table
## X-squared = 7.8341, df = 10, p-value = 0.645

Discussion

References