Erik Yuan Rui Woon, S3905260
Oct 5, 2025
It is not necessary (That is, it is optional and not compulsory) but if you like you can publish your presentation to RPubs (see here) and add this link to your presentation here.
Rpubs link comes here: https://rpubs.com/s3905260/1354300
ds <- read.csv("global_house_purchase_dataset.csv")
ds$decision <- ds$decision %>% factor(levels = c(0,1), labels = c("No", "Yes"))
ds$furnishing_status <- ds$furnishing_status %>% factor(levels = c("Unfurnished", "Semi-Furnished", "Fully-Furnished"), ordered=TRUE)
ds_filtered <- subset(ds, !country %in% c("Brazil", "China", "India", "Germany","Japan", "South Africa", "UAE"))
currency_pairs <- c("AUDUSD=X", "CADUSD=X", "EURUSD=X", "SGDUSD=X", "GBPUSD=X")
exchange_data <- getQuote(currency_pairs)
exchange_rates <- exchange_data$Last
currency_codes <- c("AUD", "CAD","EUR", "SGD", "GBP")
names(exchange_rates) <- currency_codes
exchange_rates <- c("USD" = 1, exchange_rates)
country_currency <- c("Australia" = "AUD", "Canada" = "CAD", "France" = "EUR","Singapore" = "SGD", "UK" = "GBP", "USA" = "USD")
ds_filtered$currency <- country_currency[ds_filtered$country]
ds_filtered$price_usd <- ds_filtered$price * exchange_rates[ds_filtered$currency]
ds_filtered$salary_usd <- ds_filtered$customer_salary*exchange_rates[ds_filtered$currency]
country_summary <- ds_filtered %>%
group_by(country) %>%
summarise(avg_price_usd = mean(price_usd, na.rm = TRUE),
median_price_usd = median(price_usd, na.rm = TRUE),
avg_salary_usd = mean(salary_usd, na.rm = TRUE))
knitr::kable(country_summary)
country | avg_price_usd | median_price_usd | avg_salary_usd |
---|---|---|---|
Australia | 665715.5 | 666912.2 | 35767.41 |
Canada | 794184.8 | 793537.5 | 39405.12 |
France | 1562263.2 | 1567218.1 | 63955.33 |
Singapore | 1726977.8 | 1716941.5 | 42327.34 |
UK | 1710936.8 | 1714401.0 | 73586.92 |
USA | 1603145.7 | 1600616.0 | 54950.75 |
# Boxplot: Price comparison by Country
ggplot(ds_filtered, aes(x = country, y = price_usd, fill = country)) +
geom_boxplot(fill = "white", color = "black") +
xlab("Country") + ylab("Property Price (USD)") + theme_classic() +
labs(title = "Boxplot of Property Prices by Country") +
scale_y_continuous(labels = dollar) +
theme(legend.position = "none", axis.text.x = element_text(angle = 90, size = 10),
axis.title.x = element_text(size = 12), axis.title.y = element_text(size = 12),
plot.title = element_text(size = 16))
ggplot(ds_filtered, aes(x = country, fill = decision)) +
geom_bar(position = "dodge") + labs(title = "Decision to Buy by Country",
x = "Country", y = "Count of Decisions", fill = "Decision") + theme_minimal()
property_counts <- ds_filtered %>% group_by(country, constructed_year, property_type) %>% summarise(count = n())
filtered_counts <- property_counts %>% filter(property_type %in% c("Apartment", "Independent House"))
ggplot(filtered_counts, aes(x = constructed_year, y = count, colour = property_type)) +
geom_line(size=0.75) + facet_wrap(~ country)+
labs(title = "Number of apartments and townhouses built by year in each country",
x = "Constructed Year", y = "Number of Properties", color = "Property Type") + theme_minimal() +
theme(legend.text = element_text(size = 8), legend.title = element_text(size = 9), legend.position = "bottom")
yes_prices <- ds_filtered$price[ds_filtered$decision == "Yes"]
no_prices <- ds_filtered$price[ds_filtered$decision == "No"]
par(mfrow=c(1,2))
yes_prices %>% qqnorm()
no_prices %>% qqnorm()
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 225.64 < 2.2e-16 ***
## 92441
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Welch Two Sample t-test
##
## data: yes_prices and no_prices
## t = -23.743, df = 39214, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -160893.0 -136354.8
## sample estimates:
## mean of x mean of y
## 1319175 1467799
ds_filtered$furnishing_status <- as.factor(ds_filtered$furnishing_status)
# Run ANOVA
anova_result <- aov(price ~ furnishing_status, data = ds_filtered)
par(mfrow=c(2,2))
plot(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## furnishing_status 2 2.453e+12 1.227e+12 1.698 0.183
## Residuals 92440 6.679e+16 7.225e+11
furnishing_decision_table <- table(ds_filtered$furnishing_status, ds_filtered$decision)
chi_result <- chisq.test(furnishing_decision_table)
chi_result
##
## Pearson's Chi-squared test
##
## data: furnishing_decision_table
## X-squared = 4.81, df = 2, p-value = 0.09026
country_decision_table <- table(ds_filtered$country, ds_filtered$decision)
chi_result <- chisq.test(country_decision_table)
chi_result
##
## Pearson's Chi-squared test
##
## data: country_decision_table
## X-squared = 487.55, df = 5, p-value < 2.2e-16
property_furnishing_table <- table(ds_filtered$property_type, ds_filtered$furnishing_status)
chi_result <- chisq.test(property_furnishing_table)
chi_result
##
## Pearson's Chi-squared test
##
## data: property_furnishing_table
## X-squared = 7.8341, df = 10, p-value = 0.645