Erik Yuan Rui Woon, S3905260
Oct 5, 2025
It is not necessary (That is, it is optional and not compulsory) but if you like you can publish your presentation to RPubs (see here) and add this link to your presentation here.
Rpubs link comes here: https://rpubs.com/s3905260/1352844
ds <- read.csv("global_house_purchase_dataset.csv")
ds$decision <- ds$decision %>% factor(levels = c(0,1), labels = c("No", "Yes"))
ds$furnishing_status <- ds$furnishing_status %>% factor(levels = c("Unfurnished", "Semi-Furnished", "Fully-Furnished"), ordered=TRUE)
ds_filtered <- subset(ds, !country %in% c("Brazil", "China", "India", "Germany","Japan", "South Africa", "UAE"))
currency_pairs <- c("AUDUSD=X", "CADUSD=X", "EURUSD=X", "SGDUSD=X", "GBPUSD=X")
exchange_data <- getQuote(currency_pairs)
exchange_rates <- exchange_data$Last
currency_codes <- c("AUD", "CAD","EUR", "SGD", "GBP")
names(exchange_rates) <- currency_codes
exchange_rates <- c("USD" = 1, exchange_rates)
country_currency <- c("Australia" = "AUD", "Canada" = "CAD", "France" = "EUR","Singapore" = "SGD", "UK" = "GBP", "USA" = "USD")
ds_filtered$currency <- country_currency[ds_filtered$country]
ds_filtered$price_usd <- ds_filtered$price * exchange_rates[ds_filtered$currency]
ds_filtered$salary_usd <- ds_filtered$customer_salary*exchange_rates[ds_filtered$currency]
country_summary <- ds_filtered %>%
group_by(country) %>%
summarise(avg_price_usd = mean(price_usd, na.rm = TRUE),
median_price_usd = median(price_usd, na.rm = TRUE),
avg_salary_usd = mean(salary_usd, na.rm = TRUE))
knitr::kable(country_summary)
country | avg_price_usd | median_price_usd | avg_salary_usd |
---|---|---|---|
Australia | 676000.4 | 677215.5 | 36319.99 |
Canada | 797665.1 | 797015.0 | 39577.80 |
France | 1562081.5 | 1567035.8 | 63947.89 |
Singapore | 1728150.8 | 1718107.7 | 42356.09 |
UK | 1718025.9 | 1721504.5 | 73891.82 |
USA | 1603145.7 | 1600616.0 | 54950.75 |
ggplot(ds_filtered, aes(x = property_size_sqft)) +
geom_histogram(binwidth = 100, fill = "steelblue", color = "white") +
labs(title = "Distribution of Property Size (sqft)",
x = "Property Size (sqft)",y = "Frequency") +
theme_minimal() + theme(plot.title = element_text(hjust = 0.5))
# Boxplot: Price comparison by Country
ggplot(ds_filtered, aes(x = country, y = price_usd, fill = country)) +
geom_boxplot(fill = "white", color = "black") +
xlab("Country") + ylab("Property Price (USD)") + theme_classic() +
labs(title = "Boxplot of Property Prices by Country") +
scale_y_continuous(labels = scales::comma) +
theme(legend.position = "none", axis.text.x = element_text(angle = 90, size = 10),
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12),
plot.title = element_text(size = 16))
# Correlation test between previous owners and price of property
cor.test(ds_filtered$previous_owners, ds_filtered$price_usd)
##
## Pearson's product-moment correlation
##
## data: ds_filtered$previous_owners and ds_filtered$price_usd
## t = -0.17441, df = 92441, p-value = 0.8615
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.007019945 0.005872699
## sample estimates:
## cor
## -0.0005736468
## function (object, parm, level = 0.95, ...)
## UseMethod("confint")
## <bytecode: 0x000001e9a3fb9118>
## <environment: namespace:stats>
# Linear regression model to analyse relationship between previous owner and property price
model1<-lm(previous_owners ~ price_usd, data = ds_filtered)
model1 %>% summary()
##
## Call:
## lm(formula = previous_owners ~ price_usd, data = ds_filtered)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.01115 -2.00845 -0.00961 1.98967 2.99319
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.011e+00 1.248e-02 241.286 <2e-16 ***
## price_usd -1.373e-09 7.873e-09 -0.174 0.862
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2 on 92441 degrees of freedom
## Multiple R-squared: 3.291e-07, Adjusted R-squared: -1.049e-05
## F-statistic: 0.03042 on 1 and 92441 DF, p-value: 0.8615
# Correlation test between buyer salary and price of property
cor.test(ds_filtered$salary_usd, ds_filtered$price_usd)
##
## Pearson's product-moment correlation
##
## data: ds_filtered$salary_usd and ds_filtered$price_usd
## t = 52.626, df = 92441, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1642857 0.1768033
## sample estimates:
## cor
## 0.1705514
# Linear regression model to analyse relationship buyer salary and property price
model2<-lm(price_usd ~ salary_usd, data = ds_filtered)
model2 %>% summary()
##
## Call:
## lm(formula = price_usd ~ salary_usd, data = ds_filtered)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1524283 -670031 -162244 628289 2107550
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.091e+06 5.573e+03 195.72 <2e-16 ***
## salary_usd 4.941e+00 9.388e-02 52.63 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 823400 on 92441 degrees of freedom
## Multiple R-squared: 0.02909, Adjusted R-squared: 0.02908
## F-statistic: 2769 on 1 and 92441 DF, p-value: < 2.2e-16
## Df Sum Sq Mean Sq F value Pr(>F)
## country 5 1.767e+16 3.533e+15 6967 <2e-16 ***
## Residuals 92437 4.688e+16 5.072e+11
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = price_usd ~ country, data = ds_filtered)
##
## $country
## diff lwr upr p adj
## Canada-Australia 121664.72 98552.76 144776.68 0.0000000
## France-Australia 886081.12 863053.36 909108.89 0.0000000
## Singapore-Australia 1052150.47 1028991.98 1075308.97 0.0000000
## UK-Australia 1042025.53 1018918.08 1065132.98 0.0000000
## USA-Australia 927145.35 903988.00 950302.70 0.0000000
## France-Canada 764416.40 741373.22 787459.58 0.0000000
## Singapore-Canada 930485.75 907311.94 953659.57 0.0000000
## UK-Canada 920360.81 897238.00 943483.62 0.0000000
## USA-Canada 805480.63 782307.95 828653.30 0.0000000
## Singapore-France 166069.35 142979.50 189159.20 0.0000000
## UK-France 155944.41 132905.74 178983.07 0.0000000
## USA-France 41064.23 17975.52 64152.93 0.0000060
## UK-Singapore -10124.94 -33294.27 13044.38 0.8144291
## USA-Singapore -125005.12 -148224.21 -101786.04 0.0000000
## USA-UK -114880.18 -138048.36 -91712.00 0.0000000
furnishing_decision_table <- table(ds_filtered$furnishing_status, ds_filtered$decision)
chi_result <- chisq.test(furnishing_decision_table)
chi_result
##
## Pearson's Chi-squared test
##
## data: furnishing_decision_table
## X-squared = 4.81, df = 2, p-value = 0.09026
country_decision_table <- table(ds_filtered$country, ds_filtered$decision)
chi_result <- chisq.test(country_decision_table)
chi_result
##
## Pearson's Chi-squared test
##
## data: country_decision_table
## X-squared = 487.55, df = 5, p-value < 2.2e-16
property_furnishing_table <- table(ds_filtered$property_type, ds_filtered$furnishing_status)
chi_result <- chisq.test(property_furnishing_table)
chi_result
##
## Pearson's Chi-squared test
##
## data: property_furnishing_table
## X-squared = 7.8341, df = 10, p-value = 0.645