Adriana Batrisyia binti Amran, U2101269
Danish Hazrin bin Saipul
Ekmal, U2100841
Muhammad Alif Daniel bin Mohd Nazri,
U2101945
Muhammad Asyraf Zulqarnain bin Sha’ari,
U2100944
Muhammad Azamuddin Nasution Bin Raduan, S2166723
2023-05-03
# Load the datasets for individual cities
ams_wd <- read_csv("amsterdam_weekdays.csv"); ath_wd <- read_csv("athens_weekdays.csv");
bcn_wd <- read_csv("barcelona_weekdays.csv"); ber_wd <- read_csv("berlin_weekdays.csv");
bud_wd <- read_csv("budapest_weekdays.csv"); lis_wd <- read_csv("lisbon_weekdays.csv");
lon_wd <- read_csv("london_weekdays.csv"); par_wd <- read_csv("paris_weekdays.csv");
rom_wd <- read_csv("rome_weekdays.csv"); vie_wd <- read_csv("vienna_weekdays.csv")
# Label the datasets with city name
ams_wd$city <- c(rep("amsterdam", times = nrow(ams_wd))); ath_wd$city <- c(rep("athens", times = nrow(ath_wd)));
bcn_wd$city <- c(rep("barcelona", times = nrow(bcn_wd))); ber_wd$city <- c(rep("berlin", times = nrow(ber_wd)));
bud_wd$city <- c(rep("budapest", times = nrow(bud_wd))); lis_wd$city <- c(rep("lisbon", times = nrow(lis_wd)));
lon_wd$city <- c(rep("london", times = nrow(lon_wd))); par_wd$city <- c(rep("paris", times = nrow(par_wd)));
rom_wd$city <- c(rep("rome", times = nrow(rom_wd))); vie_wd$city <- c(rep("vienna", times = nrow(vie_wd)))
# Combine all datasets and name it eu_cities_wd
eu_cities_wd <- rbind(ams_wd,ath_wd,bcn_wd,ber_wd,bud_wd,lis_wd,lon_wd,par_wd,rom_wd,vie_wd) Data
eu_cities_wd contains information about Airbnb prices
from 10 cities during weekdays.Method
eu_cities_wd.city to eu_cities_wd to
ensure that the combined dataset remains distinct.eu_cities_wd_medians <- eu_cities_wd %>% group_by(dist, city) %>%
summarize(median_price = median(realSum))
ggplot(eu_cities_wd_medians, aes(x = dist, y = median_price, color = city)) +
geom_point() + geom_smooth(method = "lm", color = "black") +
labs(x = "Distance from City Center (km)", y = "Median Airbnb Price (EUR)",
title = "Price and Distance from City Center") + theme_bw()## Analysis of Variance Table
##
## Response: realSum
## Df Sum Sq Mean Sq F value Pr(>F)
## dist 1 4745755 4745755 42.108 8.795e-11 ***
## city 9 301751167 33527907 297.487 < 2.2e-16 ***
## dist:city 9 22292412 2476935 21.977 < 2.2e-16 ***
## Residuals 25480 2871693843 112704
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
relevant_intervals to filter
outliersintervals <- (cut(eu_cities_wd$realSum, breaks = seq(0, 30000, by = 500)))
relevant_intervals <- as.data.frame(table(intervals))
relevant_intervals$percentage <- (relevant_intervals$Freq/sum(relevant_intervals$Freq))*100
head(relevant_intervals)## intervals Freq percentage
## 1 (0,500] 23066 90.45490196
## 2 (500,1e+03] 2080 8.15686275
## 3 (1e+03,1.5e+03] 230 0.90196078
## 4 (1.5e+03,2e+03] 60 0.23529412
## 5 (2e+03,2.5e+03] 25 0.09803922
## 6 (2.5e+03,3e+03] 12 0.04705882
filtered_cities <- eu_cities_wd %>% filter(realSum <= 1000)
ggplot(filtered_cities, aes(x = dist, y = realSum, color = city)) +
geom_point() + geom_smooth(method = "lm", color = "black") +
labs(x = "Distance from City Center (km)", y = "Median Airbnb Price (EUR)",
title = "Price and Distance from City Center") +
theme_bw() + facet_wrap(~city, ncol = 5) guest_rating <- eu_cities_wd %>% group_by(guest_satisfaction_overall) %>%
summarise(mean_beds = mean(bedrooms), mean_clean = mean(cleanliness_rating)) %>% arrange(-guest_satisfaction_overall)
head(guest_rating)## # A tibble: 6 × 3
## guest_satisfaction_overall mean_beds mean_clean
## <dbl> <dbl> <dbl>
## 1 100 1.21 9.85
## 2 99 1.20 9.97
## 3 98 1.21 9.91
## 4 97 1.17 9.83
## 5 96 1.17 9.74
## 6 95 1.17 9.65
clean_model <- lm(guest_satisfaction_overall ~ cleanliness_rating, data = eu_cities_wd)
bedroom_model <- lm(guest_satisfaction_overall ~ bedrooms, data = eu_cities_wd)
summary(clean_model)$r.squared; summary(bedroom_model)$r.squared## [1] 0.5114414
## [1] 0.001842468
lm_model <- lm(guest_satisfaction_overall ~ cleanliness_rating + bedrooms, data = eu_cities_wd)
summary(lm_model)##
## Call:
## lm(formula = guest_satisfaction_overall ~ cleanliness_rating +
## bedrooms, data = eu_cities_wd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -76.944 -2.217 1.076 3.368 37.029
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.24429 0.39238 74.531 < 2e-16 ***
## cleanliness_rating 6.70763 0.04109 163.239 < 2e-16 ***
## bedrooms 0.31154 0.06184 5.038 4.73e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.263 on 25497 degrees of freedom
## Multiple R-squared: 0.5119, Adjusted R-squared: 0.5119
## F-statistic: 1.337e+04 on 2 and 25497 DF, p-value: < 2.2e-16
## Analysis of Variance Table
##
## Response: guest_satisfaction_overall
## Df Sum Sq Mean Sq F value Pr(>F)
## cleanliness_rating 1 1048039 1048039 26717.791 < 2.2e-16 ***
## bedrooms 1 996 996 25.384 4.73e-07 ***
## Residuals 25497 1000152 39
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
a <- ggplot(eu_cities_wd, aes(x = cleanliness_rating, y = guest_satisfaction_overall)) +
geom_point(color="red") + geom_smooth(method = "lm", se = TRUE, color = "black") +
labs(x = "Cleanliness Rating", y = "Overall Satisfaction") +
ggtitle("Cleanliness Rating and Overall Satisfaction") +
theme_bw()
b <- ggplot(eu_cities_wd, aes(x = bedrooms, y = guest_satisfaction_overall)) +
geom_point(color="blue") + geom_smooth(method = "lm", se = TRUE, color = "black") +
labs(x = "No. of Bedrooms", y = "Overall Satisfaction") +
ggtitle("No. of Bedrooms and Overall Satisfaction") +
theme_bw()
grid.arrange(a,b, ncol = 2)eu_cities_ch <- eu_cities_wd %>% group_by(city) %>%
summarise(median_ratings = median(guest_satisfaction_overall),
mean_price = mean(realSum), median_price = median(realSum),
n = n(), percentage = n/nrow(eu_cities_wd)*100) %>%
arrange(desc(median_ratings), median_price)
head(eu_cities_ch)## # A tibble: 6 × 6
## city median_ratings mean_price median_price n percentage
## <chr> <dbl> <dbl> <dbl> <int> <dbl>
## 1 athens 97 156. 128. 2653 10.4
## 2 budapest 96 168. 147. 2074 8.13
## 3 berlin 96 240. 188. 1284 5.04
## 4 vienna 96 240. 205. 1738 6.82
## 5 amsterdam 96 545. 430. 1103 4.33
## 6 rome 95 202. 180. 4492 17.6
cheapest_cities featuring the 4 cheapest
citiescheapest_cities <- rbind(ath_wd, bud_wd, ber_wd, vie_wd) %>% filter(realSum < 250)
cheapest_cities$city <- as.factor(cheapest_cities$city)
cheapest_cities %>% group_by(city) %>%
summarise(mean_price = mean(realSum),median_price = median(realSum),
sd = sd(realSum), n=n(), percentage = n/nrow(cheapest_cities)*100)## # A tibble: 4 × 6
## city mean_price median_price sd n percentage
## <fct> <dbl> <dbl> <dbl> <int> <dbl>
## 1 athens 130. 124. 43.5 2421 37.7
## 2 berlin 163. 158. 41.4 892 13.9
## 3 budapest 146. 143. 41.5 1884 29.3
## 4 vienna 177. 180. 42.0 1226 19.1
## Analysis of Variance Table
##
## Response: realSum
## Df Sum Sq Mean Sq F value Pr(>F)
## city 3 2031454 677151 377.8 < 2.2e-16 ***
## Residuals 6419 11505201 1792
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(cheapest_cities, aes(x = city, y = realSum, fill = city)) + geom_boxplot() +
labs(title = "Distribution of Airbnb prices 4 highest-rated cities",
y="Room Price for 2 people for 2 nights (EUR)",x="City") +
scale_color_brewer(type = "qual", palette = 2) + theme_minimal() + theme(legend.position = "none") +
annotate("text", x = 4, y = 55, size = 4,
label = "H_0: group means are the same\nH_a: group means are different") +
annotate("text", x = 4, y = 35, size = 4, label = "p = 2.2e-16 < 0.05, reject H_0")