SIT1003 Analysis of Data and Statistical Report Writing
Group 2, Assignment 1: Airbnb Prices in European Cities

Adriana Batrisyia binti Amran, U2101269
Danish Hazrin bin Saipul Ekmal, U2100841
Muhammad Alif Daniel bin Mohd Nazri, U2101945
Muhammad Asyraf Zulqarnain bin Sha’ari, U2100944
Muhammad Azamuddin Nasution Bin Raduan, S2166723

2023-05-03

Loading the data

# Load the datasets for individual cities
ams_wd <- read_csv("amsterdam_weekdays.csv"); ath_wd <- read_csv("athens_weekdays.csv");      
bcn_wd <- read_csv("barcelona_weekdays.csv"); ber_wd <- read_csv("berlin_weekdays.csv");      
bud_wd <- read_csv("budapest_weekdays.csv");  lis_wd <- read_csv("lisbon_weekdays.csv");      
lon_wd <- read_csv("london_weekdays.csv");    par_wd <- read_csv("paris_weekdays.csv");       
rom_wd <- read_csv("rome_weekdays.csv");      vie_wd <- read_csv("vienna_weekdays.csv")      

# Label the datasets with city name
ams_wd$city <- c(rep("amsterdam", times = nrow(ams_wd))); ath_wd$city <- c(rep("athens", times = nrow(ath_wd)));       
bcn_wd$city <- c(rep("barcelona", times = nrow(bcn_wd))); ber_wd$city <- c(rep("berlin", times = nrow(ber_wd)));
bud_wd$city <- c(rep("budapest", times = nrow(bud_wd)));  lis_wd$city <- c(rep("lisbon", times = nrow(lis_wd)));
lon_wd$city <- c(rep("london", times = nrow(lon_wd)));    par_wd$city <- c(rep("paris", times = nrow(par_wd)));
rom_wd$city <- c(rep("rome", times = nrow(rom_wd)));      vie_wd$city <- c(rep("vienna", times = nrow(vie_wd)))

# Combine all datasets and name it eu_cities_wd
eu_cities_wd <- rbind(ams_wd,ath_wd,bcn_wd,ber_wd,bud_wd,lis_wd,lon_wd,par_wd,rom_wd,vie_wd)    

Part 1: Explanation of the Data and Method

Data

Method

Part 2: Research questions

Research question 1: What is the relationship between price and distance from city center across different European cities?


Research question 2: How does the no.of bedrooms and cleanliness affect overall satisfaction?


Research question 3: If the budget is less than $250, which city is the most affordable to go based on the best satisfaction rating?

Research question 1: What is the relationship between price and distance from city center across different European cities?

Plot the general relationship between price and distance from city centre
eu_cities_wd_medians <- eu_cities_wd %>% group_by(dist, city) %>%
                        summarize(median_price = median(realSum))

ggplot(eu_cities_wd_medians, aes(x = dist, y = median_price, color = city)) + 
geom_point() + geom_smooth(method = "lm", color = "black") +
  labs(x = "Distance from City Center (km)", y = "Median Airbnb Price (EUR)", 
       title = "Price and Distance from City Center") + theme_bw()

Hypothesis Testing

model <- lm(realSum ~ dist * city, data = eu_cities_wd)
anova(model)
## Analysis of Variance Table
## 
## Response: realSum
##              Df     Sum Sq  Mean Sq F value    Pr(>F)    
## dist          1    4745755  4745755  42.108 8.795e-11 ***
## city          9  301751167 33527907 297.487 < 2.2e-16 ***
## dist:city     9   22292412  2476935  21.977 < 2.2e-16 ***
## Residuals 25480 2871693843   112704                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Create data frame relevant_intervals to filter outliers

intervals <- (cut(eu_cities_wd$realSum, breaks = seq(0, 30000, by = 500)))
relevant_intervals <- as.data.frame(table(intervals))
relevant_intervals$percentage <- (relevant_intervals$Freq/sum(relevant_intervals$Freq))*100
head(relevant_intervals)
##         intervals  Freq  percentage
## 1         (0,500] 23066 90.45490196
## 2     (500,1e+03]  2080  8.15686275
## 3 (1e+03,1.5e+03]   230  0.90196078
## 4 (1.5e+03,2e+03]    60  0.23529412
## 5 (2e+03,2.5e+03]    25  0.09803922
## 6 (2.5e+03,3e+03]    12  0.04705882

Plotting scatterplots with regression line for each city

filtered_cities <- eu_cities_wd %>% filter(realSum <= 1000)
ggplot(filtered_cities, aes(x = dist, y = realSum, color = city)) + 
geom_point() + geom_smooth(method = "lm", color = "black") + 
  labs(x = "Distance from City Center (km)", y = "Median Airbnb Price (EUR)", 
       title = "Price and Distance from City Center")  +
  theme_bw() + facet_wrap(~city, ncol = 5) 

Research question 2: How does the no.of of bedrooms and cleanliness affect overall satisfaction?

guest_rating <- eu_cities_wd %>% group_by(guest_satisfaction_overall) %>% 
  summarise(mean_beds = mean(bedrooms), mean_clean = mean(cleanliness_rating)) %>% arrange(-guest_satisfaction_overall)
head(guest_rating)
## # A tibble: 6 × 3
##   guest_satisfaction_overall mean_beds mean_clean
##                        <dbl>     <dbl>      <dbl>
## 1                        100      1.21       9.85
## 2                         99      1.20       9.97
## 3                         98      1.21       9.91
## 4                         97      1.17       9.83
## 5                         96      1.17       9.74
## 6                         95      1.17       9.65
clean_model <- lm(guest_satisfaction_overall ~ cleanliness_rating, data = eu_cities_wd)
bedroom_model <- lm(guest_satisfaction_overall ~ bedrooms, data = eu_cities_wd)
summary(clean_model)$r.squared; summary(bedroom_model)$r.squared
## [1] 0.5114414
## [1] 0.001842468

Summary Statistics

lm_model <- lm(guest_satisfaction_overall ~ cleanliness_rating + bedrooms, data = eu_cities_wd)
summary(lm_model)
## 
## Call:
## lm(formula = guest_satisfaction_overall ~ cleanliness_rating + 
##     bedrooms, data = eu_cities_wd)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -76.944  -2.217   1.076   3.368  37.029 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        29.24429    0.39238  74.531  < 2e-16 ***
## cleanliness_rating  6.70763    0.04109 163.239  < 2e-16 ***
## bedrooms            0.31154    0.06184   5.038 4.73e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.263 on 25497 degrees of freedom
## Multiple R-squared:  0.5119, Adjusted R-squared:  0.5119 
## F-statistic: 1.337e+04 on 2 and 25497 DF,  p-value: < 2.2e-16

Hypothesis Testing

anova(lm_model)
## Analysis of Variance Table
## 
## Response: guest_satisfaction_overall
##                       Df  Sum Sq Mean Sq   F value    Pr(>F)    
## cleanliness_rating     1 1048039 1048039 26717.791 < 2.2e-16 ***
## bedrooms               1     996     996    25.384  4.73e-07 ***
## Residuals          25497 1000152      39                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Plotting visualisation

a <- ggplot(eu_cities_wd, aes(x = cleanliness_rating, y = guest_satisfaction_overall)) + 
  geom_point(color="red") + geom_smooth(method = "lm", se = TRUE, color = "black") +
  labs(x = "Cleanliness Rating", y = "Overall Satisfaction") +
  ggtitle("Cleanliness Rating and Overall Satisfaction") +
  theme_bw()
b <- ggplot(eu_cities_wd, aes(x = bedrooms, y = guest_satisfaction_overall)) + 
  geom_point(color="blue") + geom_smooth(method = "lm", se = TRUE, color = "black") +
  labs(x = "No. of Bedrooms", y = "Overall Satisfaction") +
  ggtitle("No. of Bedrooms and Overall Satisfaction") +
  theme_bw()
grid.arrange(a,b, ncol = 2)

Research question 3: If the budget is less than €250, which city is the most affordable to go based on the best satisfaction rating?

eu_cities_ch <- eu_cities_wd %>%  group_by(city) %>% 
  summarise(median_ratings = median(guest_satisfaction_overall),
            mean_price = mean(realSum), median_price = median(realSum),
            n = n(), percentage = n/nrow(eu_cities_wd)*100) %>% 
  arrange(desc(median_ratings), median_price)
head(eu_cities_ch)
## # A tibble: 6 × 6
##   city      median_ratings mean_price median_price     n percentage
##   <chr>              <dbl>      <dbl>        <dbl> <int>      <dbl>
## 1 athens                97       156.         128.  2653      10.4 
## 2 budapest              96       168.         147.  2074       8.13
## 3 berlin                96       240.         188.  1284       5.04
## 4 vienna                96       240.         205.  1738       6.82
## 5 amsterdam             96       545.         430.  1103       4.33
## 6 rome                  95       202.         180.  4492      17.6

Create dataset cheapest_cities featuring the 4 cheapest cities

cheapest_cities <- rbind(ath_wd, bud_wd, ber_wd, vie_wd) %>% filter(realSum < 250) 
cheapest_cities$city <- as.factor(cheapest_cities$city)
cheapest_cities %>% group_by(city) %>% 
  summarise(mean_price = mean(realSum),median_price = median(realSum),
            sd = sd(realSum), n=n(), percentage = n/nrow(cheapest_cities)*100)
## # A tibble: 4 × 6
##   city     mean_price median_price    sd     n percentage
##   <fct>         <dbl>        <dbl> <dbl> <int>      <dbl>
## 1 athens         130.         124.  43.5  2421       37.7
## 2 berlin         163.         158.  41.4   892       13.9
## 3 budapest       146.         143.  41.5  1884       29.3
## 4 vienna         177.         180.  42.0  1226       19.1

Hypothesis Test

model <- lm(realSum ~ city, data = cheapest_cities)
anova(model)
## Analysis of Variance Table
## 
## Response: realSum
##             Df   Sum Sq Mean Sq F value    Pr(>F)    
## city         3  2031454  677151   377.8 < 2.2e-16 ***
## Residuals 6419 11505201    1792                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Plotting boxplot

ggplot(cheapest_cities, aes(x = city, y = realSum, fill = city)) + geom_boxplot() + 
     labs(title = "Distribution of Airbnb prices 4 highest-rated cities", 
          y="Room Price for 2 people for 2 nights (EUR)",x="City") + 
  scale_color_brewer(type = "qual", palette = 2) + theme_minimal() + theme(legend.position = "none") + 
  annotate("text", x = 4, y = 55, size = 4,
           label = "H_0: group means are the same\nH_a: group means are different") +
  annotate("text", x = 4, y = 35, size = 4, label = "p = 2.2e-16 < 0.05, reject H_0") 

Plotting histogram

ggplot(data = cheapest_cities, aes(x = realSum, fill = city)) +
  geom_histogram(binwidth = 5) + 
  labs(title = "Distribution of Airbnb prices against the no.of rooms in 4 highest-rated cities",
       y = "No.of rooms", x = "Price")