library(tidyverse)
library(here)
library(rstatix)
library(gt)
listing_price= read_csv(here('airbnb_listing_price.csv'))
listing_price |> head()
## # A tibble: 6 × 4
## listing_id price minimum_nights maximum_nights
## <dbl> <dbl> <dbl> <dbl>
## 1 281420 53 2 1125
## 2 3705183 120 2 1125
## 3 4082273 89 2 1125
## 4 4797344 58 2 1125
## 5 4823489 60 2 1125
## 6 4898654 95 2 1125
dim(listing_price)
## [1] 279712 4
location_info=read_csv(here('airbnb_location_info.csv'))
location_info |> head()
## # A tibble: 6 × 7
## listing_id host_location neighbourhood district city latitude longitude
## <dbl> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 281420 Paris, Ile-de-Fran… Buttes-Montm… <NA> Paris 48.9 2.33
## 2 3705183 Paris, Ile-de-Fran… Buttes-Montm… <NA> Paris 48.9 2.35
## 3 4082273 Paris, Ile-de-Fran… Elysee <NA> Paris 48.9 2.32
## 4 4797344 Paris, Ile-de-Fran… Vaugirard <NA> Paris 48.8 2.31
## 5 4823489 Paris, Ile-de-Fran… Passy <NA> Paris 48.9 2.27
## 6 4898654 Paris, Ile-de-Fran… Temple <NA> Paris 48.9 2.35
dim(location_info)
## [1] 279712 7
property_info=read_csv(here('airbnb_property_info.csv'),locale = locale(encoding = 'utf8'))
property_info |> head()
## # A tibble: 6 × 8
## listing_id name property_type room_type accommodates bedrooms amenities
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 281420 Beautiful … Entire apart… Entire p… 2 1 "[\"Heat…
## 2 3705183 39 m² P… Entire apart… Entire p… 2 1 "[\"Sham…
## 3 4082273 Lovely apa… Entire apart… Entire p… 2 1 "[\"Heat…
## 4 4797344 Cosy studi… Entire apart… Entire p… 2 1 "[\"Heat…
## 5 4823489 Close to E… Entire apart… Entire p… 2 1 "[\"Heat…
## 6 4898654 NEW - Char… Entire apart… Entire p… 2 1 "[\"Heat…
## # ℹ 1 more variable: instant_bookable <lgl>
property_info |> dim()
## [1] 279712 8
location_info$country= location_info$host_location |> str_split_i(',',-1) |> str_trim()
I will exclude the names and amenities columns from the dataset. The
column names
doesn’t provide significant information
regarding pricing, and the amenities
column may have a
variable number of different values, making it challenging to use in the
analysis.
property_info|> head()
## # A tibble: 6 × 8
## listing_id name property_type room_type accommodates bedrooms amenities
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 281420 Beautiful … Entire apart… Entire p… 2 1 "[\"Heat…
## 2 3705183 39 m² P… Entire apart… Entire p… 2 1 "[\"Sham…
## 3 4082273 Lovely apa… Entire apart… Entire p… 2 1 "[\"Heat…
## 4 4797344 Cosy studi… Entire apart… Entire p… 2 1 "[\"Heat…
## 5 4823489 Close to E… Entire apart… Entire p… 2 1 "[\"Heat…
## 6 4898654 NEW - Char… Entire apart… Entire p… 2 1 "[\"Heat…
## # ℹ 1 more variable: instant_bookable <lgl>
property_info =property_info|> select(-c('name','amenities'))
property_info |> head()
## # A tibble: 6 × 6
## listing_id property_type room_type accommodates bedrooms instant_bookable
## <dbl> <chr> <chr> <dbl> <dbl> <lgl>
## 1 281420 Entire apartment Entire pla… 2 1 FALSE
## 2 3705183 Entire apartment Entire pla… 2 1 FALSE
## 3 4082273 Entire apartment Entire pla… 2 1 FALSE
## 4 4797344 Entire apartment Entire pla… 2 1 FALSE
## 5 4823489 Entire apartment Entire pla… 2 1 FALSE
## 6 4898654 Entire apartment Entire pla… 2 1 FALSE
All datasets share the same number of rows and a common column, listing_id. Therefore, we only need to merge the columns. To avoid errors, I will use a left join approach with listing_price as the main dataset because it contains the primary information. If a listing is not present in listing_price, it will not be included in the final dataset.
property_listing_price = left_join(listing_price,property_info)
property_listing_price |> head()
## # A tibble: 6 × 9
## listing_id price minimum_nights maximum_nights property_type room_type
## <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 281420 53 2 1125 Entire apartment Entire place
## 2 3705183 120 2 1125 Entire apartment Entire place
## 3 4082273 89 2 1125 Entire apartment Entire place
## 4 4797344 58 2 1125 Entire apartment Entire place
## 5 4823489 60 2 1125 Entire apartment Entire place
## 6 4898654 95 2 1125 Entire apartment Entire place
## # ℹ 3 more variables: accommodates <dbl>, bedrooms <dbl>,
## # instant_bookable <lgl>
df=left_join(property_listing_price,location_info)
df |> dim()
## [1] 279712 16
#write.csv(df,here('ainb_full.csv'),row.names = FALSE)
df$instant_bookable =as.factor(df$instant_bookable)
levels(df$instant_bookable) <- c('Not Available', 'Available')
df |> group_by(instant_bookable=df$instant_bookable) |>
summarise(n=n()) |>
ggplot() +
geom_col(aes(x=instant_bookable,y=n, fill=instant_bookable))+
labs(title = 'Airbnb Places Available for Instant Booking',
x="",
y="") +
theme(
legend.position = 'none')
df |> group_by(room_type) |>
summarise(n=n()) |>
ggplot() +
geom_col(aes(x=room_type,y=n, fill=room_type))+
labs(title = 'Types of Rooms',
x="",
y="") +
theme(
legend.position = 'none')
df |> group_by(property_type) |>
summarise(n=n()) |> arrange(desc(n)) |>
top_n(5) |>
ggplot() +
geom_col(aes(x=reorder(property_type,n),y=n, fill=property_type))+
labs(title = 'The five most common property types on Airbnb.',
x="",
y="") +
theme(
legend.position = 'none') +
coord_flip()
The host is the person who makes a place available for rent, but does not necessarily live there. Which country has the highest number of hosts?
top_countries= df |> group_by(country) |>
summarise(n=n()) |> arrange(desc(n)) |>top_n(5)
top_countries |>
ggplot() +
geom_col(aes(x=reorder(country,n),y=n, fill=country))+
labs(title = 'Top 5 host countries',
x="",
y="") +
theme(
legend.position = 'none') +
coord_flip()
df |> group_by(city) |>
summarise(n=n()) |> arrange(desc(n)) |>
ggplot() +
geom_col(aes(x=reorder(city,n),y=n, fill=city))+
labs(title = 'Listings by Cities',
x="",
y="") +
theme(
legend.position = 'none') +
coord_flip()
df|> group_by(city) |>
summarise(mean_price=mean(price)) |> arrange(desc(mean_price)) |>
ggplot() +
geom_col(aes(x=reorder(city,mean_price),y=mean_price, fill=city))+
labs(title = 'Mean of price by cities',
x="",
y="") +
theme(
legend.position = 'none') +coord_flip()
df|> group_by(city) |>
summarise(mean_price=median(price)) |> arrange(desc(mean_price)) |>
ggplot() +
geom_col(aes(x=reorder(city,mean_price),y=mean_price, fill=city))+
labs(title = 'Median of price by cities',
x="",
y="") +
theme(
legend.position = 'none') + coord_flip()
# With outlier
df |> ggplot(aes(city,price)) +
geom_boxplot() +
labs(x='',y='',title='Variance of price')
# Without Outlier
df |> ggplot(aes(city,price,fill=city)) +
geom_boxplot(outlier.shape = NA) + scale_y_continuous(limits = quantile(df$price,
c(0.1, 0.9))) + theme(legend.position = "none") + labs(x='',y='',title='Variance of price without outliers') + coord_flip()
kruskal.test(price ~ city, df)
##
## Kruskal-Wallis rank sum test
##
## data: price by city
## Kruskal-Wallis chi-squared = 166361, df = 9, p-value < 2.2e-16
At least one group has a median different from the others. The Dunn test allows us to compare the medians between groups and identify where this difference occurs.
dunn_test(price ~ city, data = df, p.adjust.method = "bonferroni")
## # A tibble: 45 × 9
## .y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 price Bangkok Cape … 19361 19086 -2.20 2.78e- 2 1 e+ 0 ns
## 2 price Bangkok Hong … 19361 7087 -40.2 0 0 ****
## 3 price Bangkok Istan… 19361 24519 -95.8 0 0 ****
## 4 price Bangkok Mexic… 19361 20065 -26.2 1.23e-151 5.53e-150 ****
## 5 price Bangkok New Y… 19361 37012 -206. 0 0 ****
## 6 price Bangkok Paris 19361 64690 -245. 0 0 ****
## 7 price Bangkok Rio d… 19361 26615 -88.1 0 0 ****
## 8 price Bangkok Rome 19361 27647 -236. 0 0 ****
## 9 price Bangkok Sydney 19361 33630 -173. 0 0 ****
## 10 price Cape Town Hong … 19086 7087 -38.6 0 0 ****
## # ℹ 35 more rows
df |>
group_by (City=city) |> filter(bedrooms==1) |>
summarise(`Median Price` = median(price)) |> arrange(desc(`Median Price`)) |> gt() |>
tab_header(
title = "Median rental price with 1 Bedroom by City")
Median rental price with 1 Bedroom by City | |
City | Median Price |
---|---|
Bangkok | 989 |
Cape Town | 732 |
Mexico City | 500 |
Hong Kong | 314 |
Istanbul | 220 |
Rio de Janeiro | 199 |
Sydney | 86 |
New York | 80 |
Paris | 79 |
Rome | 56 |
df |>
group_by (City=city) |> filter(bedrooms==2) |>
summarise(`Median Price` = median(price)) |> arrange(desc(`Median Price`)) |> gt() |>
tab_header(
title = 'Median rental price with 2 Bedrooms by City')
Median rental price with 2 Bedrooms by City | |
City | Median Price |
---|---|
Bangkok | 1999 |
Cape Town | 1300 |
Mexico City | 1000 |
Hong Kong | 965 |
Rio de Janeiro | 414 |
Istanbul | 371 |
Sydney | 180 |
New York | 152 |
Paris | 125 |
Rome | 84 |
df |>
group_by (City=city) |> filter(bedrooms>=3) |>
summarise(`Median Price` = median(price)) |> arrange(desc(`Median Price`)) |> gt() |>
tab_header(
title = "Median rental price with 3 or more Bedrooms by City")
Median rental price with 3 or more Bedrooms by City | |
City | Median Price |
---|---|
Bangkok | 3800.0 |
Cape Town | 3300.0 |
Mexico City | 1549.0 |
Hong Kong | 1100.0 |
Rio de Janeiro | 793.0 |
Istanbul | 600.0 |
Sydney | 410.0 |
New York | 230.5 |
Paris | 217.0 |
Rome | 131.0 |
df |>
group_by(bedrooms, city) |>
summarise(median_price = median(price)) |>
ggplot(aes(x=bedrooms, y=median_price, color=city)) +
geom_line(linewidth = 1) +
labs(x='Bedrooms',y='Median Price',title='Variance of Median Price for Bedrooms Quantity',color='City')
df |>group_by(city,accommodates) |>
summarise(mean_price=mean(price)) |>
ggplot(aes(x=accommodates,y=mean_price,color=city)) +
geom_line(linewidth = 1) +
labs(x='Accomodations',y='Mean Price',title='Variance of Mean Price for Accomodation Quantity',color='City')
Bangkok appears to be the city with the most expensive places to rent. Let’s explore this further
bang = df |> filter(city=='Bangkok')
bang |>
ggplot(aes(neighbourhood,price,fill=neighbourhood)) +
geom_boxplot(outlier.shape = NA) + scale_y_continuous(limits = quantile(bang$price,
c(0.1, 0.9))) +
labs(x='',y='',title='Variance of price by Neighbourhood') +
coord_flip() +
theme(axis.text.y = element_text(size = 6, hjust = 1),
legend.position = "none")
kruskal.test(price ~ neighbourhood, bang)
##
## Kruskal-Wallis rank sum test
##
## data: price by neighbourhood
## Kruskal-Wallis chi-squared = 1225.6, df = 49, p-value < 2.2e-16
The Dunn test allows us to compare the medians between groups and identify where this difference occurs.
dunn_test(price ~ neighbourhood, data=bang, p.adjust.method = "bonferroni")
## # A tibble: 1,225 × 9
## .y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 price Bang Bon Bang Kapi 7 332 -1.22 0.223 1 ns
## 2 price Bang Bon Bang Khae 7 103 -0.242 0.809 1 ns
## 3 price Bang Bon Bang Khen 7 148 -0.894 0.371 1 ns
## 4 price Bang Bon Bang Kho laen 7 154 0.124 0.901 1 ns
## 5 price Bang Bon Bang Khun thain 7 28 0.0608 0.952 1 ns
## 6 price Bang Bon Bang Na 7 575 -0.954 0.340 1 ns
## 7 price Bang Bon Bang Phlat 7 255 -1.11 0.266 1 ns
## 8 price Bang Bon Bang Rak 7 1079 0.635 0.526 1 ns
## 9 price Bang Bon Bang Sue 7 285 -1.33 0.183 1 ns
## 10 price Bang Bon Bangkok Noi 7 177 -0.115 0.909 1 ns
## # ℹ 1,215 more rows
Few places in Bangkok actually have different rental prices considering the neighborhood
Paris has the highest number of places available for hosting, so let’s explore more about it.
paris = df |> filter(city=='Paris')
paris |>
ggplot(aes(neighbourhood,price,fill=neighbourhood)) +
geom_boxplot(outlier.shape = NA) + scale_y_continuous(limits = quantile(paris$price,
c(0.1, 0.9))) + theme(legend.position = "none") + labs(x='',y='',title='Variance of price by Neighbourhood') + coord_flip()
kruskal.test(price ~ neighbourhood, paris)
##
## Kruskal-Wallis rank sum test
##
## data: price by neighbourhood
## Kruskal-Wallis chi-squared = 7281.2, df = 19, p-value < 2.2e-16
Again, at least one group has a median different from the others. Let’s use the Dunn test again.
dunn_test(price ~ neighbourhood, data=paris, p.adjust.method = "bonferroni")
## # A tibble: 190 × 9
## .y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 price Batignol… Bourse 4330 2188 18.0 8.37e- 73 1.59e- 70 ****
## 2 price Batignol… Butte… 4330 3728 -18.6 3.25e- 77 6.17e- 75 ****
## 3 price Batignol… Butte… 4330 7237 -12.4 3.29e- 35 6.24e- 33 ****
## 4 price Batignol… Elysee 4330 1768 25.6 1.02e-144 1.94e-142 ****
## 5 price Batignol… Enclo… 4330 4628 0.301 7.64e- 1 1 e+ 0 ns
## 6 price Batignol… Gobel… 4330 2278 -9.62 6.39e- 22 1.21e- 19 ****
## 7 price Batignol… Hotel… 4330 1972 21.6 2.16e-103 4.10e-101 ****
## 8 price Batignol… Louvre 4330 1408 21.5 7.50e-103 1.42e-100 ****
## 9 price Batignol… Luxem… 4330 1998 19.9 1.85e- 88 3.51e- 86 ****
## 10 price Batignol… Menil… 4330 3758 -19.8 1.29e- 87 2.46e- 85 ****
## # ℹ 180 more rows
Some places in Paris have price differences, while others do not.
After my analysis, Bangkok was the city that presented the highest costs for booking an Airbnb, both in terms of the number of bedrooms and accommodations. However, Paris and Rome were the cheapest, using this same criteria.