airbnb <- read_delim("./airbnb_austin.csv", delim = ",")
## Rows: 15244 Columns: 18
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (3): name, host_name, room_type
## dbl (12): id, host_id, neighbourhood, latitude, longitude, price, minimum_n...
## lgl (2): neighbourhood_group, license
## date (1): last_review
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
group_by
neighborhood
and
Summarize price
:neighbourhood_price <- airbnb |>
group_by(neighbourhood) |>
summarise(
avg_price = mean(price, na.rm = TRUE),
number_listings = n()
) |>
arrange(desc(avg_price))
neighbourhood_price
## # A tibble: 44 x 3
## neighbourhood avg_price number_listings
## <dbl> <dbl> <int>
## 1 78750 1433. 95
## 2 78732 1393. 76
## 3 78746 779. 285
## 4 78731 755. 175
## 5 78730 735. 43
## 6 78727 718. 177
## 7 78733 633. 99
## 8 78729 550. 166
## 9 78701 396. 1112
## 10 78737 390. 199
## # i 34 more rows
ggplot(neighbourhood_price, aes(x = reorder(neighbourhood, -avg_price), y = avg_price)) +
geom_col(fill = "steelblue") +
labs(title = "Average Price by Neighbourhood", x = "Neighbourhood", y = "Average Price") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Removed 1 rows containing missing values (position_stack).
room_type
and summarize by
number_of_review
.roomtype_reviews <- airbnb |>
group_by(room_type) |>
summarise(
total_reviews = sum(number_of_reviews, na.rm = TRUE),
number_listings = n()
) |>
arrange(desc(total_reviews))
roomtype_reviews
## # A tibble: 4 x 3
## room_type total_reviews number_listings
## <chr> <dbl> <int>
## 1 Entire home/apt 573618 12429
## 2 Private room 58173 2562
## 3 Shared room 1210 119
## 4 Hotel room 195 134
roomtype_reviews <- roomtype_reviews |>
mutate(percentage = (total_reviews / sum(total_reviews)) * 100)
ggplot(roomtype_reviews, aes(x = reorder(room_type, -total_reviews), y = total_reviews)) +
geom_col(fill = "salmon") +
geom_text(aes(label = paste0(round(percentage, 2), "%")),
vjust = -0.5, size = 3, fontface = "bold") +
labs(title = "Total Reviews by Room Type", x = "Room Type", y = "Total Reviews") +
theme_minimal()
Popular room types like Entire home/apt
may dominate reviews due to higher rates of occupancy.
room_type
and summarize by
availability_365
room_type_availability <- airbnb |>
group_by(room_type) |>
summarise(
avg_availability = mean(availability_365, na.rm = TRUE),
number_listings = n()
) |>
arrange(desc(avg_availability))
room_type_availability
## # A tibble: 4 x 3
## room_type avg_availability number_listings
## <chr> <dbl> <int>
## 1 Hotel room 287. 134
## 2 Shared room 187. 119
## 3 Entire home/apt 163. 12429
## 4 Private room 122. 2562
ggplot(room_type_availability, aes(x = reorder(room_type, -avg_availability), y = avg_availability)) +
geom_col(fill = "purple") +
labs(
title = "Average Availability by Room Type",
x = "Room Type",
y = "Average Availability (Days)"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
This helps to know which room types are the most/least available
throughout the year. Hotel room
have the
highest availability, which might indicate oversupply or lower
demand.
rarest_neighbourhood <- neighbourhood_price |>
filter(number_listings == min(number_listings)) |>
mutate(tag = "Rare: Low Probability Group")
rarest_neighbourhood
## # A tibble: 1 x 4
## neighbourhood avg_price number_listings tag
## <dbl> <dbl> <int> <chr>
## 1 78712 NaN 1 Rare: Low Probability Group
The probability that any listing picked at random will belong to this
group is the lowest. This might indicate a luxury
neighbourhood
or a remote area.
Hypothesis: Neighbourhoods with fewer listings have
higher prices.
Test: Check
if number_listings
predicts avg_price
.
combinations <- airbnb |>
count(neighbourhood, room_type, name = "count")
all_combinations <- expand.grid(
neighbourhood = unique(airbnb$neighbourhood),
room_type = unique(airbnb$room_type)
)
missing_combinations <- all_combinations |>
anti_join(combinations, by = c("neighbourhood", "room_type"))
missing_combinations
## neighbourhood room_type
## 1 78712 Entire home/apt
## 2 78736 Shared room
## 3 78733 Shared room
## 4 78757 Shared room
## 5 78759 Shared room
## 6 78727 Shared room
## 7 78752 Shared room
## 8 78754 Shared room
## 9 78724 Shared room
## 10 78721 Shared room
## 11 78734 Shared room
## 12 78737 Shared room
## 13 78738 Shared room
## 14 78735 Shared room
## 15 78726 Shared room
## 16 78717 Shared room
## 17 78732 Shared room
## 18 78730 Shared room
## 19 78728 Shared room
## 20 78747 Shared room
## 21 78739 Shared room
## 22 78712 Shared room
## 23 78719 Shared room
## 24 78742 Shared room
## 25 78729 Hotel room
## 26 78741 Hotel room
## 27 78703 Hotel room
## 28 78723 Hotel room
## 29 78756 Hotel room
## 30 78731 Hotel room
## 31 78736 Hotel room
## 32 78733 Hotel room
## 33 78757 Hotel room
## 34 78748 Hotel room
## 35 78758 Hotel room
## 36 78727 Hotel room
## 37 78752 Hotel room
## 38 78754 Hotel room
## 39 78724 Hotel room
## 40 78725 Hotel room
## 41 78721 Hotel room
## 42 78744 Hotel room
## 43 78737 Hotel room
## 44 78735 Hotel room
## 45 78749 Hotel room
## 46 78726 Hotel room
## 47 78717 Hotel room
## 48 78732 Hotel room
## 49 78730 Hotel room
## 50 78728 Hotel room
## 51 78750 Hotel room
## 52 78747 Hotel room
## 53 78739 Hotel room
## 54 78712 Hotel room
## 55 78719 Hotel room
## 56 78742 Hotel room
The absence of certain combinations may indicate low market demand or
zoning restrictions. For example,
Hotel Room
might be absent in a
residential neighbourhood
.
most_common <- combinations |> arrange(desc(count)) |> head(1)
least_common <- combinations |> arrange(count) |> head(1)
print(paste("Most Common:", most_common$neighbourhood, "-", most_common$room_type))
## [1] "Most Common: 78704 - Entire home/apt"
print(paste("Least Common:", least_common$neighbourhood, "-", least_common$room_type))
## [1] "Least Common: 78712 - Private room"
High-demand areas are dominated by
Entire Home/apt
listings, while rare
combinations may reflect impracticality. For example,
Private Room
listings might be uncommon in
family-oriented areas where renting out the entire apartment is the
norm.
ggplot(combinations, aes(x = neighbourhood, y = room_type, fill = count)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Count") +
labs(
title = "Listings by Neighbourhood and Room Type",
x = "Neighbourhood",
y = "Room Type"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
axis.text.y = element_text(size = 10),
legend.position = "right",
plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
)
This reveals hotspots of popular combinations. For example,
Entire Home/apt
in tourist areas.
Do rare neighbourhoods have longer vacancy periods?
Are missing combinations due to host preferences or platform policies?