library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data <- read_delim("./AB_NYC_2019.csv", delim = ",")
## Rows: 48895 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): name, host_name, neighbourhood_group, neighbourhood, room_type
## dbl (10): id, host_id, latitude, longitude, price, minimum_nights, number_o...
## date (1): last_review
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(data)
## # A tibble: 6 × 16
## id name host_id host_name neighbourhood_group neighbourhood latitude
## <dbl> <chr> <dbl> <chr> <chr> <chr> <dbl>
## 1 2539 Clean & qu… 2787 John Brooklyn Kensington 40.6
## 2 2595 Skylit Mid… 2845 Jennifer Manhattan Midtown 40.8
## 3 3647 THE VILLAG… 4632 Elisabeth Manhattan Harlem 40.8
## 4 3831 Cozy Entir… 4869 LisaRoxa… Brooklyn Clinton Hill 40.7
## 5 5022 Entire Apt… 7192 Laura Manhattan East Harlem 40.8
## 6 5099 Large Cozy… 7322 Chris Manhattan Murray Hill 40.7
## # ℹ 9 more variables: longitude <dbl>, room_type <chr>, price <dbl>,
## # minimum_nights <dbl>, number_of_reviews <dbl>, last_review <date>,
## # reviews_per_month <dbl>, calculated_host_listings_count <dbl>,
## # availability_365 <dbl>
str(data)
## spc_tbl_ [48,895 × 16] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ id : num [1:48895] 2539 2595 3647 3831 5022 ...
## $ name : chr [1:48895] "Clean & quiet apt home by the park" "Skylit Midtown Castle" "THE VILLAGE OF HARLEM....NEW YORK !" "Cozy Entire Floor of Brownstone" ...
## $ host_id : num [1:48895] 2787 2845 4632 4869 7192 ...
## $ host_name : chr [1:48895] "John" "Jennifer" "Elisabeth" "LisaRoxanne" ...
## $ neighbourhood_group : chr [1:48895] "Brooklyn" "Manhattan" "Manhattan" "Brooklyn" ...
## $ neighbourhood : chr [1:48895] "Kensington" "Midtown" "Harlem" "Clinton Hill" ...
## $ latitude : num [1:48895] 40.6 40.8 40.8 40.7 40.8 ...
## $ longitude : num [1:48895] -74 -74 -73.9 -74 -73.9 ...
## $ room_type : chr [1:48895] "Private room" "Entire home/apt" "Private room" "Entire home/apt" ...
## $ price : num [1:48895] 149 225 150 89 80 200 60 79 79 150 ...
## $ minimum_nights : num [1:48895] 1 1 3 1 10 3 45 2 2 1 ...
## $ number_of_reviews : num [1:48895] 9 45 0 270 9 74 49 430 118 160 ...
## $ last_review : Date[1:48895], format: "2018-10-19" "2019-05-21" ...
## $ reviews_per_month : num [1:48895] 0.21 0.38 NA 4.64 0.1 0.59 0.4 3.47 0.99 1.33 ...
## $ calculated_host_listings_count: num [1:48895] 6 2 1 1 1 1 1 1 1 4 ...
## $ availability_365 : num [1:48895] 365 355 365 194 0 129 0 220 0 188 ...
## - attr(*, "spec")=
## .. cols(
## .. id = col_double(),
## .. name = col_character(),
## .. host_id = col_double(),
## .. host_name = col_character(),
## .. neighbourhood_group = col_character(),
## .. neighbourhood = col_character(),
## .. latitude = col_double(),
## .. longitude = col_double(),
## .. room_type = col_character(),
## .. price = col_double(),
## .. minimum_nights = col_double(),
## .. number_of_reviews = col_double(),
## .. last_review = col_date(format = ""),
## .. reviews_per_month = col_double(),
## .. calculated_host_listings_count = col_double(),
## .. availability_365 = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
The dataset will be grouped by neighbourhood_group, and the mean price
for each group will be determined. It examine the number of listings in
each neighborhood.
neigh_price <- data %>%
group_by(neighbourhood_group) %>%
summarise(mean_price = mean(price, na.rm = TRUE), count = n())
neigh_price
## # A tibble: 5 × 3
## neighbourhood_group mean_price count
## <chr> <dbl> <int>
## 1 Bronx 87.5 1091
## 2 Brooklyn 124. 20104
## 3 Manhattan 197. 21661
## 4 Queens 99.5 5666
## 5 Staten Island 115. 373
We visualize the mean price across different neighbourhood groups using a bar plot.
ggplot(neigh_price, aes(x = neighbourhood_group, y = mean_price, fill = neighbourhood_group)) +
geom_bar(stat = "identity") +
labs(title = "Mean Price by Neighbourhood Group", x = "Neighbourhood Group", y = "Mean Price") +
theme_minimal()
Insights: Manhattan has the highest average price, while has the lowest. The number of listings in Staten Island is significantly lower.
Smallest Group: Staten Island has the lowest probability of getting chosen at random if it is the smallest group.
Testable Hypothesis: Staten Island’s lower listing count may be due to of its limited transit options or distance from Manhattan’s top tourist destinations.
Significance: Indicates the neighborhoods with the highest and lowest Airbnb prices. This assists landlords with setting reasonable rents and guides tenants toward affordable listings in New York City.
Further Questions: Are there other factors influencing prices besides the neighborhood, such as property type, size, or amenities?
Group the data by room_type
and calculate the mean
number of minimum nights required for each room type.
group_2 <- data %>%
group_by(room_type) %>%
summarise(mean_minimum_nights = mean(minimum_nights, na.rm = TRUE), count = n())
# View the results
group_2
## # A tibble: 3 × 3
## room_type mean_minimum_nights count
## <chr> <dbl> <int>
## 1 Entire home/apt 8.51 25409
## 2 Private room 5.38 22326
## 3 Shared room 6.48 1160
ggplot(group_2, aes(x = room_type, y = mean_minimum_nights, fill = room_type)) +
geom_bar(stat = "identity") +
labs(title = "Mean Minimum Nights by Room Type", x = "Room Type", y = "Mean Minimum Nights") +
theme_minimal()
Hypothesis: Entire homes may require a higher minimum stay due to cleaning logistics and higher management costs.
Smallest Group: If “Shared rooms” is the smallest group, it represents the least likely type to be selected.
Significance: Knowing availability rates can help visitors in estimating the probability that they will locate a listing that meets their needs.
Further questions: Are rates of availability
subject to seasonal variations? Does availability depend on things like
cost or location?
Group the data by both neighbourhood_group and room_type to investigate the mean price for different room types across neighborhoods.
# Group by neighbourhood_group and room_type, summarize mean price and count
group_3 <- data %>%
group_by(neighbourhood_group, room_type) %>%
summarise(mean_price = mean(price, na.rm = TRUE), count = n())
## `summarise()` has grouped output by 'neighbourhood_group'. You can override
## using the `.groups` argument.
# View the results
group_3
## # A tibble: 15 × 4
## # Groups: neighbourhood_group [5]
## neighbourhood_group room_type mean_price count
## <chr> <chr> <dbl> <int>
## 1 Bronx Entire home/apt 128. 379
## 2 Bronx Private room 66.8 652
## 3 Bronx Shared room 59.8 60
## 4 Brooklyn Entire home/apt 178. 9559
## 5 Brooklyn Private room 76.5 10132
## 6 Brooklyn Shared room 50.5 413
## 7 Manhattan Entire home/apt 249. 13199
## 8 Manhattan Private room 117. 7982
## 9 Manhattan Shared room 89.0 480
## 10 Queens Entire home/apt 147. 2096
## 11 Queens Private room 71.8 3372
## 12 Queens Shared room 69.0 198
## 13 Staten Island Entire home/apt 174. 176
## 14 Staten Island Private room 62.3 188
## 15 Staten Island Shared room 57.4 9
ggplot(group_3, aes(x = neighbourhood_group, y = mean_price, fill = room_type)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Mean Price by Neighbourhood and Room Type", x = "Neighbourhood Group", y = "Mean Price") +
theme_minimal()
Insights: This will show how prices vary by both room type and neighborhood. For example, private rooms in Manhattan may still be more expensive than entire apartments in Staten Island.
Smallest Group: If shared rooms in Staten Island are the smallest group, they have the lowest probability of being selected.
Testable Hypothesis: The lower count and price of shared rooms in areas like Staten Island may suggest that budget travelers tend to prefer more central locations even if shared room prices are lower.
Significance: Shows how costs vary according to room type and neighborhood. This explains how different accommodation types—such as private rooms versus entire homes—have varying prices based on their location.
Further Questions: Do these patterns hold across during different times periods, such as during peak tourist seasons?
Analyze combinations of neighbourhood_group and room_type. We will count the occurrences of each combination and check if there are any missing combinations.
unique_ <- data %>%
distinct(neighbourhood_group, room_type)
comb_counts <- data %>%
group_by(neighbourhood_group, room_type) %>%
summarise(count = n())
## `summarise()` has grouped output by 'neighbourhood_group'. You can override
## using the `.groups` argument.
comb_counts
## # A tibble: 15 × 3
## # Groups: neighbourhood_group [5]
## neighbourhood_group room_type count
## <chr> <chr> <int>
## 1 Bronx Entire home/apt 379
## 2 Bronx Private room 652
## 3 Bronx Shared room 60
## 4 Brooklyn Entire home/apt 9559
## 5 Brooklyn Private room 10132
## 6 Brooklyn Shared room 413
## 7 Manhattan Entire home/apt 13199
## 8 Manhattan Private room 7982
## 9 Manhattan Shared room 480
## 10 Queens Entire home/apt 2096
## 11 Queens Private room 3372
## 12 Queens Shared room 198
## 13 Staten Island Entire home/apt 176
## 14 Staten Island Private room 188
## 15 Staten Island Shared room 9
If there are missing combinations (e.g., no “Shared room” in certain neighborhoods), this could reflect specific market demand or zoning regulations.
missing_comb <- anti_join(unique_, comb_counts, by = c("neighbourhood_group", "room_type"))
missing_comb
## # A tibble: 0 × 2
## # ℹ 2 variables: neighbourhood_group <chr>, room_type <chr>
Create a heatmap to visualize the distribution of different room types across neighbourhood groups.
ggplot(comb_counts, aes(x = room_type, y = neighbourhood_group, fill = count)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "blue") +
labs(title = "Heatmap of Room Type and Neighbourhood Group Combinations", x = "Room Type", y = "Neighbourhood Group", fill = "Count") +
theme_minimal()
Insights: In comparison to “Entire home/apt in Manhattan,” some combinations—like “Shared room in Staten Island”—are relatively rare.
Hypothesis: The demand for entire homes in Manhattan and Brooklyn drives the count for these combinations. The lack of shared rooms in Manhattan may indicate lower demand for budget accommodations in affluent areas.
Significance: Visualizations are like pictures that help us understand the Airbnb market in NYC. They show us patterns and trends, like which neighborhoods are more expensive or how often different types of rooms are available. It’s like having a map that guides us through the data.
Further Questions: Is there a correlation between room type preferences and neighborhood demographics?