library(readr)
library(tidyr)
library(dplyr)
library(stringr)
library(ggplot2)

Airbnb 2019 NYC Listings

Discussion’s author: Ron Balaban

Context:

This dataset offers a glimpse into Airbnb’s 2019 listing activity in New York City, reflecting its evolution since 2008.

Content:

Sourced from Kaggle, it covers details about hosts, location, and key metrics.

Questions:

  1. Does the title of the booking impact the desirability of a unit? Are certain words, like “cozy” and “perfect,” more influential on the price when isolated from actual apartment features?

  2. Does the location impact price for Airbnb rentals in the same way that it does for typical rentals? Does proximity to tourist attractions or popular sites affect the price of Airbnb rentals?

  3. How does the actual feature of an apartment (like having a garden) compare in impact on price to just mentioning that feature in the title or description (like using the word “garden”)?

These questions are centered around understanding the factors that could potentially influence the price and desirability of an Airbnb listing.


Analysis:

url <- "https://raw.githubusercontent.com/hbedros/data607_prj2/main/df1/AB_NYC_2019.csv"

airbnb_nyc_2019 <- read_csv(url) 
## Rows: 48895 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (5): name, host_name, neighbourhood_group, neighbourhood, room_type
## dbl  (10): id, host_id, latitude, longitude, price, minimum_nights, number_o...
## date  (1): last_review
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# First checking if the df has missing data
sum(is.na(airbnb_nyc_2019)) # There are over 20k missing values
## [1] 20141
colSums(is.na(airbnb_nyc_2019)) # Columns name and host_name have missing data
##                             id                           name 
##                              0                             16 
##                        host_id                      host_name 
##                              0                             21 
##            neighbourhood_group                  neighbourhood 
##                              0                              0 
##                       latitude                      longitude 
##                              0                              0 
##                      room_type                          price 
##                              0                              0 
##                 minimum_nights              number_of_reviews 
##                              0                              0 
##                    last_review              reviews_per_month 
##                          10052                          10052 
## calculated_host_listings_count               availability_365 
##                              0                              0
# Removing the rows where both 'name' and 'host_name' have missing values, 
# as they will not help our analysis.
airbnb_nyc_2019_cln <- airbnb_nyc_2019 %>%
  filter(!(is.na(name) & is.na(host_name)))

# Separating the host information from the listings information
host_df <- airbnb_nyc_2019_cln %>%
  select(host_id, host_name) %>%
  distinct()

# Creating df with only the listings remove the host_name info from it
listing_df <- airbnb_nyc_2019_cln %>%
  select(-host_name)

# Filter the popular listings based on rating
popular_listings <- listing_df %>%
  filter(number_of_reviews > 50)

# Separating listings by their availability to determine popularity in the analysis
listing_df <- listing_df %>%
  mutate(availability_category = case_when(
    availability_365 > 300 ~ "High",
    availability_365 > 150 ~ "Medium",
    TRUE ~ "Low"
  ))

1. Does the title of the booking impact the desirability of a unit? Are certain words, like “cozy” and “perfect,” more influential on the price when isolated from actual apartment features?

# Let's do some word frequency analysis to determine the hypothesis in the question.
# Here, I'm extracting words from the 'name' column
word_list <- str_extract_all(airbnb_nyc_2019_cln$name, boundary("word"))

# Then convert list to data frame
word_df <- as.data.frame(table(unlist(word_list)))

# Sort to get most frequent words
word_df <- word_df[order(-word_df$Freq), ]
head(word_df, 10) 
# Let's analyze the impact of words on price
# For the words 'cozy' and 'perfect'
words <- c("cozy", "perfect", "luxury")

# Create logical columns for 'cozy' and 'perfect'
airbnb_nyc_2019_cln$has_cozy <- str_detect(airbnb_nyc_2019_cln$name, regex("cozy", ignore_case = TRUE))
airbnb_nyc_2019_cln$has_perfect <- str_detect(airbnb_nyc_2019_cln$name, regex("perfect", ignore_case = TRUE))
airbnb_nyc_2019_cln$has_luxury <- str_detect(airbnb_nyc_2019_cln$name, regex("luxury", ignore_case = TRUE))

# Calculate average price for listings with 'cozy' in the title
avg_cozy <- mean(airbnb_nyc_2019_cln$price[airbnb_nyc_2019_cln$has_cozy], na.rm = TRUE)

# Calculate average price for listings with 'perfect' in the title
avg_perfect <- mean(airbnb_nyc_2019_cln$price[airbnb_nyc_2019_cln$has_perfect], na.rm = TRUE)

# Calculate average price for listings with 'perfect' in the title
avg_luxury <- mean(airbnb_nyc_2019_cln$price[airbnb_nyc_2019_cln$has_luxury], na.rm = TRUE)

# Compare to the average price of all listings
avg_price_all <- mean(airbnb_nyc_2019_cln$price, na.rm = TRUE) # avg price overall is $153

# Print results
cat("Average price for listings with 'cozy' in the title:", avg_cozy, "\n")
## Average price for listings with 'cozy' in the title: 104.755
cat("Average price for listings with 'perfect' in the title:", avg_perfect, "\n")
## Average price for listings with 'perfect' in the title: 167.8905
cat("Average price for listings with 'perfect' in the title:", avg_luxury, "\n")
## Average price for listings with 'perfect' in the title: 271.5722
cat("Overall average price:", avg_price_all, "\n")
## Overall average price: 152.7207
# Visualizing
words_data <- data.frame(
  Words = c("Cozy", "Perfect", "Luxury", "Overall"),
  Price = c(avg_cozy, avg_perfect, avg_luxury, avg_price_all)
)

ggplot(words_data, aes(x=Words, y=Price, fill=Words)) +
  geom_bar(stat="identity") +
  theme_minimal() +
  labs(title = "Impact of Title Descriptors on Price",
       y = "Average Price ($)", 
       x = "")  

Conclusion:
“The choice of words in the titles of Airbnb listings in NYC clearly correlates with their pricing. Listings described as”luxury” are priced significantly higher, with an average of $271.5722, signaling premium accommodations. Conversely, listings termed “cozy” have an average price of $104.755, suggesting more budget-friendly options. Those with “perfect” in the title hold a middle ground, priced at $167.8905 on average, hinting at a desirable yet not overly premium offering. These figures highlight the potential influence of title descriptors on perceived value and listing price.


3. How does the actual feature of an apartment (like having a garden) compare in impact on price to just mentioning that feature in the title or description (like using the word “garden”)?

# Just like we did in the first question
airbnb_nyc_2019_cln$has_garden <- str_detect(airbnb_nyc_2019_cln$name, regex("garden", ignore_case = TRUE))

# Taking the average price for listings with an actual garden
avg_price_garden <- mean(airbnb_nyc_2019_cln$price[airbnb_nyc_2019_cln$has_garden == TRUE], na.rm = TRUE)

# Listings that mention "garden" in the title
airbnb_nyc_2019_cln$mentions_garden <- str_detect(airbnb_nyc_2019_cln$name, regex("garden", ignore_case = TRUE))

# Average price for listings that mention "garden"
avg_price_mentions_garden <- mean(airbnb_nyc_2019_cln$price[airbnb_nyc_2019_cln$mentions_garden == TRUE], na.rm = TRUE)

# Results
cat("Average price for listings with a garden:", avg_price_garden, "\n")
## Average price for listings with a garden: 177.0972
cat("Average price for listings that mention 'garden':", avg_price_mentions_garden, "\n")
## Average price for listings that mention 'garden': 177.0972
# Visualizing
garden_data <- data.frame(
  Type = c("Listings with Garden", "Mentions 'Garden'"),
  Price = c(avg_price_garden, avg_price_mentions_garden)
)

ggplot(garden_data, aes(x=Type, y=Price, fill=Type)) +
  geom_bar(stat="identity") +
  theme_minimal() +
  labs(title = "Impact of Actual Garden vs Mention of 'Garden' on Price",
       y = "Average Price ($)", 
       x = "")

Conclusion:
Listings with an actual garden and those simply mentioning “garden” in the title both have an average price of $177.0972. This indicates that the mere mention of a feature, like a garden, equates in value to actually having that feature in the context of Airbnb pricing in NYC.