library(readr)
library(tidyr)
library(dplyr)
library(stringr)
library(ggplot2)

Airbnb 2019 NYC Listings

Discussion’s author: Ron Balaban

Context:

This dataset offers a glimpse into Airbnb’s 2019 listing activity in New York City, reflecting its evolution since 2008.

Content:

Sourced from Kaggle, it covers details about hosts, location, and key metrics.

Questions:

Does the title of the booking impact the desirability of a unit? Are certain words, like “cozy” and “perfect,” more influential on the price when isolated from actual apartment features?
Does the location impact price for Airbnb rentals in the same way that it does for typical rentals? Does proximity to tourist attractions or popular sites affect the price of Airbnb rentals?
How does the actual feature of an apartment (like having a garden) compare in impact on price to just mentioning that feature in the title or description (like using the word “garden”)?

These questions are centered around understanding the factors that could potentially influence the price and desirability of an Airbnb listing.

Analysis:

url <- "https://raw.githubusercontent.com/hbedros/data607_prj2/main/df1/AB_NYC_2019.csv"

airbnb_nyc_2019 <- read_csv(url)

## Rows: 48895 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (5): name, host_name, neighbourhood_group, neighbourhood, room_type
## dbl  (10): id, host_id, latitude, longitude, price, minimum_nights, number_o...
## date  (1): last_review
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# First checking if the df has missing data
sum(is.na(airbnb_nyc_2019)) # There are over 20k missing values

## [1] 20141

colSums(is.na(airbnb_nyc_2019)) # Columns name and host_name have missing data

##                             id                           name 
##                              0                             16 
##                        host_id                      host_name 
##                              0                             21 
##            neighbourhood_group                  neighbourhood 
##                              0                              0 
##                       latitude                      longitude 
##                              0                              0 
##                      room_type                          price 
##                              0                              0 
##                 minimum_nights              number_of_reviews 
##                              0                              0 
##                    last_review              reviews_per_month 
##                          10052                          10052 
## calculated_host_listings_count               availability_365 
##                              0                              0

# Removing the rows where both 'name' and 'host_name' have missing values, 
# as they will not help our analysis.
airbnb_nyc_2019_cln <- airbnb_nyc_2019 %>%
  filter(!(is.na(name) & is.na(host_name)))

# Separating the host information from the listings information
host_df <- airbnb_nyc_2019_cln %>%
  select(host_id, host_name) %>%
  distinct()

# Creating df with only the listings remove the host_name info from it
listing_df <- airbnb_nyc_2019_cln %>%
  select(-host_name)

# Filter the popular listings based on rating
popular_listings <- listing_df %>%
  filter(number_of_reviews > 50)

# Separating listings by their availability to determine popularity in the analysis
listing_df <- listing_df %>%
  mutate(availability_category = case_when(
    availability_365 > 300 ~ "High",
    availability_365 > 150 ~ "Medium",
    TRUE ~ "Low"
  ))

1. Does the title of the booking impact the desirability of a unit? Are certain words, like “cozy” and “perfect,” more influential on the price when isolated from actual apartment features?

# Let's do some word frequency analysis to determine the hypothesis in the question.
# Here, I'm extracting words from the 'name' column
word_list <- str_extract_all(airbnb_nyc_2019_cln$name, boundary("word"))

# Then convert list to data frame
word_df <- as.data.frame(table(unlist(word_list)))

# Sort to get most frequent words
word_df <- word_df[order(-word_df$Freq), ]
head(word_df, 10)

# Let's analyze the impact of words on price
# For the words 'cozy' and 'perfect'
words <- c("cozy", "perfect", "luxury")

# Create logical columns for 'cozy' and 'perfect'
airbnb_nyc_2019_cln$has_cozy <- str_detect(airbnb_nyc_2019_cln$name, regex("cozy", ignore_case = TRUE))
airbnb_nyc_2019_cln$has_perfect <- str_detect(airbnb_nyc_2019_cln$name, regex("perfect", ignore_case = TRUE))
airbnb_nyc_2019_cln$has_luxury <- str_detect(airbnb_nyc_2019_cln$name, regex("luxury", ignore_case = TRUE))

# Calculate average price for listings with 'cozy' in the title
avg_cozy <- mean(airbnb_nyc_2019_cln$price[airbnb_nyc_2019_cln$has_cozy], na.rm = TRUE)

# Calculate average price for listings with 'perfect' in the title
avg_perfect <- mean(airbnb_nyc_2019_cln$price[airbnb_nyc_2019_cln$has_perfect], na.rm = TRUE)

# Calculate average price for listings with 'perfect' in the title
avg_luxury <- mean(airbnb_nyc_2019_cln$price[airbnb_nyc_2019_cln$has_luxury], na.rm = TRUE)

# Compare to the average price of all listings
avg_price_all <- mean(airbnb_nyc_2019_cln$price, na.rm = TRUE) # avg price overall is $153

# Print results
cat("Average price for listings with 'cozy' in the title:", avg_cozy, "\n")

## Average price for listings with 'cozy' in the title: 104.755

cat("Average price for listings with 'perfect' in the title:", avg_perfect, "\n")

## Average price for listings with 'perfect' in the title: 167.8905

cat("Average price for listings with 'perfect' in the title:", avg_luxury, "\n")

## Average price for listings with 'perfect' in the title: 271.5722

cat("Overall average price:", avg_price_all, "\n")

## Overall average price: 152.7207

# Visualizing
words_data <- data.frame(
  Words = c("Cozy", "Perfect", "Luxury", "Overall"),
  Price = c(avg_cozy, avg_perfect, avg_luxury, avg_price_all)
)

ggplot(words_data, aes(x=Words, y=Price, fill=Words)) +
  geom_bar(stat="identity") +
  theme_minimal() +
  labs(title = "Impact of Title Descriptors on Price",
       y = "Average Price ($)", 
       x = "")

Conclusion:
“The choice of words in the titles of Airbnb listings in NYC clearly correlates with their pricing. Listings described as”luxury” are priced significantly higher, with an average of $271.5722, signaling premium accommodations. Conversely, listings termed “cozy” have an average price of $104.755, suggesting more budget-friendly options. Those with “perfect” in the title hold a middle ground, priced at $167.8905 on average, hinting at a desirable yet not overly premium offering. These figures highlight the potential influence of title descriptors on perceived value and listing price.

2. Does the location impact price for Airbnb rentals in the same way that it does for typical rentals? Does proximity to tourist attractions or popular sites affect the price of Airbnb rentals?

# looking for the price per location here
location_price <- airbnb_nyc_2019_cln %>%
  group_by(neighbourhood_group) %>%
  summarise(avg_price = mean(price, na.rm = TRUE))

print(location_price)

## # A tibble: 5 × 2
##   neighbourhood_group avg_price
##   <chr>                   <dbl>
## 1 Bronx                    87.5
## 2 Brooklyn                124. 
## 3 Manhattan               197. 
## 4 Queens                   99.5
## 5 Staten Island           115.

ggplot(location_price, aes(x=neighbourhood_group, y=avg_price, fill=neighbourhood_group)) +
  geom_bar(stat="identity") +
  theme_minimal() +
  labs(title = "Average Price by Neighbourhood Group",
       y = "Average Price ($)", 
       x = "")

Conclusion:
Manhattan has the highest Airbnb prices at $196.88, followed by Brooklyn ($124.38), Staten Island ($114.81), Queens ($99.52), and the Bronx ($87.50). Location strongly affects price in NYC, likely due to Manhattan having more tourist attractions than the other boroughs of New York City.

3. How does the actual feature of an apartment (like having a garden) compare in impact on price to just mentioning that feature in the title or description (like using the word “garden”)?

# Just like we did in the first question
airbnb_nyc_2019_cln$has_garden <- str_detect(airbnb_nyc_2019_cln$name, regex("garden", ignore_case = TRUE))

# Taking the average price for listings with an actual garden
avg_price_garden <- mean(airbnb_nyc_2019_cln$price[airbnb_nyc_2019_cln$has_garden == TRUE], na.rm = TRUE)

# Listings that mention "garden" in the title
airbnb_nyc_2019_cln$mentions_garden <- str_detect(airbnb_nyc_2019_cln$name, regex("garden", ignore_case = TRUE))

# Average price for listings that mention "garden"
avg_price_mentions_garden <- mean(airbnb_nyc_2019_cln$price[airbnb_nyc_2019_cln$mentions_garden == TRUE], na.rm = TRUE)

# Results
cat("Average price for listings with a garden:", avg_price_garden, "\n")

## Average price for listings with a garden: 177.0972

cat("Average price for listings that mention 'garden':", avg_price_mentions_garden, "\n")

## Average price for listings that mention 'garden': 177.0972

# Visualizing
garden_data <- data.frame(
  Type = c("Listings with Garden", "Mentions 'Garden'"),
  Price = c(avg_price_garden, avg_price_mentions_garden)
)

ggplot(garden_data, aes(x=Type, y=Price, fill=Type)) +
  geom_bar(stat="identity") +
  theme_minimal() +
  labs(title = "Impact of Actual Garden vs Mention of 'Garden' on Price",
       y = "Average Price ($)", 
       x = "")

Conclusion:
Listings with an actual garden and those simply mentioning “garden” in the title both have an average price of $177.0972. This indicates that the mere mention of a feature, like a garden, equates in value to actually having that feature in the context of Airbnb pricing in NYC.

Project 2 - Part A

Haig Bedros

2023-10-07