Data Analysis of Key Variables Affecting Airbnb Prices in NYC

Applied Analytics Assignment 2

Rahul Reddy Bommireddy (s4125831)

Last updated: 01 June, 2025

Introduction & Problem Statement

Data

# Setting working directory
setwd("C:/RMIT/Sem1/Applied Analytics/Assignment2")

# Read the dataset
airbnb_data <- read_excel("AB_NYC_2019.xlsx")

# Cleaning data
clean_airbnb <- airbnb_data %>%
  # Removing entries with price = 0
  filter(price > 0) %>%
  # Removing listings with extremely high minimum nights
  filter(minimum_nights <= 365) %>%
  # Removing listings not available for booking
  filter(availability_365 > 0) %>%
  # Converting relevant columns to factors
  mutate(
    neighbourhood_group = as.factor(neighbourhood_group),
    room_type = as.factor(room_type)
  )

# Viewing the cleaned dataset
print(head(clean_airbnb))
## # A tibble: 6 × 16
##      id name        host_id host_name neighbourhood_group neighbourhood latitude
##   <dbl> <chr>         <dbl> <chr>     <fct>               <chr>            <dbl>
## 1  2539 Clean & qu…    2787 John      Brooklyn            Kensington        40.6
## 2  2595 Skylit Mid…    2845 Jennifer  Manhattan           Midtown           40.8
## 3  3647 THE VILLAG…    4632 Elisabeth Manhattan           Harlem            40.8
## 4  3831 Cozy Entir…    4869 LisaRoxa… Brooklyn            Clinton Hill      40.7
## 5  5099 Large Cozy…    7322 Chris     Manhattan           Murray Hill       40.7
## 6  5178 Large Furn…    8967 Shunichi  Manhattan           Hell's Kitch…     40.8
## # ℹ 9 more variables: longitude <dbl>, room_type <fct>, price <dbl>,
## #   minimum_nights <dbl>, number_of_reviews <dbl>, last_review <dttm>,
## #   reviews_per_month <dbl>, calculated_host_listings_count <dbl>,
## #   availability_365 <dbl>

Descriptive Statistics

# Grouping summary statistics by room_type
clean_airbnb %>%
  group_by(room_type) %>%
  summarise(
    Minimum = min(price, na.rm = TRUE),
    First_Quartile = quantile(price, 0.25, na.rm = TRUE),
    Med_Price = median(price, na.rm = TRUE),
    Third_Quartile = quantile(price, 0.75, na.rm = TRUE),
    Maximum = max(price, na.rm = TRUE),
    Average = mean(price, na.rm = TRUE),
    Std_Dev = sd(price, na.rm = TRUE),
    Count = n(),
    Miss_Values = sum(is.na(price))
  ) -> price_summary

knitr::kable(price_summary, caption = "Price Summary by Room Type")
Price Summary by Room Type
room_type Minimum First_Quartile Med_Price Third_Quartile Maximum Average Std_Dev Count Miss_Values
Entire home/apt 10 123 170 249 10000 224.63996 297.93586 16523 0
Private room 10 53 70 99 9999 93.98588 172.98513 13956 0
Shared room 11 32 43 70 1800 66.09524 97.91027 861 0

Visualisation

# Plot for Price Distribution by Room Type
ggplot(clean_airbnb, aes(x = room_type, y = price, fill = room_type)) +
  geom_boxplot(outlier.shape = NA) +
  coord_cartesian(ylim = c(0, 500)) + # Removing outliers
  labs(title = "Price Distribution by Room Type", y = "Price (USD)", x = "Room Type") +
  theme_minimal()

# Plot for Average Price by NYC Region
avg_price_borough <- clean_airbnb %>%
  group_by(neighbourhood_group) %>%
  summarise(Avg_Price = mean(price))

ggplot(avg_price_borough, aes(x = reorder(neighbourhood_group, -Avg_Price), y = Avg_Price, fill = neighbourhood_group)) +
  geom_col() +
  labs(title = "Average Price by NYC Region", x = "Region", y = "Average Price (USD)") +
  theme_minimal()

# Plot for Relationship Between Number of Reviews and Price by Region
ggplot(clean_airbnb, aes(x = number_of_reviews, y = price, color = neighbourhood_group)) +
  geom_point(alpha = 0.4) +
  scale_y_log10() +  # Compress skew in price
  labs(
    title = "Price vs. Number of Reviews by NYC Region",
    x = "Number of Reviews",
    y = "Price (log scale)",
    color = "Region"
  ) +
  theme_minimal()

Hypothesis Testing

Mean Price Comparison

\[H_0: \mu_{\text{Entire home/apt}} = \mu_{\text{Private room}}\]

\[H_A: \mu_{\text{Entire home/apt}} \ne \mu_{\text{Private room}}\]

# Filter relevant room types
price_data <- clean_airbnb %>% 
  filter(room_type %in% c("Entire home/apt", "Private room"))

# Welch's Two Sample t-test
t_test_result <- t.test(price ~ room_type, data = price_data)
t_test_result
## 
##  Welch Two Sample t-test
## 
## data:  price by room_type
## t = 47.656, df = 27210, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group Entire home/apt and group Private room is not equal to 0
## 95 percent confidence interval:
##  125.2804 136.0278
## sample estimates:
## mean in group Entire home/apt    mean in group Private room 
##                     224.63996                      93.98588

Hypothesis Testing Cont.

Regression Analysis: What affects Airbnb price?

# Removing missing prices
airbnb <- clean_airbnb %>%
  filter(!is.na(price), !is.na(room_type), !is.na(neighbourhood_group), !is.na(number_of_reviews))

# Building regression model
model <- lm(price ~ room_type + neighbourhood_group + number_of_reviews, data = airbnb)

# Show model summary
summary(model)
## 
## Call:
## lm(formula = price ~ room_type + neighbourhood_group + number_of_reviews, 
##     data = airbnb)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -258.3  -69.2  -25.6   12.4 9849.5 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       176.2728     8.2703  21.314  < 2e-16 ***
## room_typePrivate room            -116.8403     2.8411 -41.125  < 2e-16 ***
## room_typeShared room             -151.7328     8.5048 -17.841  < 2e-16 ***
## neighbourhood_groupBrooklyn        27.3774     8.3371   3.284  0.00103 ** 
## neighbourhood_groupManhattan       91.9951     8.3320  11.041  < 2e-16 ***
## neighbourhood_groupQueens          10.2822     8.8438   1.163  0.24498    
## neighbourhood_groupStaten Island   12.2803    15.5715   0.789  0.43033    
## number_of_reviews                  -0.3136     0.0266 -11.790  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 242.6 on 31332 degrees of freedom
## Multiple R-squared:  0.09129,    Adjusted R-squared:  0.09109 
## F-statistic: 449.7 on 7 and 31332 DF,  p-value: < 2.2e-16

Results Explanation

Residuals: Q-Q Plot

# plot of residuals (Q-Q)
qqnorm(residuals(model))
qqline(residuals(model), col = "red", lwd = 2)

Discussion

References