Analysis of New York Airbnb Dataset

DACS 601 - HW6 in distill format

Apoorva Saraswat
2022-08-08
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_knit$set(root.dir = "C:/Users/apoor/OneDrive/Desktop/UMass/Summer 2022/DACS 601 - R Programming")
getwd()
[1] "C:/Users/apoor/OneDrive/Desktop/UMass/Summer 2022/DACS 601 - R Programming/hw6"

Introduction

In this analysis we analyse the New York Airbnb Dataset which includes information about various Airbnbs in the New York City, USA in various neighbourhoods. Airbnbs are houses/apartments that owners rent out to travellers looking for a play to stay. The airbnbs can be private or shared. We see the relation between the price, location, room type, reviews, etc of the airbnbs are try to see if there’s any trend being followed.

Data

The dataset includes 48895 rows and 16 columns which includes the information about the airbnbs in the New York City.

Loading data

ab_nyc_data <- read_csv("datasets//AB_NYC_2019.csv")
print("Dimensions: ")
[1] "Dimensions: "
dim(ab_nyc_data)
[1] 48895    16
print("Column names: ")
[1] "Column names: "
colnames(ab_nyc_data)
 [1] "id"                             "name"                          
 [3] "host_id"                        "host_name"                     
 [5] "neighbourhood_group"            "neighbourhood"                 
 [7] "latitude"                       "longitude"                     
 [9] "room_type"                      "price"                         
[11] "minimum_nights"                 "number_of_reviews"             
[13] "last_review"                    "reviews_per_month"             
[15] "calculated_host_listings_count" "availability_365"              
print("Data types: ")
[1] "Data types: "
sapply(ab_nyc_data, class)
                            id                           name 
                     "numeric"                    "character" 
                       host_id                      host_name 
                     "numeric"                    "character" 
           neighbourhood_group                  neighbourhood 
                   "character"                    "character" 
                      latitude                      longitude 
                     "numeric"                      "numeric" 
                     room_type                          price 
                   "character"                      "numeric" 
                minimum_nights              number_of_reviews 
                     "numeric"                      "numeric" 
                   last_review              reviews_per_month 
                   "character"                      "numeric" 
calculated_host_listings_count               availability_365 
                     "numeric"                      "numeric" 
print("Data head: ")
[1] "Data head: "
head(ab_nyc_data)
# A tibble: 6 × 16
     id name   host_id host_…¹ neigh…² neigh…³ latit…⁴ longi…⁵ room_…⁶
  <dbl> <chr>    <dbl> <chr>   <chr>   <chr>     <dbl>   <dbl> <chr>  
1  2539 Clean…    2787 John    Brookl… Kensin…    40.6   -74.0 Privat…
2  2595 Skyli…    2845 Jennif… Manhat… Midtown    40.8   -74.0 Entire…
3  3647 THE V…    4632 Elisab… Manhat… Harlem     40.8   -73.9 Privat…
4  3831 Cozy …    4869 LisaRo… Brookl… Clinto…    40.7   -74.0 Entire…
5  5022 Entir…    7192 Laura   Manhat… East H…    40.8   -73.9 Entire…
6  5099 Large…    7322 Chris   Manhat… Murray…    40.7   -74.0 Entire…
# … with 7 more variables: price <dbl>, minimum_nights <dbl>,
#   number_of_reviews <dbl>, last_review <chr>,
#   reviews_per_month <dbl>, calculated_host_listings_count <dbl>,
#   availability_365 <dbl>, and abbreviated variable names
#   ¹​host_name, ²​neighbourhood_group, ³​neighbourhood, ⁴​latitude,
#   ⁵​longitude, ⁶​room_type
# ℹ Use `colnames()` to see all variable names

Exploring the dataset

1. Number of airbnbs in each neighbourhood group

head(ab_nyc_data %>% count(neighbourhood_group))
# A tibble: 5 × 2
  neighbourhood_group     n
  <chr>               <int>
1 Bronx                1091
2 Brooklyn            20104
3 Manhattan           21661
4 Queens               5666
5 Staten Island         373

2. Average price of airbnb in each neighbourhood group

head(ab_nyc_data %>% group_by(neighbourhood_group) %>% summarise(avg_price= mean(price)) %>% arrange(desc(avg_price)))
# A tibble: 5 × 2
  neighbourhood_group avg_price
  <chr>                   <dbl>
1 Manhattan               197. 
2 Brooklyn                124. 
3 Staten Island           115. 
4 Queens                   99.5
5 Bronx                    87.5

3. Median price of airbnb in each neighbourhood group

head(ab_nyc_data %>% group_by(neighbourhood_group) %>% summarise(median_price= median(price)) %>% arrange(desc(median_price)))
# A tibble: 5 × 2
  neighbourhood_group median_price
  <chr>                      <dbl>
1 Manhattan                    150
2 Brooklyn                      90
3 Queens                        75
4 Staten Island                 75
5 Bronx                         65

4. Standard deviation price of airbnb in each neighbourhood group

head(ab_nyc_data %>% group_by(neighbourhood_group) %>% summarise(sd_price= sd(price)/10) %>% arrange(desc(sd_price)))
# A tibble: 5 × 2
  neighbourhood_group sd_price
  <chr>                  <dbl>
1 Manhattan               29.1
2 Staten Island           27.8
3 Brooklyn                18.7
4 Queens                  16.7
5 Bronx                   10.7

5. Host with the most stays

head(ab_nyc_data %>% count(host_id) %>% arrange(desc(n)))
# A tibble: 6 × 2
    host_id     n
      <dbl> <int>
1 219517861   327
2 107434423   232
3  30283594   121
4 137358866   103
5  12243051    96
6  16098958    96

6. Airbnb with most number of reviews

head(ab_nyc_data %>% group_by(id, name) %>% summarise(reviews= sum(number_of_reviews)) %>% arrange(desc(reviews)))
# A tibble: 6 × 3
# Groups:   id [6]
        id name                              reviews
     <dbl> <chr>                               <dbl>
1  9145202 Room near JFK Queen Bed               629
2   903972 Great Bedroom in Manhattan            607
3   903947 Beautiful Bedroom in Manhattan        597
4   891117 Private Bedroom in Manhattan          594
5 10101135 Room Near JFK Twin Beds               576
6  8168619 Steps away from Laguardia airport     543

7. Most available airbnb

head(ab_nyc_data %>% group_by(id, name) %>% summarise(availability= mean(availability_365)) %>% arrange(desc(availability)))
# A tibble: 6 × 3
# Groups:   id [6]
     id name                                availability
  <dbl> <chr>                                      <dbl>
1  2539 Clean & quiet apt home by the park           365
2  3647 THE VILLAGE OF HARLEM....NEW YORK !          365
3 11452 Clean and Quiet in Brooklyn                  365
4 11943 Country space in the city                    365
5 21644 Upper Manhattan, New York                    365
6 32037 Huge Private  Floor at The Waverly           365

8. Least available airbnb

head(ab_nyc_data %>% group_by(id, name) %>% summarise(availability= mean(availability_365)) %>% arrange(availability))
# A tibble: 6 × 3
# Groups:   id [6]
     id name                                              availability
  <dbl> <chr>                                                    <dbl>
1  5022 Entire Apt: Spacious Studio/Loft by central park             0
2  5121 BlissArtsSpace!                                              0
3  5203 Cozy Clean Guest Room - Family Apt                           0
4  6090 West Village Nest - Superhost                                0
5  7801 Sweet and Spacious Brooklyn Loft                             0
6  8700 Magnifique Suite au N de Manhattan - vue Cloitres            0

Visualisations

The following are the exploratory visualisations on the dataset where we try to figure out patterns in the data.

Univariate Analysis

Plot 1 - Price Distribution

ggplot(ab_nyc_data) + geom_histogram(aes(price), binwidth=15) + xlab("Price (USD)") + ylab("Frequency") + ggtitle("Price Distribution")

Since this data is not that readable, we are zooming in on the values of x from 0 to 500 since the majority of the distribution is in this range

ggplot(ab_nyc_data) + geom_histogram(aes(price), binwidth=15) + xlab("Price (USD)") + ylab("Frequency") + ggtitle("Price Distribution") + xlim(0,500)

Limitations

Bivariate Analysis

Plot 2 - Median price of airbnb for each neighbourhood group

ab_nyc_data2 <- ab_nyc_data %>% group_by(neighbourhood_group) %>% summarise(sd=sd(price)/10, median=median(price))

ggplot(ab_nyc_data2, aes(x=neighbourhood_group, y=median, fill=neighbourhood_group)) + geom_bar(stat="identity") + xlab("Neighbourhood group") + ylab("Median Price (USD)") + ggtitle("Median Price by neighbourhood group") + geom_errorbar(aes(x=neighbourhood_group, ymin=median-sd, ymax=median+sd), width=0.6, colour="black")

Limitations

Plot 3 - Median price of airbnb for each room type

ab_nyc_data3 <- ab_nyc_data %>% group_by(room_type) %>% summarise(sd=sd(price)/10, median=median(price))

ggplot(ab_nyc_data3, aes(x=room_type, y=median, fill=room_type)) + geom_bar(stat="identity") + xlab("Room type") + ylab("Median Price") + ggtitle("Median Price by room type") + geom_errorbar(aes(x=room_type, ymin=median-sd, ymax=median+sd), width=0.6, colour="black") 

Limitations

Facetwrap Analysis

Plot 4 - Median price of airbnb for each room type by neighbourhood group

ggplot(ab_nyc_data, aes(x=room_type, y=median(price)/10000, fill=room_type)) + geom_bar(stat="identity") + xlab("Room Type") + ylab("Median Price (USD)") + ggtitle("Median Price for each room type by neighbourhood group") + facet_wrap(vars(neighbourhood_group), scales="free_y") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Limitations

Multivariate Analysis

Plot 5 - Relationship between number of reviews and minimum nights

ggplot(ab_nyc_data, aes(x=number_of_reviews, y=price, col=price, size=minimum_nights)) +
    theme(axis.title = element_text(), axis.title.x = element_text()) +
    geom_point() +
    xlab("Number of reviews") +
    ylab("Price") +
    ggtitle("Relationship between number of reviews and minimum nights",
            subtitle = "The costlier airbnbs have lesser number of reviews")

Reflection

To be added

Describe your process/experience with the project, including the decisions you made, what was most challenging, and what you wish you would have known. You can also discuss what the next steps would be, were you to continue with the project.

Conclusion

The following are the conclusions made so far -

  1. Manhattan is the most expensive neighbourhood group compared to other neighborhood groups.
  2. Brooklyn can be the second best choice considering the price and number of reviews.
  3. Entire home/apts are much more expensive than private or shared rooms.

Final Questions

Bibliography