R Markdown

This is the R Markdown file for the HW3 of DACS-601 Summer 2022. I’m using the New York City Airbnb csv file from the Sample Datasets.

loading libaries

library(readr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6     ✔ dplyr   1.0.9
## ✔ tibble  3.1.7     ✔ stringr 1.4.0
## ✔ tidyr   1.2.0     ✔ forcats 0.5.1
## ✔ purrr   0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(dplyr)

loading data

print("Loading data...")
## [1] "Loading data..."
ab_nyc_data <- read_csv("C:/Users/apoor/Desktop/UMass/Summer 2022/DACS 601 - R Programming/datasets/AB_NYC_2019.csv")
## Rows: 48895 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (5): name, host_name, neighbourhood_group, neighbourhood, room_type
## dbl  (10): id, host_id, latitude, longitude, price, minimum_nights, number_o...
## date  (1): last_review
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
print("Data loaded!")
## [1] "Data loaded!"
head(ab_nyc_data)
## # A tibble: 6 × 16
##      id name           host_id host_name neighbourhood_g… neighbourhood latitude
##   <dbl> <chr>            <dbl> <chr>     <chr>            <chr>            <dbl>
## 1  2539 Clean & quiet…    2787 John      Brooklyn         Kensington        40.6
## 2  2595 Skylit Midtow…    2845 Jennifer  Manhattan        Midtown           40.8
## 3  3647 THE VILLAGE O…    4632 Elisabeth Manhattan        Harlem            40.8
## 4  3831 Cozy Entire F…    4869 LisaRoxa… Brooklyn         Clinton Hill      40.7
## 5  5022 Entire Apt: S…    7192 Laura     Manhattan        East Harlem       40.8
## 6  5099 Large Cozy 1 …    7322 Chris     Manhattan        Murray Hill       40.7
## # … with 9 more variables: longitude <dbl>, room_type <chr>, price <dbl>,
## #   minimum_nights <dbl>, number_of_reviews <dbl>, last_review <date>,
## #   reviews_per_month <dbl>, calculated_host_listings_count <dbl>,
## #   availability_365 <dbl>

printing dimesions

print("Dimensions: ")
## [1] "Dimensions: "
dim(ab_nyc_data)
## [1] 48895    16

printing column names

print("Column names: ")
## [1] "Column names: "
colnames(ab_nyc_data)
##  [1] "id"                             "name"                          
##  [3] "host_id"                        "host_name"                     
##  [5] "neighbourhood_group"            "neighbourhood"                 
##  [7] "latitude"                       "longitude"                     
##  [9] "room_type"                      "price"                         
## [11] "minimum_nights"                 "number_of_reviews"             
## [13] "last_review"                    "reviews_per_month"             
## [15] "calculated_host_listings_count" "availability_365"

printing data types

print("Data types: ")
## [1] "Data types: "
sapply(ab_nyc_data, class)
##                             id                           name 
##                      "numeric"                    "character" 
##                        host_id                      host_name 
##                      "numeric"                    "character" 
##            neighbourhood_group                  neighbourhood 
##                    "character"                    "character" 
##                       latitude                      longitude 
##                      "numeric"                      "numeric" 
##                      room_type                          price 
##                    "character"                      "numeric" 
##                 minimum_nights              number_of_reviews 
##                      "numeric"                      "numeric" 
##                    last_review              reviews_per_month 
##                         "Date"                      "numeric" 
## calculated_host_listings_count               availability_365 
##                      "numeric"                      "numeric"

Potential questions

1. Host with the most stays

print("Host with the most stays - ")
## [1] "Host with the most stays - "
head(ab_nyc_data %>% count(host_id) %>% arrange(desc(n)))
## # A tibble: 6 × 2
##     host_id     n
##       <dbl> <int>
## 1 219517861   327
## 2 107434423   232
## 3  30283594   121
## 4 137358866   103
## 5  12243051    96
## 6  16098958    96

2. Airbnb with most number of reviews

print("Airbnb with most number of reviews - ")
## [1] "Airbnb with most number of reviews - "
head(ab_nyc_data %>% group_by(id, name) %>% summarise(reviews= sum(number_of_reviews)) %>% arrange(desc(reviews)))
## `summarise()` has grouped output by 'id'. You can override using the `.groups`
## argument.
## # A tibble: 6 × 3
## # Groups:   id [6]
##         id name                              reviews
##      <dbl> <chr>                               <dbl>
## 1  9145202 Room near JFK Queen Bed               629
## 2   903972 Great Bedroom in Manhattan            607
## 3   903947 Beautiful Bedroom in Manhattan        597
## 4   891117 Private Bedroom in Manhattan          594
## 5 10101135 Room Near JFK Twin Beds               576
## 6  8168619 Steps away from Laguardia airport     543

3. Most available airbnb

print("Most available airbnb - ")
## [1] "Most available airbnb - "
head(ab_nyc_data %>% group_by(id, name) %>% summarise(availability= mean(availability_365)) %>% arrange(desc(availability)))
## `summarise()` has grouped output by 'id'. You can override using the `.groups`
## argument.
## # A tibble: 6 × 3
## # Groups:   id [6]
##      id name                                availability
##   <dbl> <chr>                                      <dbl>
## 1  2539 Clean & quiet apt home by the park           365
## 2  3647 THE VILLAGE OF HARLEM....NEW YORK !          365
## 3 11452 Clean and Quiet in Brooklyn                  365
## 4 11943 Country space in the city                    365
## 5 21644 Upper Manhattan, New York                    365
## 6 32037 Huge Private  Floor at The Waverly           365

4. Least available airbnb

print("Least available airbnb - ")
## [1] "Least available airbnb - "
head(ab_nyc_data %>% group_by(id, name) %>% summarise(availability= mean(availability_365)) %>% arrange(availability))
## `summarise()` has grouped output by 'id'. You can override using the `.groups`
## argument.
## # A tibble: 6 × 3
## # Groups:   id [6]
##      id name                                              availability
##   <dbl> <chr>                                                    <dbl>
## 1  5022 Entire Apt: Spacious Studio/Loft by central park             0
## 2  5121 BlissArtsSpace!                                              0
## 3  5203 Cozy Clean Guest Room - Family Apt                           0
## 4  6090 West Village Nest - Superhost                                0
## 5  7801 Sweet and Spacious Brooklyn Loft                             0
## 6  8700 Magnifique Suite au N de Manhattan - vue Cloitres            0

5. Number of airbnbs in each neighbourhood group

print("Number of airbnbs in each neighbourhood group - ")
## [1] "Number of airbnbs in each neighbourhood group - "
head(ab_nyc_data %>% count(neighbourhood_group))
## # A tibble: 5 × 2
##   neighbourhood_group     n
##   <chr>               <int>
## 1 Bronx                1091
## 2 Brooklyn            20104
## 3 Manhattan           21661
## 4 Queens               5666
## 5 Staten Island         373

6. Average price of airbnb in each neighbourhood group

print("Average price of airbnb in each neighbourhood group - ")
## [1] "Average price of airbnb in each neighbourhood group - "
head(ab_nyc_data %>% group_by(neighbourhood_group) %>% summarise(avg_price= mean(price)) %>% arrange(desc(avg_price)))
## # A tibble: 5 × 2
##   neighbourhood_group avg_price
##   <chr>                   <dbl>
## 1 Manhattan               197. 
## 2 Brooklyn                124. 
## 3 Staten Island           115. 
## 4 Queens                   99.5
## 5 Bronx                    87.5