R Markdown

This is the R Markdown file for the HW3 of DACS-601 Summer 2022. I’m using the New York City Airbnb csv file from the Sample Datasets.

loading libaries

library(readr)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✔ ggplot2 3.3.6     ✔ dplyr   1.0.9
## ✔ tibble  3.1.7     ✔ stringr 1.4.0
## ✔ tidyr   1.2.0     ✔ forcats 0.5.1
## ✔ purrr   0.3.4

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(dplyr)

loading data

print("Loading data...")

## [1] "Loading data..."

ab_nyc_data <- read_csv("C:/Users/apoor/Desktop/UMass/Summer 2022/DACS 601 - R Programming/datasets/AB_NYC_2019.csv")

## Rows: 48895 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (5): name, host_name, neighbourhood_group, neighbourhood, room_type
## dbl  (10): id, host_id, latitude, longitude, price, minimum_nights, number_o...
## date  (1): last_review
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

print("Data loaded!")

## [1] "Data loaded!"

head(ab_nyc_data)

## # A tibble: 6 × 16
##      id name           host_id host_name neighbourhood_g… neighbourhood latitude
##   <dbl> <chr>            <dbl> <chr>     <chr>            <chr>            <dbl>
## 1  2539 Clean & quiet…    2787 John      Brooklyn         Kensington        40.6
## 2  2595 Skylit Midtow…    2845 Jennifer  Manhattan        Midtown           40.8
## 3  3647 THE VILLAGE O…    4632 Elisabeth Manhattan        Harlem            40.8
## 4  3831 Cozy Entire F…    4869 LisaRoxa… Brooklyn         Clinton Hill      40.7
## 5  5022 Entire Apt: S…    7192 Laura     Manhattan        East Harlem       40.8
## 6  5099 Large Cozy 1 …    7322 Chris     Manhattan        Murray Hill       40.7
## # … with 9 more variables: longitude <dbl>, room_type <chr>, price <dbl>,
## #   minimum_nights <dbl>, number_of_reviews <dbl>, last_review <date>,
## #   reviews_per_month <dbl>, calculated_host_listings_count <dbl>,
## #   availability_365 <dbl>

printing dimesions

print("Dimensions: ")

## [1] "Dimensions: "

dim(ab_nyc_data)

## [1] 48895    16

printing column names

print("Column names: ")

## [1] "Column names: "

colnames(ab_nyc_data)

##  [1] "id"                             "name"                          
##  [3] "host_id"                        "host_name"                     
##  [5] "neighbourhood_group"            "neighbourhood"                 
##  [7] "latitude"                       "longitude"                     
##  [9] "room_type"                      "price"                         
## [11] "minimum_nights"                 "number_of_reviews"             
## [13] "last_review"                    "reviews_per_month"             
## [15] "calculated_host_listings_count" "availability_365"

printing data types

print("Data types: ")

## [1] "Data types: "

sapply(ab_nyc_data, class)

##                             id                           name 
##                      "numeric"                    "character" 
##                        host_id                      host_name 
##                      "numeric"                    "character" 
##            neighbourhood_group                  neighbourhood 
##                    "character"                    "character" 
##                       latitude                      longitude 
##                      "numeric"                      "numeric" 
##                      room_type                          price 
##                    "character"                      "numeric" 
##                 minimum_nights              number_of_reviews 
##                      "numeric"                      "numeric" 
##                    last_review              reviews_per_month 
##                         "Date"                      "numeric" 
## calculated_host_listings_count               availability_365 
##                      "numeric"                      "numeric"

Potential questions

1. Host with the most stays

print("Host with the most stays - ")

## [1] "Host with the most stays - "

head(ab_nyc_data %>% count(host_id) %>% arrange(desc(n)))

## # A tibble: 6 × 2
##     host_id     n
##       <dbl> <int>
## 1 219517861   327
## 2 107434423   232
## 3  30283594   121
## 4 137358866   103
## 5  12243051    96
## 6  16098958    96

2. Airbnb with most number of reviews

print("Airbnb with most number of reviews - ")

## [1] "Airbnb with most number of reviews - "

head(ab_nyc_data %>% group_by(id, name) %>% summarise(reviews= sum(number_of_reviews)) %>% arrange(desc(reviews)))

## `summarise()` has grouped output by 'id'. You can override using the `.groups`
## argument.

## # A tibble: 6 × 3
## # Groups:   id [6]
##         id name                              reviews
##      <dbl> <chr>                               <dbl>
## 1  9145202 Room near JFK Queen Bed               629
## 2   903972 Great Bedroom in Manhattan            607
## 3   903947 Beautiful Bedroom in Manhattan        597
## 4   891117 Private Bedroom in Manhattan          594
## 5 10101135 Room Near JFK Twin Beds               576
## 6  8168619 Steps away from Laguardia airport     543

3. Most available airbnb

print("Most available airbnb - ")

## [1] "Most available airbnb - "

head(ab_nyc_data %>% group_by(id, name) %>% summarise(availability= mean(availability_365)) %>% arrange(desc(availability)))

## `summarise()` has grouped output by 'id'. You can override using the `.groups`
## argument.

## # A tibble: 6 × 3
## # Groups:   id [6]
##      id name                                availability
##   <dbl> <chr>                                      <dbl>
## 1  2539 Clean & quiet apt home by the park           365
## 2  3647 THE VILLAGE OF HARLEM....NEW YORK !          365
## 3 11452 Clean and Quiet in Brooklyn                  365
## 4 11943 Country space in the city                    365
## 5 21644 Upper Manhattan, New York                    365
## 6 32037 Huge Private  Floor at The Waverly           365

4. Least available airbnb

print("Least available airbnb - ")

## [1] "Least available airbnb - "

head(ab_nyc_data %>% group_by(id, name) %>% summarise(availability= mean(availability_365)) %>% arrange(availability))

## `summarise()` has grouped output by 'id'. You can override using the `.groups`
## argument.

## # A tibble: 6 × 3
## # Groups:   id [6]
##      id name                                              availability
##   <dbl> <chr>                                                    <dbl>
## 1  5022 Entire Apt: Spacious Studio/Loft by central park             0
## 2  5121 BlissArtsSpace!                                              0
## 3  5203 Cozy Clean Guest Room - Family Apt                           0
## 4  6090 West Village Nest - Superhost                                0
## 5  7801 Sweet and Spacious Brooklyn Loft                             0
## 6  8700 Magnifique Suite au N de Manhattan - vue Cloitres            0

5. Number of airbnbs in each neighbourhood group

print("Number of airbnbs in each neighbourhood group - ")

## [1] "Number of airbnbs in each neighbourhood group - "

head(ab_nyc_data %>% count(neighbourhood_group))

## # A tibble: 5 × 2
##   neighbourhood_group     n
##   <chr>               <int>
## 1 Bronx                1091
## 2 Brooklyn            20104
## 3 Manhattan           21661
## 4 Queens               5666
## 5 Staten Island         373

6. Average price of airbnb in each neighbourhood group

print("Average price of airbnb in each neighbourhood group - ")

## [1] "Average price of airbnb in each neighbourhood group - "

head(ab_nyc_data %>% group_by(neighbourhood_group) %>% summarise(avg_price= mean(price)) %>% arrange(desc(avg_price)))

## # A tibble: 5 × 2
##   neighbourhood_group avg_price
##   <chr>                   <dbl>
## 1 Manhattan               197. 
## 2 Brooklyn                124. 
## 3 Staten Island           115. 
## 4 Queens                   99.5
## 5 Bronx                    87.5

DACS 601 - HW3

Apoorva Saraswat

2022-06-24

R Markdown

loading libaries

loading data

printing dimesions

printing column names

printing data types

Potential questions

1. Host with the most stays

2. Airbnb with most number of reviews

3. Most available airbnb

4. Least available airbnb

5. Number of airbnbs in each neighbourhood group

6. Average price of airbnb in each neighbourhood group