This is the R Markdown file for the HW3 of DACS-601 Summer 2022. I’m using the New York City Airbnb csv file from the Sample Datasets.
library(readr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6 ✔ dplyr 1.0.9
## ✔ tibble 3.1.7 ✔ stringr 1.4.0
## ✔ tidyr 1.2.0 ✔ forcats 0.5.1
## ✔ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(dplyr)
print("Loading data...")
## [1] "Loading data..."
ab_nyc_data <- read_csv("C:/Users/apoor/Desktop/UMass/Summer 2022/DACS 601 - R Programming/datasets/AB_NYC_2019.csv")
## Rows: 48895 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): name, host_name, neighbourhood_group, neighbourhood, room_type
## dbl (10): id, host_id, latitude, longitude, price, minimum_nights, number_o...
## date (1): last_review
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
print("Data loaded!")
## [1] "Data loaded!"
head(ab_nyc_data)
## # A tibble: 6 × 16
## id name host_id host_name neighbourhood_g… neighbourhood latitude
## <dbl> <chr> <dbl> <chr> <chr> <chr> <dbl>
## 1 2539 Clean & quiet… 2787 John Brooklyn Kensington 40.6
## 2 2595 Skylit Midtow… 2845 Jennifer Manhattan Midtown 40.8
## 3 3647 THE VILLAGE O… 4632 Elisabeth Manhattan Harlem 40.8
## 4 3831 Cozy Entire F… 4869 LisaRoxa… Brooklyn Clinton Hill 40.7
## 5 5022 Entire Apt: S… 7192 Laura Manhattan East Harlem 40.8
## 6 5099 Large Cozy 1 … 7322 Chris Manhattan Murray Hill 40.7
## # … with 9 more variables: longitude <dbl>, room_type <chr>, price <dbl>,
## # minimum_nights <dbl>, number_of_reviews <dbl>, last_review <date>,
## # reviews_per_month <dbl>, calculated_host_listings_count <dbl>,
## # availability_365 <dbl>
print("Dimensions: ")
## [1] "Dimensions: "
dim(ab_nyc_data)
## [1] 48895 16
print("Column names: ")
## [1] "Column names: "
colnames(ab_nyc_data)
## [1] "id" "name"
## [3] "host_id" "host_name"
## [5] "neighbourhood_group" "neighbourhood"
## [7] "latitude" "longitude"
## [9] "room_type" "price"
## [11] "minimum_nights" "number_of_reviews"
## [13] "last_review" "reviews_per_month"
## [15] "calculated_host_listings_count" "availability_365"
print("Data types: ")
## [1] "Data types: "
sapply(ab_nyc_data, class)
## id name
## "numeric" "character"
## host_id host_name
## "numeric" "character"
## neighbourhood_group neighbourhood
## "character" "character"
## latitude longitude
## "numeric" "numeric"
## room_type price
## "character" "numeric"
## minimum_nights number_of_reviews
## "numeric" "numeric"
## last_review reviews_per_month
## "Date" "numeric"
## calculated_host_listings_count availability_365
## "numeric" "numeric"
print("Host with the most stays - ")
## [1] "Host with the most stays - "
head(ab_nyc_data %>% count(host_id) %>% arrange(desc(n)))
## # A tibble: 6 × 2
## host_id n
## <dbl> <int>
## 1 219517861 327
## 2 107434423 232
## 3 30283594 121
## 4 137358866 103
## 5 12243051 96
## 6 16098958 96
print("Airbnb with most number of reviews - ")
## [1] "Airbnb with most number of reviews - "
head(ab_nyc_data %>% group_by(id, name) %>% summarise(reviews= sum(number_of_reviews)) %>% arrange(desc(reviews)))
## `summarise()` has grouped output by 'id'. You can override using the `.groups`
## argument.
## # A tibble: 6 × 3
## # Groups: id [6]
## id name reviews
## <dbl> <chr> <dbl>
## 1 9145202 Room near JFK Queen Bed 629
## 2 903972 Great Bedroom in Manhattan 607
## 3 903947 Beautiful Bedroom in Manhattan 597
## 4 891117 Private Bedroom in Manhattan 594
## 5 10101135 Room Near JFK Twin Beds 576
## 6 8168619 Steps away from Laguardia airport 543
print("Most available airbnb - ")
## [1] "Most available airbnb - "
head(ab_nyc_data %>% group_by(id, name) %>% summarise(availability= mean(availability_365)) %>% arrange(desc(availability)))
## `summarise()` has grouped output by 'id'. You can override using the `.groups`
## argument.
## # A tibble: 6 × 3
## # Groups: id [6]
## id name availability
## <dbl> <chr> <dbl>
## 1 2539 Clean & quiet apt home by the park 365
## 2 3647 THE VILLAGE OF HARLEM....NEW YORK ! 365
## 3 11452 Clean and Quiet in Brooklyn 365
## 4 11943 Country space in the city 365
## 5 21644 Upper Manhattan, New York 365
## 6 32037 Huge Private Floor at The Waverly 365
print("Least available airbnb - ")
## [1] "Least available airbnb - "
head(ab_nyc_data %>% group_by(id, name) %>% summarise(availability= mean(availability_365)) %>% arrange(availability))
## `summarise()` has grouped output by 'id'. You can override using the `.groups`
## argument.
## # A tibble: 6 × 3
## # Groups: id [6]
## id name availability
## <dbl> <chr> <dbl>
## 1 5022 Entire Apt: Spacious Studio/Loft by central park 0
## 2 5121 BlissArtsSpace! 0
## 3 5203 Cozy Clean Guest Room - Family Apt 0
## 4 6090 West Village Nest - Superhost 0
## 5 7801 Sweet and Spacious Brooklyn Loft 0
## 6 8700 Magnifique Suite au N de Manhattan - vue Cloitres 0
print("Number of airbnbs in each neighbourhood group - ")
## [1] "Number of airbnbs in each neighbourhood group - "
head(ab_nyc_data %>% count(neighbourhood_group))
## # A tibble: 5 × 2
## neighbourhood_group n
## <chr> <int>
## 1 Bronx 1091
## 2 Brooklyn 20104
## 3 Manhattan 21661
## 4 Queens 5666
## 5 Staten Island 373
print("Average price of airbnb in each neighbourhood group - ")
## [1] "Average price of airbnb in each neighbourhood group - "
head(ab_nyc_data %>% group_by(neighbourhood_group) %>% summarise(avg_price= mean(price)) %>% arrange(desc(avg_price)))
## # A tibble: 5 × 2
## neighbourhood_group avg_price
## <chr> <dbl>
## 1 Manhattan 197.
## 2 Brooklyn 124.
## 3 Staten Island 115.
## 4 Queens 99.5
## 5 Bronx 87.5