library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
setwd("/Users/mikea/Desktop/Datasets")
df <- read_csv("airbnb_nyc.csv")
str(df)
## spc_tbl_ [48,895 × 16] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ id : num [1:48895] 2539 2595 3647 3831 5022 ...
## $ name : chr [1:48895] "Clean & quiet apt home by the park" "Skylit Midtown Castle" "THE VILLAGE OF HARLEM....NEW YORK !" "Cozy Entire Floor of Brownstone" ...
## $ host_id : num [1:48895] 2787 2845 4632 4869 7192 ...
## $ host_name : chr [1:48895] "John" "Jennifer" "Elisabeth" "LisaRoxanne" ...
## $ neighbourhood_group : chr [1:48895] "Brooklyn" "Manhattan" "Manhattan" "Brooklyn" ...
## $ neighbourhood : chr [1:48895] "Kensington" "Midtown" "Harlem" "Clinton Hill" ...
## $ latitude : num [1:48895] 40.6 40.8 40.8 40.7 40.8 ...
## $ longitude : num [1:48895] -74 -74 -73.9 -74 -73.9 ...
## $ room_type : chr [1:48895] "Private room" "Entire home/apt" "Private room" "Entire home/apt" ...
## $ price : num [1:48895] 149 225 150 89 80 200 60 79 79 150 ...
## $ minimum_nights : num [1:48895] 1 1 3 1 10 3 45 2 2 1 ...
## $ number_of_reviews : num [1:48895] 9 45 0 270 9 74 49 430 118 160 ...
## $ last_review : Date[1:48895], format: "2018-10-19" "2019-05-21" ...
## $ reviews_per_month : num [1:48895] 0.21 0.38 NA 4.64 0.1 0.59 0.4 3.47 0.99 1.33 ...
## $ calculated_host_listings_count: num [1:48895] 6 2 1 1 1 1 1 1 1 4 ...
## $ availability_365 : num [1:48895] 365 355 365 194 0 129 0 220 0 188 ...
## - attr(*, "spec")=
## .. cols(
## .. id = col_double(),
## .. name = col_character(),
## .. host_id = col_double(),
## .. host_name = col_character(),
## .. neighbourhood_group = col_character(),
## .. neighbourhood = col_character(),
## .. latitude = col_double(),
## .. longitude = col_double(),
## .. room_type = col_character(),
## .. price = col_double(),
## .. minimum_nights = col_double(),
## .. number_of_reviews = col_double(),
## .. last_review = col_date(format = ""),
## .. reviews_per_month = col_double(),
## .. calculated_host_listings_count = col_double(),
## .. availability_365 = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
dim(df)
## [1] 48895 16
glimpse(df)
## Rows: 48,895
## Columns: 16
## $ id <dbl> 2539, 2595, 3647, 3831, 5022, 5099, 512…
## $ name <chr> "Clean & quiet apt home by the park", "…
## $ host_id <dbl> 2787, 2845, 4632, 4869, 7192, 7322, 735…
## $ host_name <chr> "John", "Jennifer", "Elisabeth", "LisaR…
## $ neighbourhood_group <chr> "Brooklyn", "Manhattan", "Manhattan", "…
## $ neighbourhood <chr> "Kensington", "Midtown", "Harlem", "Cli…
## $ latitude <dbl> 40.64749, 40.75362, 40.80902, 40.68514,…
## $ longitude <dbl> -73.97237, -73.98377, -73.94190, -73.95…
## $ room_type <chr> "Private room", "Entire home/apt", "Pri…
## $ price <dbl> 149, 225, 150, 89, 80, 200, 60, 79, 79,…
## $ minimum_nights <dbl> 1, 1, 3, 1, 10, 3, 45, 2, 2, 1, 5, 2, 4…
## $ number_of_reviews <dbl> 9, 45, 0, 270, 9, 74, 49, 430, 118, 160…
## $ last_review <date> 2018-10-19, 2019-05-21, NA, 2019-07-05…
## $ reviews_per_month <dbl> 0.21, 0.38, NA, 4.64, 0.10, 0.59, 0.40,…
## $ calculated_host_listings_count <dbl> 6, 2, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 3, …
## $ availability_365 <dbl> 365, 355, 365, 194, 0, 129, 0, 220, 0, …
colSums(is.na(df))
## id name
## 0 16
## host_id host_name
## 0 21
## neighbourhood_group neighbourhood
## 0 0
## latitude longitude
## 0 0
## room_type price
## 0 0
## minimum_nights number_of_reviews
## 0 0
## last_review reviews_per_month
## 10052 10052
## calculated_host_listings_count availability_365
## 0 0
# 16 na's in name
# 10052 reviews_per_month
# 10052 last_review
# 21 host_name
contingency_table <- addmargins(table(df$neighbourhood_group, df$room_type))
contingency_table
##
## Entire home/apt Private room Shared room Sum
## Bronx 379 652 60 1091
## Brooklyn 9559 10132 413 20104
## Manhattan 13199 7982 480 21661
## Queens 2096 3372 198 5666
## Staten Island 176 188 9 373
## Sum 25409 22326 1160 48895
# I notced that it seems that these are cities located in NY. Also, people living in Manhattan live mostly in an Entire home/apt which makes sense.
hist(df$price, breaks = 1000, main = "Histogram of Price", xlab = "Price")
summary(df$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 69.0 106.0 152.7 175.0 10000.0
IQR(df$price)
## [1] 106
# IQR calculates the interquartile range. Which is 106
# The price is not has high in terms of spread. However, data looks skewed to the right.
boxplot(df$price ~ df$neighbourhood_group, outline = F)
# This shows that Manhattan has a greater spread of price and is higher compared to the others
# The majority of other groups distrubition and median typially falls below 100
df_brooklyn <- df %>%
filter(neighbourhood_group == "Brooklyn") %>%
group_by(neighbourhood) %>%
summarise(listing_count = n()) %>%
arrange(desc(listing_count))
df_brooklyn
## # A tibble: 47 × 2
## neighbourhood listing_count
## <chr> <int>
## 1 Williamsburg 3920
## 2 Bedford-Stuyvesant 3714
## 3 Bushwick 2465
## 4 Crown Heights 1564
## 5 Greenpoint 1115
## 6 Flatbush 621
## 7 Clinton Hill 572
## 8 Prospect-Lefferts Gardens 535
## 9 Park Slope 506
## 10 East Flatbush 500
## # ℹ 37 more rows
# Williamsburg 3920
# Formulate your question
# Read in your data
# Check the packaging
# Run str()
# Look at the top and the bottom of your data
# Check your “n”s
# Validate with at least one external data source
# Try the easy solution first
# Challenge your solution
# Follow up