library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
setwd("/Users/mikea/Desktop/Datasets")

Dataset

df <- read_csv("airbnb_nyc.csv")

Problem 1

str(df)
## spc_tbl_ [48,895 × 16] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ id                            : num [1:48895] 2539 2595 3647 3831 5022 ...
##  $ name                          : chr [1:48895] "Clean & quiet apt home by the park" "Skylit Midtown Castle" "THE VILLAGE OF HARLEM....NEW YORK !" "Cozy Entire Floor of Brownstone" ...
##  $ host_id                       : num [1:48895] 2787 2845 4632 4869 7192 ...
##  $ host_name                     : chr [1:48895] "John" "Jennifer" "Elisabeth" "LisaRoxanne" ...
##  $ neighbourhood_group           : chr [1:48895] "Brooklyn" "Manhattan" "Manhattan" "Brooklyn" ...
##  $ neighbourhood                 : chr [1:48895] "Kensington" "Midtown" "Harlem" "Clinton Hill" ...
##  $ latitude                      : num [1:48895] 40.6 40.8 40.8 40.7 40.8 ...
##  $ longitude                     : num [1:48895] -74 -74 -73.9 -74 -73.9 ...
##  $ room_type                     : chr [1:48895] "Private room" "Entire home/apt" "Private room" "Entire home/apt" ...
##  $ price                         : num [1:48895] 149 225 150 89 80 200 60 79 79 150 ...
##  $ minimum_nights                : num [1:48895] 1 1 3 1 10 3 45 2 2 1 ...
##  $ number_of_reviews             : num [1:48895] 9 45 0 270 9 74 49 430 118 160 ...
##  $ last_review                   : Date[1:48895], format: "2018-10-19" "2019-05-21" ...
##  $ reviews_per_month             : num [1:48895] 0.21 0.38 NA 4.64 0.1 0.59 0.4 3.47 0.99 1.33 ...
##  $ calculated_host_listings_count: num [1:48895] 6 2 1 1 1 1 1 1 1 4 ...
##  $ availability_365              : num [1:48895] 365 355 365 194 0 129 0 220 0 188 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   id = col_double(),
##   ..   name = col_character(),
##   ..   host_id = col_double(),
##   ..   host_name = col_character(),
##   ..   neighbourhood_group = col_character(),
##   ..   neighbourhood = col_character(),
##   ..   latitude = col_double(),
##   ..   longitude = col_double(),
##   ..   room_type = col_character(),
##   ..   price = col_double(),
##   ..   minimum_nights = col_double(),
##   ..   number_of_reviews = col_double(),
##   ..   last_review = col_date(format = ""),
##   ..   reviews_per_month = col_double(),
##   ..   calculated_host_listings_count = col_double(),
##   ..   availability_365 = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
dim(df)
## [1] 48895    16
glimpse(df)
## Rows: 48,895
## Columns: 16
## $ id                             <dbl> 2539, 2595, 3647, 3831, 5022, 5099, 512…
## $ name                           <chr> "Clean & quiet apt home by the park", "…
## $ host_id                        <dbl> 2787, 2845, 4632, 4869, 7192, 7322, 735…
## $ host_name                      <chr> "John", "Jennifer", "Elisabeth", "LisaR…
## $ neighbourhood_group            <chr> "Brooklyn", "Manhattan", "Manhattan", "…
## $ neighbourhood                  <chr> "Kensington", "Midtown", "Harlem", "Cli…
## $ latitude                       <dbl> 40.64749, 40.75362, 40.80902, 40.68514,…
## $ longitude                      <dbl> -73.97237, -73.98377, -73.94190, -73.95…
## $ room_type                      <chr> "Private room", "Entire home/apt", "Pri…
## $ price                          <dbl> 149, 225, 150, 89, 80, 200, 60, 79, 79,…
## $ minimum_nights                 <dbl> 1, 1, 3, 1, 10, 3, 45, 2, 2, 1, 5, 2, 4…
## $ number_of_reviews              <dbl> 9, 45, 0, 270, 9, 74, 49, 430, 118, 160…
## $ last_review                    <date> 2018-10-19, 2019-05-21, NA, 2019-07-05…
## $ reviews_per_month              <dbl> 0.21, 0.38, NA, 4.64, 0.10, 0.59, 0.40,…
## $ calculated_host_listings_count <dbl> 6, 2, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 3, …
## $ availability_365               <dbl> 365, 355, 365, 194, 0, 129, 0, 220, 0, …
colSums(is.na(df))
##                             id                           name 
##                              0                             16 
##                        host_id                      host_name 
##                              0                             21 
##            neighbourhood_group                  neighbourhood 
##                              0                              0 
##                       latitude                      longitude 
##                              0                              0 
##                      room_type                          price 
##                              0                              0 
##                 minimum_nights              number_of_reviews 
##                              0                              0 
##                    last_review              reviews_per_month 
##                          10052                          10052 
## calculated_host_listings_count               availability_365 
##                              0                              0
# 16 na's in name
# 10052 reviews_per_month
# 10052 last_review
# 21 host_name

Problem 2

contingency_table <- addmargins(table(df$neighbourhood_group, df$room_type))
contingency_table
##                
##                 Entire home/apt Private room Shared room   Sum
##   Bronx                     379          652          60  1091
##   Brooklyn                 9559        10132         413 20104
##   Manhattan               13199         7982         480 21661
##   Queens                   2096         3372         198  5666
##   Staten Island             176          188           9   373
##   Sum                     25409        22326        1160 48895
# I notced that it seems that these are cities located in NY. Also, people living in Manhattan  live mostly in an Entire home/apt which makes sense.

Problem 3

hist(df$price, breaks = 1000, main = "Histogram of Price", xlab = "Price")

summary(df$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    69.0   106.0   152.7   175.0 10000.0
IQR(df$price)
## [1] 106
#  IQR calculates the interquartile range. Which is 106 
# The price is not has high in terms of spread. However, data looks skewed to the right.

Problem 4

boxplot(df$price ~ df$neighbourhood_group, outline = F)

# This shows that Manhattan has a greater spread of price and is higher compared to the others
# The majority of other groups distrubition and median typially falls below 100

Problem 5

df_brooklyn <- df %>%
  filter(neighbourhood_group == "Brooklyn") %>%
  group_by(neighbourhood) %>%
  summarise(listing_count = n()) %>% 
  arrange(desc(listing_count))
df_brooklyn
## # A tibble: 47 × 2
##    neighbourhood             listing_count
##    <chr>                             <int>
##  1 Williamsburg                       3920
##  2 Bedford-Stuyvesant                 3714
##  3 Bushwick                           2465
##  4 Crown Heights                      1564
##  5 Greenpoint                         1115
##  6 Flatbush                            621
##  7 Clinton Hill                        572
##  8 Prospect-Lefferts Gardens           535
##  9 Park Slope                          506
## 10 East Flatbush                       500
## # ℹ 37 more rows
# Williamsburg  3920    

Problem 6

# Formulate your question

# Read in your data

# Check the packaging

# Run str()

# Look at the top and the bottom of your data

# Check your “n”s

# Validate with at least one external data source

# Try the easy solution first

# Challenge your solution

# Follow up