Reading Airbnb data

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
airbnb <- read_csv(file = 'C:/DATA 101. MC/Data 101/airbnb_nyc.csv')
## Rows: 48895 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (6): name, host_name, neighbourhood_group, neighbourhood, room_type, la...
## dbl (10): id, host_id, latitude, longitude, price, minimum_nights, number_of...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(airbnb)
## Rows: 48,895
## Columns: 16
## $ id                             <dbl> 2539, 2595, 3647, 3831, 5022, 5099, 512…
## $ name                           <chr> "Clean & quiet apt home by the park", "…
## $ host_id                        <dbl> 2787, 2845, 4632, 4869, 7192, 7322, 735…
## $ host_name                      <chr> "John", "Jennifer", "Elisabeth", "LisaR…
## $ neighbourhood_group            <chr> "Brooklyn", "Manhattan", "Manhattan", "…
## $ neighbourhood                  <chr> "Kensington", "Midtown", "Harlem", "Cli…
## $ latitude                       <dbl> 40.64749, 40.75362, 40.80902, 40.68514,…
## $ longitude                      <dbl> -73.97237, -73.98377, -73.94190, -73.95…
## $ room_type                      <chr> "Private room", "Entire home/apt", "Pri…
## $ price                          <dbl> 149, 225, 150, 89, 80, 200, 60, 79, 79,…
## $ minimum_nights                 <dbl> 1, 1, 3, 1, 10, 3, 45, 2, 2, 1, 5, 2, 4…
## $ number_of_reviews              <dbl> 9, 45, 0, 270, 9, 74, 49, 430, 118, 160…
## $ last_review                    <chr> "10/19/2018", "5/21/2019", NA, "7/5/201…
## $ reviews_per_month              <dbl> 0.21, 0.38, NA, 4.64, 0.10, 0.59, 0.40,…
## $ calculated_host_listings_count <dbl> 6, 2, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 3, …
## $ availability_365               <dbl> 365, 355, 365, 194, 0, 129, 0, 220, 0, …

testing proportions ?

# x is the number of successes
x <-  airbnb %>%
  filter(neighbourhood_group == "Brooklyn") %>%
  nrow()  ## this represents the number of rows exclusively for Brooklyn 
n <-  nrow(airbnb) ## n is the total number of rows Airbnbs
prop.test(x, n, p = 0.40, alternative = "two.sided", conf.level = 0.95)
## 
##  1-sample proportions test with continuity correction
## 
## data:  x out of n, null probability 0.4
## X-squared = 25.358, df = 1, p-value = 4.762e-07
## alternative hypothesis: true p is not equal to 0.4
## 95 percent confidence interval:
##  0.4068024 0.4155452
## sample estimates:
##         p 
## 0.4111668
## Single Mean Test mu stands for the mean of series of numbers 
t.test(airbnb$price, mu = 150, alternative = "greater", conf.level = 0.95)
## 
##  One Sample t-test
## 
## data:  airbnb$price
## t = 2.5051, df = 48894, p-value = 0.006123
## alternative hypothesis: true mean is greater than 150
## 95 percent confidence interval:
##  150.9342      Inf
## sample estimates:
## mean of x 
##  152.7207
### Using the Confidence Interval Method
t.test(airbnb$price, conf.level = 0.95)
## 
##  One Sample t-test
## 
## data:  airbnb$price
## t = 140.62, df = 48894, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  150.5920 154.8494
## sample estimates:
## mean of x 
##  152.7207
boxplot(data = airbnb, price ~ neighbourhood_group, outline = F, col = "red")

Conduct a t-test of a single mean at the 95% confidence level (alpha = 0.05).

brooklyn <- subset(airbnb, neighbourhood_group == "Brooklyn") ## df, col name, var name
manhattan <- subset(airbnb, neighbourhood_group == "Manhattan")

t.test(brooklyn$price, manhattan$price, alternative = "less", conf.level = 0.95) ## bro and Man are the new df
## 
##  Welch Two Sample t-test
## 
## data:  brooklyn$price and manhattan$price
## t = -30.48, df = 37233, p-value < 2.2e-16
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##       -Inf -68.58041
## sample estimates:
## mean of x mean of y 
##  124.3832  196.8758

Two Proportion Test

barplot(prop.table(table(airbnb$room_type)), col = "dark blue") ## barplot is the function and so is prop.table

It appears that the proportion of entire homes/apartments available is more than the proportion of private rooms available.

Is this actually true or is it due to random chance of our sample? Let’s find out.

x1 <- airbnb %>%
  filter(room_type == "Entire home/apt") %>%
  nrow()

x2 <- airbnb %>%
  filter(room_type == "Private room") %>%
  nrow()

x <- c(x1, x2)
n <- c(nrow(airbnb), nrow(airbnb))

prop.test(x, n, alternative = "greater", conf.level = 0.99)  ## x/n where n is the total n or the denominator 
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  x out of n
## X-squared = 388.75, df = 1, p-value < 2.2e-16
## alternative hypothesis: greater
## 99 percent confidence interval:
##  0.05561071 1.00000000
## sample estimates:
##    prop 1    prop 2 
## 0.5196646 0.4566111

ANOVA Example

p <- ggplot(data = airbnb)
p + geom_boxplot(mapping = aes(x=neighbourhood_group, y=price,
                               fill=neighbourhood_group)) +
    ylim(0, 1000) +
      labs(title = "Airbnb NYC Borough Prices")
## Warning: Removed 239 rows containing non-finite values (stat_boxplot).

results <- aov(airbnb$price ~ airbnb$neighbourhood_group)
summary(results)
##                               Df    Sum Sq  Mean Sq F value Pr(>F)    
## airbnb$neighbourhood_group     4 7.959e+07 19897739     355 <2e-16 ***
## Residuals                  48890 2.740e+09    56051                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Chi-Square Test for Independence Example

table_borough_room <- airbnb %>% count(room_type, neighbourhood_group)

p <- ggplot(data = table_borough_room)
p + geom_tile(mapping = aes(x=room_type, y=neighbourhood_group, fill = n))

### create a table

borough_room_table <- table(airbnb$room_type, airbnb$neighbourhood_group)
borough_room_table
##                  
##                   Bronx Brooklyn Manhattan Queens Staten Island
##   Entire home/apt   379     9559     13199   2096           176
##   Private room      652    10132      7982   3372           188
##   Shared room        60      413       480    198             9

the new df “borough_room_table” was created above

result <- chisq.test(borough_room_table); result
## 
##  Pearson's Chi-squared test
## 
## data:  borough_room_table
## X-squared = 1559.6, df = 8, p-value < 2.2e-16
result <- chisq.test(borough_room_table); result
## 
##  Pearson's Chi-squared test
## 
## data:  borough_room_table
## X-squared = 1559.6, df = 8, p-value < 2.2e-16