airbnb

Reading Airbnb data

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

airbnb <- read_csv(file = 'C:/DATA 101. MC/Data 101/airbnb_nyc.csv')

## Rows: 48895 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (6): name, host_name, neighbourhood_group, neighbourhood, room_type, la...
## dbl (10): id, host_id, latitude, longitude, price, minimum_nights, number_of...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

glimpse(airbnb)

## Rows: 48,895
## Columns: 16
## $ id                             <dbl> 2539, 2595, 3647, 3831, 5022, 5099, 512…
## $ name                           <chr> "Clean & quiet apt home by the park", "…
## $ host_id                        <dbl> 2787, 2845, 4632, 4869, 7192, 7322, 735…
## $ host_name                      <chr> "John", "Jennifer", "Elisabeth", "LisaR…
## $ neighbourhood_group            <chr> "Brooklyn", "Manhattan", "Manhattan", "…
## $ neighbourhood                  <chr> "Kensington", "Midtown", "Harlem", "Cli…
## $ latitude                       <dbl> 40.64749, 40.75362, 40.80902, 40.68514,…
## $ longitude                      <dbl> -73.97237, -73.98377, -73.94190, -73.95…
## $ room_type                      <chr> "Private room", "Entire home/apt", "Pri…
## $ price                          <dbl> 149, 225, 150, 89, 80, 200, 60, 79, 79,…
## $ minimum_nights                 <dbl> 1, 1, 3, 1, 10, 3, 45, 2, 2, 1, 5, 2, 4…
## $ number_of_reviews              <dbl> 9, 45, 0, 270, 9, 74, 49, 430, 118, 160…
## $ last_review                    <chr> "10/19/2018", "5/21/2019", NA, "7/5/201…
## $ reviews_per_month              <dbl> 0.21, 0.38, NA, 4.64, 0.10, 0.59, 0.40,…
## $ calculated_host_listings_count <dbl> 6, 2, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 3, …
## $ availability_365               <dbl> 365, 355, 365, 194, 0, 129, 0, 220, 0, …

testing proportions ?

# x is the number of successes
x <-  airbnb %>%
  filter(neighbourhood_group == "Brooklyn") %>%
  nrow()  ## this represents the number of rows exclusively for Brooklyn 
n <-  nrow(airbnb) ## n is the total number of rows Airbnbs
prop.test(x, n, p = 0.40, alternative = "two.sided", conf.level = 0.95)

## 
##  1-sample proportions test with continuity correction
## 
## data:  x out of n, null probability 0.4
## X-squared = 25.358, df = 1, p-value = 4.762e-07
## alternative hypothesis: true p is not equal to 0.4
## 95 percent confidence interval:
##  0.4068024 0.4155452
## sample estimates:
##         p 
## 0.4111668

## Single Mean Test mu stands for the mean of series of numbers 
t.test(airbnb$price, mu = 150, alternative = "greater", conf.level = 0.95)

## 
##  One Sample t-test
## 
## data:  airbnb$price
## t = 2.5051, df = 48894, p-value = 0.006123
## alternative hypothesis: true mean is greater than 150
## 95 percent confidence interval:
##  150.9342      Inf
## sample estimates:
## mean of x 
##  152.7207

### Using the Confidence Interval Method
t.test(airbnb$price, conf.level = 0.95)

## 
##  One Sample t-test
## 
## data:  airbnb$price
## t = 140.62, df = 48894, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  150.5920 154.8494
## sample estimates:
## mean of x 
##  152.7207

boxplot(data = airbnb, price ~ neighbourhood_group, outline = F, col = "red")

Conduct a t-test of a single mean at the 95% confidence level (alpha = 0.05).

brooklyn <- subset(airbnb, neighbourhood_group == "Brooklyn") ## df, col name, var name
manhattan <- subset(airbnb, neighbourhood_group == "Manhattan")

t.test(brooklyn$price, manhattan$price, alternative = "less", conf.level = 0.95) ## bro and Man are the new df

## 
##  Welch Two Sample t-test
## 
## data:  brooklyn$price and manhattan$price
## t = -30.48, df = 37233, p-value < 2.2e-16
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##       -Inf -68.58041
## sample estimates:
## mean of x mean of y 
##  124.3832  196.8758

Two Proportion Test

barplot(prop.table(table(airbnb$room_type)), col = "dark blue") ## barplot is the function and so is prop.table

It appears that the proportion of entire homes/apartments available is more than the proportion of private rooms available.

Is this actually true or is it due to random chance of our sample? Let’s find out.

x1 <- airbnb %>%
  filter(room_type == "Entire home/apt") %>%
  nrow()

x2 <- airbnb %>%
  filter(room_type == "Private room") %>%
  nrow()

x <- c(x1, x2)
n <- c(nrow(airbnb), nrow(airbnb))

prop.test(x, n, alternative = "greater", conf.level = 0.99)  ## x/n where n is the total n or the denominator

## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  x out of n
## X-squared = 388.75, df = 1, p-value < 2.2e-16
## alternative hypothesis: greater
## 99 percent confidence interval:
##  0.05561071 1.00000000
## sample estimates:
##    prop 1    prop 2 
## 0.5196646 0.4566111

ANOVA Example

p <- ggplot(data = airbnb)
p + geom_boxplot(mapping = aes(x=neighbourhood_group, y=price,
                               fill=neighbourhood_group)) +
    ylim(0, 1000) +
      labs(title = "Airbnb NYC Borough Prices")

## Warning: Removed 239 rows containing non-finite values (stat_boxplot).

results <- aov(airbnb$price ~ airbnb$neighbourhood_group)
summary(results)

##                               Df    Sum Sq  Mean Sq F value Pr(>F)    
## airbnb$neighbourhood_group     4 7.959e+07 19897739     355 <2e-16 ***
## Residuals                  48890 2.740e+09    56051                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Chi-Square Test for Independence Example

table_borough_room <- airbnb %>% count(room_type, neighbourhood_group)

p <- ggplot(data = table_borough_room)
p + geom_tile(mapping = aes(x=room_type, y=neighbourhood_group, fill = n))

### create a table

borough_room_table <- table(airbnb$room_type, airbnb$neighbourhood_group)
borough_room_table

##                  
##                   Bronx Brooklyn Manhattan Queens Staten Island
##   Entire home/apt   379     9559     13199   2096           176
##   Private room      652    10132      7982   3372           188
##   Shared room        60      413       480    198             9

the new df “borough_room_table” was created above

result <- chisq.test(borough_room_table); result

## 
##  Pearson's Chi-squared test
## 
## data:  borough_room_table
## X-squared = 1559.6, df = 8, p-value < 2.2e-16

result <- chisq.test(borough_room_table); result

## 
##  Pearson's Chi-squared test
## 
## data:  borough_room_table
## X-squared = 1559.6, df = 8, p-value < 2.2e-16

airbnb_nyc

Ami

2023-04-09

testing proportions ?

Conduct a t-test of a single mean at the 95% confidence level (alpha = 0.05).

Two Proportion Test

It appears that the proportion of entire homes/apartments available is more than the proportion of private rooms available.

Is this actually true or is it due to random chance of our sample? Let’s find out.

ANOVA Example

Chi-Square Test for Independence Example

the new df “borough_room_table” was created above