Reading Airbnb data
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
airbnb <- read_csv(file = 'C:/DATA 101. MC/Data 101/airbnb_nyc.csv')
## Rows: 48895 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): name, host_name, neighbourhood_group, neighbourhood, room_type, la...
## dbl (10): id, host_id, latitude, longitude, price, minimum_nights, number_of...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(airbnb)
## Rows: 48,895
## Columns: 16
## $ id <dbl> 2539, 2595, 3647, 3831, 5022, 5099, 512…
## $ name <chr> "Clean & quiet apt home by the park", "…
## $ host_id <dbl> 2787, 2845, 4632, 4869, 7192, 7322, 735…
## $ host_name <chr> "John", "Jennifer", "Elisabeth", "LisaR…
## $ neighbourhood_group <chr> "Brooklyn", "Manhattan", "Manhattan", "…
## $ neighbourhood <chr> "Kensington", "Midtown", "Harlem", "Cli…
## $ latitude <dbl> 40.64749, 40.75362, 40.80902, 40.68514,…
## $ longitude <dbl> -73.97237, -73.98377, -73.94190, -73.95…
## $ room_type <chr> "Private room", "Entire home/apt", "Pri…
## $ price <dbl> 149, 225, 150, 89, 80, 200, 60, 79, 79,…
## $ minimum_nights <dbl> 1, 1, 3, 1, 10, 3, 45, 2, 2, 1, 5, 2, 4…
## $ number_of_reviews <dbl> 9, 45, 0, 270, 9, 74, 49, 430, 118, 160…
## $ last_review <chr> "10/19/2018", "5/21/2019", NA, "7/5/201…
## $ reviews_per_month <dbl> 0.21, 0.38, NA, 4.64, 0.10, 0.59, 0.40,…
## $ calculated_host_listings_count <dbl> 6, 2, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 3, …
## $ availability_365 <dbl> 365, 355, 365, 194, 0, 129, 0, 220, 0, …
# x is the number of successes
x <- airbnb %>%
filter(neighbourhood_group == "Brooklyn") %>%
nrow() ## this represents the number of rows exclusively for Brooklyn
n <- nrow(airbnb) ## n is the total number of rows Airbnbs
prop.test(x, n, p = 0.40, alternative = "two.sided", conf.level = 0.95)
##
## 1-sample proportions test with continuity correction
##
## data: x out of n, null probability 0.4
## X-squared = 25.358, df = 1, p-value = 4.762e-07
## alternative hypothesis: true p is not equal to 0.4
## 95 percent confidence interval:
## 0.4068024 0.4155452
## sample estimates:
## p
## 0.4111668
## Single Mean Test mu stands for the mean of series of numbers
t.test(airbnb$price, mu = 150, alternative = "greater", conf.level = 0.95)
##
## One Sample t-test
##
## data: airbnb$price
## t = 2.5051, df = 48894, p-value = 0.006123
## alternative hypothesis: true mean is greater than 150
## 95 percent confidence interval:
## 150.9342 Inf
## sample estimates:
## mean of x
## 152.7207
### Using the Confidence Interval Method
t.test(airbnb$price, conf.level = 0.95)
##
## One Sample t-test
##
## data: airbnb$price
## t = 140.62, df = 48894, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 150.5920 154.8494
## sample estimates:
## mean of x
## 152.7207
boxplot(data = airbnb, price ~ neighbourhood_group, outline = F, col = "red")
brooklyn <- subset(airbnb, neighbourhood_group == "Brooklyn") ## df, col name, var name
manhattan <- subset(airbnb, neighbourhood_group == "Manhattan")
t.test(brooklyn$price, manhattan$price, alternative = "less", conf.level = 0.95) ## bro and Man are the new df
##
## Welch Two Sample t-test
##
## data: brooklyn$price and manhattan$price
## t = -30.48, df = 37233, p-value < 2.2e-16
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf -68.58041
## sample estimates:
## mean of x mean of y
## 124.3832 196.8758
barplot(prop.table(table(airbnb$room_type)), col = "dark blue") ## barplot is the function and so is prop.table
x1 <- airbnb %>%
filter(room_type == "Entire home/apt") %>%
nrow()
x2 <- airbnb %>%
filter(room_type == "Private room") %>%
nrow()
x <- c(x1, x2)
n <- c(nrow(airbnb), nrow(airbnb))
prop.test(x, n, alternative = "greater", conf.level = 0.99) ## x/n where n is the total n or the denominator
##
## 2-sample test for equality of proportions with continuity correction
##
## data: x out of n
## X-squared = 388.75, df = 1, p-value < 2.2e-16
## alternative hypothesis: greater
## 99 percent confidence interval:
## 0.05561071 1.00000000
## sample estimates:
## prop 1 prop 2
## 0.5196646 0.4566111
p <- ggplot(data = airbnb)
p + geom_boxplot(mapping = aes(x=neighbourhood_group, y=price,
fill=neighbourhood_group)) +
ylim(0, 1000) +
labs(title = "Airbnb NYC Borough Prices")
## Warning: Removed 239 rows containing non-finite values (stat_boxplot).
results <- aov(airbnb$price ~ airbnb$neighbourhood_group)
summary(results)
## Df Sum Sq Mean Sq F value Pr(>F)
## airbnb$neighbourhood_group 4 7.959e+07 19897739 355 <2e-16 ***
## Residuals 48890 2.740e+09 56051
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
table_borough_room <- airbnb %>% count(room_type, neighbourhood_group)
p <- ggplot(data = table_borough_room)
p + geom_tile(mapping = aes(x=room_type, y=neighbourhood_group, fill = n))
### create a table
borough_room_table <- table(airbnb$room_type, airbnb$neighbourhood_group)
borough_room_table
##
## Bronx Brooklyn Manhattan Queens Staten Island
## Entire home/apt 379 9559 13199 2096 176
## Private room 652 10132 7982 3372 188
## Shared room 60 413 480 198 9
result <- chisq.test(borough_room_table); result
##
## Pearson's Chi-squared test
##
## data: borough_room_table
## X-squared = 1559.6, df = 8, p-value < 2.2e-16
result <- chisq.test(borough_room_table); result
##
## Pearson's Chi-squared test
##
## data: borough_room_table
## X-squared = 1559.6, df = 8, p-value < 2.2e-16