library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data <- read_delim("./AB_NYC_2019.csv", delim = ",")
## Rows: 48895 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): name, host_name, neighbourhood_group, neighbourhood, room_type
## dbl (10): id, host_id, latitude, longitude, price, minimum_nights, number_o...
## date (1): last_review
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(data)
## spc_tbl_ [48,895 × 16] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ id : num [1:48895] 2539 2595 3647 3831 5022 ...
## $ name : chr [1:48895] "Clean & quiet apt home by the park" "Skylit Midtown Castle" "THE VILLAGE OF HARLEM....NEW YORK !" "Cozy Entire Floor of Brownstone" ...
## $ host_id : num [1:48895] 2787 2845 4632 4869 7192 ...
## $ host_name : chr [1:48895] "John" "Jennifer" "Elisabeth" "LisaRoxanne" ...
## $ neighbourhood_group : chr [1:48895] "Brooklyn" "Manhattan" "Manhattan" "Brooklyn" ...
## $ neighbourhood : chr [1:48895] "Kensington" "Midtown" "Harlem" "Clinton Hill" ...
## $ latitude : num [1:48895] 40.6 40.8 40.8 40.7 40.8 ...
## $ longitude : num [1:48895] -74 -74 -73.9 -74 -73.9 ...
## $ room_type : chr [1:48895] "Private room" "Entire home/apt" "Private room" "Entire home/apt" ...
## $ price : num [1:48895] 149 225 150 89 80 200 60 79 79 150 ...
## $ minimum_nights : num [1:48895] 1 1 3 1 10 3 45 2 2 1 ...
## $ number_of_reviews : num [1:48895] 9 45 0 270 9 74 49 430 118 160 ...
## $ last_review : Date[1:48895], format: "2018-10-19" "2019-05-21" ...
## $ reviews_per_month : num [1:48895] 0.21 0.38 NA 4.64 0.1 0.59 0.4 3.47 0.99 1.33 ...
## $ calculated_host_listings_count: num [1:48895] 6 2 1 1 1 1 1 1 1 4 ...
## $ availability_365 : num [1:48895] 365 355 365 194 0 129 0 220 0 188 ...
## - attr(*, "spec")=
## .. cols(
## .. id = col_double(),
## .. name = col_character(),
## .. host_id = col_double(),
## .. host_name = col_character(),
## .. neighbourhood_group = col_character(),
## .. neighbourhood = col_character(),
## .. latitude = col_double(),
## .. longitude = col_double(),
## .. room_type = col_character(),
## .. price = col_double(),
## .. minimum_nights = col_double(),
## .. number_of_reviews = col_double(),
## .. last_review = col_date(format = ""),
## .. reviews_per_month = col_double(),
## .. calculated_host_listings_count = col_double(),
## .. availability_365 = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
Create 5 random samples, each consisting of about 50% of dataset. These samples should include both categorical and continuous variables.
set.seed(123)
# Total data size
total_size <- nrow(data)
smple_size <- floor(0.5 * total_size)
df_1 <- data[sample(1:total_size, smple_size, replace = TRUE), ]
df_2 <- data[sample(1:total_size, smple_size, replace = TRUE), ]
df_3 <- data[sample(1:total_size, smple_size, replace = TRUE), ]
df_4 <- data[sample(1:total_size, smple_size, replace = TRUE), ]
df_5 <- data[sample(1:total_size, smple_size, replace = TRUE), ]
# Quick check of each sample size
cat("Sample 1 size:", nrow(df_1), "\nSample 2 size:", nrow(df_2), "\n")
## Sample 1 size: 24447
## Sample 2 size: 24447
# Calculate summary statistics for each subsample
summary(df_1)
## id name host_id host_name
## Min. : 2595 Length:24447 Min. : 2438 Length:24447
## 1st Qu.: 9485229 Class :character 1st Qu.: 7792684 Class :character
## Median :19745541 Mode :character Median : 31086628 Mode :character
## Mean :19030359 Mean : 68212626
## 3rd Qu.:29103275 3rd Qu.:107708708
## Max. :36484665 Max. :274311461
##
## neighbourhood_group neighbourhood latitude longitude
## Length:24447 Length:24447 Min. :40.51 Min. :-74.24
## Class :character Class :character 1st Qu.:40.69 1st Qu.:-73.98
## Mode :character Mode :character Median :40.72 Median :-73.96
## Mean :40.73 Mean :-73.95
## 3rd Qu.:40.76 3rd Qu.:-73.94
## Max. :40.91 Max. :-73.71
##
## room_type price minimum_nights number_of_reviews
## Length:24447 Min. : 0.0 Min. : 1.000 Min. : 0.00
## Class :character 1st Qu.: 69.0 1st Qu.: 1.000 1st Qu.: 1.00
## Mode :character Median : 105.0 Median : 2.000 Median : 5.00
## Mean : 150.6 Mean : 6.849 Mean : 23.16
## 3rd Qu.: 175.0 3rd Qu.: 5.000 3rd Qu.: 24.00
## Max. :10000.0 Max. :999.000 Max. :576.00
##
## last_review reviews_per_month calculated_host_listings_count
## Min. :2011-03-28 Min. : 0.010 Min. : 1.000
## 1st Qu.:2018-07-02 1st Qu.: 0.180 1st Qu.: 1.000
## Median :2019-05-19 Median : 0.710 Median : 1.000
## Mean :2018-09-29 Mean : 1.368 Mean : 7.038
## 3rd Qu.:2019-06-23 3rd Qu.: 2.010 3rd Qu.: 2.000
## Max. :2019-07-08 Max. :27.950 Max. :327.000
## NA's :4975 NA's :4975
## availability_365
## Min. : 0.0
## 1st Qu.: 0.0
## Median : 43.0
## Mean :111.7
## 3rd Qu.:224.0
## Max. :365.0
##
summary(df_2)
## id name host_id host_name
## Min. : 2595 Length:24447 Min. : 2438 Length:24447
## 1st Qu.: 9495671 Class :character 1st Qu.: 7780845 Class :character
## Median :19678436 Mode :character Median : 31451485 Mode :character
## Mean :18981163 Mean : 67567314
## 3rd Qu.:29028668 3rd Qu.:107434423
## Max. :36485609 Max. :274273284
##
## neighbourhood_group neighbourhood latitude longitude
## Length:24447 Length:24447 Min. :40.51 Min. :-74.24
## Class :character Class :character 1st Qu.:40.69 1st Qu.:-73.98
## Mode :character Mode :character Median :40.72 Median :-73.96
## Mean :40.73 Mean :-73.95
## 3rd Qu.:40.76 3rd Qu.:-73.94
## Max. :40.91 Max. :-73.71
##
## room_type price minimum_nights number_of_reviews
## Length:24447 Min. : 0.0 Min. : 1.000 Min. : 0.00
## Class :character 1st Qu.: 68.0 1st Qu.: 1.000 1st Qu.: 1.00
## Mode :character Median : 105.0 Median : 3.000 Median : 5.00
## Mean : 152.4 Mean : 7.104 Mean : 23.61
## 3rd Qu.: 175.0 3rd Qu.: 5.000 3rd Qu.: 24.00
## Max. :9999.0 Max. :999.000 Max. :543.00
##
## last_review reviews_per_month calculated_host_listings_count
## Min. :2011-03-28 Min. : 0.010 Min. : 1.000
## 1st Qu.:2018-07-05 1st Qu.: 0.190 1st Qu.: 1.000
## Median :2019-05-19 Median : 0.710 Median : 1.000
## Mean :2018-10-02 Mean : 1.373 Mean : 7.171
## 3rd Qu.:2019-06-23 3rd Qu.: 2.000 3rd Qu.: 2.000
## Max. :2019-07-08 Max. :58.500 Max. :327.000
## NA's :5054 NA's :5054
## availability_365
## Min. : 0.0
## 1st Qu.: 0.0
## Median : 43.0
## Mean :113.2
## 3rd Qu.:231.0
## Max. :365.0
##
summary(df_3)
## id name host_id host_name
## Min. : 2539 Length:24447 Min. : 2438 Length:24447
## 1st Qu.: 9367923 Class :character 1st Qu.: 7852646 Class :character
## Median :19670458 Mode :character Median : 31621291 Mode :character
## Mean :18963191 Mean : 68134291
## 3rd Qu.:29140876 3rd Qu.:107434423
## Max. :36484087 Max. :274321313
##
## neighbourhood_group neighbourhood latitude longitude
## Length:24447 Length:24447 Min. :40.50 Min. :-74.24
## Class :character Class :character 1st Qu.:40.69 1st Qu.:-73.98
## Mode :character Mode :character Median :40.72 Median :-73.96
## Mean :40.73 Mean :-73.95
## 3rd Qu.:40.76 3rd Qu.:-73.94
## Max. :40.90 Max. :-73.72
##
## room_type price minimum_nights number_of_reviews
## Length:24447 Min. : 0.0 Min. : 1.000 Min. : 0.00
## Class :character 1st Qu.: 69.0 1st Qu.: 1.000 1st Qu.: 1.00
## Mode :character Median : 105.0 Median : 2.000 Median : 5.00
## Mean : 154.8 Mean : 7.063 Mean : 23.65
## 3rd Qu.: 177.0 3rd Qu.: 5.000 3rd Qu.: 24.00
## Max. :10000.0 Max. :999.000 Max. :597.00
##
## last_review reviews_per_month calculated_host_listings_count
## Min. :2011-05-12 Min. : 0.01 Min. : 1.000
## 1st Qu.:2018-07-13 1st Qu.: 0.19 1st Qu.: 1.000
## Median :2019-05-20 Median : 0.72 Median : 1.000
## Mean :2018-10-04 Mean : 1.38 Mean : 7.118
## 3rd Qu.:2019-06-23 3rd Qu.: 2.05 3rd Qu.: 2.000
## Max. :2019-07-08 Max. :27.95 Max. :327.000
## NA's :4879 NA's :4879
## availability_365
## Min. : 0.0
## 1st Qu.: 0.0
## Median : 46.0
## Mean :113.3
## 3rd Qu.:226.0
## Max. :365.0
##
summary(df_4)
## id name host_id host_name
## Min. : 5121 Length:24447 Min. : 2787 Length:24447
## 1st Qu.: 9459134 Class :character 1st Qu.: 7836518 Class :character
## Median :19720565 Mode :character Median : 31489150 Mode :character
## Mean :19010719 Mean : 68409333
## 3rd Qu.:29183837 3rd Qu.:108397151
## Max. :36487245 Max. :274298453
##
## neighbourhood_group neighbourhood latitude longitude
## Length:24447 Length:24447 Min. :40.50 Min. :-74.24
## Class :character Class :character 1st Qu.:40.69 1st Qu.:-73.98
## Mode :character Mode :character Median :40.72 Median :-73.96
## Mean :40.73 Mean :-73.95
## 3rd Qu.:40.76 3rd Qu.:-73.94
## Max. :40.91 Max. :-73.72
##
## room_type price minimum_nights number_of_reviews
## Length:24447 Min. : 0.0 Min. : 1.000 Min. : 0.00
## Class :character 1st Qu.: 70.0 1st Qu.: 1.000 1st Qu.: 1.00
## Mode :character Median : 109.0 Median : 3.000 Median : 5.00
## Mean : 153.6 Mean : 7.055 Mean : 23.57
## 3rd Qu.: 179.0 3rd Qu.: 5.000 3rd Qu.: 24.00
## Max. :10000.0 Max. :1250.000 Max. :597.00
##
## last_review reviews_per_month calculated_host_listings_count
## Min. :2011-04-25 Min. : 0.010 Min. : 1.000
## 1st Qu.:2018-07-14 1st Qu.: 0.190 1st Qu.: 1.000
## Median :2019-05-19 Median : 0.730 Median : 1.000
## Mean :2018-10-04 Mean : 1.384 Mean : 7.408
## 3rd Qu.:2019-06-23 3rd Qu.: 2.060 3rd Qu.: 2.000
## Max. :2019-07-08 Max. :58.500 Max. :327.000
## NA's :5086 NA's :5086
## availability_365
## Min. : 0.0
## 1st Qu.: 0.0
## Median : 49.0
## Mean :114.7
## 3rd Qu.:234.0
## Max. :365.0
##
summary(df_5)
## id name host_id host_name
## Min. : 2539 Length:24447 Min. : 2787 Length:24447
## 1st Qu.: 9325410 Class :character 1st Qu.: 7636846 Class :character
## Median :19596021 Mode :character Median : 30283594 Mode :character
## Mean :18870931 Mean : 66881845
## 3rd Qu.:28941216 3rd Qu.:106601922
## Max. :36485431 Max. :274307600
##
## neighbourhood_group neighbourhood latitude longitude
## Length:24447 Length:24447 Min. :40.51 Min. :-74.24
## Class :character Class :character 1st Qu.:40.69 1st Qu.:-73.98
## Mode :character Mode :character Median :40.72 Median :-73.96
## Mean :40.73 Mean :-73.95
## 3rd Qu.:40.76 3rd Qu.:-73.94
## Max. :40.91 Max. :-73.72
##
## room_type price minimum_nights number_of_reviews
## Length:24447 Min. : 0.0 Min. : 1.000 Min. : 0.00
## Class :character 1st Qu.: 69.0 1st Qu.: 1.000 1st Qu.: 1.00
## Mode :character Median : 105.0 Median : 3.000 Median : 5.00
## Mean : 151.9 Mean : 6.938 Mean : 23.19
## 3rd Qu.: 175.0 3rd Qu.: 5.000 3rd Qu.: 24.00
## Max. :10000.0 Max. :500.000 Max. :576.00
##
## last_review reviews_per_month calculated_host_listings_count
## Min. :2011-05-12 Min. : 0.010 Min. : 1.000
## 1st Qu.:2018-07-16 1st Qu.: 0.190 1st Qu.: 1.000
## Median :2019-05-19 Median : 0.720 Median : 1.000
## Mean :2018-10-03 Mean : 1.354 Mean : 7.213
## 3rd Qu.:2019-06-23 3rd Qu.: 2.000 3rd Qu.: 2.000
## Max. :2019-07-08 Max. :16.220 Max. :327.000
## NA's :5111 NA's :5111
## availability_365
## Min. : 0.0
## 1st Qu.: 0.0
## Median : 45.0
## Mean :112.6
## 3rd Qu.:226.0
## Max. :365.0
##
head(data)
## # A tibble: 6 × 16
## id name host_id host_name neighbourhood_group neighbourhood latitude
## <dbl> <chr> <dbl> <chr> <chr> <chr> <dbl>
## 1 2539 Clean & qu… 2787 John Brooklyn Kensington 40.6
## 2 2595 Skylit Mid… 2845 Jennifer Manhattan Midtown 40.8
## 3 3647 THE VILLAG… 4632 Elisabeth Manhattan Harlem 40.8
## 4 3831 Cozy Entir… 4869 LisaRoxa… Brooklyn Clinton Hill 40.7
## 5 5022 Entire Apt… 7192 Laura Manhattan East Harlem 40.8
## 6 5099 Large Cozy… 7322 Chris Manhattan Murray Hill 40.7
## # ℹ 9 more variables: longitude <dbl>, room_type <chr>, price <dbl>,
## # minimum_nights <dbl>, number_of_reviews <dbl>, last_review <date>,
## # reviews_per_month <dbl>, calculated_host_listings_count <dbl>,
## # availability_365 <dbl>
Insight:
The process of attempting data collection from a population can be
achieved by selecting five random samples, each of which represents 50%
of the original dataset. Using several subsets of the population, it can
use this method to assess the inherent variability in your dataset and
determine how stable or variable the results may be.
Significance: Knowing how conclusions about a
population can change depending on the sample chosen is made possible by
recognizing the significance of this stage. The random variance that
might happen in actual data collection situations is replicated by
sampling with replacement.
Further Questions:
To feel certain in the stability of the conclusions, how much variance
should I anticipate between samples?
Is it possible for this variability to be changed by changed sample
sizes?
Group by neighbourhood_group and summarize the average price and number of reviews for each sample.
Compare the consistency of the subsamples, identifying differences and anomalies.
# Summary statistics for each sample
df_1_summary <- df_1 %>% group_by(neighbourhood_group) %>%
summarise(mean_price = mean(price, na.rm = TRUE), mean_reviews = mean(number_of_reviews, na.rm = TRUE))
df_2_summary <- df_2 %>% group_by(neighbourhood_group) %>%
summarise(mean_price = mean(price, na.rm = TRUE), mean_reviews = mean(number_of_reviews, na.rm = TRUE))
df_3_summary <- df_3 %>% group_by(neighbourhood_group) %>%
summarise(mean_price = mean(price, na.rm = TRUE), mean_reviews = mean(number_of_reviews, na.rm = TRUE))
df_4_summary <- df_4 %>% group_by(neighbourhood_group) %>%
summarise(mean_price = mean(price, na.rm = TRUE), mean_reviews = mean(number_of_reviews, na.rm = TRUE))
df_5_summary <- df_5 %>% group_by(neighbourhood_group) %>%
summarise(mean_price = mean(price, na.rm = TRUE), mean_reviews = mean(number_of_reviews, na.rm = TRUE))
print(df_5_summary)
## # A tibble: 5 × 3
## neighbourhood_group mean_price mean_reviews
## <chr> <dbl> <dbl>
## 1 Bronx 88.0 28.9
## 2 Brooklyn 126. 23.5
## 3 Manhattan 193. 21.4
## 4 Queens 102. 27.2
## 5 Staten Island 109. 29.6
# Combine all summaries into one data frame, adding a 'sample' column
combined_summary <- rbind(
df_1_summary %>% mutate(sample = "Sample 1"),
df_2_summary %>% mutate(sample = "Sample 2"),
df_3_summary %>% mutate(sample = "Sample 3"),
df_4_summary %>% mutate(sample = "Sample 4"),
df_5_summary %>% mutate(sample = "Sample 5")
)
Visualize the differences between samples using bar plots or other visualizations to detect potential anomalies and trends.
# Visualize the average price comparison across neighbourhood groups and samples
ggplot(combined_summary, aes(x = neighbourhood_group, y = mean_price, fill = sample)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Average Price by Neighbourhood Group Across Subsamples",
x = "Neighbourhood Group",
y = "Average Price (USD)") +
theme_minimal() +
scale_fill_brewer(palette = "Set3")
# Visualize the average number of reviews comparison across neighbourhood groups and samples
ggplot(combined_summary, aes(x = neighbourhood_group, y = mean_reviews, fill = sample)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Average Number of Reviews by Neighbourhood Group Across Subsamples",
x = "Neighbourhood Group",
y = "Average Number of Reviews") +
theme_minimal() +
scale_fill_brewer(palette = "Set3")
Insight: When scrutinizing the subsamples,
variations in the summary statistics—like the mean price and quantity of
reviews among various neighborhood groups—become evident. Some
subsamples may have more extreme outcomes or anomalies that don’t occur
consistently across all samples.
For an example:
It may recognize an anomaly if, in one neighborhood, the average price
in Sample 1 is much greater than in Sample 3.
When specific patterns, such as the distribution of review counts,
remain consistent for all subsamples, it indicates that the data point
in question is robustness.
Significance: This procedure helps in gaining an
understanding of the data’s stable and sampling-variable
characteristics. It demonstrates how relying incorrect conclusions from
a single sample can happen, particularly if anomalies or outliers have a
significant impact on the sample.
Further Questions:
Exist any elements (like rare listings) that have a disproportionate
impact on the results in certain samples?
How can I detect anomalies in my whole dataset if they only show up in
particular subsamples?
Monte Carlo simulations can be used to determine the probability that the sample differences you detect are the result of chance. To determine how much variety occurs naturally, for instance, calculate the average price by periodically simulating random sampling.
# Example Monte Carlo simulation of average price across 1000 samples
set.seed(123)
monte_carlo_sim <- replicate(1000, {
sample_data <- data[sample(1:nrow(data), size = smple_size, replace = TRUE), ]
mean(sample_data$price, na.rm = TRUE)
})
hist(monte_carlo_sim, breaks = 30, main = "Monte Carlo Simulation: Average Prices", xlab = "Average Price")
Insight: It is use for Monte Carlo simulations to
determine the likelihood that variations in average pricing or reviews
are the result of pure chance. By running thousands of simulations and
figuring the average pricing, for example, it can see how the data
naturally fluctuates.
Significance: This method indicates the degree of
confidence you can have in the inferences made from a single sample by
providing you with a range of possible results. If the difference
between the random samples is within the simulation’s predicted range,
it shows that the results are reliable and unaffected by sampling
error.
Further Questions:
What does a wide distribution indicate about the sample data’s
reliability if the simulated averages support that theory?
How can I include estimates of uncertainty in my analysis so that my
conclusions are more nuanced?