# Function to calculate statistical measures
calculate_stats <- function(data, variables) {
stats <- data %>%
summarise(across(all_of(variables), list(
mean = mean,
variance = var,
kurtosis = kurtosis,
skewness = skewness
), na.rm = TRUE))
return(stats)
}
# Function to create histogram and QQ plot for a variable
plot_distribution <- function(data, variable, method) {
# Histogram
hist_plot <- ggplot(data, aes_string(variable)) +
geom_histogram(binwidth = 0.5, fill = "skyblue", color = "black") +
labs(title = paste("Histogram of", variable, "(", method, " Sampling)"), x = variable, y = "Count") +
theme_minimal()
# QQ Plot
qq_plot <- ggplot(data, aes(sample = get(variable))) +
stat_qq() +
stat_qq_line() +
labs(title = paste("QQ Plot of", variable, "(", method, " Sampling)"), x = "Theoretical Quantiles", y = "Sample Quantiles") +
theme_minimal()
list(hist = hist_plot, qq = qq_plot)
}
# Variables for statistical analysis
variables <- c("stars", "useful", "funny", "cool")
# Function to generate plots for all variables
generate_plots <- function(data, method) {
plots <- lapply(variables, function(var) plot_distribution(data, var, method))
names(plots) <- variables
return(plots)
}
##########################################################
# Step 1. Load the data
# set your current directory
setwd("/Users/whinton/src/rstudio/tim8521")
##########################################################
# Read the file as a csv file
yelp_data <- read.csv("yelp.csv", header = TRUE, sep= ",",stringsAsFactors = TRUE)
## 'data.frame': 10000 obs. of 10 variables:
## $ business_id: Factor w/ 4174 levels "_-9pMxBWtG_x8l4rHWBasg",..: 774 4138 565 2 566 135 4134 1773 3740 2543 ...
## $ date : Factor w/ 1995 levels "2005-04-18","2005-07-03",..: 1286 1468 1790 1045 1629 225 943 1818 1854 1119 ...
## $ review_id : Factor w/ 10000 levels "__esH_kgJZeS8k3i6HaG7Q",..: 3754 4520 4479 3792 632 5645 7375 4862 9307 4840 ...
## $ stars : int 5 5 4 5 5 4 5 4 4 5 ...
## $ text : Factor w/ 9998 levels "- the location is excellent\n- the food is mediocre, and milder than they advertise\n- the wait staff is polite"| __truncated__,..: 6005 3288 5506 6818 1793 6730 1321 5591 1199 6107 ...
## $ type : Factor w/ 1 level "review": 1 1 1 1 1 1 1 1 1 1 ...
## $ user_id : Factor w/ 6403 levels "__FXEOrWIjXMOElz2pGlBQ",..: 4693 215 244 5373 5568 4934 5658 362 5455 4959 ...
## $ cool : int 2 0 0 1 0 4 7 0 0 0 ...
## $ useful : int 5 0 1 2 0 3 7 1 0 1 ...
## $ funny : int 0 0 0 0 0 1 4 0 0 0 ...
## Rows: 10,000
## Columns: 10
## $ business_id <fct> 9yKzy9PApeiPPOUJEtnvkg, ZRJwVLyzEJq1VAihDhYiow, 6oRAC4uyJC…
## $ date <fct> 2011-01-26, 2011-07-27, 2012-06-14, 2010-05-27, 2012-01-05…
## $ review_id <fct> fWKvX83p0-ka4JS3dc6E5A, IjZ33sJrzXqU-0X6U8NwyA, IESLBzqUCL…
## $ stars <int> 5, 5, 4, 5, 5, 4, 5, 4, 4, 5, 5, 5, 5, 4, 4, 2, 3, 5, 3, 4…
## $ text <fct> "My wife took me here on my birthday for breakfast and it …
## $ type <fct> review, review, review, review, review, review, review, re…
## $ user_id <fct> rLtl8ZkDX5vH5nAx9C3q5Q, 0a2KyEL0d3Yb1V6aivbIuQ, 0hT2KtfLio…
## $ cool <int> 2, 0, 0, 1, 0, 4, 7, 0, 0, 0, 1, 1, 1, 1, 1, 0, 3, 0, 5, 1…
## $ useful <int> 5, 0, 1, 2, 0, 3, 7, 1, 0, 1, 3, 1, 2, 2, 1, 2, 4, 0, 6, 1…
## $ funny <int> 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 4, 1…
###########################################################
# Generate Full Data Set Statistics
###########################################################
# Full dataset statistics
full_stats <- calculate_stats(yelp_data, variables)
print("Full Dataset Statistics:")
## [1] "Full Dataset Statistics:"
print(full_stats)
## stars_mean stars_variance stars_kurtosis stars_skewness useful_mean
## 1 3.7775 1.475341 2.840981 -0.8821582 1.4093
## useful_variance useful_kurtosis useful_skewness funny_mean funny_variance
## 1 5.45992 130.6002 6.755662 0.7013 3.640242
## funny_kurtosis funny_skewness cool_mean cool_variance cool_kurtosis
## 1 133.3332 8.124305 0.8768 4.276049 222.4071
## cool_skewness
## 1 9.258388
###########################################################
# Simple Random Sampling
###########################################################
set.seed(123)
simple_sample <- yelp_data %>% sample_n(1000)
simple_stats <- calculate_stats(simple_sample, variables)
print("Simple Random Sampling Statistics:")
## [1] "Simple Random Sampling Statistics:"
print(simple_stats)
## stars_mean stars_variance stars_kurtosis stars_skewness useful_mean
## 1 3.78 1.445045 2.94245 -0.913572 1.404
## useful_variance useful_kurtosis useful_skewness funny_mean funny_variance
## 1 5.203988 44.8599 4.932241 0.708 3.582318
## funny_kurtosis funny_skewness cool_mean cool_variance cool_kurtosis
## 1 52.89832 5.941408 0.885 4.105881 46.02684
## cool_skewness
## 1 5.272453
###########################################################
# Stratified Sampling
###########################################################
set.seed(123)
strata_sample <- strata(yelp_data, stratanames = "type", size = c(333, 333, 334), method = "srswor")
strata_data <- getdata(yelp_data, strata_sample)
stratified_stats <- calculate_stats(strata_data, variables)
print("Stratified Sampling Statistics:")
## [1] "Stratified Sampling Statistics:"
print(stratified_stats)
## stars_mean stars_variance stars_kurtosis stars_skewness useful_mean
## 1 3.828829 1.36519 3.222136 -1.003411 1.261261
## useful_variance useful_kurtosis useful_skewness funny_mean funny_variance
## 1 3.753826 22.86791 3.513179 0.6336336 2.925612
## funny_kurtosis funny_skewness cool_mean cool_variance cool_kurtosis
## 1 54.8561 5.875073 0.7177177 2.546583 19.50137
## cool_skewness
## 1 3.633898
###########################################################
# Systematic Sampling
###########################################################
set.seed(123)
interval <- floor(nrow(yelp_data) / 1000)
systematic_sample <- yelp_data[seq(1, nrow(yelp_data), by = interval), ]
systematic_stats <- calculate_stats(systematic_sample, variables)
print("Systematic Sampling Statistics:")
## [1] "Systematic Sampling Statistics:"
print(systematic_stats)
## stars_mean stars_variance stars_kurtosis stars_skewness useful_mean
## 1 3.773 1.456928 2.749176 -0.8395172 1.347
## useful_variance useful_kurtosis useful_skewness funny_mean funny_variance
## 1 4.236828 40.09334 4.452215 0.679 3.599559
## funny_kurtosis funny_skewness cool_mean cool_variance cool_kurtosis
## 1 176.1742 10.14089 0.839 3.244323 66.77288
## cool_skewness
## 1 6.044068
###########################################################
# Cluster Sampling
###########################################################
### 4.
set.seed(123)
clusters <- sample(unique(yelp_data$business_id), 50) # Select 50 clusters
cluster_sample <- yelp_data %>% filter(business_id %in% clusters)
cluster_stats <- calculate_stats(cluster_sample, variables)
print("Cluster Sampling Statistics:")
## [1] "Cluster Sampling Statistics:"
print(cluster_stats)
## stars_mean stars_variance stars_kurtosis stars_skewness useful_mean
## 1 3.865672 1.320166 2.928431 -0.9001491 1.686567
## useful_variance useful_kurtosis useful_skewness funny_mean funny_variance
## 1 4.96869 8.32978 2.110691 0.8358209 2.484121
## funny_kurtosis funny_skewness cool_mean cool_variance cool_kurtosis
## 1 13.40474 2.944058 1.059701 2.748289 6.818735
## cool_skewness
## 1 2.02172
###########################################################
# Bootstrapping
###########################################################
set.seed(123)
bootstrap_sample <- yelp_data[sample(1:nrow(yelp_data), 1000, replace = TRUE), ]
bootstrap_stats <- calculate_stats(bootstrap_sample, variables)
print("Bootstrapping Sampling Statistics:")
## [1] "Bootstrapping Sampling Statistics:"
print(bootstrap_stats)
## stars_mean stars_variance stars_kurtosis stars_skewness useful_mean
## 1 3.769 1.453092 2.889616 -0.8911918 1.413
## useful_variance useful_kurtosis useful_skewness funny_mean funny_variance
## 1 5.397829 42.12017 4.77934 0.724 3.701526
## funny_kurtosis funny_skewness cool_mean cool_variance cool_kurtosis
## 1 50.19624 5.794951 0.903 4.313905 42.72485
## cool_skewness
## 1 5.092931
###########################################################
# Generate Plots
###########################################################
# Generate plots for each sampling method
simple_plots <- generate_plots(simple_sample, "Simple Random")
stratified_plots <- generate_plots(strata_data, "Stratified")
systematic_plots <- generate_plots(systematic_sample, "Systematic")
cluster_plots <- generate_plots(cluster_sample, "Cluster")
bootstrap_plots <- generate_plots(bootstrap_sample, "Bootstrap")
# Example: Print histograms and QQ plots for "stars" in each sampling method
print(simple_plots$stars$hist)
print(simple_plots$stars$qq)
print(stratified_plots$stars$hist)
print(stratified_plots$stars$qq)
print(systematic_plots$stars$hist)
print(systematic_plots$stars$qq)
print(cluster_plots$stars$hist)
print(cluster_plots$stars$qq)
print(bootstrap_plots$stars$hist)
print(bootstrap_plots$stars$qq)
# Example: Print histograms and QQ plots for "useful" in each sampling method
print(simple_plots$useful$hist)
print(simple_plots$useful$qq)
print(stratified_plots$useful$hist)
print(stratified_plots$useful$qq)
print(systematic_plots$useful$hist)
print(systematic_plots$useful$qq)
print(cluster_plots$useful$hist)
print(cluster_plots$useful$qq)
print(bootstrap_plots$useful$hist)
print(bootstrap_plots$useful$qq)
## [1] "Comparison of Sampling Methods:"
## stars_mean stars_variance stars_kurtosis stars_skewness useful_mean
## 1 3.777500 1.475341 2.840981 -0.8821582 1.409300
## 2 3.780000 1.445045 2.942450 -0.9135720 1.404000
## 3 3.828829 1.365190 3.222136 -1.0034108 1.261261
## 4 3.773000 1.456928 2.749176 -0.8395172 1.347000
## 5 3.865672 1.320166 2.928431 -0.9001491 1.686567
## 6 3.769000 1.453092 2.889616 -0.8911918 1.413000
## useful_variance useful_kurtosis useful_skewness funny_mean funny_variance
## 1 5.459920 130.60023 6.755662 0.7013000 3.640242
## 2 5.203988 44.85990 4.932241 0.7080000 3.582318
## 3 3.753826 22.86791 3.513179 0.6336336 2.925612
## 4 4.236828 40.09334 4.452215 0.6790000 3.599559
## 5 4.968690 8.32978 2.110691 0.8358209 2.484121
## 6 5.397829 42.12017 4.779340 0.7240000 3.701526
## funny_kurtosis funny_skewness cool_mean cool_variance cool_kurtosis
## 1 133.33317 8.124305 0.8768000 4.276049 222.407054
## 2 52.89832 5.941408 0.8850000 4.105881 46.026836
## 3 54.85610 5.875073 0.7177177 2.546583 19.501367
## 4 176.17423 10.140895 0.8390000 3.244323 66.772882
## 5 13.40474 2.944058 1.0597015 2.748289 6.818735
## 6 50.19624 5.794951 0.9030000 4.313905 42.724849
## cool_skewness Sample_Type
## 1 9.258388 Full
## 2 5.272453 Simple Random
## 3 3.633898 Stratified
## 4 6.044068 Systematic
## 5 2.021720 Cluster
## 6 5.092931 Bootstrap
Overview
This study investigates the effectiveness of various sampling techniques—simple random sampling, stratified sampling, systematic sampling, cluster sampling, and bootstrapping—in addressing challenges associated with high-dimensional datasets, exemplified by the Yelp Reviews Dataset. These challenges include computational complexity, data sparsity, and reduced interpretability. Through statistical analysis and visualization (e.g., histograms and QQ plots), the study evaluates the sampled subsets against the full dataset, focusing on key measures such as mean, variance, kurtosis, and skewness.
Objective
To determine the utility of different sampling methods in mitigating the curse of dimensionality and ensuring the representativeness of high-dimensional datasets, particularly text-based datasets like Yelp Reviews. The study aims to identify techniques that balance computational efficiency with statistical reliability.
Key Findings
Each sampling method produced subsets that captured varying levels of
similarity to the full dataset. * Simple Random Sampling: Provided
unbiased representations but required larger sample sizes to address
data sparsity in high-dimensional contexts.
* Stratified Sampling: Ensured proportional representation across
business categories, yielding the most consistent statistical measures
compared to the full dataset.
* Systematic Sampling: Delivered results comparable to random sampling
but exhibited potential biases depending on interval selection.
* Cluster Sampling: Introduced higher variability due to uneven cluster
representation, particularly affecting variance and skewness.
* Bootstrapping: Effectively replicated the dataset’s overall
distribution, with slight variability in higher-order moments (kurtosis
and skewness).
Key Takeaways
Future Steps
Hybrid Techniques: Explore combining sampling methods with
dimensionality reduction approaches (e.g., PCA or t-SNE) for improved
computational efficiency and representativeness.
Automated Interval Selection: Develop systematic sampling strategies
that minimize bias from interval choices in high-dimensional
contexts.
Advanced Clustering: Refine cluster sampling by incorporating
cluster-balancing algorithms to address representation gaps.
Extended Metrics: Integrate additional evaluation criteria, such as
predictive accuracy in downstream machine learning tasks, to further
assess sampling effectiveness.
Conclusion
This study underscores the value of strategic sampling in high-dimensional data analysis, with stratified sampling standing out for its proportional representation and robust statistical consistency. Bootstrapping demonstrated strong capabilities in replicating dataset distributions and accommodating variability. By tailoring sampling methods to dataset characteristics and analytical goals, researchers and analysts can efficiently handle high-dimensional data challenges, paving the way for more effective and scalable analyses. Future work should prioritize hybrid approaches and advanced optimization techniques to enhance these methods further.
.
This study performed by Will Hinton