set.seed(1234)
boxoffice_raw <- read_csv("movie_boxoffice.csv") %>% clean_names()
n_raw <- nrow(boxoffice_raw)
boxoffice <- boxoffice_raw %>% distinct()
n_dedup <- nrow(boxoffice)
n_removed <- n_raw - n_dedup
n_removed
## [1] 100
ggplot(boxoffice, aes(x = worldwide_gross)) +
geom_histogram(bins = 30) +
labs(title = "Population: Histogram of Worldwide Gross",
x = "Worldwide_Gross (Millions)", y = "Count")
The distribution is right skewed
pop_mean <- mean(boxoffice$worldwide_gross, na.rm = TRUE)
pop_sd <- sd(boxoffice$worldwide_gross, na.rm = TRUE)
pop_prop <- mean(boxoffice$worldwide_gross > boxoffice$budget, na.rm = TRUE)
tibble(pop_mean, pop_sd, pop_prop)
## # A tibble: 1 Ă— 3
## pop_mean pop_sd pop_prop
## <dbl> <dbl> <dbl>
## 1 95.8 177. 0.646
average global box earning is 95.83149 standard deviation is 177.4594 Proportion of movies whose global box office earning exceeds budget is 0.6461286
sample200 <- boxoffice %>% slice_sample(n = 200)
ggplot(sample200, aes(x = worldwide_gross)) +
geom_histogram(bins = 30) +
labs(title = "Sample (n = 200): Histogram of Worldwide Gross",
x = "Worldwide_Gross (Millions)", y = "Count")
s200_mean <- mean(sample200$worldwide_gross, na.rm = TRUE)
s200_sd <- sd(sample200$worldwide_gross, na.rm = TRUE)
s200_prop <- mean(sample200$worldwide_gross > sample200$budget, na.rm = TRUE)
tibble(s200_mean, s200_sd, s200_prop,
pop_mean, pop_sd, pop_prop)
## # A tibble: 1 Ă— 6
## s200_mean s200_sd s200_prop pop_mean pop_sd pop_prop
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 91.7 184. 0.635 95.8 177. 0.646
Sample statistics are similar to population parameters. The mean is slightly lower in the sample, which is normal in sampling variation. What is higher is the standard deviation. The sample represents the population well.
set.seed(get0(".Random.seed")[1]) # lock in the current seed choice
ns <- c(20, 50, 100, 200)
reps <- 500
rep_results <- map_dfr(ns, function(n) {
map_dfr(1:reps, function(r) {
s <- boxoffice %>% slice_sample(n = n)
tibble(
n = n,
rep = r,
mean_world = mean(s$worldwide_gross, na.rm = TRUE),
prop_over_budget = mean(s$worldwide_gross > s$budget, na.rm = TRUE)
)
})
})
rep_results %>% glimpse()
## Rows: 2,000
## Columns: 4
## $ n <dbl> 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 2…
## $ rep <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
## $ mean_world <dbl> 106.98740, 102.29320, 169.84338, 71.49980, 120.27923,…
## $ prop_over_budget <dbl> 0.60, 0.75, 0.70, 0.55, 0.70, 0.70, 0.60, 0.75, 0.75,…
ggplot(rep_results, aes(x = mean_world)) +
geom_histogram(bins = 30) +
facet_wrap(~ n, scales = "free") +
labs(title = "Sampling Distribution of Sample Mean (500 reps)",
x = "Sample Mean of Worldwide_Gross (Millions)", y = "Count")
rep_mean_summary <- rep_results %>%
group_by(n) %>%
summarise(
mean_of_means = mean(mean_world, na.rm = TRUE),
se_of_means = sd(mean_world, na.rm = TRUE),
.groups = "drop"
)
rep_mean_summary
## # A tibble: 4 Ă— 3
## n mean_of_means se_of_means
## <dbl> <dbl> <dbl>
## 1 20 95.4 39.3
## 2 50 94.3 25.0
## 3 100 96.3 18.1
## 4 200 96.1 13.0
As sample size n increases, the sampling distributions become more symmetric around the population mean of 96 million, showing the sample mean is an unbiased estimator. Smaller samples n=20 or n = 50 show greater spread and mild right skew, while larger ones n=100 or 200 are more compact and nearly normal. Unlike the strongly right skewed population in Q1, these distributions are smoother and more centered on the mean.
ggplot(rep_results, aes(x = prop_over_budget)) +
geom_histogram(bins = 30) +
facet_wrap(~ n, scales = "free") +
labs(title = "Sampling Distribution of Sample Proportion (500 reps)",
x = "Sample Proportion: Worldwide_Gross > Budget", y = "Count")
rep_prop_summary <- rep_results %>%
group_by(n) %>%
summarise(
mean_of_props = mean(prop_over_budget, na.rm = TRUE),
se_of_props = sd(prop_over_budget, na.rm = TRUE),
.groups = "drop"
)
rep_prop_summary
## # A tibble: 4 Ă— 3
## n mean_of_props se_of_props
## <dbl> <dbl> <dbl>
## 1 20 0.641 0.105
## 2 50 0.645 0.0646
## 3 100 0.644 0.0476
## 4 200 0.647 0.0341
boot1 <- sample200 %>% slice_sample(n = 200, replace = TRUE)
boot1_n <- nrow(boot1)
boot1_distinct_n <- nrow(boot1 %>% distinct())
boot1_dups <- boot1_n - boot1_distinct_n
boot1_dups
## [1] 81
I see that there are duplicated movies in teh sample. 77 cases of duplicates but this is expected because the bootstrap re samples with replacement.
ggplot(boot1, aes(x = worldwide_gross)) +
geom_histogram(bins = 30) +
labs(title = "Bootstrap Sample (n = 200): Histogram of Worldwide Gross",
x = "Worldwide_Gross (Millions)", y = "Count")
from above we can see that the histogram is strongly right skewed. This
shape matches pretty well to the Q3 samples distribution
b1_mean <- mean(boot1$worldwide_gross, na.rm = TRUE)
b1_sd <- sd(boot1$worldwide_gross, na.rm = TRUE)
b1_prop <- mean(boot1$worldwide_gross > boot1$budget, na.rm = TRUE)
tibble(b1_mean, b1_sd, b1_prop,
s200_mean, s200_sd, s200_prop)
## # A tibble: 1 Ă— 6
## b1_mean b1_sd b1_prop s200_mean s200_sd s200_prop
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 80.3 183. 0.62 91.7 184. 0.635
In the bootstrap sample, the average global box office earning was $97.0 million with a standard deviation of $217.9 million. About 60% of movies earned more than their budget. Compared to the initial sample the bootstrap values are very similar showing that the resampling produces consistent estimates close to those from the original data
boot_results <- map_dfr(1:500, function(r) {
bs <- sample200 %>% slice_sample(n = 200, replace = TRUE)
tibble(
rep = r,
mean_world = mean(bs$worldwide_gross, na.rm = TRUE),
prop_over_budget = mean(bs$worldwide_gross > bs$budget, na.rm = TRUE)
)
})
boot_mean_summary <- boot_results %>%
summarise(
mean_of_means = mean(mean_world, na.rm = TRUE),
se_of_means = sd(mean_world, na.rm = TRUE)
)
boot_prop_summary <- boot_results %>%
summarise(
mean_of_props = mean(prop_over_budget, na.rm = TRUE),
se_of_props = sd(prop_over_budget, na.rm = TRUE)
)
boot_mean_summary
## # A tibble: 1 Ă— 2
## mean_of_means se_of_means
## <dbl> <dbl>
## 1 91.7 12.6
boot_prop_summary
## # A tibble: 1 Ă— 2
## mean_of_props se_of_props
## <dbl> <dbl>
## 1 0.633 0.0337
The bootstrap results show an average global box office of $91.3M (SE = $13.5M) and a proportion of 0.635 (SE = 0.032), both closely matching the original sample estimates.
compare_means <- bind_rows(
rep_results %>% filter(n == 200) %>% transmute(method = "Sampling n=200", value = mean_world),
boot_results %>% transmute(method = "Bootstrap n=200", value = mean_world)
)
ggplot(compare_means, aes(x = value)) +
geom_histogram(bins = 30) +
facet_wrap(~ method, scales = "free") +
labs(title = "Sample Mean: Sampling vs Bootstrap (n=200)",
x = "Sample Mean of Worldwide_Gross (Millions)", y = "Count")
compare_props <- bind_rows(
rep_results %>% filter(n == 200) %>% transmute(method = "Sampling n=200", value = prop_over_budget),
boot_results %>% transmute(method = "Bootstrap n=200", value = prop_over_budget)
)
ggplot(compare_props, aes(x = value)) +
geom_histogram(bins = 30) +
facet_wrap(~ method, scales = "free") +
labs(title = "Sample Proportion: Sampling vs Bootstrap (n=200)",
x = "Sample Proportion: Worldwide_Gross > Budget", y = "Count")
Both the sampling and bootstrap distributions for n=200 are roughly symmetric and centered around similar values. The bootstrap histograms appear slightly smoother but have nearly the same spread as the sampling ones, indicating that bootstrapping provides a good approximation of the sampling variability for both the mean and the proportion.