library(tidyverse) library(moderndive) library(ggplot2)
library(skimr) library(gapminder) library(infer)
tinytex::install_tinytex() install.packages(“xfun”)
set.seed(4869)
#bringing in dataframe, has 4969 entries with 7 columns
movie_boxoffice.1 <-
read.csv(“C:/Users/zahir/OneDrive/Desktop/Rutgers Semester Folders/2024
Fall Semester/Inference Data Science/HW 4 Lab/movie_boxoffice-1.csv”)
View(movie_boxoffice.1)
#cleaning up dataframe of duplicates, has 4869 entries with 7 columns
movie_boxoffice_distinct <- movie_boxoffice.1 %>% distinct()
view(movie_boxoffice_distinct)
#0. How many duplicated records were removed from boxoffice data?
What is your random seed number? #Answer: 100 dublicated
records(entries) were removed, and my random seed number is set to
4869
#************************************************************************************************
#Part I: Sampling #1.Treat the box office data as population. Have a
histogram of the global box office earning. Describe shape of
distribution.
ggplot(movie_boxoffice_distinct, aes(x = Worldwide_Gross)) + geom_histogram(fill = "lightblue", color = "black") +
labs(title = "Worldwide Box Office Earnings Distribution", x = "Worldwide Gross", y = "Count")

#Answer: Seems to be an exponential distribution with mode occuring
at 0.
#2. In population, what is average global box earning? What is the
std? What is the proportion of movies who global box earning exceeds
budget?
average_globalbox_earning <- mean(movie_boxoffice_distinct$Worldwide_Gross)
print(average_globalbox_earning)
[1] 95.83149
std_globalbox_earning <- sd(movie_boxoffice_distinct$Worldwide_Gross)
print(std_globalbox_earning)
[1] 177.4594
global_earning_exceeds_budget <- movie_boxoffice_distinct %>%
filter(Worldwide_Gross > Budget)
proportion_exceed <- nrow(global_earning_exceeds_budget) / nrow(movie_boxoffice_distinct)
print(proportion_exceed)
[1] 0.6461286
#Answer:Average global box earning is $95.83 million. Standard
deviation is $177.46 million. Proportion of global box earning exceeding
budget is 64.61%
#3. Take a random sample of 200 movies from the pop. Get histogram of
global box earning, describe shape of distribution. set.seed(4869)
movies_sample <- movie_boxoffice_distinct %>% sample_n(size =
200)
ggplot(movies_sample, aes(x = Worldwide_Gross)) + geom_histogram(fill = "lightblue", color = "black") +
labs(title = "Worldwide Box Office Earnings Distribution", x = "Worldwide Gross", y = "Count")

#Answer: Seems to be an exponential distribution with mode occuring
at 0. Also seems to be similar to the overall population histogram.
#4. In sample, what {is avg global box office earning? is std? is
proportion of global box movie earning exceeding budget?} Are these
summary stats from sample close to population paramets?
average_sample_earning <- mean(movies_sample$Worldwide_Gross)
print(average_sample_earning)
[1] 119.7947
std_sample_earning <- sd(movies_sample$Worldwide_Gross)
print(std_sample_earning)
[1] 201.7954
global_sample_earning_exceeds_budget <- movies_sample %>%
filter(Worldwide_Gross > Budget)
proportion_sample_exceed <- nrow(global_sample_earning_exceeds_budget) / nrow(movie_boxoffice_distinct)
print(proportion_sample_exceed)
[1] 0.02690491
#Answer:Average global box earning is $136.55 million. Standard
deviation is $231.55 million. Proportion of global box earning exceeding
budget is 2.81%. These summary stats differ quite a bit from the
population summary stats. Specifically about $40 million in avg, $54
million in std, and about 62% difference in proportion. This sample may
contain more movies with lesser global box earnings than their budgets
unlike the overall population explaining such low proportion.
#5.Take rand sample of n movies from the pop, calculate the avg
global box office earning, and proportion of global box earning
exceeding budget.
set.seed(4869)
sample_stats <- function(n, reps = 500) {
movie_boxoffice_distinct %>%
rep_sample_n(size = n, reps = reps) %>%
group_by(replicate) %>%
summarize(
gross_avg = mean(Worldwide_Gross),
proportion_ex = mean(Worldwide_Gross > Budget),
)
}
n_val <- sample(50:500, 1)
output_ex <- sample_stats(n_val)
print(output_ex)
overall_avg <- mean(output_ex$gross_avg)
cat(
"Sample Size =", n_val,
": AVG Global Box Office Earnings =", round(overall_avg, 2),
", Proportion Exceeding Budget =", round(mean(output_ex$proportion_ex, na.rm = TRUE), 4)
)
Sample Size = 78 : AVG Global Box Office Earnings = 95.06 , Proportion Exceeding Budget = 0.6478
#6 For each n=20, 50, 100, and 200, get histrogram of the avg global
box earning using face_wrap(), also get mean and std error
set.seed(4869)
get_repetition_stats <- function(n, reps = 500) {
movie_boxoffice_distinct %>%
rep_sample_n(size = n, reps = reps) %>%
group_by(replicate) %>%
summarize(
gross_mean = mean(Worldwide_Gross),
proportion_exceed = mean(Worldwide_Gross > Budget)
)
}
n_20 <- get_repetition_stats(20)
n_50 <- get_repetition_stats(50)
n_100 <- get_repetition_stats(100)
n_200 <- get_repetition_stats(200)
n_20$Sample_Size <- 20
n_50$Sample_Size <- 50
n_100$Sample_Size <- 100
n_200$Sample_Size <- 200
allSamplesDF <- bind_rows(n_20, n_50, n_100, n_200)
allSamplesDF$Sample_Size <- rep(c(20, 50, 100, 200), each = 500)
ggplot(allSamplesDF, aes(x = gross_mean)) + geom_histogram(fill = "lightblue", color = "black") + facet_wrap(~ Sample_Size, scales = "free") + labs(title = "Average Global Box Office Earnings Distribution by n Size", x = "Worldwide Gross Mean", y ="Count")

summary_stats <- allSamplesDF %>%
group_by(Sample_Size) %>%
summarize(
mean_allgross = mean(gross_mean),
std_err_allgross = sd(gross_mean) / sqrt(n())
)
print(summary_stats)
#7. Compare distributions among the different n values, and also
compare them to the distribution from the population in Q1 #Answer: For
distrubtions among the different n values, starting at n = 20, more
closely aligned with the distribution from the population in Q1, but
mode no longer being at 0, but a value around $70 million. Going up the
scale on the n values to 50, 100, and 200, the distribtuion was
approaching normality, especially with n being 200. Although mode count
decreased, the concisesness of it went up to around the value of $90
million.
#8. For each n = {20, 50, 100, 200}, get the histogram of the
proportion of movies whose global box office earning exceeds budget,
using face_wrap, also get mean and std error
set.seed(4869)
get_repetition_prop <- function(n, reps = 500) {
movie_boxoffice_distinct %>%
rep_sample_n(size = n, reps = reps) %>%
group_by(replicate) %>%
summarize(
proportion_exceed = mean(Worldwide_Gross > Budget)
)
}
pn_20 <- get_repetition_prop(20)
pn_50 <- get_repetition_prop(50)
pn_100 <- get_repetition_prop(100)
pn_200 <- get_repetition_prop(200)
pn_20$Sample_Size <- 20
pn_50$Sample_Size <- 50
pn_100$Sample_Size <- 100
pn_200$Sample_Size <- 200
allSamDF <- bind_rows(n_20, n_50, n_100, n_200)
allSamDF$Sample_Size <- rep(c(20, 50, 100, 200), each = 500)
ggplot(allSamDF, aes(x = proportion_exceed)) + geom_histogram(fill = "lightblue", color = "black") + facet_wrap(~ Sample_Size, scales = "free") + labs(title = "Proportion of Successful Movies by n Size", x = "Proportion of Worldwide Gross Exceeding Budget", y ="Count")

prop_summary_stats <- allSamDF %>%
group_by(Sample_Size) %>%
summarize(
mean_proportion = mean(proportion_exceed),
std_err_allgross = sd(proportion_exceed) / sqrt(n())
)
print(prop_summary_stats)
#9. Compare the distributions among different n values. #Answer: For
n = 20 describles a somewhat left-skewed distribtuion with mode being at
0.7. For n = 50, distribtuion appears to be normal, while at n = 100,
repeating similarities with n = 20, but now there are two modes making
it a bimodal left-skewed distribution. For N = 200, distribution appears
to be more normal similar to n = 50.
#************************************************************************************************
#Part II: Bootstrapping
#10. Use data from Q3 as the initial sample, use bootstrapping method
to resample once with 200 movies. Are there any duplicated movies,
expected or not?
set.seed(4869)
bootstrap_samp <- movies_sample %>%
rep_sample_n(size = 200, replace = TRUE) %>%
summarize(gross_avg = mean(Worldwide_Gross))
print(bootstrap_samp)
set.seed(4869)
duplicateNum <- movies_sample %>%
rep_sample_n(size = 200, replace = TRUE) %>%
count(Movie) %>%
filter(n > 1)
print(duplicateNum)
#Answer:Yes, there were duplicated movies, was this expected? Of
course it was expected because we set replace to be TRUE allowing picked
movies to be picked again.
#11.Get the histrogram of global box office earning in the bootstrap
sample. Describe the shape of distribution and compare it to Q3
bootstrap2_samp <- movies_sample %>%
rep_sample_n(size = 200, replace = TRUE)
ggplot(bootstrap2_samp, aes(x = Worldwide_Gross)) + geom_histogram(fill = "lightblue", color = "black") + labs(title = "Boostrap Sample Histogram of Worldwide Box Earnings", x = "Worldwide Gross", y ="Count")

#Answer: The shape of bootstrap sample histogram is closer to the
exponential distribution, with mode still occuring at 0, and more
variance with higher counts of outliers than as seen in Q3.
#12.In the bootstrap sample from Q10, what {is the avg global box
earning, is the std, is the proportion of succesful movies?} Are they
close enough to those in the intial sample?
avg_gross <- mean(bootstrap2_samp$Worldwide_Gross)
print(avg_gross)
[1] 127.9658
std_gross <- sd(bootstrap2_samp$Worldwide_Gross)
print(std_gross)
[1] 221.3411
prop_ex_bu <- mean(bootstrap2_samp$Worldwide_Gross > bootstrap2_samp$Budget)
print(prop_ex_bu)
[1] 0.69
#Answer: Average gross given is $127.97 million, Std dev is $221.34
million, and proportion of successful movies is 69%. These are pretty
close to the intial values of avg gross being $136.55 million, std dev
as $231.55 million, with the biggest defference being proportion from
2.81%. However, this is now closer to the overall population proportion
of 64.61.
#13. Get the boostrapping distr of the avg global box earning, and
proportion of movies whose global box earning exceeded budget, by
resampling 500 times with bootstrapping method. Get the mean and std
error of the average global box earning and mean and std error for
proportion.
set.seed(3223)
bootstrap_samples <- movies_sample %>%
rep_sample_n(size = nrow(movies_sample), reps = 2000, replace = TRUE)
bootstrap_stats <- bootstrap_samples %>%
summarize(
bootstrap_grossmean = mean(Worldwide_Gross),
bootstrap_prop = mean(Worldwide_Gross > Budget)
)
bootstrap_grossavg <- mean(bootstrap_stats$bootstrap_grossmean)
print(bootstrap_grossavg)
[1] 119.7433
bootstrap_std_grossavg <- sd(bootstrap_stats$bootstrap_grossmean)
print(bootstrap_std_grossavg)
[1] 14.21178
mean_prop <- mean(bootstrap_stats$bootstrap_prop)
print(mean_prop)
[1] 0.65501
se_prop <- sd(bootstrap_stats$bootstrap_prop) / sqrt(nrow(bootstrap_stats))
print(se_prop)
[1] 0.0007548658
ggplot(bootstrap_stats, aes(x = bootstrap_grossmean)) + geom_histogram(fill = "lightblue", color = "black") + labs(title = "Average Global Box Office Earnings Distribution", x = "AVG Worldwide Gross", y = "Count")

ggplot(bootstrap_stats, aes(x = bootstrap_prop)) + geom_histogram(fill = "lightblue", color = "black") + labs(title = "Proportion of Successful Movies", x = "Successful Movies", y = "Count")

#Answer: Given mean of Average global office earning is $120.30
million, with std dev being $14.59 million. Proportion of succesful
movies given is 65.44%, and the standard error for it is 0.16%. The
distribution for Average Global Box Earnings is right leaning with
occuring around $112 million, and for the poportion of succesful moves,
distribution is more normal with a slight left skew and mode occuring
around 67.5%.
#***************************************************************************************************************************************************************************
#Stat291 Homework 5 #1 The confidence interval of the average global box
office earning of all movies from 1980 to 2018
#setting up bootstrap distribution from given sample referenced in homework 4
set.seed(3333)
movie_stats <- movies_sample %>%
specify(response = Worldwide_Gross) %>%
generate(reps = 1500, type = "bootstrap") %>%
calculate(stat = "mean")
#visualization of the distribution
visualize(movie_stats)

ci_percentile <- movie_stats %>%
get_confidence_interval(level = 0.95, type = "percentile")
print(ci_percentile)
bootstrap_mean <- mean(movie_stats$stat)
bootstrap_se <- sd(movie_stats$stat)
ci_std_err <- c(
lower = bootstrap_mean - 1.96 * bootstrap_se,
upper = bootstrap_mean + 1.96 * bootstrap_se
)
print(ci_std_err)
lower upper
92.09295 146.59990
ci_theor <- c(
lower = bootstrap_mean - qnorm(0.975) * bootstrap_se,
upper = bootstrap_mean + qnorm(0.975) * bootstrap_se
)
print(ci_theor)
lower upper
92.09345 146.59940
#Answer: The lower ci and upper ci given are $93.40 million and
$148.49 million with 95% confidence, means that we can be 95% confident
that the average global box earnings for all movies released between
1980 and 2018 falls within this range. The standard error and
theoretical method were close but with values of $91 million and $146.6
million. The slight right skew observed in the bootstrap distribution
can help explain the slight difference between these two and the
percentile method.
#2 The confidence interval of the difference of average global box
earnings between movies in the summer (June, July, and August) and
movies in the rest of year
set.seed(4444)
movies_sample2 <- movies_sample %>%
mutate(summer = ifelse(Month %in% c("Jun", "Jul", "Aug"), "Summer", "Other"))
summer_stats <- movies_sample2 %>%
specify(response = Worldwide_Gross, explanatory = summer) %>%
generate(reps = 1750, type = "bootstrap") %>%
calculate(stat = "diff in means", order = c("Summer", "Other"))
visualize(summer_stats)

ci_summer <- summer_stats %>%
get_confidence_interval(level = 0.95, type = "percentile")
print(ci_summer)
summer_mean <- mean(summer_stats$stat)
summer_se <- sd(summer_stats$stat)
ci_sum_std_err <- c(
lower = summer_mean - 1.96 * summer_se,
upper = summer_mean + 1.96 * summer_se
)
print(ci_sum_std_err)
lower upper
-24.35445 140.63298
sum_ci_theor <- c(
lower = summer_mean - qnorm(0.975) * summer_se,
upper = summer_mean + qnorm(0.975) * summer_se
)
print(sum_ci_theor)
lower upper
-24.35293 140.63146
#Answer: We can be 95% confident that average global box earnings lie
between -16.47 million and 145.77 million dollars, and since the
interval includes zero, there cannot be a conclusion of a satistically
significant difference between summer and non-summer movies at this
level. From the theoretical and standard error method, the confidence
interval obtained was between -$24.35 million and $140.63 million that
can be explained by the distribution being slightly skewed right.
#3.The confidence interval of the difference of proportions of movies
whose global box office earnings exceeds budget
bootstrap_diff <- bootstrap_samples %>%
group_by(replicate) %>%
summarize(
prop_exc_budg = mean(Worldwide_Gross > Budget)
) %>%
mutate(
diff_prop = prop_exc_budg
)
mean_diff_prop <- mean(bootstrap_diff$diff_prop)
se_diff_prop <-sd(bootstrap_diff$diff_prop)
ggplot(bootstrap_diff, aes(x = diff_prop)) +
geom_histogram(binwidth = 0.01, color = "black", fill ="lightblue") +
labs(title = "Bootstrap Distribution of Difference in Proportions", x = "Difference in Proportions", y = "Count")

#c(0.025,0.975) captures center 95% while leaving 2.5% off both ends
ci_prop <- quantile(bootstrap_diff$diff_prop, probs = c(
0.025, 0.975
))
print(ci_prop)
2.5% 97.5%
0.59 0.72
ci_prop_se <- c(
lower = mean_diff_prop - 1.96 * se_diff_prop,
upper = mean_diff_prop + 1.96 * se_diff_prop
)
print(ci_prop_se)
lower upper
0.5888431 0.7211769
prop_ci_theor <- c(
lower = mean_diff_prop - qnorm(0.975) * se_diff_prop,
upper = mean_diff_prop + qnorm(0.975) * se_diff_prop
)
print(prop_ci_theor)
lower upper
0.5888443 0.7211757
#Answer: Seeing that the numbers range from about 0.59 to 0.72, means
that we can be 95% confident that the proportion for global box earnings
exceeding budget will fall within this interval. The distribution
appears to have a slight left-skew.
#4 The confidence interval, of the difference of proportions of
movies whose global box office earnings exceeds budget between movies
released from 1980 to 1999 and those released from 2000 and 2018 #using
data obtained in part 3 of this lab, I adjusted it to
prop_strap_diff <- bootstrap_samples %>%
group_by(replicate) %>%
summarize(
prop_exc_1980_1999 = mean(ifelse(Year >= 1980 & Year <= 1999 & !is.na(Worldwide_Gross), Worldwide_Gross > Budget, NA), na.rm = TRUE),
prop_exc_2000_2018 = mean(ifelse(Year >= 2000 & Year <= 2018 & !is.na(Worldwide_Gross), Worldwide_Gross > Budget, NA), na.rm = TRUE)
) %>%
mutate(
diff_prop2 = prop_exc_2000_2018 - prop_exc_1980_1999
)
mean_diff_prop2 <- mean(prop_strap_diff$diff_prop2)
se_diff_prop2 <- sd(prop_strap_diff$diff_prop2)
ggplot(prop_strap_diff, aes(x = diff_prop2)) +
geom_histogram(binwidth = 0.01, color = "black", fill = "lightblue") +
labs(title = "Bootstrap Distribution of Difference in Proportions between 1980-1999 and 2000-2018", x = "Difference in Proportions between 1980-1999 and 2000-2018", y = "Count")

#c(0.025,0.975) captures center 95% while leaving 2.5% off both ends
ci_prop2 <- quantile(prop_strap_diff$diff_prop2, probs = c(
0.025, 0.975
))
print(ci_prop2)
2.5% 97.5%
-0.04787311 0.31644719
ci_prop_se2 <- c(
lower = mean_diff_prop2 - 1.96 * se_diff_prop2,
upper = mean_diff_prop2 + 1.96 * se_diff_prop2
)
print(ci_prop_se2)
lower upper
-0.04577925 0.31359360
prop_ci_theor2 <- c(
lower = mean_diff_prop2 - qnorm(0.975) * se_diff_prop2,
upper = mean_diff_prop2 + qnorm(0.975) * se_diff_prop2
)
print(prop_ci_theor2)
lower upper
-0.04577595 0.31359030
#Answer: Observing the different confident interval values, ranging
from approx -0.046 to 0.314 or as in the percentile method: -0.048 to
0.316. Since zero is included within all intervals, at the 95%
confidence level, it can be stated that there is no statistically
significant difference between both time periods from (1980-1999 to
2000-2018).
---
title: "R Notebook"
output:
  html_notebook: default
  html_document:
    df_print: paged
  pdf_document: default
---
library(tidyverse)
library(moderndive)
library(ggplot2)
library(skimr)
library(gapminder)
library(infer)
tinytex::install_tinytex()
install.packages("xfun")


set.seed(4869)

#bringing in dataframe, has 4969 entries with 7 columns
movie_boxoffice.1 <- read.csv("C:/Users/zahir/OneDrive/Desktop/Rutgers Semester Folders/2024 Fall Semester/Inference Data Science/HW 4 Lab/movie_boxoffice-1.csv")
  View(movie_boxoffice.1)

#cleaning up dataframe of duplicates, has 4869 entries with 7 columns
movie_boxoffice_distinct <- movie_boxoffice.1 %>%
  distinct()
  view(movie_boxoffice_distinct)
  
#0. How many duplicated records were removed from boxoffice data? What is your random seed number?
#Answer: 100 dublicated records(entries) were removed, and my random seed number is set to 4869

#************************************************************************************************
#Part I: Sampling
#1.Treat the box office data as population. Have a histogram of the global box office earning. Describe shape of distribution.
```{r Histogram of the Global Box Earning}
ggplot(movie_boxoffice_distinct, aes(x = Worldwide_Gross)) + geom_histogram(fill = "lightblue", color = "black") + 
labs(title = "Worldwide Box Office Earnings Distribution", x = "Worldwide Gross", y = "Count")
```
#Answer: Seems to be an exponential distribution with mode occuring at 0.

#2. In population, what is average global box earning? What is the std? What is the proportion of movies who global box earning exceeds budget?
```{r Part 1 Step 2 Code}
average_globalbox_earning <- mean(movie_boxoffice_distinct$Worldwide_Gross)
print(average_globalbox_earning)

std_globalbox_earning <- sd(movie_boxoffice_distinct$Worldwide_Gross)
print(std_globalbox_earning)

global_earning_exceeds_budget <- movie_boxoffice_distinct %>%
  filter(Worldwide_Gross > Budget)
proportion_exceed <- nrow(global_earning_exceeds_budget) / nrow(movie_boxoffice_distinct)
print(proportion_exceed)
```
#Answer:Average global box earning is $95.83 million. Standard deviation is $177.46 million. Proportion of global box earning exceeding budget is 64.61%

#3. Take a random sample of 200 movies from the pop. Get histogram of global box earning, describe shape of distribution.
set.seed(4869)
movies_sample <- movie_boxoffice_distinct %>%
  sample_n(size = 200)
```{r Part 1 Step 3 Histogram}
ggplot(movies_sample, aes(x = Worldwide_Gross)) + geom_histogram(fill = "lightblue", color = "black") + 
labs(title = "Worldwide Box Office Earnings Distribution", x = "Worldwide Gross", y = "Count")
```
#Answer: Seems to be an exponential distribution with mode occuring at 0. Also seems to be similar to the overall population histogram.

#4. In sample, what {is avg global box office earning? is std? is proportion of global box movie earning exceeding budget?} Are these summary stats from sample close to population paramets?
```{r Part 1 Step 4 Code}
average_sample_earning <- mean(movies_sample$Worldwide_Gross)
print(average_sample_earning)

std_sample_earning <- sd(movies_sample$Worldwide_Gross)
print(std_sample_earning)

global_sample_earning_exceeds_budget <- movies_sample %>%
  filter(Worldwide_Gross > Budget)
proportion_sample_exceed <- nrow(global_sample_earning_exceeds_budget) / nrow(movie_boxoffice_distinct)
print(proportion_sample_exceed)
```
#Answer:Average global box earning is $136.55 million. Standard deviation is $231.55 million. Proportion of global box earning exceeding budget is 2.81%. These summary stats differ quite a bit from the population summary stats. Specifically about $40 million in avg, $54 million in std, and about 62% difference in proportion. This sample may contain more movies with lesser global box earnings than their budgets unlike the overall population explaining such low proportion.

#5.Take rand sample of n movies from the pop, calculate the avg global box office earning, and proportion of global box earning exceeding budget.
```{r 500 Iterations}
set.seed(4869)
sample_stats <- function(n, reps = 500) {
  movie_boxoffice_distinct %>%
    rep_sample_n(size = n, reps = reps) %>%
    group_by(replicate) %>%
    summarize(
      gross_avg = mean(Worldwide_Gross),
      proportion_ex = mean(Worldwide_Gross > Budget),
    )
}

n_val <- sample(50:500, 1)

output_ex <- sample_stats(n_val)
print(output_ex)

overall_avg <- mean(output_ex$gross_avg)
cat(
  "Sample Size =", n_val, 
  ": AVG Global Box Office Earnings =", round(overall_avg, 2),
  ", Proportion Exceeding Budget =", round(mean(output_ex$proportion_ex, na.rm = TRUE), 4)
  )
```

#6 For each n=20, 50, 100, and 200, get histrogram of the avg global box earning using face_wrap(), also get mean and std error

```{r Histogram by n Size}
set.seed(4869)
get_repetition_stats <- function(n, reps = 500) {
  movie_boxoffice_distinct %>%
    rep_sample_n(size = n, reps = reps) %>%
    group_by(replicate) %>%
      summarize(
        gross_mean = mean(Worldwide_Gross),
        proportion_exceed = mean(Worldwide_Gross > Budget)
      )
}

n_20 <- get_repetition_stats(20)
n_50 <- get_repetition_stats(50)
n_100 <- get_repetition_stats(100)
n_200 <- get_repetition_stats(200)

n_20$Sample_Size <- 20
n_50$Sample_Size <- 50
n_100$Sample_Size <- 100
n_200$Sample_Size <- 200
  
allSamplesDF <- bind_rows(n_20, n_50, n_100, n_200)

allSamplesDF$Sample_Size <- rep(c(20, 50, 100, 200), each = 500)

ggplot(allSamplesDF, aes(x = gross_mean)) + geom_histogram(fill = "lightblue", color = "black") + facet_wrap(~ Sample_Size, scales = "free") + labs(title = "Average Global Box Office Earnings Distribution by n Size", x = "Worldwide Gross Mean", y ="Count")
```
```{r Mean and std error of average global box earning}
summary_stats <- allSamplesDF %>%
  group_by(Sample_Size) %>%
  summarize(
    mean_allgross = mean(gross_mean),
    std_err_allgross = sd(gross_mean) / sqrt(n())
  )
print(summary_stats)
```

#7. Compare distributions among the different n values, and also compare them to the distribution from the population in Q1
#Answer: For distrubtions among the different n values, starting at n = 20, more closely aligned with the distribution from the population in Q1, but mode no longer being at 0, but a value around $70 million. Going up the scale on the n values to 50, 100, and 200, the distribtuion was approaching normality, especially with n being 200. Although mode count decreased, the concisesness of it went up to around the value of $90 million.

#8. For each n = {20, 50, 100, 200}, get the histogram of the proportion of movies whose global box office earning exceeds budget, using face_wrap, also get mean and std error
```{r Histograms of Proportion of Successful Movies by n Size}
set.seed(4869)
get_repetition_prop <- function(n, reps = 500) {
  movie_boxoffice_distinct %>%
    rep_sample_n(size = n, reps = reps) %>%
    group_by(replicate) %>%
      summarize(
        proportion_exceed = mean(Worldwide_Gross > Budget)
      )
}

pn_20 <- get_repetition_prop(20)
pn_50 <- get_repetition_prop(50)
pn_100 <- get_repetition_prop(100)
pn_200 <- get_repetition_prop(200)

pn_20$Sample_Size <- 20
pn_50$Sample_Size <- 50
pn_100$Sample_Size <- 100
pn_200$Sample_Size <- 200
  
allSamDF <- bind_rows(n_20, n_50, n_100, n_200)

allSamDF$Sample_Size <- rep(c(20, 50, 100, 200), each = 500)

ggplot(allSamDF, aes(x = proportion_exceed)) + geom_histogram(fill = "lightblue", color = "black") + facet_wrap(~ Sample_Size, scales = "free") + labs(title = "Proportion of Successful Movies by n Size", x = "Proportion of Worldwide Gross Exceeding Budget", y ="Count")
```
```{r}
prop_summary_stats <- allSamDF %>%
  group_by(Sample_Size) %>%
  summarize(
    mean_proportion = mean(proportion_exceed),
    std_err_allgross = sd(proportion_exceed) / sqrt(n())
  )
print(prop_summary_stats)
```

#9. Compare the distributions among different n values. 
#Answer: For n = 20 describles a somewhat left-skewed distribtuion with mode being at 0.7. For n = 50, distribtuion appears to be normal, while at n = 100, repeating similarities with n = 20, but now there are two modes making it a bimodal left-skewed distribution. For N = 200, distribution appears to be more normal similar to n = 50.

#************************************************************************************************
#Part II: Bootstrapping

#10. Use data from Q3 as the initial sample, use bootstrapping method to resample once with 200 movies. Are there any duplicated movies, expected or not?
```{r}
set.seed(4869)

bootstrap_samp <- movies_sample %>%
  rep_sample_n(size = 200, replace = TRUE) %>%
  summarize(gross_avg = mean(Worldwide_Gross))
print(bootstrap_samp)
```
```{r}
set.seed(4869)
duplicateNum <- movies_sample %>%
  rep_sample_n(size = 200, replace = TRUE) %>%
  count(Movie) %>%
  filter(n > 1)
print(duplicateNum)
```

#Answer:Yes, there were duplicated movies, was this expected? Of course it was expected because we set replace to be TRUE allowing picked movies to be picked again.

#11.Get the histrogram of global box office earning in the bootstrap sample. Describe the shape of distribution and compare it to Q3
```{r}
bootstrap2_samp <- movies_sample %>%
  rep_sample_n(size = 200, replace = TRUE)

ggplot(bootstrap2_samp, aes(x = Worldwide_Gross)) + geom_histogram(fill = "lightblue", color = "black") + labs(title = "Boostrap Sample Histogram of Worldwide Box Earnings", x = "Worldwide Gross", y ="Count")
```
#Answer: The shape of bootstrap sample histogram is closer to the exponential distribution, with mode still occuring at 0, and more variance with higher counts of outliers than as seen in Q3.

#12.In the bootstrap sample from Q10, what {is the avg global box earning, is the std, is the proportion of succesful movies?} Are they close enough to those in the intial sample?
```{r}
avg_gross <- mean(bootstrap2_samp$Worldwide_Gross)
print(avg_gross)

std_gross <- sd(bootstrap2_samp$Worldwide_Gross)
print(std_gross)

prop_ex_bu <- mean(bootstrap2_samp$Worldwide_Gross > bootstrap2_samp$Budget)
print(prop_ex_bu)
```
#Answer: Average gross given is $127.97 million, Std dev is $221.34 million, and proportion of successful movies is 69%. These are pretty close to the intial values of avg gross being $136.55 million, std dev as $231.55 million, with the biggest defference being proportion from 2.81%. However, this is now closer to the overall population proportion of 64.61.

#13. Get the boostrapping distr of the avg global box earning, and proportion of movies whose global box earning exceeded budget, by resampling 500 times with bootstrapping method. Get the mean and std error of the average global box earning and mean and std error for proportion.
```{r}
set.seed(3223)
bootstrap_samples <- movies_sample %>%
  rep_sample_n(size = nrow(movies_sample), reps = 2000, replace = TRUE)

bootstrap_stats <- bootstrap_samples %>%
  summarize(
    bootstrap_grossmean = mean(Worldwide_Gross),
    bootstrap_prop = mean(Worldwide_Gross > Budget)
  )
bootstrap_grossavg <- mean(bootstrap_stats$bootstrap_grossmean)
print(bootstrap_grossavg)

bootstrap_std_grossavg <- sd(bootstrap_stats$bootstrap_grossmean)
print(bootstrap_std_grossavg)

mean_prop <- mean(bootstrap_stats$bootstrap_prop)
print(mean_prop)

se_prop <- sd(bootstrap_stats$bootstrap_prop) / sqrt(nrow(bootstrap_stats))
print(se_prop)
```
```{r Earnings Distribution }
ggplot(bootstrap_stats, aes(x = bootstrap_grossmean)) + geom_histogram(fill = "lightblue", color = "black") + labs(title = "Average Global Box Office Earnings Distribution", x = "AVG Worldwide Gross", y = "Count")
```
```{r}
ggplot(bootstrap_stats, aes(x = bootstrap_prop)) + geom_histogram(fill = "lightblue", color = "black") + labs(title = "Proportion of Successful Movies", x = "Successful Movies", y = "Count")

```

#Answer: Given mean of Average global office earning is $120.30 million, with std dev being $14.59 million. Proportion of succesful movies given is 65.44%, and the standard error for it is 0.16%. The distribution for Average Global Box Earnings is right leaning with occuring around $112 million, and for the poportion of succesful moves, distribution is more normal with a slight left skew and mode occuring around 67.5%.

#***************************************************************************************************************************************************************************
#Stat291 Homework 5
#1 The confidence interval of the average global box office earning of all movies from 1980 to 2018
```{r Confidence Interval Bootstrap Distribution 1}
#setting up bootstrap distribution from given sample referenced in homework 4
set.seed(3333)

movie_stats <- movies_sample %>%
  specify(response = Worldwide_Gross) %>%
  generate(reps = 1500, type = "bootstrap") %>%
  calculate(stat = "mean")

#visualization of the distribution
visualize(movie_stats)
```
```{r Percentile method 1}
ci_percentile <- movie_stats %>%
  get_confidence_interval(level = 0.95, type = "percentile")

print(ci_percentile)
```
```{r Standard Error Method 1}
bootstrap_mean <- mean(movie_stats$stat)
bootstrap_se <- sd(movie_stats$stat)

ci_std_err <- c(
  lower = bootstrap_mean - 1.96 * bootstrap_se,
  upper = bootstrap_mean + 1.96 * bootstrap_se
)

print(ci_std_err)
```
```{r Theoretical Method 1}
ci_theor <- c(
  lower = bootstrap_mean - qnorm(0.975) * bootstrap_se,
  upper = bootstrap_mean + qnorm(0.975) * bootstrap_se
)

print(ci_theor)
```

#Answer: The lower ci and upper ci given are $93.40 million and $148.49 million with 95% confidence, means that we can be 95% confident that the average global box earnings for all movies released between 1980 and 2018 falls within this range. The standard error and theoretical method were close but with values of $91 million and $146.6 million. The slight right skew observed in the bootstrap distribution can help explain the slight difference between these two and the percentile method.

#2 The confidence interval of the difference of average global box earnings between movies in the summer (June, July, and August) and movies in the rest of year
```{r Confidence Interval Bootstrap Distribution 2}
set.seed(4444)

movies_sample2 <- movies_sample %>%
  mutate(summer = ifelse(Month %in% c("Jun", "Jul", "Aug"), "Summer", "Other"))

summer_stats <- movies_sample2 %>%
  specify(response = Worldwide_Gross, explanatory = summer) %>%
  generate(reps = 1750, type = "bootstrap") %>%
  calculate(stat = "diff in means", order = c("Summer", "Other"))


visualize(summer_stats)
```
```{r Percentile Method 2}
ci_summer <- summer_stats %>%
  get_confidence_interval(level = 0.95, type = "percentile")

print(ci_summer)
```
```{r standard error method 2}
summer_mean <- mean(summer_stats$stat)
summer_se <- sd(summer_stats$stat)

ci_sum_std_err <- c(
  lower = summer_mean - 1.96 * summer_se,
  upper = summer_mean + 1.96 * summer_se
)

print(ci_sum_std_err)
```
```{r Theoretical Method 2}
sum_ci_theor <- c(
  lower = summer_mean - qnorm(0.975) * summer_se,
  upper = summer_mean + qnorm(0.975) * summer_se
)

print(sum_ci_theor)
```
#Answer: We can be 95% confident that average global box earnings lie between -16.47 million and 145.77 million dollars, and since the interval includes zero, there cannot be a conclusion of a satistically significant difference between summer and non-summer movies at this level. From the theoretical and standard error method, the confidence interval obtained was between -$24.35 million and $140.63 million that can be explained by the distribution being slightly skewed right.

#3.The confidence interval of the difference of proportions of movies whose global box office earnings exceeds budget
```{r Confidence Interval Bootstrap Distribution 3}
bootstrap_diff <- bootstrap_samples %>%
  group_by(replicate) %>%
  summarize(
    prop_exc_budg = mean(Worldwide_Gross > Budget)
  ) %>%
  mutate(
    diff_prop = prop_exc_budg
  )

mean_diff_prop <- mean(bootstrap_diff$diff_prop)

se_diff_prop <-sd(bootstrap_diff$diff_prop)

ggplot(bootstrap_diff, aes(x = diff_prop)) +
  geom_histogram(binwidth = 0.01, color = "black", fill ="lightblue") +
  labs(title = "Bootstrap Distribution of Difference in Proportions", x = "Difference in Proportions", y = "Count")
```
```{r Percentile method 3}
#c(0.025,0.975) captures center 95% while leaving 2.5% off both ends
ci_prop <- quantile(bootstrap_diff$diff_prop, probs = c(
  0.025, 0.975
  ))

print(ci_prop)
```
```{r standard error method 3}
ci_prop_se <- c(
  lower = mean_diff_prop - 1.96 * se_diff_prop,
  upper = mean_diff_prop + 1.96 * se_diff_prop
  )

print(ci_prop_se)
```
```{r Theoretical Method 3}
prop_ci_theor <- c(
  lower = mean_diff_prop - qnorm(0.975) * se_diff_prop,
  upper = mean_diff_prop + qnorm(0.975) * se_diff_prop
)

print(prop_ci_theor)
```
#Answer: Seeing that the numbers range from about 0.59 to 0.72, means that we can be 95% confident that the proportion for global box earnings exceeding budget will fall within this interval. The distribution appears to have a slight left-skew.

#4 The confidence interval, of the difference of proportions of movies whose global box office earnings exceeds budget between movies released from 1980 to 1999 and those released from 2000 and 2018
#using data obtained in part 3 of this lab, I adjusted it to 
```{r Confidence Interval Bootstrap Distribution 4}
prop_strap_diff <- bootstrap_samples %>%
  group_by(replicate) %>%
  summarize(
    prop_exc_1980_1999 = mean(ifelse(Year >= 1980 & Year <= 1999 & !is.na(Worldwide_Gross), Worldwide_Gross > Budget, NA), na.rm = TRUE),
    prop_exc_2000_2018 = mean(ifelse(Year >= 2000 & Year <= 2018 & !is.na(Worldwide_Gross), Worldwide_Gross > Budget, NA), na.rm = TRUE)
  ) %>%
  mutate(
    diff_prop2 = prop_exc_2000_2018 - prop_exc_1980_1999
  )

mean_diff_prop2 <- mean(prop_strap_diff$diff_prop2)
se_diff_prop2 <- sd(prop_strap_diff$diff_prop2)

ggplot(prop_strap_diff, aes(x = diff_prop2)) +
  geom_histogram(binwidth = 0.01, color = "black", fill = "lightblue") +
  labs(title = "Bootstrap Distribution of Difference in Proportions between 1980-1999 and 2000-2018", x = "Difference in Proportions between 1980-1999 and 2000-2018", y = "Count")
```
```{r Percentile method 4}
#c(0.025,0.975) captures center 95% while leaving 2.5% off both ends
ci_prop2 <- quantile(prop_strap_diff$diff_prop2, probs = c(
  0.025, 0.975
  ))

print(ci_prop2)

```
```{r standard error method 4}
ci_prop_se2 <- c(
  lower = mean_diff_prop2 - 1.96 * se_diff_prop2,
  upper = mean_diff_prop2 + 1.96 * se_diff_prop2
  )

print(ci_prop_se2)
```
```{r Theoretical Method 4}
prop_ci_theor2 <- c(
  lower = mean_diff_prop2 - qnorm(0.975) * se_diff_prop2,
  upper = mean_diff_prop2 + qnorm(0.975) * se_diff_prop2
)

print(prop_ci_theor2)
```
#Answer: Observing the different confident interval values, ranging from approx -0.046 to 0.314 or as in the percentile method: -0.048 to 0.316. Since zero is included within all intervals, at the 95% confidence level, it can be stated that there is no statistically significant difference between both time periods from (1980-1999 to 2000-2018).


