data <- read.csv ("C:\\Users\\varsh\\OneDrive\\Desktop\\Gitstuff\\age_gaps.CSV")
set.seed(42)

for (i in 1:5) {
  sample_size <- round(0.5 * nrow(data))
  random_indices <- sample(1:nrow(data), size = sample_size, replace = TRUE)
  sample_df <- data[random_indices, ]
  assign(paste0("df_", i), sample_df)
}
print("First 5 rows of df_1:")
## [1] "First 5 rows of df_1:"
head(df_1)
##                 movie_name release_year          director age_difference
## 561      Y tu mamá también         2001    Alfonso Cuarón              9
## 321  From Russia with Love         1963     Terence Young             14
## 1098              Slackers         2002       Dewey Nicks              1
## 634                  Click         2006      Frank Coraci              7
## 49            Café Society         2016       Woody Allen             28
## 1152        The Lake House         2006 Alejandro Agresti              0
##      couple_number   actor_1_name    actor_2_name character_1_gender
## 561              1  Maribel Verdú      Diego Luna              woman
## 321              1   Sean Connery       Aliza Gur                man
## 1098             1     Devon Sawa      Jaime King                man
## 634              1   Adam Sandler Kate Beckinsale                man
## 49               1   Steve Carell Kristen Stewart                man
## 1152             1 Sandra Bullock    Keanu Reeves              woman
##      character_2_gender actor_1_birthdate actor_2_birthdate actor_1_age
## 561                 man        02-10-1970        29-12-1979          31
## 321               woman        25-08-1930        01-04-1944          33
## 1098              woman        07-09-1978        23-04-1979          24
## 634               woman        09-09-1966        26-07-1973          40
## 49                woman        16-08-1962        09-04-1990          54
## 1152                man        26-07-1964        02-09-1964          42
##      actor_2_age
## 561           22
## 321           19
## 1098          23
## 634           33
## 49            26
## 1152          42
print("First 5 rows of df_2:")
## [1] "First 5 rows of df_2:"
head(df_2)
##                         movie_name release_year          director
## 298  Once Upon a Time in Hollywood         2019 Quentin Tarantino
## 17                        Shopgirl         2005      Anand Tucker
## 482             Husbands and Wives         1992       Woody Allen
## 532               It's Complicated         2009      Nancy Meyers
## 76               I Married a Witch         1942        René Clair
## 1112      The Theory of Everything         2014       James Marsh
##      age_difference couple_number      actor_1_name   actor_2_name
## 298              15             1 Leonardo DiCaprio   Lorenza Izzo
## 17               34             1      Steve Martin   Claire Danes
## 482              10             3       Woody Allen     Mia Farrow
## 532               9             2      Meryl Streep   Alec Baldwin
## 76               25             1     Fredric March  Veronica Lake
## 1112              1             1    Eddie Redmayne Felicity Jones
##      character_1_gender character_2_gender actor_1_birthdate actor_2_birthdate
## 298                 man              woman        11-11-1974        19-09-1989
## 17                  man              woman        14-08-1945        12-04-1979
## 482                 man              woman        01-12-1935        09-02-1945
## 532               woman                man        22-06-1949        03-04-1958
## 76                  man              woman        1897-08-31        14-11-1922
## 1112                man              woman        26-01-1982        17-10-1983
##      actor_1_age actor_2_age
## 298           45          30
## 17            60          26
## 482           57          47
## 532           60          51
## 76            45          20
## 1112          32          31
print("First 5 rows of df_3:")
## [1] "First 5 rows of df_3:"
head(df_3)
##                           movie_name release_year              director
## 956                       Trainwreck         2015           Judd Apatow
## 474                             EDtv         1999            Ron Howard
## 383                    Casino Royale         2006       Martin Campbell
## 353   How Stella Got Her Groove Back         1998 Kevin Rodney Sullivan
## 383.1                  Casino Royale         2006       Martin Campbell
## 155                        Get Smart         2008           Peter Segal
##       age_difference couple_number    actor_1_name  actor_2_name
## 956                3             2      Bill Hader   Amy Schumer
## 474               10             1 Woody Harrelson  Jenna Elfman
## 383               12             1    Daniel Craig     Eva Green
## 353               13             1  Angela Bassett    Taye Diggs
## 383.1             12             1    Daniel Craig     Eva Green
## 155               20             1    Steve Carell Anne Hathaway
##       character_1_gender character_2_gender actor_1_birthdate actor_2_birthdate
## 956                  man              woman        07-06-1978        01-06-1981
## 474                  man              woman        23-07-1961        30-09-1971
## 383                  man              woman        02-03-1968        06-07-1980
## 353                woman                man        16-08-1958        02-01-1971
## 383.1                man              woman        02-03-1968        06-07-1980
## 155                  man              woman        16-08-1962        12-11-1982
##       actor_1_age actor_2_age
## 956            37          34
## 474            38          28
## 383            38          26
## 353            40          27
## 383.1          38          26
## 155            46          26
print("First 5 rows of df_4:")
## [1] "First 5 rows of df_4:"
head(df_4)
##                    movie_name release_year         director age_difference
## 1153                  Tolkien         2019   Dome Karukoski              0
## 506  The Time Traveler's Wife         2009 Robert Schwentke             10
## 470              Daddy's Home         2015      Sean Anders             10
## 1149                The Crush         1993     Alan Shapiro              0
## 617                 Tommy Boy         1995      Peter Segal              8
## 932                   Savages         2007        undefined              3
##      couple_number   actor_1_name         actor_2_name character_1_gender
## 1153             1   Lily Collins       Nicholas Hoult              woman
## 506              1      Eric Bana       Rachel McAdams                man
## 470              1  Mark Wahlberg  Alessandra Ambrosio                man
## 1149             2 Jennifer Rubin           Cary Elwes              woman
## 617              2       Bo Derek             Rob Lowe              woman
## 932              2   Blake Lively Aaron Taylor-Johnson              woman
##      character_2_gender actor_1_birthdate actor_2_birthdate actor_1_age
## 1153                man        18-03-1989        07-12-1989          30
## 506               woman        09-08-1968        17-11-1978          41
## 470               woman        05-06-1971        11-04-1981          44
## 1149                man        03-04-1962        26-10-1962          31
## 617                 man        20-11-1956        17-03-1964          39
## 932                 man        25-08-1987        13-06-1990          20
##      actor_2_age
## 1153          30
## 506           31
## 470           34
## 1149          31
## 617           31
## 932           17
print("First 5 rows of df_5:")
## [1] "First 5 rows of df_5:"
head(df_5)
##             movie_name release_year            director age_difference
## 316  Because I Said So         2007     Michael Lehmann             14
## 294         Home Again         2017 Hallie Meyers-Shyer             15
## 276   The Family Stone         2007      Thomas Bezucha             16
## 760    Days of Thunder         1990          Tony Scott              5
## 1118              Troy         2004   Wolfgang Petersen              1
## 699    Chasing Liberty         2004         Andy Cadiff              6
##      couple_number      actor_1_name   actor_2_name character_1_gender
## 316              1 Tom Everett Scott    Mandy Moore                man
## 294              1 Reese Witherspoon Pico Alexander              woman
## 276              1   Dermot Mulroney   Claire Danes                man
## 760              1        Tom Cruise  Nicole Kidman                man
## 1118             2      Diane Kruger  Orlando Bloom              woman
## 699              1     Matthew Goode    Mandy Moore                man
##      character_2_gender actor_1_birthdate actor_2_birthdate actor_1_age
## 316               woman        07-09-1970        10-04-1984          37
## 294                 man        21-03-1976        03-06-1991          41
## 276               woman        31-10-1963        12-04-1979          44
## 760               woman        03-07-1962        20-06-1967          28
## 1118                man        15-07-1976        13-01-1977          28
## 699               woman        03-04-1978        10-04-1984          26
##      actor_2_age
## 316           23
## 294           26
## 276           28
## 760           23
## 1118          27
## 699           20

Insight Gained:

Significance:

Further Questions:

1. Do specific patterns or trends show or disappear consistently across subsamples?

2. Are some observations regularly overrepresented or underrepresented in subsamples?

3. How does variability in subsamples affect summary statistics like means, standard deviations, and other critical metrics?

analyze_subsample <- function(subsample, subsample_number) {
  cat("Analysis for Subsample", subsample_number, "\n")
  
  summary_stats <- summary(subsample)
  print(summary_stats)
  
  anomalies <- as.data.frame(table(subsample$character_1_gender, subsample$character_2_gender))
  colnames(anomalies) <- c("character_1_gender", "character_2_gender", "Count")
  
  cat("Anomalies in character_1_gender and character_2_gender:\n")
  print(anomalies)
  
  column_to_simulate <- "age_difference"
  num_simulations <- 1000
  
  simulated_means <- replicate(num_simulations, mean(sample(subsample[[column_to_simulate]], replace = TRUE)))
  
  cat("Monte Carlo Simulation for", column_to_simulate, "Mean:\n")
  hist(simulated_means, main = paste("Distribution of Simulated Means for", column_to_simulate))
}
list_dfs = list(df_1, df_2, df_3, df_4, df_5)
for (i in 1:5){
  analyze_subsample(list_dfs[[i]], i)
}
## Analysis for Subsample 1 
##   movie_name         release_year    director         age_difference 
##  Length:578         Min.   :1939   Length:578         Min.   : 0.00  
##  Class :character   1st Qu.:1996   Class :character   1st Qu.: 3.00  
##  Mode  :character   Median :2003   Mode  :character   Median : 8.00  
##                     Mean   :2001                      Mean   :10.04  
##                     3rd Qu.:2012                      3rd Qu.:15.00  
##                     Max.   :2022                      Max.   :50.00  
##  couple_number   actor_1_name       actor_2_name       character_1_gender
##  Min.   :1.000   Length:578         Length:578         Length:578        
##  1st Qu.:1.000   Class :character   Class :character   Class :character  
##  Median :1.000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1.382                                                           
##  3rd Qu.:2.000                                                           
##  Max.   :6.000                                                           
##  character_2_gender actor_1_birthdate  actor_2_birthdate   actor_1_age   
##  Length:578         Length:578         Length:578         Min.   :18.00  
##  Class :character   Class :character   Class :character   1st Qu.:32.25  
##  Mode  :character   Mode  :character   Mode  :character   Median :38.00  
##                                                           Mean   :39.92  
##                                                           3rd Qu.:46.00  
##                                                           Max.   :79.00  
##   actor_2_age   
##  Min.   :17.00  
##  1st Qu.:25.00  
##  Median :29.00  
##  Mean   :29.88  
##  3rd Qu.:34.00  
##  Max.   :61.00  
## Anomalies in character_1_gender and character_2_gender:
##   character_1_gender character_2_gender Count
## 1                man                man     8
## 2              woman                man   107
## 3                man              woman   459
## 4              woman              woman     4
## Monte Carlo Simulation for age_difference Mean:

## Analysis for Subsample 2 
##   movie_name         release_year    director         age_difference
##  Length:578         Min.   :1935   Length:578         Min.   : 0.0  
##  Class :character   1st Qu.:1998   Class :character   1st Qu.: 3.0  
##  Mode  :character   Median :2005   Mode  :character   Median : 8.0  
##                     Mean   :2002                      Mean   :10.6  
##                     3rd Qu.:2012                      3rd Qu.:15.0  
##                     Max.   :2022                      Max.   :52.0  
##  couple_number actor_1_name       actor_2_name       character_1_gender
##  Min.   :1.0   Length:578         Length:578         Length:578        
##  1st Qu.:1.0   Class :character   Class :character   Class :character  
##  Median :1.0   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1.4                                                           
##  3rd Qu.:2.0                                                           
##  Max.   :6.0                                                           
##  character_2_gender actor_1_birthdate  actor_2_birthdate   actor_1_age   
##  Length:578         Length:578         Length:578         Min.   :18.00  
##  Class :character   Class :character   Class :character   1st Qu.:32.25  
##  Mode  :character   Mode  :character   Mode  :character   Median :39.00  
##                                                           Mean   :40.84  
##                                                           3rd Qu.:48.00  
##                                                           Max.   :79.00  
##   actor_2_age   
##  Min.   :17.00  
##  1st Qu.:24.00  
##  Median :29.00  
##  Mean   :30.24  
##  3rd Qu.:34.00  
##  Max.   :60.00  
## Anomalies in character_1_gender and character_2_gender:
##   character_1_gender character_2_gender Count
## 1                man                man    11
## 2              woman                man   113
## 3                man              woman   446
## 4              woman              woman     8
## Monte Carlo Simulation for age_difference Mean:

## Analysis for Subsample 3 
##   movie_name         release_year    director         age_difference
##  Length:578         Min.   :1936   Length:578         Min.   : 0.0  
##  Class :character   1st Qu.:1996   Class :character   1st Qu.: 4.0  
##  Mode  :character   Median :2004   Mode  :character   Median : 9.0  
##                     Mean   :2000                      Mean   :10.5  
##                     3rd Qu.:2011                      3rd Qu.:15.0  
##                     Max.   :2021                      Max.   :52.0  
##  couple_number   actor_1_name       actor_2_name       character_1_gender
##  Min.   :1.000   Length:578         Length:578         Length:578        
##  1st Qu.:1.000   Class :character   Class :character   Class :character  
##  Median :1.000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1.394                                                           
##  3rd Qu.:2.000                                                           
##  Max.   :6.000                                                           
##  character_2_gender actor_1_birthdate  actor_2_birthdate   actor_1_age   
##  Length:578         Length:578         Length:578         Min.   :19.00  
##  Class :character   Class :character   Class :character   1st Qu.:34.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :39.00  
##                                                           Mean   :40.58  
##                                                           3rd Qu.:47.00  
##                                                           Max.   :79.00  
##   actor_2_age   
##  Min.   :17.00  
##  1st Qu.:25.00  
##  Median :29.00  
##  Mean   :30.08  
##  3rd Qu.:34.00  
##  Max.   :58.00  
## Anomalies in character_1_gender and character_2_gender:
##   character_1_gender character_2_gender Count
## 1                man                man     7
## 2              woman                man   105
## 3                man              woman   463
## 4              woman              woman     3
## Monte Carlo Simulation for age_difference Mean:

## Analysis for Subsample 4 
##   movie_name         release_year    director         age_difference 
##  Length:578         Min.   :1935   Length:578         Min.   : 0.00  
##  Class :character   1st Qu.:1997   Class :character   1st Qu.: 3.00  
##  Mode  :character   Median :2004   Mode  :character   Median : 8.00  
##                     Mean   :2001                      Mean   :10.67  
##                     3rd Qu.:2011                      3rd Qu.:15.00  
##                     Max.   :2022                      Max.   :52.00  
##  couple_number   actor_1_name       actor_2_name       character_1_gender
##  Min.   :1.000   Length:578         Length:578         Length:578        
##  1st Qu.:1.000   Class :character   Class :character   Class :character  
##  Median :1.000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1.407                                                           
##  3rd Qu.:2.000                                                           
##  Max.   :7.000                                                           
##  character_2_gender actor_1_birthdate  actor_2_birthdate   actor_1_age  
##  Length:578         Length:578         Length:578         Min.   :19.0  
##  Class :character   Class :character   Class :character   1st Qu.:34.0  
##  Mode  :character   Mode  :character   Mode  :character   Median :39.0  
##                                                           Mean   :40.9  
##                                                           3rd Qu.:47.0  
##                                                           Max.   :79.0  
##   actor_2_age   
##  Min.   :17.00  
##  1st Qu.:25.00  
##  Median :29.00  
##  Mean   :30.23  
##  3rd Qu.:34.00  
##  Max.   :68.00  
## Anomalies in character_1_gender and character_2_gender:
##   character_1_gender character_2_gender Count
## 1                man                man     5
## 2              woman                man   108
## 3                man              woman   461
## 4              woman              woman     4
## Monte Carlo Simulation for age_difference Mean:

## Analysis for Subsample 5 
##   movie_name         release_year    director         age_difference 
##  Length:578         Min.   :1935   Length:578         Min.   : 0.00  
##  Class :character   1st Qu.:1995   Class :character   1st Qu.: 4.00  
##  Mode  :character   Median :2003   Mode  :character   Median : 8.50  
##                     Mean   :2000                      Mean   :10.67  
##                     3rd Qu.:2011                      3rd Qu.:16.00  
##                     Max.   :2022                      Max.   :50.00  
##  couple_number   actor_1_name       actor_2_name       character_1_gender
##  Min.   :1.000   Length:578         Length:578         Length:578        
##  1st Qu.:1.000   Class :character   Class :character   Class :character  
##  Median :1.000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1.396                                                           
##  3rd Qu.:2.000                                                           
##  Max.   :7.000                                                           
##  character_2_gender actor_1_birthdate  actor_2_birthdate   actor_1_age  
##  Length:578         Length:578         Length:578         Min.   :19.0  
##  Class :character   Class :character   Class :character   1st Qu.:33.0  
##  Mode  :character   Mode  :character   Mode  :character   Median :39.0  
##                                                           Mean   :40.4  
##                                                           3rd Qu.:47.0  
##                                                           Max.   :79.0  
##   actor_2_age   
##  Min.   :17.00  
##  1st Qu.:24.00  
##  Median :29.00  
##  Mean   :29.73  
##  3rd Qu.:34.00  
##  Max.   :68.00  
## Anomalies in character_1_gender and character_2_gender:
##   character_1_gender character_2_gender Count
## 1                man                man     5
## 2              woman                man    96
## 3                man              woman   474
## 4              woman              woman     3
## Monte Carlo Simulation for age_difference Mean:

Insights Gathered:

Significance:

Further Investigation:

  1. Analyze the factors that influence age differences across subsamples. Are some films or filmmakers connected with greater or lesser age differences?

  2. Understand the nature of anomalies and, if necessary, clean up the data. Are there any trends or commonalities among the misassignments?

  3. Examine the stability of mean age differences throughout Monte Carlo simulations. Are there steady trends, or is the metric highly variable?