data <- read.csv ("C:\\Users\\varsh\\OneDrive\\Desktop\\Gitstuff\\age_gaps.CSV")

Grouping 1: Director’s Age

group_by_director <- aggregate(data$age_difference, by = list(data$director), FUN = mean)

lowest_probability_director <- group_by_director[group_by_director$x == min(group_by_director$x), "Group.1"]

data$director_group <- 'Other'
data$director_group[data$director %in% lowest_probability_director] <- 'Lowest Probability'

Grouping 2: Character Genders

group_by_gender <- aggregate(cbind(data$actor_1_age, data$actor_2_age), 
                             by = list(data$character_1_gender, data$character_2_gender), 
                             FUN = mean)

lowest_probability_gender_combination <- group_by_gender[which.min(rowSums(group_by_gender[, 3:4])), c(1, 2)]

data$gender_group <- 'Other'
data$gender_group[data$character_1_gender == lowest_probability_gender_combination$Group.1 &
                       data$character_2_gender == lowest_probability_gender_combination$Group.2] <- 'Lowest Probability'

Grouping 3: Release Decades

data$release_decade <- as.numeric(cut(data$release_year, breaks = seq(1920, 2030, by = 10)))

group_by_decade <- aggregate(data$movie_name, by = list(data$release_decade), FUN = length)

lowest_probability_decade <- group_by_decade[group_by_decade$x == min(group_by_decade$x), "Group.1"]

data$decade_group <- 'Other'
data$decade_group[data$release_decade %in% lowest_probability_decade] <- 'Lowest Probability'

Printing the lowest probability group for every group:

print(lowest_probability_director)
## [1] "Alejandro Agresti" "Chen Kaige"        "Dome Karukoski"   
## [4] "Jan de Bont"       "Jim Field Smith"   "Tyler Perry"
print(lowest_probability_gender_combination)
##   Group.1 Group.2
## 2   woman     man
print(lowest_probability_decade)
## [1] 2 3
head(data)
##           movie_name release_year      director age_difference couple_number
## 1   Harold and Maude         1971     Hal Ashby             52             1
## 2              Venus         2006 Roger Michell             50             1
## 3 The Quiet American         2002 Phillip Noyce             49             1
## 4   The Big Lebowski         1998     Joel Coen             45             1
## 5          Beginners         2010    Mike Mills             43             1
## 6         Poison Ivy         1992     Katt Shea             42             1
##          actor_1_name    actor_2_name character_1_gender character_2_gender
## 1         Ruth Gordon        Bud Cort              woman                man
## 2       Peter O'Toole Jodie Whittaker                man              woman
## 3       Michael Caine  Do Thi Hai Yen                man              woman
## 4    David Huddleston       Tara Reid                man              woman
## 5 Christopher Plummer   Goran Visnjic                man                man
## 6        Tom Skerritt  Drew Barrymore                man              woman
##   actor_1_birthdate actor_2_birthdate actor_1_age actor_2_age director_group
## 1        1896-10-30        29-03-1948          75          23          Other
## 2        02-08-1932        03-06-1982          74          24          Other
## 3        14-03-1933        01-10-1982          69          20          Other
## 4        17-09-1930        08-11-1975          68          23          Other
## 5        13-12-1929        09-09-1972          81          38          Other
## 6        25-08-1933        22-02-1975          59          17          Other
##         gender_group release_decade decade_group
## 1 Lowest Probability              6        Other
## 2              Other              9        Other
## 3              Other              9        Other
## 4              Other              8        Other
## 5              Other              9        Other
## 6              Other              8        Other

Visualization for Grouping 1: Director’s Age

library(ggplot2)

ggplot(group_by_director, aes(x = Group.1, y = x)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(title = "Average Age Difference by Director",
       x = "Director",
       y = "Average Age Difference")

Visualization for Grouping 2: Character Genders

ggplot(group_by_gender, aes(x = Group.1, y = V1, fill = Group.2)) +
  geom_bar(stat = "identity", position = "stack", width = 0.7) +
  labs(title = "Average Age of Actors by Character Genders",
       x = "Character Genders",
       y = "Average Age") +
  scale_fill_manual(values = c("man" = "skyblue", "woman" = "pink"))

Visualization for Grouping 3: Release Decades

ggplot(group_by_decade, aes(x = Group.1, y = x)) +
  geom_line(color = "blue") +
  labs(title = "Number of Movies Released Over Decades",
       x = "Release Decade",
       y = "Number of Movies")

Conclusions and Testable Hypotheses:

1. Lowest Probability Director Group:

  • Conclusions: Directors in the lowest probability group may have different approaches that are not well recognized or understood by people in general.

  • Testable Hypothesis: Compare reviews and audience ratings of films directed by people from the lowest probability category to those made by mainstream directors. Assume that a particular audience appreciates these films better.

2. Lowest Probability Gender Combination Group:

  • Conclusion: Movies with the lowest probability gender combination question standard gender stereotypes, providing unusual character dynamics that may oppose standard expectations.

  • Testable Hypothesis: Carry out an analysis of films with the lowest probability gender combination to discover instances where established gender roles are subverted. Assume that these films are popular because of their unique character representations.

3. Lowest Probability Decade Group:

  • Conclusion: Movies released during a particular decade may have different historical or cultural settings that contribute to their poor representation.

  • Testable Hypothesis: Investigate the past events and cultural shifts in the decade with the lowest chance. The idea is that these external factors influenced movie production and reception, resulting in fewer releases at the time.

A combination that does not exist:

unique_combinations <- unique(paste(data$character_1_gender, data$character_2_gender))

all_combinations <- expand.grid(unique(data$character_1_gender), unique(data$character_2_gender))
all_combinations <- unique(paste(all_combinations$Var1, all_combinations$Var2))

missing_combinations <- setdiff(all_combinations, unique_combinations)

missing_combinations
## character(0)
print(missing_combinations)
## character(0)

Most/Least Common Combinations:

frequency_table <- table(unique_combinations)

frequency_table
## unique_combinations
##     man man   man woman   woman man woman woman 
##           1           1           1           1

Observations:

We considered two columns, one for each gender, therefore each column will only contain two values. As a result, there are only four appropriate combinations. And since all four combinations are included in the data set, there are no missing combinations for these two columns.

Visualizing Combinations:

library(ggplot2)

ggplot(data = data.frame(combination = names(frequency_table), frequency = as.numeric(frequency_table)),
       aes(x = reorder(combination, -frequency), y = frequency, fill = combination)) +
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Frequency of Character Gender Combinations",
       x = "Character Gender Combinations",
       y = "Frequency")

Grouping 1: Director’s Age

Insights: The process sorts the data by director and computes the average age difference for each one. It then selects the director(s) with the smallest average age difference and labels them as the “Lowest Probability” group in the original data set.

Significance: The “Lowest Probability” director group may include directors whose films, on average, feature a unique age dynamic between characters. It could indicate a distinct storytelling style or general concentration in their films.

Further Questions:

  1. What aspects in the films of the “Lowest Probability” directors contribute to the smaller average age difference? Is it an intentional choice in storytelling or a recurring theme?

  2. How does the audience react to films by “Lowest Probability” directors? Are they more restricted, or does the distinct age dynamic appeal to a particular demographic?

Grouping 2: Character Genders

Insights: The code collects data based on character genders (character_1_gender and character_2_gender) and generates the average age of both actors in each gender combination. It then selects the gender combination with the lowest average age and labels it as the “Lowest Probability” group in the original dataframe.

Significance: The “Lowest Probability” gender combination group may highlight a unique or less common representation of gender dynamics in films, as seen by the lower average age.

Further Questions:

  1. What character relationship dynamics are linked to the “Lowest Probability” gender combination? Does it symbolize a certain kind of relationships or storylines?

  2. Are certain genres more likely to include the “Lowest Probability” gender combination? How do gender representations change among movie genres?

Grouping 3: Release Decades

Insights: The code generates a new column called release_decade by categorizing the release years into decades. It then organizes the data by release decade, determining the number of films in each decade. The release decade with the lowest count is known as the “Lowest Probability” group, and the original data frame is accordingly defined.

Significance: The “Lowest Probability” decade group indicates that there is a decade with fewer movies released than other decades. This could point to a less common or distinctive period for movie releases.

Further Questions:

  1. What historical or cultural factors influenced the amount of films released during the “Lowest Probability” decade? Are there any important events or changes in the industry happening at that time?

  2. Is the “Lowest Probability” decade group matching particular genre trends? Are those particular genres more or less popular during that time?