data <- read.csv ("C:\\Users\\varsh\\OneDrive\\Desktop\\Gitstuff\\age_gaps.CSV")
library(tsibble)
## 
## Attaching package: 'tsibble'
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(xts)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following object is masked from 'package:tsibble':
## 
##     index
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last

I am choosing the column “release_year” and formatting it.

data$release_date <- as.Date(paste0(data$release_year, "-01-01"))

data <- subset(data, select = -release_year)
head(data$release_date, 20)
##  [1] "1971-01-01" "2006-01-01" "2002-01-01" "1998-01-01" "2010-01-01"
##  [6] "1992-01-01" "2009-01-01" "1999-01-01" "1992-01-01" "1999-01-01"
## [11] "1989-01-01" "1948-01-01" "1995-01-01" "2003-01-01" "2004-01-01"
## [16] "2003-01-01" "2005-01-01" "2010-01-01" "1981-01-01" "2002-01-01"

Now, I am sorting the data according to release date

sorted_df <- data %>% arrange(release_date)
head(sorted_df)
##         movie_name           director age_difference couple_number
## 1 Star of Midnight    Stephen Roberts             19             1
## 2    Captain Blood     Michael Curtiz              7             1
## 3     Modern Times    Charlie Chaplin             21             1
## 4    Stella Dallas         King Vidor             12             1
## 5   A Star Is Born William A. Wellman              9             3
## 6    Stella Dallas         King Vidor              1             2
##      actor_1_name        actor_2_name character_1_gender character_2_gender
## 1  William Powell       Ginger Rogers                man              woman
## 2     Errol Flynn Olivia de Havilland                man              woman
## 3 Charlie Chaplin    Paulette Goddard                man              woman
## 4      John Boles    Barbara Stanwyck                man              woman
## 5   Fredric March        Janet Gaynor                man              woman
## 6    Anne Shirley            Tim Holt              woman                man
##   actor_1_birthdate actor_2_birthdate actor_1_age actor_2_age release_date
## 1        1892-06-29        16-07-1911          43          24   1935-01-01
## 2        20-06-1909        01-07-1916          26          19   1935-01-01
## 3        1889-04-16        03-06-1910          47          26   1936-01-01
## 4        1895-10-28        16-07-1907          42          30   1937-01-01
## 5        1897-08-31        06-10-1906          40          31   1937-01-01
## 6        17-04-1918        05-02-1919          19          18   1937-01-01

I am now analyzing mean of the age differences over release date.

mean_age_difference <- sorted_df %>%
  group_by(release_date) %>%
  summarize(mean_age_difference = mean(age_difference, na.rm = TRUE))
mean_age_difference
## # A tibble: 82 Ă— 2
##    release_date mean_age_difference
##    <date>                     <dbl>
##  1 1935-01-01                 13   
##  2 1936-01-01                 21   
##  3 1937-01-01                  7.33
##  4 1939-01-01                 12   
##  5 1940-01-01                 11.3 
##  6 1942-01-01                 20.5 
##  7 1944-01-01                 25   
##  8 1946-01-01                 25   
##  9 1947-01-01                 25   
## 10 1948-01-01                 23.2 
## # ℹ 72 more rows
ggplot(mean_age_difference, aes(x = release_date, y = mean_age_difference)) +
  geom_line() +
  labs(x = "Release Date", y = "Mean Age Difference", title = "Mean Age Difference Over Release Year")

As we can see from the above graph, there are no noticeable trends. So, for this we are taking a subset of the data from 1990-2018.

filtered_mean_age_difference <- mean_age_difference %>%
  filter(release_date >= as.Date("1990-01-01") & release_date <= as.Date("2018-12-31"))

print(filtered_mean_age_difference)
## # A tibble: 29 Ă— 2
##    release_date mean_age_difference
##    <date>                     <dbl>
##  1 1990-01-01                 11   
##  2 1991-01-01                 13.1 
##  3 1992-01-01                 15.9 
##  4 1993-01-01                  9.13
##  5 1994-01-01                  8.14
##  6 1995-01-01                 12.3 
##  7 1996-01-01                  9.39
##  8 1997-01-01                  7.51
##  9 1998-01-01                 13.9 
## 10 1999-01-01                  8.8 
## # ℹ 19 more rows
ggplot(filtered_mean_age_difference, aes(x = release_date, y = mean_age_difference)) +
  geom_line() +
  labs(x = "Release Date", y = "Mean Age Difference", title = "Mean Age Difference Over Release Year")

As a whole we can see a downward trend from 1990-2018, but there seems to be a cyclic pattern/ seasonal trend in data where there is a repeating pattern of downward and upward trend for every few years.

Linear Regression Model

ggplot(filtered_mean_age_difference, aes(x = release_date, y = mean_age_difference)) +
  geom_smooth(method = "lm", se = FALSE) +
  geom_line() +
  labs(x = "Release Date", y = "Mean Age Difference", title = "Mean Age Difference Over Release Year")
## `geom_smooth()` using formula = 'y ~ x'

As we previously suspected there is a downward trend throughout the data. This evident when the regression line is drawn.

library(forecast)
## Warning: package 'forecast' was built under R version 4.3.3
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
ts_release_years <- ts(data$release_date)

acf_release_years <- acf(ts_release_years, plot = FALSE)
pacf_release_years <- pacf(ts_release_years, plot = FALSE)

plot(acf_release_years, main = "ACF for Movie Release Years")

plot(pacf_release_years, main = "PACF for Movie Release Years")

smoothed_data <- lowess(time(ts_release_years), ts_release_years)

plot(smoothed_data, type = "l", col = "red", main = "Lowess Smoothed Movie Release Years")