data <- read.csv ("C:\\Users\\varsh\\OneDrive\\Desktop\\Gitstuff\\age_gaps.CSV")
library(tsibble)
##
## Attaching package: 'tsibble'
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(xts)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following object is masked from 'package:tsibble':
##
## index
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
data$release_date <- as.Date(paste0(data$release_year, "-01-01"))
data <- subset(data, select = -release_year)
head(data$release_date, 20)
## [1] "1971-01-01" "2006-01-01" "2002-01-01" "1998-01-01" "2010-01-01"
## [6] "1992-01-01" "2009-01-01" "1999-01-01" "1992-01-01" "1999-01-01"
## [11] "1989-01-01" "1948-01-01" "1995-01-01" "2003-01-01" "2004-01-01"
## [16] "2003-01-01" "2005-01-01" "2010-01-01" "1981-01-01" "2002-01-01"
Now, I am sorting the data according to release date
sorted_df <- data %>% arrange(release_date)
head(sorted_df)
## movie_name director age_difference couple_number
## 1 Star of Midnight Stephen Roberts 19 1
## 2 Captain Blood Michael Curtiz 7 1
## 3 Modern Times Charlie Chaplin 21 1
## 4 Stella Dallas King Vidor 12 1
## 5 A Star Is Born William A. Wellman 9 3
## 6 Stella Dallas King Vidor 1 2
## actor_1_name actor_2_name character_1_gender character_2_gender
## 1 William Powell Ginger Rogers man woman
## 2 Errol Flynn Olivia de Havilland man woman
## 3 Charlie Chaplin Paulette Goddard man woman
## 4 John Boles Barbara Stanwyck man woman
## 5 Fredric March Janet Gaynor man woman
## 6 Anne Shirley Tim Holt woman man
## actor_1_birthdate actor_2_birthdate actor_1_age actor_2_age release_date
## 1 1892-06-29 16-07-1911 43 24 1935-01-01
## 2 20-06-1909 01-07-1916 26 19 1935-01-01
## 3 1889-04-16 03-06-1910 47 26 1936-01-01
## 4 1895-10-28 16-07-1907 42 30 1937-01-01
## 5 1897-08-31 06-10-1906 40 31 1937-01-01
## 6 17-04-1918 05-02-1919 19 18 1937-01-01
I am now analyzing mean of the age differences over release date.
mean_age_difference <- sorted_df %>%
group_by(release_date) %>%
summarize(mean_age_difference = mean(age_difference, na.rm = TRUE))
mean_age_difference
## # A tibble: 82 Ă— 2
## release_date mean_age_difference
## <date> <dbl>
## 1 1935-01-01 13
## 2 1936-01-01 21
## 3 1937-01-01 7.33
## 4 1939-01-01 12
## 5 1940-01-01 11.3
## 6 1942-01-01 20.5
## 7 1944-01-01 25
## 8 1946-01-01 25
## 9 1947-01-01 25
## 10 1948-01-01 23.2
## # ℹ 72 more rows
ggplot(mean_age_difference, aes(x = release_date, y = mean_age_difference)) +
geom_line() +
labs(x = "Release Date", y = "Mean Age Difference", title = "Mean Age Difference Over Release Year")
As we can see from the above graph, there are no noticeable trends. So, for this we are taking a subset of the data from 1990-2018.
filtered_mean_age_difference <- mean_age_difference %>%
filter(release_date >= as.Date("1990-01-01") & release_date <= as.Date("2018-12-31"))
print(filtered_mean_age_difference)
## # A tibble: 29 Ă— 2
## release_date mean_age_difference
## <date> <dbl>
## 1 1990-01-01 11
## 2 1991-01-01 13.1
## 3 1992-01-01 15.9
## 4 1993-01-01 9.13
## 5 1994-01-01 8.14
## 6 1995-01-01 12.3
## 7 1996-01-01 9.39
## 8 1997-01-01 7.51
## 9 1998-01-01 13.9
## 10 1999-01-01 8.8
## # ℹ 19 more rows
ggplot(filtered_mean_age_difference, aes(x = release_date, y = mean_age_difference)) +
geom_line() +
labs(x = "Release Date", y = "Mean Age Difference", title = "Mean Age Difference Over Release Year")
As a whole we can see a downward trend from 1990-2018, but there seems to be a cyclic pattern/ seasonal trend in data where there is a repeating pattern of downward and upward trend for every few years.
ggplot(filtered_mean_age_difference, aes(x = release_date, y = mean_age_difference)) +
geom_smooth(method = "lm", se = FALSE) +
geom_line() +
labs(x = "Release Date", y = "Mean Age Difference", title = "Mean Age Difference Over Release Year")
## `geom_smooth()` using formula = 'y ~ x'
As we previously suspected there is a downward trend throughout the data. This evident when the regression line is drawn.
library(forecast)
## Warning: package 'forecast' was built under R version 4.3.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
ts_release_years <- ts(data$release_date)
acf_release_years <- acf(ts_release_years, plot = FALSE)
pacf_release_years <- pacf(ts_release_years, plot = FALSE)
plot(acf_release_years, main = "ACF for Movie Release Years")
plot(pacf_release_years, main = "PACF for Movie Release Years")
smoothed_data <- lowess(time(ts_release_years), ts_release_years)
plot(smoothed_data, type = "l", col = "red", main = "Lowess Smoothed Movie Release Years")
The years in which movies were released show no clear trend throughout time. There are fluctuations, but they do not have a consistent pattern over time.
Both the ACF and PACF plots show little and moderate autocorrelation, accordingly.
This implies that the movies’ release years are not significantly associated with their past values.
The lowess smoothed plot confirms the absence of a clear trend in the release years, implying that the variations seen in the original plot were most likely caused by random variation rather than an ordered pattern.