Data Dive 12

About Dataset

The IMDB dataset contains information about movieData, including their names, release dates, user ratings, genres, overviews, cast and crew members, original titles, production status, original languages, budgets, revenues, and countries of origin. This data can be used for various analyses, such as identifying trends in movie genres, exploring the relationship between budget and revenue, and predicting the success of future movieData.

# Load the lubridate package
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(plyr)
library(plotly)

## Loading required package: ggplot2

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(readr)
library(vcd)

## Loading required package: grid

library(stringr)
library(tsibble)

## Warning: package 'tsibble' was built under R version 4.3.2

## 
## Attaching package: 'tsibble'

## The following object is masked from 'package:lubridate':
## 
##     interval

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union

library(car) # for VIF

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

library(repr)

## Warning: package 'repr' was built under R version 4.3.2

options(repr.plot.width=12, repr.plot.height=6)
library(car) # for VIF


movieData <-read.csv('C:/Users/govin/OneDrive/Desktop/RStudio/Data/imdb_movieData.csv')

movieData$date_x <- sapply(movieData$date_x, function(x) gsub("/", "-", x))
movieData[c('date_x')] <- lapply(movieData[c('date_x')], function(x) as.Date(x, format="%m-%d-%Y"))
movieData <- type_convert(movieData)

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   names = col_character(),
##   genre = col_character(),
##   overview = col_character(),
##   crew = col_character(),
##   orig_title = col_character(),
##   status = col_character(),
##   orig_lang = col_character(),
##   country = col_character()
## )

movieData$date_x <- as.Date(movieData$date_x, format = "%Y-%m-%d")

Converting to Date object: We already have a compatible date object.

Choosing the response variable as: Score

# Aggregating data by date and calculating the average score
movie_aggregated <- movieData %>%
  group_by(date_x) %>%
  summarize(average_score = mean(score, na.rm = TRUE))

# Creating a tsibble object
movie_tsibble <- movie_aggregated %>% 
  as_tsibble(index = date_x)

# Plotting data over time
movie_plot <- ggplot(movie_tsibble, aes(x = date_x, y = average_score)) +
  geom_line(color = "#1f77b4", size = 1) +
  geom_point(color = "#ff7f0e", size = 1, alpha = 0.3) + # Reduced alpha for points
  theme_minimal(base_size = 15) + # Larger text in minimal theme
  labs(title = "Average Movie Ratings Over Time",
       subtitle = "Time Series of IMDb Average Movie Ratings",
       x = "Date",
       y = "Average Rating",
       caption = "Data Source: IMDb") +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    plot.subtitle = element_text(size = 14),
    axis.title = element_text(size = 14, face = "bold"),
    axis.text = element_text(size = 12),
    plot.caption = element_text(size = 10)
  ) +
  scale_x_date(date_breaks = "10 years", date_labels = "%Y") + # Sparser x-axis labels
  scale_y_continuous(breaks = seq(0, 100, by = 10))

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Adjusting plot size and aspect ratio
ggsave("movie_plot.png", movie_plot, width = 12, height = 8, units = "in") # Increase the size

print(movie_plot)

There is a high density of data points in recent years, suggesting an increase in the number of movies being rated.

Running a simple linear regression using the entire dataset to see if there’s an overall trend:

# Linear regression using score as the response variable and date_x as the predictor
linear_model_full <- lm(score ~ date_x, data = movieData)

# Summary of the full model to check for trends and strength
summary(linear_model_full)

## 
## Call:
## lm(formula = score ~ date_x, data = movieData)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -67.716  -4.525   1.763   7.850  38.411 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.840e+01  3.590e-01   190.5   <2e-16 ***
## date_x      -3.491e-04  2.375e-05   -14.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.4 on 10176 degrees of freedom
## Multiple R-squared:  0.02079,    Adjusted R-squared:  0.02069 
## F-statistic:   216 on 1 and 10176 DF,  p-value: < 2.2e-16

Suspecting different trends in different time periods, we would subset the data. Lets test if movie ratings trends are different before and after the year 2000.

# Subset data before the year 2000
movieData_pre2000 <- filter(movieData, date_x < as.Date("2000-01-01"))

# Subset data from the year 2000 onwards
movieData_post2000 <- filter(movieData, date_x >= as.Date("2000-01-01"))

# Linear regression for the first subset
linear_model_pre2000 <- lm(score ~ date_x, data = movieData_pre2000)
summary(linear_model_pre2000)

## 
## Call:
## lm(formula = score ~ date_x, data = movieData_pre2000)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -67.183  -4.628   1.485   6.821  33.940 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.807e+01  3.307e-01  205.85   <2e-16 ***
## date_x      -4.529e-04  4.391e-05  -10.31   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.77 on 2258 degrees of freedom
## Multiple R-squared:  0.045,  Adjusted R-squared:  0.04457 
## F-statistic: 106.4 on 1 and 2258 DF,  p-value: < 2.2e-16

# Linear regression for the second subset
linear_model_post2000 <- lm(score ~ date_x, data = movieData_post2000)
summary(linear_model_post2000)

## 
## Call:
## lm(formula = score ~ date_x, data = movieData_post2000)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -65.079  -4.619   1.852   8.108  39.232 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.454e+01  1.099e+00   67.84   <2e-16 ***
## date_x      -7.057e-04  6.595e-05  -10.70   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.02 on 7916 degrees of freedom
## Multiple R-squared:  0.01426,    Adjusted R-squared:  0.01414 
## F-statistic: 114.5 on 1 and 7916 DF,  p-value: < 2.2e-16

Finding R-squared value, indicating how much variance in the movie ratings is explained by the year of release. A higher R-squared value means a stronger trend.

Full Model (All Data):

The full linear regression model considers all the data. Multiple R-squared (R^2) is 0.02079. Indicating that the model explains only a small portion of the variance in movie ratings. The p-value for the date_x coefficient is less than 2.2e-16, suggesting that there is a statistically significant relationship between date_x (time) and movie ratings. However, the Adjusted R-squared is also very low, indicating that the model is not a good fit.

Subset Models (Before and After 2000):

The data is split into two subsets: before and after the year 2000. For the “Before 2000” subset, the R^2 is 0.045, indicating a slightly better fit compared to the full model, but it’s still relatively low.

The p-value for date_x is highly significant, indicating a relationship between time and movie ratings for this subset. For the “After 2000” subset, the R^2 is 0.01426, which is lower than both the full and “Before 2000” subsets. Similar to the other models, there is a significant relationship between time and movie ratings.

The low R-squared values in all models suggest that linear regression might not be the best model to capture the relationship between time and movie ratings.

The p-values indicate that there is a statistically significant relationship between time and movie ratings in all models, but the strength of the relationship is weak.

Using smoothing to detect season

library(forecast)

## Warning: package 'forecast' was built under R version 4.3.2

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

# Apply smoothing (e.g., 5-period moving average) to the 'score' column
movieData_smoothed <- movieData %>%
  mutate(smoothed_score = zoo::rollmean(score, k = 5, fill = NA))

# Create a plot to compare the smoothed data with the original data
ggplot() +
  geom_line(data = movieData, aes(x = date_x, y = score), color = "blue", alpha = 0.5, size = 1) +
  geom_line(data = movieData_smoothed, aes(x = date_x, y = smoothed_score), color = "red", size = 1) +
  labs(title = "Original vs. Smoothed Movie Scores",
       x = "Date",
       y = "Score") +
  theme_minimal()

Seasonality Detection and ACF/PACF

library(forecast)

movieData_ts <- ts(movieData$score, frequency = 12)
movieData_decomp <- stl(movieData_ts, s.window = "periodic")

plot(movieData_decomp)

acf(movieData_decomp$time.series[, "seasonal"])

pacf(movieData_decomp$time.series[, "seasonal"])

summary(movieData_decomp$time.series[, "seasonal"])

##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -0.7272926 -0.2569826 -0.0488401 -0.0000006  0.2510333  0.8118548

# Create a time series object
movie_ts <- ts(movieData$score, frequency = 12)

# Decompose the time series to detect seasonality
decomp <- decompose(movie_ts)

# Plot the ACF and PACF
par(mfrow = c(2, 1))
acf(decomp$seasonal, main = "ACF of Seasonal Component")
pacf(decomp$seasonal, main = "PACF of Seasonal Component")