The IMDB dataset contains information about movieData, including their names, release dates, user ratings, genres, overviews, cast and crew members, original titles, production status, original languages, budgets, revenues, and countries of origin. This data can be used for various analyses, such as identifying trends in movie genres, exploring the relationship between budget and revenue, and predicting the success of future movieData.
# Load the lubridate package
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(plyr)
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
library(vcd)
## Loading required package: grid
library(stringr)
library(tsibble)
## Warning: package 'tsibble' was built under R version 4.3.2
##
## Attaching package: 'tsibble'
## The following object is masked from 'package:lubridate':
##
## interval
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
library(car) # for VIF
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(repr)
## Warning: package 'repr' was built under R version 4.3.2
options(repr.plot.width=12, repr.plot.height=6)
library(car) # for VIF
movieData <-read.csv('C:/Users/govin/OneDrive/Desktop/RStudio/Data/imdb_movieData.csv')
movieData$date_x <- sapply(movieData$date_x, function(x) gsub("/", "-", x))
movieData[c('date_x')] <- lapply(movieData[c('date_x')], function(x) as.Date(x, format="%m-%d-%Y"))
movieData <- type_convert(movieData)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## names = col_character(),
## genre = col_character(),
## overview = col_character(),
## crew = col_character(),
## orig_title = col_character(),
## status = col_character(),
## orig_lang = col_character(),
## country = col_character()
## )
movieData$date_x <- as.Date(movieData$date_x, format = "%Y-%m-%d")
Converting to Date object: We already have a compatible date object.
Choosing the response variable as: Score
# Aggregating data by date and calculating the average score
movie_aggregated <- movieData %>%
group_by(date_x) %>%
summarize(average_score = mean(score, na.rm = TRUE))
# Creating a tsibble object
movie_tsibble <- movie_aggregated %>%
as_tsibble(index = date_x)
# Plotting data over time
movie_plot <- ggplot(movie_tsibble, aes(x = date_x, y = average_score)) +
geom_line(color = "#1f77b4", size = 1) +
geom_point(color = "#ff7f0e", size = 1, alpha = 0.3) + # Reduced alpha for points
theme_minimal(base_size = 15) + # Larger text in minimal theme
labs(title = "Average Movie Ratings Over Time",
subtitle = "Time Series of IMDb Average Movie Ratings",
x = "Date",
y = "Average Rating",
caption = "Data Source: IMDb") +
theme(
plot.title = element_text(size = 16, face = "bold"),
plot.subtitle = element_text(size = 14),
axis.title = element_text(size = 14, face = "bold"),
axis.text = element_text(size = 12),
plot.caption = element_text(size = 10)
) +
scale_x_date(date_breaks = "10 years", date_labels = "%Y") + # Sparser x-axis labels
scale_y_continuous(breaks = seq(0, 100, by = 10))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Adjusting plot size and aspect ratio
ggsave("movie_plot.png", movie_plot, width = 12, height = 8, units = "in") # Increase the size
print(movie_plot)
There is a high density of data points in recent years, suggesting an
increase in the number of movies being rated.
Running a simple linear regression using the entire dataset to see if there’s an overall trend:
# Linear regression using score as the response variable and date_x as the predictor
linear_model_full <- lm(score ~ date_x, data = movieData)
# Summary of the full model to check for trends and strength
summary(linear_model_full)
##
## Call:
## lm(formula = score ~ date_x, data = movieData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -67.716 -4.525 1.763 7.850 38.411
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.840e+01 3.590e-01 190.5 <2e-16 ***
## date_x -3.491e-04 2.375e-05 -14.7 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.4 on 10176 degrees of freedom
## Multiple R-squared: 0.02079, Adjusted R-squared: 0.02069
## F-statistic: 216 on 1 and 10176 DF, p-value: < 2.2e-16
Suspecting different trends in different time periods, we would subset the data. Lets test if movie ratings trends are different before and after the year 2000.
# Subset data before the year 2000
movieData_pre2000 <- filter(movieData, date_x < as.Date("2000-01-01"))
# Subset data from the year 2000 onwards
movieData_post2000 <- filter(movieData, date_x >= as.Date("2000-01-01"))
# Linear regression for the first subset
linear_model_pre2000 <- lm(score ~ date_x, data = movieData_pre2000)
summary(linear_model_pre2000)
##
## Call:
## lm(formula = score ~ date_x, data = movieData_pre2000)
##
## Residuals:
## Min 1Q Median 3Q Max
## -67.183 -4.628 1.485 6.821 33.940
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.807e+01 3.307e-01 205.85 <2e-16 ***
## date_x -4.529e-04 4.391e-05 -10.31 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.77 on 2258 degrees of freedom
## Multiple R-squared: 0.045, Adjusted R-squared: 0.04457
## F-statistic: 106.4 on 1 and 2258 DF, p-value: < 2.2e-16
# Linear regression for the second subset
linear_model_post2000 <- lm(score ~ date_x, data = movieData_post2000)
summary(linear_model_post2000)
##
## Call:
## lm(formula = score ~ date_x, data = movieData_post2000)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65.079 -4.619 1.852 8.108 39.232
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.454e+01 1.099e+00 67.84 <2e-16 ***
## date_x -7.057e-04 6.595e-05 -10.70 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.02 on 7916 degrees of freedom
## Multiple R-squared: 0.01426, Adjusted R-squared: 0.01414
## F-statistic: 114.5 on 1 and 7916 DF, p-value: < 2.2e-16
Finding R-squared value, indicating how much variance in the movie ratings is explained by the year of release. A higher R-squared value means a stronger trend.
The full linear regression model considers all the data. Multiple R-squared (R^2) is 0.02079. Indicating that the model explains only a small portion of the variance in movie ratings. The p-value for the date_x coefficient is less than 2.2e-16, suggesting that there is a statistically significant relationship between date_x (time) and movie ratings. However, the Adjusted R-squared is also very low, indicating that the model is not a good fit.
The data is split into two subsets: before and after the year 2000. For the “Before 2000” subset, the R^2 is 0.045, indicating a slightly better fit compared to the full model, but it’s still relatively low.
The p-value for date_x is highly significant, indicating a relationship between time and movie ratings for this subset. For the “After 2000” subset, the R^2 is 0.01426, which is lower than both the full and “Before 2000” subsets. Similar to the other models, there is a significant relationship between time and movie ratings.
library(forecast)
## Warning: package 'forecast' was built under R version 4.3.2
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
# Apply smoothing (e.g., 5-period moving average) to the 'score' column
movieData_smoothed <- movieData %>%
mutate(smoothed_score = zoo::rollmean(score, k = 5, fill = NA))
# Create a plot to compare the smoothed data with the original data
ggplot() +
geom_line(data = movieData, aes(x = date_x, y = score), color = "blue", alpha = 0.5, size = 1) +
geom_line(data = movieData_smoothed, aes(x = date_x, y = smoothed_score), color = "red", size = 1) +
labs(title = "Original vs. Smoothed Movie Scores",
x = "Date",
y = "Score") +
theme_minimal()
library(forecast)
movieData_ts <- ts(movieData$score, frequency = 12)
movieData_decomp <- stl(movieData_ts, s.window = "periodic")
plot(movieData_decomp)
acf(movieData_decomp$time.series[, "seasonal"])
pacf(movieData_decomp$time.series[, "seasonal"])
summary(movieData_decomp$time.series[, "seasonal"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.7272926 -0.2569826 -0.0488401 -0.0000006 0.2510333 0.8118548
# Create a time series object
movie_ts <- ts(movieData$score, frequency = 12)
# Decompose the time series to detect seasonality
decomp <- decompose(movie_ts)
# Plot the ACF and PACF
par(mfrow = c(2, 1))
acf(decomp$seasonal, main = "ACF of Seasonal Component")
pacf(decomp$seasonal, main = "PACF of Seasonal Component")