The IMDB dataset contains information about movies, including their names, release dates, user ratings, genres, overviews, cast and crew members, original titles, production status, original languages, budgets, revenues, and countries of origin. This data can be used for various analyses, such as identifying trends in movie genres, exploring the relationship between budget and revenue, and predicting the success of future movies.
# Load the lubridate package
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(plyr)
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
library(car) # for VIF
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(repr)
## Warning: package 'repr' was built under R version 4.3.2
library(tsibble)
## Warning: package 'tsibble' was built under R version 4.3.2
##
## Attaching package: 'tsibble'
## The following object is masked from 'package:lubridate':
##
## interval
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
movieData <-read.csv('C:/Users/govin/OneDrive/Desktop/RStudio/Data/imdb_movies.csv')
movieData$date_x <- sapply(movieData$date_x, function(x) gsub("/", "-", x))
movieData[c('date_x')] <- lapply(movieData[c('date_x')], function(x) as.Date(x, format="%m-%d-%Y"))
movieData <- type_convert(movieData)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## names = col_character(),
## genre = col_character(),
## overview = col_character(),
## crew = col_character(),
## orig_title = col_character(),
## status = col_character(),
## orig_lang = col_character(),
## country = col_character()
## )
Revenue_country_wise <- aggregate(movieData$revenue, list(movieData$country), FUN=mean)
Revenue_country_wise <- Revenue_country_wise[order(Revenue_country_wise$x, decreasing = TRUE), ]
print(Revenue_country_wise)
## Group.1 x
## 36 MU 728608266
## 22 GT 655664752
## 5 BO 638332463
## 44 PR 545316308
## 35 LV 542233172
## 12 CO 534571540
## 37 MX 469644985
## 1 AR 456560785
## 54 TW 451275064
## 30 IS 443980387
## 43 PL 440802594
## 32 JP 408106272
## 16 DO 400066522
## 52 TH 381085420
## 10 CL 376139634
## 24 HU 370750873
## 39 NL 369452979
## 6 BR 367192509
## 41 PE 361959154
## 14 DE 358353693
## 31 IT 357659815
## 46 PY 356935817
## 34 KR 350030132
## 17 ES 345563529
## 53 TR 337170686
## 49 SG 333077714
## 42 PH 331340101
## 23 HK 330804782
## 4 BE 328309975
## 13 CZ 316265039
## 25 ID 313651704
## 48 SE 312184372
## 19 FR 305631854
## 9 CH 297093649
## 11 CN 296076152
## 20 GB 295877850
## 28 IN 285128163
## 26 IE 270712673
## 56 US 270571053
## 27 IL 254504167
## 8 CA 253273459
## 15 DK 239348748
## 47 RU 220233805
## 40 NO 209286725
## 60 ZA 208272848
## 58 VN 206250037
## 29 IR 193600405
## 3 AU 192945447
## 57 UY 179105223
## 51 SU 177365780
## 7 BY 175269999
## 33 KH 175269999
## 50 SK 175269999
## 21 GR 154849503
## 55 UA 127479530
## 2 AT 72282768
## 18 FI 66214266
## 59 XC 23146523
## 38 MY 22443973
## 45 PT 1240262
movieData$decade <- year(movieData$date_x)%/%10 * 10
ggplot(movieData, aes(x = decade, y = budget_x)) +
geom_point() +
geom_jitter() +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
options(repr.plot.width = 10, repr.plot.height = 10)
ggplot(movieData, aes(x = budget_x, y = revenue, color = decade)) +
geom_point() +
geom_jitter() +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## With few outliers we can see a direct correlation between movies and
their revenue.
options(repr.plot.width = 20, repr.plot.height = 100)
ggplot(movieData, aes(x = budget_x, y = country, color = decade)) +
geom_point() +
geom_jitter() +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`
## Caused by error in `gam.reparam()`:
## ! NA/NaN/Inf in foreign function call (arg 3)
options(repr.plot.width = 10, repr.plot.height = 10)
ggplot(filter(movieData, orig_lang == "Korean" | orig_lang == "English"), aes(x = budget_x, y = revenue, color = orig_lang)) +
geom_point() +
geom_jitter() +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
options(repr.plot.width = 16, repr.plot.height = 16)
movieData$year <- year(movieData$date_x)
ggplot(filter(movieData, orig_lang == "Korean"), aes(x = year)) +
geom_bar()
## Korean movies seem to be gaining in popularity
options(repr.plot.width = 16, repr.plot.height = 16)
ggplot(movieData, aes(x = score, y = revenue, color = decade)) +
geom_point() +
geom_jitter()
#### There is good correlation between the movie’s IMDB rating and its
collection at box office. It seems after a threshold, there is no direct
correlation
# Creating a histogram
hist(movieData$revenue,
main = "Distribution of Revenue",
xlab = "Revenue", # X-axis label
ylab = "Frequency", # Y-axis label
col = "green", # Bar color
border = "black", # Border color
breaks = 20,
freq = FALSE) # Number of bins or breaks
# Displaying density also
lines(density(movieData$revenue), col = "black", lwd = 2)
# Creating a histogram
hist(movieData$budget,
main = "Distribution of Budget",
xlab = "Budget", # X-axis label
ylab = "Frequency", # Y-axis label
col = "orange", # Bar color
border = "black", # Border color
breaks = 20,
freq = FALSE) # Number of bins or breaks
# Displaying density also
lines(density(movieData$revenue), col = "black", lwd = 2)
# Creating a histogram
hist(movieData$score,
main = "Distribution of Score",
xlab = "Score", # X-axis label
ylab = "Frequency", # Y-axis label
col = "violet", # Bar color
border = "black", # Border color
breaks = 20,
freq = FALSE) # Number of bins or breaks
# Displaying density also
lines(density(movieData$revenue), col = "black", lwd = 2)
library(ggplot2)
library(reshape2)
numeric_data <- unlist(lapply(movieData, is.numeric))
data_num <- movieData[ , numeric_data] #Subsetting numeric columns of data
movieData_corr <- cor(data_num)
heatmap_plot <- ggplot(data = melt(movieData_corr), aes(x = Var1, y = Var2, fill = value)) +
geom_tile() +
geom_text(aes(label = round(value, 2)), vjust = 1) +
labs(title = "Correlation Heatmap for continuous variables", x = "Features", y = "Features", fill = "Correlation")
# Print the heatmap
print(heatmap_plot)
# country Vs revenue column
ggplot(movieData, aes(x = country, y = revenue)) +
geom_boxplot() +
labs(x = "Country", y = "Revenue") +
ggtitle("Box Plot of Country vs. Revenue") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
coord_flip() +
theme(plot.margin = margin(1, 4, 1, 1, "cm"))
movieData <- movieData %>%
mutate(profit = revenue - budget_x)
ggplot(data = movieData, mapping = aes(x = budget_x, y = revenue)) +
geom_point() +
geom_smooth(method="lm") +
labs(title="Budget vs Revenue",
x="Budget",
y="Revenue")
## `geom_smooth()` using formula = 'y ~ x'
cor(movieData$budget_x, movieData$revenue)
## [1] 0.6738296
ggplot(data = movieData, mapping = aes(x = budget_x, y = profit)) +
geom_point() +
geom_smooth(method="lm") +
labs(title="Budget vs Profit",
x="Budget",
y="Profit")
## `geom_smooth()` using formula = 'y ~ x'
revenue_mean <- mean(movieData$revenue)
revenue_mean
## [1] 253140093
revenue_sd <- sd(movieData$revenue)
revenue_sd
## [1] 277788049
margin_error <- qnorm(0.975) * (revenue_sd / sqrt(nrow(movieData)))
margin_error
## [1] 5396727
lower <- revenue_mean - margin_error
upper <- revenue_mean + margin_error
paste0("95% CI: [",lower, ", ", upper,"]")
## [1] "95% CI: [247743366.72094, 258536820.11667]"
# Calculate confidence intervals for movie ratings and revenues
rating_CI <- t.test(movieData$score)$conf.int
revenue_CI <- t.test(movieData$revenue)$conf.int
rating_CI
## [1] 63.23403 63.76007
## attr(,"conf.level")
## [1] 0.95
revenue_CI
## [1] 247742725 258537462
## attr(,"conf.level")
## [1] 0.95
options(repr.plot.width=12, repr.plot.height=6)
library(car) # for VIF
movieData <-read.csv('C:/Users/govin/OneDrive/Desktop/RStudio/Data/imdb_movieData.csv')
movieData$date_x <- sapply(movieData$date_x, function(x) gsub("/", "-", x))
movieData[c('date_x')] <- lapply(movieData[c('date_x')], function(x) as.Date(x, format="%m-%d-%Y"))
movieData <- type_convert(movieData)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## names = col_character(),
## genre = col_character(),
## overview = col_character(),
## crew = col_character(),
## orig_title = col_character(),
## status = col_character(),
## orig_lang = col_character(),
## country = col_character()
## )
movieData$date_x <- as.Date(movieData$date_x, format = "%Y-%m-%d")
# Aggregating data by date and calculating the average score
movie_aggregated <- movieData %>%
group_by(date_x) %>%
summarize(average_score = mean(score, na.rm = TRUE))
# Creating a tsibble object
movie_tsibble <- movie_aggregated %>%
as_tsibble(index = date_x)
# Plotting data over time
movie_plot <- ggplot(movie_tsibble, aes(x = date_x, y = average_score)) +
geom_line(color = "#1f77b4", size = 1) +
geom_point(color = "#ff7f0e", size = 1, alpha = 0.3) + # Reduced alpha for points
theme_minimal(base_size = 15) + # Larger text in minimal theme
labs(title = "Average Movie Ratings Over Time",
subtitle = "Time Series of IMDb Average Movie Ratings",
x = "Date",
y = "Average Rating",
caption = "Data Source: IMDb") +
theme(
plot.title = element_text(size = 16, face = "bold"),
plot.subtitle = element_text(size = 14),
axis.title = element_text(size = 14, face = "bold"),
axis.text = element_text(size = 12),
plot.caption = element_text(size = 10)
) +
scale_x_date(date_breaks = "10 years", date_labels = "%Y") + # Sparser x-axis labels
scale_y_continuous(breaks = seq(0, 100, by = 10))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Adjusting plot size and aspect ratio
ggsave("movie_plot.png", movie_plot, width = 12, height = 8, units = "in") # Increase the size
print(movie_plot)
# Linear regression using score as the response variable and date_x as the predictor
linear_model_full <- lm(score ~ date_x, data = movieData)
# Summary of the full model to check for trends and strength
summary(linear_model_full)
##
## Call:
## lm(formula = score ~ date_x, data = movieData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -67.716 -4.525 1.763 7.850 38.411
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.840e+01 3.590e-01 190.5 <2e-16 ***
## date_x -3.491e-04 2.375e-05 -14.7 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.4 on 10176 degrees of freedom
## Multiple R-squared: 0.02079, Adjusted R-squared: 0.02069
## F-statistic: 216 on 1 and 10176 DF, p-value: < 2.2e-16
# Subset data before the year 2000
movieData_pre2000 <- filter(movieData, date_x < as.Date("2000-01-01"))
# Subset data from the year 2000 onwards
movieData_post2000 <- filter(movieData, date_x >= as.Date("2000-01-01"))
# Linear regression for the first subset
linear_model_pre2000 <- lm(score ~ date_x, data = movieData_pre2000)
summary(linear_model_pre2000)
##
## Call:
## lm(formula = score ~ date_x, data = movieData_pre2000)
##
## Residuals:
## Min 1Q Median 3Q Max
## -67.183 -4.628 1.485 6.821 33.940
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.807e+01 3.307e-01 205.85 <2e-16 ***
## date_x -4.529e-04 4.391e-05 -10.31 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.77 on 2258 degrees of freedom
## Multiple R-squared: 0.045, Adjusted R-squared: 0.04457
## F-statistic: 106.4 on 1 and 2258 DF, p-value: < 2.2e-16
# Linear regression for the second subset
linear_model_post2000 <- lm(score ~ date_x, data = movieData_post2000)
summary(linear_model_post2000)
##
## Call:
## lm(formula = score ~ date_x, data = movieData_post2000)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65.079 -4.619 1.852 8.108 39.232
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.454e+01 1.099e+00 67.84 <2e-16 ***
## date_x -7.057e-04 6.595e-05 -10.70 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.02 on 7916 degrees of freedom
## Multiple R-squared: 0.01426, Adjusted R-squared: 0.01414
## F-statistic: 114.5 on 1 and 7916 DF, p-value: < 2.2e-16
library(forecast)
## Warning: package 'forecast' was built under R version 4.3.2
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
# Apply smoothing (e.g., 5-period moving average) to the 'score' column
movieData_smoothed <- movieData %>%
mutate(smoothed_score = zoo::rollmean(score, k = 5, fill = NA))
# Create a plot to compare the smoothed data with the original data
ggplot() +
geom_line(data = movieData, aes(x = date_x, y = score), color = "blue", alpha = 0.5, size = 1) +
geom_line(data = movieData_smoothed, aes(x = date_x, y = smoothed_score), color = "red", size = 1) +
labs(title = "Original vs. Smoothed Movie Scores",
x = "Date",
y = "Score") +
theme_minimal()
library(forecast)
movieData_ts <- ts(movieData$score, frequency = 12)
movieData_decomp <- stl(movieData_ts, s.window = "periodic")
plot(movieData_decomp)
acf(movieData_decomp$time.series[, "seasonal"])
pacf(movieData_decomp$time.series[, "seasonal"])
summary(movieData_decomp$time.series[, "seasonal"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.7272926 -0.2569826 -0.0488401 -0.0000006 0.2510333 0.8118548
# Create a time series object
movie_ts <- ts(movieData$score, frequency = 12)
# Decompose the time series to detect seasonality
decomp <- decompose(movie_ts)
# Plot the ACF and PACF
par(mfrow = c(2, 1))
acf(decomp$seasonal, main = "ACF of Seasonal Component")
pacf(decomp$seasonal, main = "PACF of Seasonal Component")
# Perform Neyman-Pearson hypothesis test for ratings among different genres
# Assuming we want to compare Drama and Comedy genres
drama_ratings <- movieData$score[movieData$genre == "Drama"]
comedy_ratings <- movieData$score[movieData$genre == "Comedy"]
# Perform the test
np_test <- t.test(drama_ratings, comedy_ratings)
np_test
##
## Welch Two Sample t-test
##
## data: drama_ratings and comedy_ratings
## t = 3.8613, df = 911.36, p-value = 0.0001208
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 2.014544 6.179190
## sample estimates:
## mean of x mean of y
## 63.62770 59.53083
fisher_test <- cor.test(movieData$budget_x, movieData$revenue, method = "pearson")
fisher_test
##
## Pearson's product-moment correlation
##
## data: movieData$budget_x and movieData$revenue
## t = 91.994, df = 10176, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6630821 0.6842993
## sample estimates:
## cor
## 0.6738296
# ANOVA to analyze variations in user ratings among genres or languages
anova_ratings <- aov(score ~ genre + orig_lang, data = movieData)
summary(anova_ratings)
## Df Sum Sq Mean Sq F value Pr(>F)
## genre 2302 462691 201.0 1.356 <2e-16 ***
## orig_lang 51 64667 1268.0 8.556 <2e-16 ***
## Residuals 7739 1146844 148.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 85 observations deleted due to missingness