Implementing the right packages and files
install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("openxlsx")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("data.table")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("ggplot2")
library("openxlsx")
library("dplyr")
library("data.table")
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
financials <- read.xlsx("Movie Dataset_Financials.xlsx")
audience <- read.xlsx("Movie Dataset_General Audience.xlsx")
merging the files together
#see the first few rows
head(financials)
head(audience)
#changing column names
financials <- financials |>
rename(
title = 'original_title',
budget = 'budget.(Millions)',
revenue = 'revenue.(Millions)'
)
audience <- audience |>
rename(
title = 'original_title'
)
#mutate to trim any whitespace and extra spaces
financials <- financials |>
mutate(
title = str_trim(title),
title = str_squish(title)
)
audience <- audience |>
mutate(
title = str_trim(title),
title = str_squish(title)
)
#check duplicated titles
financials |>
filter(duplicated(financials$title))
audience |>
filter(duplicated(audience$title))
financials |>
filter(title == "the Legend of Tarzan")
audience |>
filter(title == "the Legend of Tarzan")
financials <- financials |>
filter(!duplicated(financials$title))
audience <- audience |>
filter(!duplicated(audience$title))
#merge the two files together
df <- left_join(
audience,
financials,
by = "title"
)
df |>
filter(!complete.cases(df))
df <- df |>
filter(!is.na(df$budget))
head(df)
Data testing and plotting
#plotting the data
ggplot(df, aes(x = imdb_rating)) +
geom_histogram(binwidth = 0.5, fill = 'blue', color = 'black') +
labs(title = "Distribution of IMDb Ratings", x = "IMDb Rating", y = "Frequency")
df <- df |>
filter(imdb_rating <= 10)
ggplot(df, aes(x = critics_score, y = imdb_rating)) +
geom_point(aes(color = genre), alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, color = "red")
## `geom_smooth()` using formula = 'y ~ x'
labs(
title = "IMDb Ratings vs Critic Score",
x = "Critics Score",
y = "IMDb Rating",
color = "Genre"
)
## $x
## [1] "Critics Score"
##
## $y
## [1] "IMDb Rating"
##
## $colour
## [1] "Genre"
##
## $title
## [1] "IMDb Ratings vs Critic Score"
##
## attr(,"class")
## [1] "labels"
### Comparing Budget to Revenue, Critics score, and Facebook likes
## Budget to Revenue with Critic Score
ggplot(df, aes(x = budget, y = revenue)) +
geom_point(aes(color = ), alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, color = "red")+
labs(
title = "Budget vs Revenue",
x = "Budget (In Millions)",
y = "Revenue",
color = "Critics Score"
)
## `geom_smooth()` using formula = 'y ~ x'
### There was no correlation
ggplot(df, aes(x = budget, y = Facebook_Likes)) +
geom_point(aes(color = critics_score), alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, color = "red")+
labs(
title = "Budget vs Facebook Likes",
x = "Budget (In Millions)",
y = "Facebook Likes",
color = "Critics Score"
)
## `geom_smooth()` using formula = 'y ~ x'
## Revenue to Facebook Likes with Critic Score
ggplot(df, aes(x = revenue, y = Facebook_Likes)) +
geom_point(aes(color = critics_score), alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, color = "red")+
labs(
title = "Revenue vs Facebook Likes",
x = "Revenue (In Millions)",
y = "Facebook Likes",
color = "Critics Score"
)
## `geom_smooth()` using formula = 'y ~ x'
## Revenue to Critic Score with Facebook Likes
ggplot(df, aes(x = revenue, y = critics_score)) +
geom_point(aes(color = Facebook_Likes), alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, color = "red")+
labs(
title = "Revenue to Critic Score",
x = "Revenue (In Millions)",
y = "Critic Score",
color = "Facebook Likes"
)
## `geom_smooth()` using formula = 'y ~ x'
### Budget vs Rev with Genre's
ggplot(df, aes(x = budget, y = revenue)) +
geom_point(aes(color = genre), alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, color = "red")+
labs(
title = "Budget vs Revenue",
x = "Budget (In Millions)",
y = "Revenue",
color = "Genre"
)
## `geom_smooth()` using formula = 'y ~ x'
### Facebook likes vs Rev with Genre
ggplot(df, aes(x = revenue, y = Facebook_Likes)) +
geom_point(aes(color = genre), alpha = 0.7) +
labs(
title = "Revenue vs Facebook Likes",
x = "Revenue (In Millions)",
y = "Facebook Likes",
color = "Genre"
)
### Rev vs Genre with Critic Score
ggplot(df, aes(x = revenue, y = genre)) +
geom_point(aes(color = critics_score), alpha = 0.7) +
labs(
title = "Revenue vs Genre",
x = "Revenue (In Millions)",
y = "Genre",
color = "Critics Score"
)
### Rev vs Genre with Budget
ggplot(df, aes(x = revenue, y = genre)) +
geom_point(aes(color = budget), alpha = 0.7) +
labs(
title = "Revenue vs Genre",
x = "Revenue (In Millions)",
y = "Genre",
color = "Budget (In Millions)"
)
### Rev vs Genre with Facebook
ggplot(df, aes(x = revenue, y = genre)) +
geom_point(aes(color = Facebook_Likes), alpha = 0.7) +
labs(
title = "Revenue vs Genre",
x = "Revenue (In Millions)",
y = "Genre",
color = "Facebook Likes"
)
###Cs vs genre
ggplot(df, aes(x = critics_score, y = genre)) +
geom_point(aes(color = critics_score), alpha = 0.7) +
labs(
title = "Critic Score vs Genre",
x = "critics score",
y = "Genre",
color = ""
)
ggplot(df, aes(x = critics_score, y = genre)) +
geom_point(aes(color = ), alpha = 0.7) +
labs(
title = "Critic Score vs Genre",
x = "critics score",
y = "Genre",
color = ""
)
###Cs vs Rev
ggplot(df, aes(x = critics_score, y = revenue)) +
geom_point(aes(color = ), alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, color = "red")+
labs(
title = "Critic Score vs Revenue",
x = "critics score",
y = "Revenue",
color = ""
)
## `geom_smooth()` using formula = 'y ~ x'
### counting genres
df %>% count(genre)
### comparing genres and rev
df %>% count(genre, revenue)
# Calculate the mean
mean_values2 <- aggregate(df$critics_score, by = list(df$genre), FUN = mean)
# Create a box plot with mean value
boxplot(df$critics_score ~ df$genre,
main = "Avg Critic Score",
xlab = "Genre", ylab = "Critics Score",
col = "lightblue")
points(mean_values2$x ~ mean_values2$Group.1, col = "red", pch = 19)
## Warning in xy.coords(x, y): NAs introduced by coercion
#### finding Pvalues of different variables
pvalueGC <- lm(data = df, revenue ~ budget + critics_score)
summary(pvalueGC)
##
## Call:
## lm(formula = revenue ~ budget + critics_score, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -442.56 -59.57 -13.76 29.49 1443.35
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -74.9864 15.1206 -4.959 9.08e-07 ***
## budget 2.7930 0.1336 20.900 < 2e-16 ***
## critics_score 1.7652 0.2662 6.630 7.13e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 148.2 on 639 degrees of freedom
## Multiple R-squared: 0.4966, Adjusted R-squared: 0.495
## F-statistic: 315.2 on 2 and 639 DF, p-value: < 2.2e-16
pvalueBG <- lm(data = df, revenue ~ budget + genre)
summary(pvalueBG)
##
## Call:
## lm(formula = revenue ~ budget + genre, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -417.45 -59.02 -21.58 22.71 1440.39
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.0522 19.8042 0.154 0.878
## budget 3.0758 0.1321 23.284 <2e-16 ***
## genreAnimation 32.8906 54.7177 0.601 0.548
## genreArt House & International 17.1758 45.2598 0.379 0.704
## genreComedy 9.1369 25.2459 0.362 0.718
## genreDocumentary 13.9818 28.9046 0.484 0.629
## genreDrama 9.5740 21.0062 0.456 0.649
## genreHorror 52.1654 37.2634 1.400 0.162
## genreMusical & Performing Arts 3.1220 48.2556 0.065 0.948
## genreMystery & Suspense -21.2248 27.6372 -0.768 0.443
## genreOther 27.3519 45.2633 0.604 0.546
## genreScience Fiction & Fantasy -53.9768 54.6250 -0.988 0.323
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 153.6 on 630 degrees of freedom
## Multiple R-squared: 0.4673, Adjusted R-squared: 0.458
## F-statistic: 50.24 on 11 and 630 DF, p-value: < 2.2e-16
pvalueCB <- lm(data = df, critics_score ~ budget )
summary(pvalueCB)
##
## Call:
## lm(formula = critics_score ~ budget, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -63.217 -15.778 -1.213 16.136 51.157
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 48.52745 1.16659 41.598 < 2e-16 ***
## budget 0.15793 0.01884 8.385 3.24e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.01 on 640 degrees of freedom
## Multiple R-squared: 0.09898, Adjusted R-squared: 0.09757
## F-statistic: 70.3 on 1 and 640 DF, p-value: 3.238e-16
pvalueRB <- lm(data = df, revenue ~ budget )
summary(pvalueRB)
##
## Call:
## lm(formula = revenue ~ budget, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -410.24 -57.07 -22.37 18.72 1443.15
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.672 8.116 1.315 0.189
## budget 3.072 0.131 23.441 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 153.1 on 640 degrees of freedom
## Multiple R-squared: 0.462, Adjusted R-squared: 0.4611
## F-statistic: 549.5 on 1 and 640 DF, p-value: < 2.2e-16
pvalueFB <- lm(data = df, Facebook_Likes ~ budget )
summary(pvalueFB)
##
## Call:
## lm(formula = Facebook_Likes ~ budget, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -210319 -37660 -19354 26994 447331
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22502.62 4312.14 5.218 2.44e-07 ***
## budget 1009.12 69.62 14.494 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 81360 on 640 degrees of freedom
## Multiple R-squared: 0.2471, Adjusted R-squared: 0.246
## F-statistic: 210.1 on 1 and 640 DF, p-value: < 2.2e-16
pvalueRG <- lm(data = df, revenue ~ genre )
summary(pvalueRG)
##
## Call:
## lm(formula = revenue ~ genre, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -207.02 -115.07 -71.36 16.13 1928.59
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 129.1192 25.9637 4.973 8.51e-07 ***
## genreAnimation 107.4334 74.4495 1.443 0.150
## genreArt House & International -2.3810 61.6761 -0.039 0.969
## genreComedy -0.8763 34.4038 -0.025 0.980
## genreDocumentary -7.1856 39.3759 -0.182 0.855
## genreDrama 10.4656 28.6302 0.366 0.715
## genreHorror 59.6278 50.7861 1.174 0.241
## genreMusical & Performing Arts 8.2887 65.7691 0.126 0.900
## genreMystery & Suspense 3.4954 37.6402 0.093 0.926
## genreOther 50.8561 61.6761 0.825 0.410
## genreScience Fiction & Fantasy -61.9164 74.4495 -0.832 0.406
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 209.3 on 631 degrees of freedom
## Multiple R-squared: 0.008848, Adjusted R-squared: -0.00686
## F-statistic: 0.5633 on 10 and 631 DF, p-value: 0.8443
pvalueRC <- lm(data = df, revenue ~ critics_score )
summary(pvalueRG)
##
## Call:
## lm(formula = revenue ~ genre, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -207.02 -115.07 -71.36 16.13 1928.59
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 129.1192 25.9637 4.973 8.51e-07 ***
## genreAnimation 107.4334 74.4495 1.443 0.150
## genreArt House & International -2.3810 61.6761 -0.039 0.969
## genreComedy -0.8763 34.4038 -0.025 0.980
## genreDocumentary -7.1856 39.3759 -0.182 0.855
## genreDrama 10.4656 28.6302 0.366 0.715
## genreHorror 59.6278 50.7861 1.174 0.241
## genreMusical & Performing Arts 8.2887 65.7691 0.126 0.900
## genreMystery & Suspense 3.4954 37.6402 0.093 0.926
## genreOther 50.8561 61.6761 0.825 0.410
## genreScience Fiction & Fantasy -61.9164 74.4495 -0.832 0.406
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 209.3 on 631 degrees of freedom
## Multiple R-squared: 0.008848, Adjusted R-squared: -0.00686
## F-statistic: 0.5633 on 10 and 631 DF, p-value: 0.8443
##Testing other boxplots
# Calculate the mean
mean_values3 <- aggregate(df$budget, by = list(df$revenue), FUN = mean)
# Create a box plot with mean value
boxplot(df$budget ~ df$revenue,
main = "Avg Critic Score",
xlab = "Genre", ylab = "Critics Score",
col = "lightblue")
points(mean_values3$x ~ mean_values3$Group.1, col = "red", pch = 19)
valueBR <- lm(data = df, revenue ~ budget + revenue)
## Warning in model.matrix.default(mt, mf, contrasts): the response appeared on
## the right-hand side and was dropped
## Warning in model.matrix.default(mt, mf, contrasts): problem with term 2 in
## model.matrix: no columns are assigned