Implementing the right packages and files

install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("openxlsx")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("data.table")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("ggplot2")
library("openxlsx")
library("dplyr")
library("data.table")
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
financials <- read.xlsx("Movie Dataset_Financials.xlsx")
audience <- read.xlsx("Movie Dataset_General Audience.xlsx")

merging the files together

#see the first few rows 
head(financials)
head(audience)
#changing column names 
financials <- financials |>
  rename(
    title = 'original_title',
    budget = 'budget.(Millions)',
    revenue = 'revenue.(Millions)'
  )


audience <- audience |>
  rename(
    title = 'original_title'
  )

#mutate to trim any whitespace and extra spaces
financials <- financials |>
  mutate(
    title = str_trim(title),
    title = str_squish(title)
  )

audience <- audience |>
  mutate(
    title = str_trim(title),
    title = str_squish(title)
  )

#check duplicated titles
financials |>
  filter(duplicated(financials$title))
audience |>
  filter(duplicated(audience$title))
financials |>
  filter(title == "the Legend of Tarzan")
audience |> 
  filter(title == "the Legend of Tarzan")
financials <- financials |>
  filter(!duplicated(financials$title))

audience <- audience |>
  filter(!duplicated(audience$title))

#merge the two files together
df <- left_join(
  audience,
  financials,
  by = "title"
)

df |>
  filter(!complete.cases(df))
df <- df |>
  filter(!is.na(df$budget))

head(df)

Data testing and plotting

#plotting the data
ggplot(df, aes(x = imdb_rating)) +
  geom_histogram(binwidth = 0.5, fill = 'blue', color = 'black') +
  labs(title = "Distribution of IMDb Ratings", x = "IMDb Rating", y = "Frequency")

df <- df |>
  filter(imdb_rating <= 10)

ggplot(df, aes(x = critics_score, y = imdb_rating)) +
  geom_point(aes(color = genre), alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, color = "red")
## `geom_smooth()` using formula = 'y ~ x'

  labs(
    title = "IMDb Ratings vs Critic Score",
    x = "Critics Score",
    y = "IMDb Rating",
    color = "Genre"
  )
## $x
## [1] "Critics Score"
## 
## $y
## [1] "IMDb Rating"
## 
## $colour
## [1] "Genre"
## 
## $title
## [1] "IMDb Ratings vs Critic Score"
## 
## attr(,"class")
## [1] "labels"
### Comparing Budget to Revenue, Critics score, and Facebook likes
  ## Budget to Revenue with Critic Score
  ggplot(df, aes(x = budget, y = revenue)) +
    geom_point(aes(color = ), alpha = 0.7) +
    geom_smooth(method = "lm", se = FALSE, color = "red")+
    labs(
      title = "Budget vs Revenue",
      x = "Budget (In Millions)",
      y = "Revenue",
      color = "Critics Score"
  )
## `geom_smooth()` using formula = 'y ~ x'

      ### There was no correlation
  ggplot(df, aes(x = budget, y = Facebook_Likes)) +
    geom_point(aes(color = critics_score), alpha = 0.7) +
    geom_smooth(method = "lm", se = FALSE, color = "red")+
    labs(
      title = "Budget vs Facebook Likes",
      x = "Budget (In Millions)",
      y = "Facebook Likes",
      color = "Critics Score"
    )
## `geom_smooth()` using formula = 'y ~ x'

  ## Revenue to Facebook Likes with Critic Score
  ggplot(df, aes(x = revenue, y = Facebook_Likes)) +
    geom_point(aes(color = critics_score), alpha = 0.7) +
    geom_smooth(method = "lm", se = FALSE, color = "red")+
    labs(
      title = "Revenue vs Facebook Likes",
      x = "Revenue (In Millions)",
      y = "Facebook Likes",
      color = "Critics Score"
    )
## `geom_smooth()` using formula = 'y ~ x'

  ## Revenue to Critic Score with Facebook Likes
  ggplot(df, aes(x = revenue, y = critics_score)) +
    geom_point(aes(color = Facebook_Likes), alpha = 0.7) +
    geom_smooth(method = "lm", se = FALSE, color = "red")+
    labs(
      title = "Revenue to Critic Score",
      x = "Revenue (In Millions)",
      y = "Critic Score",
      color = "Facebook Likes"
    )
## `geom_smooth()` using formula = 'y ~ x'

  ### Budget vs Rev with Genre's
  ggplot(df, aes(x = budget, y = revenue)) +
    geom_point(aes(color = genre), alpha = 0.7) +
    geom_smooth(method = "lm", se = FALSE, color = "red")+
    labs(
      title = "Budget vs Revenue",
      x = "Budget (In Millions)",
      y = "Revenue",
      color = "Genre"
    )
## `geom_smooth()` using formula = 'y ~ x'

  ### Facebook likes vs Rev with Genre
  ggplot(df, aes(x = revenue, y = Facebook_Likes)) +
    geom_point(aes(color = genre), alpha = 0.7) +
    labs(
      title = "Revenue vs Facebook Likes",
      x = "Revenue (In Millions)",
      y = "Facebook Likes",
      color = "Genre"
    )

  ### Rev vs Genre with Critic Score
  ggplot(df, aes(x = revenue, y = genre)) +
    geom_point(aes(color = critics_score), alpha = 0.7) +
    labs(
      title = "Revenue vs Genre",
      x = "Revenue (In Millions)",
      y = "Genre",
      color = "Critics Score"
    )

  ### Rev vs Genre with Budget
  ggplot(df, aes(x = revenue, y = genre)) +
    geom_point(aes(color = budget), alpha = 0.7) +
    labs(
      title = "Revenue vs Genre",
      x = "Revenue (In Millions)",
      y = "Genre",
      color = "Budget (In Millions)"
    )

  ### Rev vs Genre with Facebook
  ggplot(df, aes(x = revenue, y = genre)) +
    geom_point(aes(color = Facebook_Likes), alpha = 0.7) +
    labs(
      title = "Revenue vs Genre",
      x = "Revenue (In Millions)",
      y = "Genre",
      color = "Facebook Likes"
    )

  ###Cs vs genre
  ggplot(df, aes(x = critics_score, y = genre)) +
    geom_point(aes(color = critics_score), alpha = 0.7) +
    labs(
      title = "Critic Score vs Genre",
      x = "critics score",
      y = "Genre",
      color = ""
    )

  ggplot(df, aes(x = critics_score, y = genre)) +
    geom_point(aes(color = ), alpha = 0.7) +
    labs(
      title = "Critic Score vs Genre",
      x = "critics score",
      y = "Genre",
      color = ""
    )

  ###Cs vs Rev
  ggplot(df, aes(x = critics_score, y = revenue)) +
    geom_point(aes(color = ), alpha = 0.7) +
    geom_smooth(method = "lm", se = FALSE, color = "red")+
    labs(
      title = "Critic Score vs Revenue",
      x = "critics score",
      y = "Revenue",
      color = ""
    )
## `geom_smooth()` using formula = 'y ~ x'

  ### counting genres  
  df %>% count(genre)
  ### comparing genres and rev
  df %>% count(genre, revenue)
  # Calculate the mean 
  mean_values2 <- aggregate(df$critics_score, by = list(df$genre), FUN = mean)
  
  # Create a box plot with mean value
  boxplot(df$critics_score ~ df$genre, 
          main = "Avg Critic Score",
          xlab = "Genre", ylab = "Critics Score", 
          col = "lightblue")
  points(mean_values2$x ~ mean_values2$Group.1, col = "red", pch = 19)
## Warning in xy.coords(x, y): NAs introduced by coercion

  #### finding Pvalues of different variables
  pvalueGC <- lm(data = df, revenue ~ budget + critics_score)
  summary(pvalueGC)
## 
## Call:
## lm(formula = revenue ~ budget + critics_score, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -442.56  -59.57  -13.76   29.49 1443.35 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -74.9864    15.1206  -4.959 9.08e-07 ***
## budget          2.7930     0.1336  20.900  < 2e-16 ***
## critics_score   1.7652     0.2662   6.630 7.13e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 148.2 on 639 degrees of freedom
## Multiple R-squared:  0.4966, Adjusted R-squared:  0.495 
## F-statistic: 315.2 on 2 and 639 DF,  p-value: < 2.2e-16
  pvalueBG <- lm(data = df, revenue ~ budget + genre)
  summary(pvalueBG)
## 
## Call:
## lm(formula = revenue ~ budget + genre, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -417.45  -59.02  -21.58   22.71 1440.39 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      3.0522    19.8042   0.154    0.878    
## budget                           3.0758     0.1321  23.284   <2e-16 ***
## genreAnimation                  32.8906    54.7177   0.601    0.548    
## genreArt House & International  17.1758    45.2598   0.379    0.704    
## genreComedy                      9.1369    25.2459   0.362    0.718    
## genreDocumentary                13.9818    28.9046   0.484    0.629    
## genreDrama                       9.5740    21.0062   0.456    0.649    
## genreHorror                     52.1654    37.2634   1.400    0.162    
## genreMusical & Performing Arts   3.1220    48.2556   0.065    0.948    
## genreMystery & Suspense        -21.2248    27.6372  -0.768    0.443    
## genreOther                      27.3519    45.2633   0.604    0.546    
## genreScience Fiction & Fantasy -53.9768    54.6250  -0.988    0.323    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 153.6 on 630 degrees of freedom
## Multiple R-squared:  0.4673, Adjusted R-squared:  0.458 
## F-statistic: 50.24 on 11 and 630 DF,  p-value: < 2.2e-16
  pvalueCB <- lm(data = df, critics_score ~ budget )
  summary(pvalueCB)
## 
## Call:
## lm(formula = critics_score ~ budget, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -63.217 -15.778  -1.213  16.136  51.157 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 48.52745    1.16659  41.598  < 2e-16 ***
## budget       0.15793    0.01884   8.385 3.24e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.01 on 640 degrees of freedom
## Multiple R-squared:  0.09898,    Adjusted R-squared:  0.09757 
## F-statistic:  70.3 on 1 and 640 DF,  p-value: 3.238e-16
  pvalueRB <- lm(data = df, revenue ~ budget )
  summary(pvalueRB)
## 
## Call:
## lm(formula = revenue ~ budget, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -410.24  -57.07  -22.37   18.72 1443.15 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   10.672      8.116   1.315    0.189    
## budget         3.072      0.131  23.441   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 153.1 on 640 degrees of freedom
## Multiple R-squared:  0.462,  Adjusted R-squared:  0.4611 
## F-statistic: 549.5 on 1 and 640 DF,  p-value: < 2.2e-16
  pvalueFB <- lm(data = df, Facebook_Likes ~ budget )
  summary(pvalueFB)
## 
## Call:
## lm(formula = Facebook_Likes ~ budget, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -210319  -37660  -19354   26994  447331 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 22502.62    4312.14   5.218 2.44e-07 ***
## budget       1009.12      69.62  14.494  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 81360 on 640 degrees of freedom
## Multiple R-squared:  0.2471, Adjusted R-squared:  0.246 
## F-statistic: 210.1 on 1 and 640 DF,  p-value: < 2.2e-16
  pvalueRG <- lm(data = df, revenue ~ genre )
  summary(pvalueRG)
## 
## Call:
## lm(formula = revenue ~ genre, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -207.02 -115.07  -71.36   16.13 1928.59 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    129.1192    25.9637   4.973 8.51e-07 ***
## genreAnimation                 107.4334    74.4495   1.443    0.150    
## genreArt House & International  -2.3810    61.6761  -0.039    0.969    
## genreComedy                     -0.8763    34.4038  -0.025    0.980    
## genreDocumentary                -7.1856    39.3759  -0.182    0.855    
## genreDrama                      10.4656    28.6302   0.366    0.715    
## genreHorror                     59.6278    50.7861   1.174    0.241    
## genreMusical & Performing Arts   8.2887    65.7691   0.126    0.900    
## genreMystery & Suspense          3.4954    37.6402   0.093    0.926    
## genreOther                      50.8561    61.6761   0.825    0.410    
## genreScience Fiction & Fantasy -61.9164    74.4495  -0.832    0.406    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 209.3 on 631 degrees of freedom
## Multiple R-squared:  0.008848,   Adjusted R-squared:  -0.00686 
## F-statistic: 0.5633 on 10 and 631 DF,  p-value: 0.8443
  pvalueRC <- lm(data = df, revenue ~ critics_score )
  summary(pvalueRG)
## 
## Call:
## lm(formula = revenue ~ genre, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -207.02 -115.07  -71.36   16.13 1928.59 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    129.1192    25.9637   4.973 8.51e-07 ***
## genreAnimation                 107.4334    74.4495   1.443    0.150    
## genreArt House & International  -2.3810    61.6761  -0.039    0.969    
## genreComedy                     -0.8763    34.4038  -0.025    0.980    
## genreDocumentary                -7.1856    39.3759  -0.182    0.855    
## genreDrama                      10.4656    28.6302   0.366    0.715    
## genreHorror                     59.6278    50.7861   1.174    0.241    
## genreMusical & Performing Arts   8.2887    65.7691   0.126    0.900    
## genreMystery & Suspense          3.4954    37.6402   0.093    0.926    
## genreOther                      50.8561    61.6761   0.825    0.410    
## genreScience Fiction & Fantasy -61.9164    74.4495  -0.832    0.406    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 209.3 on 631 degrees of freedom
## Multiple R-squared:  0.008848,   Adjusted R-squared:  -0.00686 
## F-statistic: 0.5633 on 10 and 631 DF,  p-value: 0.8443
  ##Testing other boxplots
  # Calculate the mean 
  mean_values3 <- aggregate(df$budget, by = list(df$revenue), FUN = mean)
    # Create a box plot with mean value
  boxplot(df$budget ~ df$revenue, 
          main = "Avg Critic Score",
          xlab = "Genre", ylab = "Critics Score", 
          col = "lightblue")
  points(mean_values3$x ~ mean_values3$Group.1, col = "red", pch = 19)

  valueBR <- lm(data = df, revenue ~ budget + revenue)
## Warning in model.matrix.default(mt, mf, contrasts): the response appeared on
## the right-hand side and was dropped
## Warning in model.matrix.default(mt, mf, contrasts): problem with term 2 in
## model.matrix: no columns are assigned