#load the data libraries - remove or add as needed
library(tidyverse) #tools form data science, included ggplot2, dplyr, tidyr, readr, tibble, stringr, and forcats as core libraries.
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(scales) #loaded to address viz issues, including currency issues
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
options(scipen=999) #disable scientific notation since high values are used
library(broom)
library(lindia)
#clean up the work space
rm(list = ls())
#load the adjusted version of the csv from the local desktop
t_box_office <- read_delim("C:/Users/danjh/Grad School/H510 Stats for DS/Datasets/box_office_data_2000_24_adj.csv", delim = ",")
## Rows: 5000 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): Release Group, Genres, Rating, Original_Language, Production_Count...
## dbl (10): Rank, $Worldwide, $Domestic, Domestic %, $Foreign, Foreign %, Year...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#create a copy of the data set for this activity
movies <- t_box_office
#cat(colnames(movies),sep = ", ")
#cleanup of column names to avoid constant issues with special chars and innaccurate names
# Rename specific columns
colnames(movies)[which(colnames(movies) == "Release Group")] <- "MovieName"
colnames(movies)[which(colnames(movies) == "$Worldwide")] <- "WorldwideRevenue"
colnames(movies)[which(colnames(movies) == "$Domestic")] <- "DomesticRevenue"
colnames(movies)[which(colnames(movies) == "$Foreign")] <- "ForeignRevenue"
colnames(movies)[which(colnames(movies) == "Domestic %")] <- "DomesticPercentage"
colnames(movies)[which(colnames(movies) == "Foreign %")] <- "ForeignPercentage"
colnames(movies)[which(colnames(movies) == "Rank")] <- "RankForYear"
#create a version of the data set which excludes worldwide revenue outliers
# Calculate IQR for WorldwideRevenue
Q1 <- quantile(movies$WorldwideRevenue, 0.25) # First quartile
Q3 <- quantile(movies$WorldwideRevenue, 0.75) # Third quartile
IQR <- Q3 - Q1
# Define lower and upper bounds
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
# Filter out outliers
movies_clean <- movies[movies$WorldwideRevenue >= lower_bound & movies$WorldwideRevenue <= upper_bound, ]
# View updated column names
cat(colnames(movies_clean), sep = ", ") #list column names for reference
## RankForYear, MovieName, WorldwideRevenue, DomesticRevenue, DomesticPercentage, ForeignRevenue, ForeignPercentage, Year, Genres, Rating, Vote_Count, Original_Language, Production_Countries, Prime_Genre, Prime_Production_Country, Rating_scale, Rating_of_10
The purpose of this week’s data dive is for you to think critically about interpreting and diagnosing regression models.
Your RMarkdown notebook for this data dive should contain the following:
Refer to the simple linear regression model you built last week. Include 1-3 more variables into your regression model.
Try out either an interaction term or a binary term to start.
Consider adding other integer or continuous variables.
For each new variable you try, explain why you should include it, or not. E.g., are there any issues with multicollinearity?
Your model for this data dive should have 2-4 terms.
Evaluate this model.
At the very least, use the 5 diagnostic plots discussed in class to identify any issues with your model.
For each plot, point out any indications of issues with the model. Otherwise, explain how the plot supports the claim that an assumption is met.
Try to measure the severity of any issues as well as the level of confidence you have in an assumption being met.
For each of the above tasks, you must explain to the reader what insight was gathered, its significance, and any further questions you have which might need to be further investigated.
Select additional 1-3 variables to add to model. Explain why it should or should not be included
Thoughts about the data - We already have vote_count, which gives us a general idea of movie’s popularity. I think it would be interesting to add the movie’s score (rating_of_10), how the movie did in foreign markets, (ForeignRevenue) and whether the movie being released post-covid (IsPostCovid) makes any difference.
Why to include them:
vote_count : Helps indicate the popularity of
a film
rating_of_10 : Helps establish how well
recieved the film was
ForeignRevenue: Allows us to look at
International performance
IsPostCovid: Might help us determine if there
is any market change after COVID
#add the IsPostCovid column
movies_clean$IsPostCovid <- ifelse(movies_clean$Year >= 2020, 1, 0)
Previous analysis showed there are NA vlaues in the data set, further analysis shows these are all in the same rows and not scattered through the data. Cleaning this at the table level should be sufficient.
#Create a new dataset that excludes NA values
movies_clean_without_na <- na.omit(movies_clean)
#look at a correlation matrix to check on correlation
#choose the columns of interest
selected_columns <- movies_clean[, c("WorldwideRevenue", "Vote_Count", "Rating_of_10", "ForeignRevenue", "IsPostCovid")]
#Create the matrix
cor_matrix <- cor(selected_columns, use = "complete.obs")
#print the matrix
print(cor_matrix)
## WorldwideRevenue Vote_Count Rating_of_10 ForeignRevenue
## WorldwideRevenue 1.00000000 0.5400855 0.07987056 0.86226554
## Vote_Count 0.54008545 1.0000000 0.26597540 0.38367526
## Rating_of_10 0.07987056 0.2659754 1.00000000 0.07984933
## ForeignRevenue 0.86226554 0.3836753 0.07984933 1.00000000
## IsPostCovid -0.18346073 -0.1527189 0.09787130 -0.11959521
## IsPostCovid
## WorldwideRevenue -0.1834607
## Vote_Count -0.1527189
## Rating_of_10 0.0978713
## ForeignRevenue -0.1195952
## IsPostCovid 1.0000000
regressionModel <- lm(WorldwideRevenue ~ Rating_of_10 + ForeignRevenue + IsPostCovid + Vote_Count,
data = movies_clean_without_na)
summary(regressionModel)
##
## Call:
## lm(formula = WorldwideRevenue ~ Rating_of_10 + ForeignRevenue +
## IsPostCovid + Vote_Count, data = movies_clean_without_na)
##
## Residuals:
## Min 1Q Median 3Q Max
## -149751102 -16060574 -5618675 10727564 167770666
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30934916.71715 2930593.64369 10.556 < 0.0000000000000002 ***
## Rating_of_10 -3067606.42596 460380.95186 -6.663 0.0000000000302 ***
## ForeignRevenue 1.14326 0.01116 102.446 < 0.0000000000000002 ***
## IsPostCovid -6932194.99351 1029671.68315 -6.732 0.0000000000189 ***
## Vote_Count 6490.88045 200.80165 32.325 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26140000 on 4242 degrees of freedom
## Multiple R-squared: 0.7998, Adjusted R-squared: 0.7996
## F-statistic: 4237 on 4 and 4242 DF, p-value: < 0.00000000000000022
#create the Residuls vs. Fitted plot
gg_resfitted(regressionModel)
Does the plot identify issues with the model or supports the claim that an assumption is met?
How severe is the issue or validity of the assumption?
gg_resX(regressionModel)
vs. Rating of 10 - there is no distinct pattern and the spread of the residual is not significantly spread out supporting the assumption of linearity.
vs.ForeignRevenue - there is no distinct pattern but the spread of the residuals is widely spread suggesting mild heroscedacity which is a moderate issue.
vs. IsPostCovid - Since this is a binary variable it only has two clusters and there is no pattern
vs. Vote_count - there is no distinct pattern and the spread of the residual is not significantly spread out supporting the assumption of linearity.
gg_reshist(regressionModel)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
gg_qqplot(regressionModel)
gg_cooksd(regressionModel)