#loading libraries and data into the file
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)

bechdel_data_movies <- read_csv("C:/Users/Lauren/Documents/Stats Data/movies.csv")
## Rows: 1794 Columns: 34
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (24): imdb, title, test, clean_test, binary, domgross, intgross, code, d...
## dbl  (7): year, budget, budget_2013, period_code, decade_code, metascore, im...
## num  (1): imdb_votes
## lgl  (2): response, error
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
bechdel_data_movies$domgross_2013 <- as.numeric(bechdel_data_movies$domgross_2013, na.rm = TRUE)
## Warning: NAs introduced by coercion
bechdel_data_movies$intgross_2013 <- as.numeric(bechdel_data_movies$intgross_2013, na.rm = TRUE)
## Warning: NAs introduced by coercion
bechdel_data_movies$budget_2013 <- as.numeric(bechdel_data_movies$budget_2013, na.rm = TRUE)


bechdel_data_movies <- bechdel_data_movies |>
  mutate(profitability = (domgross_2013 + intgross_2013) - budget_2013)

The purpose of this week’s data dive is for you to get experience running ANOVA tests and building regression models.

Your RMarkdown notebook for this data dive should contain the following:

# checking that the assumptions under ANOVA are true
profitability_chart <- ggplot(bechdel_data_movies) + aes(x = binary, y = profitability, color = binary) + geom_boxplot() + labs( title = "Verifying Homogeneity", x = "Passes Bechdel Test", y = "Profitability in 2013 USD") 
profitability_chart
## Warning: Removed 18 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

The boxplots look comparable for both categories, so we will continue with the ANOVA testing.

anova_1_bechdel <- oneway.test(profitability ~ binary,
  data = bechdel_data_movies,
  var.equal = TRUE 
)

anova_1_bechdel
## 
##  One-way analysis of means
## 
## data:  profitability and binary
## F = 13.903, num df = 1, denom df = 1774, p-value = 0.0001985
#checking for linearity
linearity_check <- ggplot(bechdel_data_movies, aes(x = profitability, y = budget_2013, color = binary)) + geom_point() + labs(title = "Movie Profitability and Budget", x = "Profitability (2013 USD)", y= "Budget (2013 USD)")+ scale_y_log10()
linearity_check
## Warning: Removed 18 rows containing missing values or values outside the scale range
## (`geom_point()`).

It is kind of adjacent to something like linear, so we move ahead!

linreg <- lm(profitability ~ budget_2013, data = bechdel_data_movies)
linreg
## 
## Call:
## lm(formula = profitability ~ budget_2013, data = bechdel_data_movies)
## 
## Coefficients:
## (Intercept)  budget_2013  
##   6.367e+07    3.117e+00
#adding linreg to linearity_check
checking_lin <- linearity_check + geom_abline(intercept = 63670000, slope = 3.117, color = "purple")
checking_lin
## Warning: Removed 18 rows containing missing values or values outside the scale range
## (`geom_point()`).

For each of the above tasks, you must explain to the reader what insight was gathered, its significance, and any further questions you have which might need to be further investigated.