Setting Up the Project

library(readr)
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v dplyr   1.0.2
## v tibble  3.0.3     v stringr 1.4.0
## v tidyr   1.1.2     v forcats 0.5.0
## v purrr   0.3.4
## -- Conflicts ----------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(naniar)



library(ggplot2movies)
data(movies)
?movies
## starting httpd help server ...
##  done
glimpse(movies)
## Rows: 58,788
## Columns: 24
## $ title       <chr> "$", "$1000 a Touchdown", "$21 a Day Once a Month", "$4...
## $ year        <int> 1971, 1939, 1941, 1996, 1975, 2000, 2002, 2002, 1987, 1...
## $ length      <int> 121, 71, 7, 70, 71, 91, 93, 25, 97, 61, 99, 96, 10, 10,...
## $ budget      <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ rating      <dbl> 6.4, 6.0, 8.2, 8.2, 3.4, 4.3, 5.3, 6.7, 6.6, 6.0, 5.4, ...
## $ votes       <int> 348, 20, 5, 6, 17, 45, 200, 24, 18, 51, 23, 53, 44, 11,...
## $ r1          <dbl> 4.5, 0.0, 0.0, 14.5, 24.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5...
## $ r2          <dbl> 4.5, 14.5, 0.0, 0.0, 4.5, 4.5, 0.0, 4.5, 4.5, 0.0, 0.0,...
## $ r3          <dbl> 4.5, 4.5, 0.0, 0.0, 0.0, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, ...
## $ r4          <dbl> 4.5, 24.5, 0.0, 0.0, 14.5, 14.5, 4.5, 4.5, 0.0, 4.5, 14...
## $ r5          <dbl> 14.5, 14.5, 0.0, 0.0, 14.5, 14.5, 24.5, 4.5, 0.0, 4.5, ...
## $ r6          <dbl> 24.5, 14.5, 24.5, 0.0, 4.5, 14.5, 24.5, 14.5, 0.0, 44.5...
## $ r7          <dbl> 24.5, 14.5, 0.0, 0.0, 0.0, 4.5, 14.5, 14.5, 34.5, 14.5,...
## $ r8          <dbl> 14.5, 4.5, 44.5, 0.0, 0.0, 4.5, 4.5, 14.5, 14.5, 4.5, 4...
## $ r9          <dbl> 4.5, 4.5, 24.5, 34.5, 0.0, 14.5, 4.5, 4.5, 4.5, 4.5, 14...
## $ r10         <dbl> 4.5, 14.5, 24.5, 45.5, 24.5, 14.5, 14.5, 14.5, 24.5, 4....
## $ mpaa        <chr> "", "", "", "", "", "", "R", "", "", "", "", "", "", ""...
## $ Action      <int> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1...
## $ Animation   <int> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ Comedy      <int> 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1...
## $ Drama       <int> 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0...
## $ Documentary <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0...
## $ Romance     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ Short       <int> 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0...

QUESTION: What is the growth pattern of action movies? - Trial 1

#Filter to action movies only
action_movies<-movies%>%
  filter(Action == 1)

#Basic plot showing the ramp up.  Looks like they really take off in the mid 80's.... but need to see relative to total
ggplot(action_movies, aes(year))+
  geom_bar()

#Making total movie count so I can normalize percentage
all_movies<-movies%>%
  mutate(movie_count = 1) %>%
  group_by(year)%>%
  mutate(total_movies_per_year = sum(movie_count))%>%
  arrange(desc(total_movies_per_year))

#Normalizing Action movies as percent of total
action_movies<-all_movies%>%
  filter(Action == 1)%>%
  group_by(year)%>%
  mutate(percent_action = sum(movie_count) / total_movies_per_year)

#Plotting percent action movie.... shows that it actually tapers off quite a bit in the mid 80s.  So movie production overall must have just increase last 80s.  Need to show this.  
ggplot(action_movies, aes(year, percent_action))+
  geom_smooth(se=FALSE)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

QUESTION: What is the growth pattern of action movies? - Trial 1

#Lets get a smaller data set at the level of analysis I need for this just to make sure these formulas are right

movies_grouped_year <- all_movies%>%
  group_by(year)%>%
  summarize(year_movie_count = sum(movie_count))
## `summarise()` ungrouping output (override with `.groups` argument)
action_movies_grouped_year<-all_movies%>%
  filter(Action == 1)%>%
  group_by(year)%>%
  summarize(year_movie_count_action = sum(movie_count))
## `summarise()` ungrouping output (override with `.groups` argument)
movies_grouped_year_joined<-movies_grouped_year%>%
left_join(action_movies_grouped_year, by = c("year"))%>%
  replace_na(list(year_movie_count_action=0))%>%
  mutate(percent_action = year_movie_count_action / year_movie_count)
  


# Look at action movies again
ggplot(movies_grouped_year_joined, aes(year, year_movie_count_action))+
  geom_line()

# look at action movie percent again
ggplot(movies_grouped_year_joined, aes(year, percent_action))+
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#playing around ggplot types without modifying the data... maybe my data isnt tidy

ggplot(all_movies, aes(year, total_movies_per_year, color=factor(Action)))+
  geom_line()

ggplot(all_movies, aes(year, total_movies_per_year, color=factor(Action)))+
  geom_line(alpha=0.6)

ggplot(all_movies, aes(x=year, fill=factor(Action)))+
  geom_bar(position = "dodge")

ggplot(all_movies, aes(x=year, fill=factor(Action)))+
  geom_bar(position = "fill")

ggplot(all_movies, aes(x=year, fill=factor(Action)))+
  geom_histogram(binwidth = 1, position = "fill", alpha=0.8)

#let's zoom in on where the volume ramps up
current_movies<-all_movies%>%
  filter(year >= 1960)

ggplot(current_movies, aes(x=year, fill=factor(Action)))+
  geom_histogram(binwidth = 1, position = "fill", alpha=0.8)

Let’s look at ratings of action movies

all_action_movies<- all_movies%>%
  filter(Action==1)

ggplot(all_action_movies, aes(x=year, y=rating))+
  geom_point()+
  geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#shows that the ratings trailed off in the 90s but coming back up....  I wonder if ratings are related to movie budget

ggplot(all_action_movies, aes(x=rating, y=budget))+
  geom_point(alpha=0.2)+
  geom_smooth(se=FALSE,method="lm")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 3850 rows containing non-finite values (stat_smooth).
## Warning: Removed 3850 rows containing missing values (geom_point).

ggplot(all_action_movies, aes(x=rating, y=budget))+
  geom_point(alpha=0.2)+
  geom_smooth(se=FALSE)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 3850 rows containing non-finite values (stat_smooth).

## Warning: Removed 3850 rows containing missing values (geom_point).

#ratings look somewhat related to budget for middle of the pack ratings but the exception movies actually have smaller budgets.  

# Maybe ratings are better when it is an action + drama movie...
ggplot(all_action_movies, aes(x=rating, y=budget, color=factor(Drama)))+
  geom_point(alpha=0.2)+
  geom_smooth(se=FALSE)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 3850 rows containing non-finite values (stat_smooth).

## Warning: Removed 3850 rows containing missing values (geom_point).

#glimpse(all_action_movies)

Zeroing in on “Current Action Movies”

#filter to Year > 1990
current_action_movies<- all_action_movies%>%
  filter(year >= 1990)

ggplot(current_action_movies, aes(x=year, y=rating))+
  geom_point()+
  geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#shows that the ratings trailed off in the 90s but coming back up....  I wonder if ratings are related to movie budget

ggplot(current_action_movies, aes(x=rating, y=budget))+
  geom_point(alpha=0.2)+
  geom_smooth(se=FALSE,method="lm")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1717 rows containing non-finite values (stat_smooth).
## Warning: Removed 1717 rows containing missing values (geom_point).

ggplot(current_action_movies, aes(x=rating, y=budget))+
  geom_point(alpha=0.2)+
  geom_smooth(se=FALSE)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1717 rows containing non-finite values (stat_smooth).

## Warning: Removed 1717 rows containing missing values (geom_point).

#ratings look somewhat related to budget for middle of the pack ratings but the exception movies actually have smaller budgets.  

# Maybe ratings are better when it is an action + drama movie...
ggplot(current_action_movies, aes(x=rating, y=budget, color=factor(Drama)))+
  geom_point(alpha=0.2)+
  geom_smooth(se=FALSE)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1717 rows containing non-finite values (stat_smooth).

## Warning: Removed 1717 rows containing missing values (geom_point).

#Let's look at this a different way
ggplot(current_action_movies, aes(x=rating, fill=factor(Drama)))+
#  geom_point(alpha=0.2)+
  geom_bar(position="fill")

#### Appendix - stuff I didn’t use

movies_ordered<-arrange(movies, year)
#view(movies_ordered)

# Making table for action movies against total movies
movies_grouped_action <- group_by(movies, year, Action)
movies_sum_action <- summarise(movies_grouped_action, n_movies = n())
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
#view(movies_sum_action)

# Making table for drama movies against total movies
movies_grouped_drama <- group_by(movies, year, Drama)
movies_sum_drama <- summarise(movies_grouped_drama, n_movies = n())
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
#view(movies_sum_drama)

# Binding them together to compare
movies_sum_bind<-movies_sum_action%>%
  bind_rows(movies_sum_drama)%>%
  arrange(year)
#view(movies_sum_bind)

# Data does not look right, so playing around with the grouping
movies_grouped_multi <- group_by(movies, year, Action, Drama)
movies_sum_multi <- summarise(movies_grouped_multi, n_movies = n())
## `summarise()` regrouping output by 'year', 'Action' (override with `.groups` argument)
#view(movies_sum_multi)