library(readr)
library(tidyverse)## -- Attaching packages -------------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --## v ggplot2 3.3.2     v dplyr   1.0.2
## v tibble  3.0.3     v stringr 1.4.0
## v tidyr   1.1.2     v forcats 0.5.0
## v purrr   0.3.4## -- Conflicts ----------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()library(naniar)
library(ggplot2movies)
data(movies)
?movies## starting httpd help server ...##  doneglimpse(movies)## Rows: 58,788
## Columns: 24
## $ title       <chr> "$", "$1000 a Touchdown", "$21 a Day Once a Month", "$4...
## $ year        <int> 1971, 1939, 1941, 1996, 1975, 2000, 2002, 2002, 1987, 1...
## $ length      <int> 121, 71, 7, 70, 71, 91, 93, 25, 97, 61, 99, 96, 10, 10,...
## $ budget      <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ rating      <dbl> 6.4, 6.0, 8.2, 8.2, 3.4, 4.3, 5.3, 6.7, 6.6, 6.0, 5.4, ...
## $ votes       <int> 348, 20, 5, 6, 17, 45, 200, 24, 18, 51, 23, 53, 44, 11,...
## $ r1          <dbl> 4.5, 0.0, 0.0, 14.5, 24.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5...
## $ r2          <dbl> 4.5, 14.5, 0.0, 0.0, 4.5, 4.5, 0.0, 4.5, 4.5, 0.0, 0.0,...
## $ r3          <dbl> 4.5, 4.5, 0.0, 0.0, 0.0, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, ...
## $ r4          <dbl> 4.5, 24.5, 0.0, 0.0, 14.5, 14.5, 4.5, 4.5, 0.0, 4.5, 14...
## $ r5          <dbl> 14.5, 14.5, 0.0, 0.0, 14.5, 14.5, 24.5, 4.5, 0.0, 4.5, ...
## $ r6          <dbl> 24.5, 14.5, 24.5, 0.0, 4.5, 14.5, 24.5, 14.5, 0.0, 44.5...
## $ r7          <dbl> 24.5, 14.5, 0.0, 0.0, 0.0, 4.5, 14.5, 14.5, 34.5, 14.5,...
## $ r8          <dbl> 14.5, 4.5, 44.5, 0.0, 0.0, 4.5, 4.5, 14.5, 14.5, 4.5, 4...
## $ r9          <dbl> 4.5, 4.5, 24.5, 34.5, 0.0, 14.5, 4.5, 4.5, 4.5, 4.5, 14...
## $ r10         <dbl> 4.5, 14.5, 24.5, 45.5, 24.5, 14.5, 14.5, 14.5, 24.5, 4....
## $ mpaa        <chr> "", "", "", "", "", "", "R", "", "", "", "", "", "", ""...
## $ Action      <int> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1...
## $ Animation   <int> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ Comedy      <int> 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1...
## $ Drama       <int> 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0...
## $ Documentary <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0...
## $ Romance     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ Short       <int> 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0...#Filter to action movies only
action_movies<-movies%>%
  filter(Action == 1)
#Basic plot showing the ramp up.  Looks like they really take off in the mid 80's.... but need to see relative to total
ggplot(action_movies, aes(year))+
  geom_bar()#Making total movie count so I can normalize percentage
all_movies<-movies%>%
  mutate(movie_count = 1) %>%
  group_by(year)%>%
  mutate(total_movies_per_year = sum(movie_count))%>%
  arrange(desc(total_movies_per_year))
#Normalizing Action movies as percent of total
action_movies<-all_movies%>%
  filter(Action == 1)%>%
  group_by(year)%>%
  mutate(percent_action = sum(movie_count) / total_movies_per_year)
#Plotting percent action movie.... shows that it actually tapers off quite a bit in the mid 80s.  So movie production overall must have just increase last 80s.  Need to show this.  
ggplot(action_movies, aes(year, percent_action))+
  geom_smooth(se=FALSE)## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'#Lets get a smaller data set at the level of analysis I need for this just to make sure these formulas are right
movies_grouped_year <- all_movies%>%
  group_by(year)%>%
  summarize(year_movie_count = sum(movie_count))## `summarise()` ungrouping output (override with `.groups` argument)action_movies_grouped_year<-all_movies%>%
  filter(Action == 1)%>%
  group_by(year)%>%
  summarize(year_movie_count_action = sum(movie_count))## `summarise()` ungrouping output (override with `.groups` argument)movies_grouped_year_joined<-movies_grouped_year%>%
left_join(action_movies_grouped_year, by = c("year"))%>%
  replace_na(list(year_movie_count_action=0))%>%
  mutate(percent_action = year_movie_count_action / year_movie_count)
  
# Look at action movies again
ggplot(movies_grouped_year_joined, aes(year, year_movie_count_action))+
  geom_line()# look at action movie percent again
ggplot(movies_grouped_year_joined, aes(year, percent_action))+
  geom_smooth()## `geom_smooth()` using method = 'loess' and formula 'y ~ x'#playing around ggplot types without modifying the data... maybe my data isnt tidy
ggplot(all_movies, aes(year, total_movies_per_year, color=factor(Action)))+
  geom_line()ggplot(all_movies, aes(year, total_movies_per_year, color=factor(Action)))+
  geom_line(alpha=0.6)ggplot(all_movies, aes(x=year, fill=factor(Action)))+
  geom_bar(position = "dodge")ggplot(all_movies, aes(x=year, fill=factor(Action)))+
  geom_bar(position = "fill")ggplot(all_movies, aes(x=year, fill=factor(Action)))+
  geom_histogram(binwidth = 1, position = "fill", alpha=0.8)#let's zoom in on where the volume ramps up
current_movies<-all_movies%>%
  filter(year >= 1960)
ggplot(current_movies, aes(x=year, fill=factor(Action)))+
  geom_histogram(binwidth = 1, position = "fill", alpha=0.8)all_action_movies<- all_movies%>%
  filter(Action==1)
ggplot(all_action_movies, aes(x=year, y=rating))+
  geom_point()+
  geom_smooth()## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'#shows that the ratings trailed off in the 90s but coming back up....  I wonder if ratings are related to movie budget
ggplot(all_action_movies, aes(x=rating, y=budget))+
  geom_point(alpha=0.2)+
  geom_smooth(se=FALSE,method="lm")## `geom_smooth()` using formula 'y ~ x'## Warning: Removed 3850 rows containing non-finite values (stat_smooth).## Warning: Removed 3850 rows containing missing values (geom_point).ggplot(all_action_movies, aes(x=rating, y=budget))+
  geom_point(alpha=0.2)+
  geom_smooth(se=FALSE)## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'## Warning: Removed 3850 rows containing non-finite values (stat_smooth).
## Warning: Removed 3850 rows containing missing values (geom_point).#ratings look somewhat related to budget for middle of the pack ratings but the exception movies actually have smaller budgets.  
# Maybe ratings are better when it is an action + drama movie...
ggplot(all_action_movies, aes(x=rating, y=budget, color=factor(Drama)))+
  geom_point(alpha=0.2)+
  geom_smooth(se=FALSE)## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'## Warning: Removed 3850 rows containing non-finite values (stat_smooth).
## Warning: Removed 3850 rows containing missing values (geom_point).#glimpse(all_action_movies)#filter to Year > 1990
current_action_movies<- all_action_movies%>%
  filter(year >= 1990)
ggplot(current_action_movies, aes(x=year, y=rating))+
  geom_point()+
  geom_smooth()## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'#shows that the ratings trailed off in the 90s but coming back up....  I wonder if ratings are related to movie budget
ggplot(current_action_movies, aes(x=rating, y=budget))+
  geom_point(alpha=0.2)+
  geom_smooth(se=FALSE,method="lm")## `geom_smooth()` using formula 'y ~ x'## Warning: Removed 1717 rows containing non-finite values (stat_smooth).## Warning: Removed 1717 rows containing missing values (geom_point).ggplot(current_action_movies, aes(x=rating, y=budget))+
  geom_point(alpha=0.2)+
  geom_smooth(se=FALSE)## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'## Warning: Removed 1717 rows containing non-finite values (stat_smooth).
## Warning: Removed 1717 rows containing missing values (geom_point).#ratings look somewhat related to budget for middle of the pack ratings but the exception movies actually have smaller budgets.  
# Maybe ratings are better when it is an action + drama movie...
ggplot(current_action_movies, aes(x=rating, y=budget, color=factor(Drama)))+
  geom_point(alpha=0.2)+
  geom_smooth(se=FALSE)## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'## Warning: Removed 1717 rows containing non-finite values (stat_smooth).
## Warning: Removed 1717 rows containing missing values (geom_point).#Let's look at this a different way
ggplot(current_action_movies, aes(x=rating, fill=factor(Drama)))+
#  geom_point(alpha=0.2)+
  geom_bar(position="fill") #### Appendix - stuff I didn’t use
movies_ordered<-arrange(movies, year)
#view(movies_ordered)
# Making table for action movies against total movies
movies_grouped_action <- group_by(movies, year, Action)
movies_sum_action <- summarise(movies_grouped_action, n_movies = n())## `summarise()` regrouping output by 'year' (override with `.groups` argument)#view(movies_sum_action)
# Making table for drama movies against total movies
movies_grouped_drama <- group_by(movies, year, Drama)
movies_sum_drama <- summarise(movies_grouped_drama, n_movies = n())## `summarise()` regrouping output by 'year' (override with `.groups` argument)#view(movies_sum_drama)
# Binding them together to compare
movies_sum_bind<-movies_sum_action%>%
  bind_rows(movies_sum_drama)%>%
  arrange(year)
#view(movies_sum_bind)
# Data does not look right, so playing around with the grouping
movies_grouped_multi <- group_by(movies, year, Action, Drama)
movies_sum_multi <- summarise(movies_grouped_multi, n_movies = n())## `summarise()` regrouping output by 'year', 'Action' (override with `.groups` argument)#view(movies_sum_multi)