#install.packages("ggplot2movies")
library(ggplot2movies)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
data(movies)
?movies
str(movies)
## tibble [58,788 × 24] (S3: tbl_df/tbl/data.frame)
## $ title : chr [1:58788] "$" "$1000 a Touchdown" "$21 a Day Once a Month" "$40,000" ...
## $ year : int [1:58788] 1971 1939 1941 1996 1975 2000 2002 2002 1987 1917 ...
## $ length : int [1:58788] 121 71 7 70 71 91 93 25 97 61 ...
## $ budget : int [1:58788] NA NA NA NA NA NA NA NA NA NA ...
## $ rating : num [1:58788] 6.4 6 8.2 8.2 3.4 4.3 5.3 6.7 6.6 6 ...
## $ votes : int [1:58788] 348 20 5 6 17 45 200 24 18 51 ...
## $ r1 : num [1:58788] 4.5 0 0 14.5 24.5 4.5 4.5 4.5 4.5 4.5 ...
## $ r2 : num [1:58788] 4.5 14.5 0 0 4.5 4.5 0 4.5 4.5 0 ...
## $ r3 : num [1:58788] 4.5 4.5 0 0 0 4.5 4.5 4.5 4.5 4.5 ...
## $ r4 : num [1:58788] 4.5 24.5 0 0 14.5 14.5 4.5 4.5 0 4.5 ...
## $ r5 : num [1:58788] 14.5 14.5 0 0 14.5 14.5 24.5 4.5 0 4.5 ...
## $ r6 : num [1:58788] 24.5 14.5 24.5 0 4.5 14.5 24.5 14.5 0 44.5 ...
## $ r7 : num [1:58788] 24.5 14.5 0 0 0 4.5 14.5 14.5 34.5 14.5 ...
## $ r8 : num [1:58788] 14.5 4.5 44.5 0 0 4.5 4.5 14.5 14.5 4.5 ...
## $ r9 : num [1:58788] 4.5 4.5 24.5 34.5 0 14.5 4.5 4.5 4.5 4.5 ...
## $ r10 : num [1:58788] 4.5 14.5 24.5 45.5 24.5 14.5 14.5 14.5 24.5 4.5 ...
## $ mpaa : chr [1:58788] "" "" "" "" ...
## $ Action : int [1:58788] 0 0 0 0 0 0 1 0 0 0 ...
## $ Animation : int [1:58788] 0 0 1 0 0 0 0 0 0 0 ...
## $ Comedy : int [1:58788] 1 1 0 1 0 0 0 0 0 0 ...
## $ Drama : int [1:58788] 1 0 0 0 0 1 1 0 1 0 ...
## $ Documentary: int [1:58788] 0 0 0 0 0 0 0 1 0 0 ...
## $ Romance : int [1:58788] 0 0 0 0 0 0 0 0 0 0 ...
## $ Short : int [1:58788] 0 0 1 0 0 0 0 1 0 0 ...
We see that movies is a data frame with 58788 rows/observations and 24 variables. The variables include title, year (of release), budget, length (in minutes), rating (averyage IMDB user rating), votes (number of IMDB users who rated it), r1-10 (multiplied to the nearest 10% of users who rated the movie a 1, 2, etc.), mpaa (MPAA rating), as well as binary variables for action, animation, comedy, drama, documentary, romance, and short genres to denote whether the movie belongs to that genre.
First I want to explore: how does movie budget vary by genre? I have some hypotheses, such as newer/action movies having higher budgets and older/shorts having smaller budgets, but let’s see what the data says. Note: there a lot of NAs in the budget variable! So first, I’ll subset my data for observations that have their budgets recorded in this data frame.
moviesB<- movies%>%
na.omit(movies$budget)
str(moviesB)
## tibble [5,215 × 24] (S3: tbl_df/tbl/data.frame)
## $ title : chr [1:5215] "'G' Men" "'Manos' the Hands of Fate" "'Til There Was You" ".com for Murder" ...
## $ year : int [1:5215] 1935 1966 1997 2002 1999 2002 1997 1989 2001 2000 ...
## $ length : int [1:5215] 85 74 113 96 97 98 94 117 103 100 ...
## $ budget : int [1:5215] 450000 19000 23000000 5000000 16000000 1100000 140000 200000 200000 85000000 ...
## $ rating : num [1:5215] 7.2 1.6 4.8 3.7 6.7 5.6 3.3 7.8 5.8 4.7 ...
## $ votes : int [1:5215] 281 7996 799 271 19095 181 19 299 7 1987 ...
## $ r1 : num [1:5215] 0 74.5 4.5 64.5 4.5 4.5 14.5 4.5 0 4.5 ...
## $ r2 : num [1:5215] 4.5 4.5 4.5 4.5 4.5 4.5 14.5 0 0 4.5 ...
## $ r3 : num [1:5215] 4.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5 14.5 14.5 ...
## $ r4 : num [1:5215] 4.5 4.5 14.5 4.5 4.5 4.5 14.5 4.5 0 14.5 ...
## $ r5 : num [1:5215] 4.5 4.5 14.5 4.5 4.5 14.5 14.5 4.5 0 24.5 ...
## $ r6 : num [1:5215] 14.5 4.5 14.5 4.5 14.5 24.5 14.5 4.5 44.5 14.5 ...
## $ r7 : num [1:5215] 34.5 4.5 14.5 4.5 24.5 14.5 14.5 4.5 0 14.5 ...
## $ r8 : num [1:5215] 34.5 4.5 4.5 4.5 14.5 14.5 0 14.5 14.5 4.5 ...
## $ r9 : num [1:5215] 4.5 4.5 4.5 4.5 14.5 4.5 0 14.5 0 4.5 ...
## $ r10 : num [1:5215] 4.5 14.5 14.5 4.5 14.5 14.5 24.5 45.5 24.5 4.5 ...
## $ mpaa : chr [1:5215] "" "" "PG-13" "" ...
## $ Action : int [1:5215] 0 0 0 0 0 0 0 0 0 0 ...
## $ Animation : int [1:5215] 0 0 0 0 0 0 0 0 0 0 ...
## $ Comedy : int [1:5215] 0 0 1 0 1 1 0 0 0 1 ...
## $ Drama : int [1:5215] 1 0 0 0 0 0 1 0 1 0 ...
## $ Documentary: int [1:5215] 0 0 0 0 0 0 0 1 0 0 ...
## $ Romance : int [1:5215] 0 0 1 0 1 0 0 0 0 0 ...
## $ Short : int [1:5215] 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, "na.action")= 'omit' Named int [1:53573] 1 2 3 4 5 6 7 8 9 10 ...
## ..- attr(*, "names")= chr [1:53573] "1" "2" "3" "4" ...
ggplot(moviesB, aes(x=year, y=budget, color=Action))+
geom_point()
#clarify Action is binary, change point size
ggplot(moviesB, aes(x=year, y=budget, color=as.factor(Action)))+
geom_point(size=0.5)
ggplot(moviesB, aes(x=year, y=budget, shape=as.factor(Action)))+
geom_point(size=0.8)
#let's look at movies from "new hollywood" era, post-1960
moviesBB<- moviesB%>%
filter(year>1960)
str(moviesBB)
## tibble [4,407 × 24] (S3: tbl_df/tbl/data.frame)
## $ title : chr [1:4407] "'Manos' the Hands of Fate" "'Til There Was You" ".com for Murder" "10 Things I Hate About You" ...
## $ year : int [1:4407] 1966 1997 2002 1999 2002 1997 1989 2001 2000 2003 ...
## $ length : int [1:4407] 74 113 96 97 98 94 117 103 100 95 ...
## $ budget : int [1:4407] 19000 23000000 5000000 16000000 1100000 140000 200000 200000 85000000 6000000 ...
## $ rating : num [1:4407] 1.6 4.8 3.7 6.7 5.6 3.3 7.8 5.8 4.7 7.1 ...
## $ votes : int [1:4407] 7996 799 271 19095 181 19 299 7 1987 605 ...
## $ r1 : num [1:4407] 74.5 4.5 64.5 4.5 4.5 14.5 4.5 0 4.5 4.5 ...
## $ r2 : num [1:4407] 4.5 4.5 4.5 4.5 4.5 14.5 0 0 4.5 4.5 ...
## $ r3 : num [1:4407] 4.5 4.5 4.5 4.5 4.5 4.5 4.5 14.5 14.5 4.5 ...
## $ r4 : num [1:4407] 4.5 14.5 4.5 4.5 4.5 14.5 4.5 0 14.5 4.5 ...
## $ r5 : num [1:4407] 4.5 14.5 4.5 4.5 14.5 14.5 4.5 0 24.5 4.5 ...
## $ r6 : num [1:4407] 4.5 14.5 4.5 14.5 24.5 14.5 4.5 44.5 14.5 14.5 ...
## $ r7 : num [1:4407] 4.5 14.5 4.5 24.5 14.5 14.5 4.5 0 14.5 24.5 ...
## $ r8 : num [1:4407] 4.5 4.5 4.5 14.5 14.5 0 14.5 14.5 4.5 24.5 ...
## $ r9 : num [1:4407] 4.5 4.5 4.5 14.5 4.5 0 14.5 0 4.5 4.5 ...
## $ r10 : num [1:4407] 14.5 14.5 4.5 14.5 14.5 24.5 45.5 24.5 4.5 14.5 ...
## $ mpaa : chr [1:4407] "" "PG-13" "" "PG-13" ...
## $ Action : int [1:4407] 0 0 0 0 0 0 0 0 0 0 ...
## $ Animation : int [1:4407] 0 0 0 0 0 0 0 0 0 0 ...
## $ Comedy : int [1:4407] 0 1 0 1 1 0 0 0 1 1 ...
## $ Drama : int [1:4407] 0 0 0 0 0 1 0 1 0 1 ...
## $ Documentary: int [1:4407] 0 0 0 0 0 0 1 0 0 0 ...
## $ Romance : int [1:4407] 0 1 0 1 0 0 0 0 0 0 ...
## $ Short : int [1:4407] 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, "na.action")= 'omit' Named int [1:53573] 1 2 3 4 5 6 7 8 9 10 ...
## ..- attr(*, "names")= chr [1:53573] "1" "2" "3" "4" ...
#polish up with new data subset
ggplot(moviesBB, aes(x=year, y=budget, color=as.factor(Action)))+
geom_jitter(size=0.5, alpha=0.6)+
scale_color_manual(values=c("grey","blue"))+
ggtitle("Action movie budgets from New Hollywood to the modern day")
#now look at animation movies
ggplot(moviesBB, aes(x=year, y=budget, color=as.factor(Animation)))+
geom_jitter(size=0.5, alpha=0.6)+
scale_color_manual(values=c("grey","magenta"))+
ggtitle("Animation movie budgets from New Hollywood to the modern day")
#now comedies
ggplot(moviesBB, aes(x=year, y=budget, color=as.factor(Comedy)))+
geom_jitter(size=0.5, alpha=0.6)+
scale_color_manual(values=c("grey","darkgreen"))+
ggtitle("Comedy movie budgets from New Hollywood to the modern day")
#now dramas
ggplot(moviesBB, aes(x=year, y=budget, color=as.factor(Drama)))+
geom_jitter(size=0.5, alpha=0.6)+
scale_color_manual(values=c("grey","purple"))+
ggtitle("Drama movie budgets from New Hollywood to the modern day")
#now documentaries
ggplot(moviesBB, aes(x=year, y=budget, color=as.factor(Documentary)))+
geom_jitter(size=0.5, alpha=0.6)+
scale_color_manual(values=c("grey","black"))+
ggtitle("Documentary budgets from New Hollywood to the modern day")
#now romances
ggplot(moviesBB, aes(x=year, y=budget, color=as.factor(Romance)))+
geom_jitter(size=0.5, alpha=0.6)+
scale_color_manual(values=c("grey","darkred"))+
ggtitle("Romance movie budgets from New Hollywood to the modern day")
#now shorts
ggplot(moviesBB, aes(x=year, y=budget, color=as.factor(Short)))+
geom_jitter(size=0.5, alpha=0.6)+
scale_color_manual(values=c("grey","orange"))+
ggtitle("Short movie budgets from New Hollywood to the modern day")
## my favorite story comes from the action movie graph, so let's polish it more
options(scipen = 999) #found this online to disable scientific notation of budget
ggplot(moviesBB, aes(x=year, y=budget, color=as.factor(Action)))+
geom_jitter(size=0.5, alpha=0.6)+
scale_color_manual(values=c("grey","blue"))+
ggtitle("Action movie budgets from New Hollywood to the modern day")+
labs(x="Year of Release", y="Budget ($)",
subtitle="Blue points denote action films", subtitle.color="blue", caption="Based on data from IMDB")+
theme(legend.position="none"
, plot.subtitle=element_text(color="blue", face="italic")
)
ggsave("actionmoviehw4.pdf", width=8, height=4)
meanbudget<- mean(moviesB$budget)
meanbudget
## [1] 13412513
medianbudget<- median(moviesB$budget)
medianbudget
## [1] 3000000
movieshighB<- movies %>%
filter(budget>meanbudget)
movieslowB<- movies %>%
filter(budget<meanbudget)
ggplot(movieshighB, aes(x=1, fill=as.factor(Action)))+
geom_bar()+
coord_polar(theta="y")+
scale_fill_brewer()+
ggtitle("Proportion of action movies above mean budget")
ggplot(movieslowB, aes(x=1, fill=as.factor(Action)))+
geom_bar()+
coord_polar(theta="y")+
scale_fill_brewer()+
ggtitle("Proportion of action movies below mean budget")
Do higher budgets actually result in “better” movies? Out of our variables, I think IMDB user “rating” (with levels 1 to 10) might be the best indicator.
#movies$mpaa
ggplot(movies, aes(rating, budget, shape=mpaa))+
geom_point(alpha=0.5)
## Warning: Removed 53573 rows containing missing values (geom_point).
ggplot(movies, aes(rating, budget))+
geom_bin_2d()
## Warning: Removed 53573 rows containing non-finite values (stat_bin2d).
ggplot(movies, aes(rating, budget, color=mpaa))+
geom_point(alpha=0.5)+
geom_hline(yintercept=medianbudget)+
scale_y_log10()
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 53573 rows containing missing values (geom_point).
ggplot(movies, aes(rating, budget, color=mpaa))+
geom_point(alpha=0.5)+
geom_hline(yintercept=meanbudget)+
scale_y_log10()
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 53573 rows containing missing values (geom_point).
ggplot(movies, aes(rating, budget, color=year))+
geom_point(alpha=0.3)+
geom_hline(yintercept=meanbudget)+
scale_y_log10()
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 53573 rows containing missing values (geom_point).
ggplot(movies, aes(rating, budget))+
geom_bin_2d()+
geom_hline(yintercept=meanbudget)+
scale_y_log10()
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 53605 rows containing non-finite values (stat_bin2d).
#looks like most movies under mean budget are unrated
ggplot(movies, aes(mpaa))+
geom_bar()
#let's just look at movies WITH ratings
moviesR<- moviesB%>%
filter(mpaa!="")
str(moviesR)
## tibble [1,813 × 24] (S3: tbl_df/tbl/data.frame)
## $ title : chr [1:1813] "'Til There Was You" "10 Things I Hate About You" "100 Mile Rule" "13 Going On 30" ...
## $ year : int [1:1813] 1997 1999 2002 2004 1999 2001 1972 2003 1998 1999 ...
## $ length : int [1:1813] 113 97 98 98 102 120 180 107 87 101 ...
## $ budget : int [1:1813] 23000000 16000000 1100000 37000000 85000000 42000000 4000000 76000000 60000 6000000 ...
## $ rating : num [1:1813] 4.8 6.7 5.6 6.4 6.1 6.1 7.3 5.1 5.4 5.4 ...
## $ votes : int [1:1813] 799 19095 181 7859 14344 10866 1754 9556 841 4514 ...
## $ r1 : num [1:1813] 4.5 4.5 4.5 4.5 4.5 4.5 4.5 14.5 4.5 4.5 ...
## $ r2 : num [1:1813] 4.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5 ...
## $ r3 : num [1:1813] 4.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5 ...
## $ r4 : num [1:1813] 14.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5 ...
## $ r5 : num [1:1813] 14.5 4.5 14.5 4.5 14.5 14.5 4.5 14.5 14.5 14.5 ...
## $ r6 : num [1:1813] 14.5 14.5 24.5 14.5 14.5 24.5 4.5 14.5 14.5 24.5 ...
## $ r7 : num [1:1813] 14.5 24.5 14.5 24.5 24.5 24.5 14.5 14.5 14.5 14.5 ...
## $ r8 : num [1:1813] 4.5 14.5 14.5 14.5 14.5 14.5 14.5 4.5 14.5 14.5 ...
## $ r9 : num [1:1813] 4.5 14.5 4.5 4.5 4.5 4.5 14.5 4.5 4.5 4.5 ...
## $ r10 : num [1:1813] 14.5 14.5 14.5 14.5 4.5 4.5 34.5 4.5 4.5 4.5 ...
## $ mpaa : chr [1:1813] "PG-13" "PG-13" "R" "PG-13" ...
## $ Action : int [1:1813] 0 0 0 0 1 0 0 1 0 0 ...
## $ Animation : int [1:1813] 0 0 0 0 0 0 0 0 0 0 ...
## $ Comedy : int [1:1813] 1 1 1 1 0 0 0 0 1 1 ...
## $ Drama : int [1:1813] 0 0 0 1 0 1 1 0 0 1 ...
## $ Documentary: int [1:1813] 0 0 0 0 0 0 0 0 1 0 ...
## $ Romance : int [1:1813] 1 1 0 1 0 0 0 0 1 0 ...
## $ Short : int [1:1813] 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, "na.action")= 'omit' Named int [1:53573] 1 2 3 4 5 6 7 8 9 10 ...
## ..- attr(*, "names")= chr [1:53573] "1" "2" "3" "4" ...
ggplot(moviesR, aes(mpaa))+
geom_bar()
ggplot(moviesR, aes(rating,budget))+
geom_point()+
scale_y_log10()+
facet_wrap(.~mpaa)
ggplot(moviesR, aes(budget,rating))+
geom_jitter(size=.8, alpha=0.5)+
scale_x_log10()+
geom_smooth(se=FALSE, color="red")+
facet_wrap(.~mpaa)+
labs(title="Do higher budgets result in higher ratings?",
subtitle="By MPAA ratings",
x="movie budget ($)*",
y="average IMDB user rating",
caption="*log scale")+
theme(axis.title=element_text(color="black", family="serif"),
axis.ticks=element_line(color="grey"),
axis.text = element_text(color="grey", family="serif"),
plot.title=element_text(family="serif"),
plot.subtitle = element_text(family="serif"),
plot.caption = element_text(family="serif"))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
ggsave("budgetratinghw4.pdf",width=8,height=4)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
lotr<- movies%>%
filter(title=="Lord of the Rings: The Return of the King, The" | title=="Lord of the Rings: The Two Towers, The" | title=="Lord of the Rings: The Fellowship of the Ring, The")%>%
arrange(year)%>%
mutate(nickname= c("Fellowship of the Ring", "Two Towers","Return of the King"))
lotr
## # A tibble: 3 × 25
## title year length budget rating votes r1 r2 r3 r4 r5 r6
## <chr> <int> <int> <int> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Lord of… 2001 208 9.3e7 8.8 157608 4.5 4.5 4.5 4.5 4.5 4.5
## 2 Lord of… 2002 223 9.4e7 8.8 114797 4.5 4.5 4.5 4.5 4.5 4.5
## 3 Lord of… 2003 251 9.4e7 9 103631 4.5 4.5 4.5 4.5 4.5 4.5
## # … with 13 more variables: r7 <dbl>, r8 <dbl>, r9 <dbl>, r10 <dbl>,
## # mpaa <chr>, Action <int>, Animation <int>, Comedy <int>, Drama <int>,
## # Documentary <int>, Romance <int>, Short <int>, nickname <chr>
ggplot(lotr, aes(nickname,rating))+
geom_col(fill="purple")
ggplot(lotr, aes(rating))+
geom_dotplot()
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.
ggplot(lotr, aes(year,budget, label=title))+
geom_text()+
ylim(0,100000000)
# i like this text geom but i want more observations to use it on! so i am abandoning LOTR
#idea: votes = size, y=rating, label=title,
movies18<- movies%>%
filter(year<1900)%>%
filter(votes>10)%>%
arrange(year)
movies18
## # A tibble: 42 × 24
## title year length budget rating votes r1 r2 r3 r4 r5 r6
## <chr> <int> <int> <int> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Blacksm… 1893 1 NA 7 90 4.5 0 4.5 4.5 14.5 4.5
## 2 Bucking… 1894 1 NA 4.6 30 0 0 24.5 24.5 14.5 14.5
## 3 Buffalo… 1894 1 NA 5 35 0 0 14.5 14.5 14.5 14.5
## 4 Caicedo… 1894 1 NA 5.8 41 4.5 0 0 4.5 24.5 14.5
## 5 Glenroy… 1894 1 NA 4.2 15 0 4.5 24.5 34.5 24.5 0
## 6 Glenroy… 1894 1 NA 5.4 97 4.5 4.5 4.5 14.5 24.5 24.5
## 7 Hadj Ch… 1894 1 NA 4.1 14 0 4.5 44.5 14.5 4.5 14.5
## 8 Leonard… 1894 1 NA 4.4 20 4.5 0 14.5 14.5 24.5 14.5
## 9 Luis Ma… 1894 1 NA 6.1 38 0 0 4.5 4.5 14.5 24.5
## 10 Sioux G… 1894 1 NA 4.4 18 0 4.5 14.5 34.5 34.5 14.5
## # … with 32 more rows, and 12 more variables: r7 <dbl>, r8 <dbl>, r9 <dbl>,
## # r10 <dbl>, mpaa <chr>, Action <int>, Animation <int>, Comedy <int>,
## # Drama <int>, Documentary <int>, Romance <int>, Short <int>
ggplot(movies18, aes(year,rating, label=title))+
geom_jitter(alpha=0.4)+
geom_text(size=2, angle=45)+
xlim(1893,1900)+
ylim(1,10)
ggplot(movies18, aes(year,rating, label=title))+
geom_label(size=2,alpha=0.5, aes(fill=votes))+
scale_fill_gradient(low="yellow",high="red")+
xlim(1892,1900)+
ylim(1,10)+
theme_light()+
labs(title="IMDB User Ratings for 19th Century Films")+
theme(plot.title=element_text(family="Courier"),
axis.title=element_text(family="Courier"),
axis.text=element_text(family="Courier"),
legend.text = element_text(family="Courier"),
legend.title = element_text(family="Courier"))
ggsave("18centurymovies.pdf",width=8,height=4)