Movies dataset

Setup

#install.packages("ggplot2movies") 
library(ggplot2movies) 
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
data(movies)
?movies
str(movies)
## tibble [58,788 × 24] (S3: tbl_df/tbl/data.frame)
##  $ title      : chr [1:58788] "$" "$1000 a Touchdown" "$21 a Day Once a Month" "$40,000" ...
##  $ year       : int [1:58788] 1971 1939 1941 1996 1975 2000 2002 2002 1987 1917 ...
##  $ length     : int [1:58788] 121 71 7 70 71 91 93 25 97 61 ...
##  $ budget     : int [1:58788] NA NA NA NA NA NA NA NA NA NA ...
##  $ rating     : num [1:58788] 6.4 6 8.2 8.2 3.4 4.3 5.3 6.7 6.6 6 ...
##  $ votes      : int [1:58788] 348 20 5 6 17 45 200 24 18 51 ...
##  $ r1         : num [1:58788] 4.5 0 0 14.5 24.5 4.5 4.5 4.5 4.5 4.5 ...
##  $ r2         : num [1:58788] 4.5 14.5 0 0 4.5 4.5 0 4.5 4.5 0 ...
##  $ r3         : num [1:58788] 4.5 4.5 0 0 0 4.5 4.5 4.5 4.5 4.5 ...
##  $ r4         : num [1:58788] 4.5 24.5 0 0 14.5 14.5 4.5 4.5 0 4.5 ...
##  $ r5         : num [1:58788] 14.5 14.5 0 0 14.5 14.5 24.5 4.5 0 4.5 ...
##  $ r6         : num [1:58788] 24.5 14.5 24.5 0 4.5 14.5 24.5 14.5 0 44.5 ...
##  $ r7         : num [1:58788] 24.5 14.5 0 0 0 4.5 14.5 14.5 34.5 14.5 ...
##  $ r8         : num [1:58788] 14.5 4.5 44.5 0 0 4.5 4.5 14.5 14.5 4.5 ...
##  $ r9         : num [1:58788] 4.5 4.5 24.5 34.5 0 14.5 4.5 4.5 4.5 4.5 ...
##  $ r10        : num [1:58788] 4.5 14.5 24.5 45.5 24.5 14.5 14.5 14.5 24.5 4.5 ...
##  $ mpaa       : chr [1:58788] "" "" "" "" ...
##  $ Action     : int [1:58788] 0 0 0 0 0 0 1 0 0 0 ...
##  $ Animation  : int [1:58788] 0 0 1 0 0 0 0 0 0 0 ...
##  $ Comedy     : int [1:58788] 1 1 0 1 0 0 0 0 0 0 ...
##  $ Drama      : int [1:58788] 1 0 0 0 0 1 1 0 1 0 ...
##  $ Documentary: int [1:58788] 0 0 0 0 0 0 0 1 0 0 ...
##  $ Romance    : int [1:58788] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Short      : int [1:58788] 0 0 1 0 0 0 0 1 0 0 ...

We see that movies is a data frame with 58788 rows/observations and 24 variables. The variables include title, year (of release), budget, length (in minutes), rating (averyage IMDB user rating), votes (number of IMDB users who rated it), r1-10 (multiplied to the nearest 10% of users who rated the movie a 1, 2, etc.), mpaa (MPAA rating), as well as binary variables for action, animation, comedy, drama, documentary, romance, and short genres to denote whether the movie belongs to that genre.

Exploration of budgets

First I want to explore: how does movie budget vary by genre? I have some hypotheses, such as newer/action movies having higher budgets and older/shorts having smaller budgets, but let’s see what the data says. Note: there a lot of NAs in the budget variable! So first, I’ll subset my data for observations that have their budgets recorded in this data frame.

moviesB<- movies%>%
  na.omit(movies$budget)
str(moviesB)
## tibble [5,215 × 24] (S3: tbl_df/tbl/data.frame)
##  $ title      : chr [1:5215] "'G' Men" "'Manos' the Hands of Fate" "'Til There Was You" ".com for Murder" ...
##  $ year       : int [1:5215] 1935 1966 1997 2002 1999 2002 1997 1989 2001 2000 ...
##  $ length     : int [1:5215] 85 74 113 96 97 98 94 117 103 100 ...
##  $ budget     : int [1:5215] 450000 19000 23000000 5000000 16000000 1100000 140000 200000 200000 85000000 ...
##  $ rating     : num [1:5215] 7.2 1.6 4.8 3.7 6.7 5.6 3.3 7.8 5.8 4.7 ...
##  $ votes      : int [1:5215] 281 7996 799 271 19095 181 19 299 7 1987 ...
##  $ r1         : num [1:5215] 0 74.5 4.5 64.5 4.5 4.5 14.5 4.5 0 4.5 ...
##  $ r2         : num [1:5215] 4.5 4.5 4.5 4.5 4.5 4.5 14.5 0 0 4.5 ...
##  $ r3         : num [1:5215] 4.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5 14.5 14.5 ...
##  $ r4         : num [1:5215] 4.5 4.5 14.5 4.5 4.5 4.5 14.5 4.5 0 14.5 ...
##  $ r5         : num [1:5215] 4.5 4.5 14.5 4.5 4.5 14.5 14.5 4.5 0 24.5 ...
##  $ r6         : num [1:5215] 14.5 4.5 14.5 4.5 14.5 24.5 14.5 4.5 44.5 14.5 ...
##  $ r7         : num [1:5215] 34.5 4.5 14.5 4.5 24.5 14.5 14.5 4.5 0 14.5 ...
##  $ r8         : num [1:5215] 34.5 4.5 4.5 4.5 14.5 14.5 0 14.5 14.5 4.5 ...
##  $ r9         : num [1:5215] 4.5 4.5 4.5 4.5 14.5 4.5 0 14.5 0 4.5 ...
##  $ r10        : num [1:5215] 4.5 14.5 14.5 4.5 14.5 14.5 24.5 45.5 24.5 4.5 ...
##  $ mpaa       : chr [1:5215] "" "" "PG-13" "" ...
##  $ Action     : int [1:5215] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Animation  : int [1:5215] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Comedy     : int [1:5215] 0 0 1 0 1 1 0 0 0 1 ...
##  $ Drama      : int [1:5215] 1 0 0 0 0 0 1 0 1 0 ...
##  $ Documentary: int [1:5215] 0 0 0 0 0 0 0 1 0 0 ...
##  $ Romance    : int [1:5215] 0 0 1 0 1 0 0 0 0 0 ...
##  $ Short      : int [1:5215] 0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, "na.action")= 'omit' Named int [1:53573] 1 2 3 4 5 6 7 8 9 10 ...
##   ..- attr(*, "names")= chr [1:53573] "1" "2" "3" "4" ...
ggplot(moviesB, aes(x=year, y=budget, color=Action))+
  geom_point()

#clarify Action is binary, change point size
ggplot(moviesB, aes(x=year, y=budget, color=as.factor(Action)))+
  geom_point(size=0.5)

ggplot(moviesB, aes(x=year, y=budget, shape=as.factor(Action)))+
  geom_point(size=0.8)

#let's look at movies from "new hollywood" era, post-1960
moviesBB<- moviesB%>%
  filter(year>1960)
str(moviesBB)
## tibble [4,407 × 24] (S3: tbl_df/tbl/data.frame)
##  $ title      : chr [1:4407] "'Manos' the Hands of Fate" "'Til There Was You" ".com for Murder" "10 Things I Hate About You" ...
##  $ year       : int [1:4407] 1966 1997 2002 1999 2002 1997 1989 2001 2000 2003 ...
##  $ length     : int [1:4407] 74 113 96 97 98 94 117 103 100 95 ...
##  $ budget     : int [1:4407] 19000 23000000 5000000 16000000 1100000 140000 200000 200000 85000000 6000000 ...
##  $ rating     : num [1:4407] 1.6 4.8 3.7 6.7 5.6 3.3 7.8 5.8 4.7 7.1 ...
##  $ votes      : int [1:4407] 7996 799 271 19095 181 19 299 7 1987 605 ...
##  $ r1         : num [1:4407] 74.5 4.5 64.5 4.5 4.5 14.5 4.5 0 4.5 4.5 ...
##  $ r2         : num [1:4407] 4.5 4.5 4.5 4.5 4.5 14.5 0 0 4.5 4.5 ...
##  $ r3         : num [1:4407] 4.5 4.5 4.5 4.5 4.5 4.5 4.5 14.5 14.5 4.5 ...
##  $ r4         : num [1:4407] 4.5 14.5 4.5 4.5 4.5 14.5 4.5 0 14.5 4.5 ...
##  $ r5         : num [1:4407] 4.5 14.5 4.5 4.5 14.5 14.5 4.5 0 24.5 4.5 ...
##  $ r6         : num [1:4407] 4.5 14.5 4.5 14.5 24.5 14.5 4.5 44.5 14.5 14.5 ...
##  $ r7         : num [1:4407] 4.5 14.5 4.5 24.5 14.5 14.5 4.5 0 14.5 24.5 ...
##  $ r8         : num [1:4407] 4.5 4.5 4.5 14.5 14.5 0 14.5 14.5 4.5 24.5 ...
##  $ r9         : num [1:4407] 4.5 4.5 4.5 14.5 4.5 0 14.5 0 4.5 4.5 ...
##  $ r10        : num [1:4407] 14.5 14.5 4.5 14.5 14.5 24.5 45.5 24.5 4.5 14.5 ...
##  $ mpaa       : chr [1:4407] "" "PG-13" "" "PG-13" ...
##  $ Action     : int [1:4407] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Animation  : int [1:4407] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Comedy     : int [1:4407] 0 1 0 1 1 0 0 0 1 1 ...
##  $ Drama      : int [1:4407] 0 0 0 0 0 1 0 1 0 1 ...
##  $ Documentary: int [1:4407] 0 0 0 0 0 0 1 0 0 0 ...
##  $ Romance    : int [1:4407] 0 1 0 1 0 0 0 0 0 0 ...
##  $ Short      : int [1:4407] 0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, "na.action")= 'omit' Named int [1:53573] 1 2 3 4 5 6 7 8 9 10 ...
##   ..- attr(*, "names")= chr [1:53573] "1" "2" "3" "4" ...
#polish up with new data subset 
ggplot(moviesBB, aes(x=year, y=budget, color=as.factor(Action)))+
  geom_jitter(size=0.5, alpha=0.6)+
  scale_color_manual(values=c("grey","blue"))+
  ggtitle("Action movie budgets from New Hollywood to the modern day")

#now look at animation movies
ggplot(moviesBB, aes(x=year, y=budget, color=as.factor(Animation)))+
  geom_jitter(size=0.5, alpha=0.6)+
  scale_color_manual(values=c("grey","magenta"))+
  ggtitle("Animation movie budgets from New Hollywood to the modern day")

#now comedies
ggplot(moviesBB, aes(x=year, y=budget, color=as.factor(Comedy)))+
  geom_jitter(size=0.5, alpha=0.6)+
  scale_color_manual(values=c("grey","darkgreen"))+
  ggtitle("Comedy movie budgets from New Hollywood to the modern day")

#now dramas 
ggplot(moviesBB, aes(x=year, y=budget, color=as.factor(Drama)))+
  geom_jitter(size=0.5, alpha=0.6)+
  scale_color_manual(values=c("grey","purple"))+
  ggtitle("Drama movie budgets from New Hollywood to the modern day")

#now documentaries
ggplot(moviesBB, aes(x=year, y=budget, color=as.factor(Documentary)))+
  geom_jitter(size=0.5, alpha=0.6)+
  scale_color_manual(values=c("grey","black"))+
  ggtitle("Documentary budgets from New Hollywood to the modern day")

#now romances
ggplot(moviesBB, aes(x=year, y=budget, color=as.factor(Romance)))+
  geom_jitter(size=0.5, alpha=0.6)+
  scale_color_manual(values=c("grey","darkred"))+
  ggtitle("Romance movie budgets from New Hollywood to the modern day")

#now shorts 
ggplot(moviesBB, aes(x=year, y=budget, color=as.factor(Short)))+
  geom_jitter(size=0.5, alpha=0.6)+
  scale_color_manual(values=c("grey","orange"))+
  ggtitle("Short movie budgets from New Hollywood to the modern day")

## my favorite story comes from the action movie graph, so let's polish it more

options(scipen = 999) #found this online to disable scientific notation of budget 
ggplot(moviesBB, aes(x=year, y=budget, color=as.factor(Action)))+
  geom_jitter(size=0.5, alpha=0.6)+
  scale_color_manual(values=c("grey","blue"))+
  ggtitle("Action movie budgets from New Hollywood to the modern day")+
  labs(x="Year of Release", y="Budget ($)", 
       subtitle="Blue points denote action films", subtitle.color="blue", caption="Based on data from IMDB")+
  theme(legend.position="none"
        , plot.subtitle=element_text(color="blue", face="italic")
        )

ggsave("actionmoviehw4.pdf", width=8, height=4)

meanbudget<- mean(moviesB$budget)
meanbudget
## [1] 13412513
medianbudget<- median(moviesB$budget)
medianbudget
## [1] 3000000
movieshighB<- movies %>%
  filter(budget>meanbudget)

movieslowB<- movies %>%
  filter(budget<meanbudget)

ggplot(movieshighB, aes(x=1, fill=as.factor(Action)))+
  geom_bar()+
  coord_polar(theta="y")+
  scale_fill_brewer()+
  ggtitle("Proportion of action movies above mean budget")

ggplot(movieslowB, aes(x=1, fill=as.factor(Action)))+
  geom_bar()+
  coord_polar(theta="y")+
  scale_fill_brewer()+
  ggtitle("Proportion of action movies below mean budget")

Exploration of ratings

Do higher budgets actually result in “better” movies? Out of our variables, I think IMDB user “rating” (with levels 1 to 10) might be the best indicator.

#movies$mpaa

ggplot(movies, aes(rating, budget, shape=mpaa))+
  geom_point(alpha=0.5)
## Warning: Removed 53573 rows containing missing values (geom_point).

ggplot(movies, aes(rating, budget))+
  geom_bin_2d()
## Warning: Removed 53573 rows containing non-finite values (stat_bin2d).

ggplot(movies, aes(rating, budget, color=mpaa))+
  geom_point(alpha=0.5)+
  geom_hline(yintercept=medianbudget)+
  scale_y_log10()
## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Removed 53573 rows containing missing values (geom_point).

ggplot(movies, aes(rating, budget, color=mpaa))+
  geom_point(alpha=0.5)+
  geom_hline(yintercept=meanbudget)+
  scale_y_log10()
## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Removed 53573 rows containing missing values (geom_point).

ggplot(movies, aes(rating, budget, color=year))+
  geom_point(alpha=0.3)+
  geom_hline(yintercept=meanbudget)+
  scale_y_log10()
## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Removed 53573 rows containing missing values (geom_point).

ggplot(movies, aes(rating, budget))+
  geom_bin_2d()+
  geom_hline(yintercept=meanbudget)+
  scale_y_log10()
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 53605 rows containing non-finite values (stat_bin2d).

#looks like most movies under mean budget are unrated

ggplot(movies, aes(mpaa))+
  geom_bar()

#let's just look at movies WITH ratings
moviesR<- moviesB%>%
  filter(mpaa!="")
str(moviesR)
## tibble [1,813 × 24] (S3: tbl_df/tbl/data.frame)
##  $ title      : chr [1:1813] "'Til There Was You" "10 Things I Hate About You" "100 Mile Rule" "13 Going On 30" ...
##  $ year       : int [1:1813] 1997 1999 2002 2004 1999 2001 1972 2003 1998 1999 ...
##  $ length     : int [1:1813] 113 97 98 98 102 120 180 107 87 101 ...
##  $ budget     : int [1:1813] 23000000 16000000 1100000 37000000 85000000 42000000 4000000 76000000 60000 6000000 ...
##  $ rating     : num [1:1813] 4.8 6.7 5.6 6.4 6.1 6.1 7.3 5.1 5.4 5.4 ...
##  $ votes      : int [1:1813] 799 19095 181 7859 14344 10866 1754 9556 841 4514 ...
##  $ r1         : num [1:1813] 4.5 4.5 4.5 4.5 4.5 4.5 4.5 14.5 4.5 4.5 ...
##  $ r2         : num [1:1813] 4.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5 ...
##  $ r3         : num [1:1813] 4.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5 ...
##  $ r4         : num [1:1813] 14.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5 4.5 ...
##  $ r5         : num [1:1813] 14.5 4.5 14.5 4.5 14.5 14.5 4.5 14.5 14.5 14.5 ...
##  $ r6         : num [1:1813] 14.5 14.5 24.5 14.5 14.5 24.5 4.5 14.5 14.5 24.5 ...
##  $ r7         : num [1:1813] 14.5 24.5 14.5 24.5 24.5 24.5 14.5 14.5 14.5 14.5 ...
##  $ r8         : num [1:1813] 4.5 14.5 14.5 14.5 14.5 14.5 14.5 4.5 14.5 14.5 ...
##  $ r9         : num [1:1813] 4.5 14.5 4.5 4.5 4.5 4.5 14.5 4.5 4.5 4.5 ...
##  $ r10        : num [1:1813] 14.5 14.5 14.5 14.5 4.5 4.5 34.5 4.5 4.5 4.5 ...
##  $ mpaa       : chr [1:1813] "PG-13" "PG-13" "R" "PG-13" ...
##  $ Action     : int [1:1813] 0 0 0 0 1 0 0 1 0 0 ...
##  $ Animation  : int [1:1813] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Comedy     : int [1:1813] 1 1 1 1 0 0 0 0 1 1 ...
##  $ Drama      : int [1:1813] 0 0 0 1 0 1 1 0 0 1 ...
##  $ Documentary: int [1:1813] 0 0 0 0 0 0 0 0 1 0 ...
##  $ Romance    : int [1:1813] 1 1 0 1 0 0 0 0 1 0 ...
##  $ Short      : int [1:1813] 0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, "na.action")= 'omit' Named int [1:53573] 1 2 3 4 5 6 7 8 9 10 ...
##   ..- attr(*, "names")= chr [1:53573] "1" "2" "3" "4" ...
ggplot(moviesR, aes(mpaa))+
  geom_bar()

ggplot(moviesR, aes(rating,budget))+
  geom_point()+
  scale_y_log10()+
  facet_wrap(.~mpaa)

ggplot(moviesR, aes(budget,rating))+
  geom_jitter(size=.8, alpha=0.5)+
  scale_x_log10()+
  geom_smooth(se=FALSE, color="red")+
  facet_wrap(.~mpaa)+
  labs(title="Do higher budgets result in higher ratings?",
       subtitle="By MPAA ratings",
       x="movie budget ($)*",
       y="average IMDB user rating",
       caption="*log scale")+
  theme(axis.title=element_text(color="black", family="serif"),
        axis.ticks=element_line(color="grey"),
        axis.text = element_text(color="grey", family="serif"), 
        plot.title=element_text(family="serif"),
        plot.subtitle = element_text(family="serif"),
        plot.caption = element_text(family="serif"))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.

ggsave("budgetratinghw4.pdf",width=8,height=4)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.

Exploration of specific movies

lotr<- movies%>%
  filter(title=="Lord of the Rings: The Return of the King, The" | title=="Lord of the Rings: The Two Towers, The" | title=="Lord of the Rings: The Fellowship of the Ring, The")%>%
  arrange(year)%>%
  mutate(nickname= c("Fellowship of the Ring", "Two Towers","Return of the King"))
lotr
## # A tibble: 3 × 25
##   title     year length budget rating  votes    r1    r2    r3    r4    r5    r6
##   <chr>    <int>  <int>  <int>  <dbl>  <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Lord of…  2001    208  9.3e7    8.8 157608   4.5   4.5   4.5   4.5   4.5   4.5
## 2 Lord of…  2002    223  9.4e7    8.8 114797   4.5   4.5   4.5   4.5   4.5   4.5
## 3 Lord of…  2003    251  9.4e7    9   103631   4.5   4.5   4.5   4.5   4.5   4.5
## # … with 13 more variables: r7 <dbl>, r8 <dbl>, r9 <dbl>, r10 <dbl>,
## #   mpaa <chr>, Action <int>, Animation <int>, Comedy <int>, Drama <int>,
## #   Documentary <int>, Romance <int>, Short <int>, nickname <chr>
ggplot(lotr, aes(nickname,rating))+
  geom_col(fill="purple")

ggplot(lotr, aes(rating))+
  geom_dotplot()
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

ggplot(lotr, aes(year,budget, label=title))+
  geom_text()+
  ylim(0,100000000)

# i like this text geom but i want more observations to use it on! so i am abandoning LOTR

#idea: votes = size, y=rating, label=title, 

movies18<- movies%>%
  filter(year<1900)%>%
  filter(votes>10)%>%
  arrange(year)

movies18
## # A tibble: 42 × 24
##    title     year length budget rating votes    r1    r2    r3    r4    r5    r6
##    <chr>    <int>  <int>  <int>  <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 Blacksm…  1893      1     NA    7      90   4.5   0     4.5   4.5  14.5   4.5
##  2 Bucking…  1894      1     NA    4.6    30   0     0    24.5  24.5  14.5  14.5
##  3 Buffalo…  1894      1     NA    5      35   0     0    14.5  14.5  14.5  14.5
##  4 Caicedo…  1894      1     NA    5.8    41   4.5   0     0     4.5  24.5  14.5
##  5 Glenroy…  1894      1     NA    4.2    15   0     4.5  24.5  34.5  24.5   0  
##  6 Glenroy…  1894      1     NA    5.4    97   4.5   4.5   4.5  14.5  24.5  24.5
##  7 Hadj Ch…  1894      1     NA    4.1    14   0     4.5  44.5  14.5   4.5  14.5
##  8 Leonard…  1894      1     NA    4.4    20   4.5   0    14.5  14.5  24.5  14.5
##  9 Luis Ma…  1894      1     NA    6.1    38   0     0     4.5   4.5  14.5  24.5
## 10 Sioux G…  1894      1     NA    4.4    18   0     4.5  14.5  34.5  34.5  14.5
## # … with 32 more rows, and 12 more variables: r7 <dbl>, r8 <dbl>, r9 <dbl>,
## #   r10 <dbl>, mpaa <chr>, Action <int>, Animation <int>, Comedy <int>,
## #   Drama <int>, Documentary <int>, Romance <int>, Short <int>
ggplot(movies18, aes(year,rating, label=title))+
  geom_jitter(alpha=0.4)+
  geom_text(size=2, angle=45)+
  xlim(1893,1900)+
  ylim(1,10)

ggplot(movies18, aes(year,rating, label=title))+
  geom_label(size=2,alpha=0.5, aes(fill=votes))+
  scale_fill_gradient(low="yellow",high="red")+
  xlim(1892,1900)+
  ylim(1,10)+
  theme_light()+
  labs(title="IMDB User Ratings for 19th Century Films")+
  theme(plot.title=element_text(family="Courier"),
        axis.title=element_text(family="Courier"),
        axis.text=element_text(family="Courier"),
        legend.text = element_text(family="Courier"),
        legend.title = element_text(family="Courier"))

ggsave("18centurymovies.pdf",width=8,height=4)