# excel file
Movies <- read_excel("../00_data/MyData.xlsx")
## New names:
## • `` -> `...1`
Movies
## # A tibble: 3,401 × 9
##     ...1 release_date movie     production_budget domestic_gross worldwide_gross
##    <dbl> <chr>        <chr>                 <dbl>          <dbl>           <dbl>
##  1     1 6/22/2007    Evan Alm…         175000000      100289690       174131329
##  2     2 7/28/1995    Waterwor…         175000000       88246220       264246220
##  3     3 5/12/2017    King Art…         175000000       39175066       139950708
##  4     4 12/25/2013   47 Ronin          175000000       38362475       151716815
##  5     5 6/22/2018    Jurassic…         170000000      416769345      1304866322
##  6     6 8/1/2014     Guardian…         170000000      333172112       771051335
##  7     7 5/7/2010     Iron Man…         170000000      312433331       621156389
##  8     8 4/4/2014     Captain …         170000000      259746958       714401889
##  9     9 7/11/2014    Dawn of …         170000000      208545589       710644566
## 10    10 11/10/2004   The Pola…         170000000      186493587       310634169
## # ℹ 3,391 more rows
## # ℹ 3 more variables: distributor <chr>, mpaa_rating <chr>, genre <chr>

Introduction

Questions

Variation

ggplot(data = Movies) +
  geom_bar(mapping = aes(x = genre))

Visualizing distributions

ggplot(data = Movies, mapping = aes(x = production_budget, colour = genre)) +
  geom_freqpoly(binwidth = 100000)

Typical values

ggplot(data = Movies, mapping = aes(x = domestic_gross)) +
  geom_histogram(binwidth = 10000000)

Unusual values

ggplot(Movies) + 
  geom_histogram(mapping = aes(x = worldwide_gross), binwidth = 10000000)

ggplot(Movies) + 
  geom_histogram(mapping = aes(x = worldwide_gross), binwidth = 10000000) +
  coord_cartesian(ylim = c(0, 50))

Missing Values

Movies2 <- Movies %>% 
  mutate(y = ifelse(worldwide_gross < 100000 | worldwide_gross > 1000000000, NA, worldwide_gross))

ggplot(data = Movies2, mapping = aes(x = production_budget, y = worldwide_gross)) + 
  geom_point()

Covariation

A categorical and continuous variable

ggplot(data = Movies, mapping = aes(x = domestic_gross)) + 
  geom_freqpoly(mapping = aes(colour = genre), binwidth = 10000000)

ggplot(data = Movies, mapping = aes(x = domestic_gross, y = ..density..)) + 
  geom_freqpoly(mapping = aes(colour = genre), binwidth = 10000000)
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggplot(data = Movies, mapping = aes(x = genre, y = domestic_gross)) +
  geom_boxplot()

ggplot(data = Movies) +
  geom_boxplot(mapping = aes(x = reorder(genre, domestic_gross, FUN = median), y = domestic_gross))

Two categorical variables

ggplot(data = Movies) +
  geom_count(mapping = aes(x = genre, y = mpaa_rating))

Two continous variables

ggplot(data = Movies2) + 
  geom_point(mapping = aes(x = production_budget, y = worldwide_gross), alpha = 1 / 10)

ggplot(data = Movies) +
  geom_bin2d(mapping = aes(x = production_budget, y = domestic_gross))

# install.packages("hexbin")
ggplot(data = Movies) +
  geom_hex(mapping = aes(x = production_budget, y = domestic_gross))

Patterns and models

ggplot(data = Movies, aes(production_budget, domestic_gross + 
                              worldwide_gross)) + geom_point() + geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data = Movies2) + 
  geom_boxplot(mapping = aes(x = genre, y = domestic_gross + worldwide_gross))