# excel file
Movies <- read_excel("../00_data/MyData.xlsx")
## New names:
## • `` -> `...1`
Movies
## # A tibble: 3,401 × 9
## ...1 release_date movie production_budget domestic_gross worldwide_gross
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1 6/22/2007 Evan Alm… 175000000 100289690 174131329
## 2 2 7/28/1995 Waterwor… 175000000 88246220 264246220
## 3 3 5/12/2017 King Art… 175000000 39175066 139950708
## 4 4 12/25/2013 47 Ronin 175000000 38362475 151716815
## 5 5 6/22/2018 Jurassic… 170000000 416769345 1304866322
## 6 6 8/1/2014 Guardian… 170000000 333172112 771051335
## 7 7 5/7/2010 Iron Man… 170000000 312433331 621156389
## 8 8 4/4/2014 Captain … 170000000 259746958 714401889
## 9 9 7/11/2014 Dawn of … 170000000 208545589 710644566
## 10 10 11/10/2004 The Pola… 170000000 186493587 310634169
## # ℹ 3,391 more rows
## # ℹ 3 more variables: distributor <chr>, mpaa_rating <chr>, genre <chr>
Introduction
Questions
Variation
ggplot(data = Movies) +
geom_bar(mapping = aes(x = genre))

Visualizing distributions
ggplot(data = Movies, mapping = aes(x = production_budget, colour = genre)) +
geom_freqpoly(binwidth = 100000)

Typical values
ggplot(data = Movies, mapping = aes(x = domestic_gross)) +
geom_histogram(binwidth = 10000000)

Unusual values
ggplot(Movies) +
geom_histogram(mapping = aes(x = worldwide_gross), binwidth = 10000000)

ggplot(Movies) +
geom_histogram(mapping = aes(x = worldwide_gross), binwidth = 10000000) +
coord_cartesian(ylim = c(0, 50))

Missing Values
Movies2 <- Movies %>%
mutate(y = ifelse(worldwide_gross < 100000 | worldwide_gross > 1000000000, NA, worldwide_gross))
ggplot(data = Movies2, mapping = aes(x = production_budget, y = worldwide_gross)) +
geom_point()

Covariation
A categorical and continuous variable
ggplot(data = Movies, mapping = aes(x = domestic_gross)) +
geom_freqpoly(mapping = aes(colour = genre), binwidth = 10000000)

ggplot(data = Movies, mapping = aes(x = domestic_gross, y = ..density..)) +
geom_freqpoly(mapping = aes(colour = genre), binwidth = 10000000)
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggplot(data = Movies, mapping = aes(x = genre, y = domestic_gross)) +
geom_boxplot()

ggplot(data = Movies) +
geom_boxplot(mapping = aes(x = reorder(genre, domestic_gross, FUN = median), y = domestic_gross))

Two categorical variables
ggplot(data = Movies) +
geom_count(mapping = aes(x = genre, y = mpaa_rating))

Two continous variables
ggplot(data = Movies2) +
geom_point(mapping = aes(x = production_budget, y = worldwide_gross), alpha = 1 / 10)

ggplot(data = Movies) +
geom_bin2d(mapping = aes(x = production_budget, y = domestic_gross))

# install.packages("hexbin")
ggplot(data = Movies) +
geom_hex(mapping = aes(x = production_budget, y = domestic_gross))

Patterns and models
ggplot(data = Movies, aes(production_budget, domestic_gross +
worldwide_gross)) + geom_point() + geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data = Movies2) +
geom_boxplot(mapping = aes(x = genre, y = domestic_gross + worldwide_gross))
