Import data
data <- read_excel("Apply_1.xlsx")
Introduction
Questions
Variation
ggplot(data = data) +
geom_bar(mapping = aes(x = release_year))

Visualizing distributions
data %>%
ggplot(aes(x = release_year)) +
geom_bar()

data %>%
ggplot(mapping = aes(x = release_year)) +
geom_histogram(binwidth = 0.5)

Typical values
data %>%
# Filter out release_year > 2005
filter(release_year > 2005) %>%
# Plot
ggplot(aes(x = release_year)) +
geom_histogram(binwidth = 0.5)

Unusual values
data %>%
ggplot(aes(age_difference)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

data %>%
ggplot(aes(age_difference)) +
geom_histogram() +
coord_cartesian(ylim = c(0,50))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Missing Values
data %>%
# filter(age_difference < 35 | age_difference > 55) %>%
mutate(age_difference = ifelse(age_difference < 35 | age_difference > 55, NA, age_difference)) %>%
# Plot
ggplot(aes(x = release_year, y = age_difference)) +
geom_point()
## Warning: Removed 1141 rows containing missing values or values outside the scale range
## (`geom_point()`).

Covariation
A categorical and continuous variable
data %>%
ggplot(aes(x = release_year, y = age_difference)) +
geom_boxplot()
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

Two categorical variables
data %>%
count(character_1_gender, character_2_gender) %>%
ggplot(aes(x = character_1_gender, y = character_2_gender, fill = n)) +
geom_tile()

Two continous variables
library(hexbin)
data %>%
ggplot(aes(x = release_year, y = age_difference)) +
geom_hex()

Patterns and models
library(modelr)
mod <- lm(log(actor_1_age) ~ log(actor_2_age), data = data)
data4 <- data %>%
modelr::add_residuals(mod) %>%
mutate(resid = exp(resid))
data4 %>%
ggplot(aes(actor_2_age, resid)) +
geom_point()
