Introduction

Questions

Variation

Visualizing distributions

ggplot(data = Summer_Movies) +
    geom_bar(mapping = aes(x = average_rating))

Summer_Movies %>% count(average_rating)
## # A tibble: 70 × 2
##    average_rating     n
##             <dbl> <int>
##  1            2.3     2
##  2            2.4     2
##  3            2.5     1
##  4            2.7     4
##  5            2.9     2
##  6            3       1
##  7            3.1     2
##  8            3.2     2
##  9            3.3     4
## 10            3.4     4
## # ℹ 60 more rows
ggplot(data = Summer_Movies) +
    geom_histogram(mapping = aes (x = year), binwidth = 0.5)
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

ggplot(data = Summer_Movies, mapping = aes(x = year, colour = average_rating)) +
    geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

Typical values

Summer_Movies %>%
    
    # Filter out Summer_Movies > The year 2020
    filter(year > 2020) %>%
    
    #Plot
    ggplot(aes(x = year)) +
    geom_histogram(binwidth = 0.01)

Summer_Movies %>%
    ggplot(data = Summer_Movies, mapping = aes(x = average_rating)) +
    geom_histogram(binwidth = 0.25)

Unusual values

Summer_Movies %>%
    ggplot(aes(x = average_rating)) + 
    geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Summer_Movies %>%
    ggplot(aes(x = average_rating)) + 
    geom_histogram() +
    coord_cartesian(ylim = c(0,50))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Missing Values

Summer_Movies %>% 
    
ggplot(aes(x = year, y = average_rating)) +
    geom_point(na.rm = TRUE)

Covariation

Summer_Movies %>%

    separate_rows(genres) %>%
    ggplot(aes(x = genres, y = average_rating)) +
    geom_boxplot()

A categorical and continuous variable

Summer_Movies %>%
    
    count(title_type, average_rating) %>%
    
    ggplot(aes(x = title_type, y = average_rating, fill = n)) +
    geom_tile()

Two categorical variables

Summer_Movies %>%
    
    count(year, average_rating) %>%
    
    ggplot(aes(x = year, y = average_rating, fill = n)) +
    geom_tile()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_tile()`).

Two continous variables

library(hexbin)
Summer_Movies %>%
    ggplot(aes(x = year, y= average_rating)) +
    geom_hex()
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_binhex()`).

Summer_Movies %>%
    filter(average_rating < 5) %>%
    ggplot(aes(x = average_rating, y = year)) +
    geom_boxplot(aes(group = cut_width(year, 0.1)))

Patterns and models

library(modelr)
mod <- lm(log(num_votes) ~ log(year), data = Summer_Movies)

Summerfilm <- Summer_Movies %>%
modelr::add_residuals(mod) %>%
    mutate(resid = exp(resid))

Summerfilm %>%
    ggplot(aes(year,resid)) +
    geom_point()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

Summerfilm %>%
    ggplot(aes(num_votes,resid)) +
    geom_boxplot()
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).