Import Data

# excel file
data <- read_excel("../00_data/MyData.xlsx")
## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900
data
## # A tibble: 1,155 × 13
##    movie_name    release_year director age_difference couple_number actor_1_name
##    <chr>                <dbl> <chr>             <dbl>         <dbl> <chr>       
##  1 Venus                 2006 Roger M…             50             1 Peter O'Too…
##  2 The Quiet Am…         2002 Phillip…             49             1 Michael Cai…
##  3 The Big Lebo…         1998 Joel Co…             45             1 David Huddl…
##  4 Poison Ivy            1992 Katt Sh…             42             1 Tom Skerritt
##  5 Whatever Wor…         2009 Woody A…             40             1 Larry David 
##  6 Entrapment            1999 Jon Ami…             39             1 Sean Connery
##  7 Husbands and…         1992 Woody A…             38             1 Woody Allen 
##  8 Magnolia              1999 Paul Th…             38             1 Jason Robar…
##  9 Indiana Jone…         1989 Steven …             36             1 Sean Connery
## 10 Mr. Peabody …         1948 Irving …             36             1 William Pow…
## # ℹ 1,145 more rows
## # ℹ 7 more variables: actor_2_name <chr>, character_1_gender <chr>,
## #   character_2_gender <chr>, actor_1_birthdate <dttm>,
## #   actor_2_birthdate <dttm>, actor_1_age <dbl>, actor_2_age <dbl>

Introduction

Questions

Variation

ggplot(data = data) +
  geom_bar(mapping = aes(x = director))

Visualizing distributions

ggplot(data = data) +
  geom_histogram(mapping = aes(x = age_difference), binwidth = .8)

Typical values

ggplot(data = data, mapping = aes(x = release_year)) + 
  geom_histogram(binwidth = 0.25)

Unusual values

ggplot(data) + 
  geom_histogram(mapping = aes(x = couple_number), binwidth = 0.5)

Missing Values

ggplot(data = data, mapping = aes(x = release_year, y = age_difference)) +
  geom_boxplot()
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

Covariation

ggplot(data = data) +
  geom_count(mapping = aes(x = release_year, y = age_difference))

A categorical and continuous variable

ggplot(data = data) +
  geom_point(mapping = aes(x = release_year, y = director))

Two categorical variables

ggplot(data = data) +
  geom_count(mapping = aes(x = actor_1_age, y = actor_2_age))

Two continous variables

ggplot(data = data) + 
  geom_point(mapping = aes(x = release_year, y = age_difference), alpha = 50 / 50)

Patterns and models

ggplot(data = data) + 
  geom_point(mapping = aes(x =age_difference, y =couple_number))