Import Data

scooby <- read_excel("../00_data/MyData.xlsx")

Introduction

Questions

Some of my columns of categorical data has data that could be replaced with the word various instead of male, male, female, male, female, etc.. Can I please replace the data with the word various and basically make it a miscellaneous category? I feel like I’d have more data that I want to work with than just monster_real.

Variation

Visualizing distributions

ggplot(data = scooby) +
    geom_bar(mapping = aes(x = culprit_amount))

ggplot(data = scooby) +
    geom_histogram(mapping = aes(x = monster_amount), 
binwidth = 1)

ggplot(data = scooby, mapping = aes(x = monster_amount, colour = monster_real)) +
    geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Typical values

scooby %>%
    
    # Filter out bigger diamonds
    filter(monster_amount < 5.1) %>%
    
    # Plot
    ggplot(aes(x = monster_amount)) +
    geom_histogram(binwidth = 1)

scooby %>%
    
    ggplot(aes(x = culprit_amount)) +
    geom_histogram(binwidth = 1)

Unusual values

scooby %>%
    
    ggplot(aes(x = suspects_amount)) +
    geom_histogram(binwidth = 1)

scooby %>%
    
    ggplot(aes(x = suspects_amount)) +
    geom_histogram(binwidth = 1) +
    coord_cartesian(ylim = c(0,100))

Missing Values

scooby %>%
    
    # filter(y < 3 | y > 20) %>%
    
    mutate(engagement = ifelse(engagement < 10 | engagement > 200, NA, engagement)) %>%
    
    # Plot
    ggplot(aes(x = engagement, y = imdb)) +
    geom_point()
## Warning: Removed 128 rows containing missing values or values outside the scale range
## (`geom_point()`).

Covariation

A categorical and continuous variable

scooby %>%
    
    ggplot(aes(x = monster_real, y = monster_amount)) +
    geom_boxplot()

Two categorical variables

scooby %>%
    
    count(monster_real, setting_terrain) %>%
    
    ggplot(aes(x = monster_real, y = setting_terrain, fill = n)) +
    geom_tile()

Two continous variables

library(hexbin)
scooby %>%
    ggplot(aes(x = monster_amount, y = imdb)) +
    geom_hex()
## Warning: Removed 15 rows containing non-finite outside the scale range
## (`stat_binhex()`).

Patterns and models

library(modelr)
mod <- lm(log(imdb) ~ log(engagement), data = scooby)

scooby4 <- scooby %>%
    modelr::add_residuals(mod) %>%
    mutate(resid = exp(resid))

scooby4 %>%
    ggplot(aes(monster_amount, resid)) +
    geom_point()
## Warning: Removed 15 rows containing missing values or values outside the scale range
## (`geom_point()`).

scooby4 %>%
    ggplot(aes(monster_real, resid)) +
    geom_boxplot()
## Warning: Removed 15 rows containing non-finite outside the scale range
## (`stat_boxplot()`).