Introduction

My data consists of various albums and how they have placed on the billboard top 500. the data includes variables such as weeks on the bilboard, peak position, artist gender, member count, ## Questions

Variation

Visualizing distributions

ggplot(data = data) +
    geom_bar(mapping = aes(x = type))

data %>% count(weeks_on_billboard)
## # A tibble: 169 × 2
##    weeks_on_billboard     n
##    <chr>              <int>
##  1 1                     12
##  2 10                     2
##  3 100                    4
##  4 101                    2
##  5 102                    2
##  6 103                    4
##  7 104                    4
##  8 105                    2
##  9 106                    2
## 10 107                    2
## # ℹ 159 more rows
ggplot(data = data) +
    geom_histogram(mapping = aes(x = release_year))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = data, mapping = aes(x = release_year, colour = type)) +
    geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Typical values

data %>%
    
    # Filter out albums > number 5 position on billboard
    filter(peak_billboard_position < 5) %>%
    
    # plot
    ggplot(aes(x = peak_billboard_position)) +
    geom_histogram(binwidth = 0.5)

Unusual values

Missing Values

Covariation

A categorical and continuous variable

data %>%
    
    ggplot(aes(x = type, y = artist_member_count)) +
    geom_boxplot()
## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Two categorical variables

data  %>%
    
    count(type, genre) %>%
    
    ggplot(aes(x = type, y = genre, fill = n)) +
    geom_tile()

Two continous variables

data %>%
    ggplot(aes(x = release_year, y = peak_billboard_position)) +
    geom_hex()

data %>%
    filter(release_year < 1980) %>%
    ggplot(aes(x = release_year, y = peak_billboard_position)) +
    geom_boxplot(aes(group = cut_width(release_year, 0.1)))

## Patterns and models