Introduction

Questions

Variation

Visualizing distributions

ggplot(data = myData) +
  geom_bar(mapping = aes(x = artist_gender))

myData %>% count(artist_gender)
## # A tibble: 4 × 2
##   artist_gender     n
##   <chr>         <int>
## 1 Female           93
## 2 Male            358
## 3 Male/Female      47
## 4 <NA>              2
#Showcases the spread of what years albums were released
ggplot(data = myData) +
  geom_histogram(mapping = aes(x = release_year))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Typical values

#Showcases the count of of album releases per year tied in with the artists gender.
ggplot(data = myData, mapping = aes(x = release_year, colour = artist_gender)) +
  geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Unusual values

#Showcases the spread of how long albums have been on the billboard charts
ggplot(myData) + 
  geom_histogram(mapping = aes(x = weeks_on_billboard))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 78 rows containing non-finite outside the scale range
## (`stat_bin()`).

Missing Values

#Showcases the spread of how many weeks an album spent on the billboard top 500 (only albums between 1 - 200 weeks) but only albms made be 1 - 5 artists
myData2 <- myData %>% 
  mutate(
    weeks_on_billboard = ifelse(weeks_on_billboard < 1 | weeks_on_billboard > 200, NA, weeks_on_billboard),
    artist_member_count = ifelse(artist_member_count < 1 | artist_member_count > 5, NA, artist_member_count)
  )


ggplot(data = myData2, mapping = aes(y = artist_member_count, x = weeks_on_billboard)) + 
  geom_point()
## Warning: Removed 131 rows containing missing values or values outside the scale range
## (`geom_point()`).

Covariation

A categorical and continuous variable

#Artist Gender vs Peak Billboard position
ggplot(data = myData, mapping = aes(x = artist_gender, y = peak_billboard_position)) +
  geom_boxplot()

### Two categorical variables

#Artist Member Count vs Peak Billboard Position
myData %>%
  count(artist_member_bin = cut(artist_member_count, breaks = seq(0, max(artist_member_count, na.rm = TRUE), by = 2)),
        peak_billboard_bin = cut(peak_billboard_position, breaks = c(0, 50, 100, 200))) %>%
  ggplot(aes(x = artist_member_bin, y = peak_billboard_bin)) +
    geom_tile(aes(fill = n)) +
    labs(x = "Artist Member Count", y = "Peak Billboard Position", title = "Artist Member Count vs Peak Billboard Position")

Two continous variables

#Genre vs Rank in 2020
ggplot(myData, aes(x = genre, y = rank_2020)) +
  geom_boxplot() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(x = "Genre", y = "Rank in 2020", title = "Genre vs Rank in 2020")

Patterns and models

library(modelr)
# Fit a linear model that includes rank and artist member count
mod_member_count <- lm(log(spotify_popularity) ~ log(rank_2020) + artist_member_count, data = myData)

# Add residuals and exponentiate them
myData2 <- myData %>%
  add_residuals(mod_member_count) %>%
  mutate(resid = exp(resid))

# Plot residuals vs rank, colored by artist member count
ggplot(data = myData2) + 
  geom_point(mapping = aes(x = rank_2020, y = resid, color = as.factor(artist_member_count))) +
  labs(x = "Rank in 2020", y = "Residuals (Spotify Popularity)", 
       title = "Residuals of Model: Spotify Popularity vs Rank 2020 (by Artist Member Count)") +
  theme_minimal()
## Warning: Removed 23 rows containing missing values or values outside the scale range
## (`geom_point()`).