ggplot(data = myData) +
geom_bar(mapping = aes(x = artist_gender))
myData %>% count(artist_gender)
## # A tibble: 4 × 2
## artist_gender n
## <chr> <int>
## 1 Female 93
## 2 Male 358
## 3 Male/Female 47
## 4 <NA> 2
#Showcases the spread of what years albums were released
ggplot(data = myData) +
geom_histogram(mapping = aes(x = release_year))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Showcases the count of of album releases per year tied in with the artists gender.
ggplot(data = myData, mapping = aes(x = release_year, colour = artist_gender)) +
geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Showcases the spread of how long albums have been on the billboard charts
ggplot(myData) +
geom_histogram(mapping = aes(x = weeks_on_billboard))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 78 rows containing non-finite outside the scale range
## (`stat_bin()`).
#Showcases the spread of how many weeks an album spent on the billboard top 500 (only albums between 1 - 200 weeks) but only albms made be 1 - 5 artists
myData2 <- myData %>%
mutate(
weeks_on_billboard = ifelse(weeks_on_billboard < 1 | weeks_on_billboard > 200, NA, weeks_on_billboard),
artist_member_count = ifelse(artist_member_count < 1 | artist_member_count > 5, NA, artist_member_count)
)
ggplot(data = myData2, mapping = aes(y = artist_member_count, x = weeks_on_billboard)) +
geom_point()
## Warning: Removed 131 rows containing missing values or values outside the scale range
## (`geom_point()`).
#Artist Gender vs Peak Billboard position
ggplot(data = myData, mapping = aes(x = artist_gender, y = peak_billboard_position)) +
geom_boxplot()
### Two categorical variables
#Artist Member Count vs Peak Billboard Position
myData %>%
count(artist_member_bin = cut(artist_member_count, breaks = seq(0, max(artist_member_count, na.rm = TRUE), by = 2)),
peak_billboard_bin = cut(peak_billboard_position, breaks = c(0, 50, 100, 200))) %>%
ggplot(aes(x = artist_member_bin, y = peak_billboard_bin)) +
geom_tile(aes(fill = n)) +
labs(x = "Artist Member Count", y = "Peak Billboard Position", title = "Artist Member Count vs Peak Billboard Position")
#Genre vs Rank in 2020
ggplot(myData, aes(x = genre, y = rank_2020)) +
geom_boxplot() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "Genre", y = "Rank in 2020", title = "Genre vs Rank in 2020")
library(modelr)
# Fit a linear model that includes rank and artist member count
mod_member_count <- lm(log(spotify_popularity) ~ log(rank_2020) + artist_member_count, data = myData)
# Add residuals and exponentiate them
myData2 <- myData %>%
add_residuals(mod_member_count) %>%
mutate(resid = exp(resid))
# Plot residuals vs rank, colored by artist member count
ggplot(data = myData2) +
geom_point(mapping = aes(x = rank_2020, y = resid, color = as.factor(artist_member_count))) +
labs(x = "Rank in 2020", y = "Residuals (Spotify Popularity)",
title = "Residuals of Model: Spotify Popularity vs Rank 2020 (by Artist Member Count)") +
theme_minimal()
## Warning: Removed 23 rows containing missing values or values outside the scale range
## (`geom_point()`).