library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(tidyr)
library(readxl)

DATASET 1 - Rolling Stones

rs <- read_csv("/Users/aaliyahmjh/Downloads/rolling_stone.csv", show_col_types = FALSE)

This dataset has a wide structure with duplicate columns such as name and 3 separate rank columns that can be combined and referenced with a “year” column.

tidy_rs <- rs %>%
  pivot_longer(
    cols = starts_with("rank_"),    
    names_to = "year",               
    values_to = "rank",             
    names_pattern = "rank_(\\d+)" 
  )

tidy_rs
## # A tibble: 2,073 × 20
##    sort_name      clean_name    album      differential release_year genre type 
##    <chr>          <chr>         <chr>             <dbl>        <dbl> <chr> <chr>
##  1 Sinatra, Frank Frank Sinatra In the We…         -182         1955 Big … Stud…
##  2 Sinatra, Frank Frank Sinatra In the We…         -182         1955 Big … Stud…
##  3 Sinatra, Frank Frank Sinatra In the We…         -182         1955 Big … Stud…
##  4 Diddley, Bo    Bo Diddley    Bo Diddle…         -241         1955 Rock… Stud…
##  5 Diddley, Bo    Bo Diddley    Bo Diddle…         -241         1955 Rock… Stud…
##  6 Diddley, Bo    Bo Diddley    Bo Diddle…         -241         1955 Rock… Stud…
##  7 Presley, Elvis Elvis Presley Elvis Pre…         -277         1956 Rock… Stud…
##  8 Presley, Elvis Elvis Presley Elvis Pre…         -277         1956 Rock… Stud…
##  9 Presley, Elvis Elvis Presley Elvis Pre…         -277         1956 Rock… Stud…
## 10 Sinatra, Frank Frank Sinatra Songs for…         -195         1956 Big … Stud…
## # ℹ 2,063 more rows
## # ℹ 13 more variables: weeks_on_billboard <dbl>, peak_billboard_position <dbl>,
## #   spotify_popularity <dbl>, spotify_url <chr>, artist_member_count <dbl>,
## #   artist_gender <chr>, artist_birth_year_sum <dbl>,
## #   debut_album_release_year <dbl>, ave_age_at_top_500 <dbl>,
## #   years_between <dbl>, album_id <chr>, year <chr>, rank <dbl>

Filtered the dataset to only keep the columns that I’m interested in for my analysis.

filter_rs <- tidy_rs %>%
  select(clean_name, album, genre, year, rank, spotify_popularity)
head(filter_rs)
## # A tibble: 6 × 6
##   clean_name    album                      genre  year   rank spotify_popularity
##   <chr>         <chr>                      <chr>  <chr> <dbl>              <dbl>
## 1 Frank Sinatra In the Wee Small Hours     Big B… 2003    100                 48
## 2 Frank Sinatra In the Wee Small Hours     Big B… 2012    101                 48
## 3 Frank Sinatra In the Wee Small Hours     Big B… 2020    282                 48
## 4 Bo Diddley    Bo Diddley / Go Bo Diddley Rock … 2003    214                 50
## 5 Bo Diddley    Bo Diddley / Go Bo Diddley Rock … 2012    216                 50
## 6 Bo Diddley    Bo Diddley / Go Bo Diddley Rock … 2020    455                 50

Fix typo issue between Blues/Blues ROck and Blues/Blues Rock

filter_rs <- filter_rs %>%
    mutate(genre = case_when(
    genre %in% c("Blues/Blues ROck", "Blues/Blues Rock-") ~ "Blues/Blues Rock",
    TRUE ~ genre
  ))

EDA

Which Genres Are Consistently Ranked the Highest?

Since a lower rank number indicates a higher ranking (rank 1 being the best), I calculated the average rank for each genre- a lower average rank would suggest that a genre was consistently ranked higher. It’s also important to note that every album was not ranked which was another reason I decided to take the average, trying to minimize the biases as much as possible.

Reggae was consistently rated the highest with Afrobeats being consistently rated the lowest.

genreIncluded <- filter_rs %>%
  filter(!is.na(genre))

avgRankByGenre <- genreIncluded %>%
  group_by(genre) %>%
  summarize(
    avg_rank = mean(rank, na.rm = TRUE),
    count = n()
  ) %>%
  arrange(avg_rank)

print(avgRankByGenre)
## # A tibble: 15 × 3
##    genre                               avg_rank count
##    <chr>                                  <dbl> <int>
##  1 Reggae                                  182.    21
##  2 Rock n' Roll/Rhythm & Blues             189.    30
##  3 Big Band/Jazz                           216.    42
##  4 Country/Folk/Country Rock/Folk Rock     216.   147
##  5 Funk/Disco                              222.    66
##  6 Hard Rock/Metal                         230.    81
##  7 Blues/Blues Rock                        241.   198
##  8 Soul/Gospel/R&B                         242.   225
##  9 Hip-Hop/Rap                             249.   183
## 10 Singer-Songwriter/Heartland Rock        261.    84
## 11 Punk/Post-Punk/New Wave/Power Pop       280.   252
## 12 Indie/Alternative Rock                  285.   180
## 13 Electronic                              337.    42
## 14 Latin                                   427.    24
## 15 Afrobeat                                434.     6
ggplot(avgRankByGenre, aes(x = reorder(genre, avg_rank), y = avg_rank)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  coord_flip() +
  labs(
    title = "Average Rank by Genre",
    x = "Genre",
    y = "Average Rank (Lower = Better)"
  )

## 2. Which Artists Have the Most Albums on the List? To see which artists appear most frequently, I just counted the number of albums per artist.

The Beatles and Bob Dylan are tied for most albums on the list at 11 each.

artistAlbumCounts <- filter_rs %>%
  distinct(clean_name, album) %>%   
  count(clean_name, sort = TRUE)
head(artistAlbumCounts,20)
## # A tibble: 20 × 2
##    clean_name                       n
##    <chr>                        <int>
##  1 Bob Dylan                       11
##  2 The Beatles                     11
##  3 Rolling Stones                  10
##  4 Bruce Springsteen                9
##  5 David Bowie                      7
##  6 The Who                          7
##  7 Elton John                       6
##  8 Kanye West                       6
##  9 Neil Young                       6
## 10 Bob Marley                       5
## 11 James Brown                      5
## 12 Led Zeppelin                     5
## 13 Otis Redding                     5
## 14 Radiohead                        5
## 15 The Beach Boys                   5
## 16 The Byrds                        5
## 17 U2                               5
## 18 Various Artists                  5
## 19 Aretha Franklin                  4
## 20 Creedence Clearwater Revival     4

I wanted to create visualization showing the top 10 artists with the most albums but the chart showed 18 artists due to multiple ties.

top10_artists <- artistAlbumCounts %>%
  slice_max(n, n = 10)

ggplot(top10_artists, aes(x = reorder(clean_name, n), y = n)) +
  geom_bar(stat = "identity", fill = "salmon") +
  coord_flip() +
  labs(
    title = "Number of Albums per Artist (Top 10)",
    x = "Artist",
    y = "Album Count"
  )

3. Do Highly Ranked Albums Also Have High Spotify Popularity Scores?

I decided to utilize a scatterplot here to identify the correlation, if any, between high ranks and high Spotify popularity scores. From the scatterplot, it seems like there is a very weak negative correlation — the trend line is sloping downward, but the points on the plot are quite scattered.

This suggests that albums with lower rank (better ranking) might have SLIGHTLY higher Spotify popularity on average, but this relationship is very weak.

# Create a scatter plot to visualize the correlation
ggplot(filter_rs, aes(x = rank, y = spotify_popularity)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, col = "red") +
  labs(
    title = "Relationship Between Album Rank and Spotify Popularity",
    x = "Album Rank (Lower = Better)",
    y = "Spotify Popularity"
  )
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 659 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 659 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Calculate the correlation between rank and spotify_popularity
correlation <- cor(filter_rs$rank, filter_rs$spotify_popularity, use = "complete.obs")
print(correlation)
## [1] -0.185627

Dataset 2 - Sales

untidy_sales <- read_csv("/Users/aaliyahmjh/Downloads/sales.csv", show_col_types = FALSE)

“The data is messy because it’s in wide format, with many columns representing individual months, and each row represents a different product across the regions” I decided to pivot it to a long table to remove the redundancy and make the table easier to work with, analyze and update in the future.

t_sales <- untidy_sales %>%
  pivot_longer(
    cols = c("Jan Sales", "Feb Sales", "Mar Sales", 
             "Apr Sales", "May Sales", "Jun Sales"),
    names_to = "Month",
    values_to = "Sales"
  ) %>%
  mutate(
    Month = str_replace(Month, " Sales", ""),
    Sales = parse_number(as.character(Sales))
  )

t_sales
## # A tibble: 54 × 4
##    `Product Name` Region Month Sales
##    <chr>          <chr>  <chr> <dbl>
##  1 Product A      North  Jan     100
##  2 Product A      North  Feb     110
##  3 Product A      North  Mar     120
##  4 Product A      North  Apr     130
##  5 Product A      North  May     140
##  6 Product A      North  Jun     150
##  7 Product A      South  Jan     200
##  8 Product A      South  Feb     210
##  9 Product A      South  Mar     220
## 10 Product A      South  Apr     230
## # ℹ 44 more rows

Analyze the sales growth for each product across different months and regions. Identify products with consistent growth or seasonal variations.

All products show consistent growth (upward trajectory in sales) from Jan to June in all regions. Product B consistently has the highest sales in all regions, followed by Product A then Product C - this is the trend among each region.

This chart does not show any obvious seasonal fluctuations - there are no clear dips or spikes in certain months, rather just consistent growth.

t_sales <- t_sales %>%
  mutate(Month = factor(Month, levels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun"), ordered = TRUE))

ggplot(t_sales, aes(x = Month, y = Sales, group = `Product Name`, color = `Product Name`)) +
  geom_line() +
  facet_wrap(~ Region) +
  labs(title = "Monthly Sales Trends by Product and Region")

Product Sales Distribution: Assess which product is performing best in terms of overall sales across all regions. This analysis could be extended to compare performance across months.

As expected based on the analysis above, product B is performing the best in terms of overall sales across all regions when we calculate and plot their total sales.

product_sales <- t_sales %>%
  group_by(`Product Name`) %>%
  summarise(Total_Sales = sum(Sales, na.rm = TRUE))

t_sales <- t_sales %>%
  mutate(`Product Name` = factor(`Product Name`, levels = c("Product A", "Product B", "Product C")))

ggplot(product_sales, aes(x = `Product Name`, y = Total_Sales)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Total Sales by Product", x = "Product", y = "Total Sales")

This graph shows the monthly sales distribution for each product by plotting their highest sales among regions for each month.

Once again, it is evident that Product C is severely under performing, as we notice that its maximum sales overall (~175) is still lower than the minimum sales for both Product A (~300) and Product B (~350)

ggplot(t_sales, aes(x = Month, y = Sales, fill = `Product Name`)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Monthly Sales Distribution by Product", x = "Month", y = "Sales")

DATASET 3 - Weather

weather <- read_csv("/Users/aaliyahmjh/Downloads/weather.csv", show_col_types = FALSE)

“It is untidy because: - Months are represented as separate columns instead of being in a single”Month” column. - Temperature and humidity are stored in separate columns for each month, rather than having a single “Measure” column with corresponding values. - Each row represents a city, but instead of a single observation per row, there are multiple values in different columns.”

tidy_weather <- weather %>%
  pivot_longer(
    cols = -City,
    names_to = c("variable", "month"),
    names_sep = "_"  # Separates the names at the underscore
  ) %>%
  # Pivot wider so that I can store temperature and humidity separately since they represent different things
  pivot_wider(
    names_from = variable,
    values_from = value
  ) %>%
  rename(
    temperature = Temp,
    humidity = Humid
  )

head(tidy_weather, 10)
## # A tibble: 9 × 4
##   City        month temperature humidity
##   <chr>       <chr> <chr>       <chr>   
## 1 New York    Jan   32°F        75%     
## 2 New York    Feb   35°F        72%     
## 3 New York    Mar   42°F        68%     
## 4 Los Angeles Jan   58°F        65%     
## 5 Los Angeles Feb   60°F        63%     
## 6 Los Angeles Mar   65°F        60%     
## 7 Chicago     Jan   28°F        80%     
## 8 Chicago     Feb   30°F        78%     
## 9 Chicago     Mar   40°F        75%