library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(tidyr)
library(readxl)
rs <- read_csv("/Users/aaliyahmjh/Downloads/rolling_stone.csv", show_col_types = FALSE)
This dataset has a wide structure with duplicate columns such as name and 3 separate rank columns that can be combined and referenced with a “year” column.
tidy_rs <- rs %>%
pivot_longer(
cols = starts_with("rank_"),
names_to = "year",
values_to = "rank",
names_pattern = "rank_(\\d+)"
)
tidy_rs
## # A tibble: 2,073 × 20
## sort_name clean_name album differential release_year genre type
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 Sinatra, Frank Frank Sinatra In the We… -182 1955 Big … Stud…
## 2 Sinatra, Frank Frank Sinatra In the We… -182 1955 Big … Stud…
## 3 Sinatra, Frank Frank Sinatra In the We… -182 1955 Big … Stud…
## 4 Diddley, Bo Bo Diddley Bo Diddle… -241 1955 Rock… Stud…
## 5 Diddley, Bo Bo Diddley Bo Diddle… -241 1955 Rock… Stud…
## 6 Diddley, Bo Bo Diddley Bo Diddle… -241 1955 Rock… Stud…
## 7 Presley, Elvis Elvis Presley Elvis Pre… -277 1956 Rock… Stud…
## 8 Presley, Elvis Elvis Presley Elvis Pre… -277 1956 Rock… Stud…
## 9 Presley, Elvis Elvis Presley Elvis Pre… -277 1956 Rock… Stud…
## 10 Sinatra, Frank Frank Sinatra Songs for… -195 1956 Big … Stud…
## # ℹ 2,063 more rows
## # ℹ 13 more variables: weeks_on_billboard <dbl>, peak_billboard_position <dbl>,
## # spotify_popularity <dbl>, spotify_url <chr>, artist_member_count <dbl>,
## # artist_gender <chr>, artist_birth_year_sum <dbl>,
## # debut_album_release_year <dbl>, ave_age_at_top_500 <dbl>,
## # years_between <dbl>, album_id <chr>, year <chr>, rank <dbl>
Filtered the dataset to only keep the columns that I’m interested in for my analysis.
filter_rs <- tidy_rs %>%
select(clean_name, album, genre, year, rank, spotify_popularity)
head(filter_rs)
## # A tibble: 6 × 6
## clean_name album genre year rank spotify_popularity
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 Frank Sinatra In the Wee Small Hours Big B… 2003 100 48
## 2 Frank Sinatra In the Wee Small Hours Big B… 2012 101 48
## 3 Frank Sinatra In the Wee Small Hours Big B… 2020 282 48
## 4 Bo Diddley Bo Diddley / Go Bo Diddley Rock … 2003 214 50
## 5 Bo Diddley Bo Diddley / Go Bo Diddley Rock … 2012 216 50
## 6 Bo Diddley Bo Diddley / Go Bo Diddley Rock … 2020 455 50
Fix typo issue between Blues/Blues ROck and Blues/Blues Rock
filter_rs <- filter_rs %>%
mutate(genre = case_when(
genre %in% c("Blues/Blues ROck", "Blues/Blues Rock-") ~ "Blues/Blues Rock",
TRUE ~ genre
))
Since a lower rank number indicates a higher ranking (rank 1 being the best), I calculated the average rank for each genre- a lower average rank would suggest that a genre was consistently ranked higher. It’s also important to note that every album was not ranked which was another reason I decided to take the average, trying to minimize the biases as much as possible.
Reggae was consistently rated the highest with Afrobeats being consistently rated the lowest.
genreIncluded <- filter_rs %>%
filter(!is.na(genre))
avgRankByGenre <- genreIncluded %>%
group_by(genre) %>%
summarize(
avg_rank = mean(rank, na.rm = TRUE),
count = n()
) %>%
arrange(avg_rank)
print(avgRankByGenre)
## # A tibble: 15 × 3
## genre avg_rank count
## <chr> <dbl> <int>
## 1 Reggae 182. 21
## 2 Rock n' Roll/Rhythm & Blues 189. 30
## 3 Big Band/Jazz 216. 42
## 4 Country/Folk/Country Rock/Folk Rock 216. 147
## 5 Funk/Disco 222. 66
## 6 Hard Rock/Metal 230. 81
## 7 Blues/Blues Rock 241. 198
## 8 Soul/Gospel/R&B 242. 225
## 9 Hip-Hop/Rap 249. 183
## 10 Singer-Songwriter/Heartland Rock 261. 84
## 11 Punk/Post-Punk/New Wave/Power Pop 280. 252
## 12 Indie/Alternative Rock 285. 180
## 13 Electronic 337. 42
## 14 Latin 427. 24
## 15 Afrobeat 434. 6
ggplot(avgRankByGenre, aes(x = reorder(genre, avg_rank), y = avg_rank)) +
geom_bar(stat = "identity", fill = "skyblue") +
coord_flip() +
labs(
title = "Average Rank by Genre",
x = "Genre",
y = "Average Rank (Lower = Better)"
)
## 2. Which Artists Have the Most Albums on the List? To see which
artists appear most frequently, I just counted the number of albums per
artist.
The Beatles and Bob Dylan are tied for most albums on the list at 11 each.
artistAlbumCounts <- filter_rs %>%
distinct(clean_name, album) %>%
count(clean_name, sort = TRUE)
head(artistAlbumCounts,20)
## # A tibble: 20 × 2
## clean_name n
## <chr> <int>
## 1 Bob Dylan 11
## 2 The Beatles 11
## 3 Rolling Stones 10
## 4 Bruce Springsteen 9
## 5 David Bowie 7
## 6 The Who 7
## 7 Elton John 6
## 8 Kanye West 6
## 9 Neil Young 6
## 10 Bob Marley 5
## 11 James Brown 5
## 12 Led Zeppelin 5
## 13 Otis Redding 5
## 14 Radiohead 5
## 15 The Beach Boys 5
## 16 The Byrds 5
## 17 U2 5
## 18 Various Artists 5
## 19 Aretha Franklin 4
## 20 Creedence Clearwater Revival 4
I wanted to create visualization showing the top 10 artists with the most albums but the chart showed 18 artists due to multiple ties.
top10_artists <- artistAlbumCounts %>%
slice_max(n, n = 10)
ggplot(top10_artists, aes(x = reorder(clean_name, n), y = n)) +
geom_bar(stat = "identity", fill = "salmon") +
coord_flip() +
labs(
title = "Number of Albums per Artist (Top 10)",
x = "Artist",
y = "Album Count"
)
I decided to utilize a scatterplot here to identify the correlation, if any, between high ranks and high Spotify popularity scores. From the scatterplot, it seems like there is a very weak negative correlation — the trend line is sloping downward, but the points on the plot are quite scattered.
This suggests that albums with lower rank (better ranking) might have SLIGHTLY higher Spotify popularity on average, but this relationship is very weak.
# Create a scatter plot to visualize the correlation
ggplot(filter_rs, aes(x = rank, y = spotify_popularity)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE, col = "red") +
labs(
title = "Relationship Between Album Rank and Spotify Popularity",
x = "Album Rank (Lower = Better)",
y = "Spotify Popularity"
)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 659 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 659 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Calculate the correlation between rank and spotify_popularity
correlation <- cor(filter_rs$rank, filter_rs$spotify_popularity, use = "complete.obs")
print(correlation)
## [1] -0.185627
untidy_sales <- read_csv("/Users/aaliyahmjh/Downloads/sales.csv", show_col_types = FALSE)
“The data is messy because it’s in wide format, with many columns representing individual months, and each row represents a different product across the regions” I decided to pivot it to a long table to remove the redundancy and make the table easier to work with, analyze and update in the future.
t_sales <- untidy_sales %>%
pivot_longer(
cols = c("Jan Sales", "Feb Sales", "Mar Sales",
"Apr Sales", "May Sales", "Jun Sales"),
names_to = "Month",
values_to = "Sales"
) %>%
mutate(
Month = str_replace(Month, " Sales", ""),
Sales = parse_number(as.character(Sales))
)
t_sales
## # A tibble: 54 × 4
## `Product Name` Region Month Sales
## <chr> <chr> <chr> <dbl>
## 1 Product A North Jan 100
## 2 Product A North Feb 110
## 3 Product A North Mar 120
## 4 Product A North Apr 130
## 5 Product A North May 140
## 6 Product A North Jun 150
## 7 Product A South Jan 200
## 8 Product A South Feb 210
## 9 Product A South Mar 220
## 10 Product A South Apr 230
## # ℹ 44 more rows
All products show consistent growth (upward trajectory in sales) from Jan to June in all regions. Product B consistently has the highest sales in all regions, followed by Product A then Product C - this is the trend among each region.
This chart does not show any obvious seasonal fluctuations - there are no clear dips or spikes in certain months, rather just consistent growth.
t_sales <- t_sales %>%
mutate(Month = factor(Month, levels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun"), ordered = TRUE))
ggplot(t_sales, aes(x = Month, y = Sales, group = `Product Name`, color = `Product Name`)) +
geom_line() +
facet_wrap(~ Region) +
labs(title = "Monthly Sales Trends by Product and Region")
As expected based on the analysis above, product B is performing the best in terms of overall sales across all regions when we calculate and plot their total sales.
product_sales <- t_sales %>%
group_by(`Product Name`) %>%
summarise(Total_Sales = sum(Sales, na.rm = TRUE))
t_sales <- t_sales %>%
mutate(`Product Name` = factor(`Product Name`, levels = c("Product A", "Product B", "Product C")))
ggplot(product_sales, aes(x = `Product Name`, y = Total_Sales)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Total Sales by Product", x = "Product", y = "Total Sales")
This graph shows the monthly sales distribution for each product by
plotting their highest sales among regions for each month.
Once again, it is evident that Product C is severely under performing, as we notice that its maximum sales overall (~175) is still lower than the minimum sales for both Product A (~300) and Product B (~350)
ggplot(t_sales, aes(x = Month, y = Sales, fill = `Product Name`)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Monthly Sales Distribution by Product", x = "Month", y = "Sales")
weather <- read_csv("/Users/aaliyahmjh/Downloads/weather.csv", show_col_types = FALSE)
“It is untidy because: - Months are represented as separate columns instead of being in a single”Month” column. - Temperature and humidity are stored in separate columns for each month, rather than having a single “Measure” column with corresponding values. - Each row represents a city, but instead of a single observation per row, there are multiple values in different columns.”
tidy_weather <- weather %>%
pivot_longer(
cols = -City,
names_to = c("variable", "month"),
names_sep = "_" # Separates the names at the underscore
) %>%
# Pivot wider so that I can store temperature and humidity separately since they represent different things
pivot_wider(
names_from = variable,
values_from = value
) %>%
rename(
temperature = Temp,
humidity = Humid
)
head(tidy_weather, 10)
## # A tibble: 9 × 4
## City month temperature humidity
## <chr> <chr> <chr> <chr>
## 1 New York Jan 32°F 75%
## 2 New York Feb 35°F 72%
## 3 New York Mar 42°F 68%
## 4 Los Angeles Jan 58°F 65%
## 5 Los Angeles Feb 60°F 63%
## 6 Los Angeles Mar 65°F 60%
## 7 Chicago Jan 28°F 80%
## 8 Chicago Feb 30°F 78%
## 9 Chicago Mar 40°F 75%
I decided to plot the temperature trends throughout the first 3 months of the year (Winter) for all cities and I found that: - Los Angeles has the highest temperatures in the first three months of the year and consistently becomes warmer (rising from 58 degrees to 65 degrees). - Chicago has the coldest temperatures and shows a sharp increase between February to March (rising from 30 degrees to 40 degrees). - New York follows a similar pattern to Chicago although a bit warmer and recorded a (less sharp) increase between Feb and Mar from 35 degrees to 42 degrees.
tidy_weather <- tidy_weather %>%
mutate(month = factor(month, levels = c("Jan", "Feb", "Mar")))
ggplot(tidy_weather, aes(x = month, y = temperature, group = City, color = City)) +
geom_line() +
geom_point() +
labs(title = "Seasonal Temperature Trends",
x = "Month",
y = "Temperature") +
theme_minimal()
I, then, did the same thing to examine the humidity trends in these 3 cities for the same 3 months
The graph below shows that humidity is inversely proportional to the temperature since the city with the lowest temperature (Chicago) has the highest humidity. The opposite is also true with LA having the highest temp and lowest humidity. The negative slope, however, is consistent across the cities - suggesting that as winter ends, the humidity usually declines in all 3 locations.
ggplot(tidy_weather, aes(x = month, y = humidity, group = City, color = City)) +
geom_line() +
geom_point() +
labs(title = "Seasonal Humidity Trends",
x = "Month",
y = "Humidity") +
theme_minimal()