This is the Rmd. script with reproducible code used to perform an analysis which was the base for the report In the search of music’s golden age. The code consists of all necessary data transformations and graphs presented in the report as well as some additional graphs enabling a slightly more expanded insight into the nature of data. The database is retrieved from RIAA site and presents recorded music sales and revenues divided by format within the time range of 1973-2019. The database is available here.
library(tidyverse)
library(car)
library(scales)
library(ggalluvial)
library(RColorBrewer)
library(data.table)
library(ggiraph)
library(ggridges)
library(viridis)
library(ggpubr)
library(ggforce)
library(gganimate)
library(forcats)
library(magick)
We began with importing the sales part of the database, consisting of the number of sales of music recordings divided by formats..
sales <- read.csv2('data/Sales.csv', dec =',',
encoding = 'UTF-8')
The dataset had to be cleaned. Useful viariables were selected, renamed and transformed into more convenient form.
sales1 <- sales %>% dplyr::select(Year, Format, Value..Actual., Total.Value.For.Year) %>%
rename(Sales = Value..Actual., TotalYearSales = Total.Value.For.Year) %>%
separate(TotalYearSales, into = c("tr1", "SalesForYear", "tr2"),
sep = c(2,-1), remove = FALSE) %>%
select(Year, Format, Sales, SalesForYear) %>%
mutate(SalesForYear = as.numeric(gsub("," ,".", SalesForYear)))
sales1[sales1$Year == 2013 & sales1$Format == "DVD Audio",]$Sales
## [1] -0.05473517
The number of sales for one observation of the DVD Audio format for a year 2013 was negative and made no sense. We replaced this value with the mean of DVD Audio sales for 2012 and 2014 to obtain the likely value for this year and to keep the continuous character of the records. Next we ordered the data according to the time of occurence and created the first plot showing the music recordings sold in US between 1973 and 2019 divided by music format.
# missing proper data for DVD Audio in 2013
sales1[sales1$Year == 2013 & sales1$Format == "DVD Audio",]$Sales <- mean(c(sales1[sales1$Year %in% c(2012,2014) & sales1$Format == "DVD Audio",]$Sales))
#ordering Format category by the time of occurence
sall <- sales1 %>% arrange(Year)
orderedFormat <- sall[!duplicated(sall$Format), 'Format']
sales1 <- sales1 %>% mutate(Format= fct_relevel(Format, orderedFormat))
ggplot(data = sales1, aes(x = Year, y = Sales, fill = Format)) +
geom_col(position = "stack", width=1)+
labs(y = 'Number of records sold', x = 'Year',
title = 'Changes in the formats of recorded music',
caption = "Data source: RIAA https://www.riaa.com/u-s-sales-database/") +
scale_x_continuous(name="Year", breaks = seq(1973, 2019, 5))+
scale_y_continuous(breaks = seq(0,2000, 250), labels = label_number(suffix= "M"))+
scale_fill_manual(values=colorRampPalette(brewer.pal(8, "Paired"))(18))+
theme_minimal()+
theme(legend.title = element_blank())
Next, we created additional variable presenting the type of each format (whether the format is a digital or physical one).
sales2 <- sales1 %>% mutate(Type = ifelse(Format %in%
c('Download Album', 'Download Music Video',
'Download Single', 'Ringtones & Ringbacks',
'Kiosk'),
'Digital', 'Physical'))
Now we were able to create the plot with the comparison of those two types in the analysed time period.
mypalette <- brewer.pal(3,"Dark2") #to ensure a proper color order
ggplot(data = sales2, aes(x = Year, y = Sales, fill = Type)) +
geom_col(position = "stack", width=1)+
labs(y = 'Number of records sold', x = 'Year',
title = 'Digital transformation in the music industry',
caption = "Data source: RIAA https://www.riaa.com/u-s-sales-database/") +
scale_x_continuous(name="Year", breaks = seq(1973, 2019, 3))+
scale_y_continuous(breaks = seq(0,2000, 250), labels = label_number(suffix= "M"))+
annotate(geom="text", x=1993, y=300, label="Physical", color="black", fontface =2, size =7)+
annotate(geom="text", x=2010, y=1000, label="Digital", color="black", fontface =2, size =7)+
scale_fill_manual(values=mypalette[2:1]) +
theme_minimal() +
theme(legend.position = "none")
For the purpose of creating a plot with the shares of digital recordings the data had to be aggregated with respect to the year and type of observation.
sales_agg <- sales2 %>% group_by(Year, Type) %>% summarise(TotSales = sum(Sales), SalesForYear2 = round(mean(SalesForYear),2))
sales_agg %>% tail()
## # A tibble: 6 x 4
## # Groups: Year [3]
## Year Type TotSales SalesForYear2
## <int> <chr> <dbl> <dbl>
## 1 2017 Digital 637. 743.
## 2 2017 Physical 106. 743.
## 3 2018 Digital 462. 532.
## 4 2018 Physical 70.5 532.
## 5 2019 Digital 386. 453.
## 6 2019 Physical 67.3 453.
sales_agg2 <- sales_agg %>% filter(Type == "Digital") %>% mutate(Share = TotSales/SalesForYear2*100)
sales_agg2
## # A tibble: 16 x 5
## # Groups: Year [16]
## Year Type TotSales SalesForYear2 Share
## <int> <chr> <dbl> <dbl> <dbl>
## 1 2004 Digital 144 958. 15.0
## 2 2005 Digital 553. 1302. 42.5
## 3 2006 Digital 940. 1588. 59.2
## 4 2007 Digital 1319 1852. 71.2
## 5 2008 Digital 1534. 1920. 79.9
## 6 2009 Digital 1515. 1828. 82.9
## 7 2010 Digital 1472. 1740. 84.6
## 8 2011 Digital 1569. 1825 86.0
## 9 2012 Digital 1591. 1803. 88.2
## 10 2013 Digital 1502. 1691. 88.8
## 11 2014 Digital 1304. 1458. 89.4
## 12 2015 Digital 1120. 1256. 89.2
## 13 2016 Digital 855. 970. 88.1
## 14 2017 Digital 637. 743. 85.8
## 15 2018 Digital 462. 532. 86.7
## 16 2019 Digital 386. 453. 85.1
We imported and and prepared the data with the revenues part of the database in the same manner as the previous one.
rev <- read.csv2('data/Revenues.csv', dec =',',
encoding = 'UTF-8')
rev1 <- rev %>% select(Year, Format, Value..Actual., Total.Value.For.Year) %>%
rename(Revenues = Value..Actual., TotalYearRev = Total.Value.For.Year) %>%
separate(TotalYearRev, into = c("tr1", "RevForYear", "tr2"), sep = c(2,-1), remove = FALSE) %>%
select(Year, Format, Revenues, RevForYear) %>%
mutate(RevForYear = as.numeric(gsub("," ,".", RevForYear)))
Again we recoded the data to obtain an information about the type of formats. This time beside digital and physical types, formats of steraming are also present in the data.
rev2 <- rev1 %>% mutate(Type = ifelse(Format %in%
c('8 - Track', 'Cassette',
'Cassette Single', 'CD',
'CD Single', 'DVD Audio',
'LP/EP', 'Music Video (Physical)',
'Other Tapes', 'Vinyl Single',
'SACD'),
'Physical', ifelse(Format %in%
c('Download Album', 'Download Music Video',
'Download Single', 'Kiosk',
'Ringtones & Ringbacks', 'Other Digital'),
'Digital', 'Streaming')))
When the revenues part was ready we were able to merge both datasets and obtain a file with full information provided from the source.
data_all <- merge(sales2, rev2, by = c('Year', 'Format', 'Type'), all = T)
One more thing is the previously mentioned problem with the data for DVD Audio for year 2013. In case of revenues it is just a lack of data for this observation. Again we replaced it with a mean of respective adjacent observations.
data_all[data_all$Year == 2013 & data_all$Format == "DVD Audio",]$Revenues <- mean(c(data_all[data_all$Year %in% c(2012,2014) & data_all$Format == "DVD Audio",]$Revenues))
To simplify the analysis it may be usefull to reduce the number of main formats to some merged categories with particular respect to the technology used in the creation of the recording.
data_all$Category <-
recode(
data_all$Format,
"c('8 - Track', 'Cassette', 'Cassette Single', 'Other Tapes') = 'Tapes';
c('CD', 'CD Single', 'SACD') = 'CD';
c('Download Album', 'Download Music Video', 'Download Single', 'Kiosk', 'Ringtones & Ringbacks', 'Other Digital') = 'Download';
c('Limited Tier Paid Subscription', 'On-Demand Streaming (Ad-Supported)', 'Other Ad-Supported Streaming', 'Paid Subscription', 'SoundExchange Distributions') = 'Streaming';
c('LP/EP', 'Vinyl Single') = 'Vinyl';"
)
Having the fully prepared data we could aggregate it by new categories.
data_agg <- data_all %>% group_by(Year, Category) %>% summarise(Revenues = sum(Revenues, na.rm = T), Sales = sum(Sales, na.rm = T))
ggplot(data = data_agg,
aes(x = Year, y = Revenues, alluvium = Category)) +
geom_alluvium(aes(fill = Category, colour = Category),
alpha = .75, decreasing = FALSE) +
scale_x_continuous(breaks = seq(1973, 2019, 5)) +
scale_y_continuous(breaks = seq(0, 15000, 5000), labels = label_number(suffix= "M")) +
labs(y = 'Revenues from the records sold', x = 'Year',
title = 'Changes in the revenues',
caption = "Data source: RIAA https://www.riaa.com/u-s-sales-database/") +
theme_minimal() +
scale_fill_brewer(type = "qual", palette = "Paired") +
scale_color_brewer(type = "qual", palette = "Paired") +
theme(legend.title = element_blank())
The revenue structures with respect to the new obtained categories can be seen above. Next, we recoded categories once again to combine the least popular formats into one ‘other’ level. We again aggregated the data based on the new categories (Category2.
# recoding: merged formats (2nd version with the rest combined to other)
data_all$Category2 <- ifelse(data_all$Category %in%
c('Synchronization', 'Music Video (Physical)', 'DVD Audio'),
'Other', as.character(data_all$Category))
# grouping and aggregating data by 2nd Categories
data_agg2 <- data_all %>% group_by(Year, Category2) %>% summarise(Revenues = sum(Revenues, na.rm = T), Sales = sum(Sales, na.rm = T))
On the next plot we presented the changes in time of the physical formats based on the reduced data.
# physical formats only
data_phys2 <- data_agg2[!data_agg2$Category2 %in% c("Download", "Streaming"),] %>%
transform(Category2 = factor(Category2, levels=c("Vinyl", "Tapes", "CD", "Other")))
year_max <- as.data.table(data_phys2)
year_max <- year_max[year_max[, .I[Revenues == max(Revenues)], by=Category2]$V1]
year_max <- year_max$Year
# plot 5
ggplot(data_phys2, aes(x = Year, y = Revenues, fill = Category2)) +
geom_col(data = data_phys2[-2], fill = "grey", alpha = .5) +
geom_col(colour = "darkgrey") +
scale_fill_manual(values = brewer.pal(12,'Set3')[c(1,8,3,12)]) +
scale_x_continuous(breaks = seq(1973, 2019, 6)) +
scale_y_continuous(breaks = seq(0, 15000, 5000), labels = label_number(suffix= "M")) +
labs(y = 'Revenues', x = 'Year',
title = 'Revenues from physical formats',
caption = "Data source: RIAA https://www.riaa.com/u-s-sales-database/") +
theme_minimal() +
geom_vline(xintercept = year_max[2], colour = brewer.pal(8,'Dark2')[1], linetype = 'F1', size = 0.8) +
geom_text(aes(x = year_max[2], label = "Vinyl annual max", y = 8500), colour = brewer.pal(8,'Dark2')[1], angle = 90, vjust = -0.6) +
geom_vline(xintercept = year_max[1], colour = brewer.pal(8,'Dark2')[4], linetype = 'F1', size = 0.8) +
geom_text(aes(x = year_max[1], label = "Tapes annual max", y = 8500), colour = brewer.pal(8,'Dark2')[4], angle = 90, vjust = -0.6) +
geom_vline(xintercept = year_max[3], colour = brewer.pal(8,'Dark2')[3], linetype = 'F1', size = 0.8) +
geom_text(aes(x = year_max[3], label = "CD annual max", y = 8500), colour = brewer.pal(8,'Dark2')[3], angle = 90, vjust = -0.6) +
facet_wrap(~ Category2) + guides(fill = FALSE)
6th plot presents the lifecycle of two dependent formats until 2003.
# data chosen to be showed on the plot
data_seg <- data_agg %>%
filter(Category %in% c("Vinyl", "Tapes")) %>%
filter(Year<2004) %>%
select(Year, Category, Revenues) %>%
spread(key = Category, value=Revenues, -1)
tmp_date <- data_seg[data_seg$Year %in% c(seq(1973, 2003, 5)),]
# plot
ggplot(data = data_seg, aes(x = Vinyl, y = Tapes, label = Year)) +
geom_point(color="#BC243C") +
geom_segment(aes(xend = c(tail(Vinyl, n = -1), NA),
yend = c(tail(Tapes, n = -1), NA)),
arrow = arrow(length = unit(0.2,"cm"), type = 'closed'),
color="#E15D44"
) +
theme_minimal() +
geom_text(data=tmp_date,
color = "#5B5EA6",
size = 4.8) +
scale_x_continuous(breaks = seq(0, 4000, 1000), labels = label_number(suffix= "M")) +
scale_y_continuous(breaks = seq(0, 4000, 1000), labels = label_number(suffix= "M")) +
labs(title = "Lifecycle example of revenues from two consecutive major formats",
x = "Vinyl", y = "Tapes",
caption = "Data source: RIAA https://www.riaa.com/u-s-sales-database/")
Additionally we created the plot which shows the ranking of the total revenues obtained in years 1973-2019 from different categories of formats with respect to their types.
# calculating of total revenues
data_total <- data_all %>%
group_by(Format, Category, Type) %>%
summarise(TotalRev = sum(Revenues, na.rm = T))
# plot
ggdotchart(data_total, x = "Format", y = "TotalRev", color = "Type",
palette = c("#E47A2E", "#79C753", "#CD212A"),
sorting = "descending",
rotate = TRUE,
dot.size = 5,
y.text.col = TRUE) +
theme_cleveland() +
scale_y_continuous(breaks = seq(0, 200000, 50000), labels = label_number(suffix= "M")) +
labs(title = "Total revenues by formats",
y = "Total revenues",
color = "Format type",
caption = "Data source: RIAA https://www.riaa.com/u-s-sales-database/")
In the report we not only focused on the amount of sales and revenues but also calculated the appropriate average revenues per single music recording of the certain format for every year and analysed their relations with the changes on the music market. Below we presented the changes in unitary revenues with the special focus on the years from 2004.
# calculating revenues
data_agg <- data_agg %>%
mutate(UnitRev = ifelse(Revenues == 0 | Sales == 0, NA, Revenues/Sales))
# plot
ggplot(data = data_agg[!data_agg$Category %in% c("Streaming", "Synchronization"),], aes(x = Year, y = UnitRev, group = Category)) +
geom_line(aes(color = Category), linetype = 'F1', size = 1.3) +
facet_zoom(x = Year > 2004) +
scale_color_brewer(palette = "Dark2") +
labs(title = "Changes of unitary revenues and impact of digital era since 2004",
x = "Year", y = "Unitary revenues",
caption = "Data source: RIAA https://www.riaa.com/u-s-sales-database/")
Emphasized period caused us to make a closer look at the size of changes of the unitary revenues in the recent years, presented on the 10th plot.
# data selection and transformation
data_slope <- data_agg[!data_agg$Category %in% c("Streaming", "Synchronization", "Tapes"),] %>%
filter(Year %in% c(2004, 2019)) %>%
select(Year, Category, UnitRev) %>%
spread(key = Year, value=UnitRev, 0)
left_label <- paste(data_slope$Category, round(data_slope$`2004`, 2),sep=", ")
right_label <- paste(data_slope$Category, round(data_slope$`2019`, 2),sep=", ")
data_slope$class <- ifelse((data_slope$`2019` - data_slope$`2004`) < 0, "red", "green")
# plot
ggplot(data_slope) +
geom_segment(aes(x = 1, xend = 2, y = `2004`, yend = `2019`, col = class),
size = .75, show.legend = F) +
geom_vline(xintercept = 1, linetype = "dashed", size = .1) +
geom_vline(xintercept = 2, linetype = "dashed", size = .1) +
scale_color_manual(labels = c("Up", "Down"),
values = c("green" = "#00A170", "red" = "#BC243C")) +
xlim(.5, 2.5) + ylim(0, (1.1*(max(data_slope$`2004`, data_slope$`2019`)))) +
geom_text(label = left_label, y=data_slope$`2004`, x=rep(1, NROW(data_slope)), hjust=1.1, size=3.5) +
geom_text(label= right_label, y=data_slope$`2019`, x=rep(2, NROW(data_slope)), hjust=-0.1, size=3.5) +
geom_text(label="2004", x=1, y=1.1*(max(data_slope$`2004`, data_slope$`2019`)), hjust=1.2, size=5) +
geom_text(label="2019", x=2, y=1.1*(max(data_slope$`2004`, data_slope$`2019`)), hjust=-0.1, size=5) +
labs(title = "Changes in unitary revenues between years 2004 and 2019",
caption = "Data source: RIAA https://www.riaa.com/u-s-sales-database/") +
theme(plot.title = element_text(size = 18)) +
theme_void()
The name of the format with the highest unitary revenues was changing much more dynamically than in case of number of the sales or the revenues. Below we presented the rank with the information about format’s place in this rank with the appropriate values of unitary and total revenues available after hovering the cursor over selected line and year.
# calculating unitary revenues for the whole, not-aggregated data
data_all <- data_all %>%
mutate(UnitRev = ifelse(Revenues == 0 | Sales == 0, NA, Revenues/Sales))
# ranking data
data_rank <- data_all %>%
group_by(Year) %>% drop_na(UnitRev) %>%
mutate(rank = order(order(UnitRev, decreasing=TRUE)))
# information to display
data_rank$tooltip <- paste(
"Format: ", data_rank$Format,"<br/>",
"Unit revenues: ", round(data_rank$UnitRev, 2), "<br/>",
"Total revenues: ", data_rank$Revenues, " mln<br/>",
sep="") %>%
sapply(htmltools::HTML)
# plot
g_int <- ggplot(data = data_rank, aes(x = Year, y = rank, group = Format)) +
geom_line_interactive(aes(color = Format, alpha = -rank, tooltip = tooltip, data_id = Format), size = 2) +
scale_y_reverse(breaks = seq(1, 13)) +
scale_fill_manual(values=colorRampPalette(brewer.pal(8, "Paired"))(18)) +
labs(title = "Ranking of unit revenues for different formats",
x = "Year", y = "Ranking (the higher, the bigger unit revenues)",
caption = "Data source: RIAA https://www.riaa.com/u-s-sales-database/") +
theme(panel.grid.major = element_blank(),
axis.ticks = element_blank(),
legend.position = "none"
)
ggiraph(code = print(g_int), hover_css = "cursor:pointer;stroke:red;")
Different behaviour of the different formats can be clearly seen on the plot below.
ggplot(data_agg[!data_agg$Category %in% c("Download", "Streaming", "Synchronization"),], aes(x = UnitRev, y = Category, fill = ..x..)) +
geom_density_ridges_gradient(scale = 2, rel_min_height = 0.0005) +
scale_fill_viridis(name = "Unitary revenues", option = "D") +
theme_minimal() +
labs(title = "Unitary revenue dispertion for different types of formats",
x = "Unitary revenues", y = "Category",
caption = "Data source: RIAA https://www.riaa.com/u-s-sales-database/") +
theme(legend.position = 'none'
)
Next we came back to the yearly revenues topic in order to present the changes in the revenues including the streaming formats, which could not be taken into consideration in the unitary revenues analysis. Despite that, many aspects should be considered with the awareness of streaming formats entering the stage in the recent years.
rev2 %>% group_by(Year, Type) %>% summarise(Sum = sum(Revenues)) %>%
mutate(Type= fct_relevel(Type,
"Digital", "Streaming", "Physical")) %>%
ggplot(aes(x = Year, y = Sum, fill = Type)) +
geom_col(position = "stack", width=1)+
labs(y = 'Revenue generated', x = 'Year',
title = 'Sources of revenue by recorded music type',
caption = "Data source: RIAA https://www.riaa.com/u-s-sales-database/") +
scale_x_continuous(name="Year", breaks = seq(1973, 2019, 3))+
scale_y_continuous(breaks = seq(0,15000, 2500), labels = label_number(prefix = "$",suffix= "M"))+
scale_fill_manual(values=mypalette) +
theme_minimal()
Shares of different format types in the total amount of yearly revenues since 2005 show how the changes of music eras proceeded and leave no doubts about the huge role the streaming has nowadays.
# aggregating data by types
merged <- data_all %>% group_by(Year, Type) %>%
summarize(Sales = sum(Sales, na.rm = T), SalesYear = max(SalesForYear, na.rm = T),
Revenues = sum(Revenues, na.rm = T), RevYear = max(RevForYear, na.rm = T)) %>%
filter(Year > 2004) %>%
mutate(Percentage = Revenues/RevYear) %>%
mutate(Type= fct_relevel(Type,
"Streaming", "Digital", "Physical"))
ggplot(merged, aes(x = Year, y = Percentage, fill = Type), show.legend = F)+
geom_area(alpha = 0.6, size =1, colour = 'black')+
labs(x = 'Year', y = "Percentage of revenue",
title = 'Changes in the revenue structure (since 2005)',
caption = "Data source: RIAA https://www.riaa.com/u-s-sales-database/")+
scale_y_continuous(labels = percent)+
scale_x_continuous(name="Year", breaks = seq(2005, 2019, 1),limits=c(2005, 2019))+
annotate(geom="text", x=2008, y=0.35, label="Physical", color="black", fontface =2, size =7)+
annotate(geom="text", x=2012, y=0.65, label="Digital", color="black", fontface =2, size =7)+
annotate(geom="text", x=2017, y=0.85, label="Streaming", color="black", fontface =2, size =7)+
scale_fill_manual(values=mypalette[3:1])+
theme_minimal() +
theme(legend.position = "none")
Finally, we created an animated multiplot of number of sales divided by different categories of formats with respect to the total and unitary revenues to have a comprehensive view on the changes in relations of our variables.
# part 1: number of sales vs. revenues
p1 <- ggplot(data_agg[!data_agg$Category %in% c("Streaming", "Synchronization"),], aes(y = Revenues, x = Sales)) +
geom_point(aes(color = Category), size = 3, show.legend = FALSE) +
ggtitle("Year:") +
scale_x_continuous(breaks = seq(0, 2000, 500), labels = label_number(suffix= "M")) +
scale_y_continuous(breaks = seq(0, 15000, 5000), labels = label_number(suffix= "M")) +
labs(y = "Revenues", x = "Number of Sales") +
theme_minimal() +
theme(plot.title = element_text(size = 22, hjust = 1),
axis.title = element_text(size = 16),
text = element_text(size = 14)) +
scale_color_brewer(palette = "Dark2") +
transition_states(Year) +
shadow_mark(size = 2, past = T, future = T, alpha = 0.5)
p1_animate <- animate(p1, fps = 5)
anim_save('p1.animate')
# part 2: number of sales vs. unitary revenues
p2 <- ggplot(data_agg[!data_agg$Category %in% c("Streaming", "Synchronization"),],
aes(x = Sales, y = UnitRev)) +
geom_point(aes(color = Category), size = 3) +
ggtitle("{closest_state}") +
scale_x_continuous(breaks = seq(0, 2000, 500), labels = label_number(suffix= "M")) +
labs(y = "Unitary revenues", x = "Number of Sales") +
theme_minimal() +
theme(plot.title = element_text(size = 22, hjust = -0.1),
axis.title = element_text(size = 16),
text = element_text(size = 14)) +
scale_color_brewer(palette = "Dark2") +
transition_states(Year) +
shadow_mark(size = 2, past = T, future = T, alpha = 0.5)
p2_animate <- animate(p2, fps = 5, width = 650)
anim_save('p2.animate')
# loading and merging parts
p1_animated <- image_read('p1.animate')
p2_animated <- image_read('p2.animate')
anim_plot <- image_append(c(p1_animated[1], p2_animated[1]))
for(i in 2:100){
combined <- image_append(c(p1_animated[i], p2_animated[i]))
anim_plot <- c(anim_plot, combined)
}
# plot
anim_plot