Recreate a data visualization from Fivethirtyeight (https://projects.fivethirtyeight.com/college-fight-song-lyrics/)
Source: https://github.com/fivethirtyeight/data/tree/master/fight-songs/
# Load the tidyverse
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.3.6 v purrr 0.3.4
## v tibble 3.1.8 v dplyr 1.0.9
## v tidyr 1.2.1 v stringr 1.4.1
## v readr 2.1.2 v forcats 0.5.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# Import data
fight.songs <-
read.csv("https://raw.githubusercontent.com/kitadasmalley/DATA502/main/FALL2022/D
ata/fight_songs_HKS.csv", stringsAsFactors=FALSE)
fight.songs$trope_count<- as.factor(fight.songs$trope_count)
#summary(fight.songs)
# Select "Oregon State" and create a separate dataset
ors<-fight.songs %>% filter(school=="Oregon State")
# Create a dataset for the schools that are not "Oregon State"
others<- fight.songs %>% filter(school!="Oregon State")
fight.songs %>%
ggplot(aes(x=sec_duration, y=bpm))+
geom_point(size=5, alpha=0.1)+
geom_point(data=others, size=5, alpha=0.2, color=others$hex)+
geom_point(data=ors,size=5, shape=21, fill=ors$hex, color="black",stroke=1.2)+
geom_hline(data=fight.songs, yintercept = 130, linetype="dashed")+
geom_vline(data=fight.songs, xintercept = 71, linetype="dashed")+
ggtitle("How Oregon State’s fight song stacks up")+
annotate("text", x = 65, y = 144, label = "Oregon State",size=4,fontface = 'bold')+
annotate("text", x = c(30,140), y=190, label=c("Fast and short","Fast but long"))+
annotate("text", x = c(30,140), y=50, label=c("Slow but short","Slow and long"))+
annotate("text", x = 160, y=131, label="Average",size=3)+
annotate("text", x = 70, y=105, label="Average",size=3,angle=90)+
scale_x_discrete(limit=c(0,20,40,60,80,100,120,140,160,180), name="Duration",
labels=c("0 sec","20","40","60","80","100","120","140","160","180"))+
scale_y_discrete(limit=c(40,60,80,100,120,140,160,180,200), name="Beats per minute",
labels=c("","60","80","100","120","140","160","180","200bpm"))+
coord_fixed(ratio = 1, xlim = c(0,180), ylim = c(40,200))+
theme(axis.ticks = element_blank(), panel.background = element_rect(fill = "white"),
panel.grid.major = element_line(colour = "lightgrey"),
plot.title = element_text(hjust = 0.5, face = "bold"))
# Create an alternative data visualization using original dataset
ggplot(fight.songs) +
aes(x = sec_duration, fill = student_writer) +
geom_density(adjust = 1L, alpha=0.5) +
annotate("segment", x = 50, xend = 50, y = 0, yend = 0.02,colour = "blue", linetype="dotted")+
annotate("segment", x = 100, xend = 100, y = 0, yend = 0.02,colour = "red", linetype="dashed")+
scale_fill_manual(values = c(No = "#440154",Unknown = "#22908B",Yes = "#FDE725")) +
labs(x = "Duration (in seconds)", y = "Density",title = "Who Wrote Longer College Fight Songs? \nStudent Writers or Not-Student Writers?",
subtitle = "Student writers write longer fight songs.Some songs are longer than 150 seconds.",
caption = "The songs written by student writers are between 50 to 125 seconds. Some songs go beyond 150.\nThe songs by not-student writers mostly fall between 50 and 100 seconds and stay below 150.\nThe songs written by unknown writers are less than 100 seconds.") +
theme_minimal() +
theme(legend.position = "none",plot.caption = element_text(hjust = 0),panel.grid.minor = element_blank(),panel.grid.major = element_blank()) +
facet_grid(vars(), vars(student_writer))