grammy_data = read.csv("Grammy Award Nominees and Winners 1958-2024.csv", header = TRUE)
awards <- grammy_data %>%
rename_all(tolower)
awards <- awards %>%
clean_names()
awards <- awards %>%
select(year, award_name, nominee, winner)
awards <- awards %>%
filter(!is.na(award_name), !is.na(nominee), !is.na(winner))
awards <- awards %>%
mutate(
category = str_trim(award_name),
nominee = str_trim(nominee)
)
awardct = 1
nomineect = 1
for(i in 2:nrow(awards)){
if(! (awards[i,2] %in% awards[1:(i-1),2])){ #check for unique award name
awardct <- awardct + 1
}
if(! (awards[i,3] %in% awards[1:(i-1),3])){ #check for unique nominee name
nomineect <- nomineect + 1
}
}
# set aside matrix, names for rows/cols
awardsAndNoms <- matrix(numeric(awardct * nomineect), awardct, nomineect)
awardNames = character(awardct)
nomineeNames = character(nomineect)
# set up iteration and assign first slot
awardct = 1
nomineect = 1
awardNames[1] <- awards[1,2]
nomineeNames[1] <- awards[1,3]
# fill out names, recount uniques
for(i in 2:nrow(awards)){
if(! (awards[i,2] %in% awards[1:(i-1),2])){ #check for unique award name
awardct <- awardct + 1
awardNames[awardct] <- awards[i,2]
}
if(! (awards[i,3] %in% awards[1:(i-1),3])){ #check for unique nominee name
nomineect <- nomineect + 1
nomineeNames[nomineect] <- awards[i,3]
}
}
# assign col and row names to nominees, awards respectively
colnames(awardsAndNoms) <- nomineeNames
rownames(awardsAndNoms) <- awardNames
# find nomination counts for all awards (matrix will be incredibly sparse)
for(i in 1:nrow(awards)){
awardName = awards[i,2]
nomineeName = awards[i,3]
awardsAndNoms[awardName,nomineeName] <- awardsAndNoms[awardName,nomineeName] + 1
}
# adjust for award frequency
for(i in 1:nrow(awardsAndNoms)){
factor = sum(awardsAndNoms[i,]) / nrow(awards)
awardsAndNoms[i,] <- awardsAndNoms[i,] * factor
}
# compute cosine similarity between all rows
awardDissim <- awardsAndNoms / sqrt(rowSums(awardsAndNoms * awardsAndNoms))
awardDissim <- awardDissim %*% t(awardDissim)
# convert to dissimilarity
awardDissim <- as.dist(1 - awardDissim)
hc_complete <- hclust(awardDissim)
# create PDF with the appropriate plot, needs to be this large to accomodate all of the awards
pdf("Grammy Award Dendrogram.pdf", width=120, height=40)
plot(hc_complete, main = "Grammy Award Dendrogram")
dev.off()
## png
## 2