grammy_data = read.csv("Grammy Award Nominees and Winners 1958-2024.csv", header = TRUE)
awards <- grammy_data %>%
  rename_all(tolower)

awards <- awards %>% 
  clean_names()

awards <- awards %>%
  select(year, award_name, nominee, winner)

awards <- awards %>%
  filter(!is.na(award_name), !is.na(nominee), !is.na(winner))

awards <- awards %>%
  mutate(
    category = str_trim(award_name),
    nominee = str_trim(nominee)
  )
awardct = 1
nomineect = 1
for(i in 2:nrow(awards)){
  if(! (awards[i,2] %in% awards[1:(i-1),2])){ #check for unique award name
    awardct <- awardct + 1
  }
  if(! (awards[i,3] %in% awards[1:(i-1),3])){ #check for unique nominee name
    nomineect <- nomineect + 1
  }
}
# set aside matrix, names for rows/cols
awardsAndNoms <- matrix(numeric(awardct * nomineect), awardct, nomineect)
awardNames = character(awardct)
nomineeNames = character(nomineect)

# set up iteration and assign first slot
awardct = 1
nomineect = 1
awardNames[1] <- awards[1,2]
nomineeNames[1] <- awards[1,3]

# fill out names, recount uniques
for(i in 2:nrow(awards)){
  if(! (awards[i,2] %in% awards[1:(i-1),2])){ #check for unique award name
    awardct <- awardct + 1
    awardNames[awardct] <- awards[i,2]
  }
  if(! (awards[i,3] %in% awards[1:(i-1),3])){ #check for unique nominee name
    nomineect <- nomineect + 1
    nomineeNames[nomineect] <- awards[i,3]
  }
}

# assign col and row names to nominees, awards respectively
colnames(awardsAndNoms) <- nomineeNames
rownames(awardsAndNoms) <- awardNames

# find nomination counts for all awards (matrix will be incredibly sparse)
for(i in 1:nrow(awards)){
  awardName = awards[i,2]
  nomineeName = awards[i,3]
  awardsAndNoms[awardName,nomineeName] <- awardsAndNoms[awardName,nomineeName] + 1
}

# adjust for award frequency
for(i in 1:nrow(awardsAndNoms)){
  factor = sum(awardsAndNoms[i,]) / nrow(awards)
  awardsAndNoms[i,] <- awardsAndNoms[i,] * factor
}
# compute cosine similarity between all rows
awardDissim <- awardsAndNoms / sqrt(rowSums(awardsAndNoms * awardsAndNoms))
awardDissim <- awardDissim %*% t(awardDissim)

# convert to dissimilarity
awardDissim <- as.dist(1 - awardDissim)
hc_complete <- hclust(awardDissim)

# create PDF with the appropriate plot, needs to be this large to accomodate all of the awards
pdf("Grammy Award Dendrogram.pdf", width=120, height=40)
plot(hc_complete, main = "Grammy Award Dendrogram")
dev.off()
## png 
##   2