library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.2     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.3.0
## ✔ purrr     1.1.0     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
grammy_data <- read_csv("Grammy Award Nominees and Winners 1958-2024.csv")
## New names:
## Rows: 25305 Columns: 9
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (4): Award Type, Award Name, Work, Nominee dbl (4): ...1, Year, Ceremony, Award
## ID lgl (1): Winner
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
summary(grammy_data)
##       ...1            Year         Ceremony        Award ID     
##  Min.   :    0   Min.   :1958   Min.   : 1.00   Min.   : 584.0  
##  1st Qu.: 6326   1st Qu.:1982   1st Qu.:25.00   1st Qu.: 614.0  
##  Median :12652   Median :1998   Median :41.00   Median : 661.0  
##  Mean   :12652   Mean   :1996   Mean   :38.79   Mean   : 672.3  
##  3rd Qu.:18978   3rd Qu.:2010   3rd Qu.:53.00   3rd Qu.: 701.0  
##  Max.   :25304   Max.   :2024   Max.   :67.00   Max.   :2432.0  
##   Award Type         Award Name            Work             Nominee         
##  Length:25305       Length:25305       Length:25305       Length:25305      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    Winner       
##  Mode :logical  
##  FALSE:20435    
##  TRUE :4870     
##                 
##                 
## 
colSums(is.na(grammy_data))
##       ...1       Year   Ceremony   Award ID Award Type Award Name       Work 
##          0          0          0          0          0          0        893 
##    Nominee     Winner 
##          0          0
grammy <- grammy_data %>%
  rename_all(tolower)

grammy <- grammy %>% 
  clean_names()
names(grammy)
## [1] "x1"         "year"       "ceremony"   "award_id"   "award_type"
## [6] "award_name" "work"       "nominee"    "winner"
grammy <- grammy %>%
  select(year, award_name, nominee, winner)

grammy <- grammy %>%
  filter(!is.na(award_name), !is.na(nominee), !is.na(winner))

grammy <- grammy %>%
  mutate(
    category = str_trim(award_name),
    nominee = str_trim(nominee)
  )
write_csv(grammy, "grammy_clean.csv")
grammy_clean <- read_csv("grammy_clean.csv")
## Rows: 25305 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): award_name, nominee, category
## dbl (1): year
## lgl (1): winner
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
award_features <- grammy_clean %>%
  group_by(award_name) %>%
  summarise(
    total_nominations = n(),
    total_wins = sum(winner == TRUE, na.rm = TRUE),
    win_rate = total_wins / total_nominations,
    unique_nominees = n_distinct(nominee),
    years_active = n_distinct(year)
  ) %>%
  drop_na() 

award_features
award_scaled <- award_features %>%
  select(-award_name) %>%
  scale()

pca_result <- prcomp(award_scaled, center = TRUE, scale. = TRUE)

summary(pca_result)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5
## Standard deviation     1.9851 1.0005 0.23780 0.04562 0.00374
## Proportion of Variance 0.7881 0.2002 0.01131 0.00042 0.00000
## Cumulative Proportion  0.7881 0.9883 0.99958 1.00000 1.00000
plot(pca_result, type = "l", main = "Scree Plot")

biplot(pca_result, scale = 0, cex = 0.6)

pca_scores <- as.data.frame(pca_result$x)

pca_scores$award_name <- award_features$award_name

summary(pca_scores)
##       PC1               PC2                 PC3                 PC4           
##  Min.   :-1.1531   Min.   :-13.64876   Min.   :-1.632906   Min.   :-0.185664  
##  1st Qu.:-1.1139   1st Qu.: -0.34945   1st Qu.:-0.052812   1st Qu.:-0.013876  
##  Median :-0.8869   Median : -0.34149   Median :-0.029185   Median : 0.001241  
##  Mean   : 0.0000   Mean   :  0.00000   Mean   : 0.000000   Mean   : 0.000000  
##  3rd Qu.: 0.3259   3rd Qu.:  0.08956   3rd Qu.:-0.006724   3rd Qu.: 0.006399  
##  Max.   :14.1091   Max.   :  4.09548   Max.   : 1.505706   Max.   : 0.418841  
##       PC5              award_name       
##  Min.   :-7.607e-02   Length:784        
##  1st Qu.:-1.387e-04   Class :character  
##  Median :-9.261e-05   Mode  :character  
##  Mean   : 0.000e+00                     
##  3rd Qu.: 1.100e-04                     
##  Max.   : 9.340e-03
pca_result$sdev
## [1] 1.985050372 1.000465436 0.237798936 0.045624711 0.003739684
summary(pca_result)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5
## Standard deviation     1.9851 1.0005 0.23780 0.04562 0.00374
## Proportion of Variance 0.7881 0.2002 0.01131 0.00042 0.00000
## Cumulative Proportion  0.7881 0.9883 0.99958 1.00000 1.00000
pca_subset <- pca_scores %>%
  select(PC1, PC2, PC3)

d <- dist(1 - pca_subset)
hc.complete <- hclust(d, method = "complete")

pdf("Grammy Award Dendrogram 3.pdf", width=120, height=40)
plot(hc.complete,labels = pca_scores$award_name, main = "Grammy Award Dendrogram")
dev.off()
## quartz_off_screen 
##                 2
k <- 5
clusters <- cutree(hc.complete, k = k)

pca_scores$cluster <- as.factor(clusters)

library(ggplot2)

ggplot(pca_scores, aes(x = PC1, y = PC2, color = cluster, label = award_name)) +
  geom_point(size = 3) +
  geom_text(size = 2.5, hjust = 1, vjust = 1, check_overlap = TRUE) +
  labs(
    title = "Grammy Award Categories Clustered in PCA Space",
    x = "Principal Component 1",
    y = "Principal Component 2"
  ) +
  theme_minimal()

aggregate(cbind(PC1, PC2) ~ clusters, data = pca_scores, FUN = mean)