library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.2 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.3.0
## ✔ purrr 1.1.0 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
grammy_data <- read_csv("Grammy Award Nominees and Winners 1958-2024.csv")
## New names:
## Rows: 25305 Columns: 9
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (4): Award Type, Award Name, Work, Nominee dbl (4): ...1, Year, Ceremony, Award
## ID lgl (1): Winner
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
summary(grammy_data)
## ...1 Year Ceremony Award ID
## Min. : 0 Min. :1958 Min. : 1.00 Min. : 584.0
## 1st Qu.: 6326 1st Qu.:1982 1st Qu.:25.00 1st Qu.: 614.0
## Median :12652 Median :1998 Median :41.00 Median : 661.0
## Mean :12652 Mean :1996 Mean :38.79 Mean : 672.3
## 3rd Qu.:18978 3rd Qu.:2010 3rd Qu.:53.00 3rd Qu.: 701.0
## Max. :25304 Max. :2024 Max. :67.00 Max. :2432.0
## Award Type Award Name Work Nominee
## Length:25305 Length:25305 Length:25305 Length:25305
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Winner
## Mode :logical
## FALSE:20435
## TRUE :4870
##
##
##
colSums(is.na(grammy_data))
## ...1 Year Ceremony Award ID Award Type Award Name Work
## 0 0 0 0 0 0 893
## Nominee Winner
## 0 0
grammy <- grammy_data %>%
rename_all(tolower)
grammy <- grammy %>%
clean_names()
names(grammy)
## [1] "x1" "year" "ceremony" "award_id" "award_type"
## [6] "award_name" "work" "nominee" "winner"
grammy <- grammy %>%
select(year, award_name, nominee, winner)
grammy <- grammy %>%
filter(!is.na(award_name), !is.na(nominee), !is.na(winner))
grammy <- grammy %>%
mutate(
category = str_trim(award_name),
nominee = str_trim(nominee)
)
write_csv(grammy, "grammy_clean.csv")
grammy_clean <- read_csv("grammy_clean.csv")
## Rows: 25305 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): award_name, nominee, category
## dbl (1): year
## lgl (1): winner
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
award_features <- grammy_clean %>%
group_by(award_name) %>%
summarise(
total_nominations = n(),
total_wins = sum(winner == TRUE, na.rm = TRUE),
win_rate = total_wins / total_nominations,
unique_nominees = n_distinct(nominee),
years_active = n_distinct(year)
) %>%
drop_na()
award_features
award_scaled <- award_features %>%
select(-award_name) %>%
scale()
pca_result <- prcomp(award_scaled, center = TRUE, scale. = TRUE)
summary(pca_result)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 1.9851 1.0005 0.23780 0.04562 0.00374
## Proportion of Variance 0.7881 0.2002 0.01131 0.00042 0.00000
## Cumulative Proportion 0.7881 0.9883 0.99958 1.00000 1.00000
plot(pca_result, type = "l", main = "Scree Plot")

biplot(pca_result, scale = 0, cex = 0.6)

pca_scores <- as.data.frame(pca_result$x)
pca_scores$award_name <- award_features$award_name
summary(pca_scores)
## PC1 PC2 PC3 PC4
## Min. :-1.1531 Min. :-13.64876 Min. :-1.632906 Min. :-0.185664
## 1st Qu.:-1.1139 1st Qu.: -0.34945 1st Qu.:-0.052812 1st Qu.:-0.013876
## Median :-0.8869 Median : -0.34149 Median :-0.029185 Median : 0.001241
## Mean : 0.0000 Mean : 0.00000 Mean : 0.000000 Mean : 0.000000
## 3rd Qu.: 0.3259 3rd Qu.: 0.08956 3rd Qu.:-0.006724 3rd Qu.: 0.006399
## Max. :14.1091 Max. : 4.09548 Max. : 1.505706 Max. : 0.418841
## PC5 award_name
## Min. :-7.607e-02 Length:784
## 1st Qu.:-1.387e-04 Class :character
## Median :-9.261e-05 Mode :character
## Mean : 0.000e+00
## 3rd Qu.: 1.100e-04
## Max. : 9.340e-03
pca_result$sdev
## [1] 1.985050372 1.000465436 0.237798936 0.045624711 0.003739684
summary(pca_result)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 1.9851 1.0005 0.23780 0.04562 0.00374
## Proportion of Variance 0.7881 0.2002 0.01131 0.00042 0.00000
## Cumulative Proportion 0.7881 0.9883 0.99958 1.00000 1.00000
pca_subset <- pca_scores %>%
select(PC1, PC2, PC3)
d <- dist(1 - pca_subset)
hc.complete <- hclust(d, method = "complete")
pdf("Grammy Award Dendrogram 3.pdf", width=120, height=40)
plot(hc.complete,labels = pca_scores$award_name, main = "Grammy Award Dendrogram")
dev.off()
## quartz_off_screen
## 2
k <- 5
clusters <- cutree(hc.complete, k = k)
pca_scores$cluster <- as.factor(clusters)
library(ggplot2)
ggplot(pca_scores, aes(x = PC1, y = PC2, color = cluster, label = award_name)) +
geom_point(size = 3) +
geom_text(size = 2.5, hjust = 1, vjust = 1, check_overlap = TRUE) +
labs(
title = "Grammy Award Categories Clustered in PCA Space",
x = "Principal Component 1",
y = "Principal Component 2"
) +
theme_minimal()

aggregate(cbind(PC1, PC2) ~ clusters, data = pca_scores, FUN = mean)