Remove Objects from Environment
ls()
## character(0)
rm(list = ls())
Load packages
library(tidyverse) # data manipulation
library(stats) # clustering algorithms
library(factoextra) # clustering algorithms & visualization
sentiment <- readRDS("top100songsSENTIMENT.rds")
head(sentiment)
## Artist Track TotalStreams Sentiment
## 1 Drake Gods Plan 453226629 -24
## 2 XXXTENTACION SAD! 332633597 -40
## 3 Post Malone Psycho 306877012 -18
## 4 Juice WRLD Lucid Dreams 299907223 -36
## 5 BlocBoy JB Look Alive 266861797 -82
## 6 Drake Nice For What 263455062 -14
Prior to clustering data, you may want to remove or estimate missing data and rescale variables for comparability.
sentiment1 <- sentiment %>%
na.omit() %>% # listwise deletion of missing
mutate(Track_Artist = paste0(Track, " by ", Artist),
TotalStreamsScld = scale(TotalStreams), # standardize variables
SentimentScld = scale(Sentiment)) %>% # standardize variables
select(Track_Artist,TotalStreams, Sentiment, TotalStreamsScld, SentimentScld )
A plot of the within groups sum of squares by number of clusters extracted can help determine the appropriate number of clusters. The analyst looks for a bend in the plot similar to a scree test in factor analysis. https://www.statmethods.net/about/books.html
sentiment2 <- sentiment1 %>%
select(TotalStreamsScld, SentimentScld)
# Determine number of clusters
wss <- (nrow(sentiment2)-1)*sum(apply(sentiment2,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(sentiment2, centers=i)$withinss)
plot(1:15, wss, type="b", xlab="Number of Clusters",ylab="Within groups sum of squares")
dist_songs <- dist(sentiment2, method = "euclidean")
hc_songs <- hclust(dist_songs, method = "complete")
cluster_assignments <- cutree(hc_songs, k = 8)
cluster_assignments2 <- as.data.frame(cluster_assignments)
sentiment3 <- cbind(sentiment1, cluster_assignments2)
sentiment4 <- sentiment3 %>%
mutate(TotalStreamsMillions = TotalStreams/1000000)
You were able to summarize 73,000 rows of data in two dimensions. PREETY COOL RIGHT!!!
Which cluster will you pick to advertise? Or which cluster will you play at the next house party? :)
My_Theme = theme(
axis.title.x = element_text(size = 16),
axis.text.x = element_text(size = 14),
axis.title.y = element_text(size = 16),
axis.text.y = element_text(size = 14))
library(ggplot2)
ggplot(sentiment4, aes(x=Sentiment, y=TotalStreamsMillions, color = factor(cluster_assignments), label = Track_Artist))+
geom_point()+
geom_text(check_overlap = TRUE, size = 5) +
My_Theme