Set UP and Data Clean Up
library(tidyverse)
library(tokenizers)
library(tidytext)
library(caret)
hot100 <- read_csv("C:/Users/admin/Downloads/Hot Stuff.csv")
#Set Everything Tolower in text to make life better
hot100$Song <- tolower(hot100$Song)
Hot_Words <- hot100 %>% unnest_tokens(word, Song)
Hot_Words2<-Hot_Words %>% anti_join(stop_words)
head(Hot_Words2 %>% count(word, sort = TRUE),20)
## # A tibble: 20 x 2
## word n
## <chr> <int>
## 1 love 24918
## 2 baby 4615
## 3 time 4574
## 4 girl 4556
## 5 heart 4191
## 6 night 3842
## 7 life 2854
## 8 song 2725
## 9 world 2480
## 10 woman 2349
## 11 wanna 2166
## 12 tonight 2119
## 13 day 2066
## 14 home 2044
## 15 rock 2018
## 16 gonna 1985
## 17 dance 1947
## 18 eyes 1819
## 19 bad 1689
## 20 boy 1682
#
Top20 <- head(Hot_Words2 %>% count(word, sort = TRUE),20)
Top20 <- Top20 %>%
select(-n)
rm(Hot_Words, Hot_Words2)
Feature Engineering
hot100 <- hot100 %>%
group_by(SongID) %>%
mutate(TopSpot = min(`Peak Position`))
hot100 <- hot100 %>%
rowwise() %>%
mutate(countwords = lengths(strsplit(Song," ")))
hot100 <- hot100 %>%
rowwise() %>%
mutate(topwordcount = sum(str_detect(Song, Top20[[1]])))
Top5 <- c('love','baby','time','girl','heart')
Top10 <- c('love','baby','time','girl','heart','night', 'life','song','world','woman')
hot100 <- hot100 %>%
rowwise()%>%
mutate(Top5=any(sapply(Top5,function(x){str_detect(Song, x)})),
Top10=any(sapply(Top10,function(x){str_detect(Song, x)}))) %>%
mutate(love = str_detect(Song,"love"))
TopHot <- hot100 %>%
filter(TopSpot == 1)
ggplot(hot100, aes(love)) +
geom_bar()

ggplot(TopHot, aes(love)) +
geom_bar()

#Taking it to distinct songIDs because the week makes the song ID not unique, and you have to keep the created features
hot100Created <- hot100 %>%
select(SongID, Song, TopSpot, topwordcount, countwords, love, Top5, Top10)
Distinct_hot <- hot100Created %>%
distinct(SongID, Song, TopSpot, topwordcount, countwords, love, Top5, Top10)
#Unnesting, removing stop word and joining sentiment
Distinct_hot <- Distinct_hot %>% unnest_tokens(word, Song)
Distinct_hot2 <- Distinct_hot %>% anti_join(stop_words)
Distinct_hot2 <- Distinct_hot2 %>% left_join(sentiments, by=c("word"="word"))
Distinct_hot2$positive<-0
Distinct_hot2$negative<-0
#Creating binary for Positive/Negative
Distinct_hot2 <- Distinct_hot2 %>%
mutate(positive=replace(positive,sentiment=="positive",1))%>%
mutate(negative=replace(negative,sentiment=="negative",1))
#Summarize it
Distinct_hot2 <- Distinct_hot2 %>%
group_by(SongID,topwordcount, countwords, TopSpot, love, Top5, Top10) %>%
summarize(pos_words=sum(positive),neg_words=sum(negative))
#Create the Sentiment
Distinct_hot2 <- Distinct_hot2 %>%
mutate(sentiment=pos_words-neg_words)
ggplot(Distinct_hot2, aes(sentiment)) +
geom_bar()

Create a model
Hot_Final <- Distinct_hot2 %>%
ungroup() %>%
select(TopSpot, topwordcount, countwords, love, Top5, Top10,sentiment, pos_words, neg_words)
ModelLN <- lm(TopSpot~.,data=Hot_Final)
summary(ModelLN)
##
## Call:
## lm(formula = TopSpot ~ ., data = Hot_Final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -47.776 -28.461 -0.648 26.816 56.201
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 46.4620 0.4000 116.162 <2e-16 ***
## topwordcount -1.2420 0.6075 -2.044 0.0409 *
## countwords 0.1928 0.1057 1.825 0.0681 .
## loveTRUE 0.4165 1.0798 0.386 0.6997
## Top5TRUE 1.0806 1.0724 1.008 0.3136
## Top10TRUE -0.5052 1.0765 -0.469 0.6389
## sentiment 0.5688 0.4936 1.152 0.2492
## pos_words -1.4618 0.7815 -1.871 0.0614 .
## neg_words NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30.61 on 25159 degrees of freedom
## Multiple R-squared: 0.0005785, Adjusted R-squared: 0.0003004
## F-statistic: 2.08 on 7 and 25159 DF, p-value: 0.04207
set.seed(713)
hot_index<-createDataPartition(Hot_Final$TopSpot,p=0.60,list=FALSE)
hot_train<-Hot_Final[hot_index,]
hot_test<-Hot_Final[-hot_index,]
ModelLN <- train(TopSpot ~ .,
data=hot_train,
method="lm")
Importance
importance <- varImp(ModelLN, scale=TRUE)
plot(importance)

Summary of Findings
In my data exploration, I tried sentiment analysis, top words used, binning top words used, counting words, and if they used top words at all. While we see that generally songs had a smidge more positive sentiment, in the end, it had no particular influence on the TopSpot position.
Addtionally, we saw that Love occurred the highest of all words, but it was again marginally higher in the appearance when we filterred for the Top Spot being one.
Since I did not intitially find a lot of success, I decided to build a basic model, and then add another with the Caret package so I could also use the variable importance feature. With this we can dramatic see that the count of all Top 20 words had the greatest weight of success on the Top Spot.
If I were writing a song, I’d at least include some top words, possibly make it more positive leaning, and toss love out there for the marginal amount of people who it might appeal more to because I’m not musical and I need all the help I’m going to get.