Set UP and Data Clean Up

library(tidyverse)
library(tokenizers)
library(tidytext)
library(caret)
hot100 <- read_csv("C:/Users/admin/Downloads/Hot Stuff.csv")
#Set Everything Tolower in text to make life better
hot100$Song <- tolower(hot100$Song)
Hot_Words <- hot100 %>% unnest_tokens(word, Song)
Hot_Words2<-Hot_Words %>% anti_join(stop_words)
head(Hot_Words2 %>% count(word, sort = TRUE),20)
## # A tibble: 20 x 2
##    word        n
##    <chr>   <int>
##  1 love    24918
##  2 baby     4615
##  3 time     4574
##  4 girl     4556
##  5 heart    4191
##  6 night    3842
##  7 life     2854
##  8 song     2725
##  9 world    2480
## 10 woman    2349
## 11 wanna    2166
## 12 tonight  2119
## 13 day      2066
## 14 home     2044
## 15 rock     2018
## 16 gonna    1985
## 17 dance    1947
## 18 eyes     1819
## 19 bad      1689
## 20 boy      1682
#
Top20 <- head(Hot_Words2 %>% count(word, sort = TRUE),20)

Top20 <- Top20 %>%
  select(-n)
rm(Hot_Words, Hot_Words2)

Feature Engineering

hot100 <- hot100 %>%
  group_by(SongID) %>%
  mutate(TopSpot = min(`Peak Position`))
hot100 <- hot100 %>%
  rowwise() %>%
  mutate(countwords = lengths(strsplit(Song," ")))
hot100 <- hot100 %>%
  rowwise() %>%
  mutate(topwordcount = sum(str_detect(Song, Top20[[1]])))
Top5 <- c('love','baby','time','girl','heart')
Top10 <- c('love','baby','time','girl','heart','night', 'life','song','world','woman')

hot100 <- hot100 %>%
  rowwise()%>%
  mutate(Top5=any(sapply(Top5,function(x){str_detect(Song, x)})),
  Top10=any(sapply(Top10,function(x){str_detect(Song, x)}))) %>%
  mutate(love = str_detect(Song,"love"))
TopHot <- hot100 %>%
  filter(TopSpot == 1)

ggplot(hot100, aes(love)) +
  geom_bar()

ggplot(TopHot, aes(love)) +
  geom_bar()

#Taking it to distinct songIDs because the week makes the song ID not unique, and you have to keep the created features
hot100Created <- hot100 %>%
  select(SongID, Song, TopSpot, topwordcount, countwords, love, Top5, Top10)

Distinct_hot <- hot100Created %>%
  distinct(SongID, Song, TopSpot, topwordcount, countwords, love, Top5, Top10)

#Unnesting, removing stop word and joining sentiment
Distinct_hot <- Distinct_hot %>% unnest_tokens(word, Song)
Distinct_hot2 <- Distinct_hot %>% anti_join(stop_words)
Distinct_hot2 <- Distinct_hot2 %>% left_join(sentiments, by=c("word"="word")) 
Distinct_hot2$positive<-0 
Distinct_hot2$negative<-0

#Creating binary for Positive/Negative
Distinct_hot2 <- Distinct_hot2 %>% 
  mutate(positive=replace(positive,sentiment=="positive",1))%>% 
  mutate(negative=replace(negative,sentiment=="negative",1))

#Summarize it
Distinct_hot2 <- Distinct_hot2 %>%
  group_by(SongID,topwordcount, countwords, TopSpot, love, Top5, Top10) %>%
  summarize(pos_words=sum(positive),neg_words=sum(negative))

#Create the Sentiment
Distinct_hot2 <- Distinct_hot2 %>%
  mutate(sentiment=pos_words-neg_words)
ggplot(Distinct_hot2, aes(sentiment)) +
  geom_bar()

Create a model

Hot_Final <- Distinct_hot2 %>%
  ungroup() %>%
  select(TopSpot, topwordcount, countwords, love, Top5, Top10,sentiment, pos_words, neg_words)

ModelLN <- lm(TopSpot~.,data=Hot_Final)

summary(ModelLN)
## 
## Call:
## lm(formula = TopSpot ~ ., data = Hot_Final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -47.776 -28.461  -0.648  26.816  56.201 
## 
## Coefficients: (1 not defined because of singularities)
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   46.4620     0.4000 116.162   <2e-16 ***
## topwordcount  -1.2420     0.6075  -2.044   0.0409 *  
## countwords     0.1928     0.1057   1.825   0.0681 .  
## loveTRUE       0.4165     1.0798   0.386   0.6997    
## Top5TRUE       1.0806     1.0724   1.008   0.3136    
## Top10TRUE     -0.5052     1.0765  -0.469   0.6389    
## sentiment      0.5688     0.4936   1.152   0.2492    
## pos_words     -1.4618     0.7815  -1.871   0.0614 .  
## neg_words          NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 30.61 on 25159 degrees of freedom
## Multiple R-squared:  0.0005785,  Adjusted R-squared:  0.0003004 
## F-statistic:  2.08 on 7 and 25159 DF,  p-value: 0.04207
set.seed(713)
hot_index<-createDataPartition(Hot_Final$TopSpot,p=0.60,list=FALSE)
hot_train<-Hot_Final[hot_index,]
hot_test<-Hot_Final[-hot_index,]

ModelLN <- train(TopSpot ~ .,
              data=hot_train,
              method="lm")

Importance

importance <- varImp(ModelLN, scale=TRUE)
plot(importance)

Summary of Findings

In my data exploration, I tried sentiment analysis, top words used, binning top words used, counting words, and if they used top words at all. While we see that generally songs had a smidge more positive sentiment, in the end, it had no particular influence on the TopSpot position.

Addtionally, we saw that Love occurred the highest of all words, but it was again marginally higher in the appearance when we filterred for the Top Spot being one.

Since I did not intitially find a lot of success, I decided to build a basic model, and then add another with the Caret package so I could also use the variable importance feature. With this we can dramatic see that the count of all Top 20 words had the greatest weight of success on the Top Spot.

If I were writing a song, I’d at least include some top words, possibly make it more positive leaning, and toss love out there for the marginal amount of people who it might appeal more to because I’m not musical and I need all the help I’m going to get.