Youtube Sentiment Analysis and Topic Modelling

Loading libraries

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.6     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(tidytext)
library(SnowballC)
library(topicmodels)
library(stm)

## stm v1.3.6 successfully loaded. See ?stm for help. 
##  Papers, resources, and other materials at structuraltopicmodel.com

library(ldatuning)
library(knitr)
library(tm)

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

library(BTM)
library(textplot)
library(concaveman)
library(udpipe)
library(data.table)

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

## The following object is masked from 'package:purrr':
## 
##     transpose

library(stopwords)

## 
## Attaching package: 'stopwords'

## The following object is masked from 'package:tm':
## 
##     stopwords

library(dplyr)
library(readr)
library(readxl)
library(tidyr)
library(writexl)
library(readxl)
library(textdata)
library(ggplot2)
library(textdata)
library(wordcloud2)
library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:readr':
## 
##     col_factor

projects_raw <- read_xlsx("data/datascience_projects_tutorials.xlsx")

#selecting columns for analysis
projects_raw <- select(projects_raw,comment_ID, comments,video_type)
projects_raw$text <- paste(projects_raw$comments,projects_raw$comment_ID,sep=" ")

# Rename ID column
colnames(projects_raw)[1]  <- "doc_id"
projects_raw

## # A tibble: 615 x 4
##    doc_id                     comments             video_type  text             
##    <chr>                      <chr>                <chr>       <chr>            
##  1 Ugxax9wg3rCw8BPIHut4AaABAg "a job simp ? eh"    Data Analy~ "a job simp ? eh~
##  2 UgxsPZj3fphkrKn1ypZ4AaABAg "Way to go! This wa~ Data Analy~ "Way to go! This~
##  3 UgwXQLs5mfce95rFEMh4AaABAg "Thank you Ken! I l~ Data Analy~ "Thank you Ken! ~
##  4 UgwAXAOt8KGbNNER-7l4AaABAg "Hey Ken Jee, thank~ Data Analy~ "Hey Ken Jee, th~
##  5 UgySrmKyJHFW_WCzzf14AaABAg "Very informative p~ Data Analy~ "Very informativ~
##  6 UgzZC4VQZYdabgEfzYl4AaABAg "can anyone help me~ Data Analy~ "can anyone help~
##  7 UgyndZ-CYBzC819u_V14AaABAg "I have a coding te~ Data Analy~ "I have a coding~
##  8 UgwViqMr1ECmqkOCvXp4AaABAg "can You provide Us~ Data Analy~ "can You provide~
##  9 Ugz8cVruhbFKc9045HR4AaABAg "Amazing job ken th~ Data Analy~ "Amazing job ken~
## 10 Ugw5EP72lW_NftTJZZx4AaABAg "Should I use Excel~ Data Analy~ "Should I use Ex~
## # ... with 605 more rows

EXPLORING CATEGORIES

projects_raw %>% 
  ggplot(aes(x = video_type)) +
  geom_bar(show.legend = FALSE) + 
  labs(y = "Count",
     x = "Video Type",
     title = "Project Tutorials",
     subtitle = "Categorized by Video Types")

#tokenizing text 
projects_tidy <- projects_raw %>%
  unnest_tokens(output = word, input = comments) %>%
  anti_join(stop_words, by = "word")

#removing customized words and saving in new dataframe 

my_stopwords <- c("ken", "science", "thank", "video", "videos", "hey", "data", "ðŸ", "series", "projects", "project")
projects_tidy2 <-
 projects_tidy %>%
  filter(!word %in% my_stopwords)
# Remove numbers
projects_tidy2 <- projects_tidy2[-grep("\\b\\d+\\b", projects_tidy2$word),]

projects_top_tokens <- projects_tidy2 %>% 
  count(word, sort = TRUE) %>% 
  top_n(30)

## Selecting by n

wordcloud2 (projects_top_tokens)

Frequent Words from Cummulative Comments

# Frequent words from cumulative Tutorial Videos 
projects_top_tokens %>% 
  filter(n > 4) %>% 
  mutate(word = reorder(word, n)) %>% 
    ggplot(aes(word, n)) + 
    geom_col() +
    coord_flip() +
    labs(x = "Word \n", y = "\n Count ", title = "Frequent Words on Project Tutorials \n") +
    geom_text(aes(label = n), hjust = 3, colour = "white", fontface = "bold") +
    theme(plot.title = element_text(hjust = 0.5), 
        axis.title.x = element_text(face="bold", colour="darkblue", size = 12),
        axis.title.y = element_text(face="bold", colour="darkblue", size = 12))

projects_tidy2 %>%
  group_by(video_type) %>%
  count(word, sort = TRUE) %>%
  top_n(10) %>%
  ungroup %>%
  mutate(word = reorder_within(word, n, video_type)) %>%
  ggplot(aes(x = word, y = n, fill = word)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ video_type, scales = "free_y") +
  coord_flip() +
  scale_x_reordered() +
  scale_y_continuous(expand = c(0,0)) +
  labs(y = "Count",
       x = "Unique words",
       title = "Most frequent words by video name",
       subtitle = "Stop words removed from the list")

## Selecting by n

Sentiment Analysis

#dataframes for video titles 
datacollection <- projects_raw %>% filter (video_type == "Data Collection")

datacleaning <- projects_raw %>% filter (video_type == "Data Cleaning")

dataanalysis <- projects_raw %>% filter (video_type == "Data Analysis")

projectplanning<- projects_raw %>% filter (video_type == "Project Planning")
documentation <- projects_raw %>% filter (video_type == "Documentation")
modelbuilding <- projects_raw %>% filter (video_type == "Model Building")
modelproduction <- projects_raw %>% filter (video_type == "Model Production")

#tokenizing text 
datacollection_tidy <- datacollection %>%
  unnest_tokens(output = word, input = comments) %>%
  anti_join(stop_words, by = "word")
datacleaning_tidy <- datacleaning %>%
  unnest_tokens(output = word, input = comments) %>%
  anti_join(stop_words, by = "word")
dataanalysis_tidy <- dataanalysis %>%
  unnest_tokens(output = word, input = comments) %>%
  anti_join(stop_words, by = "word")
projectplanning_tidy <- projectplanning %>%
  unnest_tokens(output = word, input = comments) %>%
  anti_join(stop_words, by = "word")

documentation_tidy <- documentation %>%
  unnest_tokens(output = word, input = comments) %>%
  anti_join(stop_words, by = "word")
modelbuilding_tidy <- modelbuilding %>%
  unnest_tokens(output = word, input = comments) %>%
  anti_join(stop_words, by = "word")
modelproduction_tidy <- modelproduction %>%
  unnest_tokens(output = word, input = comments) %>%
  anti_join(stop_words, by = "word")

#removing customized words and saving in new dataframe 

my_stopwords <- c("ken", "science", "thank", "video", "videos", "hey", "data", "series","projects", "project")
datacollection_tidy2 <-
 datacollection_tidy %>%
  filter(!word %in% my_stopwords)

dataanalysis_tidy2 <-
 dataanalysis_tidy %>%
  filter(!word %in% my_stopwords)

projectplanning_tidy2 <-
 projectplanning_tidy %>%
  filter(!word %in% my_stopwords)

datacleaning_tidy2 <-
 datacleaning_tidy %>%
  filter(!word %in% my_stopwords)

documentation_tidy2 <-
 documentation_tidy %>%
  filter(!word %in% my_stopwords)

modelbuilding_tidy2 <-
 modelbuilding_tidy %>%
  filter(!word %in% my_stopwords)

modelproduction_tidy2 <-
 modelproduction_tidy %>%
  filter(!word %in% my_stopwords)

# Remove numbers
datacollection_tidy2 <- datacollection_tidy2[-grep("\\b\\d+\\b", datacollection_tidy2$word),]
dataanalysis_tidy2 <- dataanalysis_tidy2[-grep("\\b\\d+\\b", dataanalysis_tidy2$word),]
projectplanning_tidy2 <- projectplanning_tidy2[-grep("\\b\\d+\\b", projectplanning_tidy2$word),]
datacleaning_tidy2 <- datacleaning_tidy2[-grep("\\b\\d+\\b", datacleaning_tidy2$word),]
documentation_tidy2 <- documentation_tidy2[-grep("\\b\\d+\\b", documentation_tidy2$word),]
modelbuilding_tidy2 <- modelbuilding_tidy2[-grep("\\b\\d+\\b", modelbuilding_tidy2$word),]
modelproduction_tidy2 <- modelproduction_tidy2[-grep("\\b\\d+\\b", modelproduction_tidy2$word),]

projectplanning_top_tokens <- projectplanning_tidy2 %>% 
  count(word, sort = TRUE) %>% 
  top_n(30)

## Selecting by n

wordcloud2 (projectplanning_top_tokens)

Project Planning WordCloud

getwd()

## [1] "C:/Users/doreen/Documents/Youtube-SentimentAnalysis/ProjectBased"

### DataCollection WordCloud

datacollection_top_tokens <- datacollection_tidy2 %>% 
  count(word, sort = TRUE) %>% 
  top_n(30)

## Selecting by n

wordcloud2 (datacollection_top_tokens)

getwd()

## [1] "C:/Users/doreen/Documents/Youtube-SentimentAnalysis/ProjectBased"

Data Cleaning WordCloud

datacleaning_top_tokens <- datacleaning_tidy2 %>% 
  count(word, sort = TRUE) %>% 
  top_n(30)

## Selecting by n

wordcloud2 (datacleaning_top_tokens)

###Data Analysis WordCloud

dataanalysis_top_tokens <- dataanalysis_tidy2 %>% 
  count(word, sort = TRUE) %>% 
  top_n(30)

## Selecting by n

wordcloud2 (dataanalysis_top_tokens)

###Model Building WordCloud

modelbuilding_top_tokens <- modelbuilding_tidy2 %>% 
  count(word, sort = TRUE) %>% 
  top_n(30)

## Selecting by n

wordcloud2 (modelbuilding_top_tokens)

### Model Production WordCloud

modelproduction_top_tokens <- modelproduction_tidy2 %>% 
  count(word, sort = TRUE) %>% 
  top_n(30)

## Selecting by n

wordcloud2 (modelproduction_top_tokens)

Documentation WordCloud

documentation_top_tokens <- documentation_tidy2 %>% 
  count(word, sort = TRUE) %>% 
  top_n(30)

## Selecting by n

wordcloud2 (documentation_top_tokens)

projectplanning <-
  select(projectplanning,comments, video_type) %>%
  mutate(videotype = "projectplanning") %>%
  relocate(videotype)

datacollection <-
  select(datacollection,comments, video_type) %>%
  mutate(videotype = "datacollection") %>%
  relocate(videotype)

datacleaning <-
  select(datacleaning,comments, video_type) %>%
  mutate(videotype = "datacleaning") %>%
  relocate(videotype)

dataanalysis <-
  select(dataanalysis,comments, video_type) %>%
  mutate(videotype = "dataanalysis") %>%
  relocate(videotype)

documentation <-
  select(documentation,comments, video_type) %>%
  mutate(videotype = "documentation") %>%
  relocate(videotype)

modelbuilding <-
  select(modelbuilding,comments, video_type) %>%
  mutate(videotype = "modelbuilding") %>%
  relocate(videotype)

modelproduction <-
  select(modelproduction,comments,video_type) %>%
  mutate(videotype = "modelproduction") %>%
  relocate(videotype)

#binding rows
projectbasedyoutubecomments <- bind_rows(projectplanning, datacollection, datacleaning, dataanalysis, modelbuilding, modelproduction, documentation)
tail(projectbasedyoutubecomments)

## # A tibble: 6 x 3
##   videotype     comments                                             video_type 
##   <chr>         <chr>                                                <chr>      
## 1 documentation Thanks for creating this series. Watched and liked ~ Documentat~
## 2 documentation thanks so much for these indepth walkthroughs.       Documentat~
## 3 documentation Really a super awesome series ken. It helped a lot ~ Documentat~
## 4 documentation loved this series Ken!                               Documentat~
## 5 documentation keep doing great work man                            Documentat~
## 6 documentation Hi Everyone! Thank you for watching this series. It~ Documentat~

#tokenizing text 
projectbasedyoutube_tokens <- 
  projectbasedyoutubecomments %>%
  unnest_tokens(output = word, 
                input = comments)

#removing stopwords and doing a count of common words 

#removing stopwords 
tidy_comments <-
  projectbasedyoutube_tokens %>%
  anti_join(stop_words, by = "word")

count(tidy_comments, word, sort = T)

## # A tibble: 2,407 x 2
##    word        n
##    <chr>   <int>
##  1 data      248
##  2 ken       212
##  3 series    142
##  4 error     119
##  5 video     110
##  6 project    92
##  7 code       84
##  8 science    72
##  9 element    70
## 10 job        69
## # ... with 2,397 more rows

# Loading Affin Lexicons 
afinn <- get_sentiments("afinn")

afinn

## # A tibble: 2,477 x 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ... with 2,467 more rows

#Bing Lexicons 
bing <- get_sentiments("bing")

bing

## # A tibble: 6,786 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ... with 6,776 more rows

nrc <- get_sentiments("nrc")

nrc

## # A tibble: 13,875 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # ... with 13,865 more rows

loughran <- get_sentiments("loughran")

loughran

## # A tibble: 4,150 x 2
##    word         sentiment
##    <chr>        <chr>    
##  1 abandon      negative 
##  2 abandoned    negative 
##  3 abandoning   negative 
##  4 abandonment  negative 
##  5 abandonments negative 
##  6 abandons     negative 
##  7 abdicated    negative 
##  8 abdicates    negative 
##  9 abdicating   negative 
## 10 abdication   negative 
## # ... with 4,140 more rows

#joining Sentiments afinn
sentiment_afinn <- inner_join(tidy_comments, afinn, by = "word")

sentiment_afinn

## # A tibble: 770 x 4
##    videotype       video_type       word      value
##    <chr>           <chr>            <chr>     <dbl>
##  1 projectplanning Project Planning god           1
##  2 projectplanning Project Planning bless         2
##  3 projectplanning Project Planning love          3
##  4 projectplanning Project Planning diamond       1
##  5 projectplanning Project Planning robust        2
##  6 projectplanning Project Planning amazing       4
##  7 projectplanning Project Planning glad          3
##  8 projectplanning Project Planning woo           3
##  9 projectplanning Project Planning fan           3
## 10 projectplanning Project Planning wonderful     4
## # ... with 760 more rows

sentiment_loughran <- inner_join(tidy_comments, loughran, by = "word")

sentiment_loughran

## # A tibble: 503 x 4
##    videotype       video_type       word        sentiment  
##    <chr>           <chr>            <chr>       <chr>      
##  1 projectplanning Project Planning appealing   litigious  
##  2 projectplanning Project Planning probability uncertainty
##  3 projectplanning Project Planning stopped     negative   
##  4 projectplanning Project Planning probability uncertainty
##  5 projectplanning Project Planning perfect     positive   
##  6 projectplanning Project Planning risk        uncertainty
##  7 projectplanning Project Planning slow        negative   
##  8 projectplanning Project Planning favorite    positive   
##  9 projectplanning Project Planning perfect     positive   
## 10 projectplanning Project Planning informative positive   
## # ... with 493 more rows

sentiment_bing <- inner_join(tidy_comments, bing, by = "word")

sentiment_bing

## # A tibble: 851 x 4
##    videotype       video_type       word        sentiment
##    <chr>           <chr>            <chr>       <chr>    
##  1 projectplanning Project Planning gem         positive 
##  2 projectplanning Project Planning appealing   positive 
##  3 projectplanning Project Planning bless       positive 
##  4 projectplanning Project Planning love        positive 
##  5 projectplanning Project Planning scratch     negative 
##  6 projectplanning Project Planning robust      positive 
##  7 projectplanning Project Planning amazing     positive 
##  8 projectplanning Project Planning overwhelmed negative 
##  9 projectplanning Project Planning glad        positive 
## 10 projectplanning Project Planning woo         positive 
## # ... with 841 more rows

sentiment_nrc <- inner_join(tidy_comments, nrc, by = "word")

sentiment_nrc

## # A tibble: 3,528 x 4
##    videotype       video_type       word     sentiment   
##    <chr>           <chr>            <chr>    <chr>       
##  1 projectplanning Project Planning absolute positive    
##  2 projectplanning Project Planning gem      joy         
##  3 projectplanning Project Planning gem      positive    
##  4 projectplanning Project Planning ken      positive    
##  5 projectplanning Project Planning effort   positive    
##  6 projectplanning Project Planning god      anticipation
##  7 projectplanning Project Planning god      fear        
##  8 projectplanning Project Planning god      joy         
##  9 projectplanning Project Planning god      positive    
## 10 projectplanning Project Planning god      trust       
## # ... with 3,518 more rows

summary_bing <- count(sentiment_bing, sentiment, sort = TRUE)
summary_bing

## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 positive    473
## 2 negative    378

summary_bing <- sentiment_bing %>% 
  group_by(videotype) %>% 
  count(sentiment) 

summary_bing

## # A tibble: 14 x 3
## # Groups:   videotype [7]
##    videotype       sentiment     n
##    <chr>           <chr>     <int>
##  1 dataanalysis    negative     35
##  2 dataanalysis    positive     49
##  3 datacleaning    negative     68
##  4 datacleaning    positive    135
##  5 datacollection  negative    170
##  6 datacollection  positive    104
##  7 documentation   negative     12
##  8 documentation   positive     55
##  9 modelbuilding   negative     50
## 10 modelbuilding   positive     44
## 11 modelproduction negative     29
## 12 modelproduction positive     33
## 13 projectplanning negative     14
## 14 projectplanning positive     53

#untidy the data 
summary_bing <- sentiment_bing %>% 
  group_by(videotype) %>% 
  count(sentiment, sort = TRUE) %>% 
  spread(sentiment, n) 

summary_bing

## # A tibble: 7 x 3
## # Groups:   videotype [7]
##   videotype       negative positive
##   <chr>              <int>    <int>
## 1 dataanalysis          35       49
## 2 datacleaning          68      135
## 3 datacollection       170      104
## 4 documentation         12       55
## 5 modelbuilding         50       44
## 6 modelproduction       29       33
## 7 projectplanning       14       53

summary_bing <- sentiment_bing %>% 
  group_by(videotype) %>% 
  count(sentiment, sort = TRUE) %>% 
  spread(sentiment, n) %>%
  mutate(sentiment = positive - negative) %>%
  mutate(lexicon = "bing") %>%
  relocate(lexicon)

summary_bing

## # A tibble: 7 x 5
## # Groups:   videotype [7]
##   lexicon videotype       negative positive sentiment
##   <chr>   <chr>              <int>    <int>     <int>
## 1 bing    dataanalysis          35       49        14
## 2 bing    datacleaning          68      135        67
## 3 bing    datacollection       170      104       -66
## 4 bing    documentation         12       55        43
## 5 bing    modelbuilding         50       44        -6
## 6 bing    modelproduction       29       33         4
## 7 bing    projectplanning       14       53        39

summary_afinn <- sentiment_afinn %>% 
  group_by(videotype) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(lexicon = "AFINN") %>%
  relocate(lexicon)

summary_afinn

## # A tibble: 7 x 3
##   lexicon videotype       sentiment
##   <chr>   <chr>               <dbl>
## 1 AFINN   dataanalysis           80
## 2 AFINN   datacleaning          180
## 3 AFINN   datacollection         75
## 4 AFINN   documentation          79
## 5 AFINN   modelbuilding          56
## 6 AFINN   modelproduction        26
## 7 AFINN   projectplanning       108

summary_loughran <- sentiment_loughran %>% 
  group_by(videotype) %>% 
   
  mutate(lexicon = "loughran") %>%
  relocate(lexicon)

summary_loughran

## # A tibble: 503 x 5
## # Groups:   videotype [7]
##    lexicon  videotype       video_type       word        sentiment  
##    <chr>    <chr>           <chr>            <chr>       <chr>      
##  1 loughran projectplanning Project Planning appealing   litigious  
##  2 loughran projectplanning Project Planning probability uncertainty
##  3 loughran projectplanning Project Planning stopped     negative   
##  4 loughran projectplanning Project Planning probability uncertainty
##  5 loughran projectplanning Project Planning perfect     positive   
##  6 loughran projectplanning Project Planning risk        uncertainty
##  7 loughran projectplanning Project Planning slow        negative   
##  8 loughran projectplanning Project Planning favorite    positive   
##  9 loughran projectplanning Project Planning perfect     positive   
## 10 loughran projectplanning Project Planning informative positive   
## # ... with 493 more rows

summary_nrc <- sentiment_nrc %>% 
  group_by(videotype) %>% 
  filter (sentiment == "positive"|sentiment == "negative") %>%
 summarise(sentiment) %>%
  mutate(lexicon = "NRC") %>%
  relocate(lexicon)

## `summarise()` has grouped output by 'videotype'. You can override using the `.groups` argument.

summary_nrc

## # A tibble: 1,544 x 3
## # Groups:   videotype [7]
##    lexicon videotype    sentiment
##    <chr>   <chr>        <chr>    
##  1 NRC     dataanalysis positive 
##  2 NRC     dataanalysis positive 
##  3 NRC     dataanalysis positive 
##  4 NRC     dataanalysis positive 
##  5 NRC     dataanalysis negative 
##  6 NRC     dataanalysis positive 
##  7 NRC     dataanalysis positive 
##  8 NRC     dataanalysis negative 
##  9 NRC     dataanalysis negative 
## 10 NRC     dataanalysis positive 
## # ... with 1,534 more rows

sentiment_afinn <- projectbasedyoutubecomments %>%
  unnest_tokens(output = word, 
                input = comments)  %>% 
  anti_join(stop_words, by = "word") %>%
  filter(!word == "amp") %>%
  inner_join(afinn, by = "word")

sentiment_afinn

## # A tibble: 770 x 4
##    videotype       video_type       word      value
##    <chr>           <chr>            <chr>     <dbl>
##  1 projectplanning Project Planning god           1
##  2 projectplanning Project Planning bless         2
##  3 projectplanning Project Planning love          3
##  4 projectplanning Project Planning diamond       1
##  5 projectplanning Project Planning robust        2
##  6 projectplanning Project Planning amazing       4
##  7 projectplanning Project Planning glad          3
##  8 projectplanning Project Planning woo           3
##  9 projectplanning Project Planning fan           3
## 10 projectplanning Project Planning wonderful     4
## # ... with 760 more rows

afinn_score <- sentiment_afinn %>% 
  group_by(videotype) %>% 
  summarise(value = sum(value))

afinn_score

## # A tibble: 7 x 2
##   videotype       value
##   <chr>           <dbl>
## 1 dataanalysis       80
## 2 datacleaning      180
## 3 datacollection     75
## 4 documentation      79
## 5 modelbuilding      56
## 6 modelproduction    26
## 7 projectplanning   108

#Adding a flag for whether a comment is positive or negative
afinn_sentiment <- afinn_score %>%
  filter(value != 0) %>%
  mutate(sentiment = if_else(value < 0, "negative", "positive"))

afinn_sentiment

## # A tibble: 7 x 3
##   videotype       value sentiment
##   <chr>           <dbl> <chr>    
## 1 dataanalysis       80 positive 
## 2 datacleaning      180 positive 
## 3 datacollection     75 positive 
## 4 documentation      79 positive 
## 5 modelbuilding      56 positive 
## 6 modelproduction    26 positive 
## 7 projectplanning   108 positive

summary_afinn2 <- sentiment_afinn %>% 
  group_by(videotype) %>% 
  filter(value != 0) %>%
  mutate(sentiment = if_else(value < 0, "negative", "positive")) %>% 
  count(sentiment, sort = TRUE) %>% 
  mutate(method = "AFINN")

summary_bing2 <- sentiment_bing %>% 
  group_by(videotype) %>% 
  count(sentiment, sort = TRUE) %>% 
  mutate(method = "bing")

summary_nrc2 <- sentiment_nrc %>% 
  filter(sentiment %in% c("positive", "negative")) %>%
  group_by(videotype) %>% 
  count(sentiment, sort = TRUE) %>% 
  mutate(method = "nrc") 

summary_loughran2 <- sentiment_loughran %>% 
  filter(sentiment %in% c("positive", "negative")) %>%
  group_by(videotype) %>% 
  count(sentiment, sort = TRUE) %>% 
  mutate(method = "loughran")

summary_sentiment <- bind_rows(summary_afinn2,
                               summary_bing2,
                               summary_nrc2,
                               summary_loughran2) %>%
  arrange(method, videotype) %>%
  relocate(method)

summary_sentiment

## # A tibble: 56 x 4
## # Groups:   videotype [7]
##    method videotype      sentiment     n
##    <chr>  <chr>          <chr>     <int>
##  1 AFINN  dataanalysis   positive     58
##  2 AFINN  dataanalysis   negative     26
##  3 AFINN  datacleaning   positive    122
##  4 AFINN  datacleaning   negative     52
##  5 AFINN  datacollection positive    148
##  6 AFINN  datacollection negative    122
##  7 AFINN  documentation  positive     42
##  8 AFINN  documentation  negative     10
##  9 AFINN  modelbuilding  positive     50
## 10 AFINN  modelbuilding  negative     34
## # ... with 46 more rows

total_counts <- summary_sentiment %>%
  group_by(method, videotype) %>%
  summarise(total = sum(n))

## `summarise()` has grouped output by 'method'. You can override using the `.groups` argument.

sentiment_counts <- left_join(summary_sentiment, total_counts)

## Joining, by = c("method", "videotype")

sentiment_counts

## # A tibble: 56 x 5
## # Groups:   videotype [7]
##    method videotype      sentiment     n total
##    <chr>  <chr>          <chr>     <int> <int>
##  1 AFINN  dataanalysis   positive     58    84
##  2 AFINN  dataanalysis   negative     26    84
##  3 AFINN  datacleaning   positive    122   174
##  4 AFINN  datacleaning   negative     52   174
##  5 AFINN  datacollection positive    148   270
##  6 AFINN  datacollection negative    122   270
##  7 AFINN  documentation  positive     42    52
##  8 AFINN  documentation  negative     10    52
##  9 AFINN  modelbuilding  positive     50    84
## 10 AFINN  modelbuilding  negative     34    84
## # ... with 46 more rows

#new row that calculates the percentage
sentiment_percents <- sentiment_counts %>%
  mutate(percent = n/total * 100)

sentiment_percents

## # A tibble: 56 x 6
## # Groups:   videotype [7]
##    method videotype      sentiment     n total percent
##    <chr>  <chr>          <chr>     <int> <int>   <dbl>
##  1 AFINN  dataanalysis   positive     58    84    69.0
##  2 AFINN  dataanalysis   negative     26    84    31.0
##  3 AFINN  datacleaning   positive    122   174    70.1
##  4 AFINN  datacleaning   negative     52   174    29.9
##  5 AFINN  datacollection positive    148   270    54.8
##  6 AFINN  datacollection negative    122   270    45.2
##  7 AFINN  documentation  positive     42    52    80.8
##  8 AFINN  documentation  negative     10    52    19.2
##  9 AFINN  modelbuilding  positive     50    84    59.5
## 10 AFINN  modelbuilding  negative     34    84    40.5
## # ... with 46 more rows

sentiment_percents %>%
  ggplot(aes(x = videotype, y = percent, fill=sentiment)) +
  geom_bar(width = .8, stat = "identity") +
  facet_wrap(~method, ncol = 1) +
  coord_flip() +
  labs(title = "Public Sentiment on Project Based Youtube Videos", 
       subtitle = "From Project Planning, Data Collection, Data Cleaning, Data Analysis",
       x = "Video Type", 
       y = "Percentage of Words")

Topic Modelling

Stemming: reducing the redundancy of words and phrases

##stemming

projects_tidy2 <- projects_tidy2 %>% 
  mutate(word = wordStem(word))

Cast a Document Term Matrix

projects_tds_DTM <- projects_tidy2 %>%
  count(video_type, word) %>%
  cast_dtm(video_type, word, n)

projects_tds_DTM

## <<DocumentTermMatrix (documents: 7, terms: 1815)>>
## Non-/sparse entries: 3199/9506
## Sparsity           : 75%
## Maximal term length: 55
## Weighting          : term frequency (tf)

projects_bigrams <- projects_raw %>%   
  unnest_tokens(output = bigram, input = comments, token = "ngrams", n = 2)

projects_bigrams <- projects_bigrams %>% 
  separate(bigram, into = c("word1", "word2"), sep = " ") %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>%
  mutate(word1 = wordStem(word1)) %>% 
  mutate(word2 = wordStem(word2)) %>% 
  unite(bigram, c(word1, word2), sep = " ")

bigram_top_tokens <- projects_bigrams %>% 
  count(bigram, sort = TRUE) %>% 
  top_n(20)

## Selecting by n

bigram_top_tokens

## # A tibble: 20 x 2
##    bigram                n
##    <chr>             <int>
##  1 data scienc          71
##  2 hei ken              37
##  3 data scientist       30
##  4 info chrome          21
##  5 session info         21
##  6 css selector         18
##  7 element method       17
##  8 locat element        17
##  9 method css           17
## 10 selector selector    17
## 11 lib site             16
## 12 selector select      16
## 13 site packag          16
## 14 element unabl        14
## 15 data clean           13
## 16 df job_stat          13
## 17 ken jee              13
## 18 anaconda3 lib        12
## 19 appli lambda         12
## 20 salari estim         12

k_metrics <- FindTopicsNumber(
  projects_tds_DTM,
  topics = seq(5, 50, by = 5),
  metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
  method = "Gibbs",
  control = list(seed = 77),
  mc.cores = NA,
  return_models = FALSE,
  verbose = FALSE,
  libpath = NULL
)
FindTopicsNumber_plot(k_metrics)

## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

projects_lda <- LDA(projects_tds_DTM, 
                  k = 4, 
                  control = list(seed = 588))

terms(projects_lda, 5)

##      Topic 1   Topic 2   Topic 3 Topic 4 
## [1,] "request" "error"   "error" "salari"
## [2,] "error"   "element" "model" "help"  
## [3,] "file"    "code"    "code"  "df"    
## [4,] "line"    "job"     "learn" "learn" 
## [5,] "model"   "click"   "watch" "job"

tidy_lda <- tidy(projects_lda)
tidy_lda

## # A tibble: 7,260 x 3
##    topic term            beta
##    <int> <chr>          <dbl>
##  1     1 _imagingft 1.63e-178
##  2     2 _imagingft 1.05e-189
##  3     3 _imagingft 7.17e-  4
##  4     4 _imagingft 1.63e-176
##  5     1 8hr        2.87e-178
##  6     2 8hr        1.64e-189
##  7     3 8hr        7.17e-  4
##  8     4 8hr        1.41e-176
##  9     1 abrevi     3.39e-178
## 10     2 abrevi     1.73e-189
## # ... with 7,250 more rows

top_terms_lda <- tidy_lda %>%
  group_by(topic) %>%
  slice_max(beta, n = 5, with_ties = FALSE) %>%
  ungroup() %>%
  arrange(topic, -beta)
top_terms_lda %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  group_by(topic, term) %>%    
  arrange(desc(beta)) %>%  
  ungroup() %>%
  ggplot(aes(beta, term, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  scale_y_reordered() +
  labs(title = "Top 5 terms in each LDA topic",
       x = expression(beta), y = NULL) +
  facet_wrap(~ topic, ncol = 4, scales = "free")

#BTM 
dl <- udpipe_download_model(language = "english")

## Downloading udpipe model from https://raw.githubusercontent.com/jwijffels/udpipe.models.ud.2.5/master/inst/udpipe-ud-2.5-191206/english-ewt-ud-2.5-191206.udpipe to C:/Users/doreen/Documents/Youtube-SentimentAnalysis/ProjectBased/english-ewt-ud-2.5-191206.udpipe

##  - This model has been trained on version 2.5 of data from https://universaldependencies.org

##  - The model is distributed under the CC-BY-SA-NC license: https://creativecommons.org/licenses/by-nc-sa/4.0

##  - Visit https://github.com/jwijffels/udpipe.models.ud.2.5 for model license details.

##  - For a list of all models and their licenses (most models you can download with this package have either a CC-BY-SA or a CC-BY-SA-NC license) read the documentation at ?udpipe_download_model. For building your own models: visit the documentation by typing vignette('udpipe-train', package = 'udpipe')

## Downloading finished, model stored at 'C:/Users/doreen/Documents/Youtube-SentimentAnalysis/ProjectBased/english-ewt-ud-2.5-191206.udpipe'

udmodel_english <- udpipe_load_model(file = dl$file_model)
s<- udpipe_annotate(udmodel_english, projects_raw$text)
x <- data.frame(s)
str(x)

## 'data.frame':    26701 obs. of  14 variables:
##  $ doc_id       : chr  "doc1" "doc1" "doc1" "doc1" ...
##  $ paragraph_id : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ sentence_id  : int  1 1 1 1 2 2 1 1 1 1 ...
##  $ sentence     : chr  "a job simp ?" "a job simp ?" "a job simp ?" "a job simp ?" ...
##  $ token_id     : chr  "1" "2" "3" "4" ...
##  $ token        : chr  "a" "job" "simp" "?" ...
##  $ lemma        : chr  "a" "job" "simp" "?" ...
##  $ upos         : chr  "DET" "NOUN" "NOUN" "PUNCT" ...
##  $ xpos         : chr  "DT" "NN" "NN" "." ...
##  $ feats        : chr  "Definite=Ind|PronType=Art" "Number=Sing" "Number=Sing" NA ...
##  $ head_token_id: chr  "3" "3" "0" "3" ...
##  $ dep_rel      : chr  "det" "compound" "root" "punct" ...
##  $ deps         : chr  NA NA NA NA ...
##  $ misc         : chr  NA NA NA NA ...

table(x$upos)

## 
##   ADJ   ADP   ADV   AUX CCONJ   DET  INTJ  NOUN   NUM  PART  PRON PROPN PUNCT 
##  1396  1714  1396  1422   583  1811   251  5194   418   665  2516  1003  4146 
## SCONJ   SYM  VERB     X 
##   429   183  3026   548

anno    <- udpipe(projects_raw, "english", trace = 10)

## 2022-10-14 13:43:40 Annotating text fragment 1/615
## 2022-10-14 13:43:40 Annotating text fragment 11/615
## 2022-10-14 13:43:41 Annotating text fragment 21/615
## 2022-10-14 13:43:41 Annotating text fragment 31/615
## 2022-10-14 13:43:42 Annotating text fragment 41/615
## 2022-10-14 13:43:42 Annotating text fragment 51/615
## 2022-10-14 13:43:43 Annotating text fragment 61/615
## 2022-10-14 13:43:44 Annotating text fragment 71/615
## 2022-10-14 13:43:44 Annotating text fragment 81/615
## 2022-10-14 13:43:45 Annotating text fragment 91/615
## 2022-10-14 13:43:45 Annotating text fragment 101/615
## 2022-10-14 13:43:46 Annotating text fragment 111/615
## 2022-10-14 13:43:46 Annotating text fragment 121/615
## 2022-10-14 13:43:47 Annotating text fragment 131/615
## 2022-10-14 13:43:47 Annotating text fragment 141/615
## 2022-10-14 13:43:48 Annotating text fragment 151/615
## 2022-10-14 13:43:49 Annotating text fragment 161/615
## 2022-10-14 13:43:50 Annotating text fragment 171/615
## 2022-10-14 13:43:50 Annotating text fragment 181/615
## 2022-10-14 13:43:51 Annotating text fragment 191/615
## 2022-10-14 13:43:52 Annotating text fragment 201/615
## 2022-10-14 13:43:52 Annotating text fragment 211/615
## 2022-10-14 13:43:53 Annotating text fragment 221/615
## 2022-10-14 13:43:54 Annotating text fragment 231/615
## 2022-10-14 13:43:54 Annotating text fragment 241/615
## 2022-10-14 13:43:56 Annotating text fragment 251/615
## 2022-10-14 13:43:56 Annotating text fragment 261/615
## 2022-10-14 13:43:57 Annotating text fragment 271/615
## 2022-10-14 13:43:58 Annotating text fragment 281/615
## 2022-10-14 13:43:58 Annotating text fragment 291/615
## 2022-10-14 13:43:59 Annotating text fragment 301/615
## 2022-10-14 13:43:59 Annotating text fragment 311/615
## 2022-10-14 13:44:01 Annotating text fragment 321/615
## 2022-10-14 13:44:02 Annotating text fragment 331/615
## 2022-10-14 13:44:02 Annotating text fragment 341/615
## 2022-10-14 13:44:03 Annotating text fragment 351/615
## 2022-10-14 13:44:04 Annotating text fragment 361/615
## 2022-10-14 13:44:04 Annotating text fragment 371/615
## 2022-10-14 13:44:05 Annotating text fragment 381/615
## 2022-10-14 13:44:06 Annotating text fragment 391/615
## 2022-10-14 13:44:06 Annotating text fragment 401/615
## 2022-10-14 13:44:06 Annotating text fragment 411/615
## 2022-10-14 13:44:07 Annotating text fragment 421/615
## 2022-10-14 13:44:07 Annotating text fragment 431/615
## 2022-10-14 13:44:08 Annotating text fragment 441/615
## 2022-10-14 13:44:08 Annotating text fragment 451/615
## 2022-10-14 13:44:09 Annotating text fragment 461/615
## 2022-10-14 13:44:09 Annotating text fragment 471/615
## 2022-10-14 13:44:09 Annotating text fragment 481/615
## 2022-10-14 13:44:10 Annotating text fragment 491/615
## 2022-10-14 13:44:11 Annotating text fragment 501/615
## 2022-10-14 13:44:11 Annotating text fragment 511/615
## 2022-10-14 13:44:12 Annotating text fragment 521/615
## 2022-10-14 13:44:13 Annotating text fragment 531/615
## 2022-10-14 13:44:13 Annotating text fragment 541/615
## 2022-10-14 13:44:13 Annotating text fragment 551/615
## 2022-10-14 13:44:13 Annotating text fragment 561/615
## 2022-10-14 13:44:14 Annotating text fragment 571/615
## 2022-10-14 13:44:14 Annotating text fragment 581/615
## 2022-10-14 13:44:15 Annotating text fragment 591/615
## 2022-10-14 13:44:15 Annotating text fragment 601/615
## 2022-10-14 13:44:15 Annotating text fragment 611/615

biterms <- as.data.table(anno)
biterms <- biterms[, cooccurrence(x = lemma,
                                  relevant = upos %in% c("NOUN",
                                                         "ADJ",
                                                         "PROPN"),
                                  skipgram = 5),
                   by = list(doc_id)]

# Build BTM
set.seed(588)
traindata <- subset(anno, upos %in% c("NOUN", "ADJ", "PROPN"))
traindata <- traindata[, c("doc_id", "lemma")]
model <- BTM(traindata, k = 10, 
             beta = 0.01, 
             iter = 500,
             biterms = biterms, 
             trace = 100)

## 2022-10-14 13:44:36 Start Gibbs sampling iteration 1/500
## 2022-10-14 13:44:37 Start Gibbs sampling iteration 101/500
## 2022-10-14 13:44:38 Start Gibbs sampling iteration 201/500
## 2022-10-14 13:44:38 Start Gibbs sampling iteration 301/500
## 2022-10-14 13:44:39 Start Gibbs sampling iteration 401/500

#Plot Model Results (do not run when knitting)
#library(textplot)
#library(ggraph)
#library(concaveman)
#plot(model, top_n = 10,title = "BTM model", subtitle = "K = 10 , 500 Training Iterations", labels = c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9"))

## Annotate text with parts of speech tags

##anno    <- udpipe(projects_raw, "english", trace = 10)
##biterms <- as.data.table(anno)
##biterms <- biterms[, cooccurrence(x = lemma,
 ##                                 relevant = upos %in% c("NOUN",
 ##                                                        "ADJ",
 ##                                                        "PROPN"),
   ##                               skipgram = 5),
   ##                by = list(doc_id)]

# Build BTM
##set.seed(588)
##traindata <- subset(anno, upos %in% c("NOUN", "ADJ", "PROPN"))
##traindata <- traindata[, c("doc_id", "lemma")]
##model <- BTM(traindata, k = 10, 
##             beta = 0.01, 
##             iter = 500,
 ##            biterms = biterms, 
 ##            trace = 100)
# Plot Model Results (do not run when knitting)
##library(textplot)
##library(ggraph)
##library(concaveman)
##plot(model,
 ##   top_n = 10,title = "BTM model",
 ##   subtitle = "K = 10 , 500 Training Iterations",
 ##   labels = c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
   ##            "10", "11", "12", "13",  "14", "15", "16", "17",
   ##            "18", "19"))

Youtube Sentiment Analysis and Topic Modelling

Doreen Mushi

2022-10-06

Loading libraries

EXPLORING CATEGORIES

Frequent Words from Cummulative Comments

Sentiment Analysis

Project Planning WordCloud

Data Cleaning WordCloud

Documentation WordCloud

Topic Modelling