library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.6 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidytext)
library(SnowballC)
library(topicmodels)
library(stm)
## stm v1.3.6 successfully loaded. See ?stm for help.
## Papers, resources, and other materials at structuraltopicmodel.com
library(ldatuning)
library(knitr)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(BTM)
library(textplot)
library(concaveman)
library(udpipe)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
library(stopwords)
##
## Attaching package: 'stopwords'
## The following object is masked from 'package:tm':
##
## stopwords
library(dplyr)
library(readr)
library(readxl)
library(tidyr)
library(writexl)
library(readxl)
library(textdata)
library(ggplot2)
library(textdata)
library(wordcloud2)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
projects_raw <- read_xlsx("data/datascience_projects_tutorials.xlsx")
#selecting columns for analysis
projects_raw <- select(projects_raw,comment_ID, comments,video_type)
projects_raw$text <- paste(projects_raw$comments,projects_raw$comment_ID,sep=" ")
# Rename ID column
colnames(projects_raw)[1] <- "doc_id"
projects_raw
## # A tibble: 615 x 4
## doc_id comments video_type text
## <chr> <chr> <chr> <chr>
## 1 Ugxax9wg3rCw8BPIHut4AaABAg "a job simp ? eh" Data Analy~ "a job simp ? eh~
## 2 UgxsPZj3fphkrKn1ypZ4AaABAg "Way to go! This wa~ Data Analy~ "Way to go! This~
## 3 UgwXQLs5mfce95rFEMh4AaABAg "Thank you Ken! I l~ Data Analy~ "Thank you Ken! ~
## 4 UgwAXAOt8KGbNNER-7l4AaABAg "Hey Ken Jee, thank~ Data Analy~ "Hey Ken Jee, th~
## 5 UgySrmKyJHFW_WCzzf14AaABAg "Very informative p~ Data Analy~ "Very informativ~
## 6 UgzZC4VQZYdabgEfzYl4AaABAg "can anyone help me~ Data Analy~ "can anyone help~
## 7 UgyndZ-CYBzC819u_V14AaABAg "I have a coding te~ Data Analy~ "I have a coding~
## 8 UgwViqMr1ECmqkOCvXp4AaABAg "can You provide Us~ Data Analy~ "can You provide~
## 9 Ugz8cVruhbFKc9045HR4AaABAg "Amazing job ken th~ Data Analy~ "Amazing job ken~
## 10 Ugw5EP72lW_NftTJZZx4AaABAg "Should I use Excel~ Data Analy~ "Should I use Ex~
## # ... with 605 more rows
projects_raw %>%
ggplot(aes(x = video_type)) +
geom_bar(show.legend = FALSE) +
labs(y = "Count",
x = "Video Type",
title = "Project Tutorials",
subtitle = "Categorized by Video Types")
#tokenizing text
projects_tidy <- projects_raw %>%
unnest_tokens(output = word, input = comments) %>%
anti_join(stop_words, by = "word")
#removing customized words and saving in new dataframe
my_stopwords <- c("ken", "science", "thank", "video", "videos", "hey", "data", "ðŸ", "series", "projects", "project")
projects_tidy2 <-
projects_tidy %>%
filter(!word %in% my_stopwords)
# Remove numbers
projects_tidy2 <- projects_tidy2[-grep("\\b\\d+\\b", projects_tidy2$word),]
projects_top_tokens <- projects_tidy2 %>%
count(word, sort = TRUE) %>%
top_n(30)
## Selecting by n
wordcloud2 (projects_top_tokens)
# Frequent words from cumulative Tutorial Videos
projects_top_tokens %>%
filter(n > 4) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
coord_flip() +
labs(x = "Word \n", y = "\n Count ", title = "Frequent Words on Project Tutorials \n") +
geom_text(aes(label = n), hjust = 3, colour = "white", fontface = "bold") +
theme(plot.title = element_text(hjust = 0.5),
axis.title.x = element_text(face="bold", colour="darkblue", size = 12),
axis.title.y = element_text(face="bold", colour="darkblue", size = 12))
projects_tidy2 %>%
group_by(video_type) %>%
count(word, sort = TRUE) %>%
top_n(10) %>%
ungroup %>%
mutate(word = reorder_within(word, n, video_type)) %>%
ggplot(aes(x = word, y = n, fill = word)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ video_type, scales = "free_y") +
coord_flip() +
scale_x_reordered() +
scale_y_continuous(expand = c(0,0)) +
labs(y = "Count",
x = "Unique words",
title = "Most frequent words by video name",
subtitle = "Stop words removed from the list")
## Selecting by n
#dataframes for video titles
datacollection <- projects_raw %>% filter (video_type == "Data Collection")
datacleaning <- projects_raw %>% filter (video_type == "Data Cleaning")
dataanalysis <- projects_raw %>% filter (video_type == "Data Analysis")
projectplanning<- projects_raw %>% filter (video_type == "Project Planning")
documentation <- projects_raw %>% filter (video_type == "Documentation")
modelbuilding <- projects_raw %>% filter (video_type == "Model Building")
modelproduction <- projects_raw %>% filter (video_type == "Model Production")
#tokenizing text
datacollection_tidy <- datacollection %>%
unnest_tokens(output = word, input = comments) %>%
anti_join(stop_words, by = "word")
datacleaning_tidy <- datacleaning %>%
unnest_tokens(output = word, input = comments) %>%
anti_join(stop_words, by = "word")
dataanalysis_tidy <- dataanalysis %>%
unnest_tokens(output = word, input = comments) %>%
anti_join(stop_words, by = "word")
projectplanning_tidy <- projectplanning %>%
unnest_tokens(output = word, input = comments) %>%
anti_join(stop_words, by = "word")
documentation_tidy <- documentation %>%
unnest_tokens(output = word, input = comments) %>%
anti_join(stop_words, by = "word")
modelbuilding_tidy <- modelbuilding %>%
unnest_tokens(output = word, input = comments) %>%
anti_join(stop_words, by = "word")
modelproduction_tidy <- modelproduction %>%
unnest_tokens(output = word, input = comments) %>%
anti_join(stop_words, by = "word")
#removing customized words and saving in new dataframe
my_stopwords <- c("ken", "science", "thank", "video", "videos", "hey", "data", "series","projects", "project")
datacollection_tidy2 <-
datacollection_tidy %>%
filter(!word %in% my_stopwords)
dataanalysis_tidy2 <-
dataanalysis_tidy %>%
filter(!word %in% my_stopwords)
projectplanning_tidy2 <-
projectplanning_tidy %>%
filter(!word %in% my_stopwords)
datacleaning_tidy2 <-
datacleaning_tidy %>%
filter(!word %in% my_stopwords)
documentation_tidy2 <-
documentation_tidy %>%
filter(!word %in% my_stopwords)
modelbuilding_tidy2 <-
modelbuilding_tidy %>%
filter(!word %in% my_stopwords)
modelproduction_tidy2 <-
modelproduction_tidy %>%
filter(!word %in% my_stopwords)
# Remove numbers
datacollection_tidy2 <- datacollection_tidy2[-grep("\\b\\d+\\b", datacollection_tidy2$word),]
dataanalysis_tidy2 <- dataanalysis_tidy2[-grep("\\b\\d+\\b", dataanalysis_tidy2$word),]
projectplanning_tidy2 <- projectplanning_tidy2[-grep("\\b\\d+\\b", projectplanning_tidy2$word),]
datacleaning_tidy2 <- datacleaning_tidy2[-grep("\\b\\d+\\b", datacleaning_tidy2$word),]
documentation_tidy2 <- documentation_tidy2[-grep("\\b\\d+\\b", documentation_tidy2$word),]
modelbuilding_tidy2 <- modelbuilding_tidy2[-grep("\\b\\d+\\b", modelbuilding_tidy2$word),]
modelproduction_tidy2 <- modelproduction_tidy2[-grep("\\b\\d+\\b", modelproduction_tidy2$word),]
projectplanning_top_tokens <- projectplanning_tidy2 %>%
count(word, sort = TRUE) %>%
top_n(30)
## Selecting by n
wordcloud2 (projectplanning_top_tokens)
getwd()
## [1] "C:/Users/doreen/Documents/Youtube-SentimentAnalysis/ProjectBased"
### DataCollection
WordCloud
datacollection_top_tokens <- datacollection_tidy2 %>%
count(word, sort = TRUE) %>%
top_n(30)
## Selecting by n
wordcloud2 (datacollection_top_tokens)
getwd()
## [1] "C:/Users/doreen/Documents/Youtube-SentimentAnalysis/ProjectBased"
datacleaning_top_tokens <- datacleaning_tidy2 %>%
count(word, sort = TRUE) %>%
top_n(30)
## Selecting by n
wordcloud2 (datacleaning_top_tokens)
###Data Analysis WordCloud
dataanalysis_top_tokens <- dataanalysis_tidy2 %>%
count(word, sort = TRUE) %>%
top_n(30)
## Selecting by n
wordcloud2 (dataanalysis_top_tokens)
###Model Building WordCloud
modelbuilding_top_tokens <- modelbuilding_tidy2 %>%
count(word, sort = TRUE) %>%
top_n(30)
## Selecting by n
wordcloud2 (modelbuilding_top_tokens)
### Model Production
WordCloud
modelproduction_top_tokens <- modelproduction_tidy2 %>%
count(word, sort = TRUE) %>%
top_n(30)
## Selecting by n
wordcloud2 (modelproduction_top_tokens)
documentation_top_tokens <- documentation_tidy2 %>%
count(word, sort = TRUE) %>%
top_n(30)
## Selecting by n
wordcloud2 (documentation_top_tokens)
projectplanning <-
select(projectplanning,comments, video_type) %>%
mutate(videotype = "projectplanning") %>%
relocate(videotype)
datacollection <-
select(datacollection,comments, video_type) %>%
mutate(videotype = "datacollection") %>%
relocate(videotype)
datacleaning <-
select(datacleaning,comments, video_type) %>%
mutate(videotype = "datacleaning") %>%
relocate(videotype)
dataanalysis <-
select(dataanalysis,comments, video_type) %>%
mutate(videotype = "dataanalysis") %>%
relocate(videotype)
documentation <-
select(documentation,comments, video_type) %>%
mutate(videotype = "documentation") %>%
relocate(videotype)
modelbuilding <-
select(modelbuilding,comments, video_type) %>%
mutate(videotype = "modelbuilding") %>%
relocate(videotype)
modelproduction <-
select(modelproduction,comments,video_type) %>%
mutate(videotype = "modelproduction") %>%
relocate(videotype)
#binding rows
projectbasedyoutubecomments <- bind_rows(projectplanning, datacollection, datacleaning, dataanalysis, modelbuilding, modelproduction, documentation)
tail(projectbasedyoutubecomments)
## # A tibble: 6 x 3
## videotype comments video_type
## <chr> <chr> <chr>
## 1 documentation Thanks for creating this series. Watched and liked ~ Documentat~
## 2 documentation thanks so much for these indepth walkthroughs. Documentat~
## 3 documentation Really a super awesome series ken. It helped a lot ~ Documentat~
## 4 documentation loved this series Ken! Documentat~
## 5 documentation keep doing great work man Documentat~
## 6 documentation Hi Everyone! Thank you for watching this series. It~ Documentat~
#tokenizing text
projectbasedyoutube_tokens <-
projectbasedyoutubecomments %>%
unnest_tokens(output = word,
input = comments)
#removing stopwords and doing a count of common words
#removing stopwords
tidy_comments <-
projectbasedyoutube_tokens %>%
anti_join(stop_words, by = "word")
count(tidy_comments, word, sort = T)
## # A tibble: 2,407 x 2
## word n
## <chr> <int>
## 1 data 248
## 2 ken 212
## 3 series 142
## 4 error 119
## 5 video 110
## 6 project 92
## 7 code 84
## 8 science 72
## 9 element 70
## 10 job 69
## # ... with 2,397 more rows
# Loading Affin Lexicons
afinn <- get_sentiments("afinn")
afinn
## # A tibble: 2,477 x 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ... with 2,467 more rows
#Bing Lexicons
bing <- get_sentiments("bing")
bing
## # A tibble: 6,786 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ... with 6,776 more rows
nrc <- get_sentiments("nrc")
nrc
## # A tibble: 13,875 x 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ... with 13,865 more rows
loughran <- get_sentiments("loughran")
loughran
## # A tibble: 4,150 x 2
## word sentiment
## <chr> <chr>
## 1 abandon negative
## 2 abandoned negative
## 3 abandoning negative
## 4 abandonment negative
## 5 abandonments negative
## 6 abandons negative
## 7 abdicated negative
## 8 abdicates negative
## 9 abdicating negative
## 10 abdication negative
## # ... with 4,140 more rows
#joining Sentiments afinn
sentiment_afinn <- inner_join(tidy_comments, afinn, by = "word")
sentiment_afinn
## # A tibble: 770 x 4
## videotype video_type word value
## <chr> <chr> <chr> <dbl>
## 1 projectplanning Project Planning god 1
## 2 projectplanning Project Planning bless 2
## 3 projectplanning Project Planning love 3
## 4 projectplanning Project Planning diamond 1
## 5 projectplanning Project Planning robust 2
## 6 projectplanning Project Planning amazing 4
## 7 projectplanning Project Planning glad 3
## 8 projectplanning Project Planning woo 3
## 9 projectplanning Project Planning fan 3
## 10 projectplanning Project Planning wonderful 4
## # ... with 760 more rows
sentiment_loughran <- inner_join(tidy_comments, loughran, by = "word")
sentiment_loughran
## # A tibble: 503 x 4
## videotype video_type word sentiment
## <chr> <chr> <chr> <chr>
## 1 projectplanning Project Planning appealing litigious
## 2 projectplanning Project Planning probability uncertainty
## 3 projectplanning Project Planning stopped negative
## 4 projectplanning Project Planning probability uncertainty
## 5 projectplanning Project Planning perfect positive
## 6 projectplanning Project Planning risk uncertainty
## 7 projectplanning Project Planning slow negative
## 8 projectplanning Project Planning favorite positive
## 9 projectplanning Project Planning perfect positive
## 10 projectplanning Project Planning informative positive
## # ... with 493 more rows
sentiment_bing <- inner_join(tidy_comments, bing, by = "word")
sentiment_bing
## # A tibble: 851 x 4
## videotype video_type word sentiment
## <chr> <chr> <chr> <chr>
## 1 projectplanning Project Planning gem positive
## 2 projectplanning Project Planning appealing positive
## 3 projectplanning Project Planning bless positive
## 4 projectplanning Project Planning love positive
## 5 projectplanning Project Planning scratch negative
## 6 projectplanning Project Planning robust positive
## 7 projectplanning Project Planning amazing positive
## 8 projectplanning Project Planning overwhelmed negative
## 9 projectplanning Project Planning glad positive
## 10 projectplanning Project Planning woo positive
## # ... with 841 more rows
sentiment_nrc <- inner_join(tidy_comments, nrc, by = "word")
sentiment_nrc
## # A tibble: 3,528 x 4
## videotype video_type word sentiment
## <chr> <chr> <chr> <chr>
## 1 projectplanning Project Planning absolute positive
## 2 projectplanning Project Planning gem joy
## 3 projectplanning Project Planning gem positive
## 4 projectplanning Project Planning ken positive
## 5 projectplanning Project Planning effort positive
## 6 projectplanning Project Planning god anticipation
## 7 projectplanning Project Planning god fear
## 8 projectplanning Project Planning god joy
## 9 projectplanning Project Planning god positive
## 10 projectplanning Project Planning god trust
## # ... with 3,518 more rows
summary_bing <- count(sentiment_bing, sentiment, sort = TRUE)
summary_bing
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 positive 473
## 2 negative 378
summary_bing <- sentiment_bing %>%
group_by(videotype) %>%
count(sentiment)
summary_bing
## # A tibble: 14 x 3
## # Groups: videotype [7]
## videotype sentiment n
## <chr> <chr> <int>
## 1 dataanalysis negative 35
## 2 dataanalysis positive 49
## 3 datacleaning negative 68
## 4 datacleaning positive 135
## 5 datacollection negative 170
## 6 datacollection positive 104
## 7 documentation negative 12
## 8 documentation positive 55
## 9 modelbuilding negative 50
## 10 modelbuilding positive 44
## 11 modelproduction negative 29
## 12 modelproduction positive 33
## 13 projectplanning negative 14
## 14 projectplanning positive 53
#untidy the data
summary_bing <- sentiment_bing %>%
group_by(videotype) %>%
count(sentiment, sort = TRUE) %>%
spread(sentiment, n)
summary_bing
## # A tibble: 7 x 3
## # Groups: videotype [7]
## videotype negative positive
## <chr> <int> <int>
## 1 dataanalysis 35 49
## 2 datacleaning 68 135
## 3 datacollection 170 104
## 4 documentation 12 55
## 5 modelbuilding 50 44
## 6 modelproduction 29 33
## 7 projectplanning 14 53
summary_bing <- sentiment_bing %>%
group_by(videotype) %>%
count(sentiment, sort = TRUE) %>%
spread(sentiment, n) %>%
mutate(sentiment = positive - negative) %>%
mutate(lexicon = "bing") %>%
relocate(lexicon)
summary_bing
## # A tibble: 7 x 5
## # Groups: videotype [7]
## lexicon videotype negative positive sentiment
## <chr> <chr> <int> <int> <int>
## 1 bing dataanalysis 35 49 14
## 2 bing datacleaning 68 135 67
## 3 bing datacollection 170 104 -66
## 4 bing documentation 12 55 43
## 5 bing modelbuilding 50 44 -6
## 6 bing modelproduction 29 33 4
## 7 bing projectplanning 14 53 39
summary_afinn <- sentiment_afinn %>%
group_by(videotype) %>%
summarise(sentiment = sum(value)) %>%
mutate(lexicon = "AFINN") %>%
relocate(lexicon)
summary_afinn
## # A tibble: 7 x 3
## lexicon videotype sentiment
## <chr> <chr> <dbl>
## 1 AFINN dataanalysis 80
## 2 AFINN datacleaning 180
## 3 AFINN datacollection 75
## 4 AFINN documentation 79
## 5 AFINN modelbuilding 56
## 6 AFINN modelproduction 26
## 7 AFINN projectplanning 108
summary_loughran <- sentiment_loughran %>%
group_by(videotype) %>%
mutate(lexicon = "loughran") %>%
relocate(lexicon)
summary_loughran
## # A tibble: 503 x 5
## # Groups: videotype [7]
## lexicon videotype video_type word sentiment
## <chr> <chr> <chr> <chr> <chr>
## 1 loughran projectplanning Project Planning appealing litigious
## 2 loughran projectplanning Project Planning probability uncertainty
## 3 loughran projectplanning Project Planning stopped negative
## 4 loughran projectplanning Project Planning probability uncertainty
## 5 loughran projectplanning Project Planning perfect positive
## 6 loughran projectplanning Project Planning risk uncertainty
## 7 loughran projectplanning Project Planning slow negative
## 8 loughran projectplanning Project Planning favorite positive
## 9 loughran projectplanning Project Planning perfect positive
## 10 loughran projectplanning Project Planning informative positive
## # ... with 493 more rows
summary_nrc <- sentiment_nrc %>%
group_by(videotype) %>%
filter (sentiment == "positive"|sentiment == "negative") %>%
summarise(sentiment) %>%
mutate(lexicon = "NRC") %>%
relocate(lexicon)
## `summarise()` has grouped output by 'videotype'. You can override using the `.groups` argument.
summary_nrc
## # A tibble: 1,544 x 3
## # Groups: videotype [7]
## lexicon videotype sentiment
## <chr> <chr> <chr>
## 1 NRC dataanalysis positive
## 2 NRC dataanalysis positive
## 3 NRC dataanalysis positive
## 4 NRC dataanalysis positive
## 5 NRC dataanalysis negative
## 6 NRC dataanalysis positive
## 7 NRC dataanalysis positive
## 8 NRC dataanalysis negative
## 9 NRC dataanalysis negative
## 10 NRC dataanalysis positive
## # ... with 1,534 more rows
sentiment_afinn <- projectbasedyoutubecomments %>%
unnest_tokens(output = word,
input = comments) %>%
anti_join(stop_words, by = "word") %>%
filter(!word == "amp") %>%
inner_join(afinn, by = "word")
sentiment_afinn
## # A tibble: 770 x 4
## videotype video_type word value
## <chr> <chr> <chr> <dbl>
## 1 projectplanning Project Planning god 1
## 2 projectplanning Project Planning bless 2
## 3 projectplanning Project Planning love 3
## 4 projectplanning Project Planning diamond 1
## 5 projectplanning Project Planning robust 2
## 6 projectplanning Project Planning amazing 4
## 7 projectplanning Project Planning glad 3
## 8 projectplanning Project Planning woo 3
## 9 projectplanning Project Planning fan 3
## 10 projectplanning Project Planning wonderful 4
## # ... with 760 more rows
afinn_score <- sentiment_afinn %>%
group_by(videotype) %>%
summarise(value = sum(value))
afinn_score
## # A tibble: 7 x 2
## videotype value
## <chr> <dbl>
## 1 dataanalysis 80
## 2 datacleaning 180
## 3 datacollection 75
## 4 documentation 79
## 5 modelbuilding 56
## 6 modelproduction 26
## 7 projectplanning 108
#Adding a flag for whether a comment is positive or negative
afinn_sentiment <- afinn_score %>%
filter(value != 0) %>%
mutate(sentiment = if_else(value < 0, "negative", "positive"))
afinn_sentiment
## # A tibble: 7 x 3
## videotype value sentiment
## <chr> <dbl> <chr>
## 1 dataanalysis 80 positive
## 2 datacleaning 180 positive
## 3 datacollection 75 positive
## 4 documentation 79 positive
## 5 modelbuilding 56 positive
## 6 modelproduction 26 positive
## 7 projectplanning 108 positive
summary_afinn2 <- sentiment_afinn %>%
group_by(videotype) %>%
filter(value != 0) %>%
mutate(sentiment = if_else(value < 0, "negative", "positive")) %>%
count(sentiment, sort = TRUE) %>%
mutate(method = "AFINN")
summary_bing2 <- sentiment_bing %>%
group_by(videotype) %>%
count(sentiment, sort = TRUE) %>%
mutate(method = "bing")
summary_nrc2 <- sentiment_nrc %>%
filter(sentiment %in% c("positive", "negative")) %>%
group_by(videotype) %>%
count(sentiment, sort = TRUE) %>%
mutate(method = "nrc")
summary_loughran2 <- sentiment_loughran %>%
filter(sentiment %in% c("positive", "negative")) %>%
group_by(videotype) %>%
count(sentiment, sort = TRUE) %>%
mutate(method = "loughran")
summary_sentiment <- bind_rows(summary_afinn2,
summary_bing2,
summary_nrc2,
summary_loughran2) %>%
arrange(method, videotype) %>%
relocate(method)
summary_sentiment
## # A tibble: 56 x 4
## # Groups: videotype [7]
## method videotype sentiment n
## <chr> <chr> <chr> <int>
## 1 AFINN dataanalysis positive 58
## 2 AFINN dataanalysis negative 26
## 3 AFINN datacleaning positive 122
## 4 AFINN datacleaning negative 52
## 5 AFINN datacollection positive 148
## 6 AFINN datacollection negative 122
## 7 AFINN documentation positive 42
## 8 AFINN documentation negative 10
## 9 AFINN modelbuilding positive 50
## 10 AFINN modelbuilding negative 34
## # ... with 46 more rows
total_counts <- summary_sentiment %>%
group_by(method, videotype) %>%
summarise(total = sum(n))
## `summarise()` has grouped output by 'method'. You can override using the `.groups` argument.
sentiment_counts <- left_join(summary_sentiment, total_counts)
## Joining, by = c("method", "videotype")
sentiment_counts
## # A tibble: 56 x 5
## # Groups: videotype [7]
## method videotype sentiment n total
## <chr> <chr> <chr> <int> <int>
## 1 AFINN dataanalysis positive 58 84
## 2 AFINN dataanalysis negative 26 84
## 3 AFINN datacleaning positive 122 174
## 4 AFINN datacleaning negative 52 174
## 5 AFINN datacollection positive 148 270
## 6 AFINN datacollection negative 122 270
## 7 AFINN documentation positive 42 52
## 8 AFINN documentation negative 10 52
## 9 AFINN modelbuilding positive 50 84
## 10 AFINN modelbuilding negative 34 84
## # ... with 46 more rows
#new row that calculates the percentage
sentiment_percents <- sentiment_counts %>%
mutate(percent = n/total * 100)
sentiment_percents
## # A tibble: 56 x 6
## # Groups: videotype [7]
## method videotype sentiment n total percent
## <chr> <chr> <chr> <int> <int> <dbl>
## 1 AFINN dataanalysis positive 58 84 69.0
## 2 AFINN dataanalysis negative 26 84 31.0
## 3 AFINN datacleaning positive 122 174 70.1
## 4 AFINN datacleaning negative 52 174 29.9
## 5 AFINN datacollection positive 148 270 54.8
## 6 AFINN datacollection negative 122 270 45.2
## 7 AFINN documentation positive 42 52 80.8
## 8 AFINN documentation negative 10 52 19.2
## 9 AFINN modelbuilding positive 50 84 59.5
## 10 AFINN modelbuilding negative 34 84 40.5
## # ... with 46 more rows
sentiment_percents %>%
ggplot(aes(x = videotype, y = percent, fill=sentiment)) +
geom_bar(width = .8, stat = "identity") +
facet_wrap(~method, ncol = 1) +
coord_flip() +
labs(title = "Public Sentiment on Project Based Youtube Videos",
subtitle = "From Project Planning, Data Collection, Data Cleaning, Data Analysis",
x = "Video Type",
y = "Percentage of Words")
Stemming: reducing the redundancy of words and phrases
##stemming
projects_tidy2 <- projects_tidy2 %>%
mutate(word = wordStem(word))
Cast a Document Term Matrix
projects_tds_DTM <- projects_tidy2 %>%
count(video_type, word) %>%
cast_dtm(video_type, word, n)
projects_tds_DTM
## <<DocumentTermMatrix (documents: 7, terms: 1815)>>
## Non-/sparse entries: 3199/9506
## Sparsity : 75%
## Maximal term length: 55
## Weighting : term frequency (tf)
projects_bigrams <- projects_raw %>%
unnest_tokens(output = bigram, input = comments, token = "ngrams", n = 2)
projects_bigrams <- projects_bigrams %>%
separate(bigram, into = c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
mutate(word1 = wordStem(word1)) %>%
mutate(word2 = wordStem(word2)) %>%
unite(bigram, c(word1, word2), sep = " ")
bigram_top_tokens <- projects_bigrams %>%
count(bigram, sort = TRUE) %>%
top_n(20)
## Selecting by n
bigram_top_tokens
## # A tibble: 20 x 2
## bigram n
## <chr> <int>
## 1 data scienc 71
## 2 hei ken 37
## 3 data scientist 30
## 4 info chrome 21
## 5 session info 21
## 6 css selector 18
## 7 element method 17
## 8 locat element 17
## 9 method css 17
## 10 selector selector 17
## 11 lib site 16
## 12 selector select 16
## 13 site packag 16
## 14 element unabl 14
## 15 data clean 13
## 16 df job_stat 13
## 17 ken jee 13
## 18 anaconda3 lib 12
## 19 appli lambda 12
## 20 salari estim 12
k_metrics <- FindTopicsNumber(
projects_tds_DTM,
topics = seq(5, 50, by = 5),
metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
method = "Gibbs",
control = list(seed = 77),
mc.cores = NA,
return_models = FALSE,
verbose = FALSE,
libpath = NULL
)
FindTopicsNumber_plot(k_metrics)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
projects_lda <- LDA(projects_tds_DTM,
k = 4,
control = list(seed = 588))
terms(projects_lda, 5)
## Topic 1 Topic 2 Topic 3 Topic 4
## [1,] "request" "error" "error" "salari"
## [2,] "error" "element" "model" "help"
## [3,] "file" "code" "code" "df"
## [4,] "line" "job" "learn" "learn"
## [5,] "model" "click" "watch" "job"
tidy_lda <- tidy(projects_lda)
tidy_lda
## # A tibble: 7,260 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 _imagingft 1.63e-178
## 2 2 _imagingft 1.05e-189
## 3 3 _imagingft 7.17e- 4
## 4 4 _imagingft 1.63e-176
## 5 1 8hr 2.87e-178
## 6 2 8hr 1.64e-189
## 7 3 8hr 7.17e- 4
## 8 4 8hr 1.41e-176
## 9 1 abrevi 3.39e-178
## 10 2 abrevi 1.73e-189
## # ... with 7,250 more rows
top_terms_lda <- tidy_lda %>%
group_by(topic) %>%
slice_max(beta, n = 5, with_ties = FALSE) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms_lda %>%
mutate(term = reorder_within(term, beta, topic)) %>%
group_by(topic, term) %>%
arrange(desc(beta)) %>%
ungroup() %>%
ggplot(aes(beta, term, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
scale_y_reordered() +
labs(title = "Top 5 terms in each LDA topic",
x = expression(beta), y = NULL) +
facet_wrap(~ topic, ncol = 4, scales = "free")
#BTM
dl <- udpipe_download_model(language = "english")
## Downloading udpipe model from https://raw.githubusercontent.com/jwijffels/udpipe.models.ud.2.5/master/inst/udpipe-ud-2.5-191206/english-ewt-ud-2.5-191206.udpipe to C:/Users/doreen/Documents/Youtube-SentimentAnalysis/ProjectBased/english-ewt-ud-2.5-191206.udpipe
## - This model has been trained on version 2.5 of data from https://universaldependencies.org
## - The model is distributed under the CC-BY-SA-NC license: https://creativecommons.org/licenses/by-nc-sa/4.0
## - Visit https://github.com/jwijffels/udpipe.models.ud.2.5 for model license details.
## - For a list of all models and their licenses (most models you can download with this package have either a CC-BY-SA or a CC-BY-SA-NC license) read the documentation at ?udpipe_download_model. For building your own models: visit the documentation by typing vignette('udpipe-train', package = 'udpipe')
## Downloading finished, model stored at 'C:/Users/doreen/Documents/Youtube-SentimentAnalysis/ProjectBased/english-ewt-ud-2.5-191206.udpipe'
udmodel_english <- udpipe_load_model(file = dl$file_model)
s<- udpipe_annotate(udmodel_english, projects_raw$text)
x <- data.frame(s)
str(x)
## 'data.frame': 26701 obs. of 14 variables:
## $ doc_id : chr "doc1" "doc1" "doc1" "doc1" ...
## $ paragraph_id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ sentence_id : int 1 1 1 1 2 2 1 1 1 1 ...
## $ sentence : chr "a job simp ?" "a job simp ?" "a job simp ?" "a job simp ?" ...
## $ token_id : chr "1" "2" "3" "4" ...
## $ token : chr "a" "job" "simp" "?" ...
## $ lemma : chr "a" "job" "simp" "?" ...
## $ upos : chr "DET" "NOUN" "NOUN" "PUNCT" ...
## $ xpos : chr "DT" "NN" "NN" "." ...
## $ feats : chr "Definite=Ind|PronType=Art" "Number=Sing" "Number=Sing" NA ...
## $ head_token_id: chr "3" "3" "0" "3" ...
## $ dep_rel : chr "det" "compound" "root" "punct" ...
## $ deps : chr NA NA NA NA ...
## $ misc : chr NA NA NA NA ...
table(x$upos)
##
## ADJ ADP ADV AUX CCONJ DET INTJ NOUN NUM PART PRON PROPN PUNCT
## 1396 1714 1396 1422 583 1811 251 5194 418 665 2516 1003 4146
## SCONJ SYM VERB X
## 429 183 3026 548
anno <- udpipe(projects_raw, "english", trace = 10)
## 2022-10-14 13:43:40 Annotating text fragment 1/615
## 2022-10-14 13:43:40 Annotating text fragment 11/615
## 2022-10-14 13:43:41 Annotating text fragment 21/615
## 2022-10-14 13:43:41 Annotating text fragment 31/615
## 2022-10-14 13:43:42 Annotating text fragment 41/615
## 2022-10-14 13:43:42 Annotating text fragment 51/615
## 2022-10-14 13:43:43 Annotating text fragment 61/615
## 2022-10-14 13:43:44 Annotating text fragment 71/615
## 2022-10-14 13:43:44 Annotating text fragment 81/615
## 2022-10-14 13:43:45 Annotating text fragment 91/615
## 2022-10-14 13:43:45 Annotating text fragment 101/615
## 2022-10-14 13:43:46 Annotating text fragment 111/615
## 2022-10-14 13:43:46 Annotating text fragment 121/615
## 2022-10-14 13:43:47 Annotating text fragment 131/615
## 2022-10-14 13:43:47 Annotating text fragment 141/615
## 2022-10-14 13:43:48 Annotating text fragment 151/615
## 2022-10-14 13:43:49 Annotating text fragment 161/615
## 2022-10-14 13:43:50 Annotating text fragment 171/615
## 2022-10-14 13:43:50 Annotating text fragment 181/615
## 2022-10-14 13:43:51 Annotating text fragment 191/615
## 2022-10-14 13:43:52 Annotating text fragment 201/615
## 2022-10-14 13:43:52 Annotating text fragment 211/615
## 2022-10-14 13:43:53 Annotating text fragment 221/615
## 2022-10-14 13:43:54 Annotating text fragment 231/615
## 2022-10-14 13:43:54 Annotating text fragment 241/615
## 2022-10-14 13:43:56 Annotating text fragment 251/615
## 2022-10-14 13:43:56 Annotating text fragment 261/615
## 2022-10-14 13:43:57 Annotating text fragment 271/615
## 2022-10-14 13:43:58 Annotating text fragment 281/615
## 2022-10-14 13:43:58 Annotating text fragment 291/615
## 2022-10-14 13:43:59 Annotating text fragment 301/615
## 2022-10-14 13:43:59 Annotating text fragment 311/615
## 2022-10-14 13:44:01 Annotating text fragment 321/615
## 2022-10-14 13:44:02 Annotating text fragment 331/615
## 2022-10-14 13:44:02 Annotating text fragment 341/615
## 2022-10-14 13:44:03 Annotating text fragment 351/615
## 2022-10-14 13:44:04 Annotating text fragment 361/615
## 2022-10-14 13:44:04 Annotating text fragment 371/615
## 2022-10-14 13:44:05 Annotating text fragment 381/615
## 2022-10-14 13:44:06 Annotating text fragment 391/615
## 2022-10-14 13:44:06 Annotating text fragment 401/615
## 2022-10-14 13:44:06 Annotating text fragment 411/615
## 2022-10-14 13:44:07 Annotating text fragment 421/615
## 2022-10-14 13:44:07 Annotating text fragment 431/615
## 2022-10-14 13:44:08 Annotating text fragment 441/615
## 2022-10-14 13:44:08 Annotating text fragment 451/615
## 2022-10-14 13:44:09 Annotating text fragment 461/615
## 2022-10-14 13:44:09 Annotating text fragment 471/615
## 2022-10-14 13:44:09 Annotating text fragment 481/615
## 2022-10-14 13:44:10 Annotating text fragment 491/615
## 2022-10-14 13:44:11 Annotating text fragment 501/615
## 2022-10-14 13:44:11 Annotating text fragment 511/615
## 2022-10-14 13:44:12 Annotating text fragment 521/615
## 2022-10-14 13:44:13 Annotating text fragment 531/615
## 2022-10-14 13:44:13 Annotating text fragment 541/615
## 2022-10-14 13:44:13 Annotating text fragment 551/615
## 2022-10-14 13:44:13 Annotating text fragment 561/615
## 2022-10-14 13:44:14 Annotating text fragment 571/615
## 2022-10-14 13:44:14 Annotating text fragment 581/615
## 2022-10-14 13:44:15 Annotating text fragment 591/615
## 2022-10-14 13:44:15 Annotating text fragment 601/615
## 2022-10-14 13:44:15 Annotating text fragment 611/615
biterms <- as.data.table(anno)
biterms <- biterms[, cooccurrence(x = lemma,
relevant = upos %in% c("NOUN",
"ADJ",
"PROPN"),
skipgram = 5),
by = list(doc_id)]
# Build BTM
set.seed(588)
traindata <- subset(anno, upos %in% c("NOUN", "ADJ", "PROPN"))
traindata <- traindata[, c("doc_id", "lemma")]
model <- BTM(traindata, k = 10,
beta = 0.01,
iter = 500,
biterms = biterms,
trace = 100)
## 2022-10-14 13:44:36 Start Gibbs sampling iteration 1/500
## 2022-10-14 13:44:37 Start Gibbs sampling iteration 101/500
## 2022-10-14 13:44:38 Start Gibbs sampling iteration 201/500
## 2022-10-14 13:44:38 Start Gibbs sampling iteration 301/500
## 2022-10-14 13:44:39 Start Gibbs sampling iteration 401/500
#Plot Model Results (do not run when knitting)
#library(textplot)
#library(ggraph)
#library(concaveman)
#plot(model, top_n = 10,title = "BTM model", subtitle = "K = 10 , 500 Training Iterations", labels = c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9"))
## Annotate text with parts of speech tags
##anno <- udpipe(projects_raw, "english", trace = 10)
##biterms <- as.data.table(anno)
##biterms <- biterms[, cooccurrence(x = lemma,
## relevant = upos %in% c("NOUN",
## "ADJ",
## "PROPN"),
## skipgram = 5),
## by = list(doc_id)]
# Build BTM
##set.seed(588)
##traindata <- subset(anno, upos %in% c("NOUN", "ADJ", "PROPN"))
##traindata <- traindata[, c("doc_id", "lemma")]
##model <- BTM(traindata, k = 10,
## beta = 0.01,
## iter = 500,
## biterms = biterms,
## trace = 100)
# Plot Model Results (do not run when knitting)
##library(textplot)
##library(ggraph)
##library(concaveman)
##plot(model,
## top_n = 10,title = "BTM model",
## subtitle = "K = 10 , 500 Training Iterations",
## labels = c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
## "10", "11", "12", "13", "14", "15", "16", "17",
## "18", "19"))