#loading libraries
library(dplyr)
library(readr)
library(tidyr)
library(rtweet)
library(writexl)
library(readxl)
library(tidytext)
library(textdata)
library(ggplot2)
library(textdata)
library(scales)
#reading the excel files
learning_datascience <- read_xlsx("data/Learning_Datascience.xlsx")
projects <- read_xlsx("data/Project_Tutorials.xlsx")
motivation <- read_xlsx("data/Motivation.xlsx")
#selecting columns for analysis
learning_datascience <- select(learning_datascience,comments,video_ID)
projects <- select(projects,comments,video_ID)
motivation <- select(motivation,comments,video_ID)
learning_datascience <-
select(learning_datascience,comments, video_ID) %>%
mutate(videotype = "learning") %>%
relocate(videotype)
projects <-
select(projects,comments, video_ID) %>%
mutate(videotype = "projects") %>%
relocate(videotype)
motivation <-
select(motivation,comments, video_ID) %>%
mutate(videotype = "motivation") %>%
relocate(videotype)
#binding rows
youtubecomments <- bind_rows(learning_datascience, projects, motivation)
tail(youtubecomments)
## # A tibble: 6 × 3
## videotype comments video…¹
## <chr> <chr> <chr>
## 1 motivation "Awesome video! This really helped as I am a undergrad phy… xjhW1r…
## 2 motivation "Thank you for this video! I have a nice video suggestion.… xjhW1r…
## 3 motivation "I am curious as to your opinion on these 3 month bootcamp… xjhW1r…
## 4 motivation "one thing which makes me really enjoy your content is how… xjhW1r…
## 5 motivation "Thank you Ken, I am now a business student in UK, inspire… xjhW1r…
## 6 motivation "Hi Ken, I want to pursue career in Data Science and I'm s… xjhW1r…
## # … with abbreviated variable name ¹​video_ID
#tokenizing text
video_tokens <-
youtubecomments %>%
unnest_tokens(output = word,
input = comments)
#removing stopwords
tidy_comments <-
video_tokens %>%
anti_join(stop_words, by = "word")
count(tidy_comments, word, sort = T)
## # A tibble: 5,441 × 2
## word n
## <chr> <int>
## 1 data 1273
## 2 science 749
## 3 ken 503
## 4 video 449
## 5 learning 330
## 6 learn 275
## 7 python 233
## 8 project 220
## 9 projects 215
## 10 videos 198
## # … with 5,431 more rows
#Afinn Lexicons
afinn <- get_sentiments("afinn")
afinn
## # A tibble: 2,477 × 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # … with 2,467 more rows
#Bing Lexicons
bing <- get_sentiments("bing")
bing
## # A tibble: 6,786 × 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # … with 6,776 more rows
nrc <- get_sentiments("nrc")
nrc
## # A tibble: 13,872 × 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # … with 13,862 more rows
loughran <- get_sentiments("loughran")
loughran
## # A tibble: 4,150 × 2
## word sentiment
## <chr> <chr>
## 1 abandon negative
## 2 abandoned negative
## 3 abandoning negative
## 4 abandonment negative
## 5 abandonments negative
## 6 abandons negative
## 7 abdicated negative
## 8 abdicates negative
## 9 abdicating negative
## 10 abdication negative
## # … with 4,140 more rows
#joining Sentiments afinn
sentiment_afinn <- inner_join(tidy_comments, afinn, by = "word")
sentiment_afinn
## # A tibble: 2,263 × 4
## videotype video_ID word value
## <chr> <chr> <chr> <dbl>
## 1 learning 4OZip0cgOho lost -3
## 2 learning 4OZip0cgOho helpful 2
## 3 learning 4OZip0cgOho doubt -1
## 4 learning 4OZip0cgOho improve 2
## 5 learning 4OZip0cgOho limited -1
## 6 learning 4OZip0cgOho confident 2
## 7 learning 4OZip0cgOho nice 3
## 8 learning 4OZip0cgOho love 3
## 9 learning 4OZip0cgOho hope 2
## 10 learning 4OZip0cgOho helpful 2
## # … with 2,253 more rows
sentiment_loughran <- inner_join(tidy_comments, loughran, by = "word")
sentiment_loughran
## # A tibble: 1,270 × 4
## videotype video_ID word sentiment
## <chr> <chr> <chr> <chr>
## 1 learning 4OZip0cgOho lost negative
## 2 learning 4OZip0cgOho doubt negative
## 3 learning 4OZip0cgOho doubt uncertainty
## 4 learning 4OZip0cgOho improve positive
## 5 learning 4OZip0cgOho exposure uncertainty
## 6 learning 4OZip0cgOho confident positive
## 7 learning 4OZip0cgOho depending uncertainty
## 8 learning 4OZip0cgOho depending constraining
## 9 learning 4OZip0cgOho success positive
## 10 learning 4OZip0cgOho question negative
## # … with 1,260 more rows
sentiment_bing <- inner_join(tidy_comments, bing, by = "word")
sentiment_bing
## # A tibble: 2,538 × 4
## videotype video_ID word sentiment
## <chr> <chr> <chr> <chr>
## 1 learning 4OZip0cgOho scratch negative
## 2 learning 4OZip0cgOho afford positive
## 3 learning 4OZip0cgOho lost negative
## 4 learning 4OZip0cgOho helpful positive
## 5 learning 4OZip0cgOho doubt negative
## 6 learning 4OZip0cgOho improve positive
## 7 learning 4OZip0cgOho limited negative
## 8 learning 4OZip0cgOho confident positive
## 9 learning 4OZip0cgOho cloud negative
## 10 learning 4OZip0cgOho stranger negative
## # … with 2,528 more rows
sentiment_nrc <- inner_join(tidy_comments, nrc, by = "word")
sentiment_nrc
## # A tibble: 12,379 × 4
## videotype video_ID word sentiment
## <chr> <chr> <chr> <chr>
## 1 learning 4OZip0cgOho providing anticipation
## 2 learning 4OZip0cgOho providing joy
## 3 learning 4OZip0cgOho providing positive
## 4 learning 4OZip0cgOho providing trust
## 5 learning 4OZip0cgOho result anticipation
## 6 learning 4OZip0cgOho afford positive
## 7 learning 4OZip0cgOho time anticipation
## 8 learning 4OZip0cgOho learn positive
## 9 learning 4OZip0cgOho time anticipation
## 10 learning 4OZip0cgOho lost negative
## # … with 12,369 more rows
summary_bing <- count(sentiment_bing, sentiment, sort = TRUE)
summary_bing
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 positive 1650
## 2 negative 888
summary_bing <- sentiment_bing %>%
group_by(videotype) %>%
count(sentiment)
summary_bing
## # A tibble: 6 × 3
## # Groups: videotype [3]
## videotype sentiment n
## <chr> <chr> <int>
## 1 learning negative 426
## 2 learning positive 898
## 3 motivation negative 31
## 4 motivation positive 89
## 5 projects negative 431
## 6 projects positive 663
#untidy the data
summary_bing <- sentiment_bing %>%
group_by(videotype) %>%
count(sentiment, sort = TRUE) %>%
spread(sentiment, n)
summary_bing
## # A tibble: 3 × 3
## # Groups: videotype [3]
## videotype negative positive
## <chr> <int> <int>
## 1 learning 426 898
## 2 motivation 31 89
## 3 projects 431 663
summary_bing <- sentiment_bing %>%
group_by(videotype) %>%
count(sentiment, sort = TRUE) %>%
spread(sentiment, n) %>%
mutate(sentiment = positive - negative) %>%
mutate(lexicon = "bing") %>%
relocate(lexicon)
summary_bing
## # A tibble: 3 × 5
## # Groups: videotype [3]
## lexicon videotype negative positive sentiment
## <chr> <chr> <int> <int> <int>
## 1 bing learning 426 898 472
## 2 bing motivation 31 89 58
## 3 bing projects 431 663 232
head(sentiment_afinn)
## # A tibble: 6 × 4
## videotype video_ID word value
## <chr> <chr> <chr> <dbl>
## 1 learning 4OZip0cgOho lost -3
## 2 learning 4OZip0cgOho helpful 2
## 3 learning 4OZip0cgOho doubt -1
## 4 learning 4OZip0cgOho improve 2
## 5 learning 4OZip0cgOho limited -1
## 6 learning 4OZip0cgOho confident 2
summary_afinn <- sentiment_afinn %>%
group_by(videotype) %>%
summarise(sentiment = sum(value)) %>%
mutate(lexicon = "AFINN") %>%
relocate(lexicon)
summary_afinn
## # A tibble: 3 × 3
## lexicon videotype sentiment
## <chr> <chr> <dbl>
## 1 AFINN learning 1192
## 2 AFINN motivation 154
## 3 AFINN projects 964
summary_loughran <- sentiment_loughran %>%
group_by(videotype) %>%
mutate(lexicon = "loughran") %>%
relocate(lexicon)
summary_loughran
## # A tibble: 1,270 × 5
## # Groups: videotype [3]
## lexicon videotype video_ID word sentiment
## <chr> <chr> <chr> <chr> <chr>
## 1 loughran learning 4OZip0cgOho lost negative
## 2 loughran learning 4OZip0cgOho doubt negative
## 3 loughran learning 4OZip0cgOho doubt uncertainty
## 4 loughran learning 4OZip0cgOho improve positive
## 5 loughran learning 4OZip0cgOho exposure uncertainty
## 6 loughran learning 4OZip0cgOho confident positive
## 7 loughran learning 4OZip0cgOho depending uncertainty
## 8 loughran learning 4OZip0cgOho depending constraining
## 9 loughran learning 4OZip0cgOho success positive
## 10 loughran learning 4OZip0cgOho question negative
## # … with 1,260 more rows
summary_nrc <- sentiment_nrc %>%
group_by(videotype) %>%
filter (sentiment == "positive"|sentiment == "negative") %>%
summarise(sentiment) %>%
mutate(lexicon = "NRC") %>%
relocate(lexicon)
summary_nrc
## # A tibble: 5,594 × 3
## # Groups: videotype [3]
## lexicon videotype sentiment
## <chr> <chr> <chr>
## 1 NRC learning positive
## 2 NRC learning positive
## 3 NRC learning positive
## 4 NRC learning negative
## 5 NRC learning positive
## 6 NRC learning positive
## 7 NRC learning negative
## 8 NRC learning positive
## 9 NRC learning positive
## 10 NRC learning positive
## # … with 5,584 more rows
sentiment_afinn <- youtubecomments %>%
unnest_tokens(output = word,
input = comments) %>%
anti_join(stop_words, by = "word") %>%
filter(!word == "amp") %>%
inner_join(afinn, by = "word")
sentiment_afinn
## # A tibble: 2,263 × 4
## videotype video_ID word value
## <chr> <chr> <chr> <dbl>
## 1 learning 4OZip0cgOho lost -3
## 2 learning 4OZip0cgOho helpful 2
## 3 learning 4OZip0cgOho doubt -1
## 4 learning 4OZip0cgOho improve 2
## 5 learning 4OZip0cgOho limited -1
## 6 learning 4OZip0cgOho confident 2
## 7 learning 4OZip0cgOho nice 3
## 8 learning 4OZip0cgOho love 3
## 9 learning 4OZip0cgOho hope 2
## 10 learning 4OZip0cgOho helpful 2
## # … with 2,253 more rows
afinn_score <- sentiment_afinn %>%
group_by(videotype) %>%
summarise(value = sum(value))
afinn_score
## # A tibble: 3 × 2
## videotype value
## <chr> <dbl>
## 1 learning 1192
## 2 motivation 154
## 3 projects 964
#Adding a flag for whether a comment is positive or negative
afinn_sentiment <- afinn_score %>%
filter(value != 0) %>%
mutate(sentiment = if_else(value < 0, "negative", "positive"))
afinn_sentiment
## # A tibble: 3 × 3
## videotype value sentiment
## <chr> <dbl> <chr>
## 1 learning 1192 positive
## 2 motivation 154 positive
## 3 projects 964 positive
#plotting proportion of positive and negative comments
afinn_counts <- afinn_sentiment %>%
group_by(videotype) %>%
count(sentiment) %>%
filter(videotype == "learning")
afinn_counts %>%
ggplot(aes(x="", y=n, fill=sentiment)) +
geom_bar(width = .6, stat = "identity") +
labs(title = "Learning Videos",
subtitle = "Proportion of Positive & Negative Comments") +
coord_polar(theta = "y") +
theme_void()

summary_afinn2 <- sentiment_afinn %>%
group_by(videotype) %>%
filter(value != 0) %>%
mutate(sentiment = if_else(value < 0, "negative", "positive")) %>%
count(sentiment, sort = TRUE) %>%
mutate(method = "AFINN")
summary_bing2 <- sentiment_bing %>%
group_by(videotype) %>%
count(sentiment, sort = TRUE) %>%
mutate(method = "bing")
summary_nrc2 <- sentiment_nrc %>%
filter(sentiment %in% c("positive", "negative")) %>%
group_by(videotype) %>%
count(sentiment, sort = TRUE) %>%
mutate(method = "nrc")
summary_loughran2 <- sentiment_loughran %>%
filter(sentiment %in% c("positive", "negative")) %>%
group_by(videotype) %>%
count(sentiment, sort = TRUE) %>%
mutate(method = "loughran")
#combining the dataframes
summary_sentiment <- bind_rows(summary_afinn2,
summary_bing2,
summary_nrc2,
summary_loughran2) %>%
arrange(method, videotype) %>%
relocate(method)
summary_sentiment
## # A tibble: 24 × 4
## # Groups: videotype [3]
## method videotype sentiment n
## <chr> <chr> <chr> <int>
## 1 AFINN learning positive 858
## 2 AFINN learning negative 315
## 3 AFINN motivation positive 85
## 4 AFINN motivation negative 21
## 5 AFINN projects positive 685
## 6 AFINN projects negative 299
## 7 bing learning positive 898
## 8 bing learning negative 426
## 9 bing motivation positive 89
## 10 bing motivation negative 31
## # … with 14 more rows
total_counts <- summary_sentiment %>%
group_by(method, videotype) %>%
summarise(total = sum(n))
sentiment_counts <- left_join(summary_sentiment, total_counts)
sentiment_counts
## # A tibble: 24 × 5
## # Groups: videotype [3]
## method videotype sentiment n total
## <chr> <chr> <chr> <int> <int>
## 1 AFINN learning positive 858 1173
## 2 AFINN learning negative 315 1173
## 3 AFINN motivation positive 85 106
## 4 AFINN motivation negative 21 106
## 5 AFINN projects positive 685 984
## 6 AFINN projects negative 299 984
## 7 bing learning positive 898 1324
## 8 bing learning negative 426 1324
## 9 bing motivation positive 89 120
## 10 bing motivation negative 31 120
## # … with 14 more rows
#new row that calculates the percentage
sentiment_percents <- sentiment_counts %>%
mutate(percent = n/total * 100)
sentiment_percents
## # A tibble: 24 × 6
## # Groups: videotype [3]
## method videotype sentiment n total percent
## <chr> <chr> <chr> <int> <int> <dbl>
## 1 AFINN learning positive 858 1173 73.1
## 2 AFINN learning negative 315 1173 26.9
## 3 AFINN motivation positive 85 106 80.2
## 4 AFINN motivation negative 21 106 19.8
## 5 AFINN projects positive 685 984 69.6
## 6 AFINN projects negative 299 984 30.4
## 7 bing learning positive 898 1324 67.8
## 8 bing learning negative 426 1324 32.2
## 9 bing motivation positive 89 120 74.2
## 10 bing motivation negative 31 120 25.8
## # … with 14 more rows
sentiment_percents %>%
ggplot(aes(x = videotype, y = percent, fill=sentiment)) +
geom_bar(width = .8, stat = "identity") +
facet_wrap(~method, ncol = 1) +
coord_flip() +
labs(title = "Public Sentiment on Data Science Youtube Videos",
subtitle = "From Learning, Motivation and Project based Videos",
x = "Video Type",
y = "Percentage of Words")
