#loading libraries 
library(dplyr)
library(readr)
library(tidyr)
library(rtweet)
library(writexl)
library(readxl)
library(tidytext)
library(textdata)
library(ggplot2)
library(textdata)
library(scales)
#reading the excel files  
learning_datascience <- read_xlsx("data/Learning_Datascience.xlsx")
projects <- read_xlsx("data/Project_Tutorials.xlsx")
motivation <- read_xlsx("data/Motivation.xlsx")
#selecting columns for analysis
learning_datascience <- select(learning_datascience,comments,video_ID)
projects <- select(projects,comments,video_ID)
motivation <- select(motivation,comments,video_ID)
learning_datascience <-
  select(learning_datascience,comments, video_ID) %>%
  mutate(videotype = "learning") %>%
  relocate(videotype)

projects <-
  select(projects,comments, video_ID) %>%
  mutate(videotype = "projects") %>%
  relocate(videotype)

motivation <-
  select(motivation,comments, video_ID) %>%
  mutate(videotype = "motivation") %>%
  relocate(videotype)
#binding rows
youtubecomments <- bind_rows(learning_datascience, projects, motivation)
tail(youtubecomments)
## # A tibble: 6 × 3
##   videotype  comments                                                    video…¹
##   <chr>      <chr>                                                       <chr>  
## 1 motivation "Awesome video! This really helped as I am a undergrad phy… xjhW1r…
## 2 motivation "Thank you for this video! I have a nice video suggestion.… xjhW1r…
## 3 motivation "I am curious as to your opinion on these 3 month bootcamp… xjhW1r…
## 4 motivation "one thing which makes me really enjoy your content is how… xjhW1r…
## 5 motivation "Thank you Ken, I am now a business student in UK, inspire… xjhW1r…
## 6 motivation "Hi Ken, I want to pursue career in Data Science and I'm s… xjhW1r…
## # … with abbreviated variable name ¹​video_ID
#tokenizing text 
video_tokens <- 
  youtubecomments %>%
  unnest_tokens(output = word, 
                input = comments)
#removing stopwords 
tidy_comments <-
  video_tokens %>%
  anti_join(stop_words, by = "word")
count(tidy_comments, word, sort = T)
## # A tibble: 5,441 × 2
##    word         n
##    <chr>    <int>
##  1 data      1273
##  2 science    749
##  3 ken        503
##  4 video      449
##  5 learning   330
##  6 learn      275
##  7 python     233
##  8 project    220
##  9 projects   215
## 10 videos     198
## # … with 5,431 more rows
#Afinn Lexicons
afinn <- get_sentiments("afinn")

afinn
## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # … with 2,467 more rows
#Bing Lexicons 
bing <- get_sentiments("bing")

bing
## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # … with 6,776 more rows
nrc <- get_sentiments("nrc")

nrc
## # A tibble: 13,872 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # … with 13,862 more rows
loughran <- get_sentiments("loughran")

loughran
## # A tibble: 4,150 × 2
##    word         sentiment
##    <chr>        <chr>    
##  1 abandon      negative 
##  2 abandoned    negative 
##  3 abandoning   negative 
##  4 abandonment  negative 
##  5 abandonments negative 
##  6 abandons     negative 
##  7 abdicated    negative 
##  8 abdicates    negative 
##  9 abdicating   negative 
## 10 abdication   negative 
## # … with 4,140 more rows
#joining Sentiments afinn
sentiment_afinn <- inner_join(tidy_comments, afinn, by = "word")

sentiment_afinn
## # A tibble: 2,263 × 4
##    videotype video_ID    word      value
##    <chr>     <chr>       <chr>     <dbl>
##  1 learning  4OZip0cgOho lost         -3
##  2 learning  4OZip0cgOho helpful       2
##  3 learning  4OZip0cgOho doubt        -1
##  4 learning  4OZip0cgOho improve       2
##  5 learning  4OZip0cgOho limited      -1
##  6 learning  4OZip0cgOho confident     2
##  7 learning  4OZip0cgOho nice          3
##  8 learning  4OZip0cgOho love          3
##  9 learning  4OZip0cgOho hope          2
## 10 learning  4OZip0cgOho helpful       2
## # … with 2,253 more rows
sentiment_loughran <- inner_join(tidy_comments, loughran, by = "word")

sentiment_loughran
## # A tibble: 1,270 × 4
##    videotype video_ID    word      sentiment   
##    <chr>     <chr>       <chr>     <chr>       
##  1 learning  4OZip0cgOho lost      negative    
##  2 learning  4OZip0cgOho doubt     negative    
##  3 learning  4OZip0cgOho doubt     uncertainty 
##  4 learning  4OZip0cgOho improve   positive    
##  5 learning  4OZip0cgOho exposure  uncertainty 
##  6 learning  4OZip0cgOho confident positive    
##  7 learning  4OZip0cgOho depending uncertainty 
##  8 learning  4OZip0cgOho depending constraining
##  9 learning  4OZip0cgOho success   positive    
## 10 learning  4OZip0cgOho question  negative    
## # … with 1,260 more rows
sentiment_bing <- inner_join(tidy_comments, bing, by = "word")

sentiment_bing
## # A tibble: 2,538 × 4
##    videotype video_ID    word      sentiment
##    <chr>     <chr>       <chr>     <chr>    
##  1 learning  4OZip0cgOho scratch   negative 
##  2 learning  4OZip0cgOho afford    positive 
##  3 learning  4OZip0cgOho lost      negative 
##  4 learning  4OZip0cgOho helpful   positive 
##  5 learning  4OZip0cgOho doubt     negative 
##  6 learning  4OZip0cgOho improve   positive 
##  7 learning  4OZip0cgOho limited   negative 
##  8 learning  4OZip0cgOho confident positive 
##  9 learning  4OZip0cgOho cloud     negative 
## 10 learning  4OZip0cgOho stranger  negative 
## # … with 2,528 more rows
sentiment_nrc <- inner_join(tidy_comments, nrc, by = "word")

sentiment_nrc
## # A tibble: 12,379 × 4
##    videotype video_ID    word      sentiment   
##    <chr>     <chr>       <chr>     <chr>       
##  1 learning  4OZip0cgOho providing anticipation
##  2 learning  4OZip0cgOho providing joy         
##  3 learning  4OZip0cgOho providing positive    
##  4 learning  4OZip0cgOho providing trust       
##  5 learning  4OZip0cgOho result    anticipation
##  6 learning  4OZip0cgOho afford    positive    
##  7 learning  4OZip0cgOho time      anticipation
##  8 learning  4OZip0cgOho learn     positive    
##  9 learning  4OZip0cgOho time      anticipation
## 10 learning  4OZip0cgOho lost      negative    
## # … with 12,369 more rows
summary_bing <- count(sentiment_bing, sentiment, sort = TRUE)
summary_bing
## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 positive   1650
## 2 negative    888
summary_bing <- sentiment_bing %>% 
  group_by(videotype) %>% 
  count(sentiment) 

summary_bing
## # A tibble: 6 × 3
## # Groups:   videotype [3]
##   videotype  sentiment     n
##   <chr>      <chr>     <int>
## 1 learning   negative    426
## 2 learning   positive    898
## 3 motivation negative     31
## 4 motivation positive     89
## 5 projects   negative    431
## 6 projects   positive    663
#untidy the data 
summary_bing <- sentiment_bing %>% 
  group_by(videotype) %>% 
  count(sentiment, sort = TRUE) %>% 
  spread(sentiment, n) 

summary_bing
## # A tibble: 3 × 3
## # Groups:   videotype [3]
##   videotype  negative positive
##   <chr>         <int>    <int>
## 1 learning        426      898
## 2 motivation       31       89
## 3 projects        431      663
summary_bing <- sentiment_bing %>% 
  group_by(videotype) %>% 
  count(sentiment, sort = TRUE) %>% 
  spread(sentiment, n) %>%
  mutate(sentiment = positive - negative) %>%
  mutate(lexicon = "bing") %>%
  relocate(lexicon)

summary_bing
## # A tibble: 3 × 5
## # Groups:   videotype [3]
##   lexicon videotype  negative positive sentiment
##   <chr>   <chr>         <int>    <int>     <int>
## 1 bing    learning        426      898       472
## 2 bing    motivation       31       89        58
## 3 bing    projects        431      663       232
head(sentiment_afinn)
## # A tibble: 6 × 4
##   videotype video_ID    word      value
##   <chr>     <chr>       <chr>     <dbl>
## 1 learning  4OZip0cgOho lost         -3
## 2 learning  4OZip0cgOho helpful       2
## 3 learning  4OZip0cgOho doubt        -1
## 4 learning  4OZip0cgOho improve       2
## 5 learning  4OZip0cgOho limited      -1
## 6 learning  4OZip0cgOho confident     2
summary_afinn <- sentiment_afinn %>% 
  group_by(videotype) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(lexicon = "AFINN") %>%
  relocate(lexicon)

summary_afinn
## # A tibble: 3 × 3
##   lexicon videotype  sentiment
##   <chr>   <chr>          <dbl>
## 1 AFINN   learning        1192
## 2 AFINN   motivation       154
## 3 AFINN   projects         964
summary_loughran <- sentiment_loughran %>% 
  group_by(videotype) %>% 
   
  mutate(lexicon = "loughran") %>%
  relocate(lexicon)

summary_loughran
## # A tibble: 1,270 × 5
## # Groups:   videotype [3]
##    lexicon  videotype video_ID    word      sentiment   
##    <chr>    <chr>     <chr>       <chr>     <chr>       
##  1 loughran learning  4OZip0cgOho lost      negative    
##  2 loughran learning  4OZip0cgOho doubt     negative    
##  3 loughran learning  4OZip0cgOho doubt     uncertainty 
##  4 loughran learning  4OZip0cgOho improve   positive    
##  5 loughran learning  4OZip0cgOho exposure  uncertainty 
##  6 loughran learning  4OZip0cgOho confident positive    
##  7 loughran learning  4OZip0cgOho depending uncertainty 
##  8 loughran learning  4OZip0cgOho depending constraining
##  9 loughran learning  4OZip0cgOho success   positive    
## 10 loughran learning  4OZip0cgOho question  negative    
## # … with 1,260 more rows
summary_nrc <- sentiment_nrc %>% 
  group_by(videotype) %>% 
  filter (sentiment == "positive"|sentiment == "negative") %>%
 summarise(sentiment) %>%
  mutate(lexicon = "NRC") %>%
  relocate(lexicon)

summary_nrc
## # A tibble: 5,594 × 3
## # Groups:   videotype [3]
##    lexicon videotype sentiment
##    <chr>   <chr>     <chr>    
##  1 NRC     learning  positive 
##  2 NRC     learning  positive 
##  3 NRC     learning  positive 
##  4 NRC     learning  negative 
##  5 NRC     learning  positive 
##  6 NRC     learning  positive 
##  7 NRC     learning  negative 
##  8 NRC     learning  positive 
##  9 NRC     learning  positive 
## 10 NRC     learning  positive 
## # … with 5,584 more rows
sentiment_afinn <- youtubecomments %>%
  unnest_tokens(output = word, 
                input = comments)  %>% 
  anti_join(stop_words, by = "word") %>%
  filter(!word == "amp") %>%
  inner_join(afinn, by = "word")

sentiment_afinn
## # A tibble: 2,263 × 4
##    videotype video_ID    word      value
##    <chr>     <chr>       <chr>     <dbl>
##  1 learning  4OZip0cgOho lost         -3
##  2 learning  4OZip0cgOho helpful       2
##  3 learning  4OZip0cgOho doubt        -1
##  4 learning  4OZip0cgOho improve       2
##  5 learning  4OZip0cgOho limited      -1
##  6 learning  4OZip0cgOho confident     2
##  7 learning  4OZip0cgOho nice          3
##  8 learning  4OZip0cgOho love          3
##  9 learning  4OZip0cgOho hope          2
## 10 learning  4OZip0cgOho helpful       2
## # … with 2,253 more rows
afinn_score <- sentiment_afinn %>% 
  group_by(videotype) %>% 
  summarise(value = sum(value))

afinn_score
## # A tibble: 3 × 2
##   videotype  value
##   <chr>      <dbl>
## 1 learning    1192
## 2 motivation   154
## 3 projects     964
#Adding a flag for whether a comment is positive or negative
afinn_sentiment <- afinn_score %>%
  filter(value != 0) %>%
  mutate(sentiment = if_else(value < 0, "negative", "positive"))

afinn_sentiment
## # A tibble: 3 × 3
##   videotype  value sentiment
##   <chr>      <dbl> <chr>    
## 1 learning    1192 positive 
## 2 motivation   154 positive 
## 3 projects     964 positive
#plotting proportion of positive and negative comments
afinn_counts <- afinn_sentiment %>%
  group_by(videotype) %>% 
  count(sentiment) %>%
  filter(videotype == "learning")

afinn_counts %>%
ggplot(aes(x="", y=n, fill=sentiment)) +
  geom_bar(width = .6, stat = "identity") +
  labs(title = "Learning Videos",
       subtitle = "Proportion of Positive & Negative Comments") +
  coord_polar(theta = "y") +
  theme_void()

summary_afinn2 <- sentiment_afinn %>% 
  group_by(videotype) %>% 
  filter(value != 0) %>%
  mutate(sentiment = if_else(value < 0, "negative", "positive")) %>% 
  count(sentiment, sort = TRUE) %>% 
  mutate(method = "AFINN")

summary_bing2 <- sentiment_bing %>% 
  group_by(videotype) %>% 
  count(sentiment, sort = TRUE) %>% 
  mutate(method = "bing")

summary_nrc2 <- sentiment_nrc %>% 
  filter(sentiment %in% c("positive", "negative")) %>%
  group_by(videotype) %>% 
  count(sentiment, sort = TRUE) %>% 
  mutate(method = "nrc") 

summary_loughran2 <- sentiment_loughran %>% 
  filter(sentiment %in% c("positive", "negative")) %>%
  group_by(videotype) %>% 
  count(sentiment, sort = TRUE) %>% 
  mutate(method = "loughran") 
#combining the dataframes
summary_sentiment <- bind_rows(summary_afinn2,
                               summary_bing2,
                               summary_nrc2,
                               summary_loughran2) %>%
  arrange(method, videotype) %>%
  relocate(method)

summary_sentiment
## # A tibble: 24 × 4
## # Groups:   videotype [3]
##    method videotype  sentiment     n
##    <chr>  <chr>      <chr>     <int>
##  1 AFINN  learning   positive    858
##  2 AFINN  learning   negative    315
##  3 AFINN  motivation positive     85
##  4 AFINN  motivation negative     21
##  5 AFINN  projects   positive    685
##  6 AFINN  projects   negative    299
##  7 bing   learning   positive    898
##  8 bing   learning   negative    426
##  9 bing   motivation positive     89
## 10 bing   motivation negative     31
## # … with 14 more rows
total_counts <- summary_sentiment %>%
  group_by(method, videotype) %>%
  summarise(total = sum(n))
sentiment_counts <- left_join(summary_sentiment, total_counts)
sentiment_counts
## # A tibble: 24 × 5
## # Groups:   videotype [3]
##    method videotype  sentiment     n total
##    <chr>  <chr>      <chr>     <int> <int>
##  1 AFINN  learning   positive    858  1173
##  2 AFINN  learning   negative    315  1173
##  3 AFINN  motivation positive     85   106
##  4 AFINN  motivation negative     21   106
##  5 AFINN  projects   positive    685   984
##  6 AFINN  projects   negative    299   984
##  7 bing   learning   positive    898  1324
##  8 bing   learning   negative    426  1324
##  9 bing   motivation positive     89   120
## 10 bing   motivation negative     31   120
## # … with 14 more rows
#new row that calculates the percentage
sentiment_percents <- sentiment_counts %>%
  mutate(percent = n/total * 100)

sentiment_percents
## # A tibble: 24 × 6
## # Groups:   videotype [3]
##    method videotype  sentiment     n total percent
##    <chr>  <chr>      <chr>     <int> <int>   <dbl>
##  1 AFINN  learning   positive    858  1173    73.1
##  2 AFINN  learning   negative    315  1173    26.9
##  3 AFINN  motivation positive     85   106    80.2
##  4 AFINN  motivation negative     21   106    19.8
##  5 AFINN  projects   positive    685   984    69.6
##  6 AFINN  projects   negative    299   984    30.4
##  7 bing   learning   positive    898  1324    67.8
##  8 bing   learning   negative    426  1324    32.2
##  9 bing   motivation positive     89   120    74.2
## 10 bing   motivation negative     31   120    25.8
## # … with 14 more rows
sentiment_percents %>%
  ggplot(aes(x = videotype, y = percent, fill=sentiment)) +
  geom_bar(width = .8, stat = "identity") +
  facet_wrap(~method, ncol = 1) +
  coord_flip() +
  labs(title = "Public Sentiment on Data Science Youtube Videos", 
       subtitle = "From Learning, Motivation and Project based Videos",
       x = "Video Type", 
       y = "Percentage of Words")