Getting YouTube API Key

Establishing a connection.

#install.packages("vosonSML")
library(vosonSML)

apikey <- "AIzaSyDvuqqCGRD9DzsedaqtIw5Hb4Go_DsNBuw"
youtubeAuth <- Authenticate("youtube", apiKey = "AIzaSyDvuqqCGRD9DzsedaqtIw5Hb4Go_DsNBuw")

Pumping out data from YouTube

Collecting links of all chosen YouTube videos and making a dataframe of comments of each video.

library(dplyr)

youtubeVideoIds <- GetYoutubeVideoIDs(c("https://youtu.be/exSKCByyLns", "https://youtu.be/w9It9PEK-kM", "https://youtu.be/Oe7U5rcaq1o", "https://youtu.be/FLkkmjQ7WX0", "https://youtu.be/T80MK6EKJT4", "https://youtu.be/7h4ZeIxc1pI", "https://youtu.be/prb9d6fsnHE", "https://youtu.be/FLkkmjQ7WX0", "https://youtu.be/b_1Hn6wwXpA", "https://youtu.be/ARUI9VhxNA0", "https://youtu.be/EZTByPrgfLI", "https://youtu.be/GlpiBkS4SJA", "https://youtu.be/Qaeqc5ikxeY", "https://youtu.be/v5uqya_nJj0", "https://youtu.be/nUdQIQWZGVA", "https://youtu.be/iT9QDUhS0dU", "https://youtu.be/1tE8HEgDmoc", "https://youtu.be/DIz0VX72-Bc", "https://youtu.be/Yl793FoyJJE", "https://youtu.be/uKby_huf-aA", "https://youtu.be/nCuZLsgPXuM", "https://youtu.be/52TT8o9vW8A", "https://youtu.be/QqZ0GLUF6yo", "https://youtu.be/plAdaVWzglc", "https://youtu.be/vCGuJ5z8LYY", "https://youtu.be/xRD2VdNUmxI", "https://youtu.be/7wwkItW2Gjw", "https://youtu.be/1VmVw0ALm_o", "https://youtu.be/Lik5G-gFQk4", "https://youtu.be/5YhoSIH3lPQ", "https://youtu.be/kIHTQlfBOJo", "https://youtu.be/F8ECpUKi5r4", "https://youtu.be/-7RqgG_NsI4", "https://youtu.be/Lls-ch1k7xg", "https://youtu.be/JSR1rwjZlX4", "https://youtu.be/xhFS89hgp_c", "https://youtu.be/36ayp2jckS0", "https://youtu.be/kk4aEC-YppA", "https://youtu.be/hRTIBqaOlus", "https://youtu.be/_NXEMh-Fxf4", "https://youtu.be/LTh0DnN6FDE", "https://youtu.be/ACwKesRVfUk", "https://youtu.be/xJpYCte7JcE", "https://youtu.be/e-henB30X4c", "https://youtu.be/DIaw4UV1nA4", "https://youtu.be/1AX_u2poAN4", "https://youtu.be/ONEtXBfx7oo", "https://youtu.be/r2FGVMvbWwE", "https://youtu.be/BRE8ibnX4MQ", "https://youtu.be/a_HkKhyPaSM", "https://youtu.be/eVjZ1lyoAOs", "https://youtu.be/IjpWeQUqMWM", "https://youtu.be/rwowDSLDzFI", "https://youtu.be/qlXp8Ppw-gs", "https://youtu.be/065kBCEtPoU", "https://youtu.be/NANnST5vxCo", "https://youtu.be/VJptzBWLCLo", "https://youtu.be/PE416IH8wws", "https://youtu.be/M4ZUU553AEA", "https://youtu.be/FZo2_AXGcD4", "https://youtu.be/uEW6CWgAMMM", "https://youtu.be/6fLBa58QgFU", "https://youtu.be/JaaZuyzNUi4", "https://youtu.be/CMr3q8u1ymg", "https://youtu.be/32ZMAZr1z_E", "https://youtu.be/wDWt3Ss9rNs", "https://youtu.be/ta6iiWcT_W0", "https://youtu.be/EJJrDbAmHOU", "https://youtu.be/aNKJpR0bBEE", "https://youtu.be/LqVY71LEAPI", "https://youtu.be/xRD2VdNUmxI", "https://youtu.be/g-jTu62gDr0", "https://youtu.be/EpVzqul5Arg", "https://youtu.be/P1nI5jnVaQM", "https://youtu.be/S2JdrBvVGSo", "https://youtu.be/yKRX2gNjtv4", "https://youtu.be/rPXB_uF-2cI", "https://youtu.be/U70aqExQkUA", "https://youtu.be/JJ0LQsAvnB8", "https://youtu.be/XnF8husX3Ew", "https://youtu.be/o3zNPtYlGSQ", "https://youtu.be/r-WbRBmURf4", "https://youtu.be/wG-DbNbMBkw", "https://youtu.be/aSepCt7Dx-s", "https://youtu.be/0Tz-RF9IxJE", "https://youtu.be/ErlV9j2BmyU", "https://youtu.be/eVjZ1lyoAOs", "https://youtu.be/qlXp8Ppw-gs", "https://youtu.be/NANnST5vxCo", "https://youtu.be/UF8HZRk7DaY", "https://youtu.be/xIrvh6BNkh8", "https://youtu.be/SkAZdW_GQ0s", "https://youtu.be/R_P8X40oNvA", "https://youtu.be/MtInFBn33M0", "https://youtu.be/xP3XMDJMT-k", "https://youtu.be/ZJiYz2UYSXI", "https://youtu.be/3Af4st1kRqA", "https://youtu.be/krHqPzS3nVo", "https://youtu.be/01SdKgQePsI", "https://youtu.be/oo46uKNP3gc"))


youtubeData <- youtubeAuth %>%
  Collect(videoIDs = youtubeVideoIds,
          maxComments = 3000,
          verbose = FALSE)

Collecting links of each YouTube channel and each video.

link <- c("exSKCByyLns", "w9It9PEK-kM", "Oe7U5rcaq1o", "FLkkmjQ7WX0", "T80MK6EKJT4", "7h4ZeIxc1pI", "prb9d6fsnHE", "FLkkmjQ7WX0", "b_1Hn6wwXpA", "ARUI9VhxNA0", "EZTByPrgfLI", "GlpiBkS4SJA", "Qaeqc5ikxeY", "v5uqya_nJj0", "nUdQIQWZGVA", "iT9QDUhS0dU", "1tE8HEgDmoc", "DIz0VX72-Bc", "Yl793FoyJJE", "uKby_huf-aA", "nCuZLsgPXuM", "52TT8o9vW8A", "QqZ0GLUF6yo", "plAdaVWzglc", "vCGuJ5z8LYY", "xRD2VdNUmxI", "7wwkItW2Gjw", "1VmVw0ALm_o", "Lik5G-gFQk4", "5YhoSIH3lPQ", "kIHTQlfBOJo", "F8ECpUKi5r4", "-7RqgG_NsI4", "Lls-ch1k7xg", "JSR1rwjZlX4", "xhFS89hgp_c", "36ayp2jckS0", "kk4aEC-YppA", "hRTIBqaOlus", "_NXEMh-Fxf4", "LTh0DnN6FDE", "ACwKesRVfUk", "xJpYCte7JcE", "e-henB30X4c", "DIaw4UV1nA4", "1AX_u2poAN4", "ONEtXBfx7oo", "r2FGVMvbWwE", "BRE8ibnX4MQ", "a_HkKhyPaSM", "eVjZ1lyoAOs", "IjpWeQUqMWM", "rwowDSLDzFI", "qlXp8Ppw-gs", "065kBCEtPoU", "NANnST5vxCo", "VJptzBWLCLo", "PE416IH8wws", "M4ZUU553AEA", "FZo2_AXGcD4", "uEW6CWgAMMM", "6fLBa58QgFU", "JaaZuyzNUi4", "CMr3q8u1ymg", "32ZMAZr1z_E", "wDWt3Ss9rNs", "ta6iiWcT_W0", "EJJrDbAmHOU", "aNKJpR0bBEE", "LqVY71LEAPI", "xRD2VdNUmxI", "g-jTu62gDr0", "EpVzqul5Arg", "P1nI5jnVaQM", "S2JdrBvVGSo", "yKRX2gNjtv4", "rPXB_uF-2cI", "U70aqExQkUA", "JJ0LQsAvnB8", "XnF8husX3Ew", "o3zNPtYlGSQ", "r-WbRBmURf4", "wG-DbNbMBkw", "aSepCt7Dx-s", "0Tz-RF9IxJE", "ErlV9j2BmyU", "eVjZ1lyoAOs", "qlXp8Ppw-gs", "NANnST5vxCo", "UF8HZRk7DaY", "xIrvh6BNkh8", "SkAZdW_GQ0s", "R_P8X40oNvA", "MtInFBn33M0", "xP3XMDJMT-k", "ZJiYz2UYSXI", "3Af4st1kRqA", "krHqPzS3nVo", "01SdKgQePsI", "oo46uKNP3gc")

youtubevideo <- data.frame(link)

id <- c("UCpm6vCgiehSKHtMFFS5ukiA", "UCgjP6Yi2ciSqn5qJ8BEXOkA", "UCQzIUO1m_Z--Qulo7IQKHvg", "UCR-j4NxVnwhfbZiClmOMibA", "UCWI528UIXxO7Kl-6c0he5Cg", "UCMLqsgXvM6Vs0aiegOdrtLQ", "UCLuuuRofmjK-ypp5Hftw7tw", "UCPW0TwizKn5bCr-8PcrP8MA", "UCz8LT_kp2rIs0CMv2AUivNg", "UCoF61Ju7c9lMJpRPhQ7PBxg", "UCQaA1h-xc_nDcuEdZIbT2qg", "UCOL8_f6Rjv338p8aY0C2WvQ", "UC643CRm6UKUl1HbdCD9Kidw", "UCndyQS3Q8UNVTFYbqEO22gg", "UCvcipRqgQ8DqI1wTJka2_dQ", "UCyLCXE5tnJGdyImQlfxucfw", "UCvd_X-p3MEFQKoDGQjb3OiQ", "UCngo17sgSHQuUaHNd9lYIMg", "UCfDJfdU0aCiMTluHXKDahYg", "UCgjP6Yi2ciSqn5qJ8BEXOkA", "UCLF8x8A6v05Ic_vwlRjPTRA", "UCEMyt3qCrwa4wETvsheB95w", "UCfC2j7U61CLa-6kQFcb2mGg", "UC4N283P0CQMu3a8e3yKMABQ", "UClUnHqJSCpMU1V12LIq_9ug", "UCNQsav6N0dp7ApDOkVwPqrA", "UCZ17fVB7aFpWNsK9fHclJuA", "UC25wAC_5htqBdaXNC6O93jQ", "UCvNeBu5uPbxv7FbBBCi317g", "UCxz6gPUpL6l5WSTUObsH9MA", "UCt8YxNuB2b-gSUX8OHmfjTA", "UCn9N1nnsX0xtl6NgZIRnhaw", "UCuZRVn4d0BVTA-DIT-PkzAA", "UCZVvxHrmbxsfkWm-g2Ol0eA", "UCYq3aZxqNWh3qoZN-ky4QJw", "UCo5q67UHoJfrJ2-l1enH4WA", "UCSZ3D8HTjvg53fsTuI2NdVw", "UCKv4EjBucT40QPMnDRMFb2w", "UC6U7Dj_6gkQ0KJEuVUuSwkA", "UCsqfijrYd-p6eFoB9cBEWxA", "UCmKrv7DPHi20M_0enX6_MKA", "UCeTpKZFMQAa73F9Y0CFivDQ", "UCJ-qYJvBz_KcgrvpZxWITyQ", "UCWPKFs6u4_wS9AXZzeZwBlQ", "UCWI528UIXxO7Kl-6c0he5Cg", "UCeikULWFtSVQhudNbF5urRw", "UCp300rt4ULrhF44D9woCSOw", "UCpwFVuATxG2m4A4vSqWCvDA", "UCwyK7L9OBSyoNbE4vtaNd6g", "UCngo17sgSHQuUaHNd9lYIMg", "UCb8xXRRUvFvCZGWMX1y_pjg", "UC_bd3osClZOTuEh6WPlv-6g", "UCsGBpBK9AjGyYeJb1SzG2-g", "UCgjZ1k7hNgrPmOv4WK6q09g", "UC0V_cBwQGN5QxOJ6u7fgOtQ", "UCuZRVn4d0BVTA-DIT-PkzAA", "UC-iUoRqQDcWfAovTsbrk_GA", "UCCFfoTZvZ9EXCX32sdEPhmw", "UCNU9oR7HzWD02LShddSOoeg", "UChBKPr6rRi2HY5d92sUxh2A", "UCHRWh66zFu6-SxsQq2Mv-_g", "UCKmxzTroWG1RJeAR8JiehRQ", "UClTqP6q-NeCZbDqsQJa5bUg", "UCo5q67UHoJfrJ2-l1enH4WA", "UCf0s3NCSzyvGpFkNuTNL2EA", "UC5ilFCORThdWlrhCER_fjfA", "UCYYdahdZ7J6rkiLpz97Nc-Q", "UCTsNlOo-TcXVs8I-pKOxb2A", "UCM5aAMJrEaY1iTxlqPOe8Rg", "UCBzoZTFClGzOq1x2wFFrURg", "UCiiAIfdVLaJsD1vEiMclBoA", "UC5pFR27s1fTBocMKSwM_Svw")

channelid <- data.frame(id)

Making two functions for getting data about each YouTube channel and each video.

API_key="AIzaSyDvuqqCGRD9DzsedaqtIw5Hb4Go_DsNBuw"
getstats_video<-function(video_id,API_key){
 
  url=paste0("https://www.googleapis.com/youtube/v3/videos?part=snippet,statistics&id=",video_id,"&key=",API_key)
  result <- fromJSON(txt=url)  
  salida=list()
  return(data.frame(name=result$items$snippet$channelTitle, result$items$statistics,title=result$items$snippet$title,date=result$items$snippet$publishedAt,descrip=result$items$snippet$description))
}

get_playlist_canal<-function(id,API_key,topn=15){
  
  url=paste0('https://www.googleapis.com/youtube/v3/playlistItems?part=contentDetails&playlistId=',id,'&key=',API_key,'&maxResults=',topn)
  
  result=fromJSON(txt=url)
  return(data.frame(result$items$contentDetails))
}

getstats_canal<-function(id,API_key){

url=paste0('https://www.googleapis.com/youtube/v3/channels?part=snippet%2CcontentDetails%2Cstatistics&id=',id,'&key=',API_key)
  
result <- fromJSON(txt=url)   
return(data.frame(name=result$items$snippet$title,result$items$statistics,pl_list_id=result$items$contentDetails$relatedPlaylists))
  
}

getall_channels<-function(ids,API_key,topn=5){
  
  videos=lapply(ids,FUN=get_playlist_canal,API_key=API_key,topn=topn) %>% bind_rows()
  
  stats=lapply(videos[,1],FUN=getstats_video,API_key=API_key)
  stats=bind_rows(stats)
  stats$vid_id=videos[,1]
  return(stats)
}

Getting information about each YouTube channel and each video into tables.

require(curl)
require(jsonlite)
library(kableExtra)
library(dplyr)
library(ggplot2)
library(plotly)
library(reshape2)

can_st=lapply(youtubevideo$link,FUN = getstats_video,API_key=API_key)
can_st=bind_rows(can_st)


can_sa=lapply(channelid$id,FUN = getstats_canal,API_key=API_key)
can_sa=bind_rows(can_sa)

Renaming tables to simplify the perception.

channels <- can_sa
videos <- can_st
#comments <- youtubeData

Saving data tables.

write.csv(channels, file = "channels.csv")
write.csv(videos, file = "videos.csv")
write.csv(comments, file = "comments.csv")

Descriptive statistics

Opening tables.

comments <- read.csv("comments.csv", encoding="IBM866", stringsAsFactors=FALSE)
videos <- read.csv("videos.csv", encoding="IBM866", stringsAsFactors=FALSE)
channel <- read.csv("channels.csv", encoding="IBM866", stringsAsFactors=FALSE)

Statistics on YouTube videos.

#writeLines('PATH="${RTOOLS40_HOME}\\usr\\bin;${PATH}"', con = "~/.Renviron")
#install.packages("htmltools")
#library(htmltools)
#install.packages("jsonlite", type = "source")

videos$viewCount = as.numeric(videos$viewCount)
videos$likeCount = as.numeric(videos$likeCount)
videos$dislikeCount = as.numeric(videos$dislikeCount)
videos$commentCount = as.numeric(videos$commentCount)

videos2 = videos
library(boot)
library(htmlTable)
library(dplyr)
library(table1)

label(videos2$viewCount) <- "Number of views"
label(videos2$likeCount) <- "Number of likes"
label(videos2$dislikeCount) <- "Number of dislikes"
label(videos2$commentCount) <- "Number of comments"


table1(~ viewCount + likeCount + dislikeCount + commentCount, data=videos2, overall="Total", topclass="Rtable1-grid Rtable1-shade Rtable1-times")

	Total (N=100)
Number of views
Mean (SD)	95900 (139000)
Median [Min, Max]	44900 [2430, 731000]
Number of likes
Mean (SD)	5070 (6790)
Median [Min, Max]	2870 [201, 35700]
Missing	2 (2.0%)
Number of dislikes
Mean (SD)	172 (201)
Median [Min, Max]	81.0 [1.00, 874]
Missing	2 (2.0%)
Number of comments
Mean (SD)	486 (640)
Median [Min, Max]	261 [28.0, 4020]

In the table below, you can see the main information collected for all the videos. On average, out of all the selected vlogs, videos gain 95,900 views with a minimum of 2,430 views and a maximum of 731,000 views. It is worth noting that the videos with more than one million views were not initially selected (if there were any) so that there is no bias towards popular videos, as otherwise they would be considered outliners. Among the likes and dislikes, there are 2% of missing data, which means that no user rated the video. Such indicators are inherent in less popular videos. In general, there is a tendency for more likes than dislikes, which may be discussed later. On average, each video has 486 comments, which allows to evaluate the distribution of positive and negative reviews in each. There are also no videos without comments (no missing data), since only those with more than 50 comments were manually selected (some vlogs had 5-10 comments, which could later distort the results of text analysis).

Statistics on YouTube channels.

channel$viewCount = as.numeric(channel$viewCount)
channel$subscriberCount = as.numeric(channel$subscriberCount)
channel$videoCount = as.numeric(channel$videoCount)

label(channel$viewCount) <- "Number of views"
label(channel$subscriberCount) <- "Number of subscribers"
label(channel$videoCount) <- "Number of videos"

table1(~ viewCount + subscriberCount + videoCount, data=channel, overall="Total", topclass="Rtable1-grid Rtable1-shade Rtable1-times")

	Total (N=72)
Number of views
Mean (SD)	76500000 (173000000)
Median [Min, Max]	20300000 [8680, 981000000]
Number of subscribers
Mean (SD)	438000 (942000)
Median [Min, Max]	142000 [729, 6750000]
Missing	1 (1.4%)
Number of videos
Mean (SD)	451 (480)
Median [Min, Max]	317 [6.00, 2790]

Among all the selected videos, some authors were repeated 2-3 times (sometimes every 3 videos that fit the selection criteria were from an existing author, such videos were not excluded), so the number of unique authors was 72 users. The number of total views is not a particularly important indicator, but the number of subscribers is a measure of the channel’s popularity. So on average, the selected channels have 438,000 subscribers: there are big channels presented (more than one million subscribers), and there are very small channels presented.

Statistics on YouTube comments.

final <- read.csv("final_df.csv", encoding="IBM866", stringsAsFactors=FALSE)

clean <- final

library(dplyr)
class(clean) <- 'data.frame'

clean$Comment = as.character(clean$Comment)
  
clean <- clean %>%
    mutate(new_col = sapply(strsplit(clean$Comment, " "), length))

label(clean$ReplyCount) <- "Number of replies"
label(clean$LikeCount) <- "Number of likes"
label(clean$new_col) <- "Lentgh of the comment"

table1(~ ReplyCount + LikeCount + new_col, data=clean, overall="Total", topclass="Rtable1-grid Rtable1-shade Rtable1-times")

	Total (N=44211)
Number of replies
Mean (SD)	0.292 (1.77)
Median [Min, Max]	0 [0, 151]
Number of likes
Mean (SD)	7.20 (79.4)
Median [Min, Max]	0 [0, 5980]
Lentgh of the comment
Mean (SD)	11.3 (14.7)
Median [Min, Max]	7.00 [0, 489]

Table shows that users do not actively respond to each other’s comments, but there is a comment that received 151 responses. On average, comments have 7 likes. The length of the comment varies from 0 to 489 words, with an average of 11 words. That is, comments are often short. In the future, comment with a zero number of characters (single observation) will be deleted.

Statistics on YouTube comments per user

clean1 <- clean[, c(1,3,4,12)]
clean1$Comment = as.factor(clean1$Comment)
clean1 <- clean1 %>% group_by(AuthorDisplayName) %>% mutate(count = n())

clean1$AuthorDisplayName = as.character(clean1$AuthorDisplayName)


clean2 = clean1

new_df <- clean2[-which(duplicated(clean2$AuthorDisplayName)), ]

label(new_df$count) <- "Number of comments per user"


table1(~ count, data=new_df, overall="Total", topclass="Rtable1-grid Rtable1-shade Rtable1-times")

	Total (N=28189)
Number of comments per user
Mean (SD)	1.57 (4.08)
Median [Min, Max]	1.00 [1.00, 348]

As we can see, there are only 28,189 unique users, although there are almost twice as many comments. This means that some users leave their comments more often than once. So, on average, each person leaves one comment, but the maximum number of comments from a single user is 348. This result can be obtained due to repeated authors of video recordings: they may have regular subscribers who support the author of the channel and write comments often.

Sentiment Analysis

Loading required packages.

#install.packages("corpus")
library(tidyverse)
library(tidytext)
library(stringr)
library(corpus)

Loading data.

df <- read.csv(file = 'comments.csv')

comments <- df[, 2]

# And also download the dictionary of marked words for the Russian language
# On the basis of which the comments will be colored
russian_dictionary <- read.csv(file = 'russian.csv', encoding='UTF-8')

# Adding pairs to the dictionary to 'classify' emojis
# To do this, download the source dictionary
emo_dictionary <- read.csv(file = 'emo.csv', encoding='UTF-8')

# Rename the columns we need
# Under the file format 'russian. csv'
names(emo_dictionary)[names(emo_dictionary) == 'Sentiment.score..1....1.'] <- 'score'
names(emo_dictionary)[names(emo_dictionary) == 'Char'] <- 'word'

# Select only the two columns we need - 'word' and 'score'
emo_dictionary <- emo_dictionary[, c(2, 10)]

emo_dictionary$word <- gsub('<U\\+', '', emo_dictionary$word)
emo_dictionary$word <- gsub('>', '', emo_dictionary$word)

russian_dictionary$word <- gsub('<U\\+', '', russian_dictionary$word)
russian_dictionary$word <- gsub('>', '', russian_dictionary$word)

emo_dictionary <- lapply(emo_dictionary, iconv, "UTF-8", "UTF-8",sub='00000')
emo_dictionary <- as.data.frame(emo_dictionary)

ps <- data.frame(names=emo_dictionary$word, chr=apply(emo_dictionary,2,nchar)[,1])
emo_dictionary$size <- ps$chr
emo_dictionary <- emo_dictionary[(emo_dictionary$size == 8),]
emo_dictionary$size <- NULL

# And add the copied file to the original one
russian_dictionary <- rbind(russian_dictionary, emo_dictionary)

Tokenization

tokens <- tibble(line = 1:length(comments), text = as.character(comments)) %>% unnest_tokens(word, text)

Stemming

tokens$word <- text_tokens(tokens$word, stemmer = 'rus')

russian_dictionary$word <- text_tokens(russian_dictionary$word, stemmer = 'rus')

Performing sentiment analysis. Getting the emotional coloring of comments in the form of a certain number

# In this case, we use the dictionary to calculate the number of words
# With some emotional connotation in each comment
# And then we average the numerical values of the found words from the source dictionary for each comment
#result <- tokens %>%
#  inner_join(russian_dictionary, by = 'word') %>% 
#  group_by(line) %>% 
#  summarise(sentiment = mean(score))

result <- tokens %>%
  inner_join(russian_dictionary, by = 'word') %>% 
  group_by(line)

result$score <- as.numeric(result$score)
result <- result %>% group_by(line) %>% 
  summarise(sentiment = mean(score))

Creating the resulting dataset. After “coloring” each comment, we get “bad” and “good” comments, but in some comments, not a single word from the original dictionary was found, we take them for “neutral”.

# First, we get for each comment the resulting value of sentiment-a number that characterizes the polarity
df <- rename(df, line = X)
final_df <- df %>%
  left_join(result, by = 'line')
final_df[is.na(final_df)] <- 0

# Then we classify positive sentiment as positive comments, and negative comments as negative comments, respectively. Zero - as neutral.
map_sentiment_to_label <- function(row) {
  if(as.numeric(row[14]) > 0) {
    return('positive')
  }
  else if(as.numeric(row[14]) < 0) {
    return('negative')
  } else {
    return('neutral')
  }
}

final_df$label <- apply(final_df, 1, map_sentiment_to_label)

# Saving data
write.csv(final_df, 'final_df.csv')

Viewing results. Let us take a few comments and see how the algorithm marked them up.

COUNT <- 3
print('[Positive comments]:')

## [1] "[Positive comments]:"

paste(final_df[final_df$label == 'positive', ][, 2][100:(101+COUNT)])

## [1] "За 7 миллионов намного круче и современней"                                                     
## [2] "Это просто готовый бизнес. Гостиница для отдыхающих."                                           
## [3] "Выпей аспирин,он убирет немного боли. И 2 дня отдыхай. А то нарвешь мускулы. Осторожненька,Катя"
## [4] "Дом, в котором сейчас вы живёте намного лучше этих за 5 и 7 миллионов"                          
## [5] "Второй дом <U+0001F44D><U+0001F44D><U+0001F44D>"

print('')

## [1] ""

print('[Negative comments]:')

## [1] "[Negative comments]:"

paste(final_df[final_df$label == 'negative', ][, 2][100:(101+COUNT)])

## [1] "У Ванички  прям голос прорезался если он что то хочет <U+0001F92D>очень его балуют не есть хорошо"                                                                                                                                            
## [2] "Катюша лук не обрезай сажай как есть"                                                                                                                                                                                                         
## [3] "Первый дом на скале- это не для проживания. Это больше подходит под гостиницу."                                                                                                                                                               
## [4] "Ребята, ну тут только вид и расположение чего стоят. А выбирать по принципу красивой мебели внутри...можно выкинуть старую и купить свою новую. В том доме просто определенный стиль, который вам не близок и это не значит, что он плохой:))"
## [5] "Вид конечно огонь <U+0001F525><U+0001F44D>не понимаю зачем вам 3 им большой дом<U+0001F937><U+200D>+<U+FE0F>"

print('')

## [1] ""

print('[Neutral comments]:')

## [1] "[Neutral comments]:"

paste(final_df[final_df$label == 'neutral', ][, 2][100:(101+COUNT)])

## [1] "*очень дорого*"                                          
## [2] "Да, полностью согласна"                                  
## [3] "5 миллионов\210 это в российских рублях около 450 миллионов"
## [4] "*скорее всего так*"                                      
## [5] "*очень дорого!  очень!*"

It is noticeable that the algorithm reads such words as “very” for coloring positive comments. Some comments contain a double meaning (“I like it, but…” etc.), but the algorithm cannot take into account such subtleties. Overall, the definition of positive comments was for the most part defined correctly.

Comments that did not contain words with an emotional connotation, or implied a question to the author of the video, were correctly classified as neutral comments.

Negative comments were better determined by the algorithm than positive ones. If among the positive comments, the algorithm sometimes marked them as negative, then the negative ones were not marked as positive. Above are 5 examples of negative comments that were identified correctly.

Statistics on sentiment of the comments.

final <- read.csv("final_df.csv", encoding="IBM866", stringsAsFactors=FALSE)
final_df <- final[, c(3,16)]
final_df$label = as.factor(final_df$label)

label(final_df$label) <- "Sentiment"
table1(~ label, data=final_df, overall="Total", topclass="Rtable1-grid Rtable1-shade Rtable1-times")

	Total (N=44211)
Sentiment
negative	11416 (25.8%)
neutral	10763 (24.3%)
positive	22032 (49.8%)

It can be seen that there are more positive comments than neutral and negative. However, the method used is not perfect and cannot identify comments everywhere correctly due to grammatical errors of users, and other subtleties of the Russian language. Initially, only 7654 comments were attributed to negative comments, but this algorithm did not take into account the “not” particle for positive words. Therefore, the algorithm was refined, and showed different results. It is still worth paying attention to the fact that the algorithm is not able to count sarcasm in words or other features of the speech of the Russian language.

Determining anonymous/non-anonymous comments

Counting the number of words in each name.

library(dplyr)
#comments <- read.csv("comments.csv", encoding="IBM866", stringsAsFactors=FALSE)

comments <- final

comments <- comments[!grepl("спам",comments$Comment),]
comments <- comments[-c(35669), ]


class(comments) <- 'data.frame'

comments$AuthorDisplayName = as.character(comments$AuthorDisplayName)
  
comments <- comments %>%
    mutate(new_col = sapply(strsplit(comments$AuthorDisplayName, " "), length))

#comments$new_col = ifelse(comments$"new_col">2,"Anonymous","Full name")

Looking at the capital letters in the first names.

comments_1 <- comments[, c(3, 4, 5, 8, 9, 14, 16, 17)]
#comments_1 <- comments[, c(3, 14)]

library(stringr)
comments_1<- comments_1 %>%
    mutate(letter = str_detect(comments_1$AuthorDisplayName,"[[:upper:]]"))

Determining if a string has a dot.

comments_1 <- comments_1 %>%
    mutate(dot = letter)

comments_1$dot = str_detect(comments_1$AuthorDisplayName, "\\.")

Searching for duplicated words in names (ex. Lena Lena).

comments_1 <- comments_1 %>% 
  mutate(NameUnique = AuthorDisplayName)

comments_1$NameUnique <- sapply(comments_1$NameUnique, function(x) paste(unique(unlist(str_split(x," "))), collapse = " "))

Counting number of words in names again based on previous step.

comments_1$new_col <- sapply(strsplit(comments_1$NameUnique, " "), length)

Dividing names into separate columns to count number of letters in first and last name.

comments_1 = comments_1 %>% 
  mutate(DividedName = AuthorDisplayName)

out <- strsplit(as.character(comments_1$DividedName),' ') 
comments_1 <- data.frame(comments_1, do.call(rbind, out))
comments_1 <- comments_1[, c(1:14)]

Counting the number of letters in each string.

comments_1 <- comments_1 %>% 
  mutate(count = str_count(comments_1$X1, ""))

comments_1 <- comments_1 %>% 
  mutate(count1 = str_count(comments_1$X2, ""))

Looking for names which have symbols, numbers and other characters.

comments_1 <- comments_1 %>% 
  mutate(unrecog = str_detect(comments_1$AuthorDisplayName, "\\_|\\*|\\-|\\#|\\[|\\]|\\>|\\<|\\?"))

comments_1 <- comments_1 %>% 
  mutate(numbers = grepl("[0-9]", comments_1$AuthorDisplayName))

Looking for names that have capital letters in first and last names.

library(stringr)
comments_1<- comments_1 %>%
    mutate(letter1 = str_detect(comments_1$X1,"[[:upper:]]"))

comments_1<- comments_1 %>%
    mutate(letter2 = str_detect(comments_1$X2,"[[:upper:]]"))

Making condition for identifying anonymous users.

comments_1 <- comments_1 %>% 
  mutate(Anonymity = AuthorDisplayName)

comments_1$Anonymity = ifelse((comments_1$"new_col">2) | (comments_1$"new_col"==1) | (comments_1$"letter1"=="FALSE") | (comments_1$"letter2"=="FALSE") | (comments_1$dot == "TRUE") | (comments_1$"count"<3) | (comments_1$"count1" <3) | (comments_1$"unrecog"=="TRUE") | (comments_1$"numbers"=="TRUE"),"Anonymous","Full name")

anonymous <- comments_1
anonymous <- anonymous[, c(1,2,3,4,5,6,7,21)]

write.csv(anonymous, file = "anonymity.csv")

Statistics based on names.

anon <- read.csv("anonymity.csv", encoding="IBM866", stringsAsFactors=FALSE)

anon$Anonymity = as.factor(anon$Anonymity)

label(anon$Anonymity) <- "Name"
table1(~Anonymity, data = anon, overall="Total", topclass="Rtable1-grid Rtable1-shade Rtable1-times")

	Total (N=44075)
Name
Anonymous	16380 (37.2%)
Full name	27695 (62.8%)

The analysis showed such results. After deleting empty observations and spam from users, there are 44075 unique comments left. In total, the submitted data contains more non-anonymous comments (62.8%) than anonymous comments (37.2%). However, it is worth mentioning that unique comments do not reflect the number of unique users, since some of them left several messages.

Identifying unique users

anon1 <- anon

anon1 <- anon1 %>% group_by(AuthorDisplayName) %>% mutate(count = n())

anon1$AuthorDisplayName = as.character(anon1$AuthorDisplayName)


anon2 = anon1

new_df2 <- anon2[-which(duplicated(anon2$AuthorDisplayName)), ]

label(new_df2$count) <- "Authors of comments"


table1(~ Anonymity, data=new_df2, topclass="Rtable1-grid Rtable1-shade Rtable1-times")

	Overall (N=28186)
Name
Anonymous	10092 (35.8%)
Full name	18094 (64.2%)

Table shows the results showing the distribution of anonymous or non-anonymous users in the data presented. It is noteworthy that among all unique users (N=28186), the distribution of anonymous users/users with the full name is almost the same as the distribution of the total number of comments (35.8% and 64.2%, respectively).

Looking at the distribution of the sentiment of comments depending on the user name.

anonymity <- read.csv("anonymity.csv", encoding="IBM866", stringsAsFactors=FALSE)

# Create the barplot
library(ggplot2)
ggplot(data=anonymity, aes(x=label, fill=Anonymity)) +
  geom_bar()+
  theme_minimal()

label(anonymity$label) <- "Comment"
table1(~label|Anonymity, data = anonymity, topclass="Rtable1-grid Rtable1-shade Rtable1-times")

	Anonymous (N=16380)	Full name (N=27695)	Overall (N=44075)
Comment
negative	4346 (26.5%)	7067 (25.5%)	11413 (25.9%)
neutral	4213 (25.7%)	6547 (23.6%)	10760 (24.4%)
positive	7821 (47.7%)	14081 (50.8%)	21902 (49.7%)

When sentiment analysis and determination of anonymity were made, it is necessary to look at the distribution of the sentiment of comments depending on the user name. In general, you can pay attention to the percentage ratio and see that negative comments are more often published among anonymous users, and positive comments are more often published by users who specify the first and last name.

Statistical tests to check hypotheses.

library(dplyr)
popularity <- videos %>% 
  mutate(VideoID = c("exSKCByyLns", "w9It9PEK-kM", "Oe7U5rcaq1o", "FLkkmjQ7WX0", "T80MK6EKJT4", "7h4ZeIxc1pI", "prb9d6fsnHE", "FLkkmjQ7WX0", "b_1Hn6wwXpA", "ARUI9VhxNA0", "EZTByPrgfLI", "GlpiBkS4SJA", "Qaeqc5ikxeY", "v5uqya_nJj0", "nUdQIQWZGVA", "iT9QDUhS0dU", "1tE8HEgDmoc", "DIz0VX72-Bc", "Yl793FoyJJE", "uKby_huf-aA", "nCuZLsgPXuM", "52TT8o9vW8A", "QqZ0GLUF6yo", "plAdaVWzglc", "vCGuJ5z8LYY", "xRD2VdNUmxI", "7wwkItW2Gjw", "1VmVw0ALm_o", "Lik5G-gFQk4", "5YhoSIH3lPQ", "kIHTQlfBOJo", "F8ECpUKi5r4", "-7RqgG_NsI4", "Lls-ch1k7xg", "JSR1rwjZlX4", "xhFS89hgp_c", "36ayp2jckS0", "kk4aEC-YppA", "hRTIBqaOlus", "_NXEMh-Fxf4", "LTh0DnN6FDE", "ACwKesRVfUk", "xJpYCte7JcE", "e-henB30X4c", "DIaw4UV1nA4", "1AX_u2poAN4", "ONEtXBfx7oo", "r2FGVMvbWwE", "BRE8ibnX4MQ", "a_HkKhyPaSM", "eVjZ1lyoAOs", "IjpWeQUqMWM", "rwowDSLDzFI", "qlXp8Ppw-gs", "065kBCEtPoU", "NANnST5vxCo", "VJptzBWLCLo", "PE416IH8wws", "M4ZUU553AEA", "FZo2_AXGcD4", "uEW6CWgAMMM", "6fLBa58QgFU", "JaaZuyzNUi4", "CMr3q8u1ymg", "32ZMAZr1z_E", "wDWt3Ss9rNs", "ta6iiWcT_W0", "EJJrDbAmHOU", "aNKJpR0bBEE", "LqVY71LEAPI", "xRD2VdNUmxI", "g-jTu62gDr0", "EpVzqul5Arg", "P1nI5jnVaQM", "S2JdrBvVGSo", "yKRX2gNjtv4", "rPXB_uF-2cI", "U70aqExQkUA", "JJ0LQsAvnB8", "XnF8husX3Ew", "o3zNPtYlGSQ", "r-WbRBmURf4", "wG-DbNbMBkw", "aSepCt7Dx-s", "0Tz-RF9IxJE", "ErlV9j2BmyU", "eVjZ1lyoAOs", "qlXp8Ppw-gs", "NANnST5vxCo", "UF8HZRk7DaY", "xIrvh6BNkh8", "SkAZdW_GQ0s", "R_P8X40oNvA", "MtInFBn33M0", "xP3XMDJMT-k", "ZJiYz2UYSXI", "3Af4st1kRqA", "krHqPzS3nVo", "01SdKgQePsI", "oo46uKNP3gc")
)
write.csv(popularity, "popularity.csv")

Loading libraries

library(readxl)
library(dplyr)
library(ggplot2)
library(ggpubr)
library(stringr)
library(tidyr)

Loading data

anonymityfinal <- read_excel("anonymityfinal1.xlsx")
popularity <- read.csv(file="popularity.csv")

Research on increasing the number of records

paste("Number of videos:", length(popularity$VideoID))

## [1] "Number of videos: 100"

paste("Number of unique videos:", length(unique(popularity$VideoID)))

## [1] "Number of unique videos: 95"

Selecting duplicates and deleting them from the data

duplicates <- subset(popularity,duplicated(popularity$VideoID))
popularity <- popularity[!(popularity$X %in% duplicates$X),]

Creating a concatenated dataset. At this stage, a dataset is created by adding information about the video to the comments. The INNER JOIN connection is used

df <- merge(x = anonymityfinal, y = popularity, by = "VideoID", all = TRUE)

Checking relationships. Before conducting the tests themselves, we agree that we will use the alpha significance level of 0.05

The relationship between anonymity/non-anonymity and the type of comment (positive, negative)

Let us assume that anonymity is allowed to “feel freedom of actions”, so there should be much more negative anonymous comments than non-anonymous negative ones

H0: There is no relationship between type of comments and anonymity/ non-anonymity. H1: The relationship exists.

Graphical representation of the distribution

df %>% 
  ggplot(aes(label, fill = Anonymity))+
  geom_bar(position="fill") + labs(title="Label distribution across anonymity") + theme_bw()

We see that there are fewer anonymous comments than non - anonymous ones, but the distribution of the types of comments themselves for each of the groups is visually approximately the same: most often there are positive ones and approximately the same frequency-negative and neutral.

Contingency table

selected_table_df <- df[, c(7, 8)]
knitr::kable(table(selected_table_df))

	Anonymous	Full name
negative	4357	7006
neutral	4255	6554
positive	7847	14056

Contingency table (as a percentage)

knitr::kable(prop.table(table(selected_table_df)))

	Anonymous	Full name
negative	0.0988542	0.1589563
neutral	0.0965400	0.1487011
positive	0.1780374	0.3189109

The tables generally support the visual analysis, there are still a few positive comments, but still more from non-anonymous users.

Chi-square test

x_test <- chisq.test(table(selected_table_df))
x_test

## 
##  Pearson's Chi-squared test
## 
## data:  table(selected_table_df)
## X-squared = 45.296, df = 2, p-value = 1.459e-10

In our example, the variables are statistically significantly related (p-value tends to zero, which is significantly less than the established level, so we reject the null hypothesis of independence and can accept an alternative one).

Observed frequencies

x_test$observed

##           Anonymity
## label      Anonymous Full name
##   negative      4357      7006
##   neutral       4255      6554
##   positive      7847     14056

Conclusions: the anonymity of the commentator statistically affects the type of comment, but this is only due to the fact that there are simply more non-anonymous comments than anonymous ones (that is, if we met a positive comment, then most likely the user did not hide his name). The very same distribution of comment types for each group is approximately the same, that is, our initial assumption turned out to be incorrect.

Relationship between comment length and anonymity

We assume that long comments are left, as a rule, by anonymous users, since, perhaps, anonymity “liberates” a person, allowing to write more and more in detail.

df$CommentLength <- str_count(df$Comment)
df <- df %>% drop_na()

Getting average values

result_anonymity <- df %>%
    group_by(Anonymity) %>%
    dplyr::summarize(Mean = mean(CommentLength, na.rm=TRUE))

Logistic regression model

df$Anonymity <- as.factor(df$Anonymity)
model <- glm(Anonymity ~ CommentLength, data = df, family = "binomial")
summary(model)

## 
## Call:
## glm(formula = Anonymity ~ CommentLength, family = "binomial", 
##     data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4762  -1.4018   0.9672   0.9684   0.9691  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   5.120e-01  1.216e-02  42.119   <2e-16 ***
## CommentLength 4.298e-05  8.481e-05   0.507    0.612    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 58073  on 43928  degrees of freedom
## Residual deviance: 58073  on 43927  degrees of freedom
## AIC: 58077
## 
## Number of Fisher Scoring iterations: 4

probabilities <- model %>% predict(df, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "Full name", "Anonymous")
paste("Accuracy:", round(mean(predicted.classes == df$Anonymity), 2)*100, "%")

## [1] "Accuracy: 63 %"

Running the one-way ANOVA test

summary(aov(CommentLength ~ Anonymity, data = df))

##                Df    Sum Sq Mean Sq F value Pr(>F)
## Anonymity       1      3519    3519   0.257  0.612
## Residuals   43927 601552646   13694

Since the p-value (0.612) is significantly higher than the significance level of 0.05, we can conclude that there are no significant statistical differences between the average values of the length of the comment by the type of anonymity of the user who left it.

Conclusions: although it was possible to predict the type of anonymity of the user relatively well by the length of the comment on the training data itself (63%), however, this is still quite small. In addition, the conducted statistical test tells us that the average values of the length of comments for anonymous and non-anonymous users do not statistically differ. Therefore, we can conclude that the anonymity/non-anonymity of the user does not affect the number of characters in the comment in any way. Therefore, our initial assumption was not confirmed.

Difference between the average number of likes of anonymous / non-anonymous comments

In order to test the hypothesis, the average number of likes for anonymous and non-anonymous comments was first obtained.

anony = anon

library(tidyverse)
library(ggpubr)
library(rstatix)

anon1 <- anon %>% filter(Anonymity == "Full name")
anon2 <- anon %>% filter(Anonymity == "Anonymous")

mean(anon1$LikeCount)

## [1] 6.56855

mean(anon2$LikeCount)

## [1] 8.33956

Statistics showed that on average, anonymous comments collect 8 likes, and not anonymous comments collect 6 likes. Despite the large data outliers (where under the popular videos there are more than 1000 likes on comments), we can conclude that the average number of likes differs depending on anonymity.

res.aov <- anony %>% anova_test(LikeCount ~ Anonymity)
res.aov

Effect	DFn	DFd	F	p	p<.05	ges
Anonymity	1	44073	5.106	0.024		0.000116

pwc <- anony %>% tukey_hsd(LikeCount ~ Anonymity)
pwc

term	group1	group2	null.value	estimate	conf.low	conf.high	p.adj	p.adj.signif
Anonymity	Anonymous	Full name	0	-1.77101	-3.307146	-0.2348742	0.0238

A one-way ANOVA test was also conducted. From the ANOVA results, it can be seen that there are significant differences between anonymous and non-anonymous comments (p = 0.024), F(1, 44073) = 5.106. A Tukey post-hoc tests was conducted to perform multiple pairwise comparisons between groups. The results showed that there are significant difference between two groups (adjusted p-value = 0.024).

The Anonymity on YouTube: Analysis of User Comments

Lamova Tamara

24.05.2021