Establishing a connection.
#install.packages("vosonSML")
library(vosonSML)
apikey <- "AIzaSyDvuqqCGRD9DzsedaqtIw5Hb4Go_DsNBuw"
youtubeAuth <- Authenticate("youtube", apiKey = "AIzaSyDvuqqCGRD9DzsedaqtIw5Hb4Go_DsNBuw")Collecting links of all chosen YouTube videos and making a dataframe of comments of each video.
library(dplyr)
youtubeVideoIds <- GetYoutubeVideoIDs(c("https://youtu.be/exSKCByyLns", "https://youtu.be/w9It9PEK-kM", "https://youtu.be/Oe7U5rcaq1o", "https://youtu.be/FLkkmjQ7WX0", "https://youtu.be/T80MK6EKJT4", "https://youtu.be/7h4ZeIxc1pI", "https://youtu.be/prb9d6fsnHE", "https://youtu.be/FLkkmjQ7WX0", "https://youtu.be/b_1Hn6wwXpA", "https://youtu.be/ARUI9VhxNA0", "https://youtu.be/EZTByPrgfLI", "https://youtu.be/GlpiBkS4SJA", "https://youtu.be/Qaeqc5ikxeY", "https://youtu.be/v5uqya_nJj0", "https://youtu.be/nUdQIQWZGVA", "https://youtu.be/iT9QDUhS0dU", "https://youtu.be/1tE8HEgDmoc", "https://youtu.be/DIz0VX72-Bc", "https://youtu.be/Yl793FoyJJE", "https://youtu.be/uKby_huf-aA", "https://youtu.be/nCuZLsgPXuM", "https://youtu.be/52TT8o9vW8A", "https://youtu.be/QqZ0GLUF6yo", "https://youtu.be/plAdaVWzglc", "https://youtu.be/vCGuJ5z8LYY", "https://youtu.be/xRD2VdNUmxI", "https://youtu.be/7wwkItW2Gjw", "https://youtu.be/1VmVw0ALm_o", "https://youtu.be/Lik5G-gFQk4", "https://youtu.be/5YhoSIH3lPQ", "https://youtu.be/kIHTQlfBOJo", "https://youtu.be/F8ECpUKi5r4", "https://youtu.be/-7RqgG_NsI4", "https://youtu.be/Lls-ch1k7xg", "https://youtu.be/JSR1rwjZlX4", "https://youtu.be/xhFS89hgp_c", "https://youtu.be/36ayp2jckS0", "https://youtu.be/kk4aEC-YppA", "https://youtu.be/hRTIBqaOlus", "https://youtu.be/_NXEMh-Fxf4", "https://youtu.be/LTh0DnN6FDE", "https://youtu.be/ACwKesRVfUk", "https://youtu.be/xJpYCte7JcE", "https://youtu.be/e-henB30X4c", "https://youtu.be/DIaw4UV1nA4", "https://youtu.be/1AX_u2poAN4", "https://youtu.be/ONEtXBfx7oo", "https://youtu.be/r2FGVMvbWwE", "https://youtu.be/BRE8ibnX4MQ", "https://youtu.be/a_HkKhyPaSM", "https://youtu.be/eVjZ1lyoAOs", "https://youtu.be/IjpWeQUqMWM", "https://youtu.be/rwowDSLDzFI", "https://youtu.be/qlXp8Ppw-gs", "https://youtu.be/065kBCEtPoU", "https://youtu.be/NANnST5vxCo", "https://youtu.be/VJptzBWLCLo", "https://youtu.be/PE416IH8wws", "https://youtu.be/M4ZUU553AEA", "https://youtu.be/FZo2_AXGcD4", "https://youtu.be/uEW6CWgAMMM", "https://youtu.be/6fLBa58QgFU", "https://youtu.be/JaaZuyzNUi4", "https://youtu.be/CMr3q8u1ymg", "https://youtu.be/32ZMAZr1z_E", "https://youtu.be/wDWt3Ss9rNs", "https://youtu.be/ta6iiWcT_W0", "https://youtu.be/EJJrDbAmHOU", "https://youtu.be/aNKJpR0bBEE", "https://youtu.be/LqVY71LEAPI", "https://youtu.be/xRD2VdNUmxI", "https://youtu.be/g-jTu62gDr0", "https://youtu.be/EpVzqul5Arg", "https://youtu.be/P1nI5jnVaQM", "https://youtu.be/S2JdrBvVGSo", "https://youtu.be/yKRX2gNjtv4", "https://youtu.be/rPXB_uF-2cI", "https://youtu.be/U70aqExQkUA", "https://youtu.be/JJ0LQsAvnB8", "https://youtu.be/XnF8husX3Ew", "https://youtu.be/o3zNPtYlGSQ", "https://youtu.be/r-WbRBmURf4", "https://youtu.be/wG-DbNbMBkw", "https://youtu.be/aSepCt7Dx-s", "https://youtu.be/0Tz-RF9IxJE", "https://youtu.be/ErlV9j2BmyU", "https://youtu.be/eVjZ1lyoAOs", "https://youtu.be/qlXp8Ppw-gs", "https://youtu.be/NANnST5vxCo", "https://youtu.be/UF8HZRk7DaY", "https://youtu.be/xIrvh6BNkh8", "https://youtu.be/SkAZdW_GQ0s", "https://youtu.be/R_P8X40oNvA", "https://youtu.be/MtInFBn33M0", "https://youtu.be/xP3XMDJMT-k", "https://youtu.be/ZJiYz2UYSXI", "https://youtu.be/3Af4st1kRqA", "https://youtu.be/krHqPzS3nVo", "https://youtu.be/01SdKgQePsI", "https://youtu.be/oo46uKNP3gc"))
youtubeData <- youtubeAuth %>%
Collect(videoIDs = youtubeVideoIds,
maxComments = 3000,
verbose = FALSE)Collecting links of each YouTube channel and each video.
link <- c("exSKCByyLns", "w9It9PEK-kM", "Oe7U5rcaq1o", "FLkkmjQ7WX0", "T80MK6EKJT4", "7h4ZeIxc1pI", "prb9d6fsnHE", "FLkkmjQ7WX0", "b_1Hn6wwXpA", "ARUI9VhxNA0", "EZTByPrgfLI", "GlpiBkS4SJA", "Qaeqc5ikxeY", "v5uqya_nJj0", "nUdQIQWZGVA", "iT9QDUhS0dU", "1tE8HEgDmoc", "DIz0VX72-Bc", "Yl793FoyJJE", "uKby_huf-aA", "nCuZLsgPXuM", "52TT8o9vW8A", "QqZ0GLUF6yo", "plAdaVWzglc", "vCGuJ5z8LYY", "xRD2VdNUmxI", "7wwkItW2Gjw", "1VmVw0ALm_o", "Lik5G-gFQk4", "5YhoSIH3lPQ", "kIHTQlfBOJo", "F8ECpUKi5r4", "-7RqgG_NsI4", "Lls-ch1k7xg", "JSR1rwjZlX4", "xhFS89hgp_c", "36ayp2jckS0", "kk4aEC-YppA", "hRTIBqaOlus", "_NXEMh-Fxf4", "LTh0DnN6FDE", "ACwKesRVfUk", "xJpYCte7JcE", "e-henB30X4c", "DIaw4UV1nA4", "1AX_u2poAN4", "ONEtXBfx7oo", "r2FGVMvbWwE", "BRE8ibnX4MQ", "a_HkKhyPaSM", "eVjZ1lyoAOs", "IjpWeQUqMWM", "rwowDSLDzFI", "qlXp8Ppw-gs", "065kBCEtPoU", "NANnST5vxCo", "VJptzBWLCLo", "PE416IH8wws", "M4ZUU553AEA", "FZo2_AXGcD4", "uEW6CWgAMMM", "6fLBa58QgFU", "JaaZuyzNUi4", "CMr3q8u1ymg", "32ZMAZr1z_E", "wDWt3Ss9rNs", "ta6iiWcT_W0", "EJJrDbAmHOU", "aNKJpR0bBEE", "LqVY71LEAPI", "xRD2VdNUmxI", "g-jTu62gDr0", "EpVzqul5Arg", "P1nI5jnVaQM", "S2JdrBvVGSo", "yKRX2gNjtv4", "rPXB_uF-2cI", "U70aqExQkUA", "JJ0LQsAvnB8", "XnF8husX3Ew", "o3zNPtYlGSQ", "r-WbRBmURf4", "wG-DbNbMBkw", "aSepCt7Dx-s", "0Tz-RF9IxJE", "ErlV9j2BmyU", "eVjZ1lyoAOs", "qlXp8Ppw-gs", "NANnST5vxCo", "UF8HZRk7DaY", "xIrvh6BNkh8", "SkAZdW_GQ0s", "R_P8X40oNvA", "MtInFBn33M0", "xP3XMDJMT-k", "ZJiYz2UYSXI", "3Af4st1kRqA", "krHqPzS3nVo", "01SdKgQePsI", "oo46uKNP3gc")
youtubevideo <- data.frame(link)
id <- c("UCpm6vCgiehSKHtMFFS5ukiA", "UCgjP6Yi2ciSqn5qJ8BEXOkA", "UCQzIUO1m_Z--Qulo7IQKHvg", "UCR-j4NxVnwhfbZiClmOMibA", "UCWI528UIXxO7Kl-6c0he5Cg", "UCMLqsgXvM6Vs0aiegOdrtLQ", "UCLuuuRofmjK-ypp5Hftw7tw", "UCPW0TwizKn5bCr-8PcrP8MA", "UCz8LT_kp2rIs0CMv2AUivNg", "UCoF61Ju7c9lMJpRPhQ7PBxg", "UCQaA1h-xc_nDcuEdZIbT2qg", "UCOL8_f6Rjv338p8aY0C2WvQ", "UC643CRm6UKUl1HbdCD9Kidw", "UCndyQS3Q8UNVTFYbqEO22gg", "UCvcipRqgQ8DqI1wTJka2_dQ", "UCyLCXE5tnJGdyImQlfxucfw", "UCvd_X-p3MEFQKoDGQjb3OiQ", "UCngo17sgSHQuUaHNd9lYIMg", "UCfDJfdU0aCiMTluHXKDahYg", "UCgjP6Yi2ciSqn5qJ8BEXOkA", "UCLF8x8A6v05Ic_vwlRjPTRA", "UCEMyt3qCrwa4wETvsheB95w", "UCfC2j7U61CLa-6kQFcb2mGg", "UC4N283P0CQMu3a8e3yKMABQ", "UClUnHqJSCpMU1V12LIq_9ug", "UCNQsav6N0dp7ApDOkVwPqrA", "UCZ17fVB7aFpWNsK9fHclJuA", "UC25wAC_5htqBdaXNC6O93jQ", "UCvNeBu5uPbxv7FbBBCi317g", "UCxz6gPUpL6l5WSTUObsH9MA", "UCt8YxNuB2b-gSUX8OHmfjTA", "UCn9N1nnsX0xtl6NgZIRnhaw", "UCuZRVn4d0BVTA-DIT-PkzAA", "UCZVvxHrmbxsfkWm-g2Ol0eA", "UCYq3aZxqNWh3qoZN-ky4QJw", "UCo5q67UHoJfrJ2-l1enH4WA", "UCSZ3D8HTjvg53fsTuI2NdVw", "UCKv4EjBucT40QPMnDRMFb2w", "UC6U7Dj_6gkQ0KJEuVUuSwkA", "UCsqfijrYd-p6eFoB9cBEWxA", "UCmKrv7DPHi20M_0enX6_MKA", "UCeTpKZFMQAa73F9Y0CFivDQ", "UCJ-qYJvBz_KcgrvpZxWITyQ", "UCWPKFs6u4_wS9AXZzeZwBlQ", "UCWI528UIXxO7Kl-6c0he5Cg", "UCeikULWFtSVQhudNbF5urRw", "UCp300rt4ULrhF44D9woCSOw", "UCpwFVuATxG2m4A4vSqWCvDA", "UCwyK7L9OBSyoNbE4vtaNd6g", "UCngo17sgSHQuUaHNd9lYIMg", "UCb8xXRRUvFvCZGWMX1y_pjg", "UC_bd3osClZOTuEh6WPlv-6g", "UCsGBpBK9AjGyYeJb1SzG2-g", "UCgjZ1k7hNgrPmOv4WK6q09g", "UC0V_cBwQGN5QxOJ6u7fgOtQ", "UCuZRVn4d0BVTA-DIT-PkzAA", "UC-iUoRqQDcWfAovTsbrk_GA", "UCCFfoTZvZ9EXCX32sdEPhmw", "UCNU9oR7HzWD02LShddSOoeg", "UChBKPr6rRi2HY5d92sUxh2A", "UCHRWh66zFu6-SxsQq2Mv-_g", "UCKmxzTroWG1RJeAR8JiehRQ", "UClTqP6q-NeCZbDqsQJa5bUg", "UCo5q67UHoJfrJ2-l1enH4WA", "UCf0s3NCSzyvGpFkNuTNL2EA", "UC5ilFCORThdWlrhCER_fjfA", "UCYYdahdZ7J6rkiLpz97Nc-Q", "UCTsNlOo-TcXVs8I-pKOxb2A", "UCM5aAMJrEaY1iTxlqPOe8Rg", "UCBzoZTFClGzOq1x2wFFrURg", "UCiiAIfdVLaJsD1vEiMclBoA", "UC5pFR27s1fTBocMKSwM_Svw")
channelid <- data.frame(id)Making two functions for getting data about each YouTube channel and each video.
API_key="AIzaSyDvuqqCGRD9DzsedaqtIw5Hb4Go_DsNBuw"
getstats_video<-function(video_id,API_key){
url=paste0("https://www.googleapis.com/youtube/v3/videos?part=snippet,statistics&id=",video_id,"&key=",API_key)
result <- fromJSON(txt=url)
salida=list()
return(data.frame(name=result$items$snippet$channelTitle, result$items$statistics,title=result$items$snippet$title,date=result$items$snippet$publishedAt,descrip=result$items$snippet$description))
}
get_playlist_canal<-function(id,API_key,topn=15){
url=paste0('https://www.googleapis.com/youtube/v3/playlistItems?part=contentDetails&playlistId=',id,'&key=',API_key,'&maxResults=',topn)
result=fromJSON(txt=url)
return(data.frame(result$items$contentDetails))
}
getstats_canal<-function(id,API_key){
url=paste0('https://www.googleapis.com/youtube/v3/channels?part=snippet%2CcontentDetails%2Cstatistics&id=',id,'&key=',API_key)
result <- fromJSON(txt=url)
return(data.frame(name=result$items$snippet$title,result$items$statistics,pl_list_id=result$items$contentDetails$relatedPlaylists))
}
getall_channels<-function(ids,API_key,topn=5){
videos=lapply(ids,FUN=get_playlist_canal,API_key=API_key,topn=topn) %>% bind_rows()
stats=lapply(videos[,1],FUN=getstats_video,API_key=API_key)
stats=bind_rows(stats)
stats$vid_id=videos[,1]
return(stats)
}Getting information about each YouTube channel and each video into tables.
require(curl)
require(jsonlite)
library(kableExtra)
library(dplyr)
library(ggplot2)
library(plotly)
library(reshape2)
can_st=lapply(youtubevideo$link,FUN = getstats_video,API_key=API_key)
can_st=bind_rows(can_st)
can_sa=lapply(channelid$id,FUN = getstats_canal,API_key=API_key)
can_sa=bind_rows(can_sa)Renaming tables to simplify the perception.
channels <- can_sa
videos <- can_st
#comments <- youtubeDataSaving data tables.
write.csv(channels, file = "channels.csv")
write.csv(videos, file = "videos.csv")
write.csv(comments, file = "comments.csv")Opening tables.
comments <- read.csv("comments.csv", encoding="IBM866", stringsAsFactors=FALSE)
videos <- read.csv("videos.csv", encoding="IBM866", stringsAsFactors=FALSE)
channel <- read.csv("channels.csv", encoding="IBM866", stringsAsFactors=FALSE)Statistics on YouTube videos.
#writeLines('PATH="${RTOOLS40_HOME}\\usr\\bin;${PATH}"', con = "~/.Renviron")
#install.packages("htmltools")
#library(htmltools)
#install.packages("jsonlite", type = "source")
videos$viewCount = as.numeric(videos$viewCount)
videos$likeCount = as.numeric(videos$likeCount)
videos$dislikeCount = as.numeric(videos$dislikeCount)
videos$commentCount = as.numeric(videos$commentCount)
videos2 = videos
library(boot)
library(htmlTable)
library(dplyr)
library(table1)
label(videos2$viewCount) <- "Number of views"
label(videos2$likeCount) <- "Number of likes"
label(videos2$dislikeCount) <- "Number of dislikes"
label(videos2$commentCount) <- "Number of comments"
table1(~ viewCount + likeCount + dislikeCount + commentCount, data=videos2, overall="Total", topclass="Rtable1-grid Rtable1-shade Rtable1-times")| Total (N=100) |
|
|---|---|
| Number of views | |
| Mean (SD) | 95900 (139000) |
| Median [Min, Max] | 44900 [2430, 731000] |
| Number of likes | |
| Mean (SD) | 5070 (6790) |
| Median [Min, Max] | 2870 [201, 35700] |
| Missing | 2 (2.0%) |
| Number of dislikes | |
| Mean (SD) | 172 (201) |
| Median [Min, Max] | 81.0 [1.00, 874] |
| Missing | 2 (2.0%) |
| Number of comments | |
| Mean (SD) | 486 (640) |
| Median [Min, Max] | 261 [28.0, 4020] |
In the table below, you can see the main information collected for all the videos. On average, out of all the selected vlogs, videos gain 95,900 views with a minimum of 2,430 views and a maximum of 731,000 views. It is worth noting that the videos with more than one million views were not initially selected (if there were any) so that there is no bias towards popular videos, as otherwise they would be considered outliners. Among the likes and dislikes, there are 2% of missing data, which means that no user rated the video. Such indicators are inherent in less popular videos. In general, there is a tendency for more likes than dislikes, which may be discussed later. On average, each video has 486 comments, which allows to evaluate the distribution of positive and negative reviews in each. There are also no videos without comments (no missing data), since only those with more than 50 comments were manually selected (some vlogs had 5-10 comments, which could later distort the results of text analysis).
Statistics on YouTube channels.
channel$viewCount = as.numeric(channel$viewCount)
channel$subscriberCount = as.numeric(channel$subscriberCount)
channel$videoCount = as.numeric(channel$videoCount)
label(channel$viewCount) <- "Number of views"
label(channel$subscriberCount) <- "Number of subscribers"
label(channel$videoCount) <- "Number of videos"
table1(~ viewCount + subscriberCount + videoCount, data=channel, overall="Total", topclass="Rtable1-grid Rtable1-shade Rtable1-times")| Total (N=72) |
|
|---|---|
| Number of views | |
| Mean (SD) | 76500000 (173000000) |
| Median [Min, Max] | 20300000 [8680, 981000000] |
| Number of subscribers | |
| Mean (SD) | 438000 (942000) |
| Median [Min, Max] | 142000 [729, 6750000] |
| Missing | 1 (1.4%) |
| Number of videos | |
| Mean (SD) | 451 (480) |
| Median [Min, Max] | 317 [6.00, 2790] |
Among all the selected videos, some authors were repeated 2-3 times (sometimes every 3 videos that fit the selection criteria were from an existing author, such videos were not excluded), so the number of unique authors was 72 users. The number of total views is not a particularly important indicator, but the number of subscribers is a measure of the channel’s popularity. So on average, the selected channels have 438,000 subscribers: there are big channels presented (more than one million subscribers), and there are very small channels presented.
Statistics on YouTube comments.
final <- read.csv("final_df.csv", encoding="IBM866", stringsAsFactors=FALSE)
clean <- final
library(dplyr)
class(clean) <- 'data.frame'
clean$Comment = as.character(clean$Comment)
clean <- clean %>%
mutate(new_col = sapply(strsplit(clean$Comment, " "), length))
label(clean$ReplyCount) <- "Number of replies"
label(clean$LikeCount) <- "Number of likes"
label(clean$new_col) <- "Lentgh of the comment"
table1(~ ReplyCount + LikeCount + new_col, data=clean, overall="Total", topclass="Rtable1-grid Rtable1-shade Rtable1-times")| Total (N=44211) |
|
|---|---|
| Number of replies | |
| Mean (SD) | 0.292 (1.77) |
| Median [Min, Max] | 0 [0, 151] |
| Number of likes | |
| Mean (SD) | 7.20 (79.4) |
| Median [Min, Max] | 0 [0, 5980] |
| Lentgh of the comment | |
| Mean (SD) | 11.3 (14.7) |
| Median [Min, Max] | 7.00 [0, 489] |
Table shows that users do not actively respond to each other’s comments, but there is a comment that received 151 responses. On average, comments have 7 likes. The length of the comment varies from 0 to 489 words, with an average of 11 words. That is, comments are often short. In the future, comment with a zero number of characters (single observation) will be deleted.
Statistics on YouTube comments per user
clean1 <- clean[, c(1,3,4,12)]
clean1$Comment = as.factor(clean1$Comment)
clean1 <- clean1 %>% group_by(AuthorDisplayName) %>% mutate(count = n())
clean1$AuthorDisplayName = as.character(clean1$AuthorDisplayName)
clean2 = clean1
new_df <- clean2[-which(duplicated(clean2$AuthorDisplayName)), ]
label(new_df$count) <- "Number of comments per user"
table1(~ count, data=new_df, overall="Total", topclass="Rtable1-grid Rtable1-shade Rtable1-times")| Total (N=28189) |
|
|---|---|
| Number of comments per user | |
| Mean (SD) | 1.57 (4.08) |
| Median [Min, Max] | 1.00 [1.00, 348] |
As we can see, there are only 28,189 unique users, although there are almost twice as many comments. This means that some users leave their comments more often than once. So, on average, each person leaves one comment, but the maximum number of comments from a single user is 348. This result can be obtained due to repeated authors of video recordings: they may have regular subscribers who support the author of the channel and write comments often.
Loading required packages.
#install.packages("corpus")
library(tidyverse)
library(tidytext)
library(stringr)
library(corpus)Loading data.
df <- read.csv(file = 'comments.csv')
comments <- df[, 2]
# And also download the dictionary of marked words for the Russian language
# On the basis of which the comments will be colored
russian_dictionary <- read.csv(file = 'russian.csv', encoding='UTF-8')# Adding pairs to the dictionary to 'classify' emojis
# To do this, download the source dictionary
emo_dictionary <- read.csv(file = 'emo.csv', encoding='UTF-8')
# Rename the columns we need
# Under the file format 'russian. csv'
names(emo_dictionary)[names(emo_dictionary) == 'Sentiment.score..1....1.'] <- 'score'
names(emo_dictionary)[names(emo_dictionary) == 'Char'] <- 'word'
# Select only the two columns we need - 'word' and 'score'
emo_dictionary <- emo_dictionary[, c(2, 10)]
emo_dictionary$word <- gsub('<U\\+', '', emo_dictionary$word)
emo_dictionary$word <- gsub('>', '', emo_dictionary$word)
russian_dictionary$word <- gsub('<U\\+', '', russian_dictionary$word)
russian_dictionary$word <- gsub('>', '', russian_dictionary$word)
emo_dictionary <- lapply(emo_dictionary, iconv, "UTF-8", "UTF-8",sub='00000')
emo_dictionary <- as.data.frame(emo_dictionary)
ps <- data.frame(names=emo_dictionary$word, chr=apply(emo_dictionary,2,nchar)[,1])
emo_dictionary$size <- ps$chr
emo_dictionary <- emo_dictionary[(emo_dictionary$size == 8),]
emo_dictionary$size <- NULL
# And add the copied file to the original one
russian_dictionary <- rbind(russian_dictionary, emo_dictionary)Tokenization
tokens <- tibble(line = 1:length(comments), text = as.character(comments)) %>% unnest_tokens(word, text)Stemming
tokens$word <- text_tokens(tokens$word, stemmer = 'rus')
russian_dictionary$word <- text_tokens(russian_dictionary$word, stemmer = 'rus')Performing sentiment analysis. Getting the emotional coloring of comments in the form of a certain number
# In this case, we use the dictionary to calculate the number of words
# With some emotional connotation in each comment
# And then we average the numerical values of the found words from the source dictionary for each comment
#result <- tokens %>%
# inner_join(russian_dictionary, by = 'word') %>%
# group_by(line) %>%
# summarise(sentiment = mean(score))
result <- tokens %>%
inner_join(russian_dictionary, by = 'word') %>%
group_by(line)
result$score <- as.numeric(result$score)
result <- result %>% group_by(line) %>%
summarise(sentiment = mean(score))Creating the resulting dataset. After “coloring” each comment, we get “bad” and “good” comments, but in some comments, not a single word from the original dictionary was found, we take them for “neutral”.
# First, we get for each comment the resulting value of sentiment-a number that characterizes the polarity
df <- rename(df, line = X)
final_df <- df %>%
left_join(result, by = 'line')
final_df[is.na(final_df)] <- 0
# Then we classify positive sentiment as positive comments, and negative comments as negative comments, respectively. Zero - as neutral.
map_sentiment_to_label <- function(row) {
if(as.numeric(row[14]) > 0) {
return('positive')
}
else if(as.numeric(row[14]) < 0) {
return('negative')
} else {
return('neutral')
}
}
final_df$label <- apply(final_df, 1, map_sentiment_to_label)
# Saving data
write.csv(final_df, 'final_df.csv')Viewing results. Let us take a few comments and see how the algorithm marked them up.
COUNT <- 3
print('[Positive comments]:')## [1] "[Positive comments]:"
paste(final_df[final_df$label == 'positive', ][, 2][100:(101+COUNT)])## [1] "За 7 миллионов намного круче и современней"
## [2] "Это просто готовый бизнес. Гостиница для отдыхающих."
## [3] "Выпей аспирин,он убирет немного боли. И 2 дня отдыхай. А то нарвешь мускулы. Осторожненька,Катя"
## [4] "Дом, в котором сейчас вы живёте намного лучше этих за 5 и 7 миллионов"
## [5] "Второй дом <U+0001F44D><U+0001F44D><U+0001F44D>"
print('')## [1] ""
print('[Negative comments]:')## [1] "[Negative comments]:"
paste(final_df[final_df$label == 'negative', ][, 2][100:(101+COUNT)])## [1] "У Ванички прям голос прорезался если он что то хочет <U+0001F92D>очень его балуют не есть хорошо"
## [2] "Катюша лук не обрезай сажай как есть"
## [3] "Первый дом на скале- это не для проживания. Это больше подходит под гостиницу."
## [4] "Ребята, ну тут только вид и расположение чего стоят. А выбирать по принципу красивой мебели внутри...можно выкинуть старую и купить свою новую. В том доме просто определенный стиль, который вам не близок и это не значит, что он плохой:))"
## [5] "Вид конечно огонь <U+0001F525><U+0001F44D>не понимаю зачем вам 3 им большой дом<U+0001F937><U+200D>+<U+FE0F>"
print('')## [1] ""
print('[Neutral comments]:')## [1] "[Neutral comments]:"
paste(final_df[final_df$label == 'neutral', ][, 2][100:(101+COUNT)])## [1] "*очень дорого*"
## [2] "Да, полностью согласна"
## [3] "5 миллионов\210 это в российских рублях около 450 миллионов"
## [4] "*скорее всего так*"
## [5] "*очень дорого! очень!*"
It is noticeable that the algorithm reads such words as “very” for coloring positive comments. Some comments contain a double meaning (“I like it, but…” etc.), but the algorithm cannot take into account such subtleties. Overall, the definition of positive comments was for the most part defined correctly.
Comments that did not contain words with an emotional connotation, or implied a question to the author of the video, were correctly classified as neutral comments.
Negative comments were better determined by the algorithm than positive ones. If among the positive comments, the algorithm sometimes marked them as negative, then the negative ones were not marked as positive. Above are 5 examples of negative comments that were identified correctly.
Statistics on sentiment of the comments.
final <- read.csv("final_df.csv", encoding="IBM866", stringsAsFactors=FALSE)
final_df <- final[, c(3,16)]
final_df$label = as.factor(final_df$label)
label(final_df$label) <- "Sentiment"
table1(~ label, data=final_df, overall="Total", topclass="Rtable1-grid Rtable1-shade Rtable1-times")| Total (N=44211) |
|
|---|---|
| Sentiment | |
| negative | 11416 (25.8%) |
| neutral | 10763 (24.3%) |
| positive | 22032 (49.8%) |
It can be seen that there are more positive comments than neutral and negative. However, the method used is not perfect and cannot identify comments everywhere correctly due to grammatical errors of users, and other subtleties of the Russian language. Initially, only 7654 comments were attributed to negative comments, but this algorithm did not take into account the “not” particle for positive words. Therefore, the algorithm was refined, and showed different results. It is still worth paying attention to the fact that the algorithm is not able to count sarcasm in words or other features of the speech of the Russian language.
Counting the number of words in each name.
library(dplyr)
#comments <- read.csv("comments.csv", encoding="IBM866", stringsAsFactors=FALSE)
comments <- final
comments <- comments[!grepl("спам",comments$Comment),]
comments <- comments[-c(35669), ]
class(comments) <- 'data.frame'
comments$AuthorDisplayName = as.character(comments$AuthorDisplayName)
comments <- comments %>%
mutate(new_col = sapply(strsplit(comments$AuthorDisplayName, " "), length))
#comments$new_col = ifelse(comments$"new_col">2,"Anonymous","Full name") Looking at the capital letters in the first names.
comments_1 <- comments[, c(3, 4, 5, 8, 9, 14, 16, 17)]
#comments_1 <- comments[, c(3, 14)]
library(stringr)
comments_1<- comments_1 %>%
mutate(letter = str_detect(comments_1$AuthorDisplayName,"[[:upper:]]"))Determining if a string has a dot.
comments_1 <- comments_1 %>%
mutate(dot = letter)
comments_1$dot = str_detect(comments_1$AuthorDisplayName, "\\.")Searching for duplicated words in names (ex. Lena Lena).
comments_1 <- comments_1 %>%
mutate(NameUnique = AuthorDisplayName)
comments_1$NameUnique <- sapply(comments_1$NameUnique, function(x) paste(unique(unlist(str_split(x," "))), collapse = " "))Counting number of words in names again based on previous step.
comments_1$new_col <- sapply(strsplit(comments_1$NameUnique, " "), length)Dividing names into separate columns to count number of letters in first and last name.
comments_1 = comments_1 %>%
mutate(DividedName = AuthorDisplayName)
out <- strsplit(as.character(comments_1$DividedName),' ')
comments_1 <- data.frame(comments_1, do.call(rbind, out))
comments_1 <- comments_1[, c(1:14)]Counting the number of letters in each string.
comments_1 <- comments_1 %>%
mutate(count = str_count(comments_1$X1, ""))
comments_1 <- comments_1 %>%
mutate(count1 = str_count(comments_1$X2, ""))Looking for names which have symbols, numbers and other characters.
comments_1 <- comments_1 %>%
mutate(unrecog = str_detect(comments_1$AuthorDisplayName, "\\_|\\*|\\-|\\#|\\[|\\]|\\>|\\<|\\?"))
comments_1 <- comments_1 %>%
mutate(numbers = grepl("[0-9]", comments_1$AuthorDisplayName))Looking for names that have capital letters in first and last names.
library(stringr)
comments_1<- comments_1 %>%
mutate(letter1 = str_detect(comments_1$X1,"[[:upper:]]"))
comments_1<- comments_1 %>%
mutate(letter2 = str_detect(comments_1$X2,"[[:upper:]]"))Making condition for identifying anonymous users.
comments_1 <- comments_1 %>%
mutate(Anonymity = AuthorDisplayName)
comments_1$Anonymity = ifelse((comments_1$"new_col">2) | (comments_1$"new_col"==1) | (comments_1$"letter1"=="FALSE") | (comments_1$"letter2"=="FALSE") | (comments_1$dot == "TRUE") | (comments_1$"count"<3) | (comments_1$"count1" <3) | (comments_1$"unrecog"=="TRUE") | (comments_1$"numbers"=="TRUE"),"Anonymous","Full name")
anonymous <- comments_1
anonymous <- anonymous[, c(1,2,3,4,5,6,7,21)]
write.csv(anonymous, file = "anonymity.csv")Statistics based on names.
anon <- read.csv("anonymity.csv", encoding="IBM866", stringsAsFactors=FALSE)
anon$Anonymity = as.factor(anon$Anonymity)
label(anon$Anonymity) <- "Name"
table1(~Anonymity, data = anon, overall="Total", topclass="Rtable1-grid Rtable1-shade Rtable1-times")| Total (N=44075) |
|
|---|---|
| Name | |
| Anonymous | 16380 (37.2%) |
| Full name | 27695 (62.8%) |
The analysis showed such results. After deleting empty observations and spam from users, there are 44075 unique comments left. In total, the submitted data contains more non-anonymous comments (62.8%) than anonymous comments (37.2%). However, it is worth mentioning that unique comments do not reflect the number of unique users, since some of them left several messages.
Identifying unique users
anon1 <- anon
anon1 <- anon1 %>% group_by(AuthorDisplayName) %>% mutate(count = n())
anon1$AuthorDisplayName = as.character(anon1$AuthorDisplayName)
anon2 = anon1
new_df2 <- anon2[-which(duplicated(anon2$AuthorDisplayName)), ]
label(new_df2$count) <- "Authors of comments"
table1(~ Anonymity, data=new_df2, topclass="Rtable1-grid Rtable1-shade Rtable1-times")| Overall (N=28186) |
|
|---|---|
| Name | |
| Anonymous | 10092 (35.8%) |
| Full name | 18094 (64.2%) |
Table shows the results showing the distribution of anonymous or non-anonymous users in the data presented. It is noteworthy that among all unique users (N=28186), the distribution of anonymous users/users with the full name is almost the same as the distribution of the total number of comments (35.8% and 64.2%, respectively).
anonymity <- read.csv("anonymity.csv", encoding="IBM866", stringsAsFactors=FALSE)
# Create the barplot
library(ggplot2)
ggplot(data=anonymity, aes(x=label, fill=Anonymity)) +
geom_bar()+
theme_minimal()label(anonymity$label) <- "Comment"
table1(~label|Anonymity, data = anonymity, topclass="Rtable1-grid Rtable1-shade Rtable1-times")| Anonymous (N=16380) |
Full name (N=27695) |
Overall (N=44075) |
|
|---|---|---|---|
| Comment | |||
| negative | 4346 (26.5%) | 7067 (25.5%) | 11413 (25.9%) |
| neutral | 4213 (25.7%) | 6547 (23.6%) | 10760 (24.4%) |
| positive | 7821 (47.7%) | 14081 (50.8%) | 21902 (49.7%) |
When sentiment analysis and determination of anonymity were made, it is necessary to look at the distribution of the sentiment of comments depending on the user name. In general, you can pay attention to the percentage ratio and see that negative comments are more often published among anonymous users, and positive comments are more often published by users who specify the first and last name.
library(dplyr)
popularity <- videos %>%
mutate(VideoID = c("exSKCByyLns", "w9It9PEK-kM", "Oe7U5rcaq1o", "FLkkmjQ7WX0", "T80MK6EKJT4", "7h4ZeIxc1pI", "prb9d6fsnHE", "FLkkmjQ7WX0", "b_1Hn6wwXpA", "ARUI9VhxNA0", "EZTByPrgfLI", "GlpiBkS4SJA", "Qaeqc5ikxeY", "v5uqya_nJj0", "nUdQIQWZGVA", "iT9QDUhS0dU", "1tE8HEgDmoc", "DIz0VX72-Bc", "Yl793FoyJJE", "uKby_huf-aA", "nCuZLsgPXuM", "52TT8o9vW8A", "QqZ0GLUF6yo", "plAdaVWzglc", "vCGuJ5z8LYY", "xRD2VdNUmxI", "7wwkItW2Gjw", "1VmVw0ALm_o", "Lik5G-gFQk4", "5YhoSIH3lPQ", "kIHTQlfBOJo", "F8ECpUKi5r4", "-7RqgG_NsI4", "Lls-ch1k7xg", "JSR1rwjZlX4", "xhFS89hgp_c", "36ayp2jckS0", "kk4aEC-YppA", "hRTIBqaOlus", "_NXEMh-Fxf4", "LTh0DnN6FDE", "ACwKesRVfUk", "xJpYCte7JcE", "e-henB30X4c", "DIaw4UV1nA4", "1AX_u2poAN4", "ONEtXBfx7oo", "r2FGVMvbWwE", "BRE8ibnX4MQ", "a_HkKhyPaSM", "eVjZ1lyoAOs", "IjpWeQUqMWM", "rwowDSLDzFI", "qlXp8Ppw-gs", "065kBCEtPoU", "NANnST5vxCo", "VJptzBWLCLo", "PE416IH8wws", "M4ZUU553AEA", "FZo2_AXGcD4", "uEW6CWgAMMM", "6fLBa58QgFU", "JaaZuyzNUi4", "CMr3q8u1ymg", "32ZMAZr1z_E", "wDWt3Ss9rNs", "ta6iiWcT_W0", "EJJrDbAmHOU", "aNKJpR0bBEE", "LqVY71LEAPI", "xRD2VdNUmxI", "g-jTu62gDr0", "EpVzqul5Arg", "P1nI5jnVaQM", "S2JdrBvVGSo", "yKRX2gNjtv4", "rPXB_uF-2cI", "U70aqExQkUA", "JJ0LQsAvnB8", "XnF8husX3Ew", "o3zNPtYlGSQ", "r-WbRBmURf4", "wG-DbNbMBkw", "aSepCt7Dx-s", "0Tz-RF9IxJE", "ErlV9j2BmyU", "eVjZ1lyoAOs", "qlXp8Ppw-gs", "NANnST5vxCo", "UF8HZRk7DaY", "xIrvh6BNkh8", "SkAZdW_GQ0s", "R_P8X40oNvA", "MtInFBn33M0", "xP3XMDJMT-k", "ZJiYz2UYSXI", "3Af4st1kRqA", "krHqPzS3nVo", "01SdKgQePsI", "oo46uKNP3gc")
)
write.csv(popularity, "popularity.csv")Loading libraries
library(readxl)
library(dplyr)
library(ggplot2)
library(ggpubr)
library(stringr)
library(tidyr)Loading data
anonymityfinal <- read_excel("anonymityfinal1.xlsx")
popularity <- read.csv(file="popularity.csv")Research on increasing the number of records
paste("Number of videos:", length(popularity$VideoID))## [1] "Number of videos: 100"
paste("Number of unique videos:", length(unique(popularity$VideoID)))## [1] "Number of unique videos: 95"
Selecting duplicates and deleting them from the data
duplicates <- subset(popularity,duplicated(popularity$VideoID))
popularity <- popularity[!(popularity$X %in% duplicates$X),]Creating a concatenated dataset. At this stage, a dataset is created by adding information about the video to the comments. The INNER JOIN connection is used
df <- merge(x = anonymityfinal, y = popularity, by = "VideoID", all = TRUE)Checking relationships. Before conducting the tests themselves, we agree that we will use the alpha significance level of 0.05
Let us assume that anonymity is allowed to “feel freedom of actions”, so there should be much more negative anonymous comments than non-anonymous negative ones
H0: There is no relationship between type of comments and anonymity/ non-anonymity. H1: The relationship exists.
Graphical representation of the distribution
df %>%
ggplot(aes(label, fill = Anonymity))+
geom_bar(position="fill") + labs(title="Label distribution across anonymity") + theme_bw()We see that there are fewer anonymous comments than non - anonymous ones, but the distribution of the types of comments themselves for each of the groups is visually approximately the same: most often there are positive ones and approximately the same frequency-negative and neutral.
Contingency table
selected_table_df <- df[, c(7, 8)]
knitr::kable(table(selected_table_df))| Anonymous | Full name | |
|---|---|---|
| negative | 4357 | 7006 |
| neutral | 4255 | 6554 |
| positive | 7847 | 14056 |
Contingency table (as a percentage)
knitr::kable(prop.table(table(selected_table_df)))| Anonymous | Full name | |
|---|---|---|
| negative | 0.0988542 | 0.1589563 |
| neutral | 0.0965400 | 0.1487011 |
| positive | 0.1780374 | 0.3189109 |
The tables generally support the visual analysis, there are still a few positive comments, but still more from non-anonymous users.
Chi-square test
x_test <- chisq.test(table(selected_table_df))
x_test##
## Pearson's Chi-squared test
##
## data: table(selected_table_df)
## X-squared = 45.296, df = 2, p-value = 1.459e-10
In our example, the variables are statistically significantly related (p-value tends to zero, which is significantly less than the established level, so we reject the null hypothesis of independence and can accept an alternative one).
Observed frequencies
x_test$observed## Anonymity
## label Anonymous Full name
## negative 4357 7006
## neutral 4255 6554
## positive 7847 14056
Conclusions: the anonymity of the commentator statistically affects the type of comment, but this is only due to the fact that there are simply more non-anonymous comments than anonymous ones (that is, if we met a positive comment, then most likely the user did not hide his name). The very same distribution of comment types for each group is approximately the same, that is, our initial assumption turned out to be incorrect.
We assume that long comments are left, as a rule, by anonymous users, since, perhaps, anonymity “liberates” a person, allowing to write more and more in detail.
df$CommentLength <- str_count(df$Comment)
df <- df %>% drop_na()Getting average values
result_anonymity <- df %>%
group_by(Anonymity) %>%
dplyr::summarize(Mean = mean(CommentLength, na.rm=TRUE))Logistic regression model
df$Anonymity <- as.factor(df$Anonymity)
model <- glm(Anonymity ~ CommentLength, data = df, family = "binomial")
summary(model)##
## Call:
## glm(formula = Anonymity ~ CommentLength, family = "binomial",
## data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4762 -1.4018 0.9672 0.9684 0.9691
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.120e-01 1.216e-02 42.119 <2e-16 ***
## CommentLength 4.298e-05 8.481e-05 0.507 0.612
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 58073 on 43928 degrees of freedom
## Residual deviance: 58073 on 43927 degrees of freedom
## AIC: 58077
##
## Number of Fisher Scoring iterations: 4
probabilities <- model %>% predict(df, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "Full name", "Anonymous")
paste("Accuracy:", round(mean(predicted.classes == df$Anonymity), 2)*100, "%")## [1] "Accuracy: 63 %"
Running the one-way ANOVA test
summary(aov(CommentLength ~ Anonymity, data = df))## Df Sum Sq Mean Sq F value Pr(>F)
## Anonymity 1 3519 3519 0.257 0.612
## Residuals 43927 601552646 13694
Since the p-value (0.612) is significantly higher than the significance level of 0.05, we can conclude that there are no significant statistical differences between the average values of the length of the comment by the type of anonymity of the user who left it.
Conclusions: although it was possible to predict the type of anonymity of the user relatively well by the length of the comment on the training data itself (63%), however, this is still quite small. In addition, the conducted statistical test tells us that the average values of the length of comments for anonymous and non-anonymous users do not statistically differ. Therefore, we can conclude that the anonymity/non-anonymity of the user does not affect the number of characters in the comment in any way. Therefore, our initial assumption was not confirmed.
Difference between the average number of likes of anonymous / non-anonymous comments
In order to test the hypothesis, the average number of likes for anonymous and non-anonymous comments was first obtained.
anony = anon
library(tidyverse)
library(ggpubr)
library(rstatix)
anon1 <- anon %>% filter(Anonymity == "Full name")
anon2 <- anon %>% filter(Anonymity == "Anonymous")
mean(anon1$LikeCount)## [1] 6.56855
mean(anon2$LikeCount)## [1] 8.33956
Statistics showed that on average, anonymous comments collect 8 likes, and not anonymous comments collect 6 likes. Despite the large data outliers (where under the popular videos there are more than 1000 likes on comments), we can conclude that the average number of likes differs depending on anonymity.
res.aov <- anony %>% anova_test(LikeCount ~ Anonymity)
res.aov| Effect | DFn | DFd | F | p | p<.05 | ges |
|---|---|---|---|---|---|---|
| Anonymity | 1 | 44073 | 5.106 | 0.024 |
|
0.000116 |
pwc <- anony %>% tukey_hsd(LikeCount ~ Anonymity)
pwc| term | group1 | group2 | null.value | estimate | conf.low | conf.high | p.adj | p.adj.signif |
|---|---|---|---|---|---|---|---|---|
| Anonymity | Anonymous | Full name | 0 | -1.77101 | -3.307146 | -0.2348742 | 0.0238 |
|
A one-way ANOVA test was also conducted. From the ANOVA results, it can be seen that there are significant differences between anonymous and non-anonymous comments (p = 0.024), F(1, 44073) = 5.106. A Tukey post-hoc tests was conducted to perform multiple pairwise comparisons between groups. The results showed that there are significant difference between two groups (adjusted p-value = 0.024).