Establishing a connection.
#install.packages("vosonSML")
library(vosonSML)
apikey <- "AIzaSyDvuqqCGRD9DzsedaqtIw5Hb4Go_DsNBuw"
youtubeAuth <- Authenticate("youtube", apiKey = "AIzaSyDvuqqCGRD9DzsedaqtIw5Hb4Go_DsNBuw")Collecting links of all chosen YouTube videos and making a dataframe of comments of each video.
library(dplyr)
youtubeVideoIds <- GetYoutubeVideoIDs(c("https://youtu.be/exSKCByyLns", "https://youtu.be/w9It9PEK-kM", "https://youtu.be/Oe7U5rcaq1o", "https://youtu.be/FLkkmjQ7WX0", "https://youtu.be/T80MK6EKJT4", "https://youtu.be/7h4ZeIxc1pI", "https://youtu.be/prb9d6fsnHE", "https://youtu.be/FLkkmjQ7WX0", "https://youtu.be/b_1Hn6wwXpA", "https://youtu.be/ARUI9VhxNA0", "https://youtu.be/EZTByPrgfLI", "https://youtu.be/GlpiBkS4SJA", "https://youtu.be/Qaeqc5ikxeY", "https://youtu.be/v5uqya_nJj0", "https://youtu.be/nUdQIQWZGVA", "https://youtu.be/iT9QDUhS0dU", "https://youtu.be/1tE8HEgDmoc", "https://youtu.be/DIz0VX72-Bc", "https://youtu.be/Yl793FoyJJE", "https://youtu.be/uKby_huf-aA", "https://youtu.be/nCuZLsgPXuM", "https://youtu.be/52TT8o9vW8A", "https://youtu.be/QqZ0GLUF6yo", "https://youtu.be/plAdaVWzglc", "https://youtu.be/vCGuJ5z8LYY", "https://youtu.be/xRD2VdNUmxI", "https://youtu.be/7wwkItW2Gjw", "https://youtu.be/1VmVw0ALm_o", "https://youtu.be/Lik5G-gFQk4", "https://youtu.be/5YhoSIH3lPQ", "https://youtu.be/kIHTQlfBOJo", "https://youtu.be/F8ECpUKi5r4", "https://youtu.be/-7RqgG_NsI4", "https://youtu.be/Lls-ch1k7xg", "https://youtu.be/JSR1rwjZlX4", "https://youtu.be/xhFS89hgp_c", "https://youtu.be/36ayp2jckS0", "https://youtu.be/kk4aEC-YppA", "https://youtu.be/hRTIBqaOlus", "https://youtu.be/_NXEMh-Fxf4", "https://youtu.be/LTh0DnN6FDE", "https://youtu.be/ACwKesRVfUk", "https://youtu.be/xJpYCte7JcE", "https://youtu.be/e-henB30X4c", "https://youtu.be/DIaw4UV1nA4", "https://youtu.be/1AX_u2poAN4", "https://youtu.be/ONEtXBfx7oo", "https://youtu.be/r2FGVMvbWwE", "https://youtu.be/BRE8ibnX4MQ", "https://youtu.be/a_HkKhyPaSM", "https://youtu.be/eVjZ1lyoAOs", "https://youtu.be/IjpWeQUqMWM", "https://youtu.be/rwowDSLDzFI", "https://youtu.be/qlXp8Ppw-gs", "https://youtu.be/065kBCEtPoU", "https://youtu.be/NANnST5vxCo", "https://youtu.be/VJptzBWLCLo", "https://youtu.be/PE416IH8wws", "https://youtu.be/M4ZUU553AEA", "https://youtu.be/FZo2_AXGcD4", "https://youtu.be/uEW6CWgAMMM", "https://youtu.be/6fLBa58QgFU", "https://youtu.be/JaaZuyzNUi4", "https://youtu.be/CMr3q8u1ymg", "https://youtu.be/32ZMAZr1z_E", "https://youtu.be/wDWt3Ss9rNs", "https://youtu.be/ta6iiWcT_W0", "https://youtu.be/EJJrDbAmHOU", "https://youtu.be/aNKJpR0bBEE", "https://youtu.be/LqVY71LEAPI", "https://youtu.be/xRD2VdNUmxI", "https://youtu.be/g-jTu62gDr0", "https://youtu.be/EpVzqul5Arg", "https://youtu.be/P1nI5jnVaQM", "https://youtu.be/S2JdrBvVGSo", "https://youtu.be/yKRX2gNjtv4", "https://youtu.be/rPXB_uF-2cI", "https://youtu.be/U70aqExQkUA", "https://youtu.be/JJ0LQsAvnB8", "https://youtu.be/XnF8husX3Ew", "https://youtu.be/o3zNPtYlGSQ", "https://youtu.be/r-WbRBmURf4", "https://youtu.be/wG-DbNbMBkw", "https://youtu.be/aSepCt7Dx-s", "https://youtu.be/0Tz-RF9IxJE", "https://youtu.be/ErlV9j2BmyU", "https://youtu.be/eVjZ1lyoAOs", "https://youtu.be/qlXp8Ppw-gs", "https://youtu.be/NANnST5vxCo", "https://youtu.be/UF8HZRk7DaY", "https://youtu.be/xIrvh6BNkh8", "https://youtu.be/SkAZdW_GQ0s", "https://youtu.be/R_P8X40oNvA", "https://youtu.be/MtInFBn33M0", "https://youtu.be/xP3XMDJMT-k", "https://youtu.be/ZJiYz2UYSXI", "https://youtu.be/3Af4st1kRqA", "https://youtu.be/krHqPzS3nVo", "https://youtu.be/01SdKgQePsI", "https://youtu.be/oo46uKNP3gc"))
youtubeData <- youtubeAuth %>%
Collect(videoIDs = youtubeVideoIds,
maxComments = 3000,
verbose = FALSE)Collecting links of each YouTube channel and each video.
link <- c("exSKCByyLns", "w9It9PEK-kM", "Oe7U5rcaq1o", "FLkkmjQ7WX0", "T80MK6EKJT4", "7h4ZeIxc1pI", "prb9d6fsnHE", "FLkkmjQ7WX0", "b_1Hn6wwXpA", "ARUI9VhxNA0", "EZTByPrgfLI", "GlpiBkS4SJA", "Qaeqc5ikxeY", "v5uqya_nJj0", "nUdQIQWZGVA", "iT9QDUhS0dU", "1tE8HEgDmoc", "DIz0VX72-Bc", "Yl793FoyJJE", "uKby_huf-aA", "nCuZLsgPXuM", "52TT8o9vW8A", "QqZ0GLUF6yo", "plAdaVWzglc", "vCGuJ5z8LYY", "xRD2VdNUmxI", "7wwkItW2Gjw", "1VmVw0ALm_o", "Lik5G-gFQk4", "5YhoSIH3lPQ", "kIHTQlfBOJo", "F8ECpUKi5r4", "-7RqgG_NsI4", "Lls-ch1k7xg", "JSR1rwjZlX4", "xhFS89hgp_c", "36ayp2jckS0", "kk4aEC-YppA", "hRTIBqaOlus", "_NXEMh-Fxf4", "LTh0DnN6FDE", "ACwKesRVfUk", "xJpYCte7JcE", "e-henB30X4c", "DIaw4UV1nA4", "1AX_u2poAN4", "ONEtXBfx7oo", "r2FGVMvbWwE", "BRE8ibnX4MQ", "a_HkKhyPaSM", "eVjZ1lyoAOs", "IjpWeQUqMWM", "rwowDSLDzFI", "qlXp8Ppw-gs", "065kBCEtPoU", "NANnST5vxCo", "VJptzBWLCLo", "PE416IH8wws", "M4ZUU553AEA", "FZo2_AXGcD4", "uEW6CWgAMMM", "6fLBa58QgFU", "JaaZuyzNUi4", "CMr3q8u1ymg", "32ZMAZr1z_E", "wDWt3Ss9rNs", "ta6iiWcT_W0", "EJJrDbAmHOU", "aNKJpR0bBEE", "LqVY71LEAPI", "xRD2VdNUmxI", "g-jTu62gDr0", "EpVzqul5Arg", "P1nI5jnVaQM", "S2JdrBvVGSo", "yKRX2gNjtv4", "rPXB_uF-2cI", "U70aqExQkUA", "JJ0LQsAvnB8", "XnF8husX3Ew", "o3zNPtYlGSQ", "r-WbRBmURf4", "wG-DbNbMBkw", "aSepCt7Dx-s", "0Tz-RF9IxJE", "ErlV9j2BmyU", "eVjZ1lyoAOs", "qlXp8Ppw-gs", "NANnST5vxCo", "UF8HZRk7DaY", "xIrvh6BNkh8", "SkAZdW_GQ0s", "R_P8X40oNvA", "MtInFBn33M0", "xP3XMDJMT-k", "ZJiYz2UYSXI", "3Af4st1kRqA", "krHqPzS3nVo", "01SdKgQePsI", "oo46uKNP3gc")
youtubevideo <- data.frame(link)
id <- c("UCpm6vCgiehSKHtMFFS5ukiA", "UCgjP6Yi2ciSqn5qJ8BEXOkA", "UCQzIUO1m_Z--Qulo7IQKHvg", "UCR-j4NxVnwhfbZiClmOMibA", "UCWI528UIXxO7Kl-6c0he5Cg", "UCMLqsgXvM6Vs0aiegOdrtLQ", "UCLuuuRofmjK-ypp5Hftw7tw", "UCPW0TwizKn5bCr-8PcrP8MA", "UCz8LT_kp2rIs0CMv2AUivNg", "UCoF61Ju7c9lMJpRPhQ7PBxg", "UCQaA1h-xc_nDcuEdZIbT2qg", "UCOL8_f6Rjv338p8aY0C2WvQ", "UC643CRm6UKUl1HbdCD9Kidw", "UCndyQS3Q8UNVTFYbqEO22gg", "UCvcipRqgQ8DqI1wTJka2_dQ", "UCyLCXE5tnJGdyImQlfxucfw", "UCvd_X-p3MEFQKoDGQjb3OiQ", "UCngo17sgSHQuUaHNd9lYIMg", "UCfDJfdU0aCiMTluHXKDahYg", "UCgjP6Yi2ciSqn5qJ8BEXOkA", "UCLF8x8A6v05Ic_vwlRjPTRA", "UCEMyt3qCrwa4wETvsheB95w", "UCfC2j7U61CLa-6kQFcb2mGg", "UC4N283P0CQMu3a8e3yKMABQ", "UClUnHqJSCpMU1V12LIq_9ug", "UCNQsav6N0dp7ApDOkVwPqrA", "UCZ17fVB7aFpWNsK9fHclJuA", "UC25wAC_5htqBdaXNC6O93jQ", "UCvNeBu5uPbxv7FbBBCi317g", "UCxz6gPUpL6l5WSTUObsH9MA", "UCt8YxNuB2b-gSUX8OHmfjTA", "UCn9N1nnsX0xtl6NgZIRnhaw", "UCuZRVn4d0BVTA-DIT-PkzAA", "UCZVvxHrmbxsfkWm-g2Ol0eA", "UCYq3aZxqNWh3qoZN-ky4QJw", "UCo5q67UHoJfrJ2-l1enH4WA", "UCSZ3D8HTjvg53fsTuI2NdVw", "UCKv4EjBucT40QPMnDRMFb2w", "UC6U7Dj_6gkQ0KJEuVUuSwkA", "UCsqfijrYd-p6eFoB9cBEWxA", "UCmKrv7DPHi20M_0enX6_MKA", "UCeTpKZFMQAa73F9Y0CFivDQ", "UCJ-qYJvBz_KcgrvpZxWITyQ", "UCWPKFs6u4_wS9AXZzeZwBlQ", "UCWI528UIXxO7Kl-6c0he5Cg", "UCeikULWFtSVQhudNbF5urRw", "UCp300rt4ULrhF44D9woCSOw", "UCpwFVuATxG2m4A4vSqWCvDA", "UCwyK7L9OBSyoNbE4vtaNd6g", "UCngo17sgSHQuUaHNd9lYIMg", "UCb8xXRRUvFvCZGWMX1y_pjg", "UC_bd3osClZOTuEh6WPlv-6g", "UCsGBpBK9AjGyYeJb1SzG2-g", "UCgjZ1k7hNgrPmOv4WK6q09g", "UC0V_cBwQGN5QxOJ6u7fgOtQ", "UCuZRVn4d0BVTA-DIT-PkzAA", "UC-iUoRqQDcWfAovTsbrk_GA", "UCCFfoTZvZ9EXCX32sdEPhmw", "UCNU9oR7HzWD02LShddSOoeg", "UChBKPr6rRi2HY5d92sUxh2A", "UCHRWh66zFu6-SxsQq2Mv-_g", "UCKmxzTroWG1RJeAR8JiehRQ", "UClTqP6q-NeCZbDqsQJa5bUg", "UCo5q67UHoJfrJ2-l1enH4WA", "UCf0s3NCSzyvGpFkNuTNL2EA", "UC5ilFCORThdWlrhCER_fjfA", "UCYYdahdZ7J6rkiLpz97Nc-Q", "UCTsNlOo-TcXVs8I-pKOxb2A", "UCM5aAMJrEaY1iTxlqPOe8Rg", "UCBzoZTFClGzOq1x2wFFrURg", "UCiiAIfdVLaJsD1vEiMclBoA", "UC5pFR27s1fTBocMKSwM_Svw")
channelid <- data.frame(id)Making two functions for getting data about each YouTube channel and each video.
API_key="AIzaSyDvuqqCGRD9DzsedaqtIw5Hb4Go_DsNBuw"
getstats_video<-function(video_id,API_key){
url=paste0("https://www.googleapis.com/youtube/v3/videos?part=snippet,statistics&id=",video_id,"&key=",API_key)
result <- fromJSON(txt=url)
salida=list()
return(data.frame(name=result$items$snippet$channelTitle, result$items$statistics,title=result$items$snippet$title,date=result$items$snippet$publishedAt,descrip=result$items$snippet$description))
}
get_playlist_canal<-function(id,API_key,topn=15){
url=paste0('https://www.googleapis.com/youtube/v3/playlistItems?part=contentDetails&playlistId=',id,'&key=',API_key,'&maxResults=',topn)
result=fromJSON(txt=url)
return(data.frame(result$items$contentDetails))
}
getstats_canal<-function(id,API_key){
url=paste0('https://www.googleapis.com/youtube/v3/channels?part=snippet%2CcontentDetails%2Cstatistics&id=',id,'&key=',API_key)
result <- fromJSON(txt=url)
return(data.frame(name=result$items$snippet$title,result$items$statistics,pl_list_id=result$items$contentDetails$relatedPlaylists))
}
getall_channels<-function(ids,API_key,topn=5){
videos=lapply(ids,FUN=get_playlist_canal,API_key=API_key,topn=topn) %>% bind_rows()
stats=lapply(videos[,1],FUN=getstats_video,API_key=API_key)
stats=bind_rows(stats)
stats$vid_id=videos[,1]
return(stats)
}Getting information about each YouTube channel and each video into tables.
require(curl)
require(jsonlite)
library(kableExtra)
library(dplyr)
library(ggplot2)
library(plotly)
library(reshape2)
can_st=lapply(youtubevideo$link,FUN = getstats_video,API_key=API_key)
can_st=bind_rows(can_st)
can_sa=lapply(channelid$id,FUN = getstats_canal,API_key=API_key)
can_sa=bind_rows(can_sa)Renaming tables to simplify the perception.
channels <- can_sa
videos <- can_st
#comments <- youtubeDataSaving data tables.
write.csv(channels, file = "channels.csv")
write.csv(videos, file = "videos.csv")
write.csv(comments, file = "comments.csv")Opening tables.
comments <- read.csv("comments.csv", encoding="IBM866", stringsAsFactors=FALSE)
videos <- read.csv("videos.csv", encoding="IBM866", stringsAsFactors=FALSE)
channel <- read.csv("channels.csv", encoding="IBM866", stringsAsFactors=FALSE)Statistics on YouTube videos.
#writeLines('PATH="${RTOOLS40_HOME}\\usr\\bin;${PATH}"', con = "~/.Renviron")
#install.packages("htmltools")
#library(htmltools)
#install.packages("jsonlite", type = "source")
videos$viewCount = as.numeric(videos$viewCount)
videos$likeCount = as.numeric(videos$likeCount)
videos$dislikeCount = as.numeric(videos$dislikeCount)
videos$commentCount = as.numeric(videos$commentCount)
videos2 = videos
library(boot)
library(htmlTable)
library(dplyr)
library(table1)
label(videos2$viewCount) <- "Number of views"
label(videos2$likeCount) <- "Number of likes"
label(videos2$dislikeCount) <- "Number of dislikes"
label(videos2$commentCount) <- "Number of comments"
table1(~ viewCount + likeCount + dislikeCount + commentCount, data=videos2, overall="Total", topclass="Rtable1-grid Rtable1-shade Rtable1-times")| Total (N=100) |
|
|---|---|
| Number of views | |
| Mean (SD) | 95900 (139000) |
| Median [Min, Max] | 44900 [2430, 731000] |
| Number of likes | |
| Mean (SD) | 5070 (6790) |
| Median [Min, Max] | 2870 [201, 35700] |
| Missing | 2 (2.0%) |
| Number of dislikes | |
| Mean (SD) | 172 (201) |
| Median [Min, Max] | 81.0 [1.00, 874] |
| Missing | 2 (2.0%) |
| Number of comments | |
| Mean (SD) | 486 (640) |
| Median [Min, Max] | 261 [28.0, 4020] |
In the table below, you can see the main information collected for all the videos. On average, out of all the selected vlogs, videos gain 95,900 views with a minimum of 2,430 views and a maximum of 731,000 views. It is worth noting that the videos with more than one million views were not initially selected (if there were any) so that there is no bias towards popular videos, as otherwise they would be considered outliners. Among the likes and dislikes, there are 2% of missing data, which means that no user rated the video. Such indicators are inherent in less popular videos. In general, there is a tendency for more likes than dislikes, which may be discussed later. On average, each video has 486 comments, which allows to evaluate the distribution of positive and negative reviews in each. There are also no videos without comments (no missing data), since only those with more than 50 comments were manually selected (some vlogs had 5-10 comments, which could later distort the results of text analysis).
Statistics on YouTube channels.
channel$viewCount = as.numeric(channel$viewCount)
channel$subscriberCount = as.numeric(channel$subscriberCount)
channel$videoCount = as.numeric(channel$videoCount)
label(channel$viewCount) <- "Number of views"
label(channel$subscriberCount) <- "Number of subscribers"
label(channel$videoCount) <- "Number of videos"
table1(~ viewCount + subscriberCount + videoCount, data=channel, overall="Total", topclass="Rtable1-grid Rtable1-shade Rtable1-times")| Total (N=72) |
|
|---|---|
| Number of views | |
| Mean (SD) | 76500000 (173000000) |
| Median [Min, Max] | 20300000 [8680, 981000000] |
| Number of subscribers | |
| Mean (SD) | 438000 (942000) |
| Median [Min, Max] | 142000 [729, 6750000] |
| Missing | 1 (1.4%) |
| Number of videos | |
| Mean (SD) | 451 (480) |
| Median [Min, Max] | 317 [6.00, 2790] |
Among all the selected videos, some authors were repeated 2-3 times (sometimes every 3 videos that fit the selection criteria were from an existing author, such videos were not excluded), so the number of unique authors was 72 users. The number of total views is not a particularly important indicator, but the number of subscribers is a measure of the channel’s popularity. So on average, the selected channels have 438,000 subscribers: there are big channels presented (more than one million subscribers), and there are very small channels presented.
Statistics on YouTube comments.
final <- read.csv("final_df.csv", encoding="IBM866", stringsAsFactors=FALSE)
clean <- final
library(dplyr)
class(clean) <- 'data.frame'
clean$Comment = as.character(clean$Comment)
clean <- clean %>%
mutate(new_col = sapply(strsplit(clean$Comment, " "), length))
label(clean$ReplyCount) <- "Number of replies"
label(clean$LikeCount) <- "Number of likes"
label(clean$new_col) <- "Lentgh of the comment"
table1(~ ReplyCount + LikeCount + new_col, data=clean, overall="Total", topclass="Rtable1-grid Rtable1-shade Rtable1-times")| Total (N=44211) |
|
|---|---|
| Number of replies | |
| Mean (SD) | 0.292 (1.77) |
| Median [Min, Max] | 0 [0, 151] |
| Number of likes | |
| Mean (SD) | 7.20 (79.4) |
| Median [Min, Max] | 0 [0, 5980] |
| Lentgh of the comment | |
| Mean (SD) | 11.3 (14.7) |
| Median [Min, Max] | 7.00 [0, 489] |
Table shows that users do not actively respond to each other’s comments, but there is a comment that received 151 responses. On average, comments have 7 likes. The length of the comment varies from 0 to 489 words, with an average of 11 words. That is, comments are often short. In the future, comment with a zero number of characters (single observation) will be deleted.
Statistics on YouTube comments per user
clean1 <- clean[, c(1,3,4,12)]
clean1$Comment = as.factor(clean1$Comment)
clean1 <- clean1 %>% group_by(AuthorDisplayName) %>% mutate(count = n())
clean1$AuthorDisplayName = as.character(clean1$AuthorDisplayName)
clean2 = clean1
new_df <- clean2[-which(duplicated(clean2$AuthorDisplayName)), ]
label(new_df$count) <- "Number of comments per user"
table1(~ count, data=new_df, overall="Total", topclass="Rtable1-grid Rtable1-shade Rtable1-times")| Total (N=28189) |
|
|---|---|
| Number of comments per user | |
| Mean (SD) | 1.57 (4.08) |
| Median [Min, Max] | 1.00 [1.00, 348] |
As we can see, there are only 28,189 unique users, although there are almost twice as many comments. This means that some users leave their comments more often than once. So, on average, each person leaves one comment, but the maximum number of comments from a single user is 348. This result can be obtained due to repeated authors of video recordings: they may have regular subscribers who support the author of the channel and write comments often.
Loading required packages.
#install.packages("corpus")
library(tidyverse)
library(tidytext)
library(stringr)
library(corpus)Loading data.
df <- read.csv(file = 'comments.csv')
comments <- df[, 2]
# And also download the dictionary of marked words for the Russian language
# On the basis of which the comments will be colored
russian_dictionary <- read.csv(file = 'russian.csv', encoding='UTF-8')# Adding pairs to the dictionary to 'classify' emojis
# To do this, download the source dictionary
emo_dictionary <- read.csv(file = 'emo.csv', encoding='UTF-8')
# Rename the columns we need
# Under the file format 'russian. csv'
names(emo_dictionary)[names(emo_dictionary) == 'Sentiment.score..1....1.'] <- 'score'
names(emo_dictionary)[names(emo_dictionary) == 'Char'] <- 'word'
# Select only the two columns we need - 'word' and 'score'
emo_dictionary <- emo_dictionary[, c(2, 10)]
emo_dictionary$word <- gsub('<U\\+', '', emo_dictionary$word)
emo_dictionary$word <- gsub('>', '', emo_dictionary$word)
russian_dictionary$word <- gsub('<U\\+', '', russian_dictionary$word)
russian_dictionary$word <- gsub('>', '', russian_dictionary$word)
emo_dictionary <- lapply(emo_dictionary, iconv, "UTF-8", "UTF-8",sub='00000')
emo_dictionary <- as.data.frame(emo_dictionary)
ps <- data.frame(names=emo_dictionary$word, chr=apply(emo_dictionary,2,nchar)[,1])
emo_dictionary$size <- ps$chr
emo_dictionary <- emo_dictionary[(emo_dictionary$size == 8),]
emo_dictionary$size <- NULL
# And add the copied file to the original one
russian_dictionary <- rbind(russian_dictionary, emo_dictionary)Tokenization
tokens <- tibble(line = 1:length(comments), text = as.character(comments)) %>% unnest_tokens(word, text)Stemming
tokens$word <- text_tokens(tokens$word, stemmer = 'rus')
russian_dictionary$word <- text_tokens(russian_dictionary$word, stemmer = 'rus')Performing sentiment analysis. Getting the emotional coloring of comments in the form of a certain number
# In this case, we use the dictionary to calculate the number of words
# With some emotional connotation in each comment
# And then we average the numerical values of the found words from the source dictionary for each comment
#result <- tokens %>%
# inner_join(russian_dictionary, by = 'word') %>%
# group_by(line) %>%
# summarise(sentiment = mean(score))
result <- tokens %>%
inner_join(russian_dictionary, by = 'word') %>%
group_by(line)
result$score <- as.numeric(result$score)
result <- result %>% group_by(line) %>%
summarise(sentiment = mean(score))Creating the resulting dataset. After “coloring” each comment, we get “bad” and “good” comments, but in some comments, not a single word from the original dictionary was found, we take them for “neutral”.
# First, we get for each comment the resulting value of sentiment-a number that characterizes the polarity
df <- rename(df, line = X)
final_df <- df %>%
left_join(result, by = 'line')
final_df[is.na(final_df)] <- 0
# Then we classify positive sentiment as positive comments, and negative comments as negative comments, respectively. Zero - as neutral.
map_sentiment_to_label <- function(row) {
if(as.numeric(row[14]) > 0) {
return('positive')
}
else if(as.numeric(row[14]) < 0) {
return('negative')
} else {
return('neutral')
}
}
final_df$label <- apply(final_df, 1, map_sentiment_to_label)
# Saving data
write.csv(final_df, 'final_df.csv')Viewing results. Let us take a few comments and see how the algorithm marked them up.
COUNT <- 3
print('[Positive comments]:')## [1] "[Positive comments]:"
paste(final_df[final_df$label == 'positive', ][, 2][100:(101+COUNT)])## [1] "За 7 миллионов намного круче и современней"
## [2] "Это просто готовый бизнес. Гостиница для отдыхающих."
## [3] "Выпей аспирин,он убирет немного боли. И 2 дня отдыхай. А то нарвешь мускулы. Осторожненька,Катя"
## [4] "Дом, в котором сейчас вы живёте намного лучше этих за 5 и 7 миллионов"
## [5] "Второй дом <U+0001F44D><U+0001F44D><U+0001F44D>"
print('')## [1] ""
print('[Negative comments]:')## [1] "[Negative comments]:"
paste(final_df[final_df$label == 'negative', ][, 2][100:(101+COUNT)])## [1] "У Ванички прям голос прорезался если он что то хочет <U+0001F92D>очень его балуют не есть хорошо"
## [2] "Катюша лук не обрезай сажай как есть"
## [3] "Первый дом на скале- это не для проживания. Это больше подходит под гостиницу."
## [4] "Ребята, ну тут только вид и расположение чего стоят. А выбирать по принципу красивой мебели внутри...можно выкинуть старую и купить свою новую. В том доме просто определенный стиль, который вам не близок и это не значит, что он плохой:))"
## [5] "Вид конечно огонь <U+0001F525><U+0001F44D>не понимаю зачем вам 3 им большой дом<U+0001F937><U+200D>+<U+FE0F>"
print('')## [1] ""
print('[Neutral comments]:')## [1] "[Neutral comments]:"
paste(final_df[final_df$label == 'neutral', ][, 2][100:(101+COUNT)])## [1] "*очень дорого*"
## [2] "Да, полностью согласна"
## [3] "5 миллионов\210 это в российских рублях около 450 миллионов"
## [4] "*скорее всего так*"
## [5] "*очень дорого! очень!*"
It is noticeable that the algorithm reads such words as “very” for coloring positive comments. Some comments contain a double meaning (“I like it, but…” etc.), but the algorithm cannot take into account such subtleties. Overall, the definition of positive comments was for the most part defined correctly.
Comments that did not contain words with an emotional connotation, or implied a question to the author of the video, were correctly classified as neutral comments.
Negative comments were better determined by the algorithm than positive ones. If among the positive comments, the algorithm sometimes marked them as negative, then the negative ones were not marked as positive. Above are 5 examples of negative comments that were identified correctly.
Statistics on sentiment of the comments.
final <- read.csv("final_df.csv", encoding="IBM866", stringsAsFactors=FALSE)
final_df <- final[, c(3,16)]
final_df$label = as.factor(final_df$label)
label(final_df$label) <- "Sentiment"
table1(~ label, data=final_df, overall="Total", topclass="Rtable1-grid Rtable1-shade Rtable1-times")| Total (N=44211) |
|
|---|---|
| Sentiment | |
| negative | 11416 (25.8%) |
| neutral | 10763 (24.3%) |
| positive | 22032 (49.8%) |
It can be seen that there are more positive comments than neutral and negative. However, the method used is not perfect and cannot identify comments everywhere correctly due to grammatical errors of users, and other subtleties of the Russian language. Initially, only 7654 comments were attributed to negative comments, but this algorithm did not take into account the “not” particle for positive words. Therefore, the algorithm was refined, and showed different results. It is still worth paying attention to the fact that the algorithm is not able to count sarcasm in words or other features of the speech of the Russian language.
Counting the number of words in each name.
library(dplyr)
#comments <- read.csv("comments.csv", encoding="IBM866", stringsAsFactors=FALSE)
comments <- final
comments <- comments[!grepl("спам",comments$Comment),]
comments <- comments[-c(35669), ]
class(comments) <- 'data.frame'
comments$AuthorDisplayName = as.character(comments$AuthorDisplayName)
comments <- comments %>%
mutate(new_col = sapply(strsplit(comments$AuthorDisplayName, " "), length))
#comments$new_col = ifelse(comments$"new_col">2,"Anonymous","Full name") Looking at the capital letters in the first names.
comments_1 <- comments[, c(3, 4, 5, 8, 9, 14, 16, 17)]
#comments_1 <- comments[, c(3, 14)]
library(stringr)
comments_1<- comments_1 %>%
mutate(letter = str_detect(comments_1$AuthorDisplayName,"[[:upper:]]"))Determining if a string has a dot.
comments_1 <- comments_1 %>%
mutate(dot = letter)
comments_1$dot = str_detect(comments_1$AuthorDisplayName, "\\.")Searching for duplicated words in names (ex. Lena Lena).
comments_1 <- comments_1 %>%
mutate(NameUnique = AuthorDisplayName)
comments_1$NameUnique <- sapply(comments_1$NameUnique, function(x) paste(unique(unlist(str_split(x," "))), collapse = " "))Counting number of words in names again based on previous step.
comments_1$new_col <- sapply(strsplit(comments_1$NameUnique, " "), length)Dividing names into separate columns to count number of letters in first and last name.
comments_1 = comments_1 %>%
mutate(DividedName = AuthorDisplayName)
out <- strsplit(as.character(comments_1$DividedName),' ')
comments_1 <- data.frame(comments_1, do.call(rbind, out))
comments_1 <- comments_1[, c(1:14)]Counting the number of letters in each string.
comments_1 <- comments_1 %>%
mutate(count = str_count(comments_1$X1, ""))
comments_1 <- comments_1 %>%
mutate(count1 = str_count(comments_1$X2, ""))Looking for names which have symbols, numbers and other characters.
comments_1 <- comments_1 %>%
mutate(unrecog = str_detect(comments_1$AuthorDisplayName, "\\_|\\*|\\-|\\#|\\[|\\]|\\>|\\<|\\?"))
comments_1 <- comments_1 %>%
mutate(numbers = grepl("[0-9]", comments_1$AuthorDisplayName))Looking for names that have capital letters in first and last names.
library(stringr)
comments_1<- comments_1 %>%
mutate(letter1 = str_detect(comments_1$X1,"[[:upper:]]"))
comments_1<- comments_1 %>%
mutate(letter2 = str_detect(comments_1$X2,"[[:upper:]]"))Making condition for identifying anonymous users.
comments_1 <- comments_1 %>%
mutate(Anonymity = AuthorDisplayName)
comments_1$Anonymity = ifelse((comments_1$"new_col">2) | (comments_1$"new_col"==1) | (comments_1$"letter1"=="FALSE") | (comments_1$"letter2"=="FALSE") | (comments_1$dot == "TRUE") | (comments_1$"count"<3) | (comments_1$"count1" <3) | (comments_1$"unrecog"=="TRUE") | (comments_1$"numbers"=="TRUE"),"Anonymous","Full name")
anonymous <- comments_1
anonymous <- anonymous[, c(1,2,3,4,5,6,7,21)]
write.csv(anonymous, file = "anonymity.csv")Statistics based on names.
anon <- read.csv("anonymity.csv", encoding="IBM866", stringsAsFactors=FALSE)
anon$Anonymity = as.factor(anon$Anonymity)
label(anon$Anonymity) <- "Name"
table1(~Anonymity, data = anon, overall="Total", topclass="Rtable1-grid Rtable1-shade Rtable1-times")| Total (N=44075) |
|
|---|---|
| Name | |
| Anonymous | 16380 (37.2%) |
| Full name | 27695 (62.8%) |
The analysis showed such results. After deleting empty observations and spam from users, there are 44075 unique comments left. In total, the submitted data contains more non-anonymous comments (62.8%) than anonymous comments (37.2%). However, it is worth mentioning that unique comments do not reflect the number of unique users, since some of them left several messages.
Identifying unique users
anon1 <- anon
anon1 <- anon1 %>% group_by(AuthorDisplayName) %>% mutate(count = n())
anon1$AuthorDisplayName = as.character(anon1$AuthorDisplayName)
anon2 = anon1
new_df2 <- anon2[-which(duplicated(anon2$AuthorDisplayName)), ]
label(new_df2$count) <- "Authors of comments"
table1(~ Anonymity, data=new_df2, topclass="Rtable1-grid Rtable1-shade Rtable1-times")| Overall (N=28186) |
|
|---|---|
| Name | |
| Anonymous | 10092 (35.8%) |
| Full name | 18094 (64.2%) |
Table shows the results showing the distribution of anonymous or non-anonymous users in the data presented. It is noteworthy that among all unique users (N=28186), the distribution of anonymous users/users with the full name is almost the same as the distribution of the total number of comments (35.8% and 64.2%, respectively).
anonymity <- read.csv("anonymity.csv", encoding="IBM866", stringsAsFactors=FALSE)
# Create the barplot
library(ggplot2)
ggplot(data=anonymity, aes(x=label, fill=Anonymity)) +
geom_bar()+
theme_minimal()label(anonymity$label) <- "Comment"
table1(~label|Anonymity, data = anonymity, topclass="Rtable1-grid Rtable1-shade Rtable1-times")| Anonymous (N=16380) |
Full name (N=27695) |
Overall (N=44075) |
|
|---|---|---|---|
| Comment | |||
| negative | 4346 (26.5%) | 7067 (25.5%) | 11413 (25.9%) |
| neutral | 4213 (25.7%) | 6547 (23.6%) | 10760 (24.4%) |
| positive | 7821 (47.7%) | 14081 (50.8%) | 21902 (49.7%) |
When sentiment analysis and determination of anonymity were made, it is necessary to look at the distribution of the sentiment of comments depending on the user name. In general, you can pay attention to the percentage ratio and see that negative comments are more often published among anonymous users, and positive comments are more often published by users who specify the first and last name.
library(dplyr)
popularity <- videos %>%
mutate(VideoID = c("exSKCByyLns", "w9It9PEK-kM", "Oe7U5rcaq1o", "FLkkmjQ7WX0", "T80MK6EKJT4", "7h4ZeIxc1pI", "prb9d6fsnHE", "FLkkmjQ7WX0", "b_1Hn6wwXpA", "ARUI9VhxNA0", "EZTByPrgfLI", "GlpiBkS4SJA", "Qaeqc5ikxeY", "v5uqya_nJj0", "nUdQIQWZGVA", "iT9QDUhS0dU", "1tE8HEgDmoc", "DIz0VX72-Bc", "Yl793FoyJJE", "uKby_huf-aA", "nCuZLsgPXuM", "52TT8o9vW8A", "QqZ0GLUF6yo", "plAdaVWzglc", "vCGuJ5z8LYY", "xRD2VdNUmxI", "7wwkItW2Gjw", "1VmVw0ALm_o", "Lik5G-gFQk4", "5YhoSIH3lPQ", "kIHTQlfBOJo", "F8ECpUKi5r4", "-7RqgG_NsI4", "Lls-ch1k7xg", "JSR1rwjZlX4", "xhFS89hgp_c", "36ayp2jckS0", "kk4aEC-YppA", "hRTIBqaOlus", "_NXEMh-Fxf4", "LTh0DnN6FDE", "ACwKesRVfUk", "xJpYCte7JcE", "e-henB30X4c", "DIaw4UV1nA4", "1AX_u2poAN4", "ONEtXBfx7oo", "r2FGVMvbWwE", "BRE8ibnX4MQ", "a_HkKhyPaSM", "eVjZ1lyoAOs", "IjpWeQUqMWM", "rwowDSLDzFI", "qlXp8Ppw-gs", "065kBCEtPoU", "NANnST5vxCo", "VJptzBWLCLo", "PE416IH8wws", "M4ZUU553AEA", "FZo2_AXGcD4", "uEW6CWgAMMM", "6fLBa58QgFU", "JaaZuyzNUi4", "CMr3q8u1ymg", "32ZMAZr1z_E", "wDWt3Ss9rNs", "ta6iiWcT_W0", "EJJrDbAmHOU", "aNKJpR0bBEE", "LqVY71LEAPI", "xRD2VdNUmxI", "g-jTu62gDr0", "EpVzqul5Arg", "P1nI5jnVaQM", "S2JdrBvVGSo", "yKRX2gNjtv4", "rPXB_uF-2cI", "U70aqExQkUA", "JJ0LQsAvnB8", "XnF8husX3Ew", "o3zNPtYlGSQ", "r-WbRBmURf4", "wG-DbNbMBkw", "aSepCt7Dx-s", "0Tz-RF9IxJE", "ErlV9j2BmyU", "eVjZ1lyoAOs", "qlXp8Ppw-gs", "NANnST5vxCo", "UF8HZRk7DaY", "xIrvh6BNkh8", "SkAZdW_GQ0s", "R_P8X40oNvA", "MtInFBn33M0", "xP3XMDJMT-k", "ZJiYz2UYSXI", "3Af4st1kRqA", "krHqPzS3nVo", "01SdKgQePsI", "oo46uKNP3gc")
)
write.csv(popularity, "popularity.csv")Loading libraries
library(readxl)
library(dplyr)
library(ggplot2)
library(ggpubr)
library(stringr)
library(tidyr)Loading data
anonymityfinal <- read_excel("anonymityfinal1.xlsx")
popularity <- read.csv(file="popularity.csv")Research on increasing the number of records
paste("Number of videos:", length(popularity$VideoID))## [1] "Number of videos: 100"
paste("Number of unique videos:", length(unique(popularity$VideoID)))## [1] "Number of unique videos: 95"
Selecting duplicates and deleting them from the data
duplicates <- subset(popularity,duplicated(popularity$VideoID))
popularity <- popularity[!(popularity$X %in% duplicates$X),]Creating a concatenated dataset. At this stage, a dataset is created by adding information about the video to the comments. The INNER JOIN connection is used
df <- merge(x = anonymityfinal, y = popularity, by = "VideoID", all = TRUE)Checking relationships. Before conducting the tests themselves, we agree that we will use the alpha significance level of 0.05
Let us assume that anonymity is allowed to “feel freedom of actions”, so there should be much more negative anonymous comments than non-anonymous negative ones
H0: There is no relationship between type of comments and anonymity/ non-anonymity. H1: The relationship exists.
Graphical representation of the distribution
df %>%
ggplot(aes(Anonymity, fill = label))+
geom_bar() + labs(title="Label distribution across anonymity")We see that there are fewer anonymous comments than non - anonymous ones, but the distribution of the types of comments themselves for each of the groups is visually approximately the same: most often there are positive ones and approximately the same frequency-negative and neutral.
Contingency table
selected_table_df <- df[, c(7, 8)]
knitr::kable(table(selected_table_df))| Anonymous | Full name | |
|---|---|---|
| negative | 4357 | 7006 |
| neutral | 4255 | 6554 |
| positive | 7847 | 14056 |
Contingency table (as a percentage)
knitr::kable(prop.table(table(selected_table_df)))| Anonymous | Full name | |
|---|---|---|
| negative | 0.0988542 | 0.1589563 |
| neutral | 0.0965400 | 0.1487011 |
| positive | 0.1780374 | 0.3189109 |
The tables generally support the visual analysis, there are still a few positive comments, but still more from non-anonymous users.
Chi-square test
x_test <- chisq.test(table(selected_table_df))
x_test##
## Pearson's Chi-squared test
##
## data: table(selected_table_df)
## X-squared = 45.296, df = 2, p-value = 1.459e-10
In our example, the variables are statistically significantly related (p-value tends to zero, which is significantly less than the established level, so we reject the null hypothesis of independence and can accept an alternative one).
Observed frequencies
x_test$observed## Anonymity
## label Anonymous Full name
## negative 4357 7006
## neutral 4255 6554
## positive 7847 14056
Conclusions: the anonymity of the commentator statistically affects the type of comment, but this is only due to the fact that there are simply more non-anonymous comments than anonymous ones (that is, if we met a positive comment, then most likely the user did not hide his name). The very same distribution of comment types for each group is approximately the same, that is, our initial assumption turned out to be incorrect.
We assume that the more views, the more negative comments.
Visual representation of the data
ggboxplot(df, x = "label", y = "viewCount",
color = "label", palette = c("#00AFBB", "#E7B800", "#FC4E07"))Logistic regression model
df$is_positive_label <- factor(df$label == 'positive')
model <- glm(is_positive_label ~ viewCount, data = df, family = "binomial")
summary(model)##
## Call:
## glm(formula = is_positive_label ~ viewCount, family = "binomial",
## data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.218 -1.187 -1.061 1.166 1.299
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 9.547e-02 1.355e-02 7.046 1.83e-12 ***
## viewCount -5.143e-07 4.609e-08 -11.159 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 61099 on 44074 degrees of freedom
## Residual deviance: 60974 on 44073 degrees of freedom
## AIC: 60978
##
## Number of Fisher Scoring iterations: 3
probabilities <- model %>% predict(df, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "TRUE", "FALSE")
paste("Accuracy:", round(mean(predicted.classes == df$is_positive_label), 2)*100, "%")## [1] "Accuracy: 54 %"
Running the one-way ANOVA test
summary(aov(viewCount ~ label, data = df))## Df Sum Sq Mean Sq F value Pr(>F)
## label 2 1.362e+13 6.811e+12 158.6 <2e-16 ***
## Residuals 44072 1.893e+15 4.295e+10
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Since the p-value value is less than the significance level of 0.05, we can conclude that there are significant differences between the average values of the number of video views by comment type.
Conclusions: the average number of views of a particular video statistically depends on the types of comments that users leave, that is, often on the most popular videos by views, comments are neutral. That is, our original hypothesis was not confirmed. In addition, it was not possible to successfully divide the data into “positive” and “non-positive” comments, the accuracy turned out to be slightly more than 50 percent, which is comparable to the method of random guessing, that is, flipping a coin.
We assume that the more comments, the more negative comments.
Visual representation of the data
ggboxplot(df, x = "label", y = "commentCount",
color = "label", palette = c("#00AFBB", "#E7B800", "#FC4E07"))Logistic regression model
df$is_positive_label <- factor(df$label == 'positive')
model <- glm(is_positive_label ~ commentCount, data = df, family = "binomial")
summary(model)##
## Call:
## glm(formula = is_positive_label ~ commentCount, family = "binomial",
## data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.275 -1.156 -1.125 1.197 1.232
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.311e-01 1.463e-02 -8.959 <2e-16 ***
## commentCount 8.888e-05 8.305e-06 10.702 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 61099 on 44074 degrees of freedom
## Residual deviance: 60984 on 44073 degrees of freedom
## AIC: 60988
##
## Number of Fisher Scoring iterations: 3
probabilities <- model %>% predict(df, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "TRUE", "FALSE")
paste("Accuracy:", round(mean(predicted.classes == df$is_positive_label), 2)*100, "%")## [1] "Accuracy: 52 %"
Running the one-way ANOVA test
summary(aov(commentCount ~ label, data = df))## Df Sum Sq Mean Sq F value Pr(>F)
## label 2 4.898e+08 244922056 186.3 <2e-16 ***
## Residuals 44072 5.793e+10 1314394
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Since the p-value value is less than the significance level of 0.05, we can conclude that there are significant differences between the average values of the number of comments on the video by comment type.
Conclusions: The average number of comments left on a particular video statistically depends on the types of comments that users leave. However, it turns out that most often on popular videos, comments are still neutral, not positive, that is, again, our assumption was not confirmed.
We assume that long comments usually have some bright emotional color, while neutral comments are usually quite short.
First we calculate the length of each line
df$CommentLength <- str_count(df$Comment)
df <- df %>% drop_na()Visual representation of the data
ggboxplot(df, x = "label", y = "CommentLength",
color = "label", palette = c("#00AFBB", "#E7B800", "#FC4E07"))Getting average values
result <- df %>%
group_by(label) %>%
dplyr::summarize(Mean = mean(CommentLength, na.rm=TRUE))Logistic regression model
Let’s try to predict by the length of the comment whether it is positive/negative
df$is_positive_label <- factor(df$label == 'positive')
model <- glm(is_positive_label ~ CommentLength, data = df, family = "binomial")
summary(model)##
## Call:
## glm(formula = is_positive_label ~ CommentLength, family = "binomial",
## data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.332 -1.119 -1.064 1.204 1.297
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.2795116 0.0133591 -20.92 <2e-16 ***
## CommentLength 0.0032937 0.0001204 27.36 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 60896 on 43928 degrees of freedom
## Residual deviance: 59958 on 43927 degrees of freedom
## AIC: 59962
##
## Number of Fisher Scoring iterations: 4
probabilities <- model %>% predict(df, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "TRUE", "FALSE")
paste("Accuracy:", round(mean(predicted.classes == df$is_positive_label), 2)*100, "%")## [1] "Accuracy: 57 %"
Running the one-way ANOVA test
summary(aov(CommentLength ~ label, data = df))## Df Sum Sq Mean Sq F value Pr(>F)
## label 2 35680087 17840043 1385 <2e-16 ***
## Residuals 43926 565876078 12882
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Since the p-value is less than the significance level of 0.05, we can conclude that there are significant statistical differences between the average values of the comment length and its type.
Conclusions: even visually, it was immediately visible on the graph that, first, there are a lot of outliers in the data (that is, there are uniquely long comments in each category of comment types), but the outliers themselves are longer for positive comments, they are slightly shorter for neutral ones, and, accordingly, even shorter for negative ones. The average lengths also differ. The average length of a neutral comment is approximately 33 characters, while positive and negative comments are on average 100 characters long. These averages are also statistically distinguishable within the groups under consideration. We can therefore conclude that our initial assumption is correct. Emotionally charged comments are really longer than neutral ones.
We assume that long comments are left, as a rule, by anonymous users, since, perhaps, anonymity “liberates” a person, allowing to write more and more in detail.
Visual representation of the data
ggboxplot(df, x = "Anonymity", y = "CommentLength",
color = "Anonymity", palette = c("#00AFBB", "#E7B800", "#FC4E07"))Getting average values
result_anonymity <- df %>%
group_by(Anonymity) %>%
dplyr::summarize(Mean = mean(CommentLength, na.rm=TRUE))Logistic regression model
df$Anonymity <- as.factor(df$Anonymity)
model <- glm(Anonymity ~ CommentLength, data = df, family = "binomial")
summary(model)##
## Call:
## glm(formula = Anonymity ~ CommentLength, family = "binomial",
## data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4762 -1.4018 0.9672 0.9684 0.9691
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.120e-01 1.216e-02 42.119 <2e-16 ***
## CommentLength 4.298e-05 8.481e-05 0.507 0.612
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 58073 on 43928 degrees of freedom
## Residual deviance: 58073 on 43927 degrees of freedom
## AIC: 58077
##
## Number of Fisher Scoring iterations: 4
probabilities <- model %>% predict(df, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "Full name", "Anonymous")
paste("Accuracy:", round(mean(predicted.classes == df$Anonymity), 2)*100, "%")## [1] "Accuracy: 63 %"
Running the one-way ANOVA test
summary(aov(CommentLength ~ Anonymity, data = df))## Df Sum Sq Mean Sq F value Pr(>F)
## Anonymity 1 3519 3519 0.257 0.612
## Residuals 43927 601552646 13694
Since the p-value (0.612) is significantly higher than the significance level of 0.05, we can conclude that there are no significant statistical differences between the average values of the length of the comment by the type of anonymity of the user who left it.
Conclusions: although it was possible to predict the type of anonymity of the user relatively well by the length of the comment on the training data itself (63%), however, this is still quite small. In addition, the conducted statistical test tells us that the average values of the length of comments for anonymous and non-anonymous users do not statistically differ. Therefore, we can conclude that the anonymity/non-anonymity of the user does not affect the number of characters in the comment in any way. Therefore, our initial assumption was not confirmed.