Comments were collected mid May 2015 using the PRAW Python library, by downloading all comments for the latest one thousand topics available (Reddit limit) for /r/belgium, /r/Denmark, /r/germany, /r/italy, /r/Switzerland and /r/thenetherlands.

I did not really validate the date, results, etc. so interpret at your own risk.

1 Setup

library(plyr) # for mapvalues()
library(ggplot2)
library(ineq)

1.1 Import and clean comment data

comments.raw <- read.csv('/home/mhermans/comments_full.csv')

comments <- comments.raw
comments$X <- NULL # drop useless row id nrs

#length(comments$comment_id)
#length(unique(comments$comment_id)) # duplicates? accidental re-crawl?
comments <- comments[!duplicated(comments$comment_id),] # drop dpublicates

comments$date <- as.Date(as.POSIXct(comments$date, origin="1970-01-01")) # epoch time to Date object
comments$negative <- ifelse(comments$score <= 0, 1, 0) # 

head(comments)

##   comment_id comment_len       date gilded score subreddit_url
## 1    cq7lgbs         264 2015-04-10      0    24   /r/belgium/
## 2    cqidwu0         534 2015-04-20      0     3   /r/belgium/
## 3    cr05xwu          92 2015-05-06      0     1   /r/belgium/
## 4    cq7lhcx         120 2015-04-10      0     3   /r/belgium/
## 5    cq7mig8          31 2015-04-10      0    10   /r/belgium/
## 6    cqidxlm          46 2015-04-20      0     5   /r/belgium/
##      user_name negative
## 1    Chimiel82        0
## 2  dj-shortcut        0
## 3 Ask_The_Dust        0
## 4         inxi        0
## 5        SK2P1        0
## 6  dj-shortcut        0

1.2 Construct user-level dataset

users <- as.data.frame.matrix(table(comments$user_name, comments$subreddit_url))
subreddits <- colnames(users)
users$top_subreddit <- apply(users, 1, which.max)
users$top_subreddit <- mapvalues(users$top_subreddit, 1:6, subreddits)
users$comments_total <- rowSums(users[,subreddits])
users$user_name <- rownames(users)
rownames(users) <- NULL

downvotes_user <- aggregate(
  negative ~  user_name, 
  data=comments, sum)

users <- merge(users, downvotes_user, by='user_name')
rm(downvotes_user)

users$comments_neg_frq <- users$negative
users$negative <- NULL
users$comments_neg_prop <- users$comments_neg_frq / users$comments_total

meanscore_user <- aggregate(comments[,c('score', 'comment_len')], list(comments$user_name), mean)
names(meanscore_user) <- c('user_name', 'comment_score_mean', 'comment_len_mean')
users <- merge(users, meanscore_user, by='user_name')
rm(meanscore_user)

users$comments_neg_prop_cat <- cut(users$comments_neg_prop, c(-1, 0,.1, 0.25, .5, 0.75, 1))

head(users)

##             user_name /r/belgium/ /r/Denmark/ /r/germany/ /r/italy/
## 1    ______-__-______           0           0           3         0
## 2 ---______________--           0           0           3         0
## 3  011101010111001101          14           0           0         0
## 4            01314150           0          35           0         0
## 5               01938           0           0           4         0
## 6                 02S           0           0           0         0
##   /r/Switzerland/ /r/thenetherlands/      top_subreddit comments_total
## 1               0                  0        /r/germany/              3
## 2               0                  0        /r/germany/              3
## 3               0                  0        /r/belgium/             14
## 4               0                  0        /r/Denmark/             35
## 5               0                  0        /r/germany/              4
## 6               0                  2 /r/thenetherlands/              2
##   comments_neg_frq comments_neg_prop comment_score_mean comment_len_mean
## 1                0            0.0000              3.000            499.7
## 2                0            0.0000              3.000            263.0
## 3                0            0.0000              1.286            183.8
## 4                4            0.1143              2.029            303.8
## 5                1            0.2500             31.250            332.2
## 6                0            0.0000              6.500            125.5
##   comments_neg_prop_cat
## 1                (-1,0]
## 2                (-1,0]
## 3                (-1,0]
## 4            (0.1,0.25]
## 5            (0.1,0.25]
## 6                (-1,0]

2 Basic descriptives

nrow(comments) # total nr. of unique comments

## [1] 127772

summary(comments)

##    comment_id      comment_len        date                gilded     
##  cjsli6d:     1   Min.   :   0   Min.   :2014-08-17   Min.   :0e+00  
##  cjsmafs:     1   1st Qu.:  55   1st Qu.:2015-03-29   1st Qu.:0e+00  
##  cjsq5x7:     1   Median : 122   Median :2015-04-21   Median :0e+00  
##  cjsuhpd:     1   Mean   : 229   Mean   :2015-04-04   Mean   :4e-04  
##  cjswgr0:     1   3rd Qu.: 262   3rd Qu.:2015-05-04   3rd Qu.:0e+00  
##  cjsy2pg:     1   Max.   :9994   Max.   :2015-05-15   Max.   :2e+00  
##  (Other):127766                                                      
##      score                    subreddit_url            user_name     
##  Min.   :-122.00   /r/belgium/       :25852   [deleted]     :  2295  
##  1st Qu.:   1.00   /r/Denmark/       :19765   Urgullibl     :  1171  
##  Median :   2.00   /r/germany/       :15630   historicusXIII:  1024  
##  Mean   :   3.53   /r/italy/         :20939   JebusGobson   :   685  
##  3rd Qu.:   4.00   /r/Switzerland/   :15675   Lucky-o-jelly :   659  
##  Max.   : 251.00   /r/thenetherlands/:29911   Inquatitis    :   584  
##                                               (Other)       :121354  
##     negative     
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.0948  
##  3rd Qu.:0.0000  
##  Max.   :1.0000  
##

#table(comments$subreddit_url) # nr. of comments per country subreddit

summary(comments$date) # scrape date-range

##         Min.      1st Qu.       Median         Mean      3rd Qu. 
## "2014-08-17" "2015-03-29" "2015-04-21" "2015-04-04" "2015-05-04" 
##         Max. 
## "2015-05-15"

# mean values per country subreddit
aggregate(
  comments[,c('date', 'comment_len', 'score', 'gilded')], 
  list(subreddit=comments$subreddit_url), mean)

##            subreddit       date comment_len score    gilded
## 1        /r/belgium/ 2015-03-26       220.7 2.523 0.0006576
## 2        /r/Denmark/ 2015-04-28       229.4 3.434 0.0005059
## 3        /r/germany/ 2015-04-10       272.0 3.893 0.0001280
## 4          /r/italy/ 2015-05-02       210.9 3.596 0.0003343
## 5    /r/Switzerland/ 2015-01-01       241.3 2.943 0.0002552
## 6 /r/thenetherlands/ 2015-04-21       220.0 4.547 0.0002675

3 Comment downvote patterns

Comments are coded as “negative”, is score is 0 or lower (Reddit stopped returning raw comment scores?).

3.1 Occurence of comment downvotes

table(comments$subreddit_url, comments$negative) # frequecies

##                     
##                          0     1
##   /r/belgium/        23339  2513
##   /r/Denmark/        17286  2479
##   /r/germany/        14189  1441
##   /r/italy/          18634  2305
##   /r/Switzerland/    14569  1106
##   /r/thenetherlands/ 27645  2266

# proportion of negative comments each country
round(prop.table(
  table(comments$subreddit_url, comments$negative), 1), 2)

##                     
##                         0    1
##   /r/belgium/        0.90 0.10
##   /r/Denmark/        0.87 0.13
##   /r/germany/        0.91 0.09
##   /r/italy/          0.89 0.11
##   /r/Switzerland/    0.93 0.07
##   /r/thenetherlands/ 0.92 0.08

# there are signf. differences between countries in prop. of negative comments
chisq.test(table(
  comments$subreddit_url, 
  comments$negative))

## 
##  Pearson's Chi-squared test
## 
## data:  table(comments$subreddit_url, comments$negative)
## X-squared = 509.8, df = 5, p-value < 2.2e-16

3.2 Comment length and downvotes

General expectation is a negative relation between downvotes and comment lenght. Short, low-effort comments are (in principle) more likely to be downvoted, long comments indicate effort/contributions and are (in principle) more likely to be upvoted. A smaller difference between the average lenght of downvoted and non-downvoted comments might indicated “downvotes for disagreement”.

# mean comment length by downvote-status & country subreddit
aggregate(
  comment_len ~ negative + subreddit_url, 
  data=comments, mean)

##    negative      subreddit_url comment_len
## 1         0        /r/belgium/       218.6
## 2         1        /r/belgium/       240.5
## 3         0        /r/Denmark/       228.2
## 4         1        /r/Denmark/       237.7
## 5         0        /r/germany/       273.9
## 6         1        /r/germany/       253.2
## 7         0          /r/italy/       210.6
## 8         1          /r/italy/       213.5
## 9         0    /r/Switzerland/       240.6
## 10        1    /r/Switzerland/       250.5
## 11        0 /r/thenetherlands/       219.4
## 12        1 /r/thenetherlands/       226.4

3.3 Downvote concentration within users

We can calculate a Gini-coefficient for the number of downvoted comments in a country subreddit. A value closer to 1 means that the downvoted comments are concentrated with one or a few users, most likely trolls. Lower values might indicate a more indiscriminate downvoting pattern, e.g. “downvotes for opinion”.

tapply(users$comments_neg_frq, users$top_subreddit, Gini)

##        /r/belgium/        /r/Denmark/        /r/germany/ 
##             0.9282             0.8913             0.8934 
##          /r/italy/    /r/Switzerland/ /r/thenetherlands/ 
##             0.9031             0.9322             0.9322

tapply(users$comments_neg_prop, users$top_subreddit, Gini) # not sure if this is meaningfull, weighting?

##        /r/belgium/        /r/Denmark/        /r/germany/ 
##             0.8807             0.8520             0.8912 
##          /r/italy/    /r/Switzerland/ /r/thenetherlands/ 
##             0.8553             0.9194             0.9037

p <- ggplot(users, aes(group=top_subreddit, color=top_subreddit)) +  theme(legend.position="none") + scale_x_continuous(breaks=c(0,.5,1))

p + geom_point(aes(x=comments_neg_prop, y=comment_score_mean)) + facet_grid(.~top_subreddit) + scale_y_log10() + labs(
  x = 'Proportion of downvoted comments', y = 'Logged mean comment score',
  title = 'User-distribution based on mean score, proportion downvoted comments')

plot of chunk unnamed-chunk-8

#p + geom_point(aes(x=comments_neg_prop_cat, y=comment_score_mean)) + facet_grid(.~top_subreddit) + scale_y_log10() + theme(legend.position="none")
p + geom_point(aes(x=comments_neg_prop, y=comment_len_mean)) + facet_grid(.~top_subreddit) + scale_y_log10() + labs(
  x = 'Proportion of downvoted comments', y = 'Logged mean comment length',
  title = 'User-distribution based on mean comment length, proportion downvoted comments')