Loading, setting up

library(lme4)
library(tidyverse)
library(sjPlot)
library(stringr)
library(lubridate)

tweets <- read_csv("~/Dropbox/1_Research/Twitter_Data_Suite/hashtag/miched3/2_tweets.csv")
liwc_results <- read_csv("liwc_results.csv")
liwc_results <- liwc_results[-1, ]
liwc_results <- rename(liwc_results, tweet_link = `Source (A)`)
tweets <- left_join(tweets, liwc_results, by = "tweet_link")
tweets <- mutate(tweets, 
                 date = with_tz(date, "EST"),
                 hour = hour(date))
tweets <- mutate(tweets,
                 hour_chunk = case_when(
                     hour %in% c(0, 1, 2, 3) ~ "12-3 am",
                     hour %in% c(4, 5, 6, 7) ~ "4-7 am",
                     hour %in% c(8, 9, 10, 11) ~ "8-11 am",
                     hour %in% c(12, 13, 14, 15) ~ "12-3 pm",
                     hour %in% c(16, 17, 18, 19) ~ "4-7 pm",
                     hour %in% c(20, 21, 22, 23) ~ "8-11 pm"
                 ))
rm(liwc_results)

profiles <- read_csv("~/Dropbox/1_Research/Twitter_Data_Suite/hashtag/miched3/3_profiles.csv")
profiles$num_following <- stringr::str_count(profiles$following, "\\*")
profiles <- rename(profiles, screen_name = screenName)

df <- left_join(tweets, profiles, by = "screen_name")
rm(profiles)
rm(tweets)

df <- mutate(df,
             interactions = scraped_num_favorites + scraped_num_retweets + scraped_num_replies,
             n_chars = nchar(text),
             n_mentions = str_count(text, "@"))

A few statistics

df %>% count(user_id)
df %>% count(user_id) %>% summarize(mean_n = mean(n), sd_n = sd(n))

Filtering data frame to include only original tweets

nrow(df) # total number of tweets - originals, quotes, replies, retweets
## [1] 89943
nrow(filter(df, type == "ORIG")) # number of original tweets
## [1] 37291
nrow(df) - nrow(filter(df, type == "ORIG")) # removed this many tweets
## [1] 52652
df <- filter(df, type == "ORIG") # filter data

What distribution should we use?

My understanding is that we can use a Poisson distribution, which ranges from 0 to positive infinity, and is for “the discrete number of events”. Based on our data, I think there may be more 0’s than would expect, and so I think we maybe could consider a zero-inflated Poisson distribution (https://en.wikipedia.org/wiki/Zero-inflated_model). Another way to address this I think is to run two models: One for whether the outcome is 0 or greater than 0 (so a logistic), and one for if the outcome is greater than 0, what its value is (so a Poisson).

df$interactions_binary <- ifelse(df$interactions == 0, 0, 1)
df$interactions_non_zero <- ifelse(df$interactions > 0, df$interactions, NA)

I will add these models to those here, after getting these models finalized using a Poisson distribution.

Examining outcome (interactions: number of retweets, favorites, and replies)

This suggests that we should possibly use logged interactions as the outcome. There seem to be very many 0s. This could suggest we use a count data model. Sticking with continuous outcome to start.

set_theme(theme_bw())

ggplot(df, aes(x = interactions)) +
    geom_histogram(bins = 50)

safe_log <- function(x) {
    ifelse(x == 0, 0, log(x))
}

df <- mutate(df,
             interactions_log = safe_log(interactions))

ggplot(df, aes(x = interactions_log)) +
    geom_histogram(bins = 35)

Number of URLs x Interactions

df %>% 
    group_by(num_urls) %>% 
    summarize(mean_interactions = mean(interactions),
              n = n())
# This plot is a bit misleading, given the number of tweets with three or four URLs
# It does look very approximately linear

# df %>% 
#     group_by(num_urls) %>% 
#     summarize(mean_interactions = mean(interactions)) %>% 
#     ggplot(aes(x = num_urls, y = mean_interactions)) +
#     geom_col()

Model

Using linear model:

df$weekend <- ifelse(df$day == "Saturday" | df$day == "Sunday", "weekend", "weekday")

options(scipen=999)

m0 <- lmerTest::lmer(interactions ~ 
                         
                         (1 | day) +
                         (1 | screen_name) +
                         (1 | hour),
                     data = df)

summary(m0)
## Linear mixed model fit by REML ['lmerMod']
## Formula: interactions ~ (1 | day) + (1 | screen_name) + (1 | hour)
##    Data: df
## 
## REML criterion at convergence: 216829.3
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -7.973 -0.456 -0.165  0.189 45.823 
## 
## Random effects:
##  Groups      Name        Variance Std.Dev.
##  screen_name (Intercept)  6.43049 2.5358  
##  hour        (Intercept)  0.05247 0.2291  
##  day         (Intercept)  0.08286 0.2879  
##  Residual                18.59875 4.3126  
## Number of obs: 37291, groups:  screen_name, 1766; hour, 24; day, 7
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)   3.0668     0.1477   20.76
sjstats::icc(m0)
## Linear mixed model
##  Family: gaussian (identity)
## Formula: interactions ~ (1 | day) + (1 | screen_name) + (1 | hour)
## 
##   ICC (screen_name): 0.255538
##          ICC (hour): 0.002085
##           ICC (day): 0.003293
m1 <- lmer(interactions ~ 
               I(followersCount / 1000) +
               I(num_following / 1000) +
               I(statusesCount / 1000) +
               I(favoritesCount / 1000) +
               I(friendsCount / 1000) +
               
               n_mentions +
               num_urls + 
               num_hashtags + 
               I(n_chars / 10) +
               
               (1 | day) +
               (1 | hour) + 
               (1 | screen_name),
           data = df)

sjt.lmer(m1, p.kr = FALSE)
    interactions
    B CI p
Fixed Parts
(Intercept)   1.00 0.58 – 1.42 <.001
I(followersCount/1000)   0.07 0.05 – 0.09 <.001
I(num_following/1000)   0.33 -5.78 – 6.43 .917
I(statusesCount/1000)   -0.00 -0.01 – 0.00 .347
I(favoritesCount/1000)   0.05 0.02 – 0.07 .002
I(friendsCount/1000)   -0.32 -6.44 – 5.80 .918
n_mentions   0.21 0.16 – 0.26 <.001
num_urls   -1.33 -1.44 – -1.22 <.001
num_hashtags   0.13 0.09 – 0.18 <.001
I(n_chars/10)   0.17 0.15 – 0.19 <.001
Random Parts
σ2   18.263
τ00, screen_name   6.062
τ00, hour   0.068
τ00, day   0.122
Nscreen_name   1716
Nhour   24
Nday   7
ICCscreen_name   0.247
ICChour   0.003
ICCday   0.005
Observations   36718
R2 / Ω02   .228 / .225

With Poisson distribution (coefficients are in log odds):

m1i <- glmer(interactions ~ 
                 I(followersCount / 1000) +
                 I(num_following / 1000) +
                 I(statusesCount / 1000) +
                 I(favoritesCount / 1000) +
                 I(friendsCount / 1000) +
                 
                 n_mentions +
                 num_urls + 
                 num_hashtags + 
                 I(n_chars / 10) +
                 
                 (1 | day) +
                 (1 | hour) + 
                 (1 | screen_name),
             family = poisson,
             control = glmerControl(optCtrl=list(maxfun=100000), calc.derivs = FALSE),
             data = df)

sjt.glmer(m1i)
    interactions
    IRR CI p
Fixed Parts
(Intercept)   1.04 0.92 – 1.17 .528
I(followersCount/1000)   1.01 1.01 – 1.02 <.001
I(num_following/1000)   0.55 0.11 – 2.76 .468
I(statusesCount/1000)   1.00 0.99 – 1.00 <.001
I(favoritesCount/1000)   1.01 1.01 – 1.02 <.001
I(friendsCount/1000)   1.83 0.36 – 9.19 .463
n_mentions   1.05 1.05 – 1.06 <.001
num_urls   0.64 0.63 – 0.65 <.001
num_hashtags   1.05 1.04 – 1.06 <.001
I(n_chars/10)   1.06 1.06 – 1.07 <.001
Random Parts
τ00, screen_name   0.680
τ00, hour   0.018
τ00, day   0.012
Nscreen_name   1716
Nhour   24
Nday   7
ICCscreen_name   0.398
ICChour   0.011
ICCday   0.007
Observations   36718
Deviance   112907.831

Adding some LIWC variables

Linear model:

m2 <- lmer(interactions ~ 
               I(followersCount / 1000) +
               I(num_following / 1000) +
               I(statusesCount / 1000) +
               I(favoritesCount / 1000) +
               I(friendsCount / 1000) +
               
               n_mentions +
               num_urls + 
               num_hashtags + 
               I(n_chars / 10) +
               
               scale(cogproc) +
               scale(social) +
               scale(work) + 
               scale(posemo) +
               scale(negemo) +
               
               (1 | day) +
               (1 | hour) + 
               (1 | screen_name),
           data = df)

sjt.lmer(m2, p.kr = FALSE)
    interactions
    B CI p
Fixed Parts
(Intercept)   1.00 0.58 – 1.42 <.001
I(followersCount/1000)   0.07 0.05 – 0.09 <.001
I(num_following/1000)   0.35 -5.74 – 6.44 .911
I(statusesCount/1000)   -0.00 -0.01 – 0.00 .316
I(favoritesCount/1000)   0.05 0.02 – 0.07 .002
I(friendsCount/1000)   -0.35 -6.45 – 5.76 .911
n_mentions   0.20 0.14 – 0.25 <.001
num_urls   -1.39 -1.51 – -1.27 <.001
num_hashtags   0.12 0.07 – 0.17 <.001
I(n_chars/10)   0.18 0.16 – 0.20 <.001
scale(cogproc)   -0.06 -0.11 – -0.01 .013
scale(social)   -0.05 -0.10 – -0.01 .031
scale(work)   -0.04 -0.09 – 0.01 .095
scale(posemo)   -0.04 -0.09 – 0.00 .077
scale(negemo)   0.10 0.06 – 0.15 <.001
Random Parts
σ2   18.251
τ00, screen_name   6.012
τ00, hour   0.068
τ00, day   0.122
Nscreen_name   1716
Nhour   24
Nday   7
ICCscreen_name   0.246
ICChour   0.003
ICCday   0.005
Observations   36718
R2 / Ω02   .228 / .226

With Poisson (units in log odds):

m2i <- glmer(interactions ~ 
                 I(followersCount / 1000) +
                 I(num_following / 1000) +
                 I(statusesCount / 1000) +
                 I(favoritesCount / 1000) +
                 I(friendsCount / 1000) +
                 
                 n_mentions +
                 num_urls + 
                 num_hashtags + 
                 I(n_chars / 10) +
                 
                 scale(cogproc) +
                 scale(social) +
                 scale(work) + 
                 scale(posemo) +
                 scale(negemo) +
                 
                 (1 | day) +
                 (1 | hour) + 
                 (1 | screen_name),
             family = poisson,
             control = glmerControl(optCtrl=list(maxfun=100000), calc.derivs = FALSE),
             data = df)

sjt.glmer(m2i)
    interactions
    IRR CI p
Fixed Parts
(Intercept)   1.04 0.92 – 1.17 .537
I(followersCount/1000)   1.01 1.01 – 1.02 <.001
I(num_following/1000)   0.56 0.11 – 2.78 .476
I(statusesCount/1000)   1.00 0.99 – 1.00 <.001
I(favoritesCount/1000)   1.01 1.01 – 1.02 <.001
I(friendsCount/1000)   1.81 0.36 – 9.03 .471
n_mentions   1.05 1.04 – 1.06 <.001
num_urls   0.63 0.62 – 0.64 <.001
num_hashtags   1.05 1.04 – 1.05 <.001
I(n_chars/10)   1.06 1.06 – 1.07 <.001
scale(cogproc)   0.98 0.97 – 0.99 <.001
scale(social)   0.98 0.98 – 0.99 <.001
scale(work)   0.99 0.98 – 1.00 .002
scale(posemo)   0.99 0.98 – 0.99 <.001
scale(negemo)   1.02 1.02 – 1.03 <.001
Random Parts
τ00, screen_name   0.675
τ00, hour   0.019
τ00, day   0.012
Nscreen_name   1716
Nhour   24
Nday   7
ICCscreen_name   0.396
ICChour   0.011
ICCday   0.007
Observations   36718
Deviance   112772.209