Loading, setting up

library(lme4)
library(tidyverse)
library(sjPlot)
library(stringr)
library(lubridate)

tweets <- read_csv("~/Dropbox/1_Research/Twitter_Data_Suite/hashtag/miched3/2_tweets.csv")
liwc_results <- read_csv("liwc_results.csv")
liwc_results <- liwc_results[-1, ]
liwc_results <- rename(liwc_results, tweet_link = `Source (A)`)
tweets <- left_join(tweets, liwc_results, by = "tweet_link")
tweets <- mutate(tweets, 
                 date = with_tz(date, "EST"),
                 hour = hour(date))
tweets <- mutate(tweets,
                 hour_chunk = case_when(
                     hour %in% c(0, 1, 2, 3) ~ "12-3 am",
                     hour %in% c(4, 5, 6, 7) ~ "4-7 am",
                     hour %in% c(8, 9, 10, 11) ~ "8-11 am",
                     hour %in% c(12, 13, 14, 15) ~ "12-3 pm",
                     hour %in% c(16, 17, 18, 19) ~ "4-7 pm",
                     hour %in% c(20, 21, 22, 23) ~ "8-11 pm"
                 ))
rm(liwc_results)

profiles <- read_csv("~/Dropbox/1_Research/Twitter_Data_Suite/hashtag/miched3/3_profiles.csv")
profiles$num_following <- stringr::str_count(profiles$following, "\\*")
profiles <- rename(profiles, screen_name = screenName)

df <- left_join(tweets, profiles, by = "screen_name")
rm(profiles)
rm(tweets)

df <- mutate(df,
             interactions = scraped_num_favorites + scraped_num_retweets + scraped_num_replies,
             n_chars = nchar(text),
             n_mentions = str_count(text, "@"))

A few statistics

df %>% count(user_id)

df %>% count(user_id) %>% summarize(mean_n = mean(n), sd_n = sd(n))

Filtering data frame to include only original tweets

nrow(df) # total number of tweets - originals, quotes, replies, retweets

## [1] 89943

nrow(filter(df, type == "ORIG")) # number of original tweets

## [1] 37291

nrow(df) - nrow(filter(df, type == "ORIG")) # removed this many tweets

## [1] 52652

df <- filter(df, type == "ORIG") # filter data

What distribution should we use?

My understanding is that we can use a Poisson distribution, which ranges from 0 to positive infinity, and is for “the discrete number of events”. Based on our data, I think there may be more 0’s than would expect, and so I think we maybe could consider a zero-inflated Poisson distribution (https://en.wikipedia.org/wiki/Zero-inflated_model). Another way to address this I think is to run two models: One for whether the outcome is 0 or greater than 0 (so a logistic), and one for if the outcome is greater than 0, what its value is (so a Poisson).

df$interactions_binary <- ifelse(df$interactions == 0, 0, 1)
df$interactions_non_zero <- ifelse(df$interactions > 0, df$interactions, NA)

I will add these models to those here, after getting these models finalized using a Poisson distribution.

Examining outcome (interactions: number of retweets, favorites, and replies)

This suggests that we should possibly use logged interactions as the outcome. There seem to be very many 0s. This could suggest we use a count data model. Sticking with continuous outcome to start.

set_theme(theme_bw())

ggplot(df, aes(x = interactions)) +
    geom_histogram(bins = 50)

safe_log <- function(x) {
    ifelse(x == 0, 0, log(x))
}

df <- mutate(df,
             interactions_log = safe_log(interactions))

ggplot(df, aes(x = interactions_log)) +
    geom_histogram(bins = 35)

Number of URLs x Interactions

df %>% 
    group_by(num_urls) %>% 
    summarize(mean_interactions = mean(interactions),
              n = n())

# This plot is a bit misleading, given the number of tweets with three or four URLs
# It does look very approximately linear

# df %>% 
#     group_by(num_urls) %>% 
#     summarize(mean_interactions = mean(interactions)) %>% 
#     ggplot(aes(x = num_urls, y = mean_interactions)) +
#     geom_col()

Model

Using linear model:

df$weekend <- ifelse(df$day == "Saturday" | df$day == "Sunday", "weekend", "weekday")

options(scipen=999)

m0 <- lmerTest::lmer(interactions ~ 
                         
                         (1 | day) +
                         (1 | screen_name) +
                         (1 | hour),
                     data = df)

summary(m0)

## Linear mixed model fit by REML ['lmerMod']
## Formula: interactions ~ (1 | day) + (1 | screen_name) + (1 | hour)
##    Data: df
## 
## REML criterion at convergence: 216829.3
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -7.973 -0.456 -0.165  0.189 45.823 
## 
## Random effects:
##  Groups      Name        Variance Std.Dev.
##  screen_name (Intercept)  6.43049 2.5358  
##  hour        (Intercept)  0.05247 0.2291  
##  day         (Intercept)  0.08286 0.2879  
##  Residual                18.59875 4.3126  
## Number of obs: 37291, groups:  screen_name, 1766; hour, 24; day, 7
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)   3.0668     0.1477   20.76

sjstats::icc(m0)

## Linear mixed model
##  Family: gaussian (identity)
## Formula: interactions ~ (1 | day) + (1 | screen_name) + (1 | hour)
## 
##   ICC (screen_name): 0.255538
##          ICC (hour): 0.002085
##           ICC (day): 0.003293

m1 <- lmer(interactions ~ 
               I(followersCount / 1000) +
               I(num_following / 1000) +
               I(statusesCount / 1000) +
               I(favoritesCount / 1000) +
               I(friendsCount / 1000) +
               
               n_mentions +
               num_urls + 
               num_hashtags + 
               I(n_chars / 10) +
               
               (1 | day) +
               (1 | hour) + 
               (1 | screen_name),
           data = df)

sjt.lmer(m1, p.kr = FALSE)

	interactions
	B	CI	p
Fixed Parts
(Intercept)	1.00	0.58 – 1.42	<.001
I(followersCount/1000)	0.07	0.05 – 0.09	<.001
I(num_following/1000)	0.33	-5.78 – 6.43	.917
I(statusesCount/1000)	-0.00	-0.01 – 0.00	.347
I(favoritesCount/1000)	0.05	0.02 – 0.07	.002
I(friendsCount/1000)	-0.32	-6.44 – 5.80	.918
n_mentions	0.21	0.16 – 0.26	<.001
num_urls	-1.33	-1.44 – -1.22	<.001
num_hashtags	0.13	0.09 – 0.18	<.001
I(n_chars/10)	0.17	0.15 – 0.19	<.001
Random Parts
σ²	18.263
τ_{00, screen_name}	6.062
τ_{00, hour}	0.068
τ_{00, day}	0.122
N_{screen_name}	1716
N_hour	24
N_day	7
ICC_{screen_name}	0.247
ICC_hour	0.003
ICC_day	0.005
Observations	36718
R² / Ω₀²	.228 / .225

With Poisson distribution (coefficients are in log odds):

m1i <- glmer(interactions ~ 
                 I(followersCount / 1000) +
                 I(num_following / 1000) +
                 I(statusesCount / 1000) +
                 I(favoritesCount / 1000) +
                 I(friendsCount / 1000) +
                 
                 n_mentions +
                 num_urls + 
                 num_hashtags + 
                 I(n_chars / 10) +
                 
                 (1 | day) +
                 (1 | hour) + 
                 (1 | screen_name),
             family = poisson,
             control = glmerControl(optCtrl=list(maxfun=100000), calc.derivs = FALSE),
             data = df)

sjt.glmer(m1i)

	interactions
	IRR	CI	p
Fixed Parts
(Intercept)	1.04	0.92 – 1.17	.528
I(followersCount/1000)	1.01	1.01 – 1.02	<.001
I(num_following/1000)	0.55	0.11 – 2.76	.468
I(statusesCount/1000)	1.00	0.99 – 1.00	<.001
I(favoritesCount/1000)	1.01	1.01 – 1.02	<.001
I(friendsCount/1000)	1.83	0.36 – 9.19	.463
n_mentions	1.05	1.05 – 1.06	<.001
num_urls	0.64	0.63 – 0.65	<.001
num_hashtags	1.05	1.04 – 1.06	<.001
I(n_chars/10)	1.06	1.06 – 1.07	<.001
Random Parts
τ_{00, screen_name}	0.680
τ_{00, hour}	0.018
τ_{00, day}	0.012
N_{screen_name}	1716
N_hour	24
N_day	7
ICC_{screen_name}	0.398
ICC_hour	0.011
ICC_day	0.007
Observations	36718
Deviance	112907.831

Adding some LIWC variables

Linear model:

m2 <- lmer(interactions ~ 
               I(followersCount / 1000) +
               I(num_following / 1000) +
               I(statusesCount / 1000) +
               I(favoritesCount / 1000) +
               I(friendsCount / 1000) +
               
               n_mentions +
               num_urls + 
               num_hashtags + 
               I(n_chars / 10) +
               
               scale(cogproc) +
               scale(social) +
               scale(work) + 
               scale(posemo) +
               scale(negemo) +
               
               (1 | day) +
               (1 | hour) + 
               (1 | screen_name),
           data = df)

sjt.lmer(m2, p.kr = FALSE)

	interactions
	B	CI	p
Fixed Parts
(Intercept)	1.00	0.58 – 1.42	<.001
I(followersCount/1000)	0.07	0.05 – 0.09	<.001
I(num_following/1000)	0.35	-5.74 – 6.44	.911
I(statusesCount/1000)	-0.00	-0.01 – 0.00	.316
I(favoritesCount/1000)	0.05	0.02 – 0.07	.002
I(friendsCount/1000)	-0.35	-6.45 – 5.76	.911
n_mentions	0.20	0.14 – 0.25	<.001
num_urls	-1.39	-1.51 – -1.27	<.001
num_hashtags	0.12	0.07 – 0.17	<.001
I(n_chars/10)	0.18	0.16 – 0.20	<.001
scale(cogproc)	-0.06	-0.11 – -0.01	.013
scale(social)	-0.05	-0.10 – -0.01	.031
scale(work)	-0.04	-0.09 – 0.01	.095
scale(posemo)	-0.04	-0.09 – 0.00	.077
scale(negemo)	0.10	0.06 – 0.15	<.001
Random Parts
σ²	18.251
τ_{00, screen_name}	6.012
τ_{00, hour}	0.068
τ_{00, day}	0.122
N_{screen_name}	1716
N_hour	24
N_day	7
ICC_{screen_name}	0.246
ICC_hour	0.003
ICC_day	0.005
Observations	36718
R² / Ω₀²	.228 / .226

With Poisson (units in log odds):

m2i <- glmer(interactions ~ 
                 I(followersCount / 1000) +
                 I(num_following / 1000) +
                 I(statusesCount / 1000) +
                 I(favoritesCount / 1000) +
                 I(friendsCount / 1000) +
                 
                 n_mentions +
                 num_urls + 
                 num_hashtags + 
                 I(n_chars / 10) +
                 
                 scale(cogproc) +
                 scale(social) +
                 scale(work) + 
                 scale(posemo) +
                 scale(negemo) +
                 
                 (1 | day) +
                 (1 | hour) + 
                 (1 | screen_name),
             family = poisson,
             control = glmerControl(optCtrl=list(maxfun=100000), calc.derivs = FALSE),
             data = df)

sjt.glmer(m2i)

	interactions
	IRR	CI	p
Fixed Parts
(Intercept)	1.04	0.92 – 1.17	.537
I(followersCount/1000)	1.01	1.01 – 1.02	<.001
I(num_following/1000)	0.56	0.11 – 2.78	.476
I(statusesCount/1000)	1.00	0.99 – 1.00	<.001
I(favoritesCount/1000)	1.01	1.01 – 1.02	<.001
I(friendsCount/1000)	1.81	0.36 – 9.03	.471
n_mentions	1.05	1.04 – 1.06	<.001
num_urls	0.63	0.62 – 0.64	<.001
num_hashtags	1.05	1.04 – 1.05	<.001
I(n_chars/10)	1.06	1.06 – 1.07	<.001
scale(cogproc)	0.98	0.97 – 0.99	<.001
scale(social)	0.98	0.98 – 0.99	<.001
scale(work)	0.99	0.98 – 1.00	.002
scale(posemo)	0.99	0.98 – 0.99	<.001
scale(negemo)	1.02	1.02 – 1.03	<.001
Random Parts
τ_{00, screen_name}	0.675
τ_{00, hour}	0.019
τ_{00, day}	0.012
N_{screen_name}	1716
N_hour	24
N_day	7
ICC_{screen_name}	0.396
ICC_hour	0.011
ICC_day	0.007
Observations	36718
Deviance	112772.209

Edchat HLM

Matthew Koehler and Joshua Rosenberg