library(lme4)
library(tidyverse)
library(sjPlot)
library(stringr)
library(lubridate)
tweets <- read_csv("~/Dropbox/1_Research/Twitter_Data_Suite/hashtag/miched3/2_tweets.csv")
liwc_results <- read_csv("liwc_results.csv")
liwc_results <- liwc_results[-1, ]
liwc_results <- rename(liwc_results, tweet_link = `Source (A)`)
tweets <- left_join(tweets, liwc_results, by = "tweet_link")
tweets <- mutate(tweets,
date = with_tz(date, "EST"),
hour = hour(date))
tweets <- mutate(tweets,
hour_chunk = case_when(
hour %in% c(0, 1, 2, 3) ~ "12-3 am",
hour %in% c(4, 5, 6, 7) ~ "4-7 am",
hour %in% c(8, 9, 10, 11) ~ "8-11 am",
hour %in% c(12, 13, 14, 15) ~ "12-3 pm",
hour %in% c(16, 17, 18, 19) ~ "4-7 pm",
hour %in% c(20, 21, 22, 23) ~ "8-11 pm"
))
rm(liwc_results)
profiles <- read_csv("~/Dropbox/1_Research/Twitter_Data_Suite/hashtag/miched3/3_profiles.csv")
profiles$num_following <- stringr::str_count(profiles$following, "\\*")
profiles <- rename(profiles, screen_name = screenName)
df <- left_join(tweets, profiles, by = "screen_name")
rm(profiles)
rm(tweets)
df <- mutate(df,
interactions = scraped_num_favorites + scraped_num_retweets + scraped_num_replies,
n_chars = nchar(text),
n_mentions = str_count(text, "@"))
A few statistics
df %>% count(user_id)
df %>% count(user_id) %>% summarize(mean_n = mean(n), sd_n = sd(n))
nrow(df) # total number of tweets - originals, quotes, replies, retweets
## [1] 89943
nrow(filter(df, type == "ORIG")) # number of original tweets
## [1] 37291
nrow(df) - nrow(filter(df, type == "ORIG")) # removed this many tweets
## [1] 52652
df <- filter(df, type == "ORIG") # filter data
My understanding is that we can use a Poisson distribution, which ranges from 0 to positive infinity, and is for “the discrete number of events”. Based on our data, I think there may be more 0’s than would expect, and so I think we maybe could consider a zero-inflated Poisson distribution (https://en.wikipedia.org/wiki/Zero-inflated_model). Another way to address this I think is to run two models: One for whether the outcome is 0 or greater than 0 (so a logistic), and one for if the outcome is greater than 0, what its value is (so a Poisson).
df$interactions_binary <- ifelse(df$interactions == 0, 0, 1)
df$interactions_non_zero <- ifelse(df$interactions > 0, df$interactions, NA)
I will add these models to those here, after getting these models finalized using a Poisson distribution.
This suggests that we should possibly use logged interactions as the outcome. There seem to be very many 0s. This could suggest we use a count data model. Sticking with continuous outcome to start.
set_theme(theme_bw())
ggplot(df, aes(x = interactions)) +
geom_histogram(bins = 50)
safe_log <- function(x) {
ifelse(x == 0, 0, log(x))
}
df <- mutate(df,
interactions_log = safe_log(interactions))
ggplot(df, aes(x = interactions_log)) +
geom_histogram(bins = 35)
df %>%
group_by(num_urls) %>%
summarize(mean_interactions = mean(interactions),
n = n())
# This plot is a bit misleading, given the number of tweets with three or four URLs
# It does look very approximately linear
# df %>%
# group_by(num_urls) %>%
# summarize(mean_interactions = mean(interactions)) %>%
# ggplot(aes(x = num_urls, y = mean_interactions)) +
# geom_col()
Using linear model:
df$weekend <- ifelse(df$day == "Saturday" | df$day == "Sunday", "weekend", "weekday")
options(scipen=999)
m0 <- lmerTest::lmer(interactions ~
(1 | day) +
(1 | screen_name) +
(1 | hour),
data = df)
summary(m0)
## Linear mixed model fit by REML ['lmerMod']
## Formula: interactions ~ (1 | day) + (1 | screen_name) + (1 | hour)
## Data: df
##
## REML criterion at convergence: 216829.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -7.973 -0.456 -0.165 0.189 45.823
##
## Random effects:
## Groups Name Variance Std.Dev.
## screen_name (Intercept) 6.43049 2.5358
## hour (Intercept) 0.05247 0.2291
## day (Intercept) 0.08286 0.2879
## Residual 18.59875 4.3126
## Number of obs: 37291, groups: screen_name, 1766; hour, 24; day, 7
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 3.0668 0.1477 20.76
sjstats::icc(m0)
## Linear mixed model
## Family: gaussian (identity)
## Formula: interactions ~ (1 | day) + (1 | screen_name) + (1 | hour)
##
## ICC (screen_name): 0.255538
## ICC (hour): 0.002085
## ICC (day): 0.003293
m1 <- lmer(interactions ~
I(followersCount / 1000) +
I(num_following / 1000) +
I(statusesCount / 1000) +
I(favoritesCount / 1000) +
I(friendsCount / 1000) +
n_mentions +
num_urls +
num_hashtags +
I(n_chars / 10) +
(1 | day) +
(1 | hour) +
(1 | screen_name),
data = df)
sjt.lmer(m1, p.kr = FALSE)
interactions | ||||
B | CI | p | ||
Fixed Parts | ||||
(Intercept) | 1.00 | 0.58 – 1.42 | <.001 | |
I(followersCount/1000) | 0.07 | 0.05 – 0.09 | <.001 | |
I(num_following/1000) | 0.33 | -5.78 – 6.43 | .917 | |
I(statusesCount/1000) | -0.00 | -0.01 – 0.00 | .347 | |
I(favoritesCount/1000) | 0.05 | 0.02 – 0.07 | .002 | |
I(friendsCount/1000) | -0.32 | -6.44 – 5.80 | .918 | |
n_mentions | 0.21 | 0.16 – 0.26 | <.001 | |
num_urls | -1.33 | -1.44 – -1.22 | <.001 | |
num_hashtags | 0.13 | 0.09 – 0.18 | <.001 | |
I(n_chars/10) | 0.17 | 0.15 – 0.19 | <.001 | |
Random Parts | ||||
σ2 | 18.263 | |||
τ00, screen_name | 6.062 | |||
τ00, hour | 0.068 | |||
τ00, day | 0.122 | |||
Nscreen_name | 1716 | |||
Nhour | 24 | |||
Nday | 7 | |||
ICCscreen_name | 0.247 | |||
ICChour | 0.003 | |||
ICCday | 0.005 | |||
Observations | 36718 | |||
R2 / Ω02 | .228 / .225 |
With Poisson distribution (coefficients are in log odds):
m1i <- glmer(interactions ~
I(followersCount / 1000) +
I(num_following / 1000) +
I(statusesCount / 1000) +
I(favoritesCount / 1000) +
I(friendsCount / 1000) +
n_mentions +
num_urls +
num_hashtags +
I(n_chars / 10) +
(1 | day) +
(1 | hour) +
(1 | screen_name),
family = poisson,
control = glmerControl(optCtrl=list(maxfun=100000), calc.derivs = FALSE),
data = df)
sjt.glmer(m1i)
interactions | ||||
IRR | CI | p | ||
Fixed Parts | ||||
(Intercept) | 1.04 | 0.92 – 1.17 | .528 | |
I(followersCount/1000) | 1.01 | 1.01 – 1.02 | <.001 | |
I(num_following/1000) | 0.55 | 0.11 – 2.76 | .468 | |
I(statusesCount/1000) | 1.00 | 0.99 – 1.00 | <.001 | |
I(favoritesCount/1000) | 1.01 | 1.01 – 1.02 | <.001 | |
I(friendsCount/1000) | 1.83 | 0.36 – 9.19 | .463 | |
n_mentions | 1.05 | 1.05 – 1.06 | <.001 | |
num_urls | 0.64 | 0.63 – 0.65 | <.001 | |
num_hashtags | 1.05 | 1.04 – 1.06 | <.001 | |
I(n_chars/10) | 1.06 | 1.06 – 1.07 | <.001 | |
Random Parts | ||||
τ00, screen_name | 0.680 | |||
τ00, hour | 0.018 | |||
τ00, day | 0.012 | |||
Nscreen_name | 1716 | |||
Nhour | 24 | |||
Nday | 7 | |||
ICCscreen_name | 0.398 | |||
ICChour | 0.011 | |||
ICCday | 0.007 | |||
Observations | 36718 | |||
Deviance | 112907.831 |
Linear model:
m2 <- lmer(interactions ~
I(followersCount / 1000) +
I(num_following / 1000) +
I(statusesCount / 1000) +
I(favoritesCount / 1000) +
I(friendsCount / 1000) +
n_mentions +
num_urls +
num_hashtags +
I(n_chars / 10) +
scale(cogproc) +
scale(social) +
scale(work) +
scale(posemo) +
scale(negemo) +
(1 | day) +
(1 | hour) +
(1 | screen_name),
data = df)
sjt.lmer(m2, p.kr = FALSE)
interactions | ||||
B | CI | p | ||
Fixed Parts | ||||
(Intercept) | 1.00 | 0.58 – 1.42 | <.001 | |
I(followersCount/1000) | 0.07 | 0.05 – 0.09 | <.001 | |
I(num_following/1000) | 0.35 | -5.74 – 6.44 | .911 | |
I(statusesCount/1000) | -0.00 | -0.01 – 0.00 | .316 | |
I(favoritesCount/1000) | 0.05 | 0.02 – 0.07 | .002 | |
I(friendsCount/1000) | -0.35 | -6.45 – 5.76 | .911 | |
n_mentions | 0.20 | 0.14 – 0.25 | <.001 | |
num_urls | -1.39 | -1.51 – -1.27 | <.001 | |
num_hashtags | 0.12 | 0.07 – 0.17 | <.001 | |
I(n_chars/10) | 0.18 | 0.16 – 0.20 | <.001 | |
scale(cogproc) | -0.06 | -0.11 – -0.01 | .013 | |
scale(social) | -0.05 | -0.10 – -0.01 | .031 | |
scale(work) | -0.04 | -0.09 – 0.01 | .095 | |
scale(posemo) | -0.04 | -0.09 – 0.00 | .077 | |
scale(negemo) | 0.10 | 0.06 – 0.15 | <.001 | |
Random Parts | ||||
σ2 | 18.251 | |||
τ00, screen_name | 6.012 | |||
τ00, hour | 0.068 | |||
τ00, day | 0.122 | |||
Nscreen_name | 1716 | |||
Nhour | 24 | |||
Nday | 7 | |||
ICCscreen_name | 0.246 | |||
ICChour | 0.003 | |||
ICCday | 0.005 | |||
Observations | 36718 | |||
R2 / Ω02 | .228 / .226 |
With Poisson (units in log odds):
m2i <- glmer(interactions ~
I(followersCount / 1000) +
I(num_following / 1000) +
I(statusesCount / 1000) +
I(favoritesCount / 1000) +
I(friendsCount / 1000) +
n_mentions +
num_urls +
num_hashtags +
I(n_chars / 10) +
scale(cogproc) +
scale(social) +
scale(work) +
scale(posemo) +
scale(negemo) +
(1 | day) +
(1 | hour) +
(1 | screen_name),
family = poisson,
control = glmerControl(optCtrl=list(maxfun=100000), calc.derivs = FALSE),
data = df)
sjt.glmer(m2i)
interactions | ||||
IRR | CI | p | ||
Fixed Parts | ||||
(Intercept) | 1.04 | 0.92 – 1.17 | .537 | |
I(followersCount/1000) | 1.01 | 1.01 – 1.02 | <.001 | |
I(num_following/1000) | 0.56 | 0.11 – 2.78 | .476 | |
I(statusesCount/1000) | 1.00 | 0.99 – 1.00 | <.001 | |
I(favoritesCount/1000) | 1.01 | 1.01 – 1.02 | <.001 | |
I(friendsCount/1000) | 1.81 | 0.36 – 9.03 | .471 | |
n_mentions | 1.05 | 1.04 – 1.06 | <.001 | |
num_urls | 0.63 | 0.62 – 0.64 | <.001 | |
num_hashtags | 1.05 | 1.04 – 1.05 | <.001 | |
I(n_chars/10) | 1.06 | 1.06 – 1.07 | <.001 | |
scale(cogproc) | 0.98 | 0.97 – 0.99 | <.001 | |
scale(social) | 0.98 | 0.98 – 0.99 | <.001 | |
scale(work) | 0.99 | 0.98 – 1.00 | .002 | |
scale(posemo) | 0.99 | 0.98 – 0.99 | <.001 | |
scale(negemo) | 1.02 | 1.02 – 1.03 | <.001 | |
Random Parts | ||||
τ00, screen_name | 0.675 | |||
τ00, hour | 0.019 | |||
τ00, day | 0.012 | |||
Nscreen_name | 1716 | |||
Nhour | 24 | |||
Nday | 7 | |||
ICCscreen_name | 0.396 | |||
ICChour | 0.011 | |||
ICCday | 0.007 | |||
Observations | 36718 | |||
Deviance | 112772.209 |