Loading, setting up
library(lme4)
library(tidyverse)
library(sjPlot)
g_tweet_file <<- "data/tweet_annotated.csv"
g_profile_file <<- "data/profile_data.csv"
safe_log <- function(x) {
ifelse(x == 0, 0, log(x))
}
integer_NAs_to_zeros <- function (x) {
if (is.na(x) && is.integer(x)) {
0
} else {
x
}
}
readData <- function (f) {
message("reading source file at ", f)
myDF <- read.csv(f, stringsAsFactors = F, na.strings = c("NA","NaN", " ")) # changed to read_csv from the readr package, as it has nice defaults (stringsAsFactors = F) and is faster
myDF[] <- lapply(myDF, integer_NAs_to_zeros)
myDF
}
if (!exists('TweetDF')) TweetDF <- readData(g_tweet_file)
if (!exists('ProfileDF')) ProfileDF <- readData(g_profile_file)
MergedDF <- merge(x=TweetDF, y=ProfileDF, by.x="Screen_Name", by.y="screen_name")
MergedDF$impressions_l <- safe_log(MergedDF$impressions)
MergedDF$profile_follower_count_l <- safe_log(MergedDF$profile_follower_count)
MergedDF$tweets_l <- safe_log(MergedDF$tweets)
# this seems to be giving some problems
# MergedDF$hour_category<-recode(MergedDF$hour_of_day,"0:3='12-3am';4:7='4-7am';8:11='8-11am';12:15='12-3pm';16:19='4-7pm';20:23='8-11pm'")
MergedDF <- as_tibble(MergedDF) %>% distinct() # this removes a few hundred observations
LIWC <- read_csv("data/tweet_annotated_liwc.csv")
LIWC <- select(LIWC,Tweet_URL = `Source (P)`, WC:OtherP)
df <- left_join(MergedDF, LIWC, by = "Tweet_URL")
Models
m1 <- lmer(Total_Impressions ~
1 +
(1 | Screen_Name),
data = df)
sjstats::icc(m1) # 76.42 % variability at tweeter level
## Linear mixed model
## Family: gaussian (identity)
## Formula: Total_Impressions ~ 1 + (1 | Screen_Name)
##
## ICC (Screen_Name): 0.764196
# Let's add tweeter vars
m2 <- lmer(Total_Impressions ~
1 +
scale(profile_follower_count) +
scale(profile_following_count) +
scale(tweets) +
scale(retweets) +
(1 | Screen_Name),
data = df)
sjt.lmer(m2, show.icc = TRUE, show.r2 = TRUE, p.kr = FALSE, show.re.var = F)
## Computing p-values via Wald-statistics approximation (treating t as Wald z).
|
|
Total_Impressions
|
|
|
B
|
CI
|
p
|
Fixed Parts
|
(Intercept)
|
|
7564.41
|
7144.88 – 7983.95
|
<.001
|
scale(profile_follower_count)
|
|
31197.69
|
31097.91 – 31297.46
|
<.001
|
scale(profile_following_count)
|
|
-975.34
|
-1066.41 – -884.27
|
<.001
|
scale(tweets)
|
|
-691.92
|
-2052.50 – 668.66
|
.319
|
scale(retweets)
|
|
4908.14
|
4704.89 – 5111.40
|
<.001
|
Random Parts
|
NScreen_Name
|
|
54027
|
ICCScreen_Name
|
|
0.096
|
Observations
|
|
191794
|
R2 / Ω02
|
|
.895 / .895
|
# Let's add tweet vars
m3 <- lmer(Total_Impressions ~
1 +
scale(profile_follower_count) +
scale(profile_following_count) +
scale(tweets) +
scale(retweets) +
scale(n_chars) +
Type + # consider re-leveling these
day_of_week +
as.factor(hour_of_day) + # need to recode this
(1 | Screen_Name),
data = df)
sjPlot::sjt.lmer(m3, show.icc = TRUE, show.r2 = TRUE, p.kr = FALSE, show.re.var = F)
## Computing p-values via Wald-statistics approximation (treating t as Wald z).
|
|
Total_Impressions
|
|
|
B
|
CI
|
p
|
Fixed Parts
|
(Intercept)
|
|
7356.51
|
6714.17 – 7998.85
|
<.001
|
scale(profile_follower_count)
|
|
31116.95
|
31017.93 – 31215.98
|
<.001
|
scale(profile_following_count)
|
|
-1069.88
|
-1160.28 – -979.48
|
<.001
|
scale(tweets)
|
|
-2702.43
|
-4058.58 – -1346.27
|
<.001
|
scale(retweets)
|
|
4497.96
|
4295.33 – 4700.60
|
<.001
|
scale(n_chars)
|
|
698.57
|
638.53 – 758.61
|
<.001
|
Type (Regular)
|
|
3330.99
|
3023.66 – 3638.32
|
<.001
|
Type (reply)
|
|
-6498.92
|
-7206.24 – -5791.60
|
<.001
|
Type (retweet)
|
|
-1614.46
|
-1902.44 – -1326.47
|
<.001
|
day_of_week (Monday)
|
|
332.62
|
133.80 – 531.44
|
.001
|
day_of_week (Saturday)
|
|
48.05
|
-149.64 – 245.73
|
.634
|
day_of_week (Sunday)
|
|
381.77
|
182.38 – 581.16
|
<.001
|
day_of_week (Thursday)
|
|
-115.22
|
-301.20 – 70.76
|
.225
|
day_of_week (Tuesday)
|
|
-124.53
|
-314.16 – 65.10
|
.198
|
day_of_week (Wednesday)
|
|
-20.82
|
-200.81 – 159.16
|
.821
|
as.factor(hour_of_day) (1)
|
|
-251.43
|
-785.88 – 283.02
|
.356
|
as.factor(hour_of_day) (2)
|
|
-43.69
|
-560.43 – 473.06
|
.868
|
as.factor(hour_of_day) (3)
|
|
-103.80
|
-585.94 – 378.34
|
.673
|
as.factor(hour_of_day) (4)
|
|
20.33
|
-434.65 – 475.31
|
.930
|
as.factor(hour_of_day) (5)
|
|
236.32
|
-210.17 – 682.81
|
.300
|
as.factor(hour_of_day) (6)
|
|
-87.78
|
-529.70 – 354.14
|
.697
|
as.factor(hour_of_day) (7)
|
|
-73.34
|
-518.06 – 371.39
|
.747
|
as.factor(hour_of_day) (8)
|
|
-84.52
|
-529.88 – 360.83
|
.710
|
as.factor(hour_of_day) (9)
|
|
-324.28
|
-768.95 – 120.39
|
.153
|
as.factor(hour_of_day) (10)
|
|
-272.61
|
-721.28 – 176.06
|
.234
|
as.factor(hour_of_day) (11)
|
|
-487.27
|
-940.98 – -33.56
|
.035
|
as.factor(hour_of_day) (12)
|
|
-319.42
|
-773.15 – 134.30
|
.168
|
as.factor(hour_of_day) (13)
|
|
86.89
|
-367.69 – 541.47
|
.708
|
as.factor(hour_of_day) (14)
|
|
-42.53
|
-499.44 – 414.37
|
.855
|
as.factor(hour_of_day) (15)
|
|
45.58
|
-410.06 – 501.23
|
.845
|
as.factor(hour_of_day) (16)
|
|
65.01
|
-384.93 – 514.94
|
.777
|
as.factor(hour_of_day) (17)
|
|
-38.47
|
-491.90 – 414.96
|
.868
|
as.factor(hour_of_day) (18)
|
|
34.90
|
-417.59 – 487.40
|
.880
|
as.factor(hour_of_day) (19)
|
|
238.65
|
-218.14 – 695.43
|
.306
|
as.factor(hour_of_day) (20)
|
|
282.73
|
-185.93 – 751.39
|
.237
|
as.factor(hour_of_day) (21)
|
|
192.13
|
-305.12 – 689.38
|
.449
|
as.factor(hour_of_day) (22)
|
|
-12.39
|
-535.05 – 510.27
|
.963
|
as.factor(hour_of_day) (23)
|
|
148.56
|
-379.35 – 676.46
|
.581
|
Random Parts
|
NScreen_Name
|
|
54027
|
ICCScreen_Name
|
|
0.097
|
Observations
|
|
191794
|
R2 / Ω02
|
|
.897 / .897
|
# adding a few LIWC variables
m4 <- lmer(Total_Impressions ~
1 +
scale(profile_follower_count) +
scale(profile_following_count) +
scale(tweets) +
scale(retweets) +
scale(n_chars) +
Type + # consider re-leveling these
day_of_week +
as.factor(hour_of_day) + # need to recode this
scale(cogproc) +
scale(social) +
scale(affiliation) +
scale(posemo) +
(1 | Screen_Name),
data = df)
sjPlot::sjt.lmer(m4, show.icc = TRUE, show.r2 = TRUE, p.kr = FALSE, show.re.var = F)
## Computing p-values via Wald-statistics approximation (treating t as Wald z).
|
|
Total_Impressions
|
|
|
B
|
CI
|
p
|
Fixed Parts
|
(Intercept)
|
|
7380.10
|
6737.71 – 8022.49
|
<.001
|
scale(profile_follower_count)
|
|
31115.74
|
31016.75 – 31214.74
|
<.001
|
scale(profile_following_count)
|
|
-1067.86
|
-1158.25 – -977.48
|
<.001
|
scale(tweets)
|
|
-2689.40
|
-4044.92 – -1333.89
|
<.001
|
scale(retweets)
|
|
4491.12
|
4288.57 – 4693.68
|
<.001
|
scale(n_chars)
|
|
688.20
|
627.67 – 748.72
|
<.001
|
Type (Regular)
|
|
3390.32
|
3079.63 – 3701.01
|
<.001
|
Type (reply)
|
|
-6460.37
|
-7168.47 – -5752.27
|
<.001
|
Type (retweet)
|
|
-1692.06
|
-1981.89 – -1402.24
|
<.001
|
day_of_week (Monday)
|
|
337.45
|
138.60 – 536.30
|
<.001
|
day_of_week (Saturday)
|
|
56.43
|
-141.26 – 254.12
|
.576
|
day_of_week (Sunday)
|
|
386.39
|
186.82 – 585.96
|
<.001
|
day_of_week (Thursday)
|
|
-114.26
|
-300.25 – 71.74
|
.229
|
day_of_week (Tuesday)
|
|
-114.43
|
-304.08 – 75.23
|
.237
|
day_of_week (Wednesday)
|
|
-19.98
|
-199.99 – 160.04
|
.828
|
as.factor(hour_of_day) (1)
|
|
-231.32
|
-765.76 – 303.12
|
.396
|
as.factor(hour_of_day) (2)
|
|
-35.63
|
-552.35 – 481.08
|
.892
|
as.factor(hour_of_day) (3)
|
|
-102.45
|
-584.56 – 379.65
|
.677
|
as.factor(hour_of_day) (4)
|
|
18.34
|
-436.62 – 473.31
|
.937
|
as.factor(hour_of_day) (5)
|
|
234.73
|
-211.72 – 681.19
|
.303
|
as.factor(hour_of_day) (6)
|
|
-90.47
|
-532.37 – 351.42
|
.688
|
as.factor(hour_of_day) (7)
|
|
-77.37
|
-522.07 – 367.32
|
.733
|
as.factor(hour_of_day) (8)
|
|
-91.13
|
-536.47 – 354.20
|
.688
|
as.factor(hour_of_day) (9)
|
|
-326.50
|
-771.13 – 118.13
|
.150
|
as.factor(hour_of_day) (10)
|
|
-278.45
|
-727.08 – 170.19
|
.224
|
as.factor(hour_of_day) (11)
|
|
-492.70
|
-946.37 – -39.03
|
.033
|
as.factor(hour_of_day) (12)
|
|
-322.16
|
-775.85 – 131.53
|
.164
|
as.factor(hour_of_day) (13)
|
|
88.73
|
-365.81 – 543.27
|
.702
|
as.factor(hour_of_day) (14)
|
|
-45.99
|
-502.87 – 410.90
|
.844
|
as.factor(hour_of_day) (15)
|
|
37.62
|
-418.00 – 493.23
|
.871
|
as.factor(hour_of_day) (16)
|
|
55.75
|
-394.16 – 505.67
|
.808
|
as.factor(hour_of_day) (17)
|
|
-44.55
|
-497.98 – 408.89
|
.847
|
as.factor(hour_of_day) (18)
|
|
29.98
|
-422.50 – 482.46
|
.897
|
as.factor(hour_of_day) (19)
|
|
229.03
|
-227.76 – 685.81
|
.326
|
as.factor(hour_of_day) (20)
|
|
272.44
|
-196.22 – 741.09
|
.255
|
as.factor(hour_of_day) (21)
|
|
185.45
|
-311.77 – 682.67
|
.465
|
as.factor(hour_of_day) (22)
|
|
-9.29
|
-531.91 – 513.33
|
.972
|
as.factor(hour_of_day) (23)
|
|
155.94
|
-371.94 – 683.83
|
.563
|
scale(cogproc)
|
|
44.09
|
-11.20 – 99.39
|
.118
|
scale(social)
|
|
94.30
|
19.87 – 168.73
|
.013
|
scale(affiliation)
|
|
37.85
|
-24.96 – 100.66
|
.238
|
scale(posemo)
|
|
-120.74
|
-173.27 – -68.22
|
<.001
|
Random Parts
|
NScreen_Name
|
|
54027
|
ICCScreen_Name
|
|
0.097
|
Observations
|
|
191794
|
R2 / Ω02
|
|
.897 / .897
|