Loading, setting up

library(lme4)
library(tidyverse)
library(sjPlot)

g_tweet_file <<- "data/tweet_annotated.csv"
g_profile_file <<- "data/profile_data.csv"


safe_log <- function(x) {
    ifelse(x == 0, 0, log(x))
}


integer_NAs_to_zeros <- function (x)  {
    
    if (is.na(x) && is.integer(x)) {
        0
    } else {
        x
    }
}

readData <- function (f) {
    message("reading source  file at ", f)
    myDF <- read.csv(f, stringsAsFactors = F, na.strings = c("NA","NaN", " ")) # changed to read_csv from the readr package, as it has nice defaults (stringsAsFactors = F) and is faster
    
    myDF[] <- lapply(myDF, integer_NAs_to_zeros)
    myDF
}

if (!exists('TweetDF')) TweetDF <- readData(g_tweet_file)

if (!exists('ProfileDF')) ProfileDF <- readData(g_profile_file)

MergedDF <- merge(x=TweetDF, y=ProfileDF, by.x="Screen_Name", by.y="screen_name")

MergedDF$impressions_l <- safe_log(MergedDF$impressions)
MergedDF$profile_follower_count_l <- safe_log(MergedDF$profile_follower_count)
MergedDF$tweets_l <- safe_log(MergedDF$tweets)

# this seems to be giving some problems

# MergedDF$hour_category<-recode(MergedDF$hour_of_day,"0:3='12-3am';4:7='4-7am';8:11='8-11am';12:15='12-3pm';16:19='4-7pm';20:23='8-11pm'")

MergedDF <- as_tibble(MergedDF) %>% distinct() # this removes a few hundred observations


LIWC <- read_csv("data/tweet_annotated_liwc.csv")

LIWC <- select(LIWC,Tweet_URL = `Source (P)`, WC:OtherP)

df <- left_join(MergedDF, LIWC, by = "Tweet_URL")

Models

m1 <- lmer(Total_Impressions ~ 
               1 +
               (1 | Screen_Name),
           data = df)

sjstats::icc(m1) # 76.42 % variability at tweeter level
## Linear mixed model
##  Family: gaussian (identity)
## Formula: Total_Impressions ~ 1 + (1 | Screen_Name)
## 
##   ICC (Screen_Name): 0.764196
# Let's add tweeter vars

m2 <- lmer(Total_Impressions ~ 
               1 +
               scale(profile_follower_count) +
               scale(profile_following_count) +
               scale(tweets) +
               scale(retweets) + 
               (1 | Screen_Name),
           data = df)

sjt.lmer(m2, show.icc = TRUE, show.r2 = TRUE, p.kr = FALSE, show.re.var = F)
## Computing p-values via Wald-statistics approximation (treating t as Wald z).
    Total_Impressions
    B CI p
Fixed Parts
(Intercept)   7564.41 7144.88 – 7983.95 <.001
scale(profile_follower_count)   31197.69 31097.91 – 31297.46 <.001
scale(profile_following_count)   -975.34 -1066.41 – -884.27 <.001
scale(tweets)   -691.92 -2052.50 – 668.66 .319
scale(retweets)   4908.14 4704.89 – 5111.40 <.001
Random Parts
NScreen_Name   54027
ICCScreen_Name   0.096
Observations   191794
R2 / Ω02   .895 / .895
# Let's add tweet vars

m3 <- lmer(Total_Impressions ~ 
               1 +
               scale(profile_follower_count) +
               scale(profile_following_count) +
               scale(tweets) +
               scale(retweets) + 
               
               scale(n_chars) +
               Type + # consider re-leveling these 
               day_of_week +
               as.factor(hour_of_day) + # need to recode this
               (1 | Screen_Name),
           data = df)

sjPlot::sjt.lmer(m3, show.icc = TRUE, show.r2 = TRUE, p.kr = FALSE, show.re.var = F)
## Computing p-values via Wald-statistics approximation (treating t as Wald z).
    Total_Impressions
    B CI p
Fixed Parts
(Intercept)   7356.51 6714.17 – 7998.85 <.001
scale(profile_follower_count)   31116.95 31017.93 – 31215.98 <.001
scale(profile_following_count)   -1069.88 -1160.28 – -979.48 <.001
scale(tweets)   -2702.43 -4058.58 – -1346.27 <.001
scale(retweets)   4497.96 4295.33 – 4700.60 <.001
scale(n_chars)   698.57 638.53 – 758.61 <.001
Type (Regular)   3330.99 3023.66 – 3638.32 <.001
Type (reply)   -6498.92 -7206.24 – -5791.60 <.001
Type (retweet)   -1614.46 -1902.44 – -1326.47 <.001
day_of_week (Monday)   332.62 133.80 – 531.44 .001
day_of_week (Saturday)   48.05 -149.64 – 245.73 .634
day_of_week (Sunday)   381.77 182.38 – 581.16 <.001
day_of_week (Thursday)   -115.22 -301.20 – 70.76 .225
day_of_week (Tuesday)   -124.53 -314.16 – 65.10 .198
day_of_week (Wednesday)   -20.82 -200.81 – 159.16 .821
as.factor(hour_of_day) (1)   -251.43 -785.88 – 283.02 .356
as.factor(hour_of_day) (2)   -43.69 -560.43 – 473.06 .868
as.factor(hour_of_day) (3)   -103.80 -585.94 – 378.34 .673
as.factor(hour_of_day) (4)   20.33 -434.65 – 475.31 .930
as.factor(hour_of_day) (5)   236.32 -210.17 – 682.81 .300
as.factor(hour_of_day) (6)   -87.78 -529.70 – 354.14 .697
as.factor(hour_of_day) (7)   -73.34 -518.06 – 371.39 .747
as.factor(hour_of_day) (8)   -84.52 -529.88 – 360.83 .710
as.factor(hour_of_day) (9)   -324.28 -768.95 – 120.39 .153
as.factor(hour_of_day) (10)   -272.61 -721.28 – 176.06 .234
as.factor(hour_of_day) (11)   -487.27 -940.98 – -33.56 .035
as.factor(hour_of_day) (12)   -319.42 -773.15 – 134.30 .168
as.factor(hour_of_day) (13)   86.89 -367.69 – 541.47 .708
as.factor(hour_of_day) (14)   -42.53 -499.44 – 414.37 .855
as.factor(hour_of_day) (15)   45.58 -410.06 – 501.23 .845
as.factor(hour_of_day) (16)   65.01 -384.93 – 514.94 .777
as.factor(hour_of_day) (17)   -38.47 -491.90 – 414.96 .868
as.factor(hour_of_day) (18)   34.90 -417.59 – 487.40 .880
as.factor(hour_of_day) (19)   238.65 -218.14 – 695.43 .306
as.factor(hour_of_day) (20)   282.73 -185.93 – 751.39 .237
as.factor(hour_of_day) (21)   192.13 -305.12 – 689.38 .449
as.factor(hour_of_day) (22)   -12.39 -535.05 – 510.27 .963
as.factor(hour_of_day) (23)   148.56 -379.35 – 676.46 .581
Random Parts
NScreen_Name   54027
ICCScreen_Name   0.097
Observations   191794
R2 / Ω02   .897 / .897
# adding a few LIWC variables

m4 <- lmer(Total_Impressions ~ 
               1 +
               scale(profile_follower_count) +
               scale(profile_following_count) +
               scale(tweets) +
               scale(retweets) + 
               
               scale(n_chars) +
               Type + # consider re-leveling these 
               day_of_week +
               as.factor(hour_of_day) + # need to recode this
               
               scale(cogproc) +
               scale(social) + 
               scale(affiliation) + 
               scale(posemo) + 
               (1 | Screen_Name),
           data = df)

sjPlot::sjt.lmer(m4, show.icc = TRUE, show.r2 = TRUE, p.kr = FALSE, show.re.var = F)
## Computing p-values via Wald-statistics approximation (treating t as Wald z).
    Total_Impressions
    B CI p
Fixed Parts
(Intercept)   7380.10 6737.71 – 8022.49 <.001
scale(profile_follower_count)   31115.74 31016.75 – 31214.74 <.001
scale(profile_following_count)   -1067.86 -1158.25 – -977.48 <.001
scale(tweets)   -2689.40 -4044.92 – -1333.89 <.001
scale(retweets)   4491.12 4288.57 – 4693.68 <.001
scale(n_chars)   688.20 627.67 – 748.72 <.001
Type (Regular)   3390.32 3079.63 – 3701.01 <.001
Type (reply)   -6460.37 -7168.47 – -5752.27 <.001
Type (retweet)   -1692.06 -1981.89 – -1402.24 <.001
day_of_week (Monday)   337.45 138.60 – 536.30 <.001
day_of_week (Saturday)   56.43 -141.26 – 254.12 .576
day_of_week (Sunday)   386.39 186.82 – 585.96 <.001
day_of_week (Thursday)   -114.26 -300.25 – 71.74 .229
day_of_week (Tuesday)   -114.43 -304.08 – 75.23 .237
day_of_week (Wednesday)   -19.98 -199.99 – 160.04 .828
as.factor(hour_of_day) (1)   -231.32 -765.76 – 303.12 .396
as.factor(hour_of_day) (2)   -35.63 -552.35 – 481.08 .892
as.factor(hour_of_day) (3)   -102.45 -584.56 – 379.65 .677
as.factor(hour_of_day) (4)   18.34 -436.62 – 473.31 .937
as.factor(hour_of_day) (5)   234.73 -211.72 – 681.19 .303
as.factor(hour_of_day) (6)   -90.47 -532.37 – 351.42 .688
as.factor(hour_of_day) (7)   -77.37 -522.07 – 367.32 .733
as.factor(hour_of_day) (8)   -91.13 -536.47 – 354.20 .688
as.factor(hour_of_day) (9)   -326.50 -771.13 – 118.13 .150
as.factor(hour_of_day) (10)   -278.45 -727.08 – 170.19 .224
as.factor(hour_of_day) (11)   -492.70 -946.37 – -39.03 .033
as.factor(hour_of_day) (12)   -322.16 -775.85 – 131.53 .164
as.factor(hour_of_day) (13)   88.73 -365.81 – 543.27 .702
as.factor(hour_of_day) (14)   -45.99 -502.87 – 410.90 .844
as.factor(hour_of_day) (15)   37.62 -418.00 – 493.23 .871
as.factor(hour_of_day) (16)   55.75 -394.16 – 505.67 .808
as.factor(hour_of_day) (17)   -44.55 -497.98 – 408.89 .847
as.factor(hour_of_day) (18)   29.98 -422.50 – 482.46 .897
as.factor(hour_of_day) (19)   229.03 -227.76 – 685.81 .326
as.factor(hour_of_day) (20)   272.44 -196.22 – 741.09 .255
as.factor(hour_of_day) (21)   185.45 -311.77 – 682.67 .465
as.factor(hour_of_day) (22)   -9.29 -531.91 – 513.33 .972
as.factor(hour_of_day) (23)   155.94 -371.94 – 683.83 .563
scale(cogproc)   44.09 -11.20 – 99.39 .118
scale(social)   94.30 19.87 – 168.73 .013
scale(affiliation)   37.85 -24.96 – 100.66 .238
scale(posemo)   -120.74 -173.27 – -68.22 <.001
Random Parts
NScreen_Name   54027
ICCScreen_Name   0.097
Observations   191794
R2 / Ω02   .897 / .897