loading data, setting up
library(tidyverse)
library(lme4)
rm(list = ls())
df <- read_rds("dc_16_no_essays.rds")
df$prop_requested <- df$`Project Donation Total Amount` /
df$`Project Total Price Excluding Optional Support`
df$funded <- ifelse(df$prop_requested >= 1, 1, 0)
tweets <- readr::read_csv("processed_dc.csv")
names(tweets) <- "url"
the_bool <- stringr::str_detect(tweets$url, "donorschoose")
out_ss <- as.vector(tweets$url[the_bool])
the_split <- stringr::str_split(out_ss, "/")
the_ID <- as.numeric(sapply(the_split, function(x) x[6]))
the_split1 <- stringr::str_split(df$`Project Url`, "/")
df$url <- as.numeric(sapply(the_split1, function(x) x[5]))
df$tweeted <- df$url %in% the_ID
url_freqs <- as.data.frame(table(the_ID))
names(url_freqs) <- c('url', "number_of_tweets")
url_freqs$url <- as.numeric(url_freqs$url)
df <- left_join(df, url_freqs, by = "url")
# filtering
df$`Project Posted Date` <- lubridate::ymd(df$`Project Posted Date`)
aft <- lubridate::ymd("2015-01-01")
bef <- lubridate::ymd("2016-12-31")
df_ss <- filter(df, `Project Posted Date` >= aft & `Project Posted Date` <= bef)
# final processing
df_ss$tweeted <- as.numeric(df_ss$tweeted)
df_ss$number_of_tweets <- ifelse(is.na(df_ss$number_of_tweets), 0, df_ss$number_of_tweets)
With a dummy code for whether the URL was tweeted.
m1 <- glm(funded ~
tweeted,
data = df_ss,
family = binomial(link = 'logit'))
arm::display(m1)
## glm(formula = funded ~ tweeted, family = binomial(link = "logit"),
## data = df_ss)
## coef.est coef.se
## (Intercept) 0.60 0.00
## tweeted 0.15 0.02
## ---
## n = 467813, k = 2
## residual deviance = 606868.7, null deviance = 606948.4 (difference = 79.7)
jmRtools::convert_log_odds(coef(m1))
## (Intercept) tweeted
## 1.830669 1.157345
With project variables.
m2 <- glm(funded ~
tweeted +
`Project Subject Category` +
`Project Grade Level`,
data = df_ss,
family = binomial(link = 'logit'))
arm::display(m2)
## glm(formula = funded ~ tweeted + `Project Subject Category` +
## `Project Grade Level`, family = binomial(link = "logit"),
## data = df_ss)
## coef.est coef.se
## (Intercept) 0.53 0.01
## tweeted 0.15 0.02
## `Project Subject Category`Food, Warmth & Care -0.56 0.06
## `Project Subject Category`Health & Sports -0.15 0.02
## `Project Subject Category`History & Civics 0.11 0.02
## `Project Subject Category`Literacy & Language 0.09 0.01
## `Project Subject Category`Math & Science 0.00 0.01
## `Project Subject Category`Music & The Arts 0.14 0.02
## `Project Subject Category`Special Needs 0.11 0.02
## `Project Grade Level`Grades 6-8 0.00 0.01
## `Project Grade Level`Grades 9-12 0.11 0.01
## `Project Grade Level`Grades PreK-2 0.04 0.01
## ---
## n = 467779, k = 12
## residual deviance = 606012.6, null deviance = 606903.0 (difference = 890.4)
With school variables.
m3 <- glm(funded ~
tweeted +
`Project Subject Category` +
`Project Grade Level` +
`School Poverty Level` +
`School Metro Area`,
data = df_ss,
family = binomial(link = 'logit'))
arm::display(m3)
## glm(formula = funded ~ tweeted + `Project Subject Category` +
## `Project Grade Level` + `School Poverty Level` + `School Metro Area`,
## family = binomial(link = "logit"), data = df_ss)
## coef.est coef.se
## (Intercept) 0.36 0.02
## tweeted 0.17 0.02
## `Project Subject Category`Food, Warmth & Care -0.56 0.07
## `Project Subject Category`Health & Sports -0.16 0.02
## `Project Subject Category`History & Civics 0.10 0.02
## `Project Subject Category`Literacy & Language 0.08 0.02
## `Project Subject Category`Math & Science 0.01 0.02
## `Project Subject Category`Music & The Arts 0.13 0.02
## `Project Subject Category`Special Needs 0.11 0.02
## `Project Grade Level`Grades 6-8 -0.01 0.01
## `Project Grade Level`Grades 9-12 0.10 0.01
## `Project Grade Level`Grades PreK-2 0.03 0.01
## `School Poverty Level`upper income 0.00 0.01
## `School Metro Area`suburban 0.12 0.01
## `School Metro Area`urban 0.28 0.01
## ---
## n = 407531, k = 15
## residual deviance = 525060.9, null deviance = 526849.9 (difference = 1789.1)
With teacher variable
m4 <- glm(funded ~
tweeted +
`Project Subject Category` +
`Project Grade Level` +
`School Poverty Level` +
`School Metro Area` +
`Teacher Lifetime Donations` +
`Teacher Years with Projects`,
data = df_ss,
family = binomial(link = 'logit'))
arm::display(m4)
## glm(formula = funded ~ tweeted + `Project Subject Category` +
## `Project Grade Level` + `School Poverty Level` + `School Metro Area` +
## `Teacher Lifetime Donations` + `Teacher Years with Projects`,
## family = binomial(link = "logit"), data = df_ss)
## coef.est coef.se
## (Intercept) 0.38 0.02
## tweeted 0.15 0.02
## `Project Subject Category`Food, Warmth & Care -0.65 0.07
## `Project Subject Category`Health & Sports -0.18 0.02
## `Project Subject Category`History & Civics 0.09 0.02
## `Project Subject Category`Literacy & Language 0.09 0.02
## `Project Subject Category`Math & Science 0.02 0.02
## `Project Subject Category`Music & The Arts 0.11 0.02
## `Project Subject Category`Special Needs 0.12 0.02
## `Project Grade Level`Grades 6-8 -0.01 0.01
## `Project Grade Level`Grades 9-12 0.11 0.01
## `Project Grade Level`Grades PreK-2 0.03 0.01
## `School Poverty Level`upper income 0.01 0.01
## `School Metro Area`suburban 0.13 0.01
## `School Metro Area`urban 0.30 0.01
## `Teacher Lifetime Donations` 0.00 0.00
## `Teacher Years with Projects` -0.04 0.00
## ---
## n = 401127, k = 17
## residual deviance = 516321.9, null deviance = 519267.4 (difference = 2945.5)
With a dummy code for whether the URL was tweeted.
model1 <- lme4::glmer(funded ~
tweeted +
(1 | `School State`),
family = "binomial",
data = df_ss)
summary(model1)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: funded ~ tweeted + (1 | `School State`)
## Data: df_ss
##
## AIC BIC logLik deviance df.resid
## 603505.7 603538.8 -301749.8 603499.7 467810
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.9283 -1.2530 0.6795 0.7461 0.8628
##
## Random effects:
## Groups Name Variance Std.Dev.
## School State (Intercept) 0.04727 0.2174
## Number of obs: 467813, groups: School State, 51
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.65658 0.02724 24.10 <2e-16 ***
## tweeted 0.15372 0.01623 9.47 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## tweeted -0.015
sjstats::icc(model1)
## Generalized linear mixed model
## Family: binomial (logit)
## Formula: funded ~ tweeted + (1 | `School State`)
##
## ICC (School State): 0.014165