0. loading, setting up

remove(list = ls())

library(tidyverse)
library(stringr)
library(lubridate)
library(ggthemes)
library(cowplot)
library(igraph)
library(GGally)

# data for cohort 2 processed with the Twitter Data Suite
df_c2 <- read_csv("~/Google Drive/1_Research/Twitter_Data_Suite/hashtag/urbanstem_cohort2/2_tweets.csv")

# raw data collected with Agarwal's Twitter tracker
df_t1 <- read_csv("/Users/joshuarosenberg/Google Drive/6_Data/MAET/maet_data_1-2014-7-2015.csv")
df_t1 <- filter(df_t1, tolower(hashtag) == "#msurbanstem")

# raw data collected with TAGS
df_t2 <- read_csv("/Users/joshuarosenberg/Google Drive/6_Data/MAET/maet_data_7_2015-12-2016.csv")
df_t2$date <- lubridate::parse_date_time(df_t2$created_at, "%a %b! %d! %H! %M! %S! %z! %y!")
df_t2 <- filter(df_t2, stringr::str_detect(tolower(df_t2$entities_str), "msurbanstem"))

membership <- read_csv("~/Dropbox/1_Research/MSU Urban STEM SNA/membership.csv")

df_all <- bind_rows(df_t1, df_t2)

1. Time Series

df_all$date_day <- round_date(df_all$date, "day")
to_plot <- count(df_all, date_day)

the_limits = c(ymd("2014-06-01"), ymd("2017-07-30"))
the_limits <- as.POSIXct(the_limits)

first_cohort_begins = as.POSIXct(ymd("2014-07-15"))
first_cohort_ends = as.POSIXct(ymd("2015-06-30"))
first_cohort_annotation = as.POSIXct(ymd("2014-12-25"))

second_cohort_begins = as.POSIXct(ymd("2015-07-15"))
second_cohort_ends = as.POSIXct(ymd("2016-06-30"))
second_cohort_annotation = as.POSIXct(ymd("2015-12-25"))

third_cohort_begins = as.POSIXct(ymd("2016-07-15"))
third_cohort_ends = as.POSIXct(ymd("2017-06-30"))
third_cohort_annotation = as.POSIXct(ymd("2016-12-25"))

ggplot(to_plot, aes(x = date_day, y = n)) +
    geom_line() + 
    scale_x_datetime(limits = the_limits) +
    geom_rect(aes(xmin = first_cohort_begins, 
                  xmax = first_cohort_ends, 
                  ymin = -Inf, 
                  ymax = Inf),
              fill = "#d8b365", 
              alpha = 0.01) +
    geom_rect(aes(xmin = second_cohort_begins, 
                  xmax = second_cohort_ends, 
                  ymin = -Inf, 
                  ymax = Inf),
              fill = "#f5f5f5", 
              alpha = 0.01) +
    geom_rect(aes(xmin = third_cohort_begins, 
                  xmax = third_cohort_ends, 
                  ymin = -Inf, 
                  ymax = Inf),
              fill = "#5ab4ac", 
              alpha = 0.01) +
    annotate("text", x = first_cohort_annotation, y = 175, label = "1st Cohort") +
    annotate("text", x = second_cohort_annotation, y = 175, label = "2nd Cohort") +
    annotate("text", x = third_cohort_annotation, y = 175, label = "3rd Cohort") +
    theme_minimal() +
    ylab("Number of Tweets Per Day") +
    xlab(NULL) +
    ggtitle(NULL) +
    theme(text = element_text(size = 16, family = "Times"))

ggsave("msurbanstem_timeseries.png", width = 10, height = 9)

2. Processing

create_the_edgelist <- function(df, sender_col, receiver_col){
    
    df_ss <- filter(df, type == "ORIG" | type == "REPLY")
    
    receiver <- select_(df_ss, receiver_col)
    receiver <- collect(select_(receiver, receiver_col))[[1]]
    
    df_for_sender <- select_(df_ss, sender_col)
    df_for_sender <- collect(select_(df_for_sender, sender_col))[[1]]
    
    sender <- stringr::str_split(df_for_sender, "\\*")
    
    tmp = stack(setNames(sender, receiver))[, 2:1]
    names(tmp) <- c("receiver", "sender")
    
    tmp$sender <- tolower(tmp$sender)
    tmp$receiver <- tolower(tmp$receiver)
    
    tmp <- tmp %>% dplyr::mutate(var = sender_col)
    tmp <- filter(tmp, !is.na(sender))
    tmp <- tbl_df(tmp)
    
    return(tmp)
    
}

favorites <- create_the_edgelist(df_c2, "favNames", "screen_name")
mentions <- create_the_edgelist(df_c2, "non_reply_mentions", "screen_name")
retweets <- create_the_edgelist(df_c2, "rtNames", "screen_name")
replies <- create_the_edgelist(df_c2, "reply_user_sn", "screen_name")

all_df <- bind_rows(favorites, mentions, retweets, replies)

all_df$var <- ifelse(all_df$var == "favNames", "Favorites",
                     ifelse(all_df$var == "non_reply_mentions", "Mentions",
                            ifelse(all_df$var == "rtNames", "Retweets",
                                   ifelse(all_df$var == "reply_user_sn", "Replies", NA))))

# Joining membership data

membership$membership <- car::recode(membership$membership, "c(2, 3) = 6; 4 = 7; c(1, 7, 6) = 4; 5 = 5; 10 = 1; 9 = 2; 11 = 3; 8 = 8")
membership$membership <- car::recode(membership$membership, "c(4, 5, 6, 7, 8) = 4")
membership$membership <- car::recode(membership$membership, "1 = '1st Cohort'; 2 = '2nd Cohort'; 3 = 'Instructional Team'; 4 = 'Other'")

membership <- rename(membership, receiver_membership = membership)
membership <- mutate(membership, sender_membership = receiver_membership)

membership <- rename(membership, sender = users)
membership <- mutate(membership, receiver = sender)

all_df_ss <- left_join(all_df, select(membership, sender, sender_membership), by = "sender")
all_df_ss <- left_join(all_df_ss, select(membership, receiver, receiver_membership), by = "receiver")

all_df_ss <- select(all_df_ss, sender, receiver, sender_membership, receiver_membership, var)
all_df_ss[is.na(all_df_ss$sender_membership), "sender_membership"] <- "Other"
all_df_ss[is.na(all_df_ss$receiver_membership), "receiver_membership"] <- "Other"

# filtering others

all_df_filtered <- filter(all_df_ss, sender %in% receiver)

# adding 1st and 2nd cohort and instructors with no ties

# all_df_filtered %>% 
#     filter(receiver_membership == "1st Cohort" |
#                receiver_membership == "2nd Cohort" |
#                    receiver_membership == "Instructional Tea") %>% 
#     group_by(var, sender) %>% 
#     summarize(n = n()) %>% View()

3. Degree plots

create_density_plots <- function(edgelist, selected_var, mode){
    
    edgelist_ss <- filter(edgelist, var == selected_var)

    if (mode == "in"){
        edge1 <- filter(edgelist_ss, receiver_membership == "1st Cohort")
        edge2 <- filter(edgelist_ss, receiver_membership == "2nd Cohort")
        edge3 <- filter(edgelist_ss, receiver_membership == "Instructional Team")
        edge4 <- filter(edgelist_ss, receiver_membership == "Other")
    } else {
        edge1 <- filter(edgelist_ss, sender_membership == "1st Cohort")
        edge2 <- filter(edgelist_ss, sender_membership == "2nd Cohort")
        edge3 <- filter(edgelist_ss, sender_membership == "Instructional Team")
        edge4 <- filter(edgelist_ss, sender_membership == "Other")
    }
    
    graph1 <- igraph::graph_from_data_frame(edge1[, 1:2])
    df1 <- data_frame(degree = igraph::degree(graph1, mode = mode))
    df1$group = "1st Cohort"
    
    graph2 <- igraph::graph_from_data_frame(edge2[, 1:2])
    df2 <- data_frame(degree = igraph::degree(graph2, mode = mode))
    df2$group = "2nd Cohort"
    
    graph3 <- igraph::graph_from_data_frame(edge3[, 1:2])
    df3 <- data_frame(degree = igraph::degree(graph3, mode = mode))
    df3$group = "Instructional Team"
    
    graph4 <- igraph::graph_from_data_frame(edge4[, 1:2])
    df4 <- data_frame(degree = igraph::degree(graph4, mode = mode))
    df4$group = "Other"
    
    df <- bind_rows(df1, df2, df3, df4)
    
    df$degree <- ifelse(df$degree > 25, 25, df$degree)
    
    ggplot(df, aes(x = degree, fill = group)) +
        geom_histogram(binwidth = 1) + 
        ylab("Number of Participants") +
        xlab(paste0(Hmisc::capitalize(mode), "-degree")) +
        ggtitle(selected_var) +
        theme(legend.title = element_text(size = 16),
              text = element_text(family = "Times")) +
        scale_fill_discrete("") +
        xlim(-.5, 25.5)
    
}

create_plot_grid <- function(df, interaction_vector, mode){
    
    p1 <- create_density_plots(all_df_filtered, interaction_vector[1], mode)
    p2 <- create_density_plots(all_df_filtered, interaction_vector[2], mode)
    p3 <- create_density_plots(all_df_filtered, interaction_vector[3], mode)
    p4 <- create_density_plots(all_df_filtered, interaction_vector[4], mode)
    
    legend_b <- get_legend(p1 + theme(legend.position="bottom"))
    
    prow <-
        plot_grid(p1 + theme(legend.position="none"),
                  p2 + theme(legend.position="none"),
                  p3 + theme(legend.position="none"),
                  p4 + theme(legend.position="none"),
                  nrow = 2)
    
    title <- NULL

    p <- plot_grid(title, legend_b, prow, nrow = 3, rel_heights = c(.01, .075, 1))
    return(p)
}

in_plot <- create_plot_grid(all_df_filtered, 
                            c("Mentions",
                              "Replies",
                              "Favorites",
                              "Retweets"),
                            "in")

in_plot

ggsave("indegree_plot.png", width = 6, height = 6)

out_plot <- create_plot_grid(all_df_filtered, 
                             c("Mentions",
                               "Replies",
                               "Favorites",
                               "Retweets"),
                             "out")

out_plot

ggsave("outdegree_plot.png", width = 6, height = 6)

# ggsave("p.png", width = 10, height = 10)

4. Sociograms

make_the_graph <- function(df, interaction){
    
    df_ss <- filter(df, var == interaction)
    
    df_ss <- filter(df_ss, sender != receiver)
    
    sender_df <- select(df_ss, username = sender, membership = sender_membership)
    receiver_df <- select(df_ss, username = receiver, membership = receiver_membership)
    vertex_df <- bind_rows(sender_df, receiver_df)
    vertex_df_ss <- distinct(vertex_df)    
    
    g <- igraph::graph_from_data_frame(df_ss[, 1:2], vertices = vertex_df_ss, directed = T)
    g <- set_edge_attr(g, "weight", value = 1)
    g <- simplify(g, remove.multiple = T, remove.loops = T, edge.attr.comb = list(weight = "sum"))
    g <- igraph::set_vertex_attr(g, "degree", value = igraph::degree(g, mode = "all"))

    tmp <- igraph::get.vertex.attribute(g, "degree")
    g <- igraph::set_vertex_attr(g, "degree_s", value = tmp / 3)
    
    tmp <- igraph::get.edge.attribute(g, "weight")
    tmp <- ifelse(tmp > 25, 15, tmp)
    g <- igraph::set_edge_attr(g, "weight_s", value = tmp / 5)
    
    p <- ggnet2(g,
                color = "membership",
                size = "degree_s",
                palette = "Set1",
                directed = T,
                arrow.size = 3,
                edge.size = "weight_s",
                label = F,
                arrow.gap = .05) +
        theme(legend.title=element_blank(), 
              text = element_text(size = 25, family = "Times")) +
        guides(size = F) +
        ggtitle(Hmisc::capitalize(interaction))
    
    return(p)
    
}

create_soc_grid <- function(df, interaction_vector, mode){
    
    g_mentions <- make_the_graph(df, interaction_vector[1])
    g_replies <- make_the_graph(df, interaction_vector[2])
    g_favorites <- make_the_graph(df, interaction_vector[3])
    g_retweets <- make_the_graph(df, interaction_vector[4])
    
    legend_b <- get_legend(g_mentions + theme(legend.position="bottom"))
    
    prow <-
        plot_grid(g_mentions + theme(legend.position="none"),
                  g_replies + theme(legend.position="none"),
                  g_favorites + theme(legend.position="none"),
                  g_retweets + theme(legend.position="none"),
                  nrow = 2)
    
    title <- NULL
    
    p <- plot_grid(title, legend_b, prow, nrow = 3, rel_heights = c(.01, .075, 1))
    return(p)
}
       
create_soc_grid(all_df_filtered, 
                c("Mentions",
                  "Replies",
                  "Favorites",
                  "Retweets"))

ggsave("sociogram.png", width = 7, height = 7)