Introduction

I used TAGs to archive Tweets from the NARST conference and became interested in comparing the tweets from that conference, which consists primarily of presentations from researchers, to the NSTA conference, which consists primarily of presentations from teachers. I used Tweet Archivist to access tweets from the NSTA conference. I wondered whether I could compare the two to understand what teachers and researchers discuss on Twitter at science education conferences and whether inferences about what is important to each can be drawn from them.

Method

First, set working directory, load packages, and load data frame.

setwd("~/Dropbox/research/sci_tech_tags/")

library(dplyr)
library(tidyr)
library(SnowballC)
library(tm)
library(lsa)
library(ggplot2)
library(ppls)

# Loading data frame for analysis

data <- readRDS("nsta_narst_df.rds")
sampled_data <- sample_n(data, 1254)
text_vector <- sampled_data$text

Next, set some preliminary variables.

# Number of clusters

n_clusters <- 8

# Standard stopwords

standard_stopwords <- c("a", "an", "the", "to", "of", "and", "for", "by", "on", 
    "is", "I", "all", "this", "with", "it", "at", "from", "or", "you", "as", 
    "your", "are", "be", "that", "not", "have", "was", "we", "what", "which", 
    "there", "they", "he", "she", "his", "hers", "had", "word", "our", "you", 
    "about", "that", "this", "but", "not", "what")

# Additional stopwords

additional_stopwords <- c("amp", "http", "him", "httpt", "nsta", "narst")

# Combine standard and additional stopwords
custom_stopwords <- append(standard_stopwords, additional_stopwords)

Process text.

# Modified stemCompletion function as stemCompletion included with the tm
# package was not working

stemCompletionMod <- function(x, dictionary) {
    x <- unlist(strsplit(as.character(x), " "))
    x <- x[x != ""]
    x <- stemCompletion(x, dictionary = dictionary)
    x <- paste(x, sep = "", collapse = " ")
    PlainTextDocument(stripWhitespace(x))
}

# Removes username

text_vector <- gsub("@\\w+", "", text_vector)

# Remove links

text_vector <- gsub("(f|ht)tp(s?)://(.*)[.][a-z]+", "", text_vector)

# Creating and processing corpus

myCorpus <- Corpus(VectorSource(text_vector))
myCorpus <- tm_map(myCorpus, content_transformer(tolower), mc.cores = 1)  # makes text lowercase
myCorpus <- tm_map(myCorpus, removePunctuation, mc.cores = 1)  # removes punctuation
myCorpus <- tm_map(myCorpus, removeNumbers, mc.cores = 1)  # removes numbers
myCorpus <- tm_map(myCorpus, removeWords, custom_stopwords, mc.cores = 1)  # removes stopwords
myCorpus_copy <- myCorpus  # makes a copy of the corpus for the stepCompletionMod function to compare to 
myCorpus <- tm_map(myCorpus, stemDocument, mc.cores = 1)  # stems text
myCorpus <- tm_map(myCorpus, stemCompletionMod, myCorpus_copy)  # completes stemmed text
myCorpus <- tm_map(myCorpus, stripWhitespace, mc.cores = 1)  # removes whitespace
myCorpus <- tm_map(myCorpus, PlainTextDocument)

print(paste("Processed ", length(text_vector), " documents", sep = ""))
## [1] "Processed 1254 documents"

Creates a term document matrix from the processed text and filter the documents by frequency of terms.

TDM <- TermDocumentMatrix(myCorpus, control = list(weighting = function(x) weightSMART(x, 
    spec = "nnn")))  # this makes sure there is no weighting 

# Name documents

for (i in 1:length(myCorpus)) {
    meta(myCorpus[[i]], "hashtag") <- sampled_data$group[i]  # group refers to the hashtag
}

# Creates logicals for whether a tweet belongs to one of the two groups and
# creates a list of them

index_narst <- tm_index(myCorpus, function(x) meta(x, "hashtag") == "narst")
index_nsta <- tm_index(myCorpus, function(x) meta(x, "hashtag") == "nsta")
doc_list <- list(index_narst, index_nsta)

# Finds out if a term used in a tweet for one hashtag is used at least once
# in another hashtag

ht1 <- TDM[, index_narst]
m1 <- as.matrix(ht1)
in1 <- rowSums(m1) >= 1

ht2 <- TDM[, index_nsta]
m2 <- as.matrix(ht2)
in2 <- rowSums(m2) >= 1

new_index <- in1 * in2  # logical of word occurring at least once in both hashtags

# Filtering

term_sums <- rowSums(as.matrix(TDM))
term_logical <- term_sums >= 5  # change these parameters - this has to do with the minimum frequencies to be included in vocabulary

TDM_common <- TDM[term_logical & new_index, ]  # indexes TDM based on minimum frequencies and maximum frequencies and occuring at least once in both hashtags

# Finds and removes documents with no terms

adjminusCI_bool <- colSums(as.matrix(TDM_common)) < 1  # need to fix - change this 
doc_outliers <- adjminusCI_bool
TDM_cleaned <- TDM_common[, !doc_outliers]

# Filters matrix by boolean vectors to remove term outliers resulting from
# removing document outliers

term_outliers <- rowSums(as.matrix(TDM_cleaned)) == 0
TDM_cleaned <- TDM_cleaned[!term_outliers, ]

# Filters doc_list

adjminusCI <- !adjminusCI_bool
doc_list_cleaned <- list()
for (i in seq(doc_list)) {
    doc_list_cleaned[[i]] <- doc_list[[i]] & adjminusCI
    doc_list_cleaned[[i]] <- doc_list_cleaned[[i]][!doc_outliers]
}

Creates deviation vectors for each document.

# Creates transposed matrix which needs to have deviation vectors calculated

mat <- as.matrix(TDM_cleaned)

# Processing data for vectors into list

mat_list <- apply(mat, 2, list)

# Functions for deviation vectors

vect_project <- function(a,b){
  project <- crossprod(a,b) * b
  project
}

dev_vector <- function(vect_list){
  norm_vects <- lapply(vect_list, normalize.vector)
  sum_vect <- colSums(do.call(rbind, norm_vects))
  norm_sum <- normalize.vector(sum_vect)
  projects <- lapply(norm_vects, vect_project, norm_sum)
  difference <- mapply('-', norm_vects, projects)
  dev_vects <-  apply(difference, MARGIN = 2, FUN = normalize.vector)
  dev_vects
}

# Calculating deviation vectors

mat_vec <- lapply(mat_list, unlist)
mat_dev <- dev_vector(mat_vec)

Clusters documents.

# Calculating number of clusters using within-cluster ss

wss <- (nrow(mat)- 1) * sum(apply(mat, 2, var)) # need to change
for (i in 2:18) wss[i] <- sum(kmeans(mat,
                                     centers=i)$withinss)
plot(1:18, wss, type="b", xlab="Number of Clusters",
     ylab="Within groups sum of squares")

# Transposes matrix

mat_dev_t <- t(mat_dev)

# Fits Ward's hierarchical algorithm

distance <- dist(mat_dev_t, method = "euclidean")
mat_dev_t_clust <- hclust(distance)
hclust_cut <- cutree(mat_dev_t_clust, n_clusters)

# Processes clusters for Ward's for start values for kmeans

clusters1 <- list()
for (i in seq(n_clusters)){
  clusters1[[i]] <- mat_dev_t[hclust_cut == i,]
}

ordered_clusters1 <- list()
cluster_freqs1 <- list()
for (i in seq(length(clusters1))){
  ordered_clusters1[[i]] <- colSums(as.matrix(clusters1[[i]]) / nrow(clusters1[[i]]))
  cluster_freqs1[[i]] <- ordered_clusters1[[i]]
}

# Fits kmeans algorithm

start <- data.frame(matrix(unlist(cluster_freqs1), nrow=length(cluster_freqs1[[1]]), byrow=T),stringsAsFactors=FALSE)
start <- as.matrix(start)
start <- t(start)
kfit <- kmeans(mat_dev_t, start)

# Creating a list of weighted term document matrices for each cluster

clusters <- list()
for (i in seq(n_clusters)){
  clusters[[i]] <- mat_dev_t[kfit$cluster == i, ]
}

# Creating an ordered list of clusters

ordered_clusters <- list()
cluster_freqs <- list()
clusters_df <- matrix(nrow = 10, ncol = n_clusters)

for (i in seq(length(clusters))){
  ordered_clusters[[i]] <- colSums(as.matrix(clusters[[i]]) / nrow(clusters[[i]]))
  cluster_freqs[[i]] <- ordered_clusters[[i]]
  ordered_clusters[[i]] <- names(sort(ordered_clusters[[i]], decreasing = TRUE))
  clusters_df[, i] <- ordered_clusters[[i]][1:10]
}

clusters_df <- as.data.frame(clusters_df)
colnames(clusters_df) <- paste("Cluster ", c(seq(n_clusters)), sep="")

Calculating similarities.

# Creates groups using group booleans 

TDM_group <- list()
for (i in seq(doc_list_cleaned)){
  TDM_group[[i]] <- mat_dev_t[doc_list_cleaned[[i]], ]
}

# Creates clusters using cluster booleans

clusters <- list()
for (i in seq(n_clusters)){
  clusters[[i]] <- mat_dev_t[kfit$cluster == i, ]
}

# Calculates term frequencies for each group 

group_freqs <- list()
for (i in seq(doc_list)){
  group_freqs[[i]] <- colSums(as.matrix(TDM_group[[i]])) / nrow(TDM_group[[i]]) # Need to fix - will want to add group freqs
}

# Computing similarities  

cosines <- list()
cosines_list <- list()

for (i in seq(length(TDM_group))){ ## change this
  cosines[[i]] <- vector()
  for (j in seq(length(clusters))){
    cosines[[i]] <- append(cosines[[i]], cosine(group_freqs[[i]], cluster_freqs[[j]]))
  }
  cosines_list[[i]] <- cosines[[i]]
}

# Creating data frame and scaled data frame

cosines_df <- as.data.frame(do.call(rbind, cosines_list))
cos_plot <- gather(cosines_df, Cluster, cosines)
cos_plot$group <- rep(1:length(doc_list), length(cosines_df))
cos_plot$group <- as.factor(cos_plot$group)
levels(cos_plot$group) <- c("NARST", "NSTA")
names(cos_plot) <- c("Topic", "Cosines", "Group")

cos_plot$Topic <- as.factor(cos_plot$Topic)
for (i in seq(n_clusters)){
  levels(cos_plot$Topic)[i] <- paste("Topic ", i, sep = "") 
}

plot <- ggplot(data = cos_plot, aes(x = Group, y = Cosines, fill = Topic)) +
    geom_bar(position = "dodge", stat = "identity", width = .75) +
    xlab("Hashtag") +
    ylab("Cosines") + 
    ggtitle("Cosine similarity between topics and clusters")

Results

Here are the number of tweets and the top 10 words in each cluster:

table(kfit$cluster)
## 
##   1   2   3   4   5   6   7   8 
## 666  35 140  55 108  22 107  79
print(clusters_df)
##    Cluster 1 Cluster 2  Cluster 3  Cluster 4 Cluster 5   Cluster 6
## 1    chicago      just    science   research     learn       place
## 2       ngss       saw       like    practic  progress       great
## 3      great       won        fun       stem    museum       items
## 4      thank       now curriculum       educ  aquarium inspiration
## 5    session     whats        kid     beyond     shedd     chicago
## 6        day   awesome     edtech    embrace      come       excel
## 7      award  pathways      tower experience       via         fun
## 8       time  evidence       also  challenge      meet     student
## 9        see  progress       educ leadership     share       human
## 10       out    almost  challenge      think       set        host
##     Cluster 7        Cluster 8
## 1        stem          teacher
## 2        next responsibilities
## 3  generation      sensemaking
## 4    standard             move
## 5         wow             both
## 6    interest      preparation
## 7     science             idea
## 8       fight         resource
## 9        into         triggers
## 10      teach          instead

Here is a plot of the cosine similarity between topics and clusters:

print(plot)