I used TAGs to archive Tweets from the NARST conference and became interested in comparing the tweets from that conference, which consists primarily of presentations from researchers, to the NSTA conference, which consists primarily of presentations from teachers. I used Tweet Archivist to access tweets from the NSTA conference. I wondered whether I could compare the two to understand what teachers and researchers discuss on Twitter at science education conferences and whether inferences about what is important to each can be drawn from them.
First, set working directory, load packages, and load data frame.
setwd("~/Dropbox/research/sci_tech_tags/")
library(dplyr)
library(tidyr)
library(SnowballC)
library(tm)
library(lsa)
library(ggplot2)
library(ppls)
# Loading data frame for analysis
data <- readRDS("nsta_narst_df.rds")
sampled_data <- sample_n(data, 1254)
text_vector <- sampled_data$text
Next, set some preliminary variables.
# Number of clusters
n_clusters <- 8
# Standard stopwords
standard_stopwords <- c("a", "an", "the", "to", "of", "and", "for", "by", "on",
"is", "I", "all", "this", "with", "it", "at", "from", "or", "you", "as",
"your", "are", "be", "that", "not", "have", "was", "we", "what", "which",
"there", "they", "he", "she", "his", "hers", "had", "word", "our", "you",
"about", "that", "this", "but", "not", "what")
# Additional stopwords
additional_stopwords <- c("amp", "http", "him", "httpt", "nsta", "narst")
# Combine standard and additional stopwords
custom_stopwords <- append(standard_stopwords, additional_stopwords)
Process text.
# Modified stemCompletion function as stemCompletion included with the tm
# package was not working
stemCompletionMod <- function(x, dictionary) {
x <- unlist(strsplit(as.character(x), " "))
x <- x[x != ""]
x <- stemCompletion(x, dictionary = dictionary)
x <- paste(x, sep = "", collapse = " ")
PlainTextDocument(stripWhitespace(x))
}
# Removes username
text_vector <- gsub("@\\w+", "", text_vector)
# Remove links
text_vector <- gsub("(f|ht)tp(s?)://(.*)[.][a-z]+", "", text_vector)
# Creating and processing corpus
myCorpus <- Corpus(VectorSource(text_vector))
myCorpus <- tm_map(myCorpus, content_transformer(tolower), mc.cores = 1) # makes text lowercase
myCorpus <- tm_map(myCorpus, removePunctuation, mc.cores = 1) # removes punctuation
myCorpus <- tm_map(myCorpus, removeNumbers, mc.cores = 1) # removes numbers
myCorpus <- tm_map(myCorpus, removeWords, custom_stopwords, mc.cores = 1) # removes stopwords
myCorpus_copy <- myCorpus # makes a copy of the corpus for the stepCompletionMod function to compare to
myCorpus <- tm_map(myCorpus, stemDocument, mc.cores = 1) # stems text
myCorpus <- tm_map(myCorpus, stemCompletionMod, myCorpus_copy) # completes stemmed text
myCorpus <- tm_map(myCorpus, stripWhitespace, mc.cores = 1) # removes whitespace
myCorpus <- tm_map(myCorpus, PlainTextDocument)
print(paste("Processed ", length(text_vector), " documents", sep = ""))
## [1] "Processed 1254 documents"
Creates a term document matrix from the processed text and filter the documents by frequency of terms.
TDM <- TermDocumentMatrix(myCorpus, control = list(weighting = function(x) weightSMART(x,
spec = "nnn"))) # this makes sure there is no weighting
# Name documents
for (i in 1:length(myCorpus)) {
meta(myCorpus[[i]], "hashtag") <- sampled_data$group[i] # group refers to the hashtag
}
# Creates logicals for whether a tweet belongs to one of the two groups and
# creates a list of them
index_narst <- tm_index(myCorpus, function(x) meta(x, "hashtag") == "narst")
index_nsta <- tm_index(myCorpus, function(x) meta(x, "hashtag") == "nsta")
doc_list <- list(index_narst, index_nsta)
# Finds out if a term used in a tweet for one hashtag is used at least once
# in another hashtag
ht1 <- TDM[, index_narst]
m1 <- as.matrix(ht1)
in1 <- rowSums(m1) >= 1
ht2 <- TDM[, index_nsta]
m2 <- as.matrix(ht2)
in2 <- rowSums(m2) >= 1
new_index <- in1 * in2 # logical of word occurring at least once in both hashtags
# Filtering
term_sums <- rowSums(as.matrix(TDM))
term_logical <- term_sums >= 5 # change these parameters - this has to do with the minimum frequencies to be included in vocabulary
TDM_common <- TDM[term_logical & new_index, ] # indexes TDM based on minimum frequencies and maximum frequencies and occuring at least once in both hashtags
# Finds and removes documents with no terms
adjminusCI_bool <- colSums(as.matrix(TDM_common)) < 1 # need to fix - change this
doc_outliers <- adjminusCI_bool
TDM_cleaned <- TDM_common[, !doc_outliers]
# Filters matrix by boolean vectors to remove term outliers resulting from
# removing document outliers
term_outliers <- rowSums(as.matrix(TDM_cleaned)) == 0
TDM_cleaned <- TDM_cleaned[!term_outliers, ]
# Filters doc_list
adjminusCI <- !adjminusCI_bool
doc_list_cleaned <- list()
for (i in seq(doc_list)) {
doc_list_cleaned[[i]] <- doc_list[[i]] & adjminusCI
doc_list_cleaned[[i]] <- doc_list_cleaned[[i]][!doc_outliers]
}
Creates deviation vectors for each document.
# Creates transposed matrix which needs to have deviation vectors calculated
mat <- as.matrix(TDM_cleaned)
# Processing data for vectors into list
mat_list <- apply(mat, 2, list)
# Functions for deviation vectors
vect_project <- function(a,b){
project <- crossprod(a,b) * b
project
}
dev_vector <- function(vect_list){
norm_vects <- lapply(vect_list, normalize.vector)
sum_vect <- colSums(do.call(rbind, norm_vects))
norm_sum <- normalize.vector(sum_vect)
projects <- lapply(norm_vects, vect_project, norm_sum)
difference <- mapply('-', norm_vects, projects)
dev_vects <- apply(difference, MARGIN = 2, FUN = normalize.vector)
dev_vects
}
# Calculating deviation vectors
mat_vec <- lapply(mat_list, unlist)
mat_dev <- dev_vector(mat_vec)
Clusters documents.
# Calculating number of clusters using within-cluster ss
wss <- (nrow(mat)- 1) * sum(apply(mat, 2, var)) # need to change
for (i in 2:18) wss[i] <- sum(kmeans(mat,
centers=i)$withinss)
plot(1:18, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")
# Transposes matrix
mat_dev_t <- t(mat_dev)
# Fits Ward's hierarchical algorithm
distance <- dist(mat_dev_t, method = "euclidean")
mat_dev_t_clust <- hclust(distance)
hclust_cut <- cutree(mat_dev_t_clust, n_clusters)
# Processes clusters for Ward's for start values for kmeans
clusters1 <- list()
for (i in seq(n_clusters)){
clusters1[[i]] <- mat_dev_t[hclust_cut == i,]
}
ordered_clusters1 <- list()
cluster_freqs1 <- list()
for (i in seq(length(clusters1))){
ordered_clusters1[[i]] <- colSums(as.matrix(clusters1[[i]]) / nrow(clusters1[[i]]))
cluster_freqs1[[i]] <- ordered_clusters1[[i]]
}
# Fits kmeans algorithm
start <- data.frame(matrix(unlist(cluster_freqs1), nrow=length(cluster_freqs1[[1]]), byrow=T),stringsAsFactors=FALSE)
start <- as.matrix(start)
start <- t(start)
kfit <- kmeans(mat_dev_t, start)
# Creating a list of weighted term document matrices for each cluster
clusters <- list()
for (i in seq(n_clusters)){
clusters[[i]] <- mat_dev_t[kfit$cluster == i, ]
}
# Creating an ordered list of clusters
ordered_clusters <- list()
cluster_freqs <- list()
clusters_df <- matrix(nrow = 10, ncol = n_clusters)
for (i in seq(length(clusters))){
ordered_clusters[[i]] <- colSums(as.matrix(clusters[[i]]) / nrow(clusters[[i]]))
cluster_freqs[[i]] <- ordered_clusters[[i]]
ordered_clusters[[i]] <- names(sort(ordered_clusters[[i]], decreasing = TRUE))
clusters_df[, i] <- ordered_clusters[[i]][1:10]
}
clusters_df <- as.data.frame(clusters_df)
colnames(clusters_df) <- paste("Cluster ", c(seq(n_clusters)), sep="")
Calculating similarities.
# Creates groups using group booleans
TDM_group <- list()
for (i in seq(doc_list_cleaned)){
TDM_group[[i]] <- mat_dev_t[doc_list_cleaned[[i]], ]
}
# Creates clusters using cluster booleans
clusters <- list()
for (i in seq(n_clusters)){
clusters[[i]] <- mat_dev_t[kfit$cluster == i, ]
}
# Calculates term frequencies for each group
group_freqs <- list()
for (i in seq(doc_list)){
group_freqs[[i]] <- colSums(as.matrix(TDM_group[[i]])) / nrow(TDM_group[[i]]) # Need to fix - will want to add group freqs
}
# Computing similarities
cosines <- list()
cosines_list <- list()
for (i in seq(length(TDM_group))){ ## change this
cosines[[i]] <- vector()
for (j in seq(length(clusters))){
cosines[[i]] <- append(cosines[[i]], cosine(group_freqs[[i]], cluster_freqs[[j]]))
}
cosines_list[[i]] <- cosines[[i]]
}
# Creating data frame and scaled data frame
cosines_df <- as.data.frame(do.call(rbind, cosines_list))
cos_plot <- gather(cosines_df, Cluster, cosines)
cos_plot$group <- rep(1:length(doc_list), length(cosines_df))
cos_plot$group <- as.factor(cos_plot$group)
levels(cos_plot$group) <- c("NARST", "NSTA")
names(cos_plot) <- c("Topic", "Cosines", "Group")
cos_plot$Topic <- as.factor(cos_plot$Topic)
for (i in seq(n_clusters)){
levels(cos_plot$Topic)[i] <- paste("Topic ", i, sep = "")
}
plot <- ggplot(data = cos_plot, aes(x = Group, y = Cosines, fill = Topic)) +
geom_bar(position = "dodge", stat = "identity", width = .75) +
xlab("Hashtag") +
ylab("Cosines") +
ggtitle("Cosine similarity between topics and clusters")
Here are the number of tweets and the top 10 words in each cluster:
table(kfit$cluster)
##
## 1 2 3 4 5 6 7 8
## 666 35 140 55 108 22 107 79
print(clusters_df)
## Cluster 1 Cluster 2 Cluster 3 Cluster 4 Cluster 5 Cluster 6
## 1 chicago just science research learn place
## 2 ngss saw like practic progress great
## 3 great won fun stem museum items
## 4 thank now curriculum educ aquarium inspiration
## 5 session whats kid beyond shedd chicago
## 6 day awesome edtech embrace come excel
## 7 award pathways tower experience via fun
## 8 time evidence also challenge meet student
## 9 see progress educ leadership share human
## 10 out almost challenge think set host
## Cluster 7 Cluster 8
## 1 stem teacher
## 2 next responsibilities
## 3 generation sensemaking
## 4 standard move
## 5 wow both
## 6 interest preparation
## 7 science idea
## 8 fight resource
## 9 into triggers
## 10 teach instead
Here is a plot of the cosine similarity between topics and clusters:
print(plot)