NSTA_NARST

Method

First, set working directory, load packages, and load data frame.

setwd("~/Dropbox/research/sci_tech_tags/")

library(dplyr)
library(tidyr)
library(SnowballC)
library(tm)
library(lsa)
library(ggplot2)
library(ppls)

# Loading data frame for analysis

data <- readRDS("nsta_narst_df.rds")
sampled_data <- sample_n(data, 1254)
text_vector <- sampled_data$text

Next, set some preliminary variables.

# Number of clusters

n_clusters <- 8

# Standard stopwords

standard_stopwords <- c("a", "an", "the", "to", "of", "and", "for", "by", "on", 
    "is", "I", "all", "this", "with", "it", "at", "from", "or", "you", "as", 
    "your", "are", "be", "that", "not", "have", "was", "we", "what", "which", 
    "there", "they", "he", "she", "his", "hers", "had", "word", "our", "you", 
    "about", "that", "this", "but", "not", "what")

# Additional stopwords

additional_stopwords <- c("amp", "http", "him", "httpt", "nsta", "narst")

# Combine standard and additional stopwords
custom_stopwords <- append(standard_stopwords, additional_stopwords)

Process text.

# Modified stemCompletion function as stemCompletion included with the tm
# package was not working

stemCompletionMod <- function(x, dictionary) {
    x <- unlist(strsplit(as.character(x), " "))
    x <- x[x != ""]
    x <- stemCompletion(x, dictionary = dictionary)
    x <- paste(x, sep = "", collapse = " ")
    PlainTextDocument(stripWhitespace(x))
}

# Removes username

text_vector <- gsub("@\\w+", "", text_vector)

# Remove links

text_vector <- gsub("(f|ht)tp(s?)://(.*)[.][a-z]+", "", text_vector)

# Creating and processing corpus

myCorpus <- Corpus(VectorSource(text_vector))
myCorpus <- tm_map(myCorpus, content_transformer(tolower), mc.cores = 1)  # makes text lowercase
myCorpus <- tm_map(myCorpus, removePunctuation, mc.cores = 1)  # removes punctuation
myCorpus <- tm_map(myCorpus, removeNumbers, mc.cores = 1)  # removes numbers
myCorpus <- tm_map(myCorpus, removeWords, custom_stopwords, mc.cores = 1)  # removes stopwords
myCorpus_copy <- myCorpus  # makes a copy of the corpus for the stepCompletionMod function to compare to 
myCorpus <- tm_map(myCorpus, stemDocument, mc.cores = 1)  # stems text
myCorpus <- tm_map(myCorpus, stemCompletionMod, myCorpus_copy)  # completes stemmed text
myCorpus <- tm_map(myCorpus, stripWhitespace, mc.cores = 1)  # removes whitespace
myCorpus <- tm_map(myCorpus, PlainTextDocument)

print(paste("Processed ", length(text_vector), " documents", sep = ""))

## [1] "Processed 1254 documents"

Creates a term document matrix from the processed text and filter the documents by frequency of terms.

TDM <- TermDocumentMatrix(myCorpus, control = list(weighting = function(x) weightSMART(x, 
    spec = "nnn")))  # this makes sure there is no weighting 

# Name documents

for (i in 1:length(myCorpus)) {
    meta(myCorpus[[i]], "hashtag") <- sampled_data$group[i]  # group refers to the hashtag
}

# Creates logicals for whether a tweet belongs to one of the two groups and
# creates a list of them

index_narst <- tm_index(myCorpus, function(x) meta(x, "hashtag") == "narst")
index_nsta <- tm_index(myCorpus, function(x) meta(x, "hashtag") == "nsta")
doc_list <- list(index_narst, index_nsta)

# Finds out if a term used in a tweet for one hashtag is used at least once
# in another hashtag

ht1 <- TDM[, index_narst]
m1 <- as.matrix(ht1)
in1 <- rowSums(m1) >= 1

ht2 <- TDM[, index_nsta]
m2 <- as.matrix(ht2)
in2 <- rowSums(m2) >= 1

new_index <- in1 * in2  # logical of word occurring at least once in both hashtags

# Filtering

term_sums <- rowSums(as.matrix(TDM))
term_logical <- term_sums >= 5  # change these parameters - this has to do with the minimum frequencies to be included in vocabulary

TDM_common <- TDM[term_logical & new_index, ]  # indexes TDM based on minimum frequencies and maximum frequencies and occuring at least once in both hashtags

# Finds and removes documents with no terms

adjminusCI_bool <- colSums(as.matrix(TDM_common)) < 1  # need to fix - change this 
doc_outliers <- adjminusCI_bool
TDM_cleaned <- TDM_common[, !doc_outliers]

# Filters matrix by boolean vectors to remove term outliers resulting from
# removing document outliers

term_outliers <- rowSums(as.matrix(TDM_cleaned)) == 0
TDM_cleaned <- TDM_cleaned[!term_outliers, ]

# Filters doc_list

adjminusCI <- !adjminusCI_bool
doc_list_cleaned <- list()
for (i in seq(doc_list)) {
    doc_list_cleaned[[i]] <- doc_list[[i]] & adjminusCI
    doc_list_cleaned[[i]] <- doc_list_cleaned[[i]][!doc_outliers]
}

Creates deviation vectors for each document.

# Creates transposed matrix which needs to have deviation vectors calculated

mat <- as.matrix(TDM_cleaned)

# Processing data for vectors into list

mat_list <- apply(mat, 2, list)

# Functions for deviation vectors

vect_project <- function(a,b){
  project <- crossprod(a,b) * b
  project
}

dev_vector <- function(vect_list){
  norm_vects <- lapply(vect_list, normalize.vector)
  sum_vect <- colSums(do.call(rbind, norm_vects))
  norm_sum <- normalize.vector(sum_vect)
  projects <- lapply(norm_vects, vect_project, norm_sum)
  difference <- mapply('-', norm_vects, projects)
  dev_vects <-  apply(difference, MARGIN = 2, FUN = normalize.vector)
  dev_vects
}

# Calculating deviation vectors

mat_vec <- lapply(mat_list, unlist)
mat_dev <- dev_vector(mat_vec)

Clusters documents.

# Calculating number of clusters using within-cluster ss

wss <- (nrow(mat)- 1) * sum(apply(mat, 2, var)) # need to change
for (i in 2:18) wss[i] <- sum(kmeans(mat,
                                     centers=i)$withinss)
plot(1:18, wss, type="b", xlab="Number of Clusters",
     ylab="Within groups sum of squares")

# Transposes matrix

mat_dev_t <- t(mat_dev)

# Fits Ward's hierarchical algorithm

distance <- dist(mat_dev_t, method = "euclidean")
mat_dev_t_clust <- hclust(distance)
hclust_cut <- cutree(mat_dev_t_clust, n_clusters)

# Processes clusters for Ward's for start values for kmeans

clusters1 <- list()
for (i in seq(n_clusters)){
  clusters1[[i]] <- mat_dev_t[hclust_cut == i,]
}

ordered_clusters1 <- list()
cluster_freqs1 <- list()
for (i in seq(length(clusters1))){
  ordered_clusters1[[i]] <- colSums(as.matrix(clusters1[[i]]) / nrow(clusters1[[i]]))
  cluster_freqs1[[i]] <- ordered_clusters1[[i]]
}

# Fits kmeans algorithm

start <- data.frame(matrix(unlist(cluster_freqs1), nrow=length(cluster_freqs1[[1]]), byrow=T),stringsAsFactors=FALSE)
start <- as.matrix(start)
start <- t(start)
kfit <- kmeans(mat_dev_t, start)

# Creating a list of weighted term document matrices for each cluster

clusters <- list()
for (i in seq(n_clusters)){
  clusters[[i]] <- mat_dev_t[kfit$cluster == i, ]
}

# Creating an ordered list of clusters

ordered_clusters <- list()
cluster_freqs <- list()
clusters_df <- matrix(nrow = 10, ncol = n_clusters)

for (i in seq(length(clusters))){
  ordered_clusters[[i]] <- colSums(as.matrix(clusters[[i]]) / nrow(clusters[[i]]))
  cluster_freqs[[i]] <- ordered_clusters[[i]]
  ordered_clusters[[i]] <- names(sort(ordered_clusters[[i]], decreasing = TRUE))
  clusters_df[, i] <- ordered_clusters[[i]][1:10]
}

clusters_df <- as.data.frame(clusters_df)
colnames(clusters_df) <- paste("Cluster ", c(seq(n_clusters)), sep="")

Calculating similarities.

# Creates groups using group booleans 

TDM_group <- list()
for (i in seq(doc_list_cleaned)){
  TDM_group[[i]] <- mat_dev_t[doc_list_cleaned[[i]], ]
}

# Creates clusters using cluster booleans

clusters <- list()
for (i in seq(n_clusters)){
  clusters[[i]] <- mat_dev_t[kfit$cluster == i, ]
}

# Calculates term frequencies for each group 

group_freqs <- list()
for (i in seq(doc_list)){
  group_freqs[[i]] <- colSums(as.matrix(TDM_group[[i]])) / nrow(TDM_group[[i]]) # Need to fix - will want to add group freqs
}

# Computing similarities  

cosines <- list()
cosines_list <- list()

for (i in seq(length(TDM_group))){ ## change this
  cosines[[i]] <- vector()
  for (j in seq(length(clusters))){
    cosines[[i]] <- append(cosines[[i]], cosine(group_freqs[[i]], cluster_freqs[[j]]))
  }
  cosines_list[[i]] <- cosines[[i]]
}

# Creating data frame and scaled data frame

cosines_df <- as.data.frame(do.call(rbind, cosines_list))
cos_plot <- gather(cosines_df, Cluster, cosines)
cos_plot$group <- rep(1:length(doc_list), length(cosines_df))
cos_plot$group <- as.factor(cos_plot$group)
levels(cos_plot$group) <- c("NARST", "NSTA")
names(cos_plot) <- c("Topic", "Cosines", "Group")

cos_plot$Topic <- as.factor(cos_plot$Topic)
for (i in seq(n_clusters)){
  levels(cos_plot$Topic)[i] <- paste("Topic ", i, sep = "") 
}

plot <- ggplot(data = cos_plot, aes(x = Group, y = Cosines, fill = Topic)) +
    geom_bar(position = "dodge", stat = "identity", width = .75) +
    xlab("Hashtag") +
    ylab("Cosines") + 
    ggtitle("Cosine similarity between topics and clusters")

Results

Here are the number of tweets and the top 10 words in each cluster:

table(kfit$cluster)

## 
##   1   2   3   4   5   6   7   8 
## 666  35 140  55 108  22 107  79

print(clusters_df)

##    Cluster 1 Cluster 2  Cluster 3  Cluster 4 Cluster 5   Cluster 6
## 1    chicago      just    science   research     learn       place
## 2       ngss       saw       like    practic  progress       great
## 3      great       won        fun       stem    museum       items
## 4      thank       now curriculum       educ  aquarium inspiration
## 5    session     whats        kid     beyond     shedd     chicago
## 6        day   awesome     edtech    embrace      come       excel
## 7      award  pathways      tower experience       via         fun
## 8       time  evidence       also  challenge      meet     student
## 9        see  progress       educ leadership     share       human
## 10       out    almost  challenge      think       set        host
##     Cluster 7        Cluster 8
## 1        stem          teacher
## 2        next responsibilities
## 3  generation      sensemaking
## 4    standard             move
## 5         wow             both
## 6    interest      preparation
## 7     science             idea
## 8       fight         resource
## 9        into         triggers
## 10      teach          instead

Here is a plot of the cosine similarity between topics and clusters:

print(plot)

NSTA_NARST

Joshua Rosenberg

July 28, 2015

Introduction

Method

Results