Data Manipulation & basic visualizations

Blog post 4 describing the data manipulation and some basic visualizations as a part of the course “Text as Data”

Rahul Gundeti (Graduate student, Data Analytics & Computational Social Sciences (DACSS), UMass Amherst.)
2022-05-03

Loading required libraries

Creation of VCorpus

hony_unclean <- VCorpus(VectorSource(readLines("C:/Users/gunde/Documents/hony.txt")))
hony_unclean
<<VCorpus>>
Metadata:  corpus specific: 0, document level (indexed): 0
Content:  documents: 1607

Calling a text from Corpus

writeLines(head(strwrap(hony_unclean[[4]]), 7))
A few months after Eduardos case I went to a music festival.
It wasnt normally my kind of scene. It was on the Jersey
Shore. There were a lot of glowsticks and temporary tattoos.
But I was twenty-six. I had to do something on the weekends.
Everyone in my group seemed to know each other except for me
and a girl named Kristen. We were the bring-alongs, so we
kinda got stuck together. Kristens only 53. And shes

Preprocess of corpus

# Clean text file and pre-process for word cloud
# Convert to lowercase
hony_clean_corpus <- tm_map(hony_unclean, content_transformer(tolower))
# Remove numbers
hony_clean_corpus <- tm_map(hony_clean_corpus, removeNumbers)
# Remove conjunctions etc.: "and",the", "of"
hony_clean_corpus <- tm_map(hony_clean_corpus, removeWords, c(stopwords("english"), "im", "didnt", "couldnt","wasnt", "id", "ive", "everi", "tri", "hed", "hes", "everyth", "wed", "someth", "togeth", "noth", "rememb", "cri", "â", "anoth", "marri", "eventu", "especi", "emot", "isnt", "dont", "mother"))
# Remove words like "you'll", "will", "anyways", etc.
hony_clean_corpus <- tm_map(hony_clean_corpus, removeWords, stopwords("SMART"))
# Remove commas, periods, etc.
hony_clean_corpus <- tm_map(hony_clean_corpus, removePunctuation)
#Strip unnecessary whitespace
hony_clean_corpus <- tm_map(hony_clean_corpus, stripWhitespace)
class(hony_clean_corpus)
[1] "VCorpus" "Corpus" 
inspect(hony_clean_corpus[3])
<<VCorpus>>
Metadata:  corpus specific: 0, document level (indexed): 0
Content:  documents: 1

[[1]]
<<PlainTextDocument>>
Metadata:  7
Content:  chars: 1047

Calling cleaned corpus after pre-processing

writeLines(head(strwrap(hony_clean_corpus[[7]])))
john makes fun mom humility put letter billboard put initials
license plate porsche doesnt understand real estate works good
humility mouse hiding rug talks mouse rug list house mouse
list house boss lady began ten years ago million sales past
years number real estate agent augusta finally paid house
bought houses put kids private school paid sports activities

Creation of DFM

docs1 <- c(hony_clean_corpus)
doc_corpus <- corpus(docs1)
docs_tokens <- tokens(doc_corpus)
docs_tokens
Tokens consisting of 1,607 documents and 7 docvars.
text1 :
[1] "hony"    "stories" "dataset"

text2 :
 [1] "early"   "days"    "kristen" "write"   "single"  "email"  
 [7] "type"    "hit"     "send"    "late"    "night"   "glass"  
[ ... and 123 more ]

text3 :
 [1] "quit"          "jobs"          "nervewracking" "remember"     
 [5] "day"           "wore"          "suit"          "meeting"      
 [9] "coffee"        "shop"          "wore"          "suit"         
[ ... and 141 more ]

text4 :
 [1] "months"     "eduardos"   "case"       "music"      "festival"  
 [6] "kind"       "scene"      "jersey"     "shore"      "lot"       
[11] "glowsticks" "temporary" 
[ ... and 150 more ]

text5 :
 [1] "eduardo"     "nervous"     "office"      "barely"     
 [5] "spoke"       "english"     "told"        "story"      
 [9] "interpreter" "explained"   "hometown"    "colombia"   
[ ... and 133 more ]

text6 :
 [1] "tripp"     "prison"    "sat"       "kids"      "told"     
 [6] "loved"     "chosen"    "conceived" "sperm"     "donor"    
[11] "thought"   "hard"     
[ ... and 121 more ]

[ reached max_ndoc ... 1,601 more documents ]
docs_dfm <- docs_tokens %>%
  tokens_wordstem() %>%
  dfm()
docs_dfm
Document-feature matrix of: 1,607 documents, 8,725 features (99.28% sparse) and 7 docvars.
       features
docs    honi stori dataset earli day kristen write singl email type
  text1    1     1       1     0   0       0     0     0     0    0
  text2    0     0       0     1   1       3     1     1     1    1
  text3    0     0       0     0   1       2     0     0     0    0
  text4    0     1       0     0   0       5     0     0     1    0
  text5    0     1       0     0   1       0     0     0     0    0
  text6    0     0       0     0   0       0     0     0     0    0
[ reached max_ndoc ... 1,601 more documents, reached max_nfeat ... 8,715 more features ]

Creating DTM

dtm = DocumentTermMatrix(hony_clean_corpus)
dtm
<<DocumentTermMatrix (documents: 1607, terms: 13070)>>
Non-/sparse entries: 103798/20899692
Sparsity           : 100%
Maximal term length: 36
Weighting          : term frequency (tf)

Creating DataFrame

# Create data frame with words and frequency of occurrence
tdm = TermDocumentMatrix(docs1)

tdm2 = as.matrix(tdm)
words = sort(rowSums(tdm2), decreasing = TRUE)
df = data.frame(word = names(words), freq = words)
dim(df)
[1] 13070     2

Featuring top 50 words in frequency

# Word frequency table
head(df, 50)
             word freq
time         time 1311
people     people 1101
years       years  875
day           day  831
back         back  795
life         life  786
told         told  776
things     things  640
wanted     wanted  586
started   started  551
school     school  548
home         home  541
lot           lot  535
work         work  502
make         make  467
night       night  467
family     family  462
feel         feel  449
thing       thing  445
love         love  434
knew         knew  432
felt         felt  420
good         good  419
made         made  415
thought   thought  409
mom           mom  404
shes         shes  370
house       house  362
money       money  358
friends   friends  348
shed         shed  344
kids         kids  340
hard         hard  329
job           job  328
father     father  325
world       world  303
long         long  292
year         year  291
asked       asked  288
called     called  288
left         left  278
dad           dad  275
remember remember  271
finally   finally  268
gave         gave  268
youre       youre  264
working   working  262
ill           ill  261
man           man  258
entire     entire  253

Final word cloud after cleaning

# Create word cloud
set.seed(5000)
wordcloud(docs1
    , scale=c(2,0.5)     
    , max.words=300      
    , random.order=FALSE 
    , rot.per=0.20       
    , use.r.layout=FALSE 
    , colors=brewer.pal(8, "Set2"))

Barplot of Top 50 Most Frequent Words

# Plot of most frequently used words
barplot(df[1:50,]$freq, las=2, names.arg = df[1:50,]$word,
        col="red", main="Top 50 Most Frequent Words",
        ylab="Word frequencies")

Plotting NRC radar to see the sentiment using NRC

df %>%
  # implement sentiment analysis using the "nrc" lexicon
  inner_join(get_sentiments("nrc")) %>%
  # remove "positive/negative" sentiments
  filter(!sentiment %in% c("positive", "negative", "neutral")) %>%
  #get the frequencies of sentiments
  count(sentiment,sort = T) %>% 
  #calculate the proportion
  mutate(percent=100*n/sum(n)) %>%
  select(sentiment, percent) %>%
  #plot the result
  chartJSRadar(showToolTipLabel = TRUE, main = "NRC Radar")

Creating FCM

# create fcm from dfm
smaller_fcm <- fcm(docs_dfm)

# check the dimensions (i.e., the number of rows and the number of columnns)
# of the matrix we created
dim(smaller_fcm)
[1] 8725 8725

Creating a smaller fcm to plot textplot network

# pull the top features
myFeatures <- names(topfeatures(smaller_fcm, 25))

# retain only those top features as part of our matrix
even_smaller_fcm <- fcm_select(smaller_fcm, pattern = myFeatures, selection = "keep")

# check dimensions
dim(even_smaller_fcm)
[1] 25 25
# compute size weight for vertices in network
size <- log(colSums(even_smaller_fcm))

# create plot
textplot_network(even_smaller_fcm, 
                 min_freq = 5, 
                 edge_alpha = 0.5, 
                 edge_size = 1,
                 edge_color = "blue",
                 vertex_labelsize = log(rowSums(even_smaller_fcm))*0.75)