Blog post 4 describing the data manipulation and some basic visualizations as a part of the course “Text as Data”
Loading required libraries
Creation of VCorpus
hony_unclean <- VCorpus(VectorSource(readLines("C:/Users/gunde/Documents/hony.txt")))
hony_unclean
<<VCorpus>>
Metadata: corpus specific: 0, document level (indexed): 0
Content: documents: 1607
Calling a text from Corpus
writeLines(head(strwrap(hony_unclean[[4]]), 7))
A few months after Eduardos case I went to a music festival.
It wasnt normally my kind of scene. It was on the Jersey
Shore. There were a lot of glowsticks and temporary tattoos.
But I was twenty-six. I had to do something on the weekends.
Everyone in my group seemed to know each other except for me
and a girl named Kristen. We were the bring-alongs, so we
kinda got stuck together. Kristens only 53. And shes
Preprocess of corpus
# Clean text file and pre-process for word cloud
# Convert to lowercase
hony_clean_corpus <- tm_map(hony_unclean, content_transformer(tolower))
# Remove numbers
hony_clean_corpus <- tm_map(hony_clean_corpus, removeNumbers)
# Remove conjunctions etc.: "and",the", "of"
hony_clean_corpus <- tm_map(hony_clean_corpus, removeWords, c(stopwords("english"), "im", "didnt", "couldnt","wasnt", "id", "ive", "everi", "tri", "hed", "hes", "everyth", "wed", "someth", "togeth", "noth", "rememb", "cri", "â", "anoth", "marri", "eventu", "especi", "emot", "isnt", "dont", "mother"))
# Remove words like "you'll", "will", "anyways", etc.
hony_clean_corpus <- tm_map(hony_clean_corpus, removeWords, stopwords("SMART"))
# Remove commas, periods, etc.
hony_clean_corpus <- tm_map(hony_clean_corpus, removePunctuation)
#Strip unnecessary whitespace
hony_clean_corpus <- tm_map(hony_clean_corpus, stripWhitespace)
class(hony_clean_corpus)
[1] "VCorpus" "Corpus"
inspect(hony_clean_corpus[3])
<<VCorpus>>
Metadata: corpus specific: 0, document level (indexed): 0
Content: documents: 1
[[1]]
<<PlainTextDocument>>
Metadata: 7
Content: chars: 1047
Calling cleaned corpus after pre-processing
writeLines(head(strwrap(hony_clean_corpus[[7]])))
john makes fun mom humility put letter billboard put initials
license plate porsche doesnt understand real estate works good
humility mouse hiding rug talks mouse rug list house mouse
list house boss lady began ten years ago million sales past
years number real estate agent augusta finally paid house
bought houses put kids private school paid sports activities
Creation of DFM
docs1 <- c(hony_clean_corpus)
doc_corpus <- corpus(docs1)
docs_tokens <- tokens(doc_corpus)
docs_tokens
Tokens consisting of 1,607 documents and 7 docvars.
text1 :
[1] "hony" "stories" "dataset"
text2 :
[1] "early" "days" "kristen" "write" "single" "email"
[7] "type" "hit" "send" "late" "night" "glass"
[ ... and 123 more ]
text3 :
[1] "quit" "jobs" "nervewracking" "remember"
[5] "day" "wore" "suit" "meeting"
[9] "coffee" "shop" "wore" "suit"
[ ... and 141 more ]
text4 :
[1] "months" "eduardos" "case" "music" "festival"
[6] "kind" "scene" "jersey" "shore" "lot"
[11] "glowsticks" "temporary"
[ ... and 150 more ]
text5 :
[1] "eduardo" "nervous" "office" "barely"
[5] "spoke" "english" "told" "story"
[9] "interpreter" "explained" "hometown" "colombia"
[ ... and 133 more ]
text6 :
[1] "tripp" "prison" "sat" "kids" "told"
[6] "loved" "chosen" "conceived" "sperm" "donor"
[11] "thought" "hard"
[ ... and 121 more ]
[ reached max_ndoc ... 1,601 more documents ]
docs_dfm <- docs_tokens %>%
tokens_wordstem() %>%
dfm()
docs_dfm
Document-feature matrix of: 1,607 documents, 8,725 features (99.28% sparse) and 7 docvars.
features
docs honi stori dataset earli day kristen write singl email type
text1 1 1 1 0 0 0 0 0 0 0
text2 0 0 0 1 1 3 1 1 1 1
text3 0 0 0 0 1 2 0 0 0 0
text4 0 1 0 0 0 5 0 0 1 0
text5 0 1 0 0 1 0 0 0 0 0
text6 0 0 0 0 0 0 0 0 0 0
[ reached max_ndoc ... 1,601 more documents, reached max_nfeat ... 8,715 more features ]
Creating DTM
dtm = DocumentTermMatrix(hony_clean_corpus)
dtm
<<DocumentTermMatrix (documents: 1607, terms: 13070)>>
Non-/sparse entries: 103798/20899692
Sparsity : 100%
Maximal term length: 36
Weighting : term frequency (tf)
Creating DataFrame
# Create data frame with words and frequency of occurrence
tdm = TermDocumentMatrix(docs1)
tdm2 = as.matrix(tdm)
words = sort(rowSums(tdm2), decreasing = TRUE)
df = data.frame(word = names(words), freq = words)
dim(df)
[1] 13070 2
Featuring top 50 words in frequency
# Word frequency table
head(df, 50)
word freq
time time 1311
people people 1101
years years 875
day day 831
back back 795
life life 786
told told 776
things things 640
wanted wanted 586
started started 551
school school 548
home home 541
lot lot 535
work work 502
make make 467
night night 467
family family 462
feel feel 449
thing thing 445
love love 434
knew knew 432
felt felt 420
good good 419
made made 415
thought thought 409
mom mom 404
shes shes 370
house house 362
money money 358
friends friends 348
shed shed 344
kids kids 340
hard hard 329
job job 328
father father 325
world world 303
long long 292
year year 291
asked asked 288
called called 288
left left 278
dad dad 275
remember remember 271
finally finally 268
gave gave 268
youre youre 264
working working 262
ill ill 261
man man 258
entire entire 253
Final word cloud after cleaning
# Create word cloud
set.seed(5000)
wordcloud(docs1
, scale=c(2,0.5)
, max.words=300
, random.order=FALSE
, rot.per=0.20
, use.r.layout=FALSE
, colors=brewer.pal(8, "Set2"))
Barplot of Top 50 Most Frequent Words
# Plot of most frequently used words
barplot(df[1:50,]$freq, las=2, names.arg = df[1:50,]$word,
col="red", main="Top 50 Most Frequent Words",
ylab="Word frequencies")
Plotting NRC radar to see the sentiment using NRC
df %>%
# implement sentiment analysis using the "nrc" lexicon
inner_join(get_sentiments("nrc")) %>%
# remove "positive/negative" sentiments
filter(!sentiment %in% c("positive", "negative", "neutral")) %>%
#get the frequencies of sentiments
count(sentiment,sort = T) %>%
#calculate the proportion
mutate(percent=100*n/sum(n)) %>%
select(sentiment, percent) %>%
#plot the result
chartJSRadar(showToolTipLabel = TRUE, main = "NRC Radar")
Creating FCM
# create fcm from dfm
smaller_fcm <- fcm(docs_dfm)
# check the dimensions (i.e., the number of rows and the number of columnns)
# of the matrix we created
dim(smaller_fcm)
[1] 8725 8725
Creating a smaller fcm to plot textplot network
# pull the top features
myFeatures <- names(topfeatures(smaller_fcm, 25))
# retain only those top features as part of our matrix
even_smaller_fcm <- fcm_select(smaller_fcm, pattern = myFeatures, selection = "keep")
# check dimensions
dim(even_smaller_fcm)
[1] 25 25
# compute size weight for vertices in network
size <- log(colSums(even_smaller_fcm))
# create plot
textplot_network(even_smaller_fcm,
min_freq = 5,
edge_alpha = 0.5,
edge_size = 1,
edge_color = "blue",
vertex_labelsize = log(rowSums(even_smaller_fcm))*0.75)