# Enter your name here: Gil Raitses
# 1. I did this homework by myself, with help from the book and the professor.
Text mining plays an important role in many
industries because of the prevalence of text in the interactions between
customers and company representatives. Even when the customer
interaction is by speech, rather than by chat or email, speech to text
algorithms have gotten so good that transcriptions of these spoken word
interactions are often available. To an increasing extent, a data
scientist needs to be able to wield tools that turn a body of text into
actionable insights. In this homework, we explore a real City of
Syracuse dataset using the quanteda and
quanteda.textplots packages. Make sure to install the
quanteda and quanteda.textplots
packages before following the steps below:
# The article covers a competition organized by the City of Syracuse to name their snowplows. It details the submitted names, the selected winners, and the criteria used for choosing the names.
#install.packages("quanteda")
#install.packages("quanteda.textplots")
library(quanteda)
## Package version: 4.0.2
## Unicode version: 14.0
## ICU version: 71.1
## Parallel computing: disabled
## See https://quanteda.io for tutorials and examples.
library(quanteda.textplots)
# Read the data from the URL into a dataframe called df
df <- read.csv("https://intro-datascience.s3.us-east-2.amazonaws.com/snowplownames.csv")
Hint: Make sure you have libraried quanteda
# Inspect the df dataframe
head(df)
## submission_number submitter_name_anonymized snowplow_name
## 1 1 kjlt9cua rudolph
## 2 2 KXKaabXN salt life
## 3 3 kjlt9cua blizzard
## 4 4 Rv9sODqp butter
## 5 5 zzcc5FDn santa's 10 reindeer
## 6 6 wOrKO7XI plowy mcplowface
## meaning
## 1 The red nose cuts through any storm.
## 2 We may not be near the ocean like everyone else with the stickers that say Salt Life, but we have plenty of salt!
## 3 This plow can handle any storm.
## 4 It's amazing how the snow plows through snow like butter!
## 5 They can deliver through the bad weather and snow.
## 6 It would be a great name
## winning_name
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
str(df)
## 'data.frame': 1907 obs. of 5 variables:
## $ submission_number : int 1 2 3 4 5 6 7 8 9 10 ...
## $ submitter_name_anonymized: chr "kjlt9cua" "KXKaabXN" "kjlt9cua" "Rv9sODqp" ...
## $ snowplow_name : chr "rudolph" "salt life" "blizzard" "butter" ...
## $ meaning : chr "The red nose cuts through any storm." "We may not be near the ocean like everyone else with the stickers that say Salt Life, but we have plenty of salt!" "This plow can handle any storm." "It's amazing how the snow plows through snow like butter!" ...
## $ winning_name : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
# The column named 'meaning'is the column that contains explanations of the snowplow names
# Transform the 'meaning' column into a document-feature matrix
corpus_df <- corpus(df, text_field = "meaning") # Create a corpus from the 'meaning' column
## Warning: NA is replaced by empty string
tokens_df <- tokens(corpus_df) # Tokenize the corpus
tokens_df <- tokens_select(tokens_df, pattern = stopwords("en"), selection = "remove") # Remove stop words
dfm_df <- dfm(tokens_df) # Create a document-feature matrix
Hint: Make sure you have libraried (and installed if needed) quanteda.textplots
# Plot a word cloud where a word is only represented if it appears at least 2 times
textplot_wordcloud(dfm_df, min_count = 2)
# Increase the minimum count to 10 and plot the word cloud
textplot_wordcloud(dfm_df, min_count = 10)
# When the minimum count is increased to 10, the word cloud narrows the focus to only the words with a number of instances above the min_count threshold of 10, filtering out many of the words that were in the first text plot.
# The main words in the word cloud are:
# - 'snow'
# - 'syracuse'
# - 'plow'
# - 'salt'
# - 'name'
# - 'columbus'
# - 'city'
# - 'roads'
# These words highlight the common themes in the snowplow name explanations, such as references to snow and salting, their plowing territory, the street name, the city's history in mining and exporting salt.
output the 10 most frequent words (their word count and the word).
Hint: use textstat_frequency() from the quanteda.textstats package.
#install.packages("quanteda.textstats")
library(quanteda.textstats)
# Create a frequency table of words
word_freq <- textstat_frequency(dfm_df)
# Output the 10 most frequent words (their word count and the word)
top_10_words <- head(word_freq, 10)
top_10_words
## feature frequency rank docfreq group
## 1 . 674 1 515 all
## 2 ¿ 452 2 148 all
## 3 ½ 432 3 143 all
## 4 ï 336 4 147 all
## 5 snow 321 5 292 all
## 6 , 319 6 236 all
## 7 ! 215 7 166 all
## 8 syracuse 174 8 164 all
## 9 name 143 9 137 all
## 10 plow 140 10 130 all
# Create a named list of the top 10 words and their counts
named_list_top_10_words <- setNames(as.list(top_10_words$frequency), top_10_words$feature)
named_list_top_10_words
## $.
## [1] 674
##
## $`¿`
## [1] 452
##
## $`½`
## [1] 432
##
## $ï
## [1] 336
##
## $snow
## [1] 321
##
## $`,`
## [1] 319
##
## $`!`
## [1] 215
##
## $syracuse
## [1] 174
##
## $name
## [1] 143
##
## $plow
## [1] 140
# More than half of the entries in the sorted list of word counts are non-alphanumeric characters:
# - '.'
# - '¿'
# - '½'
# - 'ï'
# - ','
# These characters suggest potential text processing or encoding issues.
# Frequently ocurring words are also prominent:
# - 'snow'
# - 'syracuse'
# - 'name'
# - 'plow'
# These highlight relevant themes in the descriptions.
There should be 2006 positive words and 4783 negative words, so you may need to clean up these lists a bit.
# Read in the list of positive words
posWords <- scan("https://intro-datascience.s3.us-east-2.amazonaws.com/positive-words.txt", what = "character", comment.char = ";")
# Output the first 5 positive words
head(posWords, 5)
## [1] "a+" "abound" "abounds" "abundance" "abundant"
# Read in the list of negative words
negWords <- scan("https://intro-datascience.s3.us-east-2.amazonaws.com/negative-words.txt", what = "character", comment.char = ";")
# Output the first 5 negative words
head(negWords, 5)
## [1] "2-faced" "2-faces" "abnormal" "abolish" "abominable"
Then pass this new dfm to the textstat_frequency() function to see the positive words in our corpus, and how many times each word was mentioned.
# Match the words in the dfm with the positive words list
dfm_pos <- dfm_match(dfm_df, features = posWords)
# Analyze the frequency of positive words in the corpus
pos_freq <- textstat_frequency(dfm_pos)
# Output the positive words and their frequencies
pos_freq
## feature frequency rank docfreq group
## 1 like 88 1 85 all
## 2 honor 47 2 47 all
## 3 great 43 3 43 all
## 4 good 28 4 28 all
## 5 fun 27 5 24 all
## 6 strong 25 6 25 all
## 7 best 23 7 22 all
## 8 love 21 8 21 all
## 9 work 21 8 21 all
## 10 clear 19 10 19 all
## 11 famous 16 11 16 all
## 12 pride 16 11 16 all
## 13 safe 16 11 16 all
## 14 tough 15 14 15 all
## 15 well 15 14 15 all
## 16 clean 13 16 13 all
## 17 favorite 13 16 13 all
## 18 amazing 12 18 12 all
## 19 cute 10 19 10 all
## 20 beloved 9 20 9 all
## 21 right 9 20 9 all
## 22 better 8 22 8 all
## 23 honoring 8 22 8 all
## 24 powerful 8 22 8 all
## 25 cool 7 25 7 all
## 26 homage 7 25 7 all
## 27 respect 7 25 7 all
## 28 appropriate 6 28 6 all
## 29 classic 6 28 6 all
## 30 golden 6 28 6 all
## 31 pretty 6 28 6 all
## 32 clears 5 32 5 all
## 33 enough 5 32 5 all
## 34 greatest 5 32 5 all
## 35 loves 5 32 5 all
## 36 magic 5 32 5 all
## 37 mighty 5 32 5 all
## 38 proud 5 32 5 all
## 39 support 5 32 5 all
## 40 works 5 32 5 all
## 41 award 4 41 4 all
## 42 cleared 4 41 4 all
## 43 dedicated 4 41 4 all
## 44 hero 4 41 4 all
## 45 humor 4 41 4 all
## 46 loved 4 41 4 all
## 47 popular 4 41 4 all
## 48 smile 4 41 4 all
## 49 super 4 41 4 all
## 50 top 4 41 4 all
## 51 winner 4 41 4 all
## 52 won 4 41 4 all
## 53 awesome 3 53 3 all
## 54 catchy 3 53 3 all
## 55 celebrate 3 53 3 all
## 56 courage 3 53 3 all
## 57 excellent 3 53 3 all
## 58 happy 3 53 3 all
## 59 hilarious 3 53 3 all
## 60 important 3 53 3 all
## 61 lead 3 53 3 all
## 62 liked 3 53 3 all
## 63 positive 3 53 3 all
## 64 safely 3 53 3 all
## 65 saint 3 53 3 all
## 66 uplifting 3 53 2 all
## 67 win 3 53 3 all
## 68 worked 3 53 3 all
## 69 autonomous 2 69 2 all
## 70 awesomeness 2 69 2 all
## 71 beautiful 2 69 2 all
## 72 benefit 2 69 2 all
## 73 boom 2 69 1 all
## 74 bright 2 69 2 all
## 75 easy 2 69 2 all
## 76 free 2 69 2 all
## 77 freedom 2 69 2 all
## 78 gold 2 69 2 all
## 79 holy 2 69 2 all
## 80 honored 2 69 2 all
## 81 loving 2 69 2 all
## 82 neat 2 69 2 all
## 83 nice 2 69 2 all
## 84 noble 2 69 2 all
## 85 perseverance 2 69 2 all
## 86 prosperity 2 69 2 all
## 87 protection 2 69 2 all
## 88 ready 2 69 2 all
## 89 recovery 2 69 1 all
## 90 rich 2 69 2 all
## 91 trophy 2 69 2 all
## 92 trust 2 69 2 all
## 93 warm 2 69 2 all
## 94 winning 2 69 2 all
## 95 wins 2 69 2 all
## 96 abounds 1 96 1 all
## 97 accolades 1 96 1 all
## 98 accomplish 1 96 1 all
## 99 accomplishments 1 96 1 all
## 100 accurate 1 96 1 all
## 101 achievement 1 96 1 all
## 102 achievements 1 96 1 all
## 103 angel 1 96 1 all
## 104 appeal 1 96 1 all
## 105 awards 1 96 1 all
## 106 awesomely 1 96 1 all
## 107 backbone 1 96 1 all
## 108 beauty 1 96 1 all
## 109 blossom 1 96 1 all
## 110 brave 1 96 1 all
## 111 brighten 1 96 1 all
## 112 capability 1 96 1 all
## 113 capable 1 96 1 all
## 114 cheer 1 96 1 all
## 115 clearer 1 96 1 all
## 116 clever 1 96 1 all
## 117 consistent 1 96 1 all
## 118 continuity 1 96 1 all
## 119 coolest 1 96 1 all
## 120 correctly 1 96 1 all
## 121 courageous 1 96 1 all
## 122 crisp 1 96 1 all
## 123 darling 1 96 1 all
## 124 dawn 1 96 1 all
## 125 decent 1 96 1 all
## 126 dignity 1 96 1 all
## 127 easier 1 96 1 all
## 128 elite 1 96 1 all
## 129 encourage 1 96 1 all
## 130 enjoy 1 96 1 all
## 131 envy 1 96 1 all
## 132 everlasting 1 96 1 all
## 133 excellence 1 96 1 all
## 134 excited 1 96 1 all
## 135 fair 1 96 1 all
## 136 faith 1 96 1 all
## 137 fame 1 96 1 all
## 138 fantastic 1 96 1 all
## 139 fastest 1 96 1 all
## 140 fav 1 96 1 all
## 141 fidelity 1 96 1 all
## 142 finest 1 96 1 all
## 143 freedoms 1 96 1 all
## 144 fresh 1 96 1 all
## 145 friendly 1 96 1 all
## 146 genius 1 96 1 all
## 147 gifted 1 96 1 all
## 148 glory 1 96 1 all
## 149 glow 1 96 1 all
## 150 grace 1 96 1 all
## 151 grateful 1 96 1 all
## 152 hail 1 96 1 all
## 153 hardy 1 96 1 all
## 154 helped 1 96 1 all
## 155 helping 1 96 1 all
## 156 heroine 1 96 1 all
## 157 honest 1 96 1 all
## 158 humorous 1 96 1 all
## 159 incredible 1 96 1 all
## 160 innovation 1 96 1 all
## 161 inspiring 1 96 1 all
## 162 instantly 1 96 1 all
## 163 instrumental 1 96 1 all
## 164 jolly 1 96 1 all
## 165 legendary 1 96 1 all
## 166 likes 1 96 1 all
## 167 logical 1 96 1 all
## 168 lovable 1 96 1 all
## 169 loyal 1 96 1 all
## 170 lucky 1 96 1 all
## 171 magical 1 96 1 all
## 172 merit 1 96 1 all
## 173 miracle 1 96 1 all
## 174 modern 1 96 1 all
## 175 motivated 1 96 1 all
## 176 patriot 1 96 1 all
## 177 persevere 1 96 1 all
## 178 pleasant 1 96 1 all
## 179 prefer 1 96 1 all
## 180 proactive 1 96 1 all
## 181 protect 1 96 1 all
## 182 recover 1 96 1 all
## 183 respectful 1 96 1 all
## 184 respectfully 1 96 1 all
## 185 satisfy 1 96 1 all
## 186 savings 1 96 1 all
## 187 savior 1 96 1 all
## 188 sensation 1 96 1 all
## 189 shiny 1 96 1 all
## 190 significant 1 96 1 all
## 191 smart 1 96 1 all
## 192 smiles 1 96 1 all
## 193 smooth 1 96 1 all
## 194 spirited 1 96 1 all
## 195 steady 1 96 1 all
## 196 strongest 1 96 1 all
## 197 sturdy 1 96 1 all
## 198 success 1 96 1 all
## 199 supported 1 96 1 all
## 200 sweet 1 96 1 all
## 201 talented 1 96 1 all
## 202 tenacity 1 96 1 all
## 203 thrilled 1 96 1 all
## 204 trusting 1 96 1 all
## 205 unforgettable 1 96 1 all
## 206 unlimited 1 96 1 all
## 207 unparalleled 1 96 1 all
## 208 winners 1 96 1 all
## 209 wonderful 1 96 1 all
## 210 worth 1 96 1 all
## 211 wow 1 96 1 all
C. Sum all the positive words
# Sum all the positive words
total_pos_words <- sum(colSums(dfm_pos))
total_pos_words
## [1] 866
D. Do a similar analysis for the negative words - show the 10 most frequent negative words and then sum the negative words in the document.
# Match the words in the dfm with the negative words list
dfm_neg <- dfm_match(dfm_df, features = negWords)
# Analyze the frequency of negative words in the corpus
neg_freq <- textstat_frequency(dfm_neg)
# Output the 10 most frequent negative words
top_10_neg_words <- head(neg_freq, 10)
top_10_neg_words
## feature frequency rank docfreq group
## 1 funny 25 1 25 all
## 2 cold 8 2 8 all
## 3 twist 8 2 8 all
## 4 hard 7 4 7 all
## 5 abominable 6 5 6 all
## 6 problem 6 5 6 all
## 7 bad 5 7 5 all
## 8 destroy 5 7 5 all
## 9 died 5 7 5 all
## 10 bust 4 10 4 all
# Sum all the negative words in the document
total_neg_words <- sum(colSums(dfm_neg))
total_neg_words
## [1] 255
# After matching positive and negative words, it is evident that positive words are more common in this dataset.
# Most freqently used positive words include:
# - 'like'
# - 'honor'
# - 'great'
# - 'good'
# - 'fun'
# Some negative words might not actually be used in a negative context such as:
# - 'funny'
# - 'twist'
# - 'cold'
# Positive words typically convey positive sentiment, but the context in which they are used is critical for accurate sentiment analysis.