setwd("~/Desktop/Rstudio")
library(quanteda)
## Package version: 3.3.1
## Unicode version: 14.0
## ICU version: 71.1
## Parallel computing: 8 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
library(quanteda.textplots)
library(quanteda.textstats)
library(corpustools)
library(SentimentAnalysis) #Sentiment dictionary from R
##
## Attaching package: 'SentimentAnalysis'
## The following object is masked from 'package:base':
##
## write
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
x <- read.csv('womens_clothing.csv')
x$no <- as.character(x$no)
corp <- corpus(x, docid_field = 'no', text_field = 'review_text')
#create dtm
dtm <- corp |>
tokens() |>
tokens_tolower() |>
dfm()
dtm
## Document-feature matrix of: 23,486 documents, 17,808 features (99.74% sparse) and 9 docvars.
## features
## docs absolutely wonderful - silky and sexy comfortable love this dress
## 0 1 1 1 1 2 1 1 0 0 0
## 1 0 0 0 0 2 0 0 2 1 1
## 2 0 0 1 0 3 0 1 0 2 1
## 3 0 0 0 0 1 0 0 3 1 0
## 4 0 0 0 0 1 0 0 1 2 0
## 5 0 0 0 0 4 0 0 2 4 3
## [ reached max_ndoc ... 23,480 more documents, reached max_nfeat ... 17,798 more features ]
Creating a dictionary
GI_dict = dictionary(DictionaryGI)
#the dictionary() function can process the dictionary from the Sentiment Analysis package to a format that is can be processed in quanteda
#applying the dictionary and converting df into a tibble
result = dtm %>%
dfm_lookup(GI_dict) %>%
convert(to = "data.frame") %>%
as_tibble
result
## # A tibble: 23,486 × 3
## doc_id negative positive
## <chr> <dbl> <dbl>
## 1 0 0 2
## 2 1 0 6
## 3 2 2 4
## 4 3 3 6
## 5 4 2 4
## 6 5 1 5
## 7 6 1 2
## 8 7 4 3
## 9 8 1 2
## 10 9 3 4
## # ℹ 23,476 more rows
Improving our text analysis This adds a new variable into our data frame to see how many words there were in the document in total. This puts more context into our analysis. for example, in text 1 we see that there are 6 positive terms, but a length of 70 words, which may indicate that the text is not as positive as it may seem.
result <- result |>
mutate(length = ntoken(dtm))
result
## # A tibble: 23,486 × 4
## doc_id negative positive length
## <chr> <dbl> <dbl> <int>
## 1 0 0 2 8
## 2 1 0 6 70
## 3 2 2 4 111
## 4 3 3 6 30
## 5 4 2 4 41
## 6 5 1 5 107
## 7 6 1 2 121
## 8 7 4 3 117
## 9 8 1 2 37
## 10 9 3 4 88
## # ℹ 23,476 more rows
We can make this easily understandable by creating a universal sentiment score for the whole data term matrix. Here are the choices below:
result <- result |>
mutate(sentiment1=(positive - negative) / (positive + negative), # this gives results ranging from -1 to 1, -1 showing it is negative and 1 showing it is more positive
sentiment2=(positive - negative) / length, # score of -1 and 1 also, but a score of 1 means that all the terms in the document is positive and vice versa
subjectivity=(positive + negative) / length) #focusing on subjectivity, but this might be a bit more 'shaky'
result
## # A tibble: 23,486 × 7
## doc_id negative positive length sentiment1 sentiment2 subjectivity
## <chr> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0 0 2 8 1 0.25 0.25
## 2 1 0 6 70 1 0.0857 0.0857
## 3 2 2 4 111 0.333 0.0180 0.0541
## 4 3 3 6 30 0.333 0.1 0.3
## 5 4 2 4 41 0.333 0.0488 0.146
## 6 5 1 5 107 0.667 0.0374 0.0561
## 7 6 1 2 121 0.333 0.00826 0.0248
## 8 7 4 3 117 -0.143 -0.00855 0.0598
## 9 8 1 2 37 0.333 0.0270 0.0811
## 10 9 3 4 88 0.143 0.0114 0.0795
## # ℹ 23,476 more rows
Sometimes, our model wrongly predicts whether a word is positive or negative, an example can be seen below:
freqs <- textstat_frequency(dtm)
freqs |>
as_tibble() |>
filter(feature %in% GI_dict$positive)
## # A tibble: 682 × 5
## feature frequency rank docfreq group
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 love 8940 24 7415 all
## 2 fit 7280 29 6170 all
## 3 like 7007 32 5732 all
## 4 great 6094 37 5180 all
## 5 just 5598 43 4720 all
## 6 perfect 3747 60 3361 all
## 7 well 3231 68 2922 all
## 8 back 3211 69 2841 all
## 9 comfortable 3046 74 2941 all
## 10 cute 3028 75 2792 all
## # ℹ 672 more rows
Lets test whether words are correctly predicted into negative or positive sentiments. “Just” can be used as a negative or positive word. By using keyword in context, we will be able to see if it is used only in a positive light.
head(kwic(tokens(corp), "just", window = 4))
## Keyword-in-context with 6 matches.
## [1, 51] length on me- hits | just | a little below the
## [2, 59] medium, which was | just | ok. overall,
## [5, 19] petite. i am | just | under 5 feet tall
## [5, 96] the style but it | just | did not work on
## [10, 37] very cheap that even | just | pulling on it will
## [12, 91] very form-fitting. falls | just | above the knee and
As can be seen, just can be used as either negative or positive or even neutral. For example, the second row uses just in an almost positive light with “just ok”, whereas the third row uses the term to describe her height which makes this prediction innacurately used, while the fourth and fifth row used the term in a negative manner. That said, we can remove the term “just” from the list of positive words.
positive.cleaned <- setdiff(GI_dict$positive, c("just"))
GI_dict2 <- dictionary(list(positive = positive.cleaned, negative = GI_dict$negative))
#double check
freqs |>
filter(feature %in% GI_dict2$positive) |>
as_tibble()
## # A tibble: 681 × 5
## feature frequency rank docfreq group
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 love 8940 24 7415 all
## 2 fit 7280 29 6170 all
## 3 like 7007 32 5732 all
## 4 great 6094 37 5180 all
## 5 perfect 3747 60 3361 all
## 6 well 3231 68 2922 all
## 7 back 3211 69 2841 all
## 8 comfortable 3046 74 2941 all
## 9 cute 3028 75 2792 all
## 10 nice 3016 76 2682 all
## # ℹ 671 more rows
This helps visualise the full text analysis with annotations and highlights, illustrating which words are positive or negative
library(corpustools)
#creating corpus
t = create_tcorpus(x, doc_column = "no", text_column = "review_text")
##
|
| | 0%
|
|================== | 25%
|
|=================================== | 50%
|
|==================================================== | 75%
|
|======================================================================| 100%
#Applying dictionary
t$code_dictionary(GI_dict2, column = 'lsd15') #we are saying code this corpus with the GI_dict from quanteda
t$set('sentiment', 1, subset = lsd15 %in% c('positive','neg_negative')) #for positive words, give the value 1
t$set('sentiment', -1, subset = lsd15 %in% c('negative','neg_positive')) # for negative words, give the value -1
#this tool essentially tokenise the data in a tokenlist instead of making a dtm
t$tokens
## doc_id token_id token lsd15_id lsd15 sentiment
## 1: 0 1 Absolutely NA <NA> NA
## 2: 0 2 wonderful 1 positive 1
## 3: 0 3 - NA <NA> NA
## 4: 0 4 silky NA <NA> NA
## 5: 0 5 and NA <NA> NA
## ---
## 1567274: 23485 19 too 152139 negative -1
## 1567275: 23485 20 ! NA <NA> NA
## 1567276: 23485 21 highly NA <NA> NA
## 1567277: 23485 22 recommend NA <NA> NA
## 1567278: 23485 23 ! NA <NA> NA
What we can do with this tokenised list is creating a full text browser which highlights words into positive and negative
browse_texts(t, scale = 'sentiment')