Pre-processing

setwd("~/Desktop/Rstudio")

library(quanteda)
## Package version: 3.3.1
## Unicode version: 14.0
## ICU version: 71.1
## Parallel computing: 8 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
library(quanteda.textplots)
library(quanteda.textstats)
library(corpustools)
library(SentimentAnalysis) #Sentiment dictionary from R
## 
## Attaching package: 'SentimentAnalysis'
## The following object is masked from 'package:base':
## 
##     write
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
x <- read.csv('womens_clothing.csv')
x$no <- as.character(x$no)

corp <- corpus(x, docid_field = 'no', text_field = 'review_text')
#create dtm
dtm <- corp |>
  tokens() |>
  tokens_tolower() |>
  dfm()
dtm
## Document-feature matrix of: 23,486 documents, 17,808 features (99.74% sparse) and 9 docvars.
##     features
## docs absolutely wonderful - silky and sexy comfortable love this dress
##    0          1         1 1     1   2    1           1    0    0     0
##    1          0         0 0     0   2    0           0    2    1     1
##    2          0         0 1     0   3    0           1    0    2     1
##    3          0         0 0     0   1    0           0    3    1     0
##    4          0         0 0     0   1    0           0    1    2     0
##    5          0         0 0     0   4    0           0    2    4     3
## [ reached max_ndoc ... 23,480 more documents, reached max_nfeat ... 17,798 more features ]

Basic Sentiment Analysis

Creating a dictionary

GI_dict = dictionary(DictionaryGI)
#the dictionary() function can process the dictionary from the Sentiment Analysis package to a format that is can be processed in quanteda

#applying the dictionary and converting df into a tibble
result = dtm %>%
  dfm_lookup(GI_dict) %>%
  convert(to = "data.frame") %>%
  as_tibble

result
## # A tibble: 23,486 × 3
##    doc_id negative positive
##    <chr>     <dbl>    <dbl>
##  1 0             0        2
##  2 1             0        6
##  3 2             2        4
##  4 3             3        6
##  5 4             2        4
##  6 5             1        5
##  7 6             1        2
##  8 7             4        3
##  9 8             1        2
## 10 9             3        4
## # ℹ 23,476 more rows

Improving our text analysis This adds a new variable into our data frame to see how many words there were in the document in total. This puts more context into our analysis. for example, in text 1 we see that there are 6 positive terms, but a length of 70 words, which may indicate that the text is not as positive as it may seem.

result <- result |> 
  mutate(length = ntoken(dtm))

result
## # A tibble: 23,486 × 4
##    doc_id negative positive length
##    <chr>     <dbl>    <dbl>  <int>
##  1 0             0        2      8
##  2 1             0        6     70
##  3 2             2        4    111
##  4 3             3        6     30
##  5 4             2        4     41
##  6 5             1        5    107
##  7 6             1        2    121
##  8 7             4        3    117
##  9 8             1        2     37
## 10 9             3        4     88
## # ℹ 23,476 more rows

We can make this easily understandable by creating a universal sentiment score for the whole data term matrix. Here are the choices below:

result <- result |> 
  mutate(sentiment1=(positive - negative) / (positive + negative), # this gives results ranging from -1 to 1, -1 showing it is negative and 1 showing it is more positive
         sentiment2=(positive - negative) / length, # score of -1 and 1 also, but a score of 1 means that all the terms in the document is positive and vice versa
         subjectivity=(positive + negative) / length) #focusing on subjectivity, but this might be a bit more 'shaky'
result
## # A tibble: 23,486 × 7
##    doc_id negative positive length sentiment1 sentiment2 subjectivity
##    <chr>     <dbl>    <dbl>  <int>      <dbl>      <dbl>        <dbl>
##  1 0             0        2      8      1        0.25          0.25  
##  2 1             0        6     70      1        0.0857        0.0857
##  3 2             2        4    111      0.333    0.0180        0.0541
##  4 3             3        6     30      0.333    0.1           0.3   
##  5 4             2        4     41      0.333    0.0488        0.146 
##  6 5             1        5    107      0.667    0.0374        0.0561
##  7 6             1        2    121      0.333    0.00826       0.0248
##  8 7             4        3    117     -0.143   -0.00855       0.0598
##  9 8             1        2     37      0.333    0.0270        0.0811
## 10 9             3        4     88      0.143    0.0114        0.0795
## # ℹ 23,476 more rows

Improving our Dictionary

Sometimes, our model wrongly predicts whether a word is positive or negative, an example can be seen below:

freqs <- textstat_frequency(dtm)
freqs |> 
  as_tibble() |> 
  filter(feature %in% GI_dict$positive)
## # A tibble: 682 × 5
##    feature     frequency  rank docfreq group
##    <chr>           <dbl> <dbl>   <dbl> <chr>
##  1 love             8940    24    7415 all  
##  2 fit              7280    29    6170 all  
##  3 like             7007    32    5732 all  
##  4 great            6094    37    5180 all  
##  5 just             5598    43    4720 all  
##  6 perfect          3747    60    3361 all  
##  7 well             3231    68    2922 all  
##  8 back             3211    69    2841 all  
##  9 comfortable      3046    74    2941 all  
## 10 cute             3028    75    2792 all  
## # ℹ 672 more rows

Lets test whether words are correctly predicted into negative or positive sentiments. “Just” can be used as a negative or positive word. By using keyword in context, we will be able to see if it is used only in a positive light.

head(kwic(tokens(corp), "just", window = 4))
## Keyword-in-context with 6 matches.                                                              
##   [1, 51]       length on me- hits | just | a little below the
##   [2, 59]        medium, which was | just | ok. overall,      
##   [5, 19]             petite. i am | just | under 5 feet tall 
##   [5, 96]         the style but it | just | did not work on   
##  [10, 37]     very cheap that even | just | pulling on it will
##  [12, 91] very form-fitting. falls | just | above the knee and

As can be seen, just can be used as either negative or positive or even neutral. For example, the second row uses just in an almost positive light with “just ok”, whereas the third row uses the term to describe her height which makes this prediction innacurately used, while the fourth and fifth row used the term in a negative manner. That said, we can remove the term “just” from the list of positive words.

positive.cleaned <- setdiff(GI_dict$positive, c("just"))
GI_dict2 <- dictionary(list(positive = positive.cleaned, negative = GI_dict$negative))

#double check
freqs |> 
  filter(feature %in% GI_dict2$positive) |>
  as_tibble()
## # A tibble: 681 × 5
##    feature     frequency  rank docfreq group
##    <chr>           <dbl> <dbl>   <dbl> <chr>
##  1 love             8940    24    7415 all  
##  2 fit              7280    29    6170 all  
##  3 like             7007    32    5732 all  
##  4 great            6094    37    5180 all  
##  5 perfect          3747    60    3361 all  
##  6 well             3231    68    2922 all  
##  7 back             3211    69    2841 all  
##  8 comfortable      3046    74    2941 all  
##  9 cute             3028    75    2792 all  
## 10 nice             3016    76    2682 all  
## # ℹ 671 more rows

Data Visualisation: Exploring Corpustools

This helps visualise the full text analysis with annotations and highlights, illustrating which words are positive or negative

library(corpustools)
#creating corpus
t = create_tcorpus(x, doc_column = "no", text_column = "review_text")
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |======================================================================| 100%
#Applying dictionary
t$code_dictionary(GI_dict2, column = 'lsd15') #we are saying code this corpus with the GI_dict from quanteda
t$set('sentiment', 1, subset = lsd15 %in% c('positive','neg_negative')) #for positive words, give the value 1
t$set('sentiment', -1, subset = lsd15 %in% c('negative','neg_positive')) # for negative words, give the value -1

#this tool essentially tokenise the data in a tokenlist instead of making a dtm
t$tokens
##          doc_id token_id      token lsd15_id    lsd15 sentiment
##       1:      0        1 Absolutely       NA     <NA>        NA
##       2:      0        2  wonderful        1 positive         1
##       3:      0        3          -       NA     <NA>        NA
##       4:      0        4      silky       NA     <NA>        NA
##       5:      0        5        and       NA     <NA>        NA
##      ---                                                       
## 1567274:  23485       19        too   152139 negative        -1
## 1567275:  23485       20          !       NA     <NA>        NA
## 1567276:  23485       21     highly       NA     <NA>        NA
## 1567277:  23485       22  recommend       NA     <NA>        NA
## 1567278:  23485       23          !       NA     <NA>        NA

What we can do with this tokenised list is creating a full text browser which highlights words into positive and negative

browse_texts(t, scale = 'sentiment')