Sentiment Analysis: Llama 3.1 (7B parameters) vs. Traditional Sentiment Model

Applying LLMs Capabilities to Sentiment Evaluation

Libraries

knitr::opts_chunk$set(
  message = F,
  warning = F
)

pacman::p_load(
  "tidyverse",
  "caret"
)

source("main.R")

Test dataset

read_csv("data/test.csv") |>
  rename("Sentiment" = sentiment, "Text" = text) |>
  filter(!is.na(Sentiment)) -> dfSocial

dfSocial$Sentiment |> table()


negative  neutral positive 
    1001     1430     1103

Llama3.1 (7B) Sentiment Evaluation (10 iterations)

if (FALSE) {
  for (i in 1:10) {
  ### Process the response with llama3.1
  dfSocial$Text |>
    parallelSentiment() -> llamaSentiment
  
  
  ### Unify the text format
  llamaSentimentClean <- case_when(
    grepl(tolower(llamaSentiment), pattern = "positive") ~ "positive",
    grepl(tolower(llamaSentiment), pattern = "negative") ~ "negative",
    grepl(tolower(llamaSentiment), pattern = "other") ~ "neutral",
    T ~ "na"
  )
  #  llamaSentimentClean |> table()
  dfSocial$llama <- llamaSentimentClean
  
  
  ### Save cases/erorrs (not classified by llama)
  dir.create(glue("iteration{i}"))
  dfSocial[llamaSentimentClean == "na",] |>
    write_csv2(glue("iteration{i}/NotClassified.csv"))
  
  
  ### Save accuracy by sentiment
  dfSocial |>
    select(llama, Sentiment) |>
    mutate(n = 1) |>
    pivot_wider(names_from = "llama", values_from = "n") |>
    rowwise() |>
    mutate(across(where(is.list), .fns = ~length(.x))) |>
    pivot_longer(values_to = "n", names_to = "llama", cols = -1) |>
    group_by(Sentiment) |>
    summarize(
      count = sum(n),
      accuracy = round(if_else(llama == Sentiment, n, 0) / sum(n), 2)
    ) |>
    filter(accuracy != 0) |>
    write_csv2(glue("iteration{i}/Accuracy.csv"))
  
  ### Save confusion matrix
  dfSocial |>
    select(llama, Sentiment) |>
    mutate(n = 1) |>
    pivot_wider(names_from = "llama", values_from = "n") |>
    rowwise() |>
    mutate(across(where(is.list), .fns = ~length(.x))) |>
    select(Sentiment, neutral, positive, negative) |>
    ungroup() |>
    write_csv2(glue("iteration{i}/ConfusionMatrix.csv"))
  
  dfSocial |>
    select(textID, llama, Sentiment) |>
     write_csv2(glue("iteration{i}/IDs.csv"))
}
}

Results Evalaution

Iteration-wise comparison

lapply(1:10, function(i) {
  read_csv2(glue("iteration{i}/Accuracy.csv"))
}) |>
  map_df(data.frame, .id = "iteration") -> dfAccuracy

asSVG((
 dfAccuracy |>
  mutate(jitter = row_number()) |>
  group_by(Sentiment) |>
  mutate(accuracy = (accuracy - mean(accuracy)) * 100) |>
  ggplot(aes(
    color = Sentiment,
    y = accuracy,
    x = jitter,
    group = Sentiment
  )) +
  geom_segment(aes(
    xend = jitter,
    yend = 0,
  ), size = 15) +
  facet_wrap(~factor(iteration, levels = 1:10, ordered = T), scales = "free_x") +
  scale_x_continuous(expand = c(.2, .2)) +
  theme(
    axis.text.x = element_blank(),
    text = element_text(family = "Segoe UI", size = 14)
  ) +
 scale_color_manual(values = c("#272635", "#7f7f7f", "#5662f6")) +
  labs(
    x = "",
    y = "accuracy deviation (in percent points)",
    title = str_wrap("
                     The fluctuations in the accuracy of data classification between iterations are small (not bigger than 1.1% point).
                     No clear pattern is visible, it might indicate randomization as a key factor of fluctiations.
                     ")
  ) 
))

High variability of classification between model iterations indicates uncertainty of the result of a single sentence evaluation. Such a feature of LLM models may mean that sentences need to be evaluated multiple times in order to standardize the result based on multiple observations.

lapply(1:10, function(i) {
  read_csv2(glue("iteration{i}/ConfusionMatrix.csv"))
}) |>
  map_df(data.frame, .id = "iteration") -> dfConfusion

asSVG((
  dfConfusion |>
    group_by(Sentiment) |>
    summarise(
      sd_neutral = sd(neutral),
      sd_positive = sd(positive),
      sd_negative = sd(negative)
    ) |>
    pivot_longer(names_to = "Llama", values_to = "sd", cols = -1) |>
    mutate(sd = round(sd, 1)) |>
    ggplot() +
    geom_tile(aes(
      x = Sentiment,
      y = Llama,
      fill = sd
    )) +
    scale_fill_gradient2(low = "#fec286", mid = "#b83779", high = "#010210", midpoint = 6) +
    theme_minimal() +
    geom_text(aes(
      x = Sentiment,
      y = Llama,
      label = sd
    ), color = "white", size = 8, fontface = "bold") +
    labs(
      title = str_wrap("
                       A large standard deviation in the case of a sentiment classified by LLama as neutral in the data
                       set may indicate a certain type of model searching for a so-called second bottom.
                       The smallest deviation between models occurs in the case of positive opinions classified
                       as negative - it occured relatively hardly ever.
                       ")
    )
), width = 8, height = 6)

After averaging the classification results of both models and applying a cutoff function based on the hyperbolic tangent, it is observed that the results deviate significantly more from the outcomes of individual iterations. This indicates a high degree of variability (instability) in the model. Additionally, an extremely small number of cases were assigned to the neutral class.

### Good case
#dfSocial |>
#  filter(textID == "0e8aa10a4e") |> pull("Text") |> cat()

lapply(1:10, function(i) {
  read_csv2(glue("iteration{i}/IDs.csv"))
}) |>
  map_df(data.frame, .id = "iteration") -> dfFlow

  dfFlow |>
  pivot_wider(names_from = "iteration", values_from = "llama") |>
  mutate(across(3:12, ~case_when(
      .x == "positive" ~ 1,
      .x == "neutral" ~ 0,
      .x == "negative" ~ -1,
      TRUE ~ NA
    ))) |>
  rowwise() |>
  mutate(
    avgResult = round(mean(c_across(where(is.numeric)), na.rm = T), 2),
    avgSentiment = case_when(
      avgResult <= tanh(-1)~ "negative",
      avgResult >= tanh(1) ~ "positive",
      T ~ "neutral"
    )
  ) -> dfFlowS

asSVG((
  dfFlowS |>
    mutate(
      avgCertainty = if_else(abs(avgResult) >= tanh(1), "class assgined", "neutral"),
      avgResult = if_else(abs(avgResult) == 1, as.character(avgResult), "At least partial uncertanity"),
      avgResult = case_when(avgResult == "-1" ~ "negative", avgResult == "1" ~ "positive", T ~ avgResult)
    ) |>
    group_by(avgResult, avgCertainty) |>
    summarise(n = n()) |>
    na.omit() |>
    ggplot(aes(x = n, y = as.factor(avgResult))) +
    geom_col(aes(fill = factor(avgCertainty, levels = c("neutral", "class assgined")))) +
    geom_text(aes(label = n), color = "white", size = 5.5, hjust = 1) +
    scale_fill_manual(values = c("#cccccc", "#5662f6")) +
    theme_minimal() +
    labs(
      x = "# of observations",
      y = "",
      fill = "Classification",
      title = "The vast majority of cases (over 2000) were classified with complete confidence (agreement over 10 iterations). Of the uncertain cases, about 30% were assigned an extreme value (positive or negative)."
    )
), width = 8, height = 4)

asSVG((
  dfFlowS |> ggplot(aes(x = avgResult)) + 
  geom_density(size = 2, color = "#5662f6") +
  geom_vline(xintercept = tanh(-1), color = "#6a6a6a", linetype = "dotted", size = 1) +
  geom_vline(xintercept = tanh(1), color = "#6a6a6a", linetype = "dotted", size = 1) +
  theme_minimal()
), width = 9, height = 5)

TEXT PLACE HOLDER

asSVG((
  dfFlowS |>
  rowwise() %>%
  mutate(
    resultVariance = if_else(
      any(c_across(`1`:`10`) == 1) && any(c_across(`1`:`10`) == -1),
      "extreme",
      "centric"
    ),
    sentimentVariance = paste0(avgSentiment, " - ", resultVariance)
  ) |> 
  group_by(sentimentVariance, avgSentiment, resultVariance) |>
  summarise(n = n()) |>
  na.omit() |>
  ggplot() +
  geom_col(aes(
    x = resultVariance,
    y = n,
    fill = resultVariance
  )) + 
  facet_wrap(~avgSentiment, scales = "free_x") +
  scale_fill_manual(values = c("#272635", "#B1E5F2")) +
  theme_minimal() +
  theme(
    strip.background = element_rect(fill = "#eeeeee", color = NA),
    legend.position = "top"
  ) +
  labs(
    x = "", 
    y = "# of cases",
    fill = "Varianace between iterations"
  ) 
), width = 9, height = 5)

Quantitive Comparison

The overall accuracy of the models is 57.67%, which suggests a moderate level of agreement between the two models. However, the Kappa statistic is 0.3764, indicating a fair level of agreement, but far from perfect. The Kappa score suggests that while there is some overlap in how both models classify sentiment, there is also a significant amount of disagreement. The McNemar’s test P-Value is less than 2.2e-16, which strongly indicates that the discrepancies between the models are statistically significant.

The analysis reveals that the two models show a reasonable level of agreement in classifying negative and positive sentiments, with the highest concordance for negative sentiments. However, there is substantial divergence when it comes to neutral sentiments, suggesting that this category is more challenging for the models to consistently agree upon. The data suggests that while the models can be somewhat reliable in distinguishing clear positive or negative sentiments, they are less effective with neutral sentiments, which could be a focus for further model refinement.

dfFlowS |>
  group_by(Sentiment, avgSentiment) |>
  summarise(n = n()) |>
  pivot_wider(names_from = "avgSentiment", values_from = "n") |>
  column_to_rownames(var = "Sentiment") |>
  as.matrix() |> confusionMatrix()

Confusion Matrix and Statistics

         negative neutral positive
negative      721     163      117
neutral       409     400      621
positive       55     128      920

Overall Statistics
                                         
               Accuracy : 0.5775         
                 95% CI : (0.561, 0.5939)
    No Information Rate : 0.4692         
    P-Value [Acc > NIR] : < 2.2e-16      
                                         
                  Kappa : 0.3782         
                                         
 Mcnemar's Test P-Value : < 2.2e-16      

Statistics by Class:

                     Class: negative Class: neutral Class: positive
Sensitivity                   0.6084         0.5789          0.5549
Specificity                   0.8808         0.6377          0.9025
Pos Pred Value                0.7203         0.2797          0.8341
Neg Pred Value                0.8168         0.8617          0.6964
Prevalence                    0.3353         0.1955          0.4692
Detection Rate                0.2040         0.1132          0.2603
Detection Prevalence          0.2832         0.4046          0.3121
Balanced Accuracy             0.7446         0.6083          0.7287

Irony often expresses a negative sentiment using seemingly positive words, or vice versa. Analysis has shown that neutral sentiment has the lowest effectiveness in models (with low precision and recall). Irony can be classified as neutral when models fail to recognize its true tone, indicating that irony in such contexts is more challenging to detect. Llama 3.1, trained on large datasets that may include contexts where irony occurs, has the potential to better recognize subtle signals of irony, such as tone, sentence structure, or the use of specific words. However… Overinterpretation: Despite the potential benefits, LLM models may be prone to overinterpreting data. This could lead to the erroneous attribution of irony where none exists, as these models try to find subtle meanings or intentions, even if the text is straightforward and literal. With LLM models, there is a risk that they may attribute irony where it is absent, especially when they attempt to overinterpret simple statements as complex ones. This phenomenon may stem from the fact that LLMs are highly sensitive to context and may identify nuances that aren’t actually present.

Qualitive Comparison

set.seed(5)
dfFlowS |>
    filter(Sentiment != avgSentiment) |>
  inner_join(dfSocial, join_by(textID == textID)) |>
  filter(
    textID %in% (
      dfFlowS |>
      filter(Sentiment != avgSentiment) |>
        pull("textID") |> sample(10)
    )
  ) |>
  select(Text, "StdModel" = Sentiment.x, avgResult)

# A tibble: 10 × 3
# Rowwise: 
   Text                                                       StdModel avgResult
   <chr>                                                      <chr>        <dbl>
 1 haaaw..well i get out of class at 10:50..i hope i make it  positive      -0.2
 2 ****  lol....where u headed to?                            neutral        1  
 3 Missed all your tweets again Joe, **** that time differen… neutral       -1  
 4 http://twitpic.com/4ja8r - Tell me, how can you not love … neutral        1  
 5 Good morning. haha I just read that the swine flu probabl… neutral        1  
 6 Planing on going on a little ride on the mtb on monday  m… neutral        1  
 7 Kyle is Cody`s wee bro!                                    neutral        1  
 8 Not found: the thing I was looking for                     neutral       -1  
 9 Ha! I think I really have lost it. Time to get ready for … negative      -0.1
10 look up our youtube JessAndNicoleMusic, we are new, leave… neutral        1

set.seed(NULL)

Manual assesment of random cases

haaaw..well i get out of class at 10:50..i hope i make it

“haaaw..”: This expression seems to convey a sense of weariness, frustration, or exasperation. It suggests that the speaker is feeling a bit overwhelmed or tired.
“well i get out of class at 10:50..”: This part is neutral, simply stating a fact about when the speaker will finish class. There is no strong emotion attached to this statement.
“i hope i make it”: This phrase expresses concern or uncertainty about whether the speaker will achieve something on time. The use of “hope” indicates a desire for a positive outcome, but also suggests some doubt or worry.

Overall, the sentence carries a slightly anxious tone due to the concern about making it on time, combined with the weariness indicated at the beginning. However, there’s also a sense of determination to try and succeed.

Standard Model	Llama 3.1	Subjective
positive	neutral (-.2)	neutral

—- lol….where u headed to?

“—-”: While the exact word is censored, it’s often used to emphasize something casually or humorously, and without the context, it’s hard to determine the exact tone. However, in combination with “lol,” it suggests a light-hearted or teasing vibe rather than something negative.
“lol”: This abbreviation stands for “laugh out loud,” indicating that the speaker finds something amusing or is in a good mood.
“where u headed to?”: This is a neutral, casual question asking where the other person is going. It carries no strong emotional weight on its own.

Overall, the sentence has a playful and friendly sentiment, with “lol” and the casual phrasing contributing to an informal, light-hearted tone.

Standard Model	Llama 3.1	Subjective
neutral	positive (1)	positive

Missed all your tweets again Joe, —- that time difference!!! (UK) Dont 4get about your fave supporters in the UK!!! —-

“Missed all your tweets again Joe, —- that time difference!!! (UK)”: This part expresses frustration about the time difference, which is preventing the speaker from seeing Joe’s tweets. The frustration is intensified by the use of strong language (represented by “—-”).

“Dont 4get about your fave supporters in the UK!!!”: This shifts to a more positive and playful tone, reminding Joe not to forget his UK supporters. The use of “fave” (short for favorite) adds a friendly and supportive sentiment.

“—-”: This repetition of the strong language reinforces the frustration, but given the context, it seems more like a playful expression of annoyance rather than serious anger.

Overall, the sentence reflects a mix of frustration due to the time difference, but it’s tempered with a supportive and playful tone, emphasizing the speaker’s dedication as a fan despite the inconvenience.

Standard Model	Llama 3.1	Subjective
neutral	negative (-1)	positive

http://twitpic.com/4ja8r - Tell me, how can you not love someone like this?

“Tell me, how can you not love someone like this?”: This rhetorical question strongly implies that the speaker finds the person in the image lovable or endearing. The tone is admiring and affectionate, suggesting a deep appreciation or fondness for the person in the picture.
Link to an image: Although we can’t see the image itself, the context suggests that it depicts someone the speaker cares about, further enhancing the positive sentiment.

Overall, the sentence expresses a strong positive emotion, likely admiration, love, or deep affection toward the person in the image.

Standard Model	Llama 3.1	Subjective
neutral	positive (1)	positive

Good morning. haha I just read that the swine flu probably not come`s from pigs at all, but from people! Poor pigs who got all the blame!

“Good morning. haha”: The greeting and the “haha” set a cheerful and casual tone, suggesting the speaker is in a good mood and finds the information amusing.
“I just read that the swine flu probably not come`s from pigs at all, but from people!”: This part conveys surprise or interest in new information. The exclamation mark adds a sense of intrigue or mild shock.
“Poor pigs who got all the blame!”: This expresses sympathy for the pigs, humorously pointing out the irony that they were wrongly blamed. The tone here is empathetic, but still playful.

Overall, the sentence combines light-hearted amusement with a touch of sympathy, creating a generally positive and upbeat sentiment.

Standard Model	Llama 3.1	Subjective
neutral	positive (1)	positive

Planing on going on a little ride on the mtb on monday may be in over to the tower…

“Planning on going on a little ride on the mtb on monday”: This part conveys anticipation and a sense of casual excitement about a planned activity. “MTB” likely refers to a mountain bike, suggesting an outdoor adventure.
“may be in over to the tower…”: This adds to the positive tone, indicating that the speaker has a specific destination in mind, which likely holds some significance or interest.

Overall, the sentence expresses a laid-back and positive sentiment, reflecting the speaker’s enjoyment of outdoor activities and the anticipation of a pleasant bike ride.

Standard Model	Llama 3.1	Subjective
neutral	positive (1)	positive

Kyle is Cody’s wee bro!

“Kyle is Cody’s wee bro!”: The use of “wee” (a term often used in Scottish or Irish dialects meaning “small” or “little”) adds a warm, affectionate tone to the sentence. It indicates that Kyle is Cody’s younger brother, and the phrasing suggests a close, caring relationship.

Overall, the sentence conveys a positive, affectionate sentiment, reflecting a fondness or endearment towards Kyle as Cody’s younger sibling.

Standard Model	Llama 3.1	Subjective
neutral	positive (1)	positive

Not found: the thing I was looking for

“Not found”: This phrase indicates that something was missing or not located, which typically implies a sense of loss or failure.
“the thing I was looking for”: This adds to the sense of disappointment, as the speaker was searching for something specific but did not succeed in finding it.

Overall, the sentence conveys a tone of mild disappointment or frustration due to the inability to find what was being sought.

Standard Model	Llama 3.1	Subjective
neutral	negative (-1)	negative

Ha! I think I really have lost it. Time to get ready for work.

“Ha!”: This interjection suggests a moment of realization or self-reflection. It could indicate amusement, irony, or even resignation.
“I think I really have lost it.”: This phrase suggests some level of frustration, confusion, or self-deprecation. The speaker might be acknowledging that they are feeling overwhelmed, disorganized, or out of sorts, which introduces a negative sentiment.
“Time to get ready for work.”: This part is more neutral, indicating a routine action that needs to be taken. It suggests a sense of acceptance or determination to move forward despite the earlier statement.

Overall, the sentence reflects a moment of self-reflection where the speaker recognizes feeling a bit off but is still prepared to move on with their day.

Standard Model	Llama 3.1	Subjective
negative	neutral (-.1)	neutral

look up our youtube JessAndNicoleMusic, we are new, leave comments and subscribe thankyou x

“look up our YouTube JessAndNicoleMusic”: This is an enthusiastic invitation to check out their new YouTube channel. It shows eagerness to share their content with others.
“we are new”: This provides context that the channel is just starting out, which can elicit a sense of support and encouragement from potential viewers.
“leave comments and subscribe”: This is a polite and friendly call to action, asking viewers to engage with the content and support the channel.
“thank you x”: The use of “thank you” combined with “x” (which often represents a kiss or affectionate gesture) adds a warm, appreciative tone, making the request feel personal and sincere.

Overall, the sentence is aimed at generating interest and support for the new YouTube channel, conveyed in a positive and engaging manner.

Standard Model	Llama 3.1	Subjective
neutral	positive (1)	positive

tibble(
  id = 1:10,
  `Standard Model` = c(1, 0, 0 , 0, 0, 0, 0, 0, -1, 0),
  `Llama 3.1` = c(0, 1, -1, 1, 1, 1, 1, -1, 0, 1),
  Subjective = c(0, 1, 1, 1, 1, 1, 1, -1, 0, 1),
) |>
  pivot_longer(names_to = "model", values_to = "value", cols = 2:4) |>
  ggplot() +
  geom_tile(aes(
    x = factor(id),
    y = factor(model, levels = c("Standard Model", "Subjective", "Llama 3.1")),
    fill = factor(value)
  ), color = "white", linewidth = 2) +
  theme_minimal() +
  scale_fill_manual(values = c(
    "-1" = "#272635", 
    "1" = "#5662f6",
    "0" = "#cccccc"
  )) +
  labs(
    x = "", y = "", fill = "Sentiment"
  )