text mining

# Load necessary libraries
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidytext)

# Sample text data
text_data <- tibble(
  id = 1:5,
  text = c("Global Plastic Pollution.",
           "Plastic Footprint.",
           "Big companies  CocaColaCompany, Pepsi, Nestle polluting environment.",
           "Waste",
           "microplastics, sustainable packaging, enviornmentally friendly, sustainability, reuse, reduce, recycle.")
)

# View the data
print(text_data)

## # A tibble: 5 × 2
##      id text                                                                    
##   <int> <chr>                                                                   
## 1     1 Global Plastic Pollution.                                               
## 2     2 Plastic Footprint.                                                      
## 3     3 Big companies  CocaColaCompany, Pepsi, Nestle polluting environment.    
## 4     4 Waste                                                                   
## 5     5 microplastics, sustainable packaging, enviornmentally friendly, sustain…

# Tokenize the text
tokens <- text_data %>%
  unnest_tokens(word, text)

# View the tokens
print(tokens)

## # A tibble: 22 × 2
##       id word           
##    <int> <chr>          
##  1     1 global         
##  2     1 plastic        
##  3     1 pollution      
##  4     2 plastic        
##  5     2 footprint      
##  6     3 big            
##  7     3 companies      
##  8     3 cocacolacompany
##  9     3 pepsi          
## 10     3 nestle         
## # ℹ 12 more rows

# Remove stop words
data("stop_words")
clean_tokens <- tokens %>%
  anti_join(stop_words, by = "word")

# View the cleaned tokens
print(clean_tokens)

## # A tibble: 21 × 2
##       id word           
##    <int> <chr>          
##  1     1 global         
##  2     1 plastic        
##  3     1 pollution      
##  4     2 plastic        
##  5     2 footprint      
##  6     3 companies      
##  7     3 cocacolacompany
##  8     3 pepsi          
##  9     3 nestle         
## 10     3 polluting      
## # ℹ 11 more rows

# Calculate word frequency
word_freq <- clean_tokens %>%
  count(word, sort = TRUE)

# View word frequency
print(word_freq)

## # A tibble: 20 × 2
##    word                n
##    <chr>           <int>
##  1 plastic             2
##  2 cocacolacompany     1
##  3 companies           1
##  4 enviornmentally     1
##  5 environment         1
##  6 footprint           1
##  7 friendly            1
##  8 global              1
##  9 microplastics       1
## 10 nestle              1
## 11 packaging           1
## 12 pepsi               1
## 13 polluting           1
## 14 pollution           1
## 15 recycle             1
## 16 reduce              1
## 17 reuse               1
## 18 sustainability      1
## 19 sustainable         1
## 20 waste               1

# Get sentiment lexicon
sentiment_lexicon <- get_sentiments("bing")

# Perform sentiment analysis
sentiment_analysis <- clean_tokens %>%
  inner_join(sentiment_lexicon, by = "word") %>%
  count(word, sentiment, sort = TRUE)

# View sentiment analysis
print(sentiment_analysis)

## # A tibble: 4 × 3
##   word           sentiment     n
##   <chr>          <chr>     <int>
## 1 friendly       positive      1
## 2 sustainability positive      1
## 3 sustainable    positive      1
## 4 waste          negative      1

# Load wordcloud library
library(wordcloud)

## Loading required package: RColorBrewer

wordcloud(words = word_freq$word, freq = word_freq$n, min.freq = 1, 
          random.order = TRUE, colors = brewer.pal(5, "Dark2"))

text mining

2025-02-25