# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytext)
# Sample text data
text_data <- tibble(
id = 1:5,
text = c("Global Plastic Pollution.",
"Plastic Footprint.",
"Big companies CocaColaCompany, Pepsi, Nestle polluting environment.",
"Waste",
"microplastics, sustainable packaging, enviornmentally friendly, sustainability, reuse, reduce, recycle.")
)
# View the data
print(text_data)
## # A tibble: 5 × 2
## id text
## <int> <chr>
## 1 1 Global Plastic Pollution.
## 2 2 Plastic Footprint.
## 3 3 Big companies CocaColaCompany, Pepsi, Nestle polluting environment.
## 4 4 Waste
## 5 5 microplastics, sustainable packaging, enviornmentally friendly, sustain…
# Tokenize the text
tokens <- text_data %>%
unnest_tokens(word, text)
# View the tokens
print(tokens)
## # A tibble: 22 × 2
## id word
## <int> <chr>
## 1 1 global
## 2 1 plastic
## 3 1 pollution
## 4 2 plastic
## 5 2 footprint
## 6 3 big
## 7 3 companies
## 8 3 cocacolacompany
## 9 3 pepsi
## 10 3 nestle
## # ℹ 12 more rows
# Remove stop words
data("stop_words")
clean_tokens <- tokens %>%
anti_join(stop_words, by = "word")
# View the cleaned tokens
print(clean_tokens)
## # A tibble: 21 × 2
## id word
## <int> <chr>
## 1 1 global
## 2 1 plastic
## 3 1 pollution
## 4 2 plastic
## 5 2 footprint
## 6 3 companies
## 7 3 cocacolacompany
## 8 3 pepsi
## 9 3 nestle
## 10 3 polluting
## # ℹ 11 more rows
# Calculate word frequency
word_freq <- clean_tokens %>%
count(word, sort = TRUE)
# View word frequency
print(word_freq)
## # A tibble: 20 × 2
## word n
## <chr> <int>
## 1 plastic 2
## 2 cocacolacompany 1
## 3 companies 1
## 4 enviornmentally 1
## 5 environment 1
## 6 footprint 1
## 7 friendly 1
## 8 global 1
## 9 microplastics 1
## 10 nestle 1
## 11 packaging 1
## 12 pepsi 1
## 13 polluting 1
## 14 pollution 1
## 15 recycle 1
## 16 reduce 1
## 17 reuse 1
## 18 sustainability 1
## 19 sustainable 1
## 20 waste 1
# Get sentiment lexicon
sentiment_lexicon <- get_sentiments("bing")
# Perform sentiment analysis
sentiment_analysis <- clean_tokens %>%
inner_join(sentiment_lexicon, by = "word") %>%
count(word, sentiment, sort = TRUE)
# View sentiment analysis
print(sentiment_analysis)
## # A tibble: 4 × 3
## word sentiment n
## <chr> <chr> <int>
## 1 friendly positive 1
## 2 sustainability positive 1
## 3 sustainable positive 1
## 4 waste negative 1
# Load wordcloud library
library(wordcloud)
## Loading required package: RColorBrewer
wordcloud(words = word_freq$word, freq = word_freq$n, min.freq = 1,
random.order = TRUE, colors = brewer.pal(5, "Dark2"))
