# loading Libraries
library(tm)
## Warning: package 'tm' was built under R version 4.3.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 4.3.3
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.3.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 4.3.3
## Loading required package: RColorBrewer
library(syuzhet)
## Warning: package 'syuzhet' was built under R version 4.3.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tibble)
## Warning: package 'tibble' was built under R version 4.3.3
library(textstem)
## Warning: package 'textstem' was built under R version 4.3.3
## Loading required package: koRpus.lang.en
## Warning: package 'koRpus.lang.en' was built under R version 4.3.3
## Loading required package: koRpus
## Warning: package 'koRpus' was built under R version 4.3.3
## Loading required package: sylly
## Warning: package 'sylly' was built under R version 4.3.3
## For information on available language packages for 'koRpus', run
##
## available.koRpus.lang()
##
## and see ?install.koRpus.lang()
##
## Attaching package: 'koRpus'
## The following object is masked from 'package:tm':
##
## readTagged
library(textdata)
## Warning: package 'textdata' was built under R version 4.3.3
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.3
library(RColorBrewer)
library(topicmodels)
## Warning: package 'topicmodels' was built under R version 4.3.3
#Importing the data file
reviews <- "G:\\2025 DS FOLDER\\Redmi SA - DATA\\redmi6\\redmi6.csv"
df <- as.tibble(read.csv(reviews,stringsAsFactors = FALSE))
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#Summary of Data by Column
summary(df)
## Review.Title Customer.name Rating Date
## Length:280 Length:280 Min. :1.000 Length:280
## Class :character Class :character 1st Qu.:3.000 Class :character
## Mode :character Mode :character Median :5.000 Mode :character
## Mean :3.918
## 3rd Qu.:5.000
## Max. :5.000
## Category Comments Useful
## Length:280 Length:280 Length:280
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
#
if(!dir.exists("plots")){
dir.create("plots_2")
}
#
#Selecting Data
#Only the first five
df <- df[,1:7] #selecting the first five column
df <- na.omit (df) #Removing rows containing null values
df$review_no <- 1:nrow(df) #Addition of identifier column to reviews
#Sampling Data
#Reduction of rows of data to 700 by selecting all reviews from a sample of 7 randomly selected Category
set.seed(280)
sample_index <- sample(length(unique(df$Category)),5) #sample population size and sample size
sampled_Category <- unique(df$Category)[sample_index] #Taking Category at index defined previously
df <- df %>% filter(Category %in% sampled_Category) # This will select rows where Category is among the sampled Category
print (summary(df))
## Review.Title Customer.name Rating Date
## Length:280 Length:280 Min. :1.000 Length:280
## Class :character Class :character 1st Qu.:3.000 Class :character
## Mode :character Mode :character Median :5.000 Mode :character
## Mean :3.918
## 3rd Qu.:5.000
## Max. :5.000
## Category Comments Useful review_no
## Length:280 Length:280 Length:280 Min. : 1.00
## Class :character Class :character Class :character 1st Qu.: 70.75
## Mode :character Mode :character Mode :character Median :140.50
## Mean :140.50
## 3rd Qu.:210.25
## Max. :280.00
head (df)
view(df)
#Tokenizing Data
word_tokenized_data <- df%>%
unnest_tokens(output = word,input = "Comments", token = "words", to_lower = TRUE)
#bigram Tokenization
bigram_tokenized_data <- df%>%
unnest_tokens(output = bigram,input = "Comments", token = "ngrams",n=2,to_lower = TRUE)
#Creating a Plot
word_counts <- word_tokenized_data %>%
count(word, sort = TRUE) #Counts the occurences of each word and sorts.
ggplot(word_counts[1:12, ], aes(x = reorder(word, n), y = n)) + #This will plot the first 12 rows on x-axis and n on the y-axis
geom_col(fill = "green") + #This set the colours of the bar to green
labs(x = "words", y = "Frequency") + #Defines X and y axes labels
coord_flip() + # Flips coordinates so words go on the y axis (for readability)
theme_classic() #setting the theme of the visualisation

# Create a larger and fuller word cloud
# Set seed for reproducibility
set.seed(280)
wordcloud(words = word_counts$word,
freq = word_counts$n,
min.freq = 5, # Lower minimum frequency to include more words
max.words = 2000, # Increase max words to display
scale = c(3, 0.5), # Adjust scale for larger words
random.order = FALSE,
random.color = TRUE, # Allow random colors for better visual appeal
colors = brewer.pal(8, "Dark2"), # Use a color palette for better visibility
rot.per = 0.5, # Rotate some words for variety
use.r.layout = FALSE) # Use a different layout algorithm if needed

#Its important to note that this word frequency barchart and word cloud contains stop words.
#Data cleaning
#The Aim is to remove stop words and special characters
clean_tokens <- word_tokenized_data %>% anti_join(stop_words, by = "word") #This will remove stop words
clean_tokens$word <- gsub("[^a-zA-Z]","", clean_tokens$word) %>% #This will remove special characters and numbers
na_if("")%>% # This will replace the empty string with NA
lemmatize_words() #This will lemmatized the entire text
clean_tokens <- na.omit(clean_tokens) # This will remove the null values
## Wordcloud without stop words
#From the "clean_token"s, column "word", I have created a new dataframe that will now have word and frequency, which will be used for the word cloud creation.
# Create a new data frame with word frequencies
word_data <- clean_tokens %>%
group_by(word) %>% # Group by the 'word' column
summarize(frequency = n()) # Count occurrences of each word
# View the new data frame
print(word_data)
## # A tibble: 780 × 2
## word frequency
## <chr> <int>
## 1 a 2
## 2 abnoxiously 1
## 3 accha 1
## 4 ache 1
## 5 ad 14
## 6 adapter 3
## 7 adapteratleast 2
## 8 add 1
## 9 additional 2
## 10 advance 2
## # ℹ 770 more rows
#THEN THE WORD CLOUD WITHOUT STOP WORDS
# Generate the word cloud using the new variable
wordcloud(words = word_data$word,
freq = word_data$frequency,
min.freq = 5,
max.words = 2000,
scale = c(3, 0.5),
random.order = FALSE,
random.color = TRUE,
colors = brewer.pal(8, "Dark2"),
rot.per = 0.6,
use.r.layout = FALSE)

## Generation and Printing the Top 50 words in the word cloud
# Get top 50 words in descending order
top_words <- word_data %>%
arrange(desc(frequency)) %>% # Arrange by frequency in descending order
head(50) # Select the top 50
# Print the top 50 words
print(top_words, n=50)
## # A tibble: 50 × 2
## word frequency
## <chr> <int>
## 1 phone 147
## 2 camera 112
## 3 pro 72
## 4 redmi 68
## 5 quality 57
## 6 battery 56
## 7 nice 49
## 8 mi 46
## 9 product 40
## 10 display 39
## 11 mobile 39
## 12 front 36
## 13 buy 34
## 14 price 34
## 15 xiaomi 34
## 16 notch 29
## 17 bad 28
## 18 screen 25
## 19 note 24
## 20 amazon 23
## 21 design 23
## 22 money 23
## 23 video 22
## 24 app 21
## 25 awesome 21
## 26 issue 21
## 27 low 21
## 28 performance 21
## 29 sound 20
## 30 dual 18
## 31 light 18
## 32 con 17
## 33 charge 16
## 34 sim 16
## 35 bite 15
## 36 day 15
## 37 poor 15
## 38 ad 14
## 39 device 13
## 40 fast 13
## 41 feature 13
## 42 life 13
## 43 time 13
## 44 average 12
## 45 budget 12
## 46 game 12
## 47 heat 12
## 48 love 12
## 49 mp 12
## 50 range 12
#Creating A Grouped Plot
#Grouped Plot will be created by Grouped Word
#Grouped words
top_words <- top_n(word_counts,30,n)$word #This line of code will generate the vector of Top 10 words
#Groups clean_tokens by Category and counts the number of occurrences of each word, and filters to only the top 10 words.
grouped_count <- group_by(clean_tokens, Category) %>%
count(word)%>%
filter(word %in% top_words)
grouped_count$word <- factor(grouped_count$word,levels = top_words[length(top_words):1])
# This will order the top words according to overall frequency
ggplot(data = grouped_count, aes(x = word, y = n, fill= Category)) + #The fill keyword used here will order the top words according to the overall frequency
geom_col(position = "dodge")+ #The position as dodge will aid the creation of the bar chart
labs (x = "words", y = "Fill",fill = "Category") +
coord_flip() +
theme_classic()

####SENTIMENT ANALYSIS BY BING LEXICON
#Creation of a data frame which has two columns, one for words and the other for the associated sentiment of each word.
#Bing Lexicon-sentiments will be used for this
bing_sentiment <- get_sentiments("bing") #This will insert the bing lexicon into bing_sentiment
clean_tokens2 <- as.data.frame(clean_tokens) #This duplicate clean_token into a dataframe
sentiment_data <- clean_tokens2 %>%
inner_join(bing_sentiment, by = "word") # Join lexicon to dataset using only common words
# Calculation of Sentiment Scores for each review
sentiment_score <- sentiment_data %>%
group_by(review_no) %>%
summarize(bing_sentiment = sum(sentiment == "positive") - sum(sentiment == "negative"), .groups = 'drop')
# This will calculate the overall sentiment score
# Merging the sentiment score in a new data frame
df_with_sentiment <- df %>% inner_join(sentiment_score, by = "review_no") # Corrected syntax
##INSPECTING THE REVIEWS FOR THE BEST AND WORST REVIEWS
worst_review = df_with_sentiment[order(df_with_sentiment$bing_sentiment)[1],"Comments"]
print(worst_review)
## # A tibble: 1 × 1
## Comments
## <chr>
## 1 "Please don't buy!!\n\nHorrible phone from Xiaomi!!\n\nThe phone is just for …
best_review = df_with_sentiment[order(df_with_sentiment$bing_sentiment, decreasing = TRUE)[1],"Comments"]
print(best_review)
## # A tibble: 1 × 1
## Comments
## <chr>
## 1 "Hey guys this is TECHBORED! and this is unboxing and also will adding some c…
###CREATION OF SENTIMENT HISTOGRAM
#First the desired color which is burnt orange
# Define the burnt orange color (hexadecimal value)
burnt_orange <- "#CC5500" # You can adjust this value as needed
# Create the histogram with the specified color
ggplot(df_with_sentiment, aes(x = bing_sentiment)) +
geom_histogram(binwidth = 1, fill = burnt_orange) +
labs(title = "Histogram of Bing Sentiment", x = "Bing Sentiment", y = "Count") +
theme_minimal()

### AVERAGE SENTIMENT BY Category
Category_sentiment <- df_with_sentiment %>%
group_by(Category) %>%
summarise(average_bing_sentiment = mean(bing_sentiment))
ggplot(Category_sentiment,aes (x =reorder(Category,average_bing_sentiment),y = average_bing_sentiment, fill = Category)) +
geom_bar(stat = "identity") +
coord_flip() +
labs (title = "Average Sentiment Score by Category", X = "Category", y = "Average Sentiment Score")

###
###BOX PLOT- SENTIMENT Vs Rating
ggplot(df_with_sentiment,aes(group = Category,x = Rating, y = bing_sentiment, fill = Category))+
geom_boxplot() +
labs(title = "Box Plot of Bing Sentiment Score vs. Rating",
x = "Rating/Points",
y = "Sentiment Score") +
theme_minimal()

scale_fill_brewer(palette = "set 3")
## Warning: Unknown palette: "set 3"
## <ggproto object: Class ScaleDiscrete, Scale, gg>
## aesthetics: fill
## axis_order: function
## break_info: function
## break_positions: function
## breaks: waiver
## call: call
## clone: function
## dimension: function
## drop: TRUE
## expand: waiver
## get_breaks: function
## get_breaks_minor: function
## get_labels: function
## get_limits: function
## get_transformation: function
## guide: legend
## is_discrete: function
## is_empty: function
## labels: waiver
## limits: NULL
## make_sec_title: function
## make_title: function
## map: function
## map_df: function
## n.breaks.cache: NULL
## na.translate: TRUE
## na.value: NA
## name: waiver
## palette: function
## palette.cache: NULL
## position: left
## range: environment
## rescale: function
## reset: function
## train: function
## train_df: function
## transform: function
## transform_df: function
## super: <ggproto object: Class ScaleDiscrete, Scale, gg>
####SENTIMENT ANALYSIS BY AFINN Lexicon
sentiment_data_AFL <- clean_tokens %>%
inner_join(get_sentiments("afinn"),by = "word")
#Calculating Sentiment Scores for each review
sentiment_score_AFL <- sentiment_data_AFL %>%
group_by(review_no) %>%
summarise(afinn_sentiment = sum(value))
#Merge with df
Afl_BL_sentiment_df <- df_with_sentiment %>%
inner_join(sentiment_score_AFL,by = "review_no")
##INSPECTING THE REVIEWS FOR THE BEST AND WORST REVIEWS BASED ON AFINN LEXCION
worst_review_AFL = Afl_BL_sentiment_df[order(Afl_BL_sentiment_df$afinn_sentiment)[1],"Comments"]
print(worst_review_AFL)
## # A tibble: 1 × 1
## Comments
## <chr>
## 1 "Please don't buy!!\n\nHorrible phone from Xiaomi!!\n\nThe phone is just for …
best_review_AFL = Afl_BL_sentiment_df[order(Afl_BL_sentiment_df$afinn_sentiment, decreasing = TRUE)[1],"Comments"]
print(best_review_AFL)
## # A tibble: 1 × 1
## Comments
## <chr>
## 1 "Hey guys this is TECHBORED! and this is unboxing and also will adding some c…
# Define the burnt orange color (hexadecimal value)
blue <- "#0000FF"
# Create the histogram with the specified color
ggplot(Afl_BL_sentiment_df, aes(x = afinn_sentiment)) +
geom_histogram(binwidth = 1, fill = blue) +
labs(title = "Histogram of Afinn Sentiment", x = "Afinn Sentiment", y = "Count") +
theme_minimal()

### AVERAGE SENTIMENT BY Category based on Afinn lexicon
Category_sentiment_Afl <- Afl_BL_sentiment_df %>%
group_by(Category) %>%
summarise(average_afinn_sentiment = mean(afinn_sentiment))
ggplot(Category_sentiment_Afl,aes (x =reorder(Category,average_afinn_sentiment),y = average_afinn_sentiment, fill = Category)) +
geom_bar(stat = "identity") +
coord_flip() +
labs (title = "Average Sentiment Score by Category Based on Afinn Lexicon", X = "Category", y = "Average Sentiment Score")

###
###Scatter Plot of Bing vs. AFINN sentiment
ggplot(Afl_BL_sentiment_df, aes(x = bing_sentiment, y = afinn_sentiment))+
geom_point() +
labs(title = "Scatter Plot of Bing vs. AFINN Sentiment Scores",
x = "Bing Sentiment Score",
y = "Afinn Sentiment Score")

###SENTIMENT ANALYSIS BY NRC LEXICON
nrc_emotion_data <- clean_tokens %>%
inner_join(get_sentiments("nrc"), by = "word")
## Warning in inner_join(., get_sentiments("nrc"), by = "word"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 22 of `x` matches multiple rows in `y`.
## ℹ Row 1288 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
#calculating NRC sentiment Score
nrc_sentiment_count <- nrc_emotion_data %>%
group_by(review_no) %>%
count(sentiment)
#pivoting data to establish column association with each emotion
nrc_emotion_data <- nrc_sentiment_count %>%
pivot_wider(names_from = sentiment,values_from = n, values_fill = list(n= 0))
#merge nrc with the previous
nrc_afl_bl_sentiment_df <- Afl_BL_sentiment_df %>%
inner_join(nrc_emotion_data, by = "review_no")
##HEATMAP BASED ON NRC LEXICON/EMOTION BY Category
nrc_long_df<- nrc_afl_bl_sentiment_df %>%
pivot_longer(cols = c("joy","positive","trust","anticipation", "surprise",
"sadness","negative", "anger","disgust", "fear"),
names_to = "Emotion",
values_to = "Intensity")
nrc_emotion_scores <- nrc_long_df %>%
group_by(Category,Emotion) %>%
summarise(average_intensity = mean (Intensity))
## `summarise()` has grouped output by 'Category'. You can override using the
## `.groups` argument.
##HEATMAP
ggplot(nrc_emotion_scores, aes (x = Category, y = Emotion, fill = average_intensity))+
geom_tile() +
scale_fill_gradient2(low = "blue", high = "green")+
labs(x = "Category", y = "Emotion", fill = "Intensity")+
theme(axis.text.x = element_text(angle = 30, hjust = 1))
