REDMI 16 - REVIEW DATASET

# loading Libraries

library(tm)

## Warning: package 'tm' was built under R version 4.3.3

## Loading required package: NLP

## Warning: package 'NLP' was built under R version 4.3.3

library(tidytext)

## Warning: package 'tidytext' was built under R version 4.3.3

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.3.3

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

library(wordcloud)

## Warning: package 'wordcloud' was built under R version 4.3.3

## Loading required package: RColorBrewer

library(syuzhet)

## Warning: package 'syuzhet' was built under R version 4.3.3

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.3.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tibble)

## Warning: package 'tibble' was built under R version 4.3.3

library(textstem)

## Warning: package 'textstem' was built under R version 4.3.3

## Loading required package: koRpus.lang.en

## Warning: package 'koRpus.lang.en' was built under R version 4.3.3

## Loading required package: koRpus

## Warning: package 'koRpus' was built under R version 4.3.3

## Loading required package: sylly

## Warning: package 'sylly' was built under R version 4.3.3

## For information on available language packages for 'koRpus', run
## 
##   available.koRpus.lang()
## 
## and see ?install.koRpus.lang()

## 
## Attaching package: 'koRpus'

## The following object is masked from 'package:tm':
## 
##     readTagged

library(textdata)

## Warning: package 'textdata' was built under R version 4.3.3

library(tidyr)

## Warning: package 'tidyr' was built under R version 4.3.3

library(RColorBrewer)
library(topicmodels)

## Warning: package 'topicmodels' was built under R version 4.3.3

#Importing the data file 
reviews <- "G:\\2025 DS FOLDER\\Redmi SA - DATA\\redmi6\\redmi6.csv"
df <- as.tibble(read.csv(reviews,stringsAsFactors = FALSE))

## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#Summary of Data by Column 
summary(df)

##  Review.Title       Customer.name          Rating          Date          
##  Length:280         Length:280         Min.   :1.000   Length:280        
##  Class :character   Class :character   1st Qu.:3.000   Class :character  
##  Mode  :character   Mode  :character   Median :5.000   Mode  :character  
##                                        Mean   :3.918                     
##                                        3rd Qu.:5.000                     
##                                        Max.   :5.000                     
##    Category           Comments            Useful         
##  Length:280         Length:280         Length:280        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##

#
if(!dir.exists("plots")){
  dir.create("plots_2")
}
#

#Selecting Data
#Only the first five
df <- df[,1:7] #selecting the first five column
df <- na.omit (df) #Removing rows containing null values
df$review_no <- 1:nrow(df) #Addition of identifier column to reviews

#Sampling Data
#Reduction of rows of data to 700 by selecting all reviews from a sample of 7 randomly selected Category

set.seed(280)
sample_index <- sample(length(unique(df$Category)),5) #sample population size and sample size
sampled_Category <- unique(df$Category)[sample_index] #Taking Category at index defined previously

df <- df %>% filter(Category %in% sampled_Category) # This will select rows where Category is among the sampled Category

print (summary(df))

##  Review.Title       Customer.name          Rating          Date          
##  Length:280         Length:280         Min.   :1.000   Length:280        
##  Class :character   Class :character   1st Qu.:3.000   Class :character  
##  Mode  :character   Mode  :character   Median :5.000   Mode  :character  
##                                        Mean   :3.918                     
##                                        3rd Qu.:5.000                     
##                                        Max.   :5.000                     
##    Category           Comments            Useful            review_no     
##  Length:280         Length:280         Length:280         Min.   :  1.00  
##  Class :character   Class :character   Class :character   1st Qu.: 70.75  
##  Mode  :character   Mode  :character   Mode  :character   Median :140.50  
##                                                           Mean   :140.50  
##                                                           3rd Qu.:210.25  
##                                                           Max.   :280.00

head (df)

view(df)

#Tokenizing Data 
word_tokenized_data <- df%>% 
  unnest_tokens(output = word,input = "Comments", token = "words", to_lower = TRUE)  

#bigram Tokenization
bigram_tokenized_data <- df%>% 
  unnest_tokens(output = bigram,input = "Comments", token = "ngrams",n=2,to_lower = TRUE)

#Creating a Plot 
word_counts <- word_tokenized_data %>% 
  count(word, sort = TRUE) #Counts the occurences of each word and sorts. 

ggplot(word_counts[1:12, ], aes(x = reorder(word, n), y = n)) + #This will plot the first 12 rows on x-axis and n on the y-axis
  geom_col(fill = "green") + #This set the colours of the bar to green
  labs(x = "words", y = "Frequency") + #Defines X and y axes labels
  coord_flip() + # Flips coordinates so words go on the y axis (for readability)
  theme_classic() #setting the theme of the visualisation

# Create a larger and fuller word cloud
# Set seed for reproducibility
set.seed(280)
wordcloud(words = word_counts$word, 
          freq = word_counts$n, 
          min.freq = 5,             # Lower minimum frequency to include more words
          max.words = 2000,          # Increase max words to display
          scale = c(3, 0.5),        # Adjust scale for larger words
          random.order = FALSE, 
          random.color = TRUE,      # Allow random colors for better visual appeal
          colors = brewer.pal(8, "Dark2"),  # Use a color palette for better visibility
          rot.per = 0.5,           # Rotate some words for variety
          use.r.layout = FALSE)     # Use a different layout algorithm if needed

#Its important to note that this word frequency barchart and word cloud contains stop words. 

#Data cleaning
#The Aim is to remove stop words and special characters 
clean_tokens <- word_tokenized_data %>% anti_join(stop_words, by = "word") #This will remove stop words

clean_tokens$word <- gsub("[^a-zA-Z]","", clean_tokens$word) %>% #This will remove special characters and numbers
  na_if("")%>% # This will replace the empty string with NA
  lemmatize_words() #This will lemmatized the entire text 
  
  
clean_tokens <- na.omit(clean_tokens) # This will remove the null values 

## Wordcloud without stop words
#From the "clean_token"s, column "word", I have created a new dataframe that will now have word and frequency, which will be used for the word cloud creation. 
# Create a new data frame with word frequencies
word_data <- clean_tokens %>%
  group_by(word) %>%          # Group by the 'word' column
  summarize(frequency = n())  # Count occurrences of each word

# View the new data frame
print(word_data)

## # A tibble: 780 × 2
##    word           frequency
##    <chr>              <int>
##  1 a                      2
##  2 abnoxiously            1
##  3 accha                  1
##  4 ache                   1
##  5 ad                    14
##  6 adapter                3
##  7 adapteratleast         2
##  8 add                    1
##  9 additional             2
## 10 advance                2
## # ℹ 770 more rows

#THEN THE WORD CLOUD WITHOUT STOP WORDS
# Generate the word cloud using the new variable
wordcloud(words = word_data$word, 
          freq = word_data$frequency, 
          min.freq = 5,             
          max.words = 2000,          
          scale = c(3, 0.5),        
          random.order = FALSE, 
          random.color = TRUE,      
          colors = brewer.pal(8, "Dark2"),  
          rot.per = 0.6,           
          use.r.layout = FALSE)

## Generation and Printing the Top 50 words in the word cloud
# Get top 50 words in descending order
top_words <- word_data %>%
  arrange(desc(frequency)) %>%  # Arrange by frequency in descending order
  head(50)                       # Select the top 50

# Print the top 50 words
print(top_words, n=50)

## # A tibble: 50 × 2
##    word        frequency
##    <chr>           <int>
##  1 phone             147
##  2 camera            112
##  3 pro                72
##  4 redmi              68
##  5 quality            57
##  6 battery            56
##  7 nice               49
##  8 mi                 46
##  9 product            40
## 10 display            39
## 11 mobile             39
## 12 front              36
## 13 buy                34
## 14 price              34
## 15 xiaomi             34
## 16 notch              29
## 17 bad                28
## 18 screen             25
## 19 note               24
## 20 amazon             23
## 21 design             23
## 22 money              23
## 23 video              22
## 24 app                21
## 25 awesome            21
## 26 issue              21
## 27 low                21
## 28 performance        21
## 29 sound              20
## 30 dual               18
## 31 light              18
## 32 con                17
## 33 charge             16
## 34 sim                16
## 35 bite               15
## 36 day                15
## 37 poor               15
## 38 ad                 14
## 39 device             13
## 40 fast               13
## 41 feature            13
## 42 life               13
## 43 time               13
## 44 average            12
## 45 budget             12
## 46 game               12
## 47 heat               12
## 48 love               12
## 49 mp                 12
## 50 range              12

#Creating A Grouped Plot
#Grouped Plot will be created by Grouped Word 
#Grouped words
top_words <- top_n(word_counts,30,n)$word #This line of code will generate the vector of Top 10 words

#Groups clean_tokens by Category and counts the number of occurrences of each word, and filters to only the top 10 words. 
grouped_count <- group_by(clean_tokens, Category) %>%
  count(word)%>%
  filter(word %in% top_words)

grouped_count$word <- factor(grouped_count$word,levels = top_words[length(top_words):1])
# This will order the top words according to overall frequency 

ggplot(data = grouped_count, aes(x = word, y = n, fill= Category)) + #The fill keyword used here will order the top words according to the overall frequency
  geom_col(position = "dodge")+ #The position as dodge will aid the creation of the bar chart
  labs (x = "words", y = "Fill",fill = "Category") +
  coord_flip() +
  theme_classic()

####SENTIMENT ANALYSIS BY BING LEXICON
#Creation of a data frame which has two columns, one for words and the other for the associated sentiment of each word.
#Bing Lexicon-sentiments will be used for this 

bing_sentiment <- get_sentiments("bing") #This will insert the bing lexicon into bing_sentiment

clean_tokens2 <- as.data.frame(clean_tokens) #This duplicate clean_token into a dataframe

sentiment_data <- clean_tokens2 %>% 
  inner_join(bing_sentiment, by = "word") # Join lexicon to dataset using only common words

# Calculation of Sentiment Scores for each review
sentiment_score <- sentiment_data %>% 
  group_by(review_no) %>%
  summarize(bing_sentiment = sum(sentiment == "positive") - sum(sentiment == "negative"), .groups = 'drop') 
# This will calculate the overall sentiment score

# Merging the sentiment score in a new data frame
df_with_sentiment <- df %>% inner_join(sentiment_score, by = "review_no") # Corrected syntax


##INSPECTING THE REVIEWS FOR THE BEST AND WORST REVIEWS
worst_review = df_with_sentiment[order(df_with_sentiment$bing_sentiment)[1],"Comments"]
print(worst_review)

## # A tibble: 1 × 1
##   Comments                                                                      
##   <chr>                                                                         
## 1 "Please don't buy!!\n\nHorrible phone from Xiaomi!!\n\nThe phone is just for …

best_review = df_with_sentiment[order(df_with_sentiment$bing_sentiment, decreasing = TRUE)[1],"Comments"]
print(best_review)

## # A tibble: 1 × 1
##   Comments                                                                      
##   <chr>                                                                         
## 1 "Hey guys this is TECHBORED! and this is unboxing and also will adding some c…

###CREATION OF SENTIMENT HISTOGRAM
#First the desired color which is burnt orange

# Define the burnt orange color (hexadecimal value)
burnt_orange <- "#CC5500"  # You can adjust this value as needed

# Create the histogram with the specified color
ggplot(df_with_sentiment, aes(x = bing_sentiment)) + 
  geom_histogram(binwidth = 1, fill = burnt_orange) +  
  labs(title = "Histogram of Bing Sentiment", x = "Bing Sentiment", y = "Count") +
  theme_minimal()

### AVERAGE SENTIMENT BY Category 
Category_sentiment <- df_with_sentiment %>% 
  group_by(Category) %>%
  summarise(average_bing_sentiment = mean(bing_sentiment))

ggplot(Category_sentiment,aes (x =reorder(Category,average_bing_sentiment),y = average_bing_sentiment, fill = Category)) + 
  geom_bar(stat = "identity") +
  coord_flip() + 
  labs (title = "Average Sentiment Score by Category", X = "Category", y = "Average Sentiment Score")

###


###BOX PLOT-  SENTIMENT Vs Rating
ggplot(df_with_sentiment,aes(group = Category,x = Rating, y = bing_sentiment, fill = Category))+
  geom_boxplot() + 
  labs(title = "Box Plot of Bing Sentiment Score vs. Rating",
        x = "Rating/Points",
        y = "Sentiment Score") +
  theme_minimal()

  scale_fill_brewer(palette = "set 3")

## Warning: Unknown palette: "set 3"

## <ggproto object: Class ScaleDiscrete, Scale, gg>
##     aesthetics: fill
##     axis_order: function
##     break_info: function
##     break_positions: function
##     breaks: waiver
##     call: call
##     clone: function
##     dimension: function
##     drop: TRUE
##     expand: waiver
##     get_breaks: function
##     get_breaks_minor: function
##     get_labels: function
##     get_limits: function
##     get_transformation: function
##     guide: legend
##     is_discrete: function
##     is_empty: function
##     labels: waiver
##     limits: NULL
##     make_sec_title: function
##     make_title: function
##     map: function
##     map_df: function
##     n.breaks.cache: NULL
##     na.translate: TRUE
##     na.value: NA
##     name: waiver
##     palette: function
##     palette.cache: NULL
##     position: left
##     range: environment
##     rescale: function
##     reset: function
##     train: function
##     train_df: function
##     transform: function
##     transform_df: function
##     super:  <ggproto object: Class ScaleDiscrete, Scale, gg>

####SENTIMENT ANALYSIS BY AFINN Lexicon
sentiment_data_AFL <- clean_tokens %>% 
  inner_join(get_sentiments("afinn"),by = "word")

#Calculating Sentiment Scores for each review
sentiment_score_AFL <- sentiment_data_AFL %>%
  group_by(review_no) %>%
  summarise(afinn_sentiment = sum(value))

#Merge with df
Afl_BL_sentiment_df <- df_with_sentiment %>%
  inner_join(sentiment_score_AFL,by = "review_no")


##INSPECTING THE REVIEWS FOR THE BEST AND WORST REVIEWS BASED ON AFINN LEXCION
worst_review_AFL = Afl_BL_sentiment_df[order(Afl_BL_sentiment_df$afinn_sentiment)[1],"Comments"]
print(worst_review_AFL)

## # A tibble: 1 × 1
##   Comments                                                                      
##   <chr>                                                                         
## 1 "Please don't buy!!\n\nHorrible phone from Xiaomi!!\n\nThe phone is just for …

best_review_AFL = Afl_BL_sentiment_df[order(Afl_BL_sentiment_df$afinn_sentiment, decreasing = TRUE)[1],"Comments"]
print(best_review_AFL)

## # A tibble: 1 × 1
##   Comments                                                                      
##   <chr>                                                                         
## 1 "Hey guys this is TECHBORED! and this is unboxing and also will adding some c…

# Define the burnt orange color (hexadecimal value)
blue <- "#0000FF" 

# Create the histogram with the specified color
ggplot(Afl_BL_sentiment_df, aes(x = afinn_sentiment)) + 
  geom_histogram(binwidth = 1, fill = blue) + 
  labs(title = "Histogram of Afinn Sentiment", x = "Afinn Sentiment", y = "Count") +
  theme_minimal()

### AVERAGE SENTIMENT BY Category based on Afinn lexicon
Category_sentiment_Afl <- Afl_BL_sentiment_df %>% 
  group_by(Category) %>%
  summarise(average_afinn_sentiment = mean(afinn_sentiment))

ggplot(Category_sentiment_Afl,aes (x =reorder(Category,average_afinn_sentiment),y = average_afinn_sentiment, fill = Category)) + 
  geom_bar(stat = "identity") +
  coord_flip() + 
  labs (title = "Average Sentiment Score by Category Based on Afinn Lexicon", X = "Category", y = "Average Sentiment Score")

###

###Scatter Plot of Bing vs. AFINN sentiment
ggplot(Afl_BL_sentiment_df, aes(x = bing_sentiment, y = afinn_sentiment))+
  geom_point() +
  labs(title = "Scatter Plot of Bing vs. AFINN Sentiment Scores",
       x = "Bing Sentiment Score",
       y = "Afinn Sentiment Score")

###SENTIMENT ANALYSIS BY NRC LEXICON
nrc_emotion_data <- clean_tokens %>% 
  inner_join(get_sentiments("nrc"), by = "word")

## Warning in inner_join(., get_sentiments("nrc"), by = "word"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 22 of `x` matches multiple rows in `y`.
## ℹ Row 1288 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

#calculating NRC sentiment Score
nrc_sentiment_count <- nrc_emotion_data %>% 
  group_by(review_no) %>%
  count(sentiment) 

#pivoting data to establish column association with each emotion
nrc_emotion_data <- nrc_sentiment_count %>%
  pivot_wider(names_from = sentiment,values_from = n, values_fill = list(n= 0))

#merge nrc with the previous
nrc_afl_bl_sentiment_df <- Afl_BL_sentiment_df %>% 
  inner_join(nrc_emotion_data, by = "review_no")

##HEATMAP BASED ON NRC LEXICON/EMOTION BY Category

nrc_long_df<- nrc_afl_bl_sentiment_df %>%
  pivot_longer(cols = c("joy","positive","trust","anticipation", "surprise",
                        "sadness","negative", "anger","disgust", "fear"),
               names_to = "Emotion",
               values_to = "Intensity")
nrc_emotion_scores <- nrc_long_df %>%
  group_by(Category,Emotion) %>%
  summarise(average_intensity = mean (Intensity))

## `summarise()` has grouped output by 'Category'. You can override using the
## `.groups` argument.

##HEATMAP
ggplot(nrc_emotion_scores, aes (x = Category, y = Emotion, fill = average_intensity))+
  geom_tile() +
  scale_fill_gradient2(low = "blue", high = "green")+
  labs(x = "Category", y = "Emotion", fill = "Intensity")+
  theme(axis.text.x = element_text(angle = 30, hjust = 1))

REDMI 16 - REVIEW DATASET

JOSEPH ATUNDE

2025-02-14