Load necessary libraries

library(tm) library(syuzhet) library(topicmodels) library(ggplot2) #Task A # Read dataset wine_data <- read.csv(“E:/Data Base/Fiverr project/Ijaj vai/MS4S09_CW_Reviews.csv”)

View dataset structure

str(wine_data) summary(wine_data)

Check for missing values

colSums(is.na(wine_data))

Handle missing values (remove or impute)

wine_data_clean <- na.omit(wine_data)

Convert points column to numeric (ensure no coercion issues)

wine_data_clean$points <- as.numeric(as.character(wine_data_clean$points))

Text Preprocessing

wine_data_clean$description <- tolower(wine_data_clean$description) wine_data_clean$description <- iconv(wine_data_clean$description, to = “UTF-8”) # Fix encoding issues wine_data_clean$description <- removePunctuation(wine_data_clean$description) wine_data_clean$description <- removeNumbers(wine_data_clean$description)

Remove stopwords

stopwords_list <- stopwords(“en”) wine_data_clean$description <- removeWords(wine_data_clean$description, stopwords_list) # Plot Histogram of Wine Ratings (Points) ggplot(wine_data_clean, aes(x = points)) + geom_histogram(binwidth = 1, fill = “blue”, color = “black”, alpha = 0.7) + theme_minimal() + labs(title = “Distribution of Wine Ratings”, x = “Wine Rating (Points)”, y = “Count”)

#task B

library(tm) library(syuzhet) library(topicmodels) library(ggplot2) library(dplyr) # Perform Sentiment Analysis wine_data_clean$sentiment_score <- get_sentiment(wine_data_clean$description, method = “syuzhet”) # Add a new column to classify sentiment into positive, negative, and neutral wine_data_clean$sentiment_label <- ifelse(wine_data_clean$sentiment_score > 0, “Positive”, ifelse(wine_data_clean$sentiment_score < 0, "Negative", "Neutral")) # Visualize Sentiment Distribution ggplot(wine_data_clean, aes(x = sentiment_label)) + geom_bar(fill = c("blue", "red", "gray"), color = "black", alpha = 0.7) + theme_minimal() + labs(title = "Sentiment Distribution", x = "Sentiment", y = "Count") # Visualize Sentiment vs Wine Points ggplot(wine_data_clean, aes(x = points, fill = sentiment_label)) + geom_bar(position = "stack", color = "black", alpha = 0.7) + theme_minimal() + labs(title = "Sentiment Distribution Across Wine Points", x = "Wine Rating (Points)", y = "Count") # Sentiment Score Distribution (Histogram) ggplot(wine_data_clean, aes(x = sentiment_score)) + geom_histogram(binwidth = 0.1, fill = "purple", color = "black", alpha = 0.7) + theme_minimal() + labs(title = "Sentiment Score Distribution", x = "Sentiment Score", y = "Count") # Create Wordcloud for Description wordcloud(wine_data_clean$description, max.words = 100, random.order = FALSE, colors = brewer.pal(8, “Dark2”)) #Task C # Load necessary libraries library(tm) library(syuzhet) library(topicmodels) library(ggplot2) library(dplyr) library(tidyverse) library(tidytext) # Create a Document-Term Matrix (DTM) for the reviews corpus <- Corpus(VectorSource(wine_data_clean$description)) dtm <- DocumentTermMatrix(corpus, control = list(stopwords = TRUE, removePunctuation = TRUE, removeNumbers = TRUE, tolower = TRUE))

Define the number of topics

num_topics <- 5 # You can change this based on the number of topics you want to identify lda_model <- LDA(dtm, k = num_topics, control = list(seed = 1234)) # Step 4: View the top terms for each topic terms_per_topic <- tidy(lda_model, matrix = “beta”) # Show the top 10 terms for each topic terms_per_topic %>% group_by(topic) %>% top_n(10, beta) %>% ungroup() %>% ggplot(aes(reorder(term, beta), beta, fill = factor(topic))) + geom_bar(stat = “identity”, show.legend = FALSE) + facet_wrap(~topic, scales = “free_y”) + coord_flip() + theme_minimal() + labs(title = “Top Terms per Topic”, x = “Terms”, y = “Beta Value”)

Step 5: Visualizing Topic Distribution Across Reviews

Get the topic probabilities for each document

topic_probabilities <- tidy(lda_model, matrix = “gamma”) # Plot the topic distribution across all reviews ggplot(topic_probabilities, aes(x = topic, y = gamma, fill = factor(topic))) + geom_bar(stat = “identity”, position = “stack”) + theme_minimal() + labs(title = “Topic Distribution Across Reviews”, x = “Topic”, y = “Proportion of Topic in Reviews”)

Step 6: Assign the most probable topic to each review

wine_data_clean$topic <- topics(lda_model) # Step 7: Visualize Sentiment Distribution by Topic ggplot(wine_data_clean, aes(x = factor(topic), fill = sentiment_label)) + geom_bar(position = “stack”, color = “black”, alpha = 0.7) + theme_minimal() + labs(title = “Sentiment Distribution Across Topics”, x = “Topic”, y = “Count”)

Step 8: Insightful Topic Visualization (Visualizing the Sentiment Score by Topic)

ggplot(wine_data_clean, aes(x = factor(topic), y = sentiment_score, color = factor(topic))) + geom_boxplot() + theme_minimal() + labs(title = “Sentiment Score Distribution by Topic”, x = “Topic”, y = “Sentiment Score”)

#Task D # Load necessary libraries for Task D library(tm) library(syuzhet) library(topicmodels) library(ggplot2) library(dplyr) library(tidyverse) library(tidytext) library(cluster)

Step 1: Create a Document-Term Matrix (DTM) for the reviews

corpus <- Corpus(VectorSource(wine_data_clean$description)) dtm <- DocumentTermMatrix(corpus, control = list(stopwords = TRUE, removePunctuation = TRUE, removeNumbers = TRUE, tolower = TRUE)) # Step 2: Remove sparse terms (e.g., terms that appear in fewer than 5% of the documents) dtm_sparse <- removeSparseTerms(dtm, 0.95) # Keeps terms that appear in at least 5% of the documents # Step 3: Convert the DTM to a matrix and calculate word frequencies dtm_matrix <- as.matrix(dtm_sparse) word_freq <- colSums(dtm_matrix)

Step 4: Sort word frequencies in decreasing order

word_freq <- sort(word_freq, decreasing = TRUE)

Step 5: Convert word frequencies to a data frame for plotting

word_freq_df <- data.frame(word = names(word_freq), freq = word_freq)

Step 6: Take the top 20 most frequent words

top_words <- head(word_freq_df, 20)

Step 7: Visualize the top 20 most frequent words in wine descriptions

ggplot(top_words, aes(reorder(word, freq), freq)) + geom_bar(stat = “identity”, fill = “steelblue”, color = “black”, alpha = 0.7) + coord_flip() + theme_minimal() + labs(title = “Top 20 Most Frequent Words in Wine Descriptions”, x = “Words”, y = “Frequency”)

Step 8: Visualizing Word Cloud (Optional)

library(wordcloud) wordcloud(words = word_freq_df$word, freq = word_freq_df$freq, min.freq = 3, scale = c(3,0.5), colors = brewer.pal(8, “Dark2”))

Step 9: Perform a regression model to explore the relationship between rating and sentiment score

We’ll use linear regression as an example to explore how sentiment scores relate to wine ratings

regression_model <- lm(points ~ sentiment_score, data = wine_data_clean) summary(regression_model)

Step 10: Visualize the regression result

ggplot(wine_data_clean, aes(x = sentiment_score, y = points)) + geom_point(color = “blue”, alpha = 0.7) + geom_smooth(method = “lm”, color = “red”, alpha = 0.5) + theme_minimal() + labs(title = “Regression of Wine Rating vs Sentiment Score”, x = “Sentiment Score”, y = “Wine Rating (Points)”)

Step 11: Perform clustering (K-means) to identify groups of wines based on their sentiment score and rating

set.seed(1234) # Set seed for reproducibility clustering_model <- kmeans(wine_data_clean[, c(“points”, “sentiment_score”)], centers = 3) wine_data_clean$cluster <- clustering_model$cluster

Step 12: Visualize the clustering result

ggplot(wine_data_clean, aes(x = points, y = sentiment_score, color = as.factor(cluster))) + geom_point(alpha = 0.7) + theme_minimal() + labs(title = “K-Means Clustering of Wines”, x = “Wine Rating (Points)”, y = “Sentiment Score”)

Step 13: Investigating the clusters

cluster_summary <- wine_data_clean %>% group_by(cluster) %>% summarize(average_points = mean(points), average_sentiment = mean(sentiment_score), count = n()) print(cluster_summary)

wine_data_clean$topic <- topics

write.csv(wine_data_clean, “cleaned_wine_data.csv”)