library(tm) library(syuzhet) library(topicmodels) library(ggplot2) #Task A # Read dataset wine_data <- read.csv(“E:/Data Base/Fiverr project/Ijaj vai/MS4S09_CW_Reviews.csv”)
str(wine_data) summary(wine_data)
colSums(is.na(wine_data))
wine_data_clean <- na.omit(wine_data)
wine_data_clean\(points <- as.numeric(as.character(wine_data_clean\)points))
wine_data_clean\(description <- tolower(wine_data_clean\)description) wine_data_clean\(description <- iconv(wine_data_clean\)description, to = “UTF-8”) # Fix encoding issues wine_data_clean\(description <- removePunctuation(wine_data_clean\)description) wine_data_clean\(description <- removeNumbers(wine_data_clean\)description)
stopwords_list <- stopwords(“en”) wine_data_clean\(description <- removeWords(wine_data_clean\)description, stopwords_list) # Plot Histogram of Wine Ratings (Points) ggplot(wine_data_clean, aes(x = points)) + geom_histogram(binwidth = 1, fill = “blue”, color = “black”, alpha = 0.7) + theme_minimal() + labs(title = “Distribution of Wine Ratings”, x = “Wine Rating (Points)”, y = “Count”)
#task B
library(tm) library(syuzhet) library(topicmodels) library(ggplot2) library(dplyr) # Perform Sentiment Analysis wine_data_clean\(sentiment_score <- get_sentiment(wine_data_clean\)description, method = “syuzhet”) # Add a new column to classify sentiment into positive, negative, and neutral wine_data_clean\(sentiment_label <- ifelse(wine_data_clean\)sentiment_score > 0, “Positive”, ifelse(wine_data_clean\(sentiment_score < 0, "Negative", "Neutral")) # Visualize Sentiment Distribution ggplot(wine_data_clean, aes(x = sentiment_label)) + geom_bar(fill = c("blue", "red", "gray"), color = "black", alpha = 0.7) + theme_minimal() + labs(title = "Sentiment Distribution", x = "Sentiment", y = "Count") # Visualize Sentiment vs Wine Points ggplot(wine_data_clean, aes(x = points, fill = sentiment_label)) + geom_bar(position = "stack", color = "black", alpha = 0.7) + theme_minimal() + labs(title = "Sentiment Distribution Across Wine Points", x = "Wine Rating (Points)", y = "Count") # Sentiment Score Distribution (Histogram) ggplot(wine_data_clean, aes(x = sentiment_score)) + geom_histogram(binwidth = 0.1, fill = "purple", color = "black", alpha = 0.7) + theme_minimal() + labs(title = "Sentiment Score Distribution", x = "Sentiment Score", y = "Count") # Create Wordcloud for Description wordcloud(wine_data_clean\)description, max.words = 100, random.order = FALSE, colors = brewer.pal(8, “Dark2”)) #Task C # Load necessary libraries library(tm) library(syuzhet) library(topicmodels) library(ggplot2) library(dplyr) library(tidyverse) library(tidytext) # Create a Document-Term Matrix (DTM) for the reviews corpus <- Corpus(VectorSource(wine_data_clean$description)) dtm <- DocumentTermMatrix(corpus, control = list(stopwords = TRUE, removePunctuation = TRUE, removeNumbers = TRUE, tolower = TRUE))
num_topics <- 5 # You can change this based on the number of topics you want to identify lda_model <- LDA(dtm, k = num_topics, control = list(seed = 1234)) # Step 4: View the top terms for each topic terms_per_topic <- tidy(lda_model, matrix = “beta”) # Show the top 10 terms for each topic terms_per_topic %>% group_by(topic) %>% top_n(10, beta) %>% ungroup() %>% ggplot(aes(reorder(term, beta), beta, fill = factor(topic))) + geom_bar(stat = “identity”, show.legend = FALSE) + facet_wrap(~topic, scales = “free_y”) + coord_flip() + theme_minimal() + labs(title = “Top Terms per Topic”, x = “Terms”, y = “Beta Value”)
topic_probabilities <- tidy(lda_model, matrix = “gamma”) # Plot the topic distribution across all reviews ggplot(topic_probabilities, aes(x = topic, y = gamma, fill = factor(topic))) + geom_bar(stat = “identity”, position = “stack”) + theme_minimal() + labs(title = “Topic Distribution Across Reviews”, x = “Topic”, y = “Proportion of Topic in Reviews”)
wine_data_clean$topic <- topics(lda_model) # Step 7: Visualize Sentiment Distribution by Topic ggplot(wine_data_clean, aes(x = factor(topic), fill = sentiment_label)) + geom_bar(position = “stack”, color = “black”, alpha = 0.7) + theme_minimal() + labs(title = “Sentiment Distribution Across Topics”, x = “Topic”, y = “Count”)
ggplot(wine_data_clean, aes(x = factor(topic), y = sentiment_score, color = factor(topic))) + geom_boxplot() + theme_minimal() + labs(title = “Sentiment Score Distribution by Topic”, x = “Topic”, y = “Sentiment Score”)
#Task D # Load necessary libraries for Task D library(tm) library(syuzhet) library(topicmodels) library(ggplot2) library(dplyr) library(tidyverse) library(tidytext) library(cluster)
corpus <- Corpus(VectorSource(wine_data_clean$description)) dtm <- DocumentTermMatrix(corpus, control = list(stopwords = TRUE, removePunctuation = TRUE, removeNumbers = TRUE, tolower = TRUE)) # Step 2: Remove sparse terms (e.g., terms that appear in fewer than 5% of the documents) dtm_sparse <- removeSparseTerms(dtm, 0.95) # Keeps terms that appear in at least 5% of the documents # Step 3: Convert the DTM to a matrix and calculate word frequencies dtm_matrix <- as.matrix(dtm_sparse) word_freq <- colSums(dtm_matrix)
word_freq <- sort(word_freq, decreasing = TRUE)
word_freq_df <- data.frame(word = names(word_freq), freq = word_freq)
top_words <- head(word_freq_df, 20)
ggplot(top_words, aes(reorder(word, freq), freq)) + geom_bar(stat = “identity”, fill = “steelblue”, color = “black”, alpha = 0.7) + coord_flip() + theme_minimal() + labs(title = “Top 20 Most Frequent Words in Wine Descriptions”, x = “Words”, y = “Frequency”)
library(wordcloud) wordcloud(words = word_freq_df\(word, freq = word_freq_df\)freq, min.freq = 3, scale = c(3,0.5), colors = brewer.pal(8, “Dark2”))
regression_model <- lm(points ~ sentiment_score, data = wine_data_clean) summary(regression_model)
ggplot(wine_data_clean, aes(x = sentiment_score, y = points)) + geom_point(color = “blue”, alpha = 0.7) + geom_smooth(method = “lm”, color = “red”, alpha = 0.5) + theme_minimal() + labs(title = “Regression of Wine Rating vs Sentiment Score”, x = “Sentiment Score”, y = “Wine Rating (Points)”)
set.seed(1234) # Set seed for reproducibility clustering_model <- kmeans(wine_data_clean[, c(“points”, “sentiment_score”)], centers = 3) wine_data_clean\(cluster <- clustering_model\)cluster
ggplot(wine_data_clean, aes(x = points, y = sentiment_score, color = as.factor(cluster))) + geom_point(alpha = 0.7) + theme_minimal() + labs(title = “K-Means Clustering of Wines”, x = “Wine Rating (Points)”, y = “Sentiment Score”)
cluster_summary <- wine_data_clean %>% group_by(cluster) %>% summarize(average_points = mean(points), average_sentiment = mean(sentiment_score), count = n()) print(cluster_summary)
wine_data_clean$topic <- topics
write.csv(wine_data_clean, “cleaned_wine_data.csv”)