## Task A: Text Mining
We begin by preparing the wine descriptions for analysis by cleaning the text. This includes converting to lowercase, removing punctuation, numbers, and stopwords, and performing other standard text cleaning steps.
load_library <- function(pkg) {
suppressWarnings(library(pkg, character.only = TRUE))
}
load_library("readr")
load_library("tm")
load_library("tidytext")
load_library("wordcloud")
load_library("RColorBrewer")
load_library("dplyr")
wines_dataset <- read_csv("C:/Users/SA1335/OneDrive - Canterbury Christ Church University/Desktop/R Assignment/wines_dataset.csv")
## New names:
## Rows: 129971 Columns: 14
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (11): country, description, designation, province, region_1, region_2, t... dbl
## (3): ...1, points, price
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
head(wines_dataset)
## # A tibble: 6 × 14
## ...1 country description designation points price province region_1 region_2
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 0 Italy Aromas inc… Vulkà Bian… 87 NA Sicily … Etna <NA>
## 2 1 Portugal This is ri… Avidagos 87 15 Douro <NA> <NA>
## 3 2 US Tart and s… <NA> 87 14 Oregon Willame… Willame…
## 4 3 US Pineapple … Reserve La… 87 13 Michigan Lake Mi… <NA>
## 5 4 US Much like … Vintner's … 87 65 Oregon Willame… Willame…
## 6 5 Spain Blackberry… Ars In Vit… 87 15 Norther… Navarra <NA>
## # ℹ 5 more variables: taster_name <chr>, taster_twitter_handle <chr>,
## # title <chr>, variety <chr>, winery <chr>
wines_dataset$clean_description <- tolower(wines_dataset$description) # Convert to lowercase
wines_dataset$clean_description <- removePunctuation(wines_dataset$clean_description) # Remove punctuation
wines_dataset$clean_description <- removeNumbers(wines_dataset$clean_description) # Remove numbers
wines_dataset$clean_description <- removeWords(wines_dataset$clean_description, stopwords("en")) # Remove stopwords
head(wines_dataset$clean_description)
## [1] "aromas include tropical fruit broom brimstone dried herb palate isnt overly expressive offering unripened apple citrus dried sage alongside brisk acidity"
## [2] " ripe fruity wine smooth still structured firm tannins filled juicy red berry fruits freshened acidity already drinkable although will certainly better "
## [3] "tart snappy flavors lime flesh rind dominate green pineapple pokes crisp acidity underscoring flavors wine stainlesssteel fermented"
## [4] "pineapple rind lemon pith orange blossom start aromas palate bit opulent notes honeydrizzled guava mango giving way slightly astringent semidry finish"
## [5] "much like regular bottling comes across rather rough tannic rustic earthy herbal characteristics nonetheless think pleasantly unfussy country wine good companion hearty winter stew"
## [6] "blackberry raspberry aromas show typical navarran whiff green herbs case horseradish mouth fairly full bodied tomatoey acidity spicy herbal flavors complement dark plum fruit finish fresh grabby"
Now that we’ve cleaned the text, we’ll generate a word cloud to visualize the most frequent terms in the descriptions.
clean_text <- tolower(wines_dataset$description) # Convert to lowercase
clean_text <- removePunctuation(clean_text) # Remove punctuation
clean_text <- removeWords(clean_text, stopwords("en")) # Remove stopwords
clean_text <- stripWhitespace(clean_text) # Remove extra whitespace
clean_text <- clean_text[!is.na(clean_text) & clean_text != ""]
wordcloud(clean_text, max.words = 100, random.order = FALSE, colors = brewer.pal(8, "Dark2"))
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
We’ll also extract the frequency of each word and plot a bar chart of the most common words in the wine descriptions.
word_freq <- wines_dataset %>%
unnest_tokens(word, clean_description) %>%
count(word, sort = TRUE)
head(word_freq, 10)
## # A tibble: 10 × 2
## word n
## <chr> <int>
## 1 wine 78215
## 2 flavors 62782
## 3 fruit 45152
## 4 aromas 39638
## 5 palate 38105
## 6 acidity 34984
## 7 finish 34968
## 8 tannins 30875
## 9 drink 29982
## 10 cherry 27403
## Task B: Sentiment Analysis
We will use the Bing lexicon for sentiment analysis to classify words as either positive or negative. Then, we’ll aggregate sentiment scores for each wine description. # Load sentiment lexicon
library(tidytext)
sentiments <- get_sentiments("bing")
sentiment_scores <- wines_dataset %>%
unnest_tokens(word, clean_description) %>%
inner_join(sentiments, by = "word") %>%
count(sentiment)
sentiment_scores
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 103822
## 2 positive 383924
Let’s visualize the distribution of sentiment in the wine descriptions using a bar plot.
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
ggplot(sentiment_scores, aes(x = sentiment, y = n, fill = sentiment)) +
geom_bar(stat = "identity") +
labs(title = "Sentiment Distribution in Wine Descriptions") +
theme_minimal()
We can break down sentiment scores by wine variety to see if certain types of wines are associated with more positive or negative sentiments.
library(dplyr) library(tidyr) library(ggplot2)
sentiment_by_variety <- wines_dataset %>%
unnest_tokens(word, clean_description) %>%
inner_join(sentiments, by = "word") %>%
filter(sentiment %in% c("positive", "negative")) %>%
group_by(variety, sentiment) %>%
summarise(count = n(), .groups = 'drop')
print(head(sentiment_by_variety))
## # A tibble: 6 × 3
## variety sentiment count
## <chr> <chr> <int>
## 1 Abouriou negative 1
## 2 Abouriou positive 5
## 3 Agiorgitiko negative 37
## 4 Agiorgitiko positive 231
## 5 Aglianico negative 417
## 6 Aglianico positive 868
sentiment_by_variety <- sentiment_by_variety %>%
mutate(
positive_count = ifelse(sentiment == "positive", count, 0),
negative_count = ifelse(sentiment == "negative", count, 0)
) %>%
group_by(variety) %>%
summarise(
positive = sum(positive_count),
negative = sum(negative_count)
) %>%
mutate(sentiment_difference = positive - negative)
ggplot(sentiment_by_variety, aes(x = reorder(variety, sentiment_difference), y = sentiment_difference, fill = variety)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(
title = "Sentiment Difference by Wine Variety",
x = "Wine Variety",
y = "Sentiment Difference"
) +
theme_minimal()
# Step 4: Top Positive and Negative Words Visualize the top words
contributing to positive and negative sentiments separately.
positive_words <- wines_dataset %>%
unnest_tokens(word, clean_description) %>%
inner_join(get_sentiments("bing") %>% filter(sentiment == "positive"), by = "word") %>%
count(word, sort = TRUE) %>%
top_n(10)
## Selecting by n
negative_words <- wines_dataset %>%
unnest_tokens(word, clean_description) %>%
inner_join(get_sentiments("bing") %>% filter(sentiment == "negative"), by = "word") %>%
count(word, sort = TRUE) %>%
top_n(10)
## Selecting by n
ggplot(positive_words, aes(x = reorder(word, n), y = n)) +
geom_bar(stat = "identity", fill = "green") +
coord_flip() +
labs(title = "Top Positive Words in Wine Descriptions", x = "Word", y = "Frequency") +
theme_minimal()
# Plot top negative words
ggplot(negative_words, aes(x = reorder(word, n), y = n)) +
geom_bar(stat = "identity", fill = "red") +
coord_flip() +
labs(title = "Top Negative Words in Wine Descriptions", x = "Word", y = "Frequency") +
theme_minimal()
## Task C: Topic Modelling
We’ll create a Document-Term Matrix (DTM) from the cleaned descriptions for topic modelling.
install.packages(“topicmodels”) library(topicmodels)
dtm <- DocumentTermMatrix(Corpus(VectorSource(wines_dataset$clean_description)))
dtm
## <<DocumentTermMatrix (documents: 129971, terms: 44253)>>
## Non-/sparse entries: 3144614/5748462049
## Sparsity : 100%
## Maximal term length: 47
## Weighting : term frequency (tf)
We’ll apply LDA to find the hidden topics in the wine descriptions.
load_library <- function(pkg) {
suppressWarnings(library(pkg, character.only = TRUE)) # Suppresses warnings
}
# Load required libraries
load_library("ggplot2")
load_library("dplyr")
load_library("tm")
load_library("topicmodels")
load_library("tidytext")
load_library("reshape2")
load_library("tibble")
load_library("tidyr")
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
##
## smiths
load_library("purrr")
load_library("stringr")
load_library("forcats")
load_library("lubridate")
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
lda <- LDA(dtm, k = 5, control = list(seed = 1234)) # k = number of topics
Extract the beta matrix, which contains the probability of terms in each topic, and display the top terms for each topic.
lda_topics <- tidy(lda, matrix = "beta")
print(head(lda_topics))
## # A tibble: 6 × 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 acidity 0.0130
## 2 2 acidity 0.0221
## 3 3 acidity 0.00220
## 4 4 acidity 0.00651
## 5 5 acidity 0.0103
## 6 1 alongside 0.00149
We will visualize the top terms for each topic using ggplot2. # Visualize top words for each topic
lda_topics %>%
group_by(topic) %>%
slice_max(beta, n = 10) %>% # Select top 10 words per topic
ungroup() %>%
ggplot(aes(reorder_within(term, beta, topic), beta, fill = factor(topic))) +
geom_bar(stat = "identity", show.legend = FALSE) +
coord_flip() +
scale_x_reordered() +
facet_wrap(~ topic, scales = "free", ncol = 2) +
labs(
title = "Top Words for Each Topic",
x = "Terms",
y = "Beta"
) +
theme_minimal()
## Task D: Further Exploration
We can explore the relationship between wine price and rating points to identify trends. # Plot Price vs. Points
ggplot(wines_dataset, aes(x = price, y = points, color = country)) +
geom_point() +
labs(title = "Price vs. Points by Country") +
theme_minimal()
## Warning: Removed 8996 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Step 2: Predicting Price with Linear Regression We’ll apply a simple
linear regression model to predict wine price based on points. # Apply
linear regression
model <- lm(price ~ points, data = wines_dataset)
summary(model)
##
## Call:
## lm(formula = price ~ points, data = wines_dataset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.9 -15.0 -5.4 7.2 3267.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -460.46243 3.11657 -147.7 <2e-16 ***
## points 5.60750 0.03523 159.2 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 37.3 on 120973 degrees of freedom
## (8996 observations deleted due to missingness)
## Multiple R-squared: 0.1732, Adjusted R-squared: 0.1732
## F-statistic: 2.534e+04 on 1 and 120973 DF, p-value: < 2.2e-16
ggplot(wines_dataset, aes(x = points, y = price)) +
geom_point() +
geom_smooth(method = "lm", col = "blue") +
labs(title = "Price Prediction Based on Points") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 8996 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 8996 rows containing missing values or values outside the scale range
## (`geom_point()`).
We can also explore other relationships, like the distribution of prices by wine variety. # Plot price by wine variety
ggplot(wines_dataset, aes(x = variety, y = price, fill = variety)) +
geom_boxplot() +
labs(title = "Price Distribution by Wine Variety") +
theme_minimal()
## Warning: Removed 8996 rows containing non-finite outside the scale range
## (`stat_boxplot()`).