## Task A: Text Mining

Step 1: Data Preprocessing (Text Cleaning)

We begin by preparing the wine descriptions for analysis by cleaning the text. This includes converting to lowercase, removing punctuation, numbers, and stopwords, and performing other standard text cleaning steps.

Load necessary libraries

load_library <- function(pkg) {
  suppressWarnings(library(pkg, character.only = TRUE))
}

load_library("readr")
load_library("tm")
load_library("tidytext")
load_library("wordcloud")
load_library("RColorBrewer")
load_library("dplyr")

Load the wines dataset

wines_dataset <- read_csv("C:/Users/SA1335/OneDrive - Canterbury Christ Church University/Desktop/R Assignment/wines_dataset.csv")
## New names:
## Rows: 129971 Columns: 14
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (11): country, description, designation, province, region_1, region_2, t... dbl
## (3): ...1, points, price
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

Check the first few rows of the dataset

head(wines_dataset)  
## # A tibble: 6 × 14
##    ...1 country  description designation points price province region_1 region_2
##   <dbl> <chr>    <chr>       <chr>        <dbl> <dbl> <chr>    <chr>    <chr>   
## 1     0 Italy    Aromas inc… Vulkà Bian…     87    NA Sicily … Etna     <NA>    
## 2     1 Portugal This is ri… Avidagos        87    15 Douro    <NA>     <NA>    
## 3     2 US       Tart and s… <NA>            87    14 Oregon   Willame… Willame…
## 4     3 US       Pineapple … Reserve La…     87    13 Michigan Lake Mi… <NA>    
## 5     4 US       Much like … Vintner's …     87    65 Oregon   Willame… Willame…
## 6     5 Spain    Blackberry… Ars In Vit…     87    15 Norther… Navarra  <NA>    
## # ℹ 5 more variables: taster_name <chr>, taster_twitter_handle <chr>,
## #   title <chr>, variety <chr>, winery <chr>

Clean the description column

wines_dataset$clean_description <- tolower(wines_dataset$description)  # Convert to lowercase
wines_dataset$clean_description <- removePunctuation(wines_dataset$clean_description)  # Remove punctuation
wines_dataset$clean_description <- removeNumbers(wines_dataset$clean_description)  # Remove numbers
wines_dataset$clean_description <- removeWords(wines_dataset$clean_description, stopwords("en"))  # Remove stopwords

View cleaned descriptions

head(wines_dataset$clean_description)
## [1] "aromas include tropical fruit broom brimstone  dried herb  palate isnt overly expressive offering unripened apple citrus  dried sage alongside brisk acidity"                                                     
## [2] "  ripe  fruity  wine   smooth  still structured firm tannins  filled   juicy red berry fruits  freshened  acidity   already drinkable although  will certainly  better  "                                         
## [3] "tart  snappy  flavors  lime flesh  rind dominate  green pineapple pokes   crisp acidity underscoring  flavors  wine   stainlesssteel fermented"                                                                   
## [4] "pineapple rind lemon pith  orange blossom start   aromas  palate   bit  opulent  notes  honeydrizzled guava  mango giving way   slightly astringent semidry finish"                                               
## [5] "much like  regular bottling    comes across  rather rough  tannic  rustic earthy herbal characteristics nonetheless   think     pleasantly unfussy country wine   good companion   hearty winter stew"            
## [6] "blackberry  raspberry aromas show  typical navarran whiff  green herbs    case horseradish   mouth   fairly full bodied  tomatoey acidity spicy herbal flavors complement dark plum fruit   finish  fresh  grabby"

Step 2: Word Cloud of Most Frequent Words

Now that we’ve cleaned the text, we’ll generate a word cloud to visualize the most frequent terms in the descriptions.

Preprocess the text for the word cloud

clean_text <- tolower(wines_dataset$description)               # Convert to lowercase
clean_text <- removePunctuation(clean_text)                   # Remove punctuation
clean_text <- removeWords(clean_text, stopwords("en"))        # Remove stopwords
clean_text <- stripWhitespace(clean_text)                     # Remove extra whitespace

Remove missing or empty values

clean_text <- clean_text[!is.na(clean_text) & clean_text != ""]

Generate the word cloud

wordcloud(clean_text, max.words = 100, random.order = FALSE, colors = brewer.pal(8, "Dark2"))
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents

Step 3: Word Frequency Distribution

We’ll also extract the frequency of each word and plot a bar chart of the most common words in the wine descriptions.

Unnest the cleaned descriptions into words and count frequency

word_freq <- wines_dataset %>%
  unnest_tokens(word, clean_description) %>%
  count(word, sort = TRUE)

Display the top 10 most frequent words

head(word_freq, 10)
## # A tibble: 10 × 2
##    word        n
##    <chr>   <int>
##  1 wine    78215
##  2 flavors 62782
##  3 fruit   45152
##  4 aromas  39638
##  5 palate  38105
##  6 acidity 34984
##  7 finish  34968
##  8 tannins 30875
##  9 drink   29982
## 10 cherry  27403
                                    ## Task B: Sentiment Analysis

Step 1: Sentiment Classification

We will use the Bing lexicon for sentiment analysis to classify words as either positive or negative. Then, we’ll aggregate sentiment scores for each wine description. # Load sentiment lexicon

library(tidytext)
sentiments <- get_sentiments("bing")

Perform sentiment analysis on the cleaned descriptions

sentiment_scores <- wines_dataset %>%
  unnest_tokens(word, clean_description) %>%
  inner_join(sentiments, by = "word") %>%
  count(sentiment)

Display sentiment counts

sentiment_scores
## # A tibble: 2 × 2
##   sentiment      n
##   <chr>      <int>
## 1 negative  103822
## 2 positive  383924

Step 2: Visualize Sentiment Distribution

Let’s visualize the distribution of sentiment in the wine descriptions using a bar plot.

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate

Plot sentiment distribution

ggplot(sentiment_scores, aes(x = sentiment, y = n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  labs(title = "Sentiment Distribution in Wine Descriptions") +
  theme_minimal()

Step 3: Sentiment by Wine Varieties

We can break down sentiment scores by wine variety to see if certain types of wines are associated with more positive or negative sentiments.

library(dplyr) library(tidyr) library(ggplot2)

Sentiment by wine variety (manual reshaping)

sentiment_by_variety <- wines_dataset %>%
  unnest_tokens(word, clean_description) %>%
  inner_join(sentiments, by = "word") %>%
  filter(sentiment %in% c("positive", "negative")) %>%
  group_by(variety, sentiment) %>%
  summarise(count = n(), .groups = 'drop')

Create separate columns for positive and negative counts

sentiment_by_variety <- sentiment_by_variety %>%
  mutate(
    positive_count = ifelse(sentiment == "positive", count, 0),
    negative_count = ifelse(sentiment == "negative", count, 0)
  ) %>%
  group_by(variety) %>%
  summarise(
    positive = sum(positive_count),
    negative = sum(negative_count)
  ) %>%
  mutate(sentiment_difference = positive - negative)

Plot sentiment by wine variety

ggplot(sentiment_by_variety, aes(x = reorder(variety, sentiment_difference), y = sentiment_difference, fill = variety)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(
    title = "Sentiment Difference by Wine Variety",
    x = "Wine Variety",
    y = "Sentiment Difference"
  ) +
  theme_minimal()

# Step 4: Top Positive and Negative Words Visualize the top words contributing to positive and negative sentiments separately.

Top positive words

positive_words <- wines_dataset %>%
  unnest_tokens(word, clean_description) %>%
  inner_join(get_sentiments("bing") %>% filter(sentiment == "positive"), by = "word") %>%
  count(word, sort = TRUE) %>%
  top_n(10)
## Selecting by n

Top negative words

negative_words <- wines_dataset %>%
  unnest_tokens(word, clean_description) %>%
  inner_join(get_sentiments("bing") %>% filter(sentiment == "negative"), by = "word") %>%
  count(word, sort = TRUE) %>%
  top_n(10)
## Selecting by n

Plot top positive words

ggplot(positive_words, aes(x = reorder(word, n), y = n)) +
  geom_bar(stat = "identity", fill = "green") +
  coord_flip() +
  labs(title = "Top Positive Words in Wine Descriptions", x = "Word", y = "Frequency") +
  theme_minimal()

# Plot top negative words

ggplot(negative_words, aes(x = reorder(word, n), y = n)) +
  geom_bar(stat = "identity", fill = "red") +
  coord_flip() +
  labs(title = "Top Negative Words in Wine Descriptions", x = "Word", y = "Frequency") +
  theme_minimal()

                                                  ## Task C: Topic Modelling

Step 1: Prepare Document-Term Matrix (DTM)

We’ll create a Document-Term Matrix (DTM) from the cleaned descriptions for topic modelling.

install.packages(“topicmodels”) library(topicmodels)

Create Document-Term Matrix (DTM)

dtm <- DocumentTermMatrix(Corpus(VectorSource(wines_dataset$clean_description)))

Check DTM structure

dtm
## <<DocumentTermMatrix (documents: 129971, terms: 44253)>>
## Non-/sparse entries: 3144614/5748462049
## Sparsity           : 100%
## Maximal term length: 47
## Weighting          : term frequency (tf)

Step 2: Apply Latent Dirichlet Allocation (LDA)

We’ll apply LDA to find the hidden topics in the wine descriptions.

Load necessary libraries

load_library <- function(pkg) {
  suppressWarnings(library(pkg, character.only = TRUE))  # Suppresses warnings
}

# Load required libraries
load_library("ggplot2")
load_library("dplyr")
load_library("tm")
load_library("topicmodels")
load_library("tidytext")
load_library("reshape2")
load_library("tibble")
load_library("tidyr")
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
## 
##     smiths
load_library("purrr")
load_library("stringr")
load_library("forcats")
load_library("lubridate")
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

Apply LDA

lda <- LDA(dtm, k = 5, control = list(seed = 1234))  # k = number of topics

Step 3: Extract and Display Topics

Extract the beta matrix, which contains the probability of terms in each topic, and display the top terms for each topic.

Extract topics using tidytext

lda_topics <- tidy(lda, matrix = "beta")

View topics data

print(head(lda_topics))
## # A tibble: 6 × 3
##   topic term         beta
##   <int> <chr>       <dbl>
## 1     1 acidity   0.0130 
## 2     2 acidity   0.0221 
## 3     3 acidity   0.00220
## 4     4 acidity   0.00651
## 5     5 acidity   0.0103 
## 6     1 alongside 0.00149

Step 4: Visualize Topics

We will visualize the top terms for each topic using ggplot2. # Visualize top words for each topic

lda_topics %>%
  group_by(topic) %>%
  slice_max(beta, n = 10) %>%  # Select top 10 words per topic
  ungroup() %>%
  ggplot(aes(reorder_within(term, beta, topic), beta, fill = factor(topic))) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  coord_flip() +
  scale_x_reordered() +
  facet_wrap(~ topic, scales = "free", ncol = 2) +
  labs(
    title = "Top Words for Each Topic",
    x = "Terms",
    y = "Beta"
  ) +
  theme_minimal()

                                            ## Task D: Further Exploration

Step 1: Price vs. Points

We can explore the relationship between wine price and rating points to identify trends. # Plot Price vs. Points

ggplot(wines_dataset, aes(x = price, y = points, color = country)) +
  geom_point() +
  labs(title = "Price vs. Points by Country") +
  theme_minimal()
## Warning: Removed 8996 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Step 2: Predicting Price with Linear Regression We’ll apply a simple linear regression model to predict wine price based on points. # Apply linear regression

model <- lm(price ~ points, data = wines_dataset)
summary(model)
## 
## Call:
## lm(formula = price ~ points, data = wines_dataset)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -57.9  -15.0   -5.4    7.2 3267.0 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -460.46243    3.11657  -147.7   <2e-16 ***
## points         5.60750    0.03523   159.2   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 37.3 on 120973 degrees of freedom
##   (8996 observations deleted due to missingness)
## Multiple R-squared:  0.1732, Adjusted R-squared:  0.1732 
## F-statistic: 2.534e+04 on 1 and 120973 DF,  p-value: < 2.2e-16

Plot predictions

ggplot(wines_dataset, aes(x = points, y = price)) +
  geom_point() +
  geom_smooth(method = "lm", col = "blue") +
  labs(title = "Price Prediction Based on Points") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 8996 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 8996 rows containing missing values or values outside the scale range
## (`geom_point()`).

Step 3: Further Exploration of Other Variables

We can also explore other relationships, like the distribution of prices by wine variety. # Plot price by wine variety

ggplot(wines_dataset, aes(x = variety, y = price, fill = variety)) +
  geom_boxplot() +
  labs(title = "Price Distribution by Wine Variety") +
  theme_minimal()
## Warning: Removed 8996 rows containing non-finite outside the scale range
## (`stat_boxplot()`).