library(dplyr)
library(ggplot2)
library(ggjoy)
library(gridExtra)
library(ggridges)
full_data <- read.csv("winemag-data-130k-v2.csv")

# Remove any NAs
full_data <- full_data[!is.na(full_data$price), ]

#Exploratory Data Analysis

str(full_data)
## 'data.frame':    120975 obs. of  14 variables:
##  $ X                    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ country              : Factor w/ 44 levels "","Argentina",..: 33 44 44 44 39 24 17 19 17 44 ...
##  $ description          : Factor w/ 119955 levels "\"Chremisa,\" the ancient name of Krems, is commemorated in this wine that comes from Krems vineyards. It has t"| __truncated__,..: 99131 75516 60556 55752 22180 43058 89270 67633 91169 71422 ...
##  $ designation          : Factor w/ 37980 levels "","'61 Rosé",..: 2404 1 28205 36717 2049 3107 1 30993 20073 23124 ...
##  $ points               : int  87 87 87 87 87 87 87 87 87 87 ...
##  $ price                : num  15 14 13 65 15 16 24 12 27 19 ...
##  $ province             : Factor w/ 426 levels "","Österreichischer Perlwein",..: 115 274 225 274 268 340 17 314 17 57 ...
##  $ region_1             : Factor w/ 1230 levels "","Abruzzo","Adelaida District",..: 1 1219 552 1219 759 1206 23 1 23 755 ...
##  $ region_2             : Factor w/ 18 levels "","California Other",..: 1 18 1 18 1 1 1 1 1 8 ...
##  $ taster_name          : Factor w/ 20 levels "","Alexander Peartree",..: 17 16 2 16 14 11 17 3 17 20 ...
##  $ taster_twitter_handle: Factor w/ 16 levels "","@AnneInVino",..: 12 9 1 9 14 6 12 1 12 11 ...
##  $ title                : Factor w/ 118840 levels ":Nota Bene 2005 Una Notte Red (Washington)",..: 89460 89984 101062 102995 103761 105815 108734 54697 59245 62596 ...
##  $ variety              : Factor w/ 708 levels "","Çalkarası",..: 454 441 483 445 593 191 213 213 441 88 ...
##  $ winery               : Factor w/ 16757 levels ":Nota Bene","1+1=3",..: 12989 13063 14433 14665 14742 15048 15436 8443 9000 9342 ...
# Where is the incomplete data (NAs)?
print(colSums(is.na(full_data)))
##                     X               country           description 
##                     0                     0                     0 
##           designation                points                 price 
##                     0                     0                     0 
##              province              region_1              region_2 
##                     0                     0                     0 
##           taster_name taster_twitter_handle                 title 
##                     0                     0                     0 
##               variety                winery 
##                     0                     0
      # There are nearly 9,000 prices missing

# Now we will remove these NAs
full_data <- full_data[!is.na(full_data$price), ]

# Let's inspect the amount of unique observations we have in each variable
unique_vals <- lapply(full_data, unique)
sapply(unique_vals, length)
##                     X               country           description 
##                120975                    43                111567 
##           designation                points                 price 
##                 35777                    21                   390 
##              province              region_1              region_2 
##                   423                  1205                    18 
##           taster_name taster_twitter_handle                 title 
##                    20                    16                110638 
##               variety                winery 
##                   698                 15855

Some notes on the dataset:

There were 20 unique tasters providing the reviews                    
The wines in the list come from 43 different countries and 423 different provinces                  
There were 698 unique varieties and 15,855 wineries represented                       
                       

Numerical Variable Analysis

Points awarded

summary(full_data$points)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   80.00   86.00   88.00   88.42   91.00  100.00

How many wines score a perfect score?

sum(full_data$points == 100)
## [1] 19
ggplot(data = full_data, aes(x= points, colour = I('black'), fill = I('#099DD9')))+
  geom_histogram(binwidth = 1)+
  labs(x = "Points", y= "Frequency", title = "Distribution of points")

The points given to the wines reviewed ranged from 80 - 100 points, with a mean of 88.42 and median of 88. There are 19 wines in the list that have scored a perfect score of 100. The points given to the wines are normally distributed.

Wine prices

summary(full_data$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    4.00   17.00   25.00   35.36   42.00 3300.00
ggplot(data = full_data, aes(x= price, colour = I('black'), fill = I('#099DD9')))+
  geom_histogram()+
  labs(x = "Price", y= "Frequency", title = "Distribution of prices") #Strongly right skewed
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Log
ggplot(data = full_data, aes(x= log(price), colour = I('black'), fill = I('#099DD9')))+
  geom_histogram()+
  labs(x = "log(Price)", y= "Frequency", title = "Distribution of log(prices)") #slightly right skewed
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Wine prices are certainly not normally distributed! Taking a log of the price goes a long way to addressing this.

Top 30 varieties reviewed

top_30df <- full_data %>%
  group_by(variety) %>%
  summarise(count = n())%>%
  arrange(desc(count))

top_30df <- top_30df[1:30,1:2]

top_30df
## # A tibble: 30 x 2
##    variety                  count
##    <fct>                    <int>
##  1 Pinot Noir               12787
##  2 Chardonnay               11080
##  3 Cabernet Sauvignon        9386
##  4 Red Blend                 8476
##  5 Bordeaux-style Red Blend  5340
##  6 Riesling                  4972
##  7 Sauvignon Blanc           4783
##  8 Syrah                     4086
##  9 Rosé                     3262
## 10 Merlot                    3062
## # ... with 20 more rows
top_30 <- top_30df$variety  

new_data <- subset(full_data, variety %in% top_30)

The most reviewd 30 varieties are displayed above. I will limit the dataset that I will perform the rest of the EDA on to these varieties only.

# Create a new variable to indicate whether wine is white or red
new_data$wine_type <- ifelse(new_data$variety == "Chardonnay" | new_data$variety == "Riesling" | new_data$variety == "Sauvignon Blanc" | new_data$variety == "White Blend" | new_data$variety == "Sparkling Blend" | new_data$variety == "Pinot Gris" | new_data$variety == "Champagne Blend" | new_data$variety == "Grüner Veltliner" | new_data$variety == "Pinot Grigio" | new_data$variety == "Portuguese White" | new_data$variety == "Viognier" | new_data$variety == "Gewürztraminer" | new_data$variety == "Gewürztraminer", "White Wine", "Red Wine")

I will create a new variable to use in my analysis - is the wine white or red? Here I have included Sparkling Blend and Champagne Blend as White Wine, however this will probably be better let out as it is fairly different. Additionally, Rose has been included as a Red Wine.

new_data %>%
  group_by(variety, wine_type) %>%
  summarise(n=n(),
            avg_score = mean(points),
            avg_price = mean(price)) %>%
  ggplot(aes(x=avg_price, y= avg_score, size = n, colour = wine_type))+
  geom_point()+
  scale_color_manual(values = c("#CC3300", "#FFCC00"))

Distribution of points awarded for the most reviewed

How are the points a wine receives distributed for each of the varieties?

p1 <- ggplot(data = subset(new_data, wine_type == "Red Wine"), aes(x=points, y=variety))+
  geom_joy2(bandwidth = 0.539, fill = "#CC3300")

p2 <- ggplot(data = subset(new_data, wine_type == "White Wine"), aes(x=points, y=variety))+
  geom_joy2(bandwidth = 0.539, fill = "#FFCC00")

grid.arrange(p1, p2, nrow = 1)

Distribution of price awarded for the most reviewed

How does the price of a wine vary given its variety?

p3 <- ggplot(data = subset(new_data, wine_type == "Red Wine"), aes(x=log(price), y=variety))+
  geom_joy2(bandwidth = 0.103, fill = "#CC3300")

p4 <- ggplot(data = subset(new_data, wine_type == "White Wine"), aes(x=log(price), y=variety))+
  geom_joy2(bandwidth = 0.103, fill = "#FFCC00")

grid.arrange(p3, p4, nrow=1)

Is there a relationship between the points given and the wine’s price?

p5 <- ggplot(data = subset(new_data, wine_type == "Red Wine"), aes(x=points, y= price))+
  geom_point(colour="#CC3300")+
  scale_y_log10()+
  geom_smooth()

p6 <- ggplot(data = subset(new_data, wine_type == "White Wine"), aes(x=points, y= price))+
  geom_point(colour="#FFCC00")+
  scale_y_log10()+
  geom_smooth()

grid.arrange(p5, p6, nrow=1)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Correlation

cor(log(new_data$price), new_data$points)
## [1] 0.6149412

There is a fairly positive correlation between log(price) and points with a correlation of 0.6149.

Price_points_reg <- lm(log(price) ~ points + wine_type, data = new_data)
summary(Price_points_reg)
## 
## Call:
## lm(formula = log(price) ~ points + wine_type, data = new_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8023 -0.3629 -0.0421  0.3181  4.7430 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -8.0680575  0.0468514 -172.21   <2e-16 ***
## points               0.1298493  0.0005282  245.84   <2e-16 ***
## wine_typeWhite Wine -0.2255564  0.0034664  -65.07   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5113 on 98236 degrees of freedom
## Multiple R-squared:  0.4038, Adjusted R-squared:  0.4038 
## F-statistic: 3.327e+04 on 2 and 98236 DF,  p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(Price_points_reg)

Using lm to run a multiple regression model to predict a wines price, using points and wine_type as predictor variables yields an adjusted R-squared of 0.4058. The higher the points awarded, the higher the predicted price of the wine, while white wines are valued lower than their red counterparts.

#Words Analysis Points Prediction

top_30 <- top_30df$variety  

new_data <- subset(full_data, variety %in% top_30)

new_data$wine_type <- ifelse(new_data$variety == "Chardonnay" | new_data$variety == "Riesling" | new_data$variety == "Sauvignon Blanc" | new_data$variety == "White Blend" | new_data$variety == "Sparkling Blend" | new_data$variety == "Pinot Gris" | new_data$variety == "Champagne Blend" | new_data$variety == "Grüner Veltliner" | new_data$variety == "Pinot Grigio" | new_data$variety == "Portuguese White" | new_data$variety == "Viognier" | new_data$variety == "Gewürztraminer" | new_data$variety == "Gewürztraminer", "White Wine", "Red Wine")

Word count analysis on how many words are in each review

new_data$wordcount <- sapply(gregexpr("\\S+", new_data$description), length)

summary(new_data$wordcount)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3.00   33.00   40.00   40.75   48.00  135.00

Description with the highest wordcount

new_data$description[which(new_data$wordcount == 135)]
## [1] This very fine Cabernet wants a little time in the cellar. Right now, it's tight in tannins, with some acidic bitterness in the finish. The flavors are of black currants and smoky new oak. The Morisoli Vineyard has been home to very good, ageable bottlings from the likes of Sequoia Grove and Elyse, but in the last few years, Meander has expressed its terroir best. Try after 2012.This very fine Cabernet wants a little time in the cellar. Right now, it's tight in tannins, with some acidic bitterness in the finish. The flavors are of black currants and smoky new oak. The Morisoli Vineyard has been home to very good, ageable bottlings from the likes of Sequoia Grove and Elyse, but in the last few years, Meander has expressed its terroir best. Try after 2012.
## 119955 Levels: "Chremisa," the ancient name of Krems, is commemorated in this wine that comes from Krems vineyards. It has tight, tangy apple-driven acidity, with a bright, light, citrusy character. Not for aging. ...

Description with the lowest wordcount

new_data$description[which(new_data$wordcount == 3)]
## [1] Imported by Kobrand.
## 119955 Levels: "Chremisa," the ancient name of Krems, is commemorated in this wine that comes from Krems vineyards. It has tight, tangy apple-driven acidity, with a bright, light, citrusy character. Not for aging. ...

Wordcount distribution

ggplot(data = new_data, aes(x= wordcount))+
  geom_histogram(binwidth = 3)+
  labs(x = "Word Count", y= "Frequency", title = "Distribution of word count of description")

ggplot(data = new_data, aes(x= wordcount, y= wine_type, fill = wine_type))+
  geom_density_ridges ()+
  labs(x = "Word Count", title = "Distribution of word count of description")+
  scale_fill_cyclical(values = c("#CC3300", "#FFCC00"))
## Picking joint bandwidth of 1.09

ggplot(data = new_data, aes(x=variety, y=wordcount))+
  geom_boxplot()+
  coord_flip()+
  labs(title = "Wordcount Distribution by Variety", x= "Variety", y= "Word Count")

There is a fair bit of variability between the different varieties and the description word count. Pinot Grigio has the lowest median of the white wines, while Rose has the lowest of the red wines.

Is there a correlation between word count and the score?

ggplot(data = new_data, aes(x=wordcount, y=points))+
  geom_point()

cor(new_data$points, new_data$wordcount)
## [1] 0.5450367

There is a positive correlation between the number of words used in the description and the wines score (0.5450). Word Analysis

Performing analysis on the below three wine varieties show us the most commonly used words to describe the varieties. Certain words were removed from the analysis. The name of the wine was removed from the analysis, as were the words “wine” “flavors” and “drink”.
Pinot Noir

# Load
library("tm")
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library("SnowballC")
library("wordcloud")
## Warning: package 'wordcloud' was built under R version 3.6.3
## Loading required package: RColorBrewer
library("RColorBrewer")


#---------- Pinot Noir Word cloud ----------#

pinot <- subset(new_data, variety == "Pinot Noir")

descriptors <- Corpus(VectorSource(pinot$description))
head(descriptors)
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 6
# Convert the text to lower case
descriptors <- tm_map(descriptors, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(descriptors, content_transformer(tolower)):
## transformation drops documents
# Remove numbers
descriptors <- tm_map(descriptors, removeNumbers)
## Warning in tm_map.SimpleCorpus(descriptors, removeNumbers): transformation drops
## documents
# Remove english common stopwords
descriptors <- tm_map(descriptors, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(descriptors, removeWords, stopwords("english")):
## transformation drops documents
# Remove your own stop word
# specify your stopwords as a character vector
descriptors <- tm_map(descriptors, removeWords, c("wine", "pinot", "noir", "drink", "flavors")) 
## Warning in tm_map.SimpleCorpus(descriptors, removeWords, c("wine", "pinot", :
## transformation drops documents
# Remove punctuations
descriptors <- tm_map(descriptors, removePunctuation)
## Warning in tm_map.SimpleCorpus(descriptors, removePunctuation): transformation
## drops documents
# Eliminate extra white spaces
descriptors <- tm_map(descriptors, stripWhitespace)
## Warning in tm_map.SimpleCorpus(descriptors, stripWhitespace): transformation
## drops documents
# Text stemming
#descriptors <- tm_map(descriptors, stemDocument)

# Build a term-document matrix
dtm <- TermDocumentMatrix(descriptors)
dtm_mat <- as.matrix(dtm)
v <- sort(rowSums(dtm_mat),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
##                word freq
## cherry       cherry 5416
## fruit         fruit 5010
## acidity     acidity 3335
## tannins     tannins 2939
## finish       finish 2922
## red             red 2740
## palate       palate 2664
## black         black 2571
## raspberry raspberry 2408
## oak             oak 2182
# Generate the Word cloud
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

Cabernet Sauvignon Words

#---------- Cabernet Sauvignon Word cloud ----------#

cabsav <- subset(new_data, variety == "Cabernet Sauvignon")

descriptors <- Corpus(VectorSource(cabsav$description))


# Convert the text to lower case
descriptors <- tm_map(descriptors, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(descriptors, content_transformer(tolower)):
## transformation drops documents
# Remove numbers
descriptors <- tm_map(descriptors, removeNumbers)
## Warning in tm_map.SimpleCorpus(descriptors, removeNumbers): transformation drops
## documents
# Remove english common stopwords
descriptors <- tm_map(descriptors, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(descriptors, removeWords, stopwords("english")):
## transformation drops documents
# Remove your own stop word
# specify your stopwords as a character vector
descriptors <- tm_map(descriptors, removeWords, c("wine", "drink", "cabernet", "flavors")) 
## Warning in tm_map.SimpleCorpus(descriptors, removeWords, c("wine", "drink", :
## transformation drops documents
# Remove punctuations
descriptors <- tm_map(descriptors, removePunctuation)
## Warning in tm_map.SimpleCorpus(descriptors, removePunctuation): transformation
## drops documents
# Eliminate extra white spaces
descriptors <- tm_map(descriptors, stripWhitespace)
## Warning in tm_map.SimpleCorpus(descriptors, stripWhitespace): transformation
## drops documents
# Text stemming
#descriptors <- tm_map(descriptors, stemDocument)

# Build a term-document matrix
dtm <- TermDocumentMatrix(descriptors)
dtm_mat <- as.matrix(dtm)
v <- sort(rowSums(dtm_mat),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
##                  word freq
## tannins       tannins 3610
## black           black 3368
## fruit           fruit 3139
## cherry         cherry 2846
## finish         finish 2711
## aromas         aromas 2564
## oak               oak 2527
## blackberry blackberry 2389
## palate         palate 2104
## cassis         cassis 1678
# Generate the Word cloud
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : structured could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : great could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : toasty could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : finishes could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : bodied could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : pretty could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : spices could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : develop could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : vineyards could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : drinking could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : powerful could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : way could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : wood could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : easy could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : medium could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : baked could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : layers could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : followed could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : enough could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : needs could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : hints could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : body could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : seems could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : velvety could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : anise could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : cinnamon could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : integrated could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : scents could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : malbec could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : roasted could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : pencil could not be fit on page. It will not be plotted.

Chardonnay Words

#---------- Chardonnay Noir Word cloud ----------#

chardy <- subset(new_data, variety == "Chardonnay")

descriptors <- Corpus(VectorSource(chardy$description))

# Convert the text to lower case
descriptors <- tm_map(descriptors, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(descriptors, content_transformer(tolower)):
## transformation drops documents
# Remove numbers
descriptors <- tm_map(descriptors, removeNumbers)
## Warning in tm_map.SimpleCorpus(descriptors, removeNumbers): transformation drops
## documents
# Remove english common stopwords
descriptors <- tm_map(descriptors, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(descriptors, removeWords, stopwords("english")):
## transformation drops documents
# Remove your own stop word
# specify your stopwords as a character vector
descriptors <- tm_map(descriptors, removeWords, c("wine", "chardonnay", "drink", "flavors")) 
## Warning in tm_map.SimpleCorpus(descriptors, removeWords, c("wine",
## "chardonnay", : transformation drops documents
# Remove punctuations
descriptors <- tm_map(descriptors, removePunctuation)
## Warning in tm_map.SimpleCorpus(descriptors, removePunctuation): transformation
## drops documents
# Eliminate extra white spaces
descriptors <- tm_map(descriptors, stripWhitespace)
## Warning in tm_map.SimpleCorpus(descriptors, stripWhitespace): transformation
## drops documents
# Text stemming
#descriptors <- tm_map(descriptors, stemDocument)

# Build a term-document matrix
dtm <- TermDocumentMatrix(descriptors)
dtm_mat <- as.matrix(dtm)
v <- sort(rowSums(dtm_mat),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
##            word freq
## acidity acidity 4000
## apple     apple 3577
## fruit     fruit 3513
## finish   finish 2721
## oak         oak 2710
## palate   palate 2527
## ripe       ripe 2487
## aromas   aromas 2363
## pear       pear 2066
## lemon     lemon 2009
# Generate the Word cloud
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : concentrated could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : pineapples could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : steely could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : peaches could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : complexity could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : nectarine could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : zesty could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : layers could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : aftertaste could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : makes could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : banana could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : popcorn could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : honeysuckle could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : offering could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : enough could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : citrusy could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : blossom could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : color could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : structured could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : pretty could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : brings could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : concentration could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : opulent could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : scents could not be fit on page. It will not be plotted.

#Regression model to predict the score a wine receives Model 1:

Using wordcount to predict points

word_reg <- lm(points ~ wordcount, data = new_data)
summary(word_reg)
## 
## Call:
## lm(formula = points ~ wordcount, data = new_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.2893  -1.7557  -0.0108   1.8182  11.3078 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 8.244e+01  3.092e-02  2666.1   <2e-16 ***
## wordcount   1.490e-01  7.311e-04   203.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.593 on 98237 degrees of freedom
## Multiple R-squared:  0.2971, Adjusted R-squared:  0.2971 
## F-statistic: 4.152e+04 on 1 and 98237 DF,  p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(word_reg)

With a significant p-value, wordcount is a significant predictor of points. The linear model is significant, and has an adjusted-R^2 of 0.2971. Model 2:

Using wordcount and price to predict points

full_reg <- lm(points ~ wordcount + log(price), data = new_data)
summary(full_reg)
## 
## Call:
## lm(formula = points ~ wordcount + log(price), data = new_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.7600  -1.4012   0.1117   1.5298   8.2316 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7.677e+01  3.839e-02  1999.5   <2e-16 ***
## wordcount   1.026e-01  6.574e-04   156.1   <2e-16 ***
## log(price)  2.256e+00  1.123e-02   200.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.183 on 98236 degrees of freedom
## Multiple R-squared:  0.5017, Adjusted R-squared:  0.5017 
## F-statistic: 4.945e+04 on 2 and 98236 DF,  p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(full_reg)

Model 3

Using wordcount, price and variety to predict points

full_reg_varieties <- lm(points ~ wordcount + log(price) + variety, data = new_data)
summary(full_reg_varieties)
## 
## Call:
## lm(formula = points ~ wordcount + log(price) + variety, data = new_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.3248  -1.3523   0.1048   1.4742   7.4753 
## 
## Coefficients:
##                                Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)                   76.093964   0.050856 1496.262  < 2e-16 ***
## wordcount                      0.102021   0.000645  158.176  < 2e-16 ***
## log(price)                     2.403076   0.012008  200.126  < 2e-16 ***
## varietyCabernet Franc         -0.351370   0.065415   -5.371 7.83e-08 ***
## varietyCabernet Sauvignon     -0.364018   0.036325  -10.021  < 2e-16 ***
## varietyChampagne Blend        -0.509277   0.067728   -7.519 5.55e-14 ***
## varietyChardonnay              0.358511   0.035394   10.129  < 2e-16 ***
## varietyGamay                   1.122329   0.079020   14.203  < 2e-16 ***
## varietyGewürztraminer         0.794618   0.074495   10.667  < 2e-16 ***
## varietyGrüner Veltliner       2.252202   0.069081   32.603  < 2e-16 ***
## varietyMalbec                 -0.059026   0.050848   -1.161 0.245712    
## varietyMerlot                 -0.353973   0.048187   -7.346 2.06e-13 ***
## varietyNebbiolo                0.194600   0.053006    3.671 0.000241 ***
## varietyPetite Sirah           -0.164478   0.081759   -2.012 0.044249 *  
## varietyPinot Grigio            0.162049   0.073476    2.205 0.027423 *  
## varietyPinot Gris              1.110725   0.063974   17.362  < 2e-16 ***
## varietyPinot Noir              0.109912   0.034584    3.178 0.001483 ** 
## varietyPortuguese Red          1.579470   0.054048   29.223  < 2e-16 ***
## varietyPortuguese White        0.956523   0.074101   12.908  < 2e-16 ***
## varietyRed Blend              -0.192582   0.037046   -5.198 2.01e-07 ***
## varietyRhône-style Red Blend  0.309599   0.063559    4.871 1.11e-06 ***
## varietyRiesling                1.431708   0.041869   34.195  < 2e-16 ***
## varietyRosé                   0.316142   0.047689    6.629 3.39e-11 ***
## varietySangiovese             -0.141215   0.052263   -2.702 0.006893 ** 
## varietySauvignon Blanc         0.423687   0.042674    9.928  < 2e-16 ***
## varietyShiraz                  0.573246   0.079389    7.221 5.21e-13 ***
## varietySparkling Blend         0.179057   0.055345    3.235 0.001216 ** 
## varietySyrah                   0.375248   0.044027    8.523  < 2e-16 ***
## varietyTempranillo            -0.515842   0.058007   -8.893  < 2e-16 ***
## varietyViognier                0.220253   0.073566    2.994 0.002755 ** 
## varietyWhite Blend            -0.047446   0.054211   -0.875 0.381461    
## varietyZinfandel              -0.121830   0.050044   -2.434 0.014916 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.118 on 98207 degrees of freedom
## Multiple R-squared:  0.5311, Adjusted R-squared:  0.531 
## F-statistic:  3589 on 31 and 98207 DF,  p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(full_reg)

Including the wine’s variety in model three provides the highest R-squared at 0.531. While some of the coefficients in variety aren’t statistically significant (Malbec and White Blend p-value >0.05), the rest of the varieties are significant.