library(dplyr)
library(ggplot2)
library(ggjoy)
library(gridExtra)
library(ggridges)
full_data <- read.csv("winemag-data-130k-v2.csv")
# Remove any NAs
full_data <- full_data[!is.na(full_data$price), ]
#Exploratory Data Analysis
str(full_data)
## 'data.frame': 120975 obs. of 14 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ country : Factor w/ 44 levels "","Argentina",..: 33 44 44 44 39 24 17 19 17 44 ...
## $ description : Factor w/ 119955 levels "\"Chremisa,\" the ancient name of Krems, is commemorated in this wine that comes from Krems vineyards. It has t"| __truncated__,..: 99131 75516 60556 55752 22180 43058 89270 67633 91169 71422 ...
## $ designation : Factor w/ 37980 levels "","'61 Rosé",..: 2404 1 28205 36717 2049 3107 1 30993 20073 23124 ...
## $ points : int 87 87 87 87 87 87 87 87 87 87 ...
## $ price : num 15 14 13 65 15 16 24 12 27 19 ...
## $ province : Factor w/ 426 levels "","Österreichischer Perlwein",..: 115 274 225 274 268 340 17 314 17 57 ...
## $ region_1 : Factor w/ 1230 levels "","Abruzzo","Adelaida District",..: 1 1219 552 1219 759 1206 23 1 23 755 ...
## $ region_2 : Factor w/ 18 levels "","California Other",..: 1 18 1 18 1 1 1 1 1 8 ...
## $ taster_name : Factor w/ 20 levels "","Alexander Peartree",..: 17 16 2 16 14 11 17 3 17 20 ...
## $ taster_twitter_handle: Factor w/ 16 levels "","@AnneInVino",..: 12 9 1 9 14 6 12 1 12 11 ...
## $ title : Factor w/ 118840 levels ":Nota Bene 2005 Una Notte Red (Washington)",..: 89460 89984 101062 102995 103761 105815 108734 54697 59245 62596 ...
## $ variety : Factor w/ 708 levels "","Çalkarası",..: 454 441 483 445 593 191 213 213 441 88 ...
## $ winery : Factor w/ 16757 levels ":Nota Bene","1+1=3",..: 12989 13063 14433 14665 14742 15048 15436 8443 9000 9342 ...
# Where is the incomplete data (NAs)?
print(colSums(is.na(full_data)))
## X country description
## 0 0 0
## designation points price
## 0 0 0
## province region_1 region_2
## 0 0 0
## taster_name taster_twitter_handle title
## 0 0 0
## variety winery
## 0 0
# There are nearly 9,000 prices missing
# Now we will remove these NAs
full_data <- full_data[!is.na(full_data$price), ]
# Let's inspect the amount of unique observations we have in each variable
unique_vals <- lapply(full_data, unique)
sapply(unique_vals, length)
## X country description
## 120975 43 111567
## designation points price
## 35777 21 390
## province region_1 region_2
## 423 1205 18
## taster_name taster_twitter_handle title
## 20 16 110638
## variety winery
## 698 15855
Some notes on the dataset:
There were 20 unique tasters providing the reviews
The wines in the list come from 43 different countries and 423 different provinces
There were 698 unique varieties and 15,855 wineries represented
Numerical Variable Analysis
Points awarded
summary(full_data$points)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 80.00 86.00 88.00 88.42 91.00 100.00
How many wines score a perfect score?
sum(full_data$points == 100)
## [1] 19
ggplot(data = full_data, aes(x= points, colour = I('black'), fill = I('#099DD9')))+
geom_histogram(binwidth = 1)+
labs(x = "Points", y= "Frequency", title = "Distribution of points")
The points given to the wines reviewed ranged from 80 - 100 points, with a mean of 88.42 and median of 88. There are 19 wines in the list that have scored a perfect score of 100. The points given to the wines are normally distributed.
Wine prices
summary(full_data$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.00 17.00 25.00 35.36 42.00 3300.00
ggplot(data = full_data, aes(x= price, colour = I('black'), fill = I('#099DD9')))+
geom_histogram()+
labs(x = "Price", y= "Frequency", title = "Distribution of prices") #Strongly right skewed
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Log
ggplot(data = full_data, aes(x= log(price), colour = I('black'), fill = I('#099DD9')))+
geom_histogram()+
labs(x = "log(Price)", y= "Frequency", title = "Distribution of log(prices)") #slightly right skewed
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Wine prices are certainly not normally distributed! Taking a log of the price goes a long way to addressing this.
Top 30 varieties reviewed
top_30df <- full_data %>%
group_by(variety) %>%
summarise(count = n())%>%
arrange(desc(count))
top_30df <- top_30df[1:30,1:2]
top_30df
## # A tibble: 30 x 2
## variety count
## <fct> <int>
## 1 Pinot Noir 12787
## 2 Chardonnay 11080
## 3 Cabernet Sauvignon 9386
## 4 Red Blend 8476
## 5 Bordeaux-style Red Blend 5340
## 6 Riesling 4972
## 7 Sauvignon Blanc 4783
## 8 Syrah 4086
## 9 Rosé 3262
## 10 Merlot 3062
## # ... with 20 more rows
top_30 <- top_30df$variety
new_data <- subset(full_data, variety %in% top_30)
The most reviewd 30 varieties are displayed above. I will limit the dataset that I will perform the rest of the EDA on to these varieties only.
# Create a new variable to indicate whether wine is white or red
new_data$wine_type <- ifelse(new_data$variety == "Chardonnay" | new_data$variety == "Riesling" | new_data$variety == "Sauvignon Blanc" | new_data$variety == "White Blend" | new_data$variety == "Sparkling Blend" | new_data$variety == "Pinot Gris" | new_data$variety == "Champagne Blend" | new_data$variety == "Grüner Veltliner" | new_data$variety == "Pinot Grigio" | new_data$variety == "Portuguese White" | new_data$variety == "Viognier" | new_data$variety == "Gewürztraminer" | new_data$variety == "Gewürztraminer", "White Wine", "Red Wine")
I will create a new variable to use in my analysis - is the wine white or red? Here I have included Sparkling Blend and Champagne Blend as White Wine, however this will probably be better let out as it is fairly different. Additionally, Rose has been included as a Red Wine.
new_data %>%
group_by(variety, wine_type) %>%
summarise(n=n(),
avg_score = mean(points),
avg_price = mean(price)) %>%
ggplot(aes(x=avg_price, y= avg_score, size = n, colour = wine_type))+
geom_point()+
scale_color_manual(values = c("#CC3300", "#FFCC00"))
Distribution of points awarded for the most reviewed
How are the points a wine receives distributed for each of the varieties?
p1 <- ggplot(data = subset(new_data, wine_type == "Red Wine"), aes(x=points, y=variety))+
geom_joy2(bandwidth = 0.539, fill = "#CC3300")
p2 <- ggplot(data = subset(new_data, wine_type == "White Wine"), aes(x=points, y=variety))+
geom_joy2(bandwidth = 0.539, fill = "#FFCC00")
grid.arrange(p1, p2, nrow = 1)
Distribution of price awarded for the most reviewed
How does the price of a wine vary given its variety?
p3 <- ggplot(data = subset(new_data, wine_type == "Red Wine"), aes(x=log(price), y=variety))+
geom_joy2(bandwidth = 0.103, fill = "#CC3300")
p4 <- ggplot(data = subset(new_data, wine_type == "White Wine"), aes(x=log(price), y=variety))+
geom_joy2(bandwidth = 0.103, fill = "#FFCC00")
grid.arrange(p3, p4, nrow=1)
Is there a relationship between the points given and the wine’s price?
p5 <- ggplot(data = subset(new_data, wine_type == "Red Wine"), aes(x=points, y= price))+
geom_point(colour="#CC3300")+
scale_y_log10()+
geom_smooth()
p6 <- ggplot(data = subset(new_data, wine_type == "White Wine"), aes(x=points, y= price))+
geom_point(colour="#FFCC00")+
scale_y_log10()+
geom_smooth()
grid.arrange(p5, p6, nrow=1)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Correlation
cor(log(new_data$price), new_data$points)
## [1] 0.6149412
There is a fairly positive correlation between log(price) and points with a correlation of 0.6149.
Price_points_reg <- lm(log(price) ~ points + wine_type, data = new_data)
summary(Price_points_reg)
##
## Call:
## lm(formula = log(price) ~ points + wine_type, data = new_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8023 -0.3629 -0.0421 0.3181 4.7430
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.0680575 0.0468514 -172.21 <2e-16 ***
## points 0.1298493 0.0005282 245.84 <2e-16 ***
## wine_typeWhite Wine -0.2255564 0.0034664 -65.07 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5113 on 98236 degrees of freedom
## Multiple R-squared: 0.4038, Adjusted R-squared: 0.4038
## F-statistic: 3.327e+04 on 2 and 98236 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(Price_points_reg)
Using lm to run a multiple regression model to predict a wines price, using points and wine_type as predictor variables yields an adjusted R-squared of 0.4058. The higher the points awarded, the higher the predicted price of the wine, while white wines are valued lower than their red counterparts.
#Words Analysis Points Prediction
top_30 <- top_30df$variety
new_data <- subset(full_data, variety %in% top_30)
new_data$wine_type <- ifelse(new_data$variety == "Chardonnay" | new_data$variety == "Riesling" | new_data$variety == "Sauvignon Blanc" | new_data$variety == "White Blend" | new_data$variety == "Sparkling Blend" | new_data$variety == "Pinot Gris" | new_data$variety == "Champagne Blend" | new_data$variety == "Grüner Veltliner" | new_data$variety == "Pinot Grigio" | new_data$variety == "Portuguese White" | new_data$variety == "Viognier" | new_data$variety == "Gewürztraminer" | new_data$variety == "Gewürztraminer", "White Wine", "Red Wine")
Word count analysis on how many words are in each review
new_data$wordcount <- sapply(gregexpr("\\S+", new_data$description), length)
summary(new_data$wordcount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.00 33.00 40.00 40.75 48.00 135.00
Description with the highest wordcount
new_data$description[which(new_data$wordcount == 135)]
## [1] This very fine Cabernet wants a little time in the cellar. Right now, it's tight in tannins, with some acidic bitterness in the finish. The flavors are of black currants and smoky new oak. The Morisoli Vineyard has been home to very good, ageable bottlings from the likes of Sequoia Grove and Elyse, but in the last few years, Meander has expressed its terroir best. Try after 2012.This very fine Cabernet wants a little time in the cellar. Right now, it's tight in tannins, with some acidic bitterness in the finish. The flavors are of black currants and smoky new oak. The Morisoli Vineyard has been home to very good, ageable bottlings from the likes of Sequoia Grove and Elyse, but in the last few years, Meander has expressed its terroir best. Try after 2012.
## 119955 Levels: "Chremisa," the ancient name of Krems, is commemorated in this wine that comes from Krems vineyards. It has tight, tangy apple-driven acidity, with a bright, light, citrusy character. Not for aging. ...
Description with the lowest wordcount
new_data$description[which(new_data$wordcount == 3)]
## [1] Imported by Kobrand.
## 119955 Levels: "Chremisa," the ancient name of Krems, is commemorated in this wine that comes from Krems vineyards. It has tight, tangy apple-driven acidity, with a bright, light, citrusy character. Not for aging. ...
Wordcount distribution
ggplot(data = new_data, aes(x= wordcount))+
geom_histogram(binwidth = 3)+
labs(x = "Word Count", y= "Frequency", title = "Distribution of word count of description")
ggplot(data = new_data, aes(x= wordcount, y= wine_type, fill = wine_type))+
geom_density_ridges ()+
labs(x = "Word Count", title = "Distribution of word count of description")+
scale_fill_cyclical(values = c("#CC3300", "#FFCC00"))
## Picking joint bandwidth of 1.09
ggplot(data = new_data, aes(x=variety, y=wordcount))+
geom_boxplot()+
coord_flip()+
labs(title = "Wordcount Distribution by Variety", x= "Variety", y= "Word Count")
There is a fair bit of variability between the different varieties and the description word count. Pinot Grigio has the lowest median of the white wines, while Rose has the lowest of the red wines.
Is there a correlation between word count and the score?
ggplot(data = new_data, aes(x=wordcount, y=points))+
geom_point()
cor(new_data$points, new_data$wordcount)
## [1] 0.5450367
There is a positive correlation between the number of words used in the description and the wines score (0.5450). Word Analysis
Performing analysis on the below three wine varieties show us the most commonly used words to describe the varieties. Certain words were removed from the analysis. The name of the wine was removed from the analysis, as were the words “wine” “flavors” and “drink”.
Pinot Noir
# Load
library("tm")
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library("SnowballC")
library("wordcloud")
## Warning: package 'wordcloud' was built under R version 3.6.3
## Loading required package: RColorBrewer
library("RColorBrewer")
#---------- Pinot Noir Word cloud ----------#
pinot <- subset(new_data, variety == "Pinot Noir")
descriptors <- Corpus(VectorSource(pinot$description))
head(descriptors)
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 6
# Convert the text to lower case
descriptors <- tm_map(descriptors, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(descriptors, content_transformer(tolower)):
## transformation drops documents
# Remove numbers
descriptors <- tm_map(descriptors, removeNumbers)
## Warning in tm_map.SimpleCorpus(descriptors, removeNumbers): transformation drops
## documents
# Remove english common stopwords
descriptors <- tm_map(descriptors, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(descriptors, removeWords, stopwords("english")):
## transformation drops documents
# Remove your own stop word
# specify your stopwords as a character vector
descriptors <- tm_map(descriptors, removeWords, c("wine", "pinot", "noir", "drink", "flavors"))
## Warning in tm_map.SimpleCorpus(descriptors, removeWords, c("wine", "pinot", :
## transformation drops documents
# Remove punctuations
descriptors <- tm_map(descriptors, removePunctuation)
## Warning in tm_map.SimpleCorpus(descriptors, removePunctuation): transformation
## drops documents
# Eliminate extra white spaces
descriptors <- tm_map(descriptors, stripWhitespace)
## Warning in tm_map.SimpleCorpus(descriptors, stripWhitespace): transformation
## drops documents
# Text stemming
#descriptors <- tm_map(descriptors, stemDocument)
# Build a term-document matrix
dtm <- TermDocumentMatrix(descriptors)
dtm_mat <- as.matrix(dtm)
v <- sort(rowSums(dtm_mat),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
## word freq
## cherry cherry 5416
## fruit fruit 5010
## acidity acidity 3335
## tannins tannins 2939
## finish finish 2922
## red red 2740
## palate palate 2664
## black black 2571
## raspberry raspberry 2408
## oak oak 2182
# Generate the Word cloud
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
Cabernet Sauvignon Words
#---------- Cabernet Sauvignon Word cloud ----------#
cabsav <- subset(new_data, variety == "Cabernet Sauvignon")
descriptors <- Corpus(VectorSource(cabsav$description))
# Convert the text to lower case
descriptors <- tm_map(descriptors, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(descriptors, content_transformer(tolower)):
## transformation drops documents
# Remove numbers
descriptors <- tm_map(descriptors, removeNumbers)
## Warning in tm_map.SimpleCorpus(descriptors, removeNumbers): transformation drops
## documents
# Remove english common stopwords
descriptors <- tm_map(descriptors, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(descriptors, removeWords, stopwords("english")):
## transformation drops documents
# Remove your own stop word
# specify your stopwords as a character vector
descriptors <- tm_map(descriptors, removeWords, c("wine", "drink", "cabernet", "flavors"))
## Warning in tm_map.SimpleCorpus(descriptors, removeWords, c("wine", "drink", :
## transformation drops documents
# Remove punctuations
descriptors <- tm_map(descriptors, removePunctuation)
## Warning in tm_map.SimpleCorpus(descriptors, removePunctuation): transformation
## drops documents
# Eliminate extra white spaces
descriptors <- tm_map(descriptors, stripWhitespace)
## Warning in tm_map.SimpleCorpus(descriptors, stripWhitespace): transformation
## drops documents
# Text stemming
#descriptors <- tm_map(descriptors, stemDocument)
# Build a term-document matrix
dtm <- TermDocumentMatrix(descriptors)
dtm_mat <- as.matrix(dtm)
v <- sort(rowSums(dtm_mat),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
## word freq
## tannins tannins 3610
## black black 3368
## fruit fruit 3139
## cherry cherry 2846
## finish finish 2711
## aromas aromas 2564
## oak oak 2527
## blackberry blackberry 2389
## palate palate 2104
## cassis cassis 1678
# Generate the Word cloud
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : structured could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : great could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : toasty could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : finishes could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : bodied could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : pretty could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : spices could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : develop could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : vineyards could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : drinking could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : powerful could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : way could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : wood could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : easy could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : medium could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : baked could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : layers could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : followed could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : enough could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : needs could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : hints could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : body could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : seems could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : velvety could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : anise could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : cinnamon could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : integrated could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : scents could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : malbec could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : roasted could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : pencil could not be fit on page. It will not be plotted.
Chardonnay Words
#---------- Chardonnay Noir Word cloud ----------#
chardy <- subset(new_data, variety == "Chardonnay")
descriptors <- Corpus(VectorSource(chardy$description))
# Convert the text to lower case
descriptors <- tm_map(descriptors, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(descriptors, content_transformer(tolower)):
## transformation drops documents
# Remove numbers
descriptors <- tm_map(descriptors, removeNumbers)
## Warning in tm_map.SimpleCorpus(descriptors, removeNumbers): transformation drops
## documents
# Remove english common stopwords
descriptors <- tm_map(descriptors, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(descriptors, removeWords, stopwords("english")):
## transformation drops documents
# Remove your own stop word
# specify your stopwords as a character vector
descriptors <- tm_map(descriptors, removeWords, c("wine", "chardonnay", "drink", "flavors"))
## Warning in tm_map.SimpleCorpus(descriptors, removeWords, c("wine",
## "chardonnay", : transformation drops documents
# Remove punctuations
descriptors <- tm_map(descriptors, removePunctuation)
## Warning in tm_map.SimpleCorpus(descriptors, removePunctuation): transformation
## drops documents
# Eliminate extra white spaces
descriptors <- tm_map(descriptors, stripWhitespace)
## Warning in tm_map.SimpleCorpus(descriptors, stripWhitespace): transformation
## drops documents
# Text stemming
#descriptors <- tm_map(descriptors, stemDocument)
# Build a term-document matrix
dtm <- TermDocumentMatrix(descriptors)
dtm_mat <- as.matrix(dtm)
v <- sort(rowSums(dtm_mat),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
## word freq
## acidity acidity 4000
## apple apple 3577
## fruit fruit 3513
## finish finish 2721
## oak oak 2710
## palate palate 2527
## ripe ripe 2487
## aromas aromas 2363
## pear pear 2066
## lemon lemon 2009
# Generate the Word cloud
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : concentrated could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : pineapples could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : steely could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : peaches could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : complexity could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : nectarine could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : zesty could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : layers could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : aftertaste could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : makes could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : banana could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : popcorn could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : honeysuckle could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : offering could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : enough could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : citrusy could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : blossom could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : color could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : structured could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : pretty could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : brings could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : concentration could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : opulent could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words =
## 200, : scents could not be fit on page. It will not be plotted.
#Regression model to predict the score a wine receives Model 1:
Using wordcount to predict points
word_reg <- lm(points ~ wordcount, data = new_data)
summary(word_reg)
##
## Call:
## lm(formula = points ~ wordcount, data = new_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.2893 -1.7557 -0.0108 1.8182 11.3078
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.244e+01 3.092e-02 2666.1 <2e-16 ***
## wordcount 1.490e-01 7.311e-04 203.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.593 on 98237 degrees of freedom
## Multiple R-squared: 0.2971, Adjusted R-squared: 0.2971
## F-statistic: 4.152e+04 on 1 and 98237 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(word_reg)
With a significant p-value, wordcount is a significant predictor of points. The linear model is significant, and has an adjusted-R^2 of 0.2971. Model 2:
Using wordcount and price to predict points
full_reg <- lm(points ~ wordcount + log(price), data = new_data)
summary(full_reg)
##
## Call:
## lm(formula = points ~ wordcount + log(price), data = new_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.7600 -1.4012 0.1117 1.5298 8.2316
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.677e+01 3.839e-02 1999.5 <2e-16 ***
## wordcount 1.026e-01 6.574e-04 156.1 <2e-16 ***
## log(price) 2.256e+00 1.123e-02 200.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.183 on 98236 degrees of freedom
## Multiple R-squared: 0.5017, Adjusted R-squared: 0.5017
## F-statistic: 4.945e+04 on 2 and 98236 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(full_reg)
Model 3
Using wordcount, price and variety to predict points
full_reg_varieties <- lm(points ~ wordcount + log(price) + variety, data = new_data)
summary(full_reg_varieties)
##
## Call:
## lm(formula = points ~ wordcount + log(price) + variety, data = new_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.3248 -1.3523 0.1048 1.4742 7.4753
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 76.093964 0.050856 1496.262 < 2e-16 ***
## wordcount 0.102021 0.000645 158.176 < 2e-16 ***
## log(price) 2.403076 0.012008 200.126 < 2e-16 ***
## varietyCabernet Franc -0.351370 0.065415 -5.371 7.83e-08 ***
## varietyCabernet Sauvignon -0.364018 0.036325 -10.021 < 2e-16 ***
## varietyChampagne Blend -0.509277 0.067728 -7.519 5.55e-14 ***
## varietyChardonnay 0.358511 0.035394 10.129 < 2e-16 ***
## varietyGamay 1.122329 0.079020 14.203 < 2e-16 ***
## varietyGewürztraminer 0.794618 0.074495 10.667 < 2e-16 ***
## varietyGrüner Veltliner 2.252202 0.069081 32.603 < 2e-16 ***
## varietyMalbec -0.059026 0.050848 -1.161 0.245712
## varietyMerlot -0.353973 0.048187 -7.346 2.06e-13 ***
## varietyNebbiolo 0.194600 0.053006 3.671 0.000241 ***
## varietyPetite Sirah -0.164478 0.081759 -2.012 0.044249 *
## varietyPinot Grigio 0.162049 0.073476 2.205 0.027423 *
## varietyPinot Gris 1.110725 0.063974 17.362 < 2e-16 ***
## varietyPinot Noir 0.109912 0.034584 3.178 0.001483 **
## varietyPortuguese Red 1.579470 0.054048 29.223 < 2e-16 ***
## varietyPortuguese White 0.956523 0.074101 12.908 < 2e-16 ***
## varietyRed Blend -0.192582 0.037046 -5.198 2.01e-07 ***
## varietyRhône-style Red Blend 0.309599 0.063559 4.871 1.11e-06 ***
## varietyRiesling 1.431708 0.041869 34.195 < 2e-16 ***
## varietyRosé 0.316142 0.047689 6.629 3.39e-11 ***
## varietySangiovese -0.141215 0.052263 -2.702 0.006893 **
## varietySauvignon Blanc 0.423687 0.042674 9.928 < 2e-16 ***
## varietyShiraz 0.573246 0.079389 7.221 5.21e-13 ***
## varietySparkling Blend 0.179057 0.055345 3.235 0.001216 **
## varietySyrah 0.375248 0.044027 8.523 < 2e-16 ***
## varietyTempranillo -0.515842 0.058007 -8.893 < 2e-16 ***
## varietyViognier 0.220253 0.073566 2.994 0.002755 **
## varietyWhite Blend -0.047446 0.054211 -0.875 0.381461
## varietyZinfandel -0.121830 0.050044 -2.434 0.014916 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.118 on 98207 degrees of freedom
## Multiple R-squared: 0.5311, Adjusted R-squared: 0.531
## F-statistic: 3589 on 31 and 98207 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(full_reg)
Including the wine’s variety in model three provides the highest R-squared at 0.531. While some of the coefficients in variety aren’t statistically significant (Malbec and White Blend p-value >0.05), the rest of the varieties are significant.