This is a dataset containing of many observations of wine scraped from the WineEnthusiast. I downloaded the dataset from Kaggle
The scope of the project shall involve solely on Italian wines.
I uploaded the necessary libraries
library(tidyverse)
## -- Attaching packages ------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0 v purrr 0.2.5
## v tibble 1.4.2 v dplyr 0.7.8
## v tidyr 0.8.1 v stringr 1.3.1
## v readr 1.1.1 v forcats 0.3.0
## Warning: package 'dplyr' was built under R version 3.5.2
## -- Conflicts ---------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(stringr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(effects)
## Warning: package 'effects' was built under R version 3.5.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 3.5.2
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
library(httr)
## Warning: package 'httr' was built under R version 3.5.3
library(tm)
## Warning: package 'tm' was built under R version 3.5.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.5.2
##
## Attaching package: 'NLP'
## The following object is masked from 'package:httr':
##
## content
## The following object is masked from 'package:ggplot2':
##
## annotate
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.5.3
## Loading required package: RColorBrewer
library(SnowballC)
## Warning: package 'SnowballC' was built under R version 3.5.2
library(car)
## Warning: package 'car' was built under R version 3.5.3
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 3.5.2
Uploading the data
library(readxl)
wine <- read_excel("D:/Working Directory/winemag_data_first150k.xlsx")
glimpse(wine)
## Observations: 150,930
## Variables: 11
## $ X__1 <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, ...
## $ country <chr> "US", "Spain", "US", "US", "France", "Spain", "Spa...
## $ description <chr> "This tremendous 100% varietal wine hails from Oak...
## $ designation <chr> "Martha's Vineyard", "Carodorum Selección Especia...
## $ points <dbl> 96, 96, 96, 96, 95, 95, 95, 95, 95, 95, 95, 95, 95...
## $ price <dbl> 235, 110, 90, 65, 66, 73, 65, 110, 65, 60, 80, 48,...
## $ province <chr> "California", "Northern Spain", "California", "Ore...
## $ region_1 <chr> "Napa Valley", "Toro", "Knights Valley", "Willamet...
## $ region_2 <chr> "Napa", NA, "Sonoma", "Willamette Valley", NA, NA,...
## $ variety <chr> "Cabernet Sauvignon", "Tinta de Toro", "Sauvignon ...
## $ winery <chr> "Heitz", "Bodega Carmen RodrÃguez", "Macauley", "...
wine %>%
mutate(price = price * 0.89)
Given i indicated the scope of this project to detail only wines from Italy, i will filter the wines to include only those from Italy.
wines_Italia <- wine %>%
filter(country == "Italy") %>%
filter(!is.na(price))
summary(wines_Italia)
## X__1 country description designation
## Min. : 10 Length:18784 Length:18784 Length:18784
## 1st Qu.: 31646 Class :character Class :character Class :character
## Median : 71596 Mode :character Mode :character Mode :character
## Mean : 72285
## 3rd Qu.:108981
## Max. :150929
## points price province region_1
## Min. : 80.00 Min. : 5.00 Length:18784 Length:18784
## 1st Qu.: 87.00 1st Qu.: 17.00 Class :character Class :character
## Median : 88.00 Median : 25.00 Mode :character Mode :character
## Mean : 88.45 Mean : 37.55
## 3rd Qu.: 90.00 3rd Qu.: 48.00
## Max. :100.00 Max. :900.00
## region_2 variety winery
## Length:18784 Length:18784 Length:18784
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
I would like to ensure that i am working with clean data with no duplicate values. Given that “Description” consists of free text in which each taster who rates and reviews the wine writes a description using his/her own language, it would be a great place to start.
wines_Italia$description %>%
glimpse()
## chr [1:18784] "Elegance, complexity and structure come together in this drop-dead gorgeous winethat ranks among Italy's greate"| __truncated__ ...
The next step is to ensure that we will not be deleting empty descriptions that might be mistaken for duplicates
## Empty spaces: 0
## NAs: 0
Thankfully, no empty spaces and NAs exist, hence i would like to get rid of duplicates
wines_Italia %>%
mutate(duplicate = duplicated(description)) %>%
filter(duplicate == FALSE) %>%
select(-duplicate)
glimpse(wines_Italia)
## Observations: 18,784
## Variables: 11
## $ X__1 <dbl> 10, 35, 37, 38, 39, 43, 45, 46, 48, 50, 59, 79, 83...
## $ country <chr> "Italy", "Italy", "Italy", "Italy", "Italy", "Ital...
## $ description <chr> "Elegance, complexity and structure come together ...
## $ designation <chr> "Ronco della Chiesa", "Riserva", NA, NA, "Riserva"...
## $ points <dbl> 95, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 91, 91...
## $ price <dbl> 80, 135, 29, 23, 29, 39, 30, 90, 50, 100, 60, 75, ...
## $ province <chr> "Northeastern Italy", "Tuscany", "Tuscany", "Tusca...
## $ region_1 <chr> "Collio", "Brunello di Montalcino", "Vino Nobile d...
## $ region_2 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ variety <chr> "Friulano", "Sangiovese", "Sangiovese", "Sangioves...
## $ winery <chr> "Borgo del Tiglio", "Carillon", "Avignonesi", "Cas...
Let’s create a table and a graph to analyze the distribution of Italian wines per province. It would be wise to keep it to the top 5 provinces
wines_Italia_province <- wines_Italia %>%
group_by(province) %>%
summarise(total = n()) %>%
arrange(desc(total)) %>%
mutate(pcnt = round(100*(total/sum(total)), digits = 5), accum = cumsum(pcnt))
print(wines_Italia_province)
## # A tibble: 10 x 4
## province total pcnt accum
## <chr> <int> <dbl> <dbl>
## 1 Tuscany 5961 31.7 31.7
## 2 Veneto 3103 16.5 48.3
## 3 Piedmont 2909 15.5 63.7
## 4 Sicily & Sardinia 2032 10.8 74.6
## 5 Northeastern Italy 1852 9.86 84.4
## 6 Central Italy 1257 6.69 91.1
## 7 Southern Italy 1167 6.21 97.3
## 8 Lombardy 444 2.36 99.7
## 9 Italy Other 52 0.277 100.0
## 10 Northwestern Italy 7 0.0373 100.0
wines_Italia_province %>%
head(5) %>%
ggplot(aes(x = reorder(province, -pcnt), y = total, FILL = province)) +
geom_bar(aes(fill = province), stat = "identity") +
labs(x = "\n Vini Italiani (per provinzia), Top 5 \n") + labs(y = "\n Total \n") +
labs(title = "\n Rassegna di Vini Italiani") +
geom_text(aes(label = sprintf("%.1f %%", pcnt))) +
theme_solarized()
Tuscany has around most of the wines on the list with 31%. They have always been a historically wine-producing region. Piedmont second, albeit far behind from Tuscany and so on.
Wine reviews are generally about quality over quantity. Hence it’s a test to see which Italian province has the best-rated wines.
Rating <- wines_Italia %>%
group_by(province) %>%
summarise_at(vars(points), funs(points = mean(., na.rm = T))) %>%
arrange(desc(points)) %>%
head(5)
Rating %>%
head(5) %>%
ggplot(aes(x = reorder(province, - points), y = points)) +
geom_bar(fill = "yellowgreen", stat = "identity") +
coord_cartesian(ylim = c(50, 99)) +
labs(x = "Provinzia Italiani") + labs(y = "Valutazione") +
geom_text(aes(label = sprintf("%.1f %%", points))) +
theme_solarized()
It would, however, be an unfair comparison to compare Tuscan with Lombardy wines given the huge disparity of wines reviewed (7281 vs 580)
I have previously made an rpubs project albeit on Australian wines. On that project i filtered wines in three categories:
Excellent <- wines_Italia %>%
filter(points >= 90) %>%
group_by(province) %>%
summarise(total = n()) %>%
arrange(desc(total)) %>%
mutate(pcnt = round(100*(total/sum(total)), digits = 3))
Excellent
Excellent %>%
head(5) %>%
ggplot(aes(x = reorder(province, - total), y = total)) +
geom_bar(fill = "yellowgreen", stat = "identity") +
coord_cartesian(ylim = c(400, 4000)) +
labs(x = "Provinzia Italiani") + labs(y = "Total") +
labs(title = "\n Il Vino Italiano classificati da 90 in superiore (top 5)") +
geom_text(aes(label = sprintf("%.1f %%", pcnt))) +
theme_solarized()
So, almost half of the wines rated excellent come from Tuscany. Piedmont comes at 22.5%.
Okay <- wines_Italia %>%
filter(points <= 89) %>%
group_by(province) %>%
summarise(total = n()) %>%
arrange(desc(total)) %>%
mutate(pcnt = round(100*(total/sum(total)), digits = 3))
Okay
Okay %>%
head(5) %>%
ggplot(aes(x = reorder(province, - total), y = total)) +
geom_bar(fill = "yellowgreen", stat = "identity") +
coord_cartesian(ylim = c(400, 4000)) +
labs(x = "Provinzia Italiani") + labs(y = "Total") +
labs(title = "\n Il Vino Italiano classificati da 89 e sotto (top 5)") +
geom_text(aes(label = sprintf("%.1f %%", pcnt))) +
theme_solarized()
Ariel, Moreno, Orsini-Rosberg, Saggese et al on a similar project whose code on this same dataset i have been inspired to replicate to some extent, employed a Welch Two Sample t-test on the dataset to see if even though a max price of wine on a certain country is higher than that of another, the average price would still be the same.
However, i will try to determine the max rating of a province’s wine and see if the average rating of the two highest-rated wines are the same.
max_rating <- wines_Italia %>%
select(province, points) %>%
group_by(province) %>%
summarise(maxpoints = max(points, na.rm = TRUE)) %>%
arrange(desc(maxpoints)) %>%
head(5)
max_rating
t.test(wines_Italia$points[wines_Italia$province == "Tuscany"], wines_Italia$points[wines_Italia$province == "Piedmont"], paired = FALSE)
##
## Welch Two Sample t-test
##
## data: wines_Italia$points[wines_Italia$province == "Tuscany"] and wines_Italia$points[wines_Italia$province == "Piedmont"]
## t = 8.5037, df = 6186.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.4215666 0.6741644
## sample estimates:
## mean of x mean of y
## 89.39524 88.84737
Given the p-value is less than 0.05, then we can say (and we proved earlier) that the average rating difference is significant.
So how much are the most expensive wines per province?.
wines_Italia %>%
select(province, price) %>%
group_by(province) %>%
summarise(maxprice = max(price, na.rm = TRUE)) %>%
arrange(desc(maxprice)) %>%
head(10)
Given Tuscany and Veneto are 1st and second amongst the priciest wine list, we want to know if the disparity has an effect on the average.
t.test(wines_Italia$price[wines_Italia$province == "Tuscany"], wines_Italia$price[wines_Italia$province == "Veneto"], paired = FALSE)
##
## Welch Two Sample t-test
##
## data: wines_Italia$price[wines_Italia$province == "Tuscany"] and wines_Italia$price[wines_Italia$province == "Veneto"]
## t = 18.52, df = 8388.6, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 13.86257 17.14454
## sample estimates:
## mean of x mean of y
## 48.12263 32.61908
Given p-value is less than 0.05, then we can conclude with 95% confidence that there is a significant difference between prices of wines at Tuscany and at Veneto.
Let’s create a graph detailing the distribution of varieties in this dataset. For legibility sake, we will restrict ourselves to the top 5.
wine_varieties <- wines_Italia %>%
group_by(variety) %>%
summarise(total = n()) %>%
arrange(desc(total)) %>%
mutate(pcnt = round(100*(total/sum(total)), digits = 5), accum = cumsum(pcnt))
print(wine_varieties)
## # A tibble: 175 x 4
## variety total pcnt accum
## <chr> <int> <dbl> <dbl>
## 1 Red Blend 3143 16.7 16.7
## 2 Sangiovese 2258 12.0 28.8
## 3 Nebbiolo 1479 7.87 36.6
## 4 Corvina, Rondinella, Molinara 1292 6.88 43.5
## 5 Sangiovese Grosso 937 4.99 48.5
## 6 White Blend 797 4.24 52.7
## 7 Barbera 706 3.76 56.5
## 8 Pinot Grigio 682 3.63 60.1
## 9 Prosecco 515 2.74 62.9
## 10 Glera 479 2.55 65.4
## # ... with 165 more rows
wine_varieties %>%
head(5) %>%
ggplot(aes(x = reorder(variety, -pcnt), y = total)) +
geom_bar(fill = "orchid", stat = "identity") +
labs(x = "\n Varieta di Vino, Top 5 \n") + labs(y = "\n Total \n") +
labs(title = "\n Rassegna di Vini Italiani") +
geom_text(aes(label = sprintf("%.1f %%", pcnt))) +
theme_solarized() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
The red blend is the most prominent wine in the dataset whereas the Sangiovese variety (wine common in most of Central Italy frorm the Emilia-Romagna region down to Lazio, Campania and Sicily that tastes oaky with a strawberry fruitiness and some mild spiciness) comes second.
We want to know the average rating per wine variety present. As always, we limit ourselves to the top 5 for legibility sake. Like before i will filter based on the category. We will use the same category
We want to know, however, the average rating per variety.
Rating2 <- wines_Italia %>%
group_by(variety) %>%
summarise_at(vars(points), funs(points = mean(., na.rm = T))) %>%
arrange(desc(points)) %>%
head(10)
Rating2 %>%
head(10) %>%
ggplot(aes(x = reorder(variety, -points), y = points)) +
geom_bar(fill = "orchid", stat = "identity") +
coord_cartesian(ylim = c(50, 99)) +
labs(x = "Varieta di Vino (top 10)") + labs(y = "Per cento") +
geom_text(aes(label = sprintf("%.1f %%", points))) +
theme_solarized() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
As before we will sort out wines rated 90 and higher and everything else.
Excellent_variety <- wines_Italia %>%
filter(points >= 90) %>%
group_by(variety) %>%
summarise(total = n()) %>%
arrange(desc(total)) %>%
mutate(pcnt = round(100*(total/sum(total)), digits = 3))
Excellent_variety
Excellent_variety %>%
head(5) %>%
ggplot(aes(x = reorder(variety, -total), y = total)) +
geom_bar(fill = "orchid", stat = "identity") +
coord_cartesian(ylim = c(400, 2000)) +
labs (x = "\n Varieta di Vino \n") + labs(y = "Total") +
labs(title = "\n Il varieta di vino Italiano classificati da 90 in superiore (top 10)") +
geom_text(aes(label = sprintf("%.1f %%", pcnt))) +
theme_solarized() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
So, it’s kind of apparent that the Red Blend has more wines rated 90 and above in spite of not being in the top 10 in terms of general rating.
Okay_variety <- wines_Italia %>%
filter(points <= 89) %>%
group_by(variety) %>%
summarise(total = n()) %>%
arrange(desc(total)) %>%
mutate(pcnt = round(100*(total/sum(total)), digits = 3))
Okay_variety
Okay_variety %>%
head(5) %>%
ggplot(aes(x = reorder(variety, -total), y = total)) +
geom_bar(fill = "orchid", stat = "identity") +
coord_cartesian(ylim = c(400, 2500)) +
labs (x = "\n Varieta di Vino \n") + labs(y = "Total") +
labs(title = "\n Il varieta di vino Italiano classificati da 89 e sotto (top 5)") +
geom_text(aes(label = sprintf("%.1f %%", pcnt))) +
theme_solarized() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
The Red Blend also dominates here, having 13.6% of all wines reviewed here.
There are 10 wine-producing provinces indicated in this dataset. It is my interest to determine which wine varieties are widely produced per province and that i believe it is manageable.
Filtering the wine varieties for the Tuscany province.
wines_Toscania <- wines_Italia %>%
filter(province == "Tuscany") %>%
group_by(variety) %>%
summarise(total = n()) %>%
mutate(pcnt = round(100*(total/sum(total)), digits = 5), accum = cumsum(pcnt)) %>%
arrange(desc(total))
wines_Toscania
wines_Toscania %>%
head(10) %>%
ggplot(aes(x = reorder(variety, -total), y = total)) +
geom_bar(fill = "wheat", stat = "identity") +
coord_cartesian(ylim = c(50, 2700)) +
labs (x = "\n Varieta di Vino \n") + labs(y = "Total") +
labs(title = "\n Il varieta di vino Toscania") +
geom_text(aes(label = sprintf("%.1f %%", pcnt))) +
theme_solarized() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.6, hjust=1.025))
Through our exploratory data analysis, the Sangiovese is the wine most prominently produced here, followed by the Red Blend and the Sangiovese Grosso.
Filtering the wine varieties for the Piedmont province
wines_Piemonte <- wines_Italia %>%
filter(province == "Piedmont") %>%
group_by(variety) %>%
summarise(total = n()) %>%
mutate(pcnt = round(100*(total/sum(total)), digits = 5), accum = cumsum(pcnt)) %>%
arrange(desc(total))
wines_Piemonte
wines_Piemonte %>%
head(10) %>%
ggplot(aes(x = reorder(variety, -total), y = total)) +
geom_bar(fill = "wheat", stat = "identity") +
coord_cartesian(ylim = c(50, 2500)) +
labs (x = "\n Varieta di Vino \n") + labs(y = "Total") +
labs(title = "\n Il varieta di vino Piemonte") +
geom_text(aes(label = sprintf("%.1f %%", pcnt))) +
theme_solarized() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.6, hjust=1.025))
The Nebbiolo variety of wine is most prominently produced here, followed by the Barbera and Dolcetto.
Filtering the wine varieties for the Veneto province
wines_Venetto <- wines_Italia %>%
filter(province == "Veneto") %>%
group_by(variety) %>%
summarise(total = n()) %>%
mutate(pcnt = round(100*(total/sum(total)), digits = 5), accum = cumsum(pcnt)) %>%
arrange(desc(total))
wines_Venetto
wines_Venetto %>%
head(10) %>%
ggplot(aes(x = reorder(variety, -total), y = total)) +
geom_bar(fill = "wheat", stat = "identity") +
coord_cartesian(ylim = c(50, 2500)) +
labs (x = "\n Varieta di Vino \n") + labs(y = "Total") +
labs(title = "\n Il varieta di vino Venetto") +
geom_text(aes(label = sprintf("%.1f %%", pcnt))) +
theme_solarized() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.6, hjust=1.025))
The Corvina (a wine known for being light in variety), Rondinella (wine from blackish berries that appears in wines blended with Corvina) and Molinara (variety that is commonly found in the corvina and Rondinella that gives those wines acidity) varieties are the most prominent here.
wines_SS <- wines_Italia %>%
filter(province == "Sicily & Sardinia") %>%
group_by(variety) %>%
summarise(total = n()) %>%
mutate(pcnt = round(100*(total/sum(total)), digits = 5), accum = cumsum(pcnt)) %>%
arrange(desc(total))
wines_SS
wines_SS %>%
head(10) %>%
ggplot(aes(x = reorder(variety, -total), y = total)) +
geom_bar(fill = "wheat", stat = "identity") +
coord_cartesian(ylim = c(50, 500)) +
labs (x = "\n Varieta di Vino \n") + labs(y = "Total") +
labs(title = "\n Il varieta di vino Sicilia e Sardinia") +
geom_text(aes(label = sprintf("%.1f %%", pcnt))) +
theme_solarized() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.6, hjust=1.025))
The Red Blend is prominent here, followed by the Nero d’Aviola (a wine native to the Avola in Southern Sicily known to have sweet tannins and plum or peppery flavors) and the White Blend.
wines_Lombardia <- wines_Italia %>%
filter(province == "Sicily & Sardinia") %>%
group_by(variety) %>%
summarise(total = n()) %>%
mutate(pcnt = round(100*(total/sum(total)), digits = 5), accum = cumsum(pcnt)) %>%
arrange(desc(total))
wines_Lombardia
wines_Lombardia %>%
head(10) %>%
ggplot(aes(x = reorder(variety, -total), y = total)) +
geom_bar(fill = "wheat", stat = "identity") +
coord_cartesian(ylim = c(50, 500)) +
labs (x = "\n Varieta di Vino \n") + labs(y = "Total") +
labs(title = "\n Il varieta di vino Lombardia") +
geom_text(aes(label = sprintf("%.1f %%", pcnt))) +
theme_solarized() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.6, hjust=1.025))
wines_NE <- wines_Italia %>%
filter(province == "Northeastern Italy") %>%
group_by(variety) %>%
summarise(total = n()) %>%
mutate(pcnt = round(100*(total/sum(total)), digits = 5), accum = cumsum(pcnt)) %>%
arrange(desc(total))
wines_NE
wines_NE %>%
head(10) %>%
ggplot(aes(x = reorder(variety, -total), y = total)) +
geom_bar(fill = "wheat", stat = "identity") +
coord_cartesian(ylim = c(50, 700)) +
labs (x = "\n Varieta di Vino \n") + labs(y = "Total") +
labs(title = "\n Il varieta di vino Italia di Nord-Orientale") +
geom_text(aes(label = sprintf("%.1f %%", pcnt))) +
theme_solarized() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.6, hjust=1.025))
wines_CI <- wines_Italia %>%
filter(province == "Central Italy") %>%
group_by(variety) %>%
summarise(total = n()) %>%
mutate(pcnt = round(100*(total/sum(total)), digits = 5), accum = cumsum(pcnt)) %>%
arrange(desc(total))
wines_CI
wines_CI %>%
head(10) %>%
ggplot(aes(x = reorder(variety, -total), y = total)) +
geom_bar(fill = "wheat", stat = "identity") +
coord_cartesian(ylim = c(50, 400)) +
labs (x = "\n Varieta di Vino \n") + labs(y = "Total") +
labs(title = "\n Il varieta di vino Italia Centrale") +
geom_text(aes(label = sprintf("%.1f %%", pcnt))) +
theme_solarized() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.6, hjust=1.025))
The piniot grigio variety (a white wine variety) is common here.
wines_S <- wines_Italia %>%
filter(province == "Southern Italy") %>%
group_by(variety) %>%
summarise(total = n()) %>%
mutate(pcnt = round(100*(total/sum(total)), digits = 5), accum = cumsum(pcnt)) %>%
arrange(desc(total))
wines_S
wines_S %>%
head(10) %>%
ggplot(aes(x = reorder(variety, -total), y = total)) +
geom_bar(fill = "wheat", stat = "identity") +
coord_cartesian(ylim = c(50, 300)) +
labs (x = "\n Varieta di Vino \n") + labs(y = "Total") +
labs(title = "\n Il varieta di vino Italia Sud") +
geom_text(aes(label = sprintf("%.1f %%", pcnt))) +
theme_solarized() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.6, hjust=1.025))
The Aglianico (a wine variety known for being full-bodied with firm tannins and high acidity with a rich lavor) is most prominent here.
wines_NI <- wines_Italia %>%
filter(province == "Northwestern Italy") %>%
group_by(variety) %>%
summarise(total = n()) %>%
mutate(pcnt = round(100*(total/sum(total)), digits = 5), accum = cumsum(pcnt)) %>%
arrange(desc(total))
wines_NI
wines_NI %>%
head(10) %>%
ggplot(aes(x = reorder(variety, -total), y = total)) +
geom_bar(fill = "wheat", stat = "identity") +
coord_cartesian(ylim = c(0, 5)) +
labs (x = "\n Varieta di Vino \n") + labs(y = "Total") +
labs(title = "\n Il varieta di vino Italia Verso Nord") +
geom_text(aes(label = sprintf("%.1f %%", pcnt))) +
theme_solarized() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.6, hjust=1.025))
wines_Rest <- wines_Italia %>%
filter(province == "Italy Other") %>%
group_by(variety) %>%
summarise(total = n()) %>%
mutate(pcnt = round(100*(total/sum(total)), digits = 5), accum = cumsum(pcnt)) %>%
arrange(desc(total))
wines_Rest
wines_Rest %>%
head(10) %>%
ggplot(aes(x = reorder(variety, -total), y = total)) +
geom_bar(fill = "wheat", stat = "identity") +
coord_cartesian(ylim = c(0, 20)) +
labs (x = "\n Varieta di Vino \n") + labs(y = "Total") +
labs(title = "\n Il varieta di vino per il resto d'Italia") +
geom_text(aes(label = sprintf("%.1f %%", pcnt))) +
theme_solarized() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.6, hjust=1.025))
For reference, i will use course notes i got from William Surles’course notes from Datacamp course “Text Mining: Bag of Words”. While Ariel, Moreno, Orsini-Rosberg, Pi and et al focused more on the reviews, my focus here will be more on the description.
# Creating vector from descriptions
description <- wines_Italia$description
#Interpreting review vector with tm
review <- VectorSource(description)
# Creating VCorpus object
rev <- VCorpus(review)
# Removing punctuation from reviews
rev <- tm_map(rev, removePunctuation)
# Converting to lowercase to make future cleaning easier
rev <- tm_map(rev, content_transformer(tolower))
# Removing Numbers
rev <- tm_map(rev, removeNumbers)
Let’s remove all excess whitespaces and stop words, making use of tm’s package list of stopwords
#Removing stop words
rev <- tm_map(rev, removeWords, stopwords("en"))
#Remove excess white spaces
rev <- tm_map(rev, stripWhitespace)
We need to do word stemming in order to unify the natural language of our descriptions
rev <- tm_map(rev, stemDocument)
At the same time we also want to figure out what words are commonly used in the descriptions more often and generate a list of stems that appear at least 1000 times
# Creating a DTM matrix
rev_dtm <- DocumentTermMatrix(rev)
# Finding the most popular stems
rev_freq <- findFreqTerms(rev_dtm, lowfreq = 1000)
rev_freq
## [1] "acid" "age" "almond" "alongsid" "appl"
## [6] "aroma" "back" "beauti" "berri" "black"
## [11] "blackberri" "blend" "bright" "cabernet" "cherri"
## [16] "chocol" "citrus" "clean" "close" "cola"
## [21] "color" "concentr" "creami" "crisp" "dark"
## [26] "deliv" "dri" "drink" "easi" "eleg"
## [31] "exot" "express" "finish" "firm" "flavor"
## [36] "flower" "follow" "fresh" "fruit" "good"
## [41] "grape" "herb" "here" "honey" "import"
## [46] "intens" "leather" "licoric" "light" "long"
## [51] "made" "make" "matur" "meat" "merlot"
## [56] "miner" "mouth" "mouthfeel" "nose" "note"
## [61] "oak" "offer" "open" "pair" "palat"
## [66] "peach" "pepper" "plum" "raspberri" "red"
## [71] "rich" "ripe" "sangioves" "sauvignon" "show"
## [76] "slight" "smooth" "soft" "spice" "stone"
## [81] "structur" "sweet" "tannin" "textur" "there"
## [86] "thick" "tight" "toast" "tobacco" "tone"
## [91] "touch" "vanilla" "well" "white" "wild"
## [96] "wine" "year"
I want to retain words that are relevant to wines, hence i will remove words that are not relevant here.
new_stopwords <- c("age", "aroma", "alongsid", "drink", "easi", "express", "flavor","follow", "good", "grape", "here", "import", "intens", "long", "made", "make", "matur", "meat", "mouth", "mouthfeel", "nose", "note", "offer", "open", "palat", "pair", "show", "textur", "there", "tone", "touch","wine", "year")
rev_new <- tm_map(rev, removeWords, new_stopwords)
Let us create a second matrix to find the top word stems in wine reviews by only choosing the most relevant ones.
I will try to set the benchmark to a minimum of 2500 times
# Creating second DTM
rev_dtm2 <- DocumentTermMatrix(rev_new)
# Finding top stems in reviews
top_stems <- findFreqTerms(rev_dtm2, lowfreq = 2500)
top_stems
## [1] "acid" "berri" "black" "blend" "bright" "cherri" "dri"
## [8] "finish" "fresh" "fruit" "ripe" "spice" "sweet" "tannin"
## [15] "white"
I will try to create the corpus function for this:
clean_corpus <- function(corpus){
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, removeWords, new_stopwords)
corpus <-tm_map(corpus, stripWhitespace)
return(corpus)
}
italian_wines_corpus <- clean_corpus(rev)
italian_wines_corpus_dm <- TermDocumentMatrix(italian_wines_corpus)
# Creating matrix from TDM
italian_wines_corpus_cdm <- as.matrix(italian_wines_corpus_dm)
italianf <- rowSums(italian_wines_corpus_cdm)
italianf <- sort(italianf, decreasing = T)
# Creating the data frame
iwf <- data.frame(term=names(italianf), num = italianf)
# Creating a plot of 10 top stems
ggplot(data = head(iwf, 10), aes(x = factor(term, levels = iwf$term[order(-iwf$num)]), y = num)) +
geom_col(fill = "orchid") +
labs(x = "Word Stems", y = "Count", title = "Most common stems in Italian Wine Reviews") +
scale_y_continuous(expand = c(0, 0), breaks=seq(0,10000,500))
# Creating a wordcloud
wordcloud(iwf$term, iwf$num, max.words = 25, color = "orchid")
We can see here that the most common words described here are related to berries or those related to the texture or spice of the wine.