Caricamento pacchetti

library(dplyr)
library(ggplot2)
library(knitr)

Parte 1 : Dati

I seguenti dataset sono stati prelevati dal sito WineEnthusiast https://www.winemag.com/?s=&drink_type=wine durante la settimana a partire dal 15 giugno 2017 e poi di nuovo il 22 novembre 2017 tramite la tecnica dello scraping o raschiamento. I 2 dataset scaricabili da : https://www.kaggle.com/datasets/zynicide/wine-reviews?datasetId=1442&sortBy=voteCount&select=winemag-data_first150k.csv , uniti e con l’eliminazione delle osservazioni duplicate contengono 170.520 record . La variabili che si prendono in considerazione nell’analisi sono :

Caricamento dati

winemag.data.130k.v2 <- read.csv("winemag-data-130k-v2.csv")
winemag.data_first150k <- read.csv("winemag-data_first150k.csv")
df1 <- winemag.data_first150k %>%
  select(country,description,points,price,province,region_1,region_2, variety, winery)

df2 <- winemag.data.130k.v2 %>%
  select(country,description,points,price,province,region_1,region_2, variety, winery)

df1 <- rbind(df1,df2)

Parte 2 : Esplorazione dei dati

Si eliminano le osservazioni duplicate e si nota che la variabile price ha 12838 valori mancanti:

df1 <- unique.data.frame(df1)
colSums(is.na(df1))
##     country description      points       price    province    region_1 
##           0           0           0       12838           0           0 
##    region_2     variety      winery 
##           0           0           0

La variabile points ha un andamento quasi “normale” o simile alla curva di Gauss per cui la mediana è pari quasi alla media, mentre la variabile price è distorta sulla sinistra per cui il 50% dei vini ha un prezzo inferiore alla mediana pari a 25 dollari . La correlazione tra le 2 variabili è pari al 42,63% infatti ci si aspetta che il prezzo del vino sia correlato al suo punteggio:

df1 %>%
  ggplot(aes(points)) +
  geom_histogram(bins = 50, fill="red")

options(warn=-1)
df1 %>%
  ggplot(aes(price)) +
  geom_histogram(bins = 50, fill="orange")

df3 <- df1[-which(is.na(df1$price)),]
cor(df3$points,df3$price)
## [1] 0.4268176
summary(df1$points)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   80.00   86.00   88.00   88.24   90.00  100.00
summary(df1$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    4.00   16.00   25.00   34.66   40.00 3300.00   12838

VINI DEL MONDO PARAGONATI ALL’ITALIA

Gli stati con il più alto numero di recenzioni sono : USA, Italia,Francia, Spagna, Portogallo ecc. :

df1 %>%
    group_by(country) %>%
    summarise(Totale=n()) %>%
    mutate(country=reorder(country,Totale)) %>%
    ggplot(aes(country,Totale, fill=country)) +
    geom_bar(stat = "identity")+
    coord_flip() +
    geom_text(aes(label=Totale), hjust=0, size=2)+
    guides(fill="none")+
    ggtitle("Recensioni totali dei vini per Stato")

Gli stati con la media o mediana dei punteggi dei vini più alti sono : Regno Unito, Austria , Germania, Francia, Marocco, Italia

df1 %>%
  mutate(country = reorder(country,points)) %>%
  ggplot(aes(points,country, fill=country))+
  geom_boxplot()+
  guides(fill="none")

df1 %>%
    group_by(country) %>%
    summarise(media_point= mean(points)) %>%
    mutate(country=reorder(country,media_point)) %>%
    ggplot(aes(country,media_point, fill=country)) +
    geom_bar(stat = "identity")+
    coord_flip() +
    geom_text(aes(label=round(media_point,2)), hjust=0, size=2)+
    guides(fill="none")+
    ggtitle("")

VINI ITALIANI

Le regioni italiane con il più alto numero di recenzioni sono Toscana, Piemonte e Veneto

df1 <- df1 %>%
  filter(country=="Italy")

df1$province<- as.factor(df1$province)
df1$region_1<- as.factor(df1$region_1)
df1$region_2<- as.factor(df1$region_2)
df1$variety<- as.factor(df1$variety)
df1$winery<- as.factor(df1$winery)

df1 %>%
    group_by(province) %>%
    summarise(Totale=n()) %>%
    mutate(province=reorder(province,Totale)) %>%
    ggplot(aes(province,Totale, fill=province)) +
    geom_bar(stat = "identity")+
    coord_flip() +
    geom_text(aes(label=Totale), hjust=0, size=2)+
    ggtitle("Recensioni totali dei vini per Territorio")

Le regioni italiana con la media o mediana dei punteggi maggiore sono Piemonte, Toscana e Lombardia:

df1 %>%
  mutate(province = reorder(province,points)) %>%
  ggplot(aes(points,province, fill=province))+
  geom_boxplot()

df1 %>%
  filter(points>=92) %>%
  group_by(province, variety) %>%
  summarise(media_punti=mean(points) , .groups="keep") %>%
  #mutate(variety = reorder(variety,media_punti)) %>%
  ggplot(aes(variety,media_punti, fill=province))+
  geom_bar(stat = "identity")+
  coord_flip() +
  ggtitle("Varietà di vini per territorio",subtitle =  " con un punteggio in media superiore a 92")

df1 %>%
  ggplot(aes(points,price,colour=province))+
  geom_point()

df3<-df1 %>%
  filter(points>quantile(points, p=0.75), price<quantile(price, p=0.25, na.rm=TRUE)) %>%
  select(variety,points,price,province) %>%
  arrange(desc(points))

kable(df3, caption = "Vini italiani migliori a prezzo più basso")
Vini italiani migliori a prezzo più basso
variety points price province
Red Blend 93 16 Tuscany
Sangiovese 92 13 Tuscany
Sauvignon 92 17 Northeastern Italy
Sangiovese 92 16 Tuscany
Verduzzo 92 14 Veneto
Red Blend 92 15 Veneto
Vermentino 92 17 Sicily & Sardinia
Carignano 92 15 Sicily & Sardinia
Moscato 92 16 Piedmont
Moscato 92 17 Piedmont
Red Blend 92 16 Piedmont
Turbiana 91 17 Lombardy
Malvasia 91 16 Northeastern Italy
Red Blend 91 15 Tuscany
Sangiovese 91 15 Tuscany
Sangiovese 91 16 Tuscany
Sangiovese Grosso 91 15 Tuscany
Red Blend 91 16 Tuscany
Red Blend 91 12 Tuscany
Red Blend 91 17 Sicily & Sardinia
Sangiovese 91 12 Tuscany
Merlot 91 17 Northeastern Italy
Red Blend 91 17 Southern Italy
Lambrusco di Sorbara 91 16 Central Italy
Glera 91 15 Northeastern Italy
Pinot Grigio 91 13 Northeastern Italy
Verdicchio 91 15 Central Italy
Sangiovese 91 12 Tuscany
Albana 91 15 Central Italy
White Blend 91 17 Veneto
Glera 91 13 Veneto
Pinot Bianco 91 13 Northeastern Italy
Glera 91 15 Northeastern Italy
Verdicchio 91 17 Central Italy
Red Blend 91 16 Tuscany
Turbiana 91 17 Lombardy
Verdicchio 91 17 Central Italy
Pinot Bianco 91 16 Northeastern Italy
Sauvignon Blanc 91 15 Northeastern Italy
df2<-df1 %>%
  filter(points>=92) %>%
  arrange(desc(points),province,variety) %>%
  select(variety,province,points,price)

df2<- df2 %>%
  group_by(variety,province,points) %>%
  summarise(median_price=median(price))
## `summarise()` has grouped output by 'variety', 'province'. You can override
## using the `.groups` argument.
df2 <- unique.data.frame(df2)


df2<-df2 %>%
  arrange(desc(points),province,variety)

kable(df2, caption = "Vini con il punteggio maggiore da 92 fino a 100")
Vini con il punteggio maggiore da 92 fino a 100
variety province points median_price
Merlot Tuscany 100 460.0
Prugnolo Gentile Tuscany 100 210.0
Red Blend Tuscany 100 195.0
Sangiovese Tuscany 100 550.0
Sangiovese Grosso Tuscany 100 270.0
Nebbiolo Piedmont 99 440.0
Merlot Tuscany 99 285.0
Prugnolo Gentile Tuscany 99 237.0
Red Blend Tuscany 99 217.5
Sangiovese Tuscany 99 200.0
Nebbiolo Piedmont 98 117.0
Cabernet Franc Tuscany 98 127.5
Merlot Tuscany 98 227.5
Red Blend Tuscany 98 NA
Sangiovese Tuscany 98 120.0
Sangiovese Grosso Tuscany 98 245.0
Syrah Tuscany 98 225.0
Picolit Northeastern Italy 97 90.0
Nebbiolo Piedmont 97 NA
Nerello Mascalese Sicily & Sardinia 97 60.0
Aglianico Southern Italy 97 90.0
Cabernet Franc Tuscany 97 130.0
Merlot Tuscany 97 277.5
Red Blend Tuscany 97 217.5
Sangiovese Tuscany 97 130.0
Sangiovese Grosso Tuscany 97 79.5
Syrah Tuscany 97 160.0
White Blend Tuscany 97 NA
Sagrantino Central Italy 96 80.0
Nebbiolo Lombardy 96 105.0
Picolit Northeastern Italy 96 90.0
Nebbiolo Piedmont 96 NA
Merlot Southern Italy 96 105.0
Cabernet Franc Tuscany 96 105.0
Merlot Tuscany 96 315.0
Red Blend Tuscany 96 112.5
Sangiovese Tuscany 96 NA
Sangiovese Grosso Tuscany 96 75.0
White Blend Tuscany 96 62.5
Corvina, Rondinella, Molinara Veneto 96 NA
Montepulciano Central Italy 95 95.0
Sagrantino Central Italy 95 70.0
Verdicchio Central Italy 95 57.0
White Blend Central Italy 95 NA
Nebbiolo Lombardy 95 82.0
Friulano Northeastern Italy 95 80.0
Kerner Northeastern Italy 95 25.0
Picolit Northeastern Italy 95 100.0
Pinot Bianco Northeastern Italy 95 165.0
Sauvignon Northeastern Italy 95 40.0
Teroldego Northeastern Italy 95 50.0
White Blend Northeastern Italy 95 88.0
Nebbiolo Piedmont 95 NA
Red Blend Piedmont 95 350.0
Carricante Sicily & Sardinia 95 45.0
Nerello Mascalese Sicily & Sardinia 95 60.0
Red Blend Sicily & Sardinia 95 30.0
Zibibbo Sicily & Sardinia 95 40.0
Aglianico Southern Italy 95 NA
Cabernet Franc Tuscany 95 100.0
Cabernet Sauvignon Tuscany 95 75.0
Cabernet Sauvignon-Merlot Tuscany 95 50.0
Merlot Tuscany 95 130.0
Red Blend Tuscany 95 NA
Sangiovese Tuscany 95 NA
Sangiovese Grosso Tuscany 95 NA
Syrah Tuscany 95 65.0
White Blend Tuscany 95 NA
Corvina, Rondinella, Molinara Veneto 95 127.5
Garganega Veneto 95 31.0
Albana Central Italy 94 NA
Montepulciano Central Italy 94 100.0
Sagrantino Central Italy 94 55.0
Verdicchio Central Italy 94 56.0
White Blend Central Italy 94 35.0
Nebbiolo Italy Other 94 NA
Pinot Nero Lombardy 94 NA
Sparkling Blend Lombardy 94 75.0
Turbiana Lombardy 94 24.0
Cabernet Sauvignon Northeastern Italy 94 92.0
Chardonnay Northeastern Italy 94 125.0
Friulano Northeastern Italy 94 35.0
Gewürztraminer Northeastern Italy 94 50.0
Nosiola Northeastern Italy 94 47.0
Picolit Northeastern Italy 94 140.0
Ribolla Gialla Northeastern Italy 94 NA
Sauvignon Northeastern Italy 94 40.0
Sparkling Blend Northeastern Italy 94 67.0
Teroldego Northeastern Italy 94 50.0
Verduzzo Northeastern Italy 94 NA
White Blend Northeastern Italy 94 88.0
Barbera Piedmont 94 NA
Cabernet Sauvignon Piedmont 94 224.0
Nebbiolo Piedmont 94 NA
Pinot Nero Piedmont 94 45.0
Red Blend Piedmont 94 NA
Refosco Piedmont 94 200.0
Sparkling Blend Piedmont 94 34.0
Carignano Sicily & Sardinia 94 91.0
Nerello Mascalese Sicily & Sardinia 94 67.5
Nero d’Avola Sicily & Sardinia 94 51.0
Red Blend Sicily & Sardinia 94 49.0
Vermentino Sicily & Sardinia 94 113.0
White Blend Sicily & Sardinia 94 65.5
Zibibbo Sicily & Sardinia 94 40.0
Aglianico Southern Italy 94 NA
Falanghina Southern Italy 94 NA
Fiano Southern Italy 94 35.0
Greco Southern Italy 94 60.0
Moscato Southern Italy 94 29.0
Red Blend Southern Italy 94 NA
Sirica Southern Italy 94 50.0
Bordeaux-style Red Blend Tuscany 94 145.0
Cabernet Franc Tuscany 94 77.5
Cabernet Sauvignon Tuscany 94 159.5
Merlot Tuscany 94 250.0
Moscadello Tuscany 94 55.0
Red Blend Tuscany 94 NA
Sangiovese Tuscany 94 NA
Sangiovese Grosso Tuscany 94 NA
Syrah Tuscany 94 96.0
Vernaccia Tuscany 94 NA
White Blend Tuscany 94 NA
Cabernet Sauvignon Veneto 94 80.0
Corvina Veneto 94 87.0
Corvina, Rondinella, Molinara Veneto 94 NA
Garganega Veneto 94 48.0
Glera Veneto 94 NA
Red Blend Veneto 94 85.0
White Blend Veneto 94 31.0
Albana Central Italy 93 27.5
Cabernet Blend Central Italy 93 50.0
Cabernet Sauvignon Central Italy 93 52.0
Chardonnay Central Italy 93 60.0
Lambrusco di Sorbara Central Italy 93 20.0
Moscato Central Italy 93 45.0
Red Blend Central Italy 93 49.0
Sagrantino Central Italy 93 50.0
Trebbiano Central Italy 93 50.0
Verdicchio Central Italy 93 57.0
White Blend Central Italy 93 NA
Glera Italy Other 93 49.0
Nerello Mascalese Italy Other 93 144.5
Chardonnay Lombardy 93 79.0
Nebbiolo Lombardy 93 60.0
Pinot Nero Lombardy 93 NA
Sparkling Blend Lombardy 93 NA
Turbiana Lombardy 93 24.0
Cabernet Sauvignon Northeastern Italy 93 65.0
Chardonnay Northeastern Italy 93 90.0
Friulano Northeastern Italy 93 60.0
Kerner Northeastern Italy 93 30.0
Merlot Northeastern Italy 93 NA
Moscato Giallo Northeastern Italy 93 NA
Müller-Thurgau Northeastern Italy 93 40.0
Picolit Northeastern Italy 93 68.0
Pinot Nero Northeastern Italy 93 50.0
Red Blend Northeastern Italy 93 57.5
Ribolla Gialla Northeastern Italy 93 117.5
Sauvignon Northeastern Italy 93 NA
Sylvaner Northeastern Italy 93 25.0
Teroldego Northeastern Italy 93 48.0
Tocai Northeastern Italy 93 35.0
White Blend Northeastern Italy 93 NA
Barbera Piedmont 93 31.0
Moscato Piedmont 93 NA
Nebbiolo Piedmont 93 NA
Pinot Nero Piedmont 93 36.0
Red Blend Piedmont 93 NA
Sparkling Blend Piedmont 93 NA
Cabernet Sauvignon Sicily & Sardinia 93 65.0
Carricante Sicily & Sardinia 93 30.0
Grillo Sicily & Sardinia 93 30.0
Malvasia Sicily & Sardinia 93 40.0
Nerello Mascalese Sicily & Sardinia 93 50.5
Nero d’Avola Sicily & Sardinia 93 80.0
Red Blend Sicily & Sardinia 93 NA
Syrah Sicily & Sardinia 93 57.5
Vermentino Sicily & Sardinia 93 NA
White Blend Sicily & Sardinia 93 NA
Zibibbo Sicily & Sardinia 93 42.0
Aglianico Southern Italy 93 NA
Fiano Southern Italy 93 35.0
Greco Southern Italy 93 60.0
Merlot Southern Italy 93 112.5
Moscato Southern Italy 93 NA
Red Blend Southern Italy 93 80.0
Roviello Southern Italy 93 28.0
Sirica Southern Italy 93 23.0
Susumaniello Southern Italy 93 NA
White Blend Southern Italy 93 19.0
Cabernet Franc Tuscany 93 90.0
Cabernet Sauvignon Tuscany 93 NA
Cabernet Sauvignon-Merlot Tuscany 93 69.0
Malvasia Tuscany 93 50.0
Merlot Tuscany 93 75.0
Moscadello Tuscany 93 NA
Petit Verdot Tuscany 93 39.0
Red Blend Tuscany 93 NA
Sangiovese Tuscany 93 NA
Sangiovese Grosso Tuscany 93 NA
Syrah Tuscany 93 NA
Vernaccia Tuscany 93 23.0
White Blend Tuscany 93 NA
Corvina Veneto 93 92.5
Corvina, Rondinella, Molinara Veneto 93 NA
Garganega Veneto 93 NA
Glera Veneto 93 29.0
Red Blend Veneto 93 80.0
White Blend Veneto 93 31.0
Albana Central Italy 92 47.5
Aleatico Central Italy 92 50.0
Cabernet Sauvignon Central Italy 92 60.0
Chardonnay Central Italy 92 55.0
Lambrusco di Sorbara Central Italy 92 24.0
Moscato Central Italy 92 42.0
Passerina Central Italy 92 NA
Red Blend Central Italy 92 57.0
Sagrantino Central Italy 92 NA
Sangiovese Central Italy 92 40.0
Trebbiano Central Italy 92 50.0
Verdicchio Central Italy 92 50.0
White Blend Central Italy 92 43.5
Aglianico Italy Other 92 43.0
Centesimino Italy Other 92 36.0
Falanghina Italy Other 92 39.0
Nebbiolo Italy Other 92 NA
Nerello Mascalese Italy Other 92 26.0
Red Blend Italy Other 92 34.0
Sangiovese Italy Other 92 50.0
Sparkling Blend Italy Other 92 NA
White Blend Italy Other 92 40.0
Chardonnay Lombardy 92 NA
Nebbiolo Lombardy 92 NA
Pinot Nero Lombardy 92 NA
Sparkling Blend Lombardy 92 NA
Turbiana Lombardy 92 22.5
Chardonnay Northeastern Italy 92 38.0
Friulano Northeastern Italy 92 23.0
Gewürztraminer Northeastern Italy 92 39.0
Kerner Northeastern Italy 92 25.0
Müller-Thurgau Northeastern Italy 92 40.0
Picolit Northeastern Italy 92 77.5
Pignolo Northeastern Italy 92 70.0
Pinot Bianco Northeastern Italy 92 35.0
Pinot Grigio Northeastern Italy 92 35.0
Pinot Nero Northeastern Italy 92 44.0
Pinot Noir Northeastern Italy 92 50.0
Red Blend Northeastern Italy 92 61.0
Riesling Northeastern Italy 92 29.0
Sauvignon Northeastern Italy 92 33.0
Sauvignon Blanc Northeastern Italy 92 NA
Sparkling Blend Northeastern Italy 92 60.0
Teroldego Northeastern Italy 92 53.5
Veltliner Northeastern Italy 92 40.0
Verduzzo Northeastern Italy 92 18.0
Verduzzo Friulano Northeastern Italy 92 49.0
White Blend Northeastern Italy 92 NA
Barbera Piedmont 92 NA
Dolcetto Piedmont 92 NA
Moscato Piedmont 92 NA
Nebbiolo Piedmont 92 NA
Pinot Nero Piedmont 92 52.0
Red Blend Piedmont 92 38.0
Timorasso Piedmont 92 60.0
Cabernet Sauvignon Sicily & Sardinia 92 67.5
Cannonau Sicily & Sardinia 92 27.0
Carignano Sicily & Sardinia 92 NA
Carricante Sicily & Sardinia 92 33.5
Chardonnay Sicily & Sardinia 92 36.5
Fiano Sicily & Sardinia 92 43.0
Grecanico Sicily & Sardinia 92 44.0
Moscato Sicily & Sardinia 92 40.0
Moscato di Noto Sicily & Sardinia 92 40.0
Nasco Sicily & Sardinia 92 NA
Nerello Cappuccio Sicily & Sardinia 92 37.5
Nerello Mascalese Sicily & Sardinia 92 41.0
Nero d’Avola Sicily & Sardinia 92 42.0
Perricone Sicily & Sardinia 92 26.0
Red Blend Sicily & Sardinia 92 NA
Syrah Sicily & Sardinia 92 48.5
Vermentino Sicily & Sardinia 92 20.0
White Blend Sicily & Sardinia 92 NA
Zibibbo Sicily & Sardinia 92 NA
Aglianico Southern Italy 92 NA
Caprettone Southern Italy 92 19.0
Falanghina Southern Italy 92 22.0
Fiano Southern Italy 92 37.0
Greco Southern Italy 92 38.0
Nero di Troia Southern Italy 92 65.0
Primitivo Southern Italy 92 NA
Red Blend Southern Italy 92 NA
Roviello Southern Italy 92 NA
Sirica Southern Italy 92 23.0
Susumaniello Southern Italy 92 50.0
Uva di Troia Southern Italy 92 30.0
White Blend Southern Italy 92 NA
Cabernet Blend Tuscany 92 55.0
Cabernet Franc Tuscany 92 110.0
Cabernet Sauvignon Tuscany 92 NA
Cabernet Sauvignon-Merlot Tuscany 92 61.5
Chardonnay Tuscany 92 120.0
Merlot Tuscany 92 NA
Moscadello Tuscany 92 39.5
Petit Verdot Tuscany 92 35.0
Pinot Nero Tuscany 92 NA
Prugnolo Gentile Tuscany 92 65.0
Red Blend Tuscany 92 NA
Sangiovese Tuscany 92 NA
Sangiovese Grosso Tuscany 92 NA
Syrah Tuscany 92 NA
Trebbiano-Malvasia Tuscany 92 25.0
Vermentino Tuscany 92 68.0
Vernaccia Tuscany 92 22.0
White Blend Tuscany 92 NA
Cabernet Sauvignon Veneto 92 60.0
Chardonnay Veneto 92 37.0
Corvina Veneto 92 85.0
Corvina, Rondinella, Molinara Veneto 92 NA
Durella Veneto 92 40.0
Garganega Veneto 92 NA
Glera Veneto 92 NA
Red Blend Veneto 92 NA
Sparkling Blend Veneto 92 NA
Verduzzo Veneto 92 14.0
Vespaiolo Veneto 92 40.0
colore<-2

for (r in levels(df1$province)) {
  
g<-  df1 %>%
    filter(province==r) %>%
    group_by(variety) %>%
    summarise(media_point= mean(points)) %>%
    mutate(variety=reorder(variety,media_point)) %>%
    ggplot(aes(variety,media_point)) +
    geom_bar(stat = "identity", fill=colore)+
    coord_flip() +
    geom_text(aes(label=round(media_point,2)), hjust=0, size=2)+
    ggtitle(paste("Vini con punteggio più alto in",r))

  print(g)
  colore <- colore + 2
}