Wine Data Explore

#Setup ##Load Needed Packages

library(dplyr)
library(tidyverse)
library(knitr)
library(ggmap)
library(tidytext)
library(tm)
library(readr)

##Import Data I removed the first column which was just the sequencial ID number

setwd("F:\\R_project")
wine_data = read_csv("WINE1.csv", 
    col_types = cols(X1 = col_skip()))

#Understanding the Missing Data A simple bar plot shows us that some features are in every single data point, such as points given, while other features are often left out, such as the sub region

missing_data = wine_data %>%
    map_df(function(x) sum(is.na(x))) %>%
    gather(feature, num_nulls)%>%
    arrange(desc(num_nulls))%>%
    mutate(percent_missing = num_nulls/nrow(wine_data)*100)

ggplot(missing_data, aes(reorder(feature,percent_missing),percent_missing)) +geom_bar(stat = "identity")  + coord_flip() + labs(title = "Feature By Percent of Data Missing", y = "Percent of Observations with Feature Missing", x = "Feature") +scale_y_continuous(limits = c(0, 100))

kable(missing_data, digits = c(0,0,0))

feature	num_nulls	percent_missing
region_2	79460	61
designation	37465	29
taster_twitter_handle	31213	24
taster_name	26244	20
region_1	21247	16
price	8996	7
country	63	0
province	63	0
variety	1	0
description	0	0
points	0	0
title	0	0
winery	0	0

#Most Common Wine Locations

countries = wine_data %>%
  group_by(country) %>%
  count() 

top_countries = countries %>%
  filter(n>500)

ggplot(top_countries, aes(reorder(country,n),n)) +geom_bar(stat = "identity")  + coord_flip() + labs(title = "Countries with at Least 500 Reviews by Number of Reviews", y = "Number of Reviews", x = "Country")

#Histograms ##Price As is shown in the initial (barely recognizable) boxplot, the data is heavily concentrated below $100 dollars.However due to the size of the data, there are a decent number of outliers, including some over a thousand dollars. The truncated histogram gives a better idea of the actual distribution of prices.

ggplot(wine_data, aes(x = 1 ,y=price)) + geom_boxplot() + 
  labs(title ="Price Boxplot", y = "Price in Dollars", x = "All Wines") +
  coord_flip()

ggplot(wine_data, aes(x=price)) + geom_histogram(binwidth = 5, color= 'white') + 
  coord_cartesian(xlim = c(0, 100))+
  labs(title ="Price Histogram for WInes Under $100", x = "Price in Dollars", y = "Number  of Reviews") +
  scale_x_continuous(breaks=seq(0,100, by = 10))

##Points Given

ggplot(wine_data, aes(x=points)) + geom_histogram(binwidth = 1, color= 'white') + 
  coord_cartesian(xlim = c(75, 100))+
  labs(title ="Points Histogram", x = "Points Given by Reviewer", y = "Number  of Reviews") +
  scale_x_continuous(breaks=seq(75,100, by = 1))

#A Closer Look at the Data ##Countries with the Best and Worst Wine

Best_Wines = wine_data %>%
  filter(country %in% top_countries$country) %>%
  select(country,points)

Best_Wines %>%
  group_by(country) %>%
  summarise(Mean_Score = mean(points)) %>%
  arrange(desc(Mean_Score))%>%
  kable()

country	Mean_Score
Austria	90.10135
Germany	89.85173
France	88.84511
Australia	88.58051
US	88.56372
Italy	88.56223
Israel	88.47129
New Zealand	88.30303
Portugal	88.25022
South Africa	88.05639
Spain	87.28834
Argentina	86.71026
Chile	86.49352

Best_Worst = Best_Wines %>%
  filter(country %in% c("Austria","Chile"))

ggplot(Best_Worst, aes(points, colour = country, fill = country)) + geom_density(alpha = .4) +
  labs(title ="Point Densities of Top and Bottom Countries with at Least 500 Reviews", x = "Points Given", y = "Density")

##Top and Bottom Wineries

wineries = wine_data %>%
  group_by(winery) %>%
  count() 

Biggest_Wineries = wineries %>%
  filter(n>100)


Best_Wineries = wine_data %>%
  filter(winery %in% Biggest_Wineries$winery) %>%
  select(winery,points)

Best_Wineries %>%
  group_by(winery) %>%
  summarise(Mean_Score = mean(points)) %>%
  arrange(desc(Mean_Score))%>%
  kable()

winery	Mean_Score
Lynmar	92.80508
Williams Selyem	92.74408
Domaine Zind-Humbrecht	92.52475
Foxen	91.67619
Louis Jadot	91.62500
Gary Farrell	91.61600
Bründlmayer	91.52475
Iron Horse	91.18868
Chanson Père et Fils	90.80734
Testarossa	90.73853
Albert Bichot	90.63248
Louis Latour	90.53769
Siduri	90.45238
Feudi di San Gregorio	90.01961
Fess Parker	89.62037
Chehalem	89.31373
Jean-Luc and Paul Aegerter	88.91150
Chateau Ste. Michelle	88.72680
Kendall-Jackson	88.63846
V. Sattui	88.53271
Robert Mondavi	88.41964
Columbia Crest	87.88050
Kunde	87.67327
Montes	87.62393
Concha y Toro	87.60976
Wines & Winemakers	87.59910
Trapiche	87.40708
Georges Duboeuf	87.40306
Casa Santos Lima	87.23894
Undurraga	87.09735
DFJ Vinhos	86.66977
Maryhill	86.30392
Santa Ema	86.16071

Best_Worst = Best_Wineries %>%
  filter(winery %in% c("Lynmar","Santa Ema"))

ggplot(Best_Worst, aes(points, colour = winery, fill = winery)) + geom_density(alpha = .4) +
  labs(title ="Point Densities of Top and Bottom Wineries with at least 100 Reviews", x = "Points Given", y = "Density")

##Top and Bottom Varieties of Wine

varieties = wine_data %>%
  group_by(variety) %>%
  count() 

Popular_Varieties = varieties %>%
  filter(n>500)


Variety_Data = wine_data %>%
  filter(variety %in% Popular_Varieties$variety) %>%
  select(variety,points)

Variety_Data %>%
  group_by(variety) %>%
  summarise(Mean_Score = mean(points)) %>%
  arrange(desc(Mean_Score))%>%
  kable()

variety	Mean_Score
Sangiovese Grosso	90.52996
Nebbiolo	90.25107
Grüner Veltliner	89.98067
Port	89.73353
Champagne Blend	89.66332
Riesling	89.45018
Pinot Noir	89.41147
Syrah	89.28658
Rhône-style Red Blend	89.15364
Bordeaux-style Red Blend	89.10644
Shiraz	89.05861
Grenache	88.96313
Portuguese Red	88.81062
Bordeaux-style White Blend	88.69043
Cabernet Sauvignon	88.60758
Gewürztraminer	88.59091
Chenin Blanc	88.57530
Sangiovese	88.55079
Pinot Gris	88.49622
Red Blend	88.38028
Chardonnay	88.34008
Tempranillo Blend	88.25510
Corvina, Rondinella, Molinara	88.24394
Petite Sirah	88.20390
Cabernet Franc	88.15078
Sparkling Blend	88.04505
Gamay	88.03317
Malbec	87.98303
Zinfandel	87.82867
Barbera	87.78363
Viognier	87.77108
Tempranillo	87.51436
Sauvignon Blanc	87.42964
White Blend	87.35297
Glera	87.24401
Merlot	87.20858
Portuguese White	86.93098
Rosé	86.84624
Carmenère	86.58957
Pinot Grigio	86.23764

Best_Worst = Variety_Data %>%
  filter(variety %in% c("Sangiovese Grosso","Pinot Grigio"))

ggplot(Best_Worst, aes(points, colour = variety, fill = variety)) + geom_density(alpha = .4) +
  labs(title ="Point Densities of Top and Bottom Wine Varieties with at least 500 Reviews", x = "Points Given", y = "Density")

##Tasters with Highest and Lowests Standards

reviewers = wine_data %>%
  group_by(taster_name) %>%
  count() 

 frequent_reviewers= reviewers %>%
  filter(n>100)


Reviewer_Data = wine_data %>%
  filter(taster_name %in% frequent_reviewers$taster_name) %>%
  select(taster_name,points)

Reviewer_Data %>%
  group_by(taster_name) %>%
  summarise(Mean_Score = mean(points)) %>%
  arrange(desc(Mean_Score))%>%
  kable()

taster_name	Mean_Score
Anne Krebiehl MW	90.56255
Matt Kettmann	90.00869
Virginie Boone	89.21338
Mike DeSimone	89.10117
Paul Gregutt	89.08256
Kerin O’Keefe	88.86795
Sean P. Sullivan	88.75574
Roger Voss	88.70800
Jim Gordon	88.62629
Joe Czerwinski	88.53623
Anna Lee C. Iijima	88.41563
Jeff Jenssen	88.31976
NA	87.79512
Lauren Buzzeo	87.73951
Michael Schachner	86.90749
Susan Kostrzewa	86.60922
Carrie Dykes	86.39568
Alexander Peartree	85.85542

Best_Worst = Reviewer_Data %>%
  filter(taster_name %in% c("Alexander Peartree","Matt Kettmann"))

ggplot(Best_Worst, aes(points, colour = taster_name, fill = taster_name)) + geom_density(alpha = .4) +
  labs(title ="Point Densities of Most and Least Critical Reviewrs with at least 100 Reviews", x = "Points Given", y = "Density")

#Correlations ##Do the Most Reviewed Wineries Have the Best Wines? It is possible that the better a winery is the more it’s wine is reviewed. On the other hand it could turn out that the number of reviews of a winery’s wine is mostly a function of it’s volume produced and the smaller the winery the better the wine. As it turns out there is barely any correlation

wineries = wine_data %>%
  group_by(winery) %>%
  count() 

Winery_Mean_Points = wine_data %>%
  select(winery,points) %>%
  group_by(winery) %>%
  summarise(Mean_Score = mean(points))

Reviews_Mean = wineries %>%
  inner_join(Winery_Mean_Points, by = "winery") %>%
   filter(n>25)

ggplot(Reviews_Mean, aes(x=n, y=Mean_Score)) +    geom_point(shape=1) +  geom_smooth(method=lm , color="red", se=FALSE) + 
   labs(title ="Mean Winery Score vs. Number of Wines Reviewed from that Winery", x = "Number of Wines Reviewed", y = "Mean Score Given")

##What is the Relationship Between Price and Points Given? My father likes to say that a buying a bottle of wine that costs more than $20 is a waste of money because after that threshold wine does not improve. According to this data set he is wrong. It appears that the correlation between wine price and points given levels off around $40! Additinally, it is clear from the first larger plot that spending thousands of dollars on a bottle of wine is usually a poor decision and a comparable bottle can be had for much less

I_Wines = wine_data %>%
  select(price, points) %>%
  filter(!is.na(price)) %>%
  filter(!is.na(points))

ggplot(I_Wines, aes(x=price, y=points)) +    geom_jitter(shape=1)    + coord_cartesian(xlim = c(0, 4000), ylim = c(75, 100)) +
   labs(title ="Score vs Price", x = "Price", y = "Score")

Reason_Wines = I_Wines %>%
  filter(price < 50 )


ggplot(Reason_Wines, aes(x=price, y=points)) +    geom_jitter(width = 1,height = 1, alpha = .1)  + 
  stat_smooth(method= "lm" , formula = y~poly(x,2),color="red", se=FALSE)  + 
  coord_cartesian(xlim = c(0, 50), ylim = c(75, 100)) +
  labs(title ="Score vs Price for Wines Under 20 Dollars", x = "Price", y = "Score")

#Data Enineering ## Classify Reviews as Bad, Average or Good and by the Standard of the Reviewer The bins are somewhat arbitrary, I attempted to use dplyr’s ntile function to devide the scores into 3 buckets. However, then I eneded up with two scores that were devided into two different classifications. So my bin decision was based on that but expicitly specified to solve this problem

Wine_Eng = wine_data %>%
  mutate(grade = ifelse(points > 91,"Good",ifelse(points >86,"Average","Bad")))

Wine_Eng$grade = as.factor(Wine_Eng$grade)
Wine_Eng$grade = factor(Wine_Eng$grade,levels(Wine_Eng$grade)[c(3,1,2)])

ggplot(Wine_Eng, aes(x=points, fill = grade)) + geom_histogram(binwidth = 1, color= 'white') + 
  coord_cartesian(xlim = c(75, 100))+
  labs(title ="Points Histogram Showing How Wines have been Catagorized by Grade", x = "Points Given by Reviewer", y = "Number  of Reviews") +
  scale_x_continuous(breaks=seq(75,100, by = 1))

ggplot(Wine_Eng, aes(x=price, fill = grade)) + geom_histogram(binwidth = 5, color= 'white') + 
  coord_cartesian(xlim = c(0, 100))+
  labs(title ="Price Histogram for WInes Under $100", x = "Price in Dollars", y = "Number  of Reviews") +
  scale_x_continuous(breaks=seq(0,100, by = 10))

reviewers = wine_data %>%
  group_by(taster_name) %>%
  count() 

 frequent_reviewers= reviewers %>%
  filter(n>5)


Reviewer_Data = wine_data %>%
  filter(taster_name %in% frequent_reviewers$taster_name) %>%
  select(taster_name,points)

Reviewer_Standards = Reviewer_Data %>%
  group_by(taster_name) %>%
  summarise(Mean_Score = mean(points)) %>%
  arrange(desc(Mean_Score)) %>%
  mutate(standard = ntile(Mean_Score,3))

Reviewer_Standards$standard = as.factor(as.character(Reviewer_Standards$standard))
levels(Reviewer_Standards$standard) = c("High","Medium", "Low")

Wine_Eng = left_join(Wine_Eng,Reviewer_Standards, by = "taster_name")

filter(Wine_Eng, !is.na(taster_name))%>%
ggplot(aes(x=points, fill = standard)) + geom_histogram(binwidth = 1, color= 'white') + 
  coord_cartesian(xlim = c(75, 100))+
  labs(title ="Points Histogram", x = "Points Given by Reviewer", y = "Number  of Reviews") +
  scale_x_continuous(breaks=seq(75,100, by = 1))

filter(Wine_Eng, !is.na(grade))%>%
ggplot(aes(x=price, fill = grade)) + geom_bar( position = "fill") + 
  coord_cartesian(xlim = c(0, 100))+
  labs(title ="Distribution of Wine Grade by Price", x = "Price of Wine in Dollars", y = "Proportion") +
  scale_x_continuous(breaks=seq(0,100, by = 10))

filter(Wine_Eng, !is.na(taster_name))%>%
ggplot(aes(x=points, fill = standard)) + geom_bar( position = "fill") + 
  coord_cartesian(xlim = c(80, 100))+
  labs(title ="Distribution of Reviewer Standard by Points Given", x = "Score Given by Reviewr", y = "Proportion") +
  scale_x_continuous(breaks=seq(80,100, by = 1))

##Explotation of Most Common Words Here I will use td_idf function to determine the words the seperate the reviews for different grades of wine. This algarithm is designed to find the words that are used most frequently in one catagory reletive to the others. I first found the words most common in the entire data set, that is with common meaninless words removed. Unsuprisingly the most common word is ‘wine’ followed by other wine and flavor related words. The most used words for the good and bad wines reletive to the other catagories also make sense. I also looked at the corresponding words for the wines catagorized as average but they were fairly random.

Wine_Words = select(Wine_Eng,description, grade)%>%
              unnest_tokens(word, description)%>%
              anti_join(stop_words)%>%
              filter(!str_detect(word, "[0-9]"))

## Joining, by = "word"

Wine_Words %>%
  count(word, sort = TRUE) %>%
  top_n(20) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  labs(title ="Top 20 Significant Words From All Comments", x = "Total Number of Occurances", y = "Word") +
  coord_flip()

## Selecting by n

comment_words = Wine_Words %>%
  count(grade, word, sort = TRUE) %>%
  ungroup()

total_words = comment_words %>% 
  group_by(grade) %>% 
  summarize(total = sum(n))

grade_words = left_join(comment_words, total_words)

## Joining, by = "grade"

grade_words = grade_words %>%
  bind_tf_idf(word, grade, n)


grade_words %>% 
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(grade) %>% 
  top_n(10) %>% 
  ungroup %>%
  filter(grade == "Good")%>%
  ggplot(aes(word, tf_idf)) +
  geom_col(show.legend = FALSE, fill = 'red') +
  labs(title ="Top 10 Significant Words from Good Bottles", x = NULL, y = "tf-idf") +
  coord_flip()

## Selecting by tf_idf

grade_words %>% 
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(grade) %>% 
  top_n(10) %>% 
  ungroup %>%
  filter(grade == "Bad")%>%
  ggplot(aes(word, tf_idf)) +
  geom_col(show.legend = FALSE, fill = 'Blue') +
  labs(title ="Top 10 Significant Words from Bad Bottles", x = NULL, y = "tf-idf") +
  coord_flip()

## Selecting by tf_idf

Wine Data Explore

Suman Ghorai

January 23, 2019