#Setup ##Load Needed Packages

library(dplyr)
library(tidyverse)
library(knitr)
library(ggmap)
library(tidytext)
library(tm)
library(readr)

##Import Data I removed the first column which was just the sequencial ID number

setwd("F:\\R_project")
wine_data = read_csv("WINE1.csv", 
    col_types = cols(X1 = col_skip()))

#Understanding the Missing Data A simple bar plot shows us that some features are in every single data point, such as points given, while other features are often left out, such as the sub region

missing_data = wine_data %>%
    map_df(function(x) sum(is.na(x))) %>%
    gather(feature, num_nulls)%>%
    arrange(desc(num_nulls))%>%
    mutate(percent_missing = num_nulls/nrow(wine_data)*100)

ggplot(missing_data, aes(reorder(feature,percent_missing),percent_missing)) +geom_bar(stat = "identity")  + coord_flip() + labs(title = "Feature By Percent of Data Missing", y = "Percent of Observations with Feature Missing", x = "Feature") +scale_y_continuous(limits = c(0, 100))

kable(missing_data, digits = c(0,0,0))
feature num_nulls percent_missing
region_2 79460 61
designation 37465 29
taster_twitter_handle 31213 24
taster_name 26244 20
region_1 21247 16
price 8996 7
country 63 0
province 63 0
variety 1 0
description 0 0
points 0 0
title 0 0
winery 0 0

#Most Common Wine Locations

countries = wine_data %>%
  group_by(country) %>%
  count() 

top_countries = countries %>%
  filter(n>500)

ggplot(top_countries, aes(reorder(country,n),n)) +geom_bar(stat = "identity")  + coord_flip() + labs(title = "Countries with at Least 500 Reviews by Number of Reviews", y = "Number of Reviews", x = "Country")

#Histograms ##Price As is shown in the initial (barely recognizable) boxplot, the data is heavily concentrated below $100 dollars.However due to the size of the data, there are a decent number of outliers, including some over a thousand dollars. The truncated histogram gives a better idea of the actual distribution of prices.

ggplot(wine_data, aes(x = 1 ,y=price)) + geom_boxplot() + 
  labs(title ="Price Boxplot", y = "Price in Dollars", x = "All Wines") +
  coord_flip()

ggplot(wine_data, aes(x=price)) + geom_histogram(binwidth = 5, color= 'white') + 
  coord_cartesian(xlim = c(0, 100))+
  labs(title ="Price Histogram for WInes Under $100", x = "Price in Dollars", y = "Number  of Reviews") +
  scale_x_continuous(breaks=seq(0,100, by = 10))

##Points Given

ggplot(wine_data, aes(x=points)) + geom_histogram(binwidth = 1, color= 'white') + 
  coord_cartesian(xlim = c(75, 100))+
  labs(title ="Points Histogram", x = "Points Given by Reviewer", y = "Number  of Reviews") +
  scale_x_continuous(breaks=seq(75,100, by = 1))

#A Closer Look at the Data ##Countries with the Best and Worst Wine

Best_Wines = wine_data %>%
  filter(country %in% top_countries$country) %>%
  select(country,points)

Best_Wines %>%
  group_by(country) %>%
  summarise(Mean_Score = mean(points)) %>%
  arrange(desc(Mean_Score))%>%
  kable()
country Mean_Score
Austria 90.10135
Germany 89.85173
France 88.84511
Australia 88.58051
US 88.56372
Italy 88.56223
Israel 88.47129
New Zealand 88.30303
Portugal 88.25022
South Africa 88.05639
Spain 87.28834
Argentina 86.71026
Chile 86.49352
Best_Worst = Best_Wines %>%
  filter(country %in% c("Austria","Chile"))

ggplot(Best_Worst, aes(points, colour = country, fill = country)) + geom_density(alpha = .4) +
  labs(title ="Point Densities of Top and Bottom Countries with at Least 500 Reviews", x = "Points Given", y = "Density") 

##Top and Bottom Wineries

wineries = wine_data %>%
  group_by(winery) %>%
  count() 

Biggest_Wineries = wineries %>%
  filter(n>100)


Best_Wineries = wine_data %>%
  filter(winery %in% Biggest_Wineries$winery) %>%
  select(winery,points)

Best_Wineries %>%
  group_by(winery) %>%
  summarise(Mean_Score = mean(points)) %>%
  arrange(desc(Mean_Score))%>%
  kable()
winery Mean_Score
Lynmar 92.80508
Williams Selyem 92.74408
Domaine Zind-Humbrecht 92.52475
Foxen 91.67619
Louis Jadot 91.62500
Gary Farrell 91.61600
Bründlmayer 91.52475
Iron Horse 91.18868
Chanson Père et Fils 90.80734
Testarossa 90.73853
Albert Bichot 90.63248
Louis Latour 90.53769
Siduri 90.45238
Feudi di San Gregorio 90.01961
Fess Parker 89.62037
Chehalem 89.31373
Jean-Luc and Paul Aegerter 88.91150
Chateau Ste. Michelle 88.72680
Kendall-Jackson 88.63846
V. Sattui 88.53271
Robert Mondavi 88.41964
Columbia Crest 87.88050
Kunde 87.67327
Montes 87.62393
Concha y Toro 87.60976
Wines & Winemakers 87.59910
Trapiche 87.40708
Georges Duboeuf 87.40306
Casa Santos Lima 87.23894
Undurraga 87.09735
DFJ Vinhos 86.66977
Maryhill 86.30392
Santa Ema 86.16071
Best_Worst = Best_Wineries %>%
  filter(winery %in% c("Lynmar","Santa Ema"))

ggplot(Best_Worst, aes(points, colour = winery, fill = winery)) + geom_density(alpha = .4) +
  labs(title ="Point Densities of Top and Bottom Wineries with at least 100 Reviews", x = "Points Given", y = "Density") 

##Top and Bottom Varieties of Wine

varieties = wine_data %>%
  group_by(variety) %>%
  count() 

Popular_Varieties = varieties %>%
  filter(n>500)


Variety_Data = wine_data %>%
  filter(variety %in% Popular_Varieties$variety) %>%
  select(variety,points)

Variety_Data %>%
  group_by(variety) %>%
  summarise(Mean_Score = mean(points)) %>%
  arrange(desc(Mean_Score))%>%
  kable()
variety Mean_Score
Sangiovese Grosso 90.52996
Nebbiolo 90.25107
Grüner Veltliner 89.98067
Port 89.73353
Champagne Blend 89.66332
Riesling 89.45018
Pinot Noir 89.41147
Syrah 89.28658
Rhône-style Red Blend 89.15364
Bordeaux-style Red Blend 89.10644
Shiraz 89.05861
Grenache 88.96313
Portuguese Red 88.81062
Bordeaux-style White Blend 88.69043
Cabernet Sauvignon 88.60758
Gewürztraminer 88.59091
Chenin Blanc 88.57530
Sangiovese 88.55079
Pinot Gris 88.49622
Red Blend 88.38028
Chardonnay 88.34008
Tempranillo Blend 88.25510
Corvina, Rondinella, Molinara 88.24394
Petite Sirah 88.20390
Cabernet Franc 88.15078
Sparkling Blend 88.04505
Gamay 88.03317
Malbec 87.98303
Zinfandel 87.82867
Barbera 87.78363
Viognier 87.77108
Tempranillo 87.51436
Sauvignon Blanc 87.42964
White Blend 87.35297
Glera 87.24401
Merlot 87.20858
Portuguese White 86.93098
Rosé 86.84624
Carmenère 86.58957
Pinot Grigio 86.23764
Best_Worst = Variety_Data %>%
  filter(variety %in% c("Sangiovese Grosso","Pinot Grigio"))

ggplot(Best_Worst, aes(points, colour = variety, fill = variety)) + geom_density(alpha = .4) +
  labs(title ="Point Densities of Top and Bottom Wine Varieties with at least 500 Reviews", x = "Points Given", y = "Density") 

##Tasters with Highest and Lowests Standards

reviewers = wine_data %>%
  group_by(taster_name) %>%
  count() 

 frequent_reviewers= reviewers %>%
  filter(n>100)


Reviewer_Data = wine_data %>%
  filter(taster_name %in% frequent_reviewers$taster_name) %>%
  select(taster_name,points)

Reviewer_Data %>%
  group_by(taster_name) %>%
  summarise(Mean_Score = mean(points)) %>%
  arrange(desc(Mean_Score))%>%
  kable()
taster_name Mean_Score
Anne Krebiehl MW 90.56255
Matt Kettmann 90.00869
Virginie Boone 89.21338
Mike DeSimone 89.10117
Paul Gregutt 89.08256
Kerin O’Keefe 88.86795
Sean P. Sullivan 88.75574
Roger Voss 88.70800
Jim Gordon 88.62629
Joe Czerwinski 88.53623
Anna Lee C. Iijima 88.41563
Jeff Jenssen 88.31976
NA 87.79512
Lauren Buzzeo 87.73951
Michael Schachner 86.90749
Susan Kostrzewa 86.60922
Carrie Dykes 86.39568
Alexander Peartree 85.85542
Best_Worst = Reviewer_Data %>%
  filter(taster_name %in% c("Alexander Peartree","Matt Kettmann"))

ggplot(Best_Worst, aes(points, colour = taster_name, fill = taster_name)) + geom_density(alpha = .4) +
  labs(title ="Point Densities of Most and Least Critical Reviewrs with at least 100 Reviews", x = "Points Given", y = "Density") 

#Correlations ##Do the Most Reviewed Wineries Have the Best Wines? It is possible that the better a winery is the more it’s wine is reviewed. On the other hand it could turn out that the number of reviews of a winery’s wine is mostly a function of it’s volume produced and the smaller the winery the better the wine. As it turns out there is barely any correlation

wineries = wine_data %>%
  group_by(winery) %>%
  count() 

Winery_Mean_Points = wine_data %>%
  select(winery,points) %>%
  group_by(winery) %>%
  summarise(Mean_Score = mean(points))

Reviews_Mean = wineries %>%
  inner_join(Winery_Mean_Points, by = "winery") %>%
   filter(n>25)

ggplot(Reviews_Mean, aes(x=n, y=Mean_Score)) +    geom_point(shape=1) +  geom_smooth(method=lm , color="red", se=FALSE) + 
   labs(title ="Mean Winery Score vs. Number of Wines Reviewed from that Winery", x = "Number of Wines Reviewed", y = "Mean Score Given")

##What is the Relationship Between Price and Points Given? My father likes to say that a buying a bottle of wine that costs more than $20 is a waste of money because after that threshold wine does not improve. According to this data set he is wrong. It appears that the correlation between wine price and points given levels off around $40! Additinally, it is clear from the first larger plot that spending thousands of dollars on a bottle of wine is usually a poor decision and a comparable bottle can be had for much less

I_Wines = wine_data %>%
  select(price, points) %>%
  filter(!is.na(price)) %>%
  filter(!is.na(points))

ggplot(I_Wines, aes(x=price, y=points)) +    geom_jitter(shape=1)    + coord_cartesian(xlim = c(0, 4000), ylim = c(75, 100)) +
   labs(title ="Score vs Price", x = "Price", y = "Score")

Reason_Wines = I_Wines %>%
  filter(price < 50 )


ggplot(Reason_Wines, aes(x=price, y=points)) +    geom_jitter(width = 1,height = 1, alpha = .1)  + 
  stat_smooth(method= "lm" , formula = y~poly(x,2),color="red", se=FALSE)  + 
  coord_cartesian(xlim = c(0, 50), ylim = c(75, 100)) +
  labs(title ="Score vs Price for Wines Under 20 Dollars", x = "Price", y = "Score")

#Data Enineering ## Classify Reviews as Bad, Average or Good and by the Standard of the Reviewer The bins are somewhat arbitrary, I attempted to use dplyr’s ntile function to devide the scores into 3 buckets. However, then I eneded up with two scores that were devided into two different classifications. So my bin decision was based on that but expicitly specified to solve this problem

Wine_Eng = wine_data %>%
  mutate(grade = ifelse(points > 91,"Good",ifelse(points >86,"Average","Bad")))

Wine_Eng$grade = as.factor(Wine_Eng$grade)
Wine_Eng$grade = factor(Wine_Eng$grade,levels(Wine_Eng$grade)[c(3,1,2)])

ggplot(Wine_Eng, aes(x=points, fill = grade)) + geom_histogram(binwidth = 1, color= 'white') + 
  coord_cartesian(xlim = c(75, 100))+
  labs(title ="Points Histogram Showing How Wines have been Catagorized by Grade", x = "Points Given by Reviewer", y = "Number  of Reviews") +
  scale_x_continuous(breaks=seq(75,100, by = 1))

ggplot(Wine_Eng, aes(x=price, fill = grade)) + geom_histogram(binwidth = 5, color= 'white') + 
  coord_cartesian(xlim = c(0, 100))+
  labs(title ="Price Histogram for WInes Under $100", x = "Price in Dollars", y = "Number  of Reviews") +
  scale_x_continuous(breaks=seq(0,100, by = 10))

reviewers = wine_data %>%
  group_by(taster_name) %>%
  count() 

 frequent_reviewers= reviewers %>%
  filter(n>5)


Reviewer_Data = wine_data %>%
  filter(taster_name %in% frequent_reviewers$taster_name) %>%
  select(taster_name,points)

Reviewer_Standards = Reviewer_Data %>%
  group_by(taster_name) %>%
  summarise(Mean_Score = mean(points)) %>%
  arrange(desc(Mean_Score)) %>%
  mutate(standard = ntile(Mean_Score,3))

Reviewer_Standards$standard = as.factor(as.character(Reviewer_Standards$standard))
levels(Reviewer_Standards$standard) = c("High","Medium", "Low")

Wine_Eng = left_join(Wine_Eng,Reviewer_Standards, by = "taster_name")

filter(Wine_Eng, !is.na(taster_name))%>%
ggplot(aes(x=points, fill = standard)) + geom_histogram(binwidth = 1, color= 'white') + 
  coord_cartesian(xlim = c(75, 100))+
  labs(title ="Points Histogram", x = "Points Given by Reviewer", y = "Number  of Reviews") +
  scale_x_continuous(breaks=seq(75,100, by = 1))

filter(Wine_Eng, !is.na(grade))%>%
ggplot(aes(x=price, fill = grade)) + geom_bar( position = "fill") + 
  coord_cartesian(xlim = c(0, 100))+
  labs(title ="Distribution of Wine Grade by Price", x = "Price of Wine in Dollars", y = "Proportion") +
  scale_x_continuous(breaks=seq(0,100, by = 10))

filter(Wine_Eng, !is.na(taster_name))%>%
ggplot(aes(x=points, fill = standard)) + geom_bar( position = "fill") + 
  coord_cartesian(xlim = c(80, 100))+
  labs(title ="Distribution of Reviewer Standard by Points Given", x = "Score Given by Reviewr", y = "Proportion") +
  scale_x_continuous(breaks=seq(80,100, by = 1))

##Explotation of Most Common Words Here I will use td_idf function to determine the words the seperate the reviews for different grades of wine. This algarithm is designed to find the words that are used most frequently in one catagory reletive to the others. I first found the words most common in the entire data set, that is with common meaninless words removed. Unsuprisingly the most common word is ‘wine’ followed by other wine and flavor related words. The most used words for the good and bad wines reletive to the other catagories also make sense. I also looked at the corresponding words for the wines catagorized as average but they were fairly random.

Wine_Words = select(Wine_Eng,description, grade)%>%
              unnest_tokens(word, description)%>%
              anti_join(stop_words)%>%
              filter(!str_detect(word, "[0-9]"))
## Joining, by = "word"
Wine_Words %>%
  count(word, sort = TRUE) %>%
  top_n(20) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  labs(title ="Top 20 Significant Words From All Comments", x = "Total Number of Occurances", y = "Word") +
  coord_flip()
## Selecting by n

comment_words = Wine_Words %>%
  count(grade, word, sort = TRUE) %>%
  ungroup()

total_words = comment_words %>% 
  group_by(grade) %>% 
  summarize(total = sum(n))

grade_words = left_join(comment_words, total_words)
## Joining, by = "grade"
grade_words = grade_words %>%
  bind_tf_idf(word, grade, n)


grade_words %>% 
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(grade) %>% 
  top_n(10) %>% 
  ungroup %>%
  filter(grade == "Good")%>%
  ggplot(aes(word, tf_idf)) +
  geom_col(show.legend = FALSE, fill = 'red') +
  labs(title ="Top 10 Significant Words from Good Bottles", x = NULL, y = "tf-idf") +
  coord_flip()
## Selecting by tf_idf

grade_words %>% 
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(grade) %>% 
  top_n(10) %>% 
  ungroup %>%
  filter(grade == "Bad")%>%
  ggplot(aes(word, tf_idf)) +
  geom_col(show.legend = FALSE, fill = 'Blue') +
  labs(title ="Top 10 Significant Words from Bad Bottles", x = NULL, y = "tf-idf") +
  coord_flip()
## Selecting by tf_idf