#Setup ##Load Needed Packages
library(dplyr)
library(tidyverse)
library(knitr)
library(ggmap)
library(tidytext)
library(tm)
library(readr)
##Import Data I removed the first column which was just the sequencial ID number
setwd("F:\\R_project")
wine_data = read_csv("WINE1.csv",
col_types = cols(X1 = col_skip()))
#Understanding the Missing Data A simple bar plot shows us that some features are in every single data point, such as points given, while other features are often left out, such as the sub region
missing_data = wine_data %>%
map_df(function(x) sum(is.na(x))) %>%
gather(feature, num_nulls)%>%
arrange(desc(num_nulls))%>%
mutate(percent_missing = num_nulls/nrow(wine_data)*100)
ggplot(missing_data, aes(reorder(feature,percent_missing),percent_missing)) +geom_bar(stat = "identity") + coord_flip() + labs(title = "Feature By Percent of Data Missing", y = "Percent of Observations with Feature Missing", x = "Feature") +scale_y_continuous(limits = c(0, 100))
kable(missing_data, digits = c(0,0,0))
| feature | num_nulls | percent_missing |
|---|---|---|
| region_2 | 79460 | 61 |
| designation | 37465 | 29 |
| taster_twitter_handle | 31213 | 24 |
| taster_name | 26244 | 20 |
| region_1 | 21247 | 16 |
| price | 8996 | 7 |
| country | 63 | 0 |
| province | 63 | 0 |
| variety | 1 | 0 |
| description | 0 | 0 |
| points | 0 | 0 |
| title | 0 | 0 |
| winery | 0 | 0 |
#Most Common Wine Locations
countries = wine_data %>%
group_by(country) %>%
count()
top_countries = countries %>%
filter(n>500)
ggplot(top_countries, aes(reorder(country,n),n)) +geom_bar(stat = "identity") + coord_flip() + labs(title = "Countries with at Least 500 Reviews by Number of Reviews", y = "Number of Reviews", x = "Country")
#Histograms ##Price As is shown in the initial (barely recognizable) boxplot, the data is heavily concentrated below $100 dollars.However due to the size of the data, there are a decent number of outliers, including some over a thousand dollars. The truncated histogram gives a better idea of the actual distribution of prices.
ggplot(wine_data, aes(x = 1 ,y=price)) + geom_boxplot() +
labs(title ="Price Boxplot", y = "Price in Dollars", x = "All Wines") +
coord_flip()
ggplot(wine_data, aes(x=price)) + geom_histogram(binwidth = 5, color= 'white') +
coord_cartesian(xlim = c(0, 100))+
labs(title ="Price Histogram for WInes Under $100", x = "Price in Dollars", y = "Number of Reviews") +
scale_x_continuous(breaks=seq(0,100, by = 10))
##Points Given
ggplot(wine_data, aes(x=points)) + geom_histogram(binwidth = 1, color= 'white') +
coord_cartesian(xlim = c(75, 100))+
labs(title ="Points Histogram", x = "Points Given by Reviewer", y = "Number of Reviews") +
scale_x_continuous(breaks=seq(75,100, by = 1))
#A Closer Look at the Data ##Countries with the Best and Worst Wine
Best_Wines = wine_data %>%
filter(country %in% top_countries$country) %>%
select(country,points)
Best_Wines %>%
group_by(country) %>%
summarise(Mean_Score = mean(points)) %>%
arrange(desc(Mean_Score))%>%
kable()
| country | Mean_Score |
|---|---|
| Austria | 90.10135 |
| Germany | 89.85173 |
| France | 88.84511 |
| Australia | 88.58051 |
| US | 88.56372 |
| Italy | 88.56223 |
| Israel | 88.47129 |
| New Zealand | 88.30303 |
| Portugal | 88.25022 |
| South Africa | 88.05639 |
| Spain | 87.28834 |
| Argentina | 86.71026 |
| Chile | 86.49352 |
Best_Worst = Best_Wines %>%
filter(country %in% c("Austria","Chile"))
ggplot(Best_Worst, aes(points, colour = country, fill = country)) + geom_density(alpha = .4) +
labs(title ="Point Densities of Top and Bottom Countries with at Least 500 Reviews", x = "Points Given", y = "Density")
##Top and Bottom Wineries
wineries = wine_data %>%
group_by(winery) %>%
count()
Biggest_Wineries = wineries %>%
filter(n>100)
Best_Wineries = wine_data %>%
filter(winery %in% Biggest_Wineries$winery) %>%
select(winery,points)
Best_Wineries %>%
group_by(winery) %>%
summarise(Mean_Score = mean(points)) %>%
arrange(desc(Mean_Score))%>%
kable()
| winery | Mean_Score |
|---|---|
| Lynmar | 92.80508 |
| Williams Selyem | 92.74408 |
| Domaine Zind-Humbrecht | 92.52475 |
| Foxen | 91.67619 |
| Louis Jadot | 91.62500 |
| Gary Farrell | 91.61600 |
| Bründlmayer | 91.52475 |
| Iron Horse | 91.18868 |
| Chanson Père et Fils | 90.80734 |
| Testarossa | 90.73853 |
| Albert Bichot | 90.63248 |
| Louis Latour | 90.53769 |
| Siduri | 90.45238 |
| Feudi di San Gregorio | 90.01961 |
| Fess Parker | 89.62037 |
| Chehalem | 89.31373 |
| Jean-Luc and Paul Aegerter | 88.91150 |
| Chateau Ste. Michelle | 88.72680 |
| Kendall-Jackson | 88.63846 |
| V. Sattui | 88.53271 |
| Robert Mondavi | 88.41964 |
| Columbia Crest | 87.88050 |
| Kunde | 87.67327 |
| Montes | 87.62393 |
| Concha y Toro | 87.60976 |
| Wines & Winemakers | 87.59910 |
| Trapiche | 87.40708 |
| Georges Duboeuf | 87.40306 |
| Casa Santos Lima | 87.23894 |
| Undurraga | 87.09735 |
| DFJ Vinhos | 86.66977 |
| Maryhill | 86.30392 |
| Santa Ema | 86.16071 |
Best_Worst = Best_Wineries %>%
filter(winery %in% c("Lynmar","Santa Ema"))
ggplot(Best_Worst, aes(points, colour = winery, fill = winery)) + geom_density(alpha = .4) +
labs(title ="Point Densities of Top and Bottom Wineries with at least 100 Reviews", x = "Points Given", y = "Density")
##Top and Bottom Varieties of Wine
varieties = wine_data %>%
group_by(variety) %>%
count()
Popular_Varieties = varieties %>%
filter(n>500)
Variety_Data = wine_data %>%
filter(variety %in% Popular_Varieties$variety) %>%
select(variety,points)
Variety_Data %>%
group_by(variety) %>%
summarise(Mean_Score = mean(points)) %>%
arrange(desc(Mean_Score))%>%
kable()
| variety | Mean_Score |
|---|---|
| Sangiovese Grosso | 90.52996 |
| Nebbiolo | 90.25107 |
| Grüner Veltliner | 89.98067 |
| Port | 89.73353 |
| Champagne Blend | 89.66332 |
| Riesling | 89.45018 |
| Pinot Noir | 89.41147 |
| Syrah | 89.28658 |
| Rhône-style Red Blend | 89.15364 |
| Bordeaux-style Red Blend | 89.10644 |
| Shiraz | 89.05861 |
| Grenache | 88.96313 |
| Portuguese Red | 88.81062 |
| Bordeaux-style White Blend | 88.69043 |
| Cabernet Sauvignon | 88.60758 |
| Gewürztraminer | 88.59091 |
| Chenin Blanc | 88.57530 |
| Sangiovese | 88.55079 |
| Pinot Gris | 88.49622 |
| Red Blend | 88.38028 |
| Chardonnay | 88.34008 |
| Tempranillo Blend | 88.25510 |
| Corvina, Rondinella, Molinara | 88.24394 |
| Petite Sirah | 88.20390 |
| Cabernet Franc | 88.15078 |
| Sparkling Blend | 88.04505 |
| Gamay | 88.03317 |
| Malbec | 87.98303 |
| Zinfandel | 87.82867 |
| Barbera | 87.78363 |
| Viognier | 87.77108 |
| Tempranillo | 87.51436 |
| Sauvignon Blanc | 87.42964 |
| White Blend | 87.35297 |
| Glera | 87.24401 |
| Merlot | 87.20858 |
| Portuguese White | 86.93098 |
| Rosé | 86.84624 |
| Carmenère | 86.58957 |
| Pinot Grigio | 86.23764 |
Best_Worst = Variety_Data %>%
filter(variety %in% c("Sangiovese Grosso","Pinot Grigio"))
ggplot(Best_Worst, aes(points, colour = variety, fill = variety)) + geom_density(alpha = .4) +
labs(title ="Point Densities of Top and Bottom Wine Varieties with at least 500 Reviews", x = "Points Given", y = "Density")
##Tasters with Highest and Lowests Standards
reviewers = wine_data %>%
group_by(taster_name) %>%
count()
frequent_reviewers= reviewers %>%
filter(n>100)
Reviewer_Data = wine_data %>%
filter(taster_name %in% frequent_reviewers$taster_name) %>%
select(taster_name,points)
Reviewer_Data %>%
group_by(taster_name) %>%
summarise(Mean_Score = mean(points)) %>%
arrange(desc(Mean_Score))%>%
kable()
| taster_name | Mean_Score |
|---|---|
| Anne Krebiehl MW | 90.56255 |
| Matt Kettmann | 90.00869 |
| Virginie Boone | 89.21338 |
| Mike DeSimone | 89.10117 |
| Paul Gregutt | 89.08256 |
| Kerin O’Keefe | 88.86795 |
| Sean P. Sullivan | 88.75574 |
| Roger Voss | 88.70800 |
| Jim Gordon | 88.62629 |
| Joe Czerwinski | 88.53623 |
| Anna Lee C. Iijima | 88.41563 |
| Jeff Jenssen | 88.31976 |
| NA | 87.79512 |
| Lauren Buzzeo | 87.73951 |
| Michael Schachner | 86.90749 |
| Susan Kostrzewa | 86.60922 |
| Carrie Dykes | 86.39568 |
| Alexander Peartree | 85.85542 |
Best_Worst = Reviewer_Data %>%
filter(taster_name %in% c("Alexander Peartree","Matt Kettmann"))
ggplot(Best_Worst, aes(points, colour = taster_name, fill = taster_name)) + geom_density(alpha = .4) +
labs(title ="Point Densities of Most and Least Critical Reviewrs with at least 100 Reviews", x = "Points Given", y = "Density")
#Correlations ##Do the Most Reviewed Wineries Have the Best Wines? It is possible that the better a winery is the more it’s wine is reviewed. On the other hand it could turn out that the number of reviews of a winery’s wine is mostly a function of it’s volume produced and the smaller the winery the better the wine. As it turns out there is barely any correlation
wineries = wine_data %>%
group_by(winery) %>%
count()
Winery_Mean_Points = wine_data %>%
select(winery,points) %>%
group_by(winery) %>%
summarise(Mean_Score = mean(points))
Reviews_Mean = wineries %>%
inner_join(Winery_Mean_Points, by = "winery") %>%
filter(n>25)
ggplot(Reviews_Mean, aes(x=n, y=Mean_Score)) + geom_point(shape=1) + geom_smooth(method=lm , color="red", se=FALSE) +
labs(title ="Mean Winery Score vs. Number of Wines Reviewed from that Winery", x = "Number of Wines Reviewed", y = "Mean Score Given")
##What is the Relationship Between Price and Points Given? My father likes to say that a buying a bottle of wine that costs more than $20 is a waste of money because after that threshold wine does not improve. According to this data set he is wrong. It appears that the correlation between wine price and points given levels off around $40! Additinally, it is clear from the first larger plot that spending thousands of dollars on a bottle of wine is usually a poor decision and a comparable bottle can be had for much less
I_Wines = wine_data %>%
select(price, points) %>%
filter(!is.na(price)) %>%
filter(!is.na(points))
ggplot(I_Wines, aes(x=price, y=points)) + geom_jitter(shape=1) + coord_cartesian(xlim = c(0, 4000), ylim = c(75, 100)) +
labs(title ="Score vs Price", x = "Price", y = "Score")
Reason_Wines = I_Wines %>%
filter(price < 50 )
ggplot(Reason_Wines, aes(x=price, y=points)) + geom_jitter(width = 1,height = 1, alpha = .1) +
stat_smooth(method= "lm" , formula = y~poly(x,2),color="red", se=FALSE) +
coord_cartesian(xlim = c(0, 50), ylim = c(75, 100)) +
labs(title ="Score vs Price for Wines Under 20 Dollars", x = "Price", y = "Score")
#Data Enineering ## Classify Reviews as Bad, Average or Good and by the Standard of the Reviewer The bins are somewhat arbitrary, I attempted to use dplyr’s ntile function to devide the scores into 3 buckets. However, then I eneded up with two scores that were devided into two different classifications. So my bin decision was based on that but expicitly specified to solve this problem
Wine_Eng = wine_data %>%
mutate(grade = ifelse(points > 91,"Good",ifelse(points >86,"Average","Bad")))
Wine_Eng$grade = as.factor(Wine_Eng$grade)
Wine_Eng$grade = factor(Wine_Eng$grade,levels(Wine_Eng$grade)[c(3,1,2)])
ggplot(Wine_Eng, aes(x=points, fill = grade)) + geom_histogram(binwidth = 1, color= 'white') +
coord_cartesian(xlim = c(75, 100))+
labs(title ="Points Histogram Showing How Wines have been Catagorized by Grade", x = "Points Given by Reviewer", y = "Number of Reviews") +
scale_x_continuous(breaks=seq(75,100, by = 1))
ggplot(Wine_Eng, aes(x=price, fill = grade)) + geom_histogram(binwidth = 5, color= 'white') +
coord_cartesian(xlim = c(0, 100))+
labs(title ="Price Histogram for WInes Under $100", x = "Price in Dollars", y = "Number of Reviews") +
scale_x_continuous(breaks=seq(0,100, by = 10))
reviewers = wine_data %>%
group_by(taster_name) %>%
count()
frequent_reviewers= reviewers %>%
filter(n>5)
Reviewer_Data = wine_data %>%
filter(taster_name %in% frequent_reviewers$taster_name) %>%
select(taster_name,points)
Reviewer_Standards = Reviewer_Data %>%
group_by(taster_name) %>%
summarise(Mean_Score = mean(points)) %>%
arrange(desc(Mean_Score)) %>%
mutate(standard = ntile(Mean_Score,3))
Reviewer_Standards$standard = as.factor(as.character(Reviewer_Standards$standard))
levels(Reviewer_Standards$standard) = c("High","Medium", "Low")
Wine_Eng = left_join(Wine_Eng,Reviewer_Standards, by = "taster_name")
filter(Wine_Eng, !is.na(taster_name))%>%
ggplot(aes(x=points, fill = standard)) + geom_histogram(binwidth = 1, color= 'white') +
coord_cartesian(xlim = c(75, 100))+
labs(title ="Points Histogram", x = "Points Given by Reviewer", y = "Number of Reviews") +
scale_x_continuous(breaks=seq(75,100, by = 1))
filter(Wine_Eng, !is.na(grade))%>%
ggplot(aes(x=price, fill = grade)) + geom_bar( position = "fill") +
coord_cartesian(xlim = c(0, 100))+
labs(title ="Distribution of Wine Grade by Price", x = "Price of Wine in Dollars", y = "Proportion") +
scale_x_continuous(breaks=seq(0,100, by = 10))
filter(Wine_Eng, !is.na(taster_name))%>%
ggplot(aes(x=points, fill = standard)) + geom_bar( position = "fill") +
coord_cartesian(xlim = c(80, 100))+
labs(title ="Distribution of Reviewer Standard by Points Given", x = "Score Given by Reviewr", y = "Proportion") +
scale_x_continuous(breaks=seq(80,100, by = 1))
##Explotation of Most Common Words Here I will use td_idf function to determine the words the seperate the reviews for different grades of wine. This algarithm is designed to find the words that are used most frequently in one catagory reletive to the others. I first found the words most common in the entire data set, that is with common meaninless words removed. Unsuprisingly the most common word is ‘wine’ followed by other wine and flavor related words. The most used words for the good and bad wines reletive to the other catagories also make sense. I also looked at the corresponding words for the wines catagorized as average but they were fairly random.
Wine_Words = select(Wine_Eng,description, grade)%>%
unnest_tokens(word, description)%>%
anti_join(stop_words)%>%
filter(!str_detect(word, "[0-9]"))
## Joining, by = "word"
Wine_Words %>%
count(word, sort = TRUE) %>%
top_n(20) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
labs(title ="Top 20 Significant Words From All Comments", x = "Total Number of Occurances", y = "Word") +
coord_flip()
## Selecting by n
comment_words = Wine_Words %>%
count(grade, word, sort = TRUE) %>%
ungroup()
total_words = comment_words %>%
group_by(grade) %>%
summarize(total = sum(n))
grade_words = left_join(comment_words, total_words)
## Joining, by = "grade"
grade_words = grade_words %>%
bind_tf_idf(word, grade, n)
grade_words %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(grade) %>%
top_n(10) %>%
ungroup %>%
filter(grade == "Good")%>%
ggplot(aes(word, tf_idf)) +
geom_col(show.legend = FALSE, fill = 'red') +
labs(title ="Top 10 Significant Words from Good Bottles", x = NULL, y = "tf-idf") +
coord_flip()
## Selecting by tf_idf
grade_words %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(grade) %>%
top_n(10) %>%
ungroup %>%
filter(grade == "Bad")%>%
ggplot(aes(word, tf_idf)) +
geom_col(show.legend = FALSE, fill = 'Blue') +
labs(title ="Top 10 Significant Words from Bad Bottles", x = NULL, y = "tf-idf") +
coord_flip()
## Selecting by tf_idf