library(tidyverse)
library(moderndive)
library(caret)
library(wesanderson)
wine <- read_rds("/Users/Rose/Downloads/wine.rds")
head(wine)
## # A tibble: 6 x 15
## id country description designation points price province region_1 region_2
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 1 Portug… This is ri… Avidagos 87 15 Douro <NA> <NA>
## 2 2 US Tart and s… <NA> 87 14 Oregon Willame… Willame…
## 3 3 US Pineapple … Reserve La… 87 13 Michigan Lake Mi… <NA>
## 4 4 US Much like … Vintner's … 87 65 Oregon Willame… Willame…
## 5 5 Spain Blackberry… Ars In Vit… 87 15 Norther… Navarra <NA>
## 6 6 Italy Here's a b… Belsito 87 16 Sicily … Vittoria <NA>
## # … with 6 more variables: taster_name <chr>, taster_twitter_handle <chr>,
## # title <chr>, variety <chr>, winery <chr>, year <dbl>
# Create dummy var for Roger Voss
voss_wine <- wine %>%
mutate(voss = ifelse(taster_name == "Roger Voss", 1,0))
# How many times did he rate a wine?
table(voss_wine$voss) # did not = 71k, did = 17k
##
## 0 1
## 71965 17591
# Which wines does Roger dislike compared to other wines?
voss <- voss_wine %>%
filter(voss == 1) %>%
group_by(variety) %>%
summarize(avg_points_voss = mean(points)) %>%
arrange(avg_points_voss)
voss
## # A tibble: 193 x 2
## variety avg_points_voss
## <chr> <dbl>
## 1 Gewürztraminer-Riesling 82
## 2 Alsace white blend 84
## 3 Grenache 84
## 4 Portuguese Rosé 84.7
## 5 Cabernet 85
## 6 Cabernet Blend 85
## 7 Cabernet Merlot 85
## 8 Chardonnay-Pinot Gris 85
## 9 Folle Blanche 85
## 10 Malbec-Cabernet Franc 85
## # … with 183 more rows
other_wines <- voss_wine %>%
filter(voss == 0) %>%
group_by(variety) %>%
summarise(avg_points_other = mean(points)) %>%
arrange(avg_points_other)
other_wines
## # A tibble: 566 x 2
## variety avg_points_other
## <chr> <dbl>
## 1 Airen 81.7
## 2 Aidani 82
## 3 Picapoll 82
## 4 Shiraz-Tempranillo 82
## 5 Arinto 83
## 6 Athiri 83
## 7 Forcallà 83
## 8 País 83
## 9 Premsal 83
## 10 Alvarinho 83.5
## # … with 556 more rows
altoghether <- voss %>%
left_join(other_wines, by = "variety") %>%
mutate(diff = avg_points_voss - avg_points_other) %>%
arrange(diff)
# these are the wines he rated and liked more
altoghether %>% arrange(desc(diff))
## # A tibble: 193 x 4
## variety avg_points_voss avg_points_other diff
## <chr> <dbl> <dbl> <dbl>
## 1 Petit Manseng 92.2 85.8 6.40
## 2 Sauvignon Blanc-Chardonnay 90 84.5 5.5
## 3 Alvarinho 88.6 83.5 5.11
## 4 Malbec-Tannat 92.3 87.2 5.07
## 5 Jaen 90.6 86 4.60
## 6 Pinot Meunier 92.5 88.2 4.30
## 7 Arinto 87.3 83 4.26
## 8 Tannat 91.6 87.6 4.03
## 9 Abouriou 89 85 4
## 10 Tocai 90.7 87 3.67
## # … with 183 more rows
# these are the wines he disliked more
altoghether %>% arrange(diff)
## # A tibble: 193 x 4
## variety avg_points_voss avg_points_other diff
## <chr> <dbl> <dbl> <dbl>
## 1 Alsace white blend 84 90.8 -6.78
## 2 Grenache 84 89.5 -5.52
## 3 Gewürztraminer-Riesling 82 87.3 -5.33
## 4 Pinot Noir-Gamay 85.6 90 -4.4
## 5 Tinta Barroca 86.7 91 -4.33
## 6 Cabernet Blend 85 88.8 -3.82
## 7 Malbec-Cabernet Franc 85 88.7 -3.7
## 8 Roter Veltliner 87 90.5 -3.5
## 9 Sylvaner 85.9 89 -3.14
## 10 Pinot Auxerrois 85 88.1 -3.12
## # … with 183 more rows
head(table(wine$variety), 20)
##
## Abouriou Agiorgitiko Aglianico
## 3 63 175
## Aidani Airen Albana
## 1 3 13
## Albanello Albariño Aleatico
## 1 398 2
## Alfrocheiro Alicante Alicante Bouschet
## 15 2 44
## Aligoté Alsace white blend Altesse
## 26 42 5
## Alvarelhão Alvarinho Alvarinho-Chardonnay
## 2 98 4
## Antão Vaz Aragonês
## 15 7
# a lot of wines that are just 1-5, maybe would have to factor collapse into another category
wine %>%
mutate(roger=taster_name=="Roger Voss") %>%
mutate(pinot_gris=variety=="Pinot Gris") %>%
drop_na(roger) %>%
group_by(roger, pinot_gris) %>%
summarise(points = mean(points)) %>%
ggplot() +
aes(x = pinot_gris, y = points, color = roger) +
geom_line(aes(group = roger)) +
geom_point()
wine %>%
filter(province=="Oregon") %>%
group_by(year) %>%
summarise(price=mean(price)) %>%
ggplot(aes(year,price))+
geom_line()+
labs(title = "Oregon wine over the years")
Looking at this time series, you can see that there are clear ups and downs, but they are all relative to one another in other words, there is a baseline effect
library(fastDummies)
wine %>%
select(taster_name) %>%
dummy_cols() %>%
select(1:4) %>%
head()
## # A tibble: 6 x 4
## taster_name `taster_name_Roger … `taster_name_Paul … `taster_name_Alexande…
## <chr> <int> <int> <int>
## 1 Roger Voss 1 0 0
## 2 Paul Gregutt 0 1 0
## 3 Alexander Pea… 0 0 1
## 4 Paul Gregutt 0 1 0
## 5 Michael Schac… 0 0 0
## 6 Kerin O’Keefe 0 0 0
wine %>%
select(variety) %>%
mutate(variety=fct_lump(variety,4)) %>%
# Taking 4 of the highest frequency varieties, then other are put in a different "other" category
dummy_cols() %>%
head()
## # A tibble: 6 x 6
## variety `variety_Cabern… variety_Chardon… `variety_Pinot … `variety_Red Bl…
## <fct> <int> <int> <int> <int>
## 1 Other 0 0 0 0
## 2 Other 0 0 0 0
## 3 Other 0 0 0 0
## 4 Pinot … 0 0 1 0
## 5 Other 0 0 0 0
## 6 Other 0 0 0 0
## # … with 1 more variable: variety_Other <int>
wine %>%
ggplot(aes(price))+
geom_histogram()
wine %>%
ggplot(aes(log(price)))+
geom_histogram()
…allows for common scale across variables. Also helps reduce bias when interactions are included (i.e. eliminates variance inflation).
And there are many other transformations that you can read about.
This chapter has a good overview of interactions.
library(caret)
wino <- wine %>%
mutate(fr=(country=="France")) %>% # Mutates to a T/F for both fr & cab
mutate(cab=str_detect(variety,"Cabernet")) %>%
mutate(lprice=log(price)) %>%
drop_na(fr, cab) %>% # Caret Library is extremely particular about NAs
select(lprice, points, fr, cab)
# Seperate into Train & Test
wine_index <- createDataPartition(wino$lprice, p = 0.8, list = FALSE) # partitioned on basis of log price
wino_tr <- wino[ wine_index, ] # Training set 80%
wino_te <- wino[-wine_index, ] # Testin set 20%
control <- trainControl(method="repeatedcv", number=5, repeats=3) # cv = cross validation
# v-fold cross validation
## I want 5 folds
## & I want to repeat the process: 3 times
## Meaning we are essentially running the model 15 times
### Then validating it against a sub sample of the training data each of those 15 times as well
m1 <- train(lprice ~ ., # Calling train function: on this data, "i am going to give it the method - LM"
data = wino_tr,
method = "lm", # could be GLM or decision tree, etc. (read below first)
trControl = control)
# Running a linear regression with sub sampling
# This means for the training data, it can be trained seperate from a model chosen.. we can do above or...
## m1 <- train(lprice ~ .,
# data = wino_tr,
# method = "glm",
# trControl = control)
## m1 <- train(lprice ~ .,
# data = wino_tr,
# method = "decision tree", # Etc.
# trControl = control)
# Can do exact same everything (same data, same subsampling method, same framework), but change the type of model being used in the end
Follow this link for the full documentation on caret.
# Results
m1
## Linear Regression
##
## 71603 samples
## 3 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 57282, 57281, 57284, 57282, 57283, 57283, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.5106586 0.3966542 0.402223
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
## No pre processing
## Different sample sizes around 57k
## RMSE = .51
wine_pred <- predict(m1, wino_te) # Using model created and on the data we want to run it on
postResample(pred=wine_pred,
obs = wino_te$lprice)
## RMSE Rsquared MAE
## 0.5109516 0.3913061 0.4034114
wino <- wine %>%
mutate(country=fct_lump(country,4)) %>% # top 4 countries based on frequency
mutate(variety=fct_lump(variety,4)) %>% # top 4 varieties based on frequency
mutate(lprice=log(price)) %>%
select(lprice, points, country, variety) %>%
drop_na(.)
# Creates DF with variables country and variety, but only shows the top 4 in frequency, all others are named "other"
head(wino)
## # A tibble: 6 x 4
## lprice points country variety
## <dbl> <dbl> <fct> <fct>
## 1 2.71 87 Other Other
## 2 2.64 87 US Other
## 3 2.56 87 US Other
## 4 4.17 87 US Pinot Noir
## 5 2.71 87 Spain Other
## 6 2.77 87 Italy Other
# Additional data cleaning
wino <- dummy_cols(wino, remove_selected_columns = T) %>%
select(-country_Other, -variety_Other) %>% # gets rid of "other" dummy variable
rename_all(funs(tolower(.))) %>% # turning everything lowercase
rename_all(funs(str_replace_all(., "-", "_"))) %>% # replacing all dashes with underscores
rename_all(funs(str_replace_all(., " ", "_")))
head(wino) %>%
select(1:7)
## # A tibble: 6 x 7
## lprice points country_france country_italy country_spain country_us
## <dbl> <dbl> <int> <int> <int> <int>
## 1 2.71 87 0 0 0 0
## 2 2.64 87 0 0 0 1
## 3 2.56 87 0 0 0 1
## 4 4.17 87 0 0 0 1
## 5 2.71 87 0 0 1 0
## 6 2.77 87 0 1 0 0
## # … with 1 more variable: variety_cabernet_sauvignon <int>
wine_index <- createDataPartition(wino$lprice, p = 0.8, list = FALSE)
wino_tr <- wino[ wine_index, ]
wino_te <- wino[-wine_index, ]
m2 <- train(lprice ~ .,
data = wino_tr,
method = "lm",
trControl = control)
m2
## Linear Regression
##
## 71603 samples
## 9 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 57284, 57282, 57282, 57282, 57282, 57281, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.4887842 0.4464716 0.3797216
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
# RMSE = .488
# Rsq = .447
# MAE = .379
wine_pred <- predict(m2, wino_te)
postResample(pred=wine_pred,
obs = wino_te$lprice)
## RMSE Rsquared MAE
## 0.4888447 0.4457318 0.3809720
# estimate variable importance
importance <- varImp(m2, scale=TRUE)
# plot importance
plot(importance)
x <- select(wino_tr,-lprice)
y <- wino_tr$lprice
control <- rfeControl(functions=rfFuncs, method="cv", number=2)
# run the RFE algorithm
results <- rfe(x, y, sizes=c(1:9), rfeControl=control)
# summarize the results
print(results)
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (2 fold)
##
## Resampling performance over subset size:
##
## Variables RMSE Rsquared MAE RMSESD RsquaredSD MAESD Selected
## 1 0.5053 0.4083 0.3986 0.0033685 0.0046801 0.0031303
## 2 0.5269 0.3952 0.4170 0.0057977 0.0061587 0.0045627
## 3 0.5364 0.3991 0.4217 0.0024455 0.0032882 0.0013382
## 4 0.5371 0.4219 0.4198 0.0021017 0.0032352 0.0009839
## 5 0.5477 0.4261 0.4295 0.0034949 0.0027620 0.0033462
## 6 0.4887 0.4726 0.3806 0.0016965 0.0039018 0.0011442
## 7 0.4929 0.4704 0.3836 0.0005725 0.0008272 0.0008715
## 8 0.4938 0.4788 0.3848 0.0004989 0.0024974 0.0003169
## 9 0.4722 0.4939 0.3666 0.0024039 0.0018672 0.0017644 *
##
## The top 5 variables (out of 9):
## points, country_italy, variety_pinot_noir, country_us, variety_cabernet_sauvignon
# list the chosen features
predictors(results)
## [1] "points" "country_italy"
## [3] "variety_pinot_noir" "country_us"
## [5] "variety_cabernet_sauvignon" "variety_red_blend"
## [7] "variety_chardonnay" "country_france"
## [9] "country_spain"
# plot the results
plot(results, type=c("g", "o"))