library(tidyverse)
library(moderndive)
library(caret)
library(wesanderson)

wine <- read_rds("/Users/Rose/Downloads/wine.rds")

head(wine)

## # A tibble: 6 x 15
##      id country description designation points price province region_1 region_2
##   <dbl> <chr>   <chr>       <chr>        <dbl> <dbl> <chr>    <chr>    <chr>   
## 1     1 Portug… This is ri… Avidagos        87    15 Douro    <NA>     <NA>    
## 2     2 US      Tart and s… <NA>            87    14 Oregon   Willame… Willame…
## 3     3 US      Pineapple … Reserve La…     87    13 Michigan Lake Mi… <NA>    
## 4     4 US      Much like … Vintner's …     87    65 Oregon   Willame… Willame…
## 5     5 Spain   Blackberry… Ars In Vit…     87    15 Norther… Navarra  <NA>    
## 6     6 Italy   Here's a b… Belsito         87    16 Sicily … Vittoria <NA>    
## # … with 6 more variables: taster_name <chr>, taster_twitter_handle <chr>,
## #   title <chr>, variety <chr>, winery <chr>, year <dbl>

Dummy Variable Exercise

# Create dummy var for Roger Voss
voss_wine <- wine %>%
  mutate(voss = ifelse(taster_name == "Roger Voss", 1,0))

# How many times did he rate a wine?
table(voss_wine$voss) # did not = 71k, did = 17k

## 
##     0     1 
## 71965 17591

# Which wines does Roger dislike compared to other wines?
voss <- voss_wine %>%
  filter(voss == 1) %>%
  group_by(variety) %>%
  summarize(avg_points_voss = mean(points)) %>%
  arrange(avg_points_voss)

voss

## # A tibble: 193 x 2
##    variety                 avg_points_voss
##    <chr>                             <dbl>
##  1 Gewürztraminer-Riesling            82  
##  2 Alsace white blend                 84  
##  3 Grenache                           84  
##  4 Portuguese Rosé                    84.7
##  5 Cabernet                           85  
##  6 Cabernet Blend                     85  
##  7 Cabernet Merlot                    85  
##  8 Chardonnay-Pinot Gris              85  
##  9 Folle Blanche                      85  
## 10 Malbec-Cabernet Franc              85  
## # … with 183 more rows

other_wines <- voss_wine %>%
  filter(voss == 0) %>%
  group_by(variety) %>%
  summarise(avg_points_other = mean(points)) %>%
  arrange(avg_points_other)

other_wines

## # A tibble: 566 x 2
##    variety            avg_points_other
##    <chr>                         <dbl>
##  1 Airen                          81.7
##  2 Aidani                         82  
##  3 Picapoll                       82  
##  4 Shiraz-Tempranillo             82  
##  5 Arinto                         83  
##  6 Athiri                         83  
##  7 Forcallà                       83  
##  8 País                           83  
##  9 Premsal                        83  
## 10 Alvarinho                      83.5
## # … with 556 more rows

altoghether <- voss %>%
  left_join(other_wines, by = "variety") %>% 
  mutate(diff = avg_points_voss - avg_points_other) %>%
  arrange(diff)


# these are the wines he rated and liked more
altoghether %>% arrange(desc(diff))

## # A tibble: 193 x 4
##    variety                    avg_points_voss avg_points_other  diff
##    <chr>                                <dbl>            <dbl> <dbl>
##  1 Petit Manseng                         92.2             85.8  6.40
##  2 Sauvignon Blanc-Chardonnay            90               84.5  5.5 
##  3 Alvarinho                             88.6             83.5  5.11
##  4 Malbec-Tannat                         92.3             87.2  5.07
##  5 Jaen                                  90.6             86    4.60
##  6 Pinot Meunier                         92.5             88.2  4.30
##  7 Arinto                                87.3             83    4.26
##  8 Tannat                                91.6             87.6  4.03
##  9 Abouriou                              89               85    4   
## 10 Tocai                                 90.7             87    3.67
## # … with 183 more rows

# these are the wines he disliked more
altoghether %>% arrange(diff)

## # A tibble: 193 x 4
##    variety                 avg_points_voss avg_points_other  diff
##    <chr>                             <dbl>            <dbl> <dbl>
##  1 Alsace white blend                 84               90.8 -6.78
##  2 Grenache                           84               89.5 -5.52
##  3 Gewürztraminer-Riesling            82               87.3 -5.33
##  4 Pinot Noir-Gamay                   85.6             90   -4.4 
##  5 Tinta Barroca                      86.7             91   -4.33
##  6 Cabernet Blend                     85               88.8 -3.82
##  7 Malbec-Cabernet Franc              85               88.7 -3.7 
##  8 Roter Veltliner                    87               90.5 -3.5 
##  9 Sylvaner                           85.9             89   -3.14
## 10 Pinot Auxerrois                    85               88.1 -3.12
## # … with 183 more rows

head(table(wine$variety), 20)

## 
##             Abouriou          Agiorgitiko            Aglianico 
##                    3                   63                  175 
##               Aidani                Airen               Albana 
##                    1                    3                   13 
##            Albanello             Albariño             Aleatico 
##                    1                  398                    2 
##          Alfrocheiro             Alicante    Alicante Bouschet 
##                   15                    2                   44 
##              Aligoté   Alsace white blend              Altesse 
##                   26                   42                    5 
##           Alvarelhão            Alvarinho Alvarinho-Chardonnay 
##                    2                   98                    4 
##            Antão Vaz             Aragonês 
##                   15                    7

# a lot of wines that are just 1-5, maybe would have to factor collapse into another category

Categorical vs. Continuous Variables

What is a categorical variable?
What is a continuous variable?
Why do we need to “look” at the data before modeling it?

Categorical Example 1

wine %>% 
  mutate(roger=taster_name=="Roger Voss") %>% 
  mutate(pinot_gris=variety=="Pinot Gris") %>% 
  drop_na(roger) %>% 
  group_by(roger, pinot_gris) %>% 
  summarise(points = mean(points)) %>% 
  ggplot() +
  aes(x = pinot_gris, y = points, color = roger) +
  geom_line(aes(group = roger)) +
  geom_point()

In general, ppl rate Pinot Gris generally higher than Roger Voss

Categorical Example 2

wine %>% 
  filter(province=="Oregon") %>% 
  group_by(year) %>% 
  summarise(price=mean(price)) %>% 
  ggplot(aes(year,price))+
  geom_line()+
  labs(title = "Oregon wine over the years")

Looking at this time series, you can see that there are clear ups and downs, but they are all relative to one another in other words, there is a baseline effect
Something worth looking at would be creating a dummy variable for year is it 2001? (1 yes/ 0 no) to better understand
- by doing this you is adding an intercept to the y axis for each year independently
- You will get a more accurate representation of the relationship between other variables becasue it wipes out the baseline

Encoding categorical features: few dummies

library(fastDummies)
wine %>% 
  select(taster_name) %>% 
  dummy_cols() %>% 
  select(1:4) %>% 
  head()

## # A tibble: 6 x 4
##   taster_name    `taster_name_Roger … `taster_name_Paul … `taster_name_Alexande…
##   <chr>                         <int>               <int>                  <int>
## 1 Roger Voss                        1                   0                      0
## 2 Paul Gregutt                      0                   1                      0
## 3 Alexander Pea…                    0                   0                      1
## 4 Paul Gregutt                      0                   1                      0
## 5 Michael Schac…                    0                   0                      0
## 6 Kerin O’Keefe                     0                   0                      0

Encoding categorical features: many dummies

wine %>% 
  select(variety) %>%
  mutate(variety=fct_lump(variety,4)) %>% 
# Taking 4 of the highest frequency varieties, then other are put in a different "other" category
  dummy_cols() %>% 
  head()

## # A tibble: 6 x 6
##   variety `variety_Cabern… variety_Chardon… `variety_Pinot … `variety_Red Bl…
##   <fct>              <int>            <int>            <int>            <int>
## 1 Other                  0                0                0                0
## 2 Other                  0                0                0                0
## 3 Other                  0                0                0                0
## 4 Pinot …                0                0                1                0
## 5 Other                  0                0                0                0
## 6 Other                  0                0                0                0
## # … with 1 more variable: variety_Other <int>

Other types of engineered categorical features…

Words or phrases in text
A given time period
An arbitrary numerical cut-off
Demographic variables
Etc.

What about numerical features?

wine %>%
  ggplot(aes(price))+
    geom_histogram()

looking at this, we may want to take the natural log of price

Take the natural log

wine %>%
  ggplot(aes(log(price)))+
    geom_histogram()

The outliers are balanced out

Engineering numeric features: Standardizing

mean-centering \(x-\bar{x}\)
scaling: \(x/std(x)\)

…allows for common scale across variables. Also helps reduce bias when interactions are included (i.e. eliminates variance inflation).

And there are many other transformations that you can read about.

Interaction effects

This chapter has a good overview of interactions.

start with domain knowledge
use visualizations
3-way interactions exist, but are rare and sometimes hard to explain

The ‘caret’ package

Philosophy

Types of resampling

V-fold Cross-Validation
- If you have enough data, this is the #1 option
- No overalap (no replacement)
- each fold is unique & has its own test/train
Monte Carlo Cross-Validation
- 2nd option
- fuzzy boundaries meaning there is often duplicated data (“with replacement”)
- each subsample has a test/train
The Bootstrap
- 3rd option
- But this option is faster and can be used on smaller datasets more effectively
- chance that you duplicate data (fuzzy boundaries)
- it is tested on itself

Typical setup

library(caret)
wino <- wine %>%
  mutate(fr=(country=="France")) %>% # Mutates to a T/F for both fr & cab
  mutate(cab=str_detect(variety,"Cabernet")) %>% 
  mutate(lprice=log(price)) %>% 
  drop_na(fr, cab) %>% # Caret Library is extremely particular about NAs
  select(lprice, points, fr, cab)

# Seperate into Train & Test
wine_index <- createDataPartition(wino$lprice, p = 0.8, list = FALSE) # partitioned on basis of log price
wino_tr <- wino[ wine_index, ] # Training set 80%
wino_te <- wino[-wine_index, ] # Testin set 20%

control <- trainControl(method="repeatedcv", number=5, repeats=3) # cv = cross validation
# v-fold cross validation
## I want 5 folds
## & I want to repeat the process: 3 times
## Meaning we are essentially running the model 15 times
### Then validating it against a sub sample of the training data each of those 15 times as well

m1 <- train(lprice ~ ., # Calling train function: on this data, "i am going to give it the method - LM"
                data = wino_tr, 
                method = "lm", # could be GLM or decision tree, etc. (read below first)
                trControl = control)
# Running a linear regression with sub sampling



# This means for the training data, it can be trained seperate from a model chosen.. we can do above or...
## m1 <- train(lprice ~ ., 
  #              data = wino_tr, 
   #             method = "glm", 
    #            trControl = control)
## m1 <- train(lprice ~ ., 
  #              data = wino_tr, 
   #             method = "decision tree", # Etc. 
    #            trControl = control)


# Can do exact same everything (same data, same subsampling method, same framework), but change the type of model being used in the end

Follow this link for the full documentation on caret.

Train vs. test

# Results

m1

## Linear Regression 
## 
## 71603 samples
##     3 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times) 
## Summary of sample sizes: 57282, 57281, 57284, 57282, 57283, 57283, ... 
## Resampling results:
## 
##   RMSE       Rsquared   MAE     
##   0.5106586  0.3966542  0.402223
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

## No pre processing
## Different sample sizes around 57k
## RMSE = .51

Train vs. test

wine_pred <- predict(m1, wino_te) # Using model created and on the data we want to run it on

postResample(pred=wine_pred,
             obs = wino_te$lprice)

##      RMSE  Rsquared       MAE 
## 0.5109516 0.3913061 0.4034114

Parameter selection

Engineer 9 features

wino <- wine %>% 
  mutate(country=fct_lump(country,4)) %>% # top 4 countries based on frequency
  mutate(variety=fct_lump(variety,4)) %>% # top 4 varieties based on frequency
  mutate(lprice=log(price)) %>%
  select(lprice, points, country, variety) %>% 
  drop_na(.)

# Creates DF with variables country and variety, but only shows the top 4 in frequency, all others are named "other"
head(wino)

## # A tibble: 6 x 4
##   lprice points country variety   
##    <dbl>  <dbl> <fct>   <fct>     
## 1   2.71     87 Other   Other     
## 2   2.64     87 US      Other     
## 3   2.56     87 US      Other     
## 4   4.17     87 US      Pinot Noir
## 5   2.71     87 Spain   Other     
## 6   2.77     87 Italy   Other

# Additional data cleaning
wino <- dummy_cols(wino, remove_selected_columns = T) %>% 
  select(-country_Other, -variety_Other) %>% # gets rid of "other" dummy variable
  rename_all(funs(tolower(.))) %>% # turning everything lowercase 
  rename_all(funs(str_replace_all(., "-", "_"))) %>% # replacing all dashes with underscores
  rename_all(funs(str_replace_all(., " ", "_")))

head(wino) %>% 
  select(1:7)

## # A tibble: 6 x 7
##   lprice points country_france country_italy country_spain country_us
##    <dbl>  <dbl>          <int>         <int>         <int>      <int>
## 1   2.71     87              0             0             0          0
## 2   2.64     87              0             0             0          1
## 3   2.56     87              0             0             0          1
## 4   4.17     87              0             0             0          1
## 5   2.71     87              0             0             1          0
## 6   2.77     87              0             1             0          0
## # … with 1 more variable: variety_cabernet_sauvignon <int>

Basic Model

wine_index <- createDataPartition(wino$lprice, p = 0.8, list = FALSE)
wino_tr <- wino[ wine_index, ]
wino_te <- wino[-wine_index, ]

m2 <- train(lprice ~ .,
                data = wino_tr, 
                method = "lm",
                trControl = control)

Results (train)

m2

## Linear Regression 
## 
## 71603 samples
##     9 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times) 
## Summary of sample sizes: 57284, 57282, 57282, 57282, 57282, 57281, ... 
## Resampling results:
## 
##   RMSE       Rsquared   MAE      
##   0.4887842  0.4464716  0.3797216
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

# RMSE = .488
# Rsq = .447
# MAE = .379

Results (test)

wine_pred <- predict(m2, wino_te)

postResample(pred=wine_pred, 
             obs = wino_te$lprice)

##      RMSE  Rsquared       MAE 
## 0.4888447 0.4457318 0.3809720

Variable Importance (depends on model used)

# estimate variable importance
importance <- varImp(m2, scale=TRUE)

# plot importance
plot(importance)

Recursive feature elimination

Using recursive feature elimination in caret

x <- select(wino_tr,-lprice)
y <- wino_tr$lprice

control <- rfeControl(functions=rfFuncs, method="cv", number=2)

# run the RFE algorithm
results <- rfe(x, y, sizes=c(1:9), rfeControl=control)

# summarize the results
print(results)

## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (2 fold) 
## 
## Resampling performance over subset size:
## 
##  Variables   RMSE Rsquared    MAE    RMSESD RsquaredSD     MAESD Selected
##          1 0.5053   0.4083 0.3986 0.0033685  0.0046801 0.0031303         
##          2 0.5269   0.3952 0.4170 0.0057977  0.0061587 0.0045627         
##          3 0.5364   0.3991 0.4217 0.0024455  0.0032882 0.0013382         
##          4 0.5371   0.4219 0.4198 0.0021017  0.0032352 0.0009839         
##          5 0.5477   0.4261 0.4295 0.0034949  0.0027620 0.0033462         
##          6 0.4887   0.4726 0.3806 0.0016965  0.0039018 0.0011442         
##          7 0.4929   0.4704 0.3836 0.0005725  0.0008272 0.0008715         
##          8 0.4938   0.4788 0.3848 0.0004989  0.0024974 0.0003169         
##          9 0.4722   0.4939 0.3666 0.0024039  0.0018672 0.0017644        *
## 
## The top 5 variables (out of 9):
##    points, country_italy, variety_pinot_noir, country_us, variety_cabernet_sauvignon

# list the chosen features
predictors(results)

## [1] "points"                     "country_italy"             
## [3] "variety_pinot_noir"         "country_us"                
## [5] "variety_cabernet_sauvignon" "variety_red_blend"         
## [7] "variety_chardonnay"         "country_france"            
## [9] "country_spain"

# plot the results
plot(results, type=c("g", "o"))

Vocabulary

Key Terms

Feature Engineering
Categorical Feature
Continuous Feature
Dummy
Interaction
Caret
Model
Resampling
Training Data vs. Test Data
Variable Importance

Wk 2 - Feature Engineering & Variable Selection

Matt

1/28/2021

Dummy Variable Exercise

Categorical vs. Continuous Variables

Categorical Example 1

Categorical Example 2

Encoding categorical features: few dummies

Encoding categorical features: many dummies

Other types of engineered categorical features…

What about numerical features?

Take the natural log

Engineering numeric features: Standardizing

Interaction effects

The ‘caret’ package

Philosophy

Types of resampling

Typical setup

Train vs. test

Train vs. test

Parameter selection

Engineer 9 features

Basic Model

Results (train)

Results (test)

Variable Importance (depends on model used)

Recursive feature elimination

Using recursive feature elimination in caret

Vocabulary

Key Terms