Introduction

The data used to produce this model is a data set of 8,380 bottles of Pinot Noir wine. The data set includes the following columns:

-id (int): numerical id of row -province (char): where the is made - California, Oregon, Casablanca Valley, Marlborough, or New York -price (dbl): cost of bottle -points (dbl): how many points the wine was rated out of 100 max points. -year (dbl): when the wine was produced -description: the tasting notes by the reviewer of the wine

The goal here is to produce a model that predicts the wineโ€™s province based on features generated by the data in price, points, year, and description. We focus on using the description column to detect key words used in describing wines from each province.

Setup

Feature Engineering

wine = read_rds("pinot.rds")

#function for doc term matrix of commonly used words in descpription of wine
wine_words <- function(df, j = 1000, stem=F){ 
  library(tidytext)
  library(SnowballC)
  data(stop_words)
  words <- df %>%
    unnest_tokens(word, description) %>%
    anti_join(stop_words) %>% # get rid of stop words
    filter(!(word %in% c("wine","pinot","vineyard", "price", "points")))
  
  if(stem){
    words <- words %>% 
      mutate(word = wordStem(word))
  }
  
  words <- words %>% 
    count(id, word) %>% 
    group_by(id) %>% 
    mutate(exists = (n>0)) %>% 
    ungroup %>% 
    group_by(word) %>% 
    mutate(total = sum(n)) %>% 
    filter(total > j) %>% 
    pivot_wider(id_cols = id, names_from = word, values_from = exists, values_fill = list(exists=0)) %>% 
    right_join(select(df,id,province)) %>% 
    mutate(across(-province, ~replace_na(.x, F)))
}

wino <- wine_words(wine, j=200, stem=F)
## Joining, by = "word"
## Joining, by = "id"
#bringing back numerical features from original dataset to wino:
wino = wino %>% left_join(select(wine, id, price, points, year), by = "id") 

#scaling & center points
wino = wino %>% select(points) %>% preProcess(method = c("center", "scale")) %>% predict(wino)
  

wino = wino %>% 
  mutate(price_f = case_when(
    price < 16 ~ "low",
    price >= 16 & price < 41 ~ "med",
    price >= 41 ~ "high"
  ), 
   year_f = case_when(
    year < 2005 ~ "old",
    year >= 2005 & year < 2011 ~ "recent",
    year >= 2011 ~ "current"
  ))

wino = wino %>% select(-price,-year)

#elastic net model from wino dataset generated from the function above with j = 200 and got a kappa in the .80s. However, New York was not performing well in this model. So we decided to run a separate glm model with province == New York as our response variable to gather more influential words for predicting New York:

wine2 <-  read_rds("pinot.rds") %>% 
  mutate(province = as.factor(as.numeric(province=="New_York")))

wine2 = wine2 %>% mutate(province = as.factor(province))

#new dataset with lower total (because New York is more rare province in the original pinot dataset)
wino2 <- wine_words(wine2, j = 50)
## Joining, by = "word"
## Joining, by = "id"
wine_index <- createDataPartition(wino2$province, p = 0.80, list = FALSE)
train <- wino2[ wine_index, ]
test <- wino2[-wine_index, ]

control <- trainControl(method = "cv", number = 5)

fit.ny <- train(province ~ .,
             data = train, 
             trControl = control,
             method = "glm",
             family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#show the odds ratios for top coefficients
odds_ratio <- exp(coef(fit.ny$finalModel))

ny_words = data.frame(name = names(odds_ratio), odds_ratio = round(odds_ratio,2)) %>%  
  arrange(desc(odds_ratio))

#cleaning up column of words
ny_words = ny_words %>% mutate(name = gsub("TRUE", "", name)) %>% mutate(name = gsub("`\\\\`", "", name)) %>% mutate(name = gsub("\\\\``", "", name))

features = colnames(wino)

nyc = ny_words$name[1:185]

#selecting words that do not show up in features from  wino
nyc = setdiff(nyc, features)

#grabbing the top 40 of these New York words:
nyc = nyc[1:40]

#transforming these 40 words into features 
nyc_words <- wine %>%
    unnest_tokens(word, description) %>%
    anti_join(stop_words) %>% 
    count(id, word) %>% 
    group_by(id) %>% 
    mutate(exists = (n>0)) %>% 
    ungroup %>% 
    group_by(word) %>% 
    mutate(total = sum(n)) %>% 
    filter(word %in% nyc) %>% 
    pivot_wider(id_cols = id, names_from = word, values_from = exists, values_fill = list(exists=0)) %>% 
    right_join(select(wine,id,province)) %>% 
    mutate(across(-province, ~replace_na(.x, F)))
## Joining, by = "word"
## Joining, by = "id"
nyc_words = nyc_words %>% arrange(id) %>% select(-province)

#joining new features to wino, then removing id column: 
wino = wino %>% inner_join(nyc_words, by = 'id') %>% select(-id)

Specification

#Model: ELASTIC NET REGRESSION
#Tuning parameters: 
#   1. lambda - regularization/penalty enforcement to minimize prediction error)
#   2, alpha  - mixing parameter between 0 and 1, where alpha = 0 is ridge regression, and alpha = 1 is lasso regression)

set.seed(600) 

ctrl <- trainControl(method = "cv", number = 5)

wine_index <- createDataPartition(wino$province, p = 0.80, list = FALSE)
train <- wino[ wine_index, ]
test <- wino[-wine_index, ]

#training model on a sequence of values of alpha and lambda to find optimal regularization
fit <- train(province ~ .,
             data = train, 
             method = "glmnet",
             tuneGrid = expand.grid(alpha=seq(0,1,length=10), 
                                    lambda = seq(0.0001,0.2,
                                    length=20)),
             trControl = ctrl,
             metric = "Kappa")

confusionMatrix(predict(fit, test),factor(test$province))
## Confusion Matrix and Statistics
## 
##                    Reference
## Prediction          Burgundy California Casablanca_Valley Marlborough New_York
##   Burgundy               221          4                 0           0        0
##   California               6        755                 3           4        9
##   Casablanca_Valley        0          0                18           0        0
##   Marlborough              0          0                 0          29        3
##   New_York                 0          0                 0           0       14
##   Oregon                  11         32                 5          12        0
##                    Reference
## Prediction          Oregon
##   Burgundy               5
##   California            28
##   Casablanca_Valley      0
##   Marlborough            1
##   New_York               0
##   Oregon               513
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9265          
##                  95% CI : (0.9129, 0.9385)
##     No Information Rate : 0.4728          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8851          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Burgundy Class: California Class: Casablanca_Valley
## Sensitivity                   0.9286            0.9545                  0.69231
## Specificity                   0.9937            0.9433                  1.00000
## Pos Pred Value                0.9609            0.9379                  1.00000
## Neg Pred Value                0.9882            0.9585                  0.99517
## Prevalence                    0.1423            0.4728                  0.01554
## Detection Rate                0.1321            0.4513                  0.01076
## Detection Prevalence          0.1375            0.4812                  0.01076
## Balanced Accuracy             0.9611            0.9489                  0.84615
##                      Class: Marlborough Class: New_York Class: Oregon
## Sensitivity                     0.64444        0.538462        0.9378
## Specificity                     0.99754        1.000000        0.9467
## Pos Pred Value                  0.87879        1.000000        0.8953
## Neg Pred Value                  0.99024        0.992767        0.9691
## Prevalence                      0.02690        0.015541        0.3270
## Detection Rate                  0.01733        0.008368        0.3066
## Detection Prevalence            0.01973        0.008368        0.3425
## Balanced Accuracy               0.82099        0.769231        0.9423

Best model

# Here are a few lines to inspect your best model. Add some comments about optimal hyperparameters.
print(fit)
## glmnet 
## 
## 6707 samples
##  222 predictor
##    6 classes: 'Burgundy', 'California', 'Casablanca_Valley', 'Marlborough', 'New_York', 'Oregon' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 5365, 5366, 5366, 5366, 5365 
## Resampling results across tuning parameters:
## 
##   alpha      lambda      Accuracy   Kappa      
##   0.0000000  0.00010000  0.9281337  0.887889773
##   0.0000000  0.01062105  0.9281337  0.887889773
##   0.0000000  0.02114211  0.9281337  0.887821577
##   0.0000000  0.03166316  0.9248532  0.882461152
##   0.0000000  0.04218421  0.9211252  0.876341539
##   0.0000000  0.05270526  0.9187400  0.872475384
##   0.0000000  0.06322632  0.9162054  0.868226024
##   0.0000000  0.07374737  0.9142671  0.865021193
##   0.0000000  0.08426842  0.9120310  0.861359470
##   0.0000000  0.09478947  0.9087509  0.855894522
##   0.0000000  0.10531053  0.9066637  0.852419290
##   0.0000000  0.11583158  0.9050233  0.849657211
##   0.0000000  0.12635263  0.9027868  0.845856823
##   0.0000000  0.13687368  0.8993578  0.840136860
##   0.0000000  0.14739474  0.8978669  0.837644345
##   0.0000000  0.15791579  0.8963759  0.835107618
##   0.0000000  0.16843684  0.8951832  0.833127895
##   0.0000000  0.17895789  0.8935433  0.830323207
##   0.0000000  0.18947895  0.8926488  0.828746941
##   0.0000000  0.20000000  0.8914561  0.826712724
##   0.1111111  0.00010000  0.9263440  0.886370829
##   0.1111111  0.01062105  0.9269410  0.886097802
##   0.1111111  0.02114211  0.9179950  0.871477819
##   0.1111111  0.03166316  0.9117322  0.861144211
##   0.1111111  0.04218421  0.9051721  0.850234034
##   0.1111111  0.05270526  0.9002514  0.842056362
##   0.1111111  0.06322632  0.8926478  0.829296565
##   0.1111111  0.07374737  0.8853424  0.817107141
##   0.1111111  0.08426842  0.8825094  0.812140970
##   0.1111111  0.09478947  0.8796763  0.807344155
##   0.1111111  0.10531053  0.8752032  0.799832507
##   0.1111111  0.11583158  0.8723702  0.794988740
##   0.1111111  0.12635263  0.8704317  0.791691692
##   0.1111111  0.13687368  0.8680457  0.787545119
##   0.1111111  0.14739474  0.8643183  0.781225438
##   0.1111111  0.15791579  0.8625291  0.778157351
##   0.1111111  0.16843684  0.8601440  0.774009575
##   0.1111111  0.17895789  0.8567145  0.768108070
##   0.1111111  0.18947895  0.8544779  0.764259755
##   0.1111111  0.20000000  0.8516453  0.759427540
##   0.2222222  0.00010000  0.9248531  0.884129199
##   0.2222222  0.01062105  0.9209770  0.876487981
##   0.2222222  0.02114211  0.9103910  0.859043212
##   0.2222222  0.03166316  0.9009975  0.843532497
##   0.2222222  0.04218421  0.8869829  0.820240750
##   0.2222222  0.05270526  0.8783345  0.805701127
##   0.2222222  0.06322632  0.8729668  0.796566208
##   0.2222222  0.07374737  0.8677481  0.787577876
##   0.2222222  0.08426842  0.8629768  0.779358093
##   0.2222222  0.09478947  0.8571621  0.769418759
##   0.2222222  0.10531053  0.8513470  0.759462992
##   0.2222222  0.11583158  0.8434453  0.746102648
##   0.2222222  0.12635263  0.8347981  0.731510813
##   0.2222222  0.13687368  0.8295797  0.722327406
##   0.2222222  0.14739474  0.8237649  0.712261509
##   0.2222222  0.15791579  0.8178007  0.701870390
##   0.2222222  0.16843684  0.8122839  0.692235567
##   0.2222222  0.17895789  0.8042328  0.678078877
##   0.2222222  0.18947895  0.7972254  0.665902628
##   0.2222222  0.20000000  0.7902172  0.653513488
##   0.3333333  0.00010000  0.9250023  0.884373095
##   0.3333333  0.01062105  0.9157586  0.868069533
##   0.3333333  0.02114211  0.9026382  0.846490460
##   0.3333333  0.03166316  0.8857899  0.818562555
##   0.3333333  0.04218421  0.8738614  0.798514703
##   0.3333333  0.05270526  0.8659593  0.784952875
##   0.3333333  0.06322632  0.8561188  0.768230244
##   0.3333333  0.07374737  0.8473219  0.753236695
##   0.3333333  0.08426842  0.8343503  0.731167995
##   0.3333333  0.09478947  0.8262993  0.717229001
##   0.3333333  0.10531053  0.8170552  0.701298637
##   0.3333333  0.11583158  0.8100474  0.689227366
##   0.3333333  0.12635263  0.7973742  0.667169756
##   0.3333333  0.13687368  0.7893231  0.652892845
##   0.3333333  0.14739474  0.7797809  0.635799043
##   0.3333333  0.15791579  0.7672573  0.613515279
##   0.3333333  0.16843684  0.7532443  0.588137336
##   0.3333333  0.17895789  0.7386334  0.561673098
##   0.3333333  0.18947895  0.7216353  0.530321457
##   0.3333333  0.20000000  0.7038918  0.496917702
##   0.4444444  0.00010000  0.9255988  0.885275405
##   0.4444444  0.01062105  0.9123291  0.862467862
##   0.4444444  0.02114211  0.8930957  0.830906210
##   0.4444444  0.03166316  0.8749051  0.800428895
##   0.4444444  0.04218421  0.8628280  0.779857692
##   0.4444444  0.05270526  0.8485146  0.755672186
##   0.4444444  0.06322632  0.8339035  0.730862249
##   0.4444444  0.07374737  0.8206339  0.708047810
##   0.4444444  0.08426842  0.8116877  0.692551073
##   0.4444444  0.09478947  0.7982686  0.669364371
##   0.4444444  0.10531053  0.7867877  0.649028837
##   0.4444444  0.11583158  0.7726241  0.624055225
##   0.4444444  0.12635263  0.7553300  0.593223888
##   0.4444444  0.13687368  0.7380361  0.561740445
##   0.4444444  0.14739474  0.7179071  0.524682787
##   0.4444444  0.15791579  0.6968838  0.485160850
##   0.4444444  0.16843684  0.6761585  0.446156482
##   0.4444444  0.17895789  0.6608022  0.416237275
##   0.4444444  0.18947895  0.6435071  0.381706331
##   0.4444444  0.20000000  0.6265115  0.347897513
##   0.5555556  0.00010000  0.9251515  0.884635154
##   0.5555556  0.01062105  0.9083034  0.856009947
##   0.5555556  0.02114211  0.8837024  0.815439020
##   0.5555556  0.03166316  0.8668535  0.786826204
##   0.5555556  0.04218421  0.8474712  0.754227973
##   0.5555556  0.05270526  0.8304742  0.725324512
##   0.5555556  0.06322632  0.8166085  0.701265018
##   0.5555556  0.07374737  0.8009524  0.674392699
##   0.5555556  0.08426842  0.7873847  0.650649662
##   0.5555556  0.09478947  0.7678534  0.616355631
##   0.5555556  0.10531053  0.7483234  0.581269680
##   0.5555556  0.11583158  0.7244681  0.538050491
##   0.5555556  0.12635263  0.6995676  0.491352664
##   0.5555556  0.13687368  0.6751134  0.445319626
##   0.5555556  0.14739474  0.6578206  0.410743697
##   0.5555556  0.15791579  0.6367992  0.369273460
##   0.5555556  0.16843684  0.6209932  0.338629788
##   0.5555556  0.17895789  0.5931122  0.281137405
##   0.5555556  0.18947895  0.5762639  0.245074491
##   0.5555556  0.20000000  0.5446540  0.173514167
##   0.6666667  0.00010000  0.9250024  0.884395692
##   0.6666667  0.01062105  0.9045764  0.849870135
##   0.6666667  0.02114211  0.8757994  0.802179437
##   0.6666667  0.03166316  0.8543293  0.766001372
##   0.6666667  0.04218421  0.8315180  0.727392571
##   0.6666667  0.05270526  0.8142228  0.697747408
##   0.6666667  0.06322632  0.7978213  0.669294531
##   0.6666667  0.07374737  0.7806754  0.639364967
##   0.6666667  0.08426842  0.7593562  0.601398626
##   0.6666667  0.09478947  0.7338620  0.555307480
##   0.6666667  0.10531053  0.7044876  0.501253287
##   0.6666667  0.11583158  0.6748156  0.445669471
##   0.6666667  0.12635263  0.6545398  0.405202826
##   0.6666667  0.13687368  0.6309841  0.358610911
##   0.6666667  0.14739474  0.6099593  0.317257564
##   0.6666667  0.15791579  0.5831232  0.260095052
##   0.6666667  0.16843684  0.5625472  0.214485657
##   0.6666667  0.17895789  0.5445051  0.173202225
##   0.6666667  0.18947895  0.5424174  0.168463487
##   0.6666667  0.20000000  0.4748767  0.009373736
##   0.7777778  0.00010000  0.9251516  0.884649362
##   0.7777778  0.01062105  0.8995076  0.841744295
##   0.7777778  0.02114211  0.8702828  0.792848016
##   0.7777778  0.03166316  0.8403144  0.742565709
##   0.7777778  0.04218421  0.8185465  0.705475651
##   0.7777778  0.05270526  0.8000578  0.673427996
##   0.7777778  0.06322632  0.7791844  0.637160317
##   0.7777778  0.07374737  0.7541384  0.592666218
##   0.7777778  0.08426842  0.7225302  0.535172888
##   0.7777778  0.09478947  0.6925599  0.479407805
##   0.7777778  0.10531053  0.6654243  0.427487099
##   0.7777778  0.11583158  0.6348607  0.367014354
##   0.7777778  0.12635263  0.6114501  0.320802071
##   0.7777778  0.13687368  0.5910243  0.278354415
##   0.7777778  0.14739474  0.5623981  0.214256892
##   0.7777778  0.15791579  0.5445051  0.173286578
##   0.7777778  0.16843684  0.5424174  0.168463487
##   0.7777778  0.17895789  0.4747276  0.008396347
##   0.7777778  0.18947895  0.4723423  0.000000000
##   0.7777778  0.20000000  0.4723423  0.000000000
##   0.8888889  0.00010000  0.9254495  0.885082798
##   0.8888889  0.01062105  0.8951835  0.834679057
##   0.8888889  0.02114211  0.8617834  0.778752739
##   0.8888889  0.03166316  0.8309208  0.726791525
##   0.8888889  0.04218421  0.8079603  0.687356478
##   0.8888889  0.05270526  0.7848496  0.647438239
##   0.8888889  0.06322632  0.7589085  0.601470788
##   0.8888889  0.07374737  0.7238719  0.538030267
##   0.8888889  0.08426842  0.6843592  0.464918763
##   0.8888889  0.09478947  0.6576708  0.412196911
##   0.8888889  0.10531053  0.6317292  0.361248044
##   0.8888889  0.11583158  0.5966897  0.290166037
##   0.8888889  0.12635263  0.5881910  0.272940861
##   0.8888889  0.13687368  0.5446540  0.173679630
##   0.8888889  0.14739474  0.5424174  0.168463487
##   0.8888889  0.15791579  0.4747276  0.008396347
##   0.8888889  0.16843684  0.4723423  0.000000000
##   0.8888889  0.17895789  0.4723423  0.000000000
##   0.8888889  0.18947895  0.4723423  0.000000000
##   0.8888889  0.20000000  0.4723423  0.000000000
##   1.0000000  0.00010000  0.9253004  0.884847590
##   1.0000000  0.01062105  0.8910086  0.827841415
##   1.0000000  0.02114211  0.8522421  0.762756297
##   1.0000000  0.03166316  0.8203357  0.708885262
##   1.0000000  0.04218421  0.7937953  0.663275461
##   1.0000000  0.05270526  0.7672576  0.616733754
##   1.0000000  0.06322632  0.7353535  0.559025857
##   1.0000000  0.07374737  0.6939007  0.482905804
##   1.0000000  0.08426842  0.6590123  0.415237781
##   1.0000000  0.09478947  0.6309837  0.360081837
##   1.0000000  0.10531053  0.5934096  0.284300537
##   1.0000000  0.11583158  0.5875946  0.272165240
##   1.0000000  0.12635263  0.5446540  0.173679630
##   1.0000000  0.13687368  0.5424174  0.168463487
##   1.0000000  0.14739474  0.4723423  0.000000000
##   1.0000000  0.15791579  0.4723423  0.000000000
##   1.0000000  0.16843684  0.4723423  0.000000000
##   1.0000000  0.17895789  0.4723423  0.000000000
##   1.0000000  0.18947895  0.4723423  0.000000000
##   1.0000000  0.20000000  0.4723423  0.000000000
## 
## Kappa was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 0 and lambda = 0.01062105.
print(fit$bestTune)
##   alpha     lambda
## 2     0 0.01062105
#Best value for alpha: 0, lambda: 0.01
#This means that there are many predictors with similar coefficients that impact the response variable, so ridge regression is more effective. 

Re-fit and evaluation

set.seed(504)

wine_index <- createDataPartition(wino$province, p = 0.80, list = FALSE)
train <- wino[ wine_index, ]
test <- wino[-wine_index, ]

# spec for glmnet
fit_final <- train(province ~ .,
             data = train, 
             method = "glmnet",
             tuneGrid=fit$bestTune) 


confusionMatrix(predict(fit_final, test),factor(test$province))
## Confusion Matrix and Statistics
## 
##                    Reference
## Prediction          Burgundy California Casablanca_Valley Marlborough New_York
##   Burgundy               222          2                 1           0        0
##   California               2        756                 7           6        5
##   Casablanca_Valley        0          0                14           0        0
##   Marlborough              0          1                 0          32        4
##   New_York                 0          0                 0           0       14
##   Oregon                  14         32                 4           7        3
##                    Reference
## Prediction          Oregon
##   Burgundy               2
##   California            32
##   Casablanca_Valley      1
##   Marlborough            1
##   New_York               0
##   Oregon               511
## 
## Overall Statistics
##                                          
##                Accuracy : 0.9259         
##                  95% CI : (0.9123, 0.938)
##     No Information Rate : 0.4728         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.8842         
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: Burgundy Class: California Class: Casablanca_Valley
## Sensitivity                   0.9328            0.9558                 0.538462
## Specificity                   0.9965            0.9410                 0.999393
## Pos Pred Value                0.9780            0.9356                 0.933333
## Neg Pred Value                0.9889            0.9595                 0.992762
## Prevalence                    0.1423            0.4728                 0.015541
## Detection Rate                0.1327            0.4519                 0.008368
## Detection Prevalence          0.1357            0.4830                 0.008966
## Balanced Accuracy             0.9646            0.9484                 0.768927
##                      Class: Marlborough Class: New_York Class: Oregon
## Sensitivity                     0.71111        0.538462        0.9342
## Specificity                     0.99631        1.000000        0.9467
## Pos Pred Value                  0.84211        1.000000        0.8949
## Neg Pred Value                  0.99205        0.992767        0.9673
## Prevalence                      0.02690        0.015541        0.3270
## Detection Rate                  0.01913        0.008368        0.3054
## Detection Prevalence            0.02271        0.008368        0.3413
## Balanced Accuracy               0.85371        0.769231        0.9405