The data used to produce this model is a data set of 8,380 bottles of Pinot Noir wine. The data set includes the following columns:
-id (int): numerical id of row -province (char): where the is made - California, Oregon, Casablanca Valley, Marlborough, or New York -price (dbl): cost of bottle -points (dbl): how many points the wine was rated out of 100 max points. -year (dbl): when the wine was produced -description: the tasting notes by the reviewer of the wine
The goal here is to produce a model that predicts the wineโs province based on features generated by the data in price, points, year, and description. We focus on using the description column to detect key words used in describing wines from each province.
wine = read_rds("pinot.rds")
#function for doc term matrix of commonly used words in descpription of wine
wine_words <- function(df, j = 1000, stem=F){
library(tidytext)
library(SnowballC)
data(stop_words)
words <- df %>%
unnest_tokens(word, description) %>%
anti_join(stop_words) %>% # get rid of stop words
filter(!(word %in% c("wine","pinot","vineyard", "price", "points")))
if(stem){
words <- words %>%
mutate(word = wordStem(word))
}
words <- words %>%
count(id, word) %>%
group_by(id) %>%
mutate(exists = (n>0)) %>%
ungroup %>%
group_by(word) %>%
mutate(total = sum(n)) %>%
filter(total > j) %>%
pivot_wider(id_cols = id, names_from = word, values_from = exists, values_fill = list(exists=0)) %>%
right_join(select(df,id,province)) %>%
mutate(across(-province, ~replace_na(.x, F)))
}
wino <- wine_words(wine, j=200, stem=F)
## Joining, by = "word"
## Joining, by = "id"
#bringing back numerical features from original dataset to wino:
wino = wino %>% left_join(select(wine, id, price, points, year), by = "id")
#scaling & center points
wino = wino %>% select(points) %>% preProcess(method = c("center", "scale")) %>% predict(wino)
wino = wino %>%
mutate(price_f = case_when(
price < 16 ~ "low",
price >= 16 & price < 41 ~ "med",
price >= 41 ~ "high"
),
year_f = case_when(
year < 2005 ~ "old",
year >= 2005 & year < 2011 ~ "recent",
year >= 2011 ~ "current"
))
wino = wino %>% select(-price,-year)
#elastic net model from wino dataset generated from the function above with j = 200 and got a kappa in the .80s. However, New York was not performing well in this model. So we decided to run a separate glm model with province == New York as our response variable to gather more influential words for predicting New York:
wine2 <- read_rds("pinot.rds") %>%
mutate(province = as.factor(as.numeric(province=="New_York")))
wine2 = wine2 %>% mutate(province = as.factor(province))
#new dataset with lower total (because New York is more rare province in the original pinot dataset)
wino2 <- wine_words(wine2, j = 50)
## Joining, by = "word"
## Joining, by = "id"
wine_index <- createDataPartition(wino2$province, p = 0.80, list = FALSE)
train <- wino2[ wine_index, ]
test <- wino2[-wine_index, ]
control <- trainControl(method = "cv", number = 5)
fit.ny <- train(province ~ .,
data = train,
trControl = control,
method = "glm",
family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#show the odds ratios for top coefficients
odds_ratio <- exp(coef(fit.ny$finalModel))
ny_words = data.frame(name = names(odds_ratio), odds_ratio = round(odds_ratio,2)) %>%
arrange(desc(odds_ratio))
#cleaning up column of words
ny_words = ny_words %>% mutate(name = gsub("TRUE", "", name)) %>% mutate(name = gsub("`\\\\`", "", name)) %>% mutate(name = gsub("\\\\``", "", name))
features = colnames(wino)
nyc = ny_words$name[1:185]
#selecting words that do not show up in features from wino
nyc = setdiff(nyc, features)
#grabbing the top 40 of these New York words:
nyc = nyc[1:40]
#transforming these 40 words into features
nyc_words <- wine %>%
unnest_tokens(word, description) %>%
anti_join(stop_words) %>%
count(id, word) %>%
group_by(id) %>%
mutate(exists = (n>0)) %>%
ungroup %>%
group_by(word) %>%
mutate(total = sum(n)) %>%
filter(word %in% nyc) %>%
pivot_wider(id_cols = id, names_from = word, values_from = exists, values_fill = list(exists=0)) %>%
right_join(select(wine,id,province)) %>%
mutate(across(-province, ~replace_na(.x, F)))
## Joining, by = "word"
## Joining, by = "id"
nyc_words = nyc_words %>% arrange(id) %>% select(-province)
#joining new features to wino, then removing id column:
wino = wino %>% inner_join(nyc_words, by = 'id') %>% select(-id)
#Model: ELASTIC NET REGRESSION
#Tuning parameters:
# 1. lambda - regularization/penalty enforcement to minimize prediction error)
# 2, alpha - mixing parameter between 0 and 1, where alpha = 0 is ridge regression, and alpha = 1 is lasso regression)
set.seed(600)
ctrl <- trainControl(method = "cv", number = 5)
wine_index <- createDataPartition(wino$province, p = 0.80, list = FALSE)
train <- wino[ wine_index, ]
test <- wino[-wine_index, ]
#training model on a sequence of values of alpha and lambda to find optimal regularization
fit <- train(province ~ .,
data = train,
method = "glmnet",
tuneGrid = expand.grid(alpha=seq(0,1,length=10),
lambda = seq(0.0001,0.2,
length=20)),
trControl = ctrl,
metric = "Kappa")
confusionMatrix(predict(fit, test),factor(test$province))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Burgundy California Casablanca_Valley Marlborough New_York
## Burgundy 221 4 0 0 0
## California 6 755 3 4 9
## Casablanca_Valley 0 0 18 0 0
## Marlborough 0 0 0 29 3
## New_York 0 0 0 0 14
## Oregon 11 32 5 12 0
## Reference
## Prediction Oregon
## Burgundy 5
## California 28
## Casablanca_Valley 0
## Marlborough 1
## New_York 0
## Oregon 513
##
## Overall Statistics
##
## Accuracy : 0.9265
## 95% CI : (0.9129, 0.9385)
## No Information Rate : 0.4728
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8851
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Burgundy Class: California Class: Casablanca_Valley
## Sensitivity 0.9286 0.9545 0.69231
## Specificity 0.9937 0.9433 1.00000
## Pos Pred Value 0.9609 0.9379 1.00000
## Neg Pred Value 0.9882 0.9585 0.99517
## Prevalence 0.1423 0.4728 0.01554
## Detection Rate 0.1321 0.4513 0.01076
## Detection Prevalence 0.1375 0.4812 0.01076
## Balanced Accuracy 0.9611 0.9489 0.84615
## Class: Marlborough Class: New_York Class: Oregon
## Sensitivity 0.64444 0.538462 0.9378
## Specificity 0.99754 1.000000 0.9467
## Pos Pred Value 0.87879 1.000000 0.8953
## Neg Pred Value 0.99024 0.992767 0.9691
## Prevalence 0.02690 0.015541 0.3270
## Detection Rate 0.01733 0.008368 0.3066
## Detection Prevalence 0.01973 0.008368 0.3425
## Balanced Accuracy 0.82099 0.769231 0.9423
# Here are a few lines to inspect your best model. Add some comments about optimal hyperparameters.
print(fit)
## glmnet
##
## 6707 samples
## 222 predictor
## 6 classes: 'Burgundy', 'California', 'Casablanca_Valley', 'Marlborough', 'New_York', 'Oregon'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 5365, 5366, 5366, 5366, 5365
## Resampling results across tuning parameters:
##
## alpha lambda Accuracy Kappa
## 0.0000000 0.00010000 0.9281337 0.887889773
## 0.0000000 0.01062105 0.9281337 0.887889773
## 0.0000000 0.02114211 0.9281337 0.887821577
## 0.0000000 0.03166316 0.9248532 0.882461152
## 0.0000000 0.04218421 0.9211252 0.876341539
## 0.0000000 0.05270526 0.9187400 0.872475384
## 0.0000000 0.06322632 0.9162054 0.868226024
## 0.0000000 0.07374737 0.9142671 0.865021193
## 0.0000000 0.08426842 0.9120310 0.861359470
## 0.0000000 0.09478947 0.9087509 0.855894522
## 0.0000000 0.10531053 0.9066637 0.852419290
## 0.0000000 0.11583158 0.9050233 0.849657211
## 0.0000000 0.12635263 0.9027868 0.845856823
## 0.0000000 0.13687368 0.8993578 0.840136860
## 0.0000000 0.14739474 0.8978669 0.837644345
## 0.0000000 0.15791579 0.8963759 0.835107618
## 0.0000000 0.16843684 0.8951832 0.833127895
## 0.0000000 0.17895789 0.8935433 0.830323207
## 0.0000000 0.18947895 0.8926488 0.828746941
## 0.0000000 0.20000000 0.8914561 0.826712724
## 0.1111111 0.00010000 0.9263440 0.886370829
## 0.1111111 0.01062105 0.9269410 0.886097802
## 0.1111111 0.02114211 0.9179950 0.871477819
## 0.1111111 0.03166316 0.9117322 0.861144211
## 0.1111111 0.04218421 0.9051721 0.850234034
## 0.1111111 0.05270526 0.9002514 0.842056362
## 0.1111111 0.06322632 0.8926478 0.829296565
## 0.1111111 0.07374737 0.8853424 0.817107141
## 0.1111111 0.08426842 0.8825094 0.812140970
## 0.1111111 0.09478947 0.8796763 0.807344155
## 0.1111111 0.10531053 0.8752032 0.799832507
## 0.1111111 0.11583158 0.8723702 0.794988740
## 0.1111111 0.12635263 0.8704317 0.791691692
## 0.1111111 0.13687368 0.8680457 0.787545119
## 0.1111111 0.14739474 0.8643183 0.781225438
## 0.1111111 0.15791579 0.8625291 0.778157351
## 0.1111111 0.16843684 0.8601440 0.774009575
## 0.1111111 0.17895789 0.8567145 0.768108070
## 0.1111111 0.18947895 0.8544779 0.764259755
## 0.1111111 0.20000000 0.8516453 0.759427540
## 0.2222222 0.00010000 0.9248531 0.884129199
## 0.2222222 0.01062105 0.9209770 0.876487981
## 0.2222222 0.02114211 0.9103910 0.859043212
## 0.2222222 0.03166316 0.9009975 0.843532497
## 0.2222222 0.04218421 0.8869829 0.820240750
## 0.2222222 0.05270526 0.8783345 0.805701127
## 0.2222222 0.06322632 0.8729668 0.796566208
## 0.2222222 0.07374737 0.8677481 0.787577876
## 0.2222222 0.08426842 0.8629768 0.779358093
## 0.2222222 0.09478947 0.8571621 0.769418759
## 0.2222222 0.10531053 0.8513470 0.759462992
## 0.2222222 0.11583158 0.8434453 0.746102648
## 0.2222222 0.12635263 0.8347981 0.731510813
## 0.2222222 0.13687368 0.8295797 0.722327406
## 0.2222222 0.14739474 0.8237649 0.712261509
## 0.2222222 0.15791579 0.8178007 0.701870390
## 0.2222222 0.16843684 0.8122839 0.692235567
## 0.2222222 0.17895789 0.8042328 0.678078877
## 0.2222222 0.18947895 0.7972254 0.665902628
## 0.2222222 0.20000000 0.7902172 0.653513488
## 0.3333333 0.00010000 0.9250023 0.884373095
## 0.3333333 0.01062105 0.9157586 0.868069533
## 0.3333333 0.02114211 0.9026382 0.846490460
## 0.3333333 0.03166316 0.8857899 0.818562555
## 0.3333333 0.04218421 0.8738614 0.798514703
## 0.3333333 0.05270526 0.8659593 0.784952875
## 0.3333333 0.06322632 0.8561188 0.768230244
## 0.3333333 0.07374737 0.8473219 0.753236695
## 0.3333333 0.08426842 0.8343503 0.731167995
## 0.3333333 0.09478947 0.8262993 0.717229001
## 0.3333333 0.10531053 0.8170552 0.701298637
## 0.3333333 0.11583158 0.8100474 0.689227366
## 0.3333333 0.12635263 0.7973742 0.667169756
## 0.3333333 0.13687368 0.7893231 0.652892845
## 0.3333333 0.14739474 0.7797809 0.635799043
## 0.3333333 0.15791579 0.7672573 0.613515279
## 0.3333333 0.16843684 0.7532443 0.588137336
## 0.3333333 0.17895789 0.7386334 0.561673098
## 0.3333333 0.18947895 0.7216353 0.530321457
## 0.3333333 0.20000000 0.7038918 0.496917702
## 0.4444444 0.00010000 0.9255988 0.885275405
## 0.4444444 0.01062105 0.9123291 0.862467862
## 0.4444444 0.02114211 0.8930957 0.830906210
## 0.4444444 0.03166316 0.8749051 0.800428895
## 0.4444444 0.04218421 0.8628280 0.779857692
## 0.4444444 0.05270526 0.8485146 0.755672186
## 0.4444444 0.06322632 0.8339035 0.730862249
## 0.4444444 0.07374737 0.8206339 0.708047810
## 0.4444444 0.08426842 0.8116877 0.692551073
## 0.4444444 0.09478947 0.7982686 0.669364371
## 0.4444444 0.10531053 0.7867877 0.649028837
## 0.4444444 0.11583158 0.7726241 0.624055225
## 0.4444444 0.12635263 0.7553300 0.593223888
## 0.4444444 0.13687368 0.7380361 0.561740445
## 0.4444444 0.14739474 0.7179071 0.524682787
## 0.4444444 0.15791579 0.6968838 0.485160850
## 0.4444444 0.16843684 0.6761585 0.446156482
## 0.4444444 0.17895789 0.6608022 0.416237275
## 0.4444444 0.18947895 0.6435071 0.381706331
## 0.4444444 0.20000000 0.6265115 0.347897513
## 0.5555556 0.00010000 0.9251515 0.884635154
## 0.5555556 0.01062105 0.9083034 0.856009947
## 0.5555556 0.02114211 0.8837024 0.815439020
## 0.5555556 0.03166316 0.8668535 0.786826204
## 0.5555556 0.04218421 0.8474712 0.754227973
## 0.5555556 0.05270526 0.8304742 0.725324512
## 0.5555556 0.06322632 0.8166085 0.701265018
## 0.5555556 0.07374737 0.8009524 0.674392699
## 0.5555556 0.08426842 0.7873847 0.650649662
## 0.5555556 0.09478947 0.7678534 0.616355631
## 0.5555556 0.10531053 0.7483234 0.581269680
## 0.5555556 0.11583158 0.7244681 0.538050491
## 0.5555556 0.12635263 0.6995676 0.491352664
## 0.5555556 0.13687368 0.6751134 0.445319626
## 0.5555556 0.14739474 0.6578206 0.410743697
## 0.5555556 0.15791579 0.6367992 0.369273460
## 0.5555556 0.16843684 0.6209932 0.338629788
## 0.5555556 0.17895789 0.5931122 0.281137405
## 0.5555556 0.18947895 0.5762639 0.245074491
## 0.5555556 0.20000000 0.5446540 0.173514167
## 0.6666667 0.00010000 0.9250024 0.884395692
## 0.6666667 0.01062105 0.9045764 0.849870135
## 0.6666667 0.02114211 0.8757994 0.802179437
## 0.6666667 0.03166316 0.8543293 0.766001372
## 0.6666667 0.04218421 0.8315180 0.727392571
## 0.6666667 0.05270526 0.8142228 0.697747408
## 0.6666667 0.06322632 0.7978213 0.669294531
## 0.6666667 0.07374737 0.7806754 0.639364967
## 0.6666667 0.08426842 0.7593562 0.601398626
## 0.6666667 0.09478947 0.7338620 0.555307480
## 0.6666667 0.10531053 0.7044876 0.501253287
## 0.6666667 0.11583158 0.6748156 0.445669471
## 0.6666667 0.12635263 0.6545398 0.405202826
## 0.6666667 0.13687368 0.6309841 0.358610911
## 0.6666667 0.14739474 0.6099593 0.317257564
## 0.6666667 0.15791579 0.5831232 0.260095052
## 0.6666667 0.16843684 0.5625472 0.214485657
## 0.6666667 0.17895789 0.5445051 0.173202225
## 0.6666667 0.18947895 0.5424174 0.168463487
## 0.6666667 0.20000000 0.4748767 0.009373736
## 0.7777778 0.00010000 0.9251516 0.884649362
## 0.7777778 0.01062105 0.8995076 0.841744295
## 0.7777778 0.02114211 0.8702828 0.792848016
## 0.7777778 0.03166316 0.8403144 0.742565709
## 0.7777778 0.04218421 0.8185465 0.705475651
## 0.7777778 0.05270526 0.8000578 0.673427996
## 0.7777778 0.06322632 0.7791844 0.637160317
## 0.7777778 0.07374737 0.7541384 0.592666218
## 0.7777778 0.08426842 0.7225302 0.535172888
## 0.7777778 0.09478947 0.6925599 0.479407805
## 0.7777778 0.10531053 0.6654243 0.427487099
## 0.7777778 0.11583158 0.6348607 0.367014354
## 0.7777778 0.12635263 0.6114501 0.320802071
## 0.7777778 0.13687368 0.5910243 0.278354415
## 0.7777778 0.14739474 0.5623981 0.214256892
## 0.7777778 0.15791579 0.5445051 0.173286578
## 0.7777778 0.16843684 0.5424174 0.168463487
## 0.7777778 0.17895789 0.4747276 0.008396347
## 0.7777778 0.18947895 0.4723423 0.000000000
## 0.7777778 0.20000000 0.4723423 0.000000000
## 0.8888889 0.00010000 0.9254495 0.885082798
## 0.8888889 0.01062105 0.8951835 0.834679057
## 0.8888889 0.02114211 0.8617834 0.778752739
## 0.8888889 0.03166316 0.8309208 0.726791525
## 0.8888889 0.04218421 0.8079603 0.687356478
## 0.8888889 0.05270526 0.7848496 0.647438239
## 0.8888889 0.06322632 0.7589085 0.601470788
## 0.8888889 0.07374737 0.7238719 0.538030267
## 0.8888889 0.08426842 0.6843592 0.464918763
## 0.8888889 0.09478947 0.6576708 0.412196911
## 0.8888889 0.10531053 0.6317292 0.361248044
## 0.8888889 0.11583158 0.5966897 0.290166037
## 0.8888889 0.12635263 0.5881910 0.272940861
## 0.8888889 0.13687368 0.5446540 0.173679630
## 0.8888889 0.14739474 0.5424174 0.168463487
## 0.8888889 0.15791579 0.4747276 0.008396347
## 0.8888889 0.16843684 0.4723423 0.000000000
## 0.8888889 0.17895789 0.4723423 0.000000000
## 0.8888889 0.18947895 0.4723423 0.000000000
## 0.8888889 0.20000000 0.4723423 0.000000000
## 1.0000000 0.00010000 0.9253004 0.884847590
## 1.0000000 0.01062105 0.8910086 0.827841415
## 1.0000000 0.02114211 0.8522421 0.762756297
## 1.0000000 0.03166316 0.8203357 0.708885262
## 1.0000000 0.04218421 0.7937953 0.663275461
## 1.0000000 0.05270526 0.7672576 0.616733754
## 1.0000000 0.06322632 0.7353535 0.559025857
## 1.0000000 0.07374737 0.6939007 0.482905804
## 1.0000000 0.08426842 0.6590123 0.415237781
## 1.0000000 0.09478947 0.6309837 0.360081837
## 1.0000000 0.10531053 0.5934096 0.284300537
## 1.0000000 0.11583158 0.5875946 0.272165240
## 1.0000000 0.12635263 0.5446540 0.173679630
## 1.0000000 0.13687368 0.5424174 0.168463487
## 1.0000000 0.14739474 0.4723423 0.000000000
## 1.0000000 0.15791579 0.4723423 0.000000000
## 1.0000000 0.16843684 0.4723423 0.000000000
## 1.0000000 0.17895789 0.4723423 0.000000000
## 1.0000000 0.18947895 0.4723423 0.000000000
## 1.0000000 0.20000000 0.4723423 0.000000000
##
## Kappa was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 0 and lambda = 0.01062105.
print(fit$bestTune)
## alpha lambda
## 2 0 0.01062105
#Best value for alpha: 0, lambda: 0.01
#This means that there are many predictors with similar coefficients that impact the response variable, so ridge regression is more effective.
set.seed(504)
wine_index <- createDataPartition(wino$province, p = 0.80, list = FALSE)
train <- wino[ wine_index, ]
test <- wino[-wine_index, ]
# spec for glmnet
fit_final <- train(province ~ .,
data = train,
method = "glmnet",
tuneGrid=fit$bestTune)
confusionMatrix(predict(fit_final, test),factor(test$province))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Burgundy California Casablanca_Valley Marlborough New_York
## Burgundy 222 2 1 0 0
## California 2 756 7 6 5
## Casablanca_Valley 0 0 14 0 0
## Marlborough 0 1 0 32 4
## New_York 0 0 0 0 14
## Oregon 14 32 4 7 3
## Reference
## Prediction Oregon
## Burgundy 2
## California 32
## Casablanca_Valley 1
## Marlborough 1
## New_York 0
## Oregon 511
##
## Overall Statistics
##
## Accuracy : 0.9259
## 95% CI : (0.9123, 0.938)
## No Information Rate : 0.4728
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8842
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Burgundy Class: California Class: Casablanca_Valley
## Sensitivity 0.9328 0.9558 0.538462
## Specificity 0.9965 0.9410 0.999393
## Pos Pred Value 0.9780 0.9356 0.933333
## Neg Pred Value 0.9889 0.9595 0.992762
## Prevalence 0.1423 0.4728 0.015541
## Detection Rate 0.1327 0.4519 0.008368
## Detection Prevalence 0.1357 0.4830 0.008966
## Balanced Accuracy 0.9646 0.9484 0.768927
## Class: Marlborough Class: New_York Class: Oregon
## Sensitivity 0.71111 0.538462 0.9342
## Specificity 0.99631 1.000000 0.9467
## Pos Pred Value 0.84211 1.000000 0.8949
## Neg Pred Value 0.99205 0.992767 0.9673
## Prevalence 0.02690 0.015541 0.3270
## Detection Rate 0.01913 0.008368 0.3054
## Detection Prevalence 0.02271 0.008368 0.3413
## Balanced Accuracy 0.85371 0.769231 0.9405