Setup

# add your libraries
library(tidyverse)
library(caret)
wine = read_rds("/Users/rochellerafn/RStudio Files/pinot.rds") 

Feature Engineering

This was not as instructive as hoped.
# create some cool features. Make sure you add comments so I know what you are trying to accomplish!

# wines<- wine %>% 
#   rename_all(funs(tolower(.))) %>% 
#   rename_all(funs(str_replace_all(., "-", "_"))) %>% 
#   rename_all(funs(str_replace_all(., " ", "_"))) %>% 
#   mutate(first = str_detect(description,"fruit")) %>% 
#   mutate(second = str_detect(description,"intense")) %>%
#   mutate(eighth = str_detect(description,"currant")) %>%
#   select(-description)

Specification

So this initially was what we went through - looking at three words using KNN. The kappa was… not great! (kappa 0.2179) We left this in just to show one of the steps wer worked through.
# specify the model to be used (i.e. KNN, Naive Bayes, decision tree, random forest, bagged trees) and the tuning parameters used
# 
# ctrl <- trainControl(method = "cv", number = 5)
# set.seed(504) 
# 
# wine_index <- createDataPartition(wines$province, p = 0.80, list = FALSE)
# train <- wines[ wine_index, ]
# test <- wines[-wine_index, ]
# 
# # example spec for knn
# fit <- train(province ~ .,
#              data = train, 
#              method = "knn",
#              tuneLength = 15,
#              metric = "Kappa",
#              trControl = ctrl)
# 
# confusionMatrix(predict(fit, test),factor(test$province))
# # 
# tree_model = rpart(province ~ .,
#                     control = rpart.control(minsplit = 2, minbucket = 1, cp = 0, xval = 10),
#                     data = train)
# 
# variable_importance = varImp(tree_model, scale=FALSE)
# 
# print(variable_importance)
Random forest on 3 features: fruit, currant, intense. This was slightly better than KNN (kappa 0.46439)
# ctrl <- trainControl(method = "cv", number=3)
# 
# fit <- train(province ~ .,
#              data = train, 
#              method = "rf",
#              verbose=FALSE,
#              trControl = ctrl)
# 
# fit

This is our model to present.

We’ve selected a series of feature words and trained a random forest model. We spent time manipulating and adjusting our filter! word selection with numerous combinations, differing words and more and less words. This is the best combination of word feaatures that we came up with.
wine<- wine%>%
  rename_all(funs(tolower(.))) %>% 
  rename_all(funs(str_replace_all(., "-", "_"))) %>% 
  rename_all(funs(str_replace_all(., " ", "_"))) %>% 
  mutate(fruit = str_detect(description,"fruit")) %>% 
  mutate(intensity = str_detect(description,"palate"))%>%
  mutate(currant = str_detect(description,"nose"))


wine_words <- function(df, j = 100, stem=F){ 
  library(tidytext)
  library(SnowballC)
  data(stop_words)

  words <- df %>%
    unnest_tokens(word, description) %>%
    anti_join(stop_words) %>% # get rid of stop words
    filter(!(word %in% c("wine","pinot","vineyard", "structure", "drink", "tannins")))
  
  if(stem){
    words <- words %>% 
      mutate(word = wordStem(word))
  }
  
  words <- words %>% 
    count(id, word) %>% 
    group_by(id) %>% 
    mutate(exists = (n>0)) %>% 
    ungroup %>% 
    group_by(word) %>% 
    mutate(total = sum(n)) %>% 
    filter(total > j) %>% 
    pivot_wider(id_cols = id, names_from = word, values_from = exists, values_fill = list(exists=0)) %>% 
    right_join(select(df,id,province)) %>% 
    drop_na() %>% 
    select(-id)
}


winot <- wine_words(wine, j=500)
wine_index <- createDataPartition(winot$province, p = 0.80, list = FALSE)
train <- winot[ wine_index, ]
test <- winot[-wine_index, ]

table(train$province)
## 
##          Burgundy        California Casablanca_Valley       Marlborough 
##               953              3161               105               184 
##          New_York            Oregon 
##               105              2184
ctrl <- trainControl(method = "cv", number=3)

fit <- train(province ~ .,
             data = train, 
             method = "rf",
             verbose=FALSE,
             trControl = ctrl)

confusionMatrix(predict(fit, test),factor(test$province))
## Confusion Matrix and Statistics
## 
##                    Reference
## Prediction          Burgundy California Casablanca_Valley Marlborough New_York
##   Burgundy               202         14                 1           2        0
##   California              14        715                17           7       11
##   Casablanca_Valley        0          1                 5           0        0
##   Marlborough              0          5                 0          20        1
##   New_York                 0          1                 0           0        8
##   Oregon                  22         54                 3          16        6
##                    Reference
## Prediction          Oregon
##   Burgundy              23
##   California            89
##   Casablanca_Valley      0
##   Marlborough            6
##   New_York               0
##   Oregon               427
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8246          
##                  95% CI : (0.8054, 0.8425)
##     No Information Rate : 0.4731          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7232          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Burgundy Class: California Class: Casablanca_Valley
## Sensitivity                   0.8487            0.9051                 0.192308
## Specificity                   0.9721            0.8432                 0.999392
## Pos Pred Value                0.8347            0.8382                 0.833333
## Neg Pred Value                0.9748            0.9082                 0.987380
## Prevalence                    0.1425            0.4731                 0.015569
## Detection Rate                0.1210            0.4281                 0.002994
## Detection Prevalence          0.1449            0.5108                 0.003593
## Balanced Accuracy             0.9104            0.8741                 0.595850
##                      Class: Marlborough Class: New_York Class: Oregon
## Sensitivity                     0.44444        0.307692        0.7835
## Specificity                     0.99262        0.999392        0.9102
## Pos Pred Value                  0.62500        0.888889        0.8087
## Neg Pred Value                  0.98474        0.989163        0.8967
## Prevalence                      0.02695        0.015569        0.3263
## Detection Rate                  0.01198        0.004790        0.2557
## Detection Prevalence            0.01916        0.005389        0.3162
## Balanced Accuracy               0.71853        0.653542        0.8469

Best model

# Here are a few lines to inspect your best model. Add some comments about optimal hyperparameters.
print(fit)
## Random Forest 
## 
## 6692 samples
##   51 predictor
##    6 classes: 'Burgundy', 'California', 'Casablanca_Valley', 'Marlborough', 'New_York', 'Oregon' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 4462, 4461, 4461 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.8120134  0.6961685
##   26    0.8203825  0.7176561
##   51    0.8042434  0.6937743
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 26.
print(fit$bestTune)
##   mtry
## 2   26

Re-fit and evaluation

# the "method" below should match the one you chose above. 

set.seed(504) # I will choose a different seed for evaluation

wine_index <- createDataPartition(winot$province, p = 0.80, list = FALSE)
train <- winot[ wine_index, ]
test <- winot[-wine_index, ]

# example spec for knn
fit_final <- train(province ~ .,
             data = train, 
             method = "rf",
             tuneGrid=fit$bestTune) 

# The last line means we will fit a model using the best tune parameters your CV found above.
confusionMatrix(predict(fit_final, test),factor(test$province))
## Confusion Matrix and Statistics
## 
##                    Reference
## Prediction          Burgundy California Casablanca_Valley Marlborough New_York
##   Burgundy               203          6                 0           1        1
##   California              20        713                 9          16       13
##   Casablanca_Valley        0          1                13           0        0
##   Marlborough              1          4                 0          15        1
##   New_York                 0          1                 0           2        6
##   Oregon                  14         65                 4          11        5
##                    Reference
## Prediction          Oregon
##   Burgundy              28
##   California            76
##   Casablanca_Valley      0
##   Marlborough            9
##   New_York               0
##   Oregon               432
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8275          
##                  95% CI : (0.8086, 0.8454)
##     No Information Rate : 0.4731          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7285          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Burgundy Class: California Class: Casablanca_Valley
## Sensitivity                   0.8529            0.9025                 0.500000
## Specificity                   0.9749            0.8477                 0.999392
## Pos Pred Value                0.8494            0.8418                 0.928571
## Neg Pred Value                0.9755            0.9064                 0.992150
## Prevalence                    0.1425            0.4731                 0.015569
## Detection Rate                0.1216            0.4269                 0.007784
## Detection Prevalence          0.1431            0.5072                 0.008383
## Balanced Accuracy             0.9139            0.8751                 0.749696
##                      Class: Marlborough Class: New_York Class: Oregon
## Sensitivity                    0.333333        0.230769        0.7927
## Specificity                    0.990769        0.998175        0.9120
## Pos Pred Value                 0.500000        0.666667        0.8136
## Neg Pred Value                 0.981707        0.987959        0.9008
## Prevalence                     0.026946        0.015569        0.3263
## Detection Rate                 0.008982        0.003593        0.2587
## Detection Prevalence           0.017964        0.005389        0.3180
## Balanced Accuracy              0.662051        0.614472        0.8523