# add your libraries
library(tidyverse)
library(caret)
wine = read_rds("/Users/rochellerafn/RStudio Files/pinot.rds")
# create some cool features. Make sure you add comments so I know what you are trying to accomplish!
# wines<- wine %>%
# rename_all(funs(tolower(.))) %>%
# rename_all(funs(str_replace_all(., "-", "_"))) %>%
# rename_all(funs(str_replace_all(., " ", "_"))) %>%
# mutate(first = str_detect(description,"fruit")) %>%
# mutate(second = str_detect(description,"intense")) %>%
# mutate(eighth = str_detect(description,"currant")) %>%
# select(-description)
# specify the model to be used (i.e. KNN, Naive Bayes, decision tree, random forest, bagged trees) and the tuning parameters used
#
# ctrl <- trainControl(method = "cv", number = 5)
# set.seed(504)
#
# wine_index <- createDataPartition(wines$province, p = 0.80, list = FALSE)
# train <- wines[ wine_index, ]
# test <- wines[-wine_index, ]
#
# # example spec for knn
# fit <- train(province ~ .,
# data = train,
# method = "knn",
# tuneLength = 15,
# metric = "Kappa",
# trControl = ctrl)
#
# confusionMatrix(predict(fit, test),factor(test$province))
# #
# tree_model = rpart(province ~ .,
# control = rpart.control(minsplit = 2, minbucket = 1, cp = 0, xval = 10),
# data = train)
#
# variable_importance = varImp(tree_model, scale=FALSE)
#
# print(variable_importance)
# ctrl <- trainControl(method = "cv", number=3)
#
# fit <- train(province ~ .,
# data = train,
# method = "rf",
# verbose=FALSE,
# trControl = ctrl)
#
# fit
wine<- wine%>%
rename_all(funs(tolower(.))) %>%
rename_all(funs(str_replace_all(., "-", "_"))) %>%
rename_all(funs(str_replace_all(., " ", "_"))) %>%
mutate(fruit = str_detect(description,"fruit")) %>%
mutate(intensity = str_detect(description,"palate"))%>%
mutate(currant = str_detect(description,"nose"))
wine_words <- function(df, j = 100, stem=F){
library(tidytext)
library(SnowballC)
data(stop_words)
words <- df %>%
unnest_tokens(word, description) %>%
anti_join(stop_words) %>% # get rid of stop words
filter(!(word %in% c("wine","pinot","vineyard", "structure", "drink", "tannins")))
if(stem){
words <- words %>%
mutate(word = wordStem(word))
}
words <- words %>%
count(id, word) %>%
group_by(id) %>%
mutate(exists = (n>0)) %>%
ungroup %>%
group_by(word) %>%
mutate(total = sum(n)) %>%
filter(total > j) %>%
pivot_wider(id_cols = id, names_from = word, values_from = exists, values_fill = list(exists=0)) %>%
right_join(select(df,id,province)) %>%
drop_na() %>%
select(-id)
}
winot <- wine_words(wine, j=500)
wine_index <- createDataPartition(winot$province, p = 0.80, list = FALSE)
train <- winot[ wine_index, ]
test <- winot[-wine_index, ]
table(train$province)
##
## Burgundy California Casablanca_Valley Marlborough
## 953 3161 105 184
## New_York Oregon
## 105 2184
ctrl <- trainControl(method = "cv", number=3)
fit <- train(province ~ .,
data = train,
method = "rf",
verbose=FALSE,
trControl = ctrl)
confusionMatrix(predict(fit, test),factor(test$province))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Burgundy California Casablanca_Valley Marlborough New_York
## Burgundy 202 14 1 2 0
## California 14 715 17 7 11
## Casablanca_Valley 0 1 5 0 0
## Marlborough 0 5 0 20 1
## New_York 0 1 0 0 8
## Oregon 22 54 3 16 6
## Reference
## Prediction Oregon
## Burgundy 23
## California 89
## Casablanca_Valley 0
## Marlborough 6
## New_York 0
## Oregon 427
##
## Overall Statistics
##
## Accuracy : 0.8246
## 95% CI : (0.8054, 0.8425)
## No Information Rate : 0.4731
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7232
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Burgundy Class: California Class: Casablanca_Valley
## Sensitivity 0.8487 0.9051 0.192308
## Specificity 0.9721 0.8432 0.999392
## Pos Pred Value 0.8347 0.8382 0.833333
## Neg Pred Value 0.9748 0.9082 0.987380
## Prevalence 0.1425 0.4731 0.015569
## Detection Rate 0.1210 0.4281 0.002994
## Detection Prevalence 0.1449 0.5108 0.003593
## Balanced Accuracy 0.9104 0.8741 0.595850
## Class: Marlborough Class: New_York Class: Oregon
## Sensitivity 0.44444 0.307692 0.7835
## Specificity 0.99262 0.999392 0.9102
## Pos Pred Value 0.62500 0.888889 0.8087
## Neg Pred Value 0.98474 0.989163 0.8967
## Prevalence 0.02695 0.015569 0.3263
## Detection Rate 0.01198 0.004790 0.2557
## Detection Prevalence 0.01916 0.005389 0.3162
## Balanced Accuracy 0.71853 0.653542 0.8469
# Here are a few lines to inspect your best model. Add some comments about optimal hyperparameters.
print(fit)
## Random Forest
##
## 6692 samples
## 51 predictor
## 6 classes: 'Burgundy', 'California', 'Casablanca_Valley', 'Marlborough', 'New_York', 'Oregon'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 4462, 4461, 4461
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8120134 0.6961685
## 26 0.8203825 0.7176561
## 51 0.8042434 0.6937743
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 26.
print(fit$bestTune)
## mtry
## 2 26
# the "method" below should match the one you chose above.
set.seed(504) # I will choose a different seed for evaluation
wine_index <- createDataPartition(winot$province, p = 0.80, list = FALSE)
train <- winot[ wine_index, ]
test <- winot[-wine_index, ]
# example spec for knn
fit_final <- train(province ~ .,
data = train,
method = "rf",
tuneGrid=fit$bestTune)
# The last line means we will fit a model using the best tune parameters your CV found above.
confusionMatrix(predict(fit_final, test),factor(test$province))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Burgundy California Casablanca_Valley Marlborough New_York
## Burgundy 203 6 0 1 1
## California 20 713 9 16 13
## Casablanca_Valley 0 1 13 0 0
## Marlborough 1 4 0 15 1
## New_York 0 1 0 2 6
## Oregon 14 65 4 11 5
## Reference
## Prediction Oregon
## Burgundy 28
## California 76
## Casablanca_Valley 0
## Marlborough 9
## New_York 0
## Oregon 432
##
## Overall Statistics
##
## Accuracy : 0.8275
## 95% CI : (0.8086, 0.8454)
## No Information Rate : 0.4731
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7285
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Burgundy Class: California Class: Casablanca_Valley
## Sensitivity 0.8529 0.9025 0.500000
## Specificity 0.9749 0.8477 0.999392
## Pos Pred Value 0.8494 0.8418 0.928571
## Neg Pred Value 0.9755 0.9064 0.992150
## Prevalence 0.1425 0.4731 0.015569
## Detection Rate 0.1216 0.4269 0.007784
## Detection Prevalence 0.1431 0.5072 0.008383
## Balanced Accuracy 0.9139 0.8751 0.749696
## Class: Marlborough Class: New_York Class: Oregon
## Sensitivity 0.333333 0.230769 0.7927
## Specificity 0.990769 0.998175 0.9120
## Pos Pred Value 0.500000 0.666667 0.8136
## Neg Pred Value 0.981707 0.987959 0.9008
## Prevalence 0.026946 0.015569 0.3263
## Detection Rate 0.008982 0.003593 0.2587
## Detection Prevalence 0.017964 0.005389 0.3180
## Balanced Accuracy 0.662051 0.614472 0.8523