Load in the datasets

wineDf <- read.csv("G:/Documents/DATA622_HW1/winemag-data_first150k.csv")
ramenDf <- read.csv("G:/Documents/DATA622_HW1/ramen-ratings.csv")

Remove the review number (unnecessary key) and cast star rating as numeric

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
ramenDf <- ramenDf %>% select(-c('Review..')) %>% mutate(Stars = as.numeric(Stars)) %>% relocate(Stars)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Stars = as.numeric(Stars)`.
## Caused by warning:
## ! NAs introduced by coercion

Here I create some dummy variables for country of origin and ramen style

ramenDf <-
  ramenDf %>% mutate(japan = ifelse(Country == 'Japan', 1, 0),
                     pack = ifelse(Style == 'Pack', 1, 0))
ramenDf <-
  ramenDf %>% mutate(usa = ifelse(Country == 'USA', 1, 0),
                     cup = ifelse(Style == 'Cup', 1, 0))
ramenDf <-
  ramenDf %>% mutate(korea = ifelse(Country == 'South Korea', 1, 0),
                     bowl = ifelse(Style == 'Bowl', 1, 0))
ramenDf <-
  ramenDf %>% mutate(taiwan = ifelse(Country == 'Taiwan', 1, 0),
                     tray = ifelse(Style == 'Tray', 1, 0))
ramenDf <-
  ramenDf %>% mutate(nissin = ifelse(Brand == 'Nissin', 1, 0))

Here we use a data exploration function that allows us to look at missingness and correlation

pointblank::scan_data(ramenDf)

Overview of ramenDf

Table Overview

Columns

15

Rows

2,580

NAs

3 (0.01%)

Duplicate Rows

4 (0.16%)

Column Types

numeric 10
character 5

Reproducibility Information

Scan Build Time

2024-03-13 22:51:57

pointblank Version

0.11.4

R Version

R version 4.3.2 (2023–10–31 ucrt)
Eye Holes

Operating System

x86_64-w64-mingw32

Variables

Distinct

43

NAs

3

Inf/-Inf

0

Mean

3.65

Minimum

0

Maximum

5

Distinct

355

NAs

0

Inf/-Inf

0

Distinct

2,413

NAs

0

Inf/-Inf

0

Distinct

8

NAs

0

Inf/-Inf

0

Distinct

38

NAs

0

Inf/-Inf

0

Distinct

39

NAs

0

Inf/-Inf

0

Distinct

2

NAs

0

Inf/-Inf

0

Mean

0.14

Minimum

0

Maximum

1

Distinct

2

NAs

0

Inf/-Inf

0

Mean

0.59

Minimum

0

Maximum

1

Distinct

2

NAs

0

Inf/-Inf

0

Mean

0.13

Minimum

0

Maximum

1

Distinct

2

NAs

0

Inf/-Inf

0

Mean

0.17

Minimum

0

Maximum

1

Distinct

2

NAs

0

Inf/-Inf

0

Mean

0.12

Minimum

0

Maximum

1

Distinct

2

NAs

0

Inf/-Inf

0

Mean

0.19

Minimum

0

Maximum

1

Distinct

2

NAs

0

Inf/-Inf

0

Mean

0.09

Minimum

0

Maximum

1

Distinct

2

NAs

0

Inf/-Inf

0

Mean

0.04

Minimum

0

Maximum

1

Distinct

2

NAs

0

Inf/-Inf

0

Mean

0.15

Minimum

0

Maximum

1

Interactions

Correlations

Missing Values

Sample

Stars Brand Variety Style Country Top.Ten japan pack usa cup korea bowl taiwan tray nissin
1 3.75 New Touch T's Restaurant Tantanmen Cup Japan 1 0 0 1 0 0 0 0 0
2 1.00 Just Way Noodles Spicy Hot Sesame Spicy Hot Sesame Guan-miao Noodles Pack Taiwan 0 1 0 0 0 0 1 0 0
3 2.25 Nissin Cup Noodles Chicken Vegetable Cup USA 0 0 1 1 0 0 0 0 1
4 2.75 Wei Lih GGE Ramen Snack Tomato Flavor Pack Taiwan 0 1 0 0 0 0 1 0 0
5 3.75 Ching's Secret Singapore Curry Pack India 0 1 0 0 0 0 0 0 0
6..2575
2576 3.50 Vifon Hu Tiu Nam Vang ["Phnom Penh" style] Asian Style Instant Rice Noodles Bowl Vietnam 0 0 0 0 0 1 0 0 0
2577 1.00 Wai Wai Oriental Style Instant Noodles Pack Thailand 0 1 0 0 0 0 0 0 0
2578 2.00 Wai Wai Tom Yum Shrimp Pack Thailand 0 1 0 0 0 0 0 0 0
2579 2.00 Wai Wai Tom Yum Chili Flavor Pack Thailand 0 1 0 0 0 0 0 0 0
2580 0.50 Westbrae Miso Ramen Pack USA 0 1 1 0 0 0 0 0 0

Here we split the ramen dataset into a training and test set for our random forest

library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(datasets)
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
## Loading required package: lattice
ramenDf_cat <- ramenDf %>% filter(!is.na(Stars)) %>% select(c("Stars",   "Brand",   "Variety", "Style",   "Country"))
set.seed(222)
ind <- sample(2, nrow(ramenDf_cat), replace = TRUE, prob = c(0.7, 0.3))
train <- ramenDf_cat[ind==1,]
test <- ramenDf_cat[ind==2,]

Here I train my random forsest model for the ramen dataset

rf <- randomForest(Stars~., data=train,
                       keep.forest=TRUE, importance=TRUE)
print(rf)
## 
## Call:
##  randomForest(formula = Stars ~ ., data = train, keep.forest = TRUE,      importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 0.8697774
##                     % Var explained: 18.53

Here I test the accuracy

library(plyr)
## Warning: package 'plyr' was built under R version 4.3.3
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
y_predL = predict(rf, newdata = test, predict.all=TRUE)

caret::confusionMatrix(as.factor(round_any(as.numeric(unlist(y_predL[[1]])), 0.25, f = ceiling)), as.factor(test[,1]))
## Warning in levels(reference) != levels(data): longer object length is not a
## multiple of shorter object length
## Warning in
## confusionMatrix.default(as.factor(round_any(as.numeric(unlist(y_predL[[1]])), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0 0.1 0.25 0.5  1 1.25 1.5 1.75 1.8  2 2.25 2.3 2.5 2.75 2.85  3
##      0      0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      0.1    0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      0.25   0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      0.5    0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      1      0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      1.25   0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      1.5    0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      1.75   0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      1.8    0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      2      0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      2.25   0   0    0   0  0    0   0    1   0  0    0   0   0    0    0  0
##      2.3    0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      2.5    0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  1
##      2.75   1   0    0   1  0    0   0    0   0  0    0   0   0    0    0  1
##      2.85   0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      3      0   0    0   0  0    0   0    0   0  0    1   0   1    0    0  2
##      3.25   0   0    0   0  0    1   1    1   0  2    1   1   0    2    0  8
##      3.5    0   0    1   0  0    1   2    0   0  1    1   0   5    1    0  8
##      3.6    0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      3.75   3   0    1   1  6    1   9    2   1  7    0   0   8   10    1 18
##      4      1   1    1   0  2    1   2    3   0  5    2   0  10    7    0 18
##      4.125  0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      4.25   0   0    0   0  1    0   1    0   0  1    0   0   1    2    0  2
##      4.3    0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      4.5    0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      4.75   0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      5      0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##           Reference
## Prediction 3.25 3.5 3.6 3.75  4 4.125 4.25 4.3 4.5 4.75  5
##      0        0   0   0    0  0     0    0   0   0    0  0
##      0.1      0   0   0    0  0     0    0   0   0    0  0
##      0.25     0   0   0    0  0     0    0   0   0    0  0
##      0.5      0   0   0    0  0     0    0   0   0    0  0
##      1        0   0   0    0  0     0    0   0   0    0  0
##      1.25     0   0   0    0  0     0    0   0   0    0  0
##      1.5      0   0   0    0  0     0    0   0   0    0  0
##      1.75     0   0   0    0  0     0    0   0   0    0  0
##      1.8      0   0   0    0  0     0    0   0   0    0  0
##      2        0   0   0    0  0     0    0   0   0    0  0
##      2.25     0   0   0    2  0     0    0   0   1    0  0
##      2.3      0   0   0    0  0     0    0   0   0    0  0
##      2.5      0   0   0    0  0     0    0   0   0    0  0
##      2.75     1   1   0    3  1     0    1   0   1    0  0
##      2.85     0   0   0    0  0     0    0   0   0    0  0
##      3        0   2   0    7  0     0    0   0   0    1  1
##      3.25     2   7   0    4  4     0    1   0   0    0  4
##      3.5     10  12   0   15 17     0   10   0   4    2  9
##      3.6      0   0   0    0  0     0    0   0   0    0  0
##      3.75    18  39   0   51 44     0   22   1   9    8 40
##      4       23  41   1   33 49     0   12   1  10    9 44
##      4.125    0   0   0    0  0     0    0   0   0    0  0
##      4.25     2   2   0    2  5     1    3   0   5    3 13
##      4.3      0   0   0    0  0     0    0   0   0    0  0
##      4.5      0   0   0    0  1     0    0   0   0    0  1
##      4.75     0   0   0    0  0     0    0   0   0    0  0
##      5        0   0   0    0  0     0    0   0   0    0  0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.1504          
##                  95% CI : (0.1262, 0.1773)
##     No Information Rate : 0.153           
##     P-Value [Acc > NIR] : 0.5933          
##                                           
##                   Kappa : 0.0181          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 0.1 Class: 0.25 Class: 0.5 Class: 1
## Sensitivity          0.000000   0.000000    0.000000   0.000000  0.00000
## Specificity          1.000000   1.000000    1.000000   1.000000  1.00000
## Pos Pred Value            NaN        NaN         NaN        NaN      NaN
## Neg Pred Value       0.993679   0.998736    0.996207   0.997472  0.98862
## Prevalence           0.006321   0.001264    0.003793   0.002528  0.01138
## Detection Rate       0.000000   0.000000    0.000000   0.000000  0.00000
## Detection Prevalence 0.000000   0.000000    0.000000   0.000000  0.00000
## Balanced Accuracy    0.500000   0.500000    0.500000   0.500000  0.50000
##                      Class: 1.25 Class: 1.5 Class: 1.75 Class: 1.8 Class: 2
## Sensitivity             0.000000    0.00000     0.00000   0.000000  0.00000
## Specificity             1.000000    1.00000     1.00000   1.000000  1.00000
## Pos Pred Value               NaN        NaN         NaN        NaN      NaN
## Neg Pred Value          0.994943    0.98104     0.99115   0.998736  0.97977
## Prevalence              0.005057    0.01896     0.00885   0.001264  0.02023
## Detection Rate          0.000000    0.00000     0.00000   0.000000  0.00000
## Detection Prevalence    0.000000    0.00000     0.00000   0.000000  0.00000
## Balanced Accuracy       0.500000    0.50000     0.50000   0.500000  0.50000
##                      Class: 2.25 Class: 2.3 Class: 2.5 Class: 2.75 Class: 2.85
## Sensitivity             0.000000   0.000000   0.000000     0.00000    0.000000
## Specificity             0.994911   1.000000   0.998695     0.98570    1.000000
## Pos Pred Value          0.000000        NaN   0.000000     0.00000         NaN
## Neg Pred Value          0.993647   0.998736   0.968354     0.97179    0.998736
## Prevalence              0.006321   0.001264   0.031606     0.02781    0.001264
## Detection Rate          0.000000   0.000000   0.000000     0.00000    0.000000
## Detection Prevalence    0.005057   0.000000   0.001264     0.01391    0.000000
## Balanced Accuracy       0.497455   0.500000   0.499347     0.49285    0.500000
##                      Class: 3 Class: 3.25 Class: 3.5 Class: 3.6 Class: 3.75
## Sensitivity          0.034483    0.035714    0.11538   0.000000     0.43590
## Specificity          0.982265    0.949660    0.87336   1.000000     0.63056
## Pos Pred Value       0.133333    0.051282    0.12121        NaN     0.17000
## Neg Pred Value       0.927835    0.928191    0.86705   0.998736     0.86558
## Prevalence           0.073325    0.070796    0.13148   0.001264     0.14791
## Detection Rate       0.002528    0.002528    0.01517   0.000000     0.06448
## Detection Prevalence 0.018963    0.049305    0.12516   0.000000     0.37927
## Balanced Accuracy    0.508374    0.492687    0.49437   0.500000     0.53323
##                      Class: 4 Class: 4.125 Class: 4.25 Class: 4.3 Class: 4.5
## Sensitivity           0.40496     0.000000    0.061224   0.000000   0.000000
## Specificity           0.66119     1.000000    0.944744   1.000000   0.997372
## Pos Pred Value        0.17754          NaN    0.068182        NaN   0.000000
## Neg Pred Value        0.86019     0.998736    0.938420   0.997472   0.961977
## Prevalence            0.15297     0.001264    0.061947   0.002528   0.037927
## Detection Rate        0.06195     0.000000    0.003793   0.000000   0.000000
## Detection Prevalence  0.34893     0.000000    0.055626   0.000000   0.002528
## Balanced Accuracy     0.53308     0.500000    0.502984   0.500000   0.498686
##                      Class: 4.75 Class: 5
## Sensitivity              0.00000   0.0000
## Specificity              1.00000   1.0000
## Pos Pred Value               NaN      NaN
## Neg Pred Value           0.97092   0.8584
## Prevalence               0.02908   0.1416
## Detection Rate           0.00000   0.0000
## Detection Prevalence     0.00000   0.0000
## Balanced Accuracy        0.50000   0.5000

Here I select my target and dummy variables for linear regression

ramenDf_dum <- ramenDf %>% select(c("Stars","japan","pack","usa", "cup","korea","bowl","taiwan","tray","nissin" )) %>% filter(!is.na(Stars))

Now I split up my training and test data and run my linear regression

set.seed(222)
ind <- sample(2, nrow(ramenDf_dum), replace = TRUE, prob = c(0.7, 0.3))
train <- ramenDf_dum[ind==1,]
test <- ramenDf_dum[ind==2,]
lin_model <- lm(Stars ~ japan + usa + cup + korea + bowl + tray + nissin + pack + taiwan, ramenDf_dum)
#
summary(lin_model)
## 
## Call:
## lm(formula = Stars ~ japan + usa + cup + korea + bowl + tray + 
##     nissin + pack + taiwan, data = ramenDf_dum)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8626 -0.4298  0.1374  0.6411  1.8482 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.07521    0.31522  12.928  < 2e-16 ***
## japan        0.32834    0.06195   5.300 1.25e-07 ***
## usa         -0.16180    0.06439  -2.513   0.0120 *  
## cup         -0.76165    0.31875  -2.390   0.0169 *  
## korea        0.25379    0.06352   3.996 6.64e-05 ***
## bowl        -0.56973    0.31766  -1.794   0.0730 .  
## tray        -0.64919    0.32888  -1.974   0.0485 *  
## nissin       0.41307    0.06026   6.854 8.93e-12 ***
## pack        -0.46635    0.31556  -1.478   0.1396    
## taiwan       0.07663    0.07248   1.057   0.2905    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.993 on 2567 degrees of freedom
## Multiple R-squared:  0.04692,    Adjusted R-squared:  0.04358 
## F-statistic: 14.04 on 9 and 2567 DF,  p-value: < 2.2e-16

Here I test the accuracy of my linear regression model for my ramen dataset

library(plyr)
y_predL = predict(lin_model, newdata = test, predict.all=TRUE)

caret::confusionMatrix(as.factor(round_any(as.numeric(unlist(y_predL[])), 0.25, f = ceiling)), as.factor(test[,1]))
## Warning in levels(reference) != levels(data): longer object length is not a
## multiple of shorter object length
## Warning in
## confusionMatrix.default(as.factor(round_any(as.numeric(unlist(y_predL[])), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0 0.1 0.25 0.5  1 1.25 1.5 1.75 1.8  2 2.25 2.3 2.5 2.75 2.85  3
##      0      0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      0.1    0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      0.25   0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      0.5    0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      1      0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      1.25   0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      1.5    0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      1.75   0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      1.8    0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      2      0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      2.25   0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      2.3    0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      2.5    0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      2.75   0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      2.85   0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      3      0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      3.25   0   0    0   0  0    0   0    1   0  0    0   0   0    0    0  2
##      3.5    2   0    0   0  1    3   6    1   0  3    2   0   5    6    0 11
##      3.6    0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      3.75   3   1    3   2  6    1   8    5   0 11    2   1  15   14    1 32
##      4      0   0    0   0  2    0   1    0   1  1    1   0   3    2    0 10
##      4.125  0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      4.25   0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  3
##      4.3    0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      4.5    0   0    0   0  0    0   0    0   0  1    0   0   2    0    0  0
##      4.75   0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##      5      0   0    0   0  0    0   0    0   0  0    0   0   0    0    0  0
##           Reference
## Prediction 3.25 3.5 3.6 3.75  4 4.125 4.25 4.3 4.5 4.75  5
##      0        0   0   0    0  0     0    0   0   0    0  0
##      0.1      0   0   0    0  0     0    0   0   0    0  0
##      0.25     0   0   0    0  0     0    0   0   0    0  0
##      0.5      0   0   0    0  0     0    0   0   0    0  0
##      1        0   0   0    0  0     0    0   0   0    0  0
##      1.25     0   0   0    0  0     0    0   0   0    0  0
##      1.5      0   0   0    0  0     0    0   0   0    0  0
##      1.75     0   0   0    0  0     0    0   0   0    0  0
##      1.8      0   0   0    0  0     0    0   0   0    0  0
##      2        0   0   0    0  0     0    0   0   0    0  0
##      2.25     0   0   0    0  0     0    0   0   0    0  0
##      2.3      0   0   0    0  0     0    0   0   0    0  0
##      2.5      0   0   0    0  0     0    0   0   0    0  0
##      2.75     0   0   0    0  0     0    0   0   0    0  0
##      2.85     0   0   0    0  0     0    0   0   0    0  0
##      3        0   0   0    0  0     0    0   0   0    0  0
##      3.25     0   4   0    1  1     0    0   0   0    0  3
##      3.5      8  17   0   21 15     0   10   0   4    1  3
##      3.6      0   0   0    0  0     0    0   0   0    0  0
##      3.75    32  59   1   65 64     0   23   2  12   10 61
##      4       14  20   0   20 30     1   12   0   9    9 28
##      4.125    0   0   0    0  0     0    0   0   0    0  0
##      4.25     2   3   0    9  7     0    4   0   4    3 14
##      4.3      0   0   0    0  0     0    0   0   0    0  0
##      4.5      0   1   0    1  4     0    0   0   1    0  3
##      4.75     0   0   0    0  0     0    0   0   0    0  0
##      5        0   0   0    0  0     0    0   0   0    0  0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.1479          
##                  95% CI : (0.1239, 0.1746)
##     No Information Rate : 0.153           
##     P-Value [Acc > NIR] : 0.6683          
##                                           
##                   Kappa : 0.0113          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 0.1 Class: 0.25 Class: 0.5 Class: 1
## Sensitivity          0.000000   0.000000    0.000000   0.000000  0.00000
## Specificity          1.000000   1.000000    1.000000   1.000000  1.00000
## Pos Pred Value            NaN        NaN         NaN        NaN      NaN
## Neg Pred Value       0.993679   0.998736    0.996207   0.997472  0.98862
## Prevalence           0.006321   0.001264    0.003793   0.002528  0.01138
## Detection Rate       0.000000   0.000000    0.000000   0.000000  0.00000
## Detection Prevalence 0.000000   0.000000    0.000000   0.000000  0.00000
## Balanced Accuracy    0.500000   0.500000    0.500000   0.500000  0.50000
##                      Class: 1.25 Class: 1.5 Class: 1.75 Class: 1.8 Class: 2
## Sensitivity             0.000000    0.00000     0.00000   0.000000  0.00000
## Specificity             1.000000    1.00000     1.00000   1.000000  1.00000
## Pos Pred Value               NaN        NaN         NaN        NaN      NaN
## Neg Pred Value          0.994943    0.98104     0.99115   0.998736  0.97977
## Prevalence              0.005057    0.01896     0.00885   0.001264  0.02023
## Detection Rate          0.000000    0.00000     0.00000   0.000000  0.00000
## Detection Prevalence    0.000000    0.00000     0.00000   0.000000  0.00000
## Balanced Accuracy       0.500000    0.50000     0.50000   0.500000  0.50000
##                      Class: 2.25 Class: 2.3 Class: 2.5 Class: 2.75 Class: 2.85
## Sensitivity             0.000000   0.000000    0.00000     0.00000    0.000000
## Specificity             1.000000   1.000000    1.00000     1.00000    1.000000
## Pos Pred Value               NaN        NaN        NaN         NaN         NaN
## Neg Pred Value          0.993679   0.998736    0.96839     0.97219    0.998736
## Prevalence              0.006321   0.001264    0.03161     0.02781    0.001264
## Detection Rate          0.000000   0.000000    0.00000     0.00000    0.000000
## Detection Prevalence    0.000000   0.000000    0.00000     0.00000    0.000000
## Balanced Accuracy       0.500000   0.500000    0.50000     0.50000    0.500000
##                      Class: 3 Class: 3.25 Class: 3.5 Class: 3.6 Class: 3.75
## Sensitivity           0.00000     0.00000    0.16346   0.000000     0.55556
## Specificity           1.00000     0.98367    0.85153   1.000000     0.45252
## Pos Pred Value            NaN     0.00000    0.14286        NaN     0.14977
## Neg Pred Value        0.92668     0.92811    0.87054   0.998736     0.85434
## Prevalence            0.07332     0.07080    0.13148   0.001264     0.14791
## Detection Rate        0.00000     0.00000    0.02149   0.000000     0.08217
## Detection Prevalence  0.00000     0.01517    0.15044   0.000000     0.54867
## Balanced Accuracy     0.50000     0.49184    0.50749   0.500000     0.50404
##                      Class: 4 Class: 4.125 Class: 4.25 Class: 4.3 Class: 4.5
## Sensitivity           0.24793     0.000000    0.081633   0.000000   0.033333
## Specificity           0.80000     1.000000    0.939353   1.000000   0.984231
## Pos Pred Value        0.18293          NaN    0.081633        NaN   0.076923
## Neg Pred Value        0.85486     0.998736    0.939353   0.997472   0.962725
## Prevalence            0.15297     0.001264    0.061947   0.002528   0.037927
## Detection Rate        0.03793     0.000000    0.005057   0.000000   0.001264
## Detection Prevalence  0.20733     0.000000    0.061947   0.000000   0.016435
## Balanced Accuracy     0.52397     0.500000    0.510493   0.500000   0.508782
##                      Class: 4.75 Class: 5
## Sensitivity              0.00000   0.0000
## Specificity              1.00000   1.0000
## Pos Pred Value               NaN      NaN
## Neg Pred Value           0.97092   0.8584
## Prevalence               0.02908   0.1416
## Detection Rate           0.00000   0.0000
## Detection Prevalence     0.00000   0.0000
## Balanced Accuracy        0.50000   0.5000

#Wine dataset

Here I select off free text columns I wont process

library(dplyr)
wineDf <- wineDf %>% select(-c("X", "description", "designation"))

Here I create some dummy variables for my linear regression

wineDf <- wineDf %>% relocate(points)
wineDf <- wineDf %>% mutate(points = as.numeric(points), usa = as.numeric(country == 'US'), france = as.numeric(country == 'France'), italy = as.numeric(country == 'Italy'), spain = as.numeric(country == 'Spain'))
wineDf <- wineDf %>% mutate(chard = as.numeric(variety == 'Chardonnay'), pinot = as.numeric(variety == 'Pinot Noir'), cab = as.numeric(variety == 'Cabernet Sauvignon'), rblend = as.numeric(variety == 'Red Blend'), bordeaux = as.numeric(variety == 'Bordeaux-style Red Blend'))

reduced_wineDf <- wineDf %>% select(points, price, usa, france, italy, spain, chard, pinot, cab, rblend, bordeaux) %>% filter(!is.na(points))

test1 <- wineDf[1:5000,]

Here I explore the data for missingness and correlations

pointblank::scan_data(test1)

Overview of test1

Table Overview

Columns

17

Rows

5,000

NAs

269 (0.32%)

Duplicate Rows

337 (6.74%)

Column Types

numeric 11
character 6

Reproducibility Information

Scan Build Time

2024-03-13 22:53:52

pointblank Version

0.11.4

R Version

R version 4.3.2 (2023–10–31 ucrt)
Eye Holes

Operating System

x86_64-w64-mingw32

Variables

Distinct

20

NAs

0

Inf/-Inf

0

Mean

89.26

Minimum

81

Maximum

100

Distinct

31

NAs

0

Inf/-Inf

0

Distinct

137

NAs

269

Inf/-Inf

0

Mean

37.32

Minimum

4

Maximum

848

Distinct

167

NAs

0

Inf/-Inf

0

Distinct

397

NAs

0

Inf/-Inf

0

Distinct

19

NAs

0

Inf/-Inf

0

Distinct

242

NAs

0

Inf/-Inf

0

Distinct

2,711

NAs

0

Inf/-Inf

0

Distinct

2

NAs

0

Inf/-Inf

0

Mean

0.43

Minimum

0

Maximum

1

Distinct

2

NAs

0

Inf/-Inf

0

Mean

0.18

Minimum

0

Maximum

1

Distinct

2

NAs

0

Inf/-Inf

0

Mean

0.15

Minimum

0

Maximum

1

Distinct

2

NAs

0

Inf/-Inf

0

Mean

0.06

Minimum

0

Maximum

1

Distinct

2

NAs

0

Inf/-Inf

0

Mean

0.08

Minimum

0

Maximum

1

Distinct

2

NAs

0

Inf/-Inf

0

Mean

0.11

Minimum

0

Maximum

1

Distinct

2

NAs

0

Inf/-Inf

0

Mean

0.07

Minimum

0

Maximum

1

Distinct

2

NAs

0

Inf/-Inf

0

Mean

0.07

Minimum

0

Maximum

1

Distinct

2

NAs

0

Inf/-Inf

0

Mean

0.06

Minimum

0

Maximum

1

Interactions

Correlations

Missing Values

Sample

points country price province region_1 region_2 variety winery usa france italy spain chard pinot cab rblend bordeaux
1 96 US 235 California Napa Valley Napa Cabernet Sauvignon Heitz 1 0 0 0 0 0 1 0 0
2 96 Spain 110 Northern Spain Toro Tinta de Toro Bodega Carmen Rodríguez 0 0 0 1 0 0 0 0 0
3 96 US 90 California Knights Valley Sonoma Sauvignon Blanc Macauley 1 0 0 0 0 0 0 0 0
4 96 US 65 Oregon Willamette Valley Willamette Valley Pinot Noir Ponzi 1 0 0 0 0 1 0 0 0
5 95 France 66 Provence Bandol Provence red blend Domaine de la Bégude 0 1 0 0 0 0 0 0 0
6..4995
4996 84 US 10 Washington Columbia Valley (WA) Columbia Valley Merlot Sagelands 1 0 0 0 0 0 0 0 0
4997 84 Argentina 15 Mendoza Province Valle de Uco Chardonnay-Viognier Zuccardi 0 0 0 0 0 0 0 0 0
4998 84 Spain 9 Northern Spain Cariñena Red Blend Las Valles 0 0 0 1 0 0 0 1 0
4999 84 US 14 Washington Columbia Valley (WA) Columbia Valley Roussanne GÃ¥rd 1 0 0 0 0 0 0 0 0
5000 84 US 15 Idaho Viognier Fujishin 1 0 0 0 0 0 0 0 0

Here I create the training and test sets for the wine dataset linear regression and create my model

set.seed(222)
ind <- sample(2, nrow(reduced_wineDf), replace = TRUE, prob = c(0.7, 0.3))
train <- reduced_wineDf[ind==1,]
test <- reduced_wineDf[ind==2,]

lin_model <- lm(points ~ price + usa + france + italy + spain + chard + pinot + cab + rblend + bordeaux, train)
#
summary(lin_model)
## 
## Call:
## lm(formula = points ~ price + usa + france + italy + spain + 
##     chard + pinot + cab + rblend + bordeaux, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -75.567  -1.867   0.005   2.026  11.132 
## 
## Coefficients:
##               Estimate Std. Error  t value Pr(>|t|)    
## (Intercept) 86.2731004  0.0203899 4231.176  < 2e-16 ***
## price        0.0379975  0.0002552  148.874  < 2e-16 ***
## usa          0.1249285  0.0236931    5.273 1.35e-07 ***
## france       0.4696567  0.0350231   13.410  < 2e-16 ***
## italy        0.7431201  0.0318046   23.365  < 2e-16 ***
## spain       -0.6469356  0.0421324  -15.355  < 2e-16 ***
## chard        0.0131372  0.0320928    0.409   0.6823    
## pinot        0.5616782  0.0328073   17.121  < 2e-16 ***
## cab          0.1076149  0.0336863    3.195   0.0014 ** 
## rblend       0.1602617  0.0372419    4.303 1.68e-05 ***
## bordeaux     0.4298177  0.0534644    8.039 9.14e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.848 on 96015 degrees of freedom
##   (9601 observations deleted due to missingness)
## Multiple R-squared:  0.2201, Adjusted R-squared:   0.22 
## F-statistic:  2710 on 10 and 96015 DF,  p-value: < 2.2e-16

Now I test the accuracy of my linear model

library(plyr)
y_predL = predict(lin_model, newdata = test, predict.all=TRUE)
temp <- as.data.frame(round_any(as.numeric(unlist(y_predL[])), 1 , f = ceiling))
names(temp) <- c('prediction')
temp <- temp %>% mutate(prediction = as.factor(ifelse(prediction > 100, 100, prediction)))
caret::confusionMatrix(temp$prediction, as.factor(test[,1]))
## Warning in levels(reference) != levels(data): longer object length is not a
## multiple of shorter object length
## Warning in confusionMatrix.default(temp$prediction, as.factor(test[, 1])):
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   80   81   82   83   84   85   86   87   88   89   90   91   92
##        80     0    0    0    0    0    0    0    0    0    0    0    0    0
##        81     0    0    0    0    0    0    0    0    0    0    0    0    0
##        82     0    0    0    0    0    0    0    0    0    0    0    0    0
##        83     0    0    0    0    0    0    0    0    0    0    0    0    0
##        84     0    0    0    0    0    0    0    0    0    0    0    0    0
##        85     0    0    0    0    0    0    0    0    0    0    0    0    0
##        86    14   12   15   31   44   33   33   25    9    2    1    2    0
##        87   144  225  549  882 1322 1370 1405 1386 1028  565  452  147   58
##        88    99  163  571  763 1522 1793 2345 3379 2558 1728 2099 1216  857
##        89     9   18   52   72  205  265  424  787  920  869 1200  875  818
##        90     1    1   10    8   34   40   62  139  211  193  394  383  453
##        91     0    0    2    2    2    9   19   23   38   52   87  101  121
##        92     0    0    1    1    3    4    3   14   21   19   30   29   47
##        93     0    0    0    0    0    0    2    8    0    4    5   17   25
##        94     0    0    0    1    3    0    0    0    1    0   10    9   10
##        95     0    0    0    1    0    0    0    0    1    2    5    1   11
##        96     0    0    0    0    0    0    0    1    0    0    4    1    1
##        97     0    0    0    0    0    0    0    0    0    1    1    0    2
##        98     0    0    0    0    0    0    2    0    1    2    0    1    3
##        99     0    0    0    0    0    0    0    0    0    0    1    0    0
##        100    0    0    0    0    0    0    0    1    0    2    1    3    3
##           Reference
## Prediction   93   94   95   96   97   98   99  100
##        80     0    0    0    0    0    0    0    0
##        81     0    0    0    0    0    0    0    0
##        82     0    0    0    0    0    0    0    0
##        83     0    0    0    0    0    0    0    0
##        84     0    0    0    0    0    0    0    0
##        85     0    0    0    0    0    0    0    0
##        86     0    0    0    0    0    0    0    0
##        87    10    2    0    0    0    0    0    0
##        88   370  113   26   11    0    0    0    0
##        89   476  309   96   26    7    3    1    2
##        90   364  219  116   48   18    6    0    0
##        91   140  121   45   28   20    5    1    0
##        92    63   55   40   14   10    4    0    0
##        93    34   30   15    9    4    0    0    0
##        94    19   13   15   12    4    4    0    0
##        95    12   10    6    5    3    1    1    2
##        96    10   11    8    2    5    1    1    1
##        97     6    4   13    1    7    0    1    0
##        98     9    8    1    3    3    2    1    1
##        99     2    1    2    0    3    1    0    0
##        100    4    8   22   14   12    5    2    1
## 
## Overall Statistics
##                                           
##                Accuracy : 0.1323          
##                  95% CI : (0.1291, 0.1356)
##     No Information Rate : 0.1398          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0227          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 80 Class: 81 Class: 82 Class: 83 Class: 84
## Sensitivity           0.000000   0.00000   0.00000   0.00000   0.00000
## Specificity           1.000000   1.00000   1.00000   1.00000   1.00000
## Pos Pred Value             NaN       NaN       NaN       NaN       NaN
## Neg Pred Value        0.993521   0.98983   0.97088   0.95727   0.92392
## Prevalence            0.006479   0.01017   0.02912   0.04273   0.07608
## Detection Rate        0.000000   0.00000   0.00000   0.00000   0.00000
## Detection Prevalence  0.000000   0.00000   0.00000   0.00000   0.00000
## Balanced Accuracy     0.500000   0.50000   0.50000   0.50000   0.50000
##                      Class: 85 Class: 86 Class: 87 Class: 88 Class: 89
## Sensitivity            0.00000 0.0076834   0.24050   0.53425   0.25269
## Specificity            1.00000 0.9949071   0.76982   0.53173   0.82618
## Pos Pred Value             NaN 0.1493213   0.14521   0.13042   0.11690
## Neg Pred Value         0.91473 0.8960183   0.86177   0.89674   0.92391
## Prevalence             0.08527 0.1042248   0.13985   0.11619   0.08345
## Detection Rate         0.00000 0.0008008   0.03363   0.06207   0.02109
## Detection Prevalence   0.00000 0.0053629   0.23162   0.47594   0.18040
## Balanced Accuracy      0.50000 0.5012952   0.50516   0.53299   0.53944
##                      Class: 90 Class: 91 Class: 92 Class: 93 Class: 94
## Sensitivity           0.091841  0.036266  0.019510 0.0223831 0.0143805
## Specificity           0.937539  0.981392  0.991985 0.9970018 0.9978166
## Pos Pred Value        0.145926  0.123775  0.131285 0.2222222 0.1287129
## Neg Pred Value        0.898829  0.933553  0.942180 0.9638299 0.9783254
## Prevalence            0.104103  0.067582  0.058458 0.0368609 0.0219370
## Detection Rate        0.009561  0.002451  0.001141 0.0008251 0.0003155
## Detection Prevalence  0.065520  0.019801  0.008687 0.0037128 0.0024509
## Balanced Accuracy     0.514690  0.508829  0.505747 0.5096925 0.5060986
##                      Class: 95 Class: 96 Class: 97 Class: 98 Class: 99
## Sensitivity          0.0148148 1.156e-02 0.0729167 6.250e-02 0.0000000
## Specificity          0.9986521 9.989e-01 0.9992946 9.992e-01 0.9997573
## Pos Pred Value       0.0983607 4.348e-02 0.1944444 5.405e-02 0.0000000
## Neg Pred Value       0.9903033 9.958e-01 0.9978384 9.993e-01 0.9998058
## Prevalence           0.0098280 4.198e-03 0.0023296 7.765e-04 0.0001941
## Detection Rate       0.0001456 4.853e-05 0.0001699 4.853e-05 0.0000000
## Detection Prevalence 0.0014803 1.116e-03 0.0008736 8.979e-04 0.0002427
## Balanced Accuracy    0.5067335 5.052e-01 0.5361056 5.308e-01 0.4998786
##                      Class: 100
## Sensitivity           1.429e-01
## Specificity           9.981e-01
## Pos Pred Value        1.282e-02
## Neg Pred Value        9.999e-01
## Prevalence            1.699e-04
## Detection Rate        2.427e-05
## Detection Prevalence  1.893e-03
## Balanced Accuracy     5.705e-01

Here I create my train and test dataset for the wine random forest

wineDf_cat <- wineDf %>% filter(!is.na(points)) %>% select(c("points", "country","price","province","region_1","region_2","variety","winery")) %>% na.omit()
set.seed(222)
#wineDf_cat <- wineDf_cat[1:50000,]
ind <- sample(2, nrow(wineDf_cat), replace = TRUE, prob = c(0.7, 0.3))
train <- wineDf_cat[ind==1,]
test <- wineDf_cat[ind==2,]

Here I build my wine random forest model

rf <- randomForest::randomForest(points~., data=train, ntree = 10,
                       keep.forest=TRUE, importance=TRUE)
print(rf)
## 
## Call:
##  randomForest(formula = points ~ ., data = train, ntree = 10,      keep.forest = TRUE, importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 10
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 4.997418
##                     % Var explained: 51.95

Here I test the accuracy of my rf model

library(plyr)
y_predL = predict(rf, newdata = test, predict.all=TRUE)
temp <- as.data.frame(round_any(as.numeric(unlist(y_predL[1])), 1 , f = ceiling))
names(temp) <- c('prediction')
temp <- temp %>% mutate(prediction = as.factor(ifelse(prediction > 100, 100, prediction)))
caret::confusionMatrix(temp$prediction, as.factor(test[,1]))
## Warning in levels(reference) != levels(data): longer object length is not a
## multiple of shorter object length
## Warning in confusionMatrix.default(temp$prediction, as.factor(test[, 1])):
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   80   81   82   83   84   85   86   87   88   89   90   91   92
##        80     0    0    0    0    0    0    0    0    0    0    0    0    0
##        81     0    0    0    0    0    0    0    0    0    0    0    0    0
##        82     0    0    0    0    0    0    0    0    0    0    0    0    0
##        83     1    1    6    3    6    0    2    0    0    1    0    0    0
##        84    20   21   70  107   99   86   59   54   18   11    4    6    0
##        85    65  109  242  383  562  504  440  359  200   88   68   23   12
##        86    68  122  295  508  851  934 1098 1044  653  303  294  100   52
##        87    39   91  266  386  761  854 1105 1529 1070  619  626  275  161
##        88    25   48  166  212  400  496  789 1302 1199  732  827  434  303
##        89    15   33   85  113  214  311  464  719  873  762  988  632  548
##        90     3   10   27   50  114  168  279  456  521  587  780  656  628
##        91     2    5   11   12   38   52  100  167  268  256  445  455  466
##        92     0    1    3    5    7   13   18   38   57   82  144  151  205
##        93     0    1    0    0    3    5    7   15   12   20   28   37   79
##        94     0    0    0    2    0    2    0    2    7    8    9    9   17
##        95     0    0    0    0    0    0    0    0    0    1    2    0    4
##        96     0    0    0    0    0    0    0    0    0    0    0    0    0
##        97     0    0    0    0    0    0    0    0    0    0    0    0    1
##        98     0    0    0    0    0    0    0    0    0    0    0    0    0
##        99     0    0    0    0    0    0    0    0    0    0    0    0    0
##        100    0    0    0    0    0    0    0    0    0    0    0    0    0
##           Reference
## Prediction   93   94   95   96   97   98   99  100
##        80     0    0    0    0    0    0    0    0
##        81     0    0    0    0    0    0    0    0
##        82     0    0    0    0    0    0    0    0
##        83     0    0    0    0    0    0    0    0
##        84     0    0    0    0    0    0    0    0
##        85     2    3    0    0    0    0    0    0
##        86     8    1    3    1    0    0    0    0
##        87    51   16    2    0    0    0    0    0
##        88   119   55    6    2    0    0    0    0
##        89   250  124   52    9    1    3    1    0
##        90   381  172   56   19    5    0    0    0
##        91   376  252  111   46   19    2    1    0
##        92   201  162   83   32   21    9    1    2
##        93    86   78   47   29   16    4    5    1
##        94    25   38   26   21   16    4    2    2
##        95    15   19   22   10    7    5    2    2
##        96     2    2    2    3    7    0    1    1
##        97     2    0    1    0    0    3    0    0
##        98     0    0    0    0    0    0    0    0
##        99     0    0    0    0    0    0    0    0
##        100    0    0    0    0    0    0    0    0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.1649          
##                  95% CI : (0.1613, 0.1685)
##     No Information Rate : 0.1382          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.0692          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 80 Class: 81 Class: 82 Class: 83 Class: 84
## Sensitivity           0.000000   0.00000   0.00000 1.684e-03  0.032406
## Specificity           1.000000   1.00000   1.00000 9.996e-01  0.988027
## Pos Pred Value             NaN       NaN       NaN 1.500e-01  0.178378
## Neg Pred Value        0.994215   0.98926   0.97154 9.568e-01  0.927167
## Prevalence            0.005785   0.01074   0.02846 4.329e-02  0.074257
## Detection Rate        0.000000   0.00000   0.00000 7.292e-05  0.002406
## Detection Prevalence  0.000000   0.00000   0.00000 4.861e-04  0.013490
## Balanced Accuracy     0.500000   0.50000   0.50000 5.006e-01  0.510216
##                      Class: 85 Class: 86 Class: 87 Class: 88 Class: 89
## Sensitivity            0.14715   0.25178   0.26895   0.24580   0.21960
## Specificity            0.93223   0.85761   0.82169   0.83686   0.85572
## Pos Pred Value         0.16471   0.17332   0.19475   0.16852   0.12296
## Neg Pred Value         0.92330   0.90625   0.87516   0.89188   0.92250
## Prevalence             0.08325   0.10600   0.13818   0.11857   0.08434
## Detection Rate         0.01225   0.02669   0.03716   0.02914   0.01852
## Detection Prevalence   0.07438   0.15398   0.19083   0.17294   0.15063
## Balanced Accuracy      0.53969   0.55469   0.54532   0.54133   0.53766
##                      Class: 90 Class: 91 Class: 92 Class: 93 Class: 94
## Sensitivity            0.18505   0.16379  0.082795   0.05665 0.0412148
## Specificity            0.88810   0.93147  0.973361   0.99023 0.9962207
## Pos Pred Value         0.15879   0.14754  0.165992   0.18182 0.2000000
## Neg Pred Value         0.90519   0.93896  0.943091   0.96479 0.9784132
## Prevalence             0.10245   0.06752  0.060183   0.03690 0.0224107
## Detection Rate         0.01896   0.01106  0.004983   0.00209 0.0009237
## Detection Prevalence   0.11939   0.07496  0.030019   0.01150 0.0046183
## Balanced Accuracy      0.53658   0.54763  0.528078   0.52344 0.5187177
##                      Class: 95 Class: 96 Class: 97 Class: 98 Class: 99
## Sensitivity          0.0535280 1.744e-02 0.0000000 0.0000000  0.000000
## Specificity          0.9983550 9.996e-01 0.9998295 1.0000000  1.000000
## Pos Pred Value       0.2471910 1.667e-01 0.0000000       NaN       NaN
## Neg Pred Value       0.9905242 9.959e-01 0.9977634 0.9992708  0.999684
## Prevalence           0.0099900 4.181e-03 0.0022362 0.0007292  0.000316
## Detection Rate       0.0005347 7.292e-05 0.0000000 0.0000000  0.000000
## Detection Prevalence 0.0021633 4.375e-04 0.0001701 0.0000000  0.000000
## Balanced Accuracy    0.5259415 5.085e-01 0.4999147 0.5000000  0.500000
##                      Class: 100
## Sensitivity           0.0000000
## Specificity           1.0000000
## Pos Pred Value              NaN
## Neg Pred Value        0.9998055
## Prevalence            0.0001945
## Detection Rate        0.0000000
## Detection Prevalence  0.0000000
## Balanced Accuracy     0.5000000

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.