Load in the datasets
wineDf <- read.csv("G:/Documents/DATA622_HW1/winemag-data_first150k.csv")
ramenDf <- read.csv("G:/Documents/DATA622_HW1/ramen-ratings.csv")
Remove the review number (unnecessary key) and cast star rating as
numeric
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
ramenDf <- ramenDf %>% select(-c('Review..')) %>% mutate(Stars = as.numeric(Stars)) %>% relocate(Stars)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Stars = as.numeric(Stars)`.
## Caused by warning:
## ! NAs introduced by coercion
Here I create some dummy variables for country of origin and ramen
style
ramenDf <-
ramenDf %>% mutate(japan = ifelse(Country == 'Japan', 1, 0),
pack = ifelse(Style == 'Pack', 1, 0))
ramenDf <-
ramenDf %>% mutate(usa = ifelse(Country == 'USA', 1, 0),
cup = ifelse(Style == 'Cup', 1, 0))
ramenDf <-
ramenDf %>% mutate(korea = ifelse(Country == 'South Korea', 1, 0),
bowl = ifelse(Style == 'Bowl', 1, 0))
ramenDf <-
ramenDf %>% mutate(taiwan = ifelse(Country == 'Taiwan', 1, 0),
tray = ifelse(Style == 'Tray', 1, 0))
ramenDf <-
ramenDf %>% mutate(nissin = ifelse(Brand == 'Nissin', 1, 0))
Here we use a data exploration function that allows us to look at
missingness and correlation
pointblank::scan_data(ramenDf)
Table Overview
|
15 |
|
2,580 |
|
3 (0.01%) |
|
4 (0.16%) |
Reproducibility Information
|
|
|
|
|
R version 4.3.2 (2023–10–31 ucrt) Eye Holes
|
|
|
|
Stars |
Brand |
Variety |
Style |
Country |
Top.Ten |
japan |
pack |
usa |
cup |
korea |
bowl |
taiwan |
tray |
nissin |
1 |
3.75 |
New Touch |
T's Restaurant Tantanmen |
Cup |
Japan |
|
1 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
2 |
1.00 |
Just Way |
Noodles Spicy Hot Sesame Spicy Hot Sesame Guan-miao Noodles |
Pack |
Taiwan |
|
0 |
1 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
3 |
2.25 |
Nissin |
Cup Noodles Chicken Vegetable |
Cup |
USA |
|
0 |
0 |
1 |
1 |
0 |
0 |
0 |
0 |
1 |
4 |
2.75 |
Wei Lih |
GGE Ramen Snack Tomato Flavor |
Pack |
Taiwan |
|
0 |
1 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
5 |
3.75 |
Ching's Secret |
Singapore Curry |
Pack |
India |
|
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
6..2575 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2576 |
3.50 |
Vifon |
Hu Tiu Nam Vang ["Phnom Penh" style] Asian Style Instant Rice Noodles |
Bowl |
Vietnam |
|
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
2577 |
1.00 |
Wai Wai |
Oriental Style Instant Noodles |
Pack |
Thailand |
|
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
2578 |
2.00 |
Wai Wai |
Tom Yum Shrimp |
Pack |
Thailand |
|
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
2579 |
2.00 |
Wai Wai |
Tom Yum Chili Flavor |
Pack |
Thailand |
|
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
2580 |
0.50 |
Westbrae |
Miso Ramen |
Pack |
USA |
|
0 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
Here we split the ramen dataset into a training and test set for our
random forest
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
library(datasets)
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
## Loading required package: lattice
ramenDf_cat <- ramenDf %>% filter(!is.na(Stars)) %>% select(c("Stars", "Brand", "Variety", "Style", "Country"))
set.seed(222)
ind <- sample(2, nrow(ramenDf_cat), replace = TRUE, prob = c(0.7, 0.3))
train <- ramenDf_cat[ind==1,]
test <- ramenDf_cat[ind==2,]
Here I train my random forsest model for the ramen dataset
rf <- randomForest(Stars~., data=train,
keep.forest=TRUE, importance=TRUE)
print(rf)
##
## Call:
## randomForest(formula = Stars ~ ., data = train, keep.forest = TRUE, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 0.8697774
## % Var explained: 18.53
Here I test the accuracy
library(plyr)
## Warning: package 'plyr' was built under R version 4.3.3
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
y_predL = predict(rf, newdata = test, predict.all=TRUE)
caret::confusionMatrix(as.factor(round_any(as.numeric(unlist(y_predL[[1]])), 0.25, f = ceiling)), as.factor(test[,1]))
## Warning in levels(reference) != levels(data): longer object length is not a
## multiple of shorter object length
## Warning in
## confusionMatrix.default(as.factor(round_any(as.numeric(unlist(y_predL[[1]])), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 0.1 0.25 0.5 1 1.25 1.5 1.75 1.8 2 2.25 2.3 2.5 2.75 2.85 3
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 0.1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 0.25 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 0.5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 1.25 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 1.5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 1.75 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 1.8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2.25 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## 2.3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2.5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 2.75 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1
## 2.85 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 2
## 3.25 0 0 0 0 0 1 1 1 0 2 1 1 0 2 0 8
## 3.5 0 0 1 0 0 1 2 0 0 1 1 0 5 1 0 8
## 3.6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3.75 3 0 1 1 6 1 9 2 1 7 0 0 8 10 1 18
## 4 1 1 1 0 2 1 2 3 0 5 2 0 10 7 0 18
## 4.125 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4.25 0 0 0 0 1 0 1 0 0 1 0 0 1 2 0 2
## 4.3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4.5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4.75 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Reference
## Prediction 3.25 3.5 3.6 3.75 4 4.125 4.25 4.3 4.5 4.75 5
## 0 0 0 0 0 0 0 0 0 0 0 0
## 0.1 0 0 0 0 0 0 0 0 0 0 0
## 0.25 0 0 0 0 0 0 0 0 0 0 0
## 0.5 0 0 0 0 0 0 0 0 0 0 0
## 1 0 0 0 0 0 0 0 0 0 0 0
## 1.25 0 0 0 0 0 0 0 0 0 0 0
## 1.5 0 0 0 0 0 0 0 0 0 0 0
## 1.75 0 0 0 0 0 0 0 0 0 0 0
## 1.8 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0
## 2.25 0 0 0 2 0 0 0 0 1 0 0
## 2.3 0 0 0 0 0 0 0 0 0 0 0
## 2.5 0 0 0 0 0 0 0 0 0 0 0
## 2.75 1 1 0 3 1 0 1 0 1 0 0
## 2.85 0 0 0 0 0 0 0 0 0 0 0
## 3 0 2 0 7 0 0 0 0 0 1 1
## 3.25 2 7 0 4 4 0 1 0 0 0 4
## 3.5 10 12 0 15 17 0 10 0 4 2 9
## 3.6 0 0 0 0 0 0 0 0 0 0 0
## 3.75 18 39 0 51 44 0 22 1 9 8 40
## 4 23 41 1 33 49 0 12 1 10 9 44
## 4.125 0 0 0 0 0 0 0 0 0 0 0
## 4.25 2 2 0 2 5 1 3 0 5 3 13
## 4.3 0 0 0 0 0 0 0 0 0 0 0
## 4.5 0 0 0 0 1 0 0 0 0 0 1
## 4.75 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.1504
## 95% CI : (0.1262, 0.1773)
## No Information Rate : 0.153
## P-Value [Acc > NIR] : 0.5933
##
## Kappa : 0.0181
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 0.1 Class: 0.25 Class: 0.5 Class: 1
## Sensitivity 0.000000 0.000000 0.000000 0.000000 0.00000
## Specificity 1.000000 1.000000 1.000000 1.000000 1.00000
## Pos Pred Value NaN NaN NaN NaN NaN
## Neg Pred Value 0.993679 0.998736 0.996207 0.997472 0.98862
## Prevalence 0.006321 0.001264 0.003793 0.002528 0.01138
## Detection Rate 0.000000 0.000000 0.000000 0.000000 0.00000
## Detection Prevalence 0.000000 0.000000 0.000000 0.000000 0.00000
## Balanced Accuracy 0.500000 0.500000 0.500000 0.500000 0.50000
## Class: 1.25 Class: 1.5 Class: 1.75 Class: 1.8 Class: 2
## Sensitivity 0.000000 0.00000 0.00000 0.000000 0.00000
## Specificity 1.000000 1.00000 1.00000 1.000000 1.00000
## Pos Pred Value NaN NaN NaN NaN NaN
## Neg Pred Value 0.994943 0.98104 0.99115 0.998736 0.97977
## Prevalence 0.005057 0.01896 0.00885 0.001264 0.02023
## Detection Rate 0.000000 0.00000 0.00000 0.000000 0.00000
## Detection Prevalence 0.000000 0.00000 0.00000 0.000000 0.00000
## Balanced Accuracy 0.500000 0.50000 0.50000 0.500000 0.50000
## Class: 2.25 Class: 2.3 Class: 2.5 Class: 2.75 Class: 2.85
## Sensitivity 0.000000 0.000000 0.000000 0.00000 0.000000
## Specificity 0.994911 1.000000 0.998695 0.98570 1.000000
## Pos Pred Value 0.000000 NaN 0.000000 0.00000 NaN
## Neg Pred Value 0.993647 0.998736 0.968354 0.97179 0.998736
## Prevalence 0.006321 0.001264 0.031606 0.02781 0.001264
## Detection Rate 0.000000 0.000000 0.000000 0.00000 0.000000
## Detection Prevalence 0.005057 0.000000 0.001264 0.01391 0.000000
## Balanced Accuracy 0.497455 0.500000 0.499347 0.49285 0.500000
## Class: 3 Class: 3.25 Class: 3.5 Class: 3.6 Class: 3.75
## Sensitivity 0.034483 0.035714 0.11538 0.000000 0.43590
## Specificity 0.982265 0.949660 0.87336 1.000000 0.63056
## Pos Pred Value 0.133333 0.051282 0.12121 NaN 0.17000
## Neg Pred Value 0.927835 0.928191 0.86705 0.998736 0.86558
## Prevalence 0.073325 0.070796 0.13148 0.001264 0.14791
## Detection Rate 0.002528 0.002528 0.01517 0.000000 0.06448
## Detection Prevalence 0.018963 0.049305 0.12516 0.000000 0.37927
## Balanced Accuracy 0.508374 0.492687 0.49437 0.500000 0.53323
## Class: 4 Class: 4.125 Class: 4.25 Class: 4.3 Class: 4.5
## Sensitivity 0.40496 0.000000 0.061224 0.000000 0.000000
## Specificity 0.66119 1.000000 0.944744 1.000000 0.997372
## Pos Pred Value 0.17754 NaN 0.068182 NaN 0.000000
## Neg Pred Value 0.86019 0.998736 0.938420 0.997472 0.961977
## Prevalence 0.15297 0.001264 0.061947 0.002528 0.037927
## Detection Rate 0.06195 0.000000 0.003793 0.000000 0.000000
## Detection Prevalence 0.34893 0.000000 0.055626 0.000000 0.002528
## Balanced Accuracy 0.53308 0.500000 0.502984 0.500000 0.498686
## Class: 4.75 Class: 5
## Sensitivity 0.00000 0.0000
## Specificity 1.00000 1.0000
## Pos Pred Value NaN NaN
## Neg Pred Value 0.97092 0.8584
## Prevalence 0.02908 0.1416
## Detection Rate 0.00000 0.0000
## Detection Prevalence 0.00000 0.0000
## Balanced Accuracy 0.50000 0.5000
Here I select my target and dummy variables for linear
regression
ramenDf_dum <- ramenDf %>% select(c("Stars","japan","pack","usa", "cup","korea","bowl","taiwan","tray","nissin" )) %>% filter(!is.na(Stars))
Now I split up my training and test data and run my linear
regression
set.seed(222)
ind <- sample(2, nrow(ramenDf_dum), replace = TRUE, prob = c(0.7, 0.3))
train <- ramenDf_dum[ind==1,]
test <- ramenDf_dum[ind==2,]
lin_model <- lm(Stars ~ japan + usa + cup + korea + bowl + tray + nissin + pack + taiwan, ramenDf_dum)
#
summary(lin_model)
##
## Call:
## lm(formula = Stars ~ japan + usa + cup + korea + bowl + tray +
## nissin + pack + taiwan, data = ramenDf_dum)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8626 -0.4298 0.1374 0.6411 1.8482
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.07521 0.31522 12.928 < 2e-16 ***
## japan 0.32834 0.06195 5.300 1.25e-07 ***
## usa -0.16180 0.06439 -2.513 0.0120 *
## cup -0.76165 0.31875 -2.390 0.0169 *
## korea 0.25379 0.06352 3.996 6.64e-05 ***
## bowl -0.56973 0.31766 -1.794 0.0730 .
## tray -0.64919 0.32888 -1.974 0.0485 *
## nissin 0.41307 0.06026 6.854 8.93e-12 ***
## pack -0.46635 0.31556 -1.478 0.1396
## taiwan 0.07663 0.07248 1.057 0.2905
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.993 on 2567 degrees of freedom
## Multiple R-squared: 0.04692, Adjusted R-squared: 0.04358
## F-statistic: 14.04 on 9 and 2567 DF, p-value: < 2.2e-16
Here I test the accuracy of my linear regression model for my ramen
dataset
library(plyr)
y_predL = predict(lin_model, newdata = test, predict.all=TRUE)
caret::confusionMatrix(as.factor(round_any(as.numeric(unlist(y_predL[])), 0.25, f = ceiling)), as.factor(test[,1]))
## Warning in levels(reference) != levels(data): longer object length is not a
## multiple of shorter object length
## Warning in
## confusionMatrix.default(as.factor(round_any(as.numeric(unlist(y_predL[])), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 0.1 0.25 0.5 1 1.25 1.5 1.75 1.8 2 2.25 2.3 2.5 2.75 2.85 3
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 0.1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 0.25 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 0.5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 1.25 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 1.5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 1.75 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 1.8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2.25 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2.3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2.5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2.75 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2.85 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3.25 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 2
## 3.5 2 0 0 0 1 3 6 1 0 3 2 0 5 6 0 11
## 3.6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3.75 3 1 3 2 6 1 8 5 0 11 2 1 15 14 1 32
## 4 0 0 0 0 2 0 1 0 1 1 1 0 3 2 0 10
## 4.125 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4.25 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3
## 4.3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4.5 0 0 0 0 0 0 0 0 0 1 0 0 2 0 0 0
## 4.75 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Reference
## Prediction 3.25 3.5 3.6 3.75 4 4.125 4.25 4.3 4.5 4.75 5
## 0 0 0 0 0 0 0 0 0 0 0 0
## 0.1 0 0 0 0 0 0 0 0 0 0 0
## 0.25 0 0 0 0 0 0 0 0 0 0 0
## 0.5 0 0 0 0 0 0 0 0 0 0 0
## 1 0 0 0 0 0 0 0 0 0 0 0
## 1.25 0 0 0 0 0 0 0 0 0 0 0
## 1.5 0 0 0 0 0 0 0 0 0 0 0
## 1.75 0 0 0 0 0 0 0 0 0 0 0
## 1.8 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0
## 2.25 0 0 0 0 0 0 0 0 0 0 0
## 2.3 0 0 0 0 0 0 0 0 0 0 0
## 2.5 0 0 0 0 0 0 0 0 0 0 0
## 2.75 0 0 0 0 0 0 0 0 0 0 0
## 2.85 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0
## 3.25 0 4 0 1 1 0 0 0 0 0 3
## 3.5 8 17 0 21 15 0 10 0 4 1 3
## 3.6 0 0 0 0 0 0 0 0 0 0 0
## 3.75 32 59 1 65 64 0 23 2 12 10 61
## 4 14 20 0 20 30 1 12 0 9 9 28
## 4.125 0 0 0 0 0 0 0 0 0 0 0
## 4.25 2 3 0 9 7 0 4 0 4 3 14
## 4.3 0 0 0 0 0 0 0 0 0 0 0
## 4.5 0 1 0 1 4 0 0 0 1 0 3
## 4.75 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.1479
## 95% CI : (0.1239, 0.1746)
## No Information Rate : 0.153
## P-Value [Acc > NIR] : 0.6683
##
## Kappa : 0.0113
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 0.1 Class: 0.25 Class: 0.5 Class: 1
## Sensitivity 0.000000 0.000000 0.000000 0.000000 0.00000
## Specificity 1.000000 1.000000 1.000000 1.000000 1.00000
## Pos Pred Value NaN NaN NaN NaN NaN
## Neg Pred Value 0.993679 0.998736 0.996207 0.997472 0.98862
## Prevalence 0.006321 0.001264 0.003793 0.002528 0.01138
## Detection Rate 0.000000 0.000000 0.000000 0.000000 0.00000
## Detection Prevalence 0.000000 0.000000 0.000000 0.000000 0.00000
## Balanced Accuracy 0.500000 0.500000 0.500000 0.500000 0.50000
## Class: 1.25 Class: 1.5 Class: 1.75 Class: 1.8 Class: 2
## Sensitivity 0.000000 0.00000 0.00000 0.000000 0.00000
## Specificity 1.000000 1.00000 1.00000 1.000000 1.00000
## Pos Pred Value NaN NaN NaN NaN NaN
## Neg Pred Value 0.994943 0.98104 0.99115 0.998736 0.97977
## Prevalence 0.005057 0.01896 0.00885 0.001264 0.02023
## Detection Rate 0.000000 0.00000 0.00000 0.000000 0.00000
## Detection Prevalence 0.000000 0.00000 0.00000 0.000000 0.00000
## Balanced Accuracy 0.500000 0.50000 0.50000 0.500000 0.50000
## Class: 2.25 Class: 2.3 Class: 2.5 Class: 2.75 Class: 2.85
## Sensitivity 0.000000 0.000000 0.00000 0.00000 0.000000
## Specificity 1.000000 1.000000 1.00000 1.00000 1.000000
## Pos Pred Value NaN NaN NaN NaN NaN
## Neg Pred Value 0.993679 0.998736 0.96839 0.97219 0.998736
## Prevalence 0.006321 0.001264 0.03161 0.02781 0.001264
## Detection Rate 0.000000 0.000000 0.00000 0.00000 0.000000
## Detection Prevalence 0.000000 0.000000 0.00000 0.00000 0.000000
## Balanced Accuracy 0.500000 0.500000 0.50000 0.50000 0.500000
## Class: 3 Class: 3.25 Class: 3.5 Class: 3.6 Class: 3.75
## Sensitivity 0.00000 0.00000 0.16346 0.000000 0.55556
## Specificity 1.00000 0.98367 0.85153 1.000000 0.45252
## Pos Pred Value NaN 0.00000 0.14286 NaN 0.14977
## Neg Pred Value 0.92668 0.92811 0.87054 0.998736 0.85434
## Prevalence 0.07332 0.07080 0.13148 0.001264 0.14791
## Detection Rate 0.00000 0.00000 0.02149 0.000000 0.08217
## Detection Prevalence 0.00000 0.01517 0.15044 0.000000 0.54867
## Balanced Accuracy 0.50000 0.49184 0.50749 0.500000 0.50404
## Class: 4 Class: 4.125 Class: 4.25 Class: 4.3 Class: 4.5
## Sensitivity 0.24793 0.000000 0.081633 0.000000 0.033333
## Specificity 0.80000 1.000000 0.939353 1.000000 0.984231
## Pos Pred Value 0.18293 NaN 0.081633 NaN 0.076923
## Neg Pred Value 0.85486 0.998736 0.939353 0.997472 0.962725
## Prevalence 0.15297 0.001264 0.061947 0.002528 0.037927
## Detection Rate 0.03793 0.000000 0.005057 0.000000 0.001264
## Detection Prevalence 0.20733 0.000000 0.061947 0.000000 0.016435
## Balanced Accuracy 0.52397 0.500000 0.510493 0.500000 0.508782
## Class: 4.75 Class: 5
## Sensitivity 0.00000 0.0000
## Specificity 1.00000 1.0000
## Pos Pred Value NaN NaN
## Neg Pred Value 0.97092 0.8584
## Prevalence 0.02908 0.1416
## Detection Rate 0.00000 0.0000
## Detection Prevalence 0.00000 0.0000
## Balanced Accuracy 0.50000 0.5000
#Wine dataset
Here I select off free text columns I wont process
library(dplyr)
wineDf <- wineDf %>% select(-c("X", "description", "designation"))
Here I create some dummy variables for my linear regression
wineDf <- wineDf %>% relocate(points)
wineDf <- wineDf %>% mutate(points = as.numeric(points), usa = as.numeric(country == 'US'), france = as.numeric(country == 'France'), italy = as.numeric(country == 'Italy'), spain = as.numeric(country == 'Spain'))
wineDf <- wineDf %>% mutate(chard = as.numeric(variety == 'Chardonnay'), pinot = as.numeric(variety == 'Pinot Noir'), cab = as.numeric(variety == 'Cabernet Sauvignon'), rblend = as.numeric(variety == 'Red Blend'), bordeaux = as.numeric(variety == 'Bordeaux-style Red Blend'))
reduced_wineDf <- wineDf %>% select(points, price, usa, france, italy, spain, chard, pinot, cab, rblend, bordeaux) %>% filter(!is.na(points))
test1 <- wineDf[1:5000,]
Here I explore the data for missingness and correlations
pointblank::scan_data(test1)
Table Overview
|
17 |
|
5,000 |
|
269 (0.32%) |
|
337 (6.74%) |
Reproducibility Information
|
|
|
|
|
R version 4.3.2 (2023–10–31 ucrt) Eye Holes
|
|
|
|
points |
country |
price |
province |
region_1 |
region_2 |
variety |
winery |
usa |
france |
italy |
spain |
chard |
pinot |
cab |
rblend |
bordeaux |
1 |
96 |
US |
235 |
California |
Napa Valley |
Napa |
Cabernet Sauvignon |
Heitz |
1 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
2 |
96 |
Spain |
110 |
Northern Spain |
Toro |
|
Tinta de Toro |
Bodega Carmen RodrÃguez |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
3 |
96 |
US |
90 |
California |
Knights Valley |
Sonoma |
Sauvignon Blanc |
Macauley |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
4 |
96 |
US |
65 |
Oregon |
Willamette Valley |
Willamette Valley |
Pinot Noir |
Ponzi |
1 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
5 |
95 |
France |
66 |
Provence |
Bandol |
|
Provence red blend |
Domaine de la Bégude |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
6..4995 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4996 |
84 |
US |
10 |
Washington |
Columbia Valley (WA) |
Columbia Valley |
Merlot |
Sagelands |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
4997 |
84 |
Argentina |
15 |
Mendoza Province |
Valle de Uco |
|
Chardonnay-Viognier |
Zuccardi |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
4998 |
84 |
Spain |
9 |
Northern Spain |
Cariñena |
|
Red Blend |
Las Valles |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
1 |
0 |
4999 |
84 |
US |
14 |
Washington |
Columbia Valley (WA) |
Columbia Valley |
Roussanne |
GÃ¥rd |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
5000 |
84 |
US |
15 |
Idaho |
|
|
Viognier |
Fujishin |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
Here I create the training and test sets for the wine dataset linear
regression and create my model
set.seed(222)
ind <- sample(2, nrow(reduced_wineDf), replace = TRUE, prob = c(0.7, 0.3))
train <- reduced_wineDf[ind==1,]
test <- reduced_wineDf[ind==2,]
lin_model <- lm(points ~ price + usa + france + italy + spain + chard + pinot + cab + rblend + bordeaux, train)
#
summary(lin_model)
##
## Call:
## lm(formula = points ~ price + usa + france + italy + spain +
## chard + pinot + cab + rblend + bordeaux, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -75.567 -1.867 0.005 2.026 11.132
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 86.2731004 0.0203899 4231.176 < 2e-16 ***
## price 0.0379975 0.0002552 148.874 < 2e-16 ***
## usa 0.1249285 0.0236931 5.273 1.35e-07 ***
## france 0.4696567 0.0350231 13.410 < 2e-16 ***
## italy 0.7431201 0.0318046 23.365 < 2e-16 ***
## spain -0.6469356 0.0421324 -15.355 < 2e-16 ***
## chard 0.0131372 0.0320928 0.409 0.6823
## pinot 0.5616782 0.0328073 17.121 < 2e-16 ***
## cab 0.1076149 0.0336863 3.195 0.0014 **
## rblend 0.1602617 0.0372419 4.303 1.68e-05 ***
## bordeaux 0.4298177 0.0534644 8.039 9.14e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.848 on 96015 degrees of freedom
## (9601 observations deleted due to missingness)
## Multiple R-squared: 0.2201, Adjusted R-squared: 0.22
## F-statistic: 2710 on 10 and 96015 DF, p-value: < 2.2e-16
Now I test the accuracy of my linear model
library(plyr)
y_predL = predict(lin_model, newdata = test, predict.all=TRUE)
temp <- as.data.frame(round_any(as.numeric(unlist(y_predL[])), 1 , f = ceiling))
names(temp) <- c('prediction')
temp <- temp %>% mutate(prediction = as.factor(ifelse(prediction > 100, 100, prediction)))
caret::confusionMatrix(temp$prediction, as.factor(test[,1]))
## Warning in levels(reference) != levels(data): longer object length is not a
## multiple of shorter object length
## Warning in confusionMatrix.default(temp$prediction, as.factor(test[, 1])):
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 80 81 82 83 84 85 86 87 88 89 90 91 92
## 80 0 0 0 0 0 0 0 0 0 0 0 0 0
## 81 0 0 0 0 0 0 0 0 0 0 0 0 0
## 82 0 0 0 0 0 0 0 0 0 0 0 0 0
## 83 0 0 0 0 0 0 0 0 0 0 0 0 0
## 84 0 0 0 0 0 0 0 0 0 0 0 0 0
## 85 0 0 0 0 0 0 0 0 0 0 0 0 0
## 86 14 12 15 31 44 33 33 25 9 2 1 2 0
## 87 144 225 549 882 1322 1370 1405 1386 1028 565 452 147 58
## 88 99 163 571 763 1522 1793 2345 3379 2558 1728 2099 1216 857
## 89 9 18 52 72 205 265 424 787 920 869 1200 875 818
## 90 1 1 10 8 34 40 62 139 211 193 394 383 453
## 91 0 0 2 2 2 9 19 23 38 52 87 101 121
## 92 0 0 1 1 3 4 3 14 21 19 30 29 47
## 93 0 0 0 0 0 0 2 8 0 4 5 17 25
## 94 0 0 0 1 3 0 0 0 1 0 10 9 10
## 95 0 0 0 1 0 0 0 0 1 2 5 1 11
## 96 0 0 0 0 0 0 0 1 0 0 4 1 1
## 97 0 0 0 0 0 0 0 0 0 1 1 0 2
## 98 0 0 0 0 0 0 2 0 1 2 0 1 3
## 99 0 0 0 0 0 0 0 0 0 0 1 0 0
## 100 0 0 0 0 0 0 0 1 0 2 1 3 3
## Reference
## Prediction 93 94 95 96 97 98 99 100
## 80 0 0 0 0 0 0 0 0
## 81 0 0 0 0 0 0 0 0
## 82 0 0 0 0 0 0 0 0
## 83 0 0 0 0 0 0 0 0
## 84 0 0 0 0 0 0 0 0
## 85 0 0 0 0 0 0 0 0
## 86 0 0 0 0 0 0 0 0
## 87 10 2 0 0 0 0 0 0
## 88 370 113 26 11 0 0 0 0
## 89 476 309 96 26 7 3 1 2
## 90 364 219 116 48 18 6 0 0
## 91 140 121 45 28 20 5 1 0
## 92 63 55 40 14 10 4 0 0
## 93 34 30 15 9 4 0 0 0
## 94 19 13 15 12 4 4 0 0
## 95 12 10 6 5 3 1 1 2
## 96 10 11 8 2 5 1 1 1
## 97 6 4 13 1 7 0 1 0
## 98 9 8 1 3 3 2 1 1
## 99 2 1 2 0 3 1 0 0
## 100 4 8 22 14 12 5 2 1
##
## Overall Statistics
##
## Accuracy : 0.1323
## 95% CI : (0.1291, 0.1356)
## No Information Rate : 0.1398
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0227
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 80 Class: 81 Class: 82 Class: 83 Class: 84
## Sensitivity 0.000000 0.00000 0.00000 0.00000 0.00000
## Specificity 1.000000 1.00000 1.00000 1.00000 1.00000
## Pos Pred Value NaN NaN NaN NaN NaN
## Neg Pred Value 0.993521 0.98983 0.97088 0.95727 0.92392
## Prevalence 0.006479 0.01017 0.02912 0.04273 0.07608
## Detection Rate 0.000000 0.00000 0.00000 0.00000 0.00000
## Detection Prevalence 0.000000 0.00000 0.00000 0.00000 0.00000
## Balanced Accuracy 0.500000 0.50000 0.50000 0.50000 0.50000
## Class: 85 Class: 86 Class: 87 Class: 88 Class: 89
## Sensitivity 0.00000 0.0076834 0.24050 0.53425 0.25269
## Specificity 1.00000 0.9949071 0.76982 0.53173 0.82618
## Pos Pred Value NaN 0.1493213 0.14521 0.13042 0.11690
## Neg Pred Value 0.91473 0.8960183 0.86177 0.89674 0.92391
## Prevalence 0.08527 0.1042248 0.13985 0.11619 0.08345
## Detection Rate 0.00000 0.0008008 0.03363 0.06207 0.02109
## Detection Prevalence 0.00000 0.0053629 0.23162 0.47594 0.18040
## Balanced Accuracy 0.50000 0.5012952 0.50516 0.53299 0.53944
## Class: 90 Class: 91 Class: 92 Class: 93 Class: 94
## Sensitivity 0.091841 0.036266 0.019510 0.0223831 0.0143805
## Specificity 0.937539 0.981392 0.991985 0.9970018 0.9978166
## Pos Pred Value 0.145926 0.123775 0.131285 0.2222222 0.1287129
## Neg Pred Value 0.898829 0.933553 0.942180 0.9638299 0.9783254
## Prevalence 0.104103 0.067582 0.058458 0.0368609 0.0219370
## Detection Rate 0.009561 0.002451 0.001141 0.0008251 0.0003155
## Detection Prevalence 0.065520 0.019801 0.008687 0.0037128 0.0024509
## Balanced Accuracy 0.514690 0.508829 0.505747 0.5096925 0.5060986
## Class: 95 Class: 96 Class: 97 Class: 98 Class: 99
## Sensitivity 0.0148148 1.156e-02 0.0729167 6.250e-02 0.0000000
## Specificity 0.9986521 9.989e-01 0.9992946 9.992e-01 0.9997573
## Pos Pred Value 0.0983607 4.348e-02 0.1944444 5.405e-02 0.0000000
## Neg Pred Value 0.9903033 9.958e-01 0.9978384 9.993e-01 0.9998058
## Prevalence 0.0098280 4.198e-03 0.0023296 7.765e-04 0.0001941
## Detection Rate 0.0001456 4.853e-05 0.0001699 4.853e-05 0.0000000
## Detection Prevalence 0.0014803 1.116e-03 0.0008736 8.979e-04 0.0002427
## Balanced Accuracy 0.5067335 5.052e-01 0.5361056 5.308e-01 0.4998786
## Class: 100
## Sensitivity 1.429e-01
## Specificity 9.981e-01
## Pos Pred Value 1.282e-02
## Neg Pred Value 9.999e-01
## Prevalence 1.699e-04
## Detection Rate 2.427e-05
## Detection Prevalence 1.893e-03
## Balanced Accuracy 5.705e-01
Here I create my train and test dataset for the wine random
forest
wineDf_cat <- wineDf %>% filter(!is.na(points)) %>% select(c("points", "country","price","province","region_1","region_2","variety","winery")) %>% na.omit()
set.seed(222)
#wineDf_cat <- wineDf_cat[1:50000,]
ind <- sample(2, nrow(wineDf_cat), replace = TRUE, prob = c(0.7, 0.3))
train <- wineDf_cat[ind==1,]
test <- wineDf_cat[ind==2,]
Here I build my wine random forest model
rf <- randomForest::randomForest(points~., data=train, ntree = 10,
keep.forest=TRUE, importance=TRUE)
print(rf)
##
## Call:
## randomForest(formula = points ~ ., data = train, ntree = 10, keep.forest = TRUE, importance = TRUE)
## Type of random forest: regression
## Number of trees: 10
## No. of variables tried at each split: 2
##
## Mean of squared residuals: 4.997418
## % Var explained: 51.95
Here I test the accuracy of my rf model
library(plyr)
y_predL = predict(rf, newdata = test, predict.all=TRUE)
temp <- as.data.frame(round_any(as.numeric(unlist(y_predL[1])), 1 , f = ceiling))
names(temp) <- c('prediction')
temp <- temp %>% mutate(prediction = as.factor(ifelse(prediction > 100, 100, prediction)))
caret::confusionMatrix(temp$prediction, as.factor(test[,1]))
## Warning in levels(reference) != levels(data): longer object length is not a
## multiple of shorter object length
## Warning in confusionMatrix.default(temp$prediction, as.factor(test[, 1])):
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 80 81 82 83 84 85 86 87 88 89 90 91 92
## 80 0 0 0 0 0 0 0 0 0 0 0 0 0
## 81 0 0 0 0 0 0 0 0 0 0 0 0 0
## 82 0 0 0 0 0 0 0 0 0 0 0 0 0
## 83 1 1 6 3 6 0 2 0 0 1 0 0 0
## 84 20 21 70 107 99 86 59 54 18 11 4 6 0
## 85 65 109 242 383 562 504 440 359 200 88 68 23 12
## 86 68 122 295 508 851 934 1098 1044 653 303 294 100 52
## 87 39 91 266 386 761 854 1105 1529 1070 619 626 275 161
## 88 25 48 166 212 400 496 789 1302 1199 732 827 434 303
## 89 15 33 85 113 214 311 464 719 873 762 988 632 548
## 90 3 10 27 50 114 168 279 456 521 587 780 656 628
## 91 2 5 11 12 38 52 100 167 268 256 445 455 466
## 92 0 1 3 5 7 13 18 38 57 82 144 151 205
## 93 0 1 0 0 3 5 7 15 12 20 28 37 79
## 94 0 0 0 2 0 2 0 2 7 8 9 9 17
## 95 0 0 0 0 0 0 0 0 0 1 2 0 4
## 96 0 0 0 0 0 0 0 0 0 0 0 0 0
## 97 0 0 0 0 0 0 0 0 0 0 0 0 1
## 98 0 0 0 0 0 0 0 0 0 0 0 0 0
## 99 0 0 0 0 0 0 0 0 0 0 0 0 0
## 100 0 0 0 0 0 0 0 0 0 0 0 0 0
## Reference
## Prediction 93 94 95 96 97 98 99 100
## 80 0 0 0 0 0 0 0 0
## 81 0 0 0 0 0 0 0 0
## 82 0 0 0 0 0 0 0 0
## 83 0 0 0 0 0 0 0 0
## 84 0 0 0 0 0 0 0 0
## 85 2 3 0 0 0 0 0 0
## 86 8 1 3 1 0 0 0 0
## 87 51 16 2 0 0 0 0 0
## 88 119 55 6 2 0 0 0 0
## 89 250 124 52 9 1 3 1 0
## 90 381 172 56 19 5 0 0 0
## 91 376 252 111 46 19 2 1 0
## 92 201 162 83 32 21 9 1 2
## 93 86 78 47 29 16 4 5 1
## 94 25 38 26 21 16 4 2 2
## 95 15 19 22 10 7 5 2 2
## 96 2 2 2 3 7 0 1 1
## 97 2 0 1 0 0 3 0 0
## 98 0 0 0 0 0 0 0 0
## 99 0 0 0 0 0 0 0 0
## 100 0 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.1649
## 95% CI : (0.1613, 0.1685)
## No Information Rate : 0.1382
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.0692
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 80 Class: 81 Class: 82 Class: 83 Class: 84
## Sensitivity 0.000000 0.00000 0.00000 1.684e-03 0.032406
## Specificity 1.000000 1.00000 1.00000 9.996e-01 0.988027
## Pos Pred Value NaN NaN NaN 1.500e-01 0.178378
## Neg Pred Value 0.994215 0.98926 0.97154 9.568e-01 0.927167
## Prevalence 0.005785 0.01074 0.02846 4.329e-02 0.074257
## Detection Rate 0.000000 0.00000 0.00000 7.292e-05 0.002406
## Detection Prevalence 0.000000 0.00000 0.00000 4.861e-04 0.013490
## Balanced Accuracy 0.500000 0.50000 0.50000 5.006e-01 0.510216
## Class: 85 Class: 86 Class: 87 Class: 88 Class: 89
## Sensitivity 0.14715 0.25178 0.26895 0.24580 0.21960
## Specificity 0.93223 0.85761 0.82169 0.83686 0.85572
## Pos Pred Value 0.16471 0.17332 0.19475 0.16852 0.12296
## Neg Pred Value 0.92330 0.90625 0.87516 0.89188 0.92250
## Prevalence 0.08325 0.10600 0.13818 0.11857 0.08434
## Detection Rate 0.01225 0.02669 0.03716 0.02914 0.01852
## Detection Prevalence 0.07438 0.15398 0.19083 0.17294 0.15063
## Balanced Accuracy 0.53969 0.55469 0.54532 0.54133 0.53766
## Class: 90 Class: 91 Class: 92 Class: 93 Class: 94
## Sensitivity 0.18505 0.16379 0.082795 0.05665 0.0412148
## Specificity 0.88810 0.93147 0.973361 0.99023 0.9962207
## Pos Pred Value 0.15879 0.14754 0.165992 0.18182 0.2000000
## Neg Pred Value 0.90519 0.93896 0.943091 0.96479 0.9784132
## Prevalence 0.10245 0.06752 0.060183 0.03690 0.0224107
## Detection Rate 0.01896 0.01106 0.004983 0.00209 0.0009237
## Detection Prevalence 0.11939 0.07496 0.030019 0.01150 0.0046183
## Balanced Accuracy 0.53658 0.54763 0.528078 0.52344 0.5187177
## Class: 95 Class: 96 Class: 97 Class: 98 Class: 99
## Sensitivity 0.0535280 1.744e-02 0.0000000 0.0000000 0.000000
## Specificity 0.9983550 9.996e-01 0.9998295 1.0000000 1.000000
## Pos Pred Value 0.2471910 1.667e-01 0.0000000 NaN NaN
## Neg Pred Value 0.9905242 9.959e-01 0.9977634 0.9992708 0.999684
## Prevalence 0.0099900 4.181e-03 0.0022362 0.0007292 0.000316
## Detection Rate 0.0005347 7.292e-05 0.0000000 0.0000000 0.000000
## Detection Prevalence 0.0021633 4.375e-04 0.0001701 0.0000000 0.000000
## Balanced Accuracy 0.5259415 5.085e-01 0.4999147 0.5000000 0.500000
## Class: 100
## Sensitivity 0.0000000
## Specificity 1.0000000
## Pos Pred Value NaN
## Neg Pred Value 0.9998055
## Prevalence 0.0001945
## Detection Rate 0.0000000
## Detection Prevalence 0.0000000
## Balanced Accuracy 0.5000000
Note that the echo = FALSE
parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.