Data

choco<- read.csv(
  "C:\\Users\\user\\Desktop\\110_1\\Statistics\\Data Analysis\\FULL_DATA\\chocolate.csv")
n<- nrow(choco)
data<- choco[-1]
data$rating[which(data$rating==2.6)]<- 2.5
names(data)[16]<- "sweetener"

kable(data[1:10, ], "html", caption = "Table 1. Data outlook") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), position = "center") %>%
    scroll_box(width = "850px", height = "300px")
Table 1. Data outlook
ref company company_location review_date country_of_bean_origin specific_bean_origin_or_bar_name cocoa_percent rating counts_of_ingredients beans cocoa_butter vanilla lecithin salt sugar sweetener first_taste second_taste third_taste fourth_taste
2454 5150 U.S.A 2019 Madagascar Bejofo Estate, batch 1 76 3.75 3 have_bean have_cocoa_butter have_not_vanila have_not_lecithin have_not_salt have_sugar have_not_sweetener_without_sugar cocoa blackberry full body
2458 5150 U.S.A 2019 Dominican republic Zorzal, batch 1 76 3.50 3 have_bean have_cocoa_butter have_not_vanila have_not_lecithin have_not_salt have_sugar have_not_sweetener_without_sugar cocoa vegetal savory
2454 5150 U.S.A 2019 Tanzania Kokoa Kamili, batch 1 76 3.25 3 have_bean have_cocoa_butter have_not_vanila have_not_lecithin have_not_salt have_sugar have_not_sweetener_without_sugar rich cocoa fatty bready
797 A. Morin France 2012 Peru Peru 63 3.75 4 have_bean have_cocoa_butter have_not_vanila have_lecithin have_not_salt have_sugar have_not_sweetener_without_sugar fruity melon roasty
797 A. Morin France 2012 Bolivia Bolivia 70 3.50 4 have_bean have_cocoa_butter have_not_vanila have_lecithin have_not_salt have_sugar have_not_sweetener_without_sugar vegetal nutty
1015 A. Morin France 2013 Venezuela Chuao 70 4.00 4 have_bean have_cocoa_butter have_not_vanila have_lecithin have_not_salt have_sugar have_not_sweetener_without_sugar oily nut caramel raspberry
1019 A. Morin France 2013 Peru Chanchamayo Province 63 4.00 3 have_bean have_cocoa_butter have_not_vanila have_not_lecithin have_not_salt have_sugar have_not_sweetener_without_sugar sweet cocoa tangerine
1011 A. Morin France 2013 Ecuador Equateur 70 3.75 4 have_bean have_cocoa_butter have_not_vanila have_lecithin have_not_salt have_sugar have_not_sweetener_without_sugar sandy nutty cocoa fig
1019 A. Morin France 2013 Peru Chanchamayo Province 70 3.50 4 have_bean have_cocoa_butter have_not_vanila have_lecithin have_not_salt have_sugar have_not_sweetener_without_sugar cocoa sour intense tangerine
1011 A. Morin France 2013 Brazil Brazil 70 3.25 4 have_bean have_cocoa_butter have_not_vanila have_lecithin have_not_salt have_sugar have_not_sweetener_without_sugar mild tobacco

Reorganizing response

levels(data$cocoa_butter)<- data$cocoa_butter %>% fct_rev() %>% levels()
levels(data$lecithin)<- data$lecithin %>% fct_rev() %>% levels()
names<- names(data)
org<- data
# Convert logical to integer
fun<- function(x){
  return<- x %>% str_detect("not")
  as.numeric(!return)
}
dummy <- data[10:16] %>% lapply(fun) %>% as.data.frame()
data[10:16]<- dummy
target<- names[8]
variable<- names[c(7, 11:16)]
variables<- names[c(7, 11:15)]

# categorize response into 4 subs
rating<- data$rating

df <- table(data$rating) %>% t()
kable(df, "html", caption = "Table 2. Original rating outlook") %>%
  kable_styling(bootstrap_options = c("hover", "condensed", "responsive"),
                position = "center") %>%
  add_header_above(c("rating" = 12)) %>%
  kable_minimal()
Table 2. Original rating outlook
rating
1 1.5 1.75 2 2.25 2.5 2.75 3 3.25 3.5 3.75 4
1 5 1 29 14 150 304 471 394 489 265 101
data$rating <- ifelse(data$rating<=2.5, 1, data$rating)
data$rating <- ifelse(data$rating<=3 & data$rating>2.5, 2, data$rating)
data$rating <- ifelse(data$rating<=3.5 & data$rating>3, 3, data$rating)
data$rating <- ifelse(data$rating >3.5, 4, data$rating)
df <- table(data$rating) %>% t()
kable(df, "html", caption = "Table 3. Reorganized rating outlook") %>%
  kable_styling(bootstrap_options = c("hover", "condensed", "responsive"),
                position = "center") %>%
  add_header_above(c("rating" = 4)) %>%
  kable_minimal()
Table 3. Reorganized rating outlook
rating
1 2 3 4
200 775 883 366

Train/Test split

# Factorized columns
data$rating <- factor(data$rating,
                      levels = c("1", "2", "3", "4"), ordered = TRUE)
data[, variables[-1]]<- lapply(data[, variables[-1]], function(x){
  factor(x, levels = c("0", "1"), ordered = TRUE)
}) %>% as.data.frame()
# train/test split
train_int<- 
  createDataPartition(data$rating, p = 0.8) %>% 
  unlist() %>% as.numeric()
train<- data[train_int,]
test<- data[-train_int,]
train_df<- prop.table(table(train$rating))
test_df<- prop.table(table(test$rating))
df<- rbind(train_df, test_df) %>% round(digits = 3) %>% as.data.frame()
rownames(df)<- c("train", "test")
kable(df, "html", caption = "Table 4. Proportions of four categories within each of train/test split") %>%
  kable_styling(bootstrap_options = c("hover", "condensed", "responsive"),
                position = "center") %>%
  add_header_above(c(" ", "rating" = 4)) %>%
  kable_minimal()
Table 4. Proportions of four categories within each of train/test split
rating
1 2 3 4
train 0.09 0.348 0.397 0.165
test 0.09 0.349 0.396 0.164

Ordinal Logistic Regression report

# multicategory logit model
variables2<- variables[-c(4:5)]
f2 <- paste(target, "~", paste(variables2, collapse=" + "))
cumu.logit2=vglm(f2, family=cumulative(parallel=TRUE), data=train)
summary(cumu.logit2, digits = 3)
## 
## Call:
## vglm(formula = f2, family = cumulative(parallel = TRUE), data = train)
## 
## Coefficients: 
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept):1  -3.569376   0.614855  -5.805 6.43e-09 ***
## (Intercept):2  -1.438861   0.608652  -2.364 0.018078 *  
## (Intercept):3   0.471604   0.608213   0.775 0.438107    
## cocoa_percent   0.026661   0.008278   3.221 0.001279 ** 
## cocoa_butter.L  0.060141   0.069305   0.868 0.385519    
## vanilla.L       0.637851   0.090575   7.042 1.89e-12 ***
## sugar.L        -0.577289   0.164974  -3.499 0.000467 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Names of linear predictors: logitlink(P[Y<=1]), logitlink(P[Y<=2]), 
## logitlink(P[Y<=3])
## 
## Residual deviance: 4372.745 on 5333 degrees of freedom
## 
## Log-likelihood: -2186.373 on 5333 degrees of freedom
## 
## Number of Fisher scoring iterations: 5 
## 
## No Hauck-Donner effect found in any of the estimates
## 
## 
## Exponentiated coefficients:
##  cocoa_percent cocoa_butter.L      vanilla.L        sugar.L 
##      1.0270191      1.0619863      1.8924105      0.5614184

Model Interpretation

## Nomogram
n_train<- org[train_int,]
n_test<- org[-train_int,]
rms_train<- n_train
# Had to reverse factors of response since I did not find an option to reverse in the model fitting function orm()
# However, the two models from two different packages turn out to be identical in terms of their AICs and estimated coefficients.
rms_train$rating<- n_train$rating %>% as.factor() %>% fct_rev()

ddist<- datadist(rms_train)
options(datadist = "ddist")
fit.blogit.rms<- rms::orm(as.formula("rating ~ cocoa_percent + cocoa_butter + vanilla + sugar"), data=rms_train)
coef<- fit.blogit.rms$coefficients
ff<- Newlabels(fit.blogit.rms, c(vanilla = "vanilla"))
fun2<- function(x) {
  plogis(x- coef[1] + coef[2])
}
fun3<- function(x) {
  plogis(x- coef[1] + coef[3])
}
nomo<- nomogram(ff,
                fun = list("Prob Y>=2" = plogis,
                           "Prob Y>=3" = fun2,
                           "Prob Y>=4" = fun3),
                           lp = F,
                           fun.at = c(.01, .05, seq(.1, .9, by = .1), .95, .99))
plot(nomo, lmgp = .2, cex.axis = .6) # slight difference

Confusion Table

rms_train<- train
# Had to reverse factors of response since I did not find an option to reverse in the model fitting function orm()
# However, the two models from two different packages turn out to be identical in terms of their AICs and estimated coefficients.
rms_train$rating<- train$rating %>% as.factor() %>% fct_rev()
# Same situation again in polr()
# However, the three models from different packages turn out to be identical in terms of their AICs and estimated coefficients.
MASS<- polr(f2, data=rms_train, Hess = TRUE)
# summary(MASS)
pred<- predict(MASS, newdata=test)%>% fct_rev()
original<- test$rating
# Confusion Matrix
table(pred, original)
##     original
## pred   1   2   3   4
##    1   0   0   0   0
##    2  19  47  20  11
##    3  21 108 156  62
##    4   0   0   0   0

Main Effect plot

plot(predictorEffects(MASS),lines=list(multiline=TRUE),
     axes=list(grid=TRUE))

Interaction plot (with cocoa_butter)

plot(Effect(focal.predictors = c("cocoa_butter", "cocoa_percent"),MASS),
     lines=list(multiline=TRUE), axes=list(grid=TRUE))

plot(Effect(focal.predictors = c("cocoa_butter", "vanilla"),MASS),
     lines=list(multiline=TRUE), axes=list(grid=TRUE))

plot(Effect(focal.predictors = c("cocoa_butter", "sugar"),MASS),
     lines=list(multiline=TRUE), axes=list(grid=TRUE))

Different organizing scheme

1st

  1. (0, 2,5]
  2. (2.5, 2.75]
  3. (2.75, 3]
  4. (3, 3.25]
  5. (3.25, 4]

小怪怪的

data<- choco[-1]
data$rating[which(data$rating==2.6)]<- 2.5
names(data)[16]<- "sweetener"
levels(data$cocoa_butter)<- data$cocoa_butter %>% fct_rev() %>% levels()
levels(data$lecithin)<- data$lecithin %>% fct_rev() %>% levels()
names<- names(data)
org<- data

# Convert logical to integer
fun<- function(x){
  return<- x %>% str_detect("not")
  as.numeric(!return) %>% as.factor()
}
dummy <- data[10:16] %>% lapply(fun) %>% as.data.frame()
data[10:16]<- dummy
org[10:16]<- dummy
target<- names[8]
variable<- names[c(7, 11:16)]
variables<- names[c(7, 11:15)]

table(data$rating)
## 
##    1  1.5 1.75    2 2.25  2.5 2.75    3 3.25  3.5 3.75    4 
##    1    5    1   29   14  150  304  471  394  489  265  101
# categorize response into 4 subs
data$rating <- ifelse(data$rating<=2.5, 1, data$rating)
data$rating <- ifelse(data$rating<=2.75 & data$rating>2.5, 2, data$rating)
data$rating <- ifelse(data$rating<=3 & data$rating>2.75, 3, data$rating)
data$rating <- ifelse(data$rating >3.25, 5, data$rating)
data$rating <- ifelse(data$rating<=3.25 & data$rating >3, 4, data$rating)
org$rating<- data$rating
df <- table(data$rating) %>% t()
kable(df, "html", caption = "Table 5. Another reorganized rating outlook") %>%
  kable_styling(bootstrap_options = c("hover", "condensed", "responsive"),
                position = "center") %>%
  add_header_above(c("rating" = 5)) %>%
  kable_minimal()
Table 5. Another reorganized rating outlook
rating
1 2 3 4 5
200 304 471 394 855
# Factorized columns
data$rating <- factor(data$rating,
                      levels = c("1", "2", "3", "4", "5"), ordered = TRUE)
data[, variables[-1]]<- lapply(data[, variables[-1]], function(x){
  factor(x, levels = c("0", "1"), ordered = TRUE)
}) %>% as.data.frame()

# train/test split
train_int<- 
  createDataPartition(data$rating, p = 0.8) %>% 
  unlist() %>% as.numeric()
train<- data[train_int,]
test<- data[-train_int,]
train_df<- prop.table(table(train$rating))
test_df<- prop.table(table(test$rating))

n_train<- org[train_int,]
n_test<- org[-train_int,]
rms_train<- n_train
rms_train$rating<- n_train$rating %>% as.factor() %>% fct_rev()
# fit a model
# Same situation again in polr()
# However, the three models from different packages turn out to be identical in terms of their AICs and estimated coefficients.
MASS<- polr(f2, data = rms_train)
# summary(MASS)
pred<- predict(MASS, newdata=n_test) %>% fct_rev()
original<- test$rating
# Confusion Matrix
table(pred, original)
##     original
## pred   1   2   3   4   5
##    1   0   1   1   0   0
##    2   1   1   3   1   2
##    3   9   6  10   3  20
##    4   0   0   0   0   0
##    5  30  52  80  74 149

2nd

  1. (0, 2,75]
  2. (2.75, 3]
  3. (3, 3.25]
  4. (3.25, 3.5]
  5. (3.5, 4]

超硬要,而且結果超不均衡

data<- choco[-1]
data$rating[which(data$rating==2.6)]<- 2.5
names(data)[16]<- "sweetener"
levels(data$cocoa_butter)<- data$cocoa_butter %>% fct_rev() %>% levels()
levels(data$lecithin)<- data$lecithin %>% fct_rev() %>% levels()
names<- names(data)
org<- data

# Convert logical to integer
fun<- function(x){
  return<- x %>% str_detect("not")
  as.numeric(!return) %>% as.factor()
}
dummy <- data[10:16] %>% lapply(fun) %>% as.data.frame()
data[10:16]<- dummy
org[10:16]<- dummy
target<- names[8]
variable<- names[c(7, 11:16)]
variables<- names[c(7, 11:15)]

table(data$rating)
## 
##    1  1.5 1.75    2 2.25  2.5 2.75    3 3.25  3.5 3.75    4 
##    1    5    1   29   14  150  304  471  394  489  265  101
# categorize response into 4 subs
data$rating <- ifelse(data$rating<=2.75, 1, data$rating)
data$rating <- ifelse(data$rating<=3 & data$rating>2.75, 2, data$rating)
data$rating <- ifelse(data$rating<=3.25 & data$rating>3, 3, data$rating)
data$rating <- ifelse(data$rating >=3.75, 5, data$rating)
data$rating <- ifelse(data$rating<=3.5 & data$rating >3.25, 4, data$rating)
org$rating<- data$rating
df <- table(data$rating) %>% t()
kable(df, "html", caption = "Table 5. Another reorganized rating outlook") %>%
  kable_styling(bootstrap_options = c("hover", "condensed", "responsive"),
                position = "center") %>%
  add_header_above(c("rating" = 5)) %>%
  kable_minimal()
Table 5. Another reorganized rating outlook
rating
1 2 3 4 5
504 471 394 489 366
# Factorized columns
data$rating <- factor(data$rating,
                      levels = c("1", "2", "3", "4", "5"), ordered = TRUE)
data[, variables[-1]]<- lapply(data[, variables[-1]], function(x){
  factor(x, levels = c("0", "1"), ordered = TRUE)
}) %>% as.data.frame()

# train/test split
train_int<- 
  createDataPartition(data$rating, p = 0.8) %>% 
  unlist() %>% as.numeric()
train<- data[train_int,]
test<- data[-train_int,]
train_df<- prop.table(table(train$rating))
test_df<- prop.table(table(test$rating))

n_train<- org[train_int,]
n_test<- org[-train_int,]
rms_train<- n_train
rms_train$rating<- n_train$rating %>% as.factor() %>% fct_rev()
# fit a model
# Same situation again in polr()
# However, the three models from different packages turn out to be identical in terms of their AICs and estimated coefficients.
MASS<- polr(f2, data = rms_train)
# summary(MASS)
pred<- predict(MASS, newdata=n_test) %>% fct_rev()
original<- test$rating
# Confusion Matrix
table(pred, original)
##     original
## pred  1  2  3  4  5
##    1 49 39 28 25 13
##    2  0  0  0  0  0
##    3  0  0  0  0  0
##    4 51 55 50 72 59
##    5  0  0  0  0  1