choco<- read.csv(
  "~/110_2/(學)計量/final presentation/chocolate.csv")
n<- nrow(choco)
data<- choco
data$rating[which(data$rating==2.6)]<- 2.5
names(data)[16]<- "sweetener"

kable(data[1:20,], "html", caption = "Table 1. Data outlook") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), position = "center") %>%
  scroll_box(width = "850px", height = "300px")
Table 1. Data outlook
ref company company_location review_date country_of_bean_origin specific_bean_origin_or_bar_name cocoa_percent rating counts_of_ingredients beans cocoa_butter vanilla lecithin salt sugar sweetener first_taste second_taste third_taste fourth_taste
2454 5150 U.S.A 2019 Madagascar Bejofo Estate, batch 1 76 3.75 3 have_bean have_cocoa_butter have_not_vanila have_not_lecithin have_not_salt have_sugar have_not_sweetener_without_sugar cocoa blackberry full body
2458 5150 U.S.A 2019 Dominican republic Zorzal, batch 1 76 3.50 3 have_bean have_cocoa_butter have_not_vanila have_not_lecithin have_not_salt have_sugar have_not_sweetener_without_sugar cocoa vegetal savory
2454 5150 U.S.A 2019 Tanzania Kokoa Kamili, batch 1 76 3.25 3 have_bean have_cocoa_butter have_not_vanila have_not_lecithin have_not_salt have_sugar have_not_sweetener_without_sugar rich cocoa fatty bready
797 A. Morin France 2012 Peru Peru 63 3.75 4 have_bean have_cocoa_butter have_not_vanila have_lecithin have_not_salt have_sugar have_not_sweetener_without_sugar fruity melon roasty
797 A. Morin France 2012 Bolivia Bolivia 70 3.50 4 have_bean have_cocoa_butter have_not_vanila have_lecithin have_not_salt have_sugar have_not_sweetener_without_sugar vegetal nutty
1015 A. Morin France 2013 Venezuela Chuao 70 4.00 4 have_bean have_cocoa_butter have_not_vanila have_lecithin have_not_salt have_sugar have_not_sweetener_without_sugar oily nut caramel raspberry
1019 A. Morin France 2013 Peru Chanchamayo Province 63 4.00 3 have_bean have_cocoa_butter have_not_vanila have_not_lecithin have_not_salt have_sugar have_not_sweetener_without_sugar sweet cocoa tangerine
1011 A. Morin France 2013 Ecuador Equateur 70 3.75 4 have_bean have_cocoa_butter have_not_vanila have_lecithin have_not_salt have_sugar have_not_sweetener_without_sugar sandy nutty cocoa fig
1019 A. Morin France 2013 Peru Chanchamayo Province 70 3.50 4 have_bean have_cocoa_butter have_not_vanila have_lecithin have_not_salt have_sugar have_not_sweetener_without_sugar cocoa sour intense tangerine
1011 A. Morin France 2013 Brazil Brazil 70 3.25 4 have_bean have_cocoa_butter have_not_vanila have_lecithin have_not_salt have_sugar have_not_sweetener_without_sugar mild tobacco
1015 A. Morin France 2013 Papua new guinea Papua New Guinea 70 3.25 4 have_bean have_cocoa_butter have_not_vanila have_lecithin have_not_salt have_sugar have_not_sweetener_without_sugar mild fruit strong smoke
1019 A. Morin France 2013 Peru Piura 70 3.25 4 have_bean have_cocoa_butter have_not_vanila have_lecithin have_not_salt have_sugar have_not_sweetener_without_sugar green nutty cocoa
1011 A. Morin France 2013 Madagascar Madagascar, Criollo 70 3.00 4 have_bean have_cocoa_butter have_not_vanila have_lecithin have_not_salt have_sugar have_not_sweetener_without_sugar sticky red fruit sour
1015 A. Morin France 2013 Burma Birmanie 70 3.00 4 have_bean have_cocoa_butter have_not_vanila have_lecithin have_not_salt have_sugar have_not_sweetener_without_sugar sticky smokey grass
1011 A. Morin France 2013 Panama Panama 70 2.75 4 have_bean have_cocoa_butter have_not_vanila have_lecithin have_not_salt have_sugar have_not_sweetener_without_sugar brief fruit note earthy nutty
1015 A. Morin France 2013 Colombia Colombie 70 2.75 4 have_bean have_cocoa_butter have_not_vanila have_lecithin have_not_salt have_sugar have_not_sweetener_without_sugar burnt rubber alkalyzed notes
1319 A. Morin France 2014 Peru Pablino 70 4.00 4 have_bean have_cocoa_butter have_not_vanila have_lecithin have_not_salt have_sugar have_not_sweetener_without_sugar delicate hazelnut brownie
1319 A. Morin France 2014 Venezuela Puerto Cabello, Criollo 70 3.75 4 have_bean have_cocoa_butter have_not_vanila have_lecithin have_not_salt have_sugar have_not_sweetener_without_sugar astringent nutty chocolatey
1315 A. Morin France 2014 Cuba Cuba 70 3.50 4 have_bean have_cocoa_butter have_not_vanila have_lecithin have_not_salt have_sugar have_not_sweetener_without_sugar sliglty dry papaya
1315 A. Morin France 2014 Venezuela Sur del Lago, Criollo 70 3.50 4 have_bean have_cocoa_butter have_not_vanila have_lecithin have_not_salt have_sugar have_not_sweetener_without_sugar nutty mild choco roasty

Plots

There are six material variables included in this process, all of which are dummies with 0 or 1, 1 indicates the existence of this material in the production process while 0 indcates the opposite state.

As seen from the figure, none of the factors are equally distributed, some of them even differ in proportions by a large margin, such as salt and sugar. In the following process, we might want to take this information into account.

Correlation plots

To avoid multicollinearity, correlation matrix is presented graphically to help us examine if there is any unusual relationship among variables. As seen below, Sugar appears to be negatively correlated with Sweetener with strong bond, indicating an explainable relationship that both are complements to each other. To avoid multicollinearity, we’ve got to remove one of them, sugar is retained for its common usage in snacks.

After sweetener is eliminated from the candidated variables, multicollinear effect does not seem to take place in the remaining variables, giving us a green light to proceed with further process.

target<- names[8]
variable<- names[c(7, 11:16)]
variables<- names[c(7, 11:15)]

par(mfrow = c(1, 2))
fig_1<- corrplot::corrplot.mixed(cor(data[variable]),
                         lower = "number", upper = "circle")
fig_2<- corrplot::corrplot.mixed(cor(data[variables]),
                         lower = "number", upper = "circle")

To simplify the model fitting process, rating, the original response variable of interest with distribution provided as the table output below is reorganized into four intervals as follows:

# categorize response into 4 subs
df <- table(data$rating) %>% t()
kable(df, "html", caption = "Table 2. Original rating outlook") %>%
  kable_styling(bootstrap_options = c("hover", "condensed", "responsive"),
                position = "center") %>%
  add_header_above(c("Group 1" = 6, "Group 2" = 2,
                     "Group 3" = 2, "Group 4" = 2)) %>%
  add_header_above(c("rating" = 12)) %>%
  kable_minimal()
Table 2. Original rating outlook
rating
Group 1
Group 2
Group 3
Group 4
1 1.5 1.75 2 2.25 2.5 2.75 3 3.25 3.5 3.75 4
1 5 1 29 14 150 304 471 394 489 265 101
data$rating <- ifelse(data$rating<=2.5, 1, data$rating)
data$rating <- ifelse(data$rating<=3 & data$rating>2.5, 2, data$rating)
data$rating <- ifelse(data$rating<=3.5 & data$rating>3, 3, data$rating)
data$rating <- ifelse(data$rating >3.5, 4, data$rating)
org$rating<- data$rating
df <- table(data$rating) %>% t()
kable(df, "html", caption = "Table 3. Reorganized rating outlook") %>%
  kable_styling(bootstrap_options = c("hover", "condensed", "responsive"),
                position = "center") %>%
  add_header_above(c("rating" = 4)) %>%
  kable_minimal()
Table 3. Reorganized rating outlook
rating
1 2 3 4
200 775 883 366

Model Fitting - Multicategory Logit Model

Data split

Data is divided into 80/20 train/test split, with equal proportions of four response categories for both subsets to avoid unbalanced effect.

# Factorized columns
# data$rating <- factor(data$rating,
#                       levels = c("1", "2", "3", "4"), ordered = TRUE)
data[, variables[-1]]<- lapply(data[, variables[-1]], function(x){
  factor(x, levels = c("0", "1"))
}) %>% as.data.frame()
# train/test split
train_int<- 
  createDataPartition(data$rating, p = 0.8) %>% 
  unlist() %>% as.numeric()
train<- data[train_int,]
test<- data[-train_int,]
train_df<- prop.table(table(train$rating))
test_df<- prop.table(table(test$rating))
df<- rbind(train_df, test_df) %>% round(digits = 3) %>% as.data.frame()
rownames(df)<- c("train", "test")
kable(df, "html", caption = "Table 4. Proportions of four categories within each of train/test split") %>%
  kable_styling(bootstrap_options = c("hover", "condensed", "responsive"),
                position = "center") %>%
  add_header_above(c(" ", "rating" = 4)) %>%
  kable_minimal()
Table 4. Proportions of four categories within each of train/test split
rating
1 2 3 4
train 0.09 0.348 0.397 0.165
test 0.09 0.349 0.396 0.164

Initial fit

\[P(Y\leq j) = \pi_1 + \pi_2 + ... + \pi_j \ ,\ j = 1,2,...,J\]

\[logit[P(Y\leq j)] = \log[{P(Y\leq j) \over 1-P(Y\leq j)}]\ ,\ j = 1,2,...,J-1\]

\[logit[P(Y\leq j)] = \alpha_j + \beta x\]

Cumulative logit model, also known as proportional odds model, is the go-to option when there is a natural ordering in the dependent variable, which is a ordinal response with more-than-one categories.

\[logit[\hat{P}(Rating\leq 2.5)] = \hat{\alpha_1} + \hat{\beta_1} \times cocoa\ percent + \hat{\beta_2} \times cocoa\ butter + \hat{\beta_3} \times vanilla + \hat{\beta_4} \times lecithin + \hat{\beta_5} \times salt + \hat{\beta_6} \times sugar\]

\[logit[\hat{P}(Rating\leq 3)] = \hat{\alpha_2} + \hat{\beta_1} \times cocoa\ percent + \hat{\beta_2} \times cocoa\ butter + \hat{\beta_3} \times vanilla + \hat{\beta_4} \times lecithin + \hat{\beta_5} \times salt + \hat{\beta_6} \times sugar\]

\[logit[\hat{P}(Rating\leq 3.5)] = \hat{\alpha_3} + \hat{\beta_1} \times cocoa\ percent + \hat{\beta_2} \times cocoa\ butter + \hat{\beta_3} \times vanilla + \hat{\beta_4} \times lecithin + \hat{\beta_5} \times salt + \hat{\beta_6} \times sugar\]

# cumulative logit model
f <- paste(target, "~", paste(variables, collapse=" + "))
cumu.logit=vglm(f,family=cumulative(parallel=TRUE), data=train)
summary(cumu.logit, digits = 3)
## 
## Call:
## vglm(formula = f, family = cumulative(parallel = TRUE), data = train)
## 
## Coefficients: 
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept):1 -4.177797   0.508384  -8.218  < 2e-16 ***
## (Intercept):2 -2.042951   0.500669  -4.080 4.49e-05 ***
## (Intercept):3 -0.125167   0.500023  -0.250  0.80234    
## cocoa_percent  0.031184   0.006153   5.068 4.02e-07 ***
## cocoa_butter1 -0.203703   0.073550  -2.770  0.00561 ** 
## vanilla1       0.852514   0.096128   8.869  < 2e-16 ***
## lecithin1      0.227091   0.085531   2.655  0.00793 ** 
## salt1          0.548428   0.270291   2.029  0.04246 *  
## sugar1        -0.501570   0.189077  -2.653  0.00798 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Names of linear predictors: logitlink(P[Y<=1]), logitlink(P[Y<=2]), 
## logitlink(P[Y<=3])
## 
## Residual deviance: 4367.527 on 5331 degrees of freedom
## 
## Log-likelihood: -2183.763 on 5331 degrees of freedom
## 
## Number of Fisher scoring iterations: 13 
## 
## No Hauck-Donner effect found in any of the estimates
## 
## 
## Exponentiated coefficients:
## cocoa_percent cocoa_butter1      vanilla1     lecithin1         salt1 
##     1.0316749     0.8157043     2.3455368     1.2549435     1.7305313 
##        sugar1 
##     0.6055791

Second-Round fit

The estimated version of the models are presented as the following :

\[logit[\hat{P}(Rating\leq 2.5)] = \hat{\alpha_1} + \hat{\beta_1} \times cocoa\ percent + \hat{\beta_2} \times cocoa\ butter + \hat{\beta_3} \times vanilla + \hat{\beta_4} \times sugar\]

\[logit[\hat{P}(Rating\leq 3)] = \hat{\alpha_2} + \hat{\beta_1} \times cocoa\ percent + \hat{\beta_2} \times cocoa\ butter + \hat{\beta_3} \times vanilla + \hat{\beta_4} \times sugar\]

\[logit[\hat{P}(Rating\leq 3.5)] = \hat{\alpha_3} + \hat{\beta_1} \times cocoa\ percent + \hat{\beta_2} \times cocoa\ butter + \hat{\beta_3} \times vanilla + \hat{\beta_4} \times sugar\]

\[logit[\hat{P}(Rating\leq 2.5)] = -3.993 + 0.0284 \times cocoa\ percent + 0.142 \times cocoa\ butter + 0.923 \times vanilla - 0.621 \times sugar\]

\[logit[\hat{P}(Rating\leq 3)] = -1.863 + 0.0284 \times cocoa\ percent + 0.142 \times cocoa\ butter + 0.923 \times vanilla - 0.621 \times sugar\]

\[logit[\hat{P}(Rating\leq 3.5)] = 0.05 + 0.0284 \times cocoa\ percent + 0.142 \times cocoa\ butter + 0.923 \times vanilla - 0.621 \times sugar\]

# cumulative logit model
variables2<- variables[-c(4,5)]
f2 <- paste(target, "~", paste(variables2, collapse=" + "))
cumu.logit2=vglm(f2, family=cumulative(parallel=TRUE), data=train)
summary(cumu.logit2, digits = 3)
## 
## Call:
## vglm(formula = f2, family = cumulative(parallel = TRUE), data = train)
## 
## Coefficients: 
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept):1 -3.851430   0.493486  -7.805 5.97e-15 ***
## (Intercept):2 -1.721491   0.486258  -3.540 0.000400 ***
## (Intercept):3  0.191847   0.485983   0.395 0.693019    
## cocoa_percent  0.028365   0.006096   4.653 3.27e-06 ***
## cocoa_butter1 -0.141684   0.070632  -2.006 0.044863 *  
## vanilla1       0.923072   0.092287  10.002  < 2e-16 ***
## sugar1        -0.621057   0.172767  -3.595 0.000325 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Names of linear predictors: logitlink(P[Y<=1]), logitlink(P[Y<=2]), 
## logitlink(P[Y<=3])
## 
## Residual deviance: 4373.066 on 5333 degrees of freedom
## 
## Log-likelihood: -2186.533 on 5333 degrees of freedom
## 
## Number of Fisher scoring iterations: 13 
## 
## No Hauck-Donner effect found in any of the estimates
## 
## 
## Exponentiated coefficients:
## cocoa_percent cocoa_butter1      vanilla1        sugar1 
##     1.0287711     0.8678959     2.5170114     0.5373759

Nomogram

org[, variables[-1]]<- lapply(org[, variables[-1]], function(x){
  factor(x, levels = c("0", "1"))
}) %>% as.data.frame()
n_train<- org[train_int,]
n_test<- org[-train_int,]
rms_train<- n_train
# Had to reverse factors of response since I did not find an option to reverse in the model fitting function orm()
# However, the two models from two different packages turn out to be identical in terms of their AICs and estimated coefficients.
rms_train$rating<- n_train$rating %>% as.factor()# %>% fct_rev()

ddist<- datadist(rms_train)
options(datadist = "ddist")
fit.blogit.rms<- rms::orm(as.formula("rating ~ cocoa_percent + cocoa_butter + vanilla + sugar"), data=rms_train)
coef<- fit.blogit.rms$coefficients
ff<- Newlabels(fit.blogit.rms, c(vanilla = "vanilla"))
fun2<- function(x) {
  plogis(x- coef[1] + coef[2])
}
fun3<- function(x) {
  plogis(x- coef[1] + coef[3])
}
nomo<- nomogram(ff,
                fun = list("Prob Y>=2" = plogis,
                           "Prob Y>=3" = fun2,
                           "Prob Y>=4" = fun3),
                           lp = F,
                           fun.at = c(.01, .05, seq(.1, .9, by = .1), .95, .99))
plot(nomo, lmgp = .2, cex.axis = .6)

Model Explanation

Main Effect plot

# Same situation again in polr()
# However, the three models from different packages turn out to be identical in terms of their AICs and estimated coefficients.
MASS<- polr(f2, data = rms_train, Hess = TRUE)

plot(predictorEffects(MASS),lines=list(multiline=TRUE),
     axes=list(grid=TRUE))

Interaction plot (with cocoa_butter)

plot(Effect(focal.predictors = c("cocoa_butter", "cocoa_percent"),MASS),
     lines=list(multiline=TRUE), axes=list(grid=TRUE))

plot(Effect(focal.predictors = c("cocoa_butter", "vanilla"),MASS),
     lines=list(multiline=TRUE), axes=list(grid=TRUE))

plot(Effect(focal.predictors = c("cocoa_butter", "sugar"),MASS),
     lines=list(multiline=TRUE), axes=list(grid=TRUE))