choco<- read.csv(
  "~/110_2/（學）計量/final presentation/chocolate.csv")
n<- nrow(choco)
data<- choco
data$rating[which(data$rating==2.6)]<- 2.5
names(data)[16]<- "sweetener"

kable(data[1:20,], "html", caption = "Table 1. Data outlook") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), position = "center") %>%
  scroll_box(width = "850px", height = "300px")

Table 1. Data outlook
ref	company	company_location	review_date	country_of_bean_origin	specific_bean_origin_or_bar_name	cocoa_percent	rating	counts_of_ingredients	beans	cocoa_butter	vanilla	lecithin	salt	sugar	sweetener	first_taste	second_taste	third_taste	fourth_taste
2454	5150	U.S.A	2019	Madagascar	Bejofo Estate, batch 1	76	3.75	3	have_bean	have_cocoa_butter	have_not_vanila	have_not_lecithin	have_not_salt	have_sugar	have_not_sweetener_without_sugar	cocoa	blackberry	full body
2458	5150	U.S.A	2019	Dominican republic	Zorzal, batch 1	76	3.50	3	have_bean	have_cocoa_butter	have_not_vanila	have_not_lecithin	have_not_salt	have_sugar	have_not_sweetener_without_sugar	cocoa	vegetal	savory
2454	5150	U.S.A	2019	Tanzania	Kokoa Kamili, batch 1	76	3.25	3	have_bean	have_cocoa_butter	have_not_vanila	have_not_lecithin	have_not_salt	have_sugar	have_not_sweetener_without_sugar	rich cocoa	fatty	bready
797	A. Morin	France	2012	Peru	Peru	63	3.75	4	have_bean	have_cocoa_butter	have_not_vanila	have_lecithin	have_not_salt	have_sugar	have_not_sweetener_without_sugar	fruity	melon	roasty
797	A. Morin	France	2012	Bolivia	Bolivia	70	3.50	4	have_bean	have_cocoa_butter	have_not_vanila	have_lecithin	have_not_salt	have_sugar	have_not_sweetener_without_sugar	vegetal	nutty
1015	A. Morin	France	2013	Venezuela	Chuao	70	4.00	4	have_bean	have_cocoa_butter	have_not_vanila	have_lecithin	have_not_salt	have_sugar	have_not_sweetener_without_sugar	oily	nut	caramel	raspberry
1019	A. Morin	France	2013	Peru	Chanchamayo Province	63	4.00	3	have_bean	have_cocoa_butter	have_not_vanila	have_not_lecithin	have_not_salt	have_sugar	have_not_sweetener_without_sugar	sweet	cocoa	tangerine
1011	A. Morin	France	2013	Ecuador	Equateur	70	3.75	4	have_bean	have_cocoa_butter	have_not_vanila	have_lecithin	have_not_salt	have_sugar	have_not_sweetener_without_sugar	sandy	nutty	cocoa	fig
1019	A. Morin	France	2013	Peru	Chanchamayo Province	70	3.50	4	have_bean	have_cocoa_butter	have_not_vanila	have_lecithin	have_not_salt	have_sugar	have_not_sweetener_without_sugar	cocoa	sour	intense tangerine
1011	A. Morin	France	2013	Brazil	Brazil	70	3.25	4	have_bean	have_cocoa_butter	have_not_vanila	have_lecithin	have_not_salt	have_sugar	have_not_sweetener_without_sugar	mild tobacco
1015	A. Morin	France	2013	Papua new guinea	Papua New Guinea	70	3.25	4	have_bean	have_cocoa_butter	have_not_vanila	have_lecithin	have_not_salt	have_sugar	have_not_sweetener_without_sugar	mild fruit	strong smoke
1019	A. Morin	France	2013	Peru	Piura	70	3.25	4	have_bean	have_cocoa_butter	have_not_vanila	have_lecithin	have_not_salt	have_sugar	have_not_sweetener_without_sugar	green	nutty	cocoa
1011	A. Morin	France	2013	Madagascar	Madagascar, Criollo	70	3.00	4	have_bean	have_cocoa_butter	have_not_vanila	have_lecithin	have_not_salt	have_sugar	have_not_sweetener_without_sugar	sticky	red fruit	sour
1015	A. Morin	France	2013	Burma	Birmanie	70	3.00	4	have_bean	have_cocoa_butter	have_not_vanila	have_lecithin	have_not_salt	have_sugar	have_not_sweetener_without_sugar	sticky	smokey	grass
1011	A. Morin	France	2013	Panama	Panama	70	2.75	4	have_bean	have_cocoa_butter	have_not_vanila	have_lecithin	have_not_salt	have_sugar	have_not_sweetener_without_sugar	brief fruit note	earthy	nutty
1015	A. Morin	France	2013	Colombia	Colombie	70	2.75	4	have_bean	have_cocoa_butter	have_not_vanila	have_lecithin	have_not_salt	have_sugar	have_not_sweetener_without_sugar	burnt rubber	alkalyzed notes
1319	A. Morin	France	2014	Peru	Pablino	70	4.00	4	have_bean	have_cocoa_butter	have_not_vanila	have_lecithin	have_not_salt	have_sugar	have_not_sweetener_without_sugar	delicate	hazelnut	brownie
1319	A. Morin	France	2014	Venezuela	Puerto Cabello, Criollo	70	3.75	4	have_bean	have_cocoa_butter	have_not_vanila	have_lecithin	have_not_salt	have_sugar	have_not_sweetener_without_sugar	astringent	nutty	chocolatey
1315	A. Morin	France	2014	Cuba	Cuba	70	3.50	4	have_bean	have_cocoa_butter	have_not_vanila	have_lecithin	have_not_salt	have_sugar	have_not_sweetener_without_sugar	sliglty dry	papaya
1315	A. Morin	France	2014	Venezuela	Sur del Lago, Criollo	70	3.50	4	have_bean	have_cocoa_butter	have_not_vanila	have_lecithin	have_not_salt	have_sugar	have_not_sweetener_without_sugar	nutty	mild choco	roasty

Plots

There are six material variables included in this process, all of which are dummies with 0 or 1, 1 indicates the existence of this material in the production process while 0 indcates the opposite state.

As seen from the figure, none of the factors are equally distributed, some of them even differ in proportions by a large margin, such as salt and sugar. In the following process, we might want to take this information into account.

Correlation plots

To avoid multicollinearity, correlation matrix is presented graphically to help us examine if there is any unusual relationship among variables. As seen below, Sugar appears to be negatively correlated with Sweetener with strong bond, indicating an explainable relationship that both are complements to each other. To avoid multicollinearity, we’ve got to remove one of them, sugar is retained for its common usage in snacks.

After sweetener is eliminated from the candidated variables, multicollinear effect does not seem to take place in the remaining variables, giving us a green light to proceed with further process.

target<- names[8]
variable<- names[c(7, 11:16)]
variables<- names[c(7, 11:15)]

par(mfrow = c(1, 2))
fig_1<- corrplot::corrplot.mixed(cor(data[variable]),
                         lower = "number", upper = "circle")
fig_2<- corrplot::corrplot.mixed(cor(data[variables]),
                         lower = "number", upper = "circle")

To simplify the model fitting process, rating, the original response variable of interest with distribution provided as the table output below is reorganized into four intervals as follows:

# categorize response into 4 subs
df <- table(data$rating) %>% t()
kable(df, "html", caption = "Table 2. Original rating outlook") %>%
  kable_styling(bootstrap_options = c("hover", "condensed", "responsive"),
                position = "center") %>%
  add_header_above(c("Group 1" = 6, "Group 2" = 2,
                     "Group 3" = 2, "Group 4" = 2)) %>%
  add_header_above(c("rating" = 12)) %>%
  kable_minimal()

Table 2. Original rating outlook
rating
Group 1						Group 2		Group 3		Group 4
1	1.5	1.75	2	2.25	2.5	2.75	3	3.25	3.5	3.75	4
1	5	1	29	14	150	304	471	394	489	265	101

data$rating <- ifelse(data$rating<=2.5, 1, data$rating)
data$rating <- ifelse(data$rating<=3 & data$rating>2.5, 2, data$rating)
data$rating <- ifelse(data$rating<=3.5 & data$rating>3, 3, data$rating)
data$rating <- ifelse(data$rating >3.5, 4, data$rating)
org$rating<- data$rating
df <- table(data$rating) %>% t()
kable(df, "html", caption = "Table 3. Reorganized rating outlook") %>%
  kable_styling(bootstrap_options = c("hover", "condensed", "responsive"),
                position = "center") %>%
  add_header_above(c("rating" = 4)) %>%
  kable_minimal()

Table 3. Reorganized rating outlook
rating
1	2	3	4
200	775	883	366

Model Fitting - Multicategory Logit Model

Data split

Data is divided into 80/20 train/test split, with equal proportions of four response categories for both subsets to avoid unbalanced effect.

# Factorized columns
# data$rating <- factor(data$rating,
#                       levels = c("1", "2", "3", "4"), ordered = TRUE)
data[, variables[-1]]<- lapply(data[, variables[-1]], function(x){
  factor(x, levels = c("0", "1"))
}) %>% as.data.frame()
# train/test split
train_int<- 
  createDataPartition(data$rating, p = 0.8) %>% 
  unlist() %>% as.numeric()
train<- data[train_int,]
test<- data[-train_int,]
train_df<- prop.table(table(train$rating))
test_df<- prop.table(table(test$rating))
df<- rbind(train_df, test_df) %>% round(digits = 3) %>% as.data.frame()
rownames(df)<- c("train", "test")
kable(df, "html", caption = "Table 4. Proportions of four categories within each of train/test split") %>%
  kable_styling(bootstrap_options = c("hover", "condensed", "responsive"),
                position = "center") %>%
  add_header_above(c(" ", "rating" = 4)) %>%
  kable_minimal()

Table 4. Proportions of four categories within each of train/test split
	rating
	1	2	3	4
train	0.09	0.348	0.397	0.165
test	0.09	0.349	0.396	0.164

Initial fit

\[P(Y\leq j) = \pi_1 + \pi_2 + ... + \pi_j \ ,\ j = 1,2,...,J\]

\[logit[P(Y\leq j)] = \log[{P(Y\leq j) \over 1-P(Y\leq j)}]\ ,\ j = 1,2,...,J-1\]

\[logit[P(Y\leq j)] = \alpha_j + \beta x\]

Cumulative logit model, also known as proportional odds model, is the go-to option when there is a natural ordering in the dependent variable, which is a ordinal response with more-than-one categories.

\[logit[\hat{P}(Rating\leq 2.5)] = \hat{\alpha_1} + \hat{\beta_1} \times cocoa\ percent + \hat{\beta_2} \times cocoa\ butter + \hat{\beta_3} \times vanilla + \hat{\beta_4} \times lecithin + \hat{\beta_5} \times salt + \hat{\beta_6} \times sugar\]

\[logit[\hat{P}(Rating\leq 3)] = \hat{\alpha_2} + \hat{\beta_1} \times cocoa\ percent + \hat{\beta_2} \times cocoa\ butter + \hat{\beta_3} \times vanilla + \hat{\beta_4} \times lecithin + \hat{\beta_5} \times salt + \hat{\beta_6} \times sugar\]

\[logit[\hat{P}(Rating\leq 3.5)] = \hat{\alpha_3} + \hat{\beta_1} \times cocoa\ percent + \hat{\beta_2} \times cocoa\ butter + \hat{\beta_3} \times vanilla + \hat{\beta_4} \times lecithin + \hat{\beta_5} \times salt + \hat{\beta_6} \times sugar\]

# cumulative logit model
f <- paste(target, "~", paste(variables, collapse=" + "))
cumu.logit=vglm(f,family=cumulative(parallel=TRUE), data=train)
summary(cumu.logit, digits = 3)

## 
## Call:
## vglm(formula = f, family = cumulative(parallel = TRUE), data = train)
## 
## Coefficients: 
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept):1 -4.177797   0.508384  -8.218  < 2e-16 ***
## (Intercept):2 -2.042951   0.500669  -4.080 4.49e-05 ***
## (Intercept):3 -0.125167   0.500023  -0.250  0.80234    
## cocoa_percent  0.031184   0.006153   5.068 4.02e-07 ***
## cocoa_butter1 -0.203703   0.073550  -2.770  0.00561 ** 
## vanilla1       0.852514   0.096128   8.869  < 2e-16 ***
## lecithin1      0.227091   0.085531   2.655  0.00793 ** 
## salt1          0.548428   0.270291   2.029  0.04246 *  
## sugar1        -0.501570   0.189077  -2.653  0.00798 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Names of linear predictors: logitlink(P[Y<=1]), logitlink(P[Y<=2]), 
## logitlink(P[Y<=3])
## 
## Residual deviance: 4367.527 on 5331 degrees of freedom
## 
## Log-likelihood: -2183.763 on 5331 degrees of freedom
## 
## Number of Fisher scoring iterations: 13 
## 
## No Hauck-Donner effect found in any of the estimates
## 
## 
## Exponentiated coefficients:
## cocoa_percent cocoa_butter1      vanilla1     lecithin1         salt1 
##     1.0316749     0.8157043     2.3455368     1.2549435     1.7305313 
##        sugar1 
##     0.6055791

Second-Round fit

The estimated version of the models are presented as the following :

\[logit[\hat{P}(Rating\leq 2.5)] = \hat{\alpha_1} + \hat{\beta_1} \times cocoa\ percent + \hat{\beta_2} \times cocoa\ butter + \hat{\beta_3} \times vanilla + \hat{\beta_4} \times sugar\]

\[logit[\hat{P}(Rating\leq 3)] = \hat{\alpha_2} + \hat{\beta_1} \times cocoa\ percent + \hat{\beta_2} \times cocoa\ butter + \hat{\beta_3} \times vanilla + \hat{\beta_4} \times sugar\]

\[logit[\hat{P}(Rating\leq 3.5)] = \hat{\alpha_3} + \hat{\beta_1} \times cocoa\ percent + \hat{\beta_2} \times cocoa\ butter + \hat{\beta_3} \times vanilla + \hat{\beta_4} \times sugar\]

\[logit[\hat{P}(Rating\leq 2.5)] = -3.993 + 0.0284 \times cocoa\ percent + 0.142 \times cocoa\ butter + 0.923 \times vanilla - 0.621 \times sugar\]

\[logit[\hat{P}(Rating\leq 3)] = -1.863 + 0.0284 \times cocoa\ percent + 0.142 \times cocoa\ butter + 0.923 \times vanilla - 0.621 \times sugar\]

\[logit[\hat{P}(Rating\leq 3.5)] = 0.05 + 0.0284 \times cocoa\ percent + 0.142 \times cocoa\ butter + 0.923 \times vanilla - 0.621 \times sugar\]

# cumulative logit model
variables2<- variables[-c(4,5)]
f2 <- paste(target, "~", paste(variables2, collapse=" + "))
cumu.logit2=vglm(f2, family=cumulative(parallel=TRUE), data=train)
summary(cumu.logit2, digits = 3)

## 
## Call:
## vglm(formula = f2, family = cumulative(parallel = TRUE), data = train)
## 
## Coefficients: 
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept):1 -3.851430   0.493486  -7.805 5.97e-15 ***
## (Intercept):2 -1.721491   0.486258  -3.540 0.000400 ***
## (Intercept):3  0.191847   0.485983   0.395 0.693019    
## cocoa_percent  0.028365   0.006096   4.653 3.27e-06 ***
## cocoa_butter1 -0.141684   0.070632  -2.006 0.044863 *  
## vanilla1       0.923072   0.092287  10.002  < 2e-16 ***
## sugar1        -0.621057   0.172767  -3.595 0.000325 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Names of linear predictors: logitlink(P[Y<=1]), logitlink(P[Y<=2]), 
## logitlink(P[Y<=3])
## 
## Residual deviance: 4373.066 on 5333 degrees of freedom
## 
## Log-likelihood: -2186.533 on 5333 degrees of freedom
## 
## Number of Fisher scoring iterations: 13 
## 
## No Hauck-Donner effect found in any of the estimates
## 
## 
## Exponentiated coefficients:
## cocoa_percent cocoa_butter1      vanilla1        sugar1 
##     1.0287711     0.8678959     2.5170114     0.5373759

Nomogram

org[, variables[-1]]<- lapply(org[, variables[-1]], function(x){
  factor(x, levels = c("0", "1"))
}) %>% as.data.frame()
n_train<- org[train_int,]
n_test<- org[-train_int,]
rms_train<- n_train
# Had to reverse factors of response since I did not find an option to reverse in the model fitting function orm()
# However, the two models from two different packages turn out to be identical in terms of their AICs and estimated coefficients.
rms_train$rating<- n_train$rating %>% as.factor()# %>% fct_rev()

ddist<- datadist(rms_train)
options(datadist = "ddist")
fit.blogit.rms<- rms::orm(as.formula("rating ~ cocoa_percent + cocoa_butter + vanilla + sugar"), data=rms_train)
coef<- fit.blogit.rms$coefficients
ff<- Newlabels(fit.blogit.rms, c(vanilla = "vanilla"))
fun2<- function(x) {
  plogis(x- coef[1] + coef[2])
}
fun3<- function(x) {
  plogis(x- coef[1] + coef[3])
}
nomo<- nomogram(ff,
                fun = list("Prob Y>=2" = plogis,
                           "Prob Y>=3" = fun2,
                           "Prob Y>=4" = fun3),
                           lp = F,
                           fun.at = c(.01, .05, seq(.1, .9, by = .1), .95, .99))
plot(nomo, lmgp = .2, cex.axis = .6)

Model Explanation

Main Effect plot

# Same situation again in polr()
# However, the three models from different packages turn out to be identical in terms of their AICs and estimated coefficients.
MASS<- polr(f2, data = rms_train, Hess = TRUE)

plot(predictorEffects(MASS),lines=list(multiline=TRUE),
     axes=list(grid=TRUE))

Interaction plot (with cocoa_butter)

plot(Effect(focal.predictors = c("cocoa_butter", "cocoa_percent"),MASS),
     lines=list(multiline=TRUE), axes=list(grid=TRUE))

plot(Effect(focal.predictors = c("cocoa_butter", "vanilla"),MASS),
     lines=list(multiline=TRUE), axes=list(grid=TRUE))

plot(Effect(focal.predictors = c("cocoa_butter", "sugar"),MASS),
     lines=list(multiline=TRUE), axes=list(grid=TRUE))

Chocolate Bar Data Visualization

Oscar

2022/06/01