Data

choco<- read.csv(
  "C:\\Users\\user\\Desktop\\110_1\\Statistics\\Data Analysis\\FULL_DATA\\chocolate.csv")
n<- nrow(choco)
data<- choco[-1]
data$rating[which(data$rating==2.6)]<- 2.5
names(data)[16]<- "sweetener"

kable(data[1:10, ], "html", caption = "Table 1. Data outlook") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), position = "center") %>%
    scroll_box(width = "850px", height = "300px")

Table 1. Data outlook
ref	company	company_location	review_date	country_of_bean_origin	specific_bean_origin_or_bar_name	cocoa_percent	rating	counts_of_ingredients	beans	cocoa_butter	vanilla	lecithin	salt	sugar	sweetener	first_taste	second_taste	third_taste	fourth_taste
2454	5150	U.S.A	2019	Madagascar	Bejofo Estate, batch 1	76	3.75	3	have_bean	have_cocoa_butter	have_not_vanila	have_not_lecithin	have_not_salt	have_sugar	have_not_sweetener_without_sugar	cocoa	blackberry	full body
2458	5150	U.S.A	2019	Dominican republic	Zorzal, batch 1	76	3.50	3	have_bean	have_cocoa_butter	have_not_vanila	have_not_lecithin	have_not_salt	have_sugar	have_not_sweetener_without_sugar	cocoa	vegetal	savory
2454	5150	U.S.A	2019	Tanzania	Kokoa Kamili, batch 1	76	3.25	3	have_bean	have_cocoa_butter	have_not_vanila	have_not_lecithin	have_not_salt	have_sugar	have_not_sweetener_without_sugar	rich cocoa	fatty	bready
797	A. Morin	France	2012	Peru	Peru	63	3.75	4	have_bean	have_cocoa_butter	have_not_vanila	have_lecithin	have_not_salt	have_sugar	have_not_sweetener_without_sugar	fruity	melon	roasty
797	A. Morin	France	2012	Bolivia	Bolivia	70	3.50	4	have_bean	have_cocoa_butter	have_not_vanila	have_lecithin	have_not_salt	have_sugar	have_not_sweetener_without_sugar	vegetal	nutty
1015	A. Morin	France	2013	Venezuela	Chuao	70	4.00	4	have_bean	have_cocoa_butter	have_not_vanila	have_lecithin	have_not_salt	have_sugar	have_not_sweetener_without_sugar	oily	nut	caramel	raspberry
1019	A. Morin	France	2013	Peru	Chanchamayo Province	63	4.00	3	have_bean	have_cocoa_butter	have_not_vanila	have_not_lecithin	have_not_salt	have_sugar	have_not_sweetener_without_sugar	sweet	cocoa	tangerine
1011	A. Morin	France	2013	Ecuador	Equateur	70	3.75	4	have_bean	have_cocoa_butter	have_not_vanila	have_lecithin	have_not_salt	have_sugar	have_not_sweetener_without_sugar	sandy	nutty	cocoa	fig
1019	A. Morin	France	2013	Peru	Chanchamayo Province	70	3.50	4	have_bean	have_cocoa_butter	have_not_vanila	have_lecithin	have_not_salt	have_sugar	have_not_sweetener_without_sugar	cocoa	sour	intense tangerine
1011	A. Morin	France	2013	Brazil	Brazil	70	3.25	4	have_bean	have_cocoa_butter	have_not_vanila	have_lecithin	have_not_salt	have_sugar	have_not_sweetener_without_sugar	mild tobacco

Reorganizing response

levels(data$cocoa_butter)<- data$cocoa_butter %>% fct_rev() %>% levels()
levels(data$lecithin)<- data$lecithin %>% fct_rev() %>% levels()
names<- names(data)
org<- data
# Convert logical to integer
fun<- function(x){
  return<- x %>% str_detect("not")
  as.numeric(!return)
}
dummy <- data[10:16] %>% lapply(fun) %>% as.data.frame()
data[10:16]<- dummy
target<- names[8]
variable<- names[c(7, 11:16)]
variables<- names[c(7, 11:15)]

# categorize response into 4 subs
rating<- data$rating

df <- table(data$rating) %>% t()
kable(df, "html", caption = "Table 2. Original rating outlook") %>%
  kable_styling(bootstrap_options = c("hover", "condensed", "responsive"),
                position = "center") %>%
  add_header_above(c("rating" = 12)) %>%
  kable_minimal()

Table 2. Original rating outlook
rating
1	1.5	1.75	2	2.25	2.5	2.75	3	3.25	3.5	3.75	4
1	5	1	29	14	150	304	471	394	489	265	101

data$rating <- ifelse(data$rating<=2.5, 1, data$rating)
data$rating <- ifelse(data$rating<=3 & data$rating>2.5, 2, data$rating)
data$rating <- ifelse(data$rating<=3.5 & data$rating>3, 3, data$rating)
data$rating <- ifelse(data$rating >3.5, 4, data$rating)
df <- table(data$rating) %>% t()
kable(df, "html", caption = "Table 3. Reorganized rating outlook") %>%
  kable_styling(bootstrap_options = c("hover", "condensed", "responsive"),
                position = "center") %>%
  add_header_above(c("rating" = 4)) %>%
  kable_minimal()

Table 3. Reorganized rating outlook
rating
1	2	3	4
200	775	883	366

Train/Test split

# Factorized columns
data$rating <- factor(data$rating,
                      levels = c("1", "2", "3", "4"), ordered = TRUE)
data[, variables[-1]]<- lapply(data[, variables[-1]], function(x){
  factor(x, levels = c("0", "1"), ordered = TRUE)
}) %>% as.data.frame()
# train/test split
train_int<- 
  createDataPartition(data$rating, p = 0.8) %>% 
  unlist() %>% as.numeric()
train<- data[train_int,]
test<- data[-train_int,]
train_df<- prop.table(table(train$rating))
test_df<- prop.table(table(test$rating))
df<- rbind(train_df, test_df) %>% round(digits = 3) %>% as.data.frame()
rownames(df)<- c("train", "test")
kable(df, "html", caption = "Table 4. Proportions of four categories within each of train/test split") %>%
  kable_styling(bootstrap_options = c("hover", "condensed", "responsive"),
                position = "center") %>%
  add_header_above(c(" ", "rating" = 4)) %>%
  kable_minimal()

Table 4. Proportions of four categories within each of train/test split
	rating
	1	2	3	4
train	0.09	0.348	0.397	0.165
test	0.09	0.349	0.396	0.164

Ordinal Logistic Regression report

# multicategory logit model
variables2<- variables[-c(4:5)]
f2 <- paste(target, "~", paste(variables2, collapse=" + "))
cumu.logit2=vglm(f2, family=cumulative(parallel=TRUE), data=train)
summary(cumu.logit2, digits = 3)

## 
## Call:
## vglm(formula = f2, family = cumulative(parallel = TRUE), data = train)
## 
## Coefficients: 
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept):1  -3.569376   0.614855  -5.805 6.43e-09 ***
## (Intercept):2  -1.438861   0.608652  -2.364 0.018078 *  
## (Intercept):3   0.471604   0.608213   0.775 0.438107    
## cocoa_percent   0.026661   0.008278   3.221 0.001279 ** 
## cocoa_butter.L  0.060141   0.069305   0.868 0.385519    
## vanilla.L       0.637851   0.090575   7.042 1.89e-12 ***
## sugar.L        -0.577289   0.164974  -3.499 0.000467 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Names of linear predictors: logitlink(P[Y<=1]), logitlink(P[Y<=2]), 
## logitlink(P[Y<=3])
## 
## Residual deviance: 4372.745 on 5333 degrees of freedom
## 
## Log-likelihood: -2186.373 on 5333 degrees of freedom
## 
## Number of Fisher scoring iterations: 5 
## 
## No Hauck-Donner effect found in any of the estimates
## 
## 
## Exponentiated coefficients:
##  cocoa_percent cocoa_butter.L      vanilla.L        sugar.L 
##      1.0270191      1.0619863      1.8924105      0.5614184

Model Interpretation

## Nomogram
n_train<- org[train_int,]
n_test<- org[-train_int,]
rms_train<- n_train
# Had to reverse factors of response since I did not find an option to reverse in the model fitting function orm()
# However, the two models from two different packages turn out to be identical in terms of their AICs and estimated coefficients.
rms_train$rating<- n_train$rating %>% as.factor() %>% fct_rev()

ddist<- datadist(rms_train)
options(datadist = "ddist")
fit.blogit.rms<- rms::orm(as.formula("rating ~ cocoa_percent + cocoa_butter + vanilla + sugar"), data=rms_train)
coef<- fit.blogit.rms$coefficients
ff<- Newlabels(fit.blogit.rms, c(vanilla = "vanilla"))
fun2<- function(x) {
  plogis(x- coef[1] + coef[2])
}
fun3<- function(x) {
  plogis(x- coef[1] + coef[3])
}
nomo<- nomogram(ff,
                fun = list("Prob Y>=2" = plogis,
                           "Prob Y>=3" = fun2,
                           "Prob Y>=4" = fun3),
                           lp = F,
                           fun.at = c(.01, .05, seq(.1, .9, by = .1), .95, .99))
plot(nomo, lmgp = .2, cex.axis = .6) # slight difference

Confusion Table

rms_train<- train
# Had to reverse factors of response since I did not find an option to reverse in the model fitting function orm()
# However, the two models from two different packages turn out to be identical in terms of their AICs and estimated coefficients.
rms_train$rating<- train$rating %>% as.factor() %>% fct_rev()
# Same situation again in polr()
# However, the three models from different packages turn out to be identical in terms of their AICs and estimated coefficients.
MASS<- polr(f2, data=rms_train, Hess = TRUE)
# summary(MASS)
pred<- predict(MASS, newdata=test)%>% fct_rev()
original<- test$rating
# Confusion Matrix
table(pred, original)

##     original
## pred   1   2   3   4
##    1   0   0   0   0
##    2  19  47  20  11
##    3  21 108 156  62
##    4   0   0   0   0

Main Effect plot

plot(predictorEffects(MASS),lines=list(multiline=TRUE),
     axes=list(grid=TRUE))

Interaction plot (with cocoa_butter)

plot(Effect(focal.predictors = c("cocoa_butter", "cocoa_percent"),MASS),
     lines=list(multiline=TRUE), axes=list(grid=TRUE))

plot(Effect(focal.predictors = c("cocoa_butter", "vanilla"),MASS),
     lines=list(multiline=TRUE), axes=list(grid=TRUE))

plot(Effect(focal.predictors = c("cocoa_butter", "sugar"),MASS),
     lines=list(multiline=TRUE), axes=list(grid=TRUE))

Different organizing scheme

1st

(0, 2,5]
(2.5, 2.75]
(2.75, 3]
(3, 3.25]
(3.25, 4]

小怪怪的

data<- choco[-1]
data$rating[which(data$rating==2.6)]<- 2.5
names(data)[16]<- "sweetener"
levels(data$cocoa_butter)<- data$cocoa_butter %>% fct_rev() %>% levels()
levels(data$lecithin)<- data$lecithin %>% fct_rev() %>% levels()
names<- names(data)
org<- data

# Convert logical to integer
fun<- function(x){
  return<- x %>% str_detect("not")
  as.numeric(!return) %>% as.factor()
}
dummy <- data[10:16] %>% lapply(fun) %>% as.data.frame()
data[10:16]<- dummy
org[10:16]<- dummy
target<- names[8]
variable<- names[c(7, 11:16)]
variables<- names[c(7, 11:15)]

table(data$rating)

## 
##    1  1.5 1.75    2 2.25  2.5 2.75    3 3.25  3.5 3.75    4 
##    1    5    1   29   14  150  304  471  394  489  265  101

# categorize response into 4 subs
data$rating <- ifelse(data$rating<=2.5, 1, data$rating)
data$rating <- ifelse(data$rating<=2.75 & data$rating>2.5, 2, data$rating)
data$rating <- ifelse(data$rating<=3 & data$rating>2.75, 3, data$rating)
data$rating <- ifelse(data$rating >3.25, 5, data$rating)
data$rating <- ifelse(data$rating<=3.25 & data$rating >3, 4, data$rating)
org$rating<- data$rating
df <- table(data$rating) %>% t()
kable(df, "html", caption = "Table 5. Another reorganized rating outlook") %>%
  kable_styling(bootstrap_options = c("hover", "condensed", "responsive"),
                position = "center") %>%
  add_header_above(c("rating" = 5)) %>%
  kable_minimal()

Table 5. Another reorganized rating outlook
rating
1	2	3	4	5
200	304	471	394	855

# Factorized columns
data$rating <- factor(data$rating,
                      levels = c("1", "2", "3", "4", "5"), ordered = TRUE)
data[, variables[-1]]<- lapply(data[, variables[-1]], function(x){
  factor(x, levels = c("0", "1"), ordered = TRUE)
}) %>% as.data.frame()

# train/test split
train_int<- 
  createDataPartition(data$rating, p = 0.8) %>% 
  unlist() %>% as.numeric()
train<- data[train_int,]
test<- data[-train_int,]
train_df<- prop.table(table(train$rating))
test_df<- prop.table(table(test$rating))

n_train<- org[train_int,]
n_test<- org[-train_int,]
rms_train<- n_train
rms_train$rating<- n_train$rating %>% as.factor() %>% fct_rev()
# fit a model
# Same situation again in polr()
# However, the three models from different packages turn out to be identical in terms of their AICs and estimated coefficients.
MASS<- polr(f2, data = rms_train)
# summary(MASS)
pred<- predict(MASS, newdata=n_test) %>% fct_rev()
original<- test$rating
# Confusion Matrix
table(pred, original)

##     original
## pred   1   2   3   4   5
##    1   0   1   1   0   0
##    2   1   1   3   1   2
##    3   9   6  10   3  20
##    4   0   0   0   0   0
##    5  30  52  80  74 149

2nd

(0, 2,75]
(2.75, 3]
(3, 3.25]
(3.25, 3.5]
(3.5, 4]

超硬要，而且結果超不均衡

data<- choco[-1]
data$rating[which(data$rating==2.6)]<- 2.5
names(data)[16]<- "sweetener"
levels(data$cocoa_butter)<- data$cocoa_butter %>% fct_rev() %>% levels()
levels(data$lecithin)<- data$lecithin %>% fct_rev() %>% levels()
names<- names(data)
org<- data

# Convert logical to integer
fun<- function(x){
  return<- x %>% str_detect("not")
  as.numeric(!return) %>% as.factor()
}
dummy <- data[10:16] %>% lapply(fun) %>% as.data.frame()
data[10:16]<- dummy
org[10:16]<- dummy
target<- names[8]
variable<- names[c(7, 11:16)]
variables<- names[c(7, 11:15)]

table(data$rating)

## 
##    1  1.5 1.75    2 2.25  2.5 2.75    3 3.25  3.5 3.75    4 
##    1    5    1   29   14  150  304  471  394  489  265  101

# categorize response into 4 subs
data$rating <- ifelse(data$rating<=2.75, 1, data$rating)
data$rating <- ifelse(data$rating<=3 & data$rating>2.75, 2, data$rating)
data$rating <- ifelse(data$rating<=3.25 & data$rating>3, 3, data$rating)
data$rating <- ifelse(data$rating >=3.75, 5, data$rating)
data$rating <- ifelse(data$rating<=3.5 & data$rating >3.25, 4, data$rating)
org$rating<- data$rating
df <- table(data$rating) %>% t()
kable(df, "html", caption = "Table 5. Another reorganized rating outlook") %>%
  kable_styling(bootstrap_options = c("hover", "condensed", "responsive"),
                position = "center") %>%
  add_header_above(c("rating" = 5)) %>%
  kable_minimal()

Table 5. Another reorganized rating outlook
rating
1	2	3	4	5
504	471	394	489	366

# Factorized columns
data$rating <- factor(data$rating,
                      levels = c("1", "2", "3", "4", "5"), ordered = TRUE)
data[, variables[-1]]<- lapply(data[, variables[-1]], function(x){
  factor(x, levels = c("0", "1"), ordered = TRUE)
}) %>% as.data.frame()

# train/test split
train_int<- 
  createDataPartition(data$rating, p = 0.8) %>% 
  unlist() %>% as.numeric()
train<- data[train_int,]
test<- data[-train_int,]
train_df<- prop.table(table(train$rating))
test_df<- prop.table(table(test$rating))

n_train<- org[train_int,]
n_test<- org[-train_int,]
rms_train<- n_train
rms_train$rating<- n_train$rating %>% as.factor() %>% fct_rev()
# fit a model
# Same situation again in polr()
# However, the three models from different packages turn out to be identical in terms of their AICs and estimated coefficients.
MASS<- polr(f2, data = rms_train)
# summary(MASS)
pred<- predict(MASS, newdata=n_test) %>% fct_rev()
original<- test$rating
# Confusion Matrix
table(pred, original)

##     original
## pred  1  2  3  4  5
##    1 49 39 28 25 13
##    2  0  0  0  0  0
##    3  0  0  0  0  0
##    4 51 55 50 72 59
##    5  0  0  0  0  1

Chocolate Bar Data Visualization

Oscar

2021/11/6

Data

Reorganizing response

Train/Test split

Ordinal Logistic Regression report

Model Interpretation

Confusion Table

Main Effect plot

Interaction plot (with cocoa_butter)

Different organizing scheme

1st

2nd