Data
choco<- read.csv(
"C:\\Users\\user\\Desktop\\110_1\\Statistics\\Data Analysis\\FULL_DATA\\chocolate.csv")
n<- nrow(choco)
data<- choco[-1]
data$rating[which(data$rating==2.6)]<- 2.5
names(data)[16]<- "sweetener"
kable(data[1:10, ], "html", caption = "Table 1. Data outlook") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), position = "center") %>%
scroll_box(width = "850px", height = "300px")
Table 1. Data outlook
ref
|
company
|
company_location
|
review_date
|
country_of_bean_origin
|
specific_bean_origin_or_bar_name
|
cocoa_percent
|
rating
|
counts_of_ingredients
|
beans
|
cocoa_butter
|
vanilla
|
lecithin
|
salt
|
sugar
|
sweetener
|
first_taste
|
second_taste
|
third_taste
|
fourth_taste
|
2454
|
5150
|
U.S.A
|
2019
|
Madagascar
|
Bejofo Estate, batch 1
|
76
|
3.75
|
3
|
have_bean
|
have_cocoa_butter
|
have_not_vanila
|
have_not_lecithin
|
have_not_salt
|
have_sugar
|
have_not_sweetener_without_sugar
|
cocoa
|
blackberry
|
full body
|
|
2458
|
5150
|
U.S.A
|
2019
|
Dominican republic
|
Zorzal, batch 1
|
76
|
3.50
|
3
|
have_bean
|
have_cocoa_butter
|
have_not_vanila
|
have_not_lecithin
|
have_not_salt
|
have_sugar
|
have_not_sweetener_without_sugar
|
cocoa
|
vegetal
|
savory
|
|
2454
|
5150
|
U.S.A
|
2019
|
Tanzania
|
Kokoa Kamili, batch 1
|
76
|
3.25
|
3
|
have_bean
|
have_cocoa_butter
|
have_not_vanila
|
have_not_lecithin
|
have_not_salt
|
have_sugar
|
have_not_sweetener_without_sugar
|
rich cocoa
|
fatty
|
bready
|
|
797
|
A. Morin
|
France
|
2012
|
Peru
|
Peru
|
63
|
3.75
|
4
|
have_bean
|
have_cocoa_butter
|
have_not_vanila
|
have_lecithin
|
have_not_salt
|
have_sugar
|
have_not_sweetener_without_sugar
|
fruity
|
melon
|
roasty
|
|
797
|
A. Morin
|
France
|
2012
|
Bolivia
|
Bolivia
|
70
|
3.50
|
4
|
have_bean
|
have_cocoa_butter
|
have_not_vanila
|
have_lecithin
|
have_not_salt
|
have_sugar
|
have_not_sweetener_without_sugar
|
vegetal
|
nutty
|
|
|
1015
|
A. Morin
|
France
|
2013
|
Venezuela
|
Chuao
|
70
|
4.00
|
4
|
have_bean
|
have_cocoa_butter
|
have_not_vanila
|
have_lecithin
|
have_not_salt
|
have_sugar
|
have_not_sweetener_without_sugar
|
oily
|
nut
|
caramel
|
raspberry
|
1019
|
A. Morin
|
France
|
2013
|
Peru
|
Chanchamayo Province
|
63
|
4.00
|
3
|
have_bean
|
have_cocoa_butter
|
have_not_vanila
|
have_not_lecithin
|
have_not_salt
|
have_sugar
|
have_not_sweetener_without_sugar
|
sweet
|
cocoa
|
tangerine
|
|
1011
|
A. Morin
|
France
|
2013
|
Ecuador
|
Equateur
|
70
|
3.75
|
4
|
have_bean
|
have_cocoa_butter
|
have_not_vanila
|
have_lecithin
|
have_not_salt
|
have_sugar
|
have_not_sweetener_without_sugar
|
sandy
|
nutty
|
cocoa
|
fig
|
1019
|
A. Morin
|
France
|
2013
|
Peru
|
Chanchamayo Province
|
70
|
3.50
|
4
|
have_bean
|
have_cocoa_butter
|
have_not_vanila
|
have_lecithin
|
have_not_salt
|
have_sugar
|
have_not_sweetener_without_sugar
|
cocoa
|
sour
|
intense tangerine
|
|
1011
|
A. Morin
|
France
|
2013
|
Brazil
|
Brazil
|
70
|
3.25
|
4
|
have_bean
|
have_cocoa_butter
|
have_not_vanila
|
have_lecithin
|
have_not_salt
|
have_sugar
|
have_not_sweetener_without_sugar
|
mild tobacco
|
|
|
|
Reorganizing response
levels(data$cocoa_butter)<- data$cocoa_butter %>% fct_rev() %>% levels()
levels(data$lecithin)<- data$lecithin %>% fct_rev() %>% levels()
names<- names(data)
org<- data
# Convert logical to integer
fun<- function(x){
return<- x %>% str_detect("not")
as.numeric(!return)
}
dummy <- data[10:16] %>% lapply(fun) %>% as.data.frame()
data[10:16]<- dummy
target<- names[8]
variable<- names[c(7, 11:16)]
variables<- names[c(7, 11:15)]
# categorize response into 4 subs
rating<- data$rating
df <- table(data$rating) %>% t()
kable(df, "html", caption = "Table 2. Original rating outlook") %>%
kable_styling(bootstrap_options = c("hover", "condensed", "responsive"),
position = "center") %>%
add_header_above(c("rating" = 12)) %>%
kable_minimal()
Table 2. Original rating outlook
rating
|
1
|
1.5
|
1.75
|
2
|
2.25
|
2.5
|
2.75
|
3
|
3.25
|
3.5
|
3.75
|
4
|
1
|
5
|
1
|
29
|
14
|
150
|
304
|
471
|
394
|
489
|
265
|
101
|
data$rating <- ifelse(data$rating<=2.5, 1, data$rating)
data$rating <- ifelse(data$rating<=3 & data$rating>2.5, 2, data$rating)
data$rating <- ifelse(data$rating<=3.5 & data$rating>3, 3, data$rating)
data$rating <- ifelse(data$rating >3.5, 4, data$rating)
df <- table(data$rating) %>% t()
kable(df, "html", caption = "Table 3. Reorganized rating outlook") %>%
kable_styling(bootstrap_options = c("hover", "condensed", "responsive"),
position = "center") %>%
add_header_above(c("rating" = 4)) %>%
kable_minimal()
Table 3. Reorganized rating outlook
rating
|
1
|
2
|
3
|
4
|
200
|
775
|
883
|
366
|
Train/Test split
# Factorized columns
data$rating <- factor(data$rating,
levels = c("1", "2", "3", "4"), ordered = TRUE)
data[, variables[-1]]<- lapply(data[, variables[-1]], function(x){
factor(x, levels = c("0", "1"), ordered = TRUE)
}) %>% as.data.frame()
# train/test split
train_int<-
createDataPartition(data$rating, p = 0.8) %>%
unlist() %>% as.numeric()
train<- data[train_int,]
test<- data[-train_int,]
train_df<- prop.table(table(train$rating))
test_df<- prop.table(table(test$rating))
df<- rbind(train_df, test_df) %>% round(digits = 3) %>% as.data.frame()
rownames(df)<- c("train", "test")
kable(df, "html", caption = "Table 4. Proportions of four categories within each of train/test split") %>%
kable_styling(bootstrap_options = c("hover", "condensed", "responsive"),
position = "center") %>%
add_header_above(c(" ", "rating" = 4)) %>%
kable_minimal()
Table 4. Proportions of four categories within each of train/test split
|
rating
|
|
1
|
2
|
3
|
4
|
train
|
0.09
|
0.348
|
0.397
|
0.165
|
test
|
0.09
|
0.349
|
0.396
|
0.164
|
Ordinal Logistic Regression report
# multicategory logit model
variables2<- variables[-c(4:5)]
f2 <- paste(target, "~", paste(variables2, collapse=" + "))
cumu.logit2=vglm(f2, family=cumulative(parallel=TRUE), data=train)
summary(cumu.logit2, digits = 3)
##
## Call:
## vglm(formula = f2, family = cumulative(parallel = TRUE), data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept):1 -3.569376 0.614855 -5.805 6.43e-09 ***
## (Intercept):2 -1.438861 0.608652 -2.364 0.018078 *
## (Intercept):3 0.471604 0.608213 0.775 0.438107
## cocoa_percent 0.026661 0.008278 3.221 0.001279 **
## cocoa_butter.L 0.060141 0.069305 0.868 0.385519
## vanilla.L 0.637851 0.090575 7.042 1.89e-12 ***
## sugar.L -0.577289 0.164974 -3.499 0.000467 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Names of linear predictors: logitlink(P[Y<=1]), logitlink(P[Y<=2]),
## logitlink(P[Y<=3])
##
## Residual deviance: 4372.745 on 5333 degrees of freedom
##
## Log-likelihood: -2186.373 on 5333 degrees of freedom
##
## Number of Fisher scoring iterations: 5
##
## No Hauck-Donner effect found in any of the estimates
##
##
## Exponentiated coefficients:
## cocoa_percent cocoa_butter.L vanilla.L sugar.L
## 1.0270191 1.0619863 1.8924105 0.5614184
Model Interpretation
## Nomogram
n_train<- org[train_int,]
n_test<- org[-train_int,]
rms_train<- n_train
# Had to reverse factors of response since I did not find an option to reverse in the model fitting function orm()
# However, the two models from two different packages turn out to be identical in terms of their AICs and estimated coefficients.
rms_train$rating<- n_train$rating %>% as.factor() %>% fct_rev()
ddist<- datadist(rms_train)
options(datadist = "ddist")
fit.blogit.rms<- rms::orm(as.formula("rating ~ cocoa_percent + cocoa_butter + vanilla + sugar"), data=rms_train)
coef<- fit.blogit.rms$coefficients
ff<- Newlabels(fit.blogit.rms, c(vanilla = "vanilla"))
fun2<- function(x) {
plogis(x- coef[1] + coef[2])
}
fun3<- function(x) {
plogis(x- coef[1] + coef[3])
}
nomo<- nomogram(ff,
fun = list("Prob Y>=2" = plogis,
"Prob Y>=3" = fun2,
"Prob Y>=4" = fun3),
lp = F,
fun.at = c(.01, .05, seq(.1, .9, by = .1), .95, .99))
plot(nomo, lmgp = .2, cex.axis = .6) # slight difference

Confusion Table
rms_train<- train
# Had to reverse factors of response since I did not find an option to reverse in the model fitting function orm()
# However, the two models from two different packages turn out to be identical in terms of their AICs and estimated coefficients.
rms_train$rating<- train$rating %>% as.factor() %>% fct_rev()
# Same situation again in polr()
# However, the three models from different packages turn out to be identical in terms of their AICs and estimated coefficients.
MASS<- polr(f2, data=rms_train, Hess = TRUE)
# summary(MASS)
pred<- predict(MASS, newdata=test)%>% fct_rev()
original<- test$rating
# Confusion Matrix
table(pred, original)
## original
## pred 1 2 3 4
## 1 0 0 0 0
## 2 19 47 20 11
## 3 21 108 156 62
## 4 0 0 0 0
Main Effect plot
plot(predictorEffects(MASS),lines=list(multiline=TRUE),
axes=list(grid=TRUE))

Interaction plot (with cocoa_butter)
plot(Effect(focal.predictors = c("cocoa_butter", "cocoa_percent"),MASS),
lines=list(multiline=TRUE), axes=list(grid=TRUE))

plot(Effect(focal.predictors = c("cocoa_butter", "vanilla"),MASS),
lines=list(multiline=TRUE), axes=list(grid=TRUE))

plot(Effect(focal.predictors = c("cocoa_butter", "sugar"),MASS),
lines=list(multiline=TRUE), axes=list(grid=TRUE))

Different organizing scheme
1st
-
(0, 2,5]
-
(2.5, 2.75]
-
(2.75, 3]
-
(3, 3.25]
-
(3.25, 4]
小怪怪的
data<- choco[-1]
data$rating[which(data$rating==2.6)]<- 2.5
names(data)[16]<- "sweetener"
levels(data$cocoa_butter)<- data$cocoa_butter %>% fct_rev() %>% levels()
levels(data$lecithin)<- data$lecithin %>% fct_rev() %>% levels()
names<- names(data)
org<- data
# Convert logical to integer
fun<- function(x){
return<- x %>% str_detect("not")
as.numeric(!return) %>% as.factor()
}
dummy <- data[10:16] %>% lapply(fun) %>% as.data.frame()
data[10:16]<- dummy
org[10:16]<- dummy
target<- names[8]
variable<- names[c(7, 11:16)]
variables<- names[c(7, 11:15)]
table(data$rating)
##
## 1 1.5 1.75 2 2.25 2.5 2.75 3 3.25 3.5 3.75 4
## 1 5 1 29 14 150 304 471 394 489 265 101
# categorize response into 4 subs
data$rating <- ifelse(data$rating<=2.5, 1, data$rating)
data$rating <- ifelse(data$rating<=2.75 & data$rating>2.5, 2, data$rating)
data$rating <- ifelse(data$rating<=3 & data$rating>2.75, 3, data$rating)
data$rating <- ifelse(data$rating >3.25, 5, data$rating)
data$rating <- ifelse(data$rating<=3.25 & data$rating >3, 4, data$rating)
org$rating<- data$rating
df <- table(data$rating) %>% t()
kable(df, "html", caption = "Table 5. Another reorganized rating outlook") %>%
kable_styling(bootstrap_options = c("hover", "condensed", "responsive"),
position = "center") %>%
add_header_above(c("rating" = 5)) %>%
kable_minimal()
Table 5. Another reorganized rating outlook
rating
|
1
|
2
|
3
|
4
|
5
|
200
|
304
|
471
|
394
|
855
|
# Factorized columns
data$rating <- factor(data$rating,
levels = c("1", "2", "3", "4", "5"), ordered = TRUE)
data[, variables[-1]]<- lapply(data[, variables[-1]], function(x){
factor(x, levels = c("0", "1"), ordered = TRUE)
}) %>% as.data.frame()
# train/test split
train_int<-
createDataPartition(data$rating, p = 0.8) %>%
unlist() %>% as.numeric()
train<- data[train_int,]
test<- data[-train_int,]
train_df<- prop.table(table(train$rating))
test_df<- prop.table(table(test$rating))
n_train<- org[train_int,]
n_test<- org[-train_int,]
rms_train<- n_train
rms_train$rating<- n_train$rating %>% as.factor() %>% fct_rev()
# fit a model
# Same situation again in polr()
# However, the three models from different packages turn out to be identical in terms of their AICs and estimated coefficients.
MASS<- polr(f2, data = rms_train)
# summary(MASS)
pred<- predict(MASS, newdata=n_test) %>% fct_rev()
original<- test$rating
# Confusion Matrix
table(pred, original)
## original
## pred 1 2 3 4 5
## 1 0 1 1 0 0
## 2 1 1 3 1 2
## 3 9 6 10 3 20
## 4 0 0 0 0 0
## 5 30 52 80 74 149
2nd
-
(0, 2,75]
-
(2.75, 3]
-
(3, 3.25]
-
(3.25, 3.5]
-
(3.5, 4]
超硬要,而且結果超不均衡
data<- choco[-1]
data$rating[which(data$rating==2.6)]<- 2.5
names(data)[16]<- "sweetener"
levels(data$cocoa_butter)<- data$cocoa_butter %>% fct_rev() %>% levels()
levels(data$lecithin)<- data$lecithin %>% fct_rev() %>% levels()
names<- names(data)
org<- data
# Convert logical to integer
fun<- function(x){
return<- x %>% str_detect("not")
as.numeric(!return) %>% as.factor()
}
dummy <- data[10:16] %>% lapply(fun) %>% as.data.frame()
data[10:16]<- dummy
org[10:16]<- dummy
target<- names[8]
variable<- names[c(7, 11:16)]
variables<- names[c(7, 11:15)]
table(data$rating)
##
## 1 1.5 1.75 2 2.25 2.5 2.75 3 3.25 3.5 3.75 4
## 1 5 1 29 14 150 304 471 394 489 265 101
# categorize response into 4 subs
data$rating <- ifelse(data$rating<=2.75, 1, data$rating)
data$rating <- ifelse(data$rating<=3 & data$rating>2.75, 2, data$rating)
data$rating <- ifelse(data$rating<=3.25 & data$rating>3, 3, data$rating)
data$rating <- ifelse(data$rating >=3.75, 5, data$rating)
data$rating <- ifelse(data$rating<=3.5 & data$rating >3.25, 4, data$rating)
org$rating<- data$rating
df <- table(data$rating) %>% t()
kable(df, "html", caption = "Table 5. Another reorganized rating outlook") %>%
kable_styling(bootstrap_options = c("hover", "condensed", "responsive"),
position = "center") %>%
add_header_above(c("rating" = 5)) %>%
kable_minimal()
Table 5. Another reorganized rating outlook
rating
|
1
|
2
|
3
|
4
|
5
|
504
|
471
|
394
|
489
|
366
|
# Factorized columns
data$rating <- factor(data$rating,
levels = c("1", "2", "3", "4", "5"), ordered = TRUE)
data[, variables[-1]]<- lapply(data[, variables[-1]], function(x){
factor(x, levels = c("0", "1"), ordered = TRUE)
}) %>% as.data.frame()
# train/test split
train_int<-
createDataPartition(data$rating, p = 0.8) %>%
unlist() %>% as.numeric()
train<- data[train_int,]
test<- data[-train_int,]
train_df<- prop.table(table(train$rating))
test_df<- prop.table(table(test$rating))
n_train<- org[train_int,]
n_test<- org[-train_int,]
rms_train<- n_train
rms_train$rating<- n_train$rating %>% as.factor() %>% fct_rev()
# fit a model
# Same situation again in polr()
# However, the three models from different packages turn out to be identical in terms of their AICs and estimated coefficients.
MASS<- polr(f2, data = rms_train)
# summary(MASS)
pred<- predict(MASS, newdata=n_test) %>% fct_rev()
original<- test$rating
# Confusion Matrix
table(pred, original)
## original
## pred 1 2 3 4 5
## 1 49 39 28 25 13
## 2 0 0 0 0 0
## 3 0 0 0 0 0
## 4 51 55 50 72 59
## 5 0 0 0 0 1