rm(list = ls()) # clean environment
cat("\014") # clean console
library(data.table)
library(class)
library(pander)
library(ROCR)
library(tidyverse)
library(caret)
library(rpart)
library(rpart.plot)
library(dummies)
library(modeest)
library(lift)
library(randomForest)
library(adabag)
setwd("~/UNIGE/MASTERS/MaBAn (2020-2022)/Part 1/CVTDM/HW4")
# Importing the data
bank <- read.csv("UniversalBank.csv", header = TRUE, sep = ",")
t(t(names(bank)))
## [,1]
## [1,] "ID"
## [2,] "Age"
## [3,] "Experience"
## [4,] "Income"
## [5,] "ZIP.Code"
## [6,] "Family"
## [7,] "CCAvg"
## [8,] "Education"
## [9,] "Mortgage"
## [10,] "Personal.Loan"
## [11,] "Securities.Account"
## [12,] "CD.Account"
## [13,] "Online"
## [14,] "CreditCard"
str(bank)
## 'data.frame': 5000 obs. of 14 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : int 25 45 39 35 35 37 53 50 35 34 ...
## $ Experience : int 1 19 15 9 8 13 27 24 10 9 ...
## $ Income : int 49 34 11 100 45 29 72 22 81 180 ...
## $ ZIP.Code : int 91107 90089 94720 94112 91330 92121 91711 93943 90089 93023 ...
## $ Family : int 4 3 1 1 4 4 2 1 3 1 ...
## $ CCAvg : num 1.6 1.5 1 2.7 1 0.4 1.5 0.3 0.6 8.9 ...
## $ Education : int 1 1 1 2 2 2 2 3 2 3 ...
## $ Mortgage : int 0 0 0 0 0 155 0 0 104 0 ...
## $ Personal.Loan : int 0 0 0 0 0 0 0 0 0 1 ...
## $ Securities.Account: int 1 1 0 0 0 0 0 0 0 0 ...
## $ CD.Account : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Online : int 0 0 0 0 0 1 1 0 1 0 ...
## $ CreditCard : int 0 0 0 0 1 0 0 1 0 0 ...
# We see that many variables are considered integers, so we must convert them into factors
bank$Education = as.factor(bank$Education)
bank$Personal.Loan = as.factor(bank$Personal.Loan)
bank$Securities.Account = as.factor(bank$Securities.Account)
bank$CD.Account = as.factor(bank$CD.Account)
bank$Online = as.factor(bank$Online)
bank$CreditCard = as.factor(bank$CreditCard)
# Now, we create dummies for our education
bank_2 <-
cbind(bank[1:7], dummy(bank$Education, sep = "_"), bank[9:14])
names(bank_2)[8:10] <- c("Education_1", "Education_2", "Education_3")
attach(bank_2)
set.seed(1)
# Partitioning the data
train.obs <- sample(rownames(bank_2), dim(bank_2)[1]*0.6)
train.set <- bank_2[train.obs, -c(1,5)] # We take away ID and ZIP.Code
valid.obs <- setdiff(rownames(bank_2), train.obs)
valid.set <- bank_2[valid.obs, -c(1,5)]
# Normalizing the data
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
train.set.norm <- as.data.frame(lapply(train.set[, c(1:5, 9)], normalize))
valid.set.norm <- as.data.frame(lapply(valid.set[, c(1:5, 9)], normalize))
# Regrouping into final dataset, with replacement of non-normalized variables
train.norm <- cbind(train.set.norm, train.set[, c(6:8, 10:14)])
valid.norm <- cbind(valid.set.norm, valid.set[, c(6:8, 10:14)])
# Running the logistic model :
set.seed(1)
logistic <-
glm(Personal.Loan ~ ., data = train.set, family = binomial)
summary(logistic)
##
## Call:
## glm(formula = Personal.Loan ~ ., family = binomial, data = train.set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.0741 -0.1898 -0.0701 -0.0232 4.1048
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.1846978 2.3185999 -3.530 0.000416 ***
## Age -0.0324946 0.0858273 -0.379 0.704982
## Experience 0.0390812 0.0851023 0.459 0.646072
## Income 0.0591487 0.0038527 15.353 < 2e-16 ***
## Family 0.5787243 0.0993320 5.826 5.67e-09 ***
## CCAvg 0.1548502 0.0571994 2.707 0.006785 **
## Education_1 -3.9324934 0.3449324 -11.401 < 2e-16 ***
## Education_2 -0.1110033 0.2453163 -0.452 0.650916
## Education_3 NA NA NA NA
## Mortgage 0.0011437 0.0007617 1.501 0.133241
## Securities.Account1 -0.7739587 0.3957411 -1.956 0.050498 .
## CD.Account1 3.7923306 0.4801242 7.899 2.82e-15 ***
## Online1 -0.6626600 0.2103154 -3.151 0.001628 **
## CreditCard1 -1.1142969 0.2877270 -3.873 0.000108 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1838.26 on 2999 degrees of freedom
## Residual deviance: 704.97 on 2987 degrees of freedom
## AIC: 730.97
##
## Number of Fisher Scoring iterations: 8
# Making predictions for the validation set :
pred_prob <- predict(logistic, valid.set, type = "response")
performance() inside the
ROCR package.# Creating a "prediction" object (compulsory) :
pred = prediction(pred_prob, labels = valid.set$Personal.Loan)
# Creating a "performance" object and ploting :
perfacc <- performance(prediction.obj = pred, measure="acc", x.measure="cutoff")
plot(perfacc)
# Finding the highest accuracy and the corresponding cutoff value :
which.max(perfacc@y.values[[1]]) # It gives "129", so :
## [1] 129
perfacc@y.values[[1]][129] # It gives "0.9565", so :
## [1] 0.9565
perfacc@x.values[[1]][which(perfacc@y.values[[1]] == 0.9565)]
## 79 3146 3989 4017
## 0.6592312 0.5228421 0.5192039 0.5115680
# Applying the cutoff value for classification :
pred_class <- ifelse(pred_prob > 0.5228421, 1, 0)
head(pred_class)
## 2 5 6 8 10 12
## 0 0 0 0 1 0
cm_logistic = confusionMatrix(as.factor(as.integer(pred_class)), as.factor(valid.norm[, 10]), positive = "1")
cm_logistic
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1777 69
## 1 18 136
##
## Accuracy : 0.9565
## 95% CI : (0.9466, 0.965)
## No Information Rate : 0.8975
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7343
##
## Mcnemar's Test P-Value : 8.296e-08
##
## Sensitivity : 0.6634
## Specificity : 0.9900
## Pos Pred Value : 0.8831
## Neg Pred Value : 0.9626
## Prevalence : 0.1025
## Detection Rate : 0.0680
## Detection Prevalence : 0.0770
## Balanced Accuracy : 0.8267
##
## 'Positive' Class : 1
##
set.seed(1)
k_nn <-
knn3(Personal.Loan ~ ., data = train.norm,
k = 3)
pred_knn_class = predict(k_nn, valid.norm, type = "class")
# The following is the resulting confusion matrix :
cm_knn = confusionMatrix(data = pred_knn_class, as.factor(valid.norm[, 10]), positive = "1")
cm_knn
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1780 68
## 1 15 137
##
## Accuracy : 0.9585
## 95% CI : (0.9488, 0.9668)
## No Information Rate : 0.8975
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7453
##
## Mcnemar's Test P-Value : 1.145e-08
##
## Sensitivity : 0.6683
## Specificity : 0.9916
## Pos Pred Value : 0.9013
## Neg Pred Value : 0.9632
## Prevalence : 0.1025
## Detection Rate : 0.0685
## Detection Prevalence : 0.0760
## Balanced Accuracy : 0.8300
##
## 'Positive' Class : 1
##
set.seed(1)
class_tree <-
rpart(Personal.Loan ~ ., data = train.set, method = "class")
plot_class_tree = prp(
class_tree,
type = 1,
extra = 1,
under = TRUE,
split.font = 1,
varlen = -10
)
# Getting the predictions from the tree :
pred_valid <- predict(class_tree, valid.set, type = "class")
# The following is the resulting confusion matrix :
cm_tree = confusionMatrix(data = pred_valid, as.factor(valid.set[, 10]), positive = "1")
cm_tree
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1787 32
## 1 8 173
##
## Accuracy : 0.98
## 95% CI : (0.9729, 0.9857)
## No Information Rate : 0.8975
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8854
##
## Mcnemar's Test P-Value : 0.0002762
##
## Sensitivity : 0.8439
## Specificity : 0.9955
## Pos Pred Value : 0.9558
## Neg Pred Value : 0.9824
## Prevalence : 0.1025
## Detection Rate : 0.0865
## Detection Prevalence : 0.0905
## Balanced Accuracy : 0.9197
##
## 'Positive' Class : 1
##
my_df <-
data.frame(
actual = valid.set[, 10],
predicted_logistic = as.factor(pred_class),
predicted_knn = pred_knn_class,
predicted_tree = pred_valid
)
# Here are the 10 first rows :
pander(head(my_df, 10))
| actual | predicted_logistic | predicted_knn | predicted_tree | |
|---|---|---|---|---|
| 2 | 0 | 0 | 0 | 0 |
| 5 | 0 | 0 | 0 | 0 |
| 6 | 0 | 0 | 0 | 0 |
| 8 | 0 | 0 | 0 | 0 |
| 10 | 1 | 1 | 1 | 1 |
| 12 | 0 | 0 | 0 | 0 |
| 13 | 0 | 0 | 0 | 0 |
| 19 | 1 | 1 | 1 | 1 |
| 20 | 0 | 0 | 0 | 0 |
| 21 | 0 | 0 | 0 | 0 |
# Creating the "majority vote" column :
my_df$majority_vote <- apply(my_df[, c(2:4)], 1, mfv)
# Accessing the probabilities for each of the methods :
# k-NN
pred_knn_prob_vect = predict(k_nn, valid.norm, type = "prob")
pred_knn_prob_matrix = matrix(
pred_knn_prob_vect,
nrow = dim(valid.set)[1],
ncol = 2,
dimnames = list(rownames(valid.set), c("0", "1"))
)
head(pred_knn_prob_matrix)
## 0 1
## 2 1.0000000 0.0000000
## 5 1.0000000 0.0000000
## 6 1.0000000 0.0000000
## 8 1.0000000 0.0000000
## 10 0.3333333 0.6666667
## 12 1.0000000 0.0000000
# Classification tree
pred_tree_prob_vect = predict(class_tree, valid.set)
pred_tree_prob_matrix = matrix(
pred_tree_prob_vect,
nrow = dim(valid.set)[1],
ncol = 2,
dimnames = list(rownames(valid.set), c("0", "1"))
)
head(pred_tree_prob_matrix)
## 0 1
## 2 0.99734865 0.002651348
## 5 0.99734865 0.002651348
## 6 0.99734865 0.002651348
## 8 0.99734865 0.002651348
## 10 0.01595745 0.984042553
## 12 0.99734865 0.002651348
# Logistic regression
pred_logistic_prob = matrix(
data = NA,
nrow = dim(valid.set)[1],
ncol = 2,
dimnames = list(rownames(valid.set), c("0", "1"))
)
pred_logistic_prob[, 2] <-
predict(logistic, valid.set, type = "response")
pred_logistic_prob[, 1] <-
1 - predict(logistic, valid.set, type = "response")
head(pred_logistic_prob)
## 0 1
## 2 0.99993437 6.563347e-05
## 5 0.99395955 6.040445e-03
## 6 0.99542784 4.572165e-03
## 8 0.99968396 3.160403e-04
## 10 0.02494355 9.750565e-01
## 12 0.99499324 5.006758e-03
# Creating the "probability average" column :
# For the "0" category :
for (i in 1:dim(valid.set)[1]) {
my_df$prob_average_0[i] <-
(pred_knn_prob_matrix[i, 1] + pred_logistic_prob[i, 1] + pred_tree_prob_matrix[i, 1]) /
3
}
# For the "1" category :
for (i in 1:dim(valid.set)[1]) {
my_df$prob_average_1[i] <-
(pred_knn_prob_matrix[i, 2] + pred_logistic_prob[i, 2] + pred_tree_prob_matrix[i, 2]) /
3
}
head(my_df$prob_average_0)
## [1] 0.9990943 0.9971027 0.9975922 0.9990109 0.1247448 0.9974473
head(my_df$prob_average_1)
## [1] 0.0009056604 0.0028972644 0.0024078375 0.0009891294 0.8752552236
## [6] 0.0025527021
ave_prob.toclass = ifelse(my_df$prob_average_1 > 0.5228421, 1, 0)
# For ensemble method nº1 (majority vote) :
cm_ens_1 = confusionMatrix(data = table(my_df$majority_vote, valid.set$Personal.Loan), positive = "1")
cm_ens_1
## Confusion Matrix and Statistics
##
##
## 0 1
## 0 1793 48
## 1 2 157
##
## Accuracy : 0.975
## 95% CI : (0.9672, 0.9814)
## No Information Rate : 0.8975
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8491
##
## Mcnemar's Test P-Value : 1.966e-10
##
## Sensitivity : 0.7659
## Specificity : 0.9989
## Pos Pred Value : 0.9874
## Neg Pred Value : 0.9739
## Prevalence : 0.1025
## Detection Rate : 0.0785
## Detection Prevalence : 0.0795
## Balanced Accuracy : 0.8824
##
## 'Positive' Class : 1
##
# For ensemble method nº2 (average probability) :
cm_ens_2 = confusionMatrix(data = table(ave_prob.toclass, valid.set$Personal.Loan), positive = "1")
cm_ens_2
## Confusion Matrix and Statistics
##
##
## ave_prob.toclass 0 1
## 0 1794 46
## 1 1 159
##
## Accuracy : 0.9765
## 95% CI : (0.9689, 0.9827)
## No Information Rate : 0.8975
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8585
##
## Mcnemar's Test P-Value : 1.38e-10
##
## Sensitivity : 0.7756
## Specificity : 0.9994
## Pos Pred Value : 0.9937
## Neg Pred Value : 0.9750
## Prevalence : 0.1025
## Detection Rate : 0.0795
## Detection Prevalence : 0.0800
## Balanced Accuracy : 0.8875
##
## 'Positive' Class : 1
##
err_compare <-
matrix(NA,
nrow = 5,
ncol = 1,
dimnames = list(
c("logistic", "k-NN", "tree", "ensemble nº1", "ensemble nº2"),
c("Error rate")
))
err_compare[1, ] = as.numeric(1 - cm_logistic$overall[1])
err_compare[2, ] = as.numeric(1 - cm_knn$overall[1])
err_compare[3, ] = as.numeric(1 - cm_tree$overall[1])
err_compare[4, ] = as.numeric(1 - cm_ens_1$overall[1])
err_compare[5, ] = as.numeric(1 - cm_ens_2$overall[1])
pander(err_compare)
| Error rate | |
|---|---|
| logistic | 0.0435 |
| k-NN | 0.0415 |
| tree | 0.02 |
| ensemble nº1 | 0.025 |
| ensemble nº2 | 0.0235 |
ebay <- read.csv("eBayAuctions.csv", header = TRUE, sep = ",")
t(t(names(ebay)))
## [,1]
## [1,] "Category"
## [2,] "currency"
## [3,] "sellerRating"
## [4,] "Duration"
## [5,] "endDay"
## [6,] "ClosePrice"
## [7,] "OpenPrice"
## [8,] "Competitive."
str(ebay)
## 'data.frame': 1972 obs. of 8 variables:
## $ Category : chr "Music/Movie/Game" "Music/Movie/Game" "Music/Movie/Game" "Music/Movie/Game" ...
## $ currency : chr "US" "US" "US" "US" ...
## $ sellerRating: int 3249 3249 3249 3249 3249 3249 3249 3249 3249 3249 ...
## $ Duration : int 5 5 5 5 5 5 5 5 5 5 ...
## $ endDay : chr "Mon" "Mon" "Mon" "Mon" ...
## $ ClosePrice : num 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 ...
## $ OpenPrice : num 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 ...
## $ Competitive.: int 0 0 0 0 0 0 0 0 0 0 ...
attach(ebay)
set.seed(1)
# Partitioning the data
train.obs <- sample(rownames(ebay), dim(ebay)[1]*0.6)
train.set <- ebay[train.obs,] # We take away ID and ZIP.Code
valid.obs <- setdiff(rownames(ebay), train.obs)
valid.set <- ebay[valid.obs,]
class_tree <-
rpart(Competitive. ~ ., data = train.set, method = "class")
plot_class_tree = prp(
class_tree,
type = 1,
extra = 1,
under = TRUE,
split.font = 1,
varlen = -10
)
pred_class = predict(class_tree, valid.set, type = "class")
# Creating the confusion matrix :
confusionMatrix(as.factor(pred_class), as.factor(valid.set$Competitive.))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 335 99
## 1 47 308
##
## Accuracy : 0.815
## 95% CI : (0.7861, 0.8415)
## No Information Rate : 0.5158
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6311
##
## Mcnemar's Test P-Value : 2.434e-05
##
## Sensitivity : 0.8770
## Specificity : 0.7568
## Pos Pred Value : 0.7719
## Neg Pred Value : 0.8676
## Prevalence : 0.4842
## Detection Rate : 0.4246
## Detection Prevalence : 0.5501
## Balanced Accuracy : 0.8169
##
## 'Positive' Class : 0
##
# Getting predicted probabilities :
pred_prob = predict(class_tree, valid.set, type = "prob")
# Ploting the lift chart :
lift.class_tree <- lift(relevel(as.factor(valid.set$Competitive.), ref = "1") ~ pred_prob[,2], data = valid.set)
xyplot(lift.class_tree, plot = "gain")
# Calculating the baseline :
baseline = sum(valid.set$Competitive.)/789
# Creating data frame with both probabilities and classes :
full_df = as.data.frame(pred_class)
full_df$pred_prob = pred_prob[,2]
# Ordering the data frame in decreasing order according to probabilities :
full_df <- full_df[order(full_df$pred_prob, decreasing = T),]
lift = 79/(baseline*79)
lift
## [1] 1.938575
lift which contains
a formula specifically designed to DIRECTLY compute the lift for the Top
10% Decile.TopDecileLift(pred_prob[,2], valid.set$Competitive.)
## [1] 1.914
# IMPORTANT : need to convert the outcome variable into a FACTOR! Otherwise, the boosting algorithm won't work!
train.set$Competitive. = factor(train.set$Competitive.)
str(train.set$Competitive.)
## Factor w/ 2 levels "0","1": 2 2 1 2 1 1 2 1 2 1 ...
# The category "Photography" seems to have problems when boosting, so we delete it:
train.set = train.set[train.set$Category!="Photography",]
valid.set = valid.set[valid.set$Category!="Photography",]
# Doing boosting (takes quite a little while...)
boost <- boosting(Competitive. ~ ., data = train.set[, c(-1)])
# Getting the predictions :
pred_boost <- predict(boost, valid.set)
# Doing the confusion matrix :
confusionMatrix(as.factor(pred_boost$class), as.factor(valid.set$Competitive.))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 352 47
## 1 30 354
##
## Accuracy : 0.9017
## 95% CI : (0.8786, 0.9216)
## No Information Rate : 0.5121
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.8034
##
## Mcnemar's Test P-Value : 0.06825
##
## Sensitivity : 0.9215
## Specificity : 0.8828
## Pos Pred Value : 0.8822
## Neg Pred Value : 0.9219
## Prevalence : 0.4879
## Detection Rate : 0.4496
## Detection Prevalence : 0.5096
## Balanced Accuracy : 0.9021
##
## 'Positive' Class : 0
##
# Computing the lift for the top 10% decile :
TopDecileLift(pred_boost$prob[,2], valid.set$Competitive.)
## [1] 1.953
# Doing bagging :
bagg = bagging(Competitive. ~ ., data = train.set)
# Getting the predictions :
pred_bagg <- predict(bagg, valid.set, type = "class")
# Doing the confusion matrix :
confusionMatrix(as.factor(pred_bagg$class), as.factor(valid.set$Competitive.))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 347 66
## 1 35 335
##
## Accuracy : 0.871
## 95% CI : (0.8455, 0.8937)
## No Information Rate : 0.5121
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7424
##
## Mcnemar's Test P-Value : 0.002835
##
## Sensitivity : 0.9084
## Specificity : 0.8354
## Pos Pred Value : 0.8402
## Neg Pred Value : 0.9054
## Prevalence : 0.4879
## Detection Rate : 0.4432
## Detection Prevalence : 0.5275
## Balanced Accuracy : 0.8719
##
## 'Positive' Class : 0
##
# Computing the lift for the top 10% decile :
TopDecileLift(pred_bagg$prob[,2], valid.set$Competitive.)
## [1] 1.953
# Doing bagging :
rf = randomForest(Competitive. ~ ., data = train.set, mtry = 4)
# Getting the predictions :
pred_rf_class <- predict(rf, valid.set, type = "class")
pred_rf_prob <- predict(rf, valid.set, type = "prob")
# Doing the confusion matrix :
confusionMatrix(as.factor(pred_rf_class), as.factor(valid.set$Competitive.))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 342 45
## 1 40 356
##
## Accuracy : 0.8914
## 95% CI : (0.8675, 0.9124)
## No Information Rate : 0.5121
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7828
##
## Mcnemar's Test P-Value : 0.6644
##
## Sensitivity : 0.8953
## Specificity : 0.8878
## Pos Pred Value : 0.8837
## Neg Pred Value : 0.8990
## Prevalence : 0.4879
## Detection Rate : 0.4368
## Detection Prevalence : 0.4943
## Balanced Accuracy : 0.8915
##
## 'Positive' Class : 0
##
# Computing the lift for the top 10% decile :
TopDecileLift(pred_rf_prob[,2], valid.set$Competitive.)
## [1] 1.953
cereals <- read.csv("Cereals.csv", header = TRUE, sep = ",")
t(t(names(cereals)))
## [,1]
## [1,] "name"
## [2,] "mfr"
## [3,] "type"
## [4,] "calories"
## [5,] "protein"
## [6,] "fat"
## [7,] "sodium"
## [8,] "fiber"
## [9,] "carbo"
## [10,] "sugars"
## [11,] "potass"
## [12,] "vitamins"
## [13,] "shelf"
## [14,] "weight"
## [15,] "cups"
## [16,] "rating"
str(cereals)
## 'data.frame': 77 obs. of 16 variables:
## $ name : chr "100%_Bran" "100%_Natural_Bran" "All-Bran" "All-Bran_with_Extra_Fiber" ...
## $ mfr : chr "N" "Q" "K" "K" ...
## $ type : chr "C" "C" "C" "C" ...
## $ calories: int 70 120 70 50 110 110 110 130 90 90 ...
## $ protein : int 4 3 4 4 2 2 2 3 2 3 ...
## $ fat : int 1 5 1 0 2 2 0 2 1 0 ...
## $ sodium : int 130 15 260 140 200 180 125 210 200 210 ...
## $ fiber : num 10 2 9 14 1 1.5 1 2 4 5 ...
## $ carbo : num 5 8 7 8 14 10.5 11 18 15 13 ...
## $ sugars : int 6 8 5 0 8 10 14 8 6 5 ...
## $ potass : int 280 135 320 330 NA 70 30 100 125 190 ...
## $ vitamins: int 25 0 25 25 25 25 25 25 25 25 ...
## $ shelf : int 3 3 3 3 3 1 2 3 1 3 ...
## $ weight : num 1 1 1 1 1 1 1 1.33 1 1 ...
## $ cups : num 0.33 1 0.33 0.5 0.75 0.75 1 0.75 0.67 0.67 ...
## $ rating : num 68.4 34 59.4 93.7 34.4 ...
attach(cereals)
# Eliminate all cereals with NAs :
table(is.na(cereals))
##
## FALSE TRUE
## 1228 4
cereals_clean = na.omit(cereals)
# Normalizing :
cereals_clean.norm_temp <- as.data.frame(lapply(cereals_clean[, 4:16], normalize))
cereals_clean.norm <- cbind(cereals_clean[, 1:3], cereals_clean.norm_temp)
# Setting cereal names to each row :
row.names(cereals_clean.norm) <- cereals_clean.norm[, 1]
# Computing EUCLIDEAN distance for ONLY numerical (standardized) variables :
distance = dist(cereals_clean.norm[, 4:16], method = "euclidean")
single.link = hclust(distance, method = "single")
plot(single.link, hang = -1, ann = FALSE)
compl.link = hclust(distance, method = "complete")
plot(compl.link, hang = -1, ann = FALSE)
# Computing the WSS :
set.seed(1)
wss <- (nrow(cereals_clean.norm)-1)*sum(apply(cereals_clean.norm[, 4:16],2,var))
wss
## [1] 52.14865
# Drawing the graph :
set.seed(1)
for (i in 2:24) wss[i] <- sum(kmeans(cereals_clean.norm[, 4:16], centers=i)$withinss)
plot(1:24, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares", lty=1, col = 4, lwd = 1) +
abline(v=8, lty=2, lwd = 3, col = "green")
## integer(0)
legend(10, 40, legend = "Optimal number of clusters (8)", col = "green", lty=2, cex=0.8)
cutree which actually
performs a division into clusters. Let’s use it!# Creating the clusters
opt_clusters_single = cutree(single.link, k = 8)
head(opt_clusters_single)
## 100%_Bran 100%_Natural_Bran All-Bran
## 1 2 1
## All-Bran_with_Extra_Fiber Apple_Cinnamon_Cheerios Apple_Jacks
## 3 4 4
aggregate() function and compute
the mean of the members in each cluster. So, let’s go!centroids_single <- aggregate(cereals_clean.norm[, 4:16], by = list(opt_clusters_single), FUN = mean)
centroids_MATRIX_single = as.matrix(centroids_single[, -1])
plot(c(0), xaxt = 'n', ylab = "", xlab = "", type = "l", ylim = c(min(centroids_MATRIX_single), max(centroids_MATRIX_single)), xlim = c(0, 15))
axis(1, at = c(1:13), labels = names(cereals_clean.norm[, 4:16]), las = 2)
for(i in c(1:8))
lines(centroids_MATRIX_single[i,], lty = i, lwd = 2, col = ifelse(i %in% c(1,3,5,7), "black", "dark grey"))
text(x = 0.2, y = centroids_MATRIX_single[1:4,1], labels = paste("Cluster", c(1:4)), cex = 0.5)
text(x = 14, y = centroids_MATRIX_single[5:8,13], labels = paste("Cluster", c(5:8)), cex = 0.5)
# Creating the clusters :
opt_clusters_compl = cutree(compl.link, k = 8)
head(opt_clusters_compl)
## 100%_Bran 100%_Natural_Bran All-Bran
## 1 2 1
## All-Bran_with_Extra_Fiber Apple_Cinnamon_Cheerios Apple_Jacks
## 1 3 4
# Getting the centroids :
centroids_compl <- aggregate(cereals_clean.norm[4:16], by = list(opt_clusters_compl), FUN = mean)
# Plotting!
centroids_MATRIX_compl = as.matrix(centroids_compl[, -1])
plot(c(0), xaxt = 'n', ylab = "", xlab = "", type = "l", ylim = c(min(centroids_MATRIX_compl), max(centroids_MATRIX_compl)), xlim = c(0, 15))
axis(1, at = c(1:13), labels = names(cereals_clean.norm[, 4:16]), las = 2)
for(i in c(1:8))
lines(centroids_MATRIX_compl[i,], lty = i, lwd = 2, col = ifelse(i %in% c(1,3,5,7), "black", "dark grey"))
text(x = 0.2, y = centroids_MATRIX_compl[1:4,1], labels = paste("Cluster", c(1:4)), cex = 0.5)
text(x = 14, y = centroids_MATRIX_compl[5:8,13], labels = paste("Cluster", c(5:8)), cex = 0.5)
plot(compl.link, hang = -1, ann = FALSE)
abline(h = 1.42)
melt.centroids <- melt(data = centroids_compl, id.vars = "Group.1")
ggplot(melt.centroids, aes(x = variable, y = value, group = Group.1, color = factor(Group.1), size = 1)) +
labs(color = "Cluster numbers") +
geom_line(size=0.5) +
theme(axis.text.x = element_text(angle = 90))
which(opt_clusters_compl == 8)
## Puffed_Rice Puffed_Wheat
## 53 54