# Packages Required
require(nnet)
require(caret)
require(Boruta)
require(e1071)
require(C50)
require(ggplot2)
# Read/View Data
x <- read.csv('Mat_Clean.csv')
View(x)
table(x$Walc==x$Dalc)
##
## TRUE
## 395
x <- x[,-28]
colnames(x)[27] <- 'AlcCp'
View(x)
# Modify target variable
str(x$AlcCp)
## int [1:395] 1 1 2 1 1 1 1 1 1 1 ...
x$AlcCp <- ordered(as.factor(x$AlcCp))
levels(x$AlcCp) <- c('Very Low','Low', 'Moderate', 'High', 'Very High')
str(x$AlcCp)
## Ord.factor w/ 5 levels "Very Low"<"Low"<..: 1 1 2 1 1 1 1 1 1 1 ...
# Reassigning Data Types
str(x)
## 'data.frame': 395 obs. of 30 variables:
## $ School : Factor w/ 2 levels "Gabriel Pereira",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ sex : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 2 2 1 2 2 ...
## $ age : int 18 17 15 15 16 16 16 17 15 15 ...
## $ address : Factor w/ 2 levels "Rural","Urban": 2 2 2 2 2 2 2 2 2 2 ...
## $ famsize : Factor w/ 2 levels "< 3 members",..: 2 2 1 2 2 1 1 2 1 2 ...
## $ Pstatus : Factor w/ 2 levels "Living Apart",..: 1 2 2 2 2 2 2 1 1 2 ...
## $ Medu : Factor w/ 5 levels "5th - 9th Grade",..: 2 4 4 2 5 2 1 2 5 5 ...
## $ Fedu : Factor w/ 5 levels "5th - 9th Grade",..: 2 4 4 1 5 5 1 2 1 2 ...
## $ Mjob : Factor w/ 5 levels "at_home","health",..: 1 1 1 2 3 4 3 3 4 3 ...
## $ Fjob : Factor w/ 5 levels "at_home","health",..: 5 3 3 4 3 3 3 5 3 3 ...
## $ reason : Factor w/ 4 levels "course","home",..: 1 1 3 2 2 4 2 2 2 2 ...
## $ guardian : Factor w/ 3 levels "father","mother",..: 2 1 2 2 1 2 2 2 2 2 ...
## $ traveltime : Factor w/ 4 levels "< 15min","> 60 min",..: 3 1 1 1 1 1 1 3 1 1 ...
## $ studytime : Factor w/ 4 levels "< 2 hr","> 10 hr",..: 3 3 3 4 3 3 3 3 3 3 ...
## $ failures : int 0 0 3 0 0 0 0 0 0 0 ...
## $ schoolsup : Factor w/ 2 levels "no","yes": 2 1 2 1 1 1 1 2 1 1 ...
## $ famsup : Factor w/ 2 levels "no","yes": 1 2 1 2 2 2 1 2 2 2 ...
## $ paid : Factor w/ 2 levels "no","yes": 1 1 2 2 2 2 1 1 2 2 ...
## $ activities : Factor w/ 2 levels "no","yes": 1 1 1 2 1 2 1 1 1 2 ...
## $ nursery : Factor w/ 2 levels "no","yes": 2 1 2 2 2 2 2 2 2 2 ...
## $ higher : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ internet : Factor w/ 2 levels "no","yes": 1 2 2 2 1 2 2 1 2 2 ...
## $ romantic : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 1 1 ...
## $ famrel : int 4 5 4 3 4 5 4 4 4 5 ...
## $ freetime : int 3 3 3 2 3 4 4 1 2 5 ...
## $ goout : int 4 3 2 2 2 2 4 4 2 1 ...
## $ AlcCp : Ord.factor w/ 5 levels "Very Low"<"Low"<..: 1 1 2 1 1 1 1 1 1 1 ...
## $ health : int 3 3 3 5 5 5 3 1 1 5 ...
## $ absences : int 6 4 10 2 4 10 0 6 0 0 ...
## $ Average.Score: num 5.67 5.33 8.33 14.67 8.67 ...
x$famrel <- ordered(as.factor(x$famrel))
x$freetime <- ordered(as.factor(x$freetime))
x$goout <- ordered(as.factor(x$goout))
x$AlcCp <- ordered(as.factor(x$AlcCp))
x$health <- ordered(as.factor(x$health))
x$Medu <- ordered(x$Medu, levels = c('None', 'Primary Education', '5th - 9th Grade', 'Secondary Education', 'Higher Education'))
x$Fedu <- ordered(x$Fedu, levels = c('None', 'Primary Education', '5th - 9th Grade', 'Secondary Education', 'Higher Education'))
x$traveltime <- ordered(x$traveltime, levels = c('< 15min', '15-30 min', '30-60 min', '> 60 min'))
x$studytime <- ordered(x$studytime, levels = c('< 2 hr', '2-5 hr', '5-10 hr', '> 10 hr'))
str(x)
## 'data.frame': 395 obs. of 30 variables:
## $ School : Factor w/ 2 levels "Gabriel Pereira",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ sex : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 2 2 1 2 2 ...
## $ age : int 18 17 15 15 16 16 16 17 15 15 ...
## $ address : Factor w/ 2 levels "Rural","Urban": 2 2 2 2 2 2 2 2 2 2 ...
## $ famsize : Factor w/ 2 levels "< 3 members",..: 2 2 1 2 2 1 1 2 1 2 ...
## $ Pstatus : Factor w/ 2 levels "Living Apart",..: 1 2 2 2 2 2 2 1 1 2 ...
## $ Medu : Ord.factor w/ 5 levels "None"<"Primary Education"<..: 5 2 2 5 4 5 3 5 4 4 ...
## $ Fedu : Ord.factor w/ 5 levels "None"<"Primary Education"<..: 5 2 2 3 4 4 3 5 3 5 ...
## $ Mjob : Factor w/ 5 levels "at_home","health",..: 1 1 1 2 3 4 3 3 4 3 ...
## $ Fjob : Factor w/ 5 levels "at_home","health",..: 5 3 3 4 3 3 3 5 3 3 ...
## $ reason : Factor w/ 4 levels "course","home",..: 1 1 3 2 2 4 2 2 2 2 ...
## $ guardian : Factor w/ 3 levels "father","mother",..: 2 1 2 2 1 2 2 2 2 2 ...
## $ traveltime : Ord.factor w/ 4 levels "< 15min"<"15-30 min"<..: 2 1 1 1 1 1 1 2 1 1 ...
## $ studytime : Ord.factor w/ 4 levels "< 2 hr"<"2-5 hr"<..: 2 2 2 3 2 2 2 2 2 2 ...
## $ failures : int 0 0 3 0 0 0 0 0 0 0 ...
## $ schoolsup : Factor w/ 2 levels "no","yes": 2 1 2 1 1 1 1 2 1 1 ...
## $ famsup : Factor w/ 2 levels "no","yes": 1 2 1 2 2 2 1 2 2 2 ...
## $ paid : Factor w/ 2 levels "no","yes": 1 1 2 2 2 2 1 1 2 2 ...
## $ activities : Factor w/ 2 levels "no","yes": 1 1 1 2 1 2 1 1 1 2 ...
## $ nursery : Factor w/ 2 levels "no","yes": 2 1 2 2 2 2 2 2 2 2 ...
## $ higher : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ internet : Factor w/ 2 levels "no","yes": 1 2 2 2 1 2 2 1 2 2 ...
## $ romantic : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 1 1 ...
## $ famrel : Ord.factor w/ 5 levels "1"<"2"<"3"<"4"<..: 4 5 4 3 4 5 4 4 4 5 ...
## $ freetime : Ord.factor w/ 5 levels "1"<"2"<"3"<"4"<..: 3 3 3 2 3 4 4 1 2 5 ...
## $ goout : Ord.factor w/ 5 levels "1"<"2"<"3"<"4"<..: 4 3 2 2 2 2 4 4 2 1 ...
## $ AlcCp : Ord.factor w/ 5 levels "Very Low"<"Low"<..: 1 1 2 1 1 1 1 1 1 1 ...
## $ health : Ord.factor w/ 5 levels "1"<"2"<"3"<"4"<..: 3 3 3 5 5 5 3 1 1 5 ...
## $ absences : int 6 4 10 2 4 10 0 6 0 0 ...
## $ Average.Score: num 5.67 5.33 8.33 14.67 8.67 ...
# Split into Train and Test datasets
set.seed(123)
partition <- sample(seq_len(nrow(x)), size = 0.8*nrow(x))
xtr <- x[partition,]
xtt <- x[-partition,]
xte <- xtt
xte$AlcCp <- NA
# Full Model (Multinom)
mn <- multinom(xtr$AlcCp ~ ., xtr)
## # weights: 310 (244 variable)
## initial value 508.582380
## iter 10 value 319.135579
## iter 20 value 150.326953
## iter 30 value 132.280499
## iter 40 value 124.472270
## iter 50 value 120.050359
## iter 60 value 116.348674
## iter 70 value 113.754334
## iter 80 value 107.230150
## iter 90 value 100.143806
## iter 100 value 99.922549
## final value 99.922549
## stopped after 100 iterations
AIC(mn)
## [1] 687.8451
# Prediction using Full Model (Multinom)
mn_predict <- predict(mn, xte)
xte$AlcCp <- mn_predict
mn_conf <- confusionMatrix(xtt$AlcCp,xte$AlcCp)
mn_conf # 58.23 % Accuracy
## Confusion Matrix and Statistics
##
## Reference
## Prediction Very Low Low Moderate High Very High
## Very Low 45 3 7 2 4
## Low 7 0 3 1 0
## Moderate 0 0 1 1 0
## High 0 0 0 0 0
## Very High 1 1 2 1 0
##
## Overall Statistics
##
## Accuracy : 0.5823
## 95% CI : (0.4659, 0.6923)
## No Information Rate : 0.6709
## P-Value [Acc > NIR] : 0.96174
##
## Kappa : 0.1066
## Mcnemar's Test P-Value : 0.01847
##
## Statistics by Class:
##
## Class: Very Low Class: Low Class: Moderate
## Sensitivity 0.8491 0.00000 0.07692
## Specificity 0.3846 0.85333 0.98485
## Pos Pred Value 0.7377 0.00000 0.50000
## Neg Pred Value 0.5556 0.94118 0.84416
## Prevalence 0.6709 0.05063 0.16456
## Detection Rate 0.5696 0.00000 0.01266
## Detection Prevalence 0.7722 0.13924 0.02532
## Balanced Accuracy 0.6168 0.42667 0.53089
## Class: High Class: Very High
## Sensitivity 0.00000 0.00000
## Specificity 1.00000 0.93333
## Pos Pred Value NaN 0.00000
## Neg Pred Value 0.93671 0.94595
## Prevalence 0.06329 0.05063
## Detection Rate 0.00000 0.00000
## Detection Prevalence 0.00000 0.06329
## Balanced Accuracy 0.50000 0.46667
# Feature Selection
# Using Boruta
set.seed(456)
boruta <- Boruta(xtr$AlcCp ~ ., xtr, maxRuns = 100)
boruta
## Boruta performed 99 iterations in 12.0235 secs.
## 3 attributes confirmed important: failures, goout, sex;
## 24 attributes confirmed unimportant: absences, activities,
## address, age, Average.Score and 19 more;
## 2 tentative attributes left: famsize, Mjob;
set.seed(456)
boruta <- Boruta(xtr$AlcCp ~ ., xtr, maxRuns = 300)
boruta # tentative attributes changed
## Boruta performed 299 iterations in 31.80597 secs.
## 4 attributes confirmed important: failures, famsize, goout, sex;
## 24 attributes confirmed unimportant: absences, activities,
## address, age, Average.Score and 19 more;
## 1 tentative attributes left: Mjob;
set.seed(456)
boruta <- Boruta(xtr$AlcCp ~ ., xtr, maxRuns = 500)
boruta # tentative attributes not changed
## Boruta performed 499 iterations in 57.50512 secs.
## 4 attributes confirmed important: failures, famsize, goout, sex;
## 24 attributes confirmed unimportant: absences, activities,
## address, age, Average.Score and 19 more;
## 1 tentative attributes left: Mjob;
set.seed(456)
boruta <- Boruta(xtr$AlcCp ~ ., xtr, maxRuns = 700)
boruta # tentative attributes not changed
## Boruta performed 699 iterations in 1.204393 mins.
## 4 attributes confirmed important: failures, famsize, goout, sex;
## 24 attributes confirmed unimportant: absences, activities,
## address, age, Average.Score and 19 more;
## 1 tentative attributes left: Mjob;
# Model using features from Boruta Selection (Multinom)
mnb1 <- multinom(xtr$AlcCp ~ failures + goout + sex + famsize, xtr)
## # weights: 45 (32 variable)
## initial value 508.582380
## iter 10 value 260.802297
## iter 20 value 255.862678
## iter 30 value 255.254050
## iter 40 value 254.993244
## final value 254.992120
## converged
AIC(mnb1)
## [1] 573.9842
mnb2 <- multinom(xtr$AlcCp ~ failures + goout + sex + famsize + Mjob, xtr)
## # weights: 65 (48 variable)
## initial value 508.582380
## iter 10 value 257.093753
## iter 20 value 248.695451
## iter 30 value 247.595431
## iter 40 value 247.033425
## iter 50 value 246.985871
## final value 246.985759
## converged
AIC(mnb2) # better to leave tentative attributes out !!
## [1] 589.9715
# Prediction using features from Boruta Selection (Multinom)
mn_predict <- predict(mnb1, xte)
xte$AlcCp <- mn_predict
mn_conf <- confusionMatrix(xtt$AlcCp,xte$AlcCp)
mn_conf # 75.95 % Accuracy
## Confusion Matrix and Statistics
##
## Reference
## Prediction Very Low Low Moderate High Very High
## Very Low 59 2 0 0 0
## Low 9 1 1 0 0
## Moderate 0 2 0 0 0
## High 0 0 0 0 0
## Very High 4 1 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.7595
## 95% CI : (0.6502, 0.8486)
## No Information Rate : 0.9114
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1572
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Very Low Class: Low Class: Moderate
## Sensitivity 0.8194 0.16667 0.00000
## Specificity 0.7143 0.86301 0.97436
## Pos Pred Value 0.9672 0.09091 0.00000
## Neg Pred Value 0.2778 0.92647 0.98701
## Prevalence 0.9114 0.07595 0.01266
## Detection Rate 0.7468 0.01266 0.00000
## Detection Prevalence 0.7722 0.13924 0.02532
## Balanced Accuracy 0.7669 0.51484 0.48718
## Class: High Class: Very High
## Sensitivity NA NA
## Specificity 1 0.93671
## Pos Pred Value NA NA
## Neg Pred Value NA NA
## Prevalence 0 0.00000
## Detection Rate 0 0.00000
## Detection Prevalence 0 0.06329
## Balanced Accuracy NA NA
# Full Model (C5.0)
c <- C5.0(xtr$AlcCp ~ ., xtr)
# Predition using Full Model (C5.0)
c_predict <- predict(c, xte)
xte$AlcCp <- c_predict
c_conf <- confusionMatrix(xtt$AlcCp,xte$AlcCp)
c_conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction Very Low Low Moderate High Very High
## Very Low 52 5 3 1 0
## Low 7 1 3 0 0
## Moderate 1 0 1 0 0
## High 0 0 0 0 0
## Very High 3 1 1 0 0
##
## Overall Statistics
##
## Accuracy : 0.6835
## 95% CI : (0.5692, 0.7837)
## No Information Rate : 0.7975
## P-Value [Acc > NIR] : 0.9943
##
## Kappa : 0.1432
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Very Low Class: Low Class: Moderate
## Sensitivity 0.8254 0.14286 0.12500
## Specificity 0.4375 0.86111 0.98592
## Pos Pred Value 0.8525 0.09091 0.50000
## Neg Pred Value 0.3889 0.91176 0.90909
## Prevalence 0.7975 0.08861 0.10127
## Detection Rate 0.6582 0.01266 0.01266
## Detection Prevalence 0.7722 0.13924 0.02532
## Balanced Accuracy 0.6314 0.50198 0.55546
## Class: High Class: Very High
## Sensitivity 0.00000 NA
## Specificity 1.00000 0.93671
## Pos Pred Value NaN NA
## Neg Pred Value 0.98734 NA
## Prevalence 0.01266 0.00000
## Detection Rate 0.00000 0.00000
## Detection Prevalence 0.00000 0.06329
## Balanced Accuracy 0.50000 NA
# 68.35 % Accuracy, Better than Full Multinom Model !
# But not Multinom Model using Boruta Features !
# Feature Selection
# Using C5imp
a <- C5imp(c)
summary(a$Overall) # consider features from 75th %ile of importance score
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 7.59 15.58 20.25 100.00
a
## Overall
## sex 100.00
## failures 52.53
## schoolsup 47.47
## higher 43.67
## paid 40.51
## School 22.78
## guardian 20.25
## studytime 20.25
## traveltime 17.72
## internet 16.46
## goout 15.19
## Fedu 8.86
## activities 8.86
## romantic 8.54
## reason 7.59
## famrel 6.01
## nursery 4.43
## Average.Score 4.43
## absences 3.48
## freetime 2.85
## age 0.00
## address 0.00
## famsize 0.00
## Pstatus 0.00
## Medu 0.00
## Mjob 0.00
## Fjob 0.00
## famsup 0.00
## health 0.00
# Model using features from C5imp Selection (C5.0)
c1 <- C5.0(xtr$AlcCp ~ sex + failures + schoolsup + higher + paid + School + guardian + studytime, xtr)
# Prediction using features from C5imp Selection (C5.0)
c1_predict <- predict(c1,xte)
xte$AlcCp <- c1_predict
c1_conf <- confusionMatrix(xtt$AlcCp,xte$AlcCp)
c1_conf # 75.95 % Accuracy, same as Multinom Model using Boruta ! But, Kappa Statistic is better !
## Confusion Matrix and Statistics
##
## Reference
## Prediction Very Low Low Moderate High Very High
## Very Low 57 2 1 1 0
## Low 9 2 0 0 0
## Moderate 1 0 1 0 0
## High 0 0 0 0 0
## Very High 3 1 1 0 0
##
## Overall Statistics
##
## Accuracy : 0.7595
## 95% CI : (0.6502, 0.8486)
## No Information Rate : 0.8861
## P-Value [Acc > NIR] : 0.9996
##
## Kappa : 0.2141
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Very Low Class: Low Class: Moderate
## Sensitivity 0.8143 0.40000 0.33333
## Specificity 0.5556 0.87838 0.98684
## Pos Pred Value 0.9344 0.18182 0.50000
## Neg Pred Value 0.2778 0.95588 0.97403
## Prevalence 0.8861 0.06329 0.03797
## Detection Rate 0.7215 0.02532 0.01266
## Detection Prevalence 0.7722 0.13924 0.02532
## Balanced Accuracy 0.6849 0.63919 0.66009
## Class: High Class: Very High
## Sensitivity 0.00000 NA
## Specificity 1.00000 0.93671
## Pos Pred Value NaN NA
## Neg Pred Value 0.98734 NA
## Prevalence 0.01266 0.00000
## Detection Rate 0.00000 0.00000
## Detection Prevalence 0.00000 0.06329
## Balanced Accuracy 0.50000 NA
# Model using features from C5imp Selection (C5.0) - w/o studytime
c1_1 <- C5.0(xtr$AlcCp ~ sex + failures + schoolsup + higher + paid + School + guardian, xtr)
# Prediction using features from C5imp Selection (C5.0)
c1_1_predict <- predict(c1_1,xte)
xte$AlcCp <- c1_1_predict
c1_1_conf <- confusionMatrix(xtt$AlcCp,xte$AlcCp)
c1_1_conf # 77.22 % Accuracy, Better than Multinom Model using Boruta !
## Confusion Matrix and Statistics
##
## Reference
## Prediction Very Low Low Moderate High Very High
## Very Low 61 0 0 0 0
## Low 11 0 0 0 0
## Moderate 2 0 0 0 0
## High 0 0 0 0 0
## Very High 5 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.7722
## 95% CI : (0.664, 0.859)
## No Information Rate : 1
## P-Value [Acc > NIR] : 1
##
## Kappa : 0
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Very Low Class: Low Class: Moderate
## Sensitivity 0.7722 NA NA
## Specificity NA 0.8608 0.97468
## Pos Pred Value NA NA NA
## Neg Pred Value NA NA NA
## Prevalence 1.0000 0.0000 0.00000
## Detection Rate 0.7722 0.0000 0.00000
## Detection Prevalence 0.7722 0.1392 0.02532
## Balanced Accuracy NA NA NA
## Class: High Class: Very High
## Sensitivity NA NA
## Specificity 1 0.93671
## Pos Pred Value NA NA
## Neg Pred Value NA NA
## Prevalence 0 0.00000
## Detection Rate 0 0.00000
## Detection Prevalence 0 0.06329
## Balanced Accuracy NA NA
# Dropping 1 of the attributes with repeated important attributes improves accuracy !!
# Model using features from Boruta Selection (C5.0)
c2 <- C5.0(xtr$AlcCp ~ failures + goout + sex + famsize, xtr)
# Prediction using features from Boruta Selection (C5.0)
c2_predict <- predict(c2, xte)
xte$AlcCp <- c1_predict
c2_conf <- confusionMatrix(xtt$AlcCp,xte$AlcCp)
c2_conf # 75.95% Accuracy, same as Multinom Model using Boruta ! But, Kappa Statistic is better !
## Confusion Matrix and Statistics
##
## Reference
## Prediction Very Low Low Moderate High Very High
## Very Low 57 2 1 1 0
## Low 9 2 0 0 0
## Moderate 1 0 1 0 0
## High 0 0 0 0 0
## Very High 3 1 1 0 0
##
## Overall Statistics
##
## Accuracy : 0.7595
## 95% CI : (0.6502, 0.8486)
## No Information Rate : 0.8861
## P-Value [Acc > NIR] : 0.9996
##
## Kappa : 0.2141
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Very Low Class: Low Class: Moderate
## Sensitivity 0.8143 0.40000 0.33333
## Specificity 0.5556 0.87838 0.98684
## Pos Pred Value 0.9344 0.18182 0.50000
## Neg Pred Value 0.2778 0.95588 0.97403
## Prevalence 0.8861 0.06329 0.03797
## Detection Rate 0.7215 0.02532 0.01266
## Detection Prevalence 0.7722 0.13924 0.02532
## Balanced Accuracy 0.6849 0.63919 0.66009
## Class: High Class: Very High
## Sensitivity 0.00000 NA
## Specificity 1.00000 0.93671
## Pos Pred Value NaN NA
## Neg Pred Value 0.98734 NA
## Prevalence 0.01266 0.00000
## Detection Rate 0.00000 0.00000
## Detection Prevalence 0.00000 0.06329
## Balanced Accuracy 0.50000 NA
# Kappa Statistic is the same for C5.0 models using both set of features !
# Feature selection using Step Function
step <- step(mn, direction = 'backward', trace = 1)
# Model using features from Step Function
# Multinom
mns <- multinom(xtr$AlcCp ~ sex + age + address + Fedu + Mjob + Fjob + reason +
guardian + traveltime + studytime + failures + schoolsup +
famsup + paid + activities + nursery + higher + internet +
romantic + famrel + freetime + goout + health + absences +
Average.Score, xtr)
## # weights: 275 (216 variable)
## initial value 508.582380
## iter 10 value 337.510226
## iter 20 value 164.774059
## iter 30 value 144.779235
## iter 40 value 137.337546
## iter 50 value 132.808184
## iter 60 value 130.088847
## iter 70 value 128.759449
## iter 80 value 123.513872
## iter 90 value 116.771726
## iter 100 value 106.838702
## final value 106.838702
## stopped after 100 iterations
# C5.0
cs <- C5.0(xtr$AlcCp ~ sex + age + address + Fedu + Mjob + Fjob + reason +
guardian + traveltime + studytime + failures + schoolsup +
famsup + paid + activities + nursery + higher + internet +
romantic + famrel + freetime + goout + health + absences +
Average.Score, xtr)
# Prediction using features from Step Function
# Multinom
mns_predict <- predict(mns, xte)
xte$AlcCp <- mns_predict
mns_conf <- confusionMatrix(xtt$AlcCp,xte$AlcCp)
mns_conf # 63.29 % Accuracy
## Confusion Matrix and Statistics
##
## Reference
## Prediction Very Low Low Moderate High Very High
## Very Low 47 5 4 3 2
## Low 7 1 1 1 1
## Moderate 1 0 1 0 0
## High 0 0 0 0 0
## Very High 0 1 1 2 1
##
## Overall Statistics
##
## Accuracy : 0.6329
## 95% CI : (0.5169, 0.7386)
## No Information Rate : 0.6962
## P-Value [Acc > NIR] : 0.909
##
## Kappa : 0.1744
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Very Low Class: Low Class: Moderate
## Sensitivity 0.8545 0.14286 0.14286
## Specificity 0.4167 0.86111 0.98611
## Pos Pred Value 0.7705 0.09091 0.50000
## Neg Pred Value 0.5556 0.91176 0.92208
## Prevalence 0.6962 0.08861 0.08861
## Detection Rate 0.5949 0.01266 0.01266
## Detection Prevalence 0.7722 0.13924 0.02532
## Balanced Accuracy 0.6356 0.50198 0.56448
## Class: High Class: Very High
## Sensitivity 0.00000 0.25000
## Specificity 1.00000 0.94667
## Pos Pred Value NaN 0.20000
## Neg Pred Value 0.92405 0.95946
## Prevalence 0.07595 0.05063
## Detection Rate 0.00000 0.01266
## Detection Prevalence 0.00000 0.06329
## Balanced Accuracy 0.50000 0.59833
# C5.0
cs_predict <- predict(cs, xte)
xte$AlcCp <- mns_predict
cs_conf <- confusionMatrix(xtt$AlcCp,xte$AlcCp)
cs_conf # 63.29 % Accuracy, Same as Multinom with same feature set ! Kappa Statistic is also the same !
## Confusion Matrix and Statistics
##
## Reference
## Prediction Very Low Low Moderate High Very High
## Very Low 47 5 4 3 2
## Low 7 1 1 1 1
## Moderate 1 0 1 0 0
## High 0 0 0 0 0
## Very High 0 1 1 2 1
##
## Overall Statistics
##
## Accuracy : 0.6329
## 95% CI : (0.5169, 0.7386)
## No Information Rate : 0.6962
## P-Value [Acc > NIR] : 0.909
##
## Kappa : 0.1744
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Very Low Class: Low Class: Moderate
## Sensitivity 0.8545 0.14286 0.14286
## Specificity 0.4167 0.86111 0.98611
## Pos Pred Value 0.7705 0.09091 0.50000
## Neg Pred Value 0.5556 0.91176 0.92208
## Prevalence 0.6962 0.08861 0.08861
## Detection Rate 0.5949 0.01266 0.01266
## Detection Prevalence 0.7722 0.13924 0.02532
## Balanced Accuracy 0.6356 0.50198 0.56448
## Class: High Class: Very High
## Sensitivity 0.00000 0.25000
## Specificity 1.00000 0.94667
## Pos Pred Value NaN 0.20000
## Neg Pred Value 0.92405 0.95946
## Prevalence 0.07595 0.05063
## Detection Rate 0.00000 0.01266
## Detection Prevalence 0.00000 0.06329
## Balanced Accuracy 0.50000 0.59833
# Visualise Target Variable
ggplot(xtr, aes(x = AlcCp)) + geom_bar()

# Transform Target Variable
# Training set
levels(xtr$AlcCp) <- c(1,2,3,4,5)
xtr$AlcCp <- as.numeric(xtr$AlcCp)
xtr$AlcCp[xtr$AlcCp > 2] <- 'Values >= 3'
xtr$AlcCp[xtr$AlcCp < 3] <- 'Values < 3'
xtr$AlcCp <- as.factor(xtr$AlcCp)
# Test Actuals set
levels(xtt$AlcCp) <- c(1,2,3,4,5)
xtt$AlcCp <- as.numeric(xtt$AlcCp)
xtt$AlcCp[xtt$AlcCp > 2] <- 'Values >= 3'
xtt$AlcCp[xtt$AlcCp < 3] <- 'Values < 3'
xtt$AlcCp <- as.factor(xtt$AlcCp)
# Model using features from C5imp Selection (C5.0) - w/o studytime after target variable transformation
ct_1 <- C5.0(xtr$AlcCp ~ sex + failures + schoolsup + higher + paid + School + guardian, xtr)
# Prediction using features from C5imp Selection (C5.0)
ct_1_predict <- predict(ct_1,xte)
xte$AlcCp <- ct_1_predict
ct_1_conf <- confusionMatrix(xtt$AlcCp,xte$AlcCp)
ct_1_conf # 91.14 % Accuracy !! :D
## Confusion Matrix and Statistics
##
## Reference
## Prediction Values < 3 Values >= 3
## Values < 3 72 0
## Values >= 3 7 0
##
## Accuracy : 0.9114
## 95% CI : (0.8259, 0.9636)
## No Information Rate : 1
## P-Value [Acc > NIR] : 1.00000
##
## Kappa : 0
## Mcnemar's Test P-Value : 0.02334
##
## Sensitivity : 0.9114
## Specificity : NA
## Pos Pred Value : NA
## Neg Pred Value : NA
## Prevalence : 1.0000
## Detection Rate : 0.9114
## Detection Prevalence : 0.9114
## Balanced Accuracy : NA
##
## 'Positive' Class : Values < 3
##