Student Alcoholism

# Packages Required
require(nnet)
require(caret)
require(Boruta)
require(e1071)
require(C50)
require(ggplot2)

# Read/View Data
x  <- read.csv('Mat_Clean.csv')
View(x)
table(x$Walc==x$Dalc)

## 
## TRUE 
##  395

x <- x[,-28]
colnames(x)[27] <- 'AlcCp'
View(x)

# Modify target variable
str(x$AlcCp)

##  int [1:395] 1 1 2 1 1 1 1 1 1 1 ...

x$AlcCp <- ordered(as.factor(x$AlcCp))
levels(x$AlcCp) <- c('Very Low','Low', 'Moderate', 'High', 'Very High')
str(x$AlcCp)

##  Ord.factor w/ 5 levels "Very Low"<"Low"<..: 1 1 2 1 1 1 1 1 1 1 ...

# Reassigning Data Types
str(x)

## 'data.frame':    395 obs. of  30 variables:
##  $ School       : Factor w/ 2 levels "Gabriel Pereira",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ sex          : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 2 2 1 2 2 ...
##  $ age          : int  18 17 15 15 16 16 16 17 15 15 ...
##  $ address      : Factor w/ 2 levels "Rural","Urban": 2 2 2 2 2 2 2 2 2 2 ...
##  $ famsize      : Factor w/ 2 levels "< 3 members",..: 2 2 1 2 2 1 1 2 1 2 ...
##  $ Pstatus      : Factor w/ 2 levels "Living Apart",..: 1 2 2 2 2 2 2 1 1 2 ...
##  $ Medu         : Factor w/ 5 levels "5th - 9th Grade",..: 2 4 4 2 5 2 1 2 5 5 ...
##  $ Fedu         : Factor w/ 5 levels "5th - 9th Grade",..: 2 4 4 1 5 5 1 2 1 2 ...
##  $ Mjob         : Factor w/ 5 levels "at_home","health",..: 1 1 1 2 3 4 3 3 4 3 ...
##  $ Fjob         : Factor w/ 5 levels "at_home","health",..: 5 3 3 4 3 3 3 5 3 3 ...
##  $ reason       : Factor w/ 4 levels "course","home",..: 1 1 3 2 2 4 2 2 2 2 ...
##  $ guardian     : Factor w/ 3 levels "father","mother",..: 2 1 2 2 1 2 2 2 2 2 ...
##  $ traveltime   : Factor w/ 4 levels "< 15min","> 60 min",..: 3 1 1 1 1 1 1 3 1 1 ...
##  $ studytime    : Factor w/ 4 levels "< 2 hr","> 10 hr",..: 3 3 3 4 3 3 3 3 3 3 ...
##  $ failures     : int  0 0 3 0 0 0 0 0 0 0 ...
##  $ schoolsup    : Factor w/ 2 levels "no","yes": 2 1 2 1 1 1 1 2 1 1 ...
##  $ famsup       : Factor w/ 2 levels "no","yes": 1 2 1 2 2 2 1 2 2 2 ...
##  $ paid         : Factor w/ 2 levels "no","yes": 1 1 2 2 2 2 1 1 2 2 ...
##  $ activities   : Factor w/ 2 levels "no","yes": 1 1 1 2 1 2 1 1 1 2 ...
##  $ nursery      : Factor w/ 2 levels "no","yes": 2 1 2 2 2 2 2 2 2 2 ...
##  $ higher       : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ internet     : Factor w/ 2 levels "no","yes": 1 2 2 2 1 2 2 1 2 2 ...
##  $ romantic     : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 1 1 ...
##  $ famrel       : int  4 5 4 3 4 5 4 4 4 5 ...
##  $ freetime     : int  3 3 3 2 3 4 4 1 2 5 ...
##  $ goout        : int  4 3 2 2 2 2 4 4 2 1 ...
##  $ AlcCp        : Ord.factor w/ 5 levels "Very Low"<"Low"<..: 1 1 2 1 1 1 1 1 1 1 ...
##  $ health       : int  3 3 3 5 5 5 3 1 1 5 ...
##  $ absences     : int  6 4 10 2 4 10 0 6 0 0 ...
##  $ Average.Score: num  5.67 5.33 8.33 14.67 8.67 ...

x$famrel <- ordered(as.factor(x$famrel))
x$freetime <- ordered(as.factor(x$freetime))
x$goout <- ordered(as.factor(x$goout))
x$AlcCp <- ordered(as.factor(x$AlcCp))
x$health <- ordered(as.factor(x$health))
x$Medu <- ordered(x$Medu, levels = c('None', 'Primary Education', '5th - 9th Grade', 'Secondary Education', 'Higher Education'))
x$Fedu <- ordered(x$Fedu, levels = c('None', 'Primary Education', '5th - 9th Grade', 'Secondary Education', 'Higher Education'))
x$traveltime <- ordered(x$traveltime, levels = c('< 15min', '15-30 min', '30-60 min', '> 60 min'))
x$studytime <- ordered(x$studytime, levels = c('< 2 hr', '2-5 hr', '5-10 hr', '> 10 hr'))
str(x)

## 'data.frame':    395 obs. of  30 variables:
##  $ School       : Factor w/ 2 levels "Gabriel Pereira",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ sex          : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 2 2 1 2 2 ...
##  $ age          : int  18 17 15 15 16 16 16 17 15 15 ...
##  $ address      : Factor w/ 2 levels "Rural","Urban": 2 2 2 2 2 2 2 2 2 2 ...
##  $ famsize      : Factor w/ 2 levels "< 3 members",..: 2 2 1 2 2 1 1 2 1 2 ...
##  $ Pstatus      : Factor w/ 2 levels "Living Apart",..: 1 2 2 2 2 2 2 1 1 2 ...
##  $ Medu         : Ord.factor w/ 5 levels "None"<"Primary Education"<..: 5 2 2 5 4 5 3 5 4 4 ...
##  $ Fedu         : Ord.factor w/ 5 levels "None"<"Primary Education"<..: 5 2 2 3 4 4 3 5 3 5 ...
##  $ Mjob         : Factor w/ 5 levels "at_home","health",..: 1 1 1 2 3 4 3 3 4 3 ...
##  $ Fjob         : Factor w/ 5 levels "at_home","health",..: 5 3 3 4 3 3 3 5 3 3 ...
##  $ reason       : Factor w/ 4 levels "course","home",..: 1 1 3 2 2 4 2 2 2 2 ...
##  $ guardian     : Factor w/ 3 levels "father","mother",..: 2 1 2 2 1 2 2 2 2 2 ...
##  $ traveltime   : Ord.factor w/ 4 levels "< 15min"<"15-30 min"<..: 2 1 1 1 1 1 1 2 1 1 ...
##  $ studytime    : Ord.factor w/ 4 levels "< 2 hr"<"2-5 hr"<..: 2 2 2 3 2 2 2 2 2 2 ...
##  $ failures     : int  0 0 3 0 0 0 0 0 0 0 ...
##  $ schoolsup    : Factor w/ 2 levels "no","yes": 2 1 2 1 1 1 1 2 1 1 ...
##  $ famsup       : Factor w/ 2 levels "no","yes": 1 2 1 2 2 2 1 2 2 2 ...
##  $ paid         : Factor w/ 2 levels "no","yes": 1 1 2 2 2 2 1 1 2 2 ...
##  $ activities   : Factor w/ 2 levels "no","yes": 1 1 1 2 1 2 1 1 1 2 ...
##  $ nursery      : Factor w/ 2 levels "no","yes": 2 1 2 2 2 2 2 2 2 2 ...
##  $ higher       : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ internet     : Factor w/ 2 levels "no","yes": 1 2 2 2 1 2 2 1 2 2 ...
##  $ romantic     : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 1 1 ...
##  $ famrel       : Ord.factor w/ 5 levels "1"<"2"<"3"<"4"<..: 4 5 4 3 4 5 4 4 4 5 ...
##  $ freetime     : Ord.factor w/ 5 levels "1"<"2"<"3"<"4"<..: 3 3 3 2 3 4 4 1 2 5 ...
##  $ goout        : Ord.factor w/ 5 levels "1"<"2"<"3"<"4"<..: 4 3 2 2 2 2 4 4 2 1 ...
##  $ AlcCp        : Ord.factor w/ 5 levels "Very Low"<"Low"<..: 1 1 2 1 1 1 1 1 1 1 ...
##  $ health       : Ord.factor w/ 5 levels "1"<"2"<"3"<"4"<..: 3 3 3 5 5 5 3 1 1 5 ...
##  $ absences     : int  6 4 10 2 4 10 0 6 0 0 ...
##  $ Average.Score: num  5.67 5.33 8.33 14.67 8.67 ...

# Split into Train and Test datasets

set.seed(123)
partition <- sample(seq_len(nrow(x)), size = 0.8*nrow(x))

xtr <- x[partition,]
xtt <- x[-partition,]
xte <- xtt
xte$AlcCp <- NA

# Full Model (Multinom)

mn <- multinom(xtr$AlcCp ~ ., xtr)

## # weights:  310 (244 variable)
## initial  value 508.582380 
## iter  10 value 319.135579
## iter  20 value 150.326953
## iter  30 value 132.280499
## iter  40 value 124.472270
## iter  50 value 120.050359
## iter  60 value 116.348674
## iter  70 value 113.754334
## iter  80 value 107.230150
## iter  90 value 100.143806
## iter 100 value 99.922549
## final  value 99.922549 
## stopped after 100 iterations

AIC(mn)

## [1] 687.8451

# Prediction using Full Model (Multinom) 

mn_predict <- predict(mn, xte)
xte$AlcCp <- mn_predict
mn_conf <- confusionMatrix(xtt$AlcCp,xte$AlcCp)
mn_conf # 58.23 % Accuracy

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Very Low Low Moderate High Very High
##   Very Low        45   3        7    2         4
##   Low              7   0        3    1         0
##   Moderate         0   0        1    1         0
##   High             0   0        0    0         0
##   Very High        1   1        2    1         0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5823          
##                  95% CI : (0.4659, 0.6923)
##     No Information Rate : 0.6709          
##     P-Value [Acc > NIR] : 0.96174         
##                                           
##                   Kappa : 0.1066          
##  Mcnemar's Test P-Value : 0.01847         
## 
## Statistics by Class:
## 
##                      Class: Very Low Class: Low Class: Moderate
## Sensitivity                   0.8491    0.00000         0.07692
## Specificity                   0.3846    0.85333         0.98485
## Pos Pred Value                0.7377    0.00000         0.50000
## Neg Pred Value                0.5556    0.94118         0.84416
## Prevalence                    0.6709    0.05063         0.16456
## Detection Rate                0.5696    0.00000         0.01266
## Detection Prevalence          0.7722    0.13924         0.02532
## Balanced Accuracy             0.6168    0.42667         0.53089
##                      Class: High Class: Very High
## Sensitivity              0.00000          0.00000
## Specificity              1.00000          0.93333
## Pos Pred Value               NaN          0.00000
## Neg Pred Value           0.93671          0.94595
## Prevalence               0.06329          0.05063
## Detection Rate           0.00000          0.00000
## Detection Prevalence     0.00000          0.06329
## Balanced Accuracy        0.50000          0.46667

# Feature Selection
# Using Boruta

set.seed(456)
boruta <- Boruta(xtr$AlcCp ~ ., xtr, maxRuns = 100)
boruta

## Boruta performed 99 iterations in 12.0235 secs.
##  3 attributes confirmed important: failures, goout, sex;
##  24 attributes confirmed unimportant: absences, activities,
## address, age, Average.Score and 19 more;
##  2 tentative attributes left: famsize, Mjob;

set.seed(456)
boruta <- Boruta(xtr$AlcCp ~ ., xtr, maxRuns = 300)
boruta # tentative attributes changed

## Boruta performed 299 iterations in 31.80597 secs.
##  4 attributes confirmed important: failures, famsize, goout, sex;
##  24 attributes confirmed unimportant: absences, activities,
## address, age, Average.Score and 19 more;
##  1 tentative attributes left: Mjob;

set.seed(456)
boruta <- Boruta(xtr$AlcCp ~ ., xtr, maxRuns = 500)
boruta # tentative attributes not changed

## Boruta performed 499 iterations in 57.50512 secs.
##  4 attributes confirmed important: failures, famsize, goout, sex;
##  24 attributes confirmed unimportant: absences, activities,
## address, age, Average.Score and 19 more;
##  1 tentative attributes left: Mjob;

set.seed(456)
boruta <- Boruta(xtr$AlcCp ~ ., xtr, maxRuns = 700)
boruta # tentative attributes not changed

## Boruta performed 699 iterations in 1.204393 mins.
##  4 attributes confirmed important: failures, famsize, goout, sex;
##  24 attributes confirmed unimportant: absences, activities,
## address, age, Average.Score and 19 more;
##  1 tentative attributes left: Mjob;

# Model using features from Boruta Selection (Multinom)

mnb1 <- multinom(xtr$AlcCp ~ failures + goout + sex + famsize, xtr)

## # weights:  45 (32 variable)
## initial  value 508.582380 
## iter  10 value 260.802297
## iter  20 value 255.862678
## iter  30 value 255.254050
## iter  40 value 254.993244
## final  value 254.992120 
## converged

AIC(mnb1)

## [1] 573.9842

mnb2 <- multinom(xtr$AlcCp ~ failures + goout + sex + famsize + Mjob, xtr)

## # weights:  65 (48 variable)
## initial  value 508.582380 
## iter  10 value 257.093753
## iter  20 value 248.695451
## iter  30 value 247.595431
## iter  40 value 247.033425
## iter  50 value 246.985871
## final  value 246.985759 
## converged

AIC(mnb2) # better to leave tentative attributes out !!

## [1] 589.9715

# Prediction using features from Boruta Selection (Multinom)

mn_predict <- predict(mnb1, xte)
xte$AlcCp <- mn_predict
mn_conf <- confusionMatrix(xtt$AlcCp,xte$AlcCp)
mn_conf # 75.95 % Accuracy

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Very Low Low Moderate High Very High
##   Very Low        59   2        0    0         0
##   Low              9   1        1    0         0
##   Moderate         0   2        0    0         0
##   High             0   0        0    0         0
##   Very High        4   1        0    0         0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7595          
##                  95% CI : (0.6502, 0.8486)
##     No Information Rate : 0.9114          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1572          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Very Low Class: Low Class: Moderate
## Sensitivity                   0.8194    0.16667         0.00000
## Specificity                   0.7143    0.86301         0.97436
## Pos Pred Value                0.9672    0.09091         0.00000
## Neg Pred Value                0.2778    0.92647         0.98701
## Prevalence                    0.9114    0.07595         0.01266
## Detection Rate                0.7468    0.01266         0.00000
## Detection Prevalence          0.7722    0.13924         0.02532
## Balanced Accuracy             0.7669    0.51484         0.48718
##                      Class: High Class: Very High
## Sensitivity                   NA               NA
## Specificity                    1          0.93671
## Pos Pred Value                NA               NA
## Neg Pred Value                NA               NA
## Prevalence                     0          0.00000
## Detection Rate                 0          0.00000
## Detection Prevalence           0          0.06329
## Balanced Accuracy             NA               NA

# Full Model (C5.0)

c <- C5.0(xtr$AlcCp ~ ., xtr)

# Predition using Full Model (C5.0)

c_predict <- predict(c, xte)
xte$AlcCp <- c_predict
c_conf <- confusionMatrix(xtt$AlcCp,xte$AlcCp)
c_conf

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Very Low Low Moderate High Very High
##   Very Low        52   5        3    1         0
##   Low              7   1        3    0         0
##   Moderate         1   0        1    0         0
##   High             0   0        0    0         0
##   Very High        3   1        1    0         0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6835          
##                  95% CI : (0.5692, 0.7837)
##     No Information Rate : 0.7975          
##     P-Value [Acc > NIR] : 0.9943          
##                                           
##                   Kappa : 0.1432          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Very Low Class: Low Class: Moderate
## Sensitivity                   0.8254    0.14286         0.12500
## Specificity                   0.4375    0.86111         0.98592
## Pos Pred Value                0.8525    0.09091         0.50000
## Neg Pred Value                0.3889    0.91176         0.90909
## Prevalence                    0.7975    0.08861         0.10127
## Detection Rate                0.6582    0.01266         0.01266
## Detection Prevalence          0.7722    0.13924         0.02532
## Balanced Accuracy             0.6314    0.50198         0.55546
##                      Class: High Class: Very High
## Sensitivity              0.00000               NA
## Specificity              1.00000          0.93671
## Pos Pred Value               NaN               NA
## Neg Pred Value           0.98734               NA
## Prevalence               0.01266          0.00000
## Detection Rate           0.00000          0.00000
## Detection Prevalence     0.00000          0.06329
## Balanced Accuracy        0.50000               NA

# 68.35 % Accuracy, Better than Full Multinom Model !
# But not Multinom Model using Boruta Features !

# Feature Selection
# Using C5imp

a <- C5imp(c)
summary(a$Overall) # consider features from 75th %ile of importance score

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    7.59   15.58   20.25  100.00

##               Overall
## sex            100.00
## failures        52.53
## schoolsup       47.47
## higher          43.67
## paid            40.51
## School          22.78
## guardian        20.25
## studytime       20.25
## traveltime      17.72
## internet        16.46
## goout           15.19
## Fedu             8.86
## activities       8.86
## romantic         8.54
## reason           7.59
## famrel           6.01
## nursery          4.43
## Average.Score    4.43
## absences         3.48
## freetime         2.85
## age              0.00
## address          0.00
## famsize          0.00
## Pstatus          0.00
## Medu             0.00
## Mjob             0.00
## Fjob             0.00
## famsup           0.00
## health           0.00

# Model using features from C5imp Selection (C5.0)

c1 <- C5.0(xtr$AlcCp ~ sex + failures + schoolsup + higher + paid + School + guardian + studytime, xtr)

# Prediction using features from C5imp Selection (C5.0)

c1_predict <- predict(c1,xte)
xte$AlcCp <- c1_predict
c1_conf <- confusionMatrix(xtt$AlcCp,xte$AlcCp)
c1_conf # 75.95 % Accuracy, same as Multinom Model using Boruta ! But, Kappa Statistic is better !

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Very Low Low Moderate High Very High
##   Very Low        57   2        1    1         0
##   Low              9   2        0    0         0
##   Moderate         1   0        1    0         0
##   High             0   0        0    0         0
##   Very High        3   1        1    0         0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7595          
##                  95% CI : (0.6502, 0.8486)
##     No Information Rate : 0.8861          
##     P-Value [Acc > NIR] : 0.9996          
##                                           
##                   Kappa : 0.2141          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Very Low Class: Low Class: Moderate
## Sensitivity                   0.8143    0.40000         0.33333
## Specificity                   0.5556    0.87838         0.98684
## Pos Pred Value                0.9344    0.18182         0.50000
## Neg Pred Value                0.2778    0.95588         0.97403
## Prevalence                    0.8861    0.06329         0.03797
## Detection Rate                0.7215    0.02532         0.01266
## Detection Prevalence          0.7722    0.13924         0.02532
## Balanced Accuracy             0.6849    0.63919         0.66009
##                      Class: High Class: Very High
## Sensitivity              0.00000               NA
## Specificity              1.00000          0.93671
## Pos Pred Value               NaN               NA
## Neg Pred Value           0.98734               NA
## Prevalence               0.01266          0.00000
## Detection Rate           0.00000          0.00000
## Detection Prevalence     0.00000          0.06329
## Balanced Accuracy        0.50000               NA

# Model using features from C5imp Selection (C5.0) - w/o studytime

c1_1 <- C5.0(xtr$AlcCp ~ sex + failures + schoolsup + higher + paid + School + guardian, xtr)

# Prediction using features from C5imp Selection (C5.0)

c1_1_predict <- predict(c1_1,xte)
xte$AlcCp <- c1_1_predict
c1_1_conf <- confusionMatrix(xtt$AlcCp,xte$AlcCp)
c1_1_conf # 77.22 % Accuracy, Better than Multinom Model using Boruta !

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Very Low Low Moderate High Very High
##   Very Low        61   0        0    0         0
##   Low             11   0        0    0         0
##   Moderate         2   0        0    0         0
##   High             0   0        0    0         0
##   Very High        5   0        0    0         0
## 
## Overall Statistics
##                                         
##                Accuracy : 0.7722        
##                  95% CI : (0.664, 0.859)
##     No Information Rate : 1             
##     P-Value [Acc > NIR] : 1             
##                                         
##                   Kappa : 0             
##  Mcnemar's Test P-Value : NA            
## 
## Statistics by Class:
## 
##                      Class: Very Low Class: Low Class: Moderate
## Sensitivity                   0.7722         NA              NA
## Specificity                       NA     0.8608         0.97468
## Pos Pred Value                    NA         NA              NA
## Neg Pred Value                    NA         NA              NA
## Prevalence                    1.0000     0.0000         0.00000
## Detection Rate                0.7722     0.0000         0.00000
## Detection Prevalence          0.7722     0.1392         0.02532
## Balanced Accuracy                 NA         NA              NA
##                      Class: High Class: Very High
## Sensitivity                   NA               NA
## Specificity                    1          0.93671
## Pos Pred Value                NA               NA
## Neg Pred Value                NA               NA
## Prevalence                     0          0.00000
## Detection Rate                 0          0.00000
## Detection Prevalence           0          0.06329
## Balanced Accuracy             NA               NA

# Dropping 1 of the attributes with repeated important attributes improves accuracy !!

# Model using features from Boruta Selection (C5.0)

c2 <- C5.0(xtr$AlcCp ~ failures + goout + sex + famsize, xtr)

# Prediction using features from Boruta Selection (C5.0)

c2_predict <- predict(c2, xte)
xte$AlcCp <- c1_predict
c2_conf <- confusionMatrix(xtt$AlcCp,xte$AlcCp)
c2_conf # 75.95% Accuracy, same as Multinom Model using Boruta ! But, Kappa Statistic is better !

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Very Low Low Moderate High Very High
##   Very Low        57   2        1    1         0
##   Low              9   2        0    0         0
##   Moderate         1   0        1    0         0
##   High             0   0        0    0         0
##   Very High        3   1        1    0         0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7595          
##                  95% CI : (0.6502, 0.8486)
##     No Information Rate : 0.8861          
##     P-Value [Acc > NIR] : 0.9996          
##                                           
##                   Kappa : 0.2141          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Very Low Class: Low Class: Moderate
## Sensitivity                   0.8143    0.40000         0.33333
## Specificity                   0.5556    0.87838         0.98684
## Pos Pred Value                0.9344    0.18182         0.50000
## Neg Pred Value                0.2778    0.95588         0.97403
## Prevalence                    0.8861    0.06329         0.03797
## Detection Rate                0.7215    0.02532         0.01266
## Detection Prevalence          0.7722    0.13924         0.02532
## Balanced Accuracy             0.6849    0.63919         0.66009
##                      Class: High Class: Very High
## Sensitivity              0.00000               NA
## Specificity              1.00000          0.93671
## Pos Pred Value               NaN               NA
## Neg Pred Value           0.98734               NA
## Prevalence               0.01266          0.00000
## Detection Rate           0.00000          0.00000
## Detection Prevalence     0.00000          0.06329
## Balanced Accuracy        0.50000               NA

# Kappa Statistic is the same for C5.0 models using both set of features !

# Feature selection using Step Function

step <- step(mn, direction = 'backward', trace = 1)

# Model using features from Step Function

# Multinom
mns <- multinom(xtr$AlcCp ~ sex + age + address + Fedu + Mjob + Fjob + reason + 
                  guardian + traveltime + studytime + failures + schoolsup + 
                  famsup + paid + activities + nursery + higher + internet + 
                  romantic + famrel + freetime + goout + health + absences + 
                  Average.Score, xtr)

## # weights:  275 (216 variable)
## initial  value 508.582380 
## iter  10 value 337.510226
## iter  20 value 164.774059
## iter  30 value 144.779235
## iter  40 value 137.337546
## iter  50 value 132.808184
## iter  60 value 130.088847
## iter  70 value 128.759449
## iter  80 value 123.513872
## iter  90 value 116.771726
## iter 100 value 106.838702
## final  value 106.838702 
## stopped after 100 iterations

# C5.0
cs <- C5.0(xtr$AlcCp ~ sex + age + address + Fedu + Mjob + Fjob + reason + 
                  guardian + traveltime + studytime + failures + schoolsup + 
                  famsup + paid + activities + nursery + higher + internet + 
                  romantic + famrel + freetime + goout + health + absences + 
                  Average.Score, xtr)

# Prediction using features from Step Function

# Multinom
mns_predict <- predict(mns, xte)
xte$AlcCp <- mns_predict
mns_conf <- confusionMatrix(xtt$AlcCp,xte$AlcCp)
mns_conf # 63.29 % Accuracy

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Very Low Low Moderate High Very High
##   Very Low        47   5        4    3         2
##   Low              7   1        1    1         1
##   Moderate         1   0        1    0         0
##   High             0   0        0    0         0
##   Very High        0   1        1    2         1
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6329          
##                  95% CI : (0.5169, 0.7386)
##     No Information Rate : 0.6962          
##     P-Value [Acc > NIR] : 0.909           
##                                           
##                   Kappa : 0.1744          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Very Low Class: Low Class: Moderate
## Sensitivity                   0.8545    0.14286         0.14286
## Specificity                   0.4167    0.86111         0.98611
## Pos Pred Value                0.7705    0.09091         0.50000
## Neg Pred Value                0.5556    0.91176         0.92208
## Prevalence                    0.6962    0.08861         0.08861
## Detection Rate                0.5949    0.01266         0.01266
## Detection Prevalence          0.7722    0.13924         0.02532
## Balanced Accuracy             0.6356    0.50198         0.56448
##                      Class: High Class: Very High
## Sensitivity              0.00000          0.25000
## Specificity              1.00000          0.94667
## Pos Pred Value               NaN          0.20000
## Neg Pred Value           0.92405          0.95946
## Prevalence               0.07595          0.05063
## Detection Rate           0.00000          0.01266
## Detection Prevalence     0.00000          0.06329
## Balanced Accuracy        0.50000          0.59833

# C5.0
cs_predict <- predict(cs, xte)
xte$AlcCp <- mns_predict
cs_conf <- confusionMatrix(xtt$AlcCp,xte$AlcCp)
cs_conf # 63.29 % Accuracy, Same as Multinom with same feature set ! Kappa Statistic is also the same !

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Very Low Low Moderate High Very High
##   Very Low        47   5        4    3         2
##   Low              7   1        1    1         1
##   Moderate         1   0        1    0         0
##   High             0   0        0    0         0
##   Very High        0   1        1    2         1
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6329          
##                  95% CI : (0.5169, 0.7386)
##     No Information Rate : 0.6962          
##     P-Value [Acc > NIR] : 0.909           
##                                           
##                   Kappa : 0.1744          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Very Low Class: Low Class: Moderate
## Sensitivity                   0.8545    0.14286         0.14286
## Specificity                   0.4167    0.86111         0.98611
## Pos Pred Value                0.7705    0.09091         0.50000
## Neg Pred Value                0.5556    0.91176         0.92208
## Prevalence                    0.6962    0.08861         0.08861
## Detection Rate                0.5949    0.01266         0.01266
## Detection Prevalence          0.7722    0.13924         0.02532
## Balanced Accuracy             0.6356    0.50198         0.56448
##                      Class: High Class: Very High
## Sensitivity              0.00000          0.25000
## Specificity              1.00000          0.94667
## Pos Pred Value               NaN          0.20000
## Neg Pred Value           0.92405          0.95946
## Prevalence               0.07595          0.05063
## Detection Rate           0.00000          0.01266
## Detection Prevalence     0.00000          0.06329
## Balanced Accuracy        0.50000          0.59833

# Visualise Target Variable

ggplot(xtr, aes(x = AlcCp)) + geom_bar()

# Transform Target Variable

# Training set
levels(xtr$AlcCp) <- c(1,2,3,4,5)
xtr$AlcCp <- as.numeric(xtr$AlcCp)
xtr$AlcCp[xtr$AlcCp > 2] <- 'Values >= 3'
xtr$AlcCp[xtr$AlcCp < 3] <- 'Values < 3'
xtr$AlcCp <- as.factor(xtr$AlcCp)

# Test Actuals set
levels(xtt$AlcCp) <- c(1,2,3,4,5)
xtt$AlcCp <- as.numeric(xtt$AlcCp)
xtt$AlcCp[xtt$AlcCp > 2] <- 'Values >= 3'
xtt$AlcCp[xtt$AlcCp < 3] <- 'Values < 3'
xtt$AlcCp <- as.factor(xtt$AlcCp)

# Model using features from C5imp Selection (C5.0) - w/o studytime after target variable transformation

ct_1 <- C5.0(xtr$AlcCp ~ sex + failures + schoolsup + higher + paid + School + guardian, xtr)

# Prediction using features from C5imp Selection (C5.0)

ct_1_predict <- predict(ct_1,xte)
xte$AlcCp <- ct_1_predict
ct_1_conf <- confusionMatrix(xtt$AlcCp,xte$AlcCp)
ct_1_conf # 91.14 % Accuracy !! :D

## Confusion Matrix and Statistics
## 
##              Reference
## Prediction    Values < 3 Values >= 3
##   Values < 3          72           0
##   Values >= 3          7           0
##                                           
##                Accuracy : 0.9114          
##                  95% CI : (0.8259, 0.9636)
##     No Information Rate : 1               
##     P-Value [Acc > NIR] : 1.00000         
##                                           
##                   Kappa : 0               
##  Mcnemar's Test P-Value : 0.02334         
##                                           
##             Sensitivity : 0.9114          
##             Specificity :     NA          
##          Pos Pred Value :     NA          
##          Neg Pred Value :     NA          
##              Prevalence : 1.0000          
##          Detection Rate : 0.9114          
##    Detection Prevalence : 0.9114          
##       Balanced Accuracy :     NA          
##                                           
##        'Positive' Class : Values < 3      
##

Student Alcoholism

Eashwar

5/12/2017