Load data from disk

Loading the glass data file from local drive Load data for project Glass

glass <- read.csv("C:/Users/lnun/Desktop/Chemometrics/For Sariya/glass.csv")
View(glass)

To generate a float indicator in order to make the multiclass problem, a two class problem. Type 1 and 3 were set to be equal to 1 while the others were set to be 0.

library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
glass = glass %>% mutate(Float = ifelse(Type == 1 | Type == 3, 1, 0))
glass$Float = as.factor(glass$Float)

Allows for developing C5.0 model in order to generate and plot a decision tree. The C.50 model is an all-purpose classifer that is very efficient.

library(C50)
## Warning: package 'C50' was built under R version 3.6.3
nt <- 1
library(partykit)
## Warning: package 'partykit' was built under R version 3.6.3
## Loading required package: grid
## Loading required package: libcoin
## Warning: package 'libcoin' was built under R version 3.6.3
## Loading required package: mvtnorm
m <- C5.0(Float ~ RI+Na+Mg+Al+Si+K+Ca+Ba+Fe,
          data=glass, trials=nt, control=C5.0Control(earlyStopping = F))

summary(m)
## 
## Call:
## C5.0.formula(formula = Float ~ RI + Na + Mg + Al + Si + K + Ca + Ba + Fe,
##  data = glass, trials = nt, control = C5.0Control(earlyStopping = F))
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Mon Apr 20 15:43:55 2020
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 214 cases (10 attributes) from undefined.data
## 
## Decision tree:
## 
## Mg <= 2.68: 0 (61)
## Mg > 2.68:
## :...Al > 1.41:
##     :...Ca <= 8.32:
##     :   :...Fe <= 0.22: 0 (34/1)
##     :   :   Fe > 0.22:
##     :   :   :...RI <= 1.51629: 1 (2)
##     :   :       RI > 1.51629: 0 (2)
##     :   Ca > 8.32:
##     :   :...Mg <= 3.15: 0 (4)
##     :       Mg > 3.15:
##     :       :...K <= 0.52: 0 (2)
##     :           K > 0.52: 1 (8/1)
##     Al <= 1.41:
##     :...Fe > 0.11:
##         :...K <= 0.23: 1 (6)
##         :   K > 0.23:
##         :   :...Mg > 3.59: 0 (8)
##         :       Mg <= 3.59:
##         :       :...K <= 0.45: 0 (2)
##         :           K > 0.45: 1 (10/2)
##         Fe <= 0.11:
##         :...Si > 72.81: 1 (27)
##             Si <= 72.81:
##             :...Si <= 72.22:
##                 :...Ca <= 10.17: 1 (21)
##                 :   Ca > 10.17: 0 (2)
##                 Si > 72.22:
##                 :...Mg > 3.72: 0 (6)
##                     Mg <= 3.72:
##                     :...RI > 1.52043: 0 (2)
##                         RI <= 1.52043:
##                         :...Si <= 72.74: 1 (11)
##                             Si > 72.74:
##                             :...RI <= 1.51694: 0 (2)
##                                 RI > 1.51694: 1 (4)
## 
## 
## Evaluation on training data (214 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##      19    4( 1.9%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##     124     3    (a): class 0
##       1    86    (b): class 1
## 
## 
##  Attribute usage:
## 
##  100.00% Mg
##   71.50% Al
##   64.95% Fe
##   35.05% Si
##   35.05% Ca
##   16.82% K
##   10.75% RI
## 
## 
## Time: 0.0 secs
plot(m, gp = gpar(fontsize = 8))

A Likelihood ratio and cross validation was done to predict the the probability of each type for each sample that is being tested. A 10 fold cross validation was done to the prediction of the LR for the prediction and ground truth.

require(dismo)
## Loading required package: dismo
## Warning: package 'dismo' was built under R version 3.6.3
## Loading required package: raster
## Loading required package: sp
## Warning: package 'sp' was built under R version 3.6.3
## 
## Attaching package: 'raster'
## The following object is masked from 'package:dplyr':
## 
##     select
set.seed(88)
nt <- 1  
folds <- kfold(glass, k=10, by=glass$Float)
tstGT <- 99
tstprediction <- 99
for(i in 1:10){
  wtrain <- glass[folds!=i,]
  wtest <- glass[folds==i,]

  m <- C5.0(Float ~ RI+Na+Mg+Al+Si+K+Ca+Ba+Fe,
            data=wtrain, trials=nt, control=C5.0Control(earlyStopping = F))  
  tst <- predict(m, newdata=wtest, trials=nt, type="prob") 
 
  tst <- data.frame(Float=wtest$Float, tst)
  
  for(j in 1:dim(tst)[1]){
    tstprediction <- c(tstprediction , tst[j,3]/tst[j,2] )
    
    tstGT <- c(tstGT, as.numeric(as.character(tst$Float[j])))
  }
  if(i == 10){
    plot(m, gp = gpar(fontsize = 8), main="Final Training Model")
  }
}

The first value of the prediction and ground truth values were removed and changed the likelihood ratio to the log based 10 values.

tstGT <- tstGT[-1]

tstprediction <- tstprediction[-1]

summary(tstprediction)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00693  0.00787  0.12227  7.59447 11.41077 46.15789
summary(tstGT)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.4065  1.0000  1.0000
tstprediction <-  log10(tstprediction)
summary(tstprediction)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -2.1591 -2.1038 -0.9138 -0.4503  1.0573  1.6642

The model’s performance was examined by ROC analysis, ECE plot, DET plot and the Tippett plot.

library(ROCR)
## Warning: package 'ROCR' was built under R version 3.6.3
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.6.3
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
ROCRpred <- prediction(tstprediction, tstGT)
ROCRperf <- performance(ROCRpred, 'tpr','fpr')
plot(ROCRperf,downsampling=1, colorize = TRUE, print.cutoffs.at = c(-2,-1,0,1,2),text.adj=c(-.5,.5), lwd = 4, cex.axis=2, cex.lab=1.25, cex=2, lty=2)
abline(0,1, lty=1,lwd=1,col="red")
abline(1,-1,lty=3,lwd=1,col="blue")

auc.ROCperf = performance(ROCRpred, measure = "auc")
auc <- c(unlist(auc.ROCperf@y.values))

auc
## [1] 0.8833831
library(isotone)
#change wd to file location
source("ECE_function.R")
source("DET_function.R")
source("Tippett_function.R")
h1 <- tstprediction[which(tstGT==1)]
h2 <- tstprediction[which(tstGT==0)]
ECE_plot(as.matrix(10^h1),as.matrix(10^h2),"")

DET_plot(as.matrix(10^h1),as.matrix(10^h2),"")
abline(a=0, b=1, col="red")

Tippett_plot(as.matrix(10^h1),as.matrix(10^h2),"")