Loading the glass data file from local drive Load data for project Glass
glass <- read.csv("C:/Users/lnun/Desktop/Chemometrics/For Sariya/glass.csv")
View(glass)
To generate a float indicator in order to make the multiclass problem, a two class problem. Type 1 and 3 were set to be equal to 1 while the others were set to be 0.
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
glass = glass %>% mutate(Float = ifelse(Type == 1 | Type == 3, 1, 0))
glass$Float = as.factor(glass$Float)
Allows for developing C5.0 model in order to generate and plot a decision tree. The C.50 model is an all-purpose classifer that is very efficient.
library(C50)
## Warning: package 'C50' was built under R version 3.6.3
nt <- 1
library(partykit)
## Warning: package 'partykit' was built under R version 3.6.3
## Loading required package: grid
## Loading required package: libcoin
## Warning: package 'libcoin' was built under R version 3.6.3
## Loading required package: mvtnorm
m <- C5.0(Float ~ RI+Na+Mg+Al+Si+K+Ca+Ba+Fe,
data=glass, trials=nt, control=C5.0Control(earlyStopping = F))
summary(m)
##
## Call:
## C5.0.formula(formula = Float ~ RI + Na + Mg + Al + Si + K + Ca + Ba + Fe,
## data = glass, trials = nt, control = C5.0Control(earlyStopping = F))
##
##
## C5.0 [Release 2.07 GPL Edition] Mon Apr 20 15:43:55 2020
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 214 cases (10 attributes) from undefined.data
##
## Decision tree:
##
## Mg <= 2.68: 0 (61)
## Mg > 2.68:
## :...Al > 1.41:
## :...Ca <= 8.32:
## : :...Fe <= 0.22: 0 (34/1)
## : : Fe > 0.22:
## : : :...RI <= 1.51629: 1 (2)
## : : RI > 1.51629: 0 (2)
## : Ca > 8.32:
## : :...Mg <= 3.15: 0 (4)
## : Mg > 3.15:
## : :...K <= 0.52: 0 (2)
## : K > 0.52: 1 (8/1)
## Al <= 1.41:
## :...Fe > 0.11:
## :...K <= 0.23: 1 (6)
## : K > 0.23:
## : :...Mg > 3.59: 0 (8)
## : Mg <= 3.59:
## : :...K <= 0.45: 0 (2)
## : K > 0.45: 1 (10/2)
## Fe <= 0.11:
## :...Si > 72.81: 1 (27)
## Si <= 72.81:
## :...Si <= 72.22:
## :...Ca <= 10.17: 1 (21)
## : Ca > 10.17: 0 (2)
## Si > 72.22:
## :...Mg > 3.72: 0 (6)
## Mg <= 3.72:
## :...RI > 1.52043: 0 (2)
## RI <= 1.52043:
## :...Si <= 72.74: 1 (11)
## Si > 72.74:
## :...RI <= 1.51694: 0 (2)
## RI > 1.51694: 1 (4)
##
##
## Evaluation on training data (214 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 19 4( 1.9%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 124 3 (a): class 0
## 1 86 (b): class 1
##
##
## Attribute usage:
##
## 100.00% Mg
## 71.50% Al
## 64.95% Fe
## 35.05% Si
## 35.05% Ca
## 16.82% K
## 10.75% RI
##
##
## Time: 0.0 secs
plot(m, gp = gpar(fontsize = 8))
A Likelihood ratio and cross validation was done to predict the the probability of each type for each sample that is being tested. A 10 fold cross validation was done to the prediction of the LR for the prediction and ground truth.
require(dismo)
## Loading required package: dismo
## Warning: package 'dismo' was built under R version 3.6.3
## Loading required package: raster
## Loading required package: sp
## Warning: package 'sp' was built under R version 3.6.3
##
## Attaching package: 'raster'
## The following object is masked from 'package:dplyr':
##
## select
set.seed(88)
nt <- 1
folds <- kfold(glass, k=10, by=glass$Float)
tstGT <- 99
tstprediction <- 99
for(i in 1:10){
wtrain <- glass[folds!=i,]
wtest <- glass[folds==i,]
m <- C5.0(Float ~ RI+Na+Mg+Al+Si+K+Ca+Ba+Fe,
data=wtrain, trials=nt, control=C5.0Control(earlyStopping = F))
tst <- predict(m, newdata=wtest, trials=nt, type="prob")
tst <- data.frame(Float=wtest$Float, tst)
for(j in 1:dim(tst)[1]){
tstprediction <- c(tstprediction , tst[j,3]/tst[j,2] )
tstGT <- c(tstGT, as.numeric(as.character(tst$Float[j])))
}
if(i == 10){
plot(m, gp = gpar(fontsize = 8), main="Final Training Model")
}
}
The first value of the prediction and ground truth values were removed and changed the likelihood ratio to the log based 10 values.
tstGT <- tstGT[-1]
tstprediction <- tstprediction[-1]
summary(tstprediction)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00693 0.00787 0.12227 7.59447 11.41077 46.15789
summary(tstGT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.4065 1.0000 1.0000
tstprediction <- log10(tstprediction)
summary(tstprediction)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.1591 -2.1038 -0.9138 -0.4503 1.0573 1.6642
The model’s performance was examined by ROC analysis, ECE plot, DET plot and the Tippett plot.
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.6.3
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.6.3
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
ROCRpred <- prediction(tstprediction, tstGT)
ROCRperf <- performance(ROCRpred, 'tpr','fpr')
plot(ROCRperf,downsampling=1, colorize = TRUE, print.cutoffs.at = c(-2,-1,0,1,2),text.adj=c(-.5,.5), lwd = 4, cex.axis=2, cex.lab=1.25, cex=2, lty=2)
abline(0,1, lty=1,lwd=1,col="red")
abline(1,-1,lty=3,lwd=1,col="blue")
auc.ROCperf = performance(ROCRpred, measure = "auc")
auc <- c(unlist(auc.ROCperf@y.values))
auc
## [1] 0.8833831
library(isotone)
#change wd to file location
source("ECE_function.R")
source("DET_function.R")
source("Tippett_function.R")
h1 <- tstprediction[which(tstGT==1)]
h2 <- tstprediction[which(tstGT==0)]
ECE_plot(as.matrix(10^h1),as.matrix(10^h2),"")
DET_plot(as.matrix(10^h1),as.matrix(10^h2),"")
abline(a=0, b=1, col="red")
Tippett_plot(as.matrix(10^h1),as.matrix(10^h2),"")