Data Set Information:

The target attribute for classification is Category (blood donors vs. Hepatitis C (including its progress (‘just’ Hepatitis C, Fibrosis, Cirrhosis). https://archive.ics.uci.edu/ml/datasets/HCV+data#

Attribute Information:

All attributes except Category and Sex are numerical. The laboratory data are the attributes 5-14.
1) X (Patient ID/No.)
2) Category (diagnosis) (values: ‘0=Blood Donor’, ‘0s=suspect Blood Donor’, ‘1=Hepatitis’, ‘2=Fibrosis’, ‘3=Cirrhosis’)
3) Age (in years)
4) Sex (f,m) f=1, m=0
5) ALB
6) ALP
7) ALT 8) AST
9) BIL
10) CHE
11) CHOL
12) CREA
13) GGT
14) PROT

##Modelis 1:
### Klasifikaimo uždavinys:
###Kokiai kategorijai priklauso pacientas, ištyrus 10 kraujo baltymų duomenis rodančių kepenų veiklą?
### (0) Sveikas: Tinkamas kraujo donoras;
### (1) Įtartinas: reikalingi paildomi tyrimai;
### Serga kepenų liga: (2) Hepatitas, (3) Fibrozė, (4) Cirozė

knitr::opts_chunk$set(echo = TRUE)
library(MASS)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x dplyr::select() masks MASS::select()
library(tibble)
library(ggplot2)
library(devtools)
## Įkeliamas reikalingas paketas: usethis
library(ISLR)
## Warning: paketas 'ISLR' buvo sukurtas pagal R versijà 4.1.3
library(mlr)
## Įkeliamas reikalingas paketas: ParamHelpers
## Warning message: 'mlr' is in 'maintenance-only' mode since July 2019.
## Future development will only happen in 'mlr3'
## (<https://mlr3.mlr-org.com>). Due to the focus on 'mlr3' there might be
## uncaught bugs meanwhile in {mlr} - please consider switching.
library(GGally)
## Warning: paketas 'GGally' buvo sukurtas pagal R versijà 4.1.3
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
# del revalue
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Pridedamas paketas: 'plyr'
## Šie objektai yra užmaskuoti nuo 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## Šis objektas yra užmaskuotas nuo 'package:purrr':
## 
##     compact
# for missing values visualisation
library(naniar)
## Warning: paketas 'naniar' buvo sukurtas pagal R versijà 4.1.3
library(corrplot)
## corrplot 0.92 loaded
# del KNN
library(class)
library(e1071)
## 
## Pridedamas paketas: 'e1071'
## Šis objektas yra užmaskuotas nuo 'package:mlr':
## 
##     impute
library(tree)
## Warning: paketas 'tree' buvo sukurtas pagal R versijà 4.1.3
# PCA
library(factoextra)
## Warning: paketas 'factoextra' buvo sukurtas pagal R versijà 4.1.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(umap)
## Warning: paketas 'umap' buvo sukurtas pagal R versijà 4.1.3

Duomenų paruošimas

read.csv("C:\\duomenys\\hcv.csv")
hcvdata <- read.csv("C:\\duomenys\\hcv.csv")

Duomenų paruošimas:
Šalinam X stulpelį, kaip neinformatyvų
Lyties kintąmą jį perkuodojame iš f,m į 1,0
Klasėms suteikiame skaičius:
0 - tinkamas kraujo donaras
Netinkami:
1 - įtartinas
2 - Hepatitas
3 - Fibrozė
4 - Cirozė

hcvdata <- subset(hcvdata, select = -X)
hcvdata$Sex<- revalue(hcvdata$Sex, c("f"=1))
hcvdata$Sex<- revalue(hcvdata$Sex, c("m"=0))
hcvdata$Category<- revalue(hcvdata$Category, c("0=Blood Donor"=0))
hcvdata$Category<- revalue(hcvdata$Category, c("0s=suspect Blood Donor"=1))
hcvdata$Category<- revalue(hcvdata$Category, c("1=Hepatitis"=2))
hcvdata$Category<- revalue(hcvdata$Category, c("2=Fibrosis"=3))
hcvdata$Category<- revalue(hcvdata$Category, c("3=Cirrhosis"=4))

# Stulpelų pavadinimas
names(hcvdata)
##  [1] "Category" "Age"      "Sex"      "ALB"      "ALP"      "ALT"     
##  [7] "AST"      "BIL"      "CHE"      "CHOL"     "CREA"     "GGT"     
## [13] "PROT"
# Duomenų PVZ:
head(hcvdata, 6)
##   Category Age Sex  ALB  ALP  ALT  AST  BIL   CHE CHOL CREA  GGT PROT
## 1        0  32   0 38.5 52.5  7.7 22.1  7.5  6.93 3.23  106 12.1 69.0
## 2        0  32   0 38.5 70.3 18.0 24.7  3.9 11.17 4.80   74 15.6 76.5
## 3        0  32   0 46.9 74.7 36.2 52.6  6.1  8.84 5.20   86 33.2 79.3
## 4        0  32   0 43.2 52.0 30.6 22.6 18.9  7.33 4.74   80 33.8 75.7
## 5        0  32   0 39.2 74.1 32.6 24.8  9.6  9.15 4.32   76 29.9 68.7
## 6        0  32   0 41.6 43.3 18.5 19.7 12.3  9.92 6.05  111 91.0 74.0
# Stebinių kiekis
nrow(hcvdata)
## [1] 615
# Patiriname ar nėra trūkstamų duomenų
hcvdata[!complete.cases(hcvdata),]
##     Category Age Sex  ALB   ALP   ALT   AST   BIL   CHE CHOL  CREA   GGT PROT
## 122        0  43   0 48.6  45.0  10.5  40.5   5.3  7.09   NA  63.0  25.1 70.0
## 320        0  32   1 47.4  52.5  19.1  17.1   4.6 10.19   NA  63.0  23.0 72.2
## 330        0  33   1 42.4 137.2  14.2  13.1   3.4  8.23   NA  48.0  25.7 74.4
## 414        0  46   1 42.9  55.1  15.2  29.8   3.6  8.37   NA  61.0  29.0 71.9
## 425        0  48   1 45.6 107.2  24.4  39.0  13.8  9.77   NA  88.0  38.0 75.1
## 434        0  48   1 46.8  93.3  10.0  23.2   4.3 12.41   NA  52.0  23.9 72.4
## 499        0  57   1 48.4  94.4   2.5  39.6   2.3  8.84   NA  82.0   6.4 76.8
## 541        2  38   0 45.0  56.3    NA  33.1   7.0  9.58  6.0  77.9  18.9 63.0
## 542        2  19   0 41.0    NA  87.0  67.0  12.0  7.55  3.9  62.0  65.0 75.0
## 546        2  29   0 49.0    NA  53.0  39.0  15.0  8.79  3.6  79.0  37.0 90.0
## 547        2  30   0 45.0    NA  66.0  45.0  14.0 12.16  6.1  86.0  43.0 77.0
## 569        3  49   0 39.0    NA 118.0  62.0  10.0  7.28  3.5  72.0  74.0 81.0
## 570        3  49   0 46.0    NA 114.0  75.0  16.0 10.43  5.2  72.0  59.0 82.0
## 571        3  50   0 42.0    NA 258.0 106.0  15.0  8.74  4.7  77.0  80.0 84.0
## 572        3  53   0 46.0    NA  34.0  43.0  14.0  8.77  4.0 112.0 203.0 76.0
## 577        3  71   0 37.0    NA 130.0  90.0  15.0  9.92  4.7  79.0  77.0 76.0
## 582        3  49   1 39.0    NA  46.0  39.0   9.0 10.21  3.1  89.0  53.0 79.0
## 583        3  51   1 37.0    NA 164.0  70.0   9.0  3.99  4.2  67.0  43.0 72.0
## 584        3  56   1 39.0    NA  42.0  34.0  10.0  7.75  5.0  80.0  84.0 78.0
## 585        3  75   1 36.0    NA 114.0 125.0  14.0  6.65   NA  57.0 177.0 72.0
## 586        4  38   0 44.0    NA  94.0  60.0  12.0  4.37  3.2  61.0  99.0 77.0
## 591        4  46   0 20.0    NA  62.0 113.0 254.0  1.48   NA 114.0 138.0   NA
## 593        4  47   0 42.0    NA 159.0 102.0  11.0  6.29  5.5  58.0 201.0 79.0
## 604        4  65   0   NA    NA  40.0  54.0  13.0  7.50   NA  70.0 107.0 79.0
## 614        4  46   1 33.0    NA  39.0  62.0  20.0  3.56  4.2  52.0  50.0 71.0
## 615        4  59   1 36.0    NA 100.0  80.0  12.0  9.07  5.3  67.0  34.0 68.0
# Vizualzijuoma trūkstamus duomenis
vis_miss(hcvdata)
## Warning: `gather_()` was deprecated in tidyr 1.2.0.
## Please use `gather()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

# Eilutes su trūkstamais duomenimis šaliname
hcvdata<- na.omit(hcvdata)
# Dar akrtą Patiriname ar nėra trūkstamų duomenų
hcvdata[!complete.cases(hcvdata),]
##  [1] Category Age      Sex      ALB      ALP      ALT      AST      BIL     
##  [9] CHE      CHOL     CREA     GGT      PROT    
## <0 eilučių> (arba 0-ilgio row.names)
# Dirbame su tiek duomenų
nrow(hcvdata)
## [1] 589

Aprašomoji statistika

# min/max values
unique(hcvdata$Category)
## [1] "0" "1" "2" "3" "4"
unique(hcvdata$Sex )
## [1] "0" "1"
sapply(hcvdata, min)
## Category      Age      Sex      ALB      ALP      ALT      AST      BIL 
##      "0"     "23"      "0"   "14.9"   "11.3"    "0.9"   "10.6"    "0.8" 
##      CHE     CHOL     CREA      GGT     PROT 
##   "1.42"   "1.43"      "8"    "4.5"   "44.8"
sapply(hcvdata, max)
## Category      Age      Sex      ALB      ALP      ALT      AST      BIL 
##      "4"     "77"      "1"   "82.2"  "416.6"  "325.3"    "324"    "209" 
##      CHE     CHOL     CREA      GGT     PROT 
##  "16.41"   "9.67" "1079.1"  "650.9"   "86.5"
str(hcvdata)
## 'data.frame':    589 obs. of  13 variables:
##  $ Category: chr  "0" "0" "0" "0" ...
##  $ Age     : int  32 32 32 32 32 32 32 32 32 32 ...
##  $ Sex     : chr  "0" "0" "0" "0" ...
##  $ ALB     : num  38.5 38.5 46.9 43.2 39.2 41.6 46.3 42.2 50.9 42.4 ...
##  $ ALP     : num  52.5 70.3 74.7 52 74.1 43.3 41.3 41.9 65.5 86.3 ...
##  $ ALT     : num  7.7 18 36.2 30.6 32.6 18.5 17.5 35.8 23.2 20.3 ...
##  $ AST     : num  22.1 24.7 52.6 22.6 24.8 19.7 17.8 31.1 21.2 20 ...
##  $ BIL     : num  7.5 3.9 6.1 18.9 9.6 12.3 8.5 16.1 6.9 35.2 ...
##  $ CHE     : num  6.93 11.17 8.84 7.33 9.15 ...
##  $ CHOL    : num  3.23 4.8 5.2 4.74 4.32 6.05 4.79 4.6 4.1 4.45 ...
##  $ CREA    : num  106 74 86 80 76 111 70 109 83 81 ...
##  $ GGT     : num  12.1 15.6 33.2 33.8 29.9 91 16.9 21.5 13.7 15.9 ...
##  $ PROT    : num  69 76.5 79.3 75.7 68.7 74 74.5 67.1 71.3 69.9 ...
##  - attr(*, "na.action")= 'omit' Named int [1:26] 122 320 330 414 425 434 499 541 542 546 ...
##   ..- attr(*, "names")= chr [1:26] "122" "320" "330" "414" ...
summary(hcvdata)
##    Category              Age            Sex                 ALB       
##  Length:589         Min.   :23.00   Length:589         Min.   :14.90  
##  Class :character   1st Qu.:39.00   Class :character   1st Qu.:38.80  
##  Mode  :character   Median :47.00   Mode  :character   Median :41.90  
##                     Mean   :47.42                      Mean   :41.62  
##                     3rd Qu.:54.00                      3rd Qu.:45.10  
##                     Max.   :77.00                      Max.   :82.20  
##       ALP              ALT              AST              BIL        
##  Min.   : 11.30   Min.   :  0.90   Min.   : 10.60   Min.   :  0.80  
##  1st Qu.: 52.50   1st Qu.: 16.40   1st Qu.: 21.50   1st Qu.:  5.20  
##  Median : 66.20   Median : 22.70   Median : 25.70   Median :  7.10  
##  Mean   : 68.12   Mean   : 26.58   Mean   : 33.77   Mean   : 11.02  
##  3rd Qu.: 79.90   3rd Qu.: 31.90   3rd Qu.: 31.70   3rd Qu.: 11.00  
##  Max.   :416.60   Max.   :325.30   Max.   :324.00   Max.   :209.00  
##       CHE              CHOL            CREA              GGT       
##  Min.   : 1.420   Min.   :1.430   Min.   :   8.00   Min.   :  4.5  
##  1st Qu.: 6.930   1st Qu.:4.620   1st Qu.:  68.00   1st Qu.: 15.6  
##  Median : 8.260   Median :5.310   Median :  77.00   Median : 22.8  
##  Mean   : 8.204   Mean   :5.391   Mean   :  81.67   Mean   : 38.2  
##  3rd Qu.: 9.570   3rd Qu.:6.080   3rd Qu.:  89.00   3rd Qu.: 37.6  
##  Max.   :16.410   Max.   :9.670   Max.   :1079.10   Max.   :650.9  
##       PROT      
##  Min.   :44.80  
##  1st Qu.:69.30  
##  Median :72.10  
##  Mean   :71.89  
##  3rd Qu.:75.20  
##  Max.   :86.50

Bazinės vizualizacijos

# vizual
par(mfrow = c(1,1))
hist(hcvdata$Age, main = "AGE")

par(mfrow = c(2,2))
hist(hcvdata$PROT, main = "PROT")
hist(hcvdata$ALB  , main = "ALB")
hist(hcvdata$ALP    , main = "ALP")
hist(hcvdata$ALT  , main = "ALT")

par(mfrow = c(2,2))
hist(hcvdata$AST  , main = "AST")
hist(hcvdata$BIL  , main = "BIL")
hist(hcvdata$CHE  , main = "CHE")
hist(hcvdata$CHOL  , main = "CHOL")

par(mfrow = c(1,2))
hist(hcvdata$CREA  , main = "CREA")
hist(hcvdata$GGT  , main = "GGT")

par(mfrow = c(1,1))
ggplot(data = hcvdata,  aes(x=Sex, fill=Category)) +geom_bar()

ggpairs(hcvdata, mapping = ggplot2::aes(colour=Category) )

hcvdata_num <-hcvdata
hcvdata_num$Category<-as.integer(hcvdata$Category)
hcvdata_num$Sex<-as.integer(hcvdata$Sex)
# Korecialinė matrica
corrplot(cor(hcvdata_num), method = "number", type = "upper")

# ruošiame duomenis apmokymui
hcvdata_num_sex <-hcvdata
hcvdata_num_sex$Sex<-as.integer(hcvdata$Sex)

hcvdataTib <- as_tibble(hcvdata_num_sex)
hcvdataTib 
## # A tibble: 589 x 13
##    Category   Age   Sex   ALB   ALP   ALT   AST   BIL   CHE  CHOL  CREA   GGT
##    <chr>    <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 0           32     0  38.5  52.5   7.7  22.1   7.5  6.93  3.23   106  12.1
##  2 0           32     0  38.5  70.3  18    24.7   3.9 11.2   4.8     74  15.6
##  3 0           32     0  46.9  74.7  36.2  52.6   6.1  8.84  5.2     86  33.2
##  4 0           32     0  43.2  52    30.6  22.6  18.9  7.33  4.74    80  33.8
##  5 0           32     0  39.2  74.1  32.6  24.8   9.6  9.15  4.32    76  29.9
##  6 0           32     0  41.6  43.3  18.5  19.7  12.3  9.92  6.05   111  91  
##  7 0           32     0  46.3  41.3  17.5  17.8   8.5  7.01  4.79    70  16.9
##  8 0           32     0  42.2  41.9  35.8  31.1  16.1  5.82  4.6    109  21.5
##  9 0           32     0  50.9  65.5  23.2  21.2   6.9  8.69  4.1     83  13.7
## 10 0           32     0  42.4  86.3  20.3  20    35.2  5.46  4.45    81  15.9
## # ... with 579 more rows, and 1 more variable: PROT <dbl>
names(hcvdataTib)
##  [1] "Category" "Age"      "Sex"      "ALB"      "ALP"      "ALT"     
##  [7] "AST"      "BIL"      "CHE"      "CHOL"     "CREA"     "GGT"     
## [13] "PROT"

LDA

Tiesinė diskriminantinė analizė (angl. Linear Discriminant Analysis (LDA)) yra naudojama ieškant linijinės ribos tarp klasifikatorių, o kvadratinė diskriminantinė analizė (angl. Quadratic Discriminant Analysis (QDA)) yra naudojama netiesinei ribai tarp klasifikatorių rasti. LDA ir QDA veikia geriau, kai atsakymų klasės yra seperabilios (atskiriamos), o X=x pasiskirstymas visoms klasėms yra normalusis. Kuo daugiau klasių yra separabilios ir kuo normalesnis pasiskirstymas, tuo geresnis bus LDA ir QDA klasifikavimo rezultatas.

# LDA
hcvTask <- makeClassifTask(data = hcvdataTib, target = "Category")
## Warning in makeTask(type = type, data = data, weights = weights, blocking =
## blocking, : Provided data is not a pure data.frame but from class tbl_df, hence
## it will be converted.
lda <- makeLearner("classif.lda")
ldaModel <- train(lda, hcvTask)
ldaModelData <- getLearnerModel(ldaModel)
ldaPreds <- predict(ldaModelData)$x
head(ldaPreds)
##          LD1        LD2         LD3       LD4
## 1  0.3206273  0.2069216 -0.07971733 0.7260860
## 2 -0.6333132  0.7078427 -0.41548770 0.9022183
## 3 -0.1358253  0.9575933 -0.02355799 0.4409699
## 4 -0.1042261  0.4581127 -0.03210145 1.0641987
## 5 -0.6981598 -0.6986649 -0.45418518 1.1595721
## 6  0.2503788  1.0710960 -1.52739124 2.2699185
hcvdataTib %>%
mutate(LD1 = ldaPreds[, 1],
LD2 = ldaPreds[, 2]) %>%
ggplot(aes(LD1, LD2, col = Category)) +
geom_point() +
stat_ellipse() +
theme_bw()
## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure

QDA (Per mažai duomenų klasėse)

# QDA
# qda <- makeLearner("classif.qda")
# qdaModel <- train(qda, hcvTask)
# some group is too small for 'qda'
# Deja per mažos grupės atlikti QDA

(Per mažai duomenų klasėse)

Krosvalidavimas LDA

# Krosvalidavimas K-fold
kFold <- makeResampleDesc(method = "RepCV", folds = 7, reps = 50, stratify = TRUE)
ldaCVKF <- resample(learner = lda, task = hcvTask, resampling = kFold, measures = list(mmce, acc))
#K-fold Krosvalidavimo rezultatai

calculateConfusionMatrix(ldaCVKF$pred, relative = TRUE)
## Relative confusion matrix (normalized by row/column):
##         predicted
## true     0           1           2           3           4          
##   0      1e+00/0.967 8e-04/0.106 8e-05/0.003 2e-03/0.092 0e+00/0.000
##   1      5e-01/0.006 5e-01/0.867 3e-03/0.001 7e-02/0.049 0e+00/0.000
##   2      4e-01/0.016 2e-03/0.011 3e-01/0.493 2e-01/0.327 5e-02/0.056
##   3      4e-01/0.009 0e+00/0.000 4e-01/0.345 2e-01/0.205 8e-03/0.005
##   4      4e-02/0.002 2e-03/0.016 9e-02/0.158 1e-01/0.327 7e-01/0.938
##   -err.-        0.03        0.13        0.51        0.80        0.06
##         predicted
## true     -err.-     
##   0      0.003      
##   1      0.534      
##   2      0.663      
##   3      0.818      
##   4      0.279      
##   -err.- 0.06       
## 
## 
## Absolute confusion matrix:
##         predicted
## true         0   1   2   3   4 -err.-
##   0      26229  20   2  49   0     71
##   1        160 163   1  26   0    187
##   2        435   2 337 174  52    663
##   3        250   0 236 109   5    491
##   4         50   3 108 174 865    335
##   -err.-   895  25 347 423  57   1747
ldaCVKF$aggr
## mmce.test.mean  acc.test.mean 
##     0.05928464     0.94071536

LOO LDA

#LOO Krosvalidavimo rezultatai
LOO <- makeResampleDesc(method = "LOO")
hcvTask <- makeClassifTask(data = hcvdataTib, target = "Category")
ldaCVLOO <- resample(learner = lda, task = hcvTask, resampling = LOO, measures = list(mmce, acc))

ldaCVLOO$aggr
## mmce.test.mean  acc.test.mean 
##     0.06112054     0.93887946

Holdout LDA

#Holdout Krosvalidavimo rezultatai
holdout <- makeResampleDesc(method = "Holdout", split = 4/5, stratify = TRUE)
set.seed(123)
# hcvTask <- makeClassifTask(data = hcvdataTib, target = "Category")
holdoutCV_lda <- resample(learner = lda, task = hcvTask, resampling = holdout, measures = list(mmce, acc))

calculateConfusionMatrix(holdoutCV_lda$pred, relative = TRUE)
## Relative confusion matrix (normalized by row/column):
##         predicted
## true     0           1           2           3           4          
##   0      0.991/0.963 0.000/0.000 0.000/0.000 0.009/0.333 0.000/0.000
##   1      0.500/0.009 0.500/0.333 0.000/0.000 0.000/0.000 0.000/0.000
##   2      0.500/0.018 0.250/0.333 0.250/0.500 0.000/0.000 0.000/0.000
##   3      0.333/0.009 0.000/0.000 0.333/0.500 0.333/0.333 0.000/0.000
##   4      0.000/0.000 0.200/0.333 0.000/0.000 0.200/0.333 0.600/1.000
##   -err.-        0.04        0.67        0.50        0.67        0.00
##         predicted
## true     -err.-     
##   0      0.009      
##   1      0.500      
##   2      0.750      
##   3      0.667      
##   4      0.400      
##   -err.- 0.07       
## 
## 
## Absolute confusion matrix:
##         predicted
## true       0 1 2 3 4 -err.-
##   0      105 0 0 1 0      1
##   1        1 1 0 0 0      1
##   2        2 1 1 0 0      3
##   3        1 0 1 1 0      2
##   4        0 1 0 1 3      2
##   -err.-   4 2 1 2 0      9
holdoutCV_lda$aggr
## mmce.test.mean  acc.test.mean 
##          0.075          0.925

KNN

Statistikoje k-arčiausių kaimynų algoritmas (k-NN) yra neparametrinis prižiūrimas mokymosi metodas. Jis naudojamas klasifikavimui ir regresijai. Abiem atvejais įvestis susideda iš k artimiausių mokymo pavyzdžių duomenų rinkinyje. Išvestis priklauso nuo to, ar k-NN naudojamas klasifikavimui ar regresijai: K-NN klasifikacijoje išvestis yra klasės narystė. An object is classified by a plurality vote of its neighbors, with the object being assigned to the class most common among its k nearest neighbors. If k = 1, then the object is simply assigned to the class of that single nearest neighbor. Taikant k-NN regresiją, išvestis yra objekto savybės vertė. Ši reikšmė yra k artimiausių kaimynų reikšmių vidurkis. Algoritmas identifikuoja arčiausiai kiekvieno nepažymėto atvejo k pažymėtus atvejus (kaimynus). k yra mūsų nurodytas sveikasis skaičius. Tai yra surandame tinkamiausią k, su kuriuo kintamieji yra tiksliausiai priskiriami klasėms

hcvdataTask <- makeClassifTask(data = hcvdataTib, target = "Category")
knnParamSpace <- makeParamSet(makeDiscreteParam("k", values = 1:30))
gridSearch <- makeTuneControlGrid()
cvForTuning <- makeResampleDesc("RepCV", folds = 10, reps = 20)

tunedK <- tuneParams("classif.knn", task = hcvdataTask,
                     resampling = cvForTuning,
                     par.set = knnParamSpace, control = gridSearch)

knnTuningData <- generateHyperParsEffectData(tunedK)
plotHyperParsEffect(knnTuningData, x = "k", y = "mmce.test.mean",
                    plot.type = "line") +
    theme_bw()

### Holdout

knn <- makeLearner("classif.knn", par.vals = list("k" = 25))
# Hold out krosvalidavimas
holdout <- makeResampleDesc(method = "Holdout", split = 4/5, stratify = TRUE)
set.seed(123)
holdoutCV <- resample(learner = knn, task = hcvdataTask,
                      resampling = holdout, measures = list(mmce, acc))
calculateConfusionMatrix(holdoutCV$pred, relative = TRUE)
## Relative confusion matrix (normalized by row/column):
##         predicted
## true     0         1         2         3         4         -err.-   
##   0      1.00/0.89 0.00/0.00 0.00/0.00 0.00/0.00 0.00/0.00 0.00     
##   1      1.00/0.02 0.00/0.00 0.00/0.00 0.00/0.00 0.00/0.00 1.00     
##   2      1.00/0.03 0.00/0.00 0.00/0.00 0.00/0.00 0.00/0.00 1.00     
##   3      1.00/0.03 0.00/0.00 0.00/0.00 0.00/0.00 0.00/0.00 1.00     
##   4      0.80/0.03 0.00/0.00 0.00/0.00 0.20/1.00 0.00/0.00 1.00     
##   -err.-      0.11      0.00      0.00      1.00      0.00 0.12     
## 
## 
## Absolute confusion matrix:
##         predicted
## true       0 1 2 3 4 -err.-
##   0      106 0 0 0 0      0
##   1        2 0 0 0 0      2
##   2        4 0 0 0 0      4
##   3        3 0 0 0 0      3
##   4        4 0 0 1 0      5
##   -err.-  13 0 0 1 0     14
holdoutCV$aggr
## mmce.test.mean  acc.test.mean 
##      0.1166667      0.8833333

k-Fold

kFold <- makeResampleDesc(method = "CV", iters = 7,
                          stratify = TRUE)
set.seed(123)
kFoldCV <- resample(learner = knn, task = hcvdataTask,
                    resampling = kFold, measures = list(mmce, acc))
kFoldCV$aggr
## mmce.test.mean  acc.test.mean 
##      0.1001672      0.8998328

LOO

LOO <- makeResampleDesc(method = "LOO")
set.seed(123)
kLOO <- resample(learner = knn, task = hcvdataTask,
                    resampling = LOO, measures = list(mmce, acc))
kLOO$aggr
## mmce.test.mean  acc.test.mean 
##     0.09847199     0.90152801

Liner SVM

SVM – Atraminių vektorių klasifikatorius (angl. – Support vector machine) sistemos mokymosi algoritmas skirtas klasifikuoti duomenims. Tai prižiūrimo mokymosi metodas, kuomet siekiama suklasifikuoti jau pažymėtus duomenis. „Support Vector Machine“ (SVM) yra mašininio mokymosi su mokytoju algoritmas, kuris gali būti naudojamas tiek klasifikavimo, tiek regresijos uždaviniams spręsti. Visgi, jis dažniausiai naudojamas klasifikavimo uždaviniams. SVM tikslas yra rasti maksimalią skiriamąją liniją (jei atvejis yra dvimatis) arba skiriamąją plokštumą (jei atvejis yra trimatis), arba skiriamąją hiperplokštumą (jei atvejis yra n- matmenų, n>3), kuri turėtų didžiausią atstumą tarp artimiausių mokymo duomenų objektų.

set.seed(123)
svmfit <- svm(Category ~., data = hcvdata_num, kernel = "linear", cost = 10, scale = FALSE)
print(svmfit)
## 
## Call:
## svm(formula = Category ~ ., data = hcvdata_num, kernel = "linear", 
##     cost = 10, scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  linear 
##        cost:  10 
##       gamma:  0.08333333 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  305
hcvdata_numTib <- as_tibble(hcvdata_num)
hcvdata_numTask <- makeClassifTask(data = hcvdata_numTib, target = "Category")
cvForTuning <- makeResampleDesc("Holdout", split = 0.9)
kernels <- c("polynomial", "radial", "sigmoid")
svmParamSpace <- makeParamSet(makeDiscreteParam("kernel", values = kernels),
                              makeIntegerParam("degree", lower = 1, upper = 3),
                              makeNumericParam("cost", lower = 0.1, upper = 10),
                              makeNumericParam("gamma", lower = 0.1, 10))


randSearch <- makeTuneControlRandom(maxit = 10)
outer <- makeResampleDesc("CV", iters = 3)
svmWrapper <- makeTuneWrapper("classif.svm", resampling = cvForTuning,
                              par.set = svmParamSpace, control = randSearch)
cvWithTuning <- resample(learner = svmWrapper, task = hcvdata_numTask, resampling = outer, measures = list(mmce, acc))

calculateConfusionMatrix(cvWithTuning$pred, relative = TRUE)
## Relative confusion matrix (normalized by row/column):
##         predicted
## true     0           1           2           3           4          
##   0      0.989/0.944 0.000/0.000 0.010/0.357 0.002/0.111 0.000/0.000
##   1      0.429/0.005 0.286/1.000 0.000/0.000 0.286/0.222 0.000/0.000
##   2      0.650/0.024 0.000/0.000 0.200/0.286 0.100/0.222 0.050/0.077
##   3      0.500/0.011 0.000/0.000 0.417/0.357 0.083/0.111 0.000/0.000
##   4      0.375/0.016 0.000/0.000 0.000/0.000 0.125/0.333 0.500/0.923
##   -err.-        0.06        0.00        0.71        0.89        0.08
##         predicted
## true     -err.-     
##   0      0.01       
##   1      0.71       
##   2      0.80       
##   3      0.92       
##   4      0.50       
##   -err.- 0.08       
## 
## 
## Absolute confusion matrix:
##         predicted
## true       0 1  2 3  4 -err.-
##   0      520 0  5 1  0      6
##   1        3 2  0 2  0      5
##   2       13 0  4 2  1     16
##   3        6 0  5 1  0     11
##   4        9 0  0 3 12     12
##   -err.-  31 0 10 8  1     50
cvWithTuning
## Resample Result
## Task: hcvdata_numTib
## Learner: classif.svm.tuned
## Aggr perf: mmce.test.mean=0.0849304,acc.test.mean=0.9150696
## Runtime: 1.13384

Non-Liner SVM

set.seed(123)
svmfit_nl <- svm(Category ~., data = hcvdata_num, kernel = "radial", cost = 5, scale = FALSE)
print(svmfit_nl)
## 
## Call:
## svm(formula = Category ~ ., data = hcvdata_num, kernel = "radial", 
##     cost = 5, scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  radial 
##        cost:  5 
##       gamma:  0.08333333 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  589

Decision tree

Sprendimų medžiai yra mašininio mokymosi su mokytoju metodas, kai duomenys nuolat skaidomi pagal tam tikrą parametrą. Medį sudaro sprendimų mazgai ir lapai. Lapai yra sprendimai arba galutiniai rezultatai, o sprendimų mazgai atsiranda ten, kur duomenys yra padalijami.

tree.hcv <- tree(Category~., data = hcvdata_num)
summary(tree.hcv)
## 
## Regression tree:
## tree(formula = Category ~ ., data = hcvdata_num)
## Variables actually used in tree construction:
## [1] "AST" "ALT" "CHE" "BIL"
## Number of terminal nodes:  8 
## Residual mean deviance:  0.05806 = 33.73 / 581 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1.58800 -0.01751 -0.01751  0.00000 -0.01751  1.98200
# Error in text.tree(tree.hcv, pretty = 0) : cannot plot singlenode tree
plot(tree.hcv)
text(tree.hcv, pretty = 0)

Category (diagnosis) (values: ‘0=Blood Donor’, ‘1=suspect Blood Donor’, ‘2=Hepatitis’, ‘3=Fibrosis’, ‘4=Cirrhosis’)

tree <- makeLearner("classif.rpart")
treeParamSpace <- makeParamSet(makeIntegerParam("minsplit", lower = 5, upper = 20), makeIntegerParam("minbucket", lower = 3, upper = 10), makeNumericParam("cp", lower = 0.01, upper = 0.1), makeIntegerParam("maxdepth", lower = 3, upper = 10))

randSearch <- makeTuneControlRandom(maxit = 200)
cvForTuning <- makeResampleDesc("CV", iters = 5)
library(parallelMap)
# parallelStartSocket(cpus = detectCores())
# parallelStartSocket(cpus=3L)
# parallelMap(function(x) {x+1}, x = 1:10)

tunedTreePars <- tuneParams(tree, task = hcvTask, resampling = cvForTuning, par.set = treeParamSpace, control = randSearch)
# parallelStop()

tunedTree <- setHyperPars(tree, par.vals = tunedTreePars$x)
tunedTreeModel <- train(tunedTree, hcvTask)
library(rpart.plot)
treeModelData <- getLearnerModel(tunedTreeModel)
par(mfrow = c(1,1))
rpart.plot(treeModelData, roundint = FALSE, box.palette = "BuBn", type = 5)

### Hold out confusion matrix

holdout <- makeResampleDesc(method = "Holdout", split = 4/5, stratify = TRUE)
set.seed(123)
treeWrapper <- makeTuneWrapper("classif.rpart", resampling = holdout,
                               par.set = treeParamSpace,
                               control = randSearch)

set.seed(123)
cvWithTuning <- resample(treeWrapper, hcvTask, resampling = holdout)
calculateConfusionMatrix(cvWithTuning$pred, relative = TRUE)
## Relative confusion matrix (normalized by row/column):
##         predicted
## true     0           1           2           3           4          
##   0      0.972/0.972 0.000/0.000 0.009/0.250 0.000/0.000 0.019/0.250
##   1      0.500/0.009 0.000/0.000 0.000/0.000 0.000/0.000 0.500/0.125
##   2      0.500/0.019 0.000/0.000 0.250/0.250 0.000/0.000 0.250/0.125
##   3      0.000/0.000 0.000/0.000 0.667/0.500 0.333/0.500 0.000/0.000
##   4      0.000/0.000 0.000/0.000 0.000/0.000 0.200/0.500 0.800/0.500
##   -err.-        0.03        0.00        0.75        0.50        0.50
##         predicted
## true     -err.-     
##   0      0.03       
##   1      1.00       
##   2      0.75       
##   3      0.67       
##   4      0.20       
##   -err.- 0.09       
## 
## 
## Absolute confusion matrix:
##         predicted
## true       0 1 2 3 4 -err.-
##   0      103 0 1 0 2      3
##   1        1 0 0 0 1      2
##   2        2 0 1 0 1      3
##   3        0 0 2 1 0      2
##   4        0 0 0 1 4      1
##   -err.-   3 0 3 1 4     11
cvWithTuning
## Resample Result
## Task: hcvdataTib
## Learner: classif.rpart.tuned
## Aggr perf: mmce.test.mean=0.0916667
## Runtime: 3.11266

k-fold

kFold <- makeResampleDesc(method = "CV", iters = 10)
set.seed(123)
treeWrapper <- makeTuneWrapper("classif.rpart", resampling = kFold,
                               par.set = treeParamSpace,
                               control = randSearch)

set.seed(123)
cvWithTuning <- resample(treeWrapper, hcvTask, resampling = kFold)
calculateConfusionMatrix(cvWithTuning$pred, relative = TRUE)
## Relative confusion matrix (normalized by row/column):
##         predicted
## true     0           1           2           3           4          
##   0      0.985/0.970 0.004/0.500 0.004/0.133 0.004/0.182 0.004/0.080
##   1      0.429/0.006 0.143/0.250 0.000/0.000 0.286/0.182 0.143/0.040
##   2      0.400/0.015 0.000/0.000 0.250/0.333 0.200/0.364 0.150/0.120
##   3      0.250/0.006 0.000/0.000 0.500/0.400 0.167/0.182 0.083/0.040
##   4      0.083/0.004 0.042/0.250 0.083/0.133 0.042/0.091 0.750/0.720
##   -err.-        0.03        0.75        0.67        0.82        0.28
##         predicted
## true     -err.-     
##   0      0.02       
##   1      0.86       
##   2      0.75       
##   3      0.83       
##   4      0.25       
##   -err.- 0.08       
## 
## 
## Absolute confusion matrix:
##         predicted
## true       0 1  2 3  4 -err.-
##   0      518 2  2 2  2      8
##   1        3 1  0 2  1      6
##   2        8 0  5 4  3     15
##   3        3 0  6 2  1     10
##   4        2 1  2 1 18      6
##   -err.-  16 3 10 9  7     45
cvWithTuning
## Resample Result
## Task: hcvdataTib
## Learner: classif.rpart.tuned
## Aggr perf: mmce.test.mean=0.0763881
## Runtime: 191.881

Logistine regresija

Statistikoje logistinis modelis yra statistinis modelis, modeliuojantis vieno įvykio tikimybę, kai įvykio log-odds yra tiesinis vieno ar kelių nepriklausomų kintamųjų derinys. Regresinėje analizėje logistinė regresija yra logistinio modelio parametrų įvertinimas.

# "error"

PCA

pca <- select(hcvdataTib, -Category) %>%
prcomp(center = TRUE, scale = TRUE)

pca
## Standard deviations (1, .., p=12):
##  [1] 1.5668907 1.3900040 1.1915014 1.0584385 1.0228089 0.9660590 0.8477556
##  [8] 0.7919503 0.7537225 0.6836635 0.6281485 0.5634074
## 
## Rotation (n x k) = (12 x 12):
##              PC1         PC2         PC3         PC4        PC5         PC6
## Age   0.17213362  0.10252754 -0.41023548  0.14193681 -0.3177054  0.61572451
## Sex   0.02232163 -0.28777343 -0.34385728  0.37416588 -0.2155273 -0.59874016
## ALB  -0.44898160  0.13241577  0.31360083  0.03629492 -0.2122775 -0.02396007
## ALP   0.17516274  0.41567639 -0.33749197 -0.11945767 -0.2004078 -0.32615576
## ALT  -0.02936048  0.43517007 -0.06337288  0.07341917  0.5932552 -0.10707058
## AST   0.33389741  0.28878072  0.33745539  0.31278700  0.0994621  0.01233662
## BIL   0.32868302  0.03715808  0.35302479  0.14286987 -0.3164594  0.17398305
## CHE  -0.43157941  0.28102652 -0.11430251 -0.06300265  0.1341679  0.13963448
## CHOL -0.31338712  0.26953078 -0.35463691  0.16732009 -0.2112596  0.13290959
## CREA  0.06271502  0.13318870  0.06777793 -0.77311560 -0.2958094 -0.14977330
## GGT   0.31000924  0.48272745  0.02057739  0.07207775 -0.1241001 -0.17607797
## PROT -0.36429669  0.19954441  0.33890157  0.25643756 -0.3739779 -0.15151647
##              PC7         PC8          PC9         PC10        PC11         PC12
## Age  -0.40222782  0.11834442 -0.287541439  0.147092025 -0.06499641 -0.083858454
## Sex  -0.14006184 -0.25479176 -0.179616045  0.229657649 -0.28938425 -0.014982901
## ALB  -0.14964561  0.31026594 -0.226894424 -0.215430799 -0.53416012  0.365657823
## ALP   0.23577159  0.49112753 -0.002312016  0.174225484  0.24496447  0.366956898
## ALT   0.02125241 -0.08223998 -0.632615549 -0.008330094 -0.02816322 -0.169300054
## AST  -0.36156593 -0.33559973  0.160109271  0.153726417  0.10817998  0.528237315
## BIL   0.64789996 -0.12833006 -0.272958552  0.236255770 -0.21101855 -0.063536028
## CHE   0.09525418 -0.12942477  0.360270983  0.669660226 -0.26816456 -0.065220739
## CHOL  0.30845151 -0.50782640  0.059759603 -0.452934921  0.11919029  0.192679024
## CREA -0.21495130 -0.41029262 -0.214264865  0.071561235 -0.02261506  0.002183808
## GGT  -0.11425456  0.05409684  0.376048555 -0.302109240 -0.37118365 -0.482084182
## PROT -0.16181351  0.04290011 -0.111979685  0.139113253  0.53410242 -0.372635732
fviz_pca_var(pca)

fviz_screeplot(pca, addlabels = TRUE, choice = "eigenvalue")

fviz_screeplot(pca, addlabels = TRUE, choice = "variance")

TDPca <- hcvdataTib %>%
mutate(PCA1 = pca$x[, 1], PCA2 = pca$x[, 2])

ggplot(TDPca, aes(PCA1, PCA2, col = Category)) +
geom_point() +
theme_bw()

UMAP

UMAP (angl. Uniform Manifold Approximation and Projection) yra netiesninis dimensijų sumažininimo algoritmas, kuris panašiai kaip ir t-SNE, gali būti naudojamas daugiamačių duomenų vizualizacijai, taip pat bendram netiesiniam matmenų mažinimui.

hcvdataUMAP <- dplyr::select(hcvdataTib, -Category) %>% as.matrix() %>% umap(n_neighbors = 10, min_dist = 0.1, metric = "manhattan", n_epochs = 200, verbose = TRUE)

hcvdataTibmap <- hcvdataTib %>% mutate_if(.funs = scale, .predicate = is.numeric, scale = FALSE) %>% mutate(UMAP1 = hcvdataUMAP$layout[, 1], UMAP2 = hcvdataUMAP$layout[, 2]) %>% gather(key = "Variable", value = "Value", c(-UMAP1, -UMAP2, -Category))

ggplot(hcvdataTibmap, aes(UMAP1, UMAP2, col = Category))+ geom_point() + theme_bw()

Modelio pakeitimas: modelis nr. 2

Klasifikaimo uždavinys:

###Kokiai kategorijai priklauso pacientas, ištyrus 10 kraujo baltymų duomenis rodančių kepenų veiklą?
### (0) Sveikas: Tinkamas kraujo donoras; ### (1) Įtartinas: reikalingi paildomi tyrimai;

Iš 5 galimų klasių keičiu į dvi klases:0=Blood Donor 1=NOT Blood Donor

# Duomenų keitimas: Sveikas, nesveikas
# hcvdata$Sex<- revalue(hcvdata$Sex, c("f"=1))
# hcvdata$Sex<- revalue(hcvdata$Sex, c("m"=0))
hcvdata$Category<- revalue(hcvdata$Category, c("2"=1))
hcvdata$Category<- revalue(hcvdata$Category, c("3"=1))
hcvdata$Category<- revalue(hcvdata$Category, c("4"=1))

hcvdata$Sex<- revalue(hcvdata$Sex, c("f"=1))
## The following `from` values were not present in `x`: f
hcvdata$Sex<- revalue(hcvdata$Sex, c("m"=0))
## The following `from` values were not present in `x`: m
hcvdata_num <-hcvdata
hcvdata_num$Category<-as.integer(hcvdata$Category)
hcvdata_num$Sex<-as.integer(hcvdata$Sex)

LDA (2)

tail(hcvdata)
##     Category Age Sex ALB   ALP  ALT   AST BIL  CHE CHOL  CREA   GGT PROT
## 608        1  52   1  39  37.0  1.3  30.4  21 6.33 3.78 158.2 142.5 82.7
## 609        1  58   1  34  46.4 15.0 150.0   8 6.26 3.98  56.0  49.7 80.6
## 610        1  59   1  39  51.3 19.6 285.8  40 5.77 4.51 136.1 101.1 70.5
## 611        1  62   1  32 416.6  5.9 110.3  50 5.57 6.30  55.7 650.9 68.5
## 612        1  64   1  24 102.8  2.9  44.4  20 1.54 3.02  63.0  35.9 71.3
## 613        1  64   1  29  87.3  3.5  99.0  48 1.66 3.63  66.7  64.2 82.0
hcvdataTib <- as_tibble(hcvdata_num)
# LDA
hcvTask <- makeClassifTask(data = hcvdataTib, target = "Category")
## Warning in makeTask(type = type, data = data, weights = weights, blocking =
## blocking, : Provided data is not a pure data.frame but from class tbl_df, hence
## it will be converted.
lda <- makeLearner("classif.lda")
ldaModel <- train(lda, hcvTask)
ldaModelData <- getLearnerModel(ldaModel)
ldaPreds <- predict(ldaModelData)$x

Krosvalidavimas kFold LDA

# Krosvalidavimas K-fold
kFold <- makeResampleDesc(method = "RepCV", folds = 7, reps = 50, stratify = TRUE)
ldaCVKF <- resample(learner = lda, task = hcvTask, resampling = kFold, measures = list(mmce, acc))
#K-fold Krosvalidavimo rezultatai

calculateConfusionMatrix(ldaCVKF$pred, relative = TRUE)
## Relative confusion matrix (normalized by row/column):
##         predicted
## true     0          1          -err.-    
##   0      0.997/0.96 0.003/0.04 0.003     
##   1      0.386/0.04 0.614/0.96 0.386     
##   -err.-       0.04       0.04 0.04      
## 
## 
## Absolute confusion matrix:
##         predicted
## true         0    1 -err.-
##   0      26221   79     79
##   1       1216 1934   1216
##   -err.-  1216   79   1295
 ldaCVKF$aggr
## mmce.test.mean  acc.test.mean 
##     0.04396919     0.95603081

LOO LDA

#LOO Krosvalidavimo rezultatai
LOO <- makeResampleDesc(method = "LOO")
hcvTask <- makeClassifTask(data = hcvdataTib, target = "Category")
ldaCVLOO <- resample(learner = lda, task = hcvTask, resampling = LOO, measures = list(mmce, acc))

ldaCVLOO$aggr
## mmce.test.mean  acc.test.mean 
##     0.04414261     0.95585739

Holdout LDA

#Holdout Krosvalidavimo rezultatai
holdout <- makeResampleDesc(method = "Holdout", split = 4/5, stratify = TRUE)
set.seed(123)
# hcvTask <- makeClassifTask(data = hcvdataTib, target = "Category")
holdoutCV_lda <- resample(learner = lda, task = hcvTask, resampling = holdout, measures = list(mmce, acc))

calculateConfusionMatrix(holdoutCV_lda$pred, relative = TRUE)
## Relative confusion matrix (normalized by row/column):
##         predicted
## true     0          1          -err.-    
##   0      0.991/0.97 0.009/0.09 0.009     
##   1      0.231/0.03 0.769/0.91 0.231     
##   -err.-       0.03       0.09 0.03      
## 
## 
## Absolute confusion matrix:
##         predicted
## true       0  1 -err.-
##   0      105  1      1
##   1        3 10      3
##   -err.-   3  1      4
holdoutCV_lda$aggr
## mmce.test.mean  acc.test.mean 
##     0.03361345     0.96638655

QDA (2)

qda <- makeLearner("classif.qda")
qdaModel <- train(qda, hcvTask)

Krosvalidavimas kFold QDA

# Krosvalidavimas K-fold
kFold <- makeResampleDesc(method = "RepCV", folds = 10, reps = 50, stratify = TRUE)
ldaCVKF <- resample(learner = qda, task = hcvTask, resampling = kFold, measures = list(mmce, acc))
#K-fold Krosvalidavimo rezultatai

calculateConfusionMatrix(ldaCVKF$pred, relative = TRUE)
## Relative confusion matrix (normalized by row/column):
##         predicted
## true     0         1         -err.-   
##   0      0.97/0.98 0.03/0.25 0.03     
##   1      0.19/0.02 0.81/0.75 0.19     
##   -err.-      0.02      0.25 0.05     
## 
## 
## Absolute confusion matrix:
##         predicted
## true         0    1 -err.-
##   0      25469  831    831
##   1        594 2556    594
##   -err.-   594  831   1425
 ldaCVKF$aggr
## mmce.test.mean  acc.test.mean 
##     0.04840183     0.95159817

LOO QDA

#LOO Krosvalidavimo rezultatai
LOO <- makeResampleDesc(method = "LOO")
hcvTask <- makeClassifTask(data = hcvdataTib, target = "Category")
ldaCVLOO <- resample(learner = qda, task = hcvTask, resampling = LOO, measures = list(mmce, acc))

ldaCVLOO$aggr
## mmce.test.mean  acc.test.mean 
##      0.0475382      0.9524618

Holdout QDA

#Holdout Krosvalidavimo rezultatai
holdout <- makeResampleDesc(method = "Holdout", split = 4/5, stratify = TRUE)
set.seed(123)
# hcvTask <- makeClassifTask(data = hcvdataTib, target = "Category")
holdoutCV_lda <- resample(learner = qda, task = hcvTask, resampling = holdout, measures = list(mmce, acc))

calculateConfusionMatrix(holdoutCV_lda$pred, relative = TRUE)
## Relative confusion matrix (normalized by row/column):
##         predicted
## true     0         1         -err.-   
##   0      0.95/0.99 0.05/0.29 0.05     
##   1      0.08/0.01 0.92/0.71 0.08     
##   -err.-      0.01      0.29 0.05     
## 
## 
## Absolute confusion matrix:
##         predicted
## true       0  1 -err.-
##   0      101  5      5
##   1        1 12      1
##   -err.-   1  5      6
holdoutCV_lda$aggr
## mmce.test.mean  acc.test.mean 
##     0.05042017     0.94957983
hcvdataTib %>% mutate(LD1 = ldaPreds[, 1], LD1 = ldaPreds[, 1]) %>% ggplot(aes(LD1, LD1, col = Category)) +geom_point() +theme_bw()

KNN (2)

hcvdataTask <- makeClassifTask(data = hcvdataTib, target = "Category")
set.seed(123)
knnParamSpace <- makeParamSet(makeDiscreteParam("k", values = 1:30))
gridSearch <- makeTuneControlGrid()
cvForTuning <- makeResampleDesc("RepCV", folds = 10, reps = 20)

tunedK <- tuneParams("classif.knn", task = hcvdataTask,
                     resampling = cvForTuning,
                     par.set = knnParamSpace, control = gridSearch)

knnTuningData <- generateHyperParsEffectData(tunedK)
plotHyperParsEffect(knnTuningData, x = "k", y = "mmce.test.mean",
                    plot.type = "line") +
    theme_bw()

# Geriausia kai k =25
knn <- makeLearner("classif.knn", par.vals = list("k" = 25))

Holdout

# Hold out krosvalidavimas
holdout <- makeResampleDesc(method = "Holdout", split = 4/5, stratify = TRUE)
set.seed(123)
holdoutCV <- resample(learner = knn, task = hcvdataTask,
                      resampling = holdout, measures = list(mmce, acc))
calculateConfusionMatrix(holdoutCV$pred, relative = TRUE)
## Relative confusion matrix (normalized by row/column):
##         predicted
## true     0          1          -err.-    
##   0      0.991/0.96 0.009/0.10 0.009     
##   1      0.308/0.04 0.692/0.90 0.308     
##   -err.-       0.04       0.10 0.04      
## 
## 
## Absolute confusion matrix:
##         predicted
## true       0 1 -err.-
##   0      105 1      1
##   1        4 9      4
##   -err.-   4 1      5
holdoutCV$aggr
## mmce.test.mean  acc.test.mean 
##     0.04201681     0.95798319

k-Fold

kFold <- makeResampleDesc(method = "CV", iters = 10,
                          stratify = TRUE)
set.seed(123)
kFoldCV <- resample(learner = knn, task = hcvdataTask,
                    resampling = kFold, measures = list(mmce, acc))
kFoldCV$aggr
## mmce.test.mean  acc.test.mean 
##     0.05946815     0.94053185

LOO

LOO <- makeResampleDesc(method = "LOO")
set.seed(123)
kLOO <- resample(learner = knn, task = hcvdataTask,
                    resampling = LOO, measures = list(mmce, acc))
kLOO$aggr
## mmce.test.mean  acc.test.mean 
##     0.05263158     0.94736842

Logistine regresija (2)

Holdout

hcvdata_Task <- makeClassifTask(data = hcvdata_num, target = "Category")
logReg <- makeLearner("classif.logreg", predict.type = "prob")
logRegModel <- train(logReg, hcvdata_Task)

logRegWrapper <- makeImputeWrapper("classif.logreg")
holdout <- makeResampleDesc(method = "Holdout", split = 4/5, stratify = TRUE)
set.seed(123)
logRegwithImpute <- resample(logRegWrapper, hcvdata_Task,
                             resampling = holdout,
                             measures = list(acc, fpr, fnr))

calculateConfusionMatrix(logRegwithImpute$pred, relative = TRUE)
## Relative confusion matrix (normalized by row/column):
##         predicted
## true     0         1         -err.-   
##   0      0.98/0.98 0.02/0.15 0.02     
##   1      0.15/0.02 0.85/0.85 0.15     
##   -err.-      0.02      0.15 0.03     
## 
## 
## Absolute confusion matrix:
##         predicted
## true       0  1 -err.-
##   0      104  2      2
##   1        2 11      2
##   -err.-   2  2      4
logRegwithImpute
## Resample Result
## Task: hcvdata_num
## Learner: classif.logreg.imputed
## Aggr perf: acc.test.mean=0.9663866,fpr.test.mean=0.1538462,fnr.test.mean=0.0188679
## Runtime: 0.0159571

10-fold Crossvalidation

kFold <- makeResampleDesc(method = "CV", iters = 10)
set.seed(123)

logRegwithImpute <- resample(logRegWrapper, hcvdata_Task,
                             resampling = kFold,
                             measures = list(acc, fpr, fnr))
logRegwithImpute
## Resample Result
## Task: hcvdata_num
## Learner: classif.logreg.imputed
## Aggr perf: acc.test.mean=0.9625950,fpr.test.mean=0.2536905,fnr.test.mean=0.0074824
## Runtime: 0.139625
calculateConfusionMatrix(logRegwithImpute$pred, relative = TRUE)
## Relative confusion matrix (normalized by row/column):
##         predicted
## true     0          1          -err.-    
##   0      0.992/0.97 0.008/0.08 0.008     
##   1      0.286/0.03 0.714/0.92 0.286     
##   -err.-       0.03       0.08 0.04      
## 
## 
## Absolute confusion matrix:
##         predicted
## true       0  1 -err.-
##   0      522  4      4
##   1       18 45     18
##   -err.-  18  4     22
logRegwithImpute$aggr
## acc.test.mean fpr.test.mean fnr.test.mean 
##   0.962594974   0.253690476   0.007482376

Logistinės regresijos modelio po 10-fold CV bendras klasifikavimo tikslumas siekia ∼96%.
### LOO Crossvalidation

LOO <- makeResampleDesc(method = "LOO")
set.seed(123)
logRegwithImpute <- resample(logRegWrapper, hcvdata_Task,
                             resampling = LOO,
                             measures = list(acc))
logRegwithImpute$aggr
## acc.test.mean 
##     0.9643463

Logistinės regresijos modelio po LOO CV bendras klasifikavimo tikslumas siekia ∼96%.

Liner SVM (2)

set.seed(123)
svmfit <- svm(Category ~., data = hcvdata_num, kernel = "linear", cost = 10, scale = FALSE)
print(svmfit)
## 
## Call:
## svm(formula = Category ~ ., data = hcvdata_num, kernel = "linear", 
##     cost = 10, scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  linear 
##        cost:  10 
##       gamma:  0.08333333 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  151
hcvdata_numTib <- as_tibble(hcvdata_num)
hcvdata_numTask <- makeClassifTask(data = hcvdata_numTib, target = "Category")
cvForTuning <- makeResampleDesc("Holdout", split = 0.9)
kernels <- c("polynomial", "radial", "sigmoid")
svmParamSpace <- makeParamSet(makeDiscreteParam("kernel", values = kernels),
                              makeIntegerParam("degree", lower = 1, upper = 3),
                              makeNumericParam("cost", lower = 0.1, upper = 10),
                              makeNumericParam("gamma", lower = 0.1, 10))


randSearch <- makeTuneControlRandom(maxit = 10)
outer <- makeResampleDesc("CV", iters = 3)
svmWrapper <- makeTuneWrapper("classif.svm", resampling = cvForTuning,
                              par.set = svmParamSpace, control = randSearch)
cvWithTuning <- resample(learner = svmWrapper, task = hcvdata_numTask, resampling = outer, measures = list(mmce, acc))

calculateConfusionMatrix(cvWithTuning$pred, relative = TRUE)
## Relative confusion matrix (normalized by row/column):
##         predicted
## true     0         1         -err.-   
##   0      0.98/0.97 0.02/0.18 0.02     
##   1      0.25/0.03 0.75/0.82 0.25     
##   -err.-      0.03      0.18 0.04     
## 
## 
## Absolute confusion matrix:
##         predicted
## true       0  1 -err.-
##   0      516 10     10
##   1       16 47     16
##   -err.-  16 10     26
cvWithTuning
## Resample Result
## Task: hcvdata_numTib
## Learner: classif.svm.tuned
## Aggr perf: mmce.test.mean=0.0440968,acc.test.mean=0.9559032
## Runtime: 0.854714

Non-Liner SVM (2)

set.seed(123)
svmfit_nl <- svm(Category ~., data = hcvdata_num, kernel = "radial", cost = 5, scale = FALSE)
print(svmfit_nl)
## 
## Call:
## svm(formula = Category ~ ., data = hcvdata_num, kernel = "radial", 
##     cost = 5, scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  radial 
##        cost:  5 
##       gamma:  0.08333333 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  589

Decision tree (2)

tree.hcv <- tree(Category~., data = hcvdata_num)
summary(tree.hcv)
## 
## Regression tree:
## tree(formula = Category ~ ., data = hcvdata_num)
## Variables actually used in tree construction:
## [1] "AST" "ALT" "ALB" "ALP" "BIL"
## Number of terminal nodes:  8 
## Residual mean deviance:  0.01534 = 8.911 / 581 
## Distribution of residuals:
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -0.600000 -0.005859 -0.005859  0.000000 -0.005859  0.994100
# Error in text.tree(tree.hcv, pretty = 0) : cannot plot singlenode tree
plot(tree.hcv)
text(tree.hcv, pretty = 0)

Category (diagnosis) (values: ‘0=Blood Donor’, ‘1=not Blood Donor’)

tree <- makeLearner("classif.rpart")
treeParamSpace <- makeParamSet(makeIntegerParam("minsplit", lower = 5, upper = 20), makeIntegerParam("minbucket", lower = 3, upper = 10), makeNumericParam("cp", lower = 0.01, upper = 0.1), makeIntegerParam("maxdepth", lower = 3, upper = 10))

randSearch <- makeTuneControlRandom(maxit = 200)
cvForTuning <- makeResampleDesc("CV", iters = 5)

tunedTreePars <- tuneParams(tree, task = hcvTask, resampling = cvForTuning, par.set = treeParamSpace, control = randSearch)

tunedTree <- setHyperPars(tree, par.vals = tunedTreePars$x)
tunedTreeModel <- train(tunedTree, hcvTask)
library(rpart.plot)
treeModelData <- getLearnerModel(tunedTreeModel)
par(mfrow = c(1,1))
rpart.plot(treeModelData, roundint = FALSE, box.palette = "BuBn", type = 5)

Category (diagnosis) (values: ‘0=Blood Donor’, ‘1=not Blood Donor’)

Hold out confusion matrix

holdout <- makeResampleDesc(method = "Holdout", split = 4/5, stratify = TRUE)
set.seed(123)
treeWrapper <- makeTuneWrapper("classif.rpart", resampling = holdout,
                               par.set = treeParamSpace,
                               control = randSearch)

set.seed(123)
cvWithTuning <- resample(treeWrapper, hcvTask, resampling = holdout)
calculateConfusionMatrix(cvWithTuning$pred, relative = TRUE)
## Relative confusion matrix (normalized by row/column):
##         predicted
## true     0           1           -err.-     
##   0      0.991/0.991 0.009/0.077 0.009      
##   1      0.077/0.009 0.923/0.923 0.077      
##   -err.-       0.009       0.077 0.02       
## 
## 
## Absolute confusion matrix:
##         predicted
## true       0  1 -err.-
##   0      105  1      1
##   1        1 12      1
##   -err.-   1  1      2
cvWithTuning
## Resample Result
## Task: hcvdataTib
## Learner: classif.rpart.tuned
## Aggr perf: mmce.test.mean=0.0168067
## Runtime: 3.88362

k-fold

kFold <- makeResampleDesc(method = "CV", iters = 10)
set.seed(123)
treeWrapper <- makeTuneWrapper("classif.rpart", resampling = kFold,
                               par.set = treeParamSpace,
                               control = randSearch)

set.seed(123)
cvWithTuning <- resample(treeWrapper, hcvTask, resampling = kFold)
calculateConfusionMatrix(cvWithTuning$pred, relative = TRUE)
## Relative confusion matrix (normalized by row/column):
##         predicted
## true     0         1         -err.-   
##   0      0.98/0.98 0.02/0.17 0.02     
##   1      0.21/0.02 0.79/0.83 0.21     
##   -err.-      0.02      0.17 0.04     
## 
## 
## Absolute confusion matrix:
##         predicted
## true       0  1 -err.-
##   0      516 10     10
##   1       13 50     13
##   -err.-  13 10     23
cvWithTuning
## Resample Result
## Task: hcvdataTib
## Learner: classif.rpart.tuned
## Aggr perf: mmce.test.mean=0.0390707
## Runtime: 207.956

PCA (2)

pca <- select(hcvdataTib, -Category) %>%
prcomp(center = TRUE, scale = TRUE)

pca
## Standard deviations (1, .., p=12):
##  [1] 1.5668907 1.3900040 1.1915014 1.0584385 1.0228089 0.9660590 0.8477556
##  [8] 0.7919503 0.7537225 0.6836635 0.6281485 0.5634074
## 
## Rotation (n x k) = (12 x 12):
##              PC1         PC2         PC3         PC4        PC5         PC6
## Age   0.17213362  0.10252754 -0.41023548  0.14193681 -0.3177054  0.61572451
## Sex   0.02232163 -0.28777343 -0.34385728  0.37416588 -0.2155273 -0.59874016
## ALB  -0.44898160  0.13241577  0.31360083  0.03629492 -0.2122775 -0.02396007
## ALP   0.17516274  0.41567639 -0.33749197 -0.11945767 -0.2004078 -0.32615576
## ALT  -0.02936048  0.43517007 -0.06337288  0.07341917  0.5932552 -0.10707058
## AST   0.33389741  0.28878072  0.33745539  0.31278700  0.0994621  0.01233662
## BIL   0.32868302  0.03715808  0.35302479  0.14286987 -0.3164594  0.17398305
## CHE  -0.43157941  0.28102652 -0.11430251 -0.06300265  0.1341679  0.13963448
## CHOL -0.31338712  0.26953078 -0.35463691  0.16732009 -0.2112596  0.13290959
## CREA  0.06271502  0.13318870  0.06777793 -0.77311560 -0.2958094 -0.14977330
## GGT   0.31000924  0.48272745  0.02057739  0.07207775 -0.1241001 -0.17607797
## PROT -0.36429669  0.19954441  0.33890157  0.25643756 -0.3739779 -0.15151647
##              PC7         PC8          PC9         PC10        PC11         PC12
## Age  -0.40222782  0.11834442 -0.287541439  0.147092025 -0.06499641 -0.083858454
## Sex  -0.14006184 -0.25479176 -0.179616045  0.229657649 -0.28938425 -0.014982901
## ALB  -0.14964561  0.31026594 -0.226894424 -0.215430799 -0.53416012  0.365657823
## ALP   0.23577159  0.49112753 -0.002312016  0.174225484  0.24496447  0.366956898
## ALT   0.02125241 -0.08223998 -0.632615549 -0.008330094 -0.02816322 -0.169300054
## AST  -0.36156593 -0.33559973  0.160109271  0.153726417  0.10817998  0.528237315
## BIL   0.64789996 -0.12833006 -0.272958552  0.236255770 -0.21101855 -0.063536028
## CHE   0.09525418 -0.12942477  0.360270983  0.669660226 -0.26816456 -0.065220739
## CHOL  0.30845151 -0.50782640  0.059759603 -0.452934921  0.11919029  0.192679024
## CREA -0.21495130 -0.41029262 -0.214264865  0.071561235 -0.02261506  0.002183808
## GGT  -0.11425456  0.05409684  0.376048555 -0.302109240 -0.37118365 -0.482084182
## PROT -0.16181351  0.04290011 -0.111979685  0.139113253  0.53410242 -0.372635732
fviz_pca_var(pca)

fviz_screeplot(pca, addlabels = TRUE, choice = "eigenvalue")

fviz_screeplot(pca, addlabels = TRUE, choice = "variance")

TDPca <- hcvdataTib %>%
mutate(PCA1 = pca$x[, 1], PCA2 = pca$x[, 2])

ggplot(TDPca, aes(PCA1, PCA2, col = Category)) +
geom_point() +
theme_bw()

UMAP (2)

hcvdataUMAP <- dplyr::select(hcvdataTib, -Category) %>% as.matrix() %>% umap(n_neighbors = 10, min_dist = 0.1, metric = "manhattan", n_epochs = 200, verbose = TRUE)

hcvdataTibmap <- hcvdataTib %>% mutate_if(.funs = scale, .predicate = is.numeric, scale = FALSE) %>% mutate(UMAP1 = hcvdataUMAP$layout[, 1], UMAP2 = hcvdataUMAP$layout[, 2]) %>% gather(key = "Variable", value = "Value", c(-UMAP1, -UMAP2, -Category))

ggplot(hcvdataTibmap, aes(UMAP1, UMAP2, col = Category))+ geom_point() + theme_bw()

Rezultayai

lentelė

Modelis1 Modelis2
LDA K-FOLD 94,0 % 95,6 %
LDA LOO 93,9% 95,6 %
LDA Holdout 92,5% 96,6 %
QDA K-FOLD - 95,1 %
QDA LOO - 95,2 %
QDA Holdout - 95,0 %
KNN K-FOLD 90,0% 94,1 %
KNN LOO 90,2% 94,7 %
KNN Holdout 88,3% 95,6 %
Logistic reg. K-FOLD - 96,2 %
Logistic reg. LOO - 94,7 %
Logistic reg. Holdout - 96,6 %
Liner SVM K-FOLD 91,5% 95,6%
Liner SVM LOO - -
Liner SVM Holdout - -
Non-Liner SVM K-FOLD - -
Non-Liner SVM LOO - -
Non-Liner SVM Holdout - -
Tree K-FOLD 90,8% 98,3%
Tree LOO - -
Tree Holdout 92,4% 96,1%

Išvados

Skirtingų klasifikatorių tikslumo įverčiai rodo, kad Modeliui Nr. 1 tinkamiausias yra LDA klasifikavimo algoritmas. Naudojant šį algoritmą, gauti aukščiausi tikslumai taikant k-fold validavimo metodą, o Modeliui Nr. 2 geriausius rezultatus rodo LDA gaunant aukščiausią tikslumą taikant k-fold ir Logistic regression taikant Holdout validavimo metodą.
Kaip ir buvo galima tikėti validavimo rezultatai gavosi geresni Modelyje nr. 2 nes prie to pačio duomenų kiekio buvo sumažintos klasifikavimo klasės iš 5 pradinių klasių liko tik 2.

Taikant dimensijų mažinimo metodus, buvo nustatyta, kad dimensijų mažinimui tinkamiausia yra pagrindinių komponenčių analizė. Atvaizdavus duomenis pagal dvi pirmas pagrindines komponentes, galime matyti skirtingų klasių atsiskyrimą. Tačiau naudojant netiesinius - t-SNE ir UMAP metodus, taip pat galima įžvelgti struktūrą, dviejų skirtingų klasių pasiskirstyme.

Šio darbo analizės metodika galima būtų taikyti su panašaus pobūdžio duomenismis, t. y. su keletų klasifikuojamų klasių ir panašiu kiekiu skaitinių duomenų ir duomenų dimencijų kiekiu.

Gediminas Kazėnas 2022-05-19