Modulos

library(readxl)
library(readr)
#install.packages("factoextra")
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.0.5
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#install.packages("Methy1IT")
library("FactoMineR")
## Warning: package 'FactoMineR' was built under R version 4.0.5
#install.packages("viridis")
library(viridis)
## Warning: package 'viridis' was built under R version 4.0.5
## Loading required package: viridisLite
## Warning: package 'viridisLite' was built under R version 4.0.5
#install.packages("paletteer")
library(paletteer)
## Warning: package 'paletteer' was built under R version 4.0.5
library(caret)
## Warning: package 'caret' was built under R version 4.0.5
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.0.5
library(mlbench)
## Warning: package 'mlbench' was built under R version 4.0.5
library(foreign)
library(nnet)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#install.packages("report")
library(report)
## Warning: package 'report' was built under R version 4.0.5
library(mlogit)
## Warning: package 'mlogit' was built under R version 4.0.5
## Loading required package: dfidx
## Warning: package 'dfidx' was built under R version 4.0.5
## 
## Attaching package: 'dfidx'
## The following object is masked from 'package:stats':
## 
##     filter
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.0.5
library(caret)
# if (!require(remotes)) {
#     install.packages("remotes")
# }
# remotes::install_github('jorvlan/raincloudplots')

library(raincloudplots)

Leitura dos dados

# snv_data = as.data.frame(read.csv("snv_data.csv"))
# #snv_data[is.na(snv_data)] = 0
# snv_data = snv_data[,-c(1)]
# snv_data[is.na(snv_data)] = 0
snv_data_complete = as.data.frame(read.csv("snv_data.csv"))
snv_data_complete = snv_data_complete[,-c(1)]
snv_data_complete$clade = as.factor(snv_data_complete$clade)
snv_data_complete[is.na(snv_data_complete)] = 0
snv_data = as.data.frame(read.csv("snv_data_r.csv")) # > 200
snv_data[is.na(snv_data)] = 0
#snv_data = snv_data[,-c(1,2)] #,7,10,11,15,16,17,18,19,20,21,22
snv_data = snv_data[,-c(1,2)] # 3,4,5,6,7,8,9,10,14,15,16,17,18,19,21
snv_data
# snv_data = as.data.frame(read.csv("snv_data_r2.csv")) #> 5000
# #snv_data[is.na(snv_data)] = 0
# snv_data = snv_data[,-c(1,2)]
# snv_data
snv_data$clade = as.factor(snv_data$clade)
table(snv_data$clade) # we have some clades/variants that are very lowly represented. Remove all clades that are under 200 samples/patients(excluding 19A since it will be our reference for the multinomial logistic regression)
## 
##             19A             19B             20A             20B             20C 
##             137              60            1073            1042              71 
##             20D       20E (EU1)             20G  20H (Beta, V2) 20I (Alpha, V1) 
##             120            1273              16             117            4998 
## 20J (Gamma, V3)     21A (Delta)     21B (Kappa)       21D (Eta)      21F (Iota) 
##             198             116               9              25               2 
##    21G (Lambda)        21H (Mu)     21I (Delta)     21J (Delta) 
##               2              24             290           10537
table(snv_data_complete$clade) #remover todas abaixo de 200 samples - 19B, 20C, 20D, 20G,20H,20J,21A,21B,21D,21F,21G,21H,21I
## 
##             19A             19B             20A             20B             20C 
##             137              60            1073            1042              71 
##             20D       20E (EU1)             20G  20H (Beta, V2) 20I (Alpha, V1) 
##             120            1273              16             117            4998 
## 20J (Gamma, V3)     21A (Delta)     21B (Kappa)       21D (Eta)      21F (Iota) 
##             198             116               9              25               2 
##    21G (Lambda)        21H (Mu)     21I (Delta)     21J (Delta) 
##               2              24             290           10537
clades_sub_200 = c("19B", "20C", "20D", "20G","20H (Beta, V2)","21A (Delta)","21B (Kappa)","21D (Eta)","21F (Iota)","21G (Lambda)","21H (Mu)","21I (Delta)")


for (sub_clade in clades_sub_200) {
      snv_data_complete = snv_data_complete[!(snv_data_complete$clade == sub_clade),]
}

table(snv_data_complete$clade)
## 
##             19A             19B             20A             20B             20C 
##             137               0            1073            1042               0 
##             20D       20E (EU1)             20G  20H (Beta, V2) 20I (Alpha, V1) 
##               0            1273               0               0            4998 
## 20J (Gamma, V3)     21A (Delta)     21B (Kappa)       21D (Eta)      21F (Iota) 
##             198               0               0               0               0 
##    21G (Lambda)        21H (Mu)     21I (Delta)     21J (Delta) 
##               0               0               0           10537
snv_data_complete # We went from 20110 samples to 19258 when removing the clades < 200 samples
table(snv_data$clade) #remover todas abaixo de 200 samples - 19B, 20C, 20D, 20G,20H,20J,21A,21B,21D,21F,21G,21H,21I
## 
##             19A             19B             20A             20B             20C 
##             137              60            1073            1042              71 
##             20D       20E (EU1)             20G  20H (Beta, V2) 20I (Alpha, V1) 
##             120            1273              16             117            4998 
## 20J (Gamma, V3)     21A (Delta)     21B (Kappa)       21D (Eta)      21F (Iota) 
##             198             116               9              25               2 
##    21G (Lambda)        21H (Mu)     21I (Delta)     21J (Delta) 
##               2              24             290           10537
clades_sub_200 = c("19B", "20C", "20D", "20G","20H (Beta, V2)","21A (Delta)","21B (Kappa)","21D (Eta)","21F (Iota)","21G (Lambda)","21H (Mu)","21I (Delta)")

library(broom)
for (sub_clade in clades_sub_200) {
      snv_data = snv_data[!(snv_data$clade == sub_clade),]
}
snv_data$clade = droplevels(snv_data$clade)

levels(snv_data$clade)
## [1] "19A"             "20A"             "20B"             "20E (EU1)"      
## [5] "20I (Alpha, V1)" "20J (Gamma, V3)" "21J (Delta)"
table(snv_data$clade)
## 
##             19A             20A             20B       20E (EU1) 20I (Alpha, V1) 
##             137            1073            1042            1273            4998 
## 20J (Gamma, V3)     21J (Delta) 
##             198           10537
snv_data # We went from 20110 samples to 19258 when removing the clades < 200 samples

PCA

Aplicar PCA

x = PCA(snv_data_complete[,c(2:ncol(snv_data_complete))], graph = F, scale.unit=T, ncp=5) #scalling ja está como true como default
x
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 19258 individuals, described by 251 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"
#PCA
res_pca = PCA(snv_data[,c(2:ncol(snv_data))], graph = F, scale.unit=T, ncp=5) #scalling ja está como true como default
res_pca
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 19258 individuals, described by 18 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"
head(get_pca_ind(res_pca)$coord) #coordinates for the individual points
##       Dim.1     Dim.2      Dim.3      Dim.4     Dim.5
## 1 0.8180204 -6.317568 -1.0135386  0.5003879 0.5530899
## 2 0.7855081 -5.202732 -0.1480969 -1.1426849 0.6404721
## 3 0.5976934 -5.770061 -0.9072300  0.6297764 0.4379495
## 4 0.7855081 -5.202732 -0.1480969 -1.1426849 0.6404721
## 5 0.5976934 -5.770061 -0.9072300  0.6297764 0.4379495
## 6 0.5976934 -5.770061 -0.9072300  0.6297764 0.4379495

Obeter eigenvalues

# Eigenvalues
eig = as.data.frame(get_eig(res_pca))
eig

Plot cumsum da variancia

plot(eig$cumulative.variance.percent, xlab="Principal component", ylab="Accumulative prop. of varaition explained", ylim = c(0,100), xlim = c(1,10), type="b")

Scree plot

fviz_screeplot(x, addlabels = TRUE, ylim = c(0, 50))

# Scree plot 
fviz_screeplot(res_pca, addlabels = TRUE, ylim = c(0, 50))

Qualidade da representaçao

library(corrplot)
## corrplot 0.90 loaded
corrplot(res_pca$var$cos2, is.corr=F)

res_pca$var
## $coord
##                     Dim.1       Dim.2       Dim.3        Dim.4        Dim.5
## C.T           -0.23640425  0.82579106  0.09368659  0.065535723 -0.163585822
## A.G           -0.62369621  0.60201734  0.02889033  0.061573177  0.119039562
## G.A           -0.08248277  0.56927539  0.38200693 -0.713683759  0.077815339
## G.C            0.94001183  0.17030787  0.06074134 -0.031392159  0.001460332
## T.C            0.15167457  0.69492400  0.16263437  0.089222557 -0.629899195
## C.A            0.55635544  0.72059700 -0.11729627  0.094867763  0.113205348
## G.T           -0.80895142  0.45048052  0.04857301 -0.047664886 -0.076740343
## A.T            0.85304991  0.33898207  0.03013370  0.042133431 -0.047323872
## T.A            0.86045362  0.32225144  0.06620806  0.020662504  0.070658096
## T.G           -0.23423510  0.77432971 -0.17903095  0.115336985  0.201793390
## A.C            0.07548904 -0.02831914  0.91624129  0.322461159  0.198480534
## C.G           -0.90067470  0.30806004  0.02622667  0.101692576 -0.016660402
## A.del          0.03704702  0.87415699 -0.18746650  0.060780519  0.206595292
## TAT.del        0.85140024  0.35934206 -0.11437011  0.013855637  0.044233022
## TCTGGTTTT.del  0.92084375  0.33059061  0.05984170 -0.002812507  0.034220503
## TACATG.del     0.88416951  0.34404647 -0.09643531 -0.004987176  0.018810739
## AGTTCA.del    -0.83047889  0.42839642 -0.03063962  0.026210191  0.044371888
## GATTTC.del    -0.87726952  0.43428109 -0.03800711  0.028578169  0.055469322
## 
## $cor
##                     Dim.1       Dim.2       Dim.3        Dim.4        Dim.5
## C.T           -0.23640425  0.82579106  0.09368659  0.065535723 -0.163585822
## A.G           -0.62369621  0.60201734  0.02889033  0.061573177  0.119039562
## G.A           -0.08248277  0.56927539  0.38200693 -0.713683759  0.077815339
## G.C            0.94001183  0.17030787  0.06074134 -0.031392159  0.001460332
## T.C            0.15167457  0.69492400  0.16263437  0.089222557 -0.629899195
## C.A            0.55635544  0.72059700 -0.11729627  0.094867763  0.113205348
## G.T           -0.80895142  0.45048052  0.04857301 -0.047664886 -0.076740343
## A.T            0.85304991  0.33898207  0.03013370  0.042133431 -0.047323872
## T.A            0.86045362  0.32225144  0.06620806  0.020662504  0.070658096
## T.G           -0.23423510  0.77432971 -0.17903095  0.115336985  0.201793390
## A.C            0.07548904 -0.02831914  0.91624129  0.322461159  0.198480534
## C.G           -0.90067470  0.30806004  0.02622667  0.101692576 -0.016660402
## A.del          0.03704702  0.87415699 -0.18746650  0.060780519  0.206595292
## TAT.del        0.85140024  0.35934206 -0.11437011  0.013855637  0.044233022
## TCTGGTTTT.del  0.92084375  0.33059061  0.05984170 -0.002812507  0.034220503
## TACATG.del     0.88416951  0.34404647 -0.09643531 -0.004987176  0.018810739
## AGTTCA.del    -0.83047889  0.42839642 -0.03063962  0.026210191  0.044371888
## GATTTC.del    -0.87726952  0.43428109 -0.03800711  0.028578169  0.055469322
## 
## $cos2
##                     Dim.1        Dim.2        Dim.3        Dim.4        Dim.5
## C.T           0.055886969 0.6819308771 0.0087771765 4.294931e-03 2.676032e-02
## A.G           0.388996968 0.3624248762 0.0008346510 3.791256e-03 1.417042e-02
## G.A           0.006803407 0.3240744740 0.1459292952 5.093445e-01 6.055227e-03
## G.C           0.883622242 0.0290047692 0.0036895100 9.854676e-04 2.132568e-06
## T.C           0.023005174 0.4829193653 0.0264499378 7.960665e-03 3.967730e-01
## C.A           0.309531373 0.5192600435 0.0137584156 8.999892e-03 1.281545e-02
## G.T           0.654402404 0.2029326962 0.0023593376 2.271941e-03 5.889080e-03
## A.T           0.727694153 0.1149088411 0.0009080397 1.775226e-03 2.239549e-03
## T.A           0.740380424 0.1038459899 0.0043835074 4.269391e-04 4.992567e-03
## T.G           0.054866080 0.5995864967 0.0320520823 1.330262e-02 4.072057e-02
## A.C           0.005698595 0.0008019737 0.8394980976 1.039812e-01 3.939452e-02
## C.G           0.811214924 0.0949009887 0.0006878384 1.034138e-02 2.775690e-04
## A.del         0.001372481 0.7641504479 0.0351436899 3.694272e-03 4.268161e-02
## TAT.del       0.724882365 0.1291267190 0.0130805209 1.919787e-04 1.956560e-03
## TCTGGTTTT.del 0.847953204 0.1092901502 0.0035810289 7.910193e-06 1.171043e-03
## TACATG.del    0.781755730 0.1183679756 0.0092997687 2.487192e-05 3.538439e-04
## AGTTCA.del    0.689695193 0.1835234957 0.0009387862 6.869741e-04 1.968864e-03
## GATTTC.del    0.769601806 0.1886000623 0.0014445405 8.167118e-04 3.076846e-03
## 
## $contrib
##                     Dim.1       Dim.2       Dim.3        Dim.4        Dim.5
## C.T            0.65924941 13.61234506  0.76803045  0.638273002 4.450417e+00
## A.G            4.58865505  7.23453452  0.07303457  0.563421499 2.356633e+00
## G.A            0.08025381  6.46900399 12.76927052 75.694079200 1.007024e+00
## G.C           10.42331430  0.57897793  0.32284368  0.146451102 3.546601e-04
## T.C            0.27137180  9.63978206  2.31445243  1.183040493 6.598595e+01
## C.A            3.65126933 10.36519554  1.20390447  1.337480939 2.131294e+00
## G.T            7.71940951  4.05083561  0.20644943  0.337634950 9.793927e-01
## A.T            8.58396781  2.29374977  0.07945632  0.263817707 3.724517e-01
## T.A            8.73361659  2.07291896  0.38357063  0.063447745 8.302966e-01
## T.G            0.64720689 11.96862990  2.80465762  1.976912603 6.772098e+00
## A.C            0.06722131  0.01600858 73.45871363 15.452726017 6.551568e+00
## C.G            9.56918887  1.89436356  0.06018802  1.536840425 4.616155e-02
## A.del          0.01618996 15.25356883  3.07518297  0.549008532 7.098233e+00
## TAT.del        8.55079961  2.57755957  1.14458656  0.028530100 3.253888e-01
## TCTGGTTTT.del 10.00255804  2.18159242  0.31335125  0.001175540 1.947521e-01
## TACATG.del     9.22168468  2.36279920  0.81375890  0.003696236 5.884656e-02
## AGTTCA.del     8.13572750  3.66339938  0.08214673  0.102091748 3.274351e-01
## GATTTC.del     9.07831553  3.76473513  0.12640182  0.121372162 5.116996e-01

Contribuiçoes das variaveis para as componentes principais

corrplot(res_pca$var$contrib, is.corr=F)

Variaveis

# Extract the results for varaibles

var =  get_pca_var(res_pca)
var
## Principal Component Analysis Results for variables
##  ===================================================
##   Name       Description                                    
## 1 "$coord"   "Coordinates for the variables"                
## 2 "$cor"     "Correlations between variables and dimensions"
## 3 "$cos2"    "Cos2 for the variables"                       
## 4 "$contrib" "contributions of the variables"

Coordenadas das variaveis

#Coordinates of variables
head(var$coord)
##           Dim.1     Dim.2       Dim.3       Dim.4        Dim.5
## C.T -0.23640425 0.8257911  0.09368659  0.06553572 -0.163585822
## A.G -0.62369621 0.6020173  0.02889033  0.06157318  0.119039562
## G.A -0.08248277 0.5692754  0.38200693 -0.71368376  0.077815339
## G.C  0.94001183 0.1703079  0.06074134 -0.03139216  0.001460332
## T.C  0.15167457 0.6949240  0.16263437  0.08922256 -0.629899195
## C.A  0.55635544 0.7205970 -0.11729627  0.09486776  0.113205348

Contribuiçao das variaveis

#Contribution of variabls
head(var$contrib)
##           Dim.1      Dim.2       Dim.3      Dim.4        Dim.5
## C.T  0.65924941 13.6123451  0.76803045  0.6382730 4.450417e+00
## A.G  4.58865505  7.2345345  0.07303457  0.5634215 2.356633e+00
## G.A  0.08025381  6.4690040 12.76927052 75.6940792 1.007024e+00
## G.C 10.42331430  0.5789779  0.32284368  0.1464511 3.546601e-04
## T.C  0.27137180  9.6397821  2.31445243  1.1830405 6.598595e+01
## C.A  3.65126933 10.3651955  1.20390447  1.3374809 2.131294e+00

Grafico de contribuiçao de variaveis

# Control var colors using their contribution
fviz_pca_var(res_pca, col.var="contrib",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = T) #avoid text overlapping)

## Contribuiçao de variaveis para PC1 e PC2

# Contribution of variables to PC1
fviz_contrib(res_pca, choice = "var", axes=1, top=6)

# PC2
fviz_contrib(res_pca, choice = "var", axes=2, top=6)

Extrair os resultados para pontos individuais

# Extract the results for individuals
ind = get_pca_ind(res_pca)
head(ind$coord)
##       Dim.1     Dim.2      Dim.3      Dim.4     Dim.5
## 1 0.8180204 -6.317568 -1.0135386  0.5003879 0.5530899
## 2 0.7855081 -5.202732 -0.1480969 -1.1426849 0.6404721
## 3 0.5976934 -5.770061 -0.9072300  0.6297764 0.4379495
## 4 0.7855081 -5.202732 -0.1480969 -1.1426849 0.6404721
## 5 0.5976934 -5.770061 -0.9072300  0.6297764 0.4379495
## 6 0.5976934 -5.770061 -0.9072300  0.6297764 0.4379495

Dataframe com os pontos individuais do PCA

pca_val = data.frame(snv_data$clade, ind$coord)
pca_val$snv_data.clade = as.factor(pca_val$snv_data.clade)
pca_val
# viridis_pal(option = "D")(19)
# paletteer_c("scico::berlin", n = 19)
# 
# paletteer_d("ggsci::default_igv")-
# 

Grafico PCA

# Compute PCA using habillage to specify groups for colouring

fviz_pca_ind(res_pca,
             label = "none", # hide individual labels
             habillage = snv_data$clade, # color by groups
             palette = paletteer_d("ggsci::default_igv",n=7),
             addEllipses = TRUE # Concentration ellipses
             )

LOG REG MULTINOMIAL

Analise geral

#install.packages("jmv")
library(jmv)
## Warning: package 'jmv' was built under R version 4.0.5
descriptives(snv_data, freq =T)
## 
##  DESCRIPTIVES
## 
##  Descriptives                                                                                                                                                                                                                                                                
##  --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 
##                          clade    C.T         A.G         G.A          G.C         T.C         C.A         G.T         A.T          T.A          T.G          A.C           C.G          A.del        TAT.del      TCTGGTTTT.del    TACATG.del    AGTTCA.del    GATTTC.del   
##  --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 
##    N                     19258       19258       19258        19258       19258       19258       19258       19258        19258        19258        19258         19258        19258        19258        19258            19258         19258         19258         19258   
##    Missing                   0           0           0            0           0           0           0           0            0            0            0             0            0            0            0                0             0             0             0   
##    Mean                           13.84765    3.541489     2.271212    1.051875    2.312494    1.640098    6.021446    0.7860629    0.3087029    0.8061585    0.09746599     1.189220    0.8439090    0.2475854        0.2707966     0.2413023     0.5093468     0.5453837   
##    Median                         15.00000    4.000000     2.000000    0.000000    2.000000    2.000000    7.000000     0.000000     0.000000     1.000000      0.000000     2.000000     1.000000     0.000000         0.000000      0.000000      1.000000      1.000000   
##    Standard deviation             3.943129    1.552752    0.9980627    1.320035    1.141343    1.103948    3.805838     1.113177    0.4970579    0.4888214     0.3627553    0.9516123    0.4507638    0.4316208        0.4443828     0.4278843     0.4999256     0.4979490   
##    Minimum                        0.000000    0.000000     0.000000    0.000000    0.000000    0.000000    0.000000     0.000000     0.000000     0.000000      0.000000     0.000000     0.000000     0.000000         0.000000      0.000000      0.000000      0.000000   
##    Maximum                        25.00000    29.00000     7.000000    5.000000    17.00000    5.000000    17.00000     5.000000     2.000000     3.000000      4.000000     4.000000     2.000000     1.000000         1.000000      1.000000      1.000000      1.000000   
##  --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 
## 
## 
##  FREQUENCIES
## 
##  Frequencies of clade                                        
##  ----------------------------------------------------------- 
##    Levels             Counts    % of Total    Cumulative %   
##  ----------------------------------------------------------- 
##    19A                   137       0.71139         0.71139   
##    20A                  1073       5.57171         6.28310   
##    20B                  1042       5.41074        11.69384   
##    20E (EU1)            1273       6.61024        18.30408   
##    20I (Alpha, V1)      4998      25.95285        44.25693   
##    20J (Gamma, V3)       198       1.02814        45.28508   
##    21J (Delta)         10537      54.71492       100.00000   
##  ----------------------------------------------------------- 
## 
## 
##  Frequencies of G.A                                 
##  -------------------------------------------------- 
##    Levels    Counts    % of Total    Cumulative %   
##  -------------------------------------------------- 
##    0           1483       7.70070         7.70070   
##    1            615       3.19348        10.89417   
##    2          10491      54.47606        65.37024   
##    3           4947      25.68803        91.05826   
##    4           1380       7.16585        98.22411   
##    5            276       1.43317        99.65729   
##    6             61       0.31675        99.97404   
##    7              5    2.596324e-4       100.00000   
##  -------------------------------------------------- 
## 
## 
##  Frequencies of G.C                                 
##  -------------------------------------------------- 
##    Levels    Counts    % of Total    Cumulative %   
##  -------------------------------------------------- 
##    0          10062      52.24842        52.24842   
##    1           3746      19.45166        71.70007   
##    2            464       2.40939        74.10946   
##    3           4367      22.67629        96.78575   
##    4            613       3.18309        99.96884   
##    5              6    3.115588e-4       100.00000   
##  -------------------------------------------------- 
## 
## 
##  Frequencies of C.A                                 
##  -------------------------------------------------- 
##    Levels    Counts    % of Total    Cumulative %   
##  -------------------------------------------------- 
##    0           3271      16.98515        16.98515   
##    1           6153      31.95036        48.93551   
##    2           4507      23.40326        72.33877   
##    3           4934      25.62052        97.95929   
##    4            349       1.81223        99.77152   
##    5             44       0.22848       100.00000   
##  -------------------------------------------------- 
## 
## 
##  Frequencies of A.T                                 
##  -------------------------------------------------- 
##    Levels    Counts    % of Total    Cumulative %   
##  -------------------------------------------------- 
##    0          11769      61.11227        61.11227   
##    1           2272      11.79769        72.90996   
##    2           3007      15.61429        88.52425   
##    3           1997      10.36972        98.89397   
##    4            204       1.05930        99.95327   
##    5              9    4.673382e-4       100.00000   
##  -------------------------------------------------- 
## 
## 
##  Frequencies of T.A                                 
##  -------------------------------------------------- 
##    Levels    Counts    % of Total    Cumulative %   
##  -------------------------------------------------- 
##    0          13637      70.81213        70.81213   
##    1           5297      27.50545        98.31758   
##    2            324       1.68242       100.00000   
##  -------------------------------------------------- 
## 
## 
##  Frequencies of T.G                                 
##  -------------------------------------------------- 
##    Levels    Counts    % of Total    Cumulative %   
##  -------------------------------------------------- 
##    0           4515      23.44480        23.44480   
##    1          13975      72.56724        96.01205   
##    2            754       3.91526        99.92730   
##    3             14    7.269706e-4       100.00000   
##  -------------------------------------------------- 
## 
## 
##  Frequencies of A.C                                 
##  -------------------------------------------------- 
##    Levels    Counts    % of Total    Cumulative %   
##  -------------------------------------------------- 
##    0          17756      92.20064        92.20064   
##    1           1165       6.04943        98.25008   
##    2            306       1.58895        99.83903   
##    3             24       0.12462        99.96365   
##    4              7    3.634853e-4       100.00000   
##  -------------------------------------------------- 
## 
## 
##  Frequencies of C.G                                 
##  -------------------------------------------------- 
##    Levels    Counts    % of Total    Cumulative %   
##  -------------------------------------------------- 
##    0           7138      37.06512        37.06512   
##    1           1440       7.47741        44.54253   
##    2          10580      54.93821        99.48074   
##    3             98       0.50888        99.98961   
##    4              2    1.038529e-4       100.00000   
##  -------------------------------------------------- 
## 
## 
##  Frequencies of A.del                               
##  -------------------------------------------------- 
##    Levels    Counts    % of Total    Cumulative %   
##  -------------------------------------------------- 
##    0           3694      19.18164        19.18164   
##    1          14876      77.24582        96.42746   
##    2            688       3.57254       100.00000   
##  -------------------------------------------------- 
## 
## 
##  Frequencies of TAT.del                             
##  -------------------------------------------------- 
##    Levels    Counts    % of Total    Cumulative %   
##  -------------------------------------------------- 
##    0          14490      75.24146        75.24146   
##    1           4768      24.75854       100.00000   
##  -------------------------------------------------- 
## 
## 
##  Frequencies of TCTGGTTTT.del                       
##  -------------------------------------------------- 
##    Levels    Counts    % of Total    Cumulative %   
##  -------------------------------------------------- 
##    0          14043      72.92034        72.92034   
##    1           5215      27.07966       100.00000   
##  -------------------------------------------------- 
## 
## 
##  Frequencies of TACATG.del                          
##  -------------------------------------------------- 
##    Levels    Counts    % of Total    Cumulative %   
##  -------------------------------------------------- 
##    0          14611      75.86977        75.86977   
##    1           4647      24.13023       100.00000   
##  -------------------------------------------------- 
## 
## 
##  Frequencies of AGTTCA.del                          
##  -------------------------------------------------- 
##    Levels    Counts    % of Total    Cumulative %   
##  -------------------------------------------------- 
##    0           9449      49.06532        49.06532   
##    1           9809      50.93468       100.00000   
##  -------------------------------------------------- 
## 
## 
##  Frequencies of GATTTC.del                          
##  -------------------------------------------------- 
##    Levels    Counts    % of Total    Cumulative %   
##  -------------------------------------------------- 
##    0           8755      45.46163        45.46163   
##    1          10503      54.53837       100.00000   
##  --------------------------------------------------

Modelo Multinomial - Completo

Matriz de correlaçao com dados originais

library(ggcorrplot)

MatCorr=cor(snv_data[,sapply(snv_data,is.numeric)])
# MatCorr[upper.tri(MatCorr)] <- 0
# diag(MatCorr) <- 0
# MatCorrNew = snv_data[, !apply(MatCorr, 2, function(x) any(abs(x) > 0.70, na.rm = TRUE))]
# head(MatCorrNew)
#ggcorrplot(MatCorr)
ggcorrplot(MatCorr,hc.order = TRUE,type = "lower",lab = TRUE)

Dados de treino e teste - Completo

index <- createDataPartition(snv_data$clade, p = .75, list = FALSE)
train <- snv_data[index,]
test <- snv_data[-index,]

# Set the reference
snv_data$clade = relevel(snv_data$clade, ref = "19A")

Modelo completo

# Training the multinomial classification model
multinom_model_0 = multinom(clade ~ 1, data=train)
## # weights:  14 (6 variable)
## initial  value 28110.618013 
## iter  10 value 18220.901184
## final  value 18213.843066 
## converged
summary(multinom_model_0)
## Call:
## multinom(formula = clade ~ 1, data = train)
## 
## Coefficients:
##                 (Intercept)
## 20A               2.0537277
## 20B               2.0248639
## 20E (EU1)         2.2249170
## 20I (Alpha, V1)   3.5925160
## 20J (Gamma, V3)   0.3680177
## 21J (Delta)       4.3381867
## 
## Std. Errors:
##                 (Intercept)
## 20A              0.10455322
## 20B              0.10472710
## 20E (EU1)        0.10361400
## 20I (Alpha, V1)  0.09977704
## 20J (Gamma, V3)  0.12804046
## 21J (Delta)      0.09907207
## 
## Residual Deviance: 36427.69 
## AIC: 36439.69
multinom_model_completo = multinom(clade ~ ., data = train) #C.T+A.G+G.A+G.C+T.C+C.A+G.T+A.C+C.G+A.del+TAT.del+TCTGGTTTT.del+TACATG.del+GATTTC.del
## # weights:  140 (114 variable)
## initial  value 28110.618013 
## iter  10 value 6381.341107
## iter  20 value 5003.825041
## iter  30 value 3088.613327
## iter  40 value 1467.644761
## iter  50 value 830.139232
## iter  60 value 588.077634
## iter  70 value 466.698407
## iter  80 value 402.163323
## iter  90 value 375.773024
## iter 100 value 362.921520
## final  value 362.921520 
## stopped after 100 iterations
summary(multinom_model_completo)
## Call:
## multinom(formula = clade ~ ., data = train)
## 
## Coefficients:
##                 (Intercept)       C.T      A.G        G.A      G.C        T.C
## 20A               -2.232188 0.5631243 2.078674  0.8866774 1.676296 -0.2765780
## 20B               -7.150483 0.4108020 2.030366  3.0356399 8.139922 -1.1977245
## 20E (EU1)         -8.690658 0.9793905 0.536932 -0.8610208 8.485051  0.1428766
## 20I (Alpha, V1)  -26.253006 0.6920424 3.064677  1.9015196 8.900513 -0.6761139
## 20J (Gamma, V3)  -30.289500 0.2754616 2.950645  3.4052558 7.098903 -0.7900014
## 21J (Delta)      -21.530845 0.2633287 4.152028  1.6234129 3.693423 -0.8058056
##                        C.A        G.T        A.T        T.A        T.G
## 20A             -0.3613536 -0.5453304 -0.6014496 -0.9325553 -0.2267948
## 20B              1.5467829 -1.7835614 -0.3222343 -0.3903785 -0.3979370
## 20E (EU1)       -1.7482932 -1.7864131 -0.7562684 -0.3691549 -2.8857180
## 20I (Alpha, V1)  5.8535623 -1.9068270 -1.0377637 -2.7269900  0.4649198
## 20J (Gamma, V3)  4.2324776 -1.7080066  1.6549862 -1.2232224  0.1065285
## 21J (Delta)      3.1827297 -0.1270031 -2.3476473 -3.9741785  3.5425692
##                       A.C       C.G      A.del    TAT.del TCTGGTTTT.del
## 20A             4.6005091 -3.700154  -1.082457  -1.185638      1.076975
## 20B             5.2972711 -3.017966  -5.223096  -2.543260      5.686605
## 20E (EU1)       1.7434858  7.334826   3.076305   3.372635     -7.132551
## 20I (Alpha, V1) 0.4001902 -5.382867   3.544194  -1.419859      8.230024
## 20J (Gamma, V3) 8.0442153  6.474189 -10.882142 -10.961137     14.858393
## 21J (Delta)     5.1291340  1.557868   4.160116   7.446166     -4.651873
##                 TACATG.del AGTTCA.del GATTTC.del
## 20A              -2.806915  -2.955139  -1.537141
## 20B             -11.541670   1.588817   8.723889
## 20E (EU1)        -7.006095  -3.260034  -7.756161
## 20I (Alpha, V1)  -6.521287  -2.676058  16.243408
## 20J (Gamma, V3)  -5.706002  -4.863390  12.465974
## 21J (Delta)      -5.235041  -5.115652   3.537802
## 
## Std. Errors:
##                 (Intercept)        C.T       A.G       G.A      G.C       T.C
## 20A               0.5126303 0.09462691 0.3668723 0.3024753 1.549421 0.2651636
## 20B               0.8295735 0.11575380 0.4038445 0.3608313 1.635832 0.3520679
## 20E (EU1)         1.4511831 0.14384580 0.5972336 0.5398976 1.806074 0.4906998
## 20I (Alpha, V1)   6.6824179 0.30921626 0.9268575 0.7847808 1.990003 0.9096547
## 20J (Gamma, V3)  20.3458880 1.02008554 3.2455845 2.1515772 3.327715 1.2803225
## 21J (Delta)       5.5657601 0.27054904 0.9755279 0.7186657 2.073235 0.4762232
##                       C.A       G.T       A.T       T.A       T.G      A.C
## 20A             0.7576483 0.1418155 0.7241551 0.9931413 0.9866133 12.98679
## 20B             0.8827966 0.1888619 0.7694232 1.2235104 1.0629017 12.99318
## 20E (EU1)       1.0987364 0.3194653 0.9143865 1.7431027 1.1978133 13.01619
## 20I (Alpha, V1) 1.3737033 0.7177870 1.5436548 3.1776625 2.0847932 13.13035
## 20J (Gamma, V3) 4.1005105 1.1013066 2.8223454 5.3562623 6.9329937 13.41250
## 21J (Delta)     1.3623068 0.5394812 1.6266465 2.7143761 1.8204771 13.08769
##                      C.G    A.del  TAT.del TCTGGTTTT.del TACATG.del AGTTCA.del
## 20A             1.686605 1.508414 2.473818      3.550204   2.497784   16.66562
## 20B             1.861300 1.952115 3.311980      3.722145   3.268168   33.87846
## 20E (EU1)       1.931398 1.966098 3.861787      4.549092   3.385109   14.27703
## 20I (Alpha, V1) 3.115632 3.063053 3.776505      4.528164   3.757268   14.23281
## 20J (Gamma, V3) 5.334915 6.394490 7.465190      7.398119   6.163452   48.22829
## 21J (Delta)     2.116033 2.658455 4.992465      7.553366   3.723817   12.77709
##                 GATTTC.del
## 20A               15.88376
## 20B               33.73521
## 20E (EU1)         15.15451
## 20I (Alpha, V1)   14.50023
## 20J (Gamma, V3)   48.88168
## 21J (Delta)       13.11717
## 
## Residual Deviance: 725.843 
## AIC: 953.843
#Convert the coeff to odds
#install.packages("broom")
library(broom)

resumo_multinom_model_completo = tidy(multinom_model_completo, conf.int=T, conf.level = 0.95, exponentiate = T) # Este código já tem 1) Coeficientes em exponencial, 2) z-scores, 3) standard errors, 4) respetivos p-values para Wald Z test bilateral
resumo_multinom_model_completo
# Predicted values are saved as fitted.values in the model object

head(round(fitted(multinom_model_completo),2))
##    19A  20A  20B 20E (EU1) 20I (Alpha, V1) 20J (Gamma, V3) 21J (Delta)
## 2 0.00 0.01 0.99         0               0               0           0
## 4 0.00 0.01 0.99         0               0               0           0
## 5 0.11 0.89 0.00         0               0               0           0
## 7 0.01 0.96 0.03         0               0               0           0
## 8 0.11 0.89 0.00         0               0               0           0
## 9 0.08 0.91 0.00         0               0               0           0
#The multinomial regression predicts the probability of a particular observation to be part of the said level. This is what we are seeing in the above table. Columns represent the classification levels and rows represent the observations. This means that the first six observation are classified as certain clades

Modelo stepAIC1 (passagem de modelo completo para modelo com 1º step)

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dfidx':
## 
##     select
## The following object is masked from 'package:dplyr':
## 
##     select
step1=stepAIC(multinom_model_completo, direction="both")
## Start:  AIC=953.84
## clade ~ C.T + A.G + G.A + G.C + T.C + C.A + G.T + A.T + T.A + 
##     T.G + A.C + C.G + A.del + TAT.del + TCTGGTTTT.del + TACATG.del + 
##     AGTTCA.del + GATTTC.del
## 
## # weights:  133 (108 variable)
## initial  value 28110.618013 
## iter  10 value 2044.470892
## iter  20 value 1197.486180
## iter  30 value 620.517384
## iter  40 value 457.933356
## iter  50 value 409.225179
## iter  60 value 397.903307
## iter  70 value 387.294104
## iter  80 value 381.656989
## iter  90 value 377.334823
## iter 100 value 372.852830
## final  value 372.852830 
## stopped after 100 iterations
## # weights:  133 (108 variable)
## initial  value 28110.618013 
## iter  10 value 6004.888113
## iter  20 value 4274.741209
## iter  30 value 2729.838520
## iter  40 value 1536.791636
## iter  50 value 813.009285
## iter  60 value 588.960420
## iter  70 value 488.009152
## iter  80 value 428.916595
## iter  90 value 399.296734
## iter 100 value 385.913149
## final  value 385.913149 
## stopped after 100 iterations
## # weights:  133 (108 variable)
## initial  value 28110.618013 
## iter  10 value 5122.446547
## iter  20 value 2925.316847
## iter  30 value 2235.856244
## iter  40 value 1384.315548
## iter  50 value 902.058166
## iter  60 value 645.535924
## iter  70 value 570.994887
## iter  80 value 522.994939
## iter  90 value 500.420978
## iter 100 value 489.555683
## final  value 489.555683 
## stopped after 100 iterations
## # weights:  133 (108 variable)
## initial  value 28110.618013 
## iter  10 value 3053.122412
## iter  20 value 1982.713930
## iter  30 value 1581.573217
## iter  40 value 1145.226467
## iter  50 value 814.138559
## iter  60 value 668.465423
## iter  70 value 611.980836
## iter  80 value 585.563730
## iter  90 value 567.410519
## iter 100 value 557.665137
## final  value 557.665137 
## stopped after 100 iterations
## # weights:  133 (108 variable)
## initial  value 28110.618013 
## iter  10 value 6020.763192
## iter  20 value 2764.925357
## iter  30 value 1889.578620
## iter  40 value 1275.998795
## iter  50 value 719.308720
## iter  60 value 506.405243
## iter  70 value 429.713038
## iter  80 value 387.707209
## iter  90 value 362.746165
## iter 100 value 355.781312
## final  value 355.781312 
## stopped after 100 iterations
## # weights:  133 (108 variable)
## initial  value 28110.618013 
## iter  10 value 5049.066538
## iter  20 value 3444.220002
## iter  30 value 2454.843183
## iter  40 value 1365.780662
## iter  50 value 772.630445
## iter  60 value 559.582298
## iter  70 value 467.100355
## iter  80 value 428.680253
## iter  90 value 397.389225
## iter 100 value 382.807724
## final  value 382.807724 
## stopped after 100 iterations
## # weights:  133 (108 variable)
## initial  value 28110.618013 
## iter  10 value 2131.225543
## iter  20 value 1289.632625
## iter  30 value 1012.110753
## iter  40 value 728.484616
## iter  50 value 621.097331
## iter  60 value 541.757593
## iter  70 value 500.237016
## iter  80 value 464.547806
## iter  90 value 453.059843
## iter 100 value 445.542810
## final  value 445.542810 
## stopped after 100 iterations
## # weights:  133 (108 variable)
## initial  value 28110.618013 
## iter  10 value 3132.810537
## iter  20 value 2170.994743
## iter  30 value 1616.312391
## iter  40 value 1003.524784
## iter  50 value 675.697652
## iter  60 value 491.487329
## iter  70 value 429.061871
## iter  80 value 398.413262
## iter  90 value 371.277070
## iter 100 value 348.789370
## final  value 348.789370 
## stopped after 100 iterations
## # weights:  133 (108 variable)
## initial  value 28110.618013 
## iter  10 value 6347.663368
## iter  20 value 4917.748400
## iter  30 value 2862.291657
## iter  40 value 1613.835322
## iter  50 value 929.245947
## iter  60 value 588.033627
## iter  70 value 465.106714
## iter  80 value 408.391779
## iter  90 value 369.841040
## iter 100 value 359.230174
## final  value 359.230174 
## stopped after 100 iterations
## # weights:  133 (108 variable)
## initial  value 28110.618013 
## iter  10 value 6414.110117
## iter  20 value 5015.747767
## iter  30 value 2749.661700
## iter  40 value 1522.522547
## iter  50 value 924.067368
## iter  60 value 574.919591
## iter  70 value 466.822246
## iter  80 value 408.392888
## iter  90 value 381.883092
## iter 100 value 370.464028
## final  value 370.464028 
## stopped after 100 iterations
## # weights:  133 (108 variable)
## initial  value 28110.618013 
## iter  10 value 6546.001858
## iter  20 value 5028.216636
## iter  30 value 3325.910399
## iter  40 value 1743.457780
## iter  50 value 862.313717
## iter  60 value 598.816494
## iter  70 value 464.840973
## iter  80 value 402.426178
## iter  90 value 378.550368
## iter 100 value 366.693187
## final  value 366.693187 
## stopped after 100 iterations
## # weights:  133 (108 variable)
## initial  value 28110.618013 
## iter  10 value 6947.808917
## iter  20 value 5594.411276
## iter  30 value 3472.044039
## iter  40 value 1896.219247
## iter  50 value 1304.700181
## iter  60 value 924.594774
## iter  70 value 772.285061
## iter  80 value 685.290052
## iter  90 value 660.015971
## iter 100 value 640.024851
## final  value 640.024851 
## stopped after 100 iterations
## # weights:  133 (108 variable)
## initial  value 28110.618013 
## iter  10 value 6462.765663
## iter  20 value 5003.841955
## iter  30 value 2521.021392
## iter  40 value 1383.422730
## iter  50 value 784.258090
## iter  60 value 558.847233
## iter  70 value 466.827683
## iter  80 value 414.262481
## iter  90 value 389.255126
## iter 100 value 380.104329
## final  value 380.104329 
## stopped after 100 iterations
## # weights:  133 (108 variable)
## initial  value 28110.618013 
## iter  10 value 6354.143309
## iter  20 value 4896.431204
## iter  30 value 2926.951316
## iter  40 value 1616.895299
## iter  50 value 813.194271
## iter  60 value 572.814143
## iter  70 value 452.232947
## iter  80 value 399.863198
## iter  90 value 371.951519
## iter 100 value 360.310803
## final  value 360.310803 
## stopped after 100 iterations
## # weights:  133 (108 variable)
## initial  value 28110.618013 
## iter  10 value 6355.384340
## iter  20 value 4903.436042
## iter  30 value 3001.134836
## iter  40 value 1707.300576
## iter  50 value 874.580984
## iter  60 value 616.502220
## iter  70 value 479.217359
## iter  80 value 427.136907
## iter  90 value 395.623407
## iter 100 value 379.665116
## final  value 379.665116 
## stopped after 100 iterations
## # weights:  133 (108 variable)
## initial  value 28110.618013 
## iter  10 value 6354.335058
## iter  20 value 4906.838126
## iter  30 value 2983.371623
## iter  40 value 1447.111455
## iter  50 value 841.413564
## iter  60 value 619.316258
## iter  70 value 482.859359
## iter  80 value 420.392281
## iter  90 value 390.587787
## iter 100 value 378.632938
## final  value 378.632938 
## stopped after 100 iterations
## # weights:  133 (108 variable)
## initial  value 28110.618013 
## iter  10 value 6502.946598
## iter  20 value 5114.140585
## iter  30 value 3362.397096
## iter  40 value 1540.901992
## iter  50 value 822.220796
## iter  60 value 554.014623
## iter  70 value 455.590133
## iter  80 value 400.316863
## iter  90 value 374.246105
## iter 100 value 358.822625
## final  value 358.822625 
## stopped after 100 iterations
## # weights:  133 (108 variable)
## initial  value 28110.618013 
## iter  10 value 6529.141917
## iter  20 value 5224.690067
## iter  30 value 3434.839000
## iter  40 value 1645.504780
## iter  50 value 948.667685
## iter  60 value 631.377097
## iter  70 value 483.825872
## iter  80 value 411.052203
## iter  90 value 377.603994
## iter 100 value 364.839037
## final  value 364.839037 
## stopped after 100 iterations
##                 Df     AIC
## - A.T            6  913.58
## - T.C            6  927.56
## - AGTTCA.del     6  933.65
## - T.A            6  934.46
## - TAT.del        6  936.62
## - GATTTC.del     6  945.68
## - A.C            6  949.39
## <none>              953.84
## - T.G            6  956.93
## - C.T            6  961.71
## - TACATG.del     6  973.27
## - TCTGGTTTT.del  6  975.33
## - A.del          6  976.21
## - C.A            6  981.62
## - A.G            6  987.83
## - G.T            6 1107.09
## - G.A            6 1195.11
## - G.C            6 1331.33
## - C.G            6 1496.05
## # weights:  133 (108 variable)
## initial  value 28110.618013 
## iter  10 value 3132.810537
## iter  20 value 2170.994743
## iter  30 value 1616.312391
## iter  40 value 1003.524784
## iter  50 value 675.697652
## iter  60 value 491.487329
## iter  70 value 429.061871
## iter  80 value 398.413262
## iter  90 value 371.277070
## iter 100 value 348.789370
## final  value 348.789370 
## stopped after 100 iterations
## 
## Step:  AIC=913.58
## clade ~ C.T + A.G + G.A + G.C + T.C + C.A + G.T + T.A + T.G + 
##     A.C + C.G + A.del + TAT.del + TCTGGTTTT.del + TACATG.del + 
##     AGTTCA.del + GATTTC.del
## 
## # weights:  126 (102 variable)
## initial  value 28110.618013 
## iter  10 value 1455.040299
## iter  20 value 626.117266
## iter  30 value 440.573492
## iter  40 value 405.769390
## iter  50 value 398.286240
## iter  60 value 392.784033
## iter  70 value 388.085230
## iter  80 value 380.889993
## iter  90 value 372.515339
## iter 100 value 368.066606
## final  value 368.066606 
## stopped after 100 iterations
## # weights:  126 (102 variable)
## initial  value 28110.618013 
## iter  10 value 6328.126133
## iter  20 value 4609.069950
## iter  30 value 3618.837504
## iter  40 value 1673.579630
## iter  50 value 939.499238
## iter  60 value 651.823775
## iter  70 value 487.886104
## iter  80 value 439.880598
## iter  90 value 410.671297
## iter 100 value 395.330706
## final  value 395.330706 
## stopped after 100 iterations
## # weights:  126 (102 variable)
## initial  value 28110.618013 
## iter  10 value 5044.648171
## iter  20 value 3292.496160
## iter  30 value 2032.752422
## iter  40 value 1389.676164
## iter  50 value 783.430600
## iter  60 value 615.949109
## iter  70 value 549.269351
## iter  80 value 511.403018
## iter  90 value 500.370706
## iter 100 value 494.416184
## final  value 494.416184 
## stopped after 100 iterations
## # weights:  126 (102 variable)
## initial  value 28110.618013 
## iter  10 value 3564.405023
## iter  20 value 2685.934262
## iter  30 value 1877.017484
## iter  40 value 1309.199848
## iter  50 value 902.626019
## iter  60 value 691.036410
## iter  70 value 615.527478
## iter  80 value 585.680765
## iter  90 value 568.419672
## iter 100 value 560.517823
## final  value 560.517823 
## stopped after 100 iterations
## # weights:  126 (102 variable)
## initial  value 28110.618013 
## iter  10 value 3141.777871
## iter  20 value 1602.269569
## iter  30 value 1136.247650
## iter  40 value 810.705359
## iter  50 value 600.440006
## iter  60 value 476.826039
## iter  70 value 420.517134
## iter  80 value 391.668023
## iter  90 value 367.215741
## iter 100 value 353.863158
## final  value 353.863158 
## stopped after 100 iterations
## # weights:  126 (102 variable)
## initial  value 28110.618013 
## iter  10 value 5395.276799
## iter  20 value 3254.416169
## iter  30 value 2319.011452
## iter  40 value 1305.025994
## iter  50 value 746.101539
## iter  60 value 534.971421
## iter  70 value 460.495625
## iter  80 value 415.513406
## iter  90 value 399.102143
## iter 100 value 388.777074
## final  value 388.777074 
## stopped after 100 iterations
## # weights:  126 (102 variable)
## initial  value 28110.618013 
## iter  10 value 2026.762875
## iter  20 value 1361.365181
## iter  30 value 1081.587681
## iter  40 value 772.270323
## iter  50 value 642.897708
## iter  60 value 539.732685
## iter  70 value 499.991044
## iter  80 value 474.082477
## iter  90 value 458.750485
## iter 100 value 453.057083
## final  value 453.057083 
## stopped after 100 iterations
## # weights:  126 (102 variable)
## initial  value 28110.618013 
## iter  10 value 3304.937128
## iter  20 value 1954.871411
## iter  30 value 1470.956913
## iter  40 value 991.677273
## iter  50 value 687.786513
## iter  60 value 487.934554
## iter  70 value 424.792847
## iter  80 value 395.255140
## iter  90 value 368.475413
## iter 100 value 353.967628
## final  value 353.967628 
## stopped after 100 iterations
## # weights:  126 (102 variable)
## initial  value 28110.618013 
## iter  10 value 5508.639303
## iter  20 value 2676.978015
## iter  30 value 1972.724144
## iter  40 value 1176.832576
## iter  50 value 700.388187
## iter  60 value 494.915459
## iter  70 value 416.479754
## iter  80 value 384.158295
## iter  90 value 364.370417
## iter 100 value 354.628334
## final  value 354.628334 
## stopped after 100 iterations
## # weights:  126 (102 variable)
## initial  value 28110.618013 
## iter  10 value 3235.593003
## iter  20 value 2162.161227
## iter  30 value 1557.654605
## iter  40 value 1047.674969
## iter  50 value 718.438838
## iter  60 value 508.621235
## iter  70 value 438.898846
## iter  80 value 401.938453
## iter  90 value 380.310047
## iter 100 value 362.034139
## final  value 362.034139 
## stopped after 100 iterations
## # weights:  126 (102 variable)
## initial  value 28110.618013 
## iter  10 value 3408.963875
## iter  20 value 1917.317353
## iter  30 value 1532.837127
## iter  40 value 1242.696720
## iter  50 value 954.202747
## iter  60 value 784.743761
## iter  70 value 714.425205
## iter  80 value 673.944248
## iter  90 value 650.235100
## iter 100 value 628.167722
## final  value 628.167722 
## stopped after 100 iterations
## # weights:  126 (102 variable)
## initial  value 28110.618013 
## iter  10 value 4967.232185
## iter  20 value 2814.982103
## iter  30 value 2137.554867
## iter  40 value 1343.305911
## iter  50 value 783.152008
## iter  60 value 533.990886
## iter  70 value 436.413129
## iter  80 value 398.416266
## iter  90 value 378.228597
## iter 100 value 366.937760
## final  value 366.937760 
## stopped after 100 iterations
## # weights:  126 (102 variable)
## initial  value 28110.618013 
## iter  10 value 3194.123538
## iter  20 value 2123.790150
## iter  30 value 1471.824675
## iter  40 value 971.489355
## iter  50 value 676.722924
## iter  60 value 511.595905
## iter  70 value 442.746703
## iter  80 value 401.587679
## iter  90 value 376.191116
## iter 100 value 353.450846
## final  value 353.450846 
## stopped after 100 iterations
## # weights:  126 (102 variable)
## initial  value 28110.618013 
## iter  10 value 3306.432590
## iter  20 value 1964.108469
## iter  30 value 1456.902590
## iter  40 value 1013.884088
## iter  50 value 700.133220
## iter  60 value 520.284546
## iter  70 value 451.674693
## iter  80 value 415.398545
## iter  90 value 390.480142
## iter 100 value 372.257719
## final  value 372.257719 
## stopped after 100 iterations
## # weights:  126 (102 variable)
## initial  value 28110.618013 
## iter  10 value 3208.649330
## iter  20 value 2100.052806
## iter  30 value 1451.782742
## iter  40 value 1007.043363
## iter  50 value 748.820901
## iter  60 value 570.398581
## iter  70 value 490.008226
## iter  80 value 423.883723
## iter  90 value 394.423379
## iter 100 value 374.261497
## final  value 374.261497 
## stopped after 100 iterations
## # weights:  126 (102 variable)
## initial  value 28110.618013 
## iter  10 value 4124.746509
## iter  20 value 2225.796179
## iter  30 value 1516.486460
## iter  40 value 1103.204654
## iter  50 value 730.317342
## iter  60 value 526.176723
## iter  70 value 414.346703
## iter  80 value 383.120753
## iter  90 value 364.165323
## iter 100 value 348.555190
## final  value 348.555190 
## stopped after 100 iterations
## # weights:  126 (102 variable)
## initial  value 28110.618013 
## iter  10 value 3928.876361
## iter  20 value 1871.600351
## iter  30 value 1256.394059
## iter  40 value 843.402953
## iter  50 value 620.990797
## iter  60 value 507.053408
## iter  70 value 415.138135
## iter  80 value 376.927548
## iter  90 value 355.416481
## iter 100 value 345.218747
## final  value 345.218747 
## stopped after 100 iterations
## # weights:  140 (114 variable)
## initial  value 28110.618013 
## iter  10 value 6381.341107
## iter  20 value 5003.825041
## iter  30 value 3088.613327
## iter  40 value 1467.644761
## iter  50 value 830.139232
## iter  60 value 588.077634
## iter  70 value 466.698407
## iter  80 value 402.163323
## iter  90 value 375.773024
## iter 100 value 362.921520
## final  value 362.921520 
## stopped after 100 iterations
##                 Df     AIC
## - GATTTC.del     6  894.44
## - AGTTCA.del     6  901.11
## - TAT.del        6  910.90
## - T.C            6  911.73
## - T.A            6  911.94
## - T.G            6  913.26
## <none>              913.58
## - A.C            6  928.07
## - A.del          6  937.88
## - C.T            6  940.13
## - TCTGGTTTT.del  6  948.52
## - TACATG.del     6  952.52
## + A.T            6  953.84
## - C.A            6  981.55
## - A.G            6  994.66
## - G.T            6 1110.11
## - G.A            6 1192.83
## - G.C            6 1325.04
## - C.G            6 1460.34
## # weights:  126 (102 variable)
## initial  value 28110.618013 
## iter  10 value 3928.876361
## iter  20 value 1871.600351
## iter  30 value 1256.394059
## iter  40 value 843.402953
## iter  50 value 620.990797
## iter  60 value 507.053408
## iter  70 value 415.138135
## iter  80 value 376.927548
## iter  90 value 355.416481
## iter 100 value 345.218747
## final  value 345.218747 
## stopped after 100 iterations
## 
## Step:  AIC=894.44
## clade ~ C.T + A.G + G.A + G.C + T.C + C.A + G.T + T.A + T.G + 
##     A.C + C.G + A.del + TAT.del + TCTGGTTTT.del + TACATG.del + 
##     AGTTCA.del
## 
## # weights:  119 (96 variable)
## initial  value 28110.618013 
## iter  10 value 1490.605156
## iter  20 value 680.448306
## iter  30 value 540.337175
## iter  40 value 443.953874
## iter  50 value 416.532439
## iter  60 value 405.690806
## iter  70 value 396.417446
## iter  80 value 388.802634
## iter  90 value 382.406897
## iter 100 value 374.777825
## final  value 374.777825 
## stopped after 100 iterations
## # weights:  119 (96 variable)
## initial  value 28110.618013 
## iter  10 value 2466.467319
## iter  20 value 1472.502777
## iter  30 value 1125.508028
## iter  40 value 839.034056
## iter  50 value 653.864429
## iter  60 value 517.244681
## iter  70 value 459.490355
## iter  80 value 427.491413
## iter  90 value 393.531653
## iter 100 value 376.544761
## final  value 376.544761 
## stopped after 100 iterations
## # weights:  119 (96 variable)
## initial  value 28110.618013 
## iter  10 value 6729.475950
## iter  20 value 3972.229067
## iter  30 value 2473.812691
## iter  40 value 1373.755202
## iter  50 value 901.118231
## iter  60 value 635.807251
## iter  70 value 564.955583
## iter  80 value 520.828231
## iter  90 value 505.181844
## iter 100 value 496.069968
## final  value 496.069968 
## stopped after 100 iterations
## # weights:  119 (96 variable)
## initial  value 28110.618013 
## iter  10 value 4449.072768
## iter  20 value 2857.545545
## iter  30 value 2246.753441
## iter  40 value 1443.377814
## iter  50 value 931.972261
## iter  60 value 705.802105
## iter  70 value 610.904355
## iter  80 value 584.463222
## iter  90 value 569.607994
## iter 100 value 561.672230
## final  value 561.672230 
## stopped after 100 iterations
## # weights:  119 (96 variable)
## initial  value 28110.618013 
## iter  10 value 3164.665399
## iter  20 value 2152.289449
## iter  30 value 1492.599206
## iter  40 value 948.111180
## iter  50 value 684.991328
## iter  60 value 494.221445
## iter  70 value 423.609144
## iter  80 value 398.148239
## iter  90 value 383.231527
## iter 100 value 364.963404
## final  value 364.963404 
## stopped after 100 iterations
## # weights:  119 (96 variable)
## initial  value 28110.618013 
## iter  10 value 5713.351108
## iter  20 value 3425.692521
## iter  30 value 2191.147611
## iter  40 value 1404.252241
## iter  50 value 761.940206
## iter  60 value 549.825812
## iter  70 value 463.502600
## iter  80 value 426.824134
## iter  90 value 405.213120
## iter 100 value 392.263504
## final  value 392.263504 
## stopped after 100 iterations
## # weights:  119 (96 variable)
## initial  value 28110.618013 
## iter  10 value 2095.932991
## iter  20 value 1441.885675
## iter  30 value 1113.706094
## iter  40 value 858.882536
## iter  50 value 667.854860
## iter  60 value 545.806864
## iter  70 value 505.738092
## iter  80 value 480.794757
## iter  90 value 469.560361
## iter 100 value 462.333587
## final  value 462.333587 
## stopped after 100 iterations
## # weights:  119 (96 variable)
## initial  value 28110.618013 
## iter  10 value 3893.626972
## iter  20 value 2429.702813
## iter  30 value 1555.509906
## iter  40 value 1039.609036
## iter  50 value 739.251371
## iter  60 value 535.948894
## iter  70 value 433.230540
## iter  80 value 391.184345
## iter  90 value 370.172021
## iter 100 value 356.866591
## final  value 356.866591 
## stopped after 100 iterations
## # weights:  119 (96 variable)
## initial  value 28110.618013 
## iter  10 value 4096.750573
## iter  20 value 1897.413580
## iter  30 value 1268.449363
## iter  40 value 910.844541
## iter  50 value 672.648728
## iter  60 value 530.008795
## iter  70 value 436.196902
## iter  80 value 398.226780
## iter  90 value 378.157263
## iter 100 value 362.370751
## final  value 362.370751 
## stopped after 100 iterations
## # weights:  119 (96 variable)
## initial  value 28110.618013 
## iter  10 value 4061.566297
## iter  20 value 2114.845321
## iter  30 value 1366.978967
## iter  40 value 941.053981
## iter  50 value 666.044371
## iter  60 value 540.029831
## iter  70 value 437.089022
## iter  80 value 400.752538
## iter  90 value 376.590935
## iter 100 value 360.827832
## final  value 360.827832 
## stopped after 100 iterations
## # weights:  119 (96 variable)
## initial  value 28110.618013 
## iter  10 value 3362.114242
## iter  20 value 2200.968653
## iter  30 value 1641.525819
## iter  40 value 1354.224248
## iter  50 value 1034.010986
## iter  60 value 816.570736
## iter  70 value 716.568066
## iter  80 value 677.798000
## iter  90 value 654.791543
## iter 100 value 636.811162
## final  value 636.811162 
## stopped after 100 iterations
## # weights:  119 (96 variable)
## initial  value 28110.618013 
## iter  10 value 4754.229782
## iter  20 value 2138.815841
## iter  30 value 1255.768443
## iter  40 value 884.322939
## iter  50 value 666.993027
## iter  60 value 538.976910
## iter  70 value 447.887894
## iter  80 value 401.505336
## iter  90 value 374.584084
## iter 100 value 356.144685
## final  value 356.144685 
## stopped after 100 iterations
## # weights:  119 (96 variable)
## initial  value 28110.618013 
## iter  10 value 3647.986484
## iter  20 value 1617.432872
## iter  30 value 1079.213885
## iter  40 value 822.000950
## iter  50 value 623.222461
## iter  60 value 494.818133
## iter  70 value 423.443569
## iter  80 value 376.801682
## iter  90 value 354.448658
## iter 100 value 344.324306
## final  value 344.324306 
## stopped after 100 iterations
## # weights:  119 (96 variable)
## initial  value 28110.618013 
## iter  10 value 3886.468656
## iter  20 value 2336.133554
## iter  30 value 1514.985425
## iter  40 value 1066.536043
## iter  50 value 733.181352
## iter  60 value 553.755500
## iter  70 value 452.508881
## iter  80 value 408.814139
## iter  90 value 385.554203
## iter 100 value 370.958221
## final  value 370.958221 
## stopped after 100 iterations
## # weights:  119 (96 variable)
## initial  value 28110.618013 
## iter  10 value 3650.403997
## iter  20 value 1920.768670
## iter  30 value 1309.531456
## iter  40 value 973.545354
## iter  50 value 732.037266
## iter  60 value 581.166934
## iter  70 value 483.889094
## iter  80 value 420.137267
## iter  90 value 389.303647
## iter 100 value 373.872206
## final  value 373.872206 
## stopped after 100 iterations
## # weights:  119 (96 variable)
## initial  value 28110.618013 
## iter  10 value 4828.462040
## iter  20 value 2507.071026
## iter  30 value 1632.342229
## iter  40 value 1166.358292
## iter  50 value 746.505714
## iter  60 value 552.429780
## iter  70 value 462.072256
## iter  80 value 392.242442
## iter  90 value 371.384523
## iter 100 value 356.349202
## final  value 356.349202 
## stopped after 100 iterations
## # weights:  133 (108 variable)
## initial  value 28110.618013 
## iter  10 value 6529.141917
## iter  20 value 5224.690067
## iter  30 value 3434.839000
## iter  40 value 1645.504780
## iter  50 value 948.667685
## iter  60 value 631.377087
## iter  70 value 483.825869
## iter  80 value 411.052199
## iter  90 value 377.603991
## iter 100 value 364.839000
## final  value 364.839000 
## stopped after 100 iterations
## # weights:  133 (108 variable)
## initial  value 28110.618013 
## iter  10 value 3132.810537
## iter  20 value 2170.994743
## iter  30 value 1616.312391
## iter  40 value 1003.524784
## iter  50 value 675.697652
## iter  60 value 491.487329
## iter  70 value 429.061871
## iter  80 value 398.413262
## iter  90 value 371.277070
## iter 100 value 348.789370
## final  value 348.789370 
## stopped after 100 iterations
##                 Df     AIC
## - TAT.del        6  880.65
## <none>              894.44
## - A.del          6  904.29
## - AGTTCA.del     6  904.70
## - T.A            6  905.73
## + GATTTC.del     6  913.58
## - A.C            6  913.66
## - T.G            6  916.74
## - T.C            6  921.93
## - TCTGGTTTT.del  6  933.92
## - TACATG.del     6  939.74
## - C.T            6  941.56
## - A.G            6  945.09
## + A.T            6  945.68
## - C.A            6  976.53
## - G.T            6 1116.67
## - G.A            6 1184.14
## - G.C            6 1315.34
## - C.G            6 1465.62
## # weights:  119 (96 variable)
## initial  value 28110.618013 
## iter  10 value 3647.986484
## iter  20 value 1617.432872
## iter  30 value 1079.213885
## iter  40 value 822.000950
## iter  50 value 623.222461
## iter  60 value 494.818133
## iter  70 value 423.443569
## iter  80 value 376.801682
## iter  90 value 354.448658
## iter 100 value 344.324306
## final  value 344.324306 
## stopped after 100 iterations
## 
## Step:  AIC=880.65
## clade ~ C.T + A.G + G.A + G.C + T.C + C.A + G.T + T.A + T.G + 
##     A.C + C.G + A.del + TCTGGTTTT.del + TACATG.del + AGTTCA.del
## 
## # weights:  112 (90 variable)
## initial  value 28110.618013 
## iter  10 value 1466.026419
## iter  20 value 763.169585
## iter  30 value 539.593915
## iter  40 value 449.455624
## iter  50 value 420.299314
## iter  60 value 409.449215
## iter  70 value 396.933385
## iter  80 value 390.303604
## iter  90 value 381.487978
## iter 100 value 373.359870
## final  value 373.359870 
## stopped after 100 iterations
## # weights:  112 (90 variable)
## initial  value 28110.618013 
## iter  10 value 2460.845318
## iter  20 value 1455.167647
## iter  30 value 1151.135535
## iter  40 value 844.014503
## iter  50 value 627.679326
## iter  60 value 510.899010
## iter  70 value 463.599126
## iter  80 value 430.126261
## iter  90 value 391.528488
## iter 100 value 377.760342
## final  value 377.760342 
## stopped after 100 iterations
## # weights:  112 (90 variable)
## initial  value 28110.618013 
## iter  10 value 5104.618604
## iter  20 value 3489.844307
## iter  30 value 2270.449110
## iter  40 value 1222.574433
## iter  50 value 819.698555
## iter  60 value 634.835576
## iter  70 value 552.740566
## iter  80 value 516.277975
## iter  90 value 505.852191
## iter 100 value 498.073332
## final  value 498.073332 
## stopped after 100 iterations
## # weights:  112 (90 variable)
## initial  value 28110.618013 
## iter  10 value 4319.216102
## iter  20 value 2805.459375
## iter  30 value 2231.957049
## iter  40 value 1465.762005
## iter  50 value 946.014235
## iter  60 value 709.385387
## iter  70 value 615.617357
## iter  80 value 585.781166
## iter  90 value 569.535015
## iter 100 value 563.848740
## final  value 563.848740 
## stopped after 100 iterations
## # weights:  112 (90 variable)
## initial  value 28110.618013 
## iter  10 value 4170.994521
## iter  20 value 2569.396682
## iter  30 value 1594.552907
## iter  40 value 1184.186775
## iter  50 value 703.173805
## iter  60 value 509.960122
## iter  70 value 433.337875
## iter  80 value 391.476906
## iter  90 value 371.644335
## iter 100 value 361.114159
## final  value 361.114159 
## stopped after 100 iterations
## # weights:  112 (90 variable)
## initial  value 28110.618013 
## iter  10 value 5750.931838
## iter  20 value 3566.674623
## iter  30 value 2343.726344
## iter  40 value 1344.505590
## iter  50 value 757.842513
## iter  60 value 537.772479
## iter  70 value 457.279278
## iter  80 value 425.229879
## iter  90 value 407.560987
## iter 100 value 392.764726
## final  value 392.764726 
## stopped after 100 iterations
## # weights:  112 (90 variable)
## initial  value 28110.618013 
## iter  10 value 2148.162818
## iter  20 value 1458.133967
## iter  30 value 1132.230718
## iter  40 value 872.676944
## iter  50 value 686.666850
## iter  60 value 555.471817
## iter  70 value 513.958659
## iter  80 value 483.138352
## iter  90 value 470.376454
## iter 100 value 462.278337
## final  value 462.278337 
## stopped after 100 iterations
## # weights:  112 (90 variable)
## initial  value 28110.618013 
## iter  10 value 4307.820955
## iter  20 value 2376.300491
## iter  30 value 1470.787443
## iter  40 value 982.231247
## iter  50 value 738.224224
## iter  60 value 518.139783
## iter  70 value 432.410968
## iter  80 value 395.545366
## iter  90 value 374.886709
## iter 100 value 355.688215
## final  value 355.688215 
## stopped after 100 iterations
## # weights:  112 (90 variable)
## initial  value 28110.618013 
## iter  10 value 3696.920675
## iter  20 value 1717.394517
## iter  30 value 1118.307153
## iter  40 value 773.767130
## iter  50 value 641.508027
## iter  60 value 497.379659
## iter  70 value 424.371703
## iter  80 value 384.583872
## iter  90 value 365.808978
## iter 100 value 353.995847
## final  value 353.995847 
## stopped after 100 iterations
## # weights:  112 (90 variable)
## initial  value 28110.618013 
## iter  10 value 3828.120957
## iter  20 value 1984.462996
## iter  30 value 1295.978571
## iter  40 value 934.676555
## iter  50 value 640.759689
## iter  60 value 514.742781
## iter  70 value 442.596338
## iter  80 value 404.926679
## iter  90 value 377.900998
## iter 100 value 361.644344
## final  value 361.644344 
## stopped after 100 iterations
## # weights:  112 (90 variable)
## initial  value 28110.618013 
## iter  10 value 3363.849848
## iter  20 value 2165.380905
## iter  30 value 1734.531051
## iter  40 value 1340.554179
## iter  50 value 1012.787044
## iter  60 value 850.589069
## iter  70 value 729.888081
## iter  80 value 686.383842
## iter  90 value 658.188822
## iter 100 value 639.137711
## final  value 639.137711 
## stopped after 100 iterations
## # weights:  112 (90 variable)
## initial  value 28110.618013 
## iter  10 value 3931.692530
## iter  20 value 2004.968335
## iter  30 value 1318.077005
## iter  40 value 967.604427
## iter  50 value 679.446967
## iter  60 value 548.157796
## iter  70 value 456.510972
## iter  80 value 412.422473
## iter  90 value 386.492497
## iter 100 value 369.469025
## final  value 369.469025 
## stopped after 100 iterations
## # weights:  112 (90 variable)
## initial  value 28110.618013 
## iter  10 value 4268.834746
## iter  20 value 2286.209692
## iter  30 value 1504.143005
## iter  40 value 1040.687461
## iter  50 value 736.855953
## iter  60 value 550.089669
## iter  70 value 459.541120
## iter  80 value 415.484900
## iter  90 value 393.782997
## iter 100 value 375.343080
## final  value 375.343080 
## stopped after 100 iterations
## # weights:  112 (90 variable)
## initial  value 28110.618013 
## iter  10 value 3767.407797
## iter  20 value 2282.375090
## iter  30 value 1553.386501
## iter  40 value 1023.564092
## iter  50 value 748.230205
## iter  60 value 605.604212
## iter  70 value 525.760590
## iter  80 value 431.488999
## iter  90 value 394.871997
## iter 100 value 383.048315
## final  value 383.048315 
## stopped after 100 iterations
## # weights:  112 (90 variable)
## initial  value 28110.618013 
## iter  10 value 2834.434777
## iter  20 value 1589.466905
## iter  30 value 1264.488347
## iter  40 value 918.981422
## iter  50 value 646.927148
## iter  60 value 499.619503
## iter  70 value 446.531479
## iter  80 value 397.970094
## iter  90 value 368.229617
## iter 100 value 351.971730
## final  value 351.971730 
## stopped after 100 iterations
## # weights:  126 (102 variable)
## initial  value 28110.618013 
## iter  10 value 6478.437895
## iter  20 value 5176.851466
## iter  30 value 3500.948080
## iter  40 value 1465.079082
## iter  50 value 871.031712
## iter  60 value 589.623926
## iter  70 value 477.788198
## iter  80 value 404.203047
## iter  90 value 383.209248
## iter 100 value 367.165036
## final  value 367.165036 
## stopped after 100 iterations
## # weights:  126 (102 variable)
## initial  value 28110.618013 
## iter  10 value 3928.876361
## iter  20 value 1871.600351
## iter  30 value 1256.394059
## iter  40 value 843.402953
## iter  50 value 620.990797
## iter  60 value 507.053408
## iter  70 value 415.138135
## iter  80 value 376.927548
## iter  90 value 355.416481
## iter 100 value 345.218747
## final  value 345.218747 
## stopped after 100 iterations
## # weights:  126 (102 variable)
## initial  value 28110.618013 
## iter  10 value 3194.123538
## iter  20 value 2123.790150
## iter  30 value 1471.824675
## iter  40 value 971.489355
## iter  50 value 676.722924
## iter  60 value 511.595905
## iter  70 value 442.746703
## iter  80 value 401.587679
## iter  90 value 376.191116
## iter 100 value 353.450846
## final  value 353.450846 
## stopped after 100 iterations
##                 Df     AIC
## <none>              880.65
## - AGTTCA.del     6  883.94
## - T.G            6  887.99
## - T.A            6  891.38
## + TAT.del        6  894.44
## - T.C            6  902.23
## - A.C            6  903.29
## + GATTTC.del     6  910.90
## - A.del          6  918.94
## - C.T            6  926.72
## - TCTGGTTTT.del  6  930.69
## - A.G            6  935.52
## + A.T            6  938.33
## - TACATG.del     6  946.10
## - C.A            6  965.53
## - G.T            6 1104.56
## - G.A            6 1176.15
## - G.C            6 1307.70
## - C.G            6 1458.28
step1$anova # display results
multinom_modelo_step1 = multinom(clade ~ C.T + A.G + G.A + G.C + T.C + C.A + G.T + T.A + T.G + A.C + C.G + A.del + TCTGGTTTT.del + TACATG.del + AGTTCA.del, data = train)
## # weights:  119 (96 variable)
## initial  value 28110.618013 
## iter  10 value 3647.986484
## iter  20 value 1617.432872
## iter  30 value 1079.213885
## iter  40 value 822.000950
## iter  50 value 623.222461
## iter  60 value 494.818133
## iter  70 value 423.443569
## iter  80 value 376.801682
## iter  90 value 354.448658
## iter 100 value 344.324306
## final  value 344.324306 
## stopped after 100 iterations
summary(multinom_modelo_step1)      
## Warning in sqrt(diag(vc)): NaNs produced
## Call:
## multinom(formula = clade ~ C.T + A.G + G.A + G.C + T.C + C.A + 
##     G.T + T.A + T.G + A.C + C.G + A.del + TCTGGTTTT.del + TACATG.del + 
##     AGTTCA.del, data = train)
## 
## Coefficients:
##                 (Intercept)       C.T      A.G        G.A       G.C        T.C
## 20A               -1.933765 0.5454912 2.070370  0.7311481  4.340031 -0.4325411
## 20B               -7.438877 0.4129722 1.967340  3.0209764 11.436590 -1.4369695
## 20E (EU1)         -7.009073 0.9150645 0.170016 -1.0071763 10.982609 -0.2600824
## 20I (Alpha, V1)  -53.851062 0.6021353 3.736936  0.2637475 24.689851  0.6129052
## 20J (Gamma, V3)  -47.252616 0.1066137 3.901453  3.9452739 11.022918  0.5200364
## 21J (Delta)      -31.810245 0.6435029 4.028856 -0.5321102  4.303957 -1.4239099
##                        C.A        G.T         T.A        T.G       A.C
## 20A             -0.3477663 -0.6639123  -0.7485647 -0.4588402 15.505566
## 20B              1.5967697 -1.9742851  -0.3225838 -0.7884974 16.376980
## 20E (EU1)       -1.8547275 -1.8033574   0.8882563 -3.4637188 12.315218
## 20I (Alpha, V1)  8.3590355 -1.5599301  -2.4251714 -2.6650876  4.767288
## 20J (Gamma, V3)  7.8154330 -1.2905212  -3.5093478 -1.9426092 22.619497
## 21J (Delta)      4.9679821  0.7353096 -11.1697964  8.5647941 14.051928
##                        C.G       A.del TCTGGTTTT.del TACATG.del AGTTCA.del
## 20A             -5.5048721  0.06537132      8.050335   3.408709  -5.708748
## 20B             -5.0101299 -4.99650605     13.781774 -45.456238   6.665550
## 20E (EU1)        5.9616811  3.84230102      1.504489   2.257863 -10.142530
## 20I (Alpha, V1)  0.6331508 -2.83134716     16.431037   4.660909  -5.633442
## 20J (Gamma, V3)  5.0277063 -7.99701139     25.876733  -9.687166   3.641542
## 21J (Delta)      0.8712754 13.02279605     -3.178535   5.478253  -2.550051
## 
## Std. Errors:
##                 (Intercept)        C.T       A.G       G.A      G.C       T.C
## 20A               0.4929648 0.09339845 0.3618170 0.2935666 2.142818 0.2699932
## 20B               0.9047003 0.11458062 0.4046655 0.3668550 2.230727 0.3708346
## 20E (EU1)         1.2434518 0.13689303 0.6142333 0.5250421 2.343758 0.4880234
## 20I (Alpha, V1)  24.2515000 0.40821199 1.9217906 1.5604646 7.626226 1.6161008
## 20J (Gamma, V3)  37.8225646 1.36356093 4.3039109 2.4857915 5.730050 1.8057949
## 21J (Delta)      13.1107781 0.44085870 1.5942987 1.1058224 2.964152 0.8600406
##                       C.A       G.T      T.A       T.G       A.C      C.G
## 20A             0.7881442 0.1492254 1.024743 0.9965131  2.853650 1.874561
## 20B             0.9369048 0.2035105 1.259557 1.0817112  2.845473 2.037204
## 20E (EU1)       1.1147501 0.3232000 1.839938 1.2674231  2.891031 1.994876
## 20I (Alpha, V1) 2.9776548 1.2614217 5.966538 3.7495377 10.930570 4.902439
## 20J (Gamma, V3) 5.4978054 2.7872065 8.495472 7.8521040  5.767479 6.680214
## 21J (Delta)     2.3818892 0.7672143 5.922400 5.3743921  5.732374 2.416777
##                     A.del TCTGGTTTT.del TACATG.del AGTTCA.del
## 20A              1.704727      3.985526   41.00278  31.934917
## 20B              2.055813      3.935424        NaN  19.521840
## 20E (EU1)        2.004606      4.521652   41.09880  10.900171
## 20I (Alpha, V1)  4.180609      5.488683   41.39479  25.712439
## 20J (Gamma, V3) 14.187666     10.931946   43.01321  25.163521
## 21J (Delta)      7.529656     16.779229   41.67788   5.985131
## 
## Residual Deviance: 688.6486 
## AIC: 880.6486
resumo_multinom_model_step1 = tidy(multinom_modelo_step1, conf.int=T, conf.level = 0.95, exponentiate = T) # Este código já tem 1) Coeficientes em exponencial, 2) z-scores, 3) standard errors, 4) respetivos p-values para Wald Z test bilateral
## Warning in sqrt(diag(vc)): NaNs produced
## Warning in sqrt(diag(vcov(object))): NaNs produced
resumo_multinom_model_step1
multinom_model_rrr_step =  exp(coef(multinom_modelo_step1))
multinom_model_rrr_step
##                  (Intercept)      C.T       A.G        G.A          G.C
## 20A             1.446027e-01 1.725456  7.927758  2.0774643 7.670990e+01
## 20B             5.879449e-04 1.511303  7.151626 20.5113083 9.265049e+04
## 20E (EU1)       9.036456e-04 2.496936  1.185324  0.3652489 5.884186e+04
## 20I (Alpha, V1) 4.099971e-24 1.826014 41.969211  1.3017995 5.280393e+10
## 20J (Gamma, V3) 3.009190e-21 1.112504 49.474275 51.6904929 6.126219e+04
## 21J (Delta)     1.531039e-14 1.903136 56.196602  0.5873642 7.399201e+01
##                       T.C          C.A       G.T          T.A          T.G
## 20A             0.6488582    0.7062639 0.5148332 4.730450e-01 6.320162e-01
## 20B             0.2376469    4.9370585 0.1388605 7.242752e-01 4.545273e-01
## 20E (EU1)       0.7709881    0.1564956 0.1647448 2.430887e+00 3.131310e-02
## 20I (Alpha, V1) 1.8457860 4268.5759334 0.2101508 8.846296e-02 6.959325e-02
## 20J (Gamma, V3) 1.6820888 2478.5599493 0.2751274 2.991642e-02 1.433295e-01
## 21J (Delta)     0.2407708  143.7365499 2.0861277 1.409351e-05 5.243760e+03
##                          A.C          C.G        A.del TCTGGTTTT.del
## 20A             5.419783e+06 4.066909e-03 1.067555e+00  3.134844e+03
## 20B             1.295484e+07 6.670037e-03 6.761530e-03  9.668263e+05
## 20E (EU1)       2.230648e+05 3.882623e+02 4.663265e+01  4.501854e+00
## 20I (Alpha, V1) 1.175999e+02 1.883536e+00 5.893341e-02  1.367442e+07
## 20J (Gamma, V3) 6.660743e+09 1.525826e+02 3.364667e-04  1.730303e+11
## 21J (Delta)     1.266702e+06 2.389957e+00 4.526145e+05  4.164664e-02
##                   TACATG.del   AGTTCA.del
## 20A             3.022621e+01 3.316823e-03
## 20B             1.813872e-20 7.848949e+02
## 20E (EU1)       9.562632e+00 3.936906e-05
## 20I (Alpha, V1) 1.057321e+02 3.576244e-03
## 20J (Gamma, V3) 6.207508e-05 3.815063e+01
## 21J (Delta)     2.394281e+02 7.807766e-02
# Predicted values are saved as fitted.values in the model object
  • NOTA: O modelo com o nome multinom_model_completo é o modelo antes da seleção automatica de preditos e o nome multinom_model_step é o modelo apos seleçao automatica

Verificar Z-score para o modelo (Wald Z) - NOTA: Estes valores da estatistica (z scores) e os respetivos p-values (p) já se encontram no tibble com o nome resumo_multinom_model_completo. Os proximos 4 blocos (que contem as variaveis z, p e testes do chisq) servem apenas para confirmar os resultados.

z = summary(multinom_model_completo)$coefficients/summary(multinom_model_completo)$standard.errors
z
##                 (Intercept)       C.T       A.G       G.A      G.C        T.C
## 20A               -4.354382 5.9509951 5.6659327  2.931404 1.081886 -1.0430466
## 20B               -8.619469 3.5489289 5.0275933  8.412907 4.976015 -3.4019707
## 20E (EU1)         -5.988671 6.8086137 0.8990318 -1.594785 4.698064  0.2911690
## 20I (Alpha, V1)   -3.928669 2.2380530 3.3065246  2.422995 4.472613 -0.7432643
## 20J (Gamma, V3)   -1.488728 0.2700378 0.9091258  1.582679 2.133266 -0.6170332
## 21J (Delta)       -3.868446 0.9733123 4.2561858  2.258926 1.781478 -1.6920755
##                        C.A        G.T        A.T        T.A         T.G
## 20A             -0.4769411 -3.8453509 -0.8305537 -0.9389956 -0.22987207
## 20B              1.7521396 -9.4437349 -0.4187998 -0.3190643 -0.37438743
## 20E (EU1)       -1.5911853 -5.5918843 -0.8270774 -0.2117804 -2.40915495
## 20I (Alpha, V1)  4.2611546 -2.6565361 -0.6722770 -0.8581748  0.22300521
## 20J (Gamma, V3)  1.0321831 -1.5508911  0.5863869 -0.2283724  0.01536544
## 21J (Delta)      2.3362798 -0.2354172 -1.4432437 -1.4641223  1.94595647
##                        A.C        C.G      A.del    TAT.del TCTGGTTTT.del
## 20A             0.35424532 -2.1938478 -0.7176128 -0.4792744     0.3033558
## 20B             0.40769617 -1.6214288 -2.6756087 -0.7678970     1.5277762
## 20E (EU1)       0.13394748  3.7976762  1.5646755  0.8733353    -1.5679067
## 20I (Alpha, V1) 0.03047826 -1.7276967  1.1570790 -0.3759717     1.8175191
## 20J (Gamma, V3) 0.59975504  1.2135504 -1.7017997 -1.4682998     2.0084015
## 21J (Delta)     0.39190516  0.7362209  1.5648622  1.4914809    -0.6158676
##                 TACATG.del  AGTTCA.del  GATTTC.del
## 20A             -1.1237620 -0.17731944 -0.09677436
## 20B             -3.5315414  0.04689756  0.25859893
## 20E (EU1)       -2.0696810 -0.22834120 -0.51180546
## 20I (Alpha, V1) -1.7356458 -0.18802036  1.12021736
## 20J (Gamma, V3) -0.9257802 -0.10084104  0.25502345
## 21J (Delta)     -1.4058266 -0.40037696  0.26970783

Calcular p value (2-tailed z test)

p = round((1 - pnorm(abs(z), 0, 1)) * 2,digits=5)
p
##                 (Intercept)     C.T     A.G     G.A     G.C     T.C     C.A
## 20A                 0.00001 0.00000 0.00000 0.00337 0.27930 0.29693 0.63340
## 20B                 0.00000 0.00039 0.00000 0.00000 0.00000 0.00067 0.07975
## 20E (EU1)           0.00000 0.00000 0.36864 0.11076 0.00000 0.77092 0.11157
## 20I (Alpha, V1)     0.00009 0.02522 0.00094 0.01539 0.00001 0.45732 0.00002
## 20J (Gamma, V3)     0.13656 0.78713 0.36328 0.11349 0.03290 0.53721 0.30199
## 21J (Delta)         0.00011 0.33040 0.00002 0.02389 0.07483 0.09063 0.01948
##                     G.T     A.T     T.A     T.G     A.C     C.G   A.del TAT.del
## 20A             0.00012 0.40623 0.34773 0.81819 0.72316 0.02825 0.47300 0.63174
## 20B             0.00000 0.67536 0.74968 0.70812 0.68350 0.10493 0.00746 0.44255
## 20E (EU1)       0.00000 0.40819 0.83228 0.01599 0.89344 0.00015 0.11766 0.38248
## 20I (Alpha, V1) 0.00789 0.50141 0.39080 0.82353 0.97569 0.08404 0.24724 0.70694
## 20J (Gamma, V3) 0.12093 0.55762 0.81936 0.98774 0.54867 0.22492 0.08879 0.14202
## 21J (Delta)     0.81388 0.14895 0.14316 0.05166 0.69513 0.46160 0.11762 0.13584
##                 TCTGGTTTT.del TACATG.del AGTTCA.del GATTTC.del
## 20A                   0.76162    0.26111    0.85926    0.92291
## 20B                   0.12657    0.00041    0.96259    0.79594
## 20E (EU1)             0.11690    0.03848    0.81938    0.60879
## 20I (Alpha, V1)       0.06914    0.08263    0.85086    0.26262
## 20J (Gamma, V3)       0.04460    0.35456    0.91968    0.79870
## 21J (Delta)           0.53798    0.15978    0.68888    0.78739

Verificar o fit do modelo

detach("package:jmv", unload=T)
#Comparar o nosso modelo completo com o modelo vazio (intercept)
anova(multinom_model_0, multinom_model_completo)
  • O LL é uma medida que nos indica a variabilidade não explicada nos nossos dados. A diferença ou mudança no LL indica quanta nova variancia foi explicada pleo modelo
  • Teste do qui-quadrado testa a diminuiçao da variancia nao explicada a partir do modelo baseline para o modelo final. Esta alteraçao é significativa o que significa que o nosso modelo final explica uma quantidade significante da variabilidade original
  • O LR chi-suqared com p-value < 0.05 diz-nos que o nosso modelo consegue ter um melhor “fit” do que o modelo vazio (modelo sem preditores)

Calcular Goodness of fit

# Verificar a probabilidade predita para cada clade/variante/grupo
head(multinom_model_completo$fitted.values,30)
##             19A          20A          20B    20E (EU1) 20I (Alpha, V1)
## 2  2.165732e-05 0.0055634704 9.941438e-01 2.710191e-04    9.645498e-09
## 4  2.165732e-05 0.0055634704 9.941438e-01 2.710191e-04    9.645498e-09
## 5  1.086575e-01 0.8864131781 3.357820e-03 1.571509e-03    1.471285e-10
## 7  6.090700e-03 0.9640051284 2.984031e-02 6.370515e-05    1.183304e-09
## 8  1.086575e-01 0.8864131781 3.357820e-03 1.571509e-03    1.471285e-10
## 9  8.386937e-02 0.9112246164 1.179900e-03 3.726105e-03    1.153874e-10
## 11 3.268900e-05 0.0047816841 9.950320e-01 1.536218e-04    7.287378e-09
## 12 2.165732e-05 0.0055634704 9.941438e-01 2.710191e-04    9.645498e-09
## 13 6.990207e-06 0.0007124189 9.992743e-01 5.718304e-06    5.430371e-07
## 15 2.704889e-01 0.7283334831 9.314237e-04 2.461691e-04    2.723404e-11
## 16 2.704889e-01 0.7283334831 9.314237e-04 2.461691e-04    2.723404e-11
## 17 2.704889e-01 0.7283334831 9.314237e-04 2.461691e-04    2.723404e-11
## 21 2.704889e-01 0.7283334831 9.314237e-04 2.461691e-04    2.723404e-11
## 22 7.226289e-04 0.0623718146 9.366516e-01 2.538936e-04    7.102092e-09
## 23 9.541651e-01 0.0458114494 9.622115e-06 1.383740e-05    8.487045e-14
## 25 1.922498e-04 0.0163008872 9.833555e-01 1.513868e-04    6.366653e-09
## 26 1.922498e-04 0.0163008872 9.833555e-01 1.513868e-04    6.366653e-09
## 27 1.879223e-03 0.9972307948 8.537833e-04 3.549142e-05    3.467256e-10
## 29 2.165732e-05 0.0055634704 9.941438e-01 2.710191e-04    9.645498e-09
## 32 2.165732e-05 0.0055634704 9.941438e-01 2.710191e-04    9.645498e-09
## 34 2.165732e-05 0.0055634704 9.941438e-01 2.710191e-04    9.645498e-09
## 35 2.704889e-01 0.7283334831 9.314237e-04 2.461691e-04    2.723404e-11
## 36 3.268900e-05 0.0047816841 9.950320e-01 1.536218e-04    7.287378e-09
## 37 1.922498e-04 0.0163008872 9.833555e-01 1.513868e-04    6.366653e-09
## 38 1.922498e-04 0.0163008872 9.833555e-01 1.513868e-04    6.366653e-09
## 39 6.599594e-01 0.3399916903 4.255242e-05 6.332843e-06    7.339741e-13
## 41 8.313976e-01 0.1685606520 3.628078e-05 5.515289e-06    6.048609e-14
## 43 2.165732e-05 0.0055634704 9.941438e-01 2.710191e-04    9.645498e-09
## 44 4.662244e-02 0.9231040195 2.998847e-02 2.850465e-04    4.227190e-10
## 45 9.541651e-01 0.0458114494 9.622115e-06 1.383740e-05    8.487045e-14
##    20J (Gamma, V3)  21J (Delta)
## 2     9.589552e-11 1.818197e-09
## 4     9.589552e-11 1.818197e-09
## 5     4.379971e-13 8.830477e-09
## 7     1.413876e-11 1.595268e-07
## 8     4.379971e-13 8.830477e-09
## 9     2.020939e-13 3.962171e-09
## 11    1.098914e-10 2.108996e-09
## 12    9.589552e-11 1.818197e-09
## 13    1.618802e-09 1.087442e-08
## 15    1.500209e-13 1.487836e-08
## 16    1.500209e-13 1.487836e-08
## 17    1.500209e-13 1.487836e-08
## 21    1.500209e-13 1.487836e-08
## 22    1.050880e-10 4.705849e-08
## 23    1.312330e-15 1.918617e-10
## 25    1.171252e-10 1.092404e-08
## 26    1.171252e-10 1.092404e-08
## 27    6.609078e-13 7.071538e-07
## 29    9.589552e-11 1.818197e-09
## 32    9.589552e-11 1.818197e-09
## 34    9.589552e-11 1.818197e-09
## 35    1.500209e-13 1.487836e-08
## 36    1.098914e-10 2.108996e-09
## 37    1.171252e-10 1.092404e-08
## 38    1.171252e-10 1.092404e-08
## 39    9.127091e-15 2.163949e-08
## 41    3.383648e-15 5.123595e-10
## 43    9.589552e-11 1.818197e-09
## 44    5.660964e-12 1.921139e-08
## 45    1.312330e-15 1.918617e-10
head(round(fitted(multinom_model_completo),2))
##    19A  20A  20B 20E (EU1) 20I (Alpha, V1) 20J (Gamma, V3) 21J (Delta)
## 2 0.00 0.01 0.99         0               0               0           0
## 4 0.00 0.01 0.99         0               0               0           0
## 5 0.11 0.89 0.00         0               0               0           0
## 7 0.01 0.96 0.03         0               0               0           0
## 8 0.11 0.89 0.00         0               0               0           0
## 9 0.08 0.91 0.00         0               0               0           0
# Prever resultados:
head(predict(multinom_model_completo),30)
##  [1] 20B 20B 20A 20A 20A 20A 20B 20B 20B 20A 20A 20A 20A 20B 19A 20B 20B 20A 20B
## [20] 20B 20B 20A 20B 20B 20B 19A 19A 20B 20A 19A
## 7 Levels: 19A 20A 20B 20E (EU1) 20I (Alpha, V1) ... 21J (Delta)
# As probabilidades estao concordantes com os resultados previstos (verificar a tabela que testa o modelo com os dados de teste)


#Test goodness of fit
 # Usado para determinar se existe diferença significativa entre as frequencias esperadas vs frequencias observadas em 1 ou mais categorias de uma tabela de contençao. p-value < 0.05 logo, rejeitamos H0
      # Se Ho for verdadeira, o observado e esperado vai ser muito semelhante e estatistica tende a ser 0
      # Se H1 for verdadeira, estatistica assume valores muito grandes, afastando-se de Ho → mais heterogeneos sao os grupos ou maior é o desvio

chisq.test(train$clade, predict(multinom_model_completo)) # dados de treino (clade) vs prevsao feita pelo modelo completo 
## Warning in chisq.test(train$clade, predict(multinom_model_completo)): Chi-
## squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  train$clade and predict(multinom_model_completo)
## X-squared = 77505, df = 36, p-value < 2.2e-16
## DUVIDA - devemos usar o package rstatix com o pairwise_chisq_test_against_p? onde colocamos dados vs proporçoes (tabela de frequencias?) Ou o chisq.test normal serve? Ou ainda aplicar teste de fischer?

# Classes/variantes sao independentes, ou seja, uma pessoa nao pode ter variante 20A e 20B, por exemplo - ver grupos/classes vs Sim/Nao ou grupos/classes de treino vs valores preditos?
chisq.test(train$clade, predict(multinom_modelo_step1))
## Warning in chisq.test(train$clade, predict(multinom_modelo_step1)): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  train$clade and predict(multinom_modelo_step1)
## X-squared = 77636, df = 36, p-value < 2.2e-16

Qualidade e Significado do Modelo

### Qualidade e Significado do Modelo completo e modelo step AIC
            

########################################################

      #Deviance
deviance(multinom_model_0)
## [1] 36427.69
deviance(multinom_model_completo)
## [1] 725.843
deviance(multinom_modelo_step1)
## [1] 688.6486
########################################################

      #AIC
AIC(multinom_model_completo)
## [1] 953.843
AIC(multinom_model_0)
## [1] 36439.69
AIC(multinom_modelo_step1)
## [1] 880.6486
########################################################

      #Hoslem 
#install.packages("generalhoslem")
library(generalhoslem)
## Warning: package 'generalhoslem' was built under R version 4.0.5
## Loading required package: reshape
## 
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
## 
##     rename
#H0: Modelo corrente é apropriado vs Modelo nao é apropriado

logitgof(train$clade,fitted(multinom_model_completo) ,ord=F, g=10) #como p value = 1, concluimos que ha indicaçao que nao ha evidencia de que as frequencias observadas sejam muito diferentes das esperadas (evidencia de um bom fit). Este teste vai comparar frequencias observadas com as esperadas do outcome e fazer um teste estatistico que é distribuido de acordo com a distribuiçao qui-quadrado. DUVIDA se podemos usar isto uma vez que algumas celulas podem nao satisfazer pressupostos de frequencias do qui quadrado
## Warning in logitgof(train$clade, fitted(multinom_model_completo), ord =
## F, : At least one cell in the expected frequencies table is < 1. Chi-square
## approximation may be incorrect.
## 
##  Hosmer and Lemeshow test (multinomial model)
## 
## data:  train$clade, fitted(multinom_model_completo)
## X-squared = 15.898, df = 48, p-value = 1
# Teste HL para modelo step aic
logitgof(train$clade,fitted(multinom_modelo_step1) ,ord=F, g=10)
## Warning in logitgof(train$clade, fitted(multinom_modelo_step1), ord = F, :
## At least one cell in the expected frequencies table is < 1. Chi-square
## approximation may be incorrect.
## Warning in logitgof(train$clade, fitted(multinom_modelo_step1), ord = F, : Not
## possible to compute 10 rows. There might be too few observations.
## 
##  Hosmer and Lemeshow test (multinomial model)
## 
## data:  train$clade, fitted(multinom_modelo_step1)
## X-squared = 4.6717, df = 30, p-value = 1
########################################################

library(DescTools)
## 
## Attaching package: 'DescTools'
## The following object is masked from 'package:car':
## 
##     Recode
## The following objects are masked from 'package:caret':
## 
##     MAE, RMSE
#PseudoR2 para o modelo completo
PseudoR2(multinom_model_completo, which=c("McFadden")) # McFadden = LL(null) - LL(completo) / LL(null) -> 0.98. Indica uma relaçao de 98% entre os preditores e o previsto.
## Warning in PseudoR2(multinom_model_completo, which = c("McFadden")): Could
## not find model or data element of multinom object for evaluating PseudoR2 null
## model. Will fit null model with new evaluation of 'train'. Ensure object has not
## changed since initial call, or try running multinom with 'model = TRUE'
## Warning: Using formula(x) is deprecated when x is a character vector of length > 1.
##   Consider formula(paste(x, collapse = " ")) instead.
##  McFadden 
## 0.9800744
PseudoR2(multinom_model_completo, which=c("Nagelkerke")) #Indica relaçao de 99% entre preditores e previsto
## Warning in PseudoR2(multinom_model_completo, which = c("Nagelkerke")): Could not find model or data element of multinom object for evaluating PseudoR2 null model. Will fit null model with new evaluation of 'train'. Ensure object has not changed since initial call, or try running multinom with 'model = TRUE'

## Warning in PseudoR2(multinom_model_completo, which = c("Nagelkerke")): Using formula(x) is deprecated when x is a character vector of length > 1.
##   Consider formula(paste(x, collapse = " ")) instead.
## Nagelkerke 
##  0.9954993
PseudoR2(multinom_model_completo, which=c("CoxSnell"))
## Warning in PseudoR2(multinom_model_completo, which = c("CoxSnell")): Could not find model or data element of multinom object for evaluating PseudoR2 null model. Will fit null model with new evaluation of 'train'. Ensure object has not changed since initial call, or try running multinom with 'model = TRUE'

## Warning in PseudoR2(multinom_model_completo, which = c("CoxSnell")): Using formula(x) is deprecated when x is a character vector of length > 1.
##   Consider formula(paste(x, collapse = " ")) instead.
##  CoxSnell 
## 0.9155335
#PseudoR2 para o modelo step aic
PseudoR2(multinom_modelo_step1, which=c("McFadden"))
## Warning in PseudoR2(multinom_modelo_step1, which = c("McFadden")): Could not find model or data element of multinom object for evaluating PseudoR2 null model. Will fit null model with new evaluation of 'train'. Ensure object has not changed since initial call, or try running multinom with 'model = TRUE'

## Warning in PseudoR2(multinom_modelo_step1, which = c("McFadden")): Using formula(x) is deprecated when x is a character vector of length > 1.
##   Consider formula(paste(x, collapse = " ")) instead.
##  McFadden 
## 0.9810955
#NOTA: estas estatisticas na regresso logistica nao significam o mesmo que os R quadrados na regressao com metodo OLS (proporçao de variancia da resposta explicada pelos preditores).

########################################################

      # Accuracy / tabela de classificaçao - MODELO COMPLETO ########################################################

            # Predicting and validating the model
                  # Predicting the values for train dataset
train$cladePredicted <- predict(multinom_model_completo, newdata = train, "class")
#Classification table
confusionMatrix(train$cladePredicted, train$clade) #confusionMatrix(predicted,actual) - https://rpubs.com/beane/n4_2
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction         19A  20A  20B 20E (EU1) 20I (Alpha, V1) 20J (Gamma, V3)
##   19A               61    2    0         0               0               0
##   20A               42  763   21         3               1               0
##   20B                0   38  755         4               0               0
##   20E (EU1)          0    2    4       948               0               0
##   20I (Alpha, V1)    0    0    1         0            3748               0
##   20J (Gamma, V3)    0    0    0         0               0             149
##   21J (Delta)        0    0    1         0               0               0
##                  Reference
## Prediction        21J (Delta)
##   19A                       0
##   20A                       2
##   20B                       1
##   20E (EU1)                 0
##   20I (Alpha, V1)           0
##   20J (Gamma, V3)           0
##   21J (Delta)            7900
## 
## Overall Statistics
##                                          
##                Accuracy : 0.9916         
##                  95% CI : (0.9899, 0.993)
##     No Information Rate : 0.5471         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.9864         
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: 19A Class: 20A Class: 20B Class: 20E (EU1)
## Sensitivity            0.592233    0.94783    0.96547          0.99267
## Specificity            0.999861    0.99494    0.99685          0.99956
## Pos Pred Value         0.968254    0.91707    0.94612          0.99371
## Neg Pred Value         0.997080    0.99691    0.99802          0.99948
## Prevalence             0.007130    0.05572    0.05413          0.06611
## Detection Rate         0.004223    0.05282    0.05226          0.06562
## Detection Prevalence   0.004361    0.05759    0.05524          0.06604
## Balanced Accuracy      0.796047    0.97138    0.98116          0.99611
##                      Class: 20I (Alpha, V1) Class: 20J (Gamma, V3)
## Sensitivity                          0.9997                1.00000
## Specificity                          0.9999                1.00000
## Pos Pred Value                       0.9997                1.00000
## Neg Pred Value                       0.9999                1.00000
## Prevalence                           0.2595                0.01031
## Detection Rate                       0.2594                0.01031
## Detection Prevalence                 0.2595                0.01031
## Balanced Accuracy                    0.9998                1.00000
##                      Class: 21J (Delta)
## Sensitivity                      0.9996
## Specificity                      0.9998
## Pos Pred Value                   0.9999
## Neg Pred Value                   0.9995
## Prevalence                       0.5471
## Detection Rate                   0.5469
## Detection Prevalence             0.5469
## Balanced Accuracy                0.9997
#Calculating accruracy - sum of diagonal elements divided by total obs
#tab_train = table(train$clade, train$cladePredicted)
#round((sum(diag(tab_train))/sum(tab_train))*100,2)


# Predicting the class for test dataset
test$cladePredicted <- predict(multinom_model_completo, newdata = test, "class")
# Tabela de confusao para os dados de teste com o modelo completo
confusionMatrix(test$cladePredicted, test$clade)
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction         19A  20A  20B 20E (EU1) 20I (Alpha, V1) 20J (Gamma, V3)
##   19A               26    0    0         0               0               0
##   20A                8  252   11         1               0               0
##   20B                0   12  248         2               1               0
##   20E (EU1)          0    4    1       315               0               0
##   20I (Alpha, V1)    0    0    0         0            1248               0
##   20J (Gamma, V3)    0    0    0         0               0              49
##   21J (Delta)        0    0    0         0               0               0
##                  Reference
## Prediction        21J (Delta)
##   19A                       0
##   20A                       1
##   20B                       0
##   20E (EU1)                 0
##   20I (Alpha, V1)           1
##   20J (Gamma, V3)           0
##   21J (Delta)            2632
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9913          
##                  95% CI : (0.9882, 0.9937)
##     No Information Rate : 0.5474          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.986           
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 19A Class: 20A Class: 20B Class: 20E (EU1)
## Sensitivity            0.764706    0.94030    0.95385          0.99057
## Specificity            1.000000    0.99538    0.99670          0.99889
## Pos Pred Value         1.000000    0.92308    0.94297          0.98438
## Neg Pred Value         0.998328    0.99647    0.99736          0.99933
## Prevalence             0.007066    0.05569    0.05403          0.06608
## Detection Rate         0.005403    0.05237    0.05154          0.06546
## Detection Prevalence   0.005403    0.05673    0.05466          0.06650
## Balanced Accuracy      0.882353    0.96784    0.97528          0.99473
##                      Class: 20I (Alpha, V1) Class: 20J (Gamma, V3)
## Sensitivity                          0.9992                1.00000
## Specificity                          0.9997                1.00000
## Pos Pred Value                       0.9992                1.00000
## Neg Pred Value                       0.9997                1.00000
## Prevalence                           0.2596                0.01018
## Detection Rate                       0.2594                0.01018
## Detection Prevalence                 0.2596                0.01018
## Balanced Accuracy                    0.9995                1.00000
##                      Class: 21J (Delta)
## Sensitivity                      0.9992
## Specificity                      1.0000
## Pos Pred Value                   1.0000
## Neg Pred Value                   0.9991
## Prevalence                       0.5474
## Detection Rate                   0.5470
## Detection Prevalence             0.5470
## Balanced Accuracy                0.9996
#Calculating accruracy - Já calculada na matriz de confusao do package caret
#tab_test <- table(test$clade, test$cladePredicted )
#round((sum(diag(tab_test))/sum(tab_test))*100,2) - NOTA: poderiamos usar este metodo para ver a precisao do modelo se a tabela não tivesse as variantes que foram removidas no inicio (todas as variantes que estao pouco representadas -> menos de 200 observaçoes, por exemplo).O facto desta table ainda ter os valores "0" de variantes que não estáo presentes nos dados originais, vai alterar bastante os resultados. Neste caso, vamos usar a matriz de confusão que já trata desse pormenor e apresenta estatisticas mais robustas.

      # Accuracy table - MODELO STEP1 #############################################################################################

# Predicting the values for train dataset
train$cladePredictedStep <- predict(multinom_modelo_step1, newdata = train, "class")
#Classification table
confusionMatrix(train$cladePredictedStep, train$clade)
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction         19A  20A  20B 20E (EU1) 20I (Alpha, V1) 20J (Gamma, V3)
##   19A               61    2    0         0               0               0
##   20A               42  768   24         3               0               0
##   20B                0   31  754         5               0               0
##   20E (EU1)          0    4    4       947               0               0
##   20I (Alpha, V1)    0    0    0         0            3749               0
##   20J (Gamma, V3)    0    0    0         0               0             149
##   21J (Delta)        0    0    0         0               0               0
##                  Reference
## Prediction        21J (Delta)
##   19A                       0
##   20A                       1
##   20B                       0
##   20E (EU1)                 0
##   20I (Alpha, V1)           0
##   20J (Gamma, V3)           0
##   21J (Delta)            7902
## 
## Overall Statistics
##                                           
##                Accuracy : 0.992           
##                  95% CI : (0.9904, 0.9934)
##     No Information Rate : 0.5471          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9871          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 19A Class: 20A Class: 20B Class: 20E (EU1)
## Sensitivity            0.592233    0.95404    0.96419          0.99162
## Specificity            0.999861    0.99487    0.99737          0.99941
## Pos Pred Value         0.968254    0.91647    0.95443          0.99162
## Neg Pred Value         0.997080    0.99728    0.99795          0.99941
## Prevalence             0.007130    0.05572    0.05413          0.06611
## Detection Rate         0.004223    0.05316    0.05219          0.06555
## Detection Prevalence   0.004361    0.05801    0.05469          0.06611
## Balanced Accuracy      0.796047    0.97445    0.98078          0.99552
##                      Class: 20I (Alpha, V1) Class: 20J (Gamma, V3)
## Sensitivity                          1.0000                1.00000
## Specificity                          1.0000                1.00000
## Pos Pred Value                       1.0000                1.00000
## Neg Pred Value                       1.0000                1.00000
## Prevalence                           0.2595                0.01031
## Detection Rate                       0.2595                0.01031
## Detection Prevalence                 0.2595                0.01031
## Balanced Accuracy                    1.0000                1.00000
##                      Class: 21J (Delta)
## Sensitivity                      0.9999
## Specificity                      1.0000
## Pos Pred Value                   1.0000
## Neg Pred Value                   0.9998
## Prevalence                       0.5471
## Detection Rate                   0.5470
## Detection Prevalence             0.5470
## Balanced Accuracy                0.9999
#Calculating accruracy - Já calculada na matriz de confusao do package caret
#tab_train_step = table(train$clade, train$cladePredictedStep)
#round((sum(diag(tab_train_step))/sum(tab_train_step))*100,2)

# Predicting the class for test dataset
test$cladePredictedStep <- predict(multinom_modelo_step1, newdata = test, "class")
# Tabela de confusao para os dados de teste com o modelo de seleçao de variaveis automatica
cm_teste = confusionMatrix(test$cladePredictedStep, test$clade)
cm_teste
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction         19A  20A  20B 20E (EU1) 20I (Alpha, V1) 20J (Gamma, V3)
##   19A               25    0    0         1               0               0
##   20A                9  254    9         0               0               0
##   20B                0    9  250         3               1               1
##   20E (EU1)          0    4    1       314               0               0
##   20I (Alpha, V1)    0    0    0         0            1248               0
##   20J (Gamma, V3)    0    1    0         0               0              48
##   21J (Delta)        0    0    0         0               0               0
##                  Reference
## Prediction        21J (Delta)
##   19A                       0
##   20A                       0
##   20B                       0
##   20E (EU1)                 0
##   20I (Alpha, V1)           0
##   20J (Gamma, V3)           0
##   21J (Delta)            2634
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9919          
##                  95% CI : (0.9889, 0.9942)
##     No Information Rate : 0.5474          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.987           
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 19A Class: 20A Class: 20B Class: 20E (EU1)
## Sensitivity            0.735294    0.94776    0.96154          0.98742
## Specificity            0.999791    0.99604    0.99692          0.99889
## Pos Pred Value         0.961538    0.93382    0.94697          0.98433
## Neg Pred Value         0.998120    0.99692    0.99780          0.99911
## Prevalence             0.007066    0.05569    0.05403          0.06608
## Detection Rate         0.005195    0.05278    0.05195          0.06525
## Detection Prevalence   0.005403    0.05653    0.05486          0.06629
## Balanced Accuracy      0.867542    0.97190    0.97923          0.99315
##                      Class: 20I (Alpha, V1) Class: 20J (Gamma, V3)
## Sensitivity                          0.9992               0.979592
## Specificity                          1.0000               0.999790
## Pos Pred Value                       1.0000               0.979592
## Neg Pred Value                       0.9997               0.999790
## Prevalence                           0.2596               0.010183
## Detection Rate                       0.2594               0.009975
## Detection Prevalence                 0.2594               0.010183
## Balanced Accuracy                    0.9996               0.989691
##                      Class: 21J (Delta)
## Sensitivity                      1.0000
## Specificity                      1.0000
## Pos Pred Value                   1.0000
## Neg Pred Value                   1.0000
## Prevalence                       0.5474
## Detection Rate                   0.5474
## Detection Prevalence             0.5474
## Balanced Accuracy                1.0000
#tab_test_step <- table(test$clade, test$cladePredictedStep)
#round((sum(diag(tab_test_step))/sum(tab_test_step))*100,2)

########################################################
# LR Test para o modelo completo

library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
lrtest(multinom_model_0,multinom_model_completo) # modelo vazio vs modelo completo - semelhante a chisq.test do goodness of fit
indels = c("C.T","A.G","G.A","G.C","T.C","C.A","G.T","A.C","C.G","A.del","TAT.del","TCTGGTTTT.del","TACATG.del","GATTTC.del")

my_list = list()
# for (i in 1:length(indels)) {
#       assign(paste0("lrtest_results_",indels[i]), my_list[[i]]) = lrtest(multinom_model_completo, indels[i])
# }
my_list <- lapply(indels, function(i) lrtest(multinom_model_completo, i))
## # weights:  133 (108 variable)
## initial  value 28110.618013 
## iter  10 value 2044.470892
## iter  20 value 1197.486180
## iter  30 value 620.517384
## iter  40 value 457.933356
## iter  50 value 409.225179
## iter  60 value 397.903307
## iter  70 value 387.294104
## iter  80 value 381.656989
## iter  90 value 377.334823
## iter 100 value 372.852830
## final  value 372.852830 
## stopped after 100 iterations
## # weights:  133 (108 variable)
## initial  value 28110.618013 
## iter  10 value 6004.888113
## iter  20 value 4274.741209
## iter  30 value 2729.838520
## iter  40 value 1536.791636
## iter  50 value 813.009285
## iter  60 value 588.960420
## iter  70 value 488.009152
## iter  80 value 428.916595
## iter  90 value 399.296734
## iter 100 value 385.913149
## final  value 385.913149 
## stopped after 100 iterations
## # weights:  133 (108 variable)
## initial  value 28110.618013 
## iter  10 value 5122.446547
## iter  20 value 2925.316847
## iter  30 value 2235.856244
## iter  40 value 1384.315548
## iter  50 value 902.058166
## iter  60 value 645.535924
## iter  70 value 570.994887
## iter  80 value 522.994939
## iter  90 value 500.420978
## iter 100 value 489.555683
## final  value 489.555683 
## stopped after 100 iterations
## # weights:  133 (108 variable)
## initial  value 28110.618013 
## iter  10 value 3053.122412
## iter  20 value 1982.713930
## iter  30 value 1581.573217
## iter  40 value 1145.226467
## iter  50 value 814.138559
## iter  60 value 668.465423
## iter  70 value 611.980836
## iter  80 value 585.563730
## iter  90 value 567.410519
## iter 100 value 557.665137
## final  value 557.665137 
## stopped after 100 iterations
## # weights:  133 (108 variable)
## initial  value 28110.618013 
## iter  10 value 6020.763192
## iter  20 value 2764.925357
## iter  30 value 1889.578620
## iter  40 value 1275.998795
## iter  50 value 719.308720
## iter  60 value 506.405243
## iter  70 value 429.713038
## iter  80 value 387.707209
## iter  90 value 362.746165
## iter 100 value 355.781312
## final  value 355.781312 
## stopped after 100 iterations
## # weights:  133 (108 variable)
## initial  value 28110.618013 
## iter  10 value 5049.066538
## iter  20 value 3444.220002
## iter  30 value 2454.843183
## iter  40 value 1365.780662
## iter  50 value 772.630445
## iter  60 value 559.582298
## iter  70 value 467.100355
## iter  80 value 428.680253
## iter  90 value 397.389225
## iter 100 value 382.807724
## final  value 382.807724 
## stopped after 100 iterations
## # weights:  133 (108 variable)
## initial  value 28110.618013 
## iter  10 value 2131.225543
## iter  20 value 1289.632625
## iter  30 value 1012.110753
## iter  40 value 728.484616
## iter  50 value 621.097331
## iter  60 value 541.757593
## iter  70 value 500.237016
## iter  80 value 464.547806
## iter  90 value 453.059843
## iter 100 value 445.542810
## final  value 445.542810 
## stopped after 100 iterations
## # weights:  133 (108 variable)
## initial  value 28110.618013 
## iter  10 value 6546.001858
## iter  20 value 5028.216636
## iter  30 value 3325.910399
## iter  40 value 1743.457780
## iter  50 value 862.313717
## iter  60 value 598.816494
## iter  70 value 464.840973
## iter  80 value 402.426178
## iter  90 value 378.550368
## iter 100 value 366.693187
## final  value 366.693187 
## stopped after 100 iterations
## # weights:  133 (108 variable)
## initial  value 28110.618013 
## iter  10 value 6947.808917
## iter  20 value 5594.411276
## iter  30 value 3472.044039
## iter  40 value 1896.219247
## iter  50 value 1304.700181
## iter  60 value 924.594774
## iter  70 value 772.285061
## iter  80 value 685.290052
## iter  90 value 660.015971
## iter 100 value 640.024851
## final  value 640.024851 
## stopped after 100 iterations
## # weights:  133 (108 variable)
## initial  value 28110.618013 
## iter  10 value 6462.765663
## iter  20 value 5003.841955
## iter  30 value 2521.021392
## iter  40 value 1383.422730
## iter  50 value 784.258090
## iter  60 value 558.847233
## iter  70 value 466.827683
## iter  80 value 414.262481
## iter  90 value 389.255126
## iter 100 value 380.104329
## final  value 380.104329 
## stopped after 100 iterations
## # weights:  133 (108 variable)
## initial  value 28110.618013 
## iter  10 value 6354.143309
## iter  20 value 4896.431204
## iter  30 value 2926.951316
## iter  40 value 1616.895299
## iter  50 value 813.194271
## iter  60 value 572.814143
## iter  70 value 452.232947
## iter  80 value 399.863198
## iter  90 value 371.951519
## iter 100 value 360.310803
## final  value 360.310803 
## stopped after 100 iterations
## # weights:  133 (108 variable)
## initial  value 28110.618013 
## iter  10 value 6355.384340
## iter  20 value 4903.436042
## iter  30 value 3001.134836
## iter  40 value 1707.300576
## iter  50 value 874.580984
## iter  60 value 616.502220
## iter  70 value 479.217359
## iter  80 value 427.136907
## iter  90 value 395.623407
## iter 100 value 379.665116
## final  value 379.665116 
## stopped after 100 iterations
## # weights:  133 (108 variable)
## initial  value 28110.618013 
## iter  10 value 6354.335058
## iter  20 value 4906.838126
## iter  30 value 2983.371623
## iter  40 value 1447.111455
## iter  50 value 841.413564
## iter  60 value 619.316258
## iter  70 value 482.859359
## iter  80 value 420.392281
## iter  90 value 390.587787
## iter 100 value 378.632938
## final  value 378.632938 
## stopped after 100 iterations
## # weights:  133 (108 variable)
## initial  value 28110.618013 
## iter  10 value 6529.141917
## iter  20 value 5224.690067
## iter  30 value 3434.839000
## iter  40 value 1645.504780
## iter  50 value 948.667685
## iter  60 value 631.377097
## iter  70 value 483.825872
## iter  80 value 411.052203
## iter  90 value 377.603994
## iter 100 value 364.839037
## final  value 364.839037 
## stopped after 100 iterations
names(my_list) = paste0("lrtest_results_", indels)
my_list # estes resultados podem ser usados para obter a significancia dos preditores para o modelo. Todos os preditores que têm p-value do chisq < 0.05 têm efeitos significativos na prediçao da varainte tendo em conta os SNVs
## $lrtest_results_C.T
## Likelihood ratio test
## 
## Model 1: clade ~ C.T + A.G + G.A + G.C + T.C + C.A + G.T + A.T + T.A + 
##     T.G + A.C + C.G + A.del + TAT.del + TCTGGTTTT.del + TACATG.del + 
##     AGTTCA.del + GATTTC.del
## Model 2: clade ~ A.G + G.A + G.C + T.C + C.A + G.T + A.T + T.A + T.G + 
##     A.C + C.G + A.del + TAT.del + TCTGGTTTT.del + TACATG.del + 
##     AGTTCA.del + GATTTC.del
##   #Df  LogLik Df  Chisq Pr(>Chisq)   
## 1 114 -362.92                        
## 2 108 -372.85 -6 19.863    0.00293 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $lrtest_results_A.G
## Likelihood ratio test
## 
## Model 1: clade ~ C.T + A.G + G.A + G.C + T.C + C.A + G.T + A.T + T.A + 
##     T.G + A.C + C.G + A.del + TAT.del + TCTGGTTTT.del + TACATG.del + 
##     AGTTCA.del + GATTTC.del
## Model 2: clade ~ C.T + G.A + G.C + T.C + C.A + G.T + A.T + T.A + T.G + 
##     A.C + C.G + A.del + TAT.del + TCTGGTTTT.del + TACATG.del + 
##     AGTTCA.del + GATTTC.del
##   #Df  LogLik Df  Chisq Pr(>Chisq)    
## 1 114 -362.92                         
## 2 108 -385.91 -6 45.983  2.983e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $lrtest_results_G.A
## Likelihood ratio test
## 
## Model 1: clade ~ C.T + A.G + G.A + G.C + T.C + C.A + G.T + A.T + T.A + 
##     T.G + A.C + C.G + A.del + TAT.del + TCTGGTTTT.del + TACATG.del + 
##     AGTTCA.del + GATTTC.del
## Model 2: clade ~ C.T + A.G + G.C + T.C + C.A + G.T + A.T + T.A + T.G + 
##     A.C + C.G + A.del + TAT.del + TCTGGTTTT.del + TACATG.del + 
##     AGTTCA.del + GATTTC.del
##   #Df  LogLik Df  Chisq Pr(>Chisq)    
## 1 114 -362.92                         
## 2 108 -489.56 -6 253.27  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $lrtest_results_G.C
## Likelihood ratio test
## 
## Model 1: clade ~ C.T + A.G + G.A + G.C + T.C + C.A + G.T + A.T + T.A + 
##     T.G + A.C + C.G + A.del + TAT.del + TCTGGTTTT.del + TACATG.del + 
##     AGTTCA.del + GATTTC.del
## Model 2: clade ~ C.T + A.G + G.A + T.C + C.A + G.T + A.T + T.A + T.G + 
##     A.C + C.G + A.del + TAT.del + TCTGGTTTT.del + TACATG.del + 
##     AGTTCA.del + GATTTC.del
##   #Df  LogLik Df  Chisq Pr(>Chisq)    
## 1 114 -362.92                         
## 2 108 -557.67 -6 389.49  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $lrtest_results_T.C
## Likelihood ratio test
## 
## Model 1: clade ~ C.T + A.G + G.A + G.C + T.C + C.A + G.T + A.T + T.A + 
##     T.G + A.C + C.G + A.del + TAT.del + TCTGGTTTT.del + TACATG.del + 
##     AGTTCA.del + GATTTC.del
## Model 2: clade ~ C.T + A.G + G.A + G.C + C.A + G.T + A.T + T.A + T.G + 
##     A.C + C.G + A.del + TAT.del + TCTGGTTTT.del + TACATG.del + 
##     AGTTCA.del + GATTTC.del
##   #Df  LogLik Df Chisq Pr(>Chisq)  
## 1 114 -362.92                      
## 2 108 -355.78 -6 14.28    0.02666 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $lrtest_results_C.A
## Likelihood ratio test
## 
## Model 1: clade ~ C.T + A.G + G.A + G.C + T.C + C.A + G.T + A.T + T.A + 
##     T.G + A.C + C.G + A.del + TAT.del + TCTGGTTTT.del + TACATG.del + 
##     AGTTCA.del + GATTTC.del
## Model 2: clade ~ C.T + A.G + G.A + G.C + T.C + G.T + A.T + T.A + T.G + 
##     A.C + C.G + A.del + TAT.del + TCTGGTTTT.del + TACATG.del + 
##     AGTTCA.del + GATTTC.del
##   #Df  LogLik Df  Chisq Pr(>Chisq)    
## 1 114 -362.92                         
## 2 108 -382.81 -6 39.772  5.049e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $lrtest_results_G.T
## Likelihood ratio test
## 
## Model 1: clade ~ C.T + A.G + G.A + G.C + T.C + C.A + G.T + A.T + T.A + 
##     T.G + A.C + C.G + A.del + TAT.del + TCTGGTTTT.del + TACATG.del + 
##     AGTTCA.del + GATTTC.del
## Model 2: clade ~ C.T + A.G + G.A + G.C + T.C + C.A + A.T + T.A + T.G + 
##     A.C + C.G + A.del + TAT.del + TCTGGTTTT.del + TACATG.del + 
##     AGTTCA.del + GATTTC.del
##   #Df  LogLik Df  Chisq Pr(>Chisq)    
## 1 114 -362.92                         
## 2 108 -445.54 -6 165.24  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $lrtest_results_A.C
## Likelihood ratio test
## 
## Model 1: clade ~ C.T + A.G + G.A + G.C + T.C + C.A + G.T + A.T + T.A + 
##     T.G + A.C + C.G + A.del + TAT.del + TCTGGTTTT.del + TACATG.del + 
##     AGTTCA.del + GATTTC.del
## Model 2: clade ~ C.T + A.G + G.A + G.C + T.C + C.A + G.T + A.T + T.A + 
##     T.G + C.G + A.del + TAT.del + TCTGGTTTT.del + TACATG.del + 
##     AGTTCA.del + GATTTC.del
##   #Df  LogLik Df  Chisq Pr(>Chisq)
## 1 114 -362.92                     
## 2 108 -366.69 -6 7.5433     0.2735
## 
## $lrtest_results_C.G
## Likelihood ratio test
## 
## Model 1: clade ~ C.T + A.G + G.A + G.C + T.C + C.A + G.T + A.T + T.A + 
##     T.G + A.C + C.G + A.del + TAT.del + TCTGGTTTT.del + TACATG.del + 
##     AGTTCA.del + GATTTC.del
## Model 2: clade ~ C.T + A.G + G.A + G.C + T.C + C.A + G.T + A.T + T.A + 
##     T.G + A.C + A.del + TAT.del + TCTGGTTTT.del + TACATG.del + 
##     AGTTCA.del + GATTTC.del
##   #Df  LogLik Df  Chisq Pr(>Chisq)    
## 1 114 -362.92                         
## 2 108 -640.02 -6 554.21  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $lrtest_results_A.del
## Likelihood ratio test
## 
## Model 1: clade ~ C.T + A.G + G.A + G.C + T.C + C.A + G.T + A.T + T.A + 
##     T.G + A.C + C.G + A.del + TAT.del + TCTGGTTTT.del + TACATG.del + 
##     AGTTCA.del + GATTTC.del
## Model 2: clade ~ C.T + A.G + G.A + G.C + T.C + C.A + G.T + A.T + T.A + 
##     T.G + A.C + C.G + TAT.del + TCTGGTTTT.del + TACATG.del + 
##     AGTTCA.del + GATTTC.del
##   #Df  LogLik Df  Chisq Pr(>Chisq)    
## 1 114 -362.92                         
## 2 108 -380.10 -6 34.366  5.717e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $lrtest_results_TAT.del
## Likelihood ratio test
## 
## Model 1: clade ~ C.T + A.G + G.A + G.C + T.C + C.A + G.T + A.T + T.A + 
##     T.G + A.C + C.G + A.del + TAT.del + TCTGGTTTT.del + TACATG.del + 
##     AGTTCA.del + GATTTC.del
## Model 2: clade ~ C.T + A.G + G.A + G.C + T.C + C.A + G.T + A.T + T.A + 
##     T.G + A.C + C.G + A.del + TCTGGTTTT.del + TACATG.del + AGTTCA.del + 
##     GATTTC.del
##   #Df  LogLik Df  Chisq Pr(>Chisq)
## 1 114 -362.92                     
## 2 108 -360.31 -6 5.2214     0.5157
## 
## $lrtest_results_TCTGGTTTT.del
## Likelihood ratio test
## 
## Model 1: clade ~ C.T + A.G + G.A + G.C + T.C + C.A + G.T + A.T + T.A + 
##     T.G + A.C + C.G + A.del + TAT.del + TCTGGTTTT.del + TACATG.del + 
##     AGTTCA.del + GATTTC.del
## Model 2: clade ~ C.T + A.G + G.A + G.C + T.C + C.A + G.T + A.T + T.A + 
##     T.G + A.C + C.G + A.del + TAT.del + TACATG.del + AGTTCA.del + 
##     GATTTC.del
##   #Df  LogLik Df  Chisq Pr(>Chisq)    
## 1 114 -362.92                         
## 2 108 -379.67 -6 33.487  8.448e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $lrtest_results_TACATG.del
## Likelihood ratio test
## 
## Model 1: clade ~ C.T + A.G + G.A + G.C + T.C + C.A + G.T + A.T + T.A + 
##     T.G + A.C + C.G + A.del + TAT.del + TCTGGTTTT.del + TACATG.del + 
##     AGTTCA.del + GATTTC.del
## Model 2: clade ~ C.T + A.G + G.A + G.C + T.C + C.A + G.T + A.T + T.A + 
##     T.G + A.C + C.G + A.del + TAT.del + TCTGGTTTT.del + AGTTCA.del + 
##     GATTTC.del
##   #Df  LogLik Df  Chisq Pr(>Chisq)    
## 1 114 -362.92                         
## 2 108 -378.63 -6 31.423  2.105e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $lrtest_results_GATTTC.del
## Likelihood ratio test
## 
## Model 1: clade ~ C.T + A.G + G.A + G.C + T.C + C.A + G.T + A.T + T.A + 
##     T.G + A.C + C.G + A.del + TAT.del + TCTGGTTTT.del + TACATG.del + 
##     AGTTCA.del + GATTTC.del
## Model 2: clade ~ C.T + A.G + G.A + G.C + T.C + C.A + G.T + A.T + T.A + 
##     T.G + A.C + C.G + A.del + TAT.del + TCTGGTTTT.del + TACATG.del + 
##     AGTTCA.del
##   #Df  LogLik Df Chisq Pr(>Chisq)
## 1 114 -362.92                    
## 2 108 -364.84 -6 3.835      0.699
# Estes resultados indicam quais os preditores que nos permitem prever a categoria de resposta (qual a variante), mas nao nos dizem qual o efeito. Para isso teremos que olhar para os parametros estimados (Beta) tendo em conta a clade 19A como referencia

# * O LL é uma medida que nos indica a variabilidade não explicada nos nossos dados. A diferença ou mudança no LL indica quanta nova variancia foi explicada pleo modelo
# * Teste do qui-quadrado testa a diminuiçao da variancia nao explicada a partir do modelo baseline para o modelo final. Esta alteraçao é significativa o que significa que o nosso modelo final explica uma quantidade significante da variabilidade original
# * O LR chi-suqared com p-value < 0.05 diz-nos que o nosso modelo consegue ter um melhor "fit" do que o modelo vazio (modelo sem preditores)


########################################################

      # Resultados gerais (OR, AIC, values) em formato de publicaçao

multinom_model_rrr =  exp(coef(multinom_model_completo))

#stargazer(multinom_model_completo, type="html", coef=list(multinom_model_rrr), out="multinom_model.htm")

multinom_model_rrr_Step =  exp(coef(multinom_modelo_step1))

#stargazer(multinom_modelo_step1, type="html", coef=list(multinom_model_rrr_Step), out="multinom_model_step.htm")
plt_teste = as.data.frame(cm_teste$table)
ggplot(plt_teste, aes(Prediction,Reference, fill= Freq)) +
        geom_tile() + geom_text(aes(label=Freq)) +
        scale_fill_gradient(low="white", high="#009194") +
        labs(x = "Reference",y = "Prediction") +
        scale_x_discrete(labels=c("19A","20A","20B","20E","20I", "20J(Gamma)", "20J (Delta)")) +
        scale_y_discrete(labels=c("19A","20A","20B","20E","20I", "20J(Gamma)", "20J (Delta)"))

Modelo Multinomial - PCA

Matriz de correlaçao com componentes principais

library(ggcorrplot)

MatCorr=cor(pca_val[,sapply(pca_val,is.numeric)])
#ggcorrplot(MatCorr)
ggcorrplot(MatCorr,hc.order = TRUE,type = "lower",lab = TRUE) # Podemos reparar que ao usar as componentes principais, removemos o problema de variaveis demasiado correlacionadas uma vez que cada dimensao/componente principal nao vai estar correlacionada com as restantes

Dados de treino e teste - PCA

library(caret)
set.seed(42)
index_pca <- createDataPartition(pca_val$snv_data.clade, p = .75, list = FALSE)
train_pca <- pca_val[index_pca,]
test_pca <- pca_val[-index_pca,]

# Set the reference
pca_val$snv_data.clade = relevel(pca_val$snv_data.clade, ref = "19A")

Modelo PCA

# Training the multinomial classification model

multinom_model_pca = multinom(snv_data.clade ~., data = train_pca)
## # weights:  49 (36 variable)
## initial  value 28110.618013 
## iter  10 value 3083.404673
## iter  20 value 1552.750191
## iter  30 value 1397.125602
## iter  40 value 1373.588884
## iter  50 value 1367.399994
## iter  60 value 1365.382100
## iter  70 value 1365.130364
## iter  80 value 1365.109430
## iter  90 value 1365.098285
## iter 100 value 1364.975429
## final  value 1364.975429 
## stopped after 100 iterations
summary(multinom_model_pca)
## Call:
## multinom(formula = snv_data.clade ~ ., data = train_pca)
## 
## Coefficients:
##                 (Intercept)       Dim.1     Dim.2     Dim.3     Dim.4     Dim.5
## 20A                27.52153   0.5054802  2.762696 22.886645 10.254100  7.513742
## 20B                23.61655   3.4776278  2.765928 23.831167  7.912341 10.844313
## 20E (EU1)          30.23470  -1.3826206  3.497861 22.561270 12.152909  6.605696
## 20I (Alpha, V1)   -18.11041  33.6863939 28.510388  9.688697 20.985295 13.896725
## 20J (Gamma, V3)    20.15643   4.6350515  5.196110 25.813882  9.541950 10.494384
## 21J (Delta)        27.61212 -36.8304474 10.451064 10.077364 14.267332 11.296676
## 
## Std. Errors:
##                 (Intercept)     Dim.1      Dim.2    Dim.3     Dim.4      Dim.5
## 20A                3.558578  1.103285  0.6933044 3.511976  1.556480  0.9842128
## 20B                3.610695  1.120748  0.7040766 3.511592  1.562298  1.0129502
## 20E (EU1)          3.580547  1.119667  0.6985851 3.512825  1.559495  0.9860236
## 20I (Alpha, V1)   12.078205  4.945364 14.1384666 7.459391 29.869731 33.8233869
## 20J (Gamma, V3)    5.628681  1.881080  1.2586137 3.572310  1.682587  1.3980164
## 21J (Delta)        5.850562 21.727312  4.7573876 9.345961  4.671376  4.4236322
## 
## Residual Deviance: 2729.951 
## AIC: 2801.951
#resumo_multinom_model_PCA = tidy(multinom_model_pca, conf.int=T, conf.level = 0.95, exponentiate = T) # Este código já tem 1) Coeficientes em exponencial, 2) z-scores, 3) standard errors, 4) respetivos p-values para Wald Z test bilateral
#resumo_multinom_model_PCA


#exp(cbind(OR = coef(multinom_model_pca)))

# Predicted values are saved as fitted.values in the model object

head(round(fitted(multinom_model_pca),2))
##    19A  20A  20B 20E (EU1) 20I (Alpha, V1) 20J (Gamma, V3) 21J (Delta)
## 1 0.96 0.03 0.01      0.00               0               0           0
## 2 0.00 0.04 0.95      0.00               0               0           0
## 3 0.25 0.60 0.03      0.12               0               0           0
## 4 0.00 0.04 0.95      0.00               0               0           0
## 5 0.25 0.60 0.03      0.12               0               0           0
## 6 0.25 0.60 0.03      0.12               0               0           0
#Convert the coeff to odds
exp(cbind(OR = coef(multinom_model_pca), confint(multinom_model_pca)))
## Warning in cbind(OR = coef(multinom_model_pca), confint(multinom_model_pca)):
## number of rows of result is not a multiple of vector length (arg 2)
##                  (Intercept)        Dim.1        Dim.2        Dim.3
## 20A             8.962881e+11 1.657781e+00 1.584249e+01 8.700490e+09
## 20B             1.805244e+10 3.238281e+01 1.589379e+01 2.237405e+10
## 20E (EU1)       1.351333e+13 2.509201e-01 3.304470e+01 6.283982e+09
## 20I (Alpha, V1) 1.363797e-08 4.263978e+14 2.409375e+12 1.613421e+04
## 20J (Gamma, V3) 5.673181e+08 1.030332e+02 1.805685e+02 1.624899e+11
## 21J (Delta)     9.812781e+11 1.010974e-16 3.458116e+04 2.379818e+04
##                        Dim.4        Dim.5             
## 20A             2.839872e+04    1833.0597 8.382633e+08
## 20B             2.730775e+03   51241.8943 1.907256e-01
## 20E (EU1)       1.896450e+05     739.2943 4.070818e+00
## 20I (Alpha, V1) 1.299564e+09 1084603.6731 8.915479e+06
## 20J (Gamma, V3) 1.393208e+04   36112.1201 1.344065e+03
## 21J (Delta)     1.571171e+06   80553.4020 2.663258e+02
# Predicted values are saved as fitted.values in the model object

head(round(fitted(multinom_model_pca),2))
##    19A  20A  20B 20E (EU1) 20I (Alpha, V1) 20J (Gamma, V3) 21J (Delta)
## 1 0.96 0.03 0.01      0.00               0               0           0
## 2 0.00 0.04 0.95      0.00               0               0           0
## 3 0.25 0.60 0.03      0.12               0               0           0
## 4 0.00 0.04 0.95      0.00               0               0           0
## 5 0.25 0.60 0.03      0.12               0               0           0
## 6 0.25 0.60 0.03      0.12               0               0           0
#The multinomial regression predicts the probability of a particular observation to be part of the said level. This is what we are seeing in the above table. Columns represent the classification levels and rows represent the observations. This means that the first six observation are classified as certain clades


### Qualidade e Significado do Modelo completo e modelo step AIC
            

########################################################

      #Deviance
deviance(multinom_model_0)
## [1] 36427.69
deviance(multinom_model_completo)
## [1] 725.843
deviance(multinom_modelo_step1)
## [1] 688.6486
deviance(multinom_model_pca)
## [1] 2729.951
########################################################

      #AIC
AIC(multinom_model_completo)
## [1] 953.843
AIC(multinom_model_0)
## [1] 36439.69
AIC(multinom_modelo_step1)
## [1] 880.6486
AIC(multinom_model_pca)
## [1] 2801.951
########################################################

      #Hoslem 
#install.packages("generalhoslem")
library(generalhoslem)

#H0: Modelo corrente é apropriado vs Modelo nao é apropriado

logitgof(train_pca$snv_data.clade,fitted(multinom_model_pca) ,ord=F, g=10) 
## Warning in logitgof(train_pca$snv_data.clade, fitted(multinom_model_pca), :
## At least one cell in the expected frequencies table is < 1. Chi-square
## approximation may be incorrect.
## Warning in logitgof(train_pca$snv_data.clade, fitted(multinom_model_pca), : Not
## possible to compute 10 rows. There might be too few observations.
## 
##  Hosmer and Lemeshow test (multinomial model)
## 
## data:  train_pca$snv_data.clade, fitted(multinom_model_pca)
## X-squared = 47.845, df = 0, p-value < 2.2e-16
########################################################

library(DescTools)

#PseudoR2 para o modelo completo
PseudoR2(multinom_model_pca, which=c("McFadden")) # McFadden = LL(null) - LL(completo) / LL(null) -> 0.98. Indica uma relaçao de 98% entre os preditores e o previsto.
## Warning in PseudoR2(multinom_model_pca, which = c("McFadden")): Could not find
## model or data element of multinom object for evaluating PseudoR2 null model.
## Will fit null model with new evaluation of 'train_pca'. Ensure object has not
## changed since initial call, or try running multinom with 'model = TRUE'
##  McFadden 
## 0.9250584
PseudoR2(multinom_model_pca, which=c("Nagelkerke")) #Indica relaçao de 99% entre preditores e previsto
## Warning in PseudoR2(multinom_model_pca, which = c("Nagelkerke")): Could not find
## model or data element of multinom object for evaluating PseudoR2 null model.
## Will fit null model with new evaluation of 'train_pca'. Ensure object has not
## changed since initial call, or try running multinom with 'model = TRUE'
## Nagelkerke 
##  0.9818315
PseudoR2(multinom_model_pca, which=c("CoxSnell"))
## Warning in PseudoR2(multinom_model_pca, which = c("CoxSnell")): Could not find
## model or data element of multinom object for evaluating PseudoR2 null model.
## Will fit null model with new evaluation of 'train_pca'. Ensure object has not
## changed since initial call, or try running multinom with 'model = TRUE'
##  CoxSnell 
## 0.9029636
#NOTA: estas estatisticas na regresso logistica nao significam o mesmo que os R quadrados na regressao com metodo OLS (proporçao de variancia da resposta explicada pelos preditores).

########################################################

      # Accuracy / tabela de classificaçao - MODELO COMPLETO ########################################################

            # Predicting and validating the model
                  # Predicting the values for train dataset
train_pca$cladePredictedPCA <- predict(multinom_model_pca, newdata = train_pca, "class")
#Classification table


confusionMatrix(train_pca$cladePredictedPCA, train_pca$snv_data.clade) #confusionMatrix(predicted,actual) - https://rpubs.com/beane/n4_2
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction         19A  20A  20B 20E (EU1) 20I (Alpha, V1) 20J (Gamma, V3)
##   19A               58    0    0         0               0               0
##   20A               39  610   32       166               0               0
##   20B                4   45  738         3               0               2
##   20E (EU1)          2  150   11       786               0               0
##   20I (Alpha, V1)    0    0    0         0            3749               0
##   20J (Gamma, V3)    0    0    1         0               0             147
##   21J (Delta)        0    0    0         0               0               0
##                  Reference
## Prediction        21J (Delta)
##   19A                       0
##   20A                       0
##   20B                       0
##   20E (EU1)                 0
##   20I (Alpha, V1)           0
##   20J (Gamma, V3)           0
##   21J (Delta)            7903
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9685          
##                  95% CI : (0.9655, 0.9713)
##     No Information Rate : 0.5471          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9494          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 19A Class: 20A Class: 20B Class: 20E (EU1)
## Sensitivity            0.563107    0.75776    0.94373          0.82304
## Specificity            1.000000    0.98263    0.99605          0.98792
## Pos Pred Value         1.000000    0.72019    0.93182          0.82824
## Neg Pred Value         0.996872    0.98566    0.99678          0.98748
## Prevalence             0.007130    0.05572    0.05413          0.06611
## Detection Rate         0.004015    0.04223    0.05109          0.05441
## Detection Prevalence   0.004015    0.05863    0.05482          0.06569
## Balanced Accuracy      0.781553    0.87019    0.96989          0.90548
##                      Class: 20I (Alpha, V1) Class: 20J (Gamma, V3)
## Sensitivity                          1.0000                0.98658
## Specificity                          1.0000                0.99993
## Pos Pred Value                       1.0000                0.99324
## Neg Pred Value                       1.0000                0.99986
## Prevalence                           0.2595                0.01031
## Detection Rate                       0.2595                0.01018
## Detection Prevalence                 0.2595                0.01025
## Balanced Accuracy                    1.0000                0.99325
##                      Class: 21J (Delta)
## Sensitivity                      1.0000
## Specificity                      1.0000
## Pos Pred Value                   1.0000
## Neg Pred Value                   1.0000
## Prevalence                       0.5471
## Detection Rate                   0.5471
## Detection Prevalence             0.5471
## Balanced Accuracy                1.0000
train_pca
#Calculating accruracy - sum of diagonal elements divided by total obs
#tab_train = table(train$clade, train$cladePredicted)
#round((sum(diag(tab_train))/sum(tab_train))*100,2)

# Predicting the class for test dataset
test_pca$cladePredictedPCA <- predict(multinom_model_pca, newdata = test_pca, "class")
# Tabela de confusao para os dados de teste com o modelo completo
cm_teste_pca = confusionMatrix(test_pca$cladePredictedPCA, test_pca$snv_data.clade) #confusionMatrix(predicted,actual) - https://rpubs.com/beane/n4_2
cm_teste_pca
## Confusion Matrix and Statistics
## 
##                  Reference
## Prediction         19A  20A  20B 20E (EU1) 20I (Alpha, V1) 20J (Gamma, V3)
##   19A               15    0    0         0               0               0
##   20A               13  210   21        42               1               0
##   20B                2   15  234         2               1               0
##   20E (EU1)          4   42    4       274               1               0
##   20I (Alpha, V1)    0    0    0         0            1246               0
##   20J (Gamma, V3)    0    1    1         0               0              49
##   21J (Delta)        0    0    0         0               0               0
##                  Reference
## Prediction        21J (Delta)
##   19A                       0
##   20A                       0
##   20B                       0
##   20E (EU1)                 3
##   20I (Alpha, V1)           0
##   20J (Gamma, V3)           0
##   21J (Delta)            2631
## 
## Overall Statistics
##                                          
##                Accuracy : 0.9682         
##                  95% CI : (0.9629, 0.973)
##     No Information Rate : 0.5474         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.9489         
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: 19A Class: 20A Class: 20B Class: 20E (EU1)
## Sensitivity            0.441176    0.78358    0.90000          0.86164
## Specificity            1.000000    0.98305    0.99561          0.98798
## Pos Pred Value         1.000000    0.73171    0.92126          0.83537
## Neg Pred Value         0.996039    0.98718    0.99430          0.99019
## Prevalence             0.007066    0.05569    0.05403          0.06608
## Detection Rate         0.003117    0.04364    0.04863          0.05694
## Detection Prevalence   0.003117    0.05964    0.05278          0.06816
## Balanced Accuracy      0.720588    0.88332    0.94780          0.92481
##                      Class: 20I (Alpha, V1) Class: 20J (Gamma, V3)
## Sensitivity                          0.9976                1.00000
## Specificity                          1.0000                0.99958
## Pos Pred Value                       1.0000                0.96078
## Neg Pred Value                       0.9992                1.00000
## Prevalence                           0.2596                0.01018
## Detection Rate                       0.2589                0.01018
## Detection Prevalence                 0.2589                0.01060
## Balanced Accuracy                    0.9988                0.99979
##                      Class: 21J (Delta)
## Sensitivity                      0.9989
## Specificity                      1.0000
## Pos Pred Value                   1.0000
## Neg Pred Value                   0.9986
## Prevalence                       0.5474
## Detection Rate                   0.5468
## Detection Prevalence             0.5468
## Balanced Accuracy                0.9994
########################################################
plt_teste_pca = as.data.frame(cm_teste_pca$table)
ggplot(plt_teste_pca, aes(Prediction,Reference, fill= Freq)) +
        geom_tile() + geom_text(aes(label=Freq)) +
        scale_fill_gradient(low="white", high="#009194") +
        labs(x = "Reference",y = "Prediction") +
        scale_x_discrete(labels=c("19A","20A","20B","20E","20I", "20J(Gamma)", "20J (Delta)")) +
        scale_y_discrete(labels=c("19A","20A","20B","20E","20I", "20J(Gamma)", "20J (Delta)"))

Breve Analise

  • A precisao do modelo aumentou de 0.67 para 0.87 quando usamos 5 dimensoes provenientes do PCA (em vez de usar as 19 variaveis iniciais). Com isto, tambem resolvemos o problema de correlaçao entre variaveis uma vez que passamos de uma matriz muito correlacionada (com as 19 variaveis) para 5 dimensoes/features/componentes principais nao correlacionadas

Duvida - LR test vs Wald Test

  • LR test - Estima 2 modelos e compara o fit de um modelo com o outro. Na funçao lrtest(modelo_completo, “indel”), vamos removendo variaveis preditoras (indels) de um modelo e isto vai quase sempre piorar o fit do modelo (o modelo terá um LL mais baixo), mas é necessario testar se a diferença observada no modelo é estatisticamente significativa. O LR test faz isto comparando os LL dos 2 modelos. Se esta diferença for significativa, entao o modelo menos restritivo (o que tem mais variaveis) tem melhor fit do que o modelo mais restritivo (que perdeu um dos preditores). A estatistica teste resultante é um qui-quadrado em que os degrees of freedom = numero de parametros removidos.

  • O teste de wald é semelhante ao LR com a vantagem de que so requer a estimaçao de um modelo. Este wald test vai testar a H0 em que um conjunto de parametros é igual a um determinado valor (=0). Se teste nao rejeitar H0, isto sugere que remover a varaivel do modelo nao vai interferir com o fit do modelo, uma vez que o preditor com um coeficiente que é baixo relativamente ao seu erro padrao, nao vai fazer muito para ajudar na previsao da variavel dependente. Este teste ajuda a ver o quao longe os parametros estimados estao do valor 0 (in standar errors). Usado para explicar se as variaveis preditores/explicatorias no modelo sao significantes (mesmo que o LR test?)

  • Referir que no modelo completo com as 19 variaveis (indels) nao sera viavel fazer uma analise dos coeficientes estimados uma vez que os dados podem ter separaçao inerente (dar este exemplo breve - https://stats.oarc.ucla.edu/other/mult-pkg/faq/general/faqwhat-is-complete-or-quasi-complete-separation-in-logistic-regression-and-what-are-some-strategies-to-deal-with-the-issue/). Esta separação dos dados vai levar a uma sobre-avaliação dos coeficientes, tornando-os dificeis de interpretar uma vez que alguns podem chegar aos milhares ou mesmo milhoes. Por outro lado, recorrer à analise de componetnes