Objetivo

Entender fatores influenciando no aumento do tamanho do genoma.

Carregamento de pacotes

library(ggplot2)
library(corrplot)
library(dplyr)

Carregamento dos dados

glm_pangenome <- read.csv("~/dev/anvio-summary/glm_pangenome.csv")

colnames(glm_pangenome)
##  [1] "genome_name"                                                
##  [2] "total_length"                                               
##  [3] "gc_content"                                                 
##  [4] "percent_redundancy"                                         
##  [5] "num_genes"                                                  
##  [6] "avg_gene_length"                                            
##  [7] "num_genes_per_kb"                                           
##  [8] "singleton_gene_clusters"                                    
##  [9] "num_gene_clusters"                                          
## [10] "As"                                                         
## [11] "Cd"                                                         
## [12] "Co"                                                         
## [13] "Cr"                                                         
## [14] "Cu"                                                         
## [15] "Fe"                                                         
## [16] "Metal_genes"                                                
## [17] "Mg"                                                         
## [18] "Mn"                                                         
## [19] "Mo"                                                         
## [20] "Ni"                                                         
## [21] "Skip"                                                       
## [22] "Te"                                                         
## [23] "Zn"                                                         
## [24] "Amino_acid_transport_and_metabolism"                        
## [25] "Carbohydrate_transport_and_metabolism"                      
## [26] "Cell_cycle_control_cell_division_chromosome_partitioning"   
## [27] "Cell_motility"                                              
## [28] "Cell_wall_membrane_envelope_biogenesis"                     
## [29] "Chromatin_structure_and_dynamics"                           
## [30] "Coenzyme_transport_and_metabolism"                          
## [31] "Cytoskeleton"                                               
## [32] "Defense_mechanisms"                                         
## [33] "Energy_production_and_conversion"                           
## [34] "Extracellular_structures"                                   
## [35] "Function_unknown"                                           
## [36] "General_function_prediction_only"                           
## [37] "Inorganic_ion_transport_and_metabolism"                     
## [38] "Intracellular_trafficking_secretion_and_vesicular_transport"
## [39] "Lipid_transport_and_metabolism"                             
## [40] "Mobilome_prophages_transposons"                             
## [41] "Nucleotide_transport_and_metabolism"                        
## [42] "Posttranslational_modification_protein_turnover_chaperones" 
## [43] "RNA_processing_and_modification"                            
## [44] "Replication_recombination_and_repair"                       
## [45] "Secondary_metabolites_biosynthesis_transport_and_catabolism"
## [46] "Signal_transduction_mechanisms"                             
## [47] "Transcription"                                              
## [48] "Translation_ribosomal_structure_and_biogenesis"             
## [49] "HMA_genes"

Gráfico de Correlação

You can also embed plots, for example:

glm_pangenome_filt <- glm_pangenome %>%
  select(total_length, gc_content,singleton_gene_clusters,
         Metal_genes, HMA_genes,
         Defense_mechanisms,
         Mobilome_prophages_transposons,
         Amino_acid_transport_and_metabolism,
         Replication_recombination_and_repair)

# Compute correlation matrix
cor_mat <- cor(glm_pangenome_filt, use = "pairwise.complete.obs")

# Plot
corrplot(cor_mat, diag = FALSE)

Modelagem

\(Y\): Tamanho do genoma

mod_1 <- glm(total_length ~ 1, family = poisson, data = glm_pangenome)
mod_metres <- glm(total_length ~ Metal_genes, family = poisson, data = glm_pangenome)
summary(mod_metres)
## 
## Call:
## glm(formula = total_length ~ Metal_genes, family = poisson, data = glm_pangenome)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) 1.476e+01  2.356e-03  6263.7   <2e-16 ***
## Metal_genes 3.536e-03  1.107e-05   319.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 207370  on 32  degrees of freedom
## Residual deviance: 105675  on 31  degrees of freedom
## AIC: 106251
## 
## Number of Fisher Scoring iterations: 3
mod_hma <- glm(total_length ~ HMA_genes, family = poisson, data = glm_pangenome)
summary(mod_hma)
## 
## Call:
## glm(formula = total_length ~ HMA_genes, family = poisson, data = glm_pangenome)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) 1.488e+01  1.991e-03  7473.4   <2e-16 ***
## HMA_genes   3.226e-03  1.022e-05   315.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 207370  on 32  degrees of freedom
## Residual deviance: 107591  on 31  degrees of freedom
## AIC: 108167
## 
## Number of Fisher Scoring iterations: 3
mod_gc <- glm(total_length ~ gc_content, family = poisson, data = glm_pangenome)
summary(mod_gc)
## 
## Call:
## glm(formula = total_length ~ gc_content, family = poisson, data = glm_pangenome)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  26.84964    0.03202   838.4   <2e-16 ***
## gc_content  -32.22405    0.09099  -354.2   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 207370  on 32  degrees of freedom
## Residual deviance:  82949  on 31  degrees of freedom
## AIC: 83526
## 
## Number of Fisher Scoring iterations: 3
mod_mobilome <- glm(total_length ~ Mobilome_prophages_transposons, family = poisson, data = glm_pangenome)
summary(mod_mobilome)
## 
## Call:
## glm(formula = total_length ~ Mobilome_prophages_transposons, 
##     family = poisson, data = glm_pangenome)
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    1.541e+01  2.721e-04 56640.2   <2e-16 ***
## Mobilome_prophages_transposons 1.478e-03  4.045e-06   365.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 207370  on 32  degrees of freedom
## Residual deviance:  74233  on 31  degrees of freedom
## AIC: 74810
## 
## Number of Fisher Scoring iterations: 3
mod_rep <- glm(total_length ~ Replication_recombination_and_repair, family = poisson, data = glm_pangenome)
summary(mod_rep)
## 
## Call:
## glm(formula = total_length ~ Replication_recombination_and_repair, 
##     family = poisson, data = glm_pangenome)
## 
## Coefficients:
##                                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                          1.517e+01  1.069e-03 14194.8   <2e-16 ***
## Replication_recombination_and_repair 1.978e-03  6.324e-06   312.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 207370  on 32  degrees of freedom
## Residual deviance: 110548  on 31  degrees of freedom
## AIC: 111124
## 
## Number of Fisher Scoring iterations: 3
AIC(mod_1,
    mod_metres,
    mod_hma,
    mod_gc,
    mod_mobilome,
    mod_rep)
##              df       AIC
## mod_1         1 207944.61
## mod_metres    2 106251.28
## mod_hma       2 108167.05
## mod_gc        2  83525.51
## mod_mobilome  2  74809.86
## mod_rep       2 111124.15
glm_pangenome$y_hat <- predict(mod_hma, newdata = glm_pangenome, type = "response")
ggplot(glm_pangenome, aes(x = HMA_genes, y = total_length)) +
  geom_point() +
  geom_line(aes(y = y_hat), color = "red", size = 1)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

glm_pangenome$y_hat <- predict(mod_mobilome, newdata = glm_pangenome, type = "response")
ggplot(glm_pangenome, aes(x = Mobilome_prophages_transposons, y = total_length)) +
  geom_point() +
  geom_line(aes(y = y_hat), color = "red", size = 1)

glm_pangenome$y_hat <- predict(mod_metres, newdata = glm_pangenome, type = "response")
ggplot(glm_pangenome, aes(x = Metal_genes, y = total_length)) +
  geom_point() +
  geom_line(aes(y = y_hat), color = "red", size = 1)

Usando mobiloma como modelo base

Construindo novos modelos com mais de uma var explicativa, utilizando mobiloma como ponto de partida

# Mobiloma + gc
mod_mob_gc <- glm(total_length ~ Mobilome_prophages_transposons + gc_content, family = poisson, data = glm_pangenome)
anova(mod_1, mod_mobilome, mod_mob_gc, test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: total_length ~ 1
## Model 2: total_length ~ Mobilome_prophages_transposons
## Model 3: total_length ~ Mobilome_prophages_transposons + gc_content
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1        32     207370                          
## 2        31      74233  1   133137 < 2.2e-16 ***
## 3        30      53663  1    20570 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Mobiloma + met res
mod_mob_metres <- glm(total_length ~ Mobilome_prophages_transposons + Metal_genes, family = poisson, data = glm_pangenome)
anova(mod_1, mod_mobilome, mod_mob_metres, test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: total_length ~ 1
## Model 2: total_length ~ Mobilome_prophages_transposons
## Model 3: total_length ~ Mobilome_prophages_transposons + Metal_genes
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1        32     207370                          
## 2        31      74233  1   133137 < 2.2e-16 ***
## 3        30      57164  1    17070 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Mobiloma + hma
mod_mob_hma <- glm(total_length ~ Mobilome_prophages_transposons + HMA_genes, family = poisson, data = glm_pangenome)
anova(mod_1, mod_mobilome, mod_mob_hma, test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: total_length ~ 1
## Model 2: total_length ~ Mobilome_prophages_transposons
## Model 3: total_length ~ Mobilome_prophages_transposons + HMA_genes
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1        32     207370                          
## 2        31      74233  1   133137 < 2.2e-16 ***
## 3        30      35400  1    38833 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Mobiloma + rep
mod_mob_rep <- glm(total_length ~ Mobilome_prophages_transposons + Replication_recombination_and_repair, family = poisson, data = glm_pangenome)
anova(mod_1, mod_mobilome, mod_mob_rep, test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: total_length ~ 1
## Model 2: total_length ~ Mobilome_prophages_transposons
## Model 3: total_length ~ Mobilome_prophages_transposons + Replication_recombination_and_repair
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1        32     207370                          
## 2        31      74233  1   133137 < 2.2e-16 ***
## 3        30      69762  1     4471 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

No modelo aditivo sobre mob, hma explicou melhor que as outras variáveis:

# Investigar interação
# Melhorou, mas não muito
mod_mobXhma <- glm(total_length ~ Mobilome_prophages_transposons * HMA_genes, family = poisson, data = glm_pangenome)
anova(mod_1, mod_mobilome, mod_mob_hma, mod_mobXhma, test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: total_length ~ 1
## Model 2: total_length ~ Mobilome_prophages_transposons
## Model 3: total_length ~ Mobilome_prophages_transposons + HMA_genes
## Model 4: total_length ~ Mobilome_prophages_transposons * HMA_genes
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1        32     207370                          
## 2        31      74233  1   133137 < 2.2e-16 ***
## 3        30      35400  1    38833 < 2.2e-16 ***
## 4        29      28552  1     6848 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Validação

anova(mod_mob_hma, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: poisson, link: log
## 
## Response: total_length
## 
## Terms added sequentially (first to last)
## 
## 
##                                Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                                              32     207370              
## Mobilome_prophages_transposons  1   133137        31      74233 < 2.2e-16 ***
## HMA_genes                       1    38833        30      35400 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

VALORES AJUSTADOS E RESÍDUOS

ajustados <- fitted(mod_mob_hma)
eta_hat <- predict(mod_mob_hma, type = "link")

res_bruto      <- residuals(mod_mob_hma, type = "response")
res_pearson    <- residuals(mod_mob_hma, type = "pearson")
res_desvio     <- residuals(mod_mob_hma, type = "deviance")
res_working    <- residuals(mod_mob_hma, type = "working")

# Resíduos padronizados / studentizados
res_std_pearson <- rstandard(mod_mob_hma, type = "pearson")
res_std_dev     <- rstandard(mod_mob_hma, type = "deviance")
res_stud        <- rstudent(mod_mob_hma)

# Medidas de influência
lev <- hatvalues(mod_mob_hma)
cook <- cooks.distance(mod_mob_hma)
dff <- dffits(mod_mob_hma)

# Tabela-resumo
diagnostico <- data.frame(
  id = glm_pangenome$genome_name,
  y = glm_pangenome$total_length,
  ajustados = ajustados,
  res_bruto = res_bruto,
  res_pearson = res_pearson,
  res_desvio = res_desvio,
  res_std_pearson = res_std_pearson,
  res_std_dev = res_std_dev,
  res_stud = res_stud,
  leverage = lev,
  cook = cook,
  dffits = dff
)

head(diagnostico, 10)
##                       id       y ajustados  res_bruto res_pearson res_desvio
## 1   B_paramycoides_BD_11 5483749   5526665 -42916.105 -18.2552897 -18.278993
## 2   B_paramycoides_BD_12 5774287   5638061 136225.984  57.3713498  57.142612
## 3   B_paramycoides_BD_13 5627347   5625210   2136.909   0.9009830   0.900926
## 4   B_paramycoides_BD_15 5363035   5357812   5222.773   2.2563548   2.255988
## 5   B_paramycoides_BD_31 5364188   5414643 -50454.898 -21.6829527 -21.716759
## 6  B_paramycoides_BD_331 5199139   5221407 -22267.711  -9.7449995  -9.751938
## 7  B_paramycoides_BD_333 5182738   5181644   1094.273   0.4807199   0.480703
## 8  B_paramycoides_BD_335 5363943   5323217  40725.840  17.6515544  17.629118
## 9  B_paramycoides_BD_336 5294777   5335652 -40875.300 -17.6956773 -17.718344
## 10   B_paramycoides_BD_9 5299588   5360228 -60639.869 -26.1918772 -26.241496
##    res_std_pearson res_std_dev    res_stud   leverage         cook       dffits
## 1      -18.9283908 -18.9529678 -18.9512519 0.06985626 8.969358e+00 -0.149421857
## 2       59.8815005  59.6427549  59.6623871 0.08208010 1.068802e+02  0.538226311
## 3        0.9947361   0.9946731   0.9946844 0.17961541 7.221386e-02  0.013321223
## 4        2.3374668   2.3370872   2.3371131 0.06819754 1.332952e-01  0.018097787
## 5      -22.2752389 -22.3099682 -22.3081472 0.05247189 9.159211e+00 -0.151333694
## 6      -10.2412082 -10.2485004 -10.2478111 0.09455671 3.651003e+00 -0.094932669
## 7        0.5075233   0.5075055   0.5075073 0.10283534 9.841494e-03  0.004917813
## 8       18.1642399  18.1411522  18.1424378 0.05565333 6.481461e+00  0.126639003
## 9      -18.1530359 -18.1762880 -18.1751318 0.04975444 5.751396e+00 -0.119600739
## 10     -29.3468302 -29.4024258 -29.3911231 0.20345401 7.332576e+01 -0.430600141

4. IDENTIFICAÇÃO DE PONTOS SUSPEITOS

# Regras práticas
n = 33
limite_res <- 2
limite_lev <- 2 * length(coef(mod_mob_hma)) / n
limite_cook <- 4 / n
limite_dff <- 2 * sqrt(length(coef(mod_mob_hma)) / n)

cat("\nLimites práticos:\n")
## 
## Limites práticos:
cat(" |resíduo studentizado| >", limite_res, "\n")
##  |resíduo studentizado| > 2
cat(" leverage >", round(limite_lev, 4), "\n")
##  leverage > 0.1818
cat(" Cook >", round(limite_cook, 4), "\n")
##  Cook > 0.1212
cat(" |DFFITS| >", round(limite_dff, 4), "\n\n")
##  |DFFITS| > 0.603
suspeitos <- subset(
  diagnostico,
  abs(res_stud) > limite_res |
    leverage > limite_lev |
    cook > limite_cook |
    abs(dffits) > limite_dff
)

suspeitos[order(-abs(suspeitos$res_stud)), ]
##                           id       y ajustados   res_bruto res_pearson
## 17       B_paramycoides_CP40 5619544   5446040  173503.710   74.347837
## 22 B_paramycoides_NPDC093745 5897029   5757414  139614.540   58.185786
## 29 B_paramycoides_NPDC094358 5285347   5155990  129357.198   56.968472
## 12    B_paramycoides_B_14578 5227308   5360885 -133577.214  -57.691805
## 2       B_paramycoides_BD_12 5774287   5638061  136225.984   57.371350
## 20     B_paramycoides_NH24A2 5453107   5570728 -117620.920  -49.834327
## 13    B_paramycoides_B_14580 5227789   5330632 -102843.391  -44.543775
## 25 B_paramycoides_NPDC094098 5286979   5382597  -95617.823  -41.213825
## 11    B_paramycoides_B_14577 5231975   5330632  -98657.391  -42.730725
## 14     B_paramycoides_B_3435 5660034   5745996  -85961.671  -35.860984
## 28 B_paramycoides_NPDC094345 5211806   5134665   77141.387   34.043280
## 32    B_paramycoides_RZ3MS14 5493110   5411817   81292.866   34.944665
## 27 B_paramycoides_NPDC094130 5304543   5230231   74311.668   32.493505
## 10       B_paramycoides_BD_9 5299588   5360228  -60639.869  -26.191877
## 26 B_paramycoides_NPDC094118 5482343   5531585  -49242.415  -20.937004
## 5       B_paramycoides_BD_31 5364188   5414643  -50454.898  -21.682953
## 21 B_paramycoides_NPDC077662 5450429   5401084   49345.225   21.232673
## 15      B_paramycoides_B_615 5560887   5605956  -45068.994  -19.035006
## 1       B_paramycoides_BD_11 5483749   5526665  -42916.105  -18.255290
## 9      B_paramycoides_BD_336 5294777   5335652  -40875.300  -17.695677
## 8      B_paramycoides_BD_335 5363943   5323217   40725.840   17.651554
## 18     B_paramycoides_DE0103 5782815   5747996   34819.333   14.523197
## 30 B_paramycoides_NPDC094368 5585329   5559679   25649.609   10.878171
## 6      B_paramycoides_BD_331 5199139   5221407  -22267.711   -9.745000
## 31   B_paramycoides_NRS_1309 5314253   5335652  -21399.300   -9.264155
## 16 B_paramycoides_CIP111461T 5436504   5452398  -15893.608   -6.806577
## 33       B_paramycoides_h2_1 5718601   5704922   13678.962    5.727014
## 23 B_paramycoides_NPDC094078 5387444   5393571   -6126.543   -2.638015
## 4       B_paramycoides_BD_15 5363035   5357812    5222.773    2.256355
##    res_desvio res_std_pearson res_std_dev   res_stud   leverage        cook
## 17  73.958216       75.706245   75.309505  75.323651 0.03556433  70.4504069
## 22  57.952968       62.858171   62.606657  62.642720 0.14313909 220.0139169
## 29  56.732718       60.696312   60.445130  60.475092 0.11906364 165.9732041
## 12 -57.933912      -59.901787  -60.153168 -60.134997 0.07242573  93.3904564
## 2   57.142612       59.881500   59.642755  59.662387 0.08208010 106.8801768
## 20 -50.011256      -51.266677  -51.448691 -51.438680 0.05509781  51.0853681
## 13 -44.688169      -46.381491  -46.531842 -46.520181 0.07767364  60.3889152
## 25 -41.336759      -45.977948  -46.115093 -46.088177 0.19649854 172.3259058
## 11 -42.863559      -44.493641  -44.631956 -44.621228 0.07767364  55.5729824
## 14 -35.950961      -38.554120  -38.650854 -38.637826 0.13482728  77.2138631
## 28  33.958567       36.956380   36.864418  36.878359 0.15143727  81.2468254
## 32  34.857722       35.517382   35.429014  35.431844 0.03198995  13.8961427
## 27  32.417012       33.778951   33.699432  33.705375 0.07466113  30.6877333
## 10 -26.241496      -29.346830  -29.402426 -29.391123 0.20345401  73.3257555
## 26 -20.968183      -23.147338  -23.181809 -23.175544 0.18186126  39.7003293
## 5  -21.716759      -22.275239  -22.309968 -22.308147 0.05247189   9.1592113
## 21  21.200465       21.674264   21.641386  21.642713 0.04033285   6.5812097
## 15 -19.060597      -19.988544  -20.015417 -20.012916 0.09313273  13.6772771
## 1  -18.278993      -18.928391  -18.952968 -18.951252 0.06985626   8.9693581
## 9  -17.718344      -18.153036  -18.176288 -18.175132 0.04975444   5.7513963
## 8   17.629118       18.164240   18.141152  18.142438 0.05565333   6.4814611
## 18  14.508571       15.719080   15.703249  15.705567 0.14636887  14.1224958
## 30  10.869823       11.144536   11.135983  11.136387 0.04723063   2.0522897
## 6   -9.751938      -10.241208  -10.248500 -10.247811 0.09455671   3.6510033
## 31  -9.270358       -9.503594   -9.509957  -9.509641 0.04975444   1.5763423
## 16  -6.809888       -6.927952   -6.931322  -6.931205 0.03473247   0.5756739
## 33   5.724728        6.078090    6.075663   6.075936 0.11218530   1.5560609
## 23  -2.638515       -2.743391   -2.743911  -2.743872 0.07534630   0.2044264
## 4    2.255988        2.337467    2.337087   2.337113 0.06819754   0.1332952
##         dffits
## 17  0.45167893
## 22  0.77664080
## 29  0.67162584
## 12 -0.50773683
## 2   0.53822631
## 20 -0.36967349
## 13 -0.39888194
## 25 -0.67325082
## 11 -0.38160262
## 14 -0.44622478
## 28  0.45454476
## 32  0.18769807
## 27  0.27848022
## 10 -0.43060014
## 26 -0.31522469
## 5  -0.15133369
## 21  0.12783237
## 15 -0.18463326
## 1  -0.14942186
## 9  -0.11960074
## 8   0.12663900
## 18  0.18676318
## 30  0.07108912
## 6  -0.09493267
## 31 -0.06236295
## 16 -0.03765734
## 33  0.06184743
## 23 -0.02242086
## 4   0.01809779

MEDIDAS GLOBAIS DE AJUSTE

cat("\n--- Medidas globais ---\n")
## 
## --- Medidas globais ---
cat("Desvio residual:", deviance(mod_mob_hma), "\n")
## Desvio residual: 35400.45
cat("GL residual:", df.residual(mod_mob_hma), "\n")
## GL residual: 30
cat("p-valor (desvio):", 1 - pchisq(deviance(mod_mob_hma), df.residual(mod_mob_hma)), "\n")
## p-valor (desvio): 0
X2 <- sum(res_pearson^2)
cat("Qui-quadrado de Pearson:", X2, "\n")
## Qui-quadrado de Pearson: 35463.02
cat("p-valor (Pearson):", 1 - pchisq(X2, df.residual(mod_mob_hma)), "\n")
## p-valor (Pearson): 0

GRÁFICOS DE DIAGNÓSTICO

par(mfrow = c(3, 3), mar = c(4, 4, 2, 1))

# (1) Observado vs Ajustado
plot(ajustados, glm_pangenome$total_length,
     xlab = "Valores ajustados", ylab = "Valores observados",
     main = "Observado vs Ajustado", pch = 19)
abline(0, 1, col = 2, lty = 2)

# (2) Resíduo de desvio vs ajustado
plot(ajustados, res_desvio,
     xlab = "Valores ajustados", ylab = "Resíduo de desvio",
     main = "Resíduo de desvio vs Ajustado", pch = 19)
abline(h = c(-2, 0, 2), lty = c(2, 1, 2), col = c(2, 1, 2))

# (3) Resíduo de Pearson vs ajustado
plot(ajustados, res_pearson,
     xlab = "Valores ajustados", ylab = "Resíduo de Pearson",
     main = "Pearson vs Ajustado", pch = 19)
abline(h = c(-2, 0, 2), lty = c(2, 1, 2), col = c(2, 1, 2))

# (4) Resíduo studentizado vs índice
plot(res_stud,
     xlab = "Índice", ylab = "Resíduo studentizado",
     main = "Resíduo studentizado", pch = 19)
abline(h = c(-2, 0, 2), lty = c(2, 1, 2), col = c(2, 1, 2))
text(which(abs(res_stud) > 2), res_stud[abs(res_stud) > 2],
     labels = which(abs(res_stud) > 2), pos = 3, cex = 0.8)

# (5) Resíduos vs x1
plot(glm_pangenome$Mobilome_prophages_transposons, res_desvio,
     xlab = "x1", ylab = "Resíduo de desvio",
     main = "Resíduo de desvio vs x1", pch = 19)
abline(h = c(-2, 0, 2), lty = c(2, 1, 2), col = c(2, 1, 2))

# (6) Resíduos vs x2
plot(glm_pangenome$HMA_genes, res_desvio,
     xlab = "x2", ylab = "Resíduo de desvio",
     main = "Resíduo de desvio vs x2", pch = 19)
abline(h = c(-2, 0, 2), lty = c(2, 1, 2), col = c(2, 1, 2))

# (7) Alavancagem
plot(lev,
     xlab = "Índice", ylab = "Leverage",
     main = "Alavancagem", pch = 19)
abline(h = limite_lev, col = 2, lty = 2)
text(which(lev > limite_lev), lev[lev > limite_lev],
     labels = which(lev > limite_lev), pos = 3, cex = 0.8)

# (8) Distância de Cook
plot(cook,
     xlab = "Índice", ylab = "Cook's distance",
     main = "Distância de Cook", pch = 19)
abline(h = limite_cook, col = 2, lty = 2)
text(which(cook > limite_cook), cook[cook > limite_cook],
     labels = which(cook > limite_cook), pos = 3, cex = 0.8)

# (9) DFFITS
plot(dff,
     xlab = "Índice", ylab = "DFFITS",
     main = "DFFITS", pch = 19)
abline(h = c(-limite_dff, limite_dff), col = 2, lty = 2)
text(which(abs(dff) > limite_dff), dff[abs(dff) > limite_dff],
     labels = which(abs(dff) > limite_dff), pos = 3, cex = 0.8)

7. QQ-PLOT DOS RESÍDUOS DE DESVIO

par(mfrow = c(1, 1))
qqnorm(res_desvio, main = "QQ-plot dos resíduos de desvio", pch = 19)
qqline(res_desvio, col = 2, lty = 2)

8. RESÍDUOS PADRONIZADOS VS ALAVANCAGEM

par(mfrow = c(1, 1))
plot(lev, res_std_dev,
     xlab = "Alavancagem",
     ylab = "Resíduo de desvio padronizado",
     main = "Resíduo padronizado vs Alavancagem",
     pch = 19)

abline(h = c(-2, 0, 2), lty = c(2, 1, 2), col = c(2, 1, 2))
abline(v = limite_lev, lty = 2, col = 4)

# Curvas aproximadas de Cook
p <- length(coef(mod_mob_hma))
h_seq <- seq(0.001, max(lev) * 1.1, length.out = 200)

for (D in c(0.5, 1)) {
  curva <- sqrt(D * p * (1 - h_seq) / h_seq)
  lines(h_seq, curva, lty = 3, col = "darkgreen")
  lines(h_seq, -curva, lty = 3, col = "darkgreen")
}

text(lev, res_std_dev, labels = ifelse(cook > limite_cook, glm_pangenome$genome_name, ""),
     pos = 3, cex = 0.8)

Teste com Redes Bayesianas

library(bnlearn) #pacote para criação das redes
library(Rgraphviz) #pacote para visualização das redes
glm_pangenome_filt <- glm_pangenome %>%
  select(total_length, gc_content,singleton_gene_clusters,
         Metal_genes, HMA_genes,
         Defense_mechanisms,
         Mobilome_prophages_transposons,
         Amino_acid_transport_and_metabolism,
         Carbohydrate_transport_and_metabolism,
         Cell_wall_membrane_envelope_biogenesis,
         Inorganic_ion_transport_and_metabolism,
         Posttranslational_modification_protein_turnover_chaperones,
         Replication_recombination_and_repair,
         Secondary_metabolites_biosynthesis_transport_and_catabolism,
         Transcription)
# Teste com Redes Bayesianas
glm_pangenome_filt[] <- lapply(glm_pangenome_filt, function(x) {
  if (is.integer(x))
    as.numeric(x)
  else
    x
})
dpan <- discretize(
  glm_pangenome_filt,
  method  = "hartemink",
  breaks  = 3,
  ibreaks = 60,
  idisc   = "quantile"
)

str(dpan)
## 'data.frame':    33 obs. of  15 variables:
##  $ total_length                                               : Factor w/ 3 levels "[5.18274e+06,5.27823e+06]",..: 3 3 3 2 2 1 1 2 2 2 ...
##  $ gc_content                                                 : Factor w/ 3 levels "[0.349648,0.351194]",..: 1 1 1 2 2 2 3 2 3 2 ...
##  $ singleton_gene_clusters                                    : Factor w/ 3 levels "[1,51.8]","(51.8,98.4]",..: 2 2 2 3 3 2 2 2 1 2 ...
##  $ Metal_genes                                                : Factor w/ 3 levels "[200,209.067]",..: 2 3 3 1 2 1 2 1 1 1 ...
##  $ HMA_genes                                                  : Factor w/ 3 levels "[180,193.545]",..: 1 3 1 2 1 1 1 2 2 1 ...
##  $ Defense_mechanisms                                         : Factor w/ 3 levels "[149,159.696]",..: 3 3 3 1 1 1 1 1 1 3 ...
##  $ Mobilome_prophages_transposons                             : Factor w/ 3 levels "[36,63.7692]",..: 3 3 3 1 2 1 1 1 1 2 ...
##  $ Amino_acid_transport_and_metabolism                        : Factor w/ 3 levels "[411,421.692]",..: 3 3 3 3 3 1 1 2 2 1 ...
##  $ Carbohydrate_transport_and_metabolism                      : Factor w/ 3 levels "[257,264]","(264,278.4]",..: 1 3 1 2 1 1 2 2 2 2 ...
##  $ Cell_wall_membrane_envelope_biogenesis                     : Factor w/ 3 levels "[287,313.074]",..: 3 3 2 3 3 2 1 1 1 2 ...
##  $ Inorganic_ion_transport_and_metabolism                     : Factor w/ 3 levels "[229,239.818]",..: 2 2 3 3 3 1 2 2 2 1 ...
##  $ Posttranslational_modification_protein_turnover_chaperones : Factor w/ 3 levels "[185,192.4]",..: 3 3 3 3 1 3 2 3 3 3 ...
##  $ Replication_recombination_and_repair                       : Factor w/ 3 levels "[150,163.852]",..: 3 3 3 1 3 1 1 1 1 3 ...
##  $ Secondary_metabolites_biosynthesis_transport_and_catabolism: Factor w/ 3 levels "[75,80]","(80,85.6667]",..: 3 3 3 2 2 1 2 2 2 3 ...
##  $ Transcription                                              : Factor w/ 3 levels "[395,400.926]",..: 3 3 3 2 2 2 2 2 2 2 ...
##  - attr(*, "cutpoints")=List of 15
##   ..$ total_length                                               : num [1:4] 5182738 5278231 5483562 5897029
##   ..$ gc_content                                                 : num [1:4] 0.35 0.351 0.352 0.353
##   ..$ singleton_gene_clusters                                    : num [1:4] 1 51.8 98.4 315
##   ..$ Metal_genes                                                : num [1:4] 200 209 212 229
##   ..$ HMA_genes                                                  : num [1:4] 180 194 199 208
##   ..$ Defense_mechanisms                                         : num [1:4] 149 160 164 196
##   ..$ Mobilome_prophages_transposons                             : num [1:4] 36 63.8 81.4 99
##   ..$ Amino_acid_transport_and_metabolism                        : num [1:4] 411 422 426 439
##   ..$ Carbohydrate_transport_and_metabolism                      : num [1:4] 257 264 278 295
##   ..$ Cell_wall_membrane_envelope_biogenesis                     : num [1:4] 287 313 336 370
##   ..$ Inorganic_ion_transport_and_metabolism                     : num [1:4] 229 240 247 261
##   ..$ Posttranslational_modification_protein_turnover_chaperones : num [1:4] 185 192 196 209
##   ..$ Replication_recombination_and_repair                       : num [1:4] 150 164 164 203
##   ..$ Secondary_metabolites_biosynthesis_transport_and_catabolism: num [1:4] 75 80 85.7 94
##   ..$ Transcription                                              : num [1:4] 395 401 418 447
set.seed(123)

boot <- boot.strength(
  data      = dpan,
  R         = 10,
  algorithm = "hc",
  algorithm.args = list(score = "bde", iss = 10)
)

avg.boot <- averaged.network(boot, threshold = 0.7)
graphviz.plot(avg.boot, main = "Averaged network (bootstrap, threshold = 0.85)")