Entender fatores influenciando no aumento do tamanho do genoma.
library(ggplot2)
library(corrplot)
library(dplyr)
glm_pangenome <- read.csv("~/dev/anvio-summary/glm_pangenome.csv")
colnames(glm_pangenome)
## [1] "genome_name"
## [2] "total_length"
## [3] "gc_content"
## [4] "percent_redundancy"
## [5] "num_genes"
## [6] "avg_gene_length"
## [7] "num_genes_per_kb"
## [8] "singleton_gene_clusters"
## [9] "num_gene_clusters"
## [10] "As"
## [11] "Cd"
## [12] "Co"
## [13] "Cr"
## [14] "Cu"
## [15] "Fe"
## [16] "Metal_genes"
## [17] "Mg"
## [18] "Mn"
## [19] "Mo"
## [20] "Ni"
## [21] "Skip"
## [22] "Te"
## [23] "Zn"
## [24] "Amino_acid_transport_and_metabolism"
## [25] "Carbohydrate_transport_and_metabolism"
## [26] "Cell_cycle_control_cell_division_chromosome_partitioning"
## [27] "Cell_motility"
## [28] "Cell_wall_membrane_envelope_biogenesis"
## [29] "Chromatin_structure_and_dynamics"
## [30] "Coenzyme_transport_and_metabolism"
## [31] "Cytoskeleton"
## [32] "Defense_mechanisms"
## [33] "Energy_production_and_conversion"
## [34] "Extracellular_structures"
## [35] "Function_unknown"
## [36] "General_function_prediction_only"
## [37] "Inorganic_ion_transport_and_metabolism"
## [38] "Intracellular_trafficking_secretion_and_vesicular_transport"
## [39] "Lipid_transport_and_metabolism"
## [40] "Mobilome_prophages_transposons"
## [41] "Nucleotide_transport_and_metabolism"
## [42] "Posttranslational_modification_protein_turnover_chaperones"
## [43] "RNA_processing_and_modification"
## [44] "Replication_recombination_and_repair"
## [45] "Secondary_metabolites_biosynthesis_transport_and_catabolism"
## [46] "Signal_transduction_mechanisms"
## [47] "Transcription"
## [48] "Translation_ribosomal_structure_and_biogenesis"
## [49] "HMA_genes"
You can also embed plots, for example:
glm_pangenome_filt <- glm_pangenome %>%
select(total_length, gc_content,singleton_gene_clusters,
Metal_genes, HMA_genes,
Defense_mechanisms,
Mobilome_prophages_transposons,
Amino_acid_transport_and_metabolism,
Replication_recombination_and_repair)
# Compute correlation matrix
cor_mat <- cor(glm_pangenome_filt, use = "pairwise.complete.obs")
# Plot
corrplot(cor_mat, diag = FALSE)
\(Y\): Tamanho do genoma
mod_1 <- glm(total_length ~ 1, family = poisson, data = glm_pangenome)
mod_metres <- glm(total_length ~ Metal_genes, family = poisson, data = glm_pangenome)
summary(mod_metres)
##
## Call:
## glm(formula = total_length ~ Metal_genes, family = poisson, data = glm_pangenome)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.476e+01 2.356e-03 6263.7 <2e-16 ***
## Metal_genes 3.536e-03 1.107e-05 319.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 207370 on 32 degrees of freedom
## Residual deviance: 105675 on 31 degrees of freedom
## AIC: 106251
##
## Number of Fisher Scoring iterations: 3
mod_hma <- glm(total_length ~ HMA_genes, family = poisson, data = glm_pangenome)
summary(mod_hma)
##
## Call:
## glm(formula = total_length ~ HMA_genes, family = poisson, data = glm_pangenome)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.488e+01 1.991e-03 7473.4 <2e-16 ***
## HMA_genes 3.226e-03 1.022e-05 315.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 207370 on 32 degrees of freedom
## Residual deviance: 107591 on 31 degrees of freedom
## AIC: 108167
##
## Number of Fisher Scoring iterations: 3
mod_gc <- glm(total_length ~ gc_content, family = poisson, data = glm_pangenome)
summary(mod_gc)
##
## Call:
## glm(formula = total_length ~ gc_content, family = poisson, data = glm_pangenome)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 26.84964 0.03202 838.4 <2e-16 ***
## gc_content -32.22405 0.09099 -354.2 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 207370 on 32 degrees of freedom
## Residual deviance: 82949 on 31 degrees of freedom
## AIC: 83526
##
## Number of Fisher Scoring iterations: 3
mod_mobilome <- glm(total_length ~ Mobilome_prophages_transposons, family = poisson, data = glm_pangenome)
summary(mod_mobilome)
##
## Call:
## glm(formula = total_length ~ Mobilome_prophages_transposons,
## family = poisson, data = glm_pangenome)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.541e+01 2.721e-04 56640.2 <2e-16 ***
## Mobilome_prophages_transposons 1.478e-03 4.045e-06 365.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 207370 on 32 degrees of freedom
## Residual deviance: 74233 on 31 degrees of freedom
## AIC: 74810
##
## Number of Fisher Scoring iterations: 3
mod_rep <- glm(total_length ~ Replication_recombination_and_repair, family = poisson, data = glm_pangenome)
summary(mod_rep)
##
## Call:
## glm(formula = total_length ~ Replication_recombination_and_repair,
## family = poisson, data = glm_pangenome)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.517e+01 1.069e-03 14194.8 <2e-16 ***
## Replication_recombination_and_repair 1.978e-03 6.324e-06 312.7 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 207370 on 32 degrees of freedom
## Residual deviance: 110548 on 31 degrees of freedom
## AIC: 111124
##
## Number of Fisher Scoring iterations: 3
AIC(mod_1,
mod_metres,
mod_hma,
mod_gc,
mod_mobilome,
mod_rep)
## df AIC
## mod_1 1 207944.61
## mod_metres 2 106251.28
## mod_hma 2 108167.05
## mod_gc 2 83525.51
## mod_mobilome 2 74809.86
## mod_rep 2 111124.15
glm_pangenome$y_hat <- predict(mod_hma, newdata = glm_pangenome, type = "response")
ggplot(glm_pangenome, aes(x = HMA_genes, y = total_length)) +
geom_point() +
geom_line(aes(y = y_hat), color = "red", size = 1)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
glm_pangenome$y_hat <- predict(mod_mobilome, newdata = glm_pangenome, type = "response")
ggplot(glm_pangenome, aes(x = Mobilome_prophages_transposons, y = total_length)) +
geom_point() +
geom_line(aes(y = y_hat), color = "red", size = 1)
glm_pangenome$y_hat <- predict(mod_metres, newdata = glm_pangenome, type = "response")
ggplot(glm_pangenome, aes(x = Metal_genes, y = total_length)) +
geom_point() +
geom_line(aes(y = y_hat), color = "red", size = 1)
Construindo novos modelos com mais de uma var explicativa, utilizando mobiloma como ponto de partida
# Mobiloma + gc
mod_mob_gc <- glm(total_length ~ Mobilome_prophages_transposons + gc_content, family = poisson, data = glm_pangenome)
anova(mod_1, mod_mobilome, mod_mob_gc, test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: total_length ~ 1
## Model 2: total_length ~ Mobilome_prophages_transposons
## Model 3: total_length ~ Mobilome_prophages_transposons + gc_content
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 32 207370
## 2 31 74233 1 133137 < 2.2e-16 ***
## 3 30 53663 1 20570 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Mobiloma + met res
mod_mob_metres <- glm(total_length ~ Mobilome_prophages_transposons + Metal_genes, family = poisson, data = glm_pangenome)
anova(mod_1, mod_mobilome, mod_mob_metres, test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: total_length ~ 1
## Model 2: total_length ~ Mobilome_prophages_transposons
## Model 3: total_length ~ Mobilome_prophages_transposons + Metal_genes
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 32 207370
## 2 31 74233 1 133137 < 2.2e-16 ***
## 3 30 57164 1 17070 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Mobiloma + hma
mod_mob_hma <- glm(total_length ~ Mobilome_prophages_transposons + HMA_genes, family = poisson, data = glm_pangenome)
anova(mod_1, mod_mobilome, mod_mob_hma, test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: total_length ~ 1
## Model 2: total_length ~ Mobilome_prophages_transposons
## Model 3: total_length ~ Mobilome_prophages_transposons + HMA_genes
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 32 207370
## 2 31 74233 1 133137 < 2.2e-16 ***
## 3 30 35400 1 38833 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Mobiloma + rep
mod_mob_rep <- glm(total_length ~ Mobilome_prophages_transposons + Replication_recombination_and_repair, family = poisson, data = glm_pangenome)
anova(mod_1, mod_mobilome, mod_mob_rep, test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: total_length ~ 1
## Model 2: total_length ~ Mobilome_prophages_transposons
## Model 3: total_length ~ Mobilome_prophages_transposons + Replication_recombination_and_repair
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 32 207370
## 2 31 74233 1 133137 < 2.2e-16 ***
## 3 30 69762 1 4471 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
No modelo aditivo sobre mob, hma explicou melhor que as outras variáveis:
# Investigar interação
# Melhorou, mas não muito
mod_mobXhma <- glm(total_length ~ Mobilome_prophages_transposons * HMA_genes, family = poisson, data = glm_pangenome)
anova(mod_1, mod_mobilome, mod_mob_hma, mod_mobXhma, test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: total_length ~ 1
## Model 2: total_length ~ Mobilome_prophages_transposons
## Model 3: total_length ~ Mobilome_prophages_transposons + HMA_genes
## Model 4: total_length ~ Mobilome_prophages_transposons * HMA_genes
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 32 207370
## 2 31 74233 1 133137 < 2.2e-16 ***
## 3 30 35400 1 38833 < 2.2e-16 ***
## 4 29 28552 1 6848 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(mod_mob_hma, test = "Chisq")
## Analysis of Deviance Table
##
## Model: poisson, link: log
##
## Response: total_length
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 32 207370
## Mobilome_prophages_transposons 1 133137 31 74233 < 2.2e-16 ***
## HMA_genes 1 38833 30 35400 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ajustados <- fitted(mod_mob_hma)
eta_hat <- predict(mod_mob_hma, type = "link")
res_bruto <- residuals(mod_mob_hma, type = "response")
res_pearson <- residuals(mod_mob_hma, type = "pearson")
res_desvio <- residuals(mod_mob_hma, type = "deviance")
res_working <- residuals(mod_mob_hma, type = "working")
# Resíduos padronizados / studentizados
res_std_pearson <- rstandard(mod_mob_hma, type = "pearson")
res_std_dev <- rstandard(mod_mob_hma, type = "deviance")
res_stud <- rstudent(mod_mob_hma)
# Medidas de influência
lev <- hatvalues(mod_mob_hma)
cook <- cooks.distance(mod_mob_hma)
dff <- dffits(mod_mob_hma)
# Tabela-resumo
diagnostico <- data.frame(
id = glm_pangenome$genome_name,
y = glm_pangenome$total_length,
ajustados = ajustados,
res_bruto = res_bruto,
res_pearson = res_pearson,
res_desvio = res_desvio,
res_std_pearson = res_std_pearson,
res_std_dev = res_std_dev,
res_stud = res_stud,
leverage = lev,
cook = cook,
dffits = dff
)
head(diagnostico, 10)
## id y ajustados res_bruto res_pearson res_desvio
## 1 B_paramycoides_BD_11 5483749 5526665 -42916.105 -18.2552897 -18.278993
## 2 B_paramycoides_BD_12 5774287 5638061 136225.984 57.3713498 57.142612
## 3 B_paramycoides_BD_13 5627347 5625210 2136.909 0.9009830 0.900926
## 4 B_paramycoides_BD_15 5363035 5357812 5222.773 2.2563548 2.255988
## 5 B_paramycoides_BD_31 5364188 5414643 -50454.898 -21.6829527 -21.716759
## 6 B_paramycoides_BD_331 5199139 5221407 -22267.711 -9.7449995 -9.751938
## 7 B_paramycoides_BD_333 5182738 5181644 1094.273 0.4807199 0.480703
## 8 B_paramycoides_BD_335 5363943 5323217 40725.840 17.6515544 17.629118
## 9 B_paramycoides_BD_336 5294777 5335652 -40875.300 -17.6956773 -17.718344
## 10 B_paramycoides_BD_9 5299588 5360228 -60639.869 -26.1918772 -26.241496
## res_std_pearson res_std_dev res_stud leverage cook dffits
## 1 -18.9283908 -18.9529678 -18.9512519 0.06985626 8.969358e+00 -0.149421857
## 2 59.8815005 59.6427549 59.6623871 0.08208010 1.068802e+02 0.538226311
## 3 0.9947361 0.9946731 0.9946844 0.17961541 7.221386e-02 0.013321223
## 4 2.3374668 2.3370872 2.3371131 0.06819754 1.332952e-01 0.018097787
## 5 -22.2752389 -22.3099682 -22.3081472 0.05247189 9.159211e+00 -0.151333694
## 6 -10.2412082 -10.2485004 -10.2478111 0.09455671 3.651003e+00 -0.094932669
## 7 0.5075233 0.5075055 0.5075073 0.10283534 9.841494e-03 0.004917813
## 8 18.1642399 18.1411522 18.1424378 0.05565333 6.481461e+00 0.126639003
## 9 -18.1530359 -18.1762880 -18.1751318 0.04975444 5.751396e+00 -0.119600739
## 10 -29.3468302 -29.4024258 -29.3911231 0.20345401 7.332576e+01 -0.430600141
# Regras práticas
n = 33
limite_res <- 2
limite_lev <- 2 * length(coef(mod_mob_hma)) / n
limite_cook <- 4 / n
limite_dff <- 2 * sqrt(length(coef(mod_mob_hma)) / n)
cat("\nLimites práticos:\n")
##
## Limites práticos:
cat(" |resíduo studentizado| >", limite_res, "\n")
## |resíduo studentizado| > 2
cat(" leverage >", round(limite_lev, 4), "\n")
## leverage > 0.1818
cat(" Cook >", round(limite_cook, 4), "\n")
## Cook > 0.1212
cat(" |DFFITS| >", round(limite_dff, 4), "\n\n")
## |DFFITS| > 0.603
suspeitos <- subset(
diagnostico,
abs(res_stud) > limite_res |
leverage > limite_lev |
cook > limite_cook |
abs(dffits) > limite_dff
)
suspeitos[order(-abs(suspeitos$res_stud)), ]
## id y ajustados res_bruto res_pearson
## 17 B_paramycoides_CP40 5619544 5446040 173503.710 74.347837
## 22 B_paramycoides_NPDC093745 5897029 5757414 139614.540 58.185786
## 29 B_paramycoides_NPDC094358 5285347 5155990 129357.198 56.968472
## 12 B_paramycoides_B_14578 5227308 5360885 -133577.214 -57.691805
## 2 B_paramycoides_BD_12 5774287 5638061 136225.984 57.371350
## 20 B_paramycoides_NH24A2 5453107 5570728 -117620.920 -49.834327
## 13 B_paramycoides_B_14580 5227789 5330632 -102843.391 -44.543775
## 25 B_paramycoides_NPDC094098 5286979 5382597 -95617.823 -41.213825
## 11 B_paramycoides_B_14577 5231975 5330632 -98657.391 -42.730725
## 14 B_paramycoides_B_3435 5660034 5745996 -85961.671 -35.860984
## 28 B_paramycoides_NPDC094345 5211806 5134665 77141.387 34.043280
## 32 B_paramycoides_RZ3MS14 5493110 5411817 81292.866 34.944665
## 27 B_paramycoides_NPDC094130 5304543 5230231 74311.668 32.493505
## 10 B_paramycoides_BD_9 5299588 5360228 -60639.869 -26.191877
## 26 B_paramycoides_NPDC094118 5482343 5531585 -49242.415 -20.937004
## 5 B_paramycoides_BD_31 5364188 5414643 -50454.898 -21.682953
## 21 B_paramycoides_NPDC077662 5450429 5401084 49345.225 21.232673
## 15 B_paramycoides_B_615 5560887 5605956 -45068.994 -19.035006
## 1 B_paramycoides_BD_11 5483749 5526665 -42916.105 -18.255290
## 9 B_paramycoides_BD_336 5294777 5335652 -40875.300 -17.695677
## 8 B_paramycoides_BD_335 5363943 5323217 40725.840 17.651554
## 18 B_paramycoides_DE0103 5782815 5747996 34819.333 14.523197
## 30 B_paramycoides_NPDC094368 5585329 5559679 25649.609 10.878171
## 6 B_paramycoides_BD_331 5199139 5221407 -22267.711 -9.745000
## 31 B_paramycoides_NRS_1309 5314253 5335652 -21399.300 -9.264155
## 16 B_paramycoides_CIP111461T 5436504 5452398 -15893.608 -6.806577
## 33 B_paramycoides_h2_1 5718601 5704922 13678.962 5.727014
## 23 B_paramycoides_NPDC094078 5387444 5393571 -6126.543 -2.638015
## 4 B_paramycoides_BD_15 5363035 5357812 5222.773 2.256355
## res_desvio res_std_pearson res_std_dev res_stud leverage cook
## 17 73.958216 75.706245 75.309505 75.323651 0.03556433 70.4504069
## 22 57.952968 62.858171 62.606657 62.642720 0.14313909 220.0139169
## 29 56.732718 60.696312 60.445130 60.475092 0.11906364 165.9732041
## 12 -57.933912 -59.901787 -60.153168 -60.134997 0.07242573 93.3904564
## 2 57.142612 59.881500 59.642755 59.662387 0.08208010 106.8801768
## 20 -50.011256 -51.266677 -51.448691 -51.438680 0.05509781 51.0853681
## 13 -44.688169 -46.381491 -46.531842 -46.520181 0.07767364 60.3889152
## 25 -41.336759 -45.977948 -46.115093 -46.088177 0.19649854 172.3259058
## 11 -42.863559 -44.493641 -44.631956 -44.621228 0.07767364 55.5729824
## 14 -35.950961 -38.554120 -38.650854 -38.637826 0.13482728 77.2138631
## 28 33.958567 36.956380 36.864418 36.878359 0.15143727 81.2468254
## 32 34.857722 35.517382 35.429014 35.431844 0.03198995 13.8961427
## 27 32.417012 33.778951 33.699432 33.705375 0.07466113 30.6877333
## 10 -26.241496 -29.346830 -29.402426 -29.391123 0.20345401 73.3257555
## 26 -20.968183 -23.147338 -23.181809 -23.175544 0.18186126 39.7003293
## 5 -21.716759 -22.275239 -22.309968 -22.308147 0.05247189 9.1592113
## 21 21.200465 21.674264 21.641386 21.642713 0.04033285 6.5812097
## 15 -19.060597 -19.988544 -20.015417 -20.012916 0.09313273 13.6772771
## 1 -18.278993 -18.928391 -18.952968 -18.951252 0.06985626 8.9693581
## 9 -17.718344 -18.153036 -18.176288 -18.175132 0.04975444 5.7513963
## 8 17.629118 18.164240 18.141152 18.142438 0.05565333 6.4814611
## 18 14.508571 15.719080 15.703249 15.705567 0.14636887 14.1224958
## 30 10.869823 11.144536 11.135983 11.136387 0.04723063 2.0522897
## 6 -9.751938 -10.241208 -10.248500 -10.247811 0.09455671 3.6510033
## 31 -9.270358 -9.503594 -9.509957 -9.509641 0.04975444 1.5763423
## 16 -6.809888 -6.927952 -6.931322 -6.931205 0.03473247 0.5756739
## 33 5.724728 6.078090 6.075663 6.075936 0.11218530 1.5560609
## 23 -2.638515 -2.743391 -2.743911 -2.743872 0.07534630 0.2044264
## 4 2.255988 2.337467 2.337087 2.337113 0.06819754 0.1332952
## dffits
## 17 0.45167893
## 22 0.77664080
## 29 0.67162584
## 12 -0.50773683
## 2 0.53822631
## 20 -0.36967349
## 13 -0.39888194
## 25 -0.67325082
## 11 -0.38160262
## 14 -0.44622478
## 28 0.45454476
## 32 0.18769807
## 27 0.27848022
## 10 -0.43060014
## 26 -0.31522469
## 5 -0.15133369
## 21 0.12783237
## 15 -0.18463326
## 1 -0.14942186
## 9 -0.11960074
## 8 0.12663900
## 18 0.18676318
## 30 0.07108912
## 6 -0.09493267
## 31 -0.06236295
## 16 -0.03765734
## 33 0.06184743
## 23 -0.02242086
## 4 0.01809779
cat("\n--- Medidas globais ---\n")
##
## --- Medidas globais ---
cat("Desvio residual:", deviance(mod_mob_hma), "\n")
## Desvio residual: 35400.45
cat("GL residual:", df.residual(mod_mob_hma), "\n")
## GL residual: 30
cat("p-valor (desvio):", 1 - pchisq(deviance(mod_mob_hma), df.residual(mod_mob_hma)), "\n")
## p-valor (desvio): 0
X2 <- sum(res_pearson^2)
cat("Qui-quadrado de Pearson:", X2, "\n")
## Qui-quadrado de Pearson: 35463.02
cat("p-valor (Pearson):", 1 - pchisq(X2, df.residual(mod_mob_hma)), "\n")
## p-valor (Pearson): 0
GRÁFICOS DE DIAGNÓSTICO
par(mfrow = c(3, 3), mar = c(4, 4, 2, 1))
# (1) Observado vs Ajustado
plot(ajustados, glm_pangenome$total_length,
xlab = "Valores ajustados", ylab = "Valores observados",
main = "Observado vs Ajustado", pch = 19)
abline(0, 1, col = 2, lty = 2)
# (2) Resíduo de desvio vs ajustado
plot(ajustados, res_desvio,
xlab = "Valores ajustados", ylab = "Resíduo de desvio",
main = "Resíduo de desvio vs Ajustado", pch = 19)
abline(h = c(-2, 0, 2), lty = c(2, 1, 2), col = c(2, 1, 2))
# (3) Resíduo de Pearson vs ajustado
plot(ajustados, res_pearson,
xlab = "Valores ajustados", ylab = "Resíduo de Pearson",
main = "Pearson vs Ajustado", pch = 19)
abline(h = c(-2, 0, 2), lty = c(2, 1, 2), col = c(2, 1, 2))
# (4) Resíduo studentizado vs índice
plot(res_stud,
xlab = "Índice", ylab = "Resíduo studentizado",
main = "Resíduo studentizado", pch = 19)
abline(h = c(-2, 0, 2), lty = c(2, 1, 2), col = c(2, 1, 2))
text(which(abs(res_stud) > 2), res_stud[abs(res_stud) > 2],
labels = which(abs(res_stud) > 2), pos = 3, cex = 0.8)
# (5) Resíduos vs x1
plot(glm_pangenome$Mobilome_prophages_transposons, res_desvio,
xlab = "x1", ylab = "Resíduo de desvio",
main = "Resíduo de desvio vs x1", pch = 19)
abline(h = c(-2, 0, 2), lty = c(2, 1, 2), col = c(2, 1, 2))
# (6) Resíduos vs x2
plot(glm_pangenome$HMA_genes, res_desvio,
xlab = "x2", ylab = "Resíduo de desvio",
main = "Resíduo de desvio vs x2", pch = 19)
abline(h = c(-2, 0, 2), lty = c(2, 1, 2), col = c(2, 1, 2))
# (7) Alavancagem
plot(lev,
xlab = "Índice", ylab = "Leverage",
main = "Alavancagem", pch = 19)
abline(h = limite_lev, col = 2, lty = 2)
text(which(lev > limite_lev), lev[lev > limite_lev],
labels = which(lev > limite_lev), pos = 3, cex = 0.8)
# (8) Distância de Cook
plot(cook,
xlab = "Índice", ylab = "Cook's distance",
main = "Distância de Cook", pch = 19)
abline(h = limite_cook, col = 2, lty = 2)
text(which(cook > limite_cook), cook[cook > limite_cook],
labels = which(cook > limite_cook), pos = 3, cex = 0.8)
# (9) DFFITS
plot(dff,
xlab = "Índice", ylab = "DFFITS",
main = "DFFITS", pch = 19)
abline(h = c(-limite_dff, limite_dff), col = 2, lty = 2)
text(which(abs(dff) > limite_dff), dff[abs(dff) > limite_dff],
labels = which(abs(dff) > limite_dff), pos = 3, cex = 0.8)
par(mfrow = c(1, 1))
qqnorm(res_desvio, main = "QQ-plot dos resíduos de desvio", pch = 19)
qqline(res_desvio, col = 2, lty = 2)
par(mfrow = c(1, 1))
plot(lev, res_std_dev,
xlab = "Alavancagem",
ylab = "Resíduo de desvio padronizado",
main = "Resíduo padronizado vs Alavancagem",
pch = 19)
abline(h = c(-2, 0, 2), lty = c(2, 1, 2), col = c(2, 1, 2))
abline(v = limite_lev, lty = 2, col = 4)
# Curvas aproximadas de Cook
p <- length(coef(mod_mob_hma))
h_seq <- seq(0.001, max(lev) * 1.1, length.out = 200)
for (D in c(0.5, 1)) {
curva <- sqrt(D * p * (1 - h_seq) / h_seq)
lines(h_seq, curva, lty = 3, col = "darkgreen")
lines(h_seq, -curva, lty = 3, col = "darkgreen")
}
text(lev, res_std_dev, labels = ifelse(cook > limite_cook, glm_pangenome$genome_name, ""),
pos = 3, cex = 0.8)
library(bnlearn) #pacote para criação das redes
library(Rgraphviz) #pacote para visualização das redes
glm_pangenome_filt <- glm_pangenome %>%
select(total_length, gc_content,singleton_gene_clusters,
Metal_genes, HMA_genes,
Defense_mechanisms,
Mobilome_prophages_transposons,
Amino_acid_transport_and_metabolism,
Carbohydrate_transport_and_metabolism,
Cell_wall_membrane_envelope_biogenesis,
Inorganic_ion_transport_and_metabolism,
Posttranslational_modification_protein_turnover_chaperones,
Replication_recombination_and_repair,
Secondary_metabolites_biosynthesis_transport_and_catabolism,
Transcription)
# Teste com Redes Bayesianas
glm_pangenome_filt[] <- lapply(glm_pangenome_filt, function(x) {
if (is.integer(x))
as.numeric(x)
else
x
})
dpan <- discretize(
glm_pangenome_filt,
method = "hartemink",
breaks = 3,
ibreaks = 60,
idisc = "quantile"
)
str(dpan)
## 'data.frame': 33 obs. of 15 variables:
## $ total_length : Factor w/ 3 levels "[5.18274e+06,5.27823e+06]",..: 3 3 3 2 2 1 1 2 2 2 ...
## $ gc_content : Factor w/ 3 levels "[0.349648,0.351194]",..: 1 1 1 2 2 2 3 2 3 2 ...
## $ singleton_gene_clusters : Factor w/ 3 levels "[1,51.8]","(51.8,98.4]",..: 2 2 2 3 3 2 2 2 1 2 ...
## $ Metal_genes : Factor w/ 3 levels "[200,209.067]",..: 2 3 3 1 2 1 2 1 1 1 ...
## $ HMA_genes : Factor w/ 3 levels "[180,193.545]",..: 1 3 1 2 1 1 1 2 2 1 ...
## $ Defense_mechanisms : Factor w/ 3 levels "[149,159.696]",..: 3 3 3 1 1 1 1 1 1 3 ...
## $ Mobilome_prophages_transposons : Factor w/ 3 levels "[36,63.7692]",..: 3 3 3 1 2 1 1 1 1 2 ...
## $ Amino_acid_transport_and_metabolism : Factor w/ 3 levels "[411,421.692]",..: 3 3 3 3 3 1 1 2 2 1 ...
## $ Carbohydrate_transport_and_metabolism : Factor w/ 3 levels "[257,264]","(264,278.4]",..: 1 3 1 2 1 1 2 2 2 2 ...
## $ Cell_wall_membrane_envelope_biogenesis : Factor w/ 3 levels "[287,313.074]",..: 3 3 2 3 3 2 1 1 1 2 ...
## $ Inorganic_ion_transport_and_metabolism : Factor w/ 3 levels "[229,239.818]",..: 2 2 3 3 3 1 2 2 2 1 ...
## $ Posttranslational_modification_protein_turnover_chaperones : Factor w/ 3 levels "[185,192.4]",..: 3 3 3 3 1 3 2 3 3 3 ...
## $ Replication_recombination_and_repair : Factor w/ 3 levels "[150,163.852]",..: 3 3 3 1 3 1 1 1 1 3 ...
## $ Secondary_metabolites_biosynthesis_transport_and_catabolism: Factor w/ 3 levels "[75,80]","(80,85.6667]",..: 3 3 3 2 2 1 2 2 2 3 ...
## $ Transcription : Factor w/ 3 levels "[395,400.926]",..: 3 3 3 2 2 2 2 2 2 2 ...
## - attr(*, "cutpoints")=List of 15
## ..$ total_length : num [1:4] 5182738 5278231 5483562 5897029
## ..$ gc_content : num [1:4] 0.35 0.351 0.352 0.353
## ..$ singleton_gene_clusters : num [1:4] 1 51.8 98.4 315
## ..$ Metal_genes : num [1:4] 200 209 212 229
## ..$ HMA_genes : num [1:4] 180 194 199 208
## ..$ Defense_mechanisms : num [1:4] 149 160 164 196
## ..$ Mobilome_prophages_transposons : num [1:4] 36 63.8 81.4 99
## ..$ Amino_acid_transport_and_metabolism : num [1:4] 411 422 426 439
## ..$ Carbohydrate_transport_and_metabolism : num [1:4] 257 264 278 295
## ..$ Cell_wall_membrane_envelope_biogenesis : num [1:4] 287 313 336 370
## ..$ Inorganic_ion_transport_and_metabolism : num [1:4] 229 240 247 261
## ..$ Posttranslational_modification_protein_turnover_chaperones : num [1:4] 185 192 196 209
## ..$ Replication_recombination_and_repair : num [1:4] 150 164 164 203
## ..$ Secondary_metabolites_biosynthesis_transport_and_catabolism: num [1:4] 75 80 85.7 94
## ..$ Transcription : num [1:4] 395 401 418 447
set.seed(123)
boot <- boot.strength(
data = dpan,
R = 10,
algorithm = "hc",
algorithm.args = list(score = "bde", iss = 10)
)
avg.boot <- averaged.network(boot, threshold = 0.7)
graphviz.plot(avg.boot, main = "Averaged network (bootstrap, threshold = 0.85)")