Modelos de regressión para el conteo de daños de biomarcadores en personas expuestas a minas de carbon

library(readxl)
library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

library(moments)
library(cowplot)
library(sf)

## Linking to GEOS 3.9.3, GDAL 3.5.2, PROJ 8.2.1; sf_use_s2() is TRUE

library(mapdata)

## Loading required package: maps

library(maps)
library(ggrepel)

## Warning: package 'ggrepel' was built under R version 4.2.3

library(viridis)

## Warning: package 'viridis' was built under R version 4.2.3

## Loading required package: viridisLite

## 
## Attaching package: 'viridis'

## The following object is masked from 'package:maps':
## 
##     unemp

library(hrbrthemes)

## Warning: package 'hrbrthemes' was built under R version 4.2.3

## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.

##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and

##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow

library(corrplot)

## corrplot 0.92 loaded

library(stats)
library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

library(vcd)

## Warning: package 'vcd' was built under R version 4.2.3

## Loading required package: grid

library(AER)

## Loading required package: lmtest

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: sandwich

## Loading required package: survival

# llamado de base de datos

d <- read_excel("C:/Users/estudio/Desktop/TESIS/BASE DE DATOS MN SANGRE ENTERA CARBON FINAL (C).xlsx")
View(d)

summary(d)

##        #          LOCALIDAD           EXPOSICION      LONGITUD        
##  Min.   : 1.00   Length:139         Min.   :0.000   Length:139        
##  1st Qu.:18.00   Class :character   1st Qu.:0.000   Class :character  
##  Median :35.00   Mode  :character   Median :1.000   Mode  :character  
##  Mean   :41.09                      Mean   :0.705                     
##  3rd Qu.:63.50                      3rd Qu.:1.000                     
##  Max.   :98.00                      Max.   :1.000                     
##    LATITUD           SAMPLE_ID              MONO              BI        
##  Length:139         Length:139         Min.   :  72.0   Min.   :   0.0  
##  Class :character   Class :character   1st Qu.: 370.5   1st Qu.: 458.5  
##  Mode  :character   Mode  :character   Median : 584.0   Median : 921.0  
##                                        Mean   : 660.9   Mean   : 869.3  
##                                        3rd Qu.: 865.5   3rd Qu.:1260.0  
##                                        Max.   :2183.0   Max.   :2103.0  
##      MULTI         BNCMN_MNL          MN_BNC          MNCMN        
##  Min.   :  0.0   Min.   : 0.000   Min.   :0.000   Min.   : 0.0000  
##  1st Qu.: 38.5   1st Qu.: 2.000   1st Qu.:0.000   1st Qu.: 0.0000  
##  Median : 97.0   Median : 5.000   Median :1.000   Median : 0.0000  
##  Mean   :138.3   Mean   : 6.446   Mean   :1.094   Mean   : 0.6043  
##  3rd Qu.:205.0   3rd Qu.: 9.000   3rd Qu.:2.000   3rd Qu.: 1.0000  
##  Max.   :795.0   Max.   :41.000   Max.   :4.000   Max.   :10.0000  
##       NPB             NBUDS             NEC               APOP        
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.0000   Median :0.00000   Median :0.00000  
##  Mean   :0.2086   Mean   :0.5683   Mean   :0.09353   Mean   :0.05036  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :3.0000   Max.   :6.0000   Max.   :3.00000   Max.   :3.00000  
##       NDI         TOTAL_CELLS       SEXO                EDAD      
##  Min.   :1.000   Min.   :  93   Length:139         Min.   :18.00  
##  1st Qu.:1.457   1st Qu.:1005   Class :character   1st Qu.:23.00  
##  Median :1.677   Median :1962   Mode  :character   Median :32.00  
##  Mean   :1.641   Mean   :1677                      Mean   :33.89  
##  3rd Qu.:1.841   3rd Qu.:2168                      3rd Qu.:43.00  
##  Max.   :2.152   Max.   :3950                      Max.   :65.00

# se vuelve la variable EXPOSICION CARACTER
datos <- mutate(d, EXPOSICION= as.character(d$EXPOSICION))
#datos <- na.omit(datos)
summary(datos)

##        #          LOCALIDAD          EXPOSICION          LONGITUD        
##  Min.   : 1.00   Length:139         Length:139         Length:139        
##  1st Qu.:18.00   Class :character   Class :character   Class :character  
##  Median :35.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :41.09                                                           
##  3rd Qu.:63.50                                                           
##  Max.   :98.00                                                           
##    LATITUD           SAMPLE_ID              MONO              BI        
##  Length:139         Length:139         Min.   :  72.0   Min.   :   0.0  
##  Class :character   Class :character   1st Qu.: 370.5   1st Qu.: 458.5  
##  Mode  :character   Mode  :character   Median : 584.0   Median : 921.0  
##                                        Mean   : 660.9   Mean   : 869.3  
##                                        3rd Qu.: 865.5   3rd Qu.:1260.0  
##                                        Max.   :2183.0   Max.   :2103.0  
##      MULTI         BNCMN_MNL          MN_BNC          MNCMN        
##  Min.   :  0.0   Min.   : 0.000   Min.   :0.000   Min.   : 0.0000  
##  1st Qu.: 38.5   1st Qu.: 2.000   1st Qu.:0.000   1st Qu.: 0.0000  
##  Median : 97.0   Median : 5.000   Median :1.000   Median : 0.0000  
##  Mean   :138.3   Mean   : 6.446   Mean   :1.094   Mean   : 0.6043  
##  3rd Qu.:205.0   3rd Qu.: 9.000   3rd Qu.:2.000   3rd Qu.: 1.0000  
##  Max.   :795.0   Max.   :41.000   Max.   :4.000   Max.   :10.0000  
##       NPB             NBUDS             NEC               APOP        
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.0000   Median :0.00000   Median :0.00000  
##  Mean   :0.2086   Mean   :0.5683   Mean   :0.09353   Mean   :0.05036  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :3.0000   Max.   :6.0000   Max.   :3.00000   Max.   :3.00000  
##       NDI         TOTAL_CELLS       SEXO                EDAD      
##  Min.   :1.000   Min.   :  93   Length:139         Min.   :18.00  
##  1st Qu.:1.457   1st Qu.:1005   Class :character   1st Qu.:23.00  
##  Median :1.677   Median :1962   Mode  :character   Median :32.00  
##  Mean   :1.641   Mean   :1677                      Mean   :33.89  
##  3rd Qu.:1.841   3rd Qu.:2168                      3rd Qu.:43.00  
##  Max.   :2.152   Max.   :3950                      Max.   :65.00

#### CORRELACIÓN DE VARIABLES
tabla2 <- datos[1:139, c(10,11,12,13,14,15,16,18,20)]

tab <- data.frame(cor(tabla2)) 

Matrixcorr <-cor(tab)

corrplot(Matrixcorr)

# HISTOGRAMA Y FUNCION DE DENSIDAD
A_1 <- datos %>% ggplot(aes(MN_BNC)) + 
  geom_histogram(aes(y=..density..),bins=50,alpha=10,position="identity",binwidth=0.9, colour="black", fill="#9BCD9B") +
  geom_density(aes(),alpha=99, color="red", lwd = 1.5)+
  labs(x="MN_BNC",y="") 
  #ylim(c(0,20))
 #ggtitle("Distribución de la variable de respuesta MN_BNC")

A_2<-datos %>% ggplot(aes(NPB)) + 
  geom_histogram(aes(y=..density..),bins=50,alpha=10,position="identity",binwidth=0.9, colour="black", fill="#9BCD9B") +
  geom_density(aes(),alpha=99, color="red", lwd = 1.5)+
  labs(x="NPB",y="") 
  #ylim(c(0,20))
  #ggtitle("Distribución de la variable de respuesta NPB")


A_3 <-datos %>% ggplot(aes(x=BNCMN_MNL)) + 
  geom_histogram(aes(y=..density..),bins=50,alpha=10,position="identity",binwidth=0.9, colour="black", fill="#9BCD9B") +
  geom_density(aes(),alpha=99, color="red", lwd = 1.5)+
  labs(x="BNCMN_MNL",y="") 
  #ylim(c(0,20))
  #ggtitle("Distribución de la variable de respuesta BNCMN_MNL")


plot_grid(A_1, A_2, A_3, nrow  = 1, ncol=3, labels="AUTO")

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.

## sesgo por cada biomarcador
skewness(datos$MN_BNC)

## [1] 0.7262248

skewness(datos$NPB)

## [1] 2.781986

skewness(datos$BNCMN_MNL)

## [1] 2.121777

B1 <-ggplot(datos, aes(x = MN_BNC , fill = EXPOSICION)) +
  geom_density(alpha = 0.5)
  #ggtitle("Grafica de densidad por grupos de exposición")

B2 <-ggplot(datos, aes(x = NPB , fill = EXPOSICION)) +
  geom_density(alpha = 0.5)
  #ggtitle("Grafica de densidad por grupos de exposición")


B3 <-ggplot(datos, aes(x = BNCMN_MNL , fill = EXPOSICION)) +
  geom_density(alpha = 0.5)
  #ggtitle("Grafica de densidad por grupos de exposición")

plot_grid(B1, B2, B3, nrow  = 1, ncol=3, labels="AUTO")

## DIAGRAMAS DE CAJA POR CADA BIOMARCADOR EN RELACIÓN AL SEXO


C1 <-ggplot(datos, aes(x = EXPOSICION, y = MN_BNC, fill = SEXO)) +
  geom_boxplot()+stat_summary(fun = mean, color="blue")+
  scale_fill_manual(values = c('#FFB6C1', '#7AC5CD', '#638239'))+
  labs(x = 'EXPOSICION',
       y = 'MN_BNC',
       fill = 'EXPOSICION')

C2 <- ggplot(datos, aes(x = EXPOSICION, y = NPB, fill = SEXO)) +
  geom_boxplot()+stat_summary(fun = mean, color="blue")+
  scale_fill_manual(values = c('#FFB6C1', '#7AC5CD', '#638239'))+
  labs(x = 'EXPOSICION',
       y = 'NPB',
       fill = 'EXPOSICION')

C3 <- ggplot(datos, aes(x = EXPOSICION, y = BNCMN_MNL, fill = SEXO)) +
  geom_boxplot()+stat_summary(fun = mean, color="blue")+
  scale_fill_manual(values = c('#FFB6C1', '#7AC5CD', '#638239'))+
  labs(x = 'EXPOSICION',
       y = 'BNCMN_MNL',
       fill = 'EXPOSICION')

plot_grid(C1, C2, C3, nrow  = 1, ncol=3, labels="AUTO")

## Warning: Removed 4 rows containing missing values (`geom_segment()`).
## Removed 4 rows containing missing values (`geom_segment()`).
## Removed 4 rows containing missing values (`geom_segment()`).

## DIAGRAMA DE PUNTOS POR BIOMARCADOR Y EDAD

D1<- ggplot(datos, aes(x=EDAD , y= MN_BNC))+geom_point(aes(color = EXPOSICION), alpha= 0.5)

D2 <-ggplot(datos, aes(x=EDAD , y= NPB))+geom_point(aes(color = EXPOSICION), alpha= 0.5)

D3 <-ggplot(datos, aes(x=EDAD , y= BNCMN_MNL))+geom_point(aes(color = EXPOSICION), alpha= 0.5)

plot_grid(D1, D2, D3, nrow  = 1, ncol=3, labels="AUTO")

## GENERACION DE MODELOS PRIMER BIOMARCADOR 
rangedad <- cut(datos$EDAD,4)
table(rangedad)

## rangedad
##   (18,29.8] (29.8,41.5] (41.5,53.2]   (53.2,65] 
##          65          34          23          17

## MODELO POISSON 
modeloP <- glm(MN_BNC~ rangedad+EXPOSICION+SEXO, poisson(link = "log") , data = datos)
summary(modeloP)

## 
## Call:
## glm(formula = MN_BNC ~ rangedad + EXPOSICION + SEXO, family = poisson(link = "log"), 
##     data = datos)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.71611  -1.36771  -0.00991   0.56467   2.35766  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)  
## (Intercept)         -0.06687    0.18153  -0.368   0.7126  
## rangedad(29.8,41.5]  0.36688    0.19538   1.878   0.0604 .
## rangedad(41.5,53.2]  0.02438    0.24910   0.098   0.9220  
## rangedad(53.2,65]    0.36617    0.24672   1.484   0.1378  
## EXPOSICION1         -0.03458    0.17908  -0.193   0.8469  
## SEXOM                0.08697    0.17479   0.498   0.6188  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 186.05  on 138  degrees of freedom
## Residual deviance: 180.94  on 133  degrees of freedom
## AIC: 397.31
## 
## Number of Fisher Scoring iterations: 5

confint(modeloP)

## Waiting for profiling to be done...

##                          2.5 %    97.5 %
## (Intercept)         -0.4381216 0.2744306
## rangedad(29.8,41.5] -0.0206860 0.7475681
## rangedad(41.5,53.2] -0.4851105 0.4965264
## rangedad(53.2,65]   -0.1369548 0.8350173
## EXPOSICION1         -0.3783931 0.3254301
## SEXOM               -0.2636854 0.4232299

## AJUSTE DEL MODELO
dispersiontest(modeloP, trafo = 1)

## 
##  Overdispersion test
## 
## data:  modeloP
## z = 0.89937, p-value = 0.1842
## alternative hypothesis: true alpha is greater than 0
## sample estimates:
##      alpha 
## 0.09288681

gf<-goodfit(datos$MN_BNC, type = "poisson", method = "MinChisq") ## TEST QUE PERMITE SABER SI LOS DATOS SE AJUSTAN A UN MODELO POISSON
gf$par # valor de lamda

## $lambda
## [1] 1.099183

summary(gf) ### VALOR MAYOR A 0.05 POR TANTO NO SE AJUSTA UN MODELO POISSON

## Warning in summary.goodfit(gf): Chi-squared approximation may be incorrect

## 
##   Goodness-of-fit test for poisson distribution
## 
##              X^2 df   P(> X^2)
## Pearson 6.364363  3 0.09516794

#mean(datos$MN_BNC)
#var(datos$MN_BNC).

## PREDICCIÓN DE CEROS

zobs <- datos$MN_BNC == 0
head(zobs)

## [1]  TRUE  TRUE FALSE  TRUE  TRUE  TRUE

zpoi <- exp(-exp(predict(modeloP))) # or dpois(0,exp(predict(mp)))
head(dpois(0,exp(predict(modeloP))))

##         1         2         3         4         5         6 
## 0.3835083 0.3835083 0.3924623 0.3835083 0.2592758 0.2592758

c(obs=mean(zobs), poi=mean(zpoi)) # CEROS OBSERVADOS VS CEROS ´REDICHOS POISSON

##       obs       poi 
## 0.3956835 0.3417227

#### BINOMIAL NEGATIVA
modelobn <- glm.nb(MN_BNC~ rangedad+EXPOSICION+SEXO, link = "log" , data = datos)
summary(modelobn)

## 
## Call:
## glm.nb(formula = MN_BNC ~ rangedad + EXPOSICION + SEXO, data = datos, 
##     link = "log", init.theta = 9.210107463)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6501  -1.3340  -0.0096   0.5228   2.1677  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)  
## (Intercept)         -0.06804    0.19145  -0.355   0.7223  
## rangedad(29.8,41.5]  0.36594    0.20754   1.763   0.0779 .
## rangedad(41.5,53.2]  0.02473    0.26164   0.095   0.9247  
## rangedad(53.2,65]    0.36670    0.26270   1.396   0.1628  
## EXPOSICION1         -0.03210    0.18982  -0.169   0.8657  
## SEXOM                0.08550    0.18561   0.461   0.6450  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(9.2101) family taken to be 1)
## 
##     Null deviance: 169.74  on 138  degrees of freedom
## Residual deviance: 165.20  on 133  degrees of freedom
## AIC: 398.62
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  9.2 
##           Std. Err.:  12.2 
## 
##  2 x log-likelihood:  -384.623

confint(modelobn)

## Waiting for profiling to be done...

##                           2.5 %    97.5 %
## (Intercept)         -0.45662490 0.2934779
## rangedad(29.8,41.5] -0.04476488 0.7707722
## rangedad(41.5,53.2] -0.50774322 0.5237465
## rangedad(53.2,65]   -0.16484609 0.8686032
## EXPOSICION1         -0.39765039 0.3475926
## SEXOM               -0.28511234 0.4434756

## AJUSTE DEL MODELO
gf<-goodfit(datos$MN_BNC, type = "nbinomial", method = "MinChisq") ## TEST QUE PERMITE SABER SI LOS DATOS SE AJUSTAN A UN MODELO POISSON

## Warning in pnbinom(count[-n], size = x[1], prob = x[2]): NaNs produced

## Warning in pnbinom(count[-n], size = x[1], prob = x[2]): NaNs produced

## Warning in pnbinom(count[-n], size = x[1], prob = x[2]): NaNs produced

gf$par # valor de lamda

## $size
## [1] 4.510877
## 
## $prob
## [1] 0.801192

summary(gf)

## 
##   Goodness-of-fit test for nbinomial distribution
## 
##            X^2 df P(> X^2)
## Pearson 4.0624  2 0.131178

## CEROS EN EL MODELO BN

munb <- exp(predict(modelobn))
theta <- modelobn$theta

znb <- (theta/(munb+theta))^theta

# also dnbinom(0, mu=munb, size=theta)
mean(znb)

## [1] 0.3620836

## GENERACION DE MODELOS SEGUNDO BIOMARCADOR 
modeloP <- glm(NPB~ rangedad+EXPOSICION+SEXO, poisson(link = "log") , data = datos)
summary(modeloP)

## 
## Call:
## glm(formula = NPB ~ rangedad + EXPOSICION + SEXO, family = poisson(link = "log"), 
##     data = datos)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2574  -0.6147  -0.5208  -0.3378   2.6524  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -1.9979     0.4623  -4.321 1.55e-05 ***
## rangedad(29.8,41.5]   0.1591     0.5723   0.278 0.781002    
## rangedad(41.5,53.2]   0.7062     0.5425   1.302 0.193048    
## rangedad(53.2,65]     1.5906     0.4780   3.328 0.000876 ***
## EXPOSICION1           0.1722     0.4385   0.393 0.694517    
## SEXOM                -1.0383     0.5410  -1.919 0.054944 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 111.351  on 138  degrees of freedom
## Residual deviance:  95.183  on 133  degrees of freedom
## AIC: 155.24
## 
## Number of Fisher Scoring iterations: 6

## AJUSTE DEL MODELO
gf<-goodfit(datos$NPB, type = "nbinomial", method = "MinChisq") ## TEST QUE PERMITE SABER SI LOS DATOS SE AJUSTAN A UN MODELO POISSON
gf$par # valor de lamda

## $size
## [1] 0.4368505
## 
## $prob
## [1] 0.6699222

summary(gf)

## Warning in summary.goodfit(gf): Chi-squared approximation may be incorrect

## 
##   Goodness-of-fit test for nbinomial distribution
## 
##              X^2 df  P(> X^2)
## Pearson 0.463865  1 0.4958232

dispersiontest(modeloP, trafo = 1)

## 
##  Overdispersion test
## 
## data:  modeloP
## z = 0.79941, p-value = 0.212
## alternative hypothesis: true alpha is greater than 0
## sample estimates:
##      alpha 
## 0.08003796

## PREDICCIÓN DE CEROS

zobs <- datos$NPB == 0
head(zobs)

## [1]  TRUE  TRUE  TRUE  TRUE  TRUE FALSE

zpoi <- exp(-exp(predict(modeloP))) # or dpois(0,exp(predict(mp)))
head(dpois(0,exp(predict(modeloP))))

##         1         2         3         4         5         6 
## 0.7597221 0.7597221 0.8731735 0.7597221 0.8529869 0.8529869

c(obs=mean(zobs), poi=mean(zpoi)) # CEROS OBSERVADOS VS CEROS ´REDICHOS POISSON

##       obs       poi 
## 0.8417266 0.8232441

#### BINOMIAL NEGATIVA
modelobn <- glm.nb(NPB~ rangedad+EXPOSICION+SEXO, link = "log" , data = datos)
summary(modelobn)

## 
## Call:
## glm.nb(formula = NPB ~ rangedad + EXPOSICION + SEXO, data = datos, 
##     link = "log", init.theta = 1.500784669)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1124  -0.5952  -0.5146  -0.3407   2.3173  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -1.9774     0.4891  -4.043 5.27e-05 ***
## rangedad(29.8,41.5]   0.1613     0.5992   0.269  0.78781    
## rangedad(41.5,53.2]   0.6834     0.5833   1.172  0.24134    
## rangedad(53.2,65]     1.5655     0.5352   2.925  0.00344 ** 
## EXPOSICION1           0.1449     0.4775   0.304  0.76145    
## SEXOM                -0.9950     0.5678  -1.752  0.07972 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(1.5008) family taken to be 1)
## 
##     Null deviance: 92.808  on 138  degrees of freedom
## Residual deviance: 79.425  on 133  degrees of freedom
## AIC: 156.06
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  1.50 
##           Std. Err.:  1.77 
## 
##  2 x log-likelihood:  -142.064

confint(modelobn)

## Waiting for profiling to be done...

##                          2.5 %     97.5 %
## (Intercept)         -3.0379027 -1.0969845
## rangedad(29.8,41.5] -1.0878358  1.3208364
## rangedad(41.5,53.2] -0.5043003  1.8254207
## rangedad(53.2,65]    0.5240610  2.6427443
## EXPOSICION1         -0.7599213  1.1489837
## SEXOM               -2.2692486  0.0291524

## AJUSTE DEL MODELO
gf<-goodfit(datos$NPB, type = "nbinomial", method = "MinChisq") ## TEST QUE PERMITE SABER SI LOS DATOS SE AJUSTAN A UN MODELO POISSON
gf$par # valor de lamda

## $size
## [1] 0.4368505
## 
## $prob
## [1] 0.6699222

summary(gf)

## Warning in summary.goodfit(gf): Chi-squared approximation may be incorrect

## 
##   Goodness-of-fit test for nbinomial distribution
## 
##              X^2 df  P(> X^2)
## Pearson 0.463865  1 0.4958232

## CEROS EN EL MODELO BN

munb <- exp(predict(modelobn))
theta <- modelobn$theta

znb <- (theta/(munb+theta))^theta

# also dnbinom(0, mu=munb, size=theta)
mean(znb)

## [1] 0.8365493

###### GENERACION DE MODELOS TERCER BIOMARCADOR 
modeloP <- glm(BNCMN_MNL~ rangedad+EXPOSICION+SEXO, poisson(link = "log") , data = datos)
summary(modeloP)

## 
## Call:
## glm(formula = BNCMN_MNL ~ rangedad + EXPOSICION + SEXO, family = poisson(link = "log"), 
##     data = datos)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.7093  -1.9769  -0.6875   1.2641   7.5533  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          1.29675    0.08595  15.088  < 2e-16 ***
## rangedad(29.8,41.5]  0.47538    0.08701   5.463 4.67e-08 ***
## rangedad(41.5,53.2]  0.61529    0.09321   6.601 4.08e-11 ***
## rangedad(53.2,65]    0.77225    0.09767   7.907 2.64e-15 ***
## EXPOSICION1          0.33693    0.08149   4.135 3.56e-05 ***
## SEXOM               -0.16276    0.07592  -2.144    0.032 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 869.70  on 138  degrees of freedom
## Residual deviance: 756.71  on 133  degrees of freedom
## AIC: 1188.8
## 
## Number of Fisher Scoring iterations: 5

## AJUSTE DEL MODELO
gf<-goodfit(datos$BNCMN_MNL, type = "nbinomial", method = "MinChisq") ## TEST QUE PERMITE SABER SI LOS DATOS SE AJUSTAN A UN MODELO POISSON
gf$par # valor de lamda

## $size
## [1] 0.9503965
## 
## $prob
## [1] 0.1138231

summary(gf)

## Warning in summary.goodfit(gf): Chi-squared approximation may be incorrect

## 
##   Goodness-of-fit test for nbinomial distribution
## 
##              X^2 df  P(> X^2)
## Pearson 30.36615 39 0.8373707

dispersiontest(modeloP, trafo = 1)

## 
##  Overdispersion test
## 
## data:  modeloP
## z = 4.4772, p-value = 3.782e-06
## alternative hypothesis: true alpha is greater than 0
## sample estimates:
##    alpha 
## 4.732268

## PREDICCIÓN DE CEROS

zobs <- datos$BNCMN_MNL == 0
head(zobs)

## [1]  TRUE FALSE FALSE FALSE FALSE FALSE

zpoi <- exp(-exp(predict(modeloP))) # or dpois(0,exp(predict(mp)))
head(dpois(0,exp(predict(modeloP))))

##           1           2           3           4           5           6 
## 0.001151261 0.001151261 0.025799474 0.001151261 0.002785263 0.002785263

c(obs=mean(zobs), poi=mean(zpoi)) # CEROS OBSERVADOS VS CEROS ´REDICHOS POISSON

##         obs         poi 
## 0.151079137 0.007734651

#### BINOMIAL NEGATIVA
modelobn <- glm.nb(BNCMN_MNL~ rangedad+EXPOSICION+SEXO, link = "log" , data = datos)
summary(modelobn)

## 
## Call:
## glm.nb(formula = BNCMN_MNL ~ rangedad + EXPOSICION + SEXO, data = datos, 
##     link = "log", init.theta = 1.210635153)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3632  -0.9003  -0.2807   0.4792   2.0861  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           1.3610     0.1869   7.284 3.25e-13 ***
## rangedad(29.8,41.5]   0.4526     0.2116   2.139  0.03244 *  
## rangedad(41.5,53.2]   0.6006     0.2401   2.502  0.01236 *  
## rangedad(53.2,65]     0.7614     0.2679   2.842  0.00448 ** 
## EXPOSICION1           0.2702     0.1893   1.427  0.15360    
## SEXOM                -0.1773     0.1872  -0.947  0.34350    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(1.2106) family taken to be 1)
## 
##     Null deviance: 178.87  on 138  degrees of freedom
## Residual deviance: 161.71  on 133  degrees of freedom
## AIC: 814.47
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  1.211 
##           Std. Err.:  0.187 
## 
##  2 x log-likelihood:  -800.470

confint(modelobn)

## Waiting for profiling to be done...

##                           2.5 %    97.5 %
## (Intercept)          1.01280770 1.7269871
## rangedad(29.8,41.5]  0.03976645 0.8771952
## rangedad(41.5,53.2]  0.13780614 1.0876825
## rangedad(53.2,65]    0.24617791 1.3146622
## EXPOSICION1         -0.11421157 0.6430809
## SEXOM               -0.53956274 0.1963908

## AJUSTE DEL MODELO
gf<-goodfit(datos$BNCMN_MNL, type = "nbinomial", method = "MinChisq") ## TEST QUE PERMITE SABER SI LOS DATOS SE AJUSTAN A UN MODELO POISSON
gf$par # valor de lamda

## $size
## [1] 0.9503965
## 
## $prob
## [1] 0.1138231

summary(gf)

## Warning in summary.goodfit(gf): Chi-squared approximation may be incorrect

## 
##   Goodness-of-fit test for nbinomial distribution
## 
##              X^2 df  P(> X^2)
## Pearson 30.36615 39 0.8373707

## CEROS EN EL MODELO BN

munb <- exp(predict(modelobn))
theta <- modelobn$theta

znb <- (theta/(munb+theta))^theta

# also dnbinom(0, mu=munb, size=theta)
mean(znb)

## [1] 0.1196679

############## PRUEBAS DE HIPOTESIS ##################################
rangedad <- cut(datos$EDAD,4)
table(rangedad)

## rangedad
##   (18,29.8] (29.8,41.5] (41.5,53.2]   (53.2,65] 
##          65          34          23          17

boxplot(datos$NPB~rangedad)

leveneTest(MN_BNC ~ rangedad, data = datos, center = "median")

## Levene's Test for Homogeneity of Variance (center = "median")
##        Df F value Pr(>F)
## group   3  1.3459 0.2622
##       135

leveneTest(NPB ~ rangedad, data = datos, center = "median")

## Levene's Test for Homogeneity of Variance (center = "median")
##        Df F value Pr(>F)  
## group   3  3.9146 0.0102 *
##       135                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

leveneTest(BNCMN_MNL ~ rangedad, data = datos, center = "median")

## Levene's Test for Homogeneity of Variance (center = "median")
##        Df F value Pr(>F)
## group   3  1.9925 0.1181
##       135

kruskal.test(MN_BNC ~ rangedad, data = datos)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  MN_BNC by rangedad
## Kruskal-Wallis chi-squared = 3.7952, df = 3, p-value = 0.2844

kruskal.test(NPB ~ rangedad, data = datos)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  NPB by rangedad
## Kruskal-Wallis chi-squared = 6.9841, df = 3, p-value = 0.07241

kruskal.test(BNCMN_MNL ~ rangedad, data = datos)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  BNCMN_MNL by rangedad
## Kruskal-Wallis chi-squared = 13.729, df = 3, p-value = 0.003298

pairwise.wilcox.test(datos$NPB, rangedad, p.adjust.method = "holm", data = datos )

## Warning in wilcox.test.default(xi, xj, paired = paired, ...): cannot compute
## exact p-value with ties

## Warning in wilcox.test.default(xi, xj, paired = paired, ...): cannot compute
## exact p-value with ties

## Warning in wilcox.test.default(xi, xj, paired = paired, ...): cannot compute
## exact p-value with ties

## 
##  Pairwise comparisons using Wilcoxon rank sum test with continuity correction 
## 
## data:  datos$NPB and rangedad 
## 
##             (18,29.8] (29.8,41.5] (41.5,53.2]
## (29.8,41.5] 1.00      -           -          
## (41.5,53.2] 1.00      1.00        -          
## (53.2,65]   0.06      0.32        0.80       
## 
## P value adjustment method: holm

wilcox.test(data=datos, MN_BNC~EXPOSICION, alternative = "two.sided",paired = FALSE, correct = FALSE, conf.int = TRUE, conf.level = 0.95)

## 
##  Wilcoxon rank sum test
## 
## data:  MN_BNC by EXPOSICION
## W = 1960, p-value = 0.8123
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -5.917667e-05  1.718003e-05
## sample estimates:
## difference in location 
##          -2.205376e-05

wilcox.test(data=datos, NPB~EXPOSICION, alternative = "two.sided" ,paired = FALSE, correct = FALSE, conf.int = TRUE, conf.level = 0.95)

## 
##  Wilcoxon rank sum test
## 
## data:  NPB by EXPOSICION
## W = 1911.5, p-value = 0.4776
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -7.156865e-06  5.641974e-05
## sample estimates:
## difference in location 
##          -5.753896e-05

wilcox.test(data=datos, BNCMN_MNL~EXPOSICION, alternative = "two.sided" ,paired = FALSE, correct = FALSE, conf.int = TRUE, conf.level = 0.95)

## 
##  Wilcoxon rank sum test
## 
## data:  BNCMN_MNL by EXPOSICION
## W = 1769, p-value = 0.2656
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -2.9999510  0.9999909
## sample estimates:
## difference in location 
##             -0.9999728

fligner.test(data=datos, NPB~EXPOSICION)

## 
##  Fligner-Killeen test of homogeneity of variances
## 
## data:  NPB by EXPOSICION
## Fligner-Killeen:med chi-squared = 0.40343, df = 1, p-value = 0.5253

wilcox.test(data=datos, MN_BNC~SEXO, alternative = "two.sided",paired = FALSE, correct = FALSE, conf.int = TRUE, conf.level = 0.95)

## 
##  Wilcoxon rank sum test
## 
## data:  MN_BNC by SEXO
## W = 1932.5, p-value = 0.7108
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -4.696426e-05  1.842141e-05
## sample estimates:
## difference in location 
##          -4.051747e-05

wilcox.test(data=datos, NPB~SEXO, alternative = "two.sided" ,paired = FALSE, correct = FALSE, conf.int = TRUE, conf.level = 0.95)

## 
##  Wilcoxon rank sum test
## 
## data:  NPB by SEXO
## W = 2194, p-value = 0.1778
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -1.313769e-05  1.427878e-05
## sample estimates:
## difference in location 
##           6.805999e-05

wilcox.test(data=datos, BNCMN_MNL~SEXO, alternative = "two.sided" ,paired = FALSE, correct = FALSE, conf.int = TRUE, conf.level = 0.95)

## 
##  Wilcoxon rank sum test
## 
## data:  BNCMN_MNL by SEXO
## W = 2089, p-value = 0.7106
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -1.000022  2.000015
## sample estimates:
## difference in location 
##           4.924799e-05

fligner.test(data=datos, NPB~SEXO)

## 
##  Fligner-Killeen test of homogeneity of variances
## 
## data:  NPB by SEXO
## Fligner-Killeen:med chi-squared = 2.2453, df = 1, p-value = 0.134

## prueba chi cuadrado de independencia entre sexo y exposición
#create table 
data1 <- matrix (c (11, 30, 30, 68), ncol = 2 , byrow = TRUE )
colnames(data1) <- c (" M ", " F ")
row.names(data1) <- c (" 0 ", " 1 ")
table(datos$SEXO, datos$EXPOSICION)

##    
##      0  1
##   F 30 68
##   M 11 30

chisq.test(data1)$expected

##           M        F 
##  0  12.09353 28.90647
##  1  28.90647 69.09353

chisq.test (data1)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  data1
## X-squared = 0.058601, df = 1, p-value = 0.8087

## prueba chi cuadrado de independencia entre rango de edad y exposición
data2 <- matrix (c (22, 43, 10, 24, 6, 17, 3, 14), ncol = 2 , byrow = TRUE )
colnames(data2) <- c (" 0 ", " 1 ")
row.names(data2) <- c (" I1 ", " I2 ", "I3", "I4")
table(rangedad,datos$EXPOSICION )

##              
## rangedad       0  1
##   (18,29.8]   22 43
##   (29.8,41.5] 10 24
##   (41.5,53.2]  6 17
##   (53.2,65]    3 14

chisq.test (data2)$expected

##             0        1 
##  I1  19.172662 45.82734
##  I2  10.028777 23.97122
## I3    6.784173 16.21583
## I4    5.014388 11.98561

chisq.test (data2)

## 
##  Pearson's Chi-squared test
## 
## data:  data2
## X-squared = 1.8678, df = 3, p-value = 0.6003

## RIESGO RELATIVO

library(jtools)

## Warning: package 'jtools' was built under R version 4.2.3

summ(modeloP, exp = T)

## MODEL INFO:
## Observations: 139
## Dependent Variable: BNCMN_MNL
## Type: Generalized linear model
##   Family: poisson 
##   Link function: log 
## 
## MODEL FIT:
## χ²(5) = 112.99, p = 0.00
## Pseudo-R² (Cragg-Uhler) = 0.56
## Pseudo-R² (McFadden) = 0.09
## AIC = 1188.77, BIC = 1206.38 
## 
## Standard errors: MLE
## --------------------------------------------------------------------
##                             exp(Est.)   2.5%   97.5%   z val.      p
## ------------------------- ----------- ------ ------- -------- ------
## (Intercept)                      3.66   3.09    4.33    15.09   0.00
## rangedad(29.8,41.5]              1.61   1.36    1.91     5.46   0.00
## rangedad(41.5,53.2]              1.85   1.54    2.22     6.60   0.00
## rangedad(53.2,65]                2.16   1.79    2.62     7.91   0.00
## EXPOSICION1                      1.40   1.19    1.64     4.13   0.00
## SEXOM                            0.85   0.73    0.99    -2.14   0.03
## --------------------------------------------------------------------

summ(modelobn, exp = T)

## Error in glm.control(...) : 
##   unused argument (family = list("Negative Binomial(1.2106)", "log", function (mu) 
## log(mu), function (eta) 
## pmax(exp(eta), .Machine$double.eps), function (mu) 
## mu + mu^2/.Theta, function (y, mu, wt) 
## 2 * wt * (y * log(pmax(1, y)/mu) - (y + .Theta) * log((y + .Theta)/(mu + .Theta))), function (y, n, mu, wt, dev) 
## {
##     term <- (y + .Theta) * log(mu + .Theta) - y * log(mu) + lgamma(y + 1) - .Theta * log(.Theta) + lgamma(.Theta) - lgamma(.Theta + y)
##     2 * sum(term * wt)
## }, function (eta) 
## pmax(exp(eta), .Machine$double.eps), expression({
##     if (any(y < 0)) stop("negative values not allowed for the negative binomial family")
##     n <- rep(1, nobs)
##     mustart <- y + (y == 0)/6
## }), function (mu) 
## all(mu > 0), function (eta) 
## TRUE, function (object, nsim) 
## {
##     ftd <- fitted(object)
##     rnegbin(nsim * length(ftd), ftd, .Theta)
## }))

## Warning: Something went wrong when calculating the pseudo R-squared. Returning NA
## instead.

## MODEL INFO:
## Observations: 139
## Dependent Variable: BNCMN_MNL
## Type: Generalized linear model
##   Family: Negative Binomial(1.2106) 
##   Link function: log 
## 
## MODEL FIT:
## χ²(NA) = NA, p = NA
## Pseudo-R² (Cragg-Uhler) = NA
## Pseudo-R² (McFadden) = NA
## AIC = 814.47, BIC = 835.01 
## 
## Standard errors: MLE
## --------------------------------------------------------------------
##                             exp(Est.)   2.5%   97.5%   z val.      p
## ------------------------- ----------- ------ ------- -------- ------
## (Intercept)                      3.90   2.70    5.63     7.28   0.00
## rangedad(29.8,41.5]              1.57   1.04    2.38     2.14   0.03
## rangedad(41.5,53.2]              1.82   1.14    2.92     2.50   0.01
## rangedad(53.2,65]                2.14   1.27    3.62     2.84   0.00
## EXPOSICION1                      1.31   0.90    1.90     1.43   0.15
## SEXOM                            0.84   0.58    1.21    -0.95   0.34
## --------------------------------------------------------------------

Modelos de regressión para el conteo de daños de biomarcadores en personas expuestas a minas de carbon

Zuriel F Peña Ubarne

11/3/2023