library(readxl)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(moments)
library(cowplot)
library(sf)
## Linking to GEOS 3.9.3, GDAL 3.5.2, PROJ 8.2.1; sf_use_s2() is TRUE
## Loading required package: maps
library(maps)
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.2.3
## Warning: package 'viridis' was built under R version 4.2.3
## Loading required package: viridisLite
##
## Attaching package: 'viridis'
## The following object is masked from 'package:maps':
##
## unemp
## Warning: package 'hrbrthemes' was built under R version 4.2.3
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
## Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
## if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
## corrplot 0.92 loaded
library(stats)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## Warning: package 'vcd' was built under R version 4.2.3
## Loading required package: grid
## Loading required package: lmtest
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: survival
# llamado de base de datos
d <- read_excel("C:/Users/estudio/Desktop/TESIS/BASE DE DATOS MN SANGRE ENTERA CARBON FINAL (C).xlsx")
View(d)
summary(d)
## # LOCALIDAD EXPOSICION LONGITUD
## Min. : 1.00 Length:139 Min. :0.000 Length:139
## 1st Qu.:18.00 Class :character 1st Qu.:0.000 Class :character
## Median :35.00 Mode :character Median :1.000 Mode :character
## Mean :41.09 Mean :0.705
## 3rd Qu.:63.50 3rd Qu.:1.000
## Max. :98.00 Max. :1.000
## LATITUD SAMPLE_ID MONO BI
## Length:139 Length:139 Min. : 72.0 Min. : 0.0
## Class :character Class :character 1st Qu.: 370.5 1st Qu.: 458.5
## Mode :character Mode :character Median : 584.0 Median : 921.0
## Mean : 660.9 Mean : 869.3
## 3rd Qu.: 865.5 3rd Qu.:1260.0
## Max. :2183.0 Max. :2103.0
## MULTI BNCMN_MNL MN_BNC MNCMN
## Min. : 0.0 Min. : 0.000 Min. :0.000 Min. : 0.0000
## 1st Qu.: 38.5 1st Qu.: 2.000 1st Qu.:0.000 1st Qu.: 0.0000
## Median : 97.0 Median : 5.000 Median :1.000 Median : 0.0000
## Mean :138.3 Mean : 6.446 Mean :1.094 Mean : 0.6043
## 3rd Qu.:205.0 3rd Qu.: 9.000 3rd Qu.:2.000 3rd Qu.: 1.0000
## Max. :795.0 Max. :41.000 Max. :4.000 Max. :10.0000
## NPB NBUDS NEC APOP
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.0000 Median :0.0000 Median :0.00000 Median :0.00000
## Mean :0.2086 Mean :0.5683 Mean :0.09353 Mean :0.05036
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :3.0000 Max. :6.0000 Max. :3.00000 Max. :3.00000
## NDI TOTAL_CELLS SEXO EDAD
## Min. :1.000 Min. : 93 Length:139 Min. :18.00
## 1st Qu.:1.457 1st Qu.:1005 Class :character 1st Qu.:23.00
## Median :1.677 Median :1962 Mode :character Median :32.00
## Mean :1.641 Mean :1677 Mean :33.89
## 3rd Qu.:1.841 3rd Qu.:2168 3rd Qu.:43.00
## Max. :2.152 Max. :3950 Max. :65.00
# se vuelve la variable EXPOSICION CARACTER
datos <- mutate(d, EXPOSICION= as.character(d$EXPOSICION))
#datos <- na.omit(datos)
summary(datos)
## # LOCALIDAD EXPOSICION LONGITUD
## Min. : 1.00 Length:139 Length:139 Length:139
## 1st Qu.:18.00 Class :character Class :character Class :character
## Median :35.00 Mode :character Mode :character Mode :character
## Mean :41.09
## 3rd Qu.:63.50
## Max. :98.00
## LATITUD SAMPLE_ID MONO BI
## Length:139 Length:139 Min. : 72.0 Min. : 0.0
## Class :character Class :character 1st Qu.: 370.5 1st Qu.: 458.5
## Mode :character Mode :character Median : 584.0 Median : 921.0
## Mean : 660.9 Mean : 869.3
## 3rd Qu.: 865.5 3rd Qu.:1260.0
## Max. :2183.0 Max. :2103.0
## MULTI BNCMN_MNL MN_BNC MNCMN
## Min. : 0.0 Min. : 0.000 Min. :0.000 Min. : 0.0000
## 1st Qu.: 38.5 1st Qu.: 2.000 1st Qu.:0.000 1st Qu.: 0.0000
## Median : 97.0 Median : 5.000 Median :1.000 Median : 0.0000
## Mean :138.3 Mean : 6.446 Mean :1.094 Mean : 0.6043
## 3rd Qu.:205.0 3rd Qu.: 9.000 3rd Qu.:2.000 3rd Qu.: 1.0000
## Max. :795.0 Max. :41.000 Max. :4.000 Max. :10.0000
## NPB NBUDS NEC APOP
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.0000 Median :0.0000 Median :0.00000 Median :0.00000
## Mean :0.2086 Mean :0.5683 Mean :0.09353 Mean :0.05036
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :3.0000 Max. :6.0000 Max. :3.00000 Max. :3.00000
## NDI TOTAL_CELLS SEXO EDAD
## Min. :1.000 Min. : 93 Length:139 Min. :18.00
## 1st Qu.:1.457 1st Qu.:1005 Class :character 1st Qu.:23.00
## Median :1.677 Median :1962 Mode :character Median :32.00
## Mean :1.641 Mean :1677 Mean :33.89
## 3rd Qu.:1.841 3rd Qu.:2168 3rd Qu.:43.00
## Max. :2.152 Max. :3950 Max. :65.00
#### CORRELACIÓN DE VARIABLES
tabla2 <- datos[1:139, c(10,11,12,13,14,15,16,18,20)]
tab <- data.frame(cor(tabla2))
Matrixcorr <-cor(tab)
corrplot(Matrixcorr)

# HISTOGRAMA Y FUNCION DE DENSIDAD
A_1 <- datos %>% ggplot(aes(MN_BNC)) +
geom_histogram(aes(y=..density..),bins=50,alpha=10,position="identity",binwidth=0.9, colour="black", fill="#9BCD9B") +
geom_density(aes(),alpha=99, color="red", lwd = 1.5)+
labs(x="MN_BNC",y="")
#ylim(c(0,20))
#ggtitle("Distribución de la variable de respuesta MN_BNC")
A_2<-datos %>% ggplot(aes(NPB)) +
geom_histogram(aes(y=..density..),bins=50,alpha=10,position="identity",binwidth=0.9, colour="black", fill="#9BCD9B") +
geom_density(aes(),alpha=99, color="red", lwd = 1.5)+
labs(x="NPB",y="")
#ylim(c(0,20))
#ggtitle("Distribución de la variable de respuesta NPB")
A_3 <-datos %>% ggplot(aes(x=BNCMN_MNL)) +
geom_histogram(aes(y=..density..),bins=50,alpha=10,position="identity",binwidth=0.9, colour="black", fill="#9BCD9B") +
geom_density(aes(),alpha=99, color="red", lwd = 1.5)+
labs(x="BNCMN_MNL",y="")
#ylim(c(0,20))
#ggtitle("Distribución de la variable de respuesta BNCMN_MNL")
plot_grid(A_1, A_2, A_3, nrow = 1, ncol=3, labels="AUTO")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.

## sesgo por cada biomarcador
skewness(datos$MN_BNC)
## [1] 0.7262248
## [1] 2.781986
skewness(datos$BNCMN_MNL)
## [1] 2.121777
B1 <-ggplot(datos, aes(x = MN_BNC , fill = EXPOSICION)) +
geom_density(alpha = 0.5)
#ggtitle("Grafica de densidad por grupos de exposición")
B2 <-ggplot(datos, aes(x = NPB , fill = EXPOSICION)) +
geom_density(alpha = 0.5)
#ggtitle("Grafica de densidad por grupos de exposición")
B3 <-ggplot(datos, aes(x = BNCMN_MNL , fill = EXPOSICION)) +
geom_density(alpha = 0.5)
#ggtitle("Grafica de densidad por grupos de exposición")
plot_grid(B1, B2, B3, nrow = 1, ncol=3, labels="AUTO")

## DIAGRAMAS DE CAJA POR CADA BIOMARCADOR EN RELACIÓN AL SEXO
C1 <-ggplot(datos, aes(x = EXPOSICION, y = MN_BNC, fill = SEXO)) +
geom_boxplot()+stat_summary(fun = mean, color="blue")+
scale_fill_manual(values = c('#FFB6C1', '#7AC5CD', '#638239'))+
labs(x = 'EXPOSICION',
y = 'MN_BNC',
fill = 'EXPOSICION')
C2 <- ggplot(datos, aes(x = EXPOSICION, y = NPB, fill = SEXO)) +
geom_boxplot()+stat_summary(fun = mean, color="blue")+
scale_fill_manual(values = c('#FFB6C1', '#7AC5CD', '#638239'))+
labs(x = 'EXPOSICION',
y = 'NPB',
fill = 'EXPOSICION')
C3 <- ggplot(datos, aes(x = EXPOSICION, y = BNCMN_MNL, fill = SEXO)) +
geom_boxplot()+stat_summary(fun = mean, color="blue")+
scale_fill_manual(values = c('#FFB6C1', '#7AC5CD', '#638239'))+
labs(x = 'EXPOSICION',
y = 'BNCMN_MNL',
fill = 'EXPOSICION')
plot_grid(C1, C2, C3, nrow = 1, ncol=3, labels="AUTO")
## Warning: Removed 4 rows containing missing values (`geom_segment()`).
## Removed 4 rows containing missing values (`geom_segment()`).
## Removed 4 rows containing missing values (`geom_segment()`).

## DIAGRAMA DE PUNTOS POR BIOMARCADOR Y EDAD
D1<- ggplot(datos, aes(x=EDAD , y= MN_BNC))+geom_point(aes(color = EXPOSICION), alpha= 0.5)
D2 <-ggplot(datos, aes(x=EDAD , y= NPB))+geom_point(aes(color = EXPOSICION), alpha= 0.5)
D3 <-ggplot(datos, aes(x=EDAD , y= BNCMN_MNL))+geom_point(aes(color = EXPOSICION), alpha= 0.5)
plot_grid(D1, D2, D3, nrow = 1, ncol=3, labels="AUTO")

## GENERACION DE MODELOS PRIMER BIOMARCADOR
rangedad <- cut(datos$EDAD,4)
table(rangedad)
## rangedad
## (18,29.8] (29.8,41.5] (41.5,53.2] (53.2,65]
## 65 34 23 17
## MODELO POISSON
modeloP <- glm(MN_BNC~ rangedad+EXPOSICION+SEXO, poisson(link = "log") , data = datos)
summary(modeloP)
##
## Call:
## glm(formula = MN_BNC ~ rangedad + EXPOSICION + SEXO, family = poisson(link = "log"),
## data = datos)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.71611 -1.36771 -0.00991 0.56467 2.35766
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.06687 0.18153 -0.368 0.7126
## rangedad(29.8,41.5] 0.36688 0.19538 1.878 0.0604 .
## rangedad(41.5,53.2] 0.02438 0.24910 0.098 0.9220
## rangedad(53.2,65] 0.36617 0.24672 1.484 0.1378
## EXPOSICION1 -0.03458 0.17908 -0.193 0.8469
## SEXOM 0.08697 0.17479 0.498 0.6188
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 186.05 on 138 degrees of freedom
## Residual deviance: 180.94 on 133 degrees of freedom
## AIC: 397.31
##
## Number of Fisher Scoring iterations: 5
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -0.4381216 0.2744306
## rangedad(29.8,41.5] -0.0206860 0.7475681
## rangedad(41.5,53.2] -0.4851105 0.4965264
## rangedad(53.2,65] -0.1369548 0.8350173
## EXPOSICION1 -0.3783931 0.3254301
## SEXOM -0.2636854 0.4232299
## AJUSTE DEL MODELO
dispersiontest(modeloP, trafo = 1)
##
## Overdispersion test
##
## data: modeloP
## z = 0.89937, p-value = 0.1842
## alternative hypothesis: true alpha is greater than 0
## sample estimates:
## alpha
## 0.09288681
gf<-goodfit(datos$MN_BNC, type = "poisson", method = "MinChisq") ## TEST QUE PERMITE SABER SI LOS DATOS SE AJUSTAN A UN MODELO POISSON
gf$par # valor de lamda
## $lambda
## [1] 1.099183
summary(gf) ### VALOR MAYOR A 0.05 POR TANTO NO SE AJUSTA UN MODELO POISSON
## Warning in summary.goodfit(gf): Chi-squared approximation may be incorrect
##
## Goodness-of-fit test for poisson distribution
##
## X^2 df P(> X^2)
## Pearson 6.364363 3 0.09516794
#mean(datos$MN_BNC)
#var(datos$MN_BNC).
## PREDICCIÓN DE CEROS
zobs <- datos$MN_BNC == 0
head(zobs)
## [1] TRUE TRUE FALSE TRUE TRUE TRUE
zpoi <- exp(-exp(predict(modeloP))) # or dpois(0,exp(predict(mp)))
head(dpois(0,exp(predict(modeloP))))
## 1 2 3 4 5 6
## 0.3835083 0.3835083 0.3924623 0.3835083 0.2592758 0.2592758
c(obs=mean(zobs), poi=mean(zpoi)) # CEROS OBSERVADOS VS CEROS ´REDICHOS POISSON
## obs poi
## 0.3956835 0.3417227
#### BINOMIAL NEGATIVA
modelobn <- glm.nb(MN_BNC~ rangedad+EXPOSICION+SEXO, link = "log" , data = datos)
summary(modelobn)
##
## Call:
## glm.nb(formula = MN_BNC ~ rangedad + EXPOSICION + SEXO, data = datos,
## link = "log", init.theta = 9.210107463)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6501 -1.3340 -0.0096 0.5228 2.1677
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.06804 0.19145 -0.355 0.7223
## rangedad(29.8,41.5] 0.36594 0.20754 1.763 0.0779 .
## rangedad(41.5,53.2] 0.02473 0.26164 0.095 0.9247
## rangedad(53.2,65] 0.36670 0.26270 1.396 0.1628
## EXPOSICION1 -0.03210 0.18982 -0.169 0.8657
## SEXOM 0.08550 0.18561 0.461 0.6450
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(9.2101) family taken to be 1)
##
## Null deviance: 169.74 on 138 degrees of freedom
## Residual deviance: 165.20 on 133 degrees of freedom
## AIC: 398.62
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 9.2
## Std. Err.: 12.2
##
## 2 x log-likelihood: -384.623
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -0.45662490 0.2934779
## rangedad(29.8,41.5] -0.04476488 0.7707722
## rangedad(41.5,53.2] -0.50774322 0.5237465
## rangedad(53.2,65] -0.16484609 0.8686032
## EXPOSICION1 -0.39765039 0.3475926
## SEXOM -0.28511234 0.4434756
## AJUSTE DEL MODELO
gf<-goodfit(datos$MN_BNC, type = "nbinomial", method = "MinChisq") ## TEST QUE PERMITE SABER SI LOS DATOS SE AJUSTAN A UN MODELO POISSON
## Warning in pnbinom(count[-n], size = x[1], prob = x[2]): NaNs produced
## Warning in pnbinom(count[-n], size = x[1], prob = x[2]): NaNs produced
## Warning in pnbinom(count[-n], size = x[1], prob = x[2]): NaNs produced
## $size
## [1] 4.510877
##
## $prob
## [1] 0.801192
##
## Goodness-of-fit test for nbinomial distribution
##
## X^2 df P(> X^2)
## Pearson 4.0624 2 0.131178
## CEROS EN EL MODELO BN
munb <- exp(predict(modelobn))
theta <- modelobn$theta
znb <- (theta/(munb+theta))^theta
# also dnbinom(0, mu=munb, size=theta)
mean(znb)
## [1] 0.3620836
## GENERACION DE MODELOS SEGUNDO BIOMARCADOR
modeloP <- glm(NPB~ rangedad+EXPOSICION+SEXO, poisson(link = "log") , data = datos)
summary(modeloP)
##
## Call:
## glm(formula = NPB ~ rangedad + EXPOSICION + SEXO, family = poisson(link = "log"),
## data = datos)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2574 -0.6147 -0.5208 -0.3378 2.6524
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.9979 0.4623 -4.321 1.55e-05 ***
## rangedad(29.8,41.5] 0.1591 0.5723 0.278 0.781002
## rangedad(41.5,53.2] 0.7062 0.5425 1.302 0.193048
## rangedad(53.2,65] 1.5906 0.4780 3.328 0.000876 ***
## EXPOSICION1 0.1722 0.4385 0.393 0.694517
## SEXOM -1.0383 0.5410 -1.919 0.054944 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 111.351 on 138 degrees of freedom
## Residual deviance: 95.183 on 133 degrees of freedom
## AIC: 155.24
##
## Number of Fisher Scoring iterations: 6
## AJUSTE DEL MODELO
gf<-goodfit(datos$NPB, type = "nbinomial", method = "MinChisq") ## TEST QUE PERMITE SABER SI LOS DATOS SE AJUSTAN A UN MODELO POISSON
gf$par # valor de lamda
## $size
## [1] 0.4368505
##
## $prob
## [1] 0.6699222
## Warning in summary.goodfit(gf): Chi-squared approximation may be incorrect
##
## Goodness-of-fit test for nbinomial distribution
##
## X^2 df P(> X^2)
## Pearson 0.463865 1 0.4958232
dispersiontest(modeloP, trafo = 1)
##
## Overdispersion test
##
## data: modeloP
## z = 0.79941, p-value = 0.212
## alternative hypothesis: true alpha is greater than 0
## sample estimates:
## alpha
## 0.08003796
## PREDICCIÓN DE CEROS
zobs <- datos$NPB == 0
head(zobs)
## [1] TRUE TRUE TRUE TRUE TRUE FALSE
zpoi <- exp(-exp(predict(modeloP))) # or dpois(0,exp(predict(mp)))
head(dpois(0,exp(predict(modeloP))))
## 1 2 3 4 5 6
## 0.7597221 0.7597221 0.8731735 0.7597221 0.8529869 0.8529869
c(obs=mean(zobs), poi=mean(zpoi)) # CEROS OBSERVADOS VS CEROS ´REDICHOS POISSON
## obs poi
## 0.8417266 0.8232441
#### BINOMIAL NEGATIVA
modelobn <- glm.nb(NPB~ rangedad+EXPOSICION+SEXO, link = "log" , data = datos)
summary(modelobn)
##
## Call:
## glm.nb(formula = NPB ~ rangedad + EXPOSICION + SEXO, data = datos,
## link = "log", init.theta = 1.500784669)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1124 -0.5952 -0.5146 -0.3407 2.3173
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.9774 0.4891 -4.043 5.27e-05 ***
## rangedad(29.8,41.5] 0.1613 0.5992 0.269 0.78781
## rangedad(41.5,53.2] 0.6834 0.5833 1.172 0.24134
## rangedad(53.2,65] 1.5655 0.5352 2.925 0.00344 **
## EXPOSICION1 0.1449 0.4775 0.304 0.76145
## SEXOM -0.9950 0.5678 -1.752 0.07972 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(1.5008) family taken to be 1)
##
## Null deviance: 92.808 on 138 degrees of freedom
## Residual deviance: 79.425 on 133 degrees of freedom
## AIC: 156.06
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 1.50
## Std. Err.: 1.77
##
## 2 x log-likelihood: -142.064
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -3.0379027 -1.0969845
## rangedad(29.8,41.5] -1.0878358 1.3208364
## rangedad(41.5,53.2] -0.5043003 1.8254207
## rangedad(53.2,65] 0.5240610 2.6427443
## EXPOSICION1 -0.7599213 1.1489837
## SEXOM -2.2692486 0.0291524
## AJUSTE DEL MODELO
gf<-goodfit(datos$NPB, type = "nbinomial", method = "MinChisq") ## TEST QUE PERMITE SABER SI LOS DATOS SE AJUSTAN A UN MODELO POISSON
gf$par # valor de lamda
## $size
## [1] 0.4368505
##
## $prob
## [1] 0.6699222
## Warning in summary.goodfit(gf): Chi-squared approximation may be incorrect
##
## Goodness-of-fit test for nbinomial distribution
##
## X^2 df P(> X^2)
## Pearson 0.463865 1 0.4958232
## CEROS EN EL MODELO BN
munb <- exp(predict(modelobn))
theta <- modelobn$theta
znb <- (theta/(munb+theta))^theta
# also dnbinom(0, mu=munb, size=theta)
mean(znb)
## [1] 0.8365493
###### GENERACION DE MODELOS TERCER BIOMARCADOR
modeloP <- glm(BNCMN_MNL~ rangedad+EXPOSICION+SEXO, poisson(link = "log") , data = datos)
summary(modeloP)
##
## Call:
## glm(formula = BNCMN_MNL ~ rangedad + EXPOSICION + SEXO, family = poisson(link = "log"),
## data = datos)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.7093 -1.9769 -0.6875 1.2641 7.5533
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.29675 0.08595 15.088 < 2e-16 ***
## rangedad(29.8,41.5] 0.47538 0.08701 5.463 4.67e-08 ***
## rangedad(41.5,53.2] 0.61529 0.09321 6.601 4.08e-11 ***
## rangedad(53.2,65] 0.77225 0.09767 7.907 2.64e-15 ***
## EXPOSICION1 0.33693 0.08149 4.135 3.56e-05 ***
## SEXOM -0.16276 0.07592 -2.144 0.032 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 869.70 on 138 degrees of freedom
## Residual deviance: 756.71 on 133 degrees of freedom
## AIC: 1188.8
##
## Number of Fisher Scoring iterations: 5
## AJUSTE DEL MODELO
gf<-goodfit(datos$BNCMN_MNL, type = "nbinomial", method = "MinChisq") ## TEST QUE PERMITE SABER SI LOS DATOS SE AJUSTAN A UN MODELO POISSON
gf$par # valor de lamda
## $size
## [1] 0.9503965
##
## $prob
## [1] 0.1138231
## Warning in summary.goodfit(gf): Chi-squared approximation may be incorrect
##
## Goodness-of-fit test for nbinomial distribution
##
## X^2 df P(> X^2)
## Pearson 30.36615 39 0.8373707
dispersiontest(modeloP, trafo = 1)
##
## Overdispersion test
##
## data: modeloP
## z = 4.4772, p-value = 3.782e-06
## alternative hypothesis: true alpha is greater than 0
## sample estimates:
## alpha
## 4.732268
## PREDICCIÓN DE CEROS
zobs <- datos$BNCMN_MNL == 0
head(zobs)
## [1] TRUE FALSE FALSE FALSE FALSE FALSE
zpoi <- exp(-exp(predict(modeloP))) # or dpois(0,exp(predict(mp)))
head(dpois(0,exp(predict(modeloP))))
## 1 2 3 4 5 6
## 0.001151261 0.001151261 0.025799474 0.001151261 0.002785263 0.002785263
c(obs=mean(zobs), poi=mean(zpoi)) # CEROS OBSERVADOS VS CEROS ´REDICHOS POISSON
## obs poi
## 0.151079137 0.007734651
#### BINOMIAL NEGATIVA
modelobn <- glm.nb(BNCMN_MNL~ rangedad+EXPOSICION+SEXO, link = "log" , data = datos)
summary(modelobn)
##
## Call:
## glm.nb(formula = BNCMN_MNL ~ rangedad + EXPOSICION + SEXO, data = datos,
## link = "log", init.theta = 1.210635153)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3632 -0.9003 -0.2807 0.4792 2.0861
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.3610 0.1869 7.284 3.25e-13 ***
## rangedad(29.8,41.5] 0.4526 0.2116 2.139 0.03244 *
## rangedad(41.5,53.2] 0.6006 0.2401 2.502 0.01236 *
## rangedad(53.2,65] 0.7614 0.2679 2.842 0.00448 **
## EXPOSICION1 0.2702 0.1893 1.427 0.15360
## SEXOM -0.1773 0.1872 -0.947 0.34350
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(1.2106) family taken to be 1)
##
## Null deviance: 178.87 on 138 degrees of freedom
## Residual deviance: 161.71 on 133 degrees of freedom
## AIC: 814.47
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 1.211
## Std. Err.: 0.187
##
## 2 x log-likelihood: -800.470
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 1.01280770 1.7269871
## rangedad(29.8,41.5] 0.03976645 0.8771952
## rangedad(41.5,53.2] 0.13780614 1.0876825
## rangedad(53.2,65] 0.24617791 1.3146622
## EXPOSICION1 -0.11421157 0.6430809
## SEXOM -0.53956274 0.1963908
## AJUSTE DEL MODELO
gf<-goodfit(datos$BNCMN_MNL, type = "nbinomial", method = "MinChisq") ## TEST QUE PERMITE SABER SI LOS DATOS SE AJUSTAN A UN MODELO POISSON
gf$par # valor de lamda
## $size
## [1] 0.9503965
##
## $prob
## [1] 0.1138231
## Warning in summary.goodfit(gf): Chi-squared approximation may be incorrect
##
## Goodness-of-fit test for nbinomial distribution
##
## X^2 df P(> X^2)
## Pearson 30.36615 39 0.8373707
## CEROS EN EL MODELO BN
munb <- exp(predict(modelobn))
theta <- modelobn$theta
znb <- (theta/(munb+theta))^theta
# also dnbinom(0, mu=munb, size=theta)
mean(znb)
## [1] 0.1196679
############## PRUEBAS DE HIPOTESIS ##################################
rangedad <- cut(datos$EDAD,4)
table(rangedad)
## rangedad
## (18,29.8] (29.8,41.5] (41.5,53.2] (53.2,65]
## 65 34 23 17
boxplot(datos$NPB~rangedad)

leveneTest(MN_BNC ~ rangedad, data = datos, center = "median")
## Levene's Test for Homogeneity of Variance (center = "median")
## Df F value Pr(>F)
## group 3 1.3459 0.2622
## 135
leveneTest(NPB ~ rangedad, data = datos, center = "median")
## Levene's Test for Homogeneity of Variance (center = "median")
## Df F value Pr(>F)
## group 3 3.9146 0.0102 *
## 135
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
leveneTest(BNCMN_MNL ~ rangedad, data = datos, center = "median")
## Levene's Test for Homogeneity of Variance (center = "median")
## Df F value Pr(>F)
## group 3 1.9925 0.1181
## 135
kruskal.test(MN_BNC ~ rangedad, data = datos)
##
## Kruskal-Wallis rank sum test
##
## data: MN_BNC by rangedad
## Kruskal-Wallis chi-squared = 3.7952, df = 3, p-value = 0.2844
kruskal.test(NPB ~ rangedad, data = datos)
##
## Kruskal-Wallis rank sum test
##
## data: NPB by rangedad
## Kruskal-Wallis chi-squared = 6.9841, df = 3, p-value = 0.07241
kruskal.test(BNCMN_MNL ~ rangedad, data = datos)
##
## Kruskal-Wallis rank sum test
##
## data: BNCMN_MNL by rangedad
## Kruskal-Wallis chi-squared = 13.729, df = 3, p-value = 0.003298
pairwise.wilcox.test(datos$NPB, rangedad, p.adjust.method = "holm", data = datos )
## Warning in wilcox.test.default(xi, xj, paired = paired, ...): cannot compute
## exact p-value with ties
## Warning in wilcox.test.default(xi, xj, paired = paired, ...): cannot compute
## exact p-value with ties
## Warning in wilcox.test.default(xi, xj, paired = paired, ...): cannot compute
## exact p-value with ties
##
## Pairwise comparisons using Wilcoxon rank sum test with continuity correction
##
## data: datos$NPB and rangedad
##
## (18,29.8] (29.8,41.5] (41.5,53.2]
## (29.8,41.5] 1.00 - -
## (41.5,53.2] 1.00 1.00 -
## (53.2,65] 0.06 0.32 0.80
##
## P value adjustment method: holm
wilcox.test(data=datos, MN_BNC~EXPOSICION, alternative = "two.sided",paired = FALSE, correct = FALSE, conf.int = TRUE, conf.level = 0.95)
##
## Wilcoxon rank sum test
##
## data: MN_BNC by EXPOSICION
## W = 1960, p-value = 0.8123
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -5.917667e-05 1.718003e-05
## sample estimates:
## difference in location
## -2.205376e-05
wilcox.test(data=datos, NPB~EXPOSICION, alternative = "two.sided" ,paired = FALSE, correct = FALSE, conf.int = TRUE, conf.level = 0.95)
##
## Wilcoxon rank sum test
##
## data: NPB by EXPOSICION
## W = 1911.5, p-value = 0.4776
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -7.156865e-06 5.641974e-05
## sample estimates:
## difference in location
## -5.753896e-05
wilcox.test(data=datos, BNCMN_MNL~EXPOSICION, alternative = "two.sided" ,paired = FALSE, correct = FALSE, conf.int = TRUE, conf.level = 0.95)
##
## Wilcoxon rank sum test
##
## data: BNCMN_MNL by EXPOSICION
## W = 1769, p-value = 0.2656
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -2.9999510 0.9999909
## sample estimates:
## difference in location
## -0.9999728
fligner.test(data=datos, NPB~EXPOSICION)
##
## Fligner-Killeen test of homogeneity of variances
##
## data: NPB by EXPOSICION
## Fligner-Killeen:med chi-squared = 0.40343, df = 1, p-value = 0.5253
wilcox.test(data=datos, MN_BNC~SEXO, alternative = "two.sided",paired = FALSE, correct = FALSE, conf.int = TRUE, conf.level = 0.95)
##
## Wilcoxon rank sum test
##
## data: MN_BNC by SEXO
## W = 1932.5, p-value = 0.7108
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -4.696426e-05 1.842141e-05
## sample estimates:
## difference in location
## -4.051747e-05
wilcox.test(data=datos, NPB~SEXO, alternative = "two.sided" ,paired = FALSE, correct = FALSE, conf.int = TRUE, conf.level = 0.95)
##
## Wilcoxon rank sum test
##
## data: NPB by SEXO
## W = 2194, p-value = 0.1778
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -1.313769e-05 1.427878e-05
## sample estimates:
## difference in location
## 6.805999e-05
wilcox.test(data=datos, BNCMN_MNL~SEXO, alternative = "two.sided" ,paired = FALSE, correct = FALSE, conf.int = TRUE, conf.level = 0.95)
##
## Wilcoxon rank sum test
##
## data: BNCMN_MNL by SEXO
## W = 2089, p-value = 0.7106
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -1.000022 2.000015
## sample estimates:
## difference in location
## 4.924799e-05
fligner.test(data=datos, NPB~SEXO)
##
## Fligner-Killeen test of homogeneity of variances
##
## data: NPB by SEXO
## Fligner-Killeen:med chi-squared = 2.2453, df = 1, p-value = 0.134
## prueba chi cuadrado de independencia entre sexo y exposición
#create table
data1 <- matrix (c (11, 30, 30, 68), ncol = 2 , byrow = TRUE )
colnames(data1) <- c (" M ", " F ")
row.names(data1) <- c (" 0 ", " 1 ")
table(datos$SEXO, datos$EXPOSICION)
##
## 0 1
## F 30 68
## M 11 30
chisq.test(data1)$expected
## M F
## 0 12.09353 28.90647
## 1 28.90647 69.09353
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: data1
## X-squared = 0.058601, df = 1, p-value = 0.8087
## prueba chi cuadrado de independencia entre rango de edad y exposición
data2 <- matrix (c (22, 43, 10, 24, 6, 17, 3, 14), ncol = 2 , byrow = TRUE )
colnames(data2) <- c (" 0 ", " 1 ")
row.names(data2) <- c (" I1 ", " I2 ", "I3", "I4")
table(rangedad,datos$EXPOSICION )
##
## rangedad 0 1
## (18,29.8] 22 43
## (29.8,41.5] 10 24
## (41.5,53.2] 6 17
## (53.2,65] 3 14
chisq.test (data2)$expected
## 0 1
## I1 19.172662 45.82734
## I2 10.028777 23.97122
## I3 6.784173 16.21583
## I4 5.014388 11.98561
##
## Pearson's Chi-squared test
##
## data: data2
## X-squared = 1.8678, df = 3, p-value = 0.6003
## RIESGO RELATIVO
library(jtools)
## Warning: package 'jtools' was built under R version 4.2.3
## MODEL INFO:
## Observations: 139
## Dependent Variable: BNCMN_MNL
## Type: Generalized linear model
## Family: poisson
## Link function: log
##
## MODEL FIT:
## χ²(5) = 112.99, p = 0.00
## Pseudo-R² (Cragg-Uhler) = 0.56
## Pseudo-R² (McFadden) = 0.09
## AIC = 1188.77, BIC = 1206.38
##
## Standard errors: MLE
## --------------------------------------------------------------------
## exp(Est.) 2.5% 97.5% z val. p
## ------------------------- ----------- ------ ------- -------- ------
## (Intercept) 3.66 3.09 4.33 15.09 0.00
## rangedad(29.8,41.5] 1.61 1.36 1.91 5.46 0.00
## rangedad(41.5,53.2] 1.85 1.54 2.22 6.60 0.00
## rangedad(53.2,65] 2.16 1.79 2.62 7.91 0.00
## EXPOSICION1 1.40 1.19 1.64 4.13 0.00
## SEXOM 0.85 0.73 0.99 -2.14 0.03
## --------------------------------------------------------------------
## Error in glm.control(...) :
## unused argument (family = list("Negative Binomial(1.2106)", "log", function (mu)
## log(mu), function (eta)
## pmax(exp(eta), .Machine$double.eps), function (mu)
## mu + mu^2/.Theta, function (y, mu, wt)
## 2 * wt * (y * log(pmax(1, y)/mu) - (y + .Theta) * log((y + .Theta)/(mu + .Theta))), function (y, n, mu, wt, dev)
## {
## term <- (y + .Theta) * log(mu + .Theta) - y * log(mu) + lgamma(y + 1) - .Theta * log(.Theta) + lgamma(.Theta) - lgamma(.Theta + y)
## 2 * sum(term * wt)
## }, function (eta)
## pmax(exp(eta), .Machine$double.eps), expression({
## if (any(y < 0)) stop("negative values not allowed for the negative binomial family")
## n <- rep(1, nobs)
## mustart <- y + (y == 0)/6
## }), function (mu)
## all(mu > 0), function (eta)
## TRUE, function (object, nsim)
## {
## ftd <- fitted(object)
## rnegbin(nsim * length(ftd), ftd, .Theta)
## }))
## Warning: Something went wrong when calculating the pseudo R-squared. Returning NA
## instead.
## MODEL INFO:
## Observations: 139
## Dependent Variable: BNCMN_MNL
## Type: Generalized linear model
## Family: Negative Binomial(1.2106)
## Link function: log
##
## MODEL FIT:
## χ²(NA) = NA, p = NA
## Pseudo-R² (Cragg-Uhler) = NA
## Pseudo-R² (McFadden) = NA
## AIC = 814.47, BIC = 835.01
##
## Standard errors: MLE
## --------------------------------------------------------------------
## exp(Est.) 2.5% 97.5% z val. p
## ------------------------- ----------- ------ ------- -------- ------
## (Intercept) 3.90 2.70 5.63 7.28 0.00
## rangedad(29.8,41.5] 1.57 1.04 2.38 2.14 0.03
## rangedad(41.5,53.2] 1.82 1.14 2.92 2.50 0.01
## rangedad(53.2,65] 2.14 1.27 3.62 2.84 0.00
## EXPOSICION1 1.31 0.90 1.90 1.43 0.15
## SEXOM 0.84 0.58 1.21 -0.95 0.34
## --------------------------------------------------------------------