library(MASS) ## Datos
## Warning: package 'MASS' was built under R version 4.2.3
library(mgcv) ## GAM
## Warning: package 'mgcv' was built under R version 4.2.3
## Loading required package: nlme
## This is mgcv 1.9-0. For overview type 'help("mgcv-package")'.
library(sqldf) ## Manejo de Datos
## Warning: package 'sqldf' was built under R version 4.2.3
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 4.2.3
## Loading required package: proto
## Warning: package 'proto' was built under R version 4.2.3
## Loading required package: RSQLite
## Warning: package 'RSQLite' was built under R version 4.2.3
library(xtable) ## Latex
## Warning: package 'xtable' was built under R version 4.2.3
library(ggplot2) ## Graficos
## Warning: package 'ggplot2' was built under R version 4.2.3
library(GGally) ## Pairs
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(glmtoolbox) ## glm
## Warning: package 'glmtoolbox' was built under R version 4.2.3
library(gratia) ## grafico
## Warning: package 'gratia' was built under R version 4.2.3
# Arreglar la base de datos
Data <- sqldf("select time, sex, age, thickness, ulcer, case when status=1 then 0
else 1 end as status
from Melanoma
where status<3") # datos de personas muertas por melanoma (0) ó personas vivas (1)
Data <- Data[,c(6, 1,2,3,4,5)]
Data[,1] <- as.numeric(Data$status)
# Analisis de los datos
a <- as.data.frame(summary(Data)[,2]) # time
a$`time` <- a$`summary(Data)[, 2]`
a<- as.data.frame(a$time)
xtable(a)
## % latex table generated in R 4.2.2 by xtable 1.8-4 package
## % Wed Nov 29 09:06:13 2023
## \begin{table}[ht]
## \centering
## \begin{tabular}{rl}
## \hline
## & a\$time \\
## \hline
## 1 & Min. : 35 \\
## 2 & 1st Qu.:1574 \\
## 3 & Median :2024 \\
## 4 & Mean :2213 \\
## 5 & 3rd Qu.:3054 \\
## 6 & Max. :5565 \\
## \hline
## \end{tabular}
## \end{table}
a <- as.data.frame(summary(Data)[,4]) # age
a$`age` <- a$`summary(Data)[, 4]`
a<- as.data.frame(a$age)
xtable(a)
## % latex table generated in R 4.2.2 by xtable 1.8-4 package
## % Wed Nov 29 09:06:13 2023
## \begin{table}[ht]
## \centering
## \begin{tabular}{rl}
## \hline
## & a\$age \\
## \hline
## 1 & Min. : 4.00 \\
## 2 & 1st Qu.:41.00 \\
## 3 & Median :54.00 \\
## 4 & Mean :51.52 \\
## 5 & 3rd Qu.:63.50 \\
## 6 & Max. :95.00 \\
## \hline
## \end{tabular}
## \end{table}
a <- as.data.frame(summary(Data)[,5]) # thickness
a$`thickness` <- a$`summary(Data)[, 5]`
a<- as.data.frame(a$thickness)
xtable(a)
## % latex table generated in R 4.2.2 by xtable 1.8-4 package
## % Wed Nov 29 09:06:13 2023
## \begin{table}[ht]
## \centering
## \begin{tabular}{rl}
## \hline
## & a\$thickness \\
## \hline
## 1 & Min. : 0.100 \\
## 2 & 1st Qu.: 0.970 \\
## 3 & Median : 1.940 \\
## 4 & Mean : 2.861 \\
## 5 & 3rd Qu.: 3.540 \\
## 6 & Max. :17.420 \\
## \hline
## \end{tabular}
## \end{table}
# Correlacion
Data2 <- Data
Data2$status <- as.factor(Data2$status)
ggpairs(Data2, columns=2:6, ggplot2::aes(colour=status))

cor(Data[,2:6], method = "pearson")
## time sex age thickness ulcer
## time 1.0000000 -0.15615868 -0.25966221 -0.2291520 -0.24566329
## sex -0.1561587 1.00000000 0.07240028 0.1692452 0.18987276
## age -0.2596622 0.07240028 1.00000000 0.1913597 0.09764587
## thickness -0.2291520 0.16924518 0.19135973 1.0000000 0.42040819
## ulcer -0.2456633 0.18987276 0.09764587 0.4204082 1.00000000
pairs(cbind(Data[,2:6]))

attach(Data)
## glm
fit1 <- glm(status~time+sex+age+thickness+ulcer, family = binomial(logit), data = Data)
summary(fit1)
##
## Call:
## glm(formula = status ~ time + sex + age + thickness + ulcer,
## family = binomial(logit), data = Data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5971 -0.3210 0.1875 0.6392 2.2143
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.1641628 1.0557919 -2.050 0.04038 *
## time 0.0020894 0.0003825 5.462 4.7e-08 ***
## sex -0.1623156 0.4593104 -0.353 0.72380
## age 0.0004228 0.0141003 0.030 0.97608
## thickness -0.0936304 0.0884660 -1.058 0.28988
## ulcer -1.2219318 0.4670858 -2.616 0.00889 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 232.84 on 190 degrees of freedom
## Residual deviance: 133.35 on 185 degrees of freedom
## AIC: 145.35
##
## Number of Fisher Scoring iterations: 6
stepCriterion(fit1, criterion ="aic")
##
## Family: binomial
## Link function: logit
##
## Initial model:
## ~ 1
##
##
## Step 0 :
## df AIC BIC adj.R-squared P(Chisq>)(*)
## + time 1 149.95 156.46 0.3699 7.935e-10
## + ulcer 1 209.66 216.16 0.1121 7.510e-07
## + thickness 1 217.45 223.95 0.0784 9.178e-05
## + sex 1 230.93 237.43 0.0202 0.01517
## + age 1 233.02 239.53 0.0112 0.05540
## <none> 234.84 238.09 0.0000
##
## Step 1 : + time
##
## df AIC BIC adj.R-squared P(Chisq>)(*)
## + ulcer 1 140.76 150.52 0.4151 0.001053
## + thickness 1 147.04 156.79 0.3878 0.039060
## <none> 149.95 156.46 0.3699
## + sex 1 150.79 160.54 0.3716 0.278991
## + age 1 151.67 161.42 0.3677 0.594843
##
## Step 2 : + ulcer
##
## df AIC BIC adj.R-squared P(Chisq>)(*)
## <none> 140.76 150.52 0.4151
## + thickness 1 141.48 154.49 0.4175 0.2700
## + sex 1 142.56 155.57 0.4128 0.6527
## + age 1 142.73 155.74 0.4121 0.8684
## - time 1 209.66 216.16 0.1121 1.489e-08
##
## Final model:
## ~ time + ulcer
##
## ****************************************************************************
## (*) p-values of the Wald test
fit1 <- glm(status~time+ulcer, family = binomial(logit), data = Data)
summary(fit1)
##
## Call:
## glm(formula = status ~ time + ulcer, family = binomial(logit),
## data = Data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6653 -0.3362 0.1713 0.6412 2.2433
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.5079543 0.7299371 -3.436 0.000591 ***
## time 0.0021678 0.0003828 5.663 1.49e-08 ***
## ulcer -1.4197600 0.4334017 -3.276 0.001053 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 232.84 on 190 degrees of freedom
## Residual deviance: 134.76 on 188 degrees of freedom
## AIC: 140.76
##
## Number of Fisher Scoring iterations: 6
AIC(fit1)
## [1] 140.761
## gam
# se usa ML smoothness selection
fit2 <- gam(status~s(time)+sex+s(age)+s(thickness)+ulcer, family = binomial(), data = Data, method = "ML")
summary(fit2)
##
## Family: binomial
## Link function: logit
##
## Formula:
## status ~ s(time) + sex + s(age) + s(thickness) + ulcer
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.8757 0.6607 2.839 0.00453 **
## sex -0.2609 0.5054 -0.516 0.60577
## ulcer -0.7258 0.5877 -1.235 0.21681
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(time) 4.562 5.502 29.983 1.86e-05 ***
## s(age) 1.000 1.000 0.004 0.947
## s(thickness) 2.625 3.319 3.870 0.287
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.579 Deviance explained = 54.9%
## -ML = 66.465 Scale est. = 1 n = 191
# -age
fit3 <- gam(status~s(time)+sex+s(thickness)+ulcer, family = binomial(), data = Data, method = "ML")
anova(fit2, fit3, test = "Chisq") # nos da que la devianza cambio muy poco y no es significativa en el valor P
## Analysis of Deviance Table
##
## Model 1: status ~ s(time) + sex + s(age) + s(thickness) + ulcer
## Model 2: status ~ s(time) + sex + s(thickness) + ulcer
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 176.54 105.01
## 2 177.50 104.89 -0.95233 0.11762
# Usamos AIC como medida tambien.
AIC(fit2, fit3)
## df AIC
## fit2 12.82123 130.6524
## fit3 11.86213 128.6166
summary(fit3)
##
## Family: binomial
## Link function: logit
##
## Formula:
## status ~ s(time) + sex + s(thickness) + ulcer
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.8744 0.6615 2.834 0.0046 **
## sex -0.2634 0.5051 -0.521 0.6020
## ulcer -0.7201 0.5885 -1.224 0.2211
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(time) 4.573 5.514 29.958 1.91e-05 ***
## s(thickness) 2.648 3.348 3.985 0.277
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.582 Deviance explained = 55%
## -ML = 66.468 Scale est. = 1 n = 191
# -sex
fit4 <- gam(status~s(time)+s(thickness)+ulcer, family = binomial(), data = Data, method = "ML")
anova(fit3, fit4, test = "Chisq") # nos da que la devianza cambio muy poco y no es significativa en el valor P
## Analysis of Deviance Table
##
## Model 1: status ~ s(time) + sex + s(thickness) + ulcer
## Model 2: status ~ s(time) + s(thickness) + ulcer
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 177.50 104.89
## 2 181.57 112.00 -4.0768 -7.1068 0.136
# Usamos AIC como medida tambien.
AIC(fit3, fit4)
## df AIC
## fit3 11.862135 128.6166
## fit4 8.478317 128.9558
summary(fit4)
##
## Family: binomial
## Link function: logit
##
## Formula:
## status ~ s(time) + s(thickness) + ulcer
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.9100 0.5868 3.255 0.00113 **
## ulcer -1.1072 0.5252 -2.108 0.03500 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(time) 4.531 5.478 32.198 6.58e-06 ***
## s(thickness) 1.000 1.000 0.328 0.567
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.566 Deviance explained = 51.9%
## -ML = 65.947 Scale est. = 1 n = 191
# -thickness
fit5 <- gam(status~s(time)+ulcer, family = binomial(), data = Data, method = "ML")
anova(fit4, fit5, test = "Chisq") # nos da que la devianza cambio muy poco y no es significativa en el valor P
## Analysis of Deviance Table
##
## Model 1: status ~ s(time) + s(thickness) + ulcer
## Model 2: status ~ s(time) + ulcer
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 181.57 112.00
## 2 182.51 111.91 -0.94027 0.092906
# Usamos AIC como medida tambien.
AIC(fit4, fit5)
## df AIC
## fit4 8.478317 128.9558
## fit5 7.536625 126.9795
summary(fit5)
##
## Family: binomial
## Link function: logit
##
## Formula:
## status ~ s(time) + ulcer
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.9637 0.5985 3.281 0.00103 **
## ulcer -1.2240 0.4859 -2.519 0.01176 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(time) 4.588 5.537 33.45 5.08e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.568 Deviance explained = 51.9%
## -ML = 66.124 Scale est. = 1 n = 191
# fit4
# R-sq.(adj) = 0.566 Deviance explained = 51.9%
# -ML = 65.947 Scale est. = 1 n = 191
#fit5
# R-sq.(adj) = 0.568 Deviance explained = 51.9%
# -ML = 66.124 Scale est. = 1 n = 191
par(mfrow=c(1,1))
gratia::draw(fit4, main="fit4")# Se podría notar el efecto penalizado a lineal del grosor del tumor

gratia::draw(fit5, main="fit5")

# Como hay traslape en unos momentos, no se puede hablar que haya alguna
# diferencia en esos momentos y si no hay traslape como al final del
# grafico podemos decir que hubo un aumento significativo en las muertes
# Analisis de los residuales
par(mfrow=c(2,2))
gam.check(fit4, type = "deviance", old.style = FALSE)

##
## Method: ML Optimizer: outer newton
## full convergence after 14 iterations.
## Gradient range [-1.56257e-05,1.966884e-06]
## (score 65.94696 & scale 1).
## Hessian positive definite, eigenvalue range [1.562497e-05,0.765107].
## Model rank = 20 / 20
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(time) 9.00 4.53 1.08 0.93
## s(thickness) 9.00 1.00 0.89 0.04 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
gam.check(fit5, type = "deviance", old.style = FALSE)

##
## Method: ML Optimizer: outer newton
## full convergence after 5 iterations.
## Gradient range [-4.008419e-07,-4.008419e-07]
## (score 66.12423 & scale 1).
## Hessian positive definite, eigenvalue range [0.790318,0.790318].
## Model rank = 11 / 11
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(time) 9.00 4.59 1.09 0.89
# k' edf k-index p-value
#s(time) 9.00 4.53 1.08 0.895
#s(thickness) 9.00 1.00 0.89 0.095
# entre mas parecido este k y edf (grados de libertad efectivos ) es decir que necesitábamos
# una mayor base para hacer los splines que hicimos.
# En este caso el modelo lo hizo automático y busco unos valores adecuados.
# Esto se sabe porque las probabilidades tienen que ser mayor a 0.05 (en general)
# para los diferentes suavizadores.
# Tener un suavizador menor a 0.05 indiciaria que hay un problema en la
# estimación de la base del suavizador.
# Se puede controlar aumentando la base y se mejore la ondulación estimada de la curva.
coef(fit4)
## (Intercept) ulcer s(time).1 s(time).2 s(time).3
## 1.909998e+00 -1.107211e+00 2.231724e+00 -5.201885e+00 2.125474e+00
## s(time).4 s(time).5 s(time).6 s(time).7 s(time).8
## -3.335906e+00 7.349185e-01 -2.146047e+00 -6.298678e-02 -1.275061e+01
## s(time).9 s(thickness).1 s(thickness).2 s(thickness).3 s(thickness).4
## 3.770324e-02 -4.588766e-06 6.595445e-06 -2.268043e-06 -2.443334e-06
## s(thickness).5 s(thickness).6 s(thickness).7 s(thickness).8 s(thickness).9
## 1.634517e-06 2.268098e-06 2.126589e-06 -6.810093e-06 -1.755300e-01
# Muestra los valores de los coeficientes que se uso para cada distribución base que se uso en el spline en el suavizador de tiempo y de thickness
# En este caso 9 bases regresivas no lineales para producir esas curvas.
concurvity(fit4)
## para s(time) s(thickness)
## worst 0.5598995 0.2849871 0.4085209
## observed 0.5598995 0.1540974 0.2589938
## estimate 0.5598995 0.1333303 0.1763569
# Encabezados de Columna:
# para: Esta columna representa la proporción de la varianza total en la variable de respuesta que es capturada por cada término suave individualmente. Es una medida de cuánta no linealidad explica cada término suave.
# s(time), s(thickness): Estas columnas representan los términos suaves individuales en tu modelo (fit4). Muestran la proporción de no linealidad para cada término suave
# Valores más bajos en las filas observed y estimate indican que el modelo está más cercano a la linealidad. Un valor de 0 indicaría una linealidad perfecta.
# La fila worst ayuda a identificar qué término contribuye más a la no linealidad en el modelo.
# Estos valores sugieren que ambos términos suaves contribuyen en cierta medida a la no linealidad en el modelo, pero el modelo aún está relativamente cerca de ser lineal, especialmente según los valores de concavidad estimada. La fila worst ayuda a identificar qué término tiene la máxima no linealidad, y en este caso, es s(time)
par(mfrow=c(1,1))
# Hace un producto tensorial suavizado de las dos covariables time y thickness
fit7 <- gam(status ~ te(time, thickness), data = Data)
AIC(fit7)
## [1] 124.6744
vis.gam(fit7, type = 'response',plot.type = 'persp',phi = 30,
theta = 25,
n.grid = 500,
border = NA)

### Comparar LM y GLM
anova(fit1, fit4, test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: status ~ time + ulcer
## Model 2: status ~ s(time) + s(thickness) + ulcer
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 188.00 134.76
## 2 183.47 112.00 4.5307 22.762 0.0002412 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# GAM espacial
# Para ver la distribución de especies, evidencia de fenómenos locales es bastante útil