GAM

library(MASS)       ## Datos

## Warning: package 'MASS' was built under R version 4.2.3

library(mgcv)       ## GAM

## Warning: package 'mgcv' was built under R version 4.2.3

## Loading required package: nlme

## This is mgcv 1.9-0. For overview type 'help("mgcv-package")'.

library(sqldf)      ## Manejo de Datos

## Warning: package 'sqldf' was built under R version 4.2.3

## Loading required package: gsubfn

## Warning: package 'gsubfn' was built under R version 4.2.3

## Loading required package: proto

## Warning: package 'proto' was built under R version 4.2.3

## Loading required package: RSQLite

## Warning: package 'RSQLite' was built under R version 4.2.3

library(xtable)     ## Latex

## Warning: package 'xtable' was built under R version 4.2.3

library(ggplot2)    ## Graficos

## Warning: package 'ggplot2' was built under R version 4.2.3

library(GGally)     ## Pairs

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

library(glmtoolbox) ## glm

## Warning: package 'glmtoolbox' was built under R version 4.2.3

library(gratia)     ## grafico

## Warning: package 'gratia' was built under R version 4.2.3

# Arreglar la base de datos
Data <- sqldf("select time, sex, age, thickness, ulcer, case when status=1 then 0 
                                                             else 1 end as status
               from Melanoma
               where status<3") # datos de personas muertas por melanoma (0) ó personas vivas (1)

Data <- Data[,c(6, 1,2,3,4,5)]
Data[,1] <- as.numeric(Data$status)


# Analisis de los datos
a <- as.data.frame(summary(Data)[,2]) # time
a$`time` <- a$`summary(Data)[, 2]`
a<- as.data.frame(a$time)
xtable(a)

## % latex table generated in R 4.2.2 by xtable 1.8-4 package
## % Wed Nov 29 09:06:13 2023
## \begin{table}[ht]
## \centering
## \begin{tabular}{rl}
##   \hline
##  & a\$time \\ 
##   \hline
## 1 & Min.   :  35   \\ 
##   2 & 1st Qu.:1574   \\ 
##   3 & Median :2024   \\ 
##   4 & Mean   :2213   \\ 
##   5 & 3rd Qu.:3054   \\ 
##   6 & Max.   :5565   \\ 
##    \hline
## \end{tabular}
## \end{table}

a <- as.data.frame(summary(Data)[,4]) # age
a$`age` <- a$`summary(Data)[, 4]`
a<- as.data.frame(a$age)
xtable(a)

## % latex table generated in R 4.2.2 by xtable 1.8-4 package
## % Wed Nov 29 09:06:13 2023
## \begin{table}[ht]
## \centering
## \begin{tabular}{rl}
##   \hline
##  & a\$age \\ 
##   \hline
## 1 & Min.   : 4.00   \\ 
##   2 & 1st Qu.:41.00   \\ 
##   3 & Median :54.00   \\ 
##   4 & Mean   :51.52   \\ 
##   5 & 3rd Qu.:63.50   \\ 
##   6 & Max.   :95.00   \\ 
##    \hline
## \end{tabular}
## \end{table}

a <- as.data.frame(summary(Data)[,5]) # thickness
a$`thickness` <- a$`summary(Data)[, 5]`
a<- as.data.frame(a$thickness)
xtable(a)

## % latex table generated in R 4.2.2 by xtable 1.8-4 package
## % Wed Nov 29 09:06:13 2023
## \begin{table}[ht]
## \centering
## \begin{tabular}{rl}
##   \hline
##  & a\$thickness \\ 
##   \hline
## 1 & Min.   : 0.100   \\ 
##   2 & 1st Qu.: 0.970   \\ 
##   3 & Median : 1.940   \\ 
##   4 & Mean   : 2.861   \\ 
##   5 & 3rd Qu.: 3.540   \\ 
##   6 & Max.   :17.420   \\ 
##    \hline
## \end{tabular}
## \end{table}

# Correlacion
Data2 <- Data
Data2$status <- as.factor(Data2$status)
ggpairs(Data2, columns=2:6,  ggplot2::aes(colour=status))

cor(Data[,2:6], method = "pearson")

##                 time         sex         age  thickness       ulcer
## time       1.0000000 -0.15615868 -0.25966221 -0.2291520 -0.24566329
## sex       -0.1561587  1.00000000  0.07240028  0.1692452  0.18987276
## age       -0.2596622  0.07240028  1.00000000  0.1913597  0.09764587
## thickness -0.2291520  0.16924518  0.19135973  1.0000000  0.42040819
## ulcer     -0.2456633  0.18987276  0.09764587  0.4204082  1.00000000

pairs(cbind(Data[,2:6]))

attach(Data)

## glm
fit1 <- glm(status~time+sex+age+thickness+ulcer, family = binomial(logit), data = Data)
summary(fit1)

## 
## Call:
## glm(formula = status ~ time + sex + age + thickness + ulcer, 
##     family = binomial(logit), data = Data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5971  -0.3210   0.1875   0.6392   2.2143  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.1641628  1.0557919  -2.050  0.04038 *  
## time         0.0020894  0.0003825   5.462  4.7e-08 ***
## sex         -0.1623156  0.4593104  -0.353  0.72380    
## age          0.0004228  0.0141003   0.030  0.97608    
## thickness   -0.0936304  0.0884660  -1.058  0.28988    
## ulcer       -1.2219318  0.4670858  -2.616  0.00889 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 232.84  on 190  degrees of freedom
## Residual deviance: 133.35  on 185  degrees of freedom
## AIC: 145.35
## 
## Number of Fisher Scoring iterations: 6

stepCriterion(fit1, criterion ="aic")

## 
##        Family:  binomial 
## Link function:  logit 
## 
## Initial model:
## ~ 1 
## 
## 
## Step 0 :
##              df    AIC    BIC adj.R-squared P(Chisq>)(*)
## + time        1 149.95 156.46        0.3699    7.935e-10
## + ulcer       1 209.66 216.16        0.1121    7.510e-07
## + thickness   1 217.45 223.95        0.0784    9.178e-05
## + sex         1 230.93 237.43        0.0202      0.01517
## + age         1 233.02 239.53        0.0112      0.05540
## <none>          234.84 238.09        0.0000             
## 
## Step 1 : + time 
## 
##              df    AIC    BIC adj.R-squared P(Chisq>)(*)
## + ulcer       1 140.76 150.52        0.4151     0.001053
## + thickness   1 147.04 156.79        0.3878     0.039060
## <none>          149.95 156.46        0.3699             
## + sex         1 150.79 160.54        0.3716     0.278991
## + age         1 151.67 161.42        0.3677     0.594843
## 
## Step 2 : + ulcer 
## 
##              df    AIC    BIC adj.R-squared P(Chisq>)(*)
## <none>          140.76 150.52        0.4151             
## + thickness   1 141.48 154.49        0.4175       0.2700
## + sex         1 142.56 155.57        0.4128       0.6527
## + age         1 142.73 155.74        0.4121       0.8684
## - time        1 209.66 216.16        0.1121    1.489e-08
## 
## Final model:
## ~ time + ulcer 
## 
## ****************************************************************************
## (*) p-values of the Wald test

fit1 <- glm(status~time+ulcer, family = binomial(logit), data = Data)
summary(fit1)

## 
## Call:
## glm(formula = status ~ time + ulcer, family = binomial(logit), 
##     data = Data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6653  -0.3362   0.1713   0.6412   2.2433  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.5079543  0.7299371  -3.436 0.000591 ***
## time         0.0021678  0.0003828   5.663 1.49e-08 ***
## ulcer       -1.4197600  0.4334017  -3.276 0.001053 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 232.84  on 190  degrees of freedom
## Residual deviance: 134.76  on 188  degrees of freedom
## AIC: 140.76
## 
## Number of Fisher Scoring iterations: 6

AIC(fit1)

## [1] 140.761

## gam
# se usa ML smoothness selection
fit2 <- gam(status~s(time)+sex+s(age)+s(thickness)+ulcer, family = binomial(), data = Data, method = "ML")
summary(fit2)

## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## status ~ s(time) + sex + s(age) + s(thickness) + ulcer
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)   1.8757     0.6607   2.839  0.00453 **
## sex          -0.2609     0.5054  -0.516  0.60577   
## ulcer        -0.7258     0.5877  -1.235  0.21681   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                edf Ref.df Chi.sq  p-value    
## s(time)      4.562  5.502 29.983 1.86e-05 ***
## s(age)       1.000  1.000  0.004    0.947    
## s(thickness) 2.625  3.319  3.870    0.287    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.579   Deviance explained = 54.9%
## -ML = 66.465  Scale est. = 1         n = 191

# -age
fit3 <- gam(status~s(time)+sex+s(thickness)+ulcer, family = binomial(), data = Data, method = "ML")
anova(fit2, fit3, test = "Chisq") # nos da que la devianza cambio muy poco y no es significativa en el valor P

## Analysis of Deviance Table
## 
## Model 1: status ~ s(time) + sex + s(age) + s(thickness) + ulcer
## Model 2: status ~ s(time) + sex + s(thickness) + ulcer
##   Resid. Df Resid. Dev       Df Deviance Pr(>Chi)
## 1    176.54     105.01                           
## 2    177.50     104.89 -0.95233  0.11762

                                  # Usamos AIC como medida tambien.
AIC(fit2, fit3)

##            df      AIC
## fit2 12.82123 130.6524
## fit3 11.86213 128.6166

summary(fit3)

## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## status ~ s(time) + sex + s(thickness) + ulcer
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)   1.8744     0.6615   2.834   0.0046 **
## sex          -0.2634     0.5051  -0.521   0.6020   
## ulcer        -0.7201     0.5885  -1.224   0.2211   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                edf Ref.df Chi.sq  p-value    
## s(time)      4.573  5.514 29.958 1.91e-05 ***
## s(thickness) 2.648  3.348  3.985    0.277    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.582   Deviance explained =   55%
## -ML = 66.468  Scale est. = 1         n = 191

# -sex
fit4 <- gam(status~s(time)+s(thickness)+ulcer, family = binomial(), data = Data, method = "ML")
anova(fit3, fit4, test = "Chisq")  # nos da que la devianza cambio muy poco y no es significativa en el valor P

## Analysis of Deviance Table
## 
## Model 1: status ~ s(time) + sex + s(thickness) + ulcer
## Model 2: status ~ s(time) + s(thickness) + ulcer
##   Resid. Df Resid. Dev      Df Deviance Pr(>Chi)
## 1    177.50     104.89                          
## 2    181.57     112.00 -4.0768  -7.1068    0.136

                                   # Usamos AIC como medida tambien.
AIC(fit3, fit4)

##             df      AIC
## fit3 11.862135 128.6166
## fit4  8.478317 128.9558

summary(fit4)

## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## status ~ s(time) + s(thickness) + ulcer
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)   1.9100     0.5868   3.255  0.00113 **
## ulcer        -1.1072     0.5252  -2.108  0.03500 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                edf Ref.df Chi.sq  p-value    
## s(time)      4.531  5.478 32.198 6.58e-06 ***
## s(thickness) 1.000  1.000  0.328    0.567    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.566   Deviance explained = 51.9%
## -ML = 65.947  Scale est. = 1         n = 191

# -thickness
fit5 <- gam(status~s(time)+ulcer, family = binomial(), data = Data, method = "ML")
anova(fit4, fit5, test = "Chisq")  # nos da que la devianza cambio muy poco y no es significativa en el valor P

## Analysis of Deviance Table
## 
## Model 1: status ~ s(time) + s(thickness) + ulcer
## Model 2: status ~ s(time) + ulcer
##   Resid. Df Resid. Dev       Df Deviance Pr(>Chi)
## 1    181.57     112.00                           
## 2    182.51     111.91 -0.94027 0.092906

                                   # Usamos AIC como medida tambien.
AIC(fit4, fit5)

##            df      AIC
## fit4 8.478317 128.9558
## fit5 7.536625 126.9795

summary(fit5)

## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## status ~ s(time) + ulcer
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)   1.9637     0.5985   3.281  0.00103 **
## ulcer        -1.2240     0.4859  -2.519  0.01176 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##           edf Ref.df Chi.sq  p-value    
## s(time) 4.588  5.537  33.45 5.08e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.568   Deviance explained = 51.9%
## -ML = 66.124  Scale est. = 1         n = 191

# fit4
# R-sq.(adj) =  0.566   Deviance explained = 51.9%
# -ML = 65.947  Scale est. = 1         n = 191

#fit5
# R-sq.(adj) =  0.568   Deviance explained = 51.9%
# -ML = 66.124  Scale est. = 1         n = 191


par(mfrow=c(1,1))
gratia::draw(fit4, main="fit4")# Se podría notar el efecto penalizado a lineal del grosor del tumor

gratia::draw(fit5, main="fit5")

# Como hay traslape en unos momentos, no se puede hablar que haya alguna 
# diferencia en esos momentos y si no hay traslape como al final del 
# grafico podemos decir que hubo un aumento significativo en las muertes

# Analisis de los residuales

par(mfrow=c(2,2))
gam.check(fit4, type = "deviance", old.style = FALSE)

## 
## Method: ML   Optimizer: outer newton
## full convergence after 14 iterations.
## Gradient range [-1.56257e-05,1.966884e-06]
## (score 65.94696 & scale 1).
## Hessian positive definite, eigenvalue range [1.562497e-05,0.765107].
## Model rank =  20 / 20 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                k'  edf k-index p-value  
## s(time)      9.00 4.53    1.08    0.93  
## s(thickness) 9.00 1.00    0.89    0.04 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

gam.check(fit5, type = "deviance", old.style = FALSE)

## 
## Method: ML   Optimizer: outer newton
## full convergence after 5 iterations.
## Gradient range [-4.008419e-07,-4.008419e-07]
## (score 66.12423 & scale 1).
## Hessian positive definite, eigenvalue range [0.790318,0.790318].
## Model rank =  11 / 11 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##           k'  edf k-index p-value
## s(time) 9.00 4.59    1.09    0.89

#               k'  edf k-index p-value  
#s(time)      9.00 4.53    1.08   0.895  
#s(thickness) 9.00 1.00    0.89   0.095

# entre mas parecido este k y edf (grados de libertad efectivos ) es decir que necesitábamos
# una mayor base para hacer los splines que hicimos.
# En este caso el modelo lo hizo automático y busco unos valores adecuados.
# Esto se sabe porque las probabilidades tienen que ser mayor a 0.05 (en general) 
# para los diferentes suavizadores.
# Tener un suavizador menor a 0.05 indiciaria que hay un problema en la 
# estimación de la base del suavizador.
# Se puede controlar aumentando la base y se mejore la ondulación estimada de la curva.

coef(fit4)

##    (Intercept)          ulcer      s(time).1      s(time).2      s(time).3 
##   1.909998e+00  -1.107211e+00   2.231724e+00  -5.201885e+00   2.125474e+00 
##      s(time).4      s(time).5      s(time).6      s(time).7      s(time).8 
##  -3.335906e+00   7.349185e-01  -2.146047e+00  -6.298678e-02  -1.275061e+01 
##      s(time).9 s(thickness).1 s(thickness).2 s(thickness).3 s(thickness).4 
##   3.770324e-02  -4.588766e-06   6.595445e-06  -2.268043e-06  -2.443334e-06 
## s(thickness).5 s(thickness).6 s(thickness).7 s(thickness).8 s(thickness).9 
##   1.634517e-06   2.268098e-06   2.126589e-06  -6.810093e-06  -1.755300e-01

# Muestra los valores de los coeficientes que se uso para cada distribución base que se uso en el spline en el suavizador de tiempo y de thickness

# En este caso 9 bases regresivas no lineales para producir esas curvas.

concurvity(fit4)

##               para   s(time) s(thickness)
## worst    0.5598995 0.2849871    0.4085209
## observed 0.5598995 0.1540974    0.2589938
## estimate 0.5598995 0.1333303    0.1763569

# Encabezados de Columna:

# para: Esta columna representa la proporción de la varianza total en la variable de respuesta que es capturada por cada término suave individualmente. Es una medida de cuánta no linealidad explica cada término suave.

# s(time), s(thickness): Estas columnas representan los términos suaves individuales en tu modelo (fit4). Muestran la proporción de no linealidad para cada término suave


# Valores más bajos en las filas observed y estimate indican que el modelo está más cercano a la linealidad. Un valor de 0 indicaría una linealidad perfecta.

# La fila worst ayuda a identificar qué término contribuye más a la no linealidad en el modelo.

# Estos valores sugieren que ambos términos suaves contribuyen en cierta medida a la no linealidad en el modelo, pero el modelo aún está relativamente cerca de ser lineal, especialmente según los valores de concavidad estimada. La fila worst ayuda a identificar qué término tiene la máxima no linealidad, y en este caso, es s(time)

par(mfrow=c(1,1))


# Hace un producto tensorial suavizado de las dos covariables time y thickness 
fit7 <- gam(status ~ te(time, thickness), data = Data)
AIC(fit7)

## [1] 124.6744

vis.gam(fit7, type = 'response',plot.type = 'persp',phi       = 30,
        theta     = 25,
        n.grid    = 500,
        border    = NA)

### Comparar LM y GLM
anova(fit1, fit4,  test = "Chisq")

## Analysis of Deviance Table
## 
## Model 1: status ~ time + ulcer
## Model 2: status ~ s(time) + s(thickness) + ulcer
##   Resid. Df Resid. Dev     Df Deviance  Pr(>Chi)    
## 1    188.00     134.76                              
## 2    183.47     112.00 4.5307   22.762 0.0002412 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# GAM espacial
# Para ver la distribución de especies, evidencia de fenómenos locales es bastante útil

GAM

N.Romero

2023-11-29