1 1) Verificando a necessidade de um GAM (dados simulados)

1.1 1.1 Simulação de dados não-lineares

x <- seq(0, pi * 2, by = 0.1)
sin_x <- sin(x)
y <- sin_x + rnorm(n = length(x), mean = 0, sd = sd(sin_x/2))
Sample_data <- data.frame(y = y, x = x)
head(Sample_data)

1.2 1.2 Dispersão e OLS

library(ggplot2)
ggplot(Sample_data, aes(x, y)) + 
  geom_point() + 
  labs(title = "Relação não-linear entre x e y")

lm_y <- lm(y ~ x, data = Sample_data)
summary(lm_y)
## 
## Call:
## lm(formula = y ~ x, data = Sample_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.14786 -0.37398 -0.00539  0.39970  1.13455 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.94837    0.12800   7.409 4.58e-10 ***
## x           -0.29962    0.03561  -8.413 8.59e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.514 on 61 degrees of freedom
## Multiple R-squared:  0.5371, Adjusted R-squared:  0.5295 
## F-statistic: 70.78 on 1 and 61 DF,  p-value: 8.594e-12
ggplot(Sample_data, aes(x, y)) + 
  geom_point() + 
  geom_smooth(method = lm, se = TRUE) +
  labs(title = "OLS sobre dados não-lineares")

plot(lm_y, which = 1)

Observação: Os resíduos não ficam aleatórios em função de x, indicando má especificação. Vamos tentar GAM.

2 2) GAM nos dados simulados

library(mgcv)
gam_y <- gam(y ~ s(x), data = Sample_data, method = "REML")
summary(gam_y)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## y ~ s(x)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  0.01956    0.03992    0.49    0.626
## 
## Approximate significance of smooth terms:
##        edf Ref.df     F p-value    
## s(x) 5.953  7.113 40.15  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.821   Deviance explained = 83.8%
## -REML =  27.61  Scale est. = 0.1004    n = 63
AIC(lm_y, gam_y)
ggplot(Sample_data, aes(x, y)) + 
  geom_point(alpha = 0.6) + 
  geom_smooth(method = "gam", formula = y ~ s(x), se = TRUE) +
  labs(title = "GAM ajustado: y ~ s(x)")

par(mfrow = c(2,2))
gam.check(gam_y)

## 
## Method: REML   Optimizer: outer newton
## full convergence after 5 iterations.
## Gradient range [-1.793913e-05,1.107339e-05]
## (score 27.60989 & scale 0.1004016).
## Hessian positive definite, eigenvalue range [1.910748,30.71453].
## Model rank =  10 / 10 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##        k'  edf k-index p-value
## s(x) 9.00 5.95    1.19    0.90
par(mfrow = c(1,1))

3 3) Exemplo com ISLR::Wage: tendência temporal geral

3.1 3.1 Carregamento e estrutura

library(ISLR)
library(dplyr)
data <- ISLR::Wage
data <- data %>% mutate(
  maritl = droplevels(as.factor(maritl)),
  race = droplevels(as.factor(race)),
  education = droplevels(as.factor(education)),
  jobclass = droplevels(as.factor(jobclass))
)
str(data); head(data)
## 'data.frame':    3000 obs. of  11 variables:
##  $ year      : int  2006 2004 2003 2003 2005 2008 2009 2008 2006 2004 ...
##  $ age       : int  18 24 45 43 50 54 44 30 41 52 ...
##  $ maritl    : Factor w/ 5 levels "1. Never Married",..: 1 1 2 2 4 2 2 1 1 2 ...
##  $ race      : Factor w/ 4 levels "1. White","2. Black",..: 1 1 1 3 1 1 4 3 2 1 ...
##  $ education : Factor w/ 5 levels "1. < HS Grad",..: 1 4 3 4 2 4 3 3 3 2 ...
##  $ region    : Factor w/ 9 levels "1. New England",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ jobclass  : Factor w/ 2 levels "1. Industrial",..: 1 2 1 2 2 2 1 2 2 2 ...
##  $ health    : Factor w/ 2 levels "1. <=Good","2. >=Very Good": 1 2 1 2 1 2 2 1 2 2 ...
##  $ health_ins: Factor w/ 2 levels "1. Yes","2. No": 2 2 1 1 1 1 1 1 1 1 ...
##  $ logwage   : num  4.32 4.26 4.88 5.04 4.32 ...
##  $ wage      : num  75 70.5 131 154.7 75 ...
summary(data$year); summary(data$wage)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2003    2004    2006    2006    2008    2009
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   20.09   85.38  104.92  111.70  128.68  318.34
# Cálculo automático de k_year (nº seguro de bases do suavizador)
unique_years <- sort(unique(data$year))
n_unique_years <- length(unique_years)
k_year <- max(3, min(5, n_unique_years - 1))  # com ~7 anos, dá 5
k_year
## [1] 5

3.2 3.2 OLS vs GAM (wage ~ year)

ols_year <- lm(wage ~ year, data = data)
gam_year <- gam(wage ~ s(year, k = k_year), data = data, method = "REML")

summary(ols_year)
## 
## Call:
## lm(formula = wage ~ year, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -90.550 -26.606  -6.415  17.830 206.393 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2595.8616   752.8243  -3.448 0.000572 ***
## year            1.3499     0.3753   3.597 0.000328 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 41.65 on 2998 degrees of freedom
## Multiple R-squared:  0.004296,   Adjusted R-squared:  0.003964 
## F-statistic: 12.94 on 1 and 2998 DF,  p-value: 0.0003277
summary(gam_year)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## wage ~ s(year, k = k_year)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 111.7036     0.7603   146.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##           edf Ref.df     F  p-value    
## s(year) 1.006  1.013 12.74 0.000343 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.00396   Deviance explained = 0.43%
## -REML =  15442  Scale est. = 1734.4    n = 3000
AIC(ols_year, gam_year)
ggplot(data, aes(year, wage)) +
  geom_point(alpha = 0.2) +
  geom_smooth(method = "lm", se = TRUE, colour = "black", linetype = "dashed") +
  geom_smooth(method = "gam", formula = y ~ s(x, k = k_year), se = TRUE, colour = "blue") +
  labs(title = "Tendência temporal de wage: OLS (tracejada) vs GAM (azul)")

# OLS: aumento médio anual é o coeficiente de 'year'
coef_ols <- coef(ols_year)["year"]
ic_ols <- confint(ols_year)["year", ]
list(variacao_media_anual_OLS = coef_ols, IC95 = ic_ols)
## $variacao_media_anual_OLS
##     year 
## 1.349874 
## 
## $IC95
##    2.5 %   97.5 % 
## 0.613953 2.085795
# GAM: aproximação pela diferença entre extremos / amplitude temporal
yr0 <- min(data$year); yr1 <- max(data$year)
p0 <- predict(gam_year, newdata = data.frame(year = yr0))
p1 <- predict(gam_year, newdata = data.frame(year = yr1))
delta_tot <- as.numeric(p1 - p0)
annual_gam <- delta_tot / (yr1 - yr0)
list(ano_inicial = yr0, ano_final = yr1, aumento_total = delta_tot, aumento_medio_anual = annual_gam)
## $ano_inicial
## [1] 2003
## 
## $ano_final
## [1] 2009
## 
## $aumento_total
## [1] 8.099361
## 
## $aumento_medio_anual
## [1] 1.349893

4 4) Tendências estratificadas com especificações estáveis

#k = k_year

4.1 (A) Varying-coefficient com by= (muito estável)

# maritl
gam_maritl <- gam(wage ~ maritl + s(year, k = k_year) + s(year, by = maritl, k = k_year),
                  data = data, method = "REML")
summary(gam_maritl)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## wage ~ maritl + s(year, k = k_year) + s(year, by = maritl, k = k_year)
## 
## Parametric coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          92.645      1.580  58.643  < 2e-16 ***
## maritl2. Married     26.230      1.810  14.495  < 2e-16 ***
## maritl3. Widowed      6.787     10.019   0.677 0.498203    
## maritl4. Divorced    10.748      3.229   3.329 0.000883 ***
## maritl5. Separated    8.590      5.665   1.516 0.129537    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                   edf Ref.df     F p-value
## s(year)                        1.0022 1.0044 0.003   0.973
## s(year):maritl1. Never Married 1.0024 1.0048 0.127   0.724
## s(year):maritl2. Married       1.0011 1.0022 0.121   0.728
## s(year):maritl3. Widowed       1.0028 1.0056 0.003   0.978
## s(year):maritl4. Divorced      1.0039 1.0078 0.686   0.407
## s(year):maritl5. Separated     0.2243 0.4098 0.036   0.903
## 
## Rank: 28/29
## R-sq.(adj) =  0.0722   Deviance explained =  7.5%
## -REML =  15309  Scale est. = 1615.6    n = 3000
# race
gam_race <- gam(wage ~ race + s(year, k = k_year) + s(year, by = race, k = k_year),
                data = data, method = "REML")
summary(gam_race)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## wage ~ race + s(year, k = k_year) + s(year, by = race, k = k_year)
## 
## Parametric coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   112.614      0.831 135.516  < 2e-16 ***
## race2. Black  -11.356      2.562  -4.433 9.62e-06 ***
## race3. Asian    7.633      3.151   2.423 0.015467 *  
## race4. Other  -22.654      6.853  -3.306 0.000958 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                          edf   Ref.df     F p-value
## s(year)              1.00181 1.003012 0.343   0.559
## s(year):race1. White 1.61832 1.990181 0.990   0.345
## s(year):race2. Black 1.00273 1.005443 1.606   0.205
## s(year):race3. Asian 1.75141 2.155971 0.405   0.597
## s(year):race4. Other 0.00268 0.005355 0.000   0.999
## 
## Rank: 23/24
## R-sq.(adj) =  0.0168   Deviance explained = 1.95%
## -REML =  15403  Scale est. = 1712.1    n = 3000
# education
gam_edu <- gam(wage ~ education + s(year, k = k_year) + s(year, by = education, k = k_year),
               data = data, method = "REML")
summary(gam_edu)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## wage ~ education + s(year, k = k_year) + s(year, by = education, 
##     k = k_year)
## 
## Parametric coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   84.138      2.226  37.793  < 2e-16 ***
## education2. HS Grad           11.622      2.515   4.622 3.97e-06 ***
## education3. Some College      23.667      2.647   8.942  < 2e-16 ***
## education4. College Grad      40.286      2.626  15.342  < 2e-16 ***
## education5. Advanced Degree   66.626      2.844  23.431  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                          edf  Ref.df     F p-value
## s(year)                             1.000716 1.00141 1.385   0.239
## s(year):education1. < HS Grad       0.007014 0.01399 0.010   0.991
## s(year):education2. HS Grad         1.001671 1.00334 0.034   0.859
## s(year):education3. Some College    1.003616 1.00722 0.326   0.571
## s(year):education4. College Grad    1.001888 1.00377 0.093   0.762
## s(year):education5. Advanced Degree 2.118300 2.58044 1.488   0.179
## 
## Rank: 28/29
## R-sq.(adj) =  0.237   Deviance explained =   24%
## -REML =  15022  Scale est. = 1328.1    n = 3000
# jobclass
gam_job <- gam(wage ~ jobclass + s(year, k = k_year) + s(year, by = jobclass, k = k_year),
               data = data, method = "REML")
summary(gam_job)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## wage ~ jobclass + s(year, k = k_year) + s(year, by = jobclass, 
##     k = k_year)
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             103.312      1.037   99.66   <2e-16 ***
## jobclass2. Information   17.307      1.488   11.63   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                     edf  Ref.df      F  p-value    
## s(year)                        1.006135 1.01222 14.112 0.000166 ***
## s(year):jobclass1. Industrial  1.001850 1.00369  2.841 0.091870 .  
## s(year):jobclass2. Information 0.006332 0.01262  0.036 0.982993    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 13/14
## R-sq.(adj) =  0.0472   Deviance explained = 4.82%
## -REML =  15367  Scale est. = 1659      n = 3000

4.1.1 Predição e gráficos por grupo — exemplo maritl

grid_maritl <- expand.grid(
  year   = seq(min(data$year), max(data$year), length.out = 200),
  maritl = levels(data$maritl)
)
pred_maritl <- predict(gam_maritl, newdata = grid_maritl, se.fit = TRUE)
grid_maritl$fit <- pred_maritl$fit
grid_maritl$se  <- pred_maritl$se.fit
grid_maritl$lwr <- grid_maritl$fit - 1.96 * grid_maritl$se
grid_maritl$upr <- grid_maritl$fit + 1.96 * grid_maritl$se

ggplot(grid_maritl, aes(year, fit, colour = maritl, fill = maritl)) +
  geom_ribbon(aes(ymin = lwr, ymax = upr), alpha = 0.15, colour = NA) +
  geom_line(linewidth = 0.9) +
  labs(title = "Tendência temporal do salário por estado civil (maritl)",
       x = "Ano", y = "Salário previsto (wage)") +
  theme(legend.position = "bottom")

4.2 (B) Factor-smooth com bs = "fs" (também estável; lembre-se de k)

gam_maritl_fs <- gam(wage ~ maritl + s(year, maritl, bs = "fs", k = k_year),
                     data = data, method = "REML")
summary(gam_maritl_fs)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## wage ~ maritl + s(year, maritl, bs = "fs", k = k_year)
## 
## Parametric coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          92.686      3.390  27.343  < 2e-16 ***
## maritl2. Married     26.212      4.612   5.684 1.44e-08 ***
## maritl3. Widowed      6.846     10.311   0.664   0.5068    
## maritl4. Divorced    10.625      5.331   1.993   0.0463 *  
## maritl5. Separated    8.530      7.063   1.208   0.2273    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                  edf Ref.df     F  p-value    
## s(year,maritl) 2.506     20 0.665 0.000812 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0724   Deviance explained = 7.44%
## -REML =  15328  Scale est. = 1615.2    n = 3000
gam_race_fs <- gam(wage ~ race + s(year, race, bs = "fs", k = k_year),
                   data = data, method = "REML")
summary(gam_race_fs)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## wage ~ race + s(year, race, bs = "fs", k = k_year)
## 
## Parametric coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   112.656      3.209  35.108  < 2e-16 ***
## race2. Black  -11.208      5.076  -2.208  0.02730 *  
## race3. Asian    7.721      5.382   1.435  0.15151    
## race4. Other  -22.696      8.136  -2.790  0.00531 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                edf Ref.df     F  p-value    
## s(year,race) 2.934     16 1.025 0.000138 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0165   Deviance explained = 1.84%
## -REML =  15419  Scale est. = 1712.5    n = 3000
gam_edu_fs <- gam(wage ~ education + s(year, education, bs = "fs", k = k_year),
                  data = data, method = "REML")
summary(gam_edu_fs)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## wage ~ education + s(year, education, bs = "fs", k = k_year)
## 
## Parametric coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   84.150      3.115  27.016  < 2e-16 ***
## education2. HS Grad           11.668      3.976   2.935  0.00336 ** 
## education3. Some College      23.652      4.061   5.824 6.35e-09 ***
## education4. College Grad      40.303      4.047   9.958  < 2e-16 ***
## education5. Advanced Degree   66.695      4.191  15.915  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                     edf Ref.df     F p-value  
## s(year,education) 3.277     20 0.482  0.0105 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.236   Deviance explained = 23.8%
## -REML =  15039  Scale est. = 1329.9    n = 3000
gam_job_fs <- gam(wage ~ jobclass + s(year, jobclass, bs = "fs", k = k_year),
                  data = data, method = "REML")
summary(gam_job_fs)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## wage ~ jobclass + s(year, jobclass, bs = "fs", k = k_year)
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             103.124      1.190  86.671   <2e-16 ***
## jobclass2. Information   17.009      1.701   9.997   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                    edf Ref.df     F  p-value    
## s(year,jobclass) 1.788      8 1.866 0.000225 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0472   Deviance explained = 4.81%
## -REML =  15377  Scale est. = 1659      n = 3000

5 5) Conclusões

6 6) Reprodutibilidade

sessionInfo()
## R version 4.4.2 (2024-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 10 x64 (build 19045)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=Portuguese_Brazil.utf8  LC_CTYPE=Portuguese_Brazil.utf8   
## [3] LC_MONETARY=Portuguese_Brazil.utf8 LC_NUMERIC=C                      
## [5] LC_TIME=Portuguese_Brazil.utf8    
## 
## time zone: America/Sao_Paulo
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] dplyr_1.1.4   ISLR_1.4      mgcv_1.9-1    nlme_3.1-166  ggplot2_3.5.2
## 
## loaded via a namespace (and not attached):
##  [1] vctrs_0.6.5        cli_3.6.3          knitr_1.50         rlang_1.1.4       
##  [5] xfun_0.52          generics_0.1.3     jsonlite_1.8.9     labeling_0.4.3    
##  [9] glue_1.8.0         htmltools_0.5.8.1  sass_0.4.9         scales_1.4.0      
## [13] rmarkdown_2.29     grid_4.4.2         evaluate_1.0.1     jquerylib_0.1.4   
## [17] tibble_3.2.1       fastmap_1.2.0      yaml_2.3.10        lifecycle_1.0.4   
## [21] compiler_4.4.2     RColorBrewer_1.1-3 pkgconfig_2.0.3    rstudioapi_0.17.1 
## [25] lattice_0.22-6     farver_2.1.2       digest_0.6.37      R6_2.5.1          
## [29] tidyselect_1.2.1   splines_4.4.2      pillar_1.10.0      magrittr_2.0.3    
## [33] Matrix_1.7-1       bslib_0.8.0        withr_3.0.2        tools_4.4.2       
## [37] gtable_0.3.6       cachem_1.1.0