x <- seq(0, pi * 2, by = 0.1)
sin_x <- sin(x)
y <- sin_x + rnorm(n = length(x), mean = 0, sd = sd(sin_x/2))
Sample_data <- data.frame(y = y, x = x)
head(Sample_data)library(ggplot2)
ggplot(Sample_data, aes(x, y)) +
geom_point() +
labs(title = "Relação não-linear entre x e y")##
## Call:
## lm(formula = y ~ x, data = Sample_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.14786 -0.37398 -0.00539 0.39970 1.13455
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.94837 0.12800 7.409 4.58e-10 ***
## x -0.29962 0.03561 -8.413 8.59e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.514 on 61 degrees of freedom
## Multiple R-squared: 0.5371, Adjusted R-squared: 0.5295
## F-statistic: 70.78 on 1 and 61 DF, p-value: 8.594e-12
ggplot(Sample_data, aes(x, y)) +
geom_point() +
geom_smooth(method = lm, se = TRUE) +
labs(title = "OLS sobre dados não-lineares")Observação: Os resíduos não ficam aleatórios em
função de x, indicando má especificação. Vamos tentar
GAM.
##
## Family: gaussian
## Link function: identity
##
## Formula:
## y ~ s(x)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.01956 0.03992 0.49 0.626
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(x) 5.953 7.113 40.15 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.821 Deviance explained = 83.8%
## -REML = 27.61 Scale est. = 0.1004 n = 63
ggplot(Sample_data, aes(x, y)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "gam", formula = y ~ s(x), se = TRUE) +
labs(title = "GAM ajustado: y ~ s(x)")##
## Method: REML Optimizer: outer newton
## full convergence after 5 iterations.
## Gradient range [-1.793913e-05,1.107339e-05]
## (score 27.60989 & scale 0.1004016).
## Hessian positive definite, eigenvalue range [1.910748,30.71453].
## Model rank = 10 / 10
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(x) 9.00 5.95 1.19 0.90
ISLR::Wage: tendência temporal gerallibrary(ISLR)
library(dplyr)
data <- ISLR::Wage
data <- data %>% mutate(
maritl = droplevels(as.factor(maritl)),
race = droplevels(as.factor(race)),
education = droplevels(as.factor(education)),
jobclass = droplevels(as.factor(jobclass))
)
str(data); head(data)## 'data.frame': 3000 obs. of 11 variables:
## $ year : int 2006 2004 2003 2003 2005 2008 2009 2008 2006 2004 ...
## $ age : int 18 24 45 43 50 54 44 30 41 52 ...
## $ maritl : Factor w/ 5 levels "1. Never Married",..: 1 1 2 2 4 2 2 1 1 2 ...
## $ race : Factor w/ 4 levels "1. White","2. Black",..: 1 1 1 3 1 1 4 3 2 1 ...
## $ education : Factor w/ 5 levels "1. < HS Grad",..: 1 4 3 4 2 4 3 3 3 2 ...
## $ region : Factor w/ 9 levels "1. New England",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ jobclass : Factor w/ 2 levels "1. Industrial",..: 1 2 1 2 2 2 1 2 2 2 ...
## $ health : Factor w/ 2 levels "1. <=Good","2. >=Very Good": 1 2 1 2 1 2 2 1 2 2 ...
## $ health_ins: Factor w/ 2 levels "1. Yes","2. No": 2 2 1 1 1 1 1 1 1 1 ...
## $ logwage : num 4.32 4.26 4.88 5.04 4.32 ...
## $ wage : num 75 70.5 131 154.7 75 ...
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2003 2004 2006 2006 2008 2009
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20.09 85.38 104.92 111.70 128.68 318.34
# Cálculo automático de k_year (nº seguro de bases do suavizador)
unique_years <- sort(unique(data$year))
n_unique_years <- length(unique_years)
k_year <- max(3, min(5, n_unique_years - 1)) # com ~7 anos, dá 5
k_year## [1] 5
ols_year <- lm(wage ~ year, data = data)
gam_year <- gam(wage ~ s(year, k = k_year), data = data, method = "REML")
summary(ols_year)##
## Call:
## lm(formula = wage ~ year, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -90.550 -26.606 -6.415 17.830 206.393
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2595.8616 752.8243 -3.448 0.000572 ***
## year 1.3499 0.3753 3.597 0.000328 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 41.65 on 2998 degrees of freedom
## Multiple R-squared: 0.004296, Adjusted R-squared: 0.003964
## F-statistic: 12.94 on 1 and 2998 DF, p-value: 0.0003277
##
## Family: gaussian
## Link function: identity
##
## Formula:
## wage ~ s(year, k = k_year)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 111.7036 0.7603 146.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(year) 1.006 1.013 12.74 0.000343 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.00396 Deviance explained = 0.43%
## -REML = 15442 Scale est. = 1734.4 n = 3000
ggplot(data, aes(year, wage)) +
geom_point(alpha = 0.2) +
geom_smooth(method = "lm", se = TRUE, colour = "black", linetype = "dashed") +
geom_smooth(method = "gam", formula = y ~ s(x, k = k_year), se = TRUE, colour = "blue") +
labs(title = "Tendência temporal de wage: OLS (tracejada) vs GAM (azul)")# OLS: aumento médio anual é o coeficiente de 'year'
coef_ols <- coef(ols_year)["year"]
ic_ols <- confint(ols_year)["year", ]
list(variacao_media_anual_OLS = coef_ols, IC95 = ic_ols)## $variacao_media_anual_OLS
## year
## 1.349874
##
## $IC95
## 2.5 % 97.5 %
## 0.613953 2.085795
# GAM: aproximação pela diferença entre extremos / amplitude temporal
yr0 <- min(data$year); yr1 <- max(data$year)
p0 <- predict(gam_year, newdata = data.frame(year = yr0))
p1 <- predict(gam_year, newdata = data.frame(year = yr1))
delta_tot <- as.numeric(p1 - p0)
annual_gam <- delta_tot / (yr1 - yr0)
list(ano_inicial = yr0, ano_final = yr1, aumento_total = delta_tot, aumento_medio_anual = annual_gam)## $ano_inicial
## [1] 2003
##
## $ano_final
## [1] 2009
##
## $aumento_total
## [1] 8.099361
##
## $aumento_medio_anual
## [1] 1.349893
#k = k_year
by= (muito estável)# maritl
gam_maritl <- gam(wage ~ maritl + s(year, k = k_year) + s(year, by = maritl, k = k_year),
data = data, method = "REML")
summary(gam_maritl)##
## Family: gaussian
## Link function: identity
##
## Formula:
## wage ~ maritl + s(year, k = k_year) + s(year, by = maritl, k = k_year)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 92.645 1.580 58.643 < 2e-16 ***
## maritl2. Married 26.230 1.810 14.495 < 2e-16 ***
## maritl3. Widowed 6.787 10.019 0.677 0.498203
## maritl4. Divorced 10.748 3.229 3.329 0.000883 ***
## maritl5. Separated 8.590 5.665 1.516 0.129537
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(year) 1.0022 1.0044 0.003 0.973
## s(year):maritl1. Never Married 1.0024 1.0048 0.127 0.724
## s(year):maritl2. Married 1.0011 1.0022 0.121 0.728
## s(year):maritl3. Widowed 1.0028 1.0056 0.003 0.978
## s(year):maritl4. Divorced 1.0039 1.0078 0.686 0.407
## s(year):maritl5. Separated 0.2243 0.4098 0.036 0.903
##
## Rank: 28/29
## R-sq.(adj) = 0.0722 Deviance explained = 7.5%
## -REML = 15309 Scale est. = 1615.6 n = 3000
# race
gam_race <- gam(wage ~ race + s(year, k = k_year) + s(year, by = race, k = k_year),
data = data, method = "REML")
summary(gam_race)##
## Family: gaussian
## Link function: identity
##
## Formula:
## wage ~ race + s(year, k = k_year) + s(year, by = race, k = k_year)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 112.614 0.831 135.516 < 2e-16 ***
## race2. Black -11.356 2.562 -4.433 9.62e-06 ***
## race3. Asian 7.633 3.151 2.423 0.015467 *
## race4. Other -22.654 6.853 -3.306 0.000958 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(year) 1.00181 1.003012 0.343 0.559
## s(year):race1. White 1.61832 1.990181 0.990 0.345
## s(year):race2. Black 1.00273 1.005443 1.606 0.205
## s(year):race3. Asian 1.75141 2.155971 0.405 0.597
## s(year):race4. Other 0.00268 0.005355 0.000 0.999
##
## Rank: 23/24
## R-sq.(adj) = 0.0168 Deviance explained = 1.95%
## -REML = 15403 Scale est. = 1712.1 n = 3000
# education
gam_edu <- gam(wage ~ education + s(year, k = k_year) + s(year, by = education, k = k_year),
data = data, method = "REML")
summary(gam_edu)##
## Family: gaussian
## Link function: identity
##
## Formula:
## wage ~ education + s(year, k = k_year) + s(year, by = education,
## k = k_year)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 84.138 2.226 37.793 < 2e-16 ***
## education2. HS Grad 11.622 2.515 4.622 3.97e-06 ***
## education3. Some College 23.667 2.647 8.942 < 2e-16 ***
## education4. College Grad 40.286 2.626 15.342 < 2e-16 ***
## education5. Advanced Degree 66.626 2.844 23.431 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(year) 1.000716 1.00141 1.385 0.239
## s(year):education1. < HS Grad 0.007014 0.01399 0.010 0.991
## s(year):education2. HS Grad 1.001671 1.00334 0.034 0.859
## s(year):education3. Some College 1.003616 1.00722 0.326 0.571
## s(year):education4. College Grad 1.001888 1.00377 0.093 0.762
## s(year):education5. Advanced Degree 2.118300 2.58044 1.488 0.179
##
## Rank: 28/29
## R-sq.(adj) = 0.237 Deviance explained = 24%
## -REML = 15022 Scale est. = 1328.1 n = 3000
# jobclass
gam_job <- gam(wage ~ jobclass + s(year, k = k_year) + s(year, by = jobclass, k = k_year),
data = data, method = "REML")
summary(gam_job)##
## Family: gaussian
## Link function: identity
##
## Formula:
## wage ~ jobclass + s(year, k = k_year) + s(year, by = jobclass,
## k = k_year)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 103.312 1.037 99.66 <2e-16 ***
## jobclass2. Information 17.307 1.488 11.63 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(year) 1.006135 1.01222 14.112 0.000166 ***
## s(year):jobclass1. Industrial 1.001850 1.00369 2.841 0.091870 .
## s(year):jobclass2. Information 0.006332 0.01262 0.036 0.982993
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 13/14
## R-sq.(adj) = 0.0472 Deviance explained = 4.82%
## -REML = 15367 Scale est. = 1659 n = 3000
maritlgrid_maritl <- expand.grid(
year = seq(min(data$year), max(data$year), length.out = 200),
maritl = levels(data$maritl)
)
pred_maritl <- predict(gam_maritl, newdata = grid_maritl, se.fit = TRUE)
grid_maritl$fit <- pred_maritl$fit
grid_maritl$se <- pred_maritl$se.fit
grid_maritl$lwr <- grid_maritl$fit - 1.96 * grid_maritl$se
grid_maritl$upr <- grid_maritl$fit + 1.96 * grid_maritl$se
ggplot(grid_maritl, aes(year, fit, colour = maritl, fill = maritl)) +
geom_ribbon(aes(ymin = lwr, ymax = upr), alpha = 0.15, colour = NA) +
geom_line(linewidth = 0.9) +
labs(title = "Tendência temporal do salário por estado civil (maritl)",
x = "Ano", y = "Salário previsto (wage)") +
theme(legend.position = "bottom")bs = "fs" (também estável; lembre-se de
k)gam_maritl_fs <- gam(wage ~ maritl + s(year, maritl, bs = "fs", k = k_year),
data = data, method = "REML")
summary(gam_maritl_fs)##
## Family: gaussian
## Link function: identity
##
## Formula:
## wage ~ maritl + s(year, maritl, bs = "fs", k = k_year)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 92.686 3.390 27.343 < 2e-16 ***
## maritl2. Married 26.212 4.612 5.684 1.44e-08 ***
## maritl3. Widowed 6.846 10.311 0.664 0.5068
## maritl4. Divorced 10.625 5.331 1.993 0.0463 *
## maritl5. Separated 8.530 7.063 1.208 0.2273
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(year,maritl) 2.506 20 0.665 0.000812 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0724 Deviance explained = 7.44%
## -REML = 15328 Scale est. = 1615.2 n = 3000
gam_race_fs <- gam(wage ~ race + s(year, race, bs = "fs", k = k_year),
data = data, method = "REML")
summary(gam_race_fs)##
## Family: gaussian
## Link function: identity
##
## Formula:
## wage ~ race + s(year, race, bs = "fs", k = k_year)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 112.656 3.209 35.108 < 2e-16 ***
## race2. Black -11.208 5.076 -2.208 0.02730 *
## race3. Asian 7.721 5.382 1.435 0.15151
## race4. Other -22.696 8.136 -2.790 0.00531 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(year,race) 2.934 16 1.025 0.000138 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0165 Deviance explained = 1.84%
## -REML = 15419 Scale est. = 1712.5 n = 3000
gam_edu_fs <- gam(wage ~ education + s(year, education, bs = "fs", k = k_year),
data = data, method = "REML")
summary(gam_edu_fs)##
## Family: gaussian
## Link function: identity
##
## Formula:
## wage ~ education + s(year, education, bs = "fs", k = k_year)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 84.150 3.115 27.016 < 2e-16 ***
## education2. HS Grad 11.668 3.976 2.935 0.00336 **
## education3. Some College 23.652 4.061 5.824 6.35e-09 ***
## education4. College Grad 40.303 4.047 9.958 < 2e-16 ***
## education5. Advanced Degree 66.695 4.191 15.915 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(year,education) 3.277 20 0.482 0.0105 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.236 Deviance explained = 23.8%
## -REML = 15039 Scale est. = 1329.9 n = 3000
gam_job_fs <- gam(wage ~ jobclass + s(year, jobclass, bs = "fs", k = k_year),
data = data, method = "REML")
summary(gam_job_fs)##
## Family: gaussian
## Link function: identity
##
## Formula:
## wage ~ jobclass + s(year, jobclass, bs = "fs", k = k_year)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 103.124 1.190 86.671 <2e-16 ***
## jobclass2. Information 17.009 1.701 9.997 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(year,jobclass) 1.788 8 1.866 0.000225 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0472 Deviance explained = 4.81%
## -REML = 15377 Scale est. = 1659 n = 3000
Wage ~ year, a OLS fornece um aumento médio anual
global; o GAM revela possíveis curvaturas no tempo.k_year automático evitam
erros de complexidade excessiva quando year tem poucos
valores únicos (~7).by= e (B) bs = "fs" são
alternativas estáveis que permitem trajetórias distintas entre
grupos.## R version 4.4.2 (2024-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 10 x64 (build 19045)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=Portuguese_Brazil.utf8 LC_CTYPE=Portuguese_Brazil.utf8
## [3] LC_MONETARY=Portuguese_Brazil.utf8 LC_NUMERIC=C
## [5] LC_TIME=Portuguese_Brazil.utf8
##
## time zone: America/Sao_Paulo
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] dplyr_1.1.4 ISLR_1.4 mgcv_1.9-1 nlme_3.1-166 ggplot2_3.5.2
##
## loaded via a namespace (and not attached):
## [1] vctrs_0.6.5 cli_3.6.3 knitr_1.50 rlang_1.1.4
## [5] xfun_0.52 generics_0.1.3 jsonlite_1.8.9 labeling_0.4.3
## [9] glue_1.8.0 htmltools_0.5.8.1 sass_0.4.9 scales_1.4.0
## [13] rmarkdown_2.29 grid_4.4.2 evaluate_1.0.1 jquerylib_0.1.4
## [17] tibble_3.2.1 fastmap_1.2.0 yaml_2.3.10 lifecycle_1.0.4
## [21] compiler_4.4.2 RColorBrewer_1.1-3 pkgconfig_2.0.3 rstudioapi_0.17.1
## [25] lattice_0.22-6 farver_2.1.2 digest_0.6.37 R6_2.5.1
## [29] tidyselect_1.2.1 splines_4.4.2 pillar_1.10.0 magrittr_2.0.3
## [33] Matrix_1.7-1 bslib_0.8.0 withr_3.0.2 tools_4.4.2
## [37] gtable_0.3.6 cachem_1.1.0