Ajuste de modelos de regressão censurados em diferentes pacotes estatísticas

Iniciação Científica 2018-2019

Programa de Pesquisa para Alunos da Graduação

Aluno: Rafael Cabral Fernandez
Orientadores: Gustavo H. M. A. Rocha e Renata S. Bueno

Introdução

O documento a seguir apresenta a utilização do NIMBLE

Para utilização interna, basta utilizar o pacote “nimble”"

O pacote foi rodado em 4 exemplos diferentes:

  1. Exemplo didático para treino
  2. Simulação de um modelo de regressão linear simples
  3. Modelo de regressão linear simples utilizando dados do ipea, modelando a taxa de homicídos através da taxa de desemprego
  4. Modelo de regressão linear simples utilizando dados do enem, modelando a nota de matemática através da nota de redação

Treino

Verossimilhança poisson com priori gamma

pumpConsts <- list(N = 10,
                   t = c(94.3, 15.7, 62.9, 126, 5.24,31.4, 1.05, 1.05, 2.1, 10.5))

pumpData <- list(x = c(5, 1, 5, 14, 3, 19, 1, 1, 4, 22))


pumpInits <- list(alpha = 1, beta = 1,
                  theta = rep(0.1, pumpConsts$N))



pumpCode <- nimbleCode(
  {
    for (i in 1:N){
      theta[i] ~ dgamma(alpha,beta)
      lambda[i] <- theta[i]*t[i]
      x[i] ~ dpois(lambda[i])
    }
    alpha ~ dexp(1.0)
    beta ~ dgamma(0.1,1.0)
  })

pump <- nimbleModel(code = pumpCode, name = "pump", constants = pumpConsts,
                    data = pumpData, inits = pumpInits)
## defining model...
## building model...
## setting data and initial values...
## running calculate on model (any error reports that follow may simply reflect missing values in model variables) ... 
## checking model sizes and dimensions...
## model building finished.
mcmc.out <- nimbleMCMC(code = pumpCode, constants = pumpConsts,
                       data = pumpData, inits = pumpInits,
                       monitors=c("alpha","beta","theta","lambda"),
                       nchains = 2, niter = 1000,
                       summary = TRUE, WAIC = TRUE, progressBar = TRUE)
## defining model...
## building model...
## setting data and initial values...
## running calculate on model (any error reports that follow may simply reflect missing values in model variables) ... 
## checking model sizes and dimensions...
## checking model calculations...
## model building finished.
## Monitored nodes are valid for WAIC
## compiling... this may take a minute. Use 'showCompilerOutput = TRUE' to see C++ compiler details.
## compilation finished.
## running chain 1...
## |-------------|-------------|-------------|-------------|
## |-------------------------------------------------------|
## running chain 2...
## |-------------|-------------|-------------|-------------|
## |-------------------------------------------------------|
mcmc.out$summary
## $chain1
##                   Mean      Median    St.Dev.   95%CI_low  95%CI_upp
## alpha       0.65741649  0.62152079 0.26443140  0.27418313  1.2835185
## beta        0.87543237  0.77469969 0.53906917  0.17574860  2.2265983
## lambda[1]   5.62377429  5.34668169 2.33032863  1.80349011 11.5072456
## lambda[2]   1.57903665  1.24186034 1.19342919  0.13010135  4.4284454
## lambda[3]   5.62567362  5.42558383 2.27881757  1.96421959 11.3186709
## lambda[4]  14.65744996 14.43084724 3.66834299  8.41656492 22.6196507
## lambda[5]   3.12683344  2.90001991 1.65831116  0.79080089  6.9847839
## lambda[6]  19.00140355 18.71578257 4.30197510 11.77990196 28.6315649
## lambda[7]   0.91440129  0.69090284 0.78667449  0.05437572  3.0063197
## lambda[8]   0.96748021  0.74732917 0.80197327  0.07871734  3.0184780
## lambda[9]   3.39115985  3.06726158 1.72556385  0.94337868  7.3652797
## lambda[10] 21.12028642 20.72889552 4.49859388 13.17995887 30.5893869
## theta[1]    0.05963706  0.05669864 0.02471186  0.01912503  0.1220281
## theta[2]    0.10057558  0.07909938 0.07601460  0.00828671  0.2820666
## theta[3]    0.08943837  0.08625729 0.03622921  0.03122766  0.1799471
## theta[4]    0.11632897  0.11453053 0.02911383  0.06679813  0.1795210
## theta[5]    0.59672394  0.55343891 0.31647159  0.15091620  1.3329740
## theta[6]    0.60514024  0.59604403 0.13700558  0.37515611  0.9118333
## theta[7]    0.87085837  0.65800270 0.74921380  0.05178640  2.8631616
## theta[8]    0.92140972  0.71174207 0.76378407  0.07496890  2.8747409
## theta[9]    1.61483803  1.46060075 0.82169707  0.44922794  3.5072761
## theta[10]   2.01145585  1.97418053 0.42843751  1.25523418  2.9132749
## 
## $chain2
##                   Mean      Median    St.Dev.   95%CI_low  95%CI_upp
## alpha       0.66930084  0.64391425 0.24202305  0.30258259  1.2329089
## beta        0.91264281  0.78423155 0.56194424  0.20297909  2.2396883
## lambda[1]   5.57827626  5.19693134 2.44933174  1.87484621 11.5958251
## lambda[2]   1.64796489  1.34047857 1.21978659  0.17134627  4.8030200
## lambda[3]   5.57702401  5.27641621 2.26619728  2.07335791 10.5250780
## lambda[4]  14.39103548 14.07931354 3.77773279  7.79783974 22.5866158
## lambda[5]   3.21210224  2.90900017 1.75513563  0.74848724  7.6533475
## lambda[6]  19.09186026 18.62560510 4.64410242 11.31715815 29.2331619
## lambda[7]   0.95670718  0.75587456 0.78787000  0.10667032  3.0285358
## lambda[8]   0.93113860  0.74339392 0.73890629  0.07873683  2.9686124
## lambda[9]   3.31328591  3.07781491 1.62679809  0.87557808  7.1114551
## lambda[10] 20.72812723 20.35909638 4.41882323 13.10756047 30.1294458
## theta[1]    0.05915457  0.05511062 0.02597383  0.01988172  0.1229674
## theta[2]    0.10496592  0.08538080 0.07769341  0.01091378  0.3059248
## theta[3]    0.08866493  0.08388579 0.03602857  0.03296276  0.1673303
## theta[4]    0.11421457  0.11174058 0.02998201  0.06188762  0.1792589
## theta[5]    0.61299661  0.55515271 0.33494955  0.14284108  1.4605625
## theta[6]    0.60802103  0.59317214 0.14790135  0.36041905  0.9309924
## theta[7]    0.91114969  0.71988054 0.75035238  0.10159078  2.8843198
## theta[8]    0.88679867  0.70799421 0.70372027  0.07498746  2.8272499
## theta[9]    1.57775519  1.46562615 0.77466575  0.41694194  3.3864072
## theta[10]   1.97410736  1.93896156 0.42084031  1.24833909  2.8694710
## 
## $all.chains
##                   Mean      Median    St.Dev.    95%CI_low  95%CI_upp
## alpha       0.66335867  0.63464454 0.25348126  0.284451898  1.2613623
## beta        0.89403759  0.77775753 0.55080224  0.192661896  2.2306254
## lambda[1]   5.60102528  5.25900995 2.39008110  1.845512154 11.5370212
## lambda[2]   1.61350077  1.30072011 1.20687044  0.139933931  4.5906564
## lambda[3]   5.60134881  5.33746942 2.27207798  1.973061447 10.9043485
## lambda[4]  14.52424272 14.21290267 3.72489198  8.069338282 22.6196507
## lambda[5]   3.16946784  2.90522605 1.70751537  0.783439856  7.3170418
## lambda[6]  19.04663191 18.65095316 4.47541747 11.422871887 28.9083976
## lambda[7]   0.93555423  0.72734960 0.78735987  0.073809416  3.0205588
## lambda[8]   0.94930941  0.74430801 0.77110617  0.078736832  3.0056420
## lambda[9]   3.35222288  3.07218425 1.67694110  0.925527533  7.2322590
## lambda[10] 20.92420682 20.53129054 4.46208398 13.123980197 30.5007013
## theta[1]    0.05939581  0.05576893 0.02534550  0.019570648  0.1223438
## theta[2]    0.10277075  0.08284841 0.07687073  0.008912989  0.2923985
## theta[3]    0.08905165  0.08485643 0.03612207  0.031368226  0.1733601
## theta[4]    0.11527177  0.11280081 0.02956263  0.064042367  0.1795210
## theta[5]    0.60486027  0.55443245 0.32586171  0.149511423  1.3963820
## theta[6]    0.60658063  0.59397940 0.14252922  0.363785729  0.9206496
## theta[7]    0.89100403  0.69271391 0.74986654  0.070294682  2.8767227
## theta[8]    0.90410420  0.70886478 0.73438683  0.074987459  2.8625162
## theta[9]    1.59629661  1.46294488 0.79854338  0.440727397  3.4439329
## theta[10]   1.99278160  1.95536100 0.42496038  1.249902876  2.9048287
df <- data.frame(mcmc.out$samples$chain1)
df_l <- df %>% select(alpha, beta, theta.1., lambda.1.) %>% gather(key="parameter", value="value")

ps <- df_l %>% ggplot(aes(x=seq_along(value), y = value)) + geom_line()
ps + facet_wrap(~parameter, scales = "free")

p <- ggplot(df_l,aes(value)) + geom_histogram(aes( y= ..density..),bins = 60) 
p + facet_wrap(~parameter, scales = "free")

kable(head(mcmc.out$samples$chain1))
alpha beta lambda[1] lambda[2] lambda[3] lambda[4] lambda[5] lambda[6] lambda[7] lambda[8] lambda[9] lambda[10] theta[1] theta[2] theta[3] theta[4] theta[5] theta[6] theta[7] theta[8] theta[9] theta[10]
0.1493424 1.1352886 5.504208 0.1301609 5.611739 16.71001 2.733687 23.24635 0.5802257 0.1499762 4.3208639 21.73755 0.0583691 0.0082905 0.0892168 0.1326191 0.5216961 0.7403295 0.5525959 0.1428344 2.0575543 2.070243
0.1493424 0.1007850 7.538498 2.1261710 4.008394 15.82392 2.171576 13.31819 5.3363444 0.6611688 2.5557470 16.63652 0.0799417 0.1354249 0.0637264 0.1255867 0.4144229 0.4241463 5.0822328 0.6296846 1.2170224 1.584431
0.1493424 0.1158284 4.220640 0.4860400 6.030489 12.09332 1.437455 12.13598 0.3503990 0.3500875 4.8631248 23.78280 0.0447576 0.0309580 0.0958742 0.0959788 0.2743234 0.3864961 0.3337134 0.3334167 2.3157737 2.265029
0.2595270 0.1395955 4.155189 1.7428054 4.809349 17.43865 4.405145 22.41900 0.2719798 4.7682660 0.8486125 31.24374 0.0440635 0.1110067 0.0764602 0.1384020 0.8406765 0.7139809 0.2590284 4.5412057 0.4041012 2.975594
0.2595270 0.2177693 6.821694 0.2340806 2.415772 15.52653 3.151666 23.57556 1.0752055 1.5561956 6.0169325 27.64122 0.0723403 0.0149096 0.0384065 0.1232264 0.6014630 0.7508140 1.0240053 1.4820911 2.8652060 2.632497
0.5796356 0.7181844 7.748169 1.5008856 4.241679 14.30770 4.097852 12.28238 1.7084315 0.9299560 0.5847590 18.92853 0.0821651 0.0955978 0.0674353 0.1135532 0.7820329 0.3911587 1.6270776 0.8856724 0.2784567 1.802717


Simulação

N <- 20
x <- runif(N, -1, 1)
Y <- rnorm(N, 1.5 + 0.5 * x, sd = 1)

simpleCode1 <- nimbleCode({
  beta0 ~ dnorm(0, sd = 100)
  beta1 ~ dnorm(0, sd = 100)
  sigma ~ dgamma(0.01, 0.001)
  for(i in 1:N) {
    Ypred[i] <- beta0 + beta1 * x[i]
    Y[i] ~ dnorm(Ypred[i], sigma^-2)
  }
})

simpleModel1 <- nimbleModel(simpleCode1,
                            data = list(Y = Y, x = x),
                            constants = list(N = N),
                            inits = list(beta0 = 0, beta1 = 0, sigma = 2 ))
## defining model...
## building model...
## setting data and initial values...
## running calculate on model (any error reports that follow may simply reflect missing values in model variables) ... 
## checking model sizes and dimensions...
## model building finished.
mcmc.out <- nimbleMCMC(code = simpleModel1, 
                       data = list(Y = Y, x = x), 
                       inits = list(beta0 = 0, beta1 = 0, sigma = 2),
                       monitors=c("beta0","beta1","sigma"),
                       nchains = 2, 
                       niter = 1000,
                       summary = TRUE, 
                       progressBar = TRUE)
## compiling... this may take a minute. Use 'showCompilerOutput = TRUE' to see C++ compiler details.
## compilation finished.
## running chain 1...
## |-------------|-------------|-------------|-------------|
## |-------------------------------------------------------|
## running chain 2...
## |-------------|-------------|-------------|-------------|
## |-------------------------------------------------------|
mcmc.out$summary
## $chain1
##            Mean    Median   St.Dev.  95%CI_low 95%CI_upp
## beta0 1.6986971 1.6993347 0.2592497  1.1704504  2.222320
## beta1 0.5979776 0.6113877 0.4699829 -0.3446416  1.496906
## sigma 1.1117341 1.0962731 0.1964065  0.7803599  1.597977
## 
## $chain2
##            Mean    Median   St.Dev.  95%CI_low 95%CI_upp
## beta0 1.6909949 1.6872656 0.2550248  1.2312055  2.200671
## beta1 0.5574374 0.5570003 0.4938227 -0.4694186  1.526048
## sigma 1.0994234 1.0606102 0.2198601  0.7953188  1.670299
## 
## $all.chains
##            Mean    Median   St.Dev.  95%CI_low 95%CI_upp
## beta0 1.6948460 1.6935100 0.2571105  1.2024405  2.212281
## beta1 0.5777075 0.5847188 0.4823559 -0.4039640  1.510321
## sigma 1.1055788 1.0772762 0.2085022  0.7879925  1.615285
df <- data.frame(mcmc.out$samples$chain1)
df_l <- df %>% select(beta0, beta1, sigma) %>% gather(key="parameter", value="value")

ps <- df_l %>% ggplot(aes(x=seq_along(value), y = value)) + geom_line()
ps + facet_wrap(~parameter, scales = "free")

p <- ggplot(df_l,aes(value)) + geom_histogram(aes( y= ..density..),bins = 60) 
p + facet_wrap(~parameter, scales = "free")

kable(head(mcmc.out$samples$chain1))
beta0 beta1 sigma
2.2442008 1.2376794 2.000000
1.5955212 1.0144206 2.000000
0.9827446 0.8980991 1.115267
1.4056617 0.6550272 1.115267
1.9650333 0.5338374 1.029094
1.8905297 0.7822568 1.029094


Homícidio

load("C:/Users/Rafael/Desktop/Pesquisa/Bases/homicidio.RData")

y = homicidio$X__2
x = homicidio$X__1
lm1 <- lm(y ~ x)
summary(lm1)
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.2456 -0.7098  0.2115  0.5496  1.8874 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  15.7631     2.1774   7.239 4.29e-06 ***
## x             1.1262     0.2375   4.742 0.000315 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.173 on 14 degrees of freedom
## Multiple R-squared:  0.6163, Adjusted R-squared:  0.5889 
## F-statistic: 22.49 on 1 and 14 DF,  p-value: 0.0003151
plot(x, y)
lines(x, 15.7 + 1.13 * x)

N = length(y)


simpleCode1 <- nimbleCode({
  beta0 ~ dnorm(0, sd = 100)
  beta1 ~ dnorm(0, sd = 100)
  sigma ~ dgamma(0.01, 0.001)
  for(i in 1:N) {
    Ypred[i] <- beta0 + beta1 * x[i]
    y[i] ~ dnorm(Ypred[i], sigma^-2)
  }
})

simpleModel1 <- nimbleModel(simpleCode1,
                            data = list(y = y, x = x),
                            constants = list(N = N),
                            inits = list(beta0 = 0, beta1 = 0, sigma = 2 ))
## defining model...
## building model...
## setting data and initial values...
## running calculate on model (any error reports that follow may simply reflect missing values in model variables) ... 
## checking model sizes and dimensions...
## model building finished.
mcmc.out <- nimbleMCMC(code = simpleModel1, 
                       data = list(y = y, x = x), 
                       inits = list(beta0 = 0, beta1 = 0, sigma = 2),
                       monitors=c("beta0","beta1","sigma"),
                       nchains = 2, 
                       niter = 1000,
                       summary = TRUE, 
                       progressBar = TRUE)
## compiling... this may take a minute. Use 'showCompilerOutput = TRUE' to see C++ compiler details.
## compilation finished.
## running chain 1...
## |-------------|-------------|-------------|-------------|
## |-------------------------------------------------------|
## running chain 2...
## |-------------|-------------|-------------|-------------|
## |-------------------------------------------------------|
mcmc.out$summary
## $chain1
##            Mean    Median   St.Dev. 95%CI_low 95%CI_upp
## beta0 14.556536 14.666978 2.9302501 8.8711018 20.769396
## beta1  1.258027  1.242847 0.3180792 0.5854992  1.890464
## sigma  1.259966  1.210250 0.2672877 0.8659336  1.924464
## 
## $chain2
##            Mean    Median   St.Dev.  95%CI_low 95%CI_upp
## beta0 16.771943 16.410556 2.3947334 12.1649042 23.006038
## beta1  1.017101  1.058377 0.2595693  0.3606368  1.514215
## sigma  1.240734  1.185512 0.2634339  0.8700408  1.858556
## 
## $all.chains
##            Mean    Median   St.Dev. 95%CI_low 95%CI_upp
## beta0 15.664240 15.614590 2.8956165 9.5546399 22.272381
## beta1  1.137564  1.141575 0.3142477 0.4276907  1.795697
## sigma  1.250350  1.196863 0.2654757 0.8659336  1.906972
df <- data.frame(mcmc.out$samples$chain1)
df_l <- df %>% select(beta0, beta1, sigma) %>% gather(key="parameter", value="value")

ps <- df_l %>% ggplot(aes(x=seq_along(value), y = value)) + geom_line()
ps + facet_wrap(~parameter, scales = "free")

p <- ggplot(df_l,aes(value)) + geom_histogram(aes( y= ..density..),bins = 60) 
p + facet_wrap(~parameter, scales = "free")

kable(head(mcmc.out$samples$chain1))
beta0 beta1 sigma
25.99882 0.1013035 1.707305
25.35642 0.0785289 1.707305
25.35556 0.0639584 1.707305
25.94270 0.0622601 1.707305
25.50073 0.0919985 2.065822
25.07561 0.0954523 2.065822


Enem

load("C:/Users/Rafael/Desktop/Pesquisa/Bases/enemAMOSTRA.Rda")
y = enem$NU_NOTA_REDACAO
x = enem$NU_NOTA_MT
lm1 <- lm(y ~ x)
summary(lm1)
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -598.34  -84.79   -3.69   88.03  321.00 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 353.68468   32.02075  11.045   <2e-16 ***
## x             0.52649    0.05585   9.426   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 127.3 on 348 degrees of freedom
## Multiple R-squared:  0.2034, Adjusted R-squared:  0.2011 
## F-statistic: 88.86 on 1 and 348 DF,  p-value: < 2.2e-16
plot(x, y)
lines(x, 354 + 0.52* x)

N = length(y)


simpleCode1 <- nimbleCode({
  beta0 ~ dnorm(0, sd = 100)
  beta1 ~ dnorm(0, sd = 100)
  sigma ~ dgamma(0.01, 0.001)
  for(i in 1:N) {
    Ypred[i] <- beta0 + beta1 * x[i]
    y[i] ~ dnorm(Ypred[i], sigma^-2)
  }
})

simpleModel1 <- nimbleModel(simpleCode1,
                            data = list(y = y, x = x),
                            constants = list(N = N),
                            inits = list(beta0 = 0, beta1 = 0, sigma = 2 ))
## defining model...
## building model...
## setting data and initial values...
## running calculate on model (any error reports that follow may simply reflect missing values in model variables) ... 
## checking model sizes and dimensions...
## model building finished.
mcmc.out <- nimbleMCMC(code = simpleModel1, 
                       data = list(y = y, x = x), 
                       inits = list(beta0 = 0, beta1 = 0, sigma = 2),
                       monitors=c("beta0","beta1","sigma"),
                       nchains = 2, 
                       niter = 1000,
                       summary = TRUE, 
                       progressBar = TRUE)
## compiling... this may take a minute. Use 'showCompilerOutput = TRUE' to see C++ compiler details.
## compilation finished.
## running chain 1...
## |-------------|-------------|-------------|-------------|
## |-------------------------------------------------------|
## running chain 2...
## |-------------|-------------|-------------|-------------|
## |-------------------------------------------------------|
mcmc.out$summary
## $chain1
##              Mean      Median     St.Dev.   95%CI_low   95%CI_upp
## beta0 325.4817623 321.4873164 50.19510608 255.3267688 451.2321750
## beta1   0.5745112   0.5797278  0.08614174   0.3589807   0.7004295
## sigma 108.2486794 124.7914593 36.25898336  13.0347552 137.9934926
## 
## $chain2
##              Mean      Median     St.Dev.   95%CI_low   95%CI_upp
## beta0 343.1318587 339.6233626 44.86911384 282.1060474 446.9062134
## beta1   0.5443172   0.5502239  0.07704731   0.3689826   0.6476677
## sigma 105.4596203 124.4003246 37.64897281  12.3390872 136.3332093
## 
## $all.chains
##              Mean      Median     St.Dev.   95%CI_low 95%CI_upp
## beta0 334.3068105 330.6445167 48.40640142 260.8313326 450.31964
## beta1   0.5594142   0.5664405  0.08308451   0.3608178   0.68728
## sigma 106.8541498 124.5448584 36.97758534  13.0173906 137.76132
df <- data.frame(mcmc.out$samples$chain1)
df_l <- df %>% select(beta0, beta1, sigma) %>% gather(key="parameter", value="value")

ps <- df_l %>% ggplot(aes(x=seq_along(value), y = value)) + geom_line()
ps + facet_wrap(~parameter, scales = "free")

p <- ggplot(df_l,aes(value)) + geom_histogram(aes( y= ..density..),bins = 60) 
p + facet_wrap(~parameter, scales = "free")

kable(head(mcmc.out$samples$chain1))
beta0 beta1 sigma
648.9074 0.0235647 3.638467
635.7174 0.0458264 4.435107
622.6484 0.0687537 4.435107
610.2358 0.0894274 4.871537
598.1731 0.1090028 4.979528
587.9980 0.1273383 4.979528