Preparação dos Dados

Abrir base do ENEM e criar amostras

# Bibliotecas
library(knitr)
library(esquisse)
library(tidyverse)
library(skimr)
library(extrafont)
gc()
            used  (Mb) gc trigger   (Mb)  max used   (Mb)
Ncells   4125435 220.4    7984848  426.5   7984848  426.5
Vcells 117269465 894.7  171438110 1308.0 163239406 1245.5
memory.limit (9999999999)
Warning: 'memory.limit()' is no longer supported
[1] Inf

1.1.1 Organização da Base e analise preliminar

sample_00_10 <- readRDS("~/RStudio/ENEM/sample_00_10.rds")
Warning in readRDS("~/RStudio/ENEM/sample_00_10.rds") :
  error reading the file
Error in readRDS("~/RStudio/ENEM/sample_00_10.rds") : 
  error reading from connection

Observar Valores Unicos e Numéricos

#Observar Valores Unicos e Numéricos
sample_00_10[,1:3] <- lapply(sample_00_10[,1:3], as.numeric)
sample_00_10[,7:9] <- lapply(sample_00_10[,7:9], as.factor)

sample_00_10[,10] <- lapply(sample_00_10[,10], as.numeric)
sample_00_10[,11:12] <- lapply(sample_00_10[,11:12], as.factor)

sample_00_10[,13:19] <- lapply(sample_00_10[,13:19], as.numeric)
sample_00_10[,20:83] <- lapply(sample_00_10[,20:83], as.factor)

skim_tee(sample_00_10)
-- Data Summary ------------------------
                           Values
Name                       data  
Number of rows             30403 
Number of columns          83    
Key                        NULL  
_______________________          
Column type frequency:           
  character                3     
  factor                   69    
  numeric                  11    
________________________         
Group variables            None  
sample_00_10 %>% skim()
-- Data Summary ------------------------
                           Values    
Name                       Piped data
Number of rows             30403     
Number of columns          83        
Key                        NULL      
_______________________              
Column type frequency:               
  character                3         
  factor                   69        
  numeric                  11        
________________________             
Group variables            None      
fix_windows_histograms()
This function will change your system locale. It may have other unintended effects.
y

1.1.2. é possivel unificar as bases?

standarizar indicadores?

1.2.Preparação da Base Ambiental

Abrir Bases:

# unificar base de dados ambiental
# Centro <- readRDS("~/RStudio/ENEM/Data-20220615T014950Z-001/Data/Centro.rds")
# NE <- readRDS("~/RStudio/ENEM/Data-20220615T014950Z-001/Data/NE.rds")
# NO <- readRDS("~/RStudio/ENEM/Data-20220615T014950Z-001/Data/NO.rds")
# SE <- readRDS("~/RStudio/ENEM/Data-20220615T014950Z-001/Data/SE.rds")
# SUL <- readRDS("~/RStudio/ENEM/Data-20220615T014950Z-001/Data/Sul.rds")
# df_Am<-bind_rows(NE, NO, SE,SUL)
rm(NE,NO,SE,SUL,Centro)
head(df_Am)
Error in h(simpleError(msg, call)) : 
  error in evaluating the argument 'x' in selecting a method for function 'head': object 'df_Am' not found

Fundir Bases

#preparando para fundir

df_Am1<-df_Am %>% mutate(CO_MUNICIPIO_RESIDENCIA=code_muni) %>% # criterio unificador
  group_by(CO_MUNICIPIO_RESIDENCIA,Year,name_state,name_region) %>% # fatores a se manter
  summarise(co_ppb=mean(co_ppb),# media das variaveis ambientaris por dia/mes
            no2_ppb=mean(no2_ppb),
            o3_ppb=mean(o3_ppb),
            pm25_ugm3=mean(pm25_ugm3),
            so2_ugm3=mean(so2_ugm3),
            preciptation=mean(preciptation),
            temperature=mean(temperature),
            humidity=mean(humidity),
            wind_direction=mean(wind_direction),
            wind_speed=mean(wind_speed),
            wildfire==mean(wildfire))%>% distinct()
head(df_Am1)

1.3.União das Bases

# dftest<-left_join(sample_00_10,df_Am1,by="CO_MUNICIPIO_RESIDENCIA") 
# table(dftest$Year,dftest$NU_ANO)
# skim(dftest$Year)
# dftest %>% filter(Year==NU_ANO+2000) -> dftest0
# saveRDS(dftest,"dftest0.rds")
 dftest0 <- readRDS("~/RStudio/ENEM/dftest0.rds")

Análise Preliminar

gpa_mixed = lmer(Nota ~ Variavel Ambiental + (1 | ANO)+ (1 | Municipio), data = dfteste)

dftest0 %>% 
  ggplot(aes(co_ppb,NU_NOTA_REDACAO))+geom_point()+facet_wrap(~NU_ANO+2000,scales = "free")+geom_smooth()
`geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Warning: Removed 3155 rows containing non-finite values (stat_smooth).
Warning: Removed 3155 rows containing missing values (geom_point).

dftest0 %>% 
  ggplot(aes(co_ppb,NU_NOTA_COMP1))+geom_point()+facet_wrap(~NU_ANO+2000,scales = "free")+geom_smooth()
`geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Warning: Removed 16854 rows containing non-finite values (stat_smooth).
Warning: Removed 16854 rows containing missing values (geom_point).

dftest0 %>% 
  ggplot(aes(co_ppb,NU_NOTA_COMP2))+geom_point()+facet_wrap(~NU_ANO+2000,scales = "free")+geom_smooth()
`geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Warning: Removed 16854 rows containing non-finite values (stat_smooth).
Warning: Removed 16854 rows containing missing values (geom_point).

dftest0 %>% 
  ggplot(aes(co_ppb,NU_NOTA_COMP3))+geom_point()+facet_wrap(~NU_ANO+2000,scales = "free")+geom_smooth()
`geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Warning: Removed 16854 rows containing non-finite values (stat_smooth).
Warning: Removed 16854 rows containing missing values (geom_point).

dftest0 %>% 
  ggplot(aes(co_ppb,NU_NOTA_COMP4))+geom_point()+facet_wrap(~NU_ANO+2000,scales = "free")+geom_smooth()
`geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Warning: Removed 16854 rows containing non-finite values (stat_smooth).
Warning: Removed 16854 rows containing missing values (geom_point).

dftest0 %>% 
  ggplot(aes(co_ppb,NU_NOTA_COMP5))+geom_point()+facet_wrap(~NU_ANO+2000,scales = "free")+geom_smooth()
`geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Warning: Removed 16854 rows containing non-finite values (stat_smooth).
Warning: Removed 16854 rows containing missing values (geom_point).

library(lme4)
gpa_mixed = lmer(NU_NOTA_REDACAO ~ co_ppb+no2_ppb  + NU_IDADE + (1 +co_ppb| CO_UF_RESIDENCIA)+ (1 +co_ppb| NU_ANO), data = dftest0)
Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv,  :
  unable to evaluate scaled gradient
Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv,  :
  Model failed to converge: degenerate  Hessian with 4 negative eigenvalues
summary(gpa_mixed)
Linear mixed model fit by REML ['lmerMod']
Formula: 
NU_NOTA_REDACAO ~ co_ppb + no2_ppb + NU_IDADE + (1 + co_ppb |  
    CO_UF_RESIDENCIA) + (1 + co_ppb | NU_ANO)
   Data: dftest0

REML criterion at convergence: 5582022

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-2.6670 -0.2039  0.0322  0.2062  4.2815 

Random effects:
 Groups           Name        Variance  Std.Dev.  Corr
 CO_UF_RESIDENCIA (Intercept) 8.381e-03   0.09155     
                  co_ppb      4.230e-03   0.06504 0.63
 NU_ANO           (Intercept) 3.320e+04 182.19703     
                  co_ppb      8.200e+04 286.36461 0.00
 Residual                     2.724e+04 165.05674     
Number of obs: 427696, groups:  CO_UF_RESIDENCIA, 27; NU_ANO, 11

Fixed effects:
             Estimate Std. Error t value
(Intercept) 144.64178   54.95350   2.632
co_ppb       -0.02048   86.35020   0.000
no2_ppb      -0.10479    0.28394  -0.369
NU_IDADE     -3.02018    0.03650 -82.755

Correlation of Fixed Effects:
         (Intr) co_ppb n2_ppb
co_ppb    0.000              
no2_ppb   0.015  0.000       
NU_IDADE -0.009  0.000 -0.049
optimizer (nloptwrap) convergence code: 0 (OK)
unable to evaluate scaled gradient
Model failed to converge: degenerate  Hessian with 4 negative eigenvalues

Extração de Coeficientes

1.4.Conclusão

  • Viés de Coleta

  • Atrito/ Perca de dados

    • Dados Ausentes

    • Variaveis Pouco Claras

      • Avaliar Variaveis Q1,Q2,…Qn

      • Avaliar Variaveis Q1,Q2,…Qn

      • Avaliar autocorrelação das Variaveis Ambientais

Sugestões:

  • Normalizar Notas para facilitar comparação entre anos

2. Analise Exploratória

2.1 Estatistica Descritiva das Variaveis

2.2 Associações, correlações

2.3 Modelo Logico

  • melhores variaveis ambientais

  • melhores variaveis de Performace

  • Covariaveis, Autocorrelacionalidade, efeitos fixos e aleatórios

3. Regressão de Efeitos Mistos

2.1 Hipotese Básica do Modelo

2.2. Extração de Coeficientes

2.3 Hipoteses

---
title: "Análise Preparatórias"
author:
- name: Thiago Noronha Gardin
  affiliation: Fundação Getulio Vargas
- name: Weeberb J. Réquia Jr.
  affiliation: Fundação Getulio Vargas
date: "`r format(Sys.time(), '%B %d, %Y')`"
output:
  html_notebook:
    df_print: paged
  pdf_document: default
---

# Preparação dos Dados

## Abrir base do ENEM e criar amostras

```{r}
# Bibliotecas
library(knitr)
library(esquisse)
library(tidyverse)
library(skimr)
library(extrafont)

```

```{r}
gc()
memory.limit (9999999999)
```

```{r eval=FALSE, include=FALSE}
# abrir base e criar amostras
#dados_finais_2000_2010 <- readRDS("~/RStudio/ENEM/2_Dado tratado/dados_finais_2000_2010.rds")
#df<-slice_sample(dados_finais_2000_2010,prop = 0.01) 
#saveRDS(df, file = "sample_00_10.rds")
#table(dados_finais_2000_2010$NU_ANO)
#table(df$NU_ANO)
#dados_finais_2000_2010 <- readRDS("~/RStudio/ENEM/2_Dado tratado/dados_finais_2012_2014.rds")

#df2<-slice_sample(dados_finais_2000_2010,prop = 0.01) 
#saveRDS(df2, file = "sample_12_14.rds")


#dados_finais_2015_2020 <- readRDS("~/RStudio/ENEM/2_Dado tratado/dados_finais_2015_2020.rds")
#df3<-slice_sample(dados_finais_2015_2020,prop = 0.01) 
#saveRDS(df3, file = "sample_15_20.rds")
```

#### 1.1.1 Organização da Base e analise preliminar

```{r}
sample_00_10 <- readRDS("~/RStudio/ENEM/sample_00_10.rds")
# resample:
sample_00_10<- slice_sample(sample_00_10,prop = 0.1)

```

Observar Valores Unicos e Numéricos

```{r echo=TRUE}
#Observar Valores Unicos e Numéricos
sample_00_10[,1:3] <- lapply(sample_00_10[,1:3], as.numeric)
sample_00_10[,7:9] <- lapply(sample_00_10[,7:9], as.factor)

sample_00_10[,10] <- lapply(sample_00_10[,10], as.numeric)
sample_00_10[,11:12] <- lapply(sample_00_10[,11:12], as.factor)

sample_00_10[,13:19] <- lapply(sample_00_10[,13:19], as.numeric)
sample_00_10[,20:83] <- lapply(sample_00_10[,20:83], as.factor)

skim_tee(sample_00_10)

```

```{r}
sample_00_10 %>% skim()
#fix_windows_histograms()
```

#### 1.1.2. é possivel unificar as bases?

standarizar indicadores?

### 1.2.Preparação da Base Ambiental

Abrir Bases:

```{r warning=FALSE}
# unificar base de dados ambiental
# Centro <- readRDS("~/RStudio/ENEM/Data-20220615T014950Z-001/Data/Centro.rds")
# NE <- readRDS("~/RStudio/ENEM/Data-20220615T014950Z-001/Data/NE.rds")
# NO <- readRDS("~/RStudio/ENEM/Data-20220615T014950Z-001/Data/NO.rds")
# SE <- readRDS("~/RStudio/ENEM/Data-20220615T014950Z-001/Data/SE.rds")
# SUL <- readRDS("~/RStudio/ENEM/Data-20220615T014950Z-001/Data/Sul.rds")
# df_Am<-bind_rows(NE, NO, SE,SUL)
#rm(NE,NO,SE,SUL,Centro)
#head(df_Am)
```

Fundir Bases

```{}
#preparando para fundir

df_Am1<-df_Am %>% mutate(CO_MUNICIPIO_RESIDENCIA=code_muni) %>% # criterio unificador
  group_by(CO_MUNICIPIO_RESIDENCIA,Year,name_state,name_region) %>% # fatores a se manter
  summarise(co_ppb=mean(co_ppb),# media das variaveis ambientaris por dia/mes
            no2_ppb=mean(no2_ppb),
            o3_ppb=mean(o3_ppb),
            pm25_ugm3=mean(pm25_ugm3),
            so2_ugm3=mean(so2_ugm3),
            preciptation=mean(preciptation),
            temperature=mean(temperature),
            humidity=mean(humidity),
            wind_direction=mean(wind_direction),
            wind_speed=mean(wind_speed),
            wildfire==mean(wildfire))%>% distinct()
head(df_Am1)
```

### 1.3.União das Bases

```{r}
# dftest<-left_join(sample_00_10,df_Am1,by="CO_MUNICIPIO_RESIDENCIA") 
# table(dftest$Year,dftest$NU_ANO)
# skim(dftest$Year)
# dftest %>% filter(Year==NU_ANO+2000) -> dftest0
# saveRDS(dftest,"dftest0.rds")
 dftest0 <- readRDS("~/RStudio/ENEM/dftest0.rds")
```

Análise Preliminar

gpa_mixed = lmer(Nota \~ Variavel Ambiental + (1 \| ANO)+ (1 \| Municipio), data = dfteste)

```{r}
dftest0 %>% 
  ggplot(aes(co_ppb,NU_NOTA_REDACAO))+geom_point()+facet_wrap(~NU_ANO+2000,scales = "free")+geom_smooth()
dftest0 %>% 
  ggplot(aes(co_ppb,NU_NOTA_COMP1))+geom_point()+facet_wrap(~NU_ANO+2000,scales = "free")+geom_smooth()
dftest0 %>% 
  ggplot(aes(co_ppb,NU_NOTA_COMP2))+geom_point()+facet_wrap(~NU_ANO+2000,scales = "free")+geom_smooth()
dftest0 %>% 
  ggplot(aes(co_ppb,NU_NOTA_COMP3))+geom_point()+facet_wrap(~NU_ANO+2000,scales = "free")+geom_smooth()
dftest0 %>% 
  ggplot(aes(co_ppb,NU_NOTA_COMP4))+geom_point()+facet_wrap(~NU_ANO+2000,scales = "free")+geom_smooth()
dftest0 %>% 
  ggplot(aes(co_ppb,NU_NOTA_COMP5))+geom_point()+facet_wrap(~NU_ANO+2000,scales = "free")+geom_smooth()


```

```{r}
library(lme4)
gpa_mixed = lmer(NU_NOTA_REDACAO ~ co_ppb+no2_ppb  + NU_IDADE + (1 +co_ppb| CO_UF_RESIDENCIA)+ (1 +co_ppb| NU_ANO), data = dftest0)
summary(gpa_mixed)
```

Extração de Coeficientes

```{r}

#coef(gpa_mixed)
slopes_random_groups <- coef(gpa_mixed)
beta <- slopes_random_groups$NU_ANO
beta$factor<-row.names(beta)
beta
#extraindo os SE
library(parameters)
slopes_SE_random_groups <- standard_error(gpa_mixed, effects = "random")
 SE <- slopes_SE_random_groups$NU_ANO
 SE
 beta$SE_Intercept<-SE[,1]
 beta$SE_co_ppb <-SE[,2]
 beta
 ## Calculate 95%CI
 R<-beta %>% gather(Tipo_coef, Coef,`(Intercept)`,co_ppb) 
 
 # R<-beta %>% gather(Tipo_coef, Coef,`(Intercept)`,) %>%mutate(
 # `(Intercept)`=SE_Intercept) %>%
 # gather(tipo_SE, SE, co_ppb,`(Intercept)`) %>%
 # filter(tipo_SE==Tipo_coef)
 R %>%
 ggplot(aes(Coef,factor))+geom_point()+
 facet_grid(~Tipo_coef,scales = "free")+
 geom_errorbar(aes(xmin=(Coef+1.645*SE),xmax=(Coef-1.645*SE)),size=0.1)+
 geom_pointrange(aes(xmin=(Coef+1.645*SE),xmax=(Coef-1.645*SE)),size=0.10)+
 geom_vline(xintercept = 0,
 color = "red", size=1.0)+
 labs(title = "Modelo mistos",
 subtitle = "Coeficientes(IC90%)",
 caption = "EPPG FGV")
```


### 1.4.Conclusão

-   Viés de Coleta

-   Atrito/ Perca de dados

    -   Dados Ausentes

        -   

    -   Variaveis Pouco Claras

        -   Avaliar Variaveis Q1,Q2,...Qn

        -   Avaliar Variaveis Q1,Q2,...Qn

        -   Avaliar autocorrelação das Variaveis Ambientais

Sugestões:

-   Normalizar Notas para facilitar comparação entre anos

### 2. Analise Exploratória

2.1 Estatistica Descritiva das Variaveis

2.2 Associações, correlações

2.3 Modelo Logico

-   melhores variaveis ambientais

-   melhores variaveis de Performace

-   Covariaveis, Autocorrelacionalidade, efeitos fixos e aleatórios

### 3. Regressão de Efeitos Mistos

2.1 Hipotese Básica do Modelo

2.2. Extração de Coeficientes

2.3 Hipoteses
