Relatario de Estatistica Computacional

R Markdown

Pacotes Utilizados Neste Relatório:

library(rmarkdown)
#descrever o pacote:
library(Rcpp)
library(tidyr)
library(bindr)
library(dplyr, warn.conflicts =  FALSE)
library(ggplot2)
library(microbenchmark)
library(kableExtra)
library(readr)
library(dplyr)
library(knitr)
library(stringr)
library(DT)
#<https://rstudio.github.io/DT/>

1- Teste de Performance

Objetivo: Somar as linhas de uma matriz.

Seja big.matrix uma matriz “grande” e seja N a dimensao ou numero de colunas da big.matrix, entao a funcao cppSum abaixo faz a soma das linhas de uma matriz de dimensao N:

cppSum <- function(big.matrix, N) {
  stopifnot(exists("big.matrix")) #pare se nao existir a big.matrix
  stopifnot(exists("N")) #pare se nao existir o N
  Time <- microbenchmark({ 
    colsums <- rowSumsC(big.matrix)
  }, times = N)$time
  return(Time)  
}

2- Análise do banco de dados cars

Os dados do conjunto cars d R dao a velocidade dos carros e as distancias tomadas para parar. Esses dados foram registrados na decada de 1920.

O conjunto de dados tem 50 observacoes em 2 variaveis numericas:

[, 1] Velocidade (mph)
[, 2] Distancia de parada (ft)

##    speed dist
## 1      4    2
## 2      4   10
## 3      7    4
## 4      7   22
## 5      8   16
## 6      9   10
## 7     10   18
## 8     10   26
## 9     10   34
## 10    11   17
## 11    11   28
## 12    12   14
## 13    12   20
## 14    12   24
## 15    12   28
## 16    13   26
## 17    13   34
## 18    13   34
## 19    13   46
## 20    14   26
## 21    14   36
## 22    14   60
## 23    14   80
## 24    15   20
## 25    15   26
## 26    15   54
## 27    16   32
## 28    16   40
## 29    17   32
## 30    17   40
## 31    17   50
## 32    18   42
## 33    18   56
## 34    18   76
## 35    18   84
## 36    19   36
## 37    19   46
## 38    19   68
## 39    20   32
## 40    20   48
## 41    20   52
## 42    20   56
## 43    20   64
## 44    22   66
## 45    23   54
## 46    24   70
## 47    24   92
## 48    24   93
## 49    24  120
## 50    25   85

Análise Descritiva

summary(cars)

##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

GGplot

##    speed dist teste
## 1      4    2  TRUE
## 2      4   10  TRUE
## 3      7    4  TRUE
## 4      7   22  TRUE
## 5      8   16  TRUE
## 6      9   10  TRUE
## 7     10   18  TRUE
## 8     10   26  TRUE
## 9     10   34  TRUE
## 10    11   17  TRUE
## 11    11   28  TRUE
## 12    12   14  TRUE
## 13    12   20  TRUE
## 14    12   24  TRUE
## 15    12   28  TRUE
## 16    13   26  TRUE
## 17    13   34  TRUE
## 18    13   34  TRUE
## 19    13   46  TRUE
## 20    14   26  TRUE
## 21    14   36  TRUE
## 22    14   60  TRUE
## 23    14   80  TRUE
## 24    15   20  TRUE
## 25    15   26  TRUE
## 26    15   54  TRUE
## 27    16   32  TRUE
## 28    16   40  TRUE
## 29    17   32  TRUE
## 30    17   40  TRUE
## 31    17   50  TRUE
## 32    18   42  TRUE
## 33    18   56  TRUE
## 34    18   76  TRUE
## 35    18   84  TRUE
## 36    19   36  TRUE
## 37    19   46  TRUE
## 38    19   68  TRUE
## 39    20   32  TRUE
## 40    20   48  TRUE
## 41    20   52  TRUE
## 42    20   56  TRUE
## 43    20   64  TRUE
## 44    22   66  TRUE
## 45    23   54  TRUE
## 46    24   70  TRUE
## 47    24   92  TRUE
## 48    24   93  TRUE
## 49    24  120  TRUE
## 50    25   85  TRUE

cars%>%
  mutate(teste=speed<=50)

##    speed dist teste
## 1      4    2  TRUE
## 2      4   10  TRUE
## 3      7    4  TRUE
## 4      7   22  TRUE
## 5      8   16  TRUE
## 6      9   10  TRUE
## 7     10   18  TRUE
## 8     10   26  TRUE
## 9     10   34  TRUE
## 10    11   17  TRUE
## 11    11   28  TRUE
## 12    12   14  TRUE
## 13    12   20  TRUE
## 14    12   24  TRUE
## 15    12   28  TRUE
## 16    13   26  TRUE
## 17    13   34  TRUE
## 18    13   34  TRUE
## 19    13   46  TRUE
## 20    14   26  TRUE
## 21    14   36  TRUE
## 22    14   60  TRUE
## 23    14   80  TRUE
## 24    15   20  TRUE
## 25    15   26  TRUE
## 26    15   54  TRUE
## 27    16   32  TRUE
## 28    16   40  TRUE
## 29    17   32  TRUE
## 30    17   40  TRUE
## 31    17   50  TRUE
## 32    18   42  TRUE
## 33    18   56  TRUE
## 34    18   76  TRUE
## 35    18   84  TRUE
## 36    19   36  TRUE
## 37    19   46  TRUE
## 38    19   68  TRUE
## 39    20   32  TRUE
## 40    20   48  TRUE
## 41    20   52  TRUE
## 42    20   56  TRUE
## 43    20   64  TRUE
## 44    22   66  TRUE
## 45    23   54  TRUE
## 46    24   70  TRUE
## 47    24   92  TRUE
## 48    24   93  TRUE
## 49    24  120  TRUE
## 50    25   85  TRUE

p <- cars %>%
  mutate(teste=speed<=15) %>% 
  ggplot() + geom_histogram(aes(x=dist, fill=teste), col="pink", bins = 10)
p

3- Importacao de arquivos de base de dados

Será importada a base de dados do INEP http://inep.gov.br/microdados usando o pacote readr, um dos pacoetes usados quando usa-se o tidyverse.

Foi usado o Censo de Educação Superior - docentes do ano de 2016.

base5perc <-readRDS("Aula4.base5perc.rds")

censobr <-base5perc %>% 
  select(NO_IES, DS_CATEGORIA_ADMINISTRATIVA, CO_MUNICIPIO_NASCIMENTO) %>%
  transmute(Nome = NO_IES, 
            Categoria = DS_CATEGORIA_ADMINISTRATIVA,
            Codigo = as.character(CO_MUNICIPIO_NASCIMENTO)) %>% 
  mutate (UF = str_sub(string = Codigo, start=1, end=2)) %>% 
  group_by(UF, Categoria) %>% 
  #summarise(Total = n()) %>% 
  count() %>% 
  filter(UF == "26") %>% 
  arrange(desc(n))

censobr %>% datatable(class = 'stripe', rownames = FALSE, caption = 'Tabela 1: This is a simple caption for the table.')

Dados do censo de ensino superior de docentes no estado de Pernambuco em 2016

pernambuco <- base5perc %>% 
  select(NO_IES, DS_CATEGORIA_ADMINISTRATIVA, CO_MUNICIPIO_NASCIMENTO) %>%
  transmute(Nome = NO_IES, 
            Categoria = DS_CATEGORIA_ADMINISTRATIVA,
            Codigo = as.character(CO_MUNICIPIO_NASCIMENTO)) %>% 
  mutate (UF = str_sub(string = Codigo, start=1, end=2)) %>% 
  group_by(UF, Categoria) %>% 
  summarise(n = n()) %>% 
  mutate(Frequencia = n/sum(n)) %>% 
  #count() %>% 
  filter(UF == "26") %>% 
  arrange(desc(n)) %>% 
  mutate(Categoria = factor (Categoria, levels = Categoria))


pernambuco %>% datatable(class = 'stripe', rownames = FALSE, caption = 'Tabela 2: This is a simple caption for the table.')

Estatisticas Descritivas das varivaies do censo superior de docentes segundo Pernambuco em 2016

summary(pernambuco)

##       UF                                  Categoria       n         
##  Length:6           Pública Federal            :1   Min.   :  6.00  
##  Class :character   Privada com fins lucrativos:1   1st Qu.: 43.25  
##  Mode  :character   Privada sem fins lucrativos:1   Median :108.00  
##                     Pública Estadual           :1   Mean   :110.17  
##                     Pública Municipal          :1   3rd Qu.:162.25  
##                     Especial                   :1   Max.   :237.00  
##    Frequencia      
##  Min.   :0.009077  
##  1st Qu.:0.065431  
##  Median :0.163389  
##  Mean   :0.166667  
##  3rd Qu.:0.245461  
##  Max.   :0.358548

Comecando a utilizar o Color Brewer <http://colorbrewer2.org/>

Grafico de barras por categoria das universidades segundo docentes do estado de Pernambuco

#Cores pelo color brewer
pernambuco %>% 
  ggplot(aes(x = reorder(Categoria, n), y= n, fill= Categoria)) +
  geom_bar(stat = "identity") +
  guides(fill = "none") +
  coord_flip() +  #muda a direcao do grafico de barras
  labs(x = "Categoria", y = "Total", title = "Numero de instituicoes por categoria", subtitle = "Estado de Pernambuco") +
  geom_label(aes(label = paste(round(100*Frequencia), "%", sep = ""))) +
  scale_fill_brewer(palette = "Reds", direction = -1) #para a cor da paleta ficar degrade por tamanho do n

Dados do censo de ensino superior dos docentes nos estados de Pernambuco e Paraiba, 2016

PE_PB <- base5perc %>% 
  select(NO_IES, DS_CATEGORIA_ADMINISTRATIVA, CO_MUNICIPIO_NASCIMENTO) %>%
  transmute(Nome = NO_IES, 
            Categoria = DS_CATEGORIA_ADMINISTRATIVA,
            Codigo = as.character(CO_MUNICIPIO_NASCIMENTO)) %>% 
  mutate (UF = str_sub(string = Codigo, start=1, end=2)) %>% 
  group_by(UF, Categoria) %>% 
  summarise(n = n()) %>% 
  mutate(freq = n/sum(n)) %>% 
  #count() %>% 
  filter(UF == "26" | UF == "25") %>% 
  arrange(desc(n))

str(PE_PB) #mostra o tipo das variaveis

## Classes 'grouped_df', 'tbl_df', 'tbl' and 'data.frame':  10 obs. of  4 variables:
##  $ UF       : chr  "26" "26" "26" "25" ...
##  $ Categoria: chr  "Pública Federal" "Privada com fins lucrativos" "Privada sem fins lucrativos" "Privada com fins lucrativos" ...
##  $ n        : int  237 164 157 122 99 76 59 55 38 6
##  $ freq     : num  0.359 0.248 0.238 0.347 0.281 ...
##  - attr(*, "vars")= chr "UF"
##  - attr(*, "drop")= logi TRUE
##  - attr(*, "indices")=List of 2
##   ..$ : int  3 4 5 7
##   ..$ : int  0 1 2 6 8 9
##  - attr(*, "group_sizes")= int  4 6
##  - attr(*, "biggest_group_size")= int 6
##  - attr(*, "labels")='data.frame':   2 obs. of  1 variable:
##   ..$ UF: chr  "25" "26"
##   ..- attr(*, "vars")= chr "UF"
##   ..- attr(*, "drop")= logi TRUE
##   ..- attr(*, "indices")=List of 2
##   .. ..$ : int  0 1 2 3
##   .. ..$ : int  4 5 6 7 8 9
##   ..- attr(*, "group_sizes")= int  4 6
##   ..- attr(*, "biggest_group_size")= int 6

Estatisticas descritivas das varivaies segundo Pernambuco e Paraiba

summary(PE_PB)

##       UF             Categoria               n              freq         
##  Length:10          Length:10          Min.   :  6.0   Min.   :0.009077  
##  Class :character   Class :character   1st Qu.: 56.0   1st Qu.:0.106007  
##  Mode  :character   Mode  :character   Median : 87.5   Median :0.226714  
##                                        Mean   :101.3   Mean   :0.200000  
##                                        3rd Qu.:148.2   3rd Qu.:0.272965  
##                                        Max.   :237.0   Max.   :0.358548

PE_PB %>% mutate(Estado = if_else(UF == "26", "Pernambuco", "Paraiba")) %>% 
  ggplot(aes(x = reorder(Categoria,n), y= n, fill=Estado)) + geom_bar(stat = "identity") + coord_flip()

PE_PB %>% mutate(Estado = if_else(UF == "26", "Pernambuco", "Paraiba")) %>% 
  ggplot(aes(x = reorder(Categoria,n), y= n, fill=Estado)) + geom_bar(stat = "identity", position = "dodge") + coord_flip()

PE_PB %>% mutate(Estado = if_else(UF == "26", "Pernambuco", "Paraiba")) %>% 
  ggplot(aes(x = reorder(Categoria,n), y= n, fill=UF)) + geom_bar(stat = "identity", position = "dodge") + coord_flip() + facet_wrap(~Estado) + guides(fill="none")

Dentro do facetwrap, o scale free coloca uma escala pra cada um. Sem o scale free, eh a mesma escala para todos e eh melhor para comparar.