Prática em Software de Pesquisa Médica - Turma 2020/2

Prof. Marcus Jones

Aula 30 outubro 2020

options(width = 60)
library(tidyverse)
## ── Attaching packages ─────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.4     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0
## ── Conflicts ────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(gapminder) # carrega banco de dados
gap <- gapminder # salva com nome mais curto

filter: selecionando apenas Brazil e salvando em gapbrazil

gap %>% filter(country=="Brazil")
## # A tibble: 12 x 6
##    country continent  year lifeExp       pop gdpPercap
##    <fct>   <fct>     <int>   <dbl>     <int>     <dbl>
##  1 Brazil  Americas   1952    50.9  56602560     2109.
##  2 Brazil  Americas   1957    53.3  65551171     2487.
##  3 Brazil  Americas   1962    55.7  76039390     3337.
##  4 Brazil  Americas   1967    57.6  88049823     3430.
##  5 Brazil  Americas   1972    59.5 100840058     4986.
##  6 Brazil  Americas   1977    61.5 114313951     6660.
##  7 Brazil  Americas   1982    63.3 128962939     7031.
##  8 Brazil  Americas   1987    65.2 142938076     7807.
##  9 Brazil  Americas   1992    67.1 155975974     6950.
## 10 Brazil  Americas   1997    69.4 168546719     7958.
## 11 Brazil  Americas   2002    71.0 179914212     8131.
## 12 Brazil  Americas   2007    72.4 190010647     9066.
gapbrazil <- gap %>% filter(country=="Brazil")

salvando arquivo como .Rdata no diretorio de trabalho

save(gap, file="gap.Rdata")
save(gapbrazil, file="gapbrazil.Rdata")

carregando arquivo mt.Rdata do diretorio de trabalho

load("gap.Rdata")
load("gapbrazil.rdata")

filter & arrange

gap %>% arrange(year, country)
## # A tibble: 1,704 x 6
##    country     continent  year lifeExp      pop gdpPercap
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Afghanistan Asia       1952    28.8  8425333      779.
##  2 Albania     Europe     1952    55.2  1282697     1601.
##  3 Algeria     Africa     1952    43.1  9279525     2449.
##  4 Angola      Africa     1952    30.0  4232095     3521.
##  5 Argentina   Americas   1952    62.5 17876956     5911.
##  6 Australia   Oceania    1952    69.1  8691212    10040.
##  7 Austria     Europe     1952    66.8  6927772     6137.
##  8 Bahrain     Asia       1952    50.9   120447     9867.
##  9 Bangladesh  Asia       1952    37.5 46886859      684.
## 10 Belgium     Europe     1952    68    8730405     8343.
## # … with 1,694 more rows
gap %>% filter(year == 2007) %>% arrange(lifeExp)
## # A tibble: 142 x 6
##    country          continent  year lifeExp    pop gdpPercap
##    <fct>            <fct>     <int>   <dbl>  <int>     <dbl>
##  1 Swaziland        Africa     2007    39.6 1.13e6     4513.
##  2 Mozambique       Africa     2007    42.1 2.00e7      824.
##  3 Zambia           Africa     2007    42.4 1.17e7     1271.
##  4 Sierra Leone     Africa     2007    42.6 6.14e6      863.
##  5 Lesotho          Africa     2007    42.6 2.01e6     1569.
##  6 Angola           Africa     2007    42.7 1.24e7     4797.
##  7 Zimbabwe         Africa     2007    43.5 1.23e7      470.
##  8 Afghanistan      Asia       2007    43.8 3.19e7      975.
##  9 Central African… Africa     2007    44.7 4.37e6      706.
## 10 Liberia          Africa     2007    45.7 3.19e6      415.
## # … with 132 more rows
gap %>% filter(year == 2007) %>% arrange(desc(lifeExp))
## # A tibble: 142 x 6
##    country        continent  year lifeExp      pop gdpPercap
##    <fct>          <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Japan          Asia       2007    82.6   1.27e8    31656.
##  2 Hong Kong, Ch… Asia       2007    82.2   6.98e6    39725.
##  3 Iceland        Europe     2007    81.8   3.02e5    36181.
##  4 Switzerland    Europe     2007    81.7   7.55e6    37506.
##  5 Australia      Oceania    2007    81.2   2.04e7    34435.
##  6 Spain          Europe     2007    80.9   4.04e7    28821.
##  7 Sweden         Europe     2007    80.9   9.03e6    33860.
##  8 Israel         Asia       2007    80.7   6.43e6    25523.
##  9 France         Europe     2007    80.7   6.11e7    30470.
## 10 Canada         Americas   2007    80.7   3.34e7    36319.
## # … with 132 more rows
gap %>% filter(year==2007 & continent=="Americas") %>% arrange(desc(gdpPercap))
## # A tibble: 25 x 6
##    country         continent  year lifeExp     pop gdpPercap
##    <fct>           <fct>     <int>   <dbl>   <int>     <dbl>
##  1 United States   Americas   2007    78.2  3.01e8    42952.
##  2 Canada          Americas   2007    80.7  3.34e7    36319.
##  3 Puerto Rico     Americas   2007    78.7  3.94e6    19329.
##  4 Trinidad and T… Americas   2007    69.8  1.06e6    18009.
##  5 Chile           Americas   2007    78.6  1.63e7    13172.
##  6 Argentina       Americas   2007    75.3  4.03e7    12779.
##  7 Mexico          Americas   2007    76.2  1.09e8    11978.
##  8 Venezuela       Americas   2007    73.7  2.61e7    11416.
##  9 Uruguay         Americas   2007    76.4  3.45e6    10611.
## 10 Panama          Americas   2007    75.5  3.24e6     9809.
## # … with 15 more rows

mutate

gap <- gap %>% mutate(gdp = pop * gdpPercap)

rename

gap %>% rename(life_exp = lifeExp, gdp_percap = gdpPercap)
## # A tibble: 1,704 x 7
##    country continent  year life_exp    pop gdp_percap
##    <fct>   <fct>     <int>    <dbl>  <int>      <dbl>
##  1 Afghan… Asia       1952     28.8 8.43e6       779.
##  2 Afghan… Asia       1957     30.3 9.24e6       821.
##  3 Afghan… Asia       1962     32.0 1.03e7       853.
##  4 Afghan… Asia       1967     34.0 1.15e7       836.
##  5 Afghan… Asia       1972     36.1 1.31e7       740.
##  6 Afghan… Asia       1977     38.4 1.49e7       786.
##  7 Afghan… Asia       1982     39.9 1.29e7       978.
##  8 Afghan… Asia       1987     40.8 1.39e7       852.
##  9 Afghan… Asia       1992     41.7 1.63e7       649.
## 10 Afghan… Asia       1997     41.8 2.22e7       635.
## # … with 1,694 more rows, and 1 more variable: gdp <dbl>

group_by & filter -> filtra apenas ano 2007 e mostra o numero de paises em cada continente

gap %>% filter(year==2007) %>% group_by(continent) %>% summarise(numero=n())
## # A tibble: 5 x 2
##   continent numero
##   <fct>      <int>
## 1 Africa        52
## 2 Americas      25
## 3 Asia          33
## 4 Europe        30
## 5 Oceania        2

comandos combinados

gap %>% filter(year==2007) %>% group_by(continent) %>% summarise(n=n(), media=mean(pop))
## # A tibble: 5 x 3
##   continent     n      media
##   <fct>     <int>      <dbl>
## 1 Africa       52  17875763.
## 2 Americas     25  35954847.
## 3 Asia         33 115513752.
## 4 Europe       30  19536618.
## 5 Oceania       2  12274974.
gap %>% filter(year==2007) %>% group_by(continent) %>% summarise(n=n(), median=median(pop))
## # A tibble: 5 x 3
##   continent     n    median
##   <fct>     <int>     <dbl>
## 1 Africa       52 10093310.
## 2 Americas     25  9319622 
## 3 Asia         33 24821286 
## 4 Europe       30  9493598 
## 5 Oceania       2 12274974.
gap %>% filter(year==2007) %>% group_by(continent) %>% summarise(n=n(), media=mean(pop), minimo=min(pop), maximo=max(pop))
## # A tibble: 5 x 5
##   continent     n      media  minimo     maximo
##   <fct>     <int>      <dbl>   <int>      <int>
## 1 Africa       52  17875763.  199579  135031164
## 2 Americas     25  35954847. 1056608  301139947
## 3 Asia         33 115513752.  708573 1318683096
## 4 Europe       30  19536618.  301931   82400996
## 5 Oceania       2  12274974. 4115771   20434176
gap %>% filter(year==2007) %>% group_by(continent) %>% summarise(media=mean(pop), minimo=min(pop), maximo=max(pop))
## # A tibble: 5 x 4
##   continent      media  minimo     maximo
##   <fct>          <dbl>   <int>      <int>
## 1 Africa     17875763.  199579  135031164
## 2 Americas   35954847. 1056608  301139947
## 3 Asia      115513752.  708573 1318683096
## 4 Europe     19536618.  301931   82400996
## 5 Oceania    12274974. 4115771   20434176
gap %>% filter(year==2007 & pop>100000000) %>%  group_by(country) %>% arrange(desc(pop))
## # A tibble: 10 x 7
## # Groups:   country [10]
##    country  continent  year lifeExp    pop gdpPercap     gdp
##    <fct>    <fct>     <int>   <dbl>  <int>     <dbl>   <dbl>
##  1 China    Asia       2007    73.0 1.32e9     4959. 6.54e12
##  2 India    Asia       2007    64.7 1.11e9     2452. 2.72e12
##  3 United … Americas   2007    78.2 3.01e8    42952. 1.29e13
##  4 Indones… Asia       2007    70.6 2.24e8     3541. 7.92e11
##  5 Brazil   Americas   2007    72.4 1.90e8     9066. 1.72e12
##  6 Pakistan Asia       2007    65.5 1.69e8     2606. 4.41e11
##  7 Banglad… Asia       2007    64.1 1.50e8     1391. 2.09e11
##  8 Nigeria  Africa     2007    46.9 1.35e8     2014. 2.72e11
##  9 Japan    Asia       2007    82.6 1.27e8    31656. 4.04e12
## 10 Mexico   Americas   2007    76.2 1.09e8    11978. 1.30e12
gap %>% filter(year==2007) %>% filter(gdpPercap<1000 | lifeExp<50) %>%  group_by(country) %>% arrange(desc(pop))
## # A tibble: 27 x 7
## # Groups:   country [27]
##    country  continent  year lifeExp    pop gdpPercap     gdp
##    <fct>    <fct>     <int>   <dbl>  <int>     <dbl>   <dbl>
##  1 Nigeria  Africa     2007    46.9 1.35e8     2014. 2.72e11
##  2 Ethiopia Africa     2007    52.9 7.65e7      691. 5.29e10
##  3 Congo, … Africa     2007    46.5 6.46e7      278. 1.79e10
##  4 Myanmar  Asia       2007    62.1 4.78e7      944  4.51e10
##  5 South A… Africa     2007    49.3 4.40e7     9270. 4.08e11
##  6 Afghani… Asia       2007    43.8 3.19e7      975. 3.11e10
##  7 Mozambi… Africa     2007    42.1 2.00e7      824. 1.64e10
##  8 Cote d'… Africa     2007    48.3 1.80e7     1545. 2.78e10
##  9 Malawi   Africa     2007    48.3 1.33e7      759. 1.01e10
## 10 Niger    Africa     2007    56.9 1.29e7      620. 7.99e 9
## # … with 17 more rows
gap %>% filter(year==2007) %>% filter(gdpPercap<1000 | lifeExp<50) %>%  filter(continent!="Africa") %>% group_by(country)
## # A tibble: 2 x 7
## # Groups:   country [2]
##   country  continent  year lifeExp     pop gdpPercap     gdp
##   <fct>    <fct>     <int>   <dbl>   <int>     <dbl>   <dbl>
## 1 Afghani… Asia       2007    43.8  3.19e7      975. 3.11e10
## 2 Myanmar  Asia       2007    62.1  4.78e7      944  4.51e10

salvando em um tibble

t1 <- gap %>% filter(year==2007) %>%  group_by(continent) %>% summarise(n=n(), media_pop=mean(pop), minimo_pop=min(pop), maximo_pop=max(pop))
t1
## # A tibble: 5 x 5
##   continent     n  media_pop minimo_pop maximo_pop
##   <fct>     <int>      <dbl>      <int>      <int>
## 1 Africa       52  17875763.     199579  135031164
## 2 Americas     25  35954847.    1056608  301139947
## 3 Asia         33 115513752.     708573 1318683096
## 4 Europe       30  19536618.     301931   82400996
## 5 Oceania       2  12274974.    4115771   20434176

Comandos filter + select(seleciona variáveis)

gap %>%  filter(year==2007) %>% select(continent)
## # A tibble: 142 x 1
##    continent
##    <fct>    
##  1 Asia     
##  2 Europe   
##  3 Africa   
##  4 Africa   
##  5 Americas 
##  6 Oceania  
##  7 Europe   
##  8 Asia     
##  9 Asia     
## 10 Europe   
## # … with 132 more rows
gap %>%  filter(year==2007) %>% select(year,continent)
## # A tibble: 142 x 2
##     year continent
##    <int> <fct>    
##  1  2007 Asia     
##  2  2007 Europe   
##  3  2007 Africa   
##  4  2007 Africa   
##  5  2007 Americas 
##  6  2007 Oceania  
##  7  2007 Europe   
##  8  2007 Asia     
##  9  2007 Asia     
## 10  2007 Europe   
## # … with 132 more rows
gap %>%  filter(year==2007) %>% select(year,continent, country)
## # A tibble: 142 x 3
##     year continent country    
##    <int> <fct>     <fct>      
##  1  2007 Asia      Afghanistan
##  2  2007 Europe    Albania    
##  3  2007 Africa    Algeria    
##  4  2007 Africa    Angola     
##  5  2007 Americas  Argentina  
##  6  2007 Oceania   Australia  
##  7  2007 Europe    Austria    
##  8  2007 Asia      Bahrain    
##  9  2007 Asia      Bangladesh 
## 10  2007 Europe    Belgium    
## # … with 132 more rows
gap %>%  filter(year==2007) %>% select(continent, country, ends_with("p"))
## # A tibble: 142 x 6
##    continent country    lifeExp     pop gdpPercap        gdp
##    <fct>     <fct>        <dbl>   <int>     <dbl>      <dbl>
##  1 Asia      Afghanist…    43.8  3.19e7      975.    3.11e10
##  2 Europe    Albania       76.4  3.60e6     5937.    2.14e10
##  3 Africa    Algeria       72.3  3.33e7     6223.    2.07e11
##  4 Africa    Angola        42.7  1.24e7     4797.    5.96e10
##  5 Americas  Argentina     75.3  4.03e7    12779.    5.15e11
##  6 Oceania   Australia     81.2  2.04e7    34435.    7.04e11
##  7 Europe    Austria       79.8  8.20e6    36126.    2.96e11
##  8 Asia      Bahrain       75.6  7.09e5    29796.    2.11e10
##  9 Asia      Bangladesh    64.1  1.50e8     1391.    2.09e11
## 10 Europe    Belgium       79.4  1.04e7    33693.    3.50e11
## # … with 132 more rows
gap %>%  filter(year==2007) %>% select(ends_with("p"))
## # A tibble: 142 x 4
##    lifeExp       pop gdpPercap           gdp
##      <dbl>     <int>     <dbl>         <dbl>
##  1    43.8  31889923      975.  31079291949.
##  2    76.4   3600523     5937.  21376411360.
##  3    72.3  33333216     6223. 207444851958.
##  4    42.7  12420476     4797.  59583895818.
##  5    75.3  40301927    12779. 515033625357.
##  6    81.2  20434176    34435. 703658358894.
##  7    79.8   8199783    36126. 296229400691.
##  8    75.6    708573    29796.  21112675360.
##  9    64.1 150448339     1391. 209311822134.
## 10    79.4  10392226    33693. 350141166520.
## # … with 132 more rows
gap %>%  filter(year==2007) %>% select(starts_with("c"))
## # A tibble: 142 x 2
##    country     continent
##    <fct>       <fct>    
##  1 Afghanistan Asia     
##  2 Albania     Europe   
##  3 Algeria     Africa   
##  4 Angola      Africa   
##  5 Argentina   Americas 
##  6 Australia   Oceania  
##  7 Austria     Europe   
##  8 Bahrain     Asia     
##  9 Bangladesh  Asia     
## 10 Belgium     Europe   
## # … with 132 more rows
gap %>%  filter(year==2007 & continent=="Americas") %>% select(ends_with("p")) %>% summarise_all(list(~n(), ~mean(.), ~median(.)))
## # A tibble: 1 x 12
##   lifeExp_n pop_n gdpPercap_n gdp_n lifeExp_mean pop_mean
##       <int> <int>       <int> <int>        <dbl>    <dbl>
## 1        25    25          25    25         73.6   3.60e7
## # … with 6 more variables: gdpPercap_mean <dbl>,
## #   gdp_mean <dbl>, lifeExp_median <dbl>, pop_median <int>,
## #   gdpPercap_median <dbl>, gdp_median <dbl>

Combinando dplyr e ggplot

gap %>% filter(country=="Brazil") %>% ggplot(aes(year, gdp))+geom_point() 

gap %>% filter(country=="Brazil") %>% ggplot(aes(year, gdp))+geom_line()

gap %>% filter(country=="Brazil") %>% ggplot(aes(year, gdp))+geom_col()

gap %>% filter(country=="Brazil") %>% ggplot(aes(year, gdp))+geom_col(fill="Blue")

gap %>% filter(country=="Brazil") %>% ggplot(aes(year, gdp))+geom_col(fill="#111111bb")

gap %>% filter(country=="Brazil") %>% ggplot(aes(year, gdp))+geom_col(fill="#11119988")

gap %>% filter(country=="Brazil") %>% ggplot(aes(year, gdp))+geom_col(fill="#11111122")+theme_bw()

gap %>% filter(country=="Brazil") %>% ggplot(aes(year, gdp))+geom_col(fill="#88111144")+theme_bw()

gap %>% filter(continent=="Americas") %>% ggplot(aes(year, gdp, group=country))+geom_line()

gap %>% filter(continent=="Americas") %>% ggplot(aes(year, gdp, col=country))+geom_line()

gap %>% filter(continent=="Americas") %>% ggplot(aes(year, gdp, col=country))+geom_line(size=1)

gap %>% filter(continent=="Americas") %>% ggplot(aes(year, lifeExp, col=country))+geom_line()

#===== excluindo Estados Unidos =====
gap %>% filter(continent=="Americas" & country!="United States") %>% ggplot(aes(year, gdp, col=country))+geom_line()+geom_point()

#=== GDP Oceania 1952 a 2007

gap %>% filter(continent=="Oceania" & country!="United States") %>% ggplot(aes(year, gdp, col=country))+geom_line()+geom_point()