packages

Be sure to run

pacman::p_load(tidyverse, gapminder, palmerpenguins, janitor, gtsummary, visdat)

Recapitulation

EDA

dim(penguins)
[1] 344   8
summary(penguins)
visdat::vis_dat(penguins)

Data visualizations

Points

bill_length_mm vs bill_depth_mm

penguins %>% 
  ggplot(aes(x = bill_length_mm, 
             y = bill_depth_mm)) +
  geom_point() +
  aes(color = sex)
penguins %>% # my date
  drop_na() %>%  # i will dropn the NA values
  ggplot(aes(x = bill_length_mm, 
             y = bill_depth_mm)) +
  geom_point() + 
  aes(color = sex)
Facetting

One variable continuous

make an histogram with body_mass_g

Two variables continuous

Explore correlations

Two variables continuous vs nominal

body_mass_g by species

Data wrangling

Filter

Asia and 1952

Select

Mutate

1 United States Dollar equals 0.84 Euro

Group

Summarise

gapminder %>% 
  mutate(gpd_in_euros = gdpPercap * 0.84) %>% 
  group_by(continent, year) %>% 
  summarise(meanGdpperCapEur = mean(gpd_in_euros))
`summarise()` regrouping output by 'continent' (override with `.groups` argument)
gapminder %>% 
  mutate(gpd_in_euros = gdpPercap * 0.84) %>% 
  group_by(continent, year) %>% 
  summarise(meanGdpperCapEur = mean(gpd_in_euros)) %>% 
  ggplot(aes(x = year, 
             y = meanGdpperCapEur, 
             color = continent)) + 
  geom_line() +  
  scale_y_log10()
`summarise()` regrouping output by 'continent' (override with `.groups` argument)

Characteristic Adelie, N = 1461 Chinstrap, N = 681 Gentoo, N = 1191
body_mass_g 3,700 (3,362, 4,000) 3,700 (3,488, 3,950) 5,050 (4,700, 5,500)
bill_length_mm 38.8 (36.7, 40.8) 49.5 (46.3, 51.1) 47.4 (45.3, 49.6)
island
Biscoe 44 (30%) 0 (0%) 119 (100%)
Dream 55 (38%) 68 (100%) 0 (0%)
Torgersen 47 (32%) 0 (0%) 0 (0%)

1 Statistics presented: Median (IQR); n (%)

Pivot

unicef <- read_csv("https://bit.ly/unicef-wide")

Longer

Wider

gapminder %>% 
  group_by(continent, year) %>% 
  summarise(LifeExpMean = mean(lifeExp))
`summarise()` regrouping output by 'continent' (override with `.groups` argument)
gapminder %>% 
  group_by(continent, year) %>% 
  summarise(LifeExpMean = mean(lifeExp)) %>% 
  pivot_wider(names_from = year, 
              values_from = LifeExpMean )
`summarise()` regrouping output by 'continent' (override with `.groups` argument)

Join dataset

countries_list <- read_csv("https://datahub.io/JohnSnowLabs/country-and-continent-codes-list/r/country-and-continent-codes-list-csv.csv")

── Column specification ─────────────────────────────────────────────────────────────────
cols(
  Continent_Name = col_character(),
  Continent_Code = col_character(),
  Country_Name = col_character(),
  Two_Letter_Country_Code = col_character(),
  Three_Letter_Country_Code = col_character(),
  Country_Number = col_double()
)
names(unicef)
 [1] "country_name"              "u5mr_1950"                 "u5mr_1951"                
 [4] "u5mr_1952"                 "u5mr_1953"                 "u5mr_1954"                
 [7] "u5mr_1955"                 "u5mr_1956"                 "u5mr_1957"                
[10] "u5mr_1958"                 "u5mr_1959"                 "u5mr_1960"                
[13] "u5mr_1961"                 "u5mr_1962"                 "u5mr_1963"                
[16] "u5mr_1964"                 "u5mr_1965"                 "u5mr_1966"                
[19] "u5mr_1967"                 "u5mr_1968"                 "u5mr_1969"                
[22] "u5mr_1970"                 "u5mr_1971"                 "u5mr_1972"                
[25] "u5mr_1973"                 "u5mr_1974"                 "u5mr_1975"                
[28] "u5mr_1976"                 "u5mr_1977"                 "u5mr_1978"                
[31] "u5mr_1979"                 "u5mr_1980"                 "u5mr_1981"                
[34] "u5mr_1982"                 "u5mr_1983"                 "u5mr_1984"                
[37] "u5mr_1985"                 "u5mr_1986"                 "u5mr_1987"                
[40] "u5mr_1988"                 "u5mr_1989"                 "u5mr_1990"                
[43] "u5mr_1991"                 "u5mr_1992"                 "u5mr_1993"                
[46] "u5mr_1994"                 "u5mr_1995"                 "u5mr_1996"                
[49] "u5mr_1997"                 "u5mr_1998"                 "u5mr_1999"                
[52] "u5mr_2000"                 "u5mr_2001"                 "u5mr_2002"                
[55] "u5mr_2003"                 "u5mr_2004"                 "u5mr_2005"                
[58] "u5mr_2006"                 "u5mr_2007"                 "u5mr_2008"                
[61] "u5mr_2009"                 "u5mr_2010"                 "u5mr_2011"                
[64] "u5mr_2012"                 "u5mr_2013"                 "u5mr_2014"                
[67] "u5mr_2015"                 "Code"                      "Continent_Name"           
[70] "Continent_Code"            "Country_Name"              "Three_Letter_Country_Code"
[73] "Country_Number"           

calculate the mortality rate by continent and year

What is the trend in the mortality rate by continent?

unicef %>% 
  # first reformat from wide to long
  pivot_longer(u5mr_1950:u5mr_2015, 
               names_to = "year", 
               values_to = "value") %>% 
  # select only relevant columns 
  select(continent_name, year, value) %>% 
  # separate the year column 
  separate(year, into = c("delete", "year"), 
           sep = "_") %>% 
  # now delete the delete column 
  select(-delete) %>% 
  # group and summarize
  group_by(continent_name, year) %>% 
  summarise(meanMrtRate5y = mean(value)) %>% 
  ggplot(aes(x = year, 
             y = meanMrtRate5y, 
             color = continent_name)) + 
  geom_point()
`summarise()` regrouping output by 'continent_name' (override with `.groups` argument)

