packages

Be sure to run

pacman::p_load(tidyverse, gapminder, palmerpenguins, janitor, gtsummary, visdat)

Recapitulation

EDA

dim(penguins)
[1] 344   8
summary(penguins)
visdat::vis_dat(penguins)

Data visualizations

Points

bill_length_mm vs bill_depth_mm

penguins %>% 
  ggplot(aes(x = bill_length_mm, 
             y = bill_depth_mm)) +
  geom_point() +
  aes(color = sex)
penguins %>% # my date
  drop_na() %>%  # i will dropn the NA values
  ggplot(aes(x = bill_length_mm, 
             y = bill_depth_mm)) +
  geom_point() + 
  aes(color = sex)
Facetting

One variable continuous

make an histogram with body_mass_g

Two variables continuous

Explore correlations

Two variables continuous vs nominal

body_mass_g by species

Data wrangling

Filter

Asia and 1952

Select

Mutate

1 United States Dollar equals 0.84 Euro

Group

Summarise

gapminder %>% 
  mutate(gpd_in_euros = gdpPercap * 0.84) %>% 
  group_by(continent, year) %>% 
  summarise(meanGdpperCapEur = mean(gpd_in_euros))
`summarise()` regrouping output by 'continent' (override with `.groups` argument)
gapminder %>% 
  mutate(gpd_in_euros = gdpPercap * 0.84) %>% 
  group_by(continent, year) %>% 
  summarise(meanGdpperCapEur = mean(gpd_in_euros)) %>% 
  ggplot(aes(x = year, 
             y = meanGdpperCapEur, 
             color = continent)) + 
  geom_line() +  
  scale_y_log10()
`summarise()` regrouping output by 'continent' (override with `.groups` argument)

Characteristic Adelie, N = 1461 Chinstrap, N = 681 Gentoo, N = 1191
body_mass_g 3,700 (3,362, 4,000) 3,700 (3,488, 3,950) 5,050 (4,700, 5,500)
bill_length_mm 38.8 (36.7, 40.8) 49.5 (46.3, 51.1) 47.4 (45.3, 49.6)
island
Biscoe 44 (30%) 0 (0%) 119 (100%)
Dream 55 (38%) 68 (100%) 0 (0%)
Torgersen 47 (32%) 0 (0%) 0 (0%)

1 Statistics presented: Median (IQR); n (%)

Pivot

unicef <- read_csv("https://bit.ly/unicef-wide")

Longer

Wider

gapminder %>% 
  group_by(continent, year) %>% 
  summarise(LifeExpMean = mean(lifeExp))
`summarise()` regrouping output by 'continent' (override with `.groups` argument)
gapminder %>% 
  group_by(continent, year) %>% 
  summarise(LifeExpMean = mean(lifeExp)) %>% 
  pivot_wider(names_from = year, 
              values_from = LifeExpMean )
`summarise()` regrouping output by 'continent' (override with `.groups` argument)

Join dataset

countries_list <- read_csv("https://datahub.io/JohnSnowLabs/country-and-continent-codes-list/r/country-and-continent-codes-list-csv.csv")

── Column specification ─────────────────────────────────────────────────────────────────
cols(
  Continent_Name = col_character(),
  Continent_Code = col_character(),
  Country_Name = col_character(),
  Two_Letter_Country_Code = col_character(),
  Three_Letter_Country_Code = col_character(),
  Country_Number = col_double()
)
names(unicef)
 [1] "country_name"              "u5mr_1950"                 "u5mr_1951"                
 [4] "u5mr_1952"                 "u5mr_1953"                 "u5mr_1954"                
 [7] "u5mr_1955"                 "u5mr_1956"                 "u5mr_1957"                
[10] "u5mr_1958"                 "u5mr_1959"                 "u5mr_1960"                
[13] "u5mr_1961"                 "u5mr_1962"                 "u5mr_1963"                
[16] "u5mr_1964"                 "u5mr_1965"                 "u5mr_1966"                
[19] "u5mr_1967"                 "u5mr_1968"                 "u5mr_1969"                
[22] "u5mr_1970"                 "u5mr_1971"                 "u5mr_1972"                
[25] "u5mr_1973"                 "u5mr_1974"                 "u5mr_1975"                
[28] "u5mr_1976"                 "u5mr_1977"                 "u5mr_1978"                
[31] "u5mr_1979"                 "u5mr_1980"                 "u5mr_1981"                
[34] "u5mr_1982"                 "u5mr_1983"                 "u5mr_1984"                
[37] "u5mr_1985"                 "u5mr_1986"                 "u5mr_1987"                
[40] "u5mr_1988"                 "u5mr_1989"                 "u5mr_1990"                
[43] "u5mr_1991"                 "u5mr_1992"                 "u5mr_1993"                
[46] "u5mr_1994"                 "u5mr_1995"                 "u5mr_1996"                
[49] "u5mr_1997"                 "u5mr_1998"                 "u5mr_1999"                
[52] "u5mr_2000"                 "u5mr_2001"                 "u5mr_2002"                
[55] "u5mr_2003"                 "u5mr_2004"                 "u5mr_2005"                
[58] "u5mr_2006"                 "u5mr_2007"                 "u5mr_2008"                
[61] "u5mr_2009"                 "u5mr_2010"                 "u5mr_2011"                
[64] "u5mr_2012"                 "u5mr_2013"                 "u5mr_2014"                
[67] "u5mr_2015"                 "Code"                      "Continent_Name"           
[70] "Continent_Code"            "Country_Name"              "Three_Letter_Country_Code"
[73] "Country_Number"           

calculate the mortality rate by continent and year

What is the trend in the mortality rate by continent?

unicef %>% 
  # first reformat from wide to long
  pivot_longer(u5mr_1950:u5mr_2015, 
               names_to = "year", 
               values_to = "value") %>% 
  # select only relevant columns 
  select(continent_name, year, value) %>% 
  # separate the year column 
  separate(year, into = c("delete", "year"), 
           sep = "_") %>% 
  # now delete the delete column 
  select(-delete) %>% 
  # group and summarize
  group_by(continent_name, year) %>% 
  summarise(meanMrtRate5y = mean(value)) %>% 
  ggplot(aes(x = year, 
             y = meanMrtRate5y, 
             color = continent_name)) + 
  geom_point()
`summarise()` regrouping output by 'continent_name' (override with `.groups` argument)

---
title: "R Notebook"
output:
  html_notebook: default
  pdf_document: default
editor_options:
  chunk_output_type: inline
---

# packages


Be sure to run 

pacman::p_load(tidyverse, 
               gapminder, 
               palmerpenguins, 
               janitor, 
               gtsummary, 
               visdat)
               
               
```{r}
pacman::p_load(tidyverse, 
               gapminder, 
               palmerpenguins, 
               janitor, 
               gtsummary, 
               visdat)
```


# Recapitulation

# EDA

```{r}
head(penguins)
```

```{r}
dim(penguins)
```
```{r}
summary(penguins)
```
```{r}
visdat::vis_dat(penguins)
```

### Data visualizations

#### Points
bill_length_mm
vs 
bill_depth_mm

```{r}
penguins %>% 
  ggplot(aes(x = bill_length_mm, 
             y = bill_depth_mm)) +
  geom_point()
```

```{r}
penguins %>% 
  ggplot(aes(x = bill_length_mm, 
             y = bill_depth_mm)) +
  geom_point() +
  aes(color = sex)
```

```{r}
penguins %>% # my date
  drop_na() %>%  # i will dropn the NA values
  ggplot(aes(x = bill_length_mm, 
             y = bill_depth_mm)) +
  geom_point() + 
  aes(color = sex)
```

##### Facetting

```{r}
penguins %>% # my date
  drop_na() %>%  # i will dropn the NA values
  ggplot(aes(x = bill_length_mm, 
             y = bill_depth_mm)) +
  geom_point() + 
  aes(color = sex) +
  facet_wrap(~species)
```
```{r}
penguins %>% # my date
  drop_na() %>%  # i will drop the NA values
  ggplot(aes(x = bill_length_mm, 
             y = bill_depth_mm)) +
  geom_point() + 
  aes(color = sex) +
  facet_grid(island ~ species)
```

### One variable continuous
make an histogram with body_mass_g


```{r}
penguins %>%
  drop_na() %>% 
  ggplot(aes(x = body_mass_g)) +
  geom_histogram(bins = 12) +
  facet_grid(species ~ sex) +
  theme_minimal()
```
```{r}
pacman::p_load(ggthemes)
```

```{r}
penguins %>%
  drop_na() %>% 
  ggplot(aes(x = body_mass_g)) +
  geom_histogram(bins = 12) +
  facet_grid(species ~ sex) +
  ggpubr::theme_pubclean()
```


### Two variables continuous

```{r}
penguins %>%
  drop_na() %>%
  ggplot(aes(x = body_mass_g,
             y = bill_depth_mm)) +
  geom_point() +
  aes(color = sex) +
  facet_grid(sex ~ species) +
  theme_minimal() +
  labs(
    title = "Bill vs Weight", 
    y = "Bil Depth mm", 
    x = "Body mass g", 
    color = "Sex"
  )
```
#### Explore correlations

```{r}
penguins %>%
  drop_na() %>%
  ggplot(aes(x = body_mass_g,
             y = bill_depth_mm)) +
  geom_point() +
  aes(color = sex) +
  theme_minimal() +
  labs(
    title = "Bill vs Weight", 
    y = "Bil Depth mm", 
    x = "Body mass g", 
    color = "Sex"
  ) + 
  geom_smooth()
```
### Two variables continuous vs nominal

body_mass_g by species

```{r}
penguins %>% 
  drop_na() %>% 
  ggplot(aes(x = species,
             y = body_mass_g)) +
  geom_boxplot() +
  aes(color = sex)
```
### Trends

```{r}
gapminder
```
```{r}
gapminder <- gapminder::gapminder
```

# Data wrangling

## Filter

```{r}
gapminder %>% 
  filter(country == "Germany"  ) 
```
```{r}
gapminder %>% 
  filter(continent == "Europe" & year == 2007)
```

Asia and 1952
```{r}
gapminder %>% 
  filter(continent=="Asia" & year == "1952")
```
```{r}
gapminder %>% 
  filter(continent %in% c("Asia", "Europe") & year %in% c(1952, 2002) ) 
```


```{r}
selected_countries <- c("Bahrain", "Albania")
```



## Select
```{r}
gapminder %>% 
  select(continent, 
         lifeExp)
```
```{r}
gapminder %>%
  filter(country %in% selected_countries)
```



## Mutate

1 United States Dollar equals 0.84 Euro

```{r}
gapminder %>% 
  mutate(gpd_in_euros = gdpPercap * 0.84)
```



## Group

```{r}
gapminder %>% 
  group_by(continent, year)
```


## Summarise
```{r}
gapminder %>% 
  mutate(gpd_in_euros = gdpPercap * 0.84) %>% 
  group_by(continent, year) %>% 
  summarise(meanGdpperCapEur = mean(gpd_in_euros))
```
```{r}
gapminder %>% 
  mutate(gpd_in_euros = gdpPercap * 0.84) %>% 
  group_by(continent, year) %>% 
  summarise(meanGdpperCapEur = mean(gpd_in_euros)) %>% 
  ggplot(aes(x = year, 
             y = meanGdpperCapEur, 
             color = continent)) + 
  geom_line() +  
  scale_y_log10()
 
```

```{r}
penguins %>% 
  drop_na() %>% 
  select(body_mass_g, bill_length_mm, species, island) %>% 
  gtsummary::tbl_summary(by = species) %>% 
  gtsummary::bold_labels()
```






## Pivot



```{r}
unicef <- read_csv("https://bit.ly/unicef-wide")
```


```{r}
head(unicef)
```

```{r}
unicef <- unicef %>% 
  janitor::clean_names()
```



### Longer

```{r}
unicef %>% 
  pivot_longer(u5mr_1950:u5mr_2015, 
               names_to = "year", 
               values_to = "value")
```
```{r}
unicef_long <- unicef %>% 
  pivot_longer(u5mr_1950:u5mr_2015, 
               names_to = "year", 
               values_to = "value")
```

```{r}
unicef_long
```

```{r}
unicef_long <- unicef_long %>%
  separate(year,
           into = c("etc", "year"),
           sep = "_") %>% 
  select(-etc)
```


```{r}
head(unicef_long)
```

### Wider
```{r}
gapminder %>% 
  group_by(continent, year) %>% 
  summarise(LifeExpMean = mean(lifeExp))
```
```{r}
gapminder %>% 
  group_by(continent, year) %>% 
  summarise(LifeExpMean = mean(lifeExp)) %>% 
  pivot_wider(names_from = year, 
              values_from = LifeExpMean )
```

## Join dataset

```{r}
unicef
```


```{r}
countries_short <- read_csv("https://pkgstore.datahub.io/core/country-list/data_csv/data/d7c9d7cfb42cb69f4422dec222dbbaa8/data_csv.csv")


countries_list <- read_csv("https://datahub.io/JohnSnowLabs/country-and-continent-codes-list/r/country-and-continent-codes-list-csv.csv")
```
```{r}
head(countries_short)
```


```{r}
countries_list
```



```{r}
unicef
```

```{r}
left_join(unicef, 
          countries_short, 
          by = c("country_name" = "Name"))
```
```{r}
unicef <- left_join(unicef, 
          countries_short, 
          by = c("country_name" = "Name"))
```

```{r}
left_join(unicef, 
          countries_list, 
          by = c("Code" = "Two_Letter_Country_Code"))
```

```{r}
unicef <- left_join(unicef, 
          countries_list, 
          by = c("Code" = "Two_Letter_Country_Code"))
```


```{r}
names(unicef)
```
calculate the mortality rate by continent and year

```{r}
unicef
```


What is the trend in the mortality rate by continent?

```{r}
unicef %>% 
  # first reformat from wide to long
  pivot_longer(u5mr_1950:u5mr_2015, 
               names_to = "year", 
               values_to = "value") %>% 
  # select only relevant columns 
  select(continent_name, year, value) %>% 
  # separate the year column 
  separate(year, into = c("delete", "year"), 
           sep = "_") %>% 
  # now delete the delete column 
  select(-delete) %>% 
  # group and summarize
  group_by(continent_name, year) %>% 
  summarise(meanMrtRate5y = mean(value)) %>% 
  ggplot(aes(x = year, 
             y = meanMrtRate5y, 
             color = continent_name)) + 
  geom_point()
```































