compare gapminder wide vs long
gap_wide <- read_csv(“https://bit.ly/gapminder-rsu”))
Load the packages
pacman::p_load(tidyverse, # several datascience packages
palmerpenguins, # data
gapminder, #data,
gtsummary) # for tables)
gap_wide <- read_csv("https://bit.ly/gapminder-rsu")
## Rows: 142 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): continent, country
## dbl (36): gdpPercap_1952, gdpPercap_1957, gdpPercap_1962, gdpPercap_1967, gd...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Filter by species == Adelie
penguins %>%
filter(species == "Adelie")
Filter by two conditions Adelie & island == Dream
Filter body_mass_g < 3000
penguins %>%
filter(species == "Adelie" & island != "Torgersen")
You can use other operators beyond just the ==
operator that tests for equality:
>
corresponds to “greater than”
<
corresponds to “less than”
>=
corresponds to “greater than or equal to”
<=
corresponds to “less than or equal to”
!=
corresponds to “not equal to.” The !
is used in many programming languages to indicate “not.”
&
is “and”, |
is “or”, and !
is “not”.
TASK: Adelie but not from Torgersen
TASK Adelie or Gentto
|
penguins %>%
filter(species == "Adelie" | species == "Gentoo")
Another way, using %in%
penguins %>%
filter(species %in% c("Adelie", "Gentoo"))
TASK
Filter all penguins from Torgersen weighing more than 3500g
Create new variables
bill_ratio = bill_length_mm / bill_depth_mm
penguins %>%
mutate(bill_ratio = bill_length_mm / bill_depth_mm)
Relocate after island column
penguins %>%
mutate(bill_ratio = bill_length_mm / bill_depth_mm) %>%
relocate(bill_ratio, .after = island)
TASK
penguins %>%
mutate(body_mass_k = log10(body_mass_g))
calculate the mean body_mass_g by specie
penguins %>%
group_by(species) %>%
# now I will remove the NAs values
drop_na() %>%
# and now the summary of the variable
summarise(n = n(),
mean = mean(body_mass_g),
sd = sd(body_mass_g),
max = max(body_mass_g),
min = min(body_mass_g)) %>%
mutate(across(where(is.numeric), ~ round(., 1)))
Dealing with NAs
na.rm = T
drop_na()
penguins %>%
group_by(species) %>%
drop_na() %>%
summarise(median = median(flipper_length_mm),
sd = sd(flipper_length_mm)) %>%
mutate(across(where(is.numeric), ~ round(., 1)))
gap_wide <- read_csv("https://bit.ly/gapminder-rsu")
## Rows: 142 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): continent, country
## dbl (36): gdpPercap_1952, gdpPercap_1957, gdpPercap_1962, gdpPercap_1967, gd...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
How many variables?
Filter China
How to plot ?
Wide data is for humans
Long data is for computers
gdp
gap_wide %>%
select(continent:gdpPercap_2007) %>%
pivot_longer(cols = gdpPercap_1952:gdpPercap_2007,
names_to = "gdp_year",
values_to = "gdp_value") %>%
filter(country == "China") %>%
ggplot(aes(x = gdp_year,
y = gdp_value,
group = country)) +
geom_line()
try to plot
Filter only China
Conects the points
Filter Europe and conects the points
Now, we will use the long gapminder dataset
gapminder %>%
filter(year == 2007) %>%
group_by(continent) %>%
summarise(mean_gdp = mean(gdpPercap),
mean_pop = mean(pop),
mean_lifeexp = mean(lifeExp),
n = n())
gapminder %>%
group_by(continent, year) %>%
summarise(gdp_mean = mean(gdpPercap)) %>%
ggplot(aes(x = year,
y = gdp_mean,
color = continent)) +
geom_line() +
scale_y_log10()
## `summarise()` has grouped output by 'continent'. You can override using the `.groups` argument.
Hint: Try log10
Homework: See <https://github.com/owid/covid-19-data/tree/master/public/data/vaccinations>
covid_vac <- read_csv("https://github.com/owid/covid-19-data/raw/master/public/data/vaccinations/vaccinations.csv")
## Rows: 63861 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): location, iso_code
## dbl (13): total_vaccinations, people_vaccinated, people_fully_vaccinated, t...
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
covid_vac %>%
filter(location == "Latvia") %>%
ggplot(aes(x = date,
y = total_vaccinations)) +
geom_point()
## Warning: Removed 10 rows containing missing values (geom_point).