library(tidyverse)
library(gapminder)
Tidyverse Functions
Preliminaries
Load packages
Check: Are the packages are already installed? If not, first run
install.packages()
Make a copy of gapminder
<- gapminder df1
Inspect the data
head(df1)
tail(df1)
dim(df1)
glimpse(df1)
summary(df1)
Filter function
Filter one country
Select one country: China
## Create a new data frame for China
<- df1 %>%
df_China filter(country == "China")
## Scatter plot and line plot
ggplot(df_China,
aes(x = year, y = pop)) +
geom_point() +
geom_line() +
labs(x = "",
y = "Population",
title = "China's population over time",
caption = "Source: World Bank via gapminder")
Filter more than one country
Two countries: China, India
## Create a new dataframe
<- df1 %>%
df2 filter(country %in% c("China", "India"))
ggplot(df2,
aes(x = year, y = pop, color = country)) +
geom_line() +
labs(x = "",
y = "Population",
title = "Population in China and India, 1952-2007",
subtitle = "Wow, look at the increase over the period!",
caption = "Source: World Bank via gapminder")
Mutate function
Create a new variable
## Create a new variable called log_gdp_per_capita
<- df1 %>%
df3 mutate(log_gdp_per_capita = log(gdpPercap))
df3
# A tibble: 1,704 × 7
country continent year lifeExp pop gdpPercap log_gdp_per_capita
<fct> <fct> <int> <dbl> <int> <dbl> <dbl>
1 Afghanistan Asia 1952 28.8 8425333 779. 6.66
2 Afghanistan Asia 1957 30.3 9240934 821. 6.71
3 Afghanistan Asia 1962 32.0 10267083 853. 6.75
4 Afghanistan Asia 1967 34.0 11537966 836. 6.73
5 Afghanistan Asia 1972 36.1 13079460 740. 6.61
6 Afghanistan Asia 1977 38.4 14880372 786. 6.67
7 Afghanistan Asia 1982 39.9 12881816 978. 6.89
8 Afghanistan Asia 1987 40.8 13867957 852. 6.75
9 Afghanistan Asia 1992 41.7 16317921 649. 6.48
10 Afghanistan Asia 1997 41.8 22227415 635. 6.45
# ℹ 1,694 more rows
Add regression lines to a scatter plot
## Scatter plot
ggplot(data = df3,
aes(x = log_gdp_per_capita, y = lifeExp)) +
geom_point(color = "red")
## Add a regression line (straight line through the data points)
ggplot(data = df3,
aes(x = log_gdp_per_capita, y = lifeExp)) +
geom_point(color = "red") +
geom_smooth(method = "lm", se = FALSE)
## Plot a regression line for each continent
ggplot(data = df3,
aes(x = log_gdp_per_capita, y = lifeExp, color = continent)) +
geom_point() +
geom_smooth(method = "lm", se=FALSE)
Create plots using data for one year
<- df3 %>%
df4 filter(year == 2007)
ggplot(df4,
aes(x = log_gdp_per_capita, y = lifeExp)) +
geom_point() +
geom_smooth(method = lm, se = FALSE) +
labs( title = "Life expectancy vs Income in 2007")
Exercise 1
Create a new variable called gdp, where gdp is the product of GDP per capita and population. Plot gdp over time for three countries, with a different color for each country. Provide suitable labels.
Exercise 2
Create a scatterplot of life expectancy vs gdp for all countries in 2002. Add a regression line through the data points. Provide suitable labels.
Same plot as before, but indicate each continent by a different color. Add a regression line for each continent. Provide suitable labels.
ba260-tidyverse-commands-R.qmd