Loading the gapminder and dplyr packages

#install.packages("tidyverse")
#install.packages("gapminder")
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ----------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(gapminder)

Load the gapminder data and Taking a glance at gapminder data

gapminder
## # A tibble: 1,704 x 6
##    country     continent  year lifeExp      pop gdpPercap
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Afghanistan Asia       1952    28.8  8425333      779.
##  2 Afghanistan Asia       1957    30.3  9240934      821.
##  3 Afghanistan Asia       1962    32.0 10267083      853.
##  4 Afghanistan Asia       1967    34.0 11537966      836.
##  5 Afghanistan Asia       1972    36.1 13079460      740.
##  6 Afghanistan Asia       1977    38.4 14880372      786.
##  7 Afghanistan Asia       1982    39.9 12881816      978.
##  8 Afghanistan Asia       1987    40.8 13867957      852.
##  9 Afghanistan Asia       1992    41.7 16317921      649.
## 10 Afghanistan Asia       1997    41.8 22227415      635.
## # ... with 1,694 more rows

Data Wrangling

Filtering with filter() function

filter() function in R works as WHERE keyword in SQL.

Filter the gapminder dataset for the year 1957

gapminder %>%
  filter(year == 1957)
## # A tibble: 142 x 6
##    country     continent  year lifeExp      pop gdpPercap
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Afghanistan Asia       1957    30.3  9240934      821.
##  2 Albania     Europe     1957    59.3  1476505     1942.
##  3 Algeria     Africa     1957    45.7 10270856     3014.
##  4 Angola      Africa     1957    32.0  4561361     3828.
##  5 Argentina   Americas   1957    64.4 19610538     6857.
##  6 Australia   Oceania    1957    70.3  9712569    10950.
##  7 Austria     Europe     1957    67.5  6965860     8843.
##  8 Bahrain     Asia       1957    53.8   138655    11636.
##  9 Bangladesh  Asia       1957    39.3 51365468      662.
## 10 Belgium     Europe     1957    69.2  8989111     9715.
## # ... with 132 more rows

Filter for China in 2002

gapminder %>%
  filter(country == "China", year == 2002)
## # A tibble: 1 x 6
##   country continent  year lifeExp        pop gdpPercap
##   <fct>   <fct>     <int>   <dbl>      <int>     <dbl>
## 1 China   Asia       2002    72.0 1280400000     3119.

Sorting with arrange() function

arrange() function is like ORDER BY keyword in SQL. However, SQL needs ASC OR DESC to specify the order ascending and descending, where R just needs to specify desc() if it wants to output in descending order otherwise the the records on the variable/column is in ascending order(smallest to biggest) by default.

Sort in ascending order of lifeExp

gapminder %>%
  arrange(lifeExp) # since default order is ascending, so no need to specify
## # A tibble: 1,704 x 6
##    country      continent  year lifeExp     pop gdpPercap
##    <fct>        <fct>     <int>   <dbl>   <int>     <dbl>
##  1 Rwanda       Africa     1992    23.6 7290203      737.
##  2 Afghanistan  Asia       1952    28.8 8425333      779.
##  3 Gambia       Africa     1952    30    284320      485.
##  4 Angola       Africa     1952    30.0 4232095     3521.
##  5 Sierra Leone Africa     1952    30.3 2143249      880.
##  6 Afghanistan  Asia       1957    30.3 9240934      821.
##  7 Cambodia     Asia       1977    31.2 6978607      525.
##  8 Mozambique   Africa     1952    31.3 6446316      469.
##  9 Sierra Leone Africa     1957    31.6 2295678     1004.
## 10 Burkina Faso Africa     1952    32.0 4469979      543.
## # ... with 1,694 more rows

Sort in descending order of lifeExp

gapminder %>%
  arrange(desc(lifeExp))
## # A tibble: 1,704 x 6
##    country          continent  year lifeExp       pop gdpPercap
##    <fct>            <fct>     <int>   <dbl>     <int>     <dbl>
##  1 Japan            Asia       2007    82.6 127467972    31656.
##  2 Hong Kong, China Asia       2007    82.2   6980412    39725.
##  3 Japan            Asia       2002    82   127065841    28605.
##  4 Iceland          Europe     2007    81.8    301931    36181.
##  5 Switzerland      Europe     2007    81.7   7554661    37506.
##  6 Hong Kong, China Asia       2002    81.5   6762476    30209.
##  7 Australia        Oceania    2007    81.2  20434176    34435.
##  8 Spain            Europe     2007    80.9  40448191    28821.
##  9 Sweden           Europe     2007    80.9   9031088    33860.
## 10 Israel           Asia       2007    80.7   6426679    25523.
## # ... with 1,694 more rows

Filtering and arranging

We’ll often need to use the pipe operator (%>%) to combine multiple dplyr verbs in a row

Filter for the year 1957, then arrange in descending order of population

gapminder %>%
  filter(year == 1957) %>%
  arrange(desc(pop))
## # A tibble: 142 x 6
##    country        continent  year lifeExp       pop gdpPercap
##    <fct>          <fct>     <int>   <dbl>     <int>     <dbl>
##  1 China          Asia       1957    50.5 637408000      576.
##  2 India          Asia       1957    40.2 409000000      590.
##  3 United States  Americas   1957    69.5 171984000    14847.
##  4 Japan          Asia       1957    65.5  91563009     4318.
##  5 Indonesia      Asia       1957    39.9  90124000      859.
##  6 Germany        Europe     1957    69.1  71019069    10188.
##  7 Brazil         Americas   1957    53.3  65551171     2487.
##  8 United Kingdom Europe     1957    70.4  51430000    11283.
##  9 Bangladesh     Asia       1957    39.3  51365468      662.
## 10 Italy          Europe     1957    67.8  49182000     6249.
## # ... with 132 more rows

The mutate() function

Use mutate to change a variable-its like calculated field in Tableau and calculated column/transform column in Power BI. it can either replace the existing column or add new one

Use mutate to change lifeExp to be in months

gapminder %>%
  mutate(lifeExp = lifeExp * 12) # to change the existing lifeExp column, by multiplying it by 12
## # A tibble: 1,704 x 6
##    country     continent  year lifeExp      pop gdpPercap
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Afghanistan Asia       1952    346.  8425333      779.
##  2 Afghanistan Asia       1957    364.  9240934      821.
##  3 Afghanistan Asia       1962    384. 10267083      853.
##  4 Afghanistan Asia       1967    408. 11537966      836.
##  5 Afghanistan Asia       1972    433. 13079460      740.
##  6 Afghanistan Asia       1977    461. 14880372      786.
##  7 Afghanistan Asia       1982    478. 12881816      978.
##  8 Afghanistan Asia       1987    490. 13867957      852.
##  9 Afghanistan Asia       1992    500. 16317921      649.
## 10 Afghanistan Asia       1997    501. 22227415      635.
## # ... with 1,694 more rows

Use mutate to create a new column called lifeExpMonths

gapminder %>%
  mutate(lifeExpMonths = lifeExp * 12) # to add a new column, called lifeExpMonths
## # A tibble: 1,704 x 7
##    country     continent  year lifeExp      pop gdpPercap lifeExpMonths
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>         <dbl>
##  1 Afghanistan Asia       1952    28.8  8425333      779.          346.
##  2 Afghanistan Asia       1957    30.3  9240934      821.          364.
##  3 Afghanistan Asia       1962    32.0 10267083      853.          384.
##  4 Afghanistan Asia       1967    34.0 11537966      836.          408.
##  5 Afghanistan Asia       1972    36.1 13079460      740.          433.
##  6 Afghanistan Asia       1977    38.4 14880372      786.          461.
##  7 Afghanistan Asia       1982    39.9 12881816      978.          478.
##  8 Afghanistan Asia       1987    40.8 13867957      852.          490.
##  9 Afghanistan Asia       1992    41.7 16317921      649.          500.
## 10 Afghanistan Asia       1997    41.8 22227415      635.          501.
## # ... with 1,694 more rows

Combining filter, mutate, and arrange

gapminder %>%
  filter(year == 2007) %>%
  mutate(lifeExpMonths = lifeExp * 12) %>%
    arrange(desc(lifeExpMonths))
## # A tibble: 142 x 7
##    country          continent  year lifeExp       pop gdpPercap lifeExpMonths
##    <fct>            <fct>     <int>   <dbl>     <int>     <dbl>         <dbl>
##  1 Japan            Asia       2007    82.6 127467972    31656.          991.
##  2 Hong Kong, China Asia       2007    82.2   6980412    39725.          986.
##  3 Iceland          Europe     2007    81.8    301931    36181.          981.
##  4 Switzerland      Europe     2007    81.7   7554661    37506.          980.
##  5 Australia        Oceania    2007    81.2  20434176    34435.          975.
##  6 Spain            Europe     2007    80.9  40448191    28821.          971.
##  7 Sweden           Europe     2007    80.9   9031088    33860.          971.
##  8 Israel           Asia       2007    80.7   6426679    25523.          969.
##  9 France           Europe     2007    80.7  61083916    30470.          968.
## 10 Canada           Americas   2007    80.7  33390141    36319.          968.
## # ... with 132 more rows

Visualising with ggplot2

Loading ggplot2 package

library(ggplot2)

Filter gapminder for observations from the year 1952, and assign it to a new dataset gapminder_1952

gapminder_1952 <- gapminder %>%
  filter(year == 1952)

Change to put pop on the x-axis and gdpPercap on the y-axis

ggplot(gapminder_1952, aes(x = gdpPercap, y = lifeExp)) + # specifying x axis and y axises in aesthetic dimension
  geom_point() # gem_point means it should be a scatterplot with points 

Change the scatter plot of gapminder_1952 so that (pop) is on the x-axis and GDP per capita (gdpPercap) is on the y-axis.

ggplot(gapminder_1952, aes(x = pop, y = gdpPercap)) +
  geom_point()

Create a scatter plot of gapminder_1952 with population (pop) is on the x-axis and life expectancy (lifeExp) on the y-axis.

ggplot(gapminder_1952, aes(x = pop, y = lifeExp)) +
  geom_point()

Log scale in ggplot

Putting x-axis on a log scale

ggplot(gapminder_1952, aes(x = pop, y = lifeExp)) +
  geom_point() +
  scale_x_log10()

Putting the x- and y- axes on a log scale

ggplot(gapminder_1952, aes(x=gdpPercap, y=pop)) +
geom_point() +
scale_x_log10() +
scale_y_log10()

Aditional Aesthetics in ggplot

Adding color to a scatter plot, Scatter plot comparing pop and lifeExp, with color representing continent

ggplot(gapminder_1952, aes(x=pop, y=lifeExp, color = continent)) +
  geom_point() +
  scale_x_log10()

Adding size and color to a plot-Modify the scatter plot so that the size of the points represents each country’s GDP per capita (gdpPercap).

ggplot(gapminder_1952, aes(x = pop, y = lifeExp, color = continent, size = gdpPercap)) +
  geom_point() +
  scale_x_log10()

Faceting

~ symbol in R represents “by”

Create a scatter plot of gapminder_1952 with the x-axis representing population (pop), the y-axis representing life expectancy (lifeExp), and faceted to have one subplot per continent (continent). Put the x-axis on a log scale.

ggplot(gapminder_1952, aes(x=pop, y=lifeExp)) +
  geom_point() +
  scale_x_log10() +
  facet_wrap(~continent)

Create a scatter plot of the gapminder, then Put GDP per capita (gdpPercap) on the x-axis and life expectancy (lifeExp) on the y-axis, with continent (continent) represented by color and population (pop) represented by size.. After that Put the x-axis on a log scale, and Facet by the year variable

ggplot(gapminder, aes(x=gdpPercap, y=lifeExp, color=continent, size=pop)) +
  geom_point() +
  scale_x_log10() +
  facet_wrap(~year)

Grouping and Summarising

Summarising the data

Use the median() function within a summarize() to find the median life expectancy. Save it into a column called medianLifeExp.

gapminder %>%
  summarize(medianLifeExp = median(lifeExp)) # Here, medianLifeExp works like AS Alias in SQL
## # A tibble: 1 x 1
##   medianLifeExp
##           <dbl>
## 1          60.7

Filter for the year 1957, then use the median() function within a summarize() to calculate the median life expectancy into a column called medianLifeExp.

gapminder %>%
  filter(year == 1957) %>%
  summarize(medianLifeExp = median(lifeExp))
## # A tibble: 1 x 1
##   medianLifeExp
##           <dbl>
## 1          48.4

Find both the median life expectancy (lifeExp) and the maximum GDP per capita (gdpPercap) in the year 1957, calling them medianLifeExp and maxGdpPercap respectively. You can use the max() function to find the maximum.

gapminder %>%
  filter(year == 1957) %>%
  summarize(medianLifeExp = median(lifeExp), maxGdpPercap = max(gdpPercap))
## # A tibble: 1 x 2
##   medianLifeExp maxGdpPercap
##           <dbl>        <dbl>
## 1          48.4      113523.

Group by

Find the median life expectancy (lifeExp) and maximum GDP per capita (gdpPercap) within each year, saving them into medianLifeExp and maxGdpPercap, respectively.

gapminder %>%
  group_by(year) %>%
  summarize(medianLifeExp = median(lifeExp), maxGdpPercap = max(gdpPercap))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 12 x 3
##     year medianLifeExp maxGdpPercap
##    <int>         <dbl>        <dbl>
##  1  1952          45.1      108382.
##  2  1957          48.4      113523.
##  3  1962          50.9       95458.
##  4  1967          53.8       80895.
##  5  1972          56.5      109348.
##  6  1977          59.7       59265.
##  7  1982          62.4       33693.
##  8  1987          65.8       31541.
##  9  1992          67.7       34933.
## 10  1997          69.4       41283.
## 11  2002          70.8       44684.
## 12  2007          71.9       49357.

Filter the gapminder data for the year 1957. Then find the median life expectancy (lifeExp) and maximum GDP per capita (gdpPercap) within each continent, saving them into medianLifeExp and maxGdpPercap, respectively.

gapminder %>%
  filter(year == 1957) %>%
  group_by(continent) %>%
  summarize(medianLifeExp = median(lifeExp), maxGdpPercap = max(gdpPercap))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 5 x 3
##   continent medianLifeExp maxGdpPercap
##   <fct>             <dbl>        <dbl>
## 1 Africa             40.6        5487.
## 2 Americas           56.1       14847.
## 3 Asia               48.3      113523.
## 4 Europe             67.6       17909.
## 5 Oceania            70.3       12247.

Find the median life expectancy (lifeExp) and maximum GDP per capita (gdpPercap) within each combination of continent and year, saving them into medianLifeExp and maxGdpPercap, respectively.

gapminder %>%
  group_by(continent, year) %>%
  summarize(medianLifeExp = median(lifeExp), maxGdpPercap = max(gdpPercap))
## `summarise()` regrouping output by 'continent' (override with `.groups` argument)
## # A tibble: 60 x 4
## # Groups:   continent [5]
##    continent  year medianLifeExp maxGdpPercap
##    <fct>     <int>         <dbl>        <dbl>
##  1 Africa     1952          38.8        4725.
##  2 Africa     1957          40.6        5487.
##  3 Africa     1962          42.6        6757.
##  4 Africa     1967          44.7       18773.
##  5 Africa     1972          47.0       21011.
##  6 Africa     1977          49.3       21951.
##  7 Africa     1982          50.8       17364.
##  8 Africa     1987          51.6       11864.
##  9 Africa     1992          52.4       13522.
## 10 Africa     1997          52.8       14723.
## # ... with 50 more rows

Visualising Summarised Data

Use the by_year dataset to create a scatter plot showing the change of median life expectancy over time, with year on the x-axis and medianLifeExp on the y-axis. Be sure to add expand_limits(y = 0) to make sure the plot’s y-axis includes zero.

by_year <- gapminder %>%
  group_by(year) %>%
  summarize(medianLifeExp = median(lifeExp),
            maxGdpPercap = max(gdpPercap))
## `summarise()` ungrouping output (override with `.groups` argument)
# Create a scatter plot showing the change in medianLifeExp over time
ggplot(by_year, aes(x = year, y = medianLifeExp)) +
  geom_point() +
  expand_limits(y=0)

Summarize the gapminder dataset by continent and year, finding the median GDP per capita (gdpPercap) within each and putting it into a column called medianGdpPercap. Use the assignment operator <- to save this summarized data as by_year_continent. Create a scatter plot showing the change in medianGdpPercap by continent over time. Use color to distinguish between continents, and be sure to add expand_limits(y = 0) so that the y-axis starts at zero.

# Summarize medianGdpPercap within each continent within each year: by_year_continent
by_year_continent <- gapminder %>%
  group_by(continent, year) %>%
  summarize(medianGdpPercap = median(gdpPercap)) 
## `summarise()` regrouping output by 'continent' (override with `.groups` argument)
# Plot the change in medianGdpPercap in each continent over time
ggplot(by_year_continent, aes(x = year, y = medianGdpPercap, color = continent)) +
  geom_point() +
  expand_limits(y = 0) 

Filter the gapminder dataset for the year 2007, then summarize the median GDP per capita and the median life expectancy within each continent, into columns called medianLifeExp and medianGdpPercap. Save this as by_continent_2007. Use the by_continent_2007 data to create a scatterplot comparing these summary statistics for continents in 2007, putting the median GDP per capita on the x-axis to the median life expectancy on the y-axis. Color the scatter plot by continent. You don’t need to add expand_limits(y = 0) for this plot.

# Summarize the median GDP and median life expectancy per continent in 2007
by_continent_2007 <- gapminder %>%
  filter(year == 2007) %>%
  group_by(continent) %>%
  summarize(medianLifeExp = median(lifeExp), medianGdpPercap = median(gdpPercap))
## `summarise()` ungrouping output (override with `.groups` argument)
# Use a scatter plot to compare the median GDP and median life expectancy
ggplot(by_continent_2007, aes(x =medianGdpPercap, y = medianLifeExp, color = continent)) +
  geom_point()

Line Plots

Visualising median GDP per capita over time

# Summarize the median gdpPercap by year, then save it as by_year
by_year <- gapminder %>%
  group_by(year) %>%
  summarize(medianGdpPercap = median(gdpPercap))
## `summarise()` ungrouping output (override with `.groups` argument)
# Create a line plot showing the change in medianGdpPercap over time
ggplot(by_year, aes(x=year, y=medianGdpPercap)) +
  geom_line() +
  expand_limits(y = 0)

Visualising median GDP per capita by continent over time

# Summarize the median gdpPercap by year & continent, save as by_year_continent
by_year_continent <- gapminder %>%
  group_by(year, continent) %>%
  summarize(mediangdpPercap = median(gdpPercap))
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
# Create a line plot showing the change in medianGdpPercap by continent over time with color
ggplot(by_year_continent, aes(x=year, y=mediangdpPercap, color = continent)) +
  geom_line() +
  expand_limits(y = 0) +
  facet_wrap(~continent)

Bar Plots

Visualizing median GDP per capita by continent Summarize the median gdpPercap by continent in 1952, then, create a bar plot showing medianGdp by continent

by_continent <- gapminder %>%
  filter(year == 1952) %>%
  group_by(continent) %>%
  summarize(medianGdpPercap = median(gdpPercap))
## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(by_continent, aes(x=continent, y=medianGdpPercap)) +
  geom_col()

Visualizing GDP per capita by country in Oceania Filter for observatjions in the Oceania continent in 1952, and create a bar plot of gdpPercap by country

oceania_1952 <- gapminder %>%
  filter(year == 1952, continent == "Oceania")
ggplot(oceania_1952, aes(x=country, y=gdpPercap)) +
  geom_col()

Histogram

Visualising population Create a histogram of population

gapminder_1952 <- gapminder %>%
  filter(year == 1952) %>%
  mutate(pop_by_mil = pop / 1000000) # Population by million
ggplot(gapminder_1952, aes(x=pop_by_mil)) +
  geom_histogram(binwidth = 50)

Visualizing population with x-axis on a log scale Create a histogram of population with x on a log scale

gapminder_1952 <- gapminder %>%
  filter(year == 1952)
ggplot(gapminder_1952, aes(x=pop)) +
  geom_histogram() +
  scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Box Plots

Comparing GDP per capita across continents Create a boxplot comparing gdpPercap among continents

gapminder_1952 <- gapminder %>%
  filter(year == 1952)
ggplot(gapminder_1952, aes(x=continent, y=gdpPercap)) +
  geom_boxplot() +
  scale_y_log10()

Adding a title to the graph Add a title to this graph

gapminder_1952 <- gapminder %>%
  filter(year == 1952)
ggplot(gapminder_1952, aes(x = continent, y = gdpPercap)) +
  geom_boxplot() +
  ggtitle("Comparing GDP per capita across continents") +
  scale_y_log10()