Loading the gapminder and dplyr packages
#install.packages("tidyverse")
#install.packages("gapminder")
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ----------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(gapminder)
Load the gapminder data and Taking a glance at gapminder data
gapminder
## # A tibble: 1,704 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
## 7 Afghanistan Asia 1982 39.9 12881816 978.
## 8 Afghanistan Asia 1987 40.8 13867957 852.
## 9 Afghanistan Asia 1992 41.7 16317921 649.
## 10 Afghanistan Asia 1997 41.8 22227415 635.
## # ... with 1,694 more rows
filter() function in R works as WHERE keyword in SQL.
Filter the gapminder dataset for the year 1957
gapminder %>%
filter(year == 1957)
## # A tibble: 142 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1957 30.3 9240934 821.
## 2 Albania Europe 1957 59.3 1476505 1942.
## 3 Algeria Africa 1957 45.7 10270856 3014.
## 4 Angola Africa 1957 32.0 4561361 3828.
## 5 Argentina Americas 1957 64.4 19610538 6857.
## 6 Australia Oceania 1957 70.3 9712569 10950.
## 7 Austria Europe 1957 67.5 6965860 8843.
## 8 Bahrain Asia 1957 53.8 138655 11636.
## 9 Bangladesh Asia 1957 39.3 51365468 662.
## 10 Belgium Europe 1957 69.2 8989111 9715.
## # ... with 132 more rows
Filter for China in 2002
gapminder %>%
filter(country == "China", year == 2002)
## # A tibble: 1 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 China Asia 2002 72.0 1280400000 3119.
arrange() function is like ORDER BY keyword in SQL. However, SQL needs ASC OR DESC to specify the order ascending and descending, where R just needs to specify desc() if it wants to output in descending order otherwise the the records on the variable/column is in ascending order(smallest to biggest) by default.
Sort in ascending order of lifeExp
gapminder %>%
arrange(lifeExp) # since default order is ascending, so no need to specify
## # A tibble: 1,704 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Rwanda Africa 1992 23.6 7290203 737.
## 2 Afghanistan Asia 1952 28.8 8425333 779.
## 3 Gambia Africa 1952 30 284320 485.
## 4 Angola Africa 1952 30.0 4232095 3521.
## 5 Sierra Leone Africa 1952 30.3 2143249 880.
## 6 Afghanistan Asia 1957 30.3 9240934 821.
## 7 Cambodia Asia 1977 31.2 6978607 525.
## 8 Mozambique Africa 1952 31.3 6446316 469.
## 9 Sierra Leone Africa 1957 31.6 2295678 1004.
## 10 Burkina Faso Africa 1952 32.0 4469979 543.
## # ... with 1,694 more rows
Sort in descending order of lifeExp
gapminder %>%
arrange(desc(lifeExp))
## # A tibble: 1,704 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Japan Asia 2007 82.6 127467972 31656.
## 2 Hong Kong, China Asia 2007 82.2 6980412 39725.
## 3 Japan Asia 2002 82 127065841 28605.
## 4 Iceland Europe 2007 81.8 301931 36181.
## 5 Switzerland Europe 2007 81.7 7554661 37506.
## 6 Hong Kong, China Asia 2002 81.5 6762476 30209.
## 7 Australia Oceania 2007 81.2 20434176 34435.
## 8 Spain Europe 2007 80.9 40448191 28821.
## 9 Sweden Europe 2007 80.9 9031088 33860.
## 10 Israel Asia 2007 80.7 6426679 25523.
## # ... with 1,694 more rows
We’ll often need to use the pipe operator (%>%) to combine multiple dplyr verbs in a row
Filter for the year 1957, then arrange in descending order of population
gapminder %>%
filter(year == 1957) %>%
arrange(desc(pop))
## # A tibble: 142 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 China Asia 1957 50.5 637408000 576.
## 2 India Asia 1957 40.2 409000000 590.
## 3 United States Americas 1957 69.5 171984000 14847.
## 4 Japan Asia 1957 65.5 91563009 4318.
## 5 Indonesia Asia 1957 39.9 90124000 859.
## 6 Germany Europe 1957 69.1 71019069 10188.
## 7 Brazil Americas 1957 53.3 65551171 2487.
## 8 United Kingdom Europe 1957 70.4 51430000 11283.
## 9 Bangladesh Asia 1957 39.3 51365468 662.
## 10 Italy Europe 1957 67.8 49182000 6249.
## # ... with 132 more rows
Use mutate to change a variable-its like calculated field in Tableau and calculated column/transform column in Power BI. it can either replace the existing column or add new one
Use mutate to change lifeExp to be in months
gapminder %>%
mutate(lifeExp = lifeExp * 12) # to change the existing lifeExp column, by multiplying it by 12
## # A tibble: 1,704 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 346. 8425333 779.
## 2 Afghanistan Asia 1957 364. 9240934 821.
## 3 Afghanistan Asia 1962 384. 10267083 853.
## 4 Afghanistan Asia 1967 408. 11537966 836.
## 5 Afghanistan Asia 1972 433. 13079460 740.
## 6 Afghanistan Asia 1977 461. 14880372 786.
## 7 Afghanistan Asia 1982 478. 12881816 978.
## 8 Afghanistan Asia 1987 490. 13867957 852.
## 9 Afghanistan Asia 1992 500. 16317921 649.
## 10 Afghanistan Asia 1997 501. 22227415 635.
## # ... with 1,694 more rows
Use mutate to create a new column called lifeExpMonths
gapminder %>%
mutate(lifeExpMonths = lifeExp * 12) # to add a new column, called lifeExpMonths
## # A tibble: 1,704 x 7
## country continent year lifeExp pop gdpPercap lifeExpMonths
## <fct> <fct> <int> <dbl> <int> <dbl> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779. 346.
## 2 Afghanistan Asia 1957 30.3 9240934 821. 364.
## 3 Afghanistan Asia 1962 32.0 10267083 853. 384.
## 4 Afghanistan Asia 1967 34.0 11537966 836. 408.
## 5 Afghanistan Asia 1972 36.1 13079460 740. 433.
## 6 Afghanistan Asia 1977 38.4 14880372 786. 461.
## 7 Afghanistan Asia 1982 39.9 12881816 978. 478.
## 8 Afghanistan Asia 1987 40.8 13867957 852. 490.
## 9 Afghanistan Asia 1992 41.7 16317921 649. 500.
## 10 Afghanistan Asia 1997 41.8 22227415 635. 501.
## # ... with 1,694 more rows
gapminder %>%
filter(year == 2007) %>%
mutate(lifeExpMonths = lifeExp * 12) %>%
arrange(desc(lifeExpMonths))
## # A tibble: 142 x 7
## country continent year lifeExp pop gdpPercap lifeExpMonths
## <fct> <fct> <int> <dbl> <int> <dbl> <dbl>
## 1 Japan Asia 2007 82.6 127467972 31656. 991.
## 2 Hong Kong, China Asia 2007 82.2 6980412 39725. 986.
## 3 Iceland Europe 2007 81.8 301931 36181. 981.
## 4 Switzerland Europe 2007 81.7 7554661 37506. 980.
## 5 Australia Oceania 2007 81.2 20434176 34435. 975.
## 6 Spain Europe 2007 80.9 40448191 28821. 971.
## 7 Sweden Europe 2007 80.9 9031088 33860. 971.
## 8 Israel Asia 2007 80.7 6426679 25523. 969.
## 9 France Europe 2007 80.7 61083916 30470. 968.
## 10 Canada Americas 2007 80.7 33390141 36319. 968.
## # ... with 132 more rows
Loading ggplot2 package
library(ggplot2)
Filter gapminder for observations from the year 1952, and assign it to a new dataset gapminder_1952
gapminder_1952 <- gapminder %>%
filter(year == 1952)
Change to put pop on the x-axis and gdpPercap on the y-axis
ggplot(gapminder_1952, aes(x = gdpPercap, y = lifeExp)) + # specifying x axis and y axises in aesthetic dimension
geom_point() # gem_point means it should be a scatterplot with points
Change the scatter plot of gapminder_1952 so that (pop) is on the x-axis and GDP per capita (gdpPercap) is on the y-axis.
ggplot(gapminder_1952, aes(x = pop, y = gdpPercap)) +
geom_point()
Create a scatter plot of gapminder_1952 with population (pop) is on the x-axis and life expectancy (lifeExp) on the y-axis.
ggplot(gapminder_1952, aes(x = pop, y = lifeExp)) +
geom_point()
Putting x-axis on a log scale
ggplot(gapminder_1952, aes(x = pop, y = lifeExp)) +
geom_point() +
scale_x_log10()
Putting the x- and y- axes on a log scale
ggplot(gapminder_1952, aes(x=gdpPercap, y=pop)) +
geom_point() +
scale_x_log10() +
scale_y_log10()
Adding color to a scatter plot, Scatter plot comparing pop and lifeExp, with color representing continent
ggplot(gapminder_1952, aes(x=pop, y=lifeExp, color = continent)) +
geom_point() +
scale_x_log10()
Adding size and color to a plot-Modify the scatter plot so that the size of the points represents each country’s GDP per capita (gdpPercap).
ggplot(gapminder_1952, aes(x = pop, y = lifeExp, color = continent, size = gdpPercap)) +
geom_point() +
scale_x_log10()
~ symbol in R represents “by”
Create a scatter plot of gapminder_1952 with the x-axis representing population (pop), the y-axis representing life expectancy (lifeExp), and faceted to have one subplot per continent (continent). Put the x-axis on a log scale.
ggplot(gapminder_1952, aes(x=pop, y=lifeExp)) +
geom_point() +
scale_x_log10() +
facet_wrap(~continent)
Create a scatter plot of the gapminder, then Put GDP per capita (gdpPercap) on the x-axis and life expectancy (lifeExp) on the y-axis, with continent (continent) represented by color and population (pop) represented by size.. After that Put the x-axis on a log scale, and Facet by the year variable
ggplot(gapminder, aes(x=gdpPercap, y=lifeExp, color=continent, size=pop)) +
geom_point() +
scale_x_log10() +
facet_wrap(~year)
Use the median() function within a summarize() to find the median life expectancy. Save it into a column called medianLifeExp.
gapminder %>%
summarize(medianLifeExp = median(lifeExp)) # Here, medianLifeExp works like AS Alias in SQL
## # A tibble: 1 x 1
## medianLifeExp
## <dbl>
## 1 60.7
Filter for the year 1957, then use the median() function within a summarize() to calculate the median life expectancy into a column called medianLifeExp.
gapminder %>%
filter(year == 1957) %>%
summarize(medianLifeExp = median(lifeExp))
## # A tibble: 1 x 1
## medianLifeExp
## <dbl>
## 1 48.4
Find both the median life expectancy (lifeExp) and the maximum GDP per capita (gdpPercap) in the year 1957, calling them medianLifeExp and maxGdpPercap respectively. You can use the max() function to find the maximum.
gapminder %>%
filter(year == 1957) %>%
summarize(medianLifeExp = median(lifeExp), maxGdpPercap = max(gdpPercap))
## # A tibble: 1 x 2
## medianLifeExp maxGdpPercap
## <dbl> <dbl>
## 1 48.4 113523.
Find the median life expectancy (lifeExp) and maximum GDP per capita (gdpPercap) within each year, saving them into medianLifeExp and maxGdpPercap, respectively.
gapminder %>%
group_by(year) %>%
summarize(medianLifeExp = median(lifeExp), maxGdpPercap = max(gdpPercap))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 12 x 3
## year medianLifeExp maxGdpPercap
## <int> <dbl> <dbl>
## 1 1952 45.1 108382.
## 2 1957 48.4 113523.
## 3 1962 50.9 95458.
## 4 1967 53.8 80895.
## 5 1972 56.5 109348.
## 6 1977 59.7 59265.
## 7 1982 62.4 33693.
## 8 1987 65.8 31541.
## 9 1992 67.7 34933.
## 10 1997 69.4 41283.
## 11 2002 70.8 44684.
## 12 2007 71.9 49357.
Filter the gapminder data for the year 1957. Then find the median life expectancy (lifeExp) and maximum GDP per capita (gdpPercap) within each continent, saving them into medianLifeExp and maxGdpPercap, respectively.
gapminder %>%
filter(year == 1957) %>%
group_by(continent) %>%
summarize(medianLifeExp = median(lifeExp), maxGdpPercap = max(gdpPercap))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 5 x 3
## continent medianLifeExp maxGdpPercap
## <fct> <dbl> <dbl>
## 1 Africa 40.6 5487.
## 2 Americas 56.1 14847.
## 3 Asia 48.3 113523.
## 4 Europe 67.6 17909.
## 5 Oceania 70.3 12247.
Find the median life expectancy (lifeExp) and maximum GDP per capita (gdpPercap) within each combination of continent and year, saving them into medianLifeExp and maxGdpPercap, respectively.
gapminder %>%
group_by(continent, year) %>%
summarize(medianLifeExp = median(lifeExp), maxGdpPercap = max(gdpPercap))
## `summarise()` regrouping output by 'continent' (override with `.groups` argument)
## # A tibble: 60 x 4
## # Groups: continent [5]
## continent year medianLifeExp maxGdpPercap
## <fct> <int> <dbl> <dbl>
## 1 Africa 1952 38.8 4725.
## 2 Africa 1957 40.6 5487.
## 3 Africa 1962 42.6 6757.
## 4 Africa 1967 44.7 18773.
## 5 Africa 1972 47.0 21011.
## 6 Africa 1977 49.3 21951.
## 7 Africa 1982 50.8 17364.
## 8 Africa 1987 51.6 11864.
## 9 Africa 1992 52.4 13522.
## 10 Africa 1997 52.8 14723.
## # ... with 50 more rows
Use the by_year dataset to create a scatter plot showing the change of median life expectancy over time, with year on the x-axis and medianLifeExp on the y-axis. Be sure to add expand_limits(y = 0) to make sure the plot’s y-axis includes zero.
by_year <- gapminder %>%
group_by(year) %>%
summarize(medianLifeExp = median(lifeExp),
maxGdpPercap = max(gdpPercap))
## `summarise()` ungrouping output (override with `.groups` argument)
# Create a scatter plot showing the change in medianLifeExp over time
ggplot(by_year, aes(x = year, y = medianLifeExp)) +
geom_point() +
expand_limits(y=0)
Summarize the gapminder dataset by continent and year, finding the median GDP per capita (gdpPercap) within each and putting it into a column called medianGdpPercap. Use the assignment operator <- to save this summarized data as by_year_continent. Create a scatter plot showing the change in medianGdpPercap by continent over time. Use color to distinguish between continents, and be sure to add expand_limits(y = 0) so that the y-axis starts at zero.
# Summarize medianGdpPercap within each continent within each year: by_year_continent
by_year_continent <- gapminder %>%
group_by(continent, year) %>%
summarize(medianGdpPercap = median(gdpPercap))
## `summarise()` regrouping output by 'continent' (override with `.groups` argument)
# Plot the change in medianGdpPercap in each continent over time
ggplot(by_year_continent, aes(x = year, y = medianGdpPercap, color = continent)) +
geom_point() +
expand_limits(y = 0)
Filter the gapminder dataset for the year 2007, then summarize the median GDP per capita and the median life expectancy within each continent, into columns called medianLifeExp and medianGdpPercap. Save this as by_continent_2007. Use the by_continent_2007 data to create a scatterplot comparing these summary statistics for continents in 2007, putting the median GDP per capita on the x-axis to the median life expectancy on the y-axis. Color the scatter plot by continent. You don’t need to add expand_limits(y = 0) for this plot.
# Summarize the median GDP and median life expectancy per continent in 2007
by_continent_2007 <- gapminder %>%
filter(year == 2007) %>%
group_by(continent) %>%
summarize(medianLifeExp = median(lifeExp), medianGdpPercap = median(gdpPercap))
## `summarise()` ungrouping output (override with `.groups` argument)
# Use a scatter plot to compare the median GDP and median life expectancy
ggplot(by_continent_2007, aes(x =medianGdpPercap, y = medianLifeExp, color = continent)) +
geom_point()
Visualising median GDP per capita over time
# Summarize the median gdpPercap by year, then save it as by_year
by_year <- gapminder %>%
group_by(year) %>%
summarize(medianGdpPercap = median(gdpPercap))
## `summarise()` ungrouping output (override with `.groups` argument)
# Create a line plot showing the change in medianGdpPercap over time
ggplot(by_year, aes(x=year, y=medianGdpPercap)) +
geom_line() +
expand_limits(y = 0)
Visualising median GDP per capita by continent over time
# Summarize the median gdpPercap by year & continent, save as by_year_continent
by_year_continent <- gapminder %>%
group_by(year, continent) %>%
summarize(mediangdpPercap = median(gdpPercap))
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
# Create a line plot showing the change in medianGdpPercap by continent over time with color
ggplot(by_year_continent, aes(x=year, y=mediangdpPercap, color = continent)) +
geom_line() +
expand_limits(y = 0) +
facet_wrap(~continent)
Visualizing median GDP per capita by continent Summarize the median gdpPercap by continent in 1952, then, create a bar plot showing medianGdp by continent
by_continent <- gapminder %>%
filter(year == 1952) %>%
group_by(continent) %>%
summarize(medianGdpPercap = median(gdpPercap))
## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(by_continent, aes(x=continent, y=medianGdpPercap)) +
geom_col()
Visualizing GDP per capita by country in Oceania Filter for observatjions in the Oceania continent in 1952, and create a bar plot of gdpPercap by country
oceania_1952 <- gapminder %>%
filter(year == 1952, continent == "Oceania")
ggplot(oceania_1952, aes(x=country, y=gdpPercap)) +
geom_col()
Visualising population Create a histogram of population
gapminder_1952 <- gapminder %>%
filter(year == 1952) %>%
mutate(pop_by_mil = pop / 1000000) # Population by million
ggplot(gapminder_1952, aes(x=pop_by_mil)) +
geom_histogram(binwidth = 50)
Visualizing population with x-axis on a log scale Create a histogram of population with x on a log scale
gapminder_1952 <- gapminder %>%
filter(year == 1952)
ggplot(gapminder_1952, aes(x=pop)) +
geom_histogram() +
scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Comparing GDP per capita across continents Create a boxplot comparing gdpPercap among continents
gapminder_1952 <- gapminder %>%
filter(year == 1952)
ggplot(gapminder_1952, aes(x=continent, y=gdpPercap)) +
geom_boxplot() +
scale_y_log10()
Adding a title to the graph Add a title to this graph
gapminder_1952 <- gapminder %>%
filter(year == 1952)
ggplot(gapminder_1952, aes(x = continent, y = gdpPercap)) +
geom_boxplot() +
ggtitle("Comparing GDP per capita across continents") +
scale_y_log10()