Load the necessary packages

Question:

Is it necessary to use install.packages() first?

library(tidyverse)
library(gapminder)

Make a copy of the gapminder dataframe; call the new dataframe df1.

df1 <- gapminder

Inspect the data

head(df1)

## # A tibble: 6 × 6
##   country     continent  year lifeExp      pop gdpPercap
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.
## 2 Afghanistan Asia       1957    30.3  9240934      821.
## 3 Afghanistan Asia       1962    32.0 10267083      853.
## 4 Afghanistan Asia       1967    34.0 11537966      836.
## 5 Afghanistan Asia       1972    36.1 13079460      740.
## 6 Afghanistan Asia       1977    38.4 14880372      786.

tail(df1)

## # A tibble: 6 × 6
##   country  continent  year lifeExp      pop gdpPercap
##   <fct>    <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Zimbabwe Africa     1982    60.4  7636524      789.
## 2 Zimbabwe Africa     1987    62.4  9216418      706.
## 3 Zimbabwe Africa     1992    60.4 10704340      693.
## 4 Zimbabwe Africa     1997    46.8 11404948      792.
## 5 Zimbabwe Africa     2002    40.0 11926563      672.
## 6 Zimbabwe Africa     2007    43.5 12311143      470.

glimpse(df1)

## Rows: 1,704
## Columns: 6
## $ country   <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", …
## $ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, …
## $ year      <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, …
## $ lifeExp   <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.8…
## $ pop       <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 12…
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134, …

Descriptive statistics

summary(gapminder)

##         country        continent        year         lifeExp     
##  Afghanistan:  12   Africa  :624   Min.   :1952   Min.   :23.60  
##  Albania    :  12   Americas:300   1st Qu.:1966   1st Qu.:48.20  
##  Algeria    :  12   Asia    :396   Median :1980   Median :60.71  
##  Angola     :  12   Europe  :360   Mean   :1980   Mean   :59.47  
##  Argentina  :  12   Oceania : 24   3rd Qu.:1993   3rd Qu.:70.85  
##  Australia  :  12                  Max.   :2007   Max.   :82.60  
##  (Other)    :1632                                                
##       pop              gdpPercap       
##  Min.   :6.001e+04   Min.   :   241.2  
##  1st Qu.:2.794e+06   1st Qu.:  1202.1  
##  Median :7.024e+06   Median :  3531.8  
##  Mean   :2.960e+07   Mean   :  7215.3  
##  3rd Qu.:1.959e+07   3rd Qu.:  9325.5  
##  Max.   :1.319e+09   Max.   :113523.1  
##

Basic scatterplot plot

ggplot(df1, aes(x = gdpPercap, y = lifeExp)) + 
  geom_point()

Map continents to colors

ggplot(df1, aes(x = gdpPercap, y = lifeExp, color = continent)) + 
  geom_point()

Use mutate() to create a new variable: log of GDP per capita

# We'll call the new variable log_y

# This variable will be added to the data frame df1

# Finally, we create a new data frame df2 which is the revised df1.

df2 <- df1 %>% 
  mutate(log_y = log(gdpPercap))
head(df2)

## # A tibble: 6 × 7
##   country     continent  year lifeExp      pop gdpPercap log_y
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl> <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.  6.66
## 2 Afghanistan Asia       1957    30.3  9240934      821.  6.71
## 3 Afghanistan Asia       1962    32.0 10267083      853.  6.75
## 4 Afghanistan Asia       1967    34.0 11537966      836.  6.73
## 5 Afghanistan Asia       1972    36.1 13079460      740.  6.61
## 6 Afghanistan Asia       1977    38.4 14880372      786.  6.67

New plot, but this time with log_y on the X-axis

# Plot lifeExp vs log_y for all countries over all years

# Note we are using the new data frame df2

ggplot(df2, aes(x = log_y, y = lifeExp)) + 
  geom_point()

Plot all countries over all years, but different colors for different continents

ggplot(df2, aes(x = log_y, y = lifeExp, color = continent)) + 
  geom_point()

Add a straight line (same as trendline in Excel)

# The geom_smooth() function fits a linear model to the data, aka linear regression.

ggplot(df2, aes(x = log_y, y = lifeExp)) + 
  geom_point() +
  geom_smooth(method = lm)

Hide the standard error (the grey shaded areas around the straight line)

ggplot(df2, aes(x = log_y, y = lifeExp)) + 
  geom_point() +
  geom_smooth(method = lm, se = FALSE)

Find the slope and intercept of the line

# Regress lifeExp on log_y.
# Store the results in an object we are calling fm.
# Use summary(fm) to display the results of the regression.
# Look under "Coefficients": The "Estimates" of (Intercept) and log_y.
## These are the intercept and slope of the fitted line.

# Equation of the fitted line: lifeExp = -9.1 + 8.4log_y
# R-squared is 0.65

fm <- lm(lifeExp ~ log_y, data = df2)
summary(fm)

## 
## Call:
## lm(formula = lifeExp ~ log_y, data = df2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -32.778  -4.204   1.212   4.658  19.285 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -9.1009     1.2277  -7.413 1.93e-13 ***
## log_y         8.4051     0.1488  56.500  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.62 on 1702 degrees of freedom
## Multiple R-squared:  0.6522, Adjusted R-squared:  0.652 
## F-statistic:  3192 on 1 and 1702 DF,  p-value: < 2.2e-16

And finally, one regression line for each continent!

ggplot(df2, aes(x = log_y, y = lifeExp, color = continent)) + 
  geom_point() +
  geom_smooth(method = lm, se = FALSE)

– Theend –

Linear model, using gapminder

Load the necessary packages

Make a copy of the gapminder dataframe; call the new dataframe df1.

Inspect the data

Descriptive statistics

Basic scatterplot plot

Map continents to colors

Use mutate() to create a new variable: log of GDP per capita

New plot, but this time with log_y on the X-axis

Plot all countries over all years, but different colors for different continents

Add a straight line (same as trendline in Excel)

Hide the standard error (the grey shaded areas around the straight line)

Find the slope and intercept of the line

And finally, one regression line for each continent!