Load the necessary packages

Question:

Is it necessary to use install.packages() first?

library(tidyverse)
library(gapminder)

Make a copy of the gapminder dataframe; call the new dataframe df1.

df1 <- gapminder

Inspect the data

head(df1)
## # A tibble: 6 × 6
##   country     continent  year lifeExp      pop gdpPercap
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.
## 2 Afghanistan Asia       1957    30.3  9240934      821.
## 3 Afghanistan Asia       1962    32.0 10267083      853.
## 4 Afghanistan Asia       1967    34.0 11537966      836.
## 5 Afghanistan Asia       1972    36.1 13079460      740.
## 6 Afghanistan Asia       1977    38.4 14880372      786.
tail(df1)
## # A tibble: 6 × 6
##   country  continent  year lifeExp      pop gdpPercap
##   <fct>    <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Zimbabwe Africa     1982    60.4  7636524      789.
## 2 Zimbabwe Africa     1987    62.4  9216418      706.
## 3 Zimbabwe Africa     1992    60.4 10704340      693.
## 4 Zimbabwe Africa     1997    46.8 11404948      792.
## 5 Zimbabwe Africa     2002    40.0 11926563      672.
## 6 Zimbabwe Africa     2007    43.5 12311143      470.
glimpse(df1)
## Rows: 1,704
## Columns: 6
## $ country   <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", …
## $ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, …
## $ year      <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, …
## $ lifeExp   <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.8…
## $ pop       <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 12…
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134, …

Descriptive statistics

summary(gapminder)
##         country        continent        year         lifeExp     
##  Afghanistan:  12   Africa  :624   Min.   :1952   Min.   :23.60  
##  Albania    :  12   Americas:300   1st Qu.:1966   1st Qu.:48.20  
##  Algeria    :  12   Asia    :396   Median :1980   Median :60.71  
##  Angola     :  12   Europe  :360   Mean   :1980   Mean   :59.47  
##  Argentina  :  12   Oceania : 24   3rd Qu.:1993   3rd Qu.:70.85  
##  Australia  :  12                  Max.   :2007   Max.   :82.60  
##  (Other)    :1632                                                
##       pop              gdpPercap       
##  Min.   :6.001e+04   Min.   :   241.2  
##  1st Qu.:2.794e+06   1st Qu.:  1202.1  
##  Median :7.024e+06   Median :  3531.8  
##  Mean   :2.960e+07   Mean   :  7215.3  
##  3rd Qu.:1.959e+07   3rd Qu.:  9325.5  
##  Max.   :1.319e+09   Max.   :113523.1  
## 

Basic scatterplot plot

ggplot(df1, aes(x = gdpPercap, y = lifeExp)) + 
  geom_point()

Map continents to colors

ggplot(df1, aes(x = gdpPercap, y = lifeExp, color = continent)) + 
  geom_point()

Use mutate() to create a new variable: log of GDP per capita

# We'll call the new variable log_y

# This variable will be added to the data frame df1

# Finally, we create a new data frame df2 which is the revised df1.

df2 <- df1 %>% 
  mutate(log_y = log(gdpPercap))
head(df2)
## # A tibble: 6 × 7
##   country     continent  year lifeExp      pop gdpPercap log_y
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl> <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.  6.66
## 2 Afghanistan Asia       1957    30.3  9240934      821.  6.71
## 3 Afghanistan Asia       1962    32.0 10267083      853.  6.75
## 4 Afghanistan Asia       1967    34.0 11537966      836.  6.73
## 5 Afghanistan Asia       1972    36.1 13079460      740.  6.61
## 6 Afghanistan Asia       1977    38.4 14880372      786.  6.67

New plot, but this time with log_y on the X-axis

# Plot lifeExp vs log_y for all countries over all years

# Note we are using the new data frame df2

ggplot(df2, aes(x = log_y, y = lifeExp)) + 
  geom_point()

Plot all countries over all years, but different colors for different continents

ggplot(df2, aes(x = log_y, y = lifeExp, color = continent)) + 
  geom_point()

Add a straight line (same as trendline in Excel)

# The geom_smooth() function fits a linear model to the data, aka linear regression.

ggplot(df2, aes(x = log_y, y = lifeExp)) + 
  geom_point() +
  geom_smooth(method = lm)

Hide the standard error (the grey shaded areas around the straight line)

ggplot(df2, aes(x = log_y, y = lifeExp)) + 
  geom_point() +
  geom_smooth(method = lm, se = FALSE)

Find the slope and intercept of the line

# Regress lifeExp on log_y.
# Store the results in an object we are calling fm.
# Use summary(fm) to display the results of the regression.
# Look under "Coefficients": The "Estimates" of (Intercept) and log_y.
## These are the intercept and slope of the fitted line.

# Equation of the fitted line: lifeExp = -9.1 + 8.4log_y
# R-squared is 0.65

fm <- lm(lifeExp ~ log_y, data = df2)
summary(fm)
## 
## Call:
## lm(formula = lifeExp ~ log_y, data = df2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -32.778  -4.204   1.212   4.658  19.285 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -9.1009     1.2277  -7.413 1.93e-13 ***
## log_y         8.4051     0.1488  56.500  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.62 on 1702 degrees of freedom
## Multiple R-squared:  0.6522, Adjusted R-squared:  0.652 
## F-statistic:  3192 on 1 and 1702 DF,  p-value: < 2.2e-16

And finally, one regression line for each continent!

ggplot(df2, aes(x = log_y, y = lifeExp, color = continent)) + 
  geom_point() +
  geom_smooth(method = lm, se = FALSE)

– Theend –