Question:
Is it necessary to use install.packages() first?
library(tidyverse)
library(gapminder)
df1 <- gapminder
head(df1)
## # A tibble: 6 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
tail(df1)
## # A tibble: 6 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Zimbabwe Africa 1982 60.4 7636524 789.
## 2 Zimbabwe Africa 1987 62.4 9216418 706.
## 3 Zimbabwe Africa 1992 60.4 10704340 693.
## 4 Zimbabwe Africa 1997 46.8 11404948 792.
## 5 Zimbabwe Africa 2002 40.0 11926563 672.
## 6 Zimbabwe Africa 2007 43.5 12311143 470.
glimpse(df1)
## Rows: 1,704
## Columns: 6
## $ country <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", …
## $ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, …
## $ year <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, …
## $ lifeExp <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.8…
## $ pop <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 12…
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134, …
summary(gapminder)
## country continent year lifeExp
## Afghanistan: 12 Africa :624 Min. :1952 Min. :23.60
## Albania : 12 Americas:300 1st Qu.:1966 1st Qu.:48.20
## Algeria : 12 Asia :396 Median :1980 Median :60.71
## Angola : 12 Europe :360 Mean :1980 Mean :59.47
## Argentina : 12 Oceania : 24 3rd Qu.:1993 3rd Qu.:70.85
## Australia : 12 Max. :2007 Max. :82.60
## (Other) :1632
## pop gdpPercap
## Min. :6.001e+04 Min. : 241.2
## 1st Qu.:2.794e+06 1st Qu.: 1202.1
## Median :7.024e+06 Median : 3531.8
## Mean :2.960e+07 Mean : 7215.3
## 3rd Qu.:1.959e+07 3rd Qu.: 9325.5
## Max. :1.319e+09 Max. :113523.1
##
ggplot(df1, aes(x = gdpPercap, y = lifeExp)) +
geom_point()
ggplot(df1, aes(x = gdpPercap, y = lifeExp, color = continent)) +
geom_point()
# We'll call the new variable log_y
# This variable will be added to the data frame df1
# Finally, we create a new data frame df2 which is the revised df1.
df2 <- df1 %>%
mutate(log_y = log(gdpPercap))
head(df2)
## # A tibble: 6 × 7
## country continent year lifeExp pop gdpPercap log_y
## <fct> <fct> <int> <dbl> <int> <dbl> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779. 6.66
## 2 Afghanistan Asia 1957 30.3 9240934 821. 6.71
## 3 Afghanistan Asia 1962 32.0 10267083 853. 6.75
## 4 Afghanistan Asia 1967 34.0 11537966 836. 6.73
## 5 Afghanistan Asia 1972 36.1 13079460 740. 6.61
## 6 Afghanistan Asia 1977 38.4 14880372 786. 6.67
# Plot lifeExp vs log_y for all countries over all years
# Note we are using the new data frame df2
ggplot(df2, aes(x = log_y, y = lifeExp)) +
geom_point()
ggplot(df2, aes(x = log_y, y = lifeExp, color = continent)) +
geom_point()
# The geom_smooth() function fits a linear model to the data, aka linear regression.
ggplot(df2, aes(x = log_y, y = lifeExp)) +
geom_point() +
geom_smooth(method = lm)
ggplot(df2, aes(x = log_y, y = lifeExp)) +
geom_point() +
geom_smooth(method = lm, se = FALSE)
# Regress lifeExp on log_y.
# Store the results in an object we are calling fm.
# Use summary(fm) to display the results of the regression.
# Look under "Coefficients": The "Estimates" of (Intercept) and log_y.
## These are the intercept and slope of the fitted line.
# Equation of the fitted line: lifeExp = -9.1 + 8.4log_y
# R-squared is 0.65
fm <- lm(lifeExp ~ log_y, data = df2)
summary(fm)
##
## Call:
## lm(formula = lifeExp ~ log_y, data = df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32.778 -4.204 1.212 4.658 19.285
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.1009 1.2277 -7.413 1.93e-13 ***
## log_y 8.4051 0.1488 56.500 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.62 on 1702 degrees of freedom
## Multiple R-squared: 0.6522, Adjusted R-squared: 0.652
## F-statistic: 3192 on 1 and 1702 DF, p-value: < 2.2e-16
ggplot(df2, aes(x = log_y, y = lifeExp, color = continent)) +
geom_point() +
geom_smooth(method = lm, se = FALSE)
– Theend –