#Get the Data for looking at the life expectancy of
populations over time in different geographical regions.
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.2
## Warning: package 'forcats' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(finalfit)
## Warning: package 'finalfit' was built under R version 4.2.3
library(gapminder)
## Warning: package 'gapminder' was built under R version 4.2.3
library(broom)
## Warning: package 'broom' was built under R version 4.2.3
#Step 01-Create an object from object gapminder
gapdata<-gapminder
# Step:02-We are checking, each variable as line, variable type,first values
glimpse(gapdata)
## Rows: 1,704
## Columns: 6
## $ country <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", …
## $ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, …
## $ year <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, …
## $ lifeExp <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.8…
## $ pop <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 12…
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134, …
# Step:03-Check any missing data in the dataset
missing_glimpse(gapdata)
## label var_type n missing_n missing_percent
## country country <fct> 1704 0 0.0
## continent continent <fct> 1704 0 0.0
## year year <int> 1704 0 0.0
## lifeExp lifeExp <dbl> 1704 0 0.0
## pop pop <int> 1704 0 0.0
## gdpPercap gdpPercap <dbl> 1704 0 0.0
#Step:04- Create summary statistics for each variable
ff_glimpse(gapdata)
## Warning: `fct_explicit_na()` was deprecated in forcats 1.0.0.
## ℹ Please use `fct_na_value_to_level()` instead.
## ℹ The deprecated feature was likely used in the finalfit package.
## Please report the issue at <https://github.com/ewenharrison/finalfit/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## $Continuous
## label var_type n missing_n missing_percent mean
## year year <int> 1704 0 0.0 1979.5
## lifeExp lifeExp <dbl> 1704 0 0.0 59.5
## pop pop <int> 1704 0 0.0 29601212.3
## gdpPercap gdpPercap <dbl> 1704 0 0.0 7215.3
## sd min quartile_25 median quartile_75 max
## year 17.3 1952.0 1965.8 1979.5 1993.2 2007.0
## lifeExp 12.9 23.6 48.2 60.7 70.8 82.6
## pop 106157896.7 60011.0 2793664.0 7023595.5 19585221.8 1318683096.0
## gdpPercap 9857.5 241.2 1202.1 3531.8 9325.5 113523.1
##
## $Categorical
## label var_type n missing_n missing_percent levels_n
## country country <fct> 1704 0 0.0 142
## continent continent <fct> 1704 0 0.0 5
## levels
## country -
## continent "Africa", "Americas", "Asia", "Europe", "Oceania"
## levels_count levels_percent
## country - -
## continent 624, 300, 396, 360, 24 36.6, 17.6, 23.2, 21.1, 1.4
summary(gapdata)
## country continent year lifeExp
## Afghanistan: 12 Africa :624 Min. :1952 Min. :23.60
## Albania : 12 Americas:300 1st Qu.:1966 1st Qu.:48.20
## Algeria : 12 Asia :396 Median :1980 Median :60.71
## Angola : 12 Europe :360 Mean :1980 Mean :59.47
## Argentina : 12 Oceania : 24 3rd Qu.:1993 3rd Qu.:70.85
## Australia : 12 Max. :2007 Max. :82.60
## (Other) :1632
## pop gdpPercap
## Min. :6.001e+04 Min. : 241.2
## 1st Qu.:2.794e+06 1st Qu.: 1202.1
## Median :7.024e+06 Median : 3531.8
## Mean :2.960e+07 Mean : 7215.3
## 3rd Qu.:1.959e+07 3rd Qu.: 9325.5
## Max. :1.319e+09 Max. :113523.1
##
#We will start by comparing life expectancy between the 5 continents of the world in two different years. Always plot your data first. Never skip this step! We are particularly interested in the distribution. There’s that word again. The shape of the data. Is it normal? Is it skewed? Does it differ between regions and years? There are three useful plots which can help here:
##Histograms: examine shape of data and compare groups;
##Q-Q plots: are data normally distributed?
##Box-plots: identify outliers, compare shape and groups.
gapdata %>% filter(year %in% c(2002,2007)) %>% ggplot(aes(x=lifeExp))+geom_histogram(bins=20)+facet_grid(year~continent)

gapdata %>% filter(year%in%c(2002,2007)) %>% ggplot(aes(sample=lifeExp))+geom_qq()+geom_qq_line(colour="blue")+facet_grid(year~continent)

gapdata %>% filter(year%in% c(2002,2007)) %>% ggplot(aes(x=continent,y=lifeExp))+geom_boxplot()+facet_wrap(~year)

#In our example of countries and continents, you have to assume that the mean life expectancy of each country does not depend on the life expectancies of other countries in the group. In other words, that each measurement is independent.
#The Welch two-sample t-test is the most flexible and copes with differences in variance (variability) between groups, as in this example.
ttest_data<-gapdata %>% filter(year==2007) %>% filter(continent %in% c("Asia","Europe") )
ttest_result<-ttest_data %>% t.test(lifeExp~continent,data=.)
ttest_result
##
## Welch Two Sample t-test
##
## data: lifeExp by continent
## t = -4.6468, df = 41.529, p-value = 3.389e-05
## alternative hypothesis: true difference in means between group Asia and group Europe is not equal to 0
## 95 percent confidence interval:
## -9.926525 -3.913705
## sample estimates:
## mean in group Asia mean in group Europe
## 70.72848 77.64860
#What is the difference in life expectancy for each individual country? We don’t usually have to produce this directly, but here is one method.
paired_table<-gapdata %>% select(country,year,lifeExp) %>% pivot_wider(names_from=year,values_from=lifeExp)
paired_table
## # A tibble: 142 × 13
## country `1952` `1957` `1962` `1967` `1972` `1977` `1982` `1987` `1992` `1997`
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Afghan… 28.8 30.3 32.0 34.0 36.1 38.4 39.9 40.8 41.7 41.8
## 2 Albania 55.2 59.3 64.8 66.2 67.7 68.9 70.4 72 71.6 73.0
## 3 Algeria 43.1 45.7 48.3 51.4 54.5 58.0 61.4 65.8 67.7 69.2
## 4 Angola 30.0 32.0 34 36.0 37.9 39.5 39.9 39.9 40.6 41.0
## 5 Argent… 62.5 64.4 65.1 65.6 67.1 68.5 69.9 70.8 71.9 73.3
## 6 Austra… 69.1 70.3 70.9 71.1 71.9 73.5 74.7 76.3 77.6 78.8
## 7 Austria 66.8 67.5 69.5 70.1 70.6 72.2 73.2 74.9 76.0 77.5
## 8 Bahrain 50.9 53.8 56.9 59.9 63.3 65.6 69.1 70.8 72.6 73.9
## 9 Bangla… 37.5 39.3 41.2 43.5 45.3 46.9 50.0 52.8 56.0 59.4
## 10 Belgium 68 69.2 70.2 70.9 71.4 72.8 73.9 75.4 76.5 77.5
## # ℹ 132 more rows
## # ℹ 2 more variables: `2002` <dbl>, `2007` <dbl>
#Analysis of variance is a collection of statistical tests which can be used to test the difference in means between two or more groups.
library(broom)
gapdata %>% filter(year==2007) %>% filter(continent %in% c("Ameticas","Europe","Asia")) %>% aov(lifeExp~continent,data=.) %>% tidy()
## # A tibble: 2 × 6
## term df sumsq meansq statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 continent 1 753. 753. 20.1 0.0000334
## 2 Residuals 61 2287. 37.5 NA NA
#Non-parametric test for comparing two groups
africa_data<-gapdata %>% filter(year %in% c(1982,2007)) %>% filter(continent %in% c("Africa"))
p1<-africa_data %>% ggplot(aes(x=lifeExp))+geom_histogram(bins=15)+facet_wrap(~year)
p2<-africa_data %>% ggplot(aes(sample=lifeExp))+geom_qq()+geom_qq_line(colour="blue")+facet_wrap(~year)
p1

p2

dependent<-"year"
explanatory<-c("lifeExp","pop","gdpPercap")
africa_data %>% mutate(year=factor(year)) %>% summary_factorlist(dependent,explanatory,cont="median",p=TRUE)
## label levels 1982
## lifeExp Median (IQR) 50.8 (45.6 to 56.6)
## pop Median (IQR) 5668228.5 (1569553.8 to 9788207.8)
## gdpPercap Median (IQR) 1323.7 (828.7 to 2787.6)
## 2007 p
## 52.9 (47.8 to 59.4) 0.149
## 10093310.5 (2909226.5 to 19363654.5) 0.033
## 1452.3 (863.0 to 3993.5) 0.503
fit_uk<-gapdata %>% filter(country=="United Kingdom") %>% lm(lifeExp~year,data=.)
fit_uk %>% summary()
##
## Call:
## lm(formula = lifeExp ~ year, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.69767 -0.31962 0.06642 0.36601 0.68165
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.942e+02 1.464e+01 -20.10 2.05e-09 ***
## year 1.860e-01 7.394e-03 25.15 2.26e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4421 on 10 degrees of freedom
## Multiple R-squared: 0.9844, Adjusted R-squared: 0.9829
## F-statistic: 632.5 on 1 and 10 DF, p-value: 2.262e-10
fit_uk %>% tidy()
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -294. 14.6 -20.1 2.05e- 9
## 2 year 0.186 0.00739 25.1 2.26e-10