#Get the Data for looking at the life expectancy of populations over time in different geographical regions.

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.2
## Warning: package 'forcats' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(finalfit)
## Warning: package 'finalfit' was built under R version 4.2.3
library(gapminder)
## Warning: package 'gapminder' was built under R version 4.2.3
library(broom)
## Warning: package 'broom' was built under R version 4.2.3
#Step 01-Create an object from object gapminder
gapdata<-gapminder
# Step:02-We are checking, each variable as line, variable type,first values
glimpse(gapdata)
## Rows: 1,704
## Columns: 6
## $ country   <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", …
## $ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, …
## $ year      <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, …
## $ lifeExp   <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.8…
## $ pop       <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 12…
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134, …
# Step:03-Check any missing data in the dataset
missing_glimpse(gapdata)
##               label var_type    n missing_n missing_percent
## country     country    <fct> 1704         0             0.0
## continent continent    <fct> 1704         0             0.0
## year           year    <int> 1704         0             0.0
## lifeExp     lifeExp    <dbl> 1704         0             0.0
## pop             pop    <int> 1704         0             0.0
## gdpPercap gdpPercap    <dbl> 1704         0             0.0
#Step:04- Create summary statistics for each variable
ff_glimpse(gapdata)
## Warning: `fct_explicit_na()` was deprecated in forcats 1.0.0.
## ℹ Please use `fct_na_value_to_level()` instead.
## ℹ The deprecated feature was likely used in the finalfit package.
##   Please report the issue at <https://github.com/ewenharrison/finalfit/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## $Continuous
##               label var_type    n missing_n missing_percent       mean
## year           year    <int> 1704         0             0.0     1979.5
## lifeExp     lifeExp    <dbl> 1704         0             0.0       59.5
## pop             pop    <int> 1704         0             0.0 29601212.3
## gdpPercap gdpPercap    <dbl> 1704         0             0.0     7215.3
##                    sd     min quartile_25    median quartile_75          max
## year             17.3  1952.0      1965.8    1979.5      1993.2       2007.0
## lifeExp          12.9    23.6        48.2      60.7        70.8         82.6
## pop       106157896.7 60011.0   2793664.0 7023595.5  19585221.8 1318683096.0
## gdpPercap      9857.5   241.2      1202.1    3531.8      9325.5     113523.1
## 
## $Categorical
##               label var_type    n missing_n missing_percent levels_n
## country     country    <fct> 1704         0             0.0      142
## continent continent    <fct> 1704         0             0.0        5
##                                                      levels
## country                                                   -
## continent "Africa", "Americas", "Asia", "Europe", "Oceania"
##                     levels_count               levels_percent
## country                        -                            -
## continent 624, 300, 396, 360, 24 36.6, 17.6, 23.2, 21.1,  1.4
summary(gapdata)
##         country        continent        year         lifeExp     
##  Afghanistan:  12   Africa  :624   Min.   :1952   Min.   :23.60  
##  Albania    :  12   Americas:300   1st Qu.:1966   1st Qu.:48.20  
##  Algeria    :  12   Asia    :396   Median :1980   Median :60.71  
##  Angola     :  12   Europe  :360   Mean   :1980   Mean   :59.47  
##  Argentina  :  12   Oceania : 24   3rd Qu.:1993   3rd Qu.:70.85  
##  Australia  :  12                  Max.   :2007   Max.   :82.60  
##  (Other)    :1632                                                
##       pop              gdpPercap       
##  Min.   :6.001e+04   Min.   :   241.2  
##  1st Qu.:2.794e+06   1st Qu.:  1202.1  
##  Median :7.024e+06   Median :  3531.8  
##  Mean   :2.960e+07   Mean   :  7215.3  
##  3rd Qu.:1.959e+07   3rd Qu.:  9325.5  
##  Max.   :1.319e+09   Max.   :113523.1  
## 
#We will start by comparing life expectancy between the 5 continents of the world in two different years. Always plot your data first. Never skip this step! We are particularly interested in the distribution. There’s that word again. The shape of the data. Is it normal? Is it skewed? Does it differ between regions and years? There are three useful plots which can help here:

##Histograms: examine shape of data and compare groups;
##Q-Q plots: are data normally distributed? 
##Box-plots: identify outliers, compare shape and groups.

gapdata %>% filter(year %in% c(2002,2007)) %>% ggplot(aes(x=lifeExp))+geom_histogram(bins=20)+facet_grid(year~continent)

gapdata %>% filter(year%in%c(2002,2007)) %>% ggplot(aes(sample=lifeExp))+geom_qq()+geom_qq_line(colour="blue")+facet_grid(year~continent)

gapdata %>% filter(year%in% c(2002,2007)) %>% ggplot(aes(x=continent,y=lifeExp))+geom_boxplot()+facet_wrap(~year)

#In our example of countries and continents, you have to assume that the mean life expectancy of each country does not depend on the life expectancies of other countries in the group. In other words, that each measurement is independent.
#The Welch two-sample t-test is the most flexible and copes with differences in variance (variability) between groups, as in this example.
ttest_data<-gapdata %>% filter(year==2007) %>% filter(continent %in% c("Asia","Europe") )
ttest_result<-ttest_data %>% t.test(lifeExp~continent,data=.)
ttest_result
## 
##  Welch Two Sample t-test
## 
## data:  lifeExp by continent
## t = -4.6468, df = 41.529, p-value = 3.389e-05
## alternative hypothesis: true difference in means between group Asia and group Europe is not equal to 0
## 95 percent confidence interval:
##  -9.926525 -3.913705
## sample estimates:
##   mean in group Asia mean in group Europe 
##             70.72848             77.64860
#What is the difference in life expectancy for each individual country? We don’t usually have to produce this directly, but here is one method.
paired_table<-gapdata %>% select(country,year,lifeExp) %>% pivot_wider(names_from=year,values_from=lifeExp)
paired_table
## # A tibble: 142 × 13
##    country `1952` `1957` `1962` `1967` `1972` `1977` `1982` `1987` `1992` `1997`
##    <fct>    <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
##  1 Afghan…   28.8   30.3   32.0   34.0   36.1   38.4   39.9   40.8   41.7   41.8
##  2 Albania   55.2   59.3   64.8   66.2   67.7   68.9   70.4   72     71.6   73.0
##  3 Algeria   43.1   45.7   48.3   51.4   54.5   58.0   61.4   65.8   67.7   69.2
##  4 Angola    30.0   32.0   34     36.0   37.9   39.5   39.9   39.9   40.6   41.0
##  5 Argent…   62.5   64.4   65.1   65.6   67.1   68.5   69.9   70.8   71.9   73.3
##  6 Austra…   69.1   70.3   70.9   71.1   71.9   73.5   74.7   76.3   77.6   78.8
##  7 Austria   66.8   67.5   69.5   70.1   70.6   72.2   73.2   74.9   76.0   77.5
##  8 Bahrain   50.9   53.8   56.9   59.9   63.3   65.6   69.1   70.8   72.6   73.9
##  9 Bangla…   37.5   39.3   41.2   43.5   45.3   46.9   50.0   52.8   56.0   59.4
## 10 Belgium   68     69.2   70.2   70.9   71.4   72.8   73.9   75.4   76.5   77.5
## # ℹ 132 more rows
## # ℹ 2 more variables: `2002` <dbl>, `2007` <dbl>
#Analysis of variance is a collection of statistical tests which can be used to test the difference in means between two or more groups.
library(broom)
gapdata %>% filter(year==2007) %>% filter(continent %in% c("Ameticas","Europe","Asia")) %>% aov(lifeExp~continent,data=.) %>% tidy()
## # A tibble: 2 × 6
##   term         df sumsq meansq statistic    p.value
##   <chr>     <dbl> <dbl>  <dbl>     <dbl>      <dbl>
## 1 continent     1  753.  753.       20.1  0.0000334
## 2 Residuals    61 2287.   37.5      NA   NA
#Non-parametric test for comparing two groups
africa_data<-gapdata %>% filter(year %in% c(1982,2007)) %>% filter(continent %in% c("Africa"))
p1<-africa_data %>% ggplot(aes(x=lifeExp))+geom_histogram(bins=15)+facet_wrap(~year)
p2<-africa_data %>% ggplot(aes(sample=lifeExp))+geom_qq()+geom_qq_line(colour="blue")+facet_wrap(~year)
p1

p2

dependent<-"year"
explanatory<-c("lifeExp","pop","gdpPercap")
africa_data %>% mutate(year=factor(year)) %>% summary_factorlist(dependent,explanatory,cont="median",p=TRUE)
##      label       levels                               1982
##    lifeExp Median (IQR)                50.8 (45.6 to 56.6)
##        pop Median (IQR) 5668228.5 (1569553.8 to 9788207.8)
##  gdpPercap Median (IQR)           1323.7 (828.7 to 2787.6)
##                                  2007     p
##                   52.9 (47.8 to 59.4) 0.149
##  10093310.5 (2909226.5 to 19363654.5) 0.033
##              1452.3 (863.0 to 3993.5) 0.503
fit_uk<-gapdata %>% filter(country=="United Kingdom") %>% lm(lifeExp~year,data=.)
fit_uk %>% summary()
## 
## Call:
## lm(formula = lifeExp ~ year, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.69767 -0.31962  0.06642  0.36601  0.68165 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.942e+02  1.464e+01  -20.10 2.05e-09 ***
## year         1.860e-01  7.394e-03   25.15 2.26e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4421 on 10 degrees of freedom
## Multiple R-squared:  0.9844, Adjusted R-squared:  0.9829 
## F-statistic: 632.5 on 1 and 10 DF,  p-value: 2.262e-10
fit_uk %>% tidy()
## # A tibble: 2 × 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept) -294.     14.6         -20.1 2.05e- 9
## 2 year           0.186   0.00739      25.1 2.26e-10