Prepare resources which may help carry out exploration and analysis

#Get access to dataset
getwd()
## [1] "C:/Users/seanl/Documents/StatisticalInferenceAndProbability"
#Create data.frame variable using life expectancy data set
lifeExpectancy <- read.csv("Life Expectancy Plus Geo Region.csv")

Use required libraries

library(pastecs) #For creating descriptive statistic summaries
## Warning: package 'pastecs' was built under R version 4.2.2
library(ggplot2) #For creating different plots
library(semTools) #For skewness and kurtosis
## Warning: package 'semTools' was built under R version 4.2.2
## Loading required package: lavaan
## Warning: package 'lavaan' was built under R version 4.2.2
## This is lavaan 0.6-12
## lavaan is FREE software! Please report any bugs.
## 
## ###############################################################################
## This is semTools 0.5-6
## All users of R (or SEM) are invited to submit functions or ideas for functions.
## ###############################################################################
library(pastecs) #For creating descriptive statistic summaries
library(FSA) #For percentage
## Warning: package 'FSA' was built under R version 4.2.2
## ## FSA v0.9.3. See citation('FSA') if used in publication.
## ## Run fishR() for related website and fishR('IFAR') for related book.
library(car) # For Levene's test for homogeneity of variance 
## Warning: package 'car' was built under R version 4.2.2
## Loading required package: carData
## Registered S3 methods overwritten by 'car':
##   method       from
##   hist.boot    FSA 
##   confint.boot FSA
## 
## Attaching package: 'car'
## The following object is masked from 'package:FSA':
## 
##     bootCase
library(effectsize) #To calculate effect size for t-test
## Warning: package 'effectsize' was built under R version 4.2.2
library(psych) # Some useful descriptive functions
## Warning: package 'psych' was built under R version 4.2.2
## 
## Attaching package: 'psych'
## The following object is masked from 'package:effectsize':
## 
##     phi
## The following object is masked from 'package:car':
## 
##     logit
## The following object is masked from 'package:FSA':
## 
##     headtail
## The following objects are masked from 'package:semTools':
## 
##     reliability, skew
## The following object is masked from 'package:lavaan':
## 
##     cor2cov
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.2
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:pastecs':
## 
##     first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(gmodels) #For creating histograms with more detail than plot
## Warning: package 'gmodels' was built under R version 4.2.2
library(sjstats)#chi-square effect size
## Warning: package 'sjstats' was built under R version 4.2.2
## 
## Attaching package: 'sjstats'
## The following object is masked from 'package:gmodels':
## 
##     ci
## The following object is masked from 'package:psych':
## 
##     phi
## The following objects are masked from 'package:effectsize':
## 
##     cohens_f, phi
## The following object is masked from 'package:FSA':
## 
##     se

Create reusable function for histograms

#Create 5th and 95th percentile variables for convenience
fp <- 0.05
nfp <- 0.95

TrimmedAverage <- function(dataSet, featureNumber,low, high)
{
    #Calculate lower and upper percentiles
    percentileLow <- quantile(dataSet[,featureNumber], low)
    percentileHigh <- quantile(dataSet[,featureNumber], high)

    #Select only rows between these two values
    dataSet <- dataSet[dataSet[,featureNumber] > percentileLow & dataSet[,featureNumber] < percentileHigh,] 
    return(dataSet)
}

CreateHistogram <- function(dataSet,featurname,xlab,ylab,title,binwidth = 1, meanOffset = 10,medianOffset = 10,trimmed = F,low = fp,high = nfp)
{
  #Find feature index
  featureNo <- dataSet[,which(colnames(dataSet) == featurname)]
  
  
  #For calculating trimmed mean
  if(trimmed == T)
    dataSet <- TrimmedAverage(dataSet,featureNo,low,high)
 
  #Visualize data
  ggplot(dataSet,aes(featureNo)) + 
  geom_histogram(binwidth = binwidth) + 
  geom_vline(xintercept = mean(featureNo),col = 'black', lwd = 2) +
  geom_vline(xintercept = median(featureNo),col = 'blue', lwd = 2) +
  annotate("text", x = mean(featureNo) + meanOffset,200, label = paste("Mean =", round(mean(featureNo),2)), col = "black", size = 4) +
  annotate("text", x = median(featureNo) + medianOffset, 250, label = paste("Median =", round(median(featureNo),2)), col = "blue", size = 4) +
  xlab(xlab) +
  ylab(ylab) +
  ggtitle(title) + 
  theme(legend.key.size = unit(.025, 'cm'), legend.key.height = unit(.025, 'cm'), legend.key.width = unit(.01, 'cm'), legend.title = element_text(size=14), legend.text =  element_text(size=10)) 
}

Q1. Does population size impact adult mortality rate?

Step 1: Explore data

Create subset containing required variables

#Create population - adult mortality rate data frame
popIndex <- which(colnames(lifeExpectancy) == "Population")
mortalityIndex <- which(colnames(lifeExpectancy) == "Adult.Mortality")

#Create subset with needed variables
populationxmortality <- lifeExpectancy[c(1,popIndex,mortalityIndex)]

Remove NA values

populationxmortality <- na.omit(populationxmortality)

Create second subset

#Create subset with needed variables
populationxmortalityLarge <- lifeExpectancy[c(1,popIndex,mortalityIndex)]

Remove NA values

populationxmortalityLarge <- na.omit(populationxmortalityLarge)

Clean first subset for testing purposes

#Contain highest population values for each country only
populationxmortality <- populationxmortality %>% 
    group_by(Country) %>% 
    slice(which.max(Population))

#Only use values equal to or over 40 for adult mortality rate
populationxmortality <- populationxmortality %>%
    group_by(Adult.Mortality) %>%
    add_tally() %>%
    filter(Adult.Mortality >= 40)

#Remove outlier (India)
populationxmortality <- populationxmortality[-53,]

Scatter Plot 1

#Scatter plot of population size and adult mortality rates using new data set
scatter <- ggplot(populationxmortality, aes(Population,Adult.Mortality)) + geom_point() + labs(x = "Population size", y = "Adult mortality rate") 

#Add a regression line
scatter + geom_point() + geom_smooth(method = "lm", colour = "Red", se = F) + labs(x = "Population size", y = "Adult mortality rate") 
## `geom_smooth()` using formula 'y ~ x'

Scatter Plot 2

#Scatter plot of population size and adult mortality rates using initial data set
scatter <- ggplot(populationxmortalityLarge, aes(Population,Adult.Mortality)) + geom_point() + labs(x = "Population size", y = "Adult mortality rate") 

#Add a regression line
scatter + geom_point() + geom_smooth(method = "lm", colour = "Red", se = F) + labs(x = "Population size", y = "Adult mortality rate")
## `geom_smooth()` using formula 'y ~ x'

Step 2: Analyse Data

Use Pearson test

pearson_results_Q1 <- cor.test(lifeExpectancy$Population, lifeExpectancy$Adult.Mortality, method = "pearson")
pearson_results_Q1
## 
##  Pearson's product-moment correlation
## 
## data:  lifeExpectancy$Population and lifeExpectancy$Adult.Mortality
## t = -0.65198, df = 2282, p-value = 0.5145
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.05463130  0.02738326
## sample estimates:
##         cor 
## -0.01364697

Find the shared relationship between the two variables

r_squared <- pearson_results_Q1[4]

  r_squared[[1]]^2
##          cor 
## 0.0001862398

Q2. Does alcohol consumption impact mortality rate?

Step 1: Explore Data

Create subset containing required variables

#Create alcohol - adult mortality rate data frame
alcoholIndex <- which(colnames(lifeExpectancy) == "Alcohol")

#Create subset with needed variables
alcoholxnormality <- lifeExpectancy[c(1,alcoholIndex,mortalityIndex)]

Remove NA values

alcoholxnormality <- na.omit(alcoholxnormality)

Visualize distribution for each variable

#Create histogram for alcohol consumption

CreateHistogram(alcoholxnormality,"Alcohol",xlab = "Alcohol Consumption", ylab = "Count", title = "Alcohol consumption per person worldwide",binwidth =.5,meanOffset = 2.25,medianOffset = 3.25)

#Create histogram for adult mortality rate

CreateHistogram(alcoholxnormality,"Adult.Mortality",xlab = "Adult.Mortality", ylab = "Count", title = "Adult mortality rate (2000-2015)",binwidth = 15,meanOffset = 100,medianOffset = 115)

scatter <- ggplot(alcoholxnormality, aes(Alcohol,Adult.Mortality)) + geom_point() + labs(x = "Alcohol Consumption", y = "Adult mortality rate") 

#Add a regression line
scatter + geom_point() + geom_smooth(method = "lm", colour = "Red", se = F) + labs(x = "Alcohol Consumption", y = "Adult mortality rate") 
## `geom_smooth()` using formula 'y ~ x'

Step 2: Analyse Data

Use Spearman test

spearman_results_Q2 <- cor.test(alcoholxnormality$Alcohol, alcoholxnormality$Adult.Mortality, method = "kendall")
spearman_results_Q2
## 
##  Kendall's rank correlation tau
## 
## data:  alcoholxnormality$Alcohol and alcoholxnormality$Adult.Mortality
## z = -11.153, p-value < 2.2e-16
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
##        tau 
## -0.1432638

Find covariance

r_squared2 <- spearman_results_Q2[4]

  r_squared2[[1]]^2
##       tau 
## 0.0205245

Q3. Does adult mortality rate vary for developed v developing countries?

Step 1: Explore data

Create subset

#Create status - adult mortality rate data frame
statusIndex <- which(colnames(lifeExpectancy) == "Status")

#Create subset with needed variables
statusxmortality <- lifeExpectancy[c(1,statusIndex,mortalityIndex)]

#Remove NA values
statusxmortality <- na.omit(statusxmortality)

View variance in adult mortality rates

histogramQ3 <- ggplot(statusxmortality, aes(x=Adult.Mortality))

#Change the label of the x axis
histogramQ3 <- histogramQ3 + labs(x="Aldult Mortality")

#manage bin width and colors
histogramQ3 <- histogramQ3 + geom_histogram(binwidth= 20, colour="black", aes(y=..density.., fill=..count..))
histogramQ3 <- histogramQ3 + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")

#Create histogram with normality curve
histogramQ3 <- histogramQ3 + stat_function(fun=dnorm, color="red",args=list(mean=mean(statusxmortality$Adult.Mortality, na.rm=TRUE), sd=sd(statusxmortality$Adult.Mortality, na.rm=TRUE)))

#display graph 
histogramQ3

Inspect normality using q-q plot

#Create a qqplot
qqnorm(statusxmortality$Adult.Mortality)
qqline(statusxmortality$Adult.Mortality, col=2) #show a line on the plot

Generate summary statistics for adult mortality rates

pastecs::stat.desc(statusxmortality$Adult.Mortality, basic=F)
##       median         mean      SE.mean CI.mean.0.95          var      std.dev 
## 1.440000e+02 1.647964e+02 2.296984e+00 4.503868e+00 1.544852e+04 1.242921e+02 
##     coef.var 
## 7.542158e-01
#We can make a decision based on the value of the standardised score for skew and kurtosis
#Divide the skew statistic by the standard error to get the standardised score, this will indicate if there is a problem

adultMortalitySkew<-semTools::skew(statusxmortality$Adult.Mortality)
adultMortalityKurt<-semTools::kurtosis(statusxmortality$Adult.Mortality)
adultMortalitySkew[1]/adultMortalitySkew[2]
## skew (g1) 
##  25.94267
adultMortalityKurt[1]/adultMortalityKurt[2]
## Excess Kur (g2) 
##        19.31679
# Calculate the percentage of standardised scores that are greater than 1.96

z_adult_mortality<- abs(scale(statusxmortality$Adult.Mortality))

FSA::perc(as.numeric(z_adult_mortality), 1.96, "gt")
## [1] 4.986339

Step 2: Analysis

Describe Adult Mortality Rates by group (developed v developing countries - variable status)

#Get descriptive stastitics by group - output as a matrix
psych::describeBy(statusxmortality$Adult.Mortality, statusxmortality$Status, mat=TRUE)
#Conduct Levene's test for homogeneity of variance 
car::leveneTest(Adult.Mortality ~ Status, data=statusxmortality)
## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.
#Conduct the t-test 
stats::t.test(Adult.Mortality~Status,data=statusxmortality)
## 
##  Welch Two Sample t-test
## 
## data:  Adult.Mortality by Status
## t = -30.745, df = 2174.9, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group Developed and group Developing is not equal to 0
## 95 percent confidence interval:
##  -109.72693  -96.56836
## sample estimates:
##  mean in group Developed mean in group Developing 
##                 79.68555                182.83320
res <- stats::t.test(Adult.Mortality~Status,data=statusxmortality)

#Calculate Cohen's d
effcd=round((2*res$statistic)/sqrt(res$parameter),2)
effcd
##     t 
## -1.32
#Using function from effectsize package
effectsize::t_to_d(t = res$statistic, res$parameter)
effes=round((res$statistic*res$statistic)/((res$statistic*res$statistic)+(res$parameter)),3)
effes
##     t 
## 0.303

Q4. Does adult mortality rate vary by geographic region?

Step 1: Explore

Create subset using original data set

#Create status - adult mortality rate data frame
RegionIndex <- which(colnames(lifeExpectancy) == "Region")

#Create subset with needed variables
regionxmortality <- lifeExpectancy[c(1,RegionIndex,mortalityIndex)]

#Remove NA values
regionxmortality <- na.omit(regionxmortality)

Inspect normality using histogram

#We will allocate the histogram to a variable to allow us to manipulate it
gg <- ggplot(regionxmortality, aes(x=Adult.Mortality))

#Change the label of the x axis
gg <- gg + labs(x="Adult mortality rate")

#manage bin width and colours
gg <- gg + geom_histogram(binwidth=20, colour="black", aes(y=..density.., fill=..count..))
gg <- gg + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")

#Display histogram
gg <- gg + stat_function(fun=dnorm, color="red",args=list(mean=mean(regionxmortality$Adult.Mortality, na.rm=TRUE), sd=sd(regionxmortality$Adult.Mortality, na.rm=TRUE)))

#Display plot
gg

Inspect normality using q-q plot

#Create a qqplot
qqnorm(regionxmortality$Adult.Mortality)
qqline(regionxmortality$Adult.Mortality, col=2) #show a line on the plot

pastecs::stat.desc(regionxmortality$Adult.Mortality, basic=F)
##       median         mean      SE.mean CI.mean.0.95          var      std.dev 
## 1.440000e+02 1.647964e+02 2.296984e+00 4.503868e+00 1.544852e+04 1.242921e+02 
##     coef.var 
## 7.542158e-01
#We can make a decision based on the value of the standardised score for skew and kurtosis
#Divide the skew statistic by the standard error to get the standardised score, this will indicate if there is a problem

adultMortalitySkew<-semTools::skew(regionxmortality$Adult.Mortality)
adultMortalityKurt<-semTools::kurtosis(regionxmortality$Adult.Mortality)
adultMortalitySkew[1]/adultMortalitySkew[2]
## skew (g1) 
##  25.94267
adultMortalityKurt[1]/adultMortalityKurt[2]
## Excess Kur (g2) 
##        19.31679
# Calculate the percentage of standardised scores that are greater than 1.96

z_adult_mortality<- abs(scale(regionxmortality$Adult.Mortality))

FSA::perc(as.numeric(z_adult_mortality), 1.96, "gt")
## [1] 4.986339
FSA::perc(as.numeric(z_adult_mortality), 3.29, "gt")
## [1] 1.058743

Step 2: Analyse

#Get descriptive statistics by group - output as a matrix
psych::describeBy(regionxmortality$Adult.Mortality, regionxmortality$Region, mat=TRUE)
#Store the output to use in our final reporting of the outcomes of ANOVA
sasdescrip<-psych::describeBy(regionxmortality$Adult.Mortality, regionxmortality$Region, mat=TRUE)

#Conduct Bartlett's test for homogeneity of variance in library car 
stats::bartlett.test(Adult.Mortality~ Region, data=regionxmortality)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  Adult.Mortality by Region
## Bartlett's K-squared = 822.03, df = 4, p-value < 2.2e-16
#p value is < 0.05 so the result is statistically significant, we can assume homogeneity


# Compute the analysis of variance using the oneway.test from the stats package- we set VAR.equal to be true because we have homogeneity
res <- stats::oneway.test(regionxmortality$Adult.Mortality ~ as.factor(regionxmortality$Region), data = regionxmortality, var.equal = TRUE)
# Summary of the analysis
res
## 
##  One-way analysis of means
## 
## data:  regionxmortality$Adult.Mortality and as.factor(regionxmortality$Region)
## F = 293.24, num df = 4, denom df = 2923, p-value < 2.2e-16
#Compute Eta squared
effes=effectsize::effectsize(res)
effes
#Store the relevant pieces of the output from ANOVA in variables to use for reporting
#Degrees of freedom
df1=res$parameter[1]
df2=res$parameter[2]
#F statistic
Fstat=round(res$statistic, 3)
#Pvalue
pval=round(res$p.value,2)