#Get access to dataset
getwd()
## [1] "C:/Users/seanl/Documents/StatisticalInferenceAndProbability"
#Create data.frame variable using life expectancy data set
lifeExpectancy <- read.csv("Life Expectancy Plus Geo Region.csv")
Use required libraries
library(pastecs) #For creating descriptive statistic summaries
## Warning: package 'pastecs' was built under R version 4.2.2
library(ggplot2) #For creating different plots
library(semTools) #For skewness and kurtosis
## Warning: package 'semTools' was built under R version 4.2.2
## Loading required package: lavaan
## Warning: package 'lavaan' was built under R version 4.2.2
## This is lavaan 0.6-12
## lavaan is FREE software! Please report any bugs.
##
## ###############################################################################
## This is semTools 0.5-6
## All users of R (or SEM) are invited to submit functions or ideas for functions.
## ###############################################################################
library(pastecs) #For creating descriptive statistic summaries
library(FSA) #For percentage
## Warning: package 'FSA' was built under R version 4.2.2
## ## FSA v0.9.3. See citation('FSA') if used in publication.
## ## Run fishR() for related website and fishR('IFAR') for related book.
library(car) # For Levene's test for homogeneity of variance
## Warning: package 'car' was built under R version 4.2.2
## Loading required package: carData
## Registered S3 methods overwritten by 'car':
## method from
## hist.boot FSA
## confint.boot FSA
##
## Attaching package: 'car'
## The following object is masked from 'package:FSA':
##
## bootCase
library(effectsize) #To calculate effect size for t-test
## Warning: package 'effectsize' was built under R version 4.2.2
library(psych) # Some useful descriptive functions
## Warning: package 'psych' was built under R version 4.2.2
##
## Attaching package: 'psych'
## The following object is masked from 'package:effectsize':
##
## phi
## The following object is masked from 'package:car':
##
## logit
## The following object is masked from 'package:FSA':
##
## headtail
## The following objects are masked from 'package:semTools':
##
## reliability, skew
## The following object is masked from 'package:lavaan':
##
## cor2cov
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.2
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:pastecs':
##
## first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(gmodels) #For creating histograms with more detail than plot
## Warning: package 'gmodels' was built under R version 4.2.2
library(sjstats)#chi-square effect size
## Warning: package 'sjstats' was built under R version 4.2.2
##
## Attaching package: 'sjstats'
## The following object is masked from 'package:gmodels':
##
## ci
## The following object is masked from 'package:psych':
##
## phi
## The following objects are masked from 'package:effectsize':
##
## cohens_f, phi
## The following object is masked from 'package:FSA':
##
## se
Create reusable function for histograms
#Create 5th and 95th percentile variables for convenience
fp <- 0.05
nfp <- 0.95
TrimmedAverage <- function(dataSet, featureNumber,low, high)
{
#Calculate lower and upper percentiles
percentileLow <- quantile(dataSet[,featureNumber], low)
percentileHigh <- quantile(dataSet[,featureNumber], high)
#Select only rows between these two values
dataSet <- dataSet[dataSet[,featureNumber] > percentileLow & dataSet[,featureNumber] < percentileHigh,]
return(dataSet)
}
CreateHistogram <- function(dataSet,featurname,xlab,ylab,title,binwidth = 1, meanOffset = 10,medianOffset = 10,trimmed = F,low = fp,high = nfp)
{
#Find feature index
featureNo <- dataSet[,which(colnames(dataSet) == featurname)]
#For calculating trimmed mean
if(trimmed == T)
dataSet <- TrimmedAverage(dataSet,featureNo,low,high)
#Visualize data
ggplot(dataSet,aes(featureNo)) +
geom_histogram(binwidth = binwidth) +
geom_vline(xintercept = mean(featureNo),col = 'black', lwd = 2) +
geom_vline(xintercept = median(featureNo),col = 'blue', lwd = 2) +
annotate("text", x = mean(featureNo) + meanOffset,200, label = paste("Mean =", round(mean(featureNo),2)), col = "black", size = 4) +
annotate("text", x = median(featureNo) + medianOffset, 250, label = paste("Median =", round(median(featureNo),2)), col = "blue", size = 4) +
xlab(xlab) +
ylab(ylab) +
ggtitle(title) +
theme(legend.key.size = unit(.025, 'cm'), legend.key.height = unit(.025, 'cm'), legend.key.width = unit(.01, 'cm'), legend.title = element_text(size=14), legend.text = element_text(size=10))
}
Create subset containing required variables
#Create population - adult mortality rate data frame
popIndex <- which(colnames(lifeExpectancy) == "Population")
mortalityIndex <- which(colnames(lifeExpectancy) == "Adult.Mortality")
#Create subset with needed variables
populationxmortality <- lifeExpectancy[c(1,popIndex,mortalityIndex)]
Remove NA values
populationxmortality <- na.omit(populationxmortality)
Create second subset
#Create subset with needed variables
populationxmortalityLarge <- lifeExpectancy[c(1,popIndex,mortalityIndex)]
Remove NA values
populationxmortalityLarge <- na.omit(populationxmortalityLarge)
Clean first subset for testing purposes
#Contain highest population values for each country only
populationxmortality <- populationxmortality %>%
group_by(Country) %>%
slice(which.max(Population))
#Only use values equal to or over 40 for adult mortality rate
populationxmortality <- populationxmortality %>%
group_by(Adult.Mortality) %>%
add_tally() %>%
filter(Adult.Mortality >= 40)
#Remove outlier (India)
populationxmortality <- populationxmortality[-53,]
Scatter Plot 1
#Scatter plot of population size and adult mortality rates using new data set
scatter <- ggplot(populationxmortality, aes(Population,Adult.Mortality)) + geom_point() + labs(x = "Population size", y = "Adult mortality rate")
#Add a regression line
scatter + geom_point() + geom_smooth(method = "lm", colour = "Red", se = F) + labs(x = "Population size", y = "Adult mortality rate")
## `geom_smooth()` using formula 'y ~ x'
Scatter Plot 2
#Scatter plot of population size and adult mortality rates using initial data set
scatter <- ggplot(populationxmortalityLarge, aes(Population,Adult.Mortality)) + geom_point() + labs(x = "Population size", y = "Adult mortality rate")
#Add a regression line
scatter + geom_point() + geom_smooth(method = "lm", colour = "Red", se = F) + labs(x = "Population size", y = "Adult mortality rate")
## `geom_smooth()` using formula 'y ~ x'
Use Pearson test
pearson_results_Q1 <- cor.test(lifeExpectancy$Population, lifeExpectancy$Adult.Mortality, method = "pearson")
pearson_results_Q1
##
## Pearson's product-moment correlation
##
## data: lifeExpectancy$Population and lifeExpectancy$Adult.Mortality
## t = -0.65198, df = 2282, p-value = 0.5145
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.05463130 0.02738326
## sample estimates:
## cor
## -0.01364697
Find the shared relationship between the two variables
r_squared <- pearson_results_Q1[4]
r_squared[[1]]^2
## cor
## 0.0001862398
Create subset containing required variables
#Create alcohol - adult mortality rate data frame
alcoholIndex <- which(colnames(lifeExpectancy) == "Alcohol")
#Create subset with needed variables
alcoholxnormality <- lifeExpectancy[c(1,alcoholIndex,mortalityIndex)]
Remove NA values
alcoholxnormality <- na.omit(alcoholxnormality)
Visualize distribution for each variable
#Create histogram for alcohol consumption
CreateHistogram(alcoholxnormality,"Alcohol",xlab = "Alcohol Consumption", ylab = "Count", title = "Alcohol consumption per person worldwide",binwidth =.5,meanOffset = 2.25,medianOffset = 3.25)
#Create histogram for adult mortality rate
CreateHistogram(alcoholxnormality,"Adult.Mortality",xlab = "Adult.Mortality", ylab = "Count", title = "Adult mortality rate (2000-2015)",binwidth = 15,meanOffset = 100,medianOffset = 115)
scatter <- ggplot(alcoholxnormality, aes(Alcohol,Adult.Mortality)) + geom_point() + labs(x = "Alcohol Consumption", y = "Adult mortality rate")
#Add a regression line
scatter + geom_point() + geom_smooth(method = "lm", colour = "Red", se = F) + labs(x = "Alcohol Consumption", y = "Adult mortality rate")
## `geom_smooth()` using formula 'y ~ x'
Use Spearman test
spearman_results_Q2 <- cor.test(alcoholxnormality$Alcohol, alcoholxnormality$Adult.Mortality, method = "kendall")
spearman_results_Q2
##
## Kendall's rank correlation tau
##
## data: alcoholxnormality$Alcohol and alcoholxnormality$Adult.Mortality
## z = -11.153, p-value < 2.2e-16
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## -0.1432638
Find covariance
r_squared2 <- spearman_results_Q2[4]
r_squared2[[1]]^2
## tau
## 0.0205245
Create subset
#Create status - adult mortality rate data frame
statusIndex <- which(colnames(lifeExpectancy) == "Status")
#Create subset with needed variables
statusxmortality <- lifeExpectancy[c(1,statusIndex,mortalityIndex)]
#Remove NA values
statusxmortality <- na.omit(statusxmortality)
View variance in adult mortality rates
histogramQ3 <- ggplot(statusxmortality, aes(x=Adult.Mortality))
#Change the label of the x axis
histogramQ3 <- histogramQ3 + labs(x="Aldult Mortality")
#manage bin width and colors
histogramQ3 <- histogramQ3 + geom_histogram(binwidth= 20, colour="black", aes(y=..density.., fill=..count..))
histogramQ3 <- histogramQ3 + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")
#Create histogram with normality curve
histogramQ3 <- histogramQ3 + stat_function(fun=dnorm, color="red",args=list(mean=mean(statusxmortality$Adult.Mortality, na.rm=TRUE), sd=sd(statusxmortality$Adult.Mortality, na.rm=TRUE)))
#display graph
histogramQ3
Inspect normality using q-q plot
#Create a qqplot
qqnorm(statusxmortality$Adult.Mortality)
qqline(statusxmortality$Adult.Mortality, col=2) #show a line on the plot
Generate summary statistics for adult mortality rates
pastecs::stat.desc(statusxmortality$Adult.Mortality, basic=F)
## median mean SE.mean CI.mean.0.95 var std.dev
## 1.440000e+02 1.647964e+02 2.296984e+00 4.503868e+00 1.544852e+04 1.242921e+02
## coef.var
## 7.542158e-01
#We can make a decision based on the value of the standardised score for skew and kurtosis
#Divide the skew statistic by the standard error to get the standardised score, this will indicate if there is a problem
adultMortalitySkew<-semTools::skew(statusxmortality$Adult.Mortality)
adultMortalityKurt<-semTools::kurtosis(statusxmortality$Adult.Mortality)
adultMortalitySkew[1]/adultMortalitySkew[2]
## skew (g1)
## 25.94267
adultMortalityKurt[1]/adultMortalityKurt[2]
## Excess Kur (g2)
## 19.31679
# Calculate the percentage of standardised scores that are greater than 1.96
z_adult_mortality<- abs(scale(statusxmortality$Adult.Mortality))
FSA::perc(as.numeric(z_adult_mortality), 1.96, "gt")
## [1] 4.986339
Step 2: Analysis
Describe Adult Mortality Rates by group (developed v developing countries - variable status)
#Get descriptive stastitics by group - output as a matrix
psych::describeBy(statusxmortality$Adult.Mortality, statusxmortality$Status, mat=TRUE)
#Conduct Levene's test for homogeneity of variance
car::leveneTest(Adult.Mortality ~ Status, data=statusxmortality)
## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.
#Conduct the t-test
stats::t.test(Adult.Mortality~Status,data=statusxmortality)
##
## Welch Two Sample t-test
##
## data: Adult.Mortality by Status
## t = -30.745, df = 2174.9, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group Developed and group Developing is not equal to 0
## 95 percent confidence interval:
## -109.72693 -96.56836
## sample estimates:
## mean in group Developed mean in group Developing
## 79.68555 182.83320
res <- stats::t.test(Adult.Mortality~Status,data=statusxmortality)
#Calculate Cohen's d
effcd=round((2*res$statistic)/sqrt(res$parameter),2)
effcd
## t
## -1.32
#Using function from effectsize package
effectsize::t_to_d(t = res$statistic, res$parameter)
effes=round((res$statistic*res$statistic)/((res$statistic*res$statistic)+(res$parameter)),3)
effes
## t
## 0.303
Step 1: Explore
Create subset using original data set
#Create status - adult mortality rate data frame
RegionIndex <- which(colnames(lifeExpectancy) == "Region")
#Create subset with needed variables
regionxmortality <- lifeExpectancy[c(1,RegionIndex,mortalityIndex)]
#Remove NA values
regionxmortality <- na.omit(regionxmortality)
Inspect normality using histogram
#We will allocate the histogram to a variable to allow us to manipulate it
gg <- ggplot(regionxmortality, aes(x=Adult.Mortality))
#Change the label of the x axis
gg <- gg + labs(x="Adult mortality rate")
#manage bin width and colours
gg <- gg + geom_histogram(binwidth=20, colour="black", aes(y=..density.., fill=..count..))
gg <- gg + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")
#Display histogram
gg <- gg + stat_function(fun=dnorm, color="red",args=list(mean=mean(regionxmortality$Adult.Mortality, na.rm=TRUE), sd=sd(regionxmortality$Adult.Mortality, na.rm=TRUE)))
#Display plot
gg
Inspect normality using q-q plot
#Create a qqplot
qqnorm(regionxmortality$Adult.Mortality)
qqline(regionxmortality$Adult.Mortality, col=2) #show a line on the plot
pastecs::stat.desc(regionxmortality$Adult.Mortality, basic=F)
## median mean SE.mean CI.mean.0.95 var std.dev
## 1.440000e+02 1.647964e+02 2.296984e+00 4.503868e+00 1.544852e+04 1.242921e+02
## coef.var
## 7.542158e-01
#We can make a decision based on the value of the standardised score for skew and kurtosis
#Divide the skew statistic by the standard error to get the standardised score, this will indicate if there is a problem
adultMortalitySkew<-semTools::skew(regionxmortality$Adult.Mortality)
adultMortalityKurt<-semTools::kurtosis(regionxmortality$Adult.Mortality)
adultMortalitySkew[1]/adultMortalitySkew[2]
## skew (g1)
## 25.94267
adultMortalityKurt[1]/adultMortalityKurt[2]
## Excess Kur (g2)
## 19.31679
# Calculate the percentage of standardised scores that are greater than 1.96
z_adult_mortality<- abs(scale(regionxmortality$Adult.Mortality))
FSA::perc(as.numeric(z_adult_mortality), 1.96, "gt")
## [1] 4.986339
FSA::perc(as.numeric(z_adult_mortality), 3.29, "gt")
## [1] 1.058743
Step 2: Analyse
#Get descriptive statistics by group - output as a matrix
psych::describeBy(regionxmortality$Adult.Mortality, regionxmortality$Region, mat=TRUE)
#Store the output to use in our final reporting of the outcomes of ANOVA
sasdescrip<-psych::describeBy(regionxmortality$Adult.Mortality, regionxmortality$Region, mat=TRUE)
#Conduct Bartlett's test for homogeneity of variance in library car
stats::bartlett.test(Adult.Mortality~ Region, data=regionxmortality)
##
## Bartlett test of homogeneity of variances
##
## data: Adult.Mortality by Region
## Bartlett's K-squared = 822.03, df = 4, p-value < 2.2e-16
#p value is < 0.05 so the result is statistically significant, we can assume homogeneity
# Compute the analysis of variance using the oneway.test from the stats package- we set VAR.equal to be true because we have homogeneity
res <- stats::oneway.test(regionxmortality$Adult.Mortality ~ as.factor(regionxmortality$Region), data = regionxmortality, var.equal = TRUE)
# Summary of the analysis
res
##
## One-way analysis of means
##
## data: regionxmortality$Adult.Mortality and as.factor(regionxmortality$Region)
## F = 293.24, num df = 4, denom df = 2923, p-value < 2.2e-16
#Compute Eta squared
effes=effectsize::effectsize(res)
effes
#Store the relevant pieces of the output from ANOVA in variables to use for reporting
#Degrees of freedom
df1=res$parameter[1]
df2=res$parameter[2]
#F statistic
Fstat=round(res$statistic, 3)
#Pvalue
pval=round(res$p.value,2)