Probability and Statistical Inference Assignment 1

Prepare resources which may help carry out exploration and analysis

#Get access to dataset
getwd()

## [1] "C:/Users/seanl/Documents/StatisticalInferenceAndProbability"

#Create data.frame variable using life expectancy data set
lifeExpectancy <- read.csv("Life Expectancy Plus Geo Region.csv")

Use required libraries

library(pastecs) #For creating descriptive statistic summaries

## Warning: package 'pastecs' was built under R version 4.2.2

library(ggplot2) #For creating different plots
library(semTools) #For skewness and kurtosis

## Warning: package 'semTools' was built under R version 4.2.2

## Loading required package: lavaan

## Warning: package 'lavaan' was built under R version 4.2.2

## This is lavaan 0.6-12
## lavaan is FREE software! Please report any bugs.

##

## ###############################################################################

## This is semTools 0.5-6

## All users of R (or SEM) are invited to submit functions or ideas for functions.

## ###############################################################################

library(pastecs) #For creating descriptive statistic summaries
library(FSA) #For percentage

## Warning: package 'FSA' was built under R version 4.2.2

## ## FSA v0.9.3. See citation('FSA') if used in publication.
## ## Run fishR() for related website and fishR('IFAR') for related book.

library(car) # For Levene's test for homogeneity of variance

## Warning: package 'car' was built under R version 4.2.2

## Loading required package: carData

## Registered S3 methods overwritten by 'car':
##   method       from
##   hist.boot    FSA 
##   confint.boot FSA

## 
## Attaching package: 'car'

## The following object is masked from 'package:FSA':
## 
##     bootCase

library(effectsize) #To calculate effect size for t-test

## Warning: package 'effectsize' was built under R version 4.2.2

library(psych) # Some useful descriptive functions

## Warning: package 'psych' was built under R version 4.2.2

## 
## Attaching package: 'psych'

## The following object is masked from 'package:effectsize':
## 
##     phi

## The following object is masked from 'package:car':
## 
##     logit

## The following object is masked from 'package:FSA':
## 
##     headtail

## The following objects are masked from 'package:semTools':
## 
##     reliability, skew

## The following object is masked from 'package:lavaan':
## 
##     cor2cov

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.2.2

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:pastecs':
## 
##     first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(gmodels) #For creating histograms with more detail than plot

## Warning: package 'gmodels' was built under R version 4.2.2

library(sjstats)#chi-square effect size

## Warning: package 'sjstats' was built under R version 4.2.2

## 
## Attaching package: 'sjstats'

## The following object is masked from 'package:gmodels':
## 
##     ci

## The following object is masked from 'package:psych':
## 
##     phi

## The following objects are masked from 'package:effectsize':
## 
##     cohens_f, phi

## The following object is masked from 'package:FSA':
## 
##     se

Create reusable function for histograms

#Create 5th and 95th percentile variables for convenience
fp <- 0.05
nfp <- 0.95

TrimmedAverage <- function(dataSet, featureNumber,low, high)
{
    #Calculate lower and upper percentiles
    percentileLow <- quantile(dataSet[,featureNumber], low)
    percentileHigh <- quantile(dataSet[,featureNumber], high)

    #Select only rows between these two values
    dataSet <- dataSet[dataSet[,featureNumber] > percentileLow & dataSet[,featureNumber] < percentileHigh,] 
    return(dataSet)
}

CreateHistogram <- function(dataSet,featurname,xlab,ylab,title,binwidth = 1, meanOffset = 10,medianOffset = 10,trimmed = F,low = fp,high = nfp)
{
  #Find feature index
  featureNo <- dataSet[,which(colnames(dataSet) == featurname)]
  
  
  #For calculating trimmed mean
  if(trimmed == T)
    dataSet <- TrimmedAverage(dataSet,featureNo,low,high)
 
  #Visualize data
  ggplot(dataSet,aes(featureNo)) + 
  geom_histogram(binwidth = binwidth) + 
  geom_vline(xintercept = mean(featureNo),col = 'black', lwd = 2) +
  geom_vline(xintercept = median(featureNo),col = 'blue', lwd = 2) +
  annotate("text", x = mean(featureNo) + meanOffset,200, label = paste("Mean =", round(mean(featureNo),2)), col = "black", size = 4) +
  annotate("text", x = median(featureNo) + medianOffset, 250, label = paste("Median =", round(median(featureNo),2)), col = "blue", size = 4) +
  xlab(xlab) +
  ylab(ylab) +
  ggtitle(title) + 
  theme(legend.key.size = unit(.025, 'cm'), legend.key.height = unit(.025, 'cm'), legend.key.width = unit(.01, 'cm'), legend.title = element_text(size=14), legend.text =  element_text(size=10)) 
}

Q1. Does population size impact adult mortality rate?

Step 1: Explore data

Create subset containing required variables

#Create population - adult mortality rate data frame
popIndex <- which(colnames(lifeExpectancy) == "Population")
mortalityIndex <- which(colnames(lifeExpectancy) == "Adult.Mortality")

#Create subset with needed variables
populationxmortality <- lifeExpectancy[c(1,popIndex,mortalityIndex)]

Remove NA values

populationxmortality <- na.omit(populationxmortality)

Create second subset

#Create subset with needed variables
populationxmortalityLarge <- lifeExpectancy[c(1,popIndex,mortalityIndex)]

Remove NA values

populationxmortalityLarge <- na.omit(populationxmortalityLarge)

Clean first subset for testing purposes

#Contain highest population values for each country only
populationxmortality <- populationxmortality %>% 
    group_by(Country) %>% 
    slice(which.max(Population))

#Only use values equal to or over 40 for adult mortality rate
populationxmortality <- populationxmortality %>%
    group_by(Adult.Mortality) %>%
    add_tally() %>%
    filter(Adult.Mortality >= 40)

#Remove outlier (India)
populationxmortality <- populationxmortality[-53,]

Scatter Plot 1

#Scatter plot of population size and adult mortality rates using new data set
scatter <- ggplot(populationxmortality, aes(Population,Adult.Mortality)) + geom_point() + labs(x = "Population size", y = "Adult mortality rate") 

#Add a regression line
scatter + geom_point() + geom_smooth(method = "lm", colour = "Red", se = F) + labs(x = "Population size", y = "Adult mortality rate")

## `geom_smooth()` using formula 'y ~ x'

Scatter Plot 2

#Scatter plot of population size and adult mortality rates using initial data set
scatter <- ggplot(populationxmortalityLarge, aes(Population,Adult.Mortality)) + geom_point() + labs(x = "Population size", y = "Adult mortality rate") 

#Add a regression line
scatter + geom_point() + geom_smooth(method = "lm", colour = "Red", se = F) + labs(x = "Population size", y = "Adult mortality rate")

## `geom_smooth()` using formula 'y ~ x'

Step 2: Analyse Data

Use Pearson test

pearson_results_Q1 <- cor.test(lifeExpectancy$Population, lifeExpectancy$Adult.Mortality, method = "pearson")
pearson_results_Q1

## 
##  Pearson's product-moment correlation
## 
## data:  lifeExpectancy$Population and lifeExpectancy$Adult.Mortality
## t = -0.65198, df = 2282, p-value = 0.5145
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.05463130  0.02738326
## sample estimates:
##         cor 
## -0.01364697

Find the shared relationship between the two variables

r_squared <- pearson_results_Q1[4]

  r_squared[[1]]^2

##          cor 
## 0.0001862398

Q2. Does alcohol consumption impact mortality rate?

Step 1: Explore Data

Create subset containing required variables

#Create alcohol - adult mortality rate data frame
alcoholIndex <- which(colnames(lifeExpectancy) == "Alcohol")

#Create subset with needed variables
alcoholxnormality <- lifeExpectancy[c(1,alcoholIndex,mortalityIndex)]

Remove NA values

alcoholxnormality <- na.omit(alcoholxnormality)

Visualize distribution for each variable

#Create histogram for alcohol consumption

CreateHistogram(alcoholxnormality,"Alcohol",xlab = "Alcohol Consumption", ylab = "Count", title = "Alcohol consumption per person worldwide",binwidth =.5,meanOffset = 2.25,medianOffset = 3.25)

#Create histogram for adult mortality rate

CreateHistogram(alcoholxnormality,"Adult.Mortality",xlab = "Adult.Mortality", ylab = "Count", title = "Adult mortality rate (2000-2015)",binwidth = 15,meanOffset = 100,medianOffset = 115)

scatter <- ggplot(alcoholxnormality, aes(Alcohol,Adult.Mortality)) + geom_point() + labs(x = "Alcohol Consumption", y = "Adult mortality rate") 

#Add a regression line
scatter + geom_point() + geom_smooth(method = "lm", colour = "Red", se = F) + labs(x = "Alcohol Consumption", y = "Adult mortality rate")

## `geom_smooth()` using formula 'y ~ x'

Step 2: Analyse Data

Use Spearman test

spearman_results_Q2 <- cor.test(alcoholxnormality$Alcohol, alcoholxnormality$Adult.Mortality, method = "kendall")
spearman_results_Q2

## 
##  Kendall's rank correlation tau
## 
## data:  alcoholxnormality$Alcohol and alcoholxnormality$Adult.Mortality
## z = -11.153, p-value < 2.2e-16
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
##        tau 
## -0.1432638

Find covariance

r_squared2 <- spearman_results_Q2[4]

  r_squared2[[1]]^2

##       tau 
## 0.0205245

Q3. Does adult mortality rate vary for developed v developing countries?

Step 1: Explore data

Create subset

#Create status - adult mortality rate data frame
statusIndex <- which(colnames(lifeExpectancy) == "Status")

#Create subset with needed variables
statusxmortality <- lifeExpectancy[c(1,statusIndex,mortalityIndex)]

#Remove NA values
statusxmortality <- na.omit(statusxmortality)

View variance in adult mortality rates

histogramQ3 <- ggplot(statusxmortality, aes(x=Adult.Mortality))

#Change the label of the x axis
histogramQ3 <- histogramQ3 + labs(x="Aldult Mortality")

#manage bin width and colors
histogramQ3 <- histogramQ3 + geom_histogram(binwidth= 20, colour="black", aes(y=..density.., fill=..count..))
histogramQ3 <- histogramQ3 + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")

#Create histogram with normality curve
histogramQ3 <- histogramQ3 + stat_function(fun=dnorm, color="red",args=list(mean=mean(statusxmortality$Adult.Mortality, na.rm=TRUE), sd=sd(statusxmortality$Adult.Mortality, na.rm=TRUE)))

#display graph 
histogramQ3

Inspect normality using q-q plot

#Create a qqplot
qqnorm(statusxmortality$Adult.Mortality)
qqline(statusxmortality$Adult.Mortality, col=2) #show a line on the plot

Generate summary statistics for adult mortality rates

pastecs::stat.desc(statusxmortality$Adult.Mortality, basic=F)

##       median         mean      SE.mean CI.mean.0.95          var      std.dev 
## 1.440000e+02 1.647964e+02 2.296984e+00 4.503868e+00 1.544852e+04 1.242921e+02 
##     coef.var 
## 7.542158e-01

#We can make a decision based on the value of the standardised score for skew and kurtosis
#Divide the skew statistic by the standard error to get the standardised score, this will indicate if there is a problem

adultMortalitySkew<-semTools::skew(statusxmortality$Adult.Mortality)
adultMortalityKurt<-semTools::kurtosis(statusxmortality$Adult.Mortality)
adultMortalitySkew[1]/adultMortalitySkew[2]

## skew (g1) 
##  25.94267

adultMortalityKurt[1]/adultMortalityKurt[2]

## Excess Kur (g2) 
##        19.31679

# Calculate the percentage of standardised scores that are greater than 1.96

z_adult_mortality<- abs(scale(statusxmortality$Adult.Mortality))

FSA::perc(as.numeric(z_adult_mortality), 1.96, "gt")

## [1] 4.986339

Step 2: Analysis

Describe Adult Mortality Rates by group (developed v developing countries - variable status)

#Get descriptive stastitics by group - output as a matrix
psych::describeBy(statusxmortality$Adult.Mortality, statusxmortality$Status, mat=TRUE)

#Conduct Levene's test for homogeneity of variance 
car::leveneTest(Adult.Mortality ~ Status, data=statusxmortality)

## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.

#Conduct the t-test 
stats::t.test(Adult.Mortality~Status,data=statusxmortality)

## 
##  Welch Two Sample t-test
## 
## data:  Adult.Mortality by Status
## t = -30.745, df = 2174.9, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group Developed and group Developing is not equal to 0
## 95 percent confidence interval:
##  -109.72693  -96.56836
## sample estimates:
##  mean in group Developed mean in group Developing 
##                 79.68555                182.83320

res <- stats::t.test(Adult.Mortality~Status,data=statusxmortality)

#Calculate Cohen's d
effcd=round((2*res$statistic)/sqrt(res$parameter),2)
effcd

##     t 
## -1.32

#Using function from effectsize package
effectsize::t_to_d(t = res$statistic, res$parameter)

effes=round((res$statistic*res$statistic)/((res$statistic*res$statistic)+(res$parameter)),3)
effes

##     t 
## 0.303

Q4. Does adult mortality rate vary by geographic region?

Step 1: Explore

Create subset using original data set

#Create status - adult mortality rate data frame
RegionIndex <- which(colnames(lifeExpectancy) == "Region")

#Create subset with needed variables
regionxmortality <- lifeExpectancy[c(1,RegionIndex,mortalityIndex)]

#Remove NA values
regionxmortality <- na.omit(regionxmortality)

Inspect normality using histogram

#We will allocate the histogram to a variable to allow us to manipulate it
gg <- ggplot(regionxmortality, aes(x=Adult.Mortality))

#Change the label of the x axis
gg <- gg + labs(x="Adult mortality rate")

#manage bin width and colours
gg <- gg + geom_histogram(binwidth=20, colour="black", aes(y=..density.., fill=..count..))
gg <- gg + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")

#Display histogram
gg <- gg + stat_function(fun=dnorm, color="red",args=list(mean=mean(regionxmortality$Adult.Mortality, na.rm=TRUE), sd=sd(regionxmortality$Adult.Mortality, na.rm=TRUE)))

#Display plot
gg

Inspect normality using q-q plot

#Create a qqplot
qqnorm(regionxmortality$Adult.Mortality)
qqline(regionxmortality$Adult.Mortality, col=2) #show a line on the plot

pastecs::stat.desc(regionxmortality$Adult.Mortality, basic=F)

##       median         mean      SE.mean CI.mean.0.95          var      std.dev 
## 1.440000e+02 1.647964e+02 2.296984e+00 4.503868e+00 1.544852e+04 1.242921e+02 
##     coef.var 
## 7.542158e-01

#We can make a decision based on the value of the standardised score for skew and kurtosis
#Divide the skew statistic by the standard error to get the standardised score, this will indicate if there is a problem

adultMortalitySkew<-semTools::skew(regionxmortality$Adult.Mortality)
adultMortalityKurt<-semTools::kurtosis(regionxmortality$Adult.Mortality)
adultMortalitySkew[1]/adultMortalitySkew[2]

## skew (g1) 
##  25.94267

adultMortalityKurt[1]/adultMortalityKurt[2]

## Excess Kur (g2) 
##        19.31679

# Calculate the percentage of standardised scores that are greater than 1.96

z_adult_mortality<- abs(scale(regionxmortality$Adult.Mortality))

FSA::perc(as.numeric(z_adult_mortality), 1.96, "gt")

## [1] 4.986339

FSA::perc(as.numeric(z_adult_mortality), 3.29, "gt")

## [1] 1.058743

Step 2: Analyse

#Get descriptive statistics by group - output as a matrix
psych::describeBy(regionxmortality$Adult.Mortality, regionxmortality$Region, mat=TRUE)

#Store the output to use in our final reporting of the outcomes of ANOVA
sasdescrip<-psych::describeBy(regionxmortality$Adult.Mortality, regionxmortality$Region, mat=TRUE)

#Conduct Bartlett's test for homogeneity of variance in library car 
stats::bartlett.test(Adult.Mortality~ Region, data=regionxmortality)

## 
##  Bartlett test of homogeneity of variances
## 
## data:  Adult.Mortality by Region
## Bartlett's K-squared = 822.03, df = 4, p-value < 2.2e-16

#p value is < 0.05 so the result is statistically significant, we can assume homogeneity


# Compute the analysis of variance using the oneway.test from the stats package- we set VAR.equal to be true because we have homogeneity
res <- stats::oneway.test(regionxmortality$Adult.Mortality ~ as.factor(regionxmortality$Region), data = regionxmortality, var.equal = TRUE)
# Summary of the analysis
res

## 
##  One-way analysis of means
## 
## data:  regionxmortality$Adult.Mortality and as.factor(regionxmortality$Region)
## F = 293.24, num df = 4, denom df = 2923, p-value < 2.2e-16

#Compute Eta squared
effes=effectsize::effectsize(res)
effes

#Store the relevant pieces of the output from ANOVA in variables to use for reporting
#Degrees of freedom
df1=res$parameter[1]
df2=res$parameter[2]
#F statistic
Fstat=round(res$statistic, 3)
#Pvalue
pval=round(res$p.value,2)

Q5. Is developing/developed status related to geographic region?

Step 1: Explore

Create subset using original data set

#Create subset with needed variables
regionxstatus <- lifeExpectancy[c(1,RegionIndex,statusIndex)]

#Remove NA values
regionxstatus <- na.omit(regionxstatus)

Remove duplicate rows

regionxstatus <- regionxstatus[!duplicated(regionxstatus),]

Step 2: Analyse

#Use the Crosstable function
#CrossTable(predictor, outcome, fisher = TRUE, chisq = TRUE, expected = TRUE)
gmodels::CrossTable(regionxstatus$Region, regionxstatus$Status, fisher = TRUE, chisq = TRUE, expected = TRUE, sresid = TRUE, format = "SPSS")

## Warning in chisq.test(t, correct = FALSE, ...): Chi-squared approximation may be
## incorrect

## 
##    Cell Contents
## |-------------------------|
## |                   Count |
## |         Expected Values |
## | Chi-square contribution |
## |             Row Percent |
## |          Column Percent |
## |           Total Percent |
## |            Std Residual |
## |-------------------------|
## 
## Total Observations in Table:  193 
## 
##                      | regionxstatus$Status 
## regionxstatus$Region |  Developed  | Developing  |  Row Total | 
## ---------------------|------------|------------|------------|
##               Africa |         0  |        53  |        53  | 
##                      |     8.788  |    44.212  |            | 
##                      |     8.788  |     1.747  |            | 
##                      |     0.000% |   100.000% |    27.461% | 
##                      |     0.000% |    32.919% |            | 
##                      |     0.000% |    27.461% |            | 
##                      |    -2.964  |     1.322  |            | 
## ---------------------|------------|------------|------------|
##             Americas |         1  |        34  |        35  | 
##                      |     5.803  |    29.197  |            | 
##                      |     3.975  |     0.790  |            | 
##                      |     2.857% |    97.143% |    18.135% | 
##                      |     3.125% |    21.118% |            | 
##                      |     0.518% |    17.617% |            | 
##                      |    -1.994  |     0.889  |            | 
## ---------------------|------------|------------|------------|
##                 Asia |         3  |        45  |        48  | 
##                      |     7.959  |    40.041  |            | 
##                      |     3.089  |     0.614  |            | 
##                      |     6.250% |    93.750% |    24.870% | 
##                      |     9.375% |    27.950% |            | 
##                      |     1.554% |    23.316% |            | 
##                      |    -1.758  |     0.784  |            | 
## ---------------------|------------|------------|------------|
##               Europe |        26  |        15  |        41  | 
##                      |     6.798  |    34.202  |            | 
##                      |    54.240  |    10.781  |            | 
##                      |    63.415% |    36.585% |    21.244% | 
##                      |    81.250% |     9.317% |            | 
##                      |    13.472% |     7.772% |            | 
##                      |     7.365  |    -3.283  |            | 
## ---------------------|------------|------------|------------|
##              Oceania |         2  |        14  |        16  | 
##                      |     2.653  |    13.347  |            | 
##                      |     0.161  |     0.032  |            | 
##                      |    12.500% |    87.500% |     8.290% | 
##                      |     6.250% |     8.696% |            | 
##                      |     1.036% |     7.254% |            | 
##                      |    -0.401  |     0.179  |            | 
## ---------------------|------------|------------|------------|
##         Column Total |        32  |       161  |       193  | 
##                      |    16.580% |    83.420% |            | 
## ---------------------|------------|------------|------------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  84.21641     d.f. =  4     p =  2.224329e-17 
## 
## 
##  
## Fisher's Exact Test for Count Data
## ------------------------------------------------------------
## Alternative hypothesis: two.sided
## p =  4.356093e-16 
## 
##  
##        Minimum expected frequency: 2.65285 
## Cells with Expected Frequency < 5: 1 of 10 (10%)

#Create contingency table
mytable<-xtabs(~Region+Status, data=regionxstatus)

ctest<-stats::chisq.test(mytable, correct=TRUE)#chi square test

## Warning in stats::chisq.test(mytable, correct = TRUE): Chi-squared approximation
## may be incorrect

ctest#will give the details of the test statistic and p-value

## 
##  Pearson's Chi-squared test
## 
## data:  mytable
## X-squared = 84.216, df = 4, p-value < 2.2e-16

ctest$expected#expected frequencies

##           Status
## Region     Developed Developing
##   Africa    8.787565   44.21244
##   Americas  5.803109   29.19689
##   Asia      7.958549   40.04145
##   Europe    6.797927   34.20207
##   Oceania   2.652850   13.34715

ctest$observed#observed frequencies

##           Status
## Region     Developed Developing
##   Africa           0         53
##   Americas         1         34
##   Asia             3         45
##   Europe          26         15
##   Oceania          2         14

ctest$p.value

## [1] 2.224329e-17

#Calculate effect size
sjstats::phi(mytable)

## [1] 0.6605713

sjstats::cramer(mytable)

## [1] 0.6605713