# Environment setting (to remove scientific notation)
options(scipen = 999)
# Packages loaded
library(tidyverse)
library(kableExtra)
library(skimr)
library(caret)
library(tableone)
library(factoextra)
library(FactoMineR)
library(car)
library(mvnormtest)
library(biotools)
library(corrplot)
library(cowplot)
library(hrbrthemes)
library(broom)
library(ggpubr)
library(rstatix)
library(nlme)
Dataset used in this project is a publicly available on Kaggle website. Kaggle is a well-known platform for data analysts and data scientists for sharing data, codes, resources, and ideas. The dataset is accessible via this Link.
Following are 10 randomly selected observations (rows) from the dataset:
life <- read.csv("Life Expectancy Data.csv")
sample_n(life, 10)
No. <- c(1:22)
Variable <- c("Country",
"Year",
"Status",
"Life expectancy",
"Adult Mortality",
"infant deaths",
"Alcohol",
"percentage expenditure",
"Hepatitis B",
"Measles",
"BMI",
"under-five deaths",
"Polio",
"Total expenditure",
"Diphtheria",
"HIV/AIDS",
"GDP",
"Population",
"thinness 1-19 years",
"thinness 5-9 years",
"Income composition of resources",
"Schooling")
Description <- c("Country",
"Year",
"Developed or Developing status",
"Life Expectancy in age",
"Adult Mortality Rates of both sexes (probability of dying between 15 and 60 years per 1000 population)",
"Number of Infant Deaths per 1000 population",
"Alcohol, recorded per capita (15+) consumption (in litres of pure alcohol)",
"Expenditure on health as a percentage of Gross Domestic Product per capita(%)",
"Hepatitis B (HepB) immunization coverage among 1-year-olds (%)",
"Measles - number of reported cases per 1000 population",
"Average Body Mass Index of entire population",
"Number of under-five deaths per 1000 population",
"Polio (Pol3) immunization coverage among 1-year-olds (%)",
"General government expenditure on health as a percentage of total government expenditure (%)",
"Diphtheria tetanus toxoid and pertussis (DTP3) immunization coverage among 1-year-olds (%)",
"Deaths per 1 000 live births HIV/AIDS (0-4 years)",
"Gross Domestic Product per capita (in USD)",
"Population of the country",
"Prevalence of thinness among children and adolescents for Age 10 to 19 (%)",
"Prevalence of thinness among children for Age 5 to 9(%)",
"Human Development Index in terms of income composition of resources (index ranging from 0 to 1)",
"Number of years of Schooling(years)"
)
data.frame(No., Variable, Description) %>%
kbl() %>%
kable_styling(bootstrap_options = c("bordered", "stripped", "hover"),
full_width = T)
| No. | Variable | Description |
|---|---|---|
| 1 | Country | Country |
| 2 | Year | Year |
| 3 | Status | Developed or Developing status |
| 4 | Life expectancy | Life Expectancy in age |
| 5 | Adult Mortality | Adult Mortality Rates of both sexes (probability of dying between 15 and 60 years per 1000 population) |
| 6 | infant deaths | Number of Infant Deaths per 1000 population |
| 7 | Alcohol | Alcohol, recorded per capita (15+) consumption (in litres of pure alcohol) |
| 8 | percentage expenditure | Expenditure on health as a percentage of Gross Domestic Product per capita(%) |
| 9 | Hepatitis B | Hepatitis B (HepB) immunization coverage among 1-year-olds (%) |
| 10 | Measles | Measles - number of reported cases per 1000 population |
| 11 | BMI | Average Body Mass Index of entire population |
| 12 | under-five deaths | Number of under-five deaths per 1000 population |
| 13 | Polio | Polio (Pol3) immunization coverage among 1-year-olds (%) |
| 14 | Total expenditure | General government expenditure on health as a percentage of total government expenditure (%) |
| 15 | Diphtheria | Diphtheria tetanus toxoid and pertussis (DTP3) immunization coverage among 1-year-olds (%) |
| 16 | HIV/AIDS | Deaths per 1 000 live births HIV/AIDS (0-4 years) |
| 17 | GDP | Gross Domestic Product per capita (in USD) |
| 18 | Population | Population of the country |
| 19 | thinness 1-19 years | Prevalence of thinness among children and adolescents for Age 10 to 19 (%) |
| 20 | thinness 5-9 years | Prevalence of thinness among children for Age 5 to 9(%) |
| 21 | Income composition of resources | Human Development Index in terms of income composition of resources (index ranging from 0 to 1) |
| 22 | Schooling | Number of years of Schooling(years) |
There are 2938 rows of data and 22 columns of variables.
glimpse(life)
## Rows: 2,938
## Columns: 22
## $ Country <chr> "Afghanistan", "Afghanistan", "Afghani…
## $ Year <int> 2015, 2014, 2013, 2012, 2011, 2010, 20…
## $ Status <chr> "Developing", "Developing", "Developin…
## $ Life.expectancy <dbl> 65.0, 59.9, 59.9, 59.5, 59.2, 58.8, 58…
## $ Adult.Mortality <int> 263, 271, 268, 272, 275, 279, 281, 287…
## $ infant.deaths <int> 62, 64, 66, 69, 71, 74, 77, 80, 82, 84…
## $ Alcohol <dbl> 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.…
## $ percentage.expenditure <dbl> 71.279624, 73.523582, 73.219243, 78.18…
## $ Hepatitis.B <int> 65, 62, 64, 67, 68, 66, 63, 64, 63, 64…
## $ Measles <int> 1154, 492, 430, 2787, 3013, 1989, 2861…
## $ BMI <dbl> 19.1, 18.6, 18.1, 17.6, 17.2, 16.7, 16…
## $ under.five.deaths <int> 83, 86, 89, 93, 97, 102, 106, 110, 113…
## $ Polio <int> 6, 58, 62, 67, 68, 66, 63, 64, 63, 58,…
## $ Total.expenditure <dbl> 8.16, 8.18, 8.13, 8.52, 7.87, 9.20, 9.…
## $ Diphtheria <int> 65, 62, 64, 67, 68, 66, 63, 64, 63, 58…
## $ HIV.AIDS <dbl> 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1…
## $ GDP <dbl> 584.25921, 612.69651, 631.74498, 669.9…
## $ Population <dbl> 33736494, 327582, 31731688, 3696958, 2…
## $ thinness..1.19.years <dbl> 17.2, 17.5, 17.7, 17.9, 18.2, 18.4, 18…
## $ thinness.5.9.years <dbl> 17.3, 17.5, 17.7, 18.0, 18.2, 18.4, 18…
## $ Income.composition.of.resources <dbl> 0.479, 0.476, 0.470, 0.463, 0.454, 0.4…
## $ Schooling <dbl> 10.1, 10.0, 9.9, 9.8, 9.5, 9.2, 8.9, 8…
There are 2 character variables “Country” and “Status”, and the rest of the variables are numerical.
skim_without_charts(life)
| Name | life |
| Number of rows | 2938 |
| Number of columns | 22 |
| _______________________ | |
| Column type frequency: | |
| character | 2 |
| numeric | 20 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Country | 0 | 1 | 4 | 52 | 0 | 193 | 0 |
| Status | 0 | 1 | 9 | 10 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| Year | 0 | 1.00 | 2007.52 | 4.61 | 2000.00 | 2004.00 | 2008.00 | 2012.00 | 2015.00 |
| Life.expectancy | 10 | 1.00 | 69.22 | 9.52 | 36.30 | 63.10 | 72.10 | 75.70 | 89.00 |
| Adult.Mortality | 10 | 1.00 | 164.80 | 124.29 | 1.00 | 74.00 | 144.00 | 228.00 | 723.00 |
| infant.deaths | 0 | 1.00 | 30.30 | 117.93 | 0.00 | 0.00 | 3.00 | 22.00 | 1800.00 |
| Alcohol | 194 | 0.93 | 4.60 | 4.05 | 0.01 | 0.88 | 3.76 | 7.70 | 17.87 |
| percentage.expenditure | 0 | 1.00 | 738.25 | 1987.91 | 0.00 | 4.69 | 64.91 | 441.53 | 19479.91 |
| Hepatitis.B | 553 | 0.81 | 80.94 | 25.07 | 1.00 | 77.00 | 92.00 | 97.00 | 99.00 |
| Measles | 0 | 1.00 | 2419.59 | 11467.27 | 0.00 | 0.00 | 17.00 | 360.25 | 212183.00 |
| BMI | 34 | 0.99 | 38.32 | 20.04 | 1.00 | 19.30 | 43.50 | 56.20 | 87.30 |
| under.five.deaths | 0 | 1.00 | 42.04 | 160.45 | 0.00 | 0.00 | 4.00 | 28.00 | 2500.00 |
| Polio | 19 | 0.99 | 82.55 | 23.43 | 3.00 | 78.00 | 93.00 | 97.00 | 99.00 |
| Total.expenditure | 226 | 0.92 | 5.94 | 2.50 | 0.37 | 4.26 | 5.76 | 7.49 | 17.60 |
| Diphtheria | 19 | 0.99 | 82.32 | 23.72 | 2.00 | 78.00 | 93.00 | 97.00 | 99.00 |
| HIV.AIDS | 0 | 1.00 | 1.74 | 5.08 | 0.10 | 0.10 | 0.10 | 0.80 | 50.60 |
| GDP | 448 | 0.85 | 7483.16 | 14270.17 | 1.68 | 463.94 | 1766.95 | 5910.81 | 119172.74 |
| Population | 652 | 0.78 | 12753375.12 | 61012096.51 | 34.00 | 195793.25 | 1386542.00 | 7420359.00 | 1293859294.00 |
| thinness..1.19.years | 34 | 0.99 | 4.84 | 4.42 | 0.10 | 1.60 | 3.30 | 7.20 | 27.70 |
| thinness.5.9.years | 34 | 0.99 | 4.87 | 4.51 | 0.10 | 1.50 | 3.30 | 7.20 | 28.60 |
| Income.composition.of.resources | 167 | 0.94 | 0.63 | 0.21 | 0.00 | 0.49 | 0.68 | 0.78 | 0.95 |
| Schooling | 163 | 0.94 | 11.99 | 3.36 | 0.00 | 10.10 | 12.30 | 14.30 | 20.70 |
From the variables “n_missing” and “complete_rate”, they detected a lot of missing data in many variables. Mean and standard deviation are also provided.
A good thing is that there is no variable having more than 40% missing data because I will follow a 60% rule that if a column has less than 0.6 completion rate (having 0.4 or 40% missing data), I will consider dropping the variable.
The number of missing data in each column can also be examined using following code:
colSums(is.na(life))
## Country Year
## 0 0
## Status Life.expectancy
## 0 10
## Adult.Mortality infant.deaths
## 10 0
## Alcohol percentage.expenditure
## 194 0
## Hepatitis.B Measles
## 553 0
## BMI under.five.deaths
## 34 0
## Polio Total.expenditure
## 19 226
## Diphtheria HIV.AIDS
## 19 0
## GDP Population
## 448 652
## thinness..1.19.years thinness.5.9.years
## 34 34
## Income.composition.of.resources Schooling
## 167 163
Here performs horizontal missing value check.
life %>%
mutate(row.id = c(1:nrow(life))) %>%
relocate(row.id, .before = Country) %>%
gather(key = "variables", value = "values", -row.id) %>%
filter(is.na(values)) %>%
group_by(row.id) %>%
summarise(count = n()) %>%
arrange(-count) %>%
mutate(total.column = 22,
propor.NA.per.row = paste0(round(count/total.column*100,1), "%"))
It is a common procedure that if a row (observation) contains too many missing values, it should be removed otherwise filling up its missing values by estimation would be too unrealistic and synthetic. Generally, one should remove a row if it contains 60% of missing values. One can also choose not to remove a row with many missing values if some of its data is useful to analyse a particular important business question.
From the table above, there are 1289 rows out of the 2938 rows contain missing value. It will be less practical to remove all of these rows just to be able to ensure there is no missing data in the dataset. Furthermore, the maximum proportion of missing value is 40.9%.
In this project, I will adopt an imputation method using machine learning algorithm to fill up these missing values. The algorithm will make use of the entire dataset and predict what might the best possible estimates for these missing cells.
Remove leading and/or trailing white spaces in character variables “Country” and “Status”.
life <- life %>%
mutate(Country = str_replace_all(Country, "[[:punct:]]", " "),
Country = trimws(Country),
Status = trimws(Status))
Replaces spaces in some of the country names by underscore. For example “Antigua and Barbuda” to “Antigua_and_Barbuda”. This is to avoid complication during imputation using machine learning technique.
life <- life %>%
mutate(Country = str_replace_all(Country, " ", "_"))
This section converts “Country”, “Year”, and “Status” into factor because of their categorical nature.
life <- life %>%
mutate(Country = as.factor(Country),
Year = as.factor(Year),
Status = as.factor(Status))
From this point onward, the dataset object name will be changed from “life” to “life2”, so to have “life” object as a backup before further transformations.
life2 <- life
This section applies imputation model to fill up missing values in the dataset. There are many types of imputation methods including using mean, median, and mode (most occurring values, generally applied to categorical data). One of the methods is applying machine learning algorithms for imputation. They will make use of the entire dataset to predict possible estimates for these missing values. It is generally considered a better estimation technique if missing values are not too much because it takes into account all information in the dataset when filling up missing values.
However, one should note that it is still an estimation technique and is not 100% accurate representation of the truth, checking and transformation might be required, depends on the nature of the variable. It is disputable that whether this technique is the best technique, it depends on a case-by-case basis and the goals of each analysis. Since the goal of this project is to demonstrate my skills instead of doing real, detail analysis for WHO, I will use the machine learning technique to fill up these missing value.
Following codes complete the imputation of missing values using bagged-tree algorithm.
One hot encoding (Making dummies):
#dummy_function <- dummyVars(~., data = life2)
#life_dummy <- dummy_function %>% predict(life2)
Impute using bagged tree models
# bagimpute <- life_dummy %>% preProcess(method = "bagImpute")
# life_dummy_imputed <- bagimpute %>% predict(life_dummy)
# life_dummy_df <- as.tibble(life_dummy_imputed)
Replace the columns that has missing values in the original dataset “life2”.
# Extract imputed columns into life2
# life2$Life.expectancy <- life_dummy_df$Life.expectancy
# life2$Adult.Mortality <- life_dummy_df$Adult.Mortality
# life2$Alcohol <- life_dummy_df$Alcohol
# life2$Hepatitis.B <- life_dummy_df$Hepatitis.B
# life2$BMI <- life_dummy_df$BMI
# life2$Polio <- life_dummy_df$Polio
# life2$Total.expenditure <- life_dummy_df$Total.expenditure
# life2$Diphtheria <- life_dummy_df$Diphtheria
# life2$GDP <- life_dummy_df$GDP
# life2$Population <- life_dummy_df$Population
# life2$thinness..1.19.years <- life_dummy_df$thinness..1.19.years
# life2$thinness.5.9.years <- life_dummy_df$thinness.5.9.years
# life2$Income.composition.of.resources <- life_dummy_df$Income.composition.of.resources
# life2$Schooling <- life_dummy_df$Schooling
The above imputation has been completed and the new imputed dataset has been saved. Following section imports the new dataset back to R with some manipulation to make sure it is perfect prior to analysis.
# write.csv(life2, "life2.csv")
life.data <- read.csv("life2.csv", header = T, stringsAsFactors = T)
life.data <- life.data %>%
dplyr::select(-X) %>%
mutate(Year = as.factor(Year)) %>%
rename("thinness.btw.10.19years" = "thinness..1.19.years",
"thinness.btw.5.9years" = "thinness.5.9.years"
)
Checking the data structure.
str(life.data)
## 'data.frame': 2938 obs. of 22 variables:
## $ Country : Factor w/ 193 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Year : Factor w/ 16 levels "2000","2001",..: 16 15 14 13 12 11 10 9 8 7 ...
## $ Status : Factor w/ 2 levels "Developed","Developing": 2 2 2 2 2 2 2 2 2 2 ...
## $ Life.expectancy : num 65 59.9 59.9 59.5 59.2 58.8 58.6 58.1 57.5 57.3 ...
## $ Adult.Mortality : num 263 271 268 272 275 279 281 287 295 295 ...
## $ infant.deaths : int 62 64 66 69 71 74 77 80 82 84 ...
## $ Alcohol : num 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.03 0.02 0.03 ...
## $ percentage.expenditure : num 71.3 73.5 73.2 78.2 7.1 ...
## $ Hepatitis.B : num 65 62 64 67 68 66 63 64 63 64 ...
## $ Measles : int 1154 492 430 2787 3013 1989 2861 1599 1141 1990 ...
## $ BMI : num 19.1 18.6 18.1 17.6 17.2 16.7 16.2 15.7 15.2 14.7 ...
## $ under.five.deaths : int 83 86 89 93 97 102 106 110 113 116 ...
## $ Polio : num 6 58 62 67 68 66 63 64 63 58 ...
## $ Total.expenditure : num 8.16 8.18 8.13 8.52 7.87 9.2 9.42 8.33 6.73 7.43 ...
## $ Diphtheria : num 65 62 64 67 68 66 63 64 63 58 ...
## $ HIV.AIDS : num 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
## $ GDP : num 584.3 612.7 631.7 670 63.5 ...
## $ Population : num 33736494 327582 31731688 3696958 2978599 ...
## $ thinness.btw.10.19years : num 17.2 17.5 17.7 17.9 18.2 18.4 18.6 18.8 19 19.2 ...
## $ thinness.btw.5.9years : num 17.3 17.5 17.7 18 18.2 18.4 18.7 18.9 19.1 19.3 ...
## $ Income.composition.of.resources: num 0.479 0.476 0.47 0.463 0.454 0.448 0.434 0.433 0.415 0.405 ...
## $ Schooling : num 10.1 10 9.9 9.8 9.5 9.2 8.9 8.7 8.4 8.1 ...
Most of the variables have skewed distribution, except that life expectancy, schooling, total expenditure, and income composition of resources look like having a normal distribution. BMI has a clear bimodal distribution.
# set up df
life.data.num <- life.data %>% select_if(is.numeric)
# plot
life.data.num %>%
gather(key = key, value = value) %>%
mutate(key = as.factor(key)) %>%
ggplot(aes(x = value, fill = key)) +
geom_histogram(colour = "black") +
facet_wrap(~key, scales = "free", ncol = 5) +
theme_classic() +
theme(legend.position = "none",
strip.background = element_rect(fill = "grey"),
strip.text = element_text(colour = "black", face = "bold", size = 8)) +
labs(x = "Values", y = "Count")
From following boxplot, almost all numerical independent variables have outliers except BMI.
life.data.num %>%
gather(key = key, value = value) %>%
mutate(key = as.factor(key)) %>%
ggplot(aes(x = value, fill = key)) +
geom_boxplot(colour = "black") +
facet_wrap(~key, scales = "free") +
theme_classic() +
theme(legend.position = "none",
strip.background = element_rect(fill = "grey"),
strip.text = element_text(colour = "black", face = "bold", size = 9),
axis.text.y = element_blank(),
axis.ticks.y = element_blank())
Following shows the amount of outliers in each variable, in
percentage.
variable.name <- colnames(life.data.num)
outlier.percentage <- c(
(length(boxplot.stats(life.data.num$Life.expectancy)$out)/2938)*100,
(length(boxplot.stats(life.data.num$Adult.Mortality)$out)/2938)*100,
(length(boxplot.stats(life.data.num$infant.deaths)$out)/2938)*100,
(length(boxplot.stats(life.data.num$Alcohol)$out)/2938)*100,
(length(boxplot.stats(life.data.num$percentage.expenditure)$out)/2938)*100,
(length(boxplot.stats(life.data.num$Hepatitis.B)$out)/2938)*100,
(length(boxplot.stats(life.data.num$Measles)$out)/2938)*100,
(length(boxplot.stats(life.data.num$BMI)$out)/2938)*100,
(length(boxplot.stats(life.data.num$under.five.deaths)$out)/2938)*100,
(length(boxplot.stats(life.data.num$Polio)$out)/2938)*100,
(length(boxplot.stats(life.data.num$Total.expenditure)$out)/2938)*100,
(length(boxplot.stats(life.data.num$Diphtheria)$out)/2938)*100,
(length(boxplot.stats(life.data.num$HIV.AIDS)$out)/2938)*100,
(length(boxplot.stats(life.data.num$GDP)$out)/2938)*100,
(length(boxplot.stats(life.data.num$Population)$out)/2938)*100,
(length(boxplot.stats(life.data.num$thinness..1.19.years)$out)/2938)*100,
(length(boxplot.stats(life.data.num$thinness.5.9.years)$out)/2938)*100,
(length(boxplot.stats(life.data.num$Income.composition.of.resources)$out)/2938)*100,
(length(boxplot.stats(life.data.num$Schooling)$out)/2938)*100
)
data.frame(variable.name, outlier.percentage) %>%
mutate(variable.name = as.factor(variable.name),
outlier.percentage = round(outlier.percentage, 2)) %>%
ggplot(aes(y = fct_reorder(variable.name, outlier.percentage), x = outlier.percentage)) +
geom_bar(stat = "identity") +
theme_bw() +
theme(plot.title = element_text(face = "bold")) +
labs(x = "Outlier Count (%)",
y = "Variables",
title = "Proportion of Outliers in Each Variable (By IQR Method)") +
geom_label(aes(label = paste0(outlier.percentage, "%"))) +
scale_x_continuous(limits = c(0, 20))
There are different types of outliers and are broadly categorised into (1) false outliers that caused by data input error or typo (which I assume won’t be happening in this dataset) and (2) true outliers that the outlier data points are naturally just too different from other data points.
There are many ways to deal with outliers, one can simply rectify it if it is caused by human error, however if it is not, there are two main methods to use:
Trimming, remove the entire row in a table if the row contains an outlier and this may cause a substantial reduction of sample size,
Winsorisation, capping the outliers by changing them to pre-specified maximum and minimum quantile values, such as the 95% quantile and the 5% quantile.
However, trimming and winsorisation are only feasible if we have less than 5% outliers otherwise the hypothesis testing outcome would be affected statistically and may not revealing desirable trends.
I will not manage the outliers now, instead I will use other outlier definitions when building a multiple linear regression model in later section. For examples leverage, influence, and cooks distance values. They define outliers via assessing their influence on the regression model.
Multiple Factor Analysis (MFA) will be used in this section to find the relationships among these variables all at once. It is a type of unsupervised machine learning techniques to reduce the dimensions of the datasets into a few principal components for trend mining purposes.
One of the “specific effect” of MFA is that all variables (both categorical and numerical) can be grouped together and have their trends and inter-relationships monitored all at once.
Defining the themes of grouping for MFA and I found that the variables in the dataset can be grouped into following 6 subcategories.
status Country Year Status
longevity Life.expectancy Adult.Mortality infant.deaths under.five.deaths
health Alcohol BMI
healthcare percentage.expenditure Hepatitis.B Polio Total.expenditure Diphtheria
disease Measles HIV.AIDS thinness..1.19.years thinness.5.9.years
economic GDP Population Schooling Income.composition.of.resources
The “status” group will be identified as supplementary group in the analysis which means that the group will not be participating in the MFA but will be used to understand the data as supplementary information. All other groups will be treated as active group and they will take part in MFA computation.
# Data set rearrangement
life.data.mfa <- life.data %>%
dplyr::select(Country, Year, Status, Life.expectancy, Adult.Mortality, infant.deaths, under.five.deaths, Alcohol, BMI, percentage.expenditure, Hepatitis.B, Polio, Total.expenditure, Diphtheria, Measles, HIV.AIDS, thinness.btw.10.19years, thinness.btw.5.9years, GDP, Population, Schooling, Income.composition.of.resources)
# mfa
res.mfa <- MFA(life.data.mfa,
group = c(3, 4, 2, 5, 4, 4),
type = c("n", "s", "s", "s", "s", "s"),
name.group = c("status", "longevity", "health", "healthcare", "disease", "economic"),
num.group.sup = 1,
graph = F)
Total variation in the dataset has been extracted and are expressed in 10 principal components (Dimensions). The first two dimensions explain the most variation (34.6% and 13.5%), and the amount of variation explains in subsequent dimensions hit a drastic reduction after the second dimension, based on the Knee Method of Scree plot, I will use the first two dimensions for the factor map.
fviz_screeplot(res.mfa, addlabels = T, ylim = c(0, 40))
In following plot, supplementary group “status” is colored with green and red for the active groups.
f1 <- fviz_mfa_var(res.mfa, choice = "group", repel = T)
f2 <- fviz_contrib(res.mfa, choice = "group", axes = 1)
f3 <- fviz_contrib(res.mfa, choice = "group", axes = 2)
f4 <- fviz_contrib(res.mfa, choice = "group", axes = 1:2)
f234 <- plot_grid(f2, f3, f4, ncol = 3)
plot_grid(f1, f234, ncol = 1)
Insights:
In dimension 1 (34.6%) on the top factor map, variables in all active groups except “healthcare” have almost identical coordinates, which means they contribute similarly to the first dimension. Healthcare group contribution is slightly lesser than the rest.
In dimension 2 (13.5%) on the top factor map, the variables in the group “longevity” contribute the most to the second dimension, followed by the variables in the “economic” group.
In three of the following plots, the red line is “expected average value”, group that higher than this line are important groups in explaining the variability in the dataset. Variables important to Dim-1 belong to economic, longevity and health group. Variables important to Dim-2 belong to “longevity” and “economic”.
Variables from both “longevity” and “economic” are most important to both axes.
Now, let’s look at the most exciting Factor map of MFA:
fviz_mfa_var(res.mfa,
choice = "quanti.var",
palette = "jco",
repel = T,
habillage = "Status")
Guides:
Positively related variables are grouped together.
Negatively related variables are positioned on opposite sides of the plot origin.
Arrows of each variables indicate the quality of the variable on the factor map, variables with long arrows are well represented on the factor map (more important to look at). The metric that related to the representation of variables is technically known as “Cosine-squared”.
Insights:
The variables associated with the “economic” group and “healthcare” group are positively associated with each other, except the variable “population”. It indicates the better the economic of a country, the better healthcare system a country can establish, and therefore, people of that country can have a longer lifespan, as indicated by the positively related “Life expectancy” variable.
High “Body mass index - BMI” and “Alcohol consumption” would be the main issues of health when economic increases.
The “economic” group and “healthcare” group are less associated and negatively associated with variables in the “disease” group. When economics and healthcare variables have reduction in values, the variables in the disease group increase their magnitudes.
HIV_AIDS is most likely the reason causing adult mortality.
Measles is strongly associated with “death under 5 years old” and “infant deaths”. Measles is a typical infectious viral disease that causing fever and skin red rash in early childhood. Both the variables “death under 5 years old” and “infant deaths” have a near-perfect positive correlation.
“Thinness in 5 to 9 years old” and “Thinness in 10 to 19 years old” are also having a near-perfect positive correlation.
Following shows the “MFA-Individual” map. This plot shows all the data points from the dataset onto the same factor map as above, with overlapping of supplementary categorical variables, which are “Country”, “Year”, and “Development Status”. Three bar plots located at the bottom are guides to see what variables contribute the most to each dimension (Dimension 1, Dimension 2, or both Dimension 1-2). Variables that contribute the most are the most important variables in explaining the variability in the data.
f1 <- fviz_mfa_ind(res.mfa,
repel = T,
palette = "jco",
habillage = "Status",
addEllipses = T,
ellipse.type = "convex")
f2 <- fviz_contrib(res.mfa, choice = "quanti.var", axes = 1) + theme(legend.position = "top")
f3 <- fviz_contrib(res.mfa, choice = "quanti.var", axes = 2) + theme(legend.position = "top")
bottom <- plot_grid(f2, f3, ncol = 2)
plot_grid(f1, bottom,
ncol = 1)
Insights:
There is a cluster for developing countries with their data points, and a cluster for developed countries with their data points. The separation is moderately good with some overlapping.
Data points (or called “Individual” technically in PC methods) with similar profile are grouped together.
Dimension 1 is contributed the most by BMI, Life expectancy, schooling, Income composition of resources, Alcohol consumption, thinness between 5 to 9 years old, and thinness between 10 to 19 years old. The red horizontal line in the bottom plots is a line of average contribution by all variables, variables that having a contribution higher than the red line is considered important for interpretation.
Dimension 2 is contributed to most by Infant deaths, under-5 death, population, and measles. India is the most associated country to these attributes.
Developing countries have higher positive association with “thinness between 10 to 19 years old” and “thinness between 5 to 9 years old”. They have negative relationship or lower association with BMI, Life expectancy, schooling, Income composition of resources, and Alcohol consumption. Whereas, developed countries are the opposite of these characterization.
Now we roughly know where are the data points (individuals) of developing and developed countries located.
Following shows the partial points of each data points.
fviz_mfa_ind(res.mfa,
partial = "all",
palette = "jco"
) +
labs(title = "Partial Individuals Analysis",
subtitle = "Individual - MFA")
Since there are so many data points crowded together, We can only draw some general trends.
On disease group point of view, developing countries (many are located at the left side of the vertical line in the graph) has high value for disease group (“Measles”, “HIV.AIDS”, “thinness..1.19.years”, and “thinness.5.9.years”).
“Thinness..1.19.years” and “thinness.5.9.years” contributed the most to dimension 1 and many developing countries have high value in these disease variables. This partial analysis is not very effective at capturing trends clearly in this dataset, let’s move on to the next section.
Key points:
length(unique(life.data$Country))
## [1] 193
unique(life.data$Year)
## [1] 2015 2014 2013 2012 2011 2010 2009 2008 2007 2006 2005 2004 2003 2002 2001
## [16] 2000
## 16 Levels: 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 ... 2015
unique(life.data$Status)
## [1] Developing Developed
## Levels: Developed Developing
Sample size needs to be big enough or statistically enough to represent the profile of a particular population. There are many theories about what is the best sample size:
In an environmental-controlled experiment, the absolute minimum is suggested to be 6
When a sample size reach 23 to 25, the t-distribution will become quite close to z-distribution (population distribution)
Therefore, in a rule of thumb, a minimum of 30 is suggested the best sample size
The best sample size depends on situation and application really. The more is better but depends on the costs allowed. There is math to help achieving a desired sample size (means plus/minus the margin of error), but this usually suggests a large sample size and the feasibility depends on approved resources. In short, the bigger the sample size, we will hope it would start represent the population statistics, with smaller margin of error, and the confidence interval at a confidence level of interest (95% generally).
table(life.data$Status)
##
## Developed Developing
## 512 2426
life.data %>% dplyr::select( Year) %>% group_by(Year) %>% summarise(count = n()) %>%
ggplot(aes(x = Year, y = count)) +
geom_bar(stat= "identity") +
geom_text(aes(label = paste0("(", count, ")"), vjust = -1)) +
theme_classic() +
scale_y_continuous(limits = c(0, 220)) +
labs(title = "There are 183 data recorded for each Year (193 for 2013)")
There are 10 countries with only 1 sample collected and all remaining 183 countries have only 16 samples collected. There would be a lot of sampling bias if the factor variable “Country” is involved in statistical comparison, the data might be too small to represent each country to be convincing.
life.data %>% dplyr::select(Country) %>%
group_by(Country) %>%
summarise(sample.size = n()) %>%
arrange(sample.size) %>%
mutate(sample.size = as.factor(sample.size)) %>%
group_by(sample.size) %>%
summarise(count = n()) %>%
ggplot(aes(x = "", y = count, fill = sample.size)) +
geom_histogram(stat = "identity") +
coord_polar(theta = "y", start = 0) +
geom_text(aes(label = paste0("Countries with Sample Size ", sample.size, ": \n", count)),
position = position_stack(vjust = 0.5)) +
theme_minimal() +
theme(axis.title = element_blank(),
legend.position = "none",
strip.text = element_text(size = 15),
plot.title = element_text(hjust = 0.5, face = "bold", vjust = 2),
plot.subtitle = element_text(hjust = 0.5, vjust = 2))
## Warning: Ignoring unknown parameters: binwidth, bins, pad
Following show the summary statistics of developed and developing countries. The “skew” stands for skewness of data distribution, a 0 value of skew means normal distribution. The “kurt” stands for “Kurtosis”, it is a metric measures the spread of the data, a positive value indicates a less spread of the data (better), a negative value indicates more spread of the data (flat-shape distribution).
life.df <- life.data %>%
dplyr::select(- Country)
mytable <- CreateTableOne(data = life.df, strata = "Status", test = F)
summary(mytable)
##
## ### Summary of continuous variables ###
##
## Status: Developed
## n miss p.miss mean sd median
## Life.expectancy 512 0 0 79.2 3.93 79.2
## Adult.Mortality 512 0 0 79.7 47.88 73.0
## infant.deaths 512 0 0 1.5 4.59 0.0
## Alcohol 512 0 0 9.8 2.69 10.3
## percentage.expenditure 512 0 0 2703.6 3824.20 846.6
## Hepatitis.B 512 0 0 86.7 18.26 92.9
## Measles 512 0 0 499.0 2529.08 12.0
## BMI 512 0 0 51.8 17.20 57.5
## under.five.deaths 512 0 0 1.8 5.38 0.0
## Polio 512 0 0 93.7 10.78 96.0
## Total.expenditure 512 0 0 7.5 2.90 7.9
## Diphtheria 512 0 0 93.5 12.53 96.0
## HIV.AIDS 512 0 0 0.1 0.00 0.1
## GDP 512 0 0 19393.8 22520.46 8477.3
## Population 512 0 0 6988754.9 13340196.29 3180573.5
## thinness.btw.10.19years 512 0 0 1.3 0.76 1.1
## thinness.btw.5.9years 512 0 0 1.3 0.83 1.0
## Income.composition.of.resources 512 0 0 0.8 0.05 0.9
## Schooling 512 0 0 15.8 1.72 15.5
## p25 p75 min max skew
## Life.expectancy 76.8 81.7 69.90 89.0 0.09
## Adult.Mortality 58.0 96.0 1.00 229.0 0.68
## infant.deaths 0.0 1.0 0.00 28.0 4.99
## Alcohol 8.7 11.6 0.01 15.2 -1.40
## percentage.expenditure 92.9 4102.9 0.00 19479.9 1.88
## Hepatitis.B 86.5 96.0 2.00 99.0 -3.11
## Measles 0.0 96.5 0.00 33812.0 8.89
## BMI 53.8 61.3 3.20 69.6 -1.91
## under.five.deaths 0.0 2.0 0.00 33.0 4.97
## Polio 93.0 98.0 9.00 99.0 -6.75
## Total.expenditure 6.4 9.1 1.10 17.6 -0.10
## Diphtheria 93.8 98.0 9.00 99.0 -6.04
## HIV.AIDS 0.1 0.1 0.10 0.1 NaN
## GDP 2192.4 33607.7 12.28 119172.7 1.47
## Population 308063.2 7334141.7 123.00 82534176.0 3.80
## thinness.btw.10.19years 0.7 1.9 0.30 4.0 0.82
## thinness.btw.5.9years 0.6 1.9 0.20 4.3 0.93
## Income.composition.of.resources 0.8 0.9 0.70 0.9 -0.43
## Schooling 14.8 16.6 11.50 20.7 0.44
## kurt
## Life.expectancy -0.12
## Adult.Mortality 0.50
## infant.deaths 24.62
## Alcohol 2.74
## percentage.expenditure 3.57
## Hepatitis.B 10.06
## Measles 91.84
## BMI 2.30
## under.five.deaths 24.57
## Polio 50.00
## Total.expenditure 1.56
## Diphtheria 37.95
## HIV.AIDS NaN
## GDP 2.16
## Population 15.75
## thinness.btw.10.19years 0.07
## thinness.btw.5.9years 0.44
## Income.composition.of.resources -0.46
## Schooling 0.28
## ------------------------------------------------------------
## Status: Developing
## n miss p.miss mean sd
## Life.expectancy 2426 0 0 67.1 9.0
## Adult.Mortality 2426 0 0 182.6 127.8
## infant.deaths 2426 0 0 36.4 128.9
## Alcohol 2426 0 0 3.5 3.3
## percentage.expenditure 2426 0 0 323.5 846.7
## Hepatitis.B 2426 0 0 76.4 26.2
## Measles 2426 0 0 2824.9 12528.8
## BMI 2426 0 0 35.2 19.4
## under.five.deaths 2426 0 0 50.5 175.4
## Polio 2426 0 0 80.1 24.6
## Total.expenditure 2426 0 0 5.6 2.2
## Diphtheria 2426 0 0 79.9 24.7
## HIV.AIDS 2426 0 0 2.1 5.5
## GDP 2426 0 0 3731.1 8149.3
## Population 2426 0 0 12954433.8 58911004.3
## thinness.btw.10.19years 2426 0 0 5.6 4.5
## thinness.btw.5.9years 2426 0 0 5.7 4.6
## Income.composition.of.resources 2426 0 0 0.6 0.2
## Schooling 2426 0 0 11.2 3.0
## median p25 p75 min max
## Life.expectancy 69.0 61.1 74.0 36.30 89.0
## Adult.Mortality 163.0 92.0 252.8 1.00 723.0
## infant.deaths 6.0 1.0 28.0 0.00 1800.0
## Alcohol 2.6 0.6 5.8 0.01 17.9
## percentage.expenditure 48.4 3.6 257.7 0.00 9748.6
## Hepatitis.B 88.0 66.0 96.0 1.00 99.0
## Measles 18.0 0.0 514.5 0.00 212183.0
## BMI 34.4 18.4 53.1 1.00 87.3
## under.five.deaths 7.0 1.0 39.0 0.00 2500.0
## Polio 91.0 74.0 97.0 3.00 99.0
## Total.expenditure 5.5 4.2 6.8 0.37 17.2
## Diphtheria 89.0 74.0 96.0 2.00 99.0
## HIV.AIDS 0.1 0.1 1.4 0.10 50.6
## GDP 777.1 464.5 3527.8 1.68 88564.8
## Population 3742413.0 463420.2 8474064.0 34.00 1293859294.0
## thinness.btw.10.19years 4.8 2.1 7.9 0.10 27.7
## thinness.btw.5.9years 4.7 2.1 7.9 0.10 28.6
## Income.composition.of.resources 0.6 0.5 0.7 0.00 0.9
## Schooling 11.7 9.3 13.2 0.00 18.3
## skew kurt
## Life.expectancy -0.63 -0.4
## Adult.Mortality 1.00 1.3
## infant.deaths 8.93 96.2
## Alcohol 0.97 0.4
## percentage.expenditure 5.52 38.7
## Hepatitis.B -1.38 0.9
## Measles 8.66 96.1
## BMI 0.04 -1.2
## under.five.deaths 8.66 91.0
## Polio -1.84 2.7
## Total.expenditure 0.64 1.3
## Diphtheria -1.83 2.6
## HIV.AIDS 4.89 28.5
## GDP 4.81 29.8
## Population 16.64 323.0
## thinness.btw.10.19years 1.54 3.4
## thinness.btw.5.9years 1.61 3.8
## Income.composition.of.resources -1.17 1.5
## Schooling -0.82 1.3
##
## Standardize mean differences
## 1 vs 2
## Life.expectancy 1.7378374
## Adult.Mortality 1.0665702
## infant.deaths 0.3824245
## Alcohol 2.1043509
## percentage.expenditure 0.8593778
## Hepatitis.B 0.4545989
## Measles 0.2573518
## BMI 0.9044805
## under.five.deaths 0.3926356
## Polio 0.7193888
## Total.expenditure 0.7551148
## Diphtheria 0.6915955
## HIV.AIDS 0.5089256
## GDP 0.9248812
## Population 0.1396753
## thinness.btw.10.19years 1.3359964
## thinness.btw.5.9years 1.3273474
## Income.composition.of.resources 1.8289669
## Schooling 1.8549254
##
## =======================================================================================
##
## ### Summary of categorical variables ###
##
## Status: Developed
## var n miss p.miss level freq percent cum.percent
## Year 512 0 0.0 2000 32 6.2 6.2
## 2001 32 6.2 12.5
## 2002 32 6.2 18.8
## 2003 32 6.2 25.0
## 2004 32 6.2 31.2
## 2005 32 6.2 37.5
## 2006 32 6.2 43.8
## 2007 32 6.2 50.0
## 2008 32 6.2 56.2
## 2009 32 6.2 62.5
## 2010 32 6.2 68.8
## 2011 32 6.2 75.0
## 2012 32 6.2 81.2
## 2013 32 6.2 87.5
## 2014 32 6.2 93.8
## 2015 32 6.2 100.0
##
## Status 512 0 0.0 Developed 512 100.0 100.0
## Developing 0 0.0 100.0
##
## ------------------------------------------------------------
## Status: Developing
## var n miss p.miss level freq percent cum.percent
## Year 2426 0 0.0 2000 151 6.2 6.2
## 2001 151 6.2 12.4
## 2002 151 6.2 18.7
## 2003 151 6.2 24.9
## 2004 151 6.2 31.1
## 2005 151 6.2 37.3
## 2006 151 6.2 43.6
## 2007 151 6.2 49.8
## 2008 151 6.2 56.0
## 2009 151 6.2 62.2
## 2010 151 6.2 68.5
## 2011 151 6.2 74.7
## 2012 151 6.2 80.9
## 2013 161 6.6 87.6
## 2014 151 6.2 93.8
## 2015 151 6.2 100.0
##
## Status 2426 0 0.0 Developed 0 0.0 0.0
## Developing 2426 100.0 100.0
##
##
## Standardize mean differences
## 1 vs 2
## Year 0.01574
## Status NaN
A box plot will help visually compares the central tendency statistics of developed and developing countries. Following shows the box plots of each numerical variables in the dataset, categorized by country status and overlapped with data points.
Note:
life.data %>%
dplyr::select(-Country, -Year) %>%
pivot_longer(c(2:20), names_to = "my.var", values_to = "my.val") %>%
mutate(my.var = as.factor(my.var)) %>%
ggplot(aes(x = Status, y = my.val, color = Status)) +
geom_jitter(alpha = 0.1) +
geom_boxplot(outlier.shape = NA, color = "black", alpha = 0) +
stat_summary(fun = mean, geom = "point", color = "black", shape = 4, size = 5) +
facet_wrap(~my.var, scales = "free") +
stat_boxplot(geom = "errorbar", width = 0.2, color = "black") +
theme_bw() +
theme(legend.position = "top",
plot.title = element_text(face = "bold", hjust = 0.5)) +
labs(x = "Country Status",
y = "Values",
title = "Box Plot")
Insights
I will use the 5 grouping themes in MFA to help interpretation. Following are general trends detected from the boxplot:
Regarding longevity, developed countries has higher life expectancy, lower adult mortality, infant.deaths, and under.five.deaths.
Regarding health, alcohol consumption and BMI are the main concern issues of most developed countries. However, both of the issues also happen in many developing countries.
Regarding healthcare, using median, developed countries has higher “percentage.expenditure”, which is the expenditure on health as a percentage of Gross Domestic Product per capita(%), and higher “Total expenditure”, which is the general government expenditure on health as a percentage of total government expenditure (%). Both developing countries and developed countries have similar immunisation coverage of “Hepatitis.B”, “Polio”, and “Diphtheria”. However, there are many data points showing some developing countries need improvement in immunisation of these diseases.
Regarding disease, Measles, HIV.AIDS, thinness..1.19.years, and thinness.5.9.years are more severe in developing countries.
Regarding economic, GDP is generally a lot higher in developed countries, whereas population is generally higher in developing countries. People in developed countries have higher years of schooling. Developed countries also have higher “Income.composition.of.resources” which is the Human Development Index in terms of income composition of resources.
In following sections, a series of business questions have been designed and will be answered using statistical techniques.
What are the predicting variables actually affecting the life expectancy?
In this question, life expectancy will be the dependent variables and the other variables are independent variables.
First, I will test multicollinearity among numerical independent variables. Multicollinearity is an extreme problem when there is high correlation among two or more independent numerical variables. The consequences are that (1) standard errors will be inflated, and which affects the accuracy of the Beta coefficient estimates, (2) Lower down the t statistics, and (3) therefore increase the P-value. It is usually understood as redundancy of variables in the dataset when two variables are highly correlated.
Two thresholds will be used to tackle multicollinearity
When correlation is higher than 7 or VIF (Variance inflation factor) > 5, then there would be a serious multicollinearity problem. Though there are these maximum thresholds for us to refer to but the general aim is to try to keep these thresholds as low as possible.
From the correlation graph below, highly correlated independent variables (both negatively and positively) are identified:
Under.five.deaths and infant.deaths, at a perfect correlation of 1. Under.five.deaths will be selected during multiple linear regression because it has slightly better correlation with life expectancy.
GDP and percentage expenditure, at a correlation of 0.9. GDP will be selected with the same reason as Under.five.death.
Both thinness variables are highly correlated (0.94) and both are correlated to life expectancy at the same degree. I will keep thinness prevalence between 5 to 9 years old as it might be more important to look at because children are more vulnerable compared to 10 to 19 years old teenagers.
Schooling and income.composition.of.resource are highly correlated, I will choose schooling for the same reason as the variable Under.five.death.
# correlation computation
num.var <- life.data %>% select_if(is.numeric)
num.var_cor <- cor(num.var)
# plot
corrplot(num.var_cor, method = "number", type = "upper")
Therefore, I will build my multiple linear regression model without following numerical variables:
Removed_Variables <- c("under.five.deaths", "thinness.btw.10.19years", "percentage.expenditure", "Income.composition.of.resources")
Removed_Variables <- data.frame(Removed_Variables)
Removed_Variables %>% kbl() %>% kable_styling(bootstrap_options = c("bordered", "striped", "hover"), full_width = F)
| Removed_Variables |
|---|
| under.five.deaths |
| thinness.btw.10.19years |
| percentage.expenditure |
| Income.composition.of.resources |
Building the model:
res.mlr <- lm(Life.expectancy ~ .- Country - Year - under.five.deaths - thinness.btw.10.19years - percentage.expenditure - Income.composition.of.resources, data = life.data)
Before interpreting the model, I will run a series of diagnostic plots to make sure the reliability of the statistical outcomes in the mode.
Applying variance inflation factor (VIF) to check multicollinearity.
vifout <- car::vif(res.mlr)
vifout %>% kbl(col.names = "VIF") %>%
kable_styling(bootstrap_options = c("stripped", "bordered", "hover"), full_width = F)
| VIF | |
|---|---|
| Status | 1.899528 |
| Adult.Mortality | 1.759069 |
| infant.deaths | 2.204212 |
| Alcohol | 1.953773 |
| Hepatitis.B | 2.027274 |
| Measles | 1.363928 |
| BMI | 1.756366 |
| Polio | 1.991569 |
| Total.expenditure | 1.185514 |
| Diphtheria | 2.525734 |
| HIV.AIDS | 1.417868 |
| GDP | 1.356121 |
| Population | 1.467981 |
| thinness.btw.5.9years | 2.025135 |
| Schooling | 2.433919 |
VIF of all variables are closed to 1 and 2, which is excellent, and the multicollinearity problem can be said has been removed from the model with the aid of correlation analysis.
par(mfrow = c(2, 3))
plot(res.mlr, which = 1:5)
* In “Residuals vs Fitter” graph, the data points of
residuals follow roughly a straight line at approximate 0. Therefore, I
assume that the relationship between the independent variables and life
expectancy is linear.
The Q-Q plot, most of the data in the middle follows a straight line. It indicates that the residuals are roughly normally distributed.
In Scale-Location, there is a gradually decreasing variance indicating that it may be an indication of heteroscedasticity.
In the Residuals vs Leverage plot, this plot help to find influential data point that an inclusion or exclusion of it can alter the results of the multiple linear regression because these data points are associated with large residuals. Visually from the plot, there are no data points located outside of the Cook’s distance dash line located at the upper right and bottom right areas and which indicate no influential points. However, I would try to use mathematics method to find high influential values.
model.diag.metrics <- augment(res.mlr) %>% dplyr::select(.fitted, .resid, .hat, .cooksd, .std.resid)
model.diag.metrics %>% filter(.std.resid <= -3 |
.std.resid >= 3)
$$
4/(n - p - 1) = 4/(2938 - 15 -1) = 0.001368925
$$ * From following computation, there are 212 data points from the dataset (7.215%) are associated with large residuals and removal of them may help improve a model.
influential.obs <- model.diag.metrics %>%
mutate(row.id = 1:nrow(model.diag.metrics)) %>%
relocate(row.id, .before = .fitted) %>%
dplyr::select(row.id, .cooksd) %>%
filter(.cooksd >= 0.001368925)
nrow(influential.obs)
## [1] 212
212/2938 * 100
## [1] 7.215793
However, I will only remove 147 data points who has the largest Cooks distance (.cooksd) so that I will only remove a maximum of 5% of data from the dataset instead of 7.215% to avoid the chance of influencing the model output statistically.
5/100*2938
## [1] 146.9
Creating an influential.obs table with 147 of the targeted data points.
influential.obs2 <- model.diag.metrics %>%
mutate(row.id = 1:nrow(model.diag.metrics)) %>%
relocate(row.id, .before = .fitted) %>%
dplyr::select(row.id, .cooksd) %>%
filter(.cooksd >= 0.001368925) %>%
arrange(-.cooksd) %>%
slice_head(n = 147)
Anti-join the life.data dataset from the influential.obs table and create a new table that has 2791 rows of data (147 large residual influential points have been removed).
life.data2 <- life.data %>%
mutate(row.id = 1:nrow(life.data)) %>%
relocate(row.id, .before = Country) %>%
anti_join(influential.obs2, by = "row.id") %>%
dplyr::select(-row.id)
Making the second multiple linear regression model:
res.mlr2 <- lm(Life.expectancy ~ .- Country - Year - under.five.deaths - thinness.btw.10.19years - percentage.expenditure - Income.composition.of.resources, data = life.data2)
par(mfrow = c(2, 3))
plot(res.mlr2, which = 1:5)
Now the linear relationship, normality, and homogeneity of variance have
improved and met the assumptions of linear regression. The residual data
points on “Scale-Location” plot has become more randomly and equally
spread and the line has become horizontal. There are no high influential
points outside of the cook’s distance on the “Residuals vs Leverage”
plot.
The multiple linear regression show that:
According to Adjusted R-squared, The model can
explain 85.5% of variability of life expectancy, which is a very good
level.
The F-statistic is 1032 and an overall P-value of less than 0.001, which indicates that at least 1 variable is statistically related to the life expectancy.
Variables that has a P-value (“Pr(>|t|)”) less than 0.05 (as indicated with “*“) are statistically affecting life.expectancy.
summary(res.mlr2)
##
## Call:
## lm(formula = Life.expectancy ~ . - Country - Year - under.five.deaths -
## thinness.btw.10.19years - percentage.expenditure - Income.composition.of.resources,
## data = life.data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.6806 -2.0555 -0.0036 2.3415 10.7489
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 54.919369270341 0.583007125306 94.200
## StatusDeveloping -1.121308238815 0.233360456687 -4.805
## Adult.Mortality -0.022175148698 0.000755442023 -29.354
## infant.deaths -0.002750544488 0.001175362634 -2.340
## Alcohol -0.006452559773 0.022907580577 -0.282
## Hepatitis.B 0.001995202829 0.003758080783 0.531
## Measles -0.000011317405 0.000010211361 -1.108
## BMI 0.026811680928 0.004365056982 6.142
## Polio 0.022277499307 0.004128104339 5.397
## Total.expenditure 0.080790531836 0.029960675032 2.697
## Diphtheria 0.029684344213 0.004545072390 6.531
## HIV.AIDS -0.513859724501 0.018484110975 -27.800
## GDP 0.000044436767 0.000005653359 7.860
## Population 0.000000003338 0.000000002411 1.385
## thinness.btw.5.9years -0.037148287881 0.020796369536 -1.786
## Schooling 1.153671879687 0.033897730511 34.034
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## StatusDeveloping 0.00000162945604687 ***
## Adult.Mortality < 0.0000000000000002 ***
## infant.deaths 0.01935 *
## Alcohol 0.77821
## Hepatitis.B 0.59552
## Measles 0.26782
## BMI 0.00000000092950331 ***
## Polio 0.00000007367039532 ***
## Total.expenditure 0.00705 **
## Diphtheria 0.00000000007736926 ***
## HIV.AIDS < 0.0000000000000002 ***
## GDP 0.00000000000000544 ***
## Population 0.16626
## thinness.btw.5.9years 0.07416 .
## Schooling < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.429 on 2775 degrees of freedom
## Multiple R-squared: 0.8545, Adjusted R-squared: 0.8538
## F-statistic: 1087 on 15 and 2775 DF, p-value: < 0.00000000000000022
output.table <- summary(res.mlr2)$coefficients %>%
data.frame() %>%
dplyr::select(Estimate, Pr...t..) %>%
rename("P.value" = Pr...t..) %>%
filter(P.value < 0.05) %>%
arrange(P.value) %>%
mutate(P.value = round(P.value, 6))
output.table[-1, ] %>%
filter(Estimate > 0) %>%
arrange(-Estimate) %>%
kbl() %>%
kable_styling(bootstrap_options = c("hover", "bordered"), full_width = F)
| Estimate | P.value | |
|---|---|---|
| Schooling | 1.1536719 | 0.000000 |
| Total.expenditure | 0.0807905 | 0.007048 |
| Diphtheria | 0.0296843 | 0.000000 |
| BMI | 0.0268117 | 0.000000 |
| Polio | 0.0222775 | 0.000000 |
| GDP | 0.0000444 | 0.000000 |
output.table[-1, ] %>%
filter(Estimate < 0) %>%
arrange(Estimate) %>%
kbl() %>%
kable_styling(bootstrap_options = c("hover", "bordered"), full_width = F)
| Estimate | P.value | |
|---|---|---|
| StatusDeveloping | -1.1213082 | 0.000002 |
| HIV.AIDS | -0.5138597 | 0.000000 |
| Adult.Mortality | -0.0221751 | 0.000000 |
| infant.deaths | -0.0027505 | 0.019346 |
Question: Should a country having a lower life expectancy value(<65) increase its healthcare expenditure in order to improve its average lifespan?
Yes for two reasons.
Reason 1: Based on the multiple linear regression in previous section, it indicated that total expenditure by government on healthcare is statistically related to life expectancy in a positive way, at a significant P-value of lower than 0.01. It indicates that 1% increase in the healthcare expenditure by government, the life expectancy will increase by 0.0807905 (29.5 days) on average, while keep all other predicting variables constant.
0.0807905*365
## [1] 29.48853
Reason 2: Governments in countries with higher life expectancy value (>65) spent more on healthcare statistically than countries with lower life expectancy (<65). (Welch‘s Two Sample t-test: t = -8.6583, df = 1821.2, p-value < 0.001). In short, government of longer lifespan countries spend more on healthcare.
Visualisation:
# data frame
df2 <- life.data %>%
dplyr::select(Life.expectancy, Total.expenditure) %>%
mutate(my.group = ifelse(Life.expectancy >= 65, "LifeExp>65", "LifeExp<65")) %>%
group_by(my.group) %>%
mutate(count = n(),
label = paste0(my.group, "\n(n=", count, ")"))
# plot
ggplot(df2, aes(x = label, y = Total.expenditure, colour = my.group)) +
geom_jitter(alpha = 0.5) +
geom_boxplot(outlier.shape = NA, alpha = 0.1, colour = "black") +
stat_summary(fun = "mean", geom = "point", shape = 4, size = 5, color = "black") +
theme_classic()
Both of the distributions look normal (normal distribution) based on the distance between mean and median in the box plots as well as the overall shape of their boxplot. Technically, the boxes of both of the plot are symmetrical with mean and median in the center.
Performing Welch’s T test and the P-value of less than 0.05 concludes the difference between the both groups is statistically significant. Welch’s T test is selected because the sample size different is quite large, and this test do not have the assumption of variance homogeneity between groups and therefore different sample size is allowed.
t.test(df2$Total.expenditure~df2$my.group)
##
## Welch Two Sample t-test
##
## data: df2$Total.expenditure by df2$my.group
## t = -8.6583, df = 1821.2, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means between group LifeExp<65 and group LifeExp>65 is not equal to 0
## 95 percent confidence interval:
## -0.9709779 -0.6123292
## sample estimates:
## mean in group LifeExp<65 mean in group LifeExp>65
## 5.391601 6.183254
Question: How does Infant and Adult mortality rates affect life expectancy?
Both infant and adult mortality have statistical negative impact on life expectancy. Based on the output of multiple linear regression in previous section that takes all other variables into consideration:
Any increment in one unit of adult mortality, life expectancy will be reduced by 0.0222 (P-value < 0.001)
Any increment in one unit of infant deaths, life expectancy will be reduced by 0.0028 (P-value < 0.01)
df3 <- life.data %>%
dplyr::select(Status, Life.expectancy, Adult.Mortality, infant.deaths) %>%
pivot_longer(c(3:4), names_to = "myvar", values_to = "myval") %>%
mutate(myvar = as.factor(myvar))
ggplot(df3, aes(y = Life.expectancy, x = myval, colour = Status)) +
geom_point(alpha = 0.5) +
facet_wrap(~myvar, scales = "free_x") +
theme_classic() +
theme(legend.position = "none") +
labs(x = "", y = "Life Expectancy")
Question: Does Life Expectancy has positive or negative correlation with schooling, BMI, drinking alcohol etc.
df4 <- life.data %>% dplyr::select(Status, Life.expectancy, Alcohol, BMI, Schooling)
Correlation is a measure of association between variables without making any assumption whether life expectancy is dependent on alcohol, BMI, and schooling.
Result shows that:
cor.data <- cor(df4[2:5])
corrplot(cor.data, method = "number", type = "lower")
A PCA and various scatter plots are graphed to act as visual aid.
Main points:
PCA supports the correlation that “life expectancy” is more
related to “BMI” and “Schooling”, and less related to “Alcohol”
consumption”, however all of them are positively correlated with life
expectancy. If a variable is negatively correlated to life expectancy,
it will be located at the opposite side of the origin. The PCA also
explains excellently well the variation in the dataset at a combination
of both dimension at 82.6%.
PCA also help revealing that developed countries have higher degree in alcohol consumption, schooling, BMI, and life expectancy than many developing countries.
Three scatter plots at the bottom with an overall correlation line plotted (Not Based On Country Status) also supports the findings of correlations in previous correlation plot.
res.pca <- PCA(df4, quali.sup = 1, graph = F)
f1 <- fviz_pca_biplot(res.pca, palette = "Set2", repel = T, habillage = "Status",
addEllipses = T, geom.ind = "point",
pointsize = 1.5, pointshape = 21, col.var = "blue")
f2 <- fviz_screeplot(res.pca, addlabels = T, ylim = c(0, 70))
f3 <- fviz_contrib(res.pca, choice = "var")
f4 <- ggplot(df4, aes(y = Life.expectancy, x = Alcohol)) + geom_point(aes(color = Status), alpha = 0.5) +
geom_smooth(se = F, method = "lm") + theme_classic() + labs(title = "Alcohol")
f5 <- ggplot(df4, aes(y = Life.expectancy, x = BMI)) + geom_point(aes(color = Status), alpha = 0.5) +
geom_smooth(se = F, method = "lm") + theme_classic() + labs(title = "BMI")
f6 <- ggplot(df4, aes(y = Life.expectancy, x = Schooling)) + geom_point(aes(color = Status), alpha = 0.5) +
geom_smooth(se = F, method = "lm") + theme_classic() + labs(title = "Schooling")
right <- plot_grid(f2, f3, ncol = 1)
top <- plot_grid(f1, right, rel_widths = c(2.5, 1))
bottom <- plot_grid(f6, f5, f4, nrow = 1)
plot_grid(top, bottom, ncol = 1)
If attempt to describe the dependence of life expectancy on alcohol, BMI and schooling, a multiple linear regression model can be created. Output shows that the BMI and Schooling are statistically related to life expectancy, however the model has higher residual standard error rate and explaining less variation in the dataset as indicated by lower Adjusted R-squared of 60%.
model1 <- lm(Life.expectancy ~., data = df4[, 2:5])
summary(model1)
##
## Call:
## lm(formula = Life.expectancy ~ ., data = df4[, 2:5])
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.2529 -2.9112 0.3987 3.6102 30.8555
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 43.348082 0.430338 100.730 <0.0000000000000002 ***
## Alcohol -0.041983 0.033396 -1.257 0.209
## BMI 0.104586 0.006627 15.781 <0.0000000000000002 ***
## Schooling 1.839998 0.045305 40.613 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.991 on 2934 degrees of freedom
## Multiple R-squared: 0.6038, Adjusted R-squared: 0.6033
## F-statistic: 1490 on 3 and 2934 DF, p-value: < 0.00000000000000022
Compared to the output of following multiple linear regression model that built before, which has a lower level of residual standard error and Adjusted R-squared. These indicates that it is a better model.
summary(res.mlr2)
##
## Call:
## lm(formula = Life.expectancy ~ . - Country - Year - under.five.deaths -
## thinness.btw.10.19years - percentage.expenditure - Income.composition.of.resources,
## data = life.data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.6806 -2.0555 -0.0036 2.3415 10.7489
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 54.919369270341 0.583007125306 94.200
## StatusDeveloping -1.121308238815 0.233360456687 -4.805
## Adult.Mortality -0.022175148698 0.000755442023 -29.354
## infant.deaths -0.002750544488 0.001175362634 -2.340
## Alcohol -0.006452559773 0.022907580577 -0.282
## Hepatitis.B 0.001995202829 0.003758080783 0.531
## Measles -0.000011317405 0.000010211361 -1.108
## BMI 0.026811680928 0.004365056982 6.142
## Polio 0.022277499307 0.004128104339 5.397
## Total.expenditure 0.080790531836 0.029960675032 2.697
## Diphtheria 0.029684344213 0.004545072390 6.531
## HIV.AIDS -0.513859724501 0.018484110975 -27.800
## GDP 0.000044436767 0.000005653359 7.860
## Population 0.000000003338 0.000000002411 1.385
## thinness.btw.5.9years -0.037148287881 0.020796369536 -1.786
## Schooling 1.153671879687 0.033897730511 34.034
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## StatusDeveloping 0.00000162945604687 ***
## Adult.Mortality < 0.0000000000000002 ***
## infant.deaths 0.01935 *
## Alcohol 0.77821
## Hepatitis.B 0.59552
## Measles 0.26782
## BMI 0.00000000092950331 ***
## Polio 0.00000007367039532 ***
## Total.expenditure 0.00705 **
## Diphtheria 0.00000000007736926 ***
## HIV.AIDS < 0.0000000000000002 ***
## GDP 0.00000000000000544 ***
## Population 0.16626
## thinness.btw.5.9years 0.07416 .
## Schooling < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.429 on 2775 degrees of freedom
## Multiple R-squared: 0.8545, Adjusted R-squared: 0.8538
## F-statistic: 1087 on 15 and 2775 DF, p-value: < 0.00000000000000022
Insights:
Alcohol is insignificantly related to life expectancy at a P-value of 0.778, whereas BMI and schooling are significantly related to life expectancy at a P-value of lower than 0.01. Both BMI and schooling relates to life expectancy positively. One unit increase in BMI, the life expectancy is expected to increase 0.027 year on average, whereas one unit increase in schooling, life expectancy is expected to increase 1.15 year on average (Adjusted R^2: 0.8538, F(15, 2775) = 1087, P < 0.01).
Question: Do densely populated countries tend to have lower life expectancy?
Both parametric and non-parametric correlation methods that using “pearson” and “spearman” method in computing the correlation. Both methods suggest the correlation between population and life expectancy is negative but the correlation is extremely small.
Setting the dataset:
df5 <- life.data %>% dplyr::select(Status, Life.expectancy, Population) %>%
mutate(Population.log = log(Population),
Life.expectancy.log = log(Life.expectancy))
Pearson correlation:
cor(y = df5$Life.expectancy, x = df5$Population, method = "pearson")
## [1] -0.0299053
More conservative non-parametric method: Spearman correlation:
cor(y = df5$Life.expectancy, x = df5$Population, method = "spearman")
## [1] -0.05657282
A scatter plot has the same result, the linear relationship between population and life expectancy is quite difficult to see, and the situation is the same for both developing and developed countries. The R-squared (explained variation) of two regression equations is so more that even achieve less than 1% of the variation in life expectancy.
ggscatter(df5, x = "Population.log", y = "Life.expectancy", color = "Status", add = "reg.line") +
stat_regline_equation(aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~~"), color = Status)) +
labs(x = "log(Population)",
title = "Life Expectancy versus Log-Population, Grouped by Country Status") +
theme(plot.title = element_text(face = "bold", size = 15, hjust = 0.5))
## `geom_smooth()` using formula 'y ~ x'
Life expectancy is generally dictated by many variables, as specified in the multiple linear regression model I have built previously. Many factors are related to Life expectancy. From the plot, we can see that lower life expectancy is generally associated with developing countries, or life expectancy is generally associated with other attributes of developing countries.
Getting the data frame:
df6 <- df5 %>% dplyr::select(Status, Life.expectancy) %>%
group_by(Status) %>%
mutate(count = n(),
Country.Status = paste0(Status, " (n=", count, ")"))
The formal normality test is called Shapiro-Wilk test with a null hypothesis that the distribution is normal. The shapiro-wilk test rejects the null hypothesis of group and saying their data are not normally distribution.
by(data = df6$Life.expectancy, INDICES = df6$Country.Status, shapiro.test)
## df6$Country.Status: Developed (n=512)
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.9844, p-value = 0.00002671
##
## ------------------------------------------------------------
## df6$Country.Status: Developing (n=2426)
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.94965, p-value < 0.00000000000000022
However, I will reject the result of Shapiro-Wilk test because the sample size are too large for this test to handle. In general,
if the sample size is too small, Shapiro-Wilk test will tend to conclude that the data is normally distributed (become harder to reject null hypothesis), and
if the sample size is too large, Shapiro-Wilk test will tend to conclude that a set of data is not normally distributed (become easier to reject null hypothesis).
Therefore, visual aids for normality assessments are required:
g1 <- ggplot(df6, aes(x = Life.expectancy, color = Status)) +
geom_histogram(alpha = 0) +
theme_bw() +
facet_wrap(~Country.Status, scales = "free_x") +
theme(legend.position = "none")
g2 <- ggplot(df6, aes(x = Life.expectancy, y = "", color = Status)) +
geom_boxplot() +
theme_bw() +
facet_wrap(~Country.Status, scales = "free_x") +
theme(legend.position = "none") +
stat_summary(fun = "mean", geom = "point", shape = 4, size = 5, color = "black")
plot_grid(g1, g2, ncol = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
To meet the requirement of normality assumption of t-test, the data doesn’t have to be perfectly normal but it has to be at least roughly fits a bell curve shape.
A non-parametric, median-based comparison test is run and concludes that the difference between life expectancy in developed and developing countries is statistically significance (Wilcoxon rank sum test: W = 1136054, p-value < 0.001) .
wilcox.test(df6$Life.expectancy~df6$Country.Status, paired = F)
##
## Wilcoxon rank sum test with continuity correction
##
## data: df6$Life.expectancy by df6$Country.Status
## W = 1136054, p-value < 0.00000000000000022
## alternative hypothesis: true location shift is not equal to 0
Immunization coverage has positive correlation with life expectancy, as indicated by the immunisation of Hepatitis B, Polio, and Diphtheria at following correlation values. The correlations are moderate.
# set up data frame
df7 <- life.data %>%
dplyr::select(Life.expectancy, Hepatitis.B, Polio, Diphtheria)
corrplot(cor(df7), method = "number", type = "upper")
Visually:
df7.2 <- df7 %>%
pivot_longer(c(2:4), names_to = "myvar", values_to = "myvalues") %>%
mutate(myvar = as.factor(myvar))
ggplot(df7.2, aes(y = Life.expectancy, x = myvalues)) +
geom_point(alpha = 0.5) +
facet_wrap(~myvar) + geom_smooth(se = F, method = "lm") +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
Statistics from the multiple linear regression built previously suggests that:
Diphtheria has a statistical positive relationship with life expectancy, at a P-value of less than 0.05. An increase in one % of Diphtheria tetanus toxoid and pertussis (DTP3) immunization coverage among 1-year-olds, life expectancy will increase by 0.03 year on average.
Polio has also a statistical positive relationship with life expectancy, at a P-value of less than 0.05. An increase in one % of Polio immunisation among 1-year-olds, life expectancy will increase by 0.02 year on average.
Hepatitis B immunisation do not have a statistical relationship with life expectancy, which means that having an increment in Hepatitis B immunisation can lead to zero increment in life expectancy.
Question: Please test the interaction of Year on the data among both country status.
Multivariate analysis of variance (MANOVA) is used to test is there statistical difference in data between the categories within year and status.
There are 16 levels of the factor variable “Year”, indicates that the data in the dataset are recorded in a yearly basis.
levels(life.data$Year)
## [1] "2000" "2001" "2002" "2003" "2004" "2005" "2006" "2007" "2008" "2009"
## [11] "2010" "2011" "2012" "2013" "2014" "2015"
There are 2 levels in the factor variable “Status”, which are:
levels(life.data$Status)
## [1] "Developed" "Developing"
MANOVA
Performing MANOVA.
manova.output <- manova(cbind(Life.expectancy, Adult.Mortality, infant.deaths, Alcohol, percentage.expenditure, Hepatitis.B, Measles, BMI, under.five.deaths, Polio, Total.expenditure, Diphtheria, HIV.AIDS, GDP, Population, thinness.btw.10.19years, thinness.btw.5.9years, Income.composition.of.resources, Schooling) ~ Year + Status, data = life.data)
summary(manova.output)
## Df Pillai approx F num Df den Df Pr(>F)
## Year 15 0.26954 2.809 285 43755 < 0.00000000000000022 ***
## Status 1 0.49042 147.047 19 2903 < 0.00000000000000022 ***
## Residuals 2921
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
There is sufficient evidence to reject the null hypothesis that there is no difference in mean between years and development status on the numerical findings in the dataset and conclude that:
Following are 4 different tests of MANOVA being performed, which are “Pillai’s trace”, “Wilks”, “Hotelling-Lawley trace”, and “Roy’s largest root”. All 4 different statistics that devised because of multivariate nature of MANOVA suggests the same outcome, and especially the R default test “Pillai’s trace” which is commonly said the most robust method (and especially if assumptions of linearity are not match in the real life data).
summary(manova.output, test = "Pillai", intercept = T)
## Df Pillai approx F num Df den Df Pr(>F)
## (Intercept) 1 0.99599 37912 19 2903 < 0.00000000000000022 ***
## Year 15 0.26954 3 285 43755 < 0.00000000000000022 ***
## Status 1 0.49042 147 19 2903 < 0.00000000000000022 ***
## Residuals 2921
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(manova.output, test = "Hotelling-Lawley", intercept = T)
## Df Hotelling-Lawley approx F num Df den Df Pr(>F)
## (Intercept) 1 248.135 37912 19 2903 < 0.00000000000000022
## Year 15 0.302 3 285 43517 < 0.00000000000000022
## Status 1 0.962 147 19 2903 < 0.00000000000000022
## Residuals 2921
##
## (Intercept) ***
## Year ***
## Status ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(manova.output, test = "Wilks", intercept = T)
## Df Wilks approx F num Df den Df Pr(>F)
## (Intercept) 1 0.00401 37912 19 2903 < 0.00000000000000022 ***
## Year 15 0.75209 3 285 34365 < 0.00000000000000022 ***
## Status 1 0.50958 147 19 2903 < 0.00000000000000022 ***
## Residuals 2921
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(manova.output, test = "Roy", intercept = T)
## Df Roy approx F num Df den Df Pr(>F)
## (Intercept) 1 248.135 37912 19 2903 < 0.00000000000000022 ***
## Year 15 0.184 28 19 2917 < 0.00000000000000022 ***
## Status 1 0.962 147 19 2903 < 0.00000000000000022 ***
## Residuals 2921
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary.aov(manova.output)
## Response Life.expectancy :
## Df Sum Sq Mean Sq F value Pr(>F)
## Year 15 7825 522 7.7624 < 0.00000000000000022 ***
## Status 1 61638 61638 917.1556 < 0.00000000000000022 ***
## Residuals 2921 196308 67
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Adult.Mortality :
## Df Sum Sq Mean Sq F value Pr(>F)
## Year 15 387725 25848 1.8702 0.02168 *
## Status 1 4483723 4483723 324.4158 < 0.0000000000000002 ***
## Residuals 2921 40370887 13821
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response infant.deaths :
## Df Sum Sq Mean Sq F value Pr(>F)
## Year 15 57616 3841 0.2786 0.9971
## Status 1 515441 515441 37.3870 0.000000001098 ***
## Residuals 2921 40270803 13787
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Alcohol :
## Df Sum Sq Mean Sq F value Pr(>F)
## Year 15 597.1 39.8 3.9377 0.0000004371 ***
## Status 1 16963.8 16963.8 1678.1738 < 0.00000000000000022 ***
## Residuals 2921 29527.0 10.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response percentage.expenditure :
## Df Sum Sq Mean Sq F value Pr(>F)
## Year 15 223230310 14882021 4.8369 0.000000002108 ***
## Status 1 2395929496 2395929496 778.7117 < 0.00000000000000022 ***
## Residuals 2921 8987292897 3076786
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Hepatitis.B :
## Df Sum Sq Mean Sq F value Pr(>F)
## Year 15 78942 5263 8.7765 < 0.00000000000000022 ***
## Status 1 44629 44629 74.4254 < 0.00000000000000022 ***
## Residuals 2921 1751573 600
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Measles :
## Df Sum Sq Mean Sq F value Pr(>F)
## Year 15 3574437030 238295802 1.8301 0.02576 *
## Status 1 2295103259 2295103259 17.6263 0.00002769 ***
## Residuals 2921 380341079422 130209202
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response BMI :
## Df Sum Sq Mean Sq F value Pr(>F)
## Year 15 15722 1048 2.9273 0.0001234 ***
## Status 1 116309 116309 324.8463 < 0.00000000000000022 ***
## Residuals 2921 1045848 358
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response under.five.deaths :
## Df Sum Sq Mean Sq F value Pr(>F)
## Year 15 140068 9338 0.3663 0.987
## Status 1 1004966 1004966 39.4231 0.0000000003919 ***
## Residuals 2921 74461493 25492
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Polio :
## Df Sum Sq Mean Sq F value Pr(>F)
## Year 15 23466 1564 3.0398 0.00006754 ***
## Status 1 79028 79028 153.5582 < 0.00000000000000022 ***
## Residuals 2921 1503284 515
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Total.expenditure :
## Df Sum Sq Mean Sq F value Pr(>F)
## Year 15 248.3 16.55 3.1422 0.00003875 ***
## Status 1 1582.0 1581.99 300.2744 < 0.00000000000000022 ***
## Residuals 2921 15389.3 5.27
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Diphtheria :
## Df Sum Sq Mean Sq F value Pr(>F)
## Year 15 39730 2649 5.0733 0.0000000005013 ***
## Status 1 77968 77968 149.3407 < 0.00000000000000022 ***
## Residuals 2921 1524993 522
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response HIV.AIDS :
## Df Sum Sq Mean Sq F value Pr(>F)
## Year 15 1555 103.67 4.1772 0.0000001080110241631 ***
## Status 1 1679 1678.99 67.6521 0.0000000000000002901 ***
## Residuals 2921 72493 24.82
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response GDP :
## Df Sum Sq Mean Sq F value Pr(>F)
## Year 15 8145660021 543044001 3.8497 0.0000007272 ***
## Status 1 103743590059 103743590059 735.4503 < 0.00000000000000022 ***
## Residuals 2921 412040103345 141061316
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Population :
## Df Sum Sq Mean Sq F value Pr(>F)
## Year 15 27489282251233536 1832618816748902 0.6313 0.85139
## Status 1 15058079446803406 15058079446803406 5.1872 0.02283 *
## Residuals 2921 8479414988835587072 2902915093747206
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response thinness.btw.10.19years :
## Df Sum Sq Mean Sq F value Pr(>F)
## Year 15 165 11.0 0.6557 0.8297
## Status 1 7845 7845.2 466.5575 <0.0000000000000002 ***
## Residuals 2921 49117 16.8
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response thinness.btw.5.9years :
## Df Sum Sq Mean Sq F value Pr(>F)
## Year 15 195 13.0 0.7404 0.7448
## Status 1 8101 8101.5 462.3282 <0.0000000000000002 ***
## Residuals 2921 51185 17.5
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Income.composition.of.resources :
## Df Sum Sq Mean Sq F value Pr(>F)
## Year 15 7.698 0.5132 16.619 < 0.00000000000000022 ***
## Status 1 30.195 30.1949 977.792 < 0.00000000000000022 ***
## Residuals 2921 90.202 0.0309
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Schooling :
## Df Sum Sq Mean Sq F value Pr(>F)
## Year 15 1445.4 96.4 12.655 < 0.00000000000000022 ***
## Status 1 8812.5 8812.5 1157.395 < 0.00000000000000022 ***
## Residuals 2921 22240.6 7.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lm.life <- lm(cbind(Life.expectancy, Adult.Mortality, infant.deaths, Alcohol, percentage.expenditure, Hepatitis.B, Measles, BMI, under.five.deaths, Polio, Total.expenditure, Diphtheria, HIV.AIDS, GDP, Population, thinness.btw.10.19years, thinness.btw.5.9years, Income.composition.of.resources, Schooling) ~ Year + Status + Year*Status -1, data = life.data)
summary(lm.life)
## Response Life.expectancy :
##
## Call:
## lm(formula = Life.expectancy ~ Year + Status + Year * Status -
## 1, data = life.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31.609 -5.507 1.288 6.389 22.139
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## Year2000 76.8031250 1.4524546 52.878 < 0.0000000000000002
## Year2001 77.1281250 1.4524546 53.102 < 0.0000000000000002
## Year2002 77.5468750 1.4524546 53.390 < 0.0000000000000002
## Year2003 77.9406250 1.4524546 53.661 < 0.0000000000000002
## Year2004 78.3843750 1.4524546 53.967 < 0.0000000000000002
## Year2005 78.5906250 1.4524546 54.109 < 0.0000000000000002
## Year2006 79.1312500 1.4524546 54.481 < 0.0000000000000002
## Year2007 79.3000000 1.4524546 54.597 < 0.0000000000000002
## Year2008 78.9312500 1.4524546 54.343 < 0.0000000000000002
## Year2009 79.5843750 1.4524546 54.793 < 0.0000000000000002
## Year2010 80.1468750 1.4524546 55.180 < 0.0000000000000002
## Year2011 80.7062500 1.4524546 55.565 < 0.0000000000000002
## Year2012 80.4437500 1.4524546 55.385 < 0.0000000000000002
## Year2013 80.6812500 1.4524546 55.548 < 0.0000000000000002
## Year2014 81.1375000 1.4524546 55.862 < 0.0000000000000002
## Year2015 80.7093750 1.4524546 55.568 < 0.0000000000000002
## StatusDeveloping -12.1832575 1.5989675 -7.619 0.0000000000000342
## Year2001:StatusDeveloping 0.0650662 2.2612816 0.029 0.977
## Year2002:StatusDeveloping -0.1728891 2.2612816 -0.076 0.939
## Year2003:StatusDeveloping -0.5507450 2.2612816 -0.244 0.808
## Year2004:StatusDeveloping -0.8302566 2.2612816 -0.367 0.714
## Year2005:StatusDeveloping -0.3980960 2.2612816 -0.176 0.860
## Year2006:StatusDeveloping -0.4976614 2.2612816 -0.220 0.826
## Year2007:StatusDeveloping -0.2558154 2.2612816 -0.113 0.910
## Year2008:StatusDeveloping 0.6659147 2.2612816 0.294 0.768
## Year2009:StatusDeveloping 0.4929222 2.2612816 0.218 0.827
## Year2010:StatusDeveloping -0.0550083 2.2612816 -0.024 0.981
## Year2011:StatusDeveloping 0.0008485 2.2612816 0.000 1.000
## Year2012:StatusDeveloping 0.6375207 2.2612816 0.282 0.778
## Year2013:StatusDeveloping 0.9140569 2.2551332 0.405 0.685
## Year2014:StatusDeveloping 0.5477442 2.2612816 0.242 0.809
## Year2015:StatusDeveloping 1.1639487 2.2612816 0.515 0.607
##
## Year2000 ***
## Year2001 ***
## Year2002 ***
## Year2003 ***
## Year2004 ***
## Year2005 ***
## Year2006 ***
## Year2007 ***
## Year2008 ***
## Year2009 ***
## Year2010 ***
## Year2011 ***
## Year2012 ***
## Year2013 ***
## Year2014 ***
## Year2015 ***
## StatusDeveloping ***
## Year2001:StatusDeveloping
## Year2002:StatusDeveloping
## Year2003:StatusDeveloping
## Year2004:StatusDeveloping
## Year2005:StatusDeveloping
## Year2006:StatusDeveloping
## Year2007:StatusDeveloping
## Year2008:StatusDeveloping
## Year2009:StatusDeveloping
## Year2010:StatusDeveloping
## Year2011:StatusDeveloping
## Year2012:StatusDeveloping
## Year2013:StatusDeveloping
## Year2014:StatusDeveloping
## Year2015:StatusDeveloping
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.216 on 2906 degrees of freedom
## Multiple R-squared: 0.9863, Adjusted R-squared: 0.9862
## F-statistic: 6551 on 32 and 2906 DF, p-value: < 0.00000000000000022
##
##
## Response Adult.Mortality :
##
## Call:
## lm(formula = Adult.Mortality ~ Year + Status + Year * Status -
## 1, data = life.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -197.50 -70.33 -13.44 59.56 532.83
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## Year2000 91.719 20.827 4.404 0.00001102 ***
## Year2001 90.281 20.827 4.335 0.00001509 ***
## Year2002 86.469 20.827 4.152 0.00003395 ***
## Year2003 81.937 20.827 3.934 0.00008544 ***
## Year2004 84.469 20.827 4.056 0.00005130 ***
## Year2005 89.594 20.827 4.302 0.00001750 ***
## Year2006 93.531 20.827 4.491 0.00000737 ***
## Year2007 75.906 20.827 3.645 0.000273 ***
## Year2008 77.062 20.827 3.700 0.000220 ***
## Year2009 76.750 20.827 3.685 0.000233 ***
## Year2010 74.531 20.827 3.579 0.000351 ***
## Year2011 65.687 20.827 3.154 0.001627 **
## Year2012 71.094 20.827 3.414 0.000650 ***
## Year2013 66.656 20.827 3.200 0.001387 **
## Year2014 74.406 20.827 3.573 0.000359 ***
## Year2015 74.875 20.827 3.595 0.000330 ***
## StatusDeveloping 108.778 22.928 4.744 0.00000219 ***
## Year2001:StatusDeveloping -5.900 32.425 -0.182 0.855623
## Year2002:StatusDeveloping -5.803 32.425 -0.179 0.857978
## Year2003:StatusDeveloping -3.106 32.425 -0.096 0.923690
## Year2004:StatusDeveloping 14.588 32.425 0.450 0.652824
## Year2005:StatusDeveloping -14.200 32.425 -0.438 0.661480
## Year2006:StatusDeveloping -15.614 32.425 -0.482 0.630174
## Year2007:StatusDeveloping -7.095 32.425 -0.219 0.826818
## Year2008:StatusDeveloping 9.332 32.425 0.288 0.773526
## Year2009:StatusDeveloping -3.773 32.425 -0.116 0.907376
## Year2010:StatusDeveloping -2.938 32.425 -0.091 0.927802
## Year2011:StatusDeveloping 1.323 32.425 0.041 0.967466
## Year2012:StatusDeveloping -14.613 32.425 -0.451 0.652254
## Year2013:StatusDeveloping -11.811 32.337 -0.365 0.714964
## Year2014:StatusDeveloping -18.754 32.425 -0.578 0.563060
## Year2015:StatusDeveloping -14.262 32.425 -0.440 0.660079
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 117.8 on 2906 degrees of freedom
## Multiple R-squared: 0.677, Adjusted R-squared: 0.6735
## F-statistic: 190.4 on 32 and 2906 DF, p-value: < 0.00000000000000022
##
##
## Response infant.deaths :
##
## Call:
## lm(formula = infant.deaths ~ Year + Status + Year * Status -
## 1, data = life.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45.12 -33.56 -24.22 -1.25 1755.74
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## Year2000 1.7812 20.8071 0.086 0.9318
## Year2001 1.7500 20.8071 0.084 0.9330
## Year2002 1.7187 20.8071 0.083 0.9342
## Year2003 1.6562 20.8071 0.080 0.9366
## Year2004 1.6562 20.8071 0.080 0.9366
## Year2005 1.5937 20.8071 0.077 0.9389
## Year2006 1.5937 20.8071 0.077 0.9389
## Year2007 1.5625 20.8071 0.075 0.9401
## Year2008 1.5000 20.8071 0.072 0.9425
## Year2009 1.4375 20.8071 0.069 0.9449
## Year2010 1.3437 20.8071 0.065 0.9485
## Year2011 1.3437 20.8071 0.065 0.9485
## Year2012 1.2812 20.8071 0.062 0.9509
## Year2013 1.2500 20.8071 0.060 0.9521
## Year2014 1.2500 20.8071 0.060 0.9521
## Year2015 1.1875 20.8071 0.057 0.9545
## StatusDeveloping 43.3380 22.9060 1.892 0.0586 .
## Year2001:StatusDeveloping -0.8297 32.3939 -0.026 0.9796
## Year2002:StatusDeveloping -2.2951 32.3939 -0.071 0.9435
## Year2003:StatusDeveloping -3.0803 32.3939 -0.095 0.9243
## Year2004:StatusDeveloping -4.5704 32.3939 -0.141 0.8878
## Year2005:StatusDeveloping -6.0443 32.3939 -0.187 0.8520
## Year2006:StatusDeveloping -6.7198 32.3939 -0.207 0.8357
## Year2007:StatusDeveloping -8.1322 32.3939 -0.251 0.8018
## Year2008:StatusDeveloping -9.3214 32.3939 -0.288 0.7736
## Year2009:StatusDeveloping -9.9874 32.3939 -0.308 0.7579
## Year2010:StatusDeveloping -11.1188 32.3939 -0.343 0.7314
## Year2011:StatusDeveloping -12.4499 32.3939 -0.384 0.7008
## Year2012:StatusDeveloping -12.9702 32.3939 -0.400 0.6889
## Year2013:StatusDeveloping -16.1035 32.3059 -0.498 0.6182
## Year2014:StatusDeveloping -15.0913 32.3939 -0.466 0.6413
## Year2015:StatusDeveloping -15.9294 32.3939 -0.492 0.6229
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 117.7 on 2906 degrees of freedom
## Multiple R-squared: 0.07538, Adjusted R-squared: 0.0652
## F-statistic: 7.404 on 32 and 2906 DF, p-value: < 0.00000000000000022
##
##
## Response Alcohol :
##
## Call:
## lm(formula = Alcohol ~ Year + Status + Year * Status - 1, data = life.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.7388 -2.5479 -0.5401 1.9826 13.9988
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## Year2000 9.930625 0.562884 17.642 <0.0000000000000002 ***
## Year2001 9.861250 0.562884 17.519 <0.0000000000000002 ***
## Year2002 10.018125 0.562884 17.798 <0.0000000000000002 ***
## Year2003 10.126563 0.562884 17.990 <0.0000000000000002 ***
## Year2004 10.188750 0.562884 18.101 <0.0000000000000002 ***
## Year2005 10.108125 0.562884 17.958 <0.0000000000000002 ***
## Year2006 10.230938 0.562884 18.176 <0.0000000000000002 ***
## Year2007 10.307813 0.562884 18.312 <0.0000000000000002 ***
## Year2008 10.263125 0.562884 18.233 <0.0000000000000002 ***
## Year2009 9.906875 0.562884 17.600 <0.0000000000000002 ***
## Year2010 9.814063 0.562884 17.435 <0.0000000000000002 ***
## Year2011 9.780000 0.562884 17.375 <0.0000000000000002 ***
## Year2012 9.811875 0.562884 17.431 <0.0000000000000002 ***
## Year2013 9.699688 0.562884 17.232 <0.0000000000000002 ***
## Year2014 7.734063 0.562884 13.740 <0.0000000000000002 ***
## Year2015 9.597581 0.562884 17.051 <0.0000000000000002 ***
## StatusDeveloping -6.513822 0.619664 -10.512 <0.0000000000000002 ***
## Year2001:StatusDeveloping 0.065931 0.876337 0.075 0.940
## Year2002:StatusDeveloping 0.002699 0.876337 0.003 0.998
## Year2003:StatusDeveloping -0.094749 0.876337 -0.108 0.914
## Year2004:StatusDeveloping -0.092297 0.876337 -0.105 0.916
## Year2005:StatusDeveloping 0.123809 0.876337 0.141 0.888
## Year2006:StatusDeveloping 0.054257 0.876337 0.062 0.951
## Year2007:StatusDeveloping 0.077250 0.876337 0.088 0.930
## Year2008:StatusDeveloping 0.122997 0.876337 0.140 0.888
## Year2009:StatusDeveloping 0.412690 0.876337 0.471 0.638
## Year2010:StatusDeveloping 0.590735 0.876337 0.674 0.500
## Year2011:StatusDeveloping 0.547546 0.876337 0.625 0.532
## Year2012:StatusDeveloping -0.377409 0.876337 -0.431 0.667
## Year2013:StatusDeveloping -0.391092 0.873955 -0.447 0.655
## Year2014:StatusDeveloping 1.094241 0.876337 1.249 0.212
## Year2015:StatusDeveloping 0.738122 0.876337 0.842 0.400
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.184 on 2906 degrees of freedom
## Multiple R-squared: 0.7306, Adjusted R-squared: 0.7276
## F-statistic: 246.2 on 32 and 2906 DF, p-value: < 0.00000000000000022
##
##
## Response percentage.expenditure :
##
## Call:
## lm(formula = percentage.expenditure ~ Year + Status + Year *
## Status - 1, data = life.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4283.5 -358.7 -208.7 0.0 16488.2
##
## Coefficients:
## Estimate Std. Error t value
## Year2000 1897.60799714375412 304.03369772823271 6.241
## Year2001 2042.07181938676035 304.03369772823140 6.717
## Year2002 1741.77228602157220 304.03369772822828 5.729
## Year2003 1532.33553219688815 304.03369772822828 5.040
## Year2004 2962.75973825530946 304.03369772822748 9.745
## Year2005 2955.57337835687576 304.03369772822793 9.721
## Year2006 3507.07645071812749 304.03369772822964 11.535
## Year2007 3818.03309434689709 304.03369772823055 12.558
## Year2008 4283.47963250687280 304.03369772822805 14.089
## Year2009 2607.10828200470451 304.03369772823009 8.575
## Year2010 2719.69766076439009 304.03369772822958 8.945
## Year2011 4133.73831318157499 304.03369772822919 13.596
## Year2012 3122.59981279063686 304.03369772822970 10.271
## Year2013 2610.84349423062940 304.03369772823009 8.587
## Year2014 3322.90858170912770 304.03369772822867 10.929
## Year2015 0.00000000002878 304.03369772822765 0.000
## StatusDeveloping -1725.48059965513221 334.70237398506896 -5.155
## Year2001:StatusDeveloping -123.23861545274202 473.34063664815784 -0.260
## Year2002:StatusDeveloping 192.42803550012133 473.34063664814795 0.407
## Year2003:StatusDeveloping 467.95672992973846 473.34063664814806 0.989
## Year2004:StatusDeveloping -929.13870884200333 473.34063664814602 -1.963
## Year2005:StatusDeveloping -858.64333526126609 473.34063664814749 -1.814
## Year2006:StatusDeveloping -1520.40631460243776 473.34063664815130 -3.212
## Year2007:StatusDeveloping -1744.94427805708528 473.34063664815494 -3.686
## Year2008:StatusDeveloping -2137.73055533841261 473.34063664814960 -4.516
## Year2009:StatusDeveloping -561.75381728862237 473.34063664815505 -1.187
## Year2010:StatusDeveloping -639.55355892689181 473.34063664815312 -1.351
## Year2011:StatusDeveloping -2036.56743241351774 473.34063664815051 -4.303
## Year2012:StatusDeveloping -833.03886968168092 473.34063664815443 -1.760
## Year2013:StatusDeveloping -401.37881322950602 472.05363936666987 -0.850
## Year2014:StatusDeveloping -1087.38214101282028 473.34063664815187 -2.297
## Year2015:StatusDeveloping 1728.36970463735997 473.34063664814869 3.651
## Pr(>|t|)
## Year2000 0.0000000004967 ***
## Year2001 0.0000000000223 ***
## Year2002 0.0000000111452 ***
## Year2003 0.0000004940110 ***
## Year2004 < 0.0000000000000002 ***
## Year2005 < 0.0000000000000002 ***
## Year2006 < 0.0000000000000002 ***
## Year2007 < 0.0000000000000002 ***
## Year2008 < 0.0000000000000002 ***
## Year2009 < 0.0000000000000002 ***
## Year2010 < 0.0000000000000002 ***
## Year2011 < 0.0000000000000002 ***
## Year2012 < 0.0000000000000002 ***
## Year2013 < 0.0000000000000002 ***
## Year2014 < 0.0000000000000002 ***
## Year2015 1.000000
## StatusDeveloping 0.0000002702453 ***
## Year2001:StatusDeveloping 0.794605
## Year2002:StatusDeveloping 0.684382
## Year2003:StatusDeveloping 0.322929
## Year2004:StatusDeveloping 0.049749 *
## Year2005:StatusDeveloping 0.069780 .
## Year2006:StatusDeveloping 0.001332 **
## Year2007:StatusDeveloping 0.000232 ***
## Year2008:StatusDeveloping 0.0000065447691 ***
## Year2009:StatusDeveloping 0.235409
## Year2010:StatusDeveloping 0.176753
## Year2011:StatusDeveloping 0.0000174423418 ***
## Year2012:StatusDeveloping 0.078528 .
## Year2013:StatusDeveloping 0.395238
## Year2014:StatusDeveloping 0.021675 *
## Year2015:StatusDeveloping 0.000265 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1720 on 2906 degrees of freedom
## Multiple R-squared: 0.3492, Adjusted R-squared: 0.342
## F-statistic: 48.72 on 32 and 2906 DF, p-value: < 0.00000000000000022
##
##
## Response Hepatitis.B :
##
## Call:
## lm(formula = Hepatitis.B ~ Year + Status + Year * Status - 1,
## data = life.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -85.332 -6.956 7.668 15.894 32.619
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## Year2000 75.8337 4.3253 17.533 <0.0000000000000002 ***
## Year2001 81.1093 4.3253 18.752 <0.0000000000000002 ***
## Year2002 86.8907 4.3253 20.089 <0.0000000000000002 ***
## Year2003 87.5650 4.3253 20.245 <0.0000000000000002 ***
## Year2004 89.4472 4.3253 20.680 <0.0000000000000002 ***
## Year2005 86.1679 4.3253 19.922 <0.0000000000000002 ***
## Year2006 91.0347 4.3253 21.047 <0.0000000000000002 ***
## Year2007 90.9583 4.3253 21.029 <0.0000000000000002 ***
## Year2008 88.6718 4.3253 20.501 <0.0000000000000002 ***
## Year2009 90.7071 4.3253 20.971 <0.0000000000000002 ***
## Year2010 85.2922 4.3253 19.719 <0.0000000000000002 ***
## Year2011 87.3318 4.3253 20.191 <0.0000000000000002 ***
## Year2012 82.9239 4.3253 19.172 <0.0000000000000002 ***
## Year2013 89.4474 4.3253 20.680 <0.0000000000000002 ***
## Year2014 86.5074 4.3253 20.000 <0.0000000000000002 ***
## Year2015 87.2842 4.3253 20.180 <0.0000000000000002 ***
## StatusDeveloping -8.9294 4.7616 -1.875 0.0609 .
## Year2001:StatusDeveloping -5.7987 6.7339 -0.861 0.3892
## Year2002:StatusDeveloping -9.2786 6.7339 -1.378 0.1683
## Year2003:StatusDeveloping -10.3958 6.7339 -1.544 0.1227
## Year2004:StatusDeveloping -9.1169 6.7339 -1.354 0.1759
## Year2005:StatusDeveloping -1.3308 6.7339 -0.198 0.8434
## Year2006:StatusDeveloping -5.7252 6.7339 -0.850 0.3953
## Year2007:StatusDeveloping -4.2434 6.7339 -0.630 0.5286
## Year2008:StatusDeveloping 1.5640 6.7339 0.232 0.8164
## Year2009:StatusDeveloping 0.3287 6.7339 0.049 0.9611
## Year2010:StatusDeveloping 2.7054 6.7339 0.402 0.6879
## Year2011:StatusDeveloping 4.0985 6.7339 0.609 0.5428
## Year2012:StatusDeveloping 7.8017 6.7339 1.159 0.2467
## Year2013:StatusDeveloping 0.7187 6.7156 0.107 0.9148
## Year2014:StatusDeveloping 4.1518 6.7339 0.617 0.5376
## Year2015:StatusDeveloping 2.9757 6.7339 0.442 0.6586
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.47 on 2906 degrees of freedom
## Multiple R-squared: 0.9124, Adjusted R-squared: 0.9114
## F-statistic: 945.6 on 32 and 2906 DF, p-value: < 0.00000000000000022
##
##
## Response Measles :
##
## Call:
## lm(formula = Measles ~ Year + Status + Year * Status - 1, data = life.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5480 -2447 -1748 -608 206703
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## Year2000 810.7 2020.9 0.401 0.6883
## Year2001 956.8 2020.9 0.473 0.6359
## Year2002 1531.5 2020.9 0.758 0.4486
## Year2003 701.2 2020.9 0.347 0.7287
## Year2004 331.4 2020.9 0.164 0.8697
## Year2005 201.0 2020.9 0.099 0.9208
## Year2006 250.2 2020.9 0.124 0.9015
## Year2007 120.2 2020.9 0.059 0.9526
## Year2008 573.9 2020.9 0.284 0.7764
## Year2009 216.3 2020.9 0.107 0.9148
## Year2010 795.6 2020.9 0.394 0.6939
## Year2011 608.4 2020.9 0.301 0.7634
## Year2012 386.5 2020.9 0.191 0.8483
## Year2013 275.8 2020.9 0.136 0.8915
## Year2014 103.4 2020.9 0.051 0.9592
## Year2015 121.3 2020.9 0.060 0.9521
## StatusDeveloping 4669.6 2224.8 2.099 0.0359 *
## Year2001:StatusDeveloping -205.5 3146.3 -0.065 0.9479
## Year2002:StatusDeveloping -2641.7 3146.3 -0.840 0.4012
## Year2003:StatusDeveloping -1024.6 3146.3 -0.326 0.7447
## Year2004:StatusDeveloping -1671.2 3146.3 -0.531 0.5953
## Year2005:StatusDeveloping -1034.4 3146.3 -0.329 0.7424
## Year2006:StatusDeveloping -2472.3 3146.3 -0.786 0.4321
## Year2007:StatusDeveloping -2927.6 3146.3 -0.930 0.3522
## Year2008:StatusDeveloping -3519.1 3146.3 -1.118 0.2635
## Year2009:StatusDeveloping -3086.5 3146.3 -0.981 0.3267
## Year2010:StatusDeveloping -3356.9 3146.3 -1.067 0.2861
## Year2011:StatusDeveloping -3027.3 3146.3 -0.962 0.3360
## Year2012:StatusDeveloping -3629.9 3146.3 -1.154 0.2487
## Year2013:StatusDeveloping -3255.8 3137.8 -1.038 0.2995
## Year2014:StatusDeveloping -2575.6 3146.3 -0.819 0.4131
## Year2015:StatusDeveloping -2995.2 3146.3 -0.952 0.3412
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11430 on 2906 degrees of freedom
## Multiple R-squared: 0.05855, Adjusted R-squared: 0.04818
## F-statistic: 5.648 on 32 and 2906 DF, p-value: < 0.00000000000000022
##
##
## Response BMI :
##
## Call:
## lm(formula = BMI ~ Year + Status + Year * Status - 1, data = life.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.694 -15.742 3.838 14.808 46.905
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## Year2000 45.5812 3.3478 13.615 <0.0000000000000002 ***
## Year2001 48.9500 3.3478 14.622 <0.0000000000000002 ***
## Year2002 52.3156 3.3478 15.627 <0.0000000000000002 ***
## Year2003 49.4875 3.3478 14.782 <0.0000000000000002 ***
## Year2004 50.0313 3.3478 14.945 <0.0000000000000002 ***
## Year2005 49.7313 3.3478 14.855 <0.0000000000000002 ***
## Year2006 51.9406 3.3478 15.515 <0.0000000000000002 ***
## Year2007 52.4656 3.3478 15.672 <0.0000000000000002 ***
## Year2008 55.5188 3.3478 16.584 <0.0000000000000002 ***
## Year2009 54.3531 3.3478 16.236 <0.0000000000000002 ***
## Year2010 54.8594 3.3478 16.387 <0.0000000000000002 ***
## Year2011 55.3938 3.3478 16.547 <0.0000000000000002 ***
## Year2012 50.8469 3.3478 15.188 <0.0000000000000002 ***
## Year2013 49.6906 3.3478 14.843 <0.0000000000000002 ***
## Year2014 51.9031 3.3478 15.504 <0.0000000000000002 ***
## Year2015 55.7938 3.3478 16.666 <0.0000000000000002 ***
## StatusDeveloping -13.7236 3.6855 -3.724 0.0002 ***
## Year2001:StatusDeveloping -2.0317 5.2120 -0.390 0.6967
## Year2002:StatusDeveloping -4.9423 5.2120 -0.948 0.3431
## Year2003:StatusDeveloping -2.3182 5.2120 -0.445 0.6565
## Year2004:StatusDeveloping -2.8189 5.2120 -0.541 0.5887
## Year2005:StatusDeveloping -3.6114 5.2120 -0.693 0.4884
## Year2006:StatusDeveloping -3.9799 5.2120 -0.764 0.4452
## Year2007:StatusDeveloping -5.2831 5.2120 -1.014 0.3108
## Year2008:StatusDeveloping -7.4769 5.2120 -1.435 0.1515
## Year2009:StatusDeveloping -4.4419 5.2120 -0.852 0.3942
## Year2010:StatusDeveloping -6.3321 5.2120 -1.215 0.2245
## Year2011:StatusDeveloping -5.3838 5.2120 -1.033 0.3017
## Year2012:StatusDeveloping 0.4260 5.2120 0.082 0.9349
## Year2013:StatusDeveloping 4.4275 5.1979 0.852 0.3944
## Year2014:StatusDeveloping 0.2629 5.2120 0.050 0.9598
## Year2015:StatusDeveloping -2.3866 5.2120 -0.458 0.6471
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.94 on 2906 degrees of freedom
## Multiple R-squared: 0.8087, Adjusted R-squared: 0.8066
## F-statistic: 383.9 on 32 and 2906 DF, p-value: < 0.00000000000000022
##
##
## Response under.five.deaths :
##
## Call:
## lm(formula = under.five.deaths ~ Year + Status + Year * Status -
## 1, data = life.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -64.66 -46.82 -34.00 -1.36 2435.34
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## Year2000 2.156 28.292 0.076 0.9393
## Year2001 2.094 28.292 0.074 0.9410
## Year2002 2.062 28.292 0.073 0.9419
## Year2003 2.000 28.292 0.071 0.9436
## Year2004 1.937 28.292 0.068 0.9454
## Year2005 1.937 28.292 0.068 0.9454
## Year2006 1.906 28.292 0.067 0.9463
## Year2007 1.844 28.292 0.065 0.9480
## Year2008 1.781 28.292 0.063 0.9498
## Year2009 1.781 28.292 0.063 0.9498
## Year2010 1.687 28.292 0.060 0.9524
## Year2011 1.656 28.292 0.059 0.9533
## Year2012 1.594 28.292 0.056 0.9551
## Year2013 1.562 28.292 0.055 0.9560
## Year2014 1.500 28.292 0.053 0.9577
## Year2015 1.469 28.292 0.052 0.9586
## StatusDeveloping 62.499 31.146 2.007 0.0449 *
## Year2001:StatusDeveloping -1.971 44.047 -0.045 0.9643
## Year2002:StatusDeveloping -4.039 44.047 -0.092 0.9269
## Year2003:StatusDeveloping -6.042 44.047 -0.137 0.8909
## Year2004:StatusDeveloping -7.775 44.047 -0.177 0.8599
## Year2005:StatusDeveloping -9.940 44.047 -0.226 0.8215
## Year2006:StatusDeveloping -11.154 44.047 -0.253 0.8001
## Year2007:StatusDeveloping -12.946 44.047 -0.294 0.7688
## Year2008:StatusDeveloping -14.579 44.047 -0.331 0.7407
## Year2009:StatusDeveloping -16.459 44.047 -0.374 0.7087
## Year2010:StatusDeveloping -17.829 44.047 -0.405 0.6857
## Year2011:StatusDeveloping -19.765 44.047 -0.449 0.6537
## Year2012:StatusDeveloping -21.332 44.047 -0.484 0.6282
## Year2013:StatusDeveloping -25.509 43.927 -0.581 0.5615
## Year2014:StatusDeveloping -24.456 44.047 -0.555 0.5788
## Year2015:StatusDeveloping -25.968 44.047 -0.590 0.5555
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 160 on 2906 degrees of freedom
## Multiple R-squared: 0.07877, Adjusted R-squared: 0.06863
## F-statistic: 7.765 on 32 and 2906 DF, p-value: < 0.00000000000000022
##
##
## Response Polio :
##
## Call:
## lm(formula = Polio ~ Year + Status + Year * Status - 1, data = life.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -83.937 -4.400 5.656 14.854 25.610
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## Year2000 89.0625 4.0170 22.171 <0.0000000000000002 ***
## Year2001 93.8750 4.0170 23.369 <0.0000000000000002 ***
## Year2002 91.9375 4.0170 22.887 <0.0000000000000002 ***
## Year2003 94.9688 4.0170 23.642 <0.0000000000000002 ***
## Year2004 92.3125 4.0170 22.980 <0.0000000000000002 ***
## Year2005 92.3750 4.0170 22.996 <0.0000000000000002 ***
## Year2006 94.9063 4.0170 23.626 <0.0000000000000002 ***
## Year2007 94.4688 4.0170 23.517 <0.0000000000000002 ***
## Year2008 94.5313 4.0170 23.533 <0.0000000000000002 ***
## Year2009 94.6250 4.0170 23.556 <0.0000000000000002 ***
## Year2010 92.1563 4.0170 22.942 <0.0000000000000002 ***
## Year2011 95.2813 4.0170 23.719 <0.0000000000000002 ***
## Year2012 95.6563 4.0170 23.813 <0.0000000000000002 ***
## Year2013 95.6250 4.0170 23.805 <0.0000000000000002 ***
## Year2014 92.9375 4.0170 23.136 <0.0000000000000002 ***
## Year2015 95.0625 4.0170 23.665 <0.0000000000000002 ***
## StatusDeveloping -15.6720 4.4222 -3.544 0.0004 ***
## Year2001:StatusDeveloping -3.8137 6.2540 -0.610 0.5420
## Year2002:StatusDeveloping 0.6580 6.2540 0.105 0.9162
## Year2003:StatusDeveloping -2.2077 6.2540 -0.353 0.7241
## Year2004:StatusDeveloping 0.7651 6.2540 0.122 0.9026
## Year2005:StatusDeveloping 1.7000 6.2540 0.272 0.7858
## Year2006:StatusDeveloping 2.9811 6.2540 0.477 0.6336
## Year2007:StatusDeveloping 3.8424 6.2540 0.614 0.5390
## Year2008:StatusDeveloping 4.6872 6.2540 0.749 0.4536
## Year2009:StatusDeveloping 4.2888 6.2540 0.686 0.4929
## Year2010:StatusDeveloping 5.5457 6.2540 0.887 0.3753
## Year2011:StatusDeveloping 2.5365 6.2540 0.406 0.6851
## Year2012:StatusDeveloping 1.4727 6.2540 0.235 0.8138
## Year2013:StatusDeveloping 2.5067 6.2370 0.402 0.6878
## Year2014:StatusDeveloping 5.7213 6.2540 0.915 0.3604
## Year2015:StatusDeveloping 1.3049 6.2540 0.209 0.8347
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.72 on 2906 degrees of freedom
## Multiple R-squared: 0.9305, Adjusted R-squared: 0.9297
## F-statistic: 1215 on 32 and 2906 DF, p-value: < 0.00000000000000022
##
##
## Response Total.expenditure :
##
## Call:
## lm(formula = Total.expenditure ~ Year + Status + Year * Status -
## 1, data = life.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.3012 -1.3706 -0.0675 1.2261 11.2888
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## Year2000 7.02156 0.40611 17.290 < 0.0000000000000002 ***
## Year2001 6.82063 0.40611 16.795 < 0.0000000000000002 ***
## Year2002 7.11375 0.40611 17.517 < 0.0000000000000002 ***
## Year2003 7.14313 0.40611 17.589 < 0.0000000000000002 ***
## Year2004 7.47719 0.40611 18.412 < 0.0000000000000002 ***
## Year2005 7.50406 0.40611 18.478 < 0.0000000000000002 ***
## Year2006 7.41688 0.40611 18.263 < 0.0000000000000002 ***
## Year2007 7.35438 0.40611 18.110 < 0.0000000000000002 ***
## Year2008 7.13406 0.40611 17.567 < 0.0000000000000002 ***
## Year2009 8.35156 0.40611 20.565 < 0.0000000000000002 ***
## Year2010 8.29188 0.40611 20.418 < 0.0000000000000002 ***
## Year2011 7.13906 0.40611 17.579 < 0.0000000000000002 ***
## Year2012 7.98750 0.40611 19.669 < 0.0000000000000002 ***
## Year2013 8.55125 0.40611 21.057 < 0.0000000000000002 ***
## Year2014 8.00375 0.40611 19.709 < 0.0000000000000002 ***
## Year2015 7.48553 0.40611 18.432 < 0.0000000000000002 ***
## StatusDeveloping -1.74364 0.44707 -3.900 0.0000983 ***
## Year2001:StatusDeveloping 0.25992 0.63225 0.411 0.681
## Year2002:StatusDeveloping 0.01119 0.63225 0.018 0.986
## Year2003:StatusDeveloping 0.15081 0.63225 0.239 0.811
## Year2004:StatusDeveloping -0.21708 0.63225 -0.343 0.731
## Year2005:StatusDeveloping -0.17621 0.63225 -0.279 0.780
## Year2006:StatusDeveloping -0.33485 0.63225 -0.530 0.596
## Year2007:StatusDeveloping -0.34010 0.63225 -0.538 0.591
## Year2008:StatusDeveloping 0.03839 0.63225 0.061 0.952
## Year2009:StatusDeveloping -0.47713 0.63225 -0.755 0.451
## Year2010:StatusDeveloping -0.85439 0.63225 -1.351 0.177
## Year2011:StatusDeveloping 0.33153 0.63225 0.524 0.600
## Year2012:StatusDeveloping -0.36222 0.63225 -0.573 0.567
## Year2013:StatusDeveloping -0.85639 0.63053 -1.358 0.175
## Year2014:StatusDeveloping -0.44302 0.63225 -0.701 0.484
## Year2015:StatusDeveloping 0.22407 0.63225 0.354 0.723
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.297 on 2906 degrees of freedom
## Multiple R-squared: 0.8736, Adjusted R-squared: 0.8722
## F-statistic: 627.9 on 32 and 2906 DF, p-value: < 0.00000000000000022
##
##
## Response Diphtheria :
##
## Call:
## lm(formula = Diphtheria ~ Year + Status + Year * Status - 1,
## data = life.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -84.000 -3.243 5.793 14.185 27.530
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## Year2000 83.6875 4.0457 20.686 < 0.0000000000000002 ***
## Year2001 92.0000 4.0457 22.740 < 0.0000000000000002 ***
## Year2002 92.2500 4.0457 22.802 < 0.0000000000000002 ***
## Year2003 92.8125 4.0457 22.941 < 0.0000000000000002 ***
## Year2004 95.1250 4.0457 23.513 < 0.0000000000000002 ***
## Year2005 92.5313 4.0457 22.871 < 0.0000000000000002 ***
## Year2006 95.1250 4.0457 23.513 < 0.0000000000000002 ***
## Year2007 94.6563 4.0457 23.397 < 0.0000000000000002 ***
## Year2008 94.7813 4.0457 23.428 < 0.0000000000000002 ***
## Year2009 94.8125 4.0457 23.435 < 0.0000000000000002 ***
## Year2010 92.4063 4.0457 22.841 < 0.0000000000000002 ***
## Year2011 95.5938 4.0457 23.628 < 0.0000000000000002 ***
## Year2012 95.7813 4.0457 23.675 < 0.0000000000000002 ***
## Year2013 95.8125 4.0457 23.683 < 0.0000000000000002 ***
## Year2014 93.0000 4.0457 22.987 < 0.0000000000000002 ***
## Year2015 95.2500 4.0457 23.543 < 0.0000000000000002 ***
## StatusDeveloping -12.2174 4.4538 -2.743 0.00612 **
## Year2001:StatusDeveloping -6.3790 6.2986 -1.013 0.31126
## Year2002:StatusDeveloping -4.0461 6.2986 -0.642 0.52068
## Year2003:StatusDeveloping -4.7216 6.2986 -0.750 0.45354
## Year2004:StatusDeveloping -6.4160 6.2986 -1.019 0.30846
## Year2005:StatusDeveloping -1.0928 6.2986 -0.174 0.86227
## Year2006:StatusDeveloping -3.4276 6.2986 -0.544 0.58636
## Year2007:StatusDeveloping -0.3496 6.2986 -0.056 0.95574
## Year2008:StatusDeveloping 0.1479 6.2986 0.023 0.98127
## Year2009:StatusDeveloping 0.3551 6.2986 0.056 0.95505
## Year2010:StatusDeveloping 0.4766 6.2986 0.076 0.93969
## Year2011:StatusDeveloping 1.9084 6.2986 0.303 0.76192
## Year2012:StatusDeveloping 0.2507 6.2986 0.040 0.96826
## Year2013:StatusDeveloping 0.7030 6.2815 0.112 0.91090
## Year2014:StatusDeveloping 1.4094 6.2986 0.224 0.82296
## Year2015:StatusDeveloping -0.6485 6.2986 -0.103 0.91800
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.89 on 2906 degrees of freedom
## Multiple R-squared: 0.9293, Adjusted R-squared: 0.9285
## F-statistic: 1194 on 32 and 2906 DF, p-value: < 0.00000000000000022
##
##
## Response HIV.AIDS :
##
## Call:
## lm(formula = HIV.AIDS ~ Year + Status + Year * Status - 1, data = life.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.998 -2.294 -0.773 0.000 47.534
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## Year2000 0.10000 0.88093 0.114 0.90963
## Year2001 0.10000 0.88093 0.114 0.90963
## Year2002 0.10000 0.88093 0.114 0.90963
## Year2003 0.10000 0.88093 0.114 0.90963
## Year2004 0.10000 0.88093 0.114 0.90963
## Year2005 0.10000 0.88093 0.114 0.90963
## Year2006 0.10000 0.88093 0.114 0.90963
## Year2007 0.10000 0.88093 0.114 0.90963
## Year2008 0.10000 0.88093 0.114 0.90963
## Year2009 0.10000 0.88093 0.114 0.90963
## Year2010 0.10000 0.88093 0.114 0.90963
## Year2011 0.10000 0.88093 0.114 0.90963
## Year2012 0.10000 0.88093 0.114 0.90963
## Year2013 0.10000 0.88093 0.114 0.90963
## Year2014 0.10000 0.88093 0.114 0.90963
## Year2015 0.10000 0.88093 0.114 0.90963
## StatusDeveloping 2.94503 0.96979 3.037 0.00241 **
## Year2001:StatusDeveloping 0.05298 1.37149 0.039 0.96919
## Year2002:StatusDeveloping 0.05298 1.37149 0.039 0.96919
## Year2003:StatusDeveloping 0.02053 1.37149 0.015 0.98806
## Year2004:StatusDeveloping -0.06225 1.37149 -0.045 0.96380
## Year2005:StatusDeveloping -0.22318 1.37149 -0.163 0.87074
## Year2006:StatusDeveloping -0.43709 1.37149 -0.319 0.74998
## Year2007:StatusDeveloping -0.65099 1.37149 -0.475 0.63506
## Year2008:StatusDeveloping -0.88808 1.37149 -0.648 0.51734
## Year2009:StatusDeveloping -1.20132 1.37149 -0.876 0.38114
## Year2010:StatusDeveloping -1.52252 1.37149 -1.110 0.26704
## Year2011:StatusDeveloping -1.74172 1.37149 -1.270 0.20420
## Year2012:StatusDeveloping -1.94305 1.37149 -1.417 0.15667
## Year2013:StatusDeveloping -2.17236 1.36776 -1.588 0.11234
## Year2014:StatusDeveloping -2.23974 1.37149 -1.633 0.10256
## Year2015:StatusDeveloping -2.26556 1.37149 -1.652 0.09866 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.983 on 2906 degrees of freedom
## Multiple R-squared: 0.1474, Adjusted R-squared: 0.138
## F-statistic: 15.7 on 32 and 2906 DF, p-value: < 0.00000000000000022
##
##
## Response GDP :
##
## Call:
## lm(formula = GDP ~ Year + Status + Year * Status - 1, data = life.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29829 -3643 -2181 99 96216
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## Year2000 13411.0 2080.2 6.447 0.00000000013313 ***
## Year2001 14129.4 2080.2 6.792 0.00000000001333 ***
## Year2002 12105.7 2080.2 5.819 0.00000000654706 ***
## Year2003 10990.3 2080.2 5.283 0.00000013626825 ***
## Year2004 19705.6 2080.2 9.473 < 0.0000000000000002 ***
## Year2005 19468.1 2080.2 9.359 < 0.0000000000000002 ***
## Year2006 22585.6 2080.2 10.857 < 0.0000000000000002 ***
## Year2007 24866.5 2080.2 11.954 < 0.0000000000000002 ***
## Year2008 29970.9 2080.2 14.408 < 0.0000000000000002 ***
## Year2009 17486.7 2080.2 8.406 < 0.0000000000000002 ***
## Year2010 20670.1 2080.2 9.937 < 0.0000000000000002 ***
## Year2011 27414.7 2080.2 13.179 < 0.0000000000000002 ***
## Year2012 22242.6 2080.2 10.693 < 0.0000000000000002 ***
## Year2013 17536.2 2080.2 8.430 < 0.0000000000000002 ***
## Year2014 23240.4 2080.2 11.172 < 0.0000000000000002 ***
## Year2015 14477.7 2080.2 6.960 0.00000000000419 ***
## StatusDeveloping -11301.7 2290.0 -4.935 0.00000084585963 ***
## Year2001:StatusDeveloping -694.7 3238.6 -0.215 0.830155
## Year2002:StatusDeveloping 1495.8 3238.6 0.462 0.644200
## Year2003:StatusDeveloping 3027.8 3238.6 0.935 0.349905
## Year2004:StatusDeveloping -5151.5 3238.6 -1.591 0.111798
## Year2005:StatusDeveloping -4663.3 3238.6 -1.440 0.149999
## Year2006:StatusDeveloping -8340.9 3238.6 -2.575 0.010059 *
## Year2007:StatusDeveloping -9875.7 3238.6 -3.049 0.002314 **
## Year2008:StatusDeveloping -13926.5 3238.6 -4.300 0.00001762860100 ***
## Year2009:StatusDeveloping -2457.4 3238.6 -0.759 0.448049
## Year2010:StatusDeveloping -5898.2 3238.6 -1.821 0.068676 .
## Year2011:StatusDeveloping -11777.1 3238.6 -3.636 0.000281 ***
## Year2012:StatusDeveloping -4920.2 3238.6 -1.519 0.128811
## Year2013:StatusDeveloping -1458.6 3229.8 -0.452 0.651588
## Year2014:StatusDeveloping -6438.9 3238.6 -1.988 0.046887 *
## Year2015:StatusDeveloping 1233.2 3238.6 0.381 0.703388
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11770 on 2906 degrees of freedom
## Multiple R-squared: 0.3776, Adjusted R-squared: 0.3708
## F-statistic: 55.1 on 32 and 2906 DF, p-value: < 0.00000000000000022
##
##
## Response Population :
##
## Call:
## lm(formula = Population ~ Year + Status + Year * Status - 1,
## data = life.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21348551 -10808944 -6971493 -1554173 1272510702
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## Year2000 4939271 9544823 0.517 0.605
## Year2001 5195572 9544823 0.544 0.586
## Year2002 7403182 9544823 0.776 0.438
## Year2003 7740886 9544823 0.811 0.417
## Year2004 8199717 9544823 0.859 0.390
## Year2005 10989501 9544823 1.151 0.250
## Year2006 9534334 9544823 0.999 0.318
## Year2007 6216779 9544823 0.651 0.515
## Year2008 6982930 9544823 0.732 0.464
## Year2009 5016626 9544823 0.526 0.599
## Year2010 6865330 9544823 0.719 0.472
## Year2011 6946115 9544823 0.728 0.467
## Year2012 6920408 9544823 0.725 0.468
## Year2013 4674218 9544823 0.490 0.624
## Year2014 3797404 9544823 0.398 0.691
## Year2015 10397807 9544823 1.089 0.276
## StatusDeveloping 5358868 10507634 0.510 0.610
## Year2001:StatusDeveloping 315440 14860039 0.021 0.983
## Year2002:StatusDeveloping -5671282 14860039 -0.382 0.703
## Year2003:StatusDeveloping -2701831 14860039 -0.182 0.856
## Year2004:StatusDeveloping 3046106 14860039 0.205 0.838
## Year2005:StatusDeveloping 1028016 14860039 0.069 0.945
## Year2006:StatusDeveloping -13192 14860039 -0.001 0.999
## Year2007:StatusDeveloping 7114870 14860039 0.479 0.632
## Year2008:StatusDeveloping -2421565 14860039 -0.163 0.871
## Year2009:StatusDeveloping -358820 14860039 -0.024 0.981
## Year2010:StatusDeveloping -49718 14860039 -0.003 0.997
## Year2011:StatusDeveloping -627692 14860039 -0.042 0.966
## Year2012:StatusDeveloping 418487 14860039 0.028 0.978
## Year2013:StatusDeveloping 2672445 14819635 0.180 0.857
## Year2014:StatusDeveloping 12192319 14860039 0.820 0.412
## Year2015:StatusDeveloping -5218133 14860039 -0.351 0.725
##
## Residual standard error: 53990000 on 2906 degrees of freedom
## Multiple R-squared: 0.05226, Adjusted R-squared: 0.04182
## F-statistic: 5.007 on 32 and 2906 DF, p-value: < 0.00000000000000022
##
##
## Response thinness.btw.10.19years :
##
## Call:
## lm(formula = thinness.btw.10.19years ~ Year + Status + Year *
## Status - 1, data = life.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9025 -3.1515 -0.4028 1.7227 21.7511
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## Year2000 1.46250 0.72659 2.013 0.0442 *
## Year2001 1.43437 0.72659 1.974 0.0485 *
## Year2002 1.41875 0.72659 1.953 0.0510 .
## Year2003 1.38125 0.72659 1.901 0.0574 .
## Year2004 1.36250 0.72659 1.875 0.0609 .
## Year2005 1.33750 0.72659 1.841 0.0658 .
## Year2006 1.32187 0.72659 1.819 0.0690 .
## Year2007 1.30312 0.72659 1.793 0.0730 .
## Year2008 1.28750 0.72659 1.772 0.0765 .
## Year2009 1.27812 0.72659 1.759 0.0787 .
## Year2010 1.27187 0.72659 1.750 0.0801 .
## Year2011 1.25625 0.72659 1.729 0.0839 .
## Year2012 1.25000 0.72659 1.720 0.0855 .
## Year2013 1.25000 0.72659 1.720 0.0855 .
## Year2014 1.25625 0.72659 1.729 0.0839 .
## Year2015 1.25937 0.72659 1.733 0.0832 .
## StatusDeveloping 4.50822 0.79989 5.636 0.0000000191 ***
## Year2001:StatusDeveloping -0.19373 1.13121 -0.171 0.8640
## Year2002:StatusDeveloping 0.07554 1.13121 0.067 0.9468
## Year2003:StatusDeveloping -0.16908 1.13121 -0.149 0.8812
## Year2004:StatusDeveloping -0.13179 1.13121 -0.117 0.9073
## Year2005:StatusDeveloping 0.09149 1.13121 0.081 0.9355
## Year2006:StatusDeveloping 0.13202 1.13121 0.117 0.9071
## Year2007:StatusDeveloping 0.05805 1.13121 0.051 0.9591
## Year2008:StatusDeveloping -0.07599 1.13121 -0.067 0.9464
## Year2009:StatusDeveloping -0.20900 1.13121 -0.185 0.8534
## Year2010:StatusDeveloping -0.35441 1.13121 -0.313 0.7541
## Year2011:StatusDeveloping -0.41296 1.13121 -0.365 0.7151
## Year2012:StatusDeveloping -0.41969 1.13121 -0.371 0.7107
## Year2013:StatusDeveloping -0.62048 1.12814 -0.550 0.5824
## Year2014:StatusDeveloping -0.48142 1.13121 -0.426 0.6704
## Year2015:StatusDeveloping -0.49065 1.13121 -0.434 0.6645
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.11 on 2906 degrees of freedom
## Multiple R-squared: 0.6134, Adjusted R-squared: 0.6091
## F-statistic: 144.1 on 32 and 2906 DF, p-value: < 0.00000000000000022
##
##
## Response thinness.btw.5.9years :
##
## Call:
## lm(formula = thinness.btw.5.9years ~ Year + Status + Year * Status -
## 1, data = life.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9966 -3.1132 -0.4162 1.7451 22.6246
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## Year2000 1.44063 0.74173 1.942 0.0522 .
## Year2001 1.41250 0.74173 1.904 0.0570 .
## Year2002 1.39375 0.74173 1.879 0.0603 .
## Year2003 1.36875 0.74173 1.845 0.0651 .
## Year2004 1.34375 0.74173 1.812 0.0701 .
## Year2005 1.33125 0.74173 1.795 0.0728 .
## Year2006 1.30937 0.74173 1.765 0.0776 .
## Year2007 1.29062 0.74173 1.740 0.0820 .
## Year2008 1.27187 0.74173 1.715 0.0865 .
## Year2009 1.26250 0.74173 1.702 0.0888 .
## Year2010 1.24375 0.74173 1.677 0.0937 .
## Year2011 1.23125 0.74173 1.660 0.0970 .
## Year2012 1.21875 0.74173 1.643 0.1005
## Year2013 1.21562 0.74173 1.639 0.1013
## Year2014 1.21250 0.74173 1.635 0.1022
## Year2015 1.20000 0.74173 1.618 0.1058
## StatusDeveloping 4.65601 0.81655 5.702 0.000000013 ***
## Year2001:StatusDeveloping -0.19307 1.15478 -0.167 0.8672
## Year2002:StatusDeveloping -0.09485 1.15478 -0.082 0.9345
## Year2003:StatusDeveloping -0.17779 1.15478 -0.154 0.8776
## Year2004:StatusDeveloping -0.14087 1.15478 -0.122 0.9029
## Year2005:StatusDeveloping 0.06935 1.15478 0.060 0.9521
## Year2006:StatusDeveloping 0.05443 1.15478 0.047 0.9624
## Year2007:StatusDeveloping -0.14007 1.15478 -0.121 0.9035
## Year2008:StatusDeveloping -0.15906 1.15478 -0.138 0.8905
## Year2009:StatusDeveloping -0.41525 1.15478 -0.360 0.7192
## Year2010:StatusDeveloping -0.48657 1.15478 -0.421 0.6735
## Year2011:StatusDeveloping -0.55685 1.15478 -0.482 0.6297
## Year2012:StatusDeveloping -0.50955 1.15478 -0.441 0.6591
## Year2013:StatusDeveloping -0.77930 1.15164 -0.677 0.4987
## Year2014:StatusDeveloping -0.40499 1.15478 -0.351 0.7258
## Year2015:StatusDeveloping -0.51464 1.15478 -0.446 0.6559
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.196 on 2906 degrees of freedom
## Multiple R-squared: 0.6074, Adjusted R-squared: 0.603
## F-statistic: 140.5 on 32 and 2906 DF, p-value: < 0.00000000000000022
##
##
## Response Income.composition.of.resources :
##
## Call:
## lm(formula = Income.composition.of.resources ~ Year + Status +
## Year * Status - 1, data = life.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.61116 -0.09017 0.02960 0.11969 0.41061
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## Year2000 0.81110 0.03104 26.128 < 0.0000000000000002 ***
## Year2001 0.81635 0.03104 26.297 < 0.0000000000000002 ***
## Year2002 0.82336 0.03104 26.522 < 0.0000000000000002 ***
## Year2003 0.82636 0.03104 26.619 < 0.0000000000000002 ***
## Year2004 0.83162 0.03104 26.789 < 0.0000000000000002 ***
## Year2005 0.83734 0.03104 26.973 < 0.0000000000000002 ***
## Year2006 0.84405 0.03104 27.189 < 0.0000000000000002 ***
## Year2007 0.85098 0.03104 27.412 < 0.0000000000000002 ***
## Year2008 0.85627 0.03104 27.583 < 0.0000000000000002 ***
## Year2009 0.85984 0.03104 27.698 < 0.0000000000000002 ***
## Year2010 0.86056 0.03104 27.721 < 0.0000000000000002 ***
## Year2011 0.86519 0.03104 27.870 < 0.0000000000000002 ***
## Year2012 0.86788 0.03104 27.957 < 0.0000000000000002 ***
## Year2013 0.87110 0.03104 28.060 < 0.0000000000000002 ***
## Year2014 0.87600 0.03104 28.218 < 0.0000000000000002 ***
## Year2015 0.87872 0.03104 28.306 < 0.0000000000000002 ***
## StatusDeveloping -0.35771 0.03418 -10.467 < 0.0000000000000002 ***
## Year2001:StatusDeveloping 0.05334 0.04833 1.104 0.26985
## Year2002:StatusDeveloping 0.05152 0.04833 1.066 0.28650
## Year2003:StatusDeveloping 0.05288 0.04833 1.094 0.27402
## Year2004:StatusDeveloping 0.06037 0.04833 1.249 0.21174
## Year2005:StatusDeveloping 0.06735 0.04833 1.393 0.16359
## Year2006:StatusDeveloping 0.10149 0.04833 2.100 0.03582 *
## Year2007:StatusDeveloping 0.10110 0.04833 2.092 0.03653 *
## Year2008:StatusDeveloping 0.10274 0.04833 2.126 0.03362 *
## Year2009:StatusDeveloping 0.10504 0.04833 2.173 0.02983 *
## Year2010:StatusDeveloping 0.10832 0.04833 2.241 0.02509 *
## Year2011:StatusDeveloping 0.12532 0.04833 2.593 0.00956 **
## Year2012:StatusDeveloping 0.12838 0.04833 2.656 0.00794 **
## Year2013:StatusDeveloping 0.13126 0.04820 2.723 0.00650 **
## Year2014:StatusDeveloping 0.12904 0.04833 2.670 0.00763 **
## Year2015:StatusDeveloping 0.12876 0.04833 2.664 0.00776 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1756 on 2906 degrees of freedom
## Multiple R-squared: 0.9304, Adjusted R-squared: 0.9296
## F-statistic: 1213 on 32 and 2906 DF, p-value: < 0.00000000000000022
##
##
## Response Schooling :
##
## Call:
## lm(formula = Schooling ~ Year + Status + Year * Status - 1, data = life.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.9811 -1.6084 0.3136 1.7381 7.6938
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## Year2000 14.8898 0.4886 30.473 <0.0000000000000002 ***
## Year2001 15.0179 0.4886 30.735 <0.0000000000000002 ***
## Year2002 15.2616 0.4886 31.233 <0.0000000000000002 ***
## Year2003 15.3581 0.4886 31.431 <0.0000000000000002 ***
## Year2004 15.5675 0.4886 31.860 <0.0000000000000002 ***
## Year2005 15.5178 0.4886 31.758 <0.0000000000000002 ***
## Year2006 15.6761 0.4886 32.082 <0.0000000000000002 ***
## Year2007 15.7595 0.4886 32.253 <0.0000000000000002 ***
## Year2008 15.8689 0.4886 32.476 <0.0000000000000002 ***
## Year2009 15.9345 0.4886 32.611 <0.0000000000000002 ***
## Year2010 16.0533 0.4886 32.854 <0.0000000000000002 ***
## Year2011 16.1595 0.4886 33.071 <0.0000000000000002 ***
## Year2012 16.2220 0.4886 33.199 <0.0000000000000002 ***
## Year2013 16.2470 0.4886 33.250 <0.0000000000000002 ***
## Year2014 16.4158 0.4886 33.596 <0.0000000000000002 ***
## Year2015 16.4439 0.4886 33.653 <0.0000000000000002 ***
## StatusDeveloping -5.2633 0.5379 -9.785 <0.0000000000000002 ***
## Year2001:StatusDeveloping 0.3605 0.7607 0.474 0.636
## Year2002:StatusDeveloping 0.3080 0.7607 0.405 0.686
## Year2003:StatusDeveloping 0.3854 0.7607 0.507 0.612
## Year2004:StatusDeveloping 0.4302 0.7607 0.566 0.572
## Year2005:StatusDeveloping 0.6336 0.7607 0.833 0.405
## Year2006:StatusDeveloping 0.7333 0.7607 0.964 0.335
## Year2007:StatusDeveloping 0.7848 0.7607 1.032 0.302
## Year2008:StatusDeveloping 0.8127 0.7607 1.068 0.285
## Year2009:StatusDeveloping 0.8875 0.7607 1.167 0.243
## Year2010:StatusDeveloping 0.8985 0.7607 1.181 0.238
## Year2011:StatusDeveloping 0.9506 0.7607 1.250 0.212
## Year2012:StatusDeveloping 1.0277 0.7607 1.351 0.177
## Year2013:StatusDeveloping 0.9974 0.7587 1.315 0.189
## Year2014:StatusDeveloping 0.9754 0.7607 1.282 0.200
## Year2015:StatusDeveloping 0.9744 0.7607 1.281 0.200
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.764 on 2906 degrees of freedom
## Multiple R-squared: 0.9513, Adjusted R-squared: 0.9508
## F-statistic: 1775 on 32 and 2906 DF, p-value: < 0.00000000000000022
The data-set aims to answer the following key questions:
levels(life.data$Year)
## [1] "2000" "2001" "2002" "2003" "2004" "2005" "2006" "2007" "2008" "2009"
## [11] "2010" "2011" "2012" "2013" "2014" "2015"
life.data2 <- life.data %>%
mutate(Year2 = fct_recode(Year,
"0" = "2000",
"1" = "2001",
"2" = "2002",
"3" = "2003",
"4" = "2004",
"5" = "2005",
"6" = "2006",
"7" = "2007",
"8" = "2008",
"9" = "2009",
"10" = "2010",
"11" = "2011",
"12" = "2012",
"13" = "2013",
"14" = "2014",
"15" = "2015")) %>%
relocate(Year2, .after = Year)
life.data %>% dplyr::select(Country, Year) %>%
mutate(Year = as.numeric(as.character(Year)),
Country = as.numeric(as.character(Country)))
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
model1 <- lme(Life.expectancy ~ 1, random=~1|Country, data = life.data2, method = "ML")
summary(model1)
## Linear mixed-effects model fit by maximum likelihood
## Data: life.data2
## AIC BIC logLik
## 15116.05 15134.01 -7555.026
##
## Random effects:
## Formula: ~1 | Country
## (Intercept) Residual
## StdDev: 8.982004 2.68177
##
## Fixed effects: Life.expectancy ~ 1
## Value Std.Error DF t-value p-value
## (Intercept) 69.36273 0.6497407 2745 106.7545 0
##
## Standardized Within-Group Residuals:
## Min Q1 Med Q3 Max
## -8.80812089 -0.51499336 -0.03812818 0.41851811 6.11831705
##
## Number of Observations: 2938
## Number of Groups: 193
Q2. Factors that Affacting Life Expectancy
Life expectancy is statistically and positively related to schooling, total expenditure of government on health , BMI, GDP, Diphtheria and Polio immunisation (P-value < 0.01 for all). However, life expectancy is statistically and negatively related to if an individual is living in developing countries, cases of HIV/AIDS deaths, adult mortality and infant deaths (P-value < 0.01 for all).
The data used in this project was collected from WHO and United Nations website with the help of Deeksha Russell and Duan Wang (KUMARRAJARSHI 2018).
Anderson D.R., Sweeney D.J, Williams T.A., 2006, Essentials of Statistics for Business & Economics, South-Western, Division of Thomson Learning, United States of America
Bruce, Peter, and Andrew Bruce. 2017. Practical Statistics for Data Scientists. O’Reilly Media.
James, Gareth, Daniela Witten, Trevor Hastie, and Robert Tibshirani. 2014. An Introduction to Statistical Learning: With Applications in R . Springer Publishing Company, Incorporated.
Kassambara.A 2018, Machine Learning Essentials: Practical Guide in R, eBook.
Kassambara.A n.d., COMPARING MULTIPLE MEANS IN R, viewed 1 July 2022, https://www.datanovia.com/en/lessons/ancova-in-r/#two-way-ancova
Kumarrajarshi 2018, “Life Expectancy (WHO)”, viewed 23 June 2022, https://www.kaggle.com/datasets/kumarajarshi/life-expectancy-who