Hypothesis

India is a developing country, it has gone through a lot of ups and down but it has handled itself quite well, I wanted to know how India is performing with respect to other countries, so I’ll be taking mean of top ten developed countries, top ten developing and bottom ten countries and we’ll see where we stand, have we progressed? Has independence made any difference? have we made significantly difference in GDP and life expectancy compare to underdeveloped countries?

 Note: All the list of countries has been taken from **HDR website** http://www.hdr.undp.org/ 

Workflow

first we will import all the datasets

check the working directory getwd() If the working directory isn’t same then change it to directory where you have downloaded all the datasets setwd("X:/1.Study/4th year semester 2/Biostat/Assignment 1")

Now we’ll import all the .csv datasets and store them in some vectors

read.csv("child_mortality_0_5_year_olds_dying_per_1000_born.csv", header = T, check.names = F) -> ChildMortality

read.csv("children_per_woman_total_fertility.csv", header = T, check.names = F) -> ChildrenPerWomen

read.csv("income_per_person_gdppercapita_ppp_inflation_adjusted.csv", header = T, check.names = F)-> IncomePerPerson

read.csv("life_expectancy_years.csv", header = T, check.names = F)-> LifeExpectancy

read.csv("population_total.csv", header = T, check.names = F)-> Population

Loading the libraries

Once all the datasets are imported we need to load all the libraries in R

library(tidyverse)
library(dplyr)
library(tidyr)
library(ggthemes)
library(plotly)
library(Hmisc)

Creating generalised functions

As we have to extract our selected countries from each datasets so instead of writing tedious codes everytime we are making a generalized function to make our life bit easier.

suppose a dummy dataset is following

Dataset <- LifeExpectancy
a <- "country 1"
b <- "country 2"
c <- "country 3"
d <- "country 4"
e <- "country 5"
f <- "country 6"
g <- "country 7"
h <- "country 8"
i <- "country 9"
j <- "country 10"

Listcollecter extracts our country of interest from the specified dataset and stores in a table.

Listcollecter <- function(Dataset, a, b, c, d, e, f, g, h, i, j) {Dataset[Dataset$country %in% c(a, b, c, d, e, f, g, h, i, j), ]} 

Extracting Specific countries from the list

countrydata<- function(countryname, Dataset)    
{as.vector(na.omit(as.numeric(unlist(Dataset[Dataset$country==countryname,2:227]))))}

aicountrydata<- function(countryname, Dataset)    
{as.vector(na.omit(as.numeric(unlist(Dataset[Dataset$country==countryname,149:227]))))}

bicountrydata<- function(countryname, Dataset)    
{as.vector(na.omit(as.numeric(unlist(Dataset[Dataset$country==countryname,2:149]))))}

Extracting top ten countries

 a <- "Norway"
 b <- "Ireland"
 c <- "Switzerland"
 d <- "Hong Kong" 
 e <- "Iceland" 
 f <- "Germany"
 g <- "Sweden"
 h <- "Australia"
 i <- "Netherlands"
 j <- "Denmark"

Top ten countries life expectancies

Dataset <- LifeExpectancy
Listcollecter(Dataset, a, b, c, d, e, f, g, h, i, j) -> Toptenlife

Top 10 countries child mortality rate

Dataset <- ChildMortality
Listcollecter(Dataset, a, b, c, d, e, f, g, h, i, j) -> Toptenchildmortality

Top 10 countries Child per women

Dataset <- ChildrenPerWomen
Listcollecter(Dataset, a, b, c, d, e, f, g, h, i, j) -> Toptenchildrenperwomen

Top 10 countries income per person

Dataset <- IncomePerPerson
Listcollecter(Dataset, a, b, c, d, e, f, g, h, i, j) -> Toptenincomeperperson

Top 10 countries Population

Dataset <- Population
Listcollecter(Dataset, a, b, c, d, e, f, g, h, i, j) -> Toptenpopulation

Extracting Middle Ten Countries

 a <- "Algeria"
 b <- "Lebanon"
 c <- "Fiji"
 d <- "Moldova"
 e <- "Maldives"
 f <- "Tunisia"
 g <- "Saint Vincent and the Grenadines" 
 h <- "Suriname"
 i <- "Mongolia"
 j <- "Botswana"

Middle 10 countries Life expectancy

Dataset <- LifeExpectancy
Listcollecter(Dataset, a, b, c, d, e, f, g, h, i, j) -> Middletenlife

Middle 10 countries child mortality rate

Dataset <- ChildMortality
Listcollecter(Dataset, a, b, c, d, e, f, g, h, i, j) -> Middletenchildmortality

Middle 10 countries Child per women

Dataset <- ChildrenPerWomen
Listcollecter(Dataset, a, b, c, d, e, f, g, h, i, j) -> Middletenchildrenperwomen

Middle 10 countries income per person

Dataset <- IncomePerPerson
Listcollecter(Dataset, a, b, c, d, e, f, g, h, i, j) -> Middletenincomeperperson

Middle 10 countries Population

Dataset <- Population
Listcollecter(Dataset, a, b, c, d, e, f, g, h, i, j) -> Middletenpopulation

Bottom 10 countries

 a <- "Eritrea"
 b <- "Mozambique"
 c <- "Burkina Faso"
 d <- "Sierra Leone"
 e <- "Mali"
 f <- "Burundi"
 g <- "South Sudan" 
 h <- "Chad"
 i <- "Central African Republic"
 j <- "Niger"

Bottom10 countries Life expectancy

Dataset <- LifeExpectancy
Listcollecter(Dataset, a, b, c, d, e, f, g, h, i, j) -> Bottomtenlife

Bottom 10 countries child mortality rate

Dataset <- ChildMortality
Listcollecter(Dataset, a, b, c, d, e, f, g, h, i, j) -> Bottomtenchildmortality

Bottom 10 countries Child per women

Dataset <- ChildrenPerWomen
Listcollecter(Dataset, a, b, c, d, e, f, g, h, i, j) -> Bottomtenchildrenperwomen

Bottom 10 countries income per person

Dataset <- IncomePerPerson
Listcollecter(Dataset, a, b, c, d, e, f, g, h, i, j) -> Bottomtenincomeperperson

Bottom 10 countries Population

Dataset <- Population
Listcollecter(Dataset, a, b, c, d, e, f, g, h, i, j) -> Bottomtenpopulation

Merging files

Calculating means

Means of Life expectancies

 apply(Toptenlife[,2:227], 2, mean) -> topmeanlife
 apply(Middletenlife[,2:227], 2, mean) -> middlemeanlife
 apply(Bottomtenlife[,2:227], 2, mean) -> bottommeanlife
 apply(Toptenlife[,149:227], 2, mean) -> aitopmeanlife
 apply(Middletenlife[,149:227], 2, mean) -> aimiddlemeanlife
 apply(Bottomtenlife[,149:227], 2, mean) -> aibottommeanlife
 apply(Toptenlife[,2:149], 2, mean) -> bitopmeanlife
 apply(Middletenlife[,2:149], 2, mean) -> bimiddlemeanlife
 apply(Bottomtenlife[,2:149], 2, mean) -> bibottommeanlife

Means of Child Mortality

 apply(Toptenchildmortality[,2:227], 2, mean) -> topmeanchildmortality
 apply(Middletenchildmortality[,2:227], 2, mean) -> middlemeanchildmortality
 apply(Bottomtenchildmortality[,2:227], 2, mean) -> bottommeanchildmortality
 apply(Toptenchildmortality[,149:227], 2, mean) -> aitopmeanchildmortality
 apply(Middletenchildmortality[,149:227], 2, mean) -> aimiddlemeanchildmortality
 apply(Bottomtenchildmortality[,149:227], 2, mean) -> aibottommeanchildmortality
 apply(Toptenchildmortality[,2:149], 2, mean) -> bitopmeanchildmortality
 apply(Middletenchildmortality[,2:149], 2, mean) -> bimiddlemeanchildmortality
 apply(Bottomtenchildmortality[,2:149], 2, mean) -> bibottommeanchildmortality

Means of Children per women

 apply(Toptenchildrenperwomen[,2:227], 2, mean) -> topmeanchildrenperwomen
 apply(Middletenchildrenperwomen[,2:227], 2, mean) -> middlemeanchildrenperwomen
 apply(Bottomtenchildrenperwomen[,2:227], 2, mean) -> bottommeanchildrenperwomen
 apply(Toptenchildrenperwomen[,149:227], 2, mean) -> aitopmeanchildrenperwomen
 apply(Middletenchildrenperwomen[,149:227], 2, mean) -> aimiddlemeanchildrenperwomen
 apply(Bottomtenchildrenperwomen[,149:227], 2, mean) -> aibottommeanchildrenperwomen
 apply(Toptenchildrenperwomen[,2:149], 2, mean) -> bitopmeanchildrenperwomen
 apply(Middletenchildrenperwomen[,2:149], 2, mean) -> bimiddlemeanchildrenperwomen
 apply(Bottomtenchildrenperwomen[,2:149], 2, mean) -> bibottommeanchildrenperwomen

Means of Income Per person

 apply(Toptenincomeperperson[,2:227], 2, mean) -> topmeangdp
 apply(Middletenincomeperperson[,2:227], 2, mean) -> middlemeangdp
 apply(Bottomtenincomeperperson[,2:227], 2, mean) -> bottommeangdp
 apply(Toptenincomeperperson[,149:227], 2, mean) -> aitopmeangdp
 apply(Middletenincomeperperson[,149:227], 2, mean) -> aimiddlemeangdp
 apply(Bottomtenincomeperperson[,149:227], 2, mean) -> aibottommeangdp
 apply(Toptenincomeperperson[,2:149], 2, mean) -> bitopmeangdp
 apply(Middletenincomeperperson[,2:149], 2, mean) -> bimiddlemeangdp
 apply(Bottomtenincomeperperson[,2:149], 2, mean) -> bibottommeangdp

Means of Population

 apply(Toptenpopulation[,2:227], 2, mean) -> topmeanpopulation
 apply(Middletenpopulation[,2:227], 2, mean) -> middlemeanpopulation
 apply(Bottomtenpopulation[,2:227], 2, mean) -> bottommeanpopulation
 apply(Toptenpopulation[,149:227], 2, mean) -> aitopmeanpopulation
 apply(Middletenpopulation[,149:227], 2, mean) -> aimiddlemeanpopulation
 apply(Bottomtenpopulation[,149:227], 2, mean) -> aibottommeanpopulation
 apply(Toptenpopulation[,2:149], 2, mean) -> bitopmeanpopulation
 apply(Middletenpopulation[,2:149], 2, mean) -> bimiddlemeanpopulation
 apply(Bottomtenpopulation[,2:149], 2, mean) -> bibottommeanpopulation

Country datasets

Overall dataset for India

as.vector(countrydata("India", LifeExpectancy))-> indialife
as.vector(countrydata("India", IncomePerPerson))-> indiagdp
as.vector(countrydata("India", ChildMortality))-> indiachildmortality
as.vector(countrydata("India", ChildrenPerWomen))-> indiachildperwomen
as.vector(countrydata("India", Population))-> indiapopulation

India dataset after independence

as.vector(aicountrydata("India", LifeExpectancy))-> aiindialife
as.vector(aicountrydata("India", IncomePerPerson))-> aiindiagdp
as.vector(aicountrydata("India", ChildMortality))-> aiindiachildmortality
as.vector(aicountrydata("India", ChildrenPerWomen))-> aiindiachildperwomen
as.vector(aicountrydata("India", Population))-> aiindiapopulation

India dataset before independence

as.vector(bicountrydata("India", LifeExpectancy))-> biindialife
as.vector(bicountrydata("India", IncomePerPerson))-> biindiagdp
as.vector(bicountrydata("India", ChildMortality))-> biindiachildmortality
as.vector(bicountrydata("India", ChildrenPerWomen))-> biindiachildperwomen
as.vector(bicountrydata("India", Population))-> biindiapopulation
as.vector(unlist(colnames(LifeExpectancy[1,2:227]))) -> Years
as.vector(unlist(colnames(LifeExpectancy[1,149:227]))) -> a47Years
as.vector(unlist(colnames(LifeExpectancy[1,2:149]))) -> b47Years
plot(Years, topmeanlife, col = "red", ylim= c(20, 100), pch = 20, main = "Life expectancy for years 1800 to 2025", ylab = "Life expectancies in Years")
points(Years, middlemeanlife, col= " blue", pch = 20)
points(Years, bottommeanlife, col ="green", pch= 20)
points(Years, indialife, col = "black", pch = 20)
legend(1800, 100, legend=c("Top 10 Developed Nations", "Middle 10 Developing Nations", "Botom 10 Underdeveloped Nations",  "India"), fill = c("red", "blue", "green", "black"), bg= "lightblue")
abline(v= 1918)
abline(v= 1945)
abline(v= 2000)
abline(v= 1947, col= "red")
text(1955, 30, "Independence", col= "red", font = 2)
text(1918, 75, "World War 1")
text(1945, 75, "World War 2")
text(2000, 75, "AIDS Pandemic")

Observations

Here I have drawn three vertical lines indicating three important events i have noticed

World war 1 (~1918)

There is a very sharp dip in life expectancies of developing and developed countries, but the underdeveloped countries remains unaffected as only developing and developed ones were participating in world war 1, though Idia didn’t show much of a decrease during that time.

World war 2 (~1945)

Here we can also see a dip in Developing and developed countries, but strangely underdeveloped ones are also showing dips why? Weren’t World war 2 amongst super powers only? After a bit of research i found Underdeveloped countries like Eritrea was used by Italians against Sudan, similarily Mozambique served various places in Portuguese empire, as chad was also a french colony at that time they were allies in the world war 2. Similarily India was also a british colony it also faced the consequence.

Independence (1947)

15th Aug 1947 a historic day for India, we can see a sharp change in life expectancy of india it just crossed under developing countries and just kept moving upwards after that

AIDS Pandemic (~2000)

It affects mostly sub African countries so we can see a significant dip in underdeveloped countries, it also affected India a bit, but developed countries remained almost unaffected.

Testing Above observations

As India has shown a sharp change after independence it will be injustice to compare overall mean so we’ll also check before and after independence data too.

par(mfrow=c(2,2))
boxplot(topmeanlife, middlemeanlife, bottommeanlife, indialife, names = c("Developed", "Developing", "Underdeveloped", "India"), ylab= "Life Expectancy in Years", main= "Life Expectancy Means for years 1800 to 2025")
boxplot(aitopmeanlife, aimiddlemeanlife, aibottommeanlife, aiindialife, names = c("Developed", "Developing", "Underdeveloped", "India"), ylab= "Life Expectancy in Years", main= "Life Expectancy Means for years 1947 to 2025 (after Independence)")
boxplot(bitopmeanlife, bimiddlemeanlife, bibottommeanlife, biindialife, names = c("Developed", "Developing", "Underdeveloped", "India"), ylab= "Life Expectancy in Years", main= "Life Expectancy Means for years 1800 to 1947 (before Independence)")
boxplot(aibottommeanlife, aiindialife, names = c("Underdeveloped", "India"), ylab= "Life Expectancy in Years", main= "India and Underdeveloped nations after independence")

By the above plots we can say Idia life expectancy has increased a lot after independence to check this hypothesis we will do some tests.

Saphiro test to check normality

NULL Hypothesis: The data is normal Alternate Hypothesis: The data is not normal

If P> 0.05 our NULL Hypothesis will be true
shapiro.test(topmeanlife)
## 
##  Shapiro-Wilk normality test
## 
## data:  topmeanlife
## W = 0.87578, p-value = 1.228e-12
shapiro.test(middlemeanlife)
## 
##  Shapiro-Wilk normality test
## 
## data:  middlemeanlife
## W = 0.7453, p-value < 2.2e-16
shapiro.test(bottommeanlife)
## 
##  Shapiro-Wilk normality test
## 
## data:  bottommeanlife
## W = 0.77159, p-value < 2.2e-16
shapiro.test(indialife)
## 
##  Shapiro-Wilk normality test
## 
## data:  indialife
## W = 0.78295, p-value < 2.2e-16

As the P values are less then 0.05 so our data is not normal

Checking Normality by plotting Histogram

par(mfrow=c(2,2))
hist(topmeanlife, col= "#8BC3B6", border= "#09622A", main = "Mean life expectancy of TOP 10 countries", xlab = "Life Exp")
hist(middlemeanlife, col= "#8BC3B6", border= "#09622A", main = "Mean life expectancy of Middle 10 countries", xlab = "Life Exp")
hist(bottommeanlife, col= "#8BC3B6", border= "#09622A", main = "Mean life expectancy of Bottom 10 countries", xlab = "Life Exp")
hist(indialife, col= "#8BC3B6", border= "#09622A", main = "Life expectancy India", xlab = "Life Exp")

By the histograms Datas are not looking very normal

To further confirm we can do Q-Q plot

par(mfrow=c(2,2) )
qqnorm(topmeanlife, col="red", pch = 1, frame = FALSE, main = "Develped nations mean lifeexp Normal Q-Q plot")
qqline(topmeanlife, col = "steelblue", lwd = 2)
qqnorm(middlemeanlife, col="red", pch = 1, frame = FALSE, main = "Developing nations mean lifeexp Normal Q-Q plot")
qqline(middlemeanlife, col = "steelblue", lwd = 2)
qqnorm(bottommeanlife, col="red",  pch = 1, frame = FALSE, main = "underdeveloped nations mean lifeexp Normal Q-Q plot")
qqline(bottommeanlife, col = "steelblue", lwd = 2)
qqnorm(indialife, col="red",  pch = 1, frame = FALSE, main = "India lifeexp Normal Q-Q plot")
qqline(indialife, col = "steelblue", lwd = 2)

From the above tests it is clear that the data sets are not normal so we can’t use parametric tests like T test or ANOVA to check weather means are same or not, so we will be using non parametric alternatives of those like Wilcox and Kruskal tests.

Kruskal test

First we need to check weather all the datas are significantly different or not for this we’ll do kruskal test, for that we need to make a data frame.

lifedataframe<- data.frame(values=c(topmeanlife, middlemeanlife, bottommeanlife, indialife), variable= c(rep("topmeanlife", length(topmeanlife)), rep("middlemeanlife", length(middlemeanlife)), rep("bottommeanlife", length(bottommeanlife)), rep("indialife", length(indialife))))

now doing kruskal test on lifedataframe

kruskal.test(lifedataframe$values~lifedataframe$variable)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  lifedataframe$values by lifedataframe$variable
## Kruskal-Wallis chi-squared = 265.86, df = 3, p-value < 2.2e-16

As P value is significant we can say the datas are different so now we will do wilcoxon test on pairs for more detailed analysis

Wilcoxon test

NULL Hypothesis: Mean of bottom ten countries is greater than India before independence Alternate Hypothesis: Mean of bottom ten countries is less than India before independence

wilcox.test(bibottommeanlife, biindialife, paired = T, alternative = "less")
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  bibottommeanlife and biindialife
## V = 10995, p-value = 1
## alternative hypothesis: true location shift is less than 0

As P value is significant we can say mean life expectancy of india was less than bottom ten countries before independence

NULL Hypotheis: Mean of bottom ten countries is less than India after independence Alternate Hypotheis: Mean of bottom ten countries is greater than India after independence

wilcox.test(aibottommeanlife, aiindialife, paired = T, alternative = "greater")
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  aibottommeanlife and aiindialife
## V = 60, p-value = 1
## alternative hypothesis: true location shift is greater than 0

As P value is significant we can say mean life expectancy of India became greater than bottom ten countries before independence

How GDP per capita changed along with life expectancies

plot(topmeangdp, topmeanlife, col = "red", pch = 20, xlab = "GDP per capita", ylab = "Life Expectancy in years", main = "GDP per capita vs Life expectancy")
points(middlemeangdp, middlemeanlife, col= " blue", pch = 20)
points(bottommeangdp, bottommeanlife, col ="green", pch= 20)
points(indiagdp, indialife, col = "black", pch = 20)
legend(35000, 45, legend=c("Top 10 Developed Nations", "Middle 10 Developing Nations", "Botom 10 Underdeveloped Nations",  "India"), fill = c("red", "blue", "green", "black"), bg= "lightblue")

### Hypthesis With increase in GDP Life expectancy also increases, to check this corelation we can do corelation tests.

Corelation test

cor.test(topmeangdp, topmeanlife)
## 
##  Pearson's product-moment correlation
## 
## data:  topmeangdp and topmeanlife
## t = 27.197, df = 224, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8418542 0.9033253
## sample estimates:
##       cor 
## 0.8761038

As the cor value is in positive it means with increase of GDP Life expectancy also increases

How Child mortality changed with GDP per capita

plot(log(topmeangdp), topmeanchildmortality, col = "red", pch = 20, xlab = "Log of Means of GDP per capita", ylab = "Child Mortality", main = "GDP per capita vs Child Mortality")
points(log(middlemeangdp), middlemeanchildmortality, col= " blue", pch = 20)
points(log(bottommeangdp), bottommeanchildmortality, col ="green", pch= 20)
points(log(indiagdp), indiachildmortality, col = "black", pch = 20)
legend(10, 390, legend=c("Top 10 Developed Nations", "Middle 10 Developing Nations", "Botom 10 Underdeveloped Nations",  "India"), fill = c("red", "blue", "green", "black"), bg= "lightblue")

Hypthesis

With increase in GDP child mortality is decreasing, to check this corelation we can do corelation tests.

Corelation test

cor.test(log(topmeangdp), topmeanchildmortality)
## 
##  Pearson's product-moment correlation
## 
## data:  log(topmeangdp) and topmeanchildmortality
## t = -44.202, df = 224, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.959123 -0.931861
## sample estimates:
##        cor 
## -0.9471769

As the cor value is in negative it means with increase of GDP child mortality decreases

—————————————THE END———————————————-

Govind Prakash

17096

—————————————THE END———————————————-