India is a developing country, it has gone through a lot of ups and down but it has handled itself quite well, I wanted to know how India is performing with respect to other countries, so I’ll be taking mean of top ten developed countries, top ten developing and bottom ten countries and we’ll see where we stand, have we progressed? Has independence made any difference? have we made significantly difference in GDP and life expectancy compare to underdeveloped countries?
Note: All the list of countries has been taken from **HDR website** http://www.hdr.undp.org/
check the working directory getwd() If the working directory isn’t same then change it to directory where you have downloaded all the datasets setwd("X:/1.Study/4th year semester 2/Biostat/Assignment 1")
Now we’ll import all the .csv datasets and store them in some vectors
read.csv("child_mortality_0_5_year_olds_dying_per_1000_born.csv", header = T, check.names = F) -> ChildMortality
read.csv("children_per_woman_total_fertility.csv", header = T, check.names = F) -> ChildrenPerWomen
read.csv("income_per_person_gdppercapita_ppp_inflation_adjusted.csv", header = T, check.names = F)-> IncomePerPerson
read.csv("life_expectancy_years.csv", header = T, check.names = F)-> LifeExpectancy
read.csv("population_total.csv", header = T, check.names = F)-> Population
Once all the datasets are imported we need to load all the libraries in R
library(tidyverse)
library(dplyr)
library(tidyr)
library(ggthemes)
library(plotly)
library(Hmisc)
As we have to extract our selected countries from each datasets so instead of writing tedious codes everytime we are making a generalized function to make our life bit easier.
suppose a dummy dataset is following
Dataset <- LifeExpectancy
a <- "country 1"
b <- "country 2"
c <- "country 3"
d <- "country 4"
e <- "country 5"
f <- "country 6"
g <- "country 7"
h <- "country 8"
i <- "country 9"
j <- "country 10"
Listcollecter extracts our country of interest from the specified dataset and stores in a table.
Listcollecter <- function(Dataset, a, b, c, d, e, f, g, h, i, j) {Dataset[Dataset$country %in% c(a, b, c, d, e, f, g, h, i, j), ]}
countrydata<- function(countryname, Dataset)
{as.vector(na.omit(as.numeric(unlist(Dataset[Dataset$country==countryname,2:227]))))}
aicountrydata<- function(countryname, Dataset)
{as.vector(na.omit(as.numeric(unlist(Dataset[Dataset$country==countryname,149:227]))))}
bicountrydata<- function(countryname, Dataset)
{as.vector(na.omit(as.numeric(unlist(Dataset[Dataset$country==countryname,2:149]))))}
a <- "Norway"
b <- "Ireland"
c <- "Switzerland"
d <- "Hong Kong"
e <- "Iceland"
f <- "Germany"
g <- "Sweden"
h <- "Australia"
i <- "Netherlands"
j <- "Denmark"
Dataset <- LifeExpectancy
Listcollecter(Dataset, a, b, c, d, e, f, g, h, i, j) -> Toptenlife
Dataset <- ChildMortality
Listcollecter(Dataset, a, b, c, d, e, f, g, h, i, j) -> Toptenchildmortality
Dataset <- ChildrenPerWomen
Listcollecter(Dataset, a, b, c, d, e, f, g, h, i, j) -> Toptenchildrenperwomen
Dataset <- IncomePerPerson
Listcollecter(Dataset, a, b, c, d, e, f, g, h, i, j) -> Toptenincomeperperson
Dataset <- Population
Listcollecter(Dataset, a, b, c, d, e, f, g, h, i, j) -> Toptenpopulation
a <- "Algeria"
b <- "Lebanon"
c <- "Fiji"
d <- "Moldova"
e <- "Maldives"
f <- "Tunisia"
g <- "Saint Vincent and the Grenadines"
h <- "Suriname"
i <- "Mongolia"
j <- "Botswana"
Dataset <- LifeExpectancy
Listcollecter(Dataset, a, b, c, d, e, f, g, h, i, j) -> Middletenlife
Dataset <- ChildMortality
Listcollecter(Dataset, a, b, c, d, e, f, g, h, i, j) -> Middletenchildmortality
Dataset <- ChildrenPerWomen
Listcollecter(Dataset, a, b, c, d, e, f, g, h, i, j) -> Middletenchildrenperwomen
Dataset <- IncomePerPerson
Listcollecter(Dataset, a, b, c, d, e, f, g, h, i, j) -> Middletenincomeperperson
Dataset <- Population
Listcollecter(Dataset, a, b, c, d, e, f, g, h, i, j) -> Middletenpopulation
a <- "Eritrea"
b <- "Mozambique"
c <- "Burkina Faso"
d <- "Sierra Leone"
e <- "Mali"
f <- "Burundi"
g <- "South Sudan"
h <- "Chad"
i <- "Central African Republic"
j <- "Niger"
Dataset <- LifeExpectancy
Listcollecter(Dataset, a, b, c, d, e, f, g, h, i, j) -> Bottomtenlife
Dataset <- ChildMortality
Listcollecter(Dataset, a, b, c, d, e, f, g, h, i, j) -> Bottomtenchildmortality
Dataset <- ChildrenPerWomen
Listcollecter(Dataset, a, b, c, d, e, f, g, h, i, j) -> Bottomtenchildrenperwomen
Dataset <- IncomePerPerson
Listcollecter(Dataset, a, b, c, d, e, f, g, h, i, j) -> Bottomtenincomeperperson
Dataset <- Population
Listcollecter(Dataset, a, b, c, d, e, f, g, h, i, j) -> Bottomtenpopulation
apply(Toptenlife[,2:227], 2, mean) -> topmeanlife
apply(Middletenlife[,2:227], 2, mean) -> middlemeanlife
apply(Bottomtenlife[,2:227], 2, mean) -> bottommeanlife
apply(Toptenlife[,149:227], 2, mean) -> aitopmeanlife
apply(Middletenlife[,149:227], 2, mean) -> aimiddlemeanlife
apply(Bottomtenlife[,149:227], 2, mean) -> aibottommeanlife
apply(Toptenlife[,2:149], 2, mean) -> bitopmeanlife
apply(Middletenlife[,2:149], 2, mean) -> bimiddlemeanlife
apply(Bottomtenlife[,2:149], 2, mean) -> bibottommeanlife
apply(Toptenchildmortality[,2:227], 2, mean) -> topmeanchildmortality
apply(Middletenchildmortality[,2:227], 2, mean) -> middlemeanchildmortality
apply(Bottomtenchildmortality[,2:227], 2, mean) -> bottommeanchildmortality
apply(Toptenchildmortality[,149:227], 2, mean) -> aitopmeanchildmortality
apply(Middletenchildmortality[,149:227], 2, mean) -> aimiddlemeanchildmortality
apply(Bottomtenchildmortality[,149:227], 2, mean) -> aibottommeanchildmortality
apply(Toptenchildmortality[,2:149], 2, mean) -> bitopmeanchildmortality
apply(Middletenchildmortality[,2:149], 2, mean) -> bimiddlemeanchildmortality
apply(Bottomtenchildmortality[,2:149], 2, mean) -> bibottommeanchildmortality
apply(Toptenchildrenperwomen[,2:227], 2, mean) -> topmeanchildrenperwomen
apply(Middletenchildrenperwomen[,2:227], 2, mean) -> middlemeanchildrenperwomen
apply(Bottomtenchildrenperwomen[,2:227], 2, mean) -> bottommeanchildrenperwomen
apply(Toptenchildrenperwomen[,149:227], 2, mean) -> aitopmeanchildrenperwomen
apply(Middletenchildrenperwomen[,149:227], 2, mean) -> aimiddlemeanchildrenperwomen
apply(Bottomtenchildrenperwomen[,149:227], 2, mean) -> aibottommeanchildrenperwomen
apply(Toptenchildrenperwomen[,2:149], 2, mean) -> bitopmeanchildrenperwomen
apply(Middletenchildrenperwomen[,2:149], 2, mean) -> bimiddlemeanchildrenperwomen
apply(Bottomtenchildrenperwomen[,2:149], 2, mean) -> bibottommeanchildrenperwomen
apply(Toptenincomeperperson[,2:227], 2, mean) -> topmeangdp
apply(Middletenincomeperperson[,2:227], 2, mean) -> middlemeangdp
apply(Bottomtenincomeperperson[,2:227], 2, mean) -> bottommeangdp
apply(Toptenincomeperperson[,149:227], 2, mean) -> aitopmeangdp
apply(Middletenincomeperperson[,149:227], 2, mean) -> aimiddlemeangdp
apply(Bottomtenincomeperperson[,149:227], 2, mean) -> aibottommeangdp
apply(Toptenincomeperperson[,2:149], 2, mean) -> bitopmeangdp
apply(Middletenincomeperperson[,2:149], 2, mean) -> bimiddlemeangdp
apply(Bottomtenincomeperperson[,2:149], 2, mean) -> bibottommeangdp
apply(Toptenpopulation[,2:227], 2, mean) -> topmeanpopulation
apply(Middletenpopulation[,2:227], 2, mean) -> middlemeanpopulation
apply(Bottomtenpopulation[,2:227], 2, mean) -> bottommeanpopulation
apply(Toptenpopulation[,149:227], 2, mean) -> aitopmeanpopulation
apply(Middletenpopulation[,149:227], 2, mean) -> aimiddlemeanpopulation
apply(Bottomtenpopulation[,149:227], 2, mean) -> aibottommeanpopulation
apply(Toptenpopulation[,2:149], 2, mean) -> bitopmeanpopulation
apply(Middletenpopulation[,2:149], 2, mean) -> bimiddlemeanpopulation
apply(Bottomtenpopulation[,2:149], 2, mean) -> bibottommeanpopulation
as.vector(countrydata("India", LifeExpectancy))-> indialife
as.vector(countrydata("India", IncomePerPerson))-> indiagdp
as.vector(countrydata("India", ChildMortality))-> indiachildmortality
as.vector(countrydata("India", ChildrenPerWomen))-> indiachildperwomen
as.vector(countrydata("India", Population))-> indiapopulation
as.vector(aicountrydata("India", LifeExpectancy))-> aiindialife
as.vector(aicountrydata("India", IncomePerPerson))-> aiindiagdp
as.vector(aicountrydata("India", ChildMortality))-> aiindiachildmortality
as.vector(aicountrydata("India", ChildrenPerWomen))-> aiindiachildperwomen
as.vector(aicountrydata("India", Population))-> aiindiapopulation
as.vector(bicountrydata("India", LifeExpectancy))-> biindialife
as.vector(bicountrydata("India", IncomePerPerson))-> biindiagdp
as.vector(bicountrydata("India", ChildMortality))-> biindiachildmortality
as.vector(bicountrydata("India", ChildrenPerWomen))-> biindiachildperwomen
as.vector(bicountrydata("India", Population))-> biindiapopulation
as.vector(unlist(colnames(LifeExpectancy[1,2:227]))) -> Years
as.vector(unlist(colnames(LifeExpectancy[1,149:227]))) -> a47Years
as.vector(unlist(colnames(LifeExpectancy[1,2:149]))) -> b47Years
plot(Years, topmeanlife, col = "red", ylim= c(20, 100), pch = 20, main = "Life expectancy for years 1800 to 2025", ylab = "Life expectancies in Years")
points(Years, middlemeanlife, col= " blue", pch = 20)
points(Years, bottommeanlife, col ="green", pch= 20)
points(Years, indialife, col = "black", pch = 20)
legend(1800, 100, legend=c("Top 10 Developed Nations", "Middle 10 Developing Nations", "Botom 10 Underdeveloped Nations", "India"), fill = c("red", "blue", "green", "black"), bg= "lightblue")
abline(v= 1918)
abline(v= 1945)
abline(v= 2000)
abline(v= 1947, col= "red")
text(1955, 30, "Independence", col= "red", font = 2)
text(1918, 75, "World War 1")
text(1945, 75, "World War 2")
text(2000, 75, "AIDS Pandemic")
Here I have drawn three vertical lines indicating three important events i have noticed
There is a very sharp dip in life expectancies of developing and developed countries, but the underdeveloped countries remains unaffected as only developing and developed ones were participating in world war 1, though Idia didn’t show much of a decrease during that time.
Here we can also see a dip in Developing and developed countries, but strangely underdeveloped ones are also showing dips why? Weren’t World war 2 amongst super powers only? After a bit of research i found Underdeveloped countries like Eritrea was used by Italians against Sudan, similarily Mozambique served various places in Portuguese empire, as chad was also a french colony at that time they were allies in the world war 2. Similarily India was also a british colony it also faced the consequence.
15th Aug 1947 a historic day for India, we can see a sharp change in life expectancy of india it just crossed under developing countries and just kept moving upwards after that
It affects mostly sub African countries so we can see a significant dip in underdeveloped countries, it also affected India a bit, but developed countries remained almost unaffected.
As India has shown a sharp change after independence it will be injustice to compare overall mean so we’ll also check before and after independence data too.
par(mfrow=c(2,2))
boxplot(topmeanlife, middlemeanlife, bottommeanlife, indialife, names = c("Developed", "Developing", "Underdeveloped", "India"), ylab= "Life Expectancy in Years", main= "Life Expectancy Means for years 1800 to 2025")
boxplot(aitopmeanlife, aimiddlemeanlife, aibottommeanlife, aiindialife, names = c("Developed", "Developing", "Underdeveloped", "India"), ylab= "Life Expectancy in Years", main= "Life Expectancy Means for years 1947 to 2025 (after Independence)")
boxplot(bitopmeanlife, bimiddlemeanlife, bibottommeanlife, biindialife, names = c("Developed", "Developing", "Underdeveloped", "India"), ylab= "Life Expectancy in Years", main= "Life Expectancy Means for years 1800 to 1947 (before Independence)")
boxplot(aibottommeanlife, aiindialife, names = c("Underdeveloped", "India"), ylab= "Life Expectancy in Years", main= "India and Underdeveloped nations after independence")
By the above plots we can say Idia life expectancy has increased a lot after independence to check this hypothesis we will do some tests.
NULL Hypothesis: The data is normal Alternate Hypothesis: The data is not normal
shapiro.test(topmeanlife)
##
## Shapiro-Wilk normality test
##
## data: topmeanlife
## W = 0.87578, p-value = 1.228e-12
shapiro.test(middlemeanlife)
##
## Shapiro-Wilk normality test
##
## data: middlemeanlife
## W = 0.7453, p-value < 2.2e-16
shapiro.test(bottommeanlife)
##
## Shapiro-Wilk normality test
##
## data: bottommeanlife
## W = 0.77159, p-value < 2.2e-16
shapiro.test(indialife)
##
## Shapiro-Wilk normality test
##
## data: indialife
## W = 0.78295, p-value < 2.2e-16
As the P values are less then 0.05 so our data is not normal
par(mfrow=c(2,2))
hist(topmeanlife, col= "#8BC3B6", border= "#09622A", main = "Mean life expectancy of TOP 10 countries", xlab = "Life Exp")
hist(middlemeanlife, col= "#8BC3B6", border= "#09622A", main = "Mean life expectancy of Middle 10 countries", xlab = "Life Exp")
hist(bottommeanlife, col= "#8BC3B6", border= "#09622A", main = "Mean life expectancy of Bottom 10 countries", xlab = "Life Exp")
hist(indialife, col= "#8BC3B6", border= "#09622A", main = "Life expectancy India", xlab = "Life Exp")
By the histograms Datas are not looking very normal
par(mfrow=c(2,2) )
qqnorm(topmeanlife, col="red", pch = 1, frame = FALSE, main = "Develped nations mean lifeexp Normal Q-Q plot")
qqline(topmeanlife, col = "steelblue", lwd = 2)
qqnorm(middlemeanlife, col="red", pch = 1, frame = FALSE, main = "Developing nations mean lifeexp Normal Q-Q plot")
qqline(middlemeanlife, col = "steelblue", lwd = 2)
qqnorm(bottommeanlife, col="red", pch = 1, frame = FALSE, main = "underdeveloped nations mean lifeexp Normal Q-Q plot")
qqline(bottommeanlife, col = "steelblue", lwd = 2)
qqnorm(indialife, col="red", pch = 1, frame = FALSE, main = "India lifeexp Normal Q-Q plot")
qqline(indialife, col = "steelblue", lwd = 2)
From the above tests it is clear that the data sets are not normal so we can’t use parametric tests like T test or ANOVA to check weather means are same or not, so we will be using non parametric alternatives of those like Wilcox and Kruskal tests.
First we need to check weather all the datas are significantly different or not for this we’ll do kruskal test, for that we need to make a data frame.
lifedataframe<- data.frame(values=c(topmeanlife, middlemeanlife, bottommeanlife, indialife), variable= c(rep("topmeanlife", length(topmeanlife)), rep("middlemeanlife", length(middlemeanlife)), rep("bottommeanlife", length(bottommeanlife)), rep("indialife", length(indialife))))
now doing kruskal test on lifedataframe
kruskal.test(lifedataframe$values~lifedataframe$variable)
##
## Kruskal-Wallis rank sum test
##
## data: lifedataframe$values by lifedataframe$variable
## Kruskal-Wallis chi-squared = 265.86, df = 3, p-value < 2.2e-16
As P value is significant we can say the datas are different so now we will do wilcoxon test on pairs for more detailed analysis
NULL Hypothesis: Mean of bottom ten countries is greater than India before independence Alternate Hypothesis: Mean of bottom ten countries is less than India before independence
wilcox.test(bibottommeanlife, biindialife, paired = T, alternative = "less")
##
## Wilcoxon signed rank test with continuity correction
##
## data: bibottommeanlife and biindialife
## V = 10995, p-value = 1
## alternative hypothesis: true location shift is less than 0
As P value is significant we can say mean life expectancy of india was less than bottom ten countries before independence
NULL Hypotheis: Mean of bottom ten countries is less than India after independence Alternate Hypotheis: Mean of bottom ten countries is greater than India after independence
wilcox.test(aibottommeanlife, aiindialife, paired = T, alternative = "greater")
##
## Wilcoxon signed rank test with continuity correction
##
## data: aibottommeanlife and aiindialife
## V = 60, p-value = 1
## alternative hypothesis: true location shift is greater than 0
As P value is significant we can say mean life expectancy of India became greater than bottom ten countries before independence
plot(topmeangdp, topmeanlife, col = "red", pch = 20, xlab = "GDP per capita", ylab = "Life Expectancy in years", main = "GDP per capita vs Life expectancy")
points(middlemeangdp, middlemeanlife, col= " blue", pch = 20)
points(bottommeangdp, bottommeanlife, col ="green", pch= 20)
points(indiagdp, indialife, col = "black", pch = 20)
legend(35000, 45, legend=c("Top 10 Developed Nations", "Middle 10 Developing Nations", "Botom 10 Underdeveloped Nations", "India"), fill = c("red", "blue", "green", "black"), bg= "lightblue")
### Hypthesis With increase in GDP Life expectancy also increases, to check this corelation we can do corelation tests.
cor.test(topmeangdp, topmeanlife)
##
## Pearson's product-moment correlation
##
## data: topmeangdp and topmeanlife
## t = 27.197, df = 224, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8418542 0.9033253
## sample estimates:
## cor
## 0.8761038
As the cor value is in positive it means with increase of GDP Life expectancy also increases
plot(log(topmeangdp), topmeanchildmortality, col = "red", pch = 20, xlab = "Log of Means of GDP per capita", ylab = "Child Mortality", main = "GDP per capita vs Child Mortality")
points(log(middlemeangdp), middlemeanchildmortality, col= " blue", pch = 20)
points(log(bottommeangdp), bottommeanchildmortality, col ="green", pch= 20)
points(log(indiagdp), indiachildmortality, col = "black", pch = 20)
legend(10, 390, legend=c("Top 10 Developed Nations", "Middle 10 Developing Nations", "Botom 10 Underdeveloped Nations", "India"), fill = c("red", "blue", "green", "black"), bg= "lightblue")
With increase in GDP child mortality is decreasing, to check this corelation we can do corelation tests.
cor.test(log(topmeangdp), topmeanchildmortality)
##
## Pearson's product-moment correlation
##
## data: log(topmeangdp) and topmeanchildmortality
## t = -44.202, df = 224, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.959123 -0.931861
## sample estimates:
## cor
## -0.9471769
As the cor value is in negative it means with increase of GDP child mortality decreases