library(readr) # Useful for importing data
library(magrittr) # Useful for pipe %>% Operator etc
library(ggplot2) # Useful for Data Visualisations & creating plots
library(Hmisc) # Useful for value imputation
library(dplyr) # Useful for data manipulation
library(car) # Useful for creating Anova tables
library(knitr) # Useful for creating nice tables
library(kableExtra) # Useful to build complex tables
library(rmarkdown) # Useful to create neat records of analysis
library(tidyr) # Useful for tidy up the dataset
library(forecast) # Useful for displaying and analysing univariate time series
library(editrules) #useful for detecting obvious errors
Dataset for this study is acquired from source: http://data.un.org/DocumentData.aspx?id=392
- Human Development Index dataset- holds value of HDI rank, HDI index, life expectancy at birth, expected and mean years of schooling, GNI per capita- details are available for 189 countries.
- Gender Inequality dataset- represents the values such as gender inequality index value and rank, maternal mortality ratio, adolescent birth rate, Percent share of seats in parliament by women, population with secondary education in both gender, labour force participation rate of both gender as well as HDI rank for 189 countries.
- Both datasets are merged using inner join by variable country as both hold observations of 189 countries.
- Country and HDI rank are two common variables which belong to both the datasets.Whereas, the remaining variables are different. Two avoid occurrence of duplicate columns, we drop ‘HDI rank’ from ‘Human Development’ dataset
- The Joined dataset holds 16 variables.
- All the variables are assigned with more readable names.
- Few variables such as maternal mortality ratio and Adolescent birth rate that does not belong to the year 2017 are dropped.
- All the other attributes that hold data of the year 2017 as per data source and HDI rank for 2016 are considered.
Human_Development<-read.csv("C:\\Users\\aditi\\Downloads\\Human Development.csv",nrows=189)
Gender_Inequality<-read.csv("C:\\Users\\aditi\\Downloads\\Gender_Inequality.csv",nrows=189)
#Removing the variables which do not belong to the year 2017
Gender_Inequality<-select(Gender_Inequality,-c(Maternal.mortality.ratio,Adolescent.birth.rate))
Human_Development<-select(Human_Development,- "HDI.rank")
#Joining Human development and gender inequality datsets by country
Human_Dev_Gender_Joined <- inner_join(Gender_Inequality, Human_Development, by='Country')
head(Human_Dev_Gender_Joined)
## HDI.rank Country Gender.Inequality.Index.Value
## 1 1 Norway 0.048
## 2 2 Switzerland 0.039
## 3 3 Australia 0.109
## 4 4 Ireland 0.109
## 5 5 Germany 0.072
## 6 6 Iceland 0.062
## Gender.Inequality.Index.Rank
## 1 5
## 2 1
## 3 23
## 4 23
## 5 14
## 6 9
## Percent.share.of.seats.in.parliament.by.women
## 1 41.4
## 2 29.3
## 3 32.7
## 4 24.3
## 5 31.5
## 6 38.1
## Female.Population.with.secondary.education
## 1 96.3
## 2 96.4
## 3 90.0
## 4 90.2
## 5 96.2
## 6 100.0
## Male.Population.with.secondary.education
## 1 95.1
## 2 97.2
## 3 89.9
## 4 86.3
## 5 96.8
## 6 100.0
## Female.Labour.force.participation.rate
## 1 60.8
## 2 62.9
## 3 59.2
## 4 53.0
## 5 55.0
## 6 72.8
## Male.Labour.force.participation.rate HDI.Value
## 1 67.6 0.953
## 2 74.1 0.944
## 3 70.5 0.939
## 4 67.3 0.938
## 5 66.2 0.936
## 6 81.8 0.935
## Life.expectancy.at.birth.in.years Expected.years.of.schooling.in.years
## 1 82.3 17.9
## 2 83.5 16.2
## 3 83.1 22.9
## 4 81.6 19.6
## 5 81.2 17.0
## 6 82.9 19.3
## Mean.years.of.schooling.in.years GNI.per.capita
## 1 12.6 68012
## 2 13.4 57625
## 3 12.9 43560
## 4 12.5 53754
## 5 14.1 46136
## 6 12.4 45810
## GNI.per.capita.rank.minus.HDI.rank HDI.rank.2016.
## 1 5 1
## 2 8 2
## 3 18 3
## 4 8 4
## 5 13 4
## 6 13 6
#Check the structure of dataset
str(Human_Dev_Gender_Joined)
## 'data.frame': 189 obs. of 16 variables:
## $ HDI.rank : int 1 2 3 4 5 6 7 7 9 10 ...
## $ Country : Factor w/ 189 levels "Afghanistan",..: 127 164 9 82 65 77 75 163 153 122 ...
## $ Gender.Inequality.Index.Value : num 0.048 0.039 0.109 0.109 0.072 0.062 NA 0.044 0.067 0.044 ...
## $ Gender.Inequality.Index.Rank : int 5 1 23 23 14 9 NA 3 12 3 ...
## $ Percent.share.of.seats.in.parliament.by.women: num 41.4 29.3 32.7 24.3 31.5 38.1 NA 43.6 23 35.6 ...
## $ Female.Population.with.secondary.education : num 96.3 96.4 90 90.2 96.2 100 75.7 88.4 76.1 86.4 ...
## $ Male.Population.with.secondary.education : num 95.1 97.2 89.9 86.3 96.8 100 81.8 88.7 82.9 90.4 ...
## $ Female.Labour.force.participation.rate : num 60.8 62.9 59.2 53 55 72.8 54 60.8 60.5 58 ...
## $ Male.Labour.force.participation.rate : num 67.6 74.1 70.5 67.3 66.2 81.8 68.1 67.4 76.8 69.2 ...
## $ HDI.Value : num 0.953 0.944 0.939 0.938 0.936 0.935 0.933 0.933 0.932 0.931 ...
## $ Life.expectancy.at.birth.in.years : num 82.3 83.5 83.1 81.6 81.2 82.9 84.1 82.6 83.2 82 ...
## $ Expected.years.of.schooling.in.years : num 17.9 16.2 22.9 19.6 17 19.3 16.3 17.6 16.2 18 ...
## $ Mean.years.of.schooling.in.years : num 12.6 13.4 12.9 12.5 14.1 12.4 12 12.4 11.5 12.2 ...
## $ GNI.per.capita : int 68012 57625 43560 53754 46136 45810 58420 47766 82503 47900 ...
## $ GNI.per.capita.rank.minus.HDI.rank : int 5 8 18 8 13 13 2 9 -6 5 ...
## $ HDI.rank.2016. : Factor w/ 159 levels "..","1","10",..: 2 87 98 107 107 125 143 134 143 3 ...
#Changing datatype of GNI.per.capita variable from factor to double
Human_Dev_Gender_Joined$GNI.per.capita <- as.numeric(as.character(Human_Dev_Gender_Joined$GNI.per.capita))
Human_Dev_Gender_Joined$HDI.rank.2016.<- as.numeric(as.character(Human_Dev_Gender_Joined$HDI.rank.2016.))
#Creating new variable GNI.per.capita.rank
Human_Dev_Gender_Joined <- Human_Dev_Gender_Joined %>% mutate(GNI.per.capita.rank = GNI.per.capita.rank.minus.HDI.rank + HDI.rank )
#Selecting all variables except GNI.per.capita.rank.minus.HDI.rank
Human_Dev_Gender_Joined <- select(Human_Dev_Gender_Joined, -c(GNI.per.capita.rank.minus.HDI.rank))
head(Human_Dev_Gender_Joined)
## HDI.rank Country Gender.Inequality.Index.Value
## 1 1 Norway 0.048
## 2 2 Switzerland 0.039
## 3 3 Australia 0.109
## 4 4 Ireland 0.109
## 5 5 Germany 0.072
## 6 6 Iceland 0.062
## Gender.Inequality.Index.Rank
## 1 5
## 2 1
## 3 23
## 4 23
## 5 14
## 6 9
## Percent.share.of.seats.in.parliament.by.women
## 1 41.4
## 2 29.3
## 3 32.7
## 4 24.3
## 5 31.5
## 6 38.1
## Female.Population.with.secondary.education
## 1 96.3
## 2 96.4
## 3 90.0
## 4 90.2
## 5 96.2
## 6 100.0
## Male.Population.with.secondary.education
## 1 95.1
## 2 97.2
## 3 89.9
## 4 86.3
## 5 96.8
## 6 100.0
## Female.Labour.force.participation.rate
## 1 60.8
## 2 62.9
## 3 59.2
## 4 53.0
## 5 55.0
## 6 72.8
## Male.Labour.force.participation.rate HDI.Value
## 1 67.6 0.953
## 2 74.1 0.944
## 3 70.5 0.939
## 4 67.3 0.938
## 5 66.2 0.936
## 6 81.8 0.935
## Life.expectancy.at.birth.in.years Expected.years.of.schooling.in.years
## 1 82.3 17.9
## 2 83.5 16.2
## 3 83.1 22.9
## 4 81.6 19.6
## 5 81.2 17.0
## 6 82.9 19.3
## Mean.years.of.schooling.in.years GNI.per.capita HDI.rank.2016.
## 1 12.6 68012 1
## 2 13.4 57625 2
## 3 12.9 43560 3
## 4 12.5 53754 4
## 5 14.1 46136 4
## 6 12.4 45810 6
## GNI.per.capita.rank
## 1 6
## 2 10
## 3 21
## 4 12
## 5 18
## 6 19
HDI Status
– Classifying the countries into HDI categories to further assess where a country stands in terms of the level of Human Development.
- Creating a new variable ‘HDI Status’ which is derived by categorizing the ‘HDI value’.
- This helps to find that which set of countries fall in what level of HDI Development.
- ‘HDI Status’ is an ordinal factor signifying the level of development so it is factorized.
- The new variable ‘HDI status’ holds the categories of HDI index. Following categories based on HDI value:
| Levels | Category |
|---|---|
| 0.800-1.000 | Very High |
| 0.700-0.799 | High |
| 0.555-0.699 | Medium |
| 0.350-0.554 | Low |
Change_in_Rank Position
* Comparing the the HDI ranks of countries in 2017 and 2016, we examine the progress made by the countries under ‘Rank.Change’ variable
* Mutating ‘Rank.Change’ variable which shows whether the rank of a country in 2017 went ‘Up’, ‘Down’ or is the ‘same’ form its rank in 2016
* The ‘Rank.Change’ variable should be a factor, and so is changed from char to factor
# Mutating 'HDI Status' which includes categories based on HDI Value
Human_Dev_Gender_Joined <- Human_Dev_Gender_Joined %>% mutate(HDI.status= ifelse(HDI.Value<=1.000 & HDI.Value>=0.799,"Very High", ifelse(HDI.Value<=0.799 & HDI.Value>=0.699,"High", ifelse(HDI.Value<=0.699 & HDI.Value>=0.554,"Medium","Low"))))
# Checking the data type of 'HDI Status'
class(Human_Dev_Gender_Joined$HDI.status)
## [1] "character"
#Converting 'HDI Status' to factor & checking the levels
Human_Dev_Gender_Joined$HDI.status <- factor(Human_Dev_Gender_Joined$HDI.status , labels=c("Low","Medium","High","Very High"),ordered=TRUE)
levels(Human_Dev_Gender_Joined$HDI.status)
## [1] "Low" "Medium" "High" "Very High"
# Checking if conversion was successful
is.factor(Human_Dev_Gender_Joined$HDI.status)
## [1] TRUE
# Mutating 'Rank.Change' which represnts change in rank from previous year
Human_Dev_Gender_Joined <- Human_Dev_Gender_Joined %>% mutate(Rank.Change = ifelse(HDI.rank< HDI.rank.2016.,"Up", ifelse(HDI.rank > HDI.rank.2016.,"Down","Same")))
# Checking the data type of 'Rank.Change'
class(Human_Dev_Gender_Joined$Rank.Change)
## [1] "character"
#Converting 'Rank.Change' to factor
Human_Dev_Gender_Joined$Rank.Change <- factor(Human_Dev_Gender_Joined$Rank.Change)
# Checking if conversion was successful
is.factor(Human_Dev_Gender_Joined$Rank.Change)
## [1] TRUE
# Checking dataset for missing values
cat("Missing values in each column:")
## Missing values in each column:
colSums(is.na(Human_Dev_Gender_Joined))
## HDI.rank
## 0
## Country
## 0
## Gender.Inequality.Index.Value
## 29
## Gender.Inequality.Index.Rank
## 29
## Percent.share.of.seats.in.parliament.by.women
## 2
## Female.Population.with.secondary.education
## 24
## Male.Population.with.secondary.education
## 24
## Female.Labour.force.participation.rate
## 11
## Male.Labour.force.participation.rate
## 11
## HDI.Value
## 0
## Life.expectancy.at.birth.in.years
## 0
## Expected.years.of.schooling.in.years
## 0
## Mean.years.of.schooling.in.years
## 0
## GNI.per.capita
## 0
## HDI.rank.2016.
## 1
## GNI.per.capita.rank
## 0
## HDI.status
## 0
## Rank.Change
## 1
# Omitting the rows with NA as the country doesnt hold a rank, it won't hold a value too, vice-versa
Human_Dev_Gender_Joined <- Human_Dev_Gender_Joined[ -which(is.na(Human_Dev_Gender_Joined$Gender.Inequality.Index.Rank)),]
# Creating function to check data for number of inconsistencies in each column
is.special <- function(x){
if (is.numeric(x)) (is.infinite(x) | is.nan(x))
}
# Checking data for number of inconsistencies in each column
sapply(Human_Dev_Gender_Joined, function(x) sum(is.special(x)))
## HDI.rank
## 0
## Country
## 0
## Gender.Inequality.Index.Value
## 0
## Gender.Inequality.Index.Rank
## 0
## Percent.share.of.seats.in.parliament.by.women
## 0
## Female.Population.with.secondary.education
## 0
## Male.Population.with.secondary.education
## 0
## Female.Labour.force.participation.rate
## 0
## Male.Labour.force.participation.rate
## 0
## HDI.Value
## 0
## Life.expectancy.at.birth.in.years
## 0
## Expected.years.of.schooling.in.years
## 0
## Mean.years.of.schooling.in.years
## 0
## GNI.per.capita
## 0
## HDI.rank.2016.
## 0
## GNI.per.capita.rank
## 0
## HDI.status
## 0
## Rank.Change
## 0
# Checking data for obvious errors i.e any rank cannot be a negative number
(Rule1 <- editset(c("HDI.rank > 0", "HDI.rank <= 189")))
##
## Edit set:
## num1 : 0 < HDI.rank
## num2 : HDI.rank <= 189
(Rule2 <- editset(c("HDI.rank.2016. > 0")))
##
## Edit set:
## num1 : 0 < HDI.rank.2016.
(Rule3 <- editset(c("GNI.per.capita.rank > 0")))
##
## Edit set:
## num1 : 0 < GNI.per.capita.rank
sum(violatedEdits(Rule1, Human_Dev_Gender_Joined))
## [1] 0
sum(violatedEdits(Rule2, Human_Dev_Gender_Joined))
## [1] 0
sum(violatedEdits(Rule3, Human_Dev_Gender_Joined))
## [1] 0
#using boxplot detecting the outliers for all numeric variables
outliers<-function(x)
{
boxplot(x, plot= FALSE)$out
}
cat("Outliers for numeric variables:")
## Outliers for numeric variables:
sapply(Human_Dev_Gender_Joined[,c(1, 3:16)],outliers)
## $HDI.rank
## numeric(0)
##
## $Gender.Inequality.Index.Value
## numeric(0)
##
## $Gender.Inequality.Index.Rank
## numeric(0)
##
## $Percent.share.of.seats.in.parliament.by.women
## [1] 55.7
##
## $Female.Population.with.secondary.education
## numeric(0)
##
## $Male.Population.with.secondary.education
## numeric(0)
##
## $Female.Labour.force.participation.rate
## [1] 16.8 15.2 14.0 18.7 11.9 86.0 6.0
##
## $Male.Labour.force.participation.rate
## [1] 38.9 45.8
##
## $HDI.Value
## numeric(0)
##
## $Life.expectancy.at.birth.in.years
## numeric(0)
##
## $Expected.years.of.schooling.in.years
## [1] 22.9 5.4
##
## $Mean.years.of.schooling.in.years
## numeric(0)
##
## $GNI.per.capita
## [1] 68012 82503 65016 67805 116818 76427 70524
##
## $HDI.rank.2016.
## numeric(0)
##
## $GNI.per.capita.rank
## numeric(0)
#Repacing outliers using capping
cap <- function(x){
quantiles <- quantile( x, c(.05, 0.25, 0.75, .95 ) )
x[ x < quantiles[2] - 1.5*IQR(x) ] <- quantiles[1]
x[ x > quantiles[3] + 1.5*IQR(x) ] <- quantiles[4]
x
}
Human_Dev_Gender_Joined[,c(1, 3:16)]<-sapply(Human_Dev_Gender_Joined[,c(1, 3:16)],cap)
# Histogram for ''Female Population with Secondary Education'
hist(Human_Dev_Gender_Joined$GNI.per.capita)
# NOrmalising the data
boxcox_gni_pc <- BoxCox(Human_Dev_Gender_Joined$GNI.per.capita, lambda = "auto")
hist(boxcox_gni_pc)
#Overview of HDI status of countries
summary(Human_Dev_Gender_Joined$HDI.status)
## Low Medium High Very High
## 46 29 29 56
#Top 6 countries
head(Human_Dev_Gender_Joined[,c(1,2,10)])
## HDI.rank Country HDI.Value
## 1 1 Norway 0.953
## 2 2 Switzerland 0.944
## 3 3 Australia 0.939
## 4 4 Ireland 0.938
## 5 5 Germany 0.936
## 6 6 Iceland 0.935
#Bottom 6 countries
tail(Human_Dev_Gender_Joined[,c(1,2,10)])
## HDI.rank Country HDI.Value
## 183 183 Burkina Faso 0.423
## 184 184 Sierra Leone 0.419
## 185 185 Burundi 0.417
## 186 186 Chad 0.404
## 188 188 Central African Republic 0.367
## 189 189 Niger 0.354
boxplot_obs <- ggplot(data = Human_Dev_Gender_Joined, aes(x = HDI.status, y = Gender.Inequality.Index.Value, color = HDI.status)) +
geom_boxplot(size = 1, outlier.color = "black", outlier.size = 3) +
theme(axis.text.x = element_text(angle = 45, hjust = 2,size = 15)) +
theme_grey(base_size = 15) +
labs(x = "HDI.status",y = "Gender.Inequality.Index.Value",title = "HDI Status by gender inequality index")
boxplot_obs
plot(HDI.Value ~Gender.Inequality.Index.Value,data = Human_Dev_Gender_Joined)
We conclude that the calculation of HDI needs to include not just life expectancy, education and GNI index but also needs to take into account the gender inequality in countries. And hence, the HDI needs to be adjusted according to inequality prevalent in societies.