Required packages

library(readr) # Useful for importing data
library(magrittr) # Useful for pipe %>% Operator etc
library(ggplot2) # Useful for Data Visualisations & creating plots
library(Hmisc) # Useful for value imputation
library(dplyr) # Useful for data manipulation
library(car) # Useful for creating Anova tables
library(knitr) # Useful for creating nice tables
library(kableExtra) # Useful to build complex tables
library(rmarkdown) # Useful to create neat records of analysis
library(tidyr) # Useful for tidy up the dataset
library(forecast) # Useful for displaying and analysing univariate time series
library(editrules) #useful for detecting obvious errors

Executive Summary

Data

Dataset for this study is acquired from source: http://data.un.org/DocumentData.aspx?id=392
- Human Development Index dataset- holds value of HDI rank, HDI index, life expectancy at birth, expected and mean years of schooling, GNI per capita- details are available for 189 countries.
- Gender Inequality dataset- represents the values such as gender inequality index value and rank, maternal mortality ratio, adolescent birth rate, Percent share of seats in parliament by women, population with secondary education in both gender, labour force participation rate of both gender as well as HDI rank for 189 countries.
- Both datasets are merged using inner join by variable country as both hold observations of 189 countries.
- Country and HDI rank are two common variables which belong to both the datasets.Whereas, the remaining variables are different. Two avoid occurrence of duplicate columns, we drop ‘HDI rank’ from ‘Human Development’ dataset
- The Joined dataset holds 16 variables.
- All the variables are assigned with more readable names.
- Few variables such as maternal mortality ratio and Adolescent birth rate that does not belong to the year 2017 are dropped.
- All the other attributes that hold data of the year 2017 as per data source and HDI rank for 2016 are considered.

Human_Development<-read.csv("C:\\Users\\aditi\\Downloads\\Human Development.csv",nrows=189)
Gender_Inequality<-read.csv("C:\\Users\\aditi\\Downloads\\Gender_Inequality.csv",nrows=189)

#Removing the variables which do not belong to the year 2017
Gender_Inequality<-select(Gender_Inequality,-c(Maternal.mortality.ratio,Adolescent.birth.rate))
Human_Development<-select(Human_Development,- "HDI.rank")

#Joining Human development and gender inequality datsets by country
Human_Dev_Gender_Joined <- inner_join(Gender_Inequality, Human_Development, by='Country')
head(Human_Dev_Gender_Joined)
##   HDI.rank     Country Gender.Inequality.Index.Value
## 1        1      Norway                         0.048
## 2        2 Switzerland                         0.039
## 3        3   Australia                         0.109
## 4        4     Ireland                         0.109
## 5        5     Germany                         0.072
## 6        6     Iceland                         0.062
##   Gender.Inequality.Index.Rank
## 1                            5
## 2                            1
## 3                           23
## 4                           23
## 5                           14
## 6                            9
##   Percent.share.of.seats.in.parliament.by.women
## 1                                          41.4
## 2                                          29.3
## 3                                          32.7
## 4                                          24.3
## 5                                          31.5
## 6                                          38.1
##   Female.Population.with.secondary.education
## 1                                       96.3
## 2                                       96.4
## 3                                       90.0
## 4                                       90.2
## 5                                       96.2
## 6                                      100.0
##   Male.Population.with.secondary.education
## 1                                     95.1
## 2                                     97.2
## 3                                     89.9
## 4                                     86.3
## 5                                     96.8
## 6                                    100.0
##   Female.Labour.force.participation.rate
## 1                                   60.8
## 2                                   62.9
## 3                                   59.2
## 4                                   53.0
## 5                                   55.0
## 6                                   72.8
##   Male.Labour.force.participation.rate HDI.Value
## 1                                 67.6     0.953
## 2                                 74.1     0.944
## 3                                 70.5     0.939
## 4                                 67.3     0.938
## 5                                 66.2     0.936
## 6                                 81.8     0.935
##   Life.expectancy.at.birth.in.years Expected.years.of.schooling.in.years
## 1                              82.3                                 17.9
## 2                              83.5                                 16.2
## 3                              83.1                                 22.9
## 4                              81.6                                 19.6
## 5                              81.2                                 17.0
## 6                              82.9                                 19.3
##   Mean.years.of.schooling.in.years GNI.per.capita
## 1                             12.6          68012
## 2                             13.4          57625
## 3                             12.9          43560
## 4                             12.5          53754
## 5                             14.1          46136
## 6                             12.4          45810
##   GNI.per.capita.rank.minus.HDI.rank HDI.rank.2016.
## 1                                  5              1
## 2                                  8              2
## 3                                 18              3
## 4                                  8              4
## 5                                 13              4
## 6                                 13              6

Understand

#Check the structure of dataset
str(Human_Dev_Gender_Joined)
## 'data.frame':    189 obs. of  16 variables:
##  $ HDI.rank                                     : int  1 2 3 4 5 6 7 7 9 10 ...
##  $ Country                                      : Factor w/ 189 levels "Afghanistan",..: 127 164 9 82 65 77 75 163 153 122 ...
##  $ Gender.Inequality.Index.Value                : num  0.048 0.039 0.109 0.109 0.072 0.062 NA 0.044 0.067 0.044 ...
##  $ Gender.Inequality.Index.Rank                 : int  5 1 23 23 14 9 NA 3 12 3 ...
##  $ Percent.share.of.seats.in.parliament.by.women: num  41.4 29.3 32.7 24.3 31.5 38.1 NA 43.6 23 35.6 ...
##  $ Female.Population.with.secondary.education   : num  96.3 96.4 90 90.2 96.2 100 75.7 88.4 76.1 86.4 ...
##  $ Male.Population.with.secondary.education     : num  95.1 97.2 89.9 86.3 96.8 100 81.8 88.7 82.9 90.4 ...
##  $ Female.Labour.force.participation.rate       : num  60.8 62.9 59.2 53 55 72.8 54 60.8 60.5 58 ...
##  $ Male.Labour.force.participation.rate         : num  67.6 74.1 70.5 67.3 66.2 81.8 68.1 67.4 76.8 69.2 ...
##  $ HDI.Value                                    : num  0.953 0.944 0.939 0.938 0.936 0.935 0.933 0.933 0.932 0.931 ...
##  $ Life.expectancy.at.birth.in.years            : num  82.3 83.5 83.1 81.6 81.2 82.9 84.1 82.6 83.2 82 ...
##  $ Expected.years.of.schooling.in.years         : num  17.9 16.2 22.9 19.6 17 19.3 16.3 17.6 16.2 18 ...
##  $ Mean.years.of.schooling.in.years             : num  12.6 13.4 12.9 12.5 14.1 12.4 12 12.4 11.5 12.2 ...
##  $ GNI.per.capita                               : int  68012 57625 43560 53754 46136 45810 58420 47766 82503 47900 ...
##  $ GNI.per.capita.rank.minus.HDI.rank           : int  5 8 18 8 13 13 2 9 -6 5 ...
##  $ HDI.rank.2016.                               : Factor w/ 159 levels "..","1","10",..: 2 87 98 107 107 125 143 134 143 3 ...
#Changing datatype of GNI.per.capita variable from factor to double
Human_Dev_Gender_Joined$GNI.per.capita <- as.numeric(as.character(Human_Dev_Gender_Joined$GNI.per.capita))

Human_Dev_Gender_Joined$HDI.rank.2016.<- as.numeric(as.character(Human_Dev_Gender_Joined$HDI.rank.2016.))

Tidy & Manipulate Data I

  1. Each variable must have its own column.
  2. Each unique observation has its own row.
  3. All values have its own cell.
#Creating new variable GNI.per.capita.rank
Human_Dev_Gender_Joined <- Human_Dev_Gender_Joined %>% mutate(GNI.per.capita.rank = GNI.per.capita.rank.minus.HDI.rank + HDI.rank )

#Selecting all variables except GNI.per.capita.rank.minus.HDI.rank
Human_Dev_Gender_Joined <- select(Human_Dev_Gender_Joined, -c(GNI.per.capita.rank.minus.HDI.rank))

head(Human_Dev_Gender_Joined)
##   HDI.rank     Country Gender.Inequality.Index.Value
## 1        1      Norway                         0.048
## 2        2 Switzerland                         0.039
## 3        3   Australia                         0.109
## 4        4     Ireland                         0.109
## 5        5     Germany                         0.072
## 6        6     Iceland                         0.062
##   Gender.Inequality.Index.Rank
## 1                            5
## 2                            1
## 3                           23
## 4                           23
## 5                           14
## 6                            9
##   Percent.share.of.seats.in.parliament.by.women
## 1                                          41.4
## 2                                          29.3
## 3                                          32.7
## 4                                          24.3
## 5                                          31.5
## 6                                          38.1
##   Female.Population.with.secondary.education
## 1                                       96.3
## 2                                       96.4
## 3                                       90.0
## 4                                       90.2
## 5                                       96.2
## 6                                      100.0
##   Male.Population.with.secondary.education
## 1                                     95.1
## 2                                     97.2
## 3                                     89.9
## 4                                     86.3
## 5                                     96.8
## 6                                    100.0
##   Female.Labour.force.participation.rate
## 1                                   60.8
## 2                                   62.9
## 3                                   59.2
## 4                                   53.0
## 5                                   55.0
## 6                                   72.8
##   Male.Labour.force.participation.rate HDI.Value
## 1                                 67.6     0.953
## 2                                 74.1     0.944
## 3                                 70.5     0.939
## 4                                 67.3     0.938
## 5                                 66.2     0.936
## 6                                 81.8     0.935
##   Life.expectancy.at.birth.in.years Expected.years.of.schooling.in.years
## 1                              82.3                                 17.9
## 2                              83.5                                 16.2
## 3                              83.1                                 22.9
## 4                              81.6                                 19.6
## 5                              81.2                                 17.0
## 6                              82.9                                 19.3
##   Mean.years.of.schooling.in.years GNI.per.capita HDI.rank.2016.
## 1                             12.6          68012              1
## 2                             13.4          57625              2
## 3                             12.9          43560              3
## 4                             12.5          53754              4
## 5                             14.1          46136              4
## 6                             12.4          45810              6
##   GNI.per.capita.rank
## 1                   6
## 2                  10
## 3                  21
## 4                  12
## 5                  18
## 6                  19

Tidy & Manipulate Data II

HDI Status
– Classifying the countries into HDI categories to further assess where a country stands in terms of the level of Human Development.
- Creating a new variable ‘HDI Status’ which is derived by categorizing the ‘HDI value’.
- This helps to find that which set of countries fall in what level of HDI Development.
- ‘HDI Status’ is an ordinal factor signifying the level of development so it is factorized.
- The new variable ‘HDI status’ holds the categories of HDI index. Following categories based on HDI value:

Levels Category
0.800-1.000 Very High
0.700-0.799 High
0.555-0.699 Medium
0.350-0.554 Low

Change_in_Rank Position
* Comparing the the HDI ranks of countries in 2017 and 2016, we examine the progress made by the countries under ‘Rank.Change’ variable
* Mutating ‘Rank.Change’ variable which shows whether the rank of a country in 2017 went ‘Up’, ‘Down’ or is the ‘same’ form its rank in 2016
* The ‘Rank.Change’ variable should be a factor, and so is changed from char to factor

# Mutating 'HDI Status' which includes categories based on HDI Value 
Human_Dev_Gender_Joined <- Human_Dev_Gender_Joined %>% mutate(HDI.status= ifelse(HDI.Value<=1.000 & HDI.Value>=0.799,"Very High",                                                                             ifelse(HDI.Value<=0.799 & HDI.Value>=0.699,"High",                                                                                 ifelse(HDI.Value<=0.699 & HDI.Value>=0.554,"Medium","Low"))))

# Checking the data type of 'HDI Status'
class(Human_Dev_Gender_Joined$HDI.status)
## [1] "character"
#Converting 'HDI Status' to factor & checking the levels                                        
Human_Dev_Gender_Joined$HDI.status  <- factor(Human_Dev_Gender_Joined$HDI.status , labels=c("Low","Medium","High","Very High"),ordered=TRUE)
levels(Human_Dev_Gender_Joined$HDI.status)
## [1] "Low"       "Medium"    "High"      "Very High"
# Checking if conversion was successful
is.factor(Human_Dev_Gender_Joined$HDI.status)
## [1] TRUE
# Mutating 'Rank.Change' which represnts change in rank from previous year
Human_Dev_Gender_Joined <- Human_Dev_Gender_Joined %>% mutate(Rank.Change = ifelse(HDI.rank< HDI.rank.2016.,"Up", ifelse(HDI.rank > HDI.rank.2016.,"Down","Same")))

# Checking the data type of 'Rank.Change'
class(Human_Dev_Gender_Joined$Rank.Change)
## [1] "character"
#Converting 'Rank.Change' to factor  
Human_Dev_Gender_Joined$Rank.Change <- factor(Human_Dev_Gender_Joined$Rank.Change)

# Checking if conversion was successful
is.factor(Human_Dev_Gender_Joined$Rank.Change)
## [1] TRUE

Scan I

# Checking dataset for missing values 
cat("Missing values in each column:")
## Missing values in each column:
colSums(is.na(Human_Dev_Gender_Joined))
##                                      HDI.rank 
##                                             0 
##                                       Country 
##                                             0 
##                 Gender.Inequality.Index.Value 
##                                            29 
##                  Gender.Inequality.Index.Rank 
##                                            29 
## Percent.share.of.seats.in.parliament.by.women 
##                                             2 
##    Female.Population.with.secondary.education 
##                                            24 
##      Male.Population.with.secondary.education 
##                                            24 
##        Female.Labour.force.participation.rate 
##                                            11 
##          Male.Labour.force.participation.rate 
##                                            11 
##                                     HDI.Value 
##                                             0 
##             Life.expectancy.at.birth.in.years 
##                                             0 
##          Expected.years.of.schooling.in.years 
##                                             0 
##              Mean.years.of.schooling.in.years 
##                                             0 
##                                GNI.per.capita 
##                                             0 
##                                HDI.rank.2016. 
##                                             1 
##                           GNI.per.capita.rank 
##                                             0 
##                                    HDI.status 
##                                             0 
##                                   Rank.Change 
##                                             1
# Omitting the rows with NA as the country doesnt hold a rank, it won't hold a value too, vice-versa
Human_Dev_Gender_Joined <- Human_Dev_Gender_Joined[ -which(is.na(Human_Dev_Gender_Joined$Gender.Inequality.Index.Rank)),]

# Creating function to check data for number of inconsistencies in each column
is.special <- function(x){
  if (is.numeric(x)) (is.infinite(x) | is.nan(x))
}

# Checking data for number of inconsistencies in each column
sapply(Human_Dev_Gender_Joined, function(x) sum(is.special(x)))
##                                      HDI.rank 
##                                             0 
##                                       Country 
##                                             0 
##                 Gender.Inequality.Index.Value 
##                                             0 
##                  Gender.Inequality.Index.Rank 
##                                             0 
## Percent.share.of.seats.in.parliament.by.women 
##                                             0 
##    Female.Population.with.secondary.education 
##                                             0 
##      Male.Population.with.secondary.education 
##                                             0 
##        Female.Labour.force.participation.rate 
##                                             0 
##          Male.Labour.force.participation.rate 
##                                             0 
##                                     HDI.Value 
##                                             0 
##             Life.expectancy.at.birth.in.years 
##                                             0 
##          Expected.years.of.schooling.in.years 
##                                             0 
##              Mean.years.of.schooling.in.years 
##                                             0 
##                                GNI.per.capita 
##                                             0 
##                                HDI.rank.2016. 
##                                             0 
##                           GNI.per.capita.rank 
##                                             0 
##                                    HDI.status 
##                                             0 
##                                   Rank.Change 
##                                             0
# Checking data for obvious errors i.e any rank cannot be a negative number
(Rule1 <- editset(c("HDI.rank > 0", "HDI.rank <= 189")))
## 
## Edit set:
## num1 : 0 < HDI.rank
## num2 : HDI.rank <= 189
(Rule2 <- editset(c("HDI.rank.2016. > 0")))
## 
## Edit set:
## num1 : 0 < HDI.rank.2016.
(Rule3 <- editset(c("GNI.per.capita.rank > 0")))
## 
## Edit set:
## num1 : 0 < GNI.per.capita.rank
sum(violatedEdits(Rule1, Human_Dev_Gender_Joined))
## [1] 0
sum(violatedEdits(Rule2, Human_Dev_Gender_Joined))
## [1] 0
sum(violatedEdits(Rule3, Human_Dev_Gender_Joined))
## [1] 0

Scan II

#using boxplot detecting the outliers for all numeric variables
outliers<-function(x)
{
  boxplot(x, plot= FALSE)$out
}

cat("Outliers for numeric variables:")
## Outliers for numeric variables:
sapply(Human_Dev_Gender_Joined[,c(1, 3:16)],outliers)
## $HDI.rank
## numeric(0)
## 
## $Gender.Inequality.Index.Value
## numeric(0)
## 
## $Gender.Inequality.Index.Rank
## numeric(0)
## 
## $Percent.share.of.seats.in.parliament.by.women
## [1] 55.7
## 
## $Female.Population.with.secondary.education
## numeric(0)
## 
## $Male.Population.with.secondary.education
## numeric(0)
## 
## $Female.Labour.force.participation.rate
## [1] 16.8 15.2 14.0 18.7 11.9 86.0  6.0
## 
## $Male.Labour.force.participation.rate
## [1] 38.9 45.8
## 
## $HDI.Value
## numeric(0)
## 
## $Life.expectancy.at.birth.in.years
## numeric(0)
## 
## $Expected.years.of.schooling.in.years
## [1] 22.9  5.4
## 
## $Mean.years.of.schooling.in.years
## numeric(0)
## 
## $GNI.per.capita
## [1]  68012  82503  65016  67805 116818  76427  70524
## 
## $HDI.rank.2016.
## numeric(0)
## 
## $GNI.per.capita.rank
## numeric(0)
#Repacing outliers using capping
cap <- function(x){
  quantiles <- quantile( x, c(.05, 0.25, 0.75, .95 ) )
  x[ x < quantiles[2] - 1.5*IQR(x) ] <- quantiles[1]
  x[ x > quantiles[3] + 1.5*IQR(x) ] <- quantiles[4]
  x
}
Human_Dev_Gender_Joined[,c(1, 3:16)]<-sapply(Human_Dev_Gender_Joined[,c(1, 3:16)],cap)

Transform

# Histogram for ''Female Population with Secondary Education'
hist(Human_Dev_Gender_Joined$GNI.per.capita)

# NOrmalising the data
boxcox_gni_pc <- BoxCox(Human_Dev_Gender_Joined$GNI.per.capita, lambda = "auto")
hist(boxcox_gni_pc) 

Analysis

#Overview of HDI status of countries
summary(Human_Dev_Gender_Joined$HDI.status)
##       Low    Medium      High Very High 
##        46        29        29        56
#Top 6 countries
head(Human_Dev_Gender_Joined[,c(1,2,10)])
##   HDI.rank     Country HDI.Value
## 1        1      Norway     0.953
## 2        2 Switzerland     0.944
## 3        3   Australia     0.939
## 4        4     Ireland     0.938
## 5        5     Germany     0.936
## 6        6     Iceland     0.935
#Bottom 6 countries
tail(Human_Dev_Gender_Joined[,c(1,2,10)])
##     HDI.rank                  Country HDI.Value
## 183      183             Burkina Faso     0.423
## 184      184             Sierra Leone     0.419
## 185      185                  Burundi     0.417
## 186      186                     Chad     0.404
## 188      188 Central African Republic     0.367
## 189      189                    Niger     0.354
boxplot_obs <- ggplot(data = Human_Dev_Gender_Joined, aes(x = HDI.status, y = Gender.Inequality.Index.Value, color = HDI.status)) + 
  geom_boxplot(size = 1, outlier.color = "black", outlier.size = 3) +     
  theme(axis.text.x = element_text(angle = 45, hjust = 2,size = 15)) +                  
  theme_grey(base_size = 15) +                           
  labs(x = "HDI.status",y = "Gender.Inequality.Index.Value",title = "HDI Status by gender inequality index")  
boxplot_obs

plot(HDI.Value ~Gender.Inequality.Index.Value,data = Human_Dev_Gender_Joined)

We conclude that the calculation of HDI needs to include not just life expectancy, education and GNI index but also needs to take into account the gender inequality in countries. And hence, the HDI needs to be adjusted according to inequality prevalent in societies.