## Load Data ##
library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(gtools)
library(SciViews)
HDI.dt <- read_excel("C:/Users/Joseph/Dropbox/Boston College/Development Economics/HDI/Data_Extract_From_World_Development_Indicators.xlsx")

## Get List of Countries ##
countries <- unique(HDI.dt[,c('Country Name','Income Level')])

### Break into Income Brackets ###
LowIncome <- countries[countries$`Income Level` == 1,]
MedIncome <- countries[countries$`Income Level` == 2,]
HighIncome <- countries[countries$`Income Level` == 3,]

## Sample ##
LowIncome <- sample(LowIncome$`Country Name`,32)
MedIncome <- sample(MedIncome$`Country Name`, 36)
HighIncome <- sample(HighIncome$`Country Name`,32)

HDI.Sample <- c(LowIncome, MedIncome, HighIncome)

#Filter Original Data

HDI.Sample.dt <- merge(
  x = HDI.Sample,
  y = HDI.dt,
  by = 1,
  all.x = TRUE
)
colnames(HDI.Sample.dt) <- c("Country Name", colnames(HDI.Sample.dt)[-1])
## Calculating Life Expecatancy
LifeExpectancyIndex <- function(Ob, setMin, setMax) {
  return((Ob - setMin)/(setMax - setMin))
}

LifeExpectancy.dt <- HDI.Sample.dt[HDI.Sample.dt$`Series Name` == "Life expectancy at birth, total (years)",]

setMin <- 20
setMax <- max(LifeExpectancy.dt$Data, na.rm = TRUE)


LifeExpectancy.dt <- LifeExpectancy.dt %>% group_by(`Country Name`) %>% mutate(LifeIndex = LifeExpectancyIndex(Data,setMin,setMax))

### Calculating Education Index
Index <- function(Ob, setMin, setMax) {
  return((Ob - setMin)/(setMax - setMin))
}

MeanYears.dt <- HDI.Sample.dt[HDI.Sample.dt$`Series Name` == "Barro-Lee: Mean years of schooling",]

setMin <- 0
setMax <- max(MeanYears.dt$Data, na.rm = TRUE)

MeanYears.dt <- MeanYears.dt %>% group_by(`Country Name`) %>% mutate(MeanIndex = Index(Data,setMin,setMax))

ExpectedYears.dt <- HDI.Sample.dt[HDI.Sample.dt$`Series Name` == "Human Capital Index (HCI): Expected Years of School, Total",]

setMin <- 0
setMax <- max(ExpectedYears.dt$Data, na.rm = TRUE)

ExpectedYears.dt <- ExpectedYears.dt %>% group_by(`Country Name`) %>% mutate(ExpectedIndex = Index(Data,setMin,setMax))

CombinedEducation.dt <- merge(
  x = ExpectedYears.dt[,c(-3,-4)],
  y = MeanYears.dt[,c(1,10)],
  by = 1
)

setMin <- 0
setMax <- 1

EducationIndex <- function(ExpectedIndex,MeanIndex) {
  return(sqrt(ExpectedIndex * MeanIndex))
}

CombinedEducation.dt <- CombinedEducation.dt %>% group_by(`Country Name`) %>% mutate(CombinedEdIndex = EducationIndex(ExpectedIndex,MeanIndex))

### Calculating the Income Index ###
IncomeIndex <- function(Ob, setMin, setMax) {
  return((ln(Ob) - ln(setMin))/(ln(setMax) - ln(setMin)))
}

IncomeIndex.dt <- HDI.Sample.dt[HDI.Sample.dt$`Series Name` == "GNI per capita (constant 2010 US$)" ,]

setMin <- 100
setMax <- max(IncomeIndex.dt$Data, na.rm = TRUE)

IncomeIndex.dt <- IncomeIndex.dt %>% group_by(`Country Name`) %>% mutate(IncomeIndex = IncomeIndex(Data,setMin,setMax))


## Calculating HDI for each Country ##
HDI.Index.Dt <- merge(
  x = LifeExpectancy.dt[,c(-3,-4)],
  y = IncomeIndex.dt[,c(1,10)]
)

HDI.Index.Dt <- merge(
  x = HDI.Index.Dt,
  y = CombinedEducation.dt[,c(1,10)]
)

HDI.func <- function(Life, Education, Income) {
  return((Life * Education * Income)^(1/3))
}

HDI.Index.Dt <- HDI.Index.Dt %>% group_by(`Country Name`) %>% mutate(HDI = HDI.func(LifeIndex,CombinedEdIndex,IncomeIndex))

HDI.Index.Dt[,c("Country Name", "HDI")]
## # A tibble: 100 x 2
## # Groups:   Country Name [100]
##    `Country Name`    HDI
##    <fct>           <dbl>
##  1 Afghanistan    NA    
##  2 Albania         0.757
##  3 Andorra        NA    
##  4 Angola          0    
##  5 Armenia         0.720
##  6 Bahrain         0.799
##  7 Belarus         0    
##  8 Belize          0    
##  9 Benin           0.435
## 10 Bermuda        NA    
## # ... with 90 more rows
###Difficulties:
##Grabbing consistent data was difficult and the datasets I used even had some holes which could skew the results.

#2) There are plenty of difference in the numbers in the methods but I would say that my index ranks countries rather similarly to the UN.

#3)
GDP.dt <- read_excel("C:/Users/Joseph/Dropbox/Boston College/Development Economics/HDI/GDPPPP2017.xlsx")

colnames(GDP.dt) <- c(colnames(GDP.dt)[-5], "GDP_PPP_2017")

HDI.Index.Dt <- merge(
  x = HDI.Index.Dt,
  y = GDP.dt[,c(1,5)],
  by = 1
)

HDI.Index.Dt$GDP_PPP_2017 <- as.numeric(HDI.Index.Dt$GDP_PPP_2017)
## Warning: NAs introduced by coercion
HDI.Index.Dt$logGDP_PPP_2017 <- ln(HDI.Index.Dt$GDP_PPP_2017)
#GDP PPP Index correlation
cor(HDI.Index.Dt$HDI, HDI.Index.Dt$GDP_PPP_2017,"na.or.complete")
## [1] 0.2257043
#log GDP PPP Index
cor(HDI.Index.Dt$HDI, HDI.Index.Dt$logGDP_PPP_2017, "na.or.complete")
## [1] 0.5909102
#The Log is stronger because this is how I calculated income with the index

#4)

plot(HDI.Index.Dt$HDI, HDI.Index.Dt$logGDP_PPP_2017)

#5) Making this change did not change much for the HDI in my sample and calculations, this is likely due to the quality of the data that I was able to obtain.

#6) There a number of fields that could be added but I believe that adding an income inequality index would help describe the distribution of the wealth and paint a more detailed picture of a country. For my calculations below I will be using the GINI index a world bank estimate

#3)
GINI.dt <- read_excel("C:/Users/Joseph/Dropbox/Boston College/Development Economics/HDI/GINI Index.xlsx")

colnames(GINI.dt) <- c("Country Name", "GINI")

HDI.Index.Dt <- merge(
  x = HDI.Index.Dt,
  y = GINI.dt,
  by = 1
)

HDI.Index.Dt$GINI <- as.numeric(HDI.Index.Dt$GINI)
## Warning: NAs introduced by coercion
GiniIndex <- function(Ob, setMin, setMax) {
  return((Ob - setMin)/(setMax - setMin))
}

setMin <- 0
setMax <- setMax <- max(HDI.Index.Dt$GINI, na.rm = TRUE)

HDI.Index.Dt <- HDI.Index.Dt %>% group_by(`Country Name`) %>% mutate(GiniIndex = GiniIndex(GINI,setMin, setMax))

MyIndex <- function(Life, Education, Income, Inequality) {
  return((Life * Education * Income * Inequality)^(1/4))
}


HDI.Index.Dt <- HDI.Index.Dt %>% group_by(`Country Name`) %>% mutate(MyIndex = MyIndex(LifeIndex,CombinedEdIndex,IncomeIndex,GiniIndex))
#Correlation between Indices
cor(HDI.Index.Dt$HDI, HDI.Index.Dt$MyIndex,"na.or.complete")
## [1] 0.9792878
HDI.Index.Dt[,c("Country Name", "HDI", "MyIndex")]
## # A tibble: 81 x 3
## # Groups:   Country Name [81]
##    `Country Name`    HDI MyIndex
##    <fct>           <dbl>   <dbl>
##  1 Afghanistan    NA      NA    
##  2 Albania         0.757  NA    
##  3 Angola          0      NA    
##  4 Armenia         0.720   0.699
##  5 Belarus         0       0    
##  6 Belize          0      NA    
##  7 Benin           0.435   0.527
##  8 Bulgaria        0.789  NA    
##  9 Burkina Faso    0      NA    
## 10 Burundi         0.287  NA    
## # ... with 71 more rows
#The biggest difference between the two Indices is that there is significantly less equality data to use for analysis.