The dataset contains 135 economic, health, and educational indicators relating to children around the world. Data is collected from 1960-2018 for the 264 countries contained in the dataset. The data was extracted from the World Development Indicators database housed by the World Bank. The entire database contains over 1500 variables from 264 countries from 1960 to 2018.
The purpose of this dataset is to investigate country level development indicators specifically relating to children.
Source: http://databank.worldbank.org/data/reports.aspx?source=world-development-indicators
I formatted the data three different ways so user’s can select the format that best suits their needs.
Original: The first dataset has a column for countries, a column for attribute, and 58 columns for each year.
Long: The second dataset has been reshaped so that the years are not seperate columns, but one column that specifies which year. The value for the attribute is now in a value’s column.
Final: This dataset created contains 1 row per country. The educational attainment variable is the average percent of the population who have received bachelors degrees in any given year between 2006-2015. This is meant to be the outcome variable. The input variables are the remaining columns, for which data was extracted between 1986-1995. These childhood indicators might be related to educational attainment later in life. To properly capture this relationship, it is best to look at child indicators of a certain time period, and then educational attainment about 20 years later.
Country.Name: column containing 264 countries Country.Code: The code for a specific country recorded in Country.Name Series.Name: Various indicators such as educational attainment and child employment statistics. Originally stored in a column with each input refering to an indicator. Series.Code: The code for a specific indicator recorded in Series.Name
X1960…X2018 (original format only) : contains attribute value for each specified year Year (long format only) : gives the year the data was collected Value (long format only) : gives the value for a given attribute, year and a country. ed_attain (final format only): percent of population that has received a bachelors
data <- read.csv("original_data.csv", stringsAsFactors = FALSE)
library(dplyr)
library(tidyr)
library(reshape2)
data <- data[!(data$Series.Code == "SE.XPD.MPRM.ZS" |data$Series.Code == "SE.XPD.MSEC.ZS"),]
data[5:length(data)] <- sapply(data[5:length(data)], as.numeric)
# data[data ==".."] <- NA
Series <- unique(data$Series.Name)
library(reshape2)
length(data)
## [1] 62
timevars <- names(data[,c(5:62)])
longdata <- melt(data, id.vars = c("Country.Name", "Country.Code", "Series.Name", "Series.Code"), measure.vars = timevars)
colnames(longdata)[5] <- "year"
colnames(longdata)[6] <- "value"
#write.csv(longdata, file = "longdata.csv", row.names = FALSE)
These childhood indicators might be related to educational attainment later in life. To properly capture this relationship, it is best to look at child indicators of a certain time period, and then educational attainment about 20 years later. I am subsetting the data so that I am working with indicators from 1980-1990 and educational attainment from 2000-2010.
# Extracting rows where there is atleast one nonmissing value between 1986-1995
data1 <- data
data1$good <- rep(NA, len = nrow(data1))
for(i in 1: nrow(data1)){
data1[i, "good"] <- sum(is.na(data1[i,31:40])) < 10 }
data1b <- data1[data1$good,]
# taking the median of any values for an indicator between 1986-1995
data1b$ave1980s <- rep(NA, len = nrow(data1b))
for(i in 1: nrow(data1b)){
x <- as.numeric(data1b[i,31:40])
data1b[i, "ave1980s"] <- median(x, na.rm = TRUE) }
data1c <- subset(data1b, select = c(Country.Name, Series.Code, ave1980s))
# Extracting rows where there is atleast one nonmissing value between 2006-2015
data2 <- data
data2$good <- rep(NA, len = nrow(data2))
for(i in 1: nrow(data2)){
data2[i, "good"] <- sum(is.na(data2[i,51:60])) < 10 }
data2 <- data2[data2$good,]
# taking the median of any values for an indicator between 2006-2015
data2$ave2000s <- rep(NA, len = nrow(data2))
for(i in 1: nrow(data2)){
x <- as.numeric(data2[i,51:60])
data2[i, "ave2000s"] <- median(x, na.rm = TRUE) }
data2 <- subset(data2, select = c(Country.Name, Series.Code, ave2000s))
# Outer Join
avedata <- merge(x = data1c, y = data2, by = c("Country.Name", "Series.Code"), all = TRUE)
avedata <- avedata[!(avedata$Country.Name == ""),]
Now I reshape the data with just 1980s averages and 2000 averages
##### Reshape data
ed_data <- filter(avedata, Series.Code == "SE.TER.CUAT.BA.ZS")
ave2000data <- subset(ed_data, select = c(Country.Name, ave2000s))
wide <- spread(avedata[,-c(4)], key = Series.Code, value = "ave1980s")
finaldata <- merge(ave2000data, wide, by = "Country.Name")
colnames(finaldata)[2] <- "ed_attain"
#write.csv(finaldata, file = "finaldata.csv", row.names = FALSE)
#save(finaldata, file = "finaldata.RData")
There are many variables with good correlations with the response variable.
zzz <- finaldata[sapply(finaldata, class) %in% c("numeric", "integer")]
correlations <- cor(x=zzz$ed_attain, y=zzz, use="pairwise.complete.obs") # Acquired is the target
correlations
## ed_attain SE.ENR.PRSC.FM.ZS SE.PRM.CUAT.FE.ZS SE.PRM.CUAT.MA.ZS
## [1,] 1 0.4880828 0.5660677 0.7046644
## SE.PRM.CUAT.ZS SE.PRM.ENRR.FE SE.PRM.ENRR.MA SE.PRM.GINT.FE.ZS
## [1,] 0.6527446 0.312628 0.1307098 0.269514
## SE.PRM.GINT.MA.ZS SE.PRM.GINT.ZS SE.PRM.NENR.FE SE.PRM.NENR.MA
## [1,] 0.1311973 0.1344168 NA NA
## SE.PRM.NINT.FE.ZS SE.PRM.NINT.MA.ZS SE.PRM.NINT.ZS SE.PRM.OENR.FE.ZS
## [1,] NA NA NA -0.6759363
## SE.PRM.OENR.MA.ZS SE.PRM.OENR.ZS SE.PRM.PRIV.ZS SE.PRM.PRS5.FE.ZS
## [1,] -0.6838471 -0.7086756 0.001003234 0.504377
## SE.PRM.PRS5.MA.ZS SE.PRM.PRS5.ZS SE.PRM.PRSL.FE.ZS SE.PRM.PRSL.MA.ZS
## [1,] 0.448878 0.5049543 0.519688 0.4610071
## SE.PRM.PRSL.ZS SE.PRM.TENR SE.PRM.TENR.FE SE.PRM.TENR.MA SE.PRM.UNER
## [1,] 0.5103096 NA NA NA -0.1706365
## SE.PRM.UNER.FE SE.PRM.UNER.FE.ZS SE.PRM.UNER.MA SE.PRM.UNER.MA.ZS
## [1,] -0.1826297 -0.500434 -0.2161246 -0.4453684
## SE.PRM.UNER.ZS SE.SEC.CUAT.LO.FE.ZS SE.SEC.CUAT.LO.MA.ZS
## [1,] -0.4729938 0.6852397 0.624393
## SE.SEC.CUAT.LO.ZS SE.SEC.CUAT.PO.FE.ZS SE.SEC.CUAT.PO.MA.ZS
## [1,] 0.6440436 0.6236601 0.6290761
## SE.SEC.CUAT.PO.ZS SE.SEC.UNER.LO.FE.ZS SE.SEC.UNER.LO.MA.ZS
## [1,] 0.6379054 -0.7138224 -0.6792996
## SE.SEC.UNER.LO.ZS SE.TER.CUAT.BA.FE.ZS SE.TER.CUAT.BA.MA.ZS
## [1,] -0.6690417 NA NA
## SE.TER.CUAT.BA.ZS SE.TER.CUAT.MS.FE.ZS SE.TER.CUAT.MS.MA.ZS
## [1,] NA NA NA
## SE.TER.CUAT.MS.ZS SE.XPD.CPRM.ZS SE.XPD.CSEC.ZS SE.XPD.CTOT.ZS
## [1,] NA NA NA NA
## SE.XPD.PRIM.PC.ZS SE.XPD.TOTL.GB.ZS SE.XPD.TOTL.GD.ZS SG.LAW.CHMR
## [1,] 0.7992333 -0.06966929 0.1641536 NA
## SH.ANM.CHLD.ZS SH.DYN.0514 SH.DYN.MORT.FE SH.DYN.MORT.MA SH.HIV.0014
## [1,] -0.7166816 -0.5803721 -0.6394866 -0.6429577 -0.2991689
## SH.HIV.INCD.14 SH.IMM.HEPB SH.IMM.IDPT SH.IMM.MEAS SH.MLR.TRET.ZS
## [1,] -0.340688 -0.01190793 0.4462019 0.4432366 NA
## SH.STA.BFED.ZS SH.STA.MALN.FE.ZS SH.STA.MALN.MA.ZS SH.STA.MALN.ZS
## [1,] -0.2949064 -0.4159105 -0.4552954 -0.4616411
## SH.STA.ORCF.ZS SH.STA.ORTH SH.STA.OWGH.FE.ZS SH.STA.OWGH.MA.ZS
## [1,] 0.7951991 0.2410842 0.2059133 0.2249944
## SH.STA.OWGH.ZS SH.STA.STNT.FE.ZS SH.STA.STNT.MA.ZS SH.STA.STNT.ZS
## [1,] 0.24705 -0.6172661 -0.6563787 -0.5425229
## SH.STA.WAST.FE.ZS SH.STA.WAST.MA.ZS SH.STA.WAST.ZS SH.SVR.WAST.FE.ZS
## [1,] -0.2931756 -0.3113957 -0.343303 -0.2659535
## SH.SVR.WAST.MA.ZS SH.SVR.WAST.ZS SH.VAC.TTNS.ZS SL.AGR.0714.FE.ZS
## [1,] -0.336746 -0.3027676 0.3158363 NA
## SL.AGR.0714.MA.ZS SL.AGR.0714.ZS SL.FAM.0714.FE.ZS SL.FAM.0714.MA.ZS
## [1,] NA NA NA NA
## SL.FAM.0714.ZS SL.MNF.0714.FE.ZS SL.MNF.0714.MA.ZS SL.MNF.0714.ZS
## [1,] NA NA NA NA
## SL.SLF.0714.FE.ZS SL.SLF.0714.MA.ZS SL.SLF.0714.ZS SL.SRV.0714.FE.ZS
## [1,] NA NA NA NA
## SL.SRV.0714.MA.ZS SL.SRV.0714.ZS SL.TLF.0714.FE.ZS SL.TLF.0714.MA.ZS
## [1,] NA NA NA NA
## SL.TLF.0714.SW.FE.ZS SL.TLF.0714.SW.MA.ZS SL.TLF.0714.SW.ZS
## [1,] NA NA NA
## SL.TLF.0714.WK.FE.TM SL.TLF.0714.WK.FE.ZS SL.TLF.0714.WK.MA.TM
## [1,] NA NA NA
## SL.TLF.0714.WK.MA.ZS SL.TLF.0714.WK.TM SL.TLF.0714.WK.ZS
## [1,] NA NA NA
## SL.TLF.0714.ZS SL.UEM.NEET.FE.ZS SL.UEM.NEET.MA.ZS SL.UEM.NEET.ZS
## [1,] NA NA NA NA
## SL.WAG.0714.FE.ZS SL.WAG.0714.MA.ZS SL.WAG.0714.ZS SN.ITK.DEFC.ZS
## [1,] NA NA NA NA
## SP.ADO.TFRT SP.DYN.IMRT.FE.IN SP.DYN.IMRT.IN SP.DYN.IMRT.MA.IN
## [1,] -0.648316 -0.6568271 -0.655688 -0.6537068
## SP.POP.0004.FE.5Y SP.POP.0004.MA.5Y SP.POP.0014.FE.IN
## [1,] -0.6889219 -0.6905233 -0.160368
## SP.POP.0014.FE.ZS SP.POP.0014.MA.IN SP.POP.0014.MA.ZS SP.POP.0014.TO
## [1,] -0.695047 -0.158665 -0.6964212 -0.1595052
## SP.POP.0014.TO.ZS SP.POP.BRTH.MF SP.POP.DPND.YG
## [1,] -0.7047506 0.3849838 -0.7001837
names(correlations) <- names(zzz)
correlationsNoNa <- correlations[!is.na(correlations)]
length(correlationsNoNa)
## [1] 80
StrongCorrelations <- correlationsNoNa[abs(correlationsNoNa) > 0.2]
StrongCorrelations <- sort(StrongCorrelations)
par(mar=c(5.1,10.1,2.1,1.1))
barplot(StrongCorrelations, main=paste("Strong Correlations"), horiz=T, cex.names=0.4, col="cyan", las=2, xlab="Correlation")
The percent of the population with a bachelors degree appears relatively normally distributed.
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
ggplot(finaldata,aes(x = finaldata$ed_attain)) + geom_histogram(binwidth = 3, color = "black", fill = "blue") + xlab("Percent of Pop. with Bachelors Degree") + ggtitle("Outcome Variable: Attainment of Bachelor's Degree") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
Though there are too many variables in the graphs below to read where the missingness exists, it is apparent there is pervasive missingness in the data.
library(VIM)
aggr(finaldata, numbers= TRUE)
We could simply remove the missing values, however, we will surely lose a lot of value data if do so. In fact, we lose all of our rows if we simply remove all rows with missing data. We could instead select certain variables to work with that have more consistent data entries, or we could impute values where the missing exists so that we can continue to play with the full dataset.
finaldata_noNAs <- na.omit(finaldata)
sum(is.na(finaldata_noNAs)) # adding up all the NA's shows that all the missing data has been removed
## [1] 0
dim(finaldata)
## [1] 91 134
dim(finaldata_noNAs) # however, when we remove the missing data, no rows are left over. There is no country where every variable exists.
## [1] 0 134
Depending on the type of missingess mechanism at play, we may be able to impute the missing data. This sort of investigation can be done by looking at a missingess matrix, and sorting on each variable separately. If there appears to be a relationship between the missingness in one variable and the missingess another variable, then we say the data is MAR(missing at random). However, if no distinct relationships exist between missing data in variables, then we can argue the data is missing completely at random. So long as the missing data does not have to do with the value the missing data holds, we can use multiple imputation techniques to fill in values where missing data exists.
I am going to impute using the Amelia package. Amelia immediately tells us that several attributes are completely missing or only have 1 attribute. These columns should be removed, because imputing will not be accurate. However, even after removing these columns, the degree of missingness is too great. The number of attributes (135) is much larger than the 91 observations. In order to apply Amelia, we must reduce the number of attributes. This is best accomplished done by removing columns with greater percentages of missing values.
library(Amelia)
## Warning: package 'Rcpp' was built under R version 3.3.2
## Warning in as.POSIXlt.POSIXct(x, tz): unknown timezone 'zone/tz/2018c.1.0/
## zoneinfo/America/Los_Angeles'
# mulitple imputation
data.amelia <- amelia(finaldata[,-2], m=3, emburn=c(100,100))
## Amelia Error Code: 4
## The data has a column that is completely missing or only has one,observation. Remove these columns: SE.PRM.NENR.FE, SE.PRM.NENR.MA, SE.PRM.NINT.FE.ZS, SE.PRM.NINT.MA.ZS, SE.PRM.NINT.ZS, SE.PRM.TENR, SE.PRM.TENR.FE, SE.PRM.TENR.MA, SE.TER.CUAT.BA.FE.ZS, SE.TER.CUAT.BA.MA.ZS, SE.TER.CUAT.BA.ZS, SE.TER.CUAT.MS.FE.ZS, SE.TER.CUAT.MS.MA.ZS, SE.TER.CUAT.MS.ZS, SE.XPD.CPRM.ZS, SE.XPD.CSEC.ZS, SE.XPD.CTOT.ZS, SG.LAW.CHMR, SH.MLR.TRET.ZS, SL.AGR.0714.FE.ZS, SL.AGR.0714.MA.ZS, SL.AGR.0714.ZS, SL.FAM.0714.FE.ZS, SL.FAM.0714.MA.ZS, SL.FAM.0714.ZS, SL.MNF.0714.FE.ZS, SL.MNF.0714.MA.ZS, SL.MNF.0714.ZS, SL.SLF.0714.FE.ZS, SL.SLF.0714.MA.ZS, SL.SLF.0714.ZS, SL.SRV.0714.FE.ZS, SL.SRV.0714.MA.ZS, SL.SRV.0714.ZS, SL.TLF.0714.FE.ZS, SL.TLF.0714.MA.ZS, SL.TLF.0714.SW.FE.ZS, SL.TLF.0714.SW.MA.ZS, SL.TLF.0714.SW.ZS, SL.TLF.0714.WK.FE.TM, SL.TLF.0714.WK.FE.ZS, SL.TLF.0714.WK.MA.TM, SL.TLF.0714.WK.MA.ZS, SL.TLF.0714.WK.TM, SL.TLF.0714.WK.ZS, SL.TLF.0714.ZS, SL.UEM.NEET.FE.ZS, SL.UEM.NEET.MA.ZS, SL.UEM.NEET.ZS, SL.WAG.0714.FE.ZS, SL.WAG.0714.MA.ZS, SL.WAG.0714.ZS, SN.ITK.DEFC.ZS
finaldata_reduced <- subset(finaldata, select = -c(SE.PRM.NENR.FE, SE.PRM.NENR.MA, SE.PRM.NINT.FE.ZS, SE.PRM.NINT.MA.ZS, SE.PRM.NINT.ZS, SE.PRM.TENR, SE.PRM.TENR.FE, SE.PRM.TENR.MA, SE.TER.CUAT.BA.FE.ZS, SE.TER.CUAT.BA.MA.ZS, SE.TER.CUAT.BA.ZS, SE.TER.CUAT.MS.FE.ZS, SE.TER.CUAT.MS.MA.ZS, SE.TER.CUAT.MS.ZS, SE.XPD.CPRM.ZS, SE.XPD.CSEC.ZS, SE.XPD.CTOT.ZS, SG.LAW.CHMR, SH.MLR.TRET.ZS, SL.AGR.0714.FE.ZS, SL.AGR.0714.MA.ZS, SL.AGR.0714.ZS, SL.FAM.0714.FE.ZS, SL.FAM.0714.MA.ZS, SL.FAM.0714.ZS, SL.MNF.0714.FE.ZS, SL.MNF.0714.MA.ZS, SL.MNF.0714.ZS, SL.SLF.0714.FE.ZS, SL.SLF.0714.MA.ZS, SL.SLF.0714.ZS, SL.SRV.0714.FE.ZS, SL.SRV.0714.MA.ZS, SL.SRV.0714.ZS, SL.TLF.0714.FE.ZS, SL.TLF.0714.MA.ZS, SL.TLF.0714.SW.FE.ZS, SL.TLF.0714.SW.MA.ZS, SL.TLF.0714.SW.ZS, SL.TLF.0714.WK.FE.TM, SL.TLF.0714.WK.FE.ZS, SL.TLF.0714.WK.MA.TM, SL.TLF.0714.WK.MA.ZS, SL.TLF.0714.WK.TM, SL.TLF.0714.WK.ZS, SL.TLF.0714.ZS, SL.UEM.NEET.FE.ZS, SL.UEM.NEET.MA.ZS, SL.UEM.NEET.ZS, SL.WAG.0714.FE.ZS, SL.WAG.0714.MA.ZS, SL.WAG.0714.ZS, SN.ITK.DEFC.ZS))
data.amelia <- amelia(finaldata_reduced[,-c(1:2)], m=3, emburn=c(100,100))
## Warning: There are observations in the data that are completely missing.
## These observations will remain unimputed in the final datasets.
## Amelia Error Code: 34
## The number of observations is too low to estimate the number of
## parameters. You can either remove some variables, reduce
## the order of the time polynomial, or increase the empirical prior.
pctmiss <- apply(finaldata_reduced, 2, function(col)sum(is.na(col))/length(col))