This is the Project-2 in DATA607 course for CUNY SPS -Masters in Data Science program. As a part of this project, 3 wide or messy data sets are being read, cleaned up, and then used for individual analyses.

Loading the required packages.

Data set # 1 - UNICEF data for number of deaths of children age - 5-14 years in multiple areas or geographies We are now reading the first data set.

getURL <- "https://raw.githubusercontent.com/deepakmongia/Fall2018/master/Number_of_deaths_estimates-childresn-age-5-14.csv"

Children_death_estimates_df <- read.csv(getURL, header = TRUE, sep = ",", skip = 1)

head(Children_death_estimates_df)
##               Region.Name Uncertainty.bounds. X1990.5 X2000.5 X2010.5
## 1      Sub-Saharan Africa               Lower 575,856 582,903 519,030
## 2      Sub-Saharan Africa              Median 604,397 599,931 543,210
## 3      Sub-Saharan Africa               Upper 651,966 623,935 579,074
## 4 West and Central Africa               Lower 262,531 291,554 291,672
## 5 West and Central Africa              Median 285,507 305,245 314,220
## 6 West and Central Africa               Upper 323,545 322,440 344,649
##   X2015.5 X2017.5
## 1 480,283 469,876
## 2 516,208 508,174
## 3 575,013 578,156
## 4 283,610 279,679
## 5 317,710 317,039
## 6 370,029 378,544

Now we are melting the data set, that means all the columns which are data are being converted to separate columns.

Children_death_estimates_df_molten <- Children_death_estimates_df %>% gather("year", "Number.of.deaths", 3:7)
## Warning: attributes are not identical across measure variables;
## they will be dropped
head(Children_death_estimates_df_molten)
##               Region.Name Uncertainty.bounds.    year Number.of.deaths
## 1      Sub-Saharan Africa               Lower X1990.5          575,856
## 2      Sub-Saharan Africa              Median X1990.5          604,397
## 3      Sub-Saharan Africa               Upper X1990.5          651,966
## 4 West and Central Africa               Lower X1990.5          262,531
## 5 West and Central Africa              Median X1990.5          285,507
## 6 West and Central Africa               Upper X1990.5          323,545
Children_death_estimates_df_molten$year <- substr(Children_death_estimates_df_molten$year, 2, 5)
Children_death_estimates_df_molten$Number.of.deaths <- gsub(",", "", Children_death_estimates_df_molten$Number.of.deaths)

head(Children_death_estimates_df_molten)
##               Region.Name Uncertainty.bounds. year Number.of.deaths
## 1      Sub-Saharan Africa               Lower 1990           575856
## 2      Sub-Saharan Africa              Median 1990           604397
## 3      Sub-Saharan Africa               Upper 1990           651966
## 4 West and Central Africa               Lower 1990           262531
## 5 West and Central Africa              Median 1990           285507
## 6 West and Central Africa               Upper 1990           323545
Children_death_estimates_df_molten_median <- Children_death_estimates_df_molten %>% filter(Children_death_estimates_df_molten$Uncertainty.bounds. == "Median", Children_death_estimates_df_molten$Region.Name != "World")

head(Children_death_estimates_df_molten_median)
##                    Region.Name Uncertainty.bounds. year Number.of.deaths
## 1           Sub-Saharan Africa              Median 1990           604397
## 2      West and Central Africa              Median 1990           285507
## 3  Eastern and Southern Africa              Median 1990           318890
## 4 Middle East and North Africa              Median 1990            66409
## 5                   South Asia              Median 1990           584906
## 6        East Asia and Pacific              Median 1990           319190
Children_death_estimates_df_molten_median <- Children_death_estimates_df_molten_median %>% arrange(Region.Name, year)

DATASET-2

The secod data set is for the country wise data for Balance of payment as a percent of that country’s GDP

Step-1 - To read the data set from Github location into R

getURL2 <- "https://raw.githubusercontent.com/deepakmongia/Fall2018/master/Selected_Indicators_Percent_of_GDP.csv"
Selected_Indicators_Percent_of_GDP_raw <- read.csv(getURL2, header = TRUE, sep = ",", skip = 3)

Performing the steps to 1) rename the first colum to Country.names 2) Removing the junk columns from the end 3) Replcaing the unknown data given by 3 dots “…” by NA

head(Selected_Indicators_Percent_of_GDP_raw)
##                       X X2009 X2010 X2011 X2012 X2013 X2014 X2015 X2016
## 1                 Total   ...   ...   ...   ...   ...   ...   ...   ...
## 2    Advanced Economies   ...   ...   ...   ...   ...   ...   ...   ...
## 3 Euro Area (Incl. ECB) 18.32 21.08 22.94 23.58 22.96 23.19 23.69 23.23
## 4               Austria 42.84 47.38 50.96 50.91 50.38 50.14 49.24 48.95
## 5               Belgium 67.80 75.26 82.05 82.61 81.86 81.92 78.80 81.81
## 6                Cyprus 54.25 57.52 55.91 54.97 56.97 60.16 64.06 66.64
##   X.1 X.2 X.3
## 1  NA  NA  NA
## 2  NA  NA  NA
## 3  NA  NA  NA
## 4  NA  NA  NA
## 5  NA  NA  NA
## 6  NA  NA  NA
colnames(Selected_Indicators_Percent_of_GDP_raw)[colnames(Selected_Indicators_Percent_of_GDP_raw) == "X"] = "Country.name"
names(Selected_Indicators_Percent_of_GDP_raw)
##  [1] "Country.name" "X2009"        "X2010"        "X2011"       
##  [5] "X2012"        "X2013"        "X2014"        "X2015"       
##  [9] "X2016"        "X.1"          "X.2"          "X.3"
Selected_Indicators_Percent_of_GDP_raw <- Selected_Indicators_Percent_of_GDP_raw %>% select(1:9)
Selected_Indicators_Percent_of_GDP_raw[Selected_Indicators_Percent_of_GDP_raw == "..."] <- NA

Pull only the continents / categories and countries with non-blank values into a new filtered data set

Selected_Indicators_Percent_of_GDP_filtered <-  Selected_Indicators_Percent_of_GDP_raw %>% filter((substr(Country.name,1,8) == "Advanced" | substr(Country.name,1,4) == "Euro"
                                                   | substr(Country.name,1,8) == "Emerging" | substr(Country.name,1,3) == "CIS"
                                                   | substr(Country.name,1,6) == "Middle" | substr(Country.name,1,11) == "Sub-Saharan"
                                                   | substr(Country.name,1,5) == "CEMAC" | substr(Country.name,1,5) == "WAEMU" 
                                                   | substr(Country.name,1,7) == "Western" | substr(Country.name,1,4) == "ECCU")
                                                  | !(is.na(X2009) & is.na(X2010) & is.na(X2011) & is.na(X2012) & is.na(X2013) & is.na(X2014) & is.na(X2015) & is.na(X2016)))

Creating a new column called category, this column will be filled with the category under which any country lies. For example - Advanced economies

Selected_Indicators_Percent_of_GDP_filtered$Category <- Selected_Indicators_Percent_of_GDP_filtered$Country.name

end_of_df <- "FALSE"
i <- 1

while (end_of_df == "FALSE") {
  if (Selected_Indicators_Percent_of_GDP_filtered$Country.name[i] %>% is.na())
  {
    end_of_df <- "TRUE"
  }
  else
    #  sprintf("not end of file for record # %s", i)
  {
    if (Selected_Indicators_Percent_of_GDP_filtered$X2009[i] %>% is.na() &
        Selected_Indicators_Percent_of_GDP_filtered$X2010[i] %>% is.na() &
        Selected_Indicators_Percent_of_GDP_filtered$X2011[i] %>% is.na() &
        Selected_Indicators_Percent_of_GDP_filtered$X2012[i] %>% is.na() &
        Selected_Indicators_Percent_of_GDP_filtered$X2013[i] %>% is.na() &
        Selected_Indicators_Percent_of_GDP_filtered$X2014[i] %>% is.na() &
        Selected_Indicators_Percent_of_GDP_filtered$X2015[i] %>% is.na() &
        Selected_Indicators_Percent_of_GDP_filtered$X2016[i] %>% is.na())
    {
      save.country.category <- Selected_Indicators_Percent_of_GDP_filtered$Country.name[i]
    }
    else
    {
      Selected_Indicators_Percent_of_GDP_filtered$Category[i] <- save.country.category
    }
    i <- i + 1 
  }
}

Only saving the country rows and removing the category rows, as we now have grabbed the category values in a new column for all the countries

Selected_Indicators_Percent_of_GDP_filtered <- Selected_Indicators_Percent_of_GDP_filtered %>% filter(!(Selected_Indicators_Percent_of_GDP_filtered$X2009 %>% is.na() &
                                                                                                          Selected_Indicators_Percent_of_GDP_filtered$X2010 %>% is.na() &
                                                                                                          Selected_Indicators_Percent_of_GDP_filtered$X2011 %>% is.na() &
                                                                                                          Selected_Indicators_Percent_of_GDP_filtered$X2012 %>% is.na() &
                                                                                                          Selected_Indicators_Percent_of_GDP_filtered$X2013 %>% is.na() &
                                                                                                          Selected_Indicators_Percent_of_GDP_filtered$X2014 %>% is.na() &
                                                                                                          Selected_Indicators_Percent_of_GDP_filtered$X2015 %>% is.na() &
                                                                                                          Selected_Indicators_Percent_of_GDP_filtered$X2016 %>% is.na()))


head(Selected_Indicators_Percent_of_GDP_filtered)
##            Country.name X2009 X2010 X2011 X2012 X2013 X2014 X2015 X2016
## 1 Euro Area (Incl. ECB) 18.32 21.08 22.94 23.58 22.96 23.19 23.69 23.23
## 2               Austria 42.84 47.38 50.96 50.91 50.38 50.14 49.24 48.95
## 3               Belgium 67.80 75.26 82.05 82.61 81.86 81.92 78.80 81.81
## 4                Cyprus 54.25 57.52 55.91 54.97 56.97 60.16 64.06 66.64
## 5               Estonia 56.02 68.70 80.90 84.39 81.51 79.80 74.59 75.14
## 6               Finland 34.23 37.34 39.93 40.80 39.49 38.05 36.85 36.07
##             Category
## 1 Advanced Economies
## 2 Advanced Economies
## 3 Advanced Economies
## 4 Advanced Economies
## 5 Advanced Economies
## 6 Advanced Economies

Bringing the category on the left before the data set is melted. And then melting the data set.

Selected_Indicators_Percent_of_GDP_filtered <- Selected_Indicators_Percent_of_GDP_filtered[, c(1,10,2:9)]

Selected_Indicators_Percent_of_GDP_long <- Selected_Indicators_Percent_of_GDP_filtered %>% gather("year", "BOP_Percent_of_GDP", 3:10)
## Warning: attributes are not identical across measure variables;
## they will be dropped
head(Selected_Indicators_Percent_of_GDP_long)
##            Country.name           Category  year BOP_Percent_of_GDP
## 1 Euro Area (Incl. ECB) Advanced Economies X2009              18.32
## 2               Austria Advanced Economies X2009              42.84
## 3               Belgium Advanced Economies X2009              67.80
## 4                Cyprus Advanced Economies X2009              54.25
## 5               Estonia Advanced Economies X2009              56.02
## 6               Finland Advanced Economies X2009              34.23
Selected_Indicators_Percent_of_GDP_long$year <- substr(Selected_Indicators_Percent_of_GDP_long$year, 2,5)

Selected_Indicators_Percent_of_GDP_long$BOP_Percent_of_GDP <- as.numeric(Selected_Indicators_Percent_of_GDP_long$BOP_Percent_of_GDP)

BOP_by_GDP_category_year_summarized <- Selected_Indicators_Percent_of_GDP_long %>% group_by(Category, year) %>% summarize(avg_percent_BOP=round(mean(BOP_Percent_of_GDP, na.rm = TRUE),2))

ggplot(BOP_by_GDP_category_year_summarized, aes(x=year, y=avg_percent_BOP, fill=Category)) + geom_bar(position = "dodge", stat = "identity")
## Warning: Removed 1 rows containing missing values (geom_bar).