This is the Project-2 in DATA607 course for CUNY SPS -Masters in Data Science program. As a part of this project, 3 wide or messy data sets are being read, cleaned up, and then used for individual analyses.
Loading the required packages.
Data set # 1 - UNICEF data for number of deaths of children age - 5-14 years in multiple areas or geographies We are now reading the first data set.
getURL <- "https://raw.githubusercontent.com/deepakmongia/Fall2018/master/Number_of_deaths_estimates-childresn-age-5-14.csv"
Children_death_estimates_df <- read.csv(getURL, header = TRUE, sep = ",", skip = 1)
head(Children_death_estimates_df)
## Region.Name Uncertainty.bounds. X1990.5 X2000.5 X2010.5
## 1 Sub-Saharan Africa Lower 575,856 582,903 519,030
## 2 Sub-Saharan Africa Median 604,397 599,931 543,210
## 3 Sub-Saharan Africa Upper 651,966 623,935 579,074
## 4 West and Central Africa Lower 262,531 291,554 291,672
## 5 West and Central Africa Median 285,507 305,245 314,220
## 6 West and Central Africa Upper 323,545 322,440 344,649
## X2015.5 X2017.5
## 1 480,283 469,876
## 2 516,208 508,174
## 3 575,013 578,156
## 4 283,610 279,679
## 5 317,710 317,039
## 6 370,029 378,544
Now we are melting the data set, that means all the columns which are data are being converted to separate columns.
Children_death_estimates_df_molten <- Children_death_estimates_df %>% gather("year", "Number.of.deaths", 3:7)
## Warning: attributes are not identical across measure variables;
## they will be dropped
head(Children_death_estimates_df_molten)
## Region.Name Uncertainty.bounds. year Number.of.deaths
## 1 Sub-Saharan Africa Lower X1990.5 575,856
## 2 Sub-Saharan Africa Median X1990.5 604,397
## 3 Sub-Saharan Africa Upper X1990.5 651,966
## 4 West and Central Africa Lower X1990.5 262,531
## 5 West and Central Africa Median X1990.5 285,507
## 6 West and Central Africa Upper X1990.5 323,545
Children_death_estimates_df_molten$year <- substr(Children_death_estimates_df_molten$year, 2, 5)
Children_death_estimates_df_molten$Number.of.deaths <- gsub(",", "", Children_death_estimates_df_molten$Number.of.deaths)
head(Children_death_estimates_df_molten)
## Region.Name Uncertainty.bounds. year Number.of.deaths
## 1 Sub-Saharan Africa Lower 1990 575856
## 2 Sub-Saharan Africa Median 1990 604397
## 3 Sub-Saharan Africa Upper 1990 651966
## 4 West and Central Africa Lower 1990 262531
## 5 West and Central Africa Median 1990 285507
## 6 West and Central Africa Upper 1990 323545
Children_death_estimates_df_molten_median <- Children_death_estimates_df_molten %>% filter(Children_death_estimates_df_molten$Uncertainty.bounds. == "Median", Children_death_estimates_df_molten$Region.Name != "World")
head(Children_death_estimates_df_molten_median)
## Region.Name Uncertainty.bounds. year Number.of.deaths
## 1 Sub-Saharan Africa Median 1990 604397
## 2 West and Central Africa Median 1990 285507
## 3 Eastern and Southern Africa Median 1990 318890
## 4 Middle East and North Africa Median 1990 66409
## 5 South Asia Median 1990 584906
## 6 East Asia and Pacific Median 1990 319190
Children_death_estimates_df_molten_median <- Children_death_estimates_df_molten_median %>% arrange(Region.Name, year)
DATASET-2
The secod data set is for the country wise data for Balance of payment as a percent of that country’s GDP
Step-1 - To read the data set from Github location into R
getURL2 <- "https://raw.githubusercontent.com/deepakmongia/Fall2018/master/Selected_Indicators_Percent_of_GDP.csv"
Selected_Indicators_Percent_of_GDP_raw <- read.csv(getURL2, header = TRUE, sep = ",", skip = 3)
Performing the steps to 1) rename the first colum to Country.names 2) Removing the junk columns from the end 3) Replcaing the unknown data given by 3 dots “…” by NA
head(Selected_Indicators_Percent_of_GDP_raw)
## X X2009 X2010 X2011 X2012 X2013 X2014 X2015 X2016
## 1 Total ... ... ... ... ... ... ... ...
## 2 Advanced Economies ... ... ... ... ... ... ... ...
## 3 Euro Area (Incl. ECB) 18.32 21.08 22.94 23.58 22.96 23.19 23.69 23.23
## 4 Austria 42.84 47.38 50.96 50.91 50.38 50.14 49.24 48.95
## 5 Belgium 67.80 75.26 82.05 82.61 81.86 81.92 78.80 81.81
## 6 Cyprus 54.25 57.52 55.91 54.97 56.97 60.16 64.06 66.64
## X.1 X.2 X.3
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
colnames(Selected_Indicators_Percent_of_GDP_raw)[colnames(Selected_Indicators_Percent_of_GDP_raw) == "X"] = "Country.name"
names(Selected_Indicators_Percent_of_GDP_raw)
## [1] "Country.name" "X2009" "X2010" "X2011"
## [5] "X2012" "X2013" "X2014" "X2015"
## [9] "X2016" "X.1" "X.2" "X.3"
Selected_Indicators_Percent_of_GDP_raw <- Selected_Indicators_Percent_of_GDP_raw %>% select(1:9)
Selected_Indicators_Percent_of_GDP_raw[Selected_Indicators_Percent_of_GDP_raw == "..."] <- NA
Pull only the continents / categories and countries with non-blank values into a new filtered data set
Selected_Indicators_Percent_of_GDP_filtered <- Selected_Indicators_Percent_of_GDP_raw %>% filter((substr(Country.name,1,8) == "Advanced" | substr(Country.name,1,4) == "Euro"
| substr(Country.name,1,8) == "Emerging" | substr(Country.name,1,3) == "CIS"
| substr(Country.name,1,6) == "Middle" | substr(Country.name,1,11) == "Sub-Saharan"
| substr(Country.name,1,5) == "CEMAC" | substr(Country.name,1,5) == "WAEMU"
| substr(Country.name,1,7) == "Western" | substr(Country.name,1,4) == "ECCU")
| !(is.na(X2009) & is.na(X2010) & is.na(X2011) & is.na(X2012) & is.na(X2013) & is.na(X2014) & is.na(X2015) & is.na(X2016)))
Creating a new column called category, this column will be filled with the category under which any country lies. For example - Advanced economies
Selected_Indicators_Percent_of_GDP_filtered$Category <- Selected_Indicators_Percent_of_GDP_filtered$Country.name
end_of_df <- "FALSE"
i <- 1
while (end_of_df == "FALSE") {
if (Selected_Indicators_Percent_of_GDP_filtered$Country.name[i] %>% is.na())
{
end_of_df <- "TRUE"
}
else
# sprintf("not end of file for record # %s", i)
{
if (Selected_Indicators_Percent_of_GDP_filtered$X2009[i] %>% is.na() &
Selected_Indicators_Percent_of_GDP_filtered$X2010[i] %>% is.na() &
Selected_Indicators_Percent_of_GDP_filtered$X2011[i] %>% is.na() &
Selected_Indicators_Percent_of_GDP_filtered$X2012[i] %>% is.na() &
Selected_Indicators_Percent_of_GDP_filtered$X2013[i] %>% is.na() &
Selected_Indicators_Percent_of_GDP_filtered$X2014[i] %>% is.na() &
Selected_Indicators_Percent_of_GDP_filtered$X2015[i] %>% is.na() &
Selected_Indicators_Percent_of_GDP_filtered$X2016[i] %>% is.na())
{
save.country.category <- Selected_Indicators_Percent_of_GDP_filtered$Country.name[i]
}
else
{
Selected_Indicators_Percent_of_GDP_filtered$Category[i] <- save.country.category
}
i <- i + 1
}
}
Only saving the country rows and removing the category rows, as we now have grabbed the category values in a new column for all the countries
Selected_Indicators_Percent_of_GDP_filtered <- Selected_Indicators_Percent_of_GDP_filtered %>% filter(!(Selected_Indicators_Percent_of_GDP_filtered$X2009 %>% is.na() &
Selected_Indicators_Percent_of_GDP_filtered$X2010 %>% is.na() &
Selected_Indicators_Percent_of_GDP_filtered$X2011 %>% is.na() &
Selected_Indicators_Percent_of_GDP_filtered$X2012 %>% is.na() &
Selected_Indicators_Percent_of_GDP_filtered$X2013 %>% is.na() &
Selected_Indicators_Percent_of_GDP_filtered$X2014 %>% is.na() &
Selected_Indicators_Percent_of_GDP_filtered$X2015 %>% is.na() &
Selected_Indicators_Percent_of_GDP_filtered$X2016 %>% is.na()))
head(Selected_Indicators_Percent_of_GDP_filtered)
## Country.name X2009 X2010 X2011 X2012 X2013 X2014 X2015 X2016
## 1 Euro Area (Incl. ECB) 18.32 21.08 22.94 23.58 22.96 23.19 23.69 23.23
## 2 Austria 42.84 47.38 50.96 50.91 50.38 50.14 49.24 48.95
## 3 Belgium 67.80 75.26 82.05 82.61 81.86 81.92 78.80 81.81
## 4 Cyprus 54.25 57.52 55.91 54.97 56.97 60.16 64.06 66.64
## 5 Estonia 56.02 68.70 80.90 84.39 81.51 79.80 74.59 75.14
## 6 Finland 34.23 37.34 39.93 40.80 39.49 38.05 36.85 36.07
## Category
## 1 Advanced Economies
## 2 Advanced Economies
## 3 Advanced Economies
## 4 Advanced Economies
## 5 Advanced Economies
## 6 Advanced Economies
Bringing the category on the left before the data set is melted. And then melting the data set.
Selected_Indicators_Percent_of_GDP_filtered <- Selected_Indicators_Percent_of_GDP_filtered[, c(1,10,2:9)]
Selected_Indicators_Percent_of_GDP_long <- Selected_Indicators_Percent_of_GDP_filtered %>% gather("year", "BOP_Percent_of_GDP", 3:10)
## Warning: attributes are not identical across measure variables;
## they will be dropped
head(Selected_Indicators_Percent_of_GDP_long)
## Country.name Category year BOP_Percent_of_GDP
## 1 Euro Area (Incl. ECB) Advanced Economies X2009 18.32
## 2 Austria Advanced Economies X2009 42.84
## 3 Belgium Advanced Economies X2009 67.80
## 4 Cyprus Advanced Economies X2009 54.25
## 5 Estonia Advanced Economies X2009 56.02
## 6 Finland Advanced Economies X2009 34.23
Selected_Indicators_Percent_of_GDP_long$year <- substr(Selected_Indicators_Percent_of_GDP_long$year, 2,5)
Selected_Indicators_Percent_of_GDP_long$BOP_Percent_of_GDP <- as.numeric(Selected_Indicators_Percent_of_GDP_long$BOP_Percent_of_GDP)
BOP_by_GDP_category_year_summarized <- Selected_Indicators_Percent_of_GDP_long %>% group_by(Category, year) %>% summarize(avg_percent_BOP=round(mean(BOP_Percent_of_GDP, na.rm = TRUE),2))
ggplot(BOP_by_GDP_category_year_summarized, aes(x=year, y=avg_percent_BOP, fill=Category)) + geom_bar(position = "dodge", stat = "identity")
## Warning: Removed 1 rows containing missing values (geom_bar).