Project2

Child Mortality

The dataset for this project is downloaded from http://www.childmortality.org under âEstimates for under-five, infant and neonatal mortalityâ. The dataset is in wide form and contains six value variables of interest: under-five (0-4 years) mortality, infant (0-1 years) mortality, neonatal (0-1 month) mortality, as well as the number of under-five, infant, and neonatal deaths.

The objective of this project is to analyze number of children deaths in different countries viz, Canada, France, Germany and United Kingdom:

The following dplyr/tidyr functions are used in this project

dplyr	tidyr
select	gather
filter	separate
arrange

(1) Save the data as a CSV file:

library(tidyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)
library(sqldf)

## Loading required package: gsubfn

## Loading required package: proto

## Loading required package: RSQLite

## Loading required package: DBI

library(ggplot2)

(2) Read the information from the .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data:

**(2.a) Read 01_ChildMortality.csv file**

ChildMortality_df = read.csv(file="01_ChildMortality.csv", header=TRUE, sep=",", skip = 6)
nrow(ChildMortality_df)

## [1] 585

**(2.b) Apply gsub function and replace “.” with “_" in the column names**

colnames(ChildMortality_df) = gsub("\\.","_",colnames(ChildMortality_df))

**(2.c) Apply tidyr filter function and select only cases that have median values for countries Canada, France, Germany and United Kingdom**

ChildMortality_df = filter(ChildMortality_df, Uncertainty_bounds_ == "Median" &
                              (CountryName == "Canada" |
                               CountryName == "France" |
                               CountryName == "Germany" |
                               CountryName == "United Kingdom"))

nrow(ChildMortality_df)

## [1] 4

**(2.d) Apply tidyr select function and select only columns that are relevant for our purpose**

ChildDeaths1_df = select(ChildMortality_df, matches("CountryName|Under_five_Deaths|Infant_Deaths|Neonatal_Deaths"))
nrow(ChildDeaths1_df)

## [1] 4

**(2.e) Apply tidyr gather function and convert the data from wide to long structure so it can be used for analysis**

ChildDeaths2_df = gather(ChildDeaths1_df, Type, Value, Under_five_Deaths_1950:Neonatal_Deaths_2015)
head(ChildDeaths2_df)

##      CountryName                   Type Value
## 1         Canada Under_five_Deaths_1950 17848
## 2        Germany Under_five_Deaths_1950    NA
## 3         France Under_five_Deaths_1950 58258
## 4 United Kingdom Under_five_Deaths_1950 35795
## 5         Canada Under_five_Deaths_1951 16562
## 6        Germany Under_five_Deaths_1951    NA

**(2.f) Apply gsub function to replace strings “Under_five_Deaths, Infant_Deaths and Neonatal_Deaths by codes U5D, IND and NND respectively**

ChildDeaths2_df$Type = gsub("Under_five_Deaths", "U5D", ChildDeaths2_df$Type)
ChildDeaths2_df$Type = gsub("Infant_Deaths", "IND", ChildDeaths2_df$Type)
ChildDeaths2_df$Type = gsub("Neonatal_Deaths", "NND", ChildDeaths2_df$Type)
head(ChildDeaths2_df, 20)

##       CountryName     Type Value
## 1          Canada U5D_1950 17848
## 2         Germany U5D_1950    NA
## 3          France U5D_1950 58258
## 4  United Kingdom U5D_1950 35795
## 5          Canada U5D_1951 16562
## 6         Germany U5D_1951    NA
## 7          France U5D_1951 46488
## 8  United Kingdom U5D_1951 28588
## 9          Canada U5D_1952 15817
## 10        Germany U5D_1952    NA
## 11         France U5D_1952 37923
## 12 United Kingdom U5D_1952 23677
## 13         Canada U5D_1953 15445
## 14        Germany U5D_1953    NA
## 15         France U5D_1953 32248
## 16 United Kingdom U5D_1953 20703
## 17         Canada U5D_1954 15304
## 18        Germany U5D_1954    NA
## 19         France U5D_1954 28699
## 20 United Kingdom U5D_1954 19374

**(2.g) Apply separate function to extract the year of death from the event type column**

ChildDeaths2_df = separate(ChildDeaths2_df, Type, c("Rate", "Year"), sep = "_")
head(ChildDeaths2_df, 20)

##       CountryName Rate Year Value
## 1          Canada  U5D 1950 17848
## 2         Germany  U5D 1950    NA
## 3          France  U5D 1950 58258
## 4  United Kingdom  U5D 1950 35795
## 5          Canada  U5D 1951 16562
## 6         Germany  U5D 1951    NA
## 7          France  U5D 1951 46488
## 8  United Kingdom  U5D 1951 28588
## 9          Canada  U5D 1952 15817
## 10        Germany  U5D 1952    NA
## 11         France  U5D 1952 37923
## 12 United Kingdom  U5D 1952 23677
## 13         Canada  U5D 1953 15445
## 14        Germany  U5D 1953    NA
## 15         France  U5D 1953 32248
## 16 United Kingdom  U5D 1953 20703
## 17         Canada  U5D 1954 15304
## 18        Germany  U5D 1954    NA
## 19         France  U5D 1954 28699
## 20 United Kingdom  U5D 1954 19374

(3) Prepare data for analysis:

**(3.a) Apply arrange function to arrange the data by CountryName. Also, apply filter function to get rows that report under 5 deaths for years greater than 2005.**

ChildDeaths2_df = arrange(ChildDeaths2_df, CountryName)
ChildDeaths2_dfU5D = filter(ChildDeaths2_df, Rate == "U5D" & Value != "NA" & Year>=2005)
print(ChildDeaths2_dfU5D)

##       CountryName Rate Year Value
## 1          Canada  U5D 2005  2122
## 2          Canada  U5D 2006  2149
## 3          Canada  U5D 2007  2181
## 4          Canada  U5D 2008  2206
## 5          Canada  U5D 2009  2217
## 6          Canada  U5D 2010  2201
## 7          Canada  U5D 2011  2121
## 8          Canada  U5D 2012  2056
## 9          Canada  U5D 2013  1978
## 10         Canada  U5D 2014  1872
## 11         Canada  U5D 2015  1824
## 12         France  U5D 2005  3595
## 13         France  U5D 2006  3529
## 14         France  U5D 2007  3455
## 15         France  U5D 2008  3378
## 16         France  U5D 2009  3377
## 17         France  U5D 2010  3376
## 18         France  U5D 2011  3379
## 19         France  U5D 2012  3383
## 20         France  U5D 2013  3463
## 21         France  U5D 2014  3462
## 22         France  U5D 2015  3378
## 23        Germany  U5D 2005  3161
## 24        Germany  U5D 2006  3022
## 25        Germany  U5D 2007  2930
## 26        Germany  U5D 2008  2872
## 27        Germany  U5D 2009  2827
## 28        Germany  U5D 2010  2784
## 29        Germany  U5D 2011  2734
## 30        Germany  U5D 2012  2687
## 31        Germany  U5D 2013  2641
## 32        Germany  U5D 2014  2594
## 33        Germany  U5D 2015  2545
## 34 United Kingdom  U5D 2005  4207
## 35 United Kingdom  U5D 2006  4317
## 36 United Kingdom  U5D 2007  4489
## 37 United Kingdom  U5D 2008  4583
## 38 United Kingdom  U5D 2009  4618
## 39 United Kingdom  U5D 2010  4533
## 40 United Kingdom  U5D 2011  4354
## 41 United Kingdom  U5D 2012  4079
## 42 United Kingdom  U5D 2013  3755
## 43 United Kingdom  U5D 2014  3460
## 44 United Kingdom  U5D 2015  3235

ggplot(data=ChildDeaths2_dfU5D, aes(x=Year, y=Value, group=CountryName, color=CountryName)) +  geom_line() + 
  geom_point() + ylab("# of Deaths") +  ggtitle("Under 5 Child Deaths")

**(3.b) Apply arrange function to arrange the data by CountryName. Also, apply filter function to get rows that report infant deaths for years greater than 2005.**

ChildDeaths2_df = arrange(ChildDeaths2_df, CountryName)
ChildDeaths2_dfIND = filter(ChildDeaths2_df, Rate == "IND" & Value != "NA" & Year>=2005)
print(ChildDeaths2_dfIND)

##       CountryName Rate Year Value
## 1          Canada  IND 2005  1847
## 2          Canada  IND 2006  1871
## 3          Canada  IND 2007  1898
## 4          Canada  IND 2008  1918
## 5          Canada  IND 2009  1960
## 6          Canada  IND 2010  1939
## 7          Canada  IND 2011  1852
## 8          Canada  IND 2012  1822
## 9          Canada  IND 2013  1743
## 10         Canada  IND 2014  1636
## 11         Canada  IND 2015  1587
## 12         France  IND 2005  2981
## 13         France  IND 2006  2908
## 14         France  IND 2007  2830
## 15         France  IND 2008  2750
## 16         France  IND 2009  2747
## 17         France  IND 2010  2745
## 18         France  IND 2011  2747
## 19         France  IND 2012  2749
## 20         France  IND 2013  2830
## 21         France  IND 2014  2829
## 22         France  IND 2015  2747
## 23        Germany  IND 2005  2578
## 24        Germany  IND 2006  2457
## 25        Germany  IND 2007  2377
## 26        Germany  IND 2008  2326
## 27        Germany  IND 2009  2352
## 28        Germany  IND 2010  2313
## 29        Germany  IND 2011  2265
## 30        Germany  IND 2012  2219
## 31        Germany  IND 2013  2240
## 32        Germany  IND 2014  2191
## 33        Germany  IND 2015  2140
## 34 United Kingdom  IND 2005  3584
## 35 United Kingdom  IND 2006  3692
## 36 United Kingdom  IND 2007  3851
## 37 United Kingdom  IND 2008  3925
## 38 United Kingdom  IND 2009  4016
## 39 United Kingdom  IND 2010  3917
## 40 United Kingdom  IND 2011  3704
## 41 United Kingdom  IND 2012  3498
## 42 United Kingdom  IND 2013  3173
## 43 United Kingdom  IND 2014  2880
## 44 United Kingdom  IND 2015  2655

ggplot(data=ChildDeaths2_dfIND, aes(x=Year, y=Value, group=CountryName, color=CountryName)) +  geom_line() + 
  geom_point() + ylab("# of Deaths") +  ggtitle("Infant Deaths")

**(3.c) Apply arrange function to arrange the data by CountryName. Also, apply filter function to get rows that report neonatal deaths for years greater than 2005.**

ChildDeaths2_df = arrange(ChildDeaths2_df, CountryName)
ChildDeaths2_dfNND = filter(ChildDeaths2_df, Rate == "NND" & Value != "NA" & Year>=2005)
print(ChildDeaths2_dfNND)

##       CountryName Rate Year Value
## 1          Canada  NND 2005  1393
## 2          Canada  NND 2006  1439
## 3          Canada  NND 2007  1451
## 4          Canada  NND 2008  1457
## 5          Canada  NND 2009  1489
## 6          Canada  NND 2010  1464
## 7          Canada  NND 2011  1418
## 8          Canada  NND 2012  1356
## 9          Canada  NND 2013  1288
## 10         Canada  NND 2014  1227
## 11         Canada  NND 2015  1181
## 12         France  NND 2005  1882
## 13         France  NND 2006  1807
## 14         France  NND 2007  1807
## 15         France  NND 2008  1806
## 16         France  NND 2009  1805
## 17         France  NND 2010  1803
## 18         France  NND 2011  1726
## 19         France  NND 2012  1728
## 20         France  NND 2013  1807
## 21         France  NND 2014  1807
## 22         France  NND 2015  1726
## 23        Germany  NND 2005  1718
## 24        Germany  NND 2006  1681
## 25        Germany  NND 2007  1606
## 26        Germany  NND 2008  1550
## 27        Germany  NND 2009  1568
## 28        Germany  NND 2010  1519
## 29        Germany  NND 2011  1532
## 30        Germany  NND 2012  1546
## 31        Germany  NND 2013  1493
## 32        Germany  NND 2014  1506
## 33        Germany  NND 2015  1449
## 34 United Kingdom  NND 2005  2459
## 35 United Kingdom  NND 2006  2510
## 36 United Kingdom  NND 2007  2592
## 37 United Kingdom  NND 2008  2672
## 38 United Kingdom  NND 2009  2706
## 39 United Kingdom  NND 2010  2670
## 40 United Kingdom  NND 2011  2557
## 41 United Kingdom  NND 2012  2388
## 42 United Kingdom  NND 2013  2196
## 43 United Kingdom  NND 2014  1946
## 44 United Kingdom  NND 2015  1820

ggplot(data=ChildDeaths2_dfNND, aes(x=Year, y=Value, group=CountryName, color=CountryName)) +  geom_line() + 
  geom_point() + ylab("# of Deaths") +  ggtitle("Neonatal Deaths")

(4) Conclusion:

(4.a) From the graphs we can conclude that the number of child deaths (Under 5, infant and neonatal) in United Kingdom increased from 2005 to 2009 and then decreased for every year afterwards. We can see a similar pattern in Canada. It would require further data/analysis to find the root causes of why it increased from 2005 to 2009 and then decreases afterwards.

Project2_01

A.S

October 6, 2016