# install.packages("tidyverse")
library(readxl)
library(tidyr)
library(dplyr)
library(stringr)
library(ggplot2)
Data Posted by Binish Kurian Chandy
WHO data on HIV
http://apps.who.int/gho/data/node.country.country-USA?lang=en
Downloaded complete data set for “USA” into .CSV file and named the file “USA GHO Stats.csv”
Here’s how the data looks initially: image:
Read CSV File, skiping the first row in order to get proper headers
df <- read.csv("~/R/Project 2/USA GHO Stats.csv", skip = 1, stringsAsFactors = F)
names(df)
## [1] "Indicator" "X2015" "X2014" "X2013" "X2012"
## [6] "X2011" "X2010" "X2009" "X2008" "X2007"
## [11] "X2006" "X2005" "X2004" "X2003" "X2002"
names(df) <- str_replace_all(names(df), "X", "")
names(df)
## [1] "Indicator" "2015" "2014" "2013" "2012"
## [6] "2011" "2010" "2009" "2008" "2007"
## [11] "2006" "2005" "2004" "2003" "2002"
dfInfant <- filter(df, str_detect(Indicator, "[Ii]nfant")) %>% gather("Year", "N", 2:(2+2015-2002)) %>% tbl_df()
knitr::kable(dfInfant)
| Indicator | Year | N |
|---|---|---|
| Infant mortality rate (probability of dying between birth and age 1 per 1000 live births) | 2015 | 5.7 [5.4-5.9] |
| Number of infant deaths (thousands) | 2015 | 23 |
| Infant mortality rate (probability of dying between birth and age 1 per 1000 live births) | 2014 | 5.8 [5.7-6.0] |
| Number of infant deaths (thousands) | 2014 | 23 |
| Infant mortality rate (probability of dying between birth and age 1 per 1000 live births) | 2013 | 5.9 [5.8-6.0] |
| Number of infant deaths (thousands) | 2013 | 23 |
| Infant mortality rate (probability of dying between birth and age 1 per 1000 live births) | 2012 | 6.0 [5.9-6.1] |
| Number of infant deaths (thousands) | 2012 | 24 |
| Infant mortality rate (probability of dying between birth and age 1 per 1000 live births) | 2011 | 6.1 [6.0-6.3] |
| Number of infant deaths (thousands) | 2011 | 25 |
| Infant mortality rate (probability of dying between birth and age 1 per 1000 live births) | 2010 | 6.2 [6.1-6.3] |
| Number of infant deaths (thousands) | 2010 | 25 |
| Infant mortality rate (probability of dying between birth and age 1 per 1000 live births) | 2009 | 6.4 [6.3-6.5] |
| Number of infant deaths (thousands) | 2009 | 26 |
| Infant mortality rate (probability of dying between birth and age 1 per 1000 live births) | 2008 | 6.5 [6.4-6.6] |
| Number of infant deaths (thousands) | 2008 | 27 |
| Infant mortality rate (probability of dying between birth and age 1 per 1000 live births) | 2007 | 6.6 [6.5-6.7] |
| Number of infant deaths (thousands) | 2007 | 27 |
| Infant mortality rate (probability of dying between birth and age 1 per 1000 live births) | 2006 | 6.7 [6.6-6.8] |
| Number of infant deaths (thousands) | 2006 | 28 |
| Infant mortality rate (probability of dying between birth and age 1 per 1000 live births) | 2005 | 6.8 [6.7-6.9] |
| Number of infant deaths (thousands) | 2005 | 28 |
| Infant mortality rate (probability of dying between birth and age 1 per 1000 live births) | 2004 | 6.9 [6.8-7.0] |
| Number of infant deaths (thousands) | 2004 | 28 |
| Infant mortality rate (probability of dying between birth and age 1 per 1000 live births) | 2003 | 6.9 [6.7-7.0] |
| Number of infant deaths (thousands) | 2003 | 28 |
| Infant mortality rate (probability of dying between birth and age 1 per 1000 live births) | 2002 | 6.9 [6.8-7.0] |
| Number of infant deaths (thousands) | 2002 | 28 |
5.7 [5.4-5.9] AVG [MIN-MAX] In order to separate that composite (united) value, the 3 values need to be delimited by a sep. character
str_view_all("5.7 [5.4-5.9]", "[\\[\\-\\]]")
str_replace_all("5.7 [5.4-5.9]", "[\\[\\-\\]]", "|")
## [1] "5.7 |5.4|5.9|"
Apply separator “|” to “N” column
dfInfant$N <- str_replace_all(dfInfant$N, "[\\[\\-\\]]", "|")
Use separate() function to transform values in “AVG [MIN-MAX]” into 3 separate columns - AVG, MIN and MAX
NOTE: use “\|” for sep. char since “|” is a special character in regex.
dfInfant <- separate(dfInfant, N, c("N", "MIN", "MAX"), sep = "\\|", convert = T)
## Warning: Expected 3 pieces. Additional pieces discarded in 14 rows [1, 3,
## 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27].
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 14 rows [2,
## 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28].
# Conver Year column into numeric
dfInfant$Year <- type.convert(dfInfant$Year)
dfInfant
## # A tibble: 28 x 5
## Indicator Year N MIN MAX
## <chr> <int> <dbl> <dbl> <dbl>
## 1 Infant mortality rate (probability of dying be~ 2015 5.70 5.40 5.90
## 2 Number of infant deaths (thousands) 2015 23.0 NA NA
## 3 Infant mortality rate (probability of dying be~ 2014 5.80 5.70 6.00
## 4 Number of infant deaths (thousands) 2014 23.0 NA NA
## 5 Infant mortality rate (probability of dying be~ 2013 5.90 5.80 6.00
## 6 Number of infant deaths (thousands) 2013 23.0 NA NA
## 7 Infant mortality rate (probability of dying be~ 2012 6.00 5.90 6.10
## 8 Number of infant deaths (thousands) 2012 24.0 NA NA
## 9 Infant mortality rate (probability of dying be~ 2011 6.10 6.00 6.30
## 10 Number of infant deaths (thousands) 2011 25.0 NA NA
## # ... with 18 more rows
Sort data set by Year, Indicator (in descending order)
dfInfant <- arrange(dfInfant, Year, desc(Indicator))
ggplot(data = dfInfant, mapping = aes(x = Year, y = N)) +
geom_line() +
geom_point() +
facet_wrap(~ Indicator, nrow = 2) +
geom_text(aes(label = N), vjust = 0, nudge_y = 1, color = "red") +
labs(y ="NUmbers")
Conclusion:
The line graphs above show that the Infant mortality rate and numbers have been declining over the period although not for the first 4-5 years.