Load any Required Packages

library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
library(readr)
library(stringr)

Get the Data

Final_Death_Data <- read_delim("~/Dropbox/Documents/SMU/CSC 360/Fall 2017 MSSA/Final Death Data.txt", 
     "\t", escape_double = FALSE, col_types = cols(rate = col_double()), 
      trim_ws = TRUE)
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 107 parsing failures.
## row # A tibble: 5 x 5 col     row   col expected         actual expected   <int> <chr>    <chr>          <chr> actual 1    23  rate a double Not Applicable file 2    24  rate a double Not Applicable row 3    47  rate a double Not Applicable col 4    48  rate a double Not Applicable expected 5    71  rate a double Not Applicable actual # ... with 1 more variables: file <chr>
## ... ................. ... ..................................... ........ ..................................... ...... ..................................... .... ..................................... ... ..................................... ... ..................................... ........ ..................................... ...... .......................................
## See problems(...) for more details.

Eliminate the NA rows and get a short name.

fdd = na.omit(Final_Death_Data)

Examine the Date with summary() and glimpse()

summary(fdd)
##     region               year        agegroup            gender         
##  Length:1496        Min.   :1999   Length:1496        Length:1496       
##  Class :character   1st Qu.:2003   Class :character   Class :character  
##  Mode  :character   Median :2007   Mode  :character   Mode  :character  
##                     Mean   :2007                                        
##                     3rd Qu.:2011                                        
##                     Max.   :2015                                        
##       rate         
##  Min.   :    8.40  
##  1st Qu.:   60.17  
##  Median :  398.40  
##  Mean   : 2219.66  
##  3rd Qu.: 1619.70  
##  Max.   :18485.00
glimpse(fdd)
## Observations: 1,496
## Variables: 5
## $ region   <chr> "CENS-R1", "CENS-R1", "CENS-R1", "CENS-R1", "CENS-R1"...
## $ year     <int> 1999, 1999, 1999, 1999, 1999, 1999, 1999, 1999, 1999,...
## $ agegroup <chr> "< 1 year", "< 1 year", "1-4 years", "1-4 years", "5-...
## $ gender   <chr> "F", "M", "F", "M", "F", "M", "F", "M", "F", "M", "F"...
## $ rate     <dbl> 622.5, 704.9, 23.6, 30.7, 12.5, 17.5, 35.8, 92.4, 56....

Clean up the Data

fdd %>% 
  mutate(agegroup = str_replace(agegroup," year",''),
         agegroup = str_replace(agegroup,"s",''),
         agegroup = factor(agegroup,levels = c("< 1",
                                               "1-4",
                                               "5-14",
                                               "15-24",
                                               "25-34",
                                               "35-44",
                                               "45-54",
                                               "55-64",
                                               "65-74",
                                               "75-84",
                                               "85+")),
         censreg = factor(region,labels = c("NE","MidW","South","West"))
         ) -> fdd

Select an age group + gender and observe the change over time

fdd %>%
   filter(agegroup == "< 1", gender == "M") %>% 
  ggplot(aes(x=year,y=rate)) +
  geom_point()

Identify the Regions by Color

fdd %>%
   filter(agegroup == "45-54", gender == "M") %>% 
  ggplot(aes(x=year,y=rate,color=censreg)) +
  geom_point()

Create a Function to Change the Parameters of the graphic.

g1 = function(ag,ge) {
  title = paste0("Death Rate per 100K by Year and Region ",ag," ",ge)
  fdd %>%
  filter(agegroup == ag, gender == ge) %>% 
  ggplot(aes(x=year,y=rate,color=censreg)) +
  geom_point() +
  ggtitle(title)
  
}
g1("35-44","F")

g1("5-14","M")

Select a region and year. Examine Variation by Age Group and Gender.

reg = "NE"
yr = 2000

fdd %>% 
  filter(censreg == reg, year == yr) %>% 
  ggplot(aes(x=agegroup, y = rate,color=gender)) + 
  geom_point() + scale_y_log10()

Look at Ratios of Death Rates

We need to get male and female death rates for a given age group, region and year on the same row in our table.

fdd %>% 
  spread(gender,rate) %>% 
  rename(fdr = F,mdr = M) %>% 
  mutate (mfratio = mdr/fdr ) -> fdds
head(fdds)
## # A tibble: 6 x 7
##    region  year agegroup censreg   fdr   mdr  mfratio
##     <chr> <int>   <fctr>  <fctr> <dbl> <dbl>    <dbl>
## 1 CENS-R1  1999      < 1      NE 622.5 704.9 1.132369
## 2 CENS-R1  1999      1-4      NE  23.6  30.7 1.300847
## 3 CENS-R1  1999     5-14      NE  12.5  17.5 1.400000
## 4 CENS-R1  1999    15-24      NE  35.8  92.4 2.581006
## 5 CENS-R1  1999    25-34      NE  56.6 124.8 2.204947
## 6 CENS-R1  1999    35-44      NE 132.5 231.8 1.749434

Look at the Ratio

We need to select a region and year, then look at the relationship between age group and the ratio.

reg = "NE"
yr = 2000
fdds %>% filter(censreg == reg, year == yr) %>% 
  ggplot(aes(x = agegroup, y = mfratio)) + 
  geom_point()