library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
library(readr)
library(stringr)
Final_Death_Data <- read_delim("~/Dropbox/Documents/SMU/CSC 360/Fall 2017 MSSA/Final Death Data.txt",
"\t", escape_double = FALSE, col_types = cols(rate = col_double()),
trim_ws = TRUE)
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 107 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual expected <int> <chr> <chr> <chr> actual 1 23 rate a double Not Applicable file 2 24 rate a double Not Applicable row 3 47 rate a double Not Applicable col 4 48 rate a double Not Applicable expected 5 71 rate a double Not Applicable actual # ... with 1 more variables: file <chr>
## ... ................. ... ..................................... ........ ..................................... ...... ..................................... .... ..................................... ... ..................................... ... ..................................... ........ ..................................... ...... .......................................
## See problems(...) for more details.
fdd = na.omit(Final_Death_Data)
summary(fdd)
## region year agegroup gender
## Length:1496 Min. :1999 Length:1496 Length:1496
## Class :character 1st Qu.:2003 Class :character Class :character
## Mode :character Median :2007 Mode :character Mode :character
## Mean :2007
## 3rd Qu.:2011
## Max. :2015
## rate
## Min. : 8.40
## 1st Qu.: 60.17
## Median : 398.40
## Mean : 2219.66
## 3rd Qu.: 1619.70
## Max. :18485.00
glimpse(fdd)
## Observations: 1,496
## Variables: 5
## $ region <chr> "CENS-R1", "CENS-R1", "CENS-R1", "CENS-R1", "CENS-R1"...
## $ year <int> 1999, 1999, 1999, 1999, 1999, 1999, 1999, 1999, 1999,...
## $ agegroup <chr> "< 1 year", "< 1 year", "1-4 years", "1-4 years", "5-...
## $ gender <chr> "F", "M", "F", "M", "F", "M", "F", "M", "F", "M", "F"...
## $ rate <dbl> 622.5, 704.9, 23.6, 30.7, 12.5, 17.5, 35.8, 92.4, 56....
fdd %>%
mutate(agegroup = str_replace(agegroup," year",''),
agegroup = str_replace(agegroup,"s",''),
agegroup = factor(agegroup,levels = c("< 1",
"1-4",
"5-14",
"15-24",
"25-34",
"35-44",
"45-54",
"55-64",
"65-74",
"75-84",
"85+")),
censreg = factor(region,labels = c("NE","MidW","South","West"))
) -> fdd
fdd %>%
filter(agegroup == "< 1", gender == "M") %>%
ggplot(aes(x=year,y=rate)) +
geom_point()
fdd %>%
filter(agegroup == "45-54", gender == "M") %>%
ggplot(aes(x=year,y=rate,color=censreg)) +
geom_point()
g1 = function(ag,ge) {
title = paste0("Death Rate per 100K by Year and Region ",ag," ",ge)
fdd %>%
filter(agegroup == ag, gender == ge) %>%
ggplot(aes(x=year,y=rate,color=censreg)) +
geom_point() +
ggtitle(title)
}
g1("35-44","F")
g1("5-14","M")
reg = "NE"
yr = 2000
fdd %>%
filter(censreg == reg, year == yr) %>%
ggplot(aes(x=agegroup, y = rate,color=gender)) +
geom_point() + scale_y_log10()
We need to get male and female death rates for a given age group, region and year on the same row in our table.
fdd %>%
spread(gender,rate) %>%
rename(fdr = F,mdr = M) %>%
mutate (mfratio = mdr/fdr ) -> fdds
head(fdds)
## # A tibble: 6 x 7
## region year agegroup censreg fdr mdr mfratio
## <chr> <int> <fctr> <fctr> <dbl> <dbl> <dbl>
## 1 CENS-R1 1999 < 1 NE 622.5 704.9 1.132369
## 2 CENS-R1 1999 1-4 NE 23.6 30.7 1.300847
## 3 CENS-R1 1999 5-14 NE 12.5 17.5 1.400000
## 4 CENS-R1 1999 15-24 NE 35.8 92.4 2.581006
## 5 CENS-R1 1999 25-34 NE 56.6 124.8 2.204947
## 6 CENS-R1 1999 35-44 NE 132.5 231.8 1.749434
We need to select a region and year, then look at the relationship between age group and the ratio.
reg = "NE"
yr = 2000
fdds %>% filter(censreg == reg, year == yr) %>%
ggplot(aes(x = agegroup, y = mfratio)) +
geom_point()