library(dplyr)
library(readr)
library(knitr)
library(ggplot2)
library(gtable)
library(wesanderson)
getwd()
## [1] "C:/Users/Chris Iyer/Documents/R/Mortality"
list_file <- list.files(pattern = "*.csv") %>%
lapply(read.csv, stringsAsFactors=F) %>%
bind_rows
kable(head(list_file))
measure_id | measure_name | location_id | location_name | FIPS | sex_id | sex | age_id | age_name | year_id | metric | val | upper | lower |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
26 | Life expectancy | 523 | Alabama | 1 | 1 | Male | 161 | 0 | 1980 | Years | 68.11951 | 68.23987 | 67.99309 |
26 | Life expectancy | 523 | Alabama | 1 | 1 | Male | 161 | 0 | 1981 | Years | 68.43191 | 68.55604 | 68.31208 |
26 | Life expectancy | 523 | Alabama | 1 | 1 | Male | 161 | 0 | 1982 | Years | 68.89942 | 69.01831 | 68.77751 |
26 | Life expectancy | 523 | Alabama | 1 | 1 | Male | 161 | 0 | 1983 | Years | 69.16916 | 69.29072 | 69.05122 |
26 | Life expectancy | 523 | Alabama | 1 | 1 | Male | 161 | 0 | 1984 | Years | 69.21699 | 69.33221 | 69.10116 |
26 | Life expectancy | 523 | Alabama | 1 | 1 | Male | 161 | 0 | 1985 | Years | 69.18182 | 69.30204 | 69.06820 |
Plots life expectancy by state.
#create a vector with state names to use for filtering out the county information
new_names <- sort(state.name)
new_names
## [1] "Alabama" "Alaska" "Arizona" "Arkansas"
## [5] "California" "Colorado" "Connecticut" "Delaware"
## [9] "Florida" "Georgia" "Hawaii" "Idaho"
## [13] "Illinois" "Indiana" "Iowa" "Kansas"
## [17] "Kentucky" "Louisiana" "Maine" "Maryland"
## [21] "Massachusetts" "Michigan" "Minnesota" "Mississippi"
## [25] "Missouri" "Montana" "Nebraska" "Nevada"
## [29] "New Hampshire" "New Jersey" "New Mexico" "New York"
## [33] "North Carolina" "North Dakota" "Ohio" "Oklahoma"
## [37] "Oregon" "Pennsylvania" "Rhode Island" "South Carolina"
## [41] "South Dakota" "Tennessee" "Texas" "Utah"
## [45] "Vermont" "Virginia" "Washington" "West Virginia"
## [49] "Wisconsin" "Wyoming"
dim(list_file)
## [1] 2012220 14
x <- list_file %>% filter( sex!= "Both", measure_name == "Life expectancy", location_name %in% new_names) %>% mutate(val = round(val, digits = 2))
head(x)
## measure_id measure_name location_id location_name FIPS sex_id sex
## 1 26 Life expectancy 523 Alabama 1 1 Male
## 2 26 Life expectancy 523 Alabama 1 1 Male
## 3 26 Life expectancy 523 Alabama 1 1 Male
## 4 26 Life expectancy 523 Alabama 1 1 Male
## 5 26 Life expectancy 523 Alabama 1 1 Male
## 6 26 Life expectancy 523 Alabama 1 1 Male
## age_id age_name year_id metric val upper lower
## 1 161 0 1980 Years 68.12 68.23987 67.99309
## 2 161 0 1981 Years 68.43 68.55604 68.31208
## 3 161 0 1982 Years 68.90 69.01831 68.77751
## 4 161 0 1983 Years 69.17 69.29072 69.05122
## 5 161 0 1984 Years 69.22 69.33221 69.10116
## 6 161 0 1985 Years 69.18 69.30204 69.06820
dim(x)
## [1] 3500 14
yy <- ggplot(x, aes(x = year_id, y = val, color = sex)) + geom_line() + facet_wrap(~ location_name, ncol = 5 )
yy
But the above don’t show the states as compared to the US averages
USA <- (list_file %>% filter(grepl("United", location_name), measure_name == "Life expectancy", sex!= "Both")) %>% select(sex, year_id, val) %>% rename(Sex = sex, Year = year_id, US_Average = val)
USA <- rep(USA,50)
StateUS <- cbind.data.frame(x, USA)
class(StateUS)
## [1] "data.frame"
New <- StateUS[, c(10, 7,4, 12,17)]
New <- New %>% mutate(US_Average = round(US_Average, digits = 2))
head(New %>% mutate(Difference = US_Average - val))
## year_id sex location_name val US_Average Difference
## 1 1980 Male Alabama 68.12 69.99 1.87
## 2 1981 Male Alabama 68.43 70.35 1.92
## 3 1982 Male Alabama 68.90 70.77 1.87
## 4 1983 Male Alabama 69.17 71.02 1.85
## 5 1984 Male Alabama 69.22 71.16 1.94
## 6 1985 Male Alabama 69.18 71.17 1.99
xxx <- New %>% group_by(location_name) %>% summarise(Difference = mean(US_Average) - mean(val)) %>% rename(state = location_name)
Difference_Variable <- mean(New$US_Average) - mean(New$val)
#Create a new vectorwith the difference between state and US
xxx <- as.data.frame(xxx)
#have to reverse factors to coordinate with colors
aa <- ggplot(New, aes(x = year_id, y = val, color = sex)) + geom_line(size = 1.05) + facet_wrap(~ location_name, ncol = 5) + scale_color_manual(values = wes_palette("GrandBudapest2")) + labs(colour = "", size = 1) + scale_color_hue(c=75, l=60) +
guides(colour = guide_legend(override.aes = list(size=3)))
aa + geom_point(aes(y = US_Average), alpha = 1, size = 1/2) + scale_shape_discrete(solid = FALSE) +
theme_bw() +theme(legend.position="top") + ggtitle(label = "Life Expectancy by State", subtitle = "State Average: Solid Line\nUS Average: Dotted line ")