R Markdown

This is the code used to analyse the birth, death and current population datasets for the United States. Forecasts were prepared out to 2052 to allow the final state populations to be determined. This was used to redistribute electoral college votes to determine the potential impact to the outcome of the presidential election.

options(scipen = 999) # disable scientific notation

library(readr)
library(dplyr)
library(data.table)
library(forecast)
library(ggplot2)
library(fpp2)
library(tidyr)

Pop2016 <- read_csv('data/2016PopLong.csv', col_names = T)
BirthData <- read_csv('data/BirthDataLong.csv', col_names = T)
# Hispanic data missing pre 1989
BirthData <- BirthData[BirthData$Year >= 1989,]

# write_csv(BirthData2014,'data/birthdata2014.csv')
# Rename 'American Indian or Alaska Native' to 'Other Minority' to match population data
BirthData$Race[BirthData$Race == 'American Indian or Alaska Native'] <- 'Other Minority'
BirthData$Race[BirthData$Race == 'Asian or Pacific Islander'] <- 'Asian and Pacific Islander'

# Return only female data columns
Pop2016Fem <- Pop2016[c('State','Year','White Female',
                        'Hispanic Female','Black Female','Other Minority Female',
                        'Asian and Pacific Islander Female','Birth_Age_Bin')]
# Pop2016Fem$Birth_Age_Bin <- as.factor(Pop2016Fem$Birth_Age_Bin)
# Pop2016Fem$State <- as.factor(Pop2016Fem$State)

# Return and manipulate birth data
BirthData2014 <- BirthData[BirthData$Year == 2014,]
Mortality1998 <- MortalityData[MortalityData$Year == 1998,]
BirthAgg <- aggregate(Pop2016Fem[c(3:7)], by = list(Pop2016Fem$State,Pop2016Fem$Birth_Age_Bin), FUN = sum)
colnames(BirthAgg)[1] <- "State"
colnames(BirthAgg)[2] <- "Birth_Age_Bin"
BirthAgg <- BirthAgg[BirthAgg$Birth_Age_Bin != 0,]
BirthAgg <- BirthAgg[order(BirthAgg$State, BirthAgg$Birth_Age_Bin),] 
BirthTotal <- aggregate(BirthAgg[c(3:7)], by = list(BirthAgg$State), FUN = sum)
colnames(BirthTotal)[1] <- "State"

write_csv(BirthData, 'data/birthdata.csv')

BirthTotal <- data.frame(State=BirthTotal$State, 
                         Fem_Pop_Tot=BirthTotal$`White Female` + 
                           BirthTotal$`Hispanic Female` +
                           BirthTotal$`Black Female` + 
                           BirthTotal$`Other Minority Female` + 
                           BirthTotal$`Asian and Pacific Islander Female`)

BirthTotal <- left_join(BirthAgg, BirthTotal, by = c("State" = "State"))

BirthTotal <- cbind(BirthTotal[1:2],
                    BirthTotal$`White Female` / BirthTotal$Fem_Pop_Tot,
                    BirthTotal$`Hispanic Female` / BirthTotal$Fem_Pop_Tot,
                    BirthTotal$`Black Female` / BirthTotal$Fem_Pop_Tot,
                    BirthTotal$`Other Minority Female` / BirthTotal$Fem_Pop_Tot,
                    BirthTotal$`Asian and Pacific Islander Female` / BirthTotal$Fem_Pop_Tot)

colnames(BirthTotal)[3] <- "White"
colnames(BirthTotal)[4] <- "Hispanic"
colnames(BirthTotal)[5] <- "Black"
colnames(BirthTotal)[6] <- "Other"
colnames(BirthTotal)[7] <- "Asian"
BirthTotalLong <- gather(BirthTotal, Race, Perc_Pop, -State, -Birth_Age_Bin)
BirthTotalLong <- BirthTotalLong[order(BirthTotalLong$State, BirthTotalLong$Race),] 
BirthRate <- merge(BirthData, BirthTotalLong, 
                   by.x = c("Birth_Age_Bin", "Race_Short"), by.y=c("Birth_Age_Bin","Race")) %>% 
  select(Year, Birth_Age_Bin, Race_Short, State, Rate_Per_Woman=`Rate per Woman`, Perc_Pop) %>% 
  mutate(adjBirthRate = Rate_Per_Woman * Perc_Pop ) %>% 
  filter(Year>=1989) %>% 
  data.table()

# aggregate by year, state
BirthRate.year_state <- BirthRate[,sum(adjBirthRate), by=c("Year","State")] %>% 
  arrange(Year, State) %>% 
  select(Year, State, CumlBirthRate=V1)


# Prepare death data

DeathData <- read_csv('data/MortalityDataLong.csv', col_names = T)
PopHist <- read_csv('data/USPopHist.csv', col_names = T)
PopVeryLong <- read_csv('data/2016PopVeryLong.csv', col_names = T)
colnames(PopVeryLong)[5] <- 'Sex'
colnames(PopVeryLong)[6] <- 'Population'
DeathMerge <- left_join(DeathData[DeathData$Year >= '1970',], PopVeryLong, by = c('Race','Sex','Mortality_Age_Bin'))

deathmerge.dt <-data.table(DeathMerge)

DeathMerge2 <- deathmerge.dt[,
                             .(Year = Year.x,State,DeathCount = sum(as.numeric(Rate) * Population)),
                             by = .(Year.x,State)]
DeathMerge2 <-DeathMerge2[,c(3:5)]
DeathMerge2 <- DeathMerge2[order(State, Year),]

DeathMerge4 <- left_join(DeathMerge2, PopHist, by = c('Year','State'))
DeathRate <- cbind(DeathMerge4[,1:2],DeathMerge4$DeathCount / DeathMerge4$Population)
colnames(DeathRate)[3] <- 'DeathRate' 

## Arima time

# split data by state
sdf <- split(BirthRate.year_state, BirthRate.year_state$State)
sdf.death <- split(DeathRate,DeathRate$State)

# get list of states and set output variable
states <- unique(BirthRate.year_state$State) %>% sort()
states.arima <- list()

# create data frames for loop function to insert into
states.fcbirth <- data.frame(character(50))
states.fcdeath <- data.frame(character(50))

# autoplot(ts(DeathRate[DeathRate$State == 'Alaska',]$DeathRate))

# run auto-arimas for each state
for (i in 1:length(states)) {
  
  # Birth data
  first_year <- min(sdf[[states[i]]]$Year)
  temp.ts <- ts(sdf[[states[i]]]$CumlBirthRate, start=first_year)
  autoplot(temp.ts) + labs(subtitle=states[i])
  temp.arima <- forecast(auto.arima(temp.ts),h=50)
  states.arima[[states[i]]] <- temp.arima
  autoplot(temp.arima) + ggtitle(states[i]) + 
    ylab("rate") + xlab('year') + 
    scale_y_continuous(labels = scales::percent)
  states.fcbirth[,i+1] <- states.arima[[i]]$mean

  # Death data
  first_year <- min(sdf.death[[states[i]]]$Year)
  temp.ts <- ts(sdf.death[[states[i]]]$DeathRate, start=first_year)
  autoplot(temp.ts) + labs(subtitle=states[i])
  temp.arima <- forecast(auto.arima(temp.ts),h=50)
  states.arima[[states[i]]] <- temp.arima
  autoplot(temp.arima) + ggtitle(states[i]) + 
    ylab("rate") + xlab('year') + 
    scale_y_continuous(labels = scales::percent)
  states.fcdeath[,i+1] <- states.arima[[i]]$mean  
  
}



# Clean up datasets and combine into one forecast table
colnames(states.fcbirth)[2:52] <- states
states.fcbirth[,1] <- (max(sdf[[states[i]]]$Year)+1):(max(sdf[[states[i]]]$Year)+50)
colnames(states.fcbirth)[1] <- 'Year'
colnames(states.fcdeath)[2:52] <- states
states.fcdeath[,1] <- (max(sdf[[states[i]]]$Year)+1):(max(sdf[[states[i]]]$Year)+50)
colnames(states.fcdeath)[1] <- 'Year'
fcBirthLong <- gather(states.fcbirth, State, BirthRate, -Year)
fcDeathLong <- gather(states.fcdeath, State, DeathRate, -Year)                   

fctable <- read_csv('data/fcTable.csv', col_names = T)
fctable <- left_join(fcTable, fcBirthLong, c('Year','State'))
fctable <- left_join(fctable, fcDeathLong, c('Year','State'))

autoplot(ts(fctable[fctable$State == 'Alaska',]$DeathRate))

autoplot(ts(diff(BirthRate.year_state[BirthRate.year_state$State == 'Alaska',]$CumlBirthRate)))

# BoxCox.lambda(ts(diff(BirthRate.year_state[BirthRate.year_state$State == 'Arizona',]$CumlBirthRate)))


# write_csv(fctable,'data/fcOutput.csv')
# write_csv(Pop2016,'data/PopData.csv')
#export data
# write_csv(BirthRate.year_state,'data/BirthRate.csv')