Data was downloaded and then looped through to create a smaller file that could be uploaded to github.
setwd("C:/Users/bkrei/Desktop/Bk's Stuff Desktop/School/Visualization Final/Files")
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.3.3
library(ggplot2)
library(stringr)
## Warning: package 'stringr' was built under R version 3.3.3
#To make my map I need the latitudinal and longitudinal information for each state.
state_latlong <- read.csv("https://raw.githubusercontent.com/bkreis84/Data-604---Model/master/VIS/state_latlong.csv")
colnames(state_latlong)[4] <- 'STATE'
state_latlong <- state_latlong %>%
select(STATE, lat, long, group)
dfAll <- data.frame()
# Although there is data going back to 2005, many of the metrics I am interested in only go back to 2009. I saved the very large files to my working directory and then looped through them to pull the variables of interest and append them into a dataframe
for (i in 10:15) {
year <- read.csv(paste0(2000+i, '.csv'), header = TRUE)
#some column names were uppercase and others lower, to have the loop work correctly convert to upper.
names(year) <- toupper(names(year))
year <- year %>%
select(STATE, ZIPCODE, AGI_STUB, N1, MARS2, NUMDEP, PREP, N00900, A00900, N03300, A03300, N02300, A02300, A00100, A00200, A18500, N18500)
year$yr <- 2000+i
dfAll <- rbind(dfAll, year)
}
names(dfAll) <- c('STATE', 'ZIPCODE', 'AGI_STUB', 'RETURNSCOUNT', 'JOINTCOUNT', 'NUMDEP', 'PREP', 'NUM_BUS_INCOME', 'BUS_INCOME', 'SEP_PLAN_COUNT',
'SEP_PLAN$', 'UNEMP_COMP_Count', 'UNEMP_COMP$', 'AGI', 'SAL&WAGES', 'RE_TAXES', 'NUM_RE_TAXES', 'YEAR')
#Function from https://favorableoutcomes.wordpress.com/2012/10/19/create-an-r-function-to-convert-state-codes-to-full-state-name/
stateFromLower <-function(x) {
#read 52 state codes into local variable [includes DC (Washington D.C. and PR (Puerto Rico)]
st.codes<-data.frame(
state=as.factor(c("AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", "GA",
"HI", "IA", "ID", "IL", "IN", "KS", "KY", "LA", "MA", "MD", "ME",
"MI", "MN", "MO", "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM",
"NV", "NY", "OH", "OK", "OR", "PA", "PR", "RI", "SC", "SD", "TN",
"TX", "UT", "VA", "VT", "WA", "WI", "WV", "WY")),
full=as.factor(c("alaska","alabama","arkansas","arizona","california","colorado",
"connecticut","district of columbia","delaware","florida","georgia",
"hawaii","iowa","idaho","illinois","indiana","kansas","kentucky",
"louisiana","massachusetts","maryland","maine","michigan","minnesota",
"missouri","mississippi","montana","north carolina","north dakota",
"nebraska","new hampshire","new jersey","new mexico","nevada",
"new york","ohio","oklahoma","oregon","pennsylvania","puerto rico",
"rhode island","south carolina","south dakota","tennessee","texas",
"utah","virginia","vermont","washington","wisconsin",
"west virginia","wyoming"))
)
#create an nx1 data.frame of state codes from source column
st.x<-data.frame(state=x)
#match source codes with codes from 'st.codes' local variable and use to return the full state name
refac.x<-st.codes$full[match(st.x$state,st.codes$state)]
#return the full state names in the same order in which they appeared in the original source
return(refac.x)
}
#Full data with AGI Stubs broken out
final <- dfAll %>%
group_by(YEAR, STATE, AGI_STUB)%>%
summarise_all(funs(sum)) %>%
select(STATE, YEAR, AGI_STUB, RETURNSCOUNT, NUM_BUS_INCOME, `SAL&WAGES`, UNEMP_COMP_Count, `UNEMP_COMP$`, NUM_RE_TAXES, AGI) %>%
mutate(PERC_BUSINESS_RETURN = NUM_BUS_INCOME/RETURNSCOUNT*100, UNEMP_COMP_PR = `UNEMP_COMP$`/RETURNSCOUNT*1000,
PERC_RE = NUM_RE_TAXES/RETURNSCOUNT*100, AGI_PR = AGI/RETURNSCOUNT*1000)
## Warning: package 'bindrcpp' was built under R version 3.3.3
final$PERC_BUSINESS_RETURN <- round(final$PERC_BUSINESS_RETURN, 2)
final$PERC_RE <- round(final$PERC_RE, 2)
final$UNEMP_COMP_PR <- round(final$UNEMP_COMP_PR, 2)
final$INCOME <- final$AGI_STUB
final$INCOME[final$AGI_STUB==1] <- 'Under 25k'
final$INCOME[final$AGI_STUB==2] <- '25k - 50k'
final$INCOME[final$AGI_STUB==3] <- '50k - 75k'
final$INCOME[final$AGI_STUB==4] <- '75k - 100k'
final$INCOME[final$AGI_STUB==5] <- '100k - 200k'
final$INCOME[final$AGI_STUB==6] <- '200k+'
final <- final %>%
select(STATE, YEAR, INCOME, PERC_BUSINESS_RETURN, UNEMP_COMP_PR, PERC_RE, AGI_PR)
final$STATE <- stateFromLower(final$STATE)
final$STATE <- str_to_title(final$STATE)
#Add US total so that can also be looked at on an AGI level.
usa <- dfAll %>%
#remove State
select(-c(STATE)) %>%
group_by(YEAR, AGI_STUB)%>%
summarise_all(funs(sum)) %>%
select(YEAR, AGI_STUB, RETURNSCOUNT, NUM_BUS_INCOME, `SAL&WAGES`, UNEMP_COMP_Count, `UNEMP_COMP$`, NUM_RE_TAXES, AGI) %>%
mutate(PERC_BUSINESS_RETURN = NUM_BUS_INCOME/RETURNSCOUNT*100, UNEMP_COMP_PR = `UNEMP_COMP$`/RETURNSCOUNT*1000,
PERC_RE = NUM_RE_TAXES/RETURNSCOUNT*100, AGI_PR = AGI/RETURNSCOUNT*1000)
usa$PERC_BUSINESS_RETURN <- round(usa$PERC_BUSINESS_RETURN, 2)
usa$PERC_RE <- round(usa$PERC_RE, 2)
usa$UNEMP_COMP_PR <- round(usa$UNEMP_COMP_PR, 2)
usa$INCOME <- usa$AGI_STUB
usa$INCOME[usa$AGI_STUB==1] <- 'Under 25k'
usa$INCOME[usa$AGI_STUB==2] <- '25k - 50k'
usa$INCOME[usa$AGI_STUB==3] <- '50k - 75k'
usa$INCOME[usa$AGI_STUB==4] <- '75k - 100k'
usa$INCOME[usa$AGI_STUB==5] <- '100k - 200k'
usa$INCOME[usa$AGI_STUB==6] <- '200k+'
usa$STATE <- 'United States'
usa <- usa %>%
select(STATE, YEAR, INCOME, PERC_BUSINESS_RETURN, UNEMP_COMP_PR, PERC_RE, AGI_PR)
#union to the two df so we can select data for any State and compare it to the national data
combined <- union(final, usa)
write.csv(combined, file = "final.csv", row.names = FALSE)
#Do the same as above but for state totals rather than by income groups and join it with lat/long info for the map.
state_tot <- dfAll %>%
group_by(YEAR, STATE) %>%
summarise_all(funs(sum)) %>%
select(STATE, YEAR, AGI_STUB, RETURNSCOUNT, NUM_BUS_INCOME, `SAL&WAGES`, UNEMP_COMP_Count, `UNEMP_COMP$`, NUM_RE_TAXES, AGI) %>%
mutate(PERC_BUSINESS_RETURN = NUM_BUS_INCOME/RETURNSCOUNT*100, UNEMP_COMP_PR = `UNEMP_COMP$`/RETURNSCOUNT*1000,
PERC_RE = NUM_RE_TAXES/RETURNSCOUNT*100, AGI_PR = AGI/RETURNSCOUNT*1000)
state_tot$PERC_BUSINESS_RETURN <- round(state_tot$PERC_BUSINESS_RETURN, 2)
state_tot$PERC_RE <- round(state_tot$PERC_RE, 2)
state_tot$UNEMP_COMP_PR <- round(state_tot$UNEMP_COMP_PR, 2)
state_tot <- state_tot %>%
select(STATE, YEAR, PERC_BUSINESS_RETURN, UNEMP_COMP_PR, PERC_RE, AGI_PR)
state_tot$STATE <- stateFromLower(state_tot$STATE)
state_tot <- inner_join(state_latlong, state_tot, by ="STATE")
## Warning: Column `STATE` joining factors with different levels, coercing to
## character vector
write.csv(state_tot, file = "state_tot.csv", row.names = FALSE)
#State totals without lat/long
summSt <- dfAll %>%
group_by(YEAR, STATE) %>%
summarise_all(funs(sum)) %>%
select(STATE, YEAR, AGI_STUB, RETURNSCOUNT, NUM_BUS_INCOME, `SAL&WAGES`, UNEMP_COMP_Count, `UNEMP_COMP$`, NUM_RE_TAXES, AGI) %>%
mutate(PERC_BUSINESS_RETURN = NUM_BUS_INCOME/RETURNSCOUNT*100, UNEMP_COMP_PR = `UNEMP_COMP$`/RETURNSCOUNT*1000,
PERC_RE = NUM_RE_TAXES/RETURNSCOUNT*100, AGI_PR = AGI/RETURNSCOUNT*1000)
summSt$PERC_BUSINESS_RETURN <- round(summSt$PERC_BUSINESS_RETURN, 2)
summSt$PERC_RE <- round(summSt$PERC_RE, 2)
summSt$UNEMP_COMP_PR <- round(summSt$UNEMP_COMP_PR, 2)
#change numeric values to descriptive charaters
summSt$INCOME <- summSt$AGI_STUB
summSt$INCOME[summSt$AGI_STUB==1] <- 'Under 25k'
summSt$INCOME[summSt$AGI_STUB==2] <- '25k - 50k'
summSt$INCOME[summSt$AGI_STUB==3] <- '50k - 75k'
summSt$INCOME[summSt$AGI_STUB==4] <- '75k - 100k'
summSt$INCOME[summSt$AGI_STUB==5] <- '100k - 200k'
summSt$INCOME[summSt$AGI_STUB==6] <- '200k+'
summSt <- summSt %>%
select(STATE, YEAR, PERC_BUSINESS_RETURN, UNEMP_COMP_PR, PERC_RE, AGI_PR)
summSt$STATE <- stateFromLower(summSt$STATE)
write.csv(summSt, file = "summSt.csv", row.names = FALSE)