#Read in ACS data from IPUMS

usa_ddi <- read_ipums_ddi("usa_00026.xml")
acs_data <- read_ipums_micro("usa_00026.xml", data_file = ("usa_00026.csv.gz"), verbose = FALSE)

#Convert variable names to lower case
names(acs_data) <- tolower(names(acs_data))

# #View(acs_data)
# is.data.frame(acs_data)
# describe(acs_data)
#Determine how many NAs there are 
sum(is.na(acs_data))
## [1] 0
#Percent missing values per variable
apply(acs_data, 2, function(col)sum(is.na(col))/length(col))
##     year   sample   serial cbserial     hhwt  cluster statefip   strata 
##        0        0        0        0        0        0        0        0 
##       gq   pernum    perwt      age     race    raced   hispan  hispand 
##        0        0        0        0        0        0        0        0 
##  empstat empstatd uhrswork   inctot  incwage 
##        0        0        0        0        0
#Remove missing cases 
na.omit(acs_data)
## # A tibble: 392,924 × 21
##     year            sample serial cbserial  hhwt cluster statefip strata      gq
##    <int>         <int+lbl>  <dbl>    <dbl> <dbl>   <dbl> <int+lb>  <dbl> <int+l>
##  1  2016 201601 [2016 ACS] 1.15e6        1    57 2.02e12 48 [Tex…  10048 1 [Hou…
##  2  2016 201601 [2016 ACS] 1.15e6       14    30 2.02e12 48 [Tex… 320048 1 [Hou…
##  3  2016 201601 [2016 ACS] 1.15e6       14    30 2.02e12 48 [Tex… 320048 1 [Hou…
##  4  2016 201601 [2016 ACS] 1.15e6       46   191 2.02e12 48 [Tex… 460648 1 [Hou…
##  5  2016 201601 [2016 ACS] 1.15e6       53    59 2.02e12 48 [Tex… 210248 1 [Hou…
##  6  2016 201601 [2016 ACS] 1.15e6      109   299 2.02e12 48 [Tex… 590748 1 [Hou…
##  7  2016 201601 [2016 ACS] 1.15e6      144    48 2.02e12 48 [Tex… 251248 1 [Hou…
##  8  2016 201601 [2016 ACS] 1.15e6      144    48 2.02e12 48 [Tex… 251248 1 [Hou…
##  9  2016 201601 [2016 ACS] 1.15e6      153   180 2.02e12 48 [Tex… 670348 1 [Hou…
## 10  2016 201601 [2016 ACS] 1.15e6      154    52 2.02e12 48 [Tex… 520148 1 [Hou…
## # … with 392,914 more rows, and 12 more variables: pernum <dbl>, perwt <dbl>,
## #   age <int+lbl>, race <int+lbl>, raced <int+lbl>, hispan <int+lbl>,
## #   hispand <int+lbl>, empstat <int+lbl>, empstatd <int+lbl>,
## #   uhrswork <int+lbl>, inctot <dbl+lbl>, incwage <dbl+lbl>
###Recodes###

#Filter for people who usually work 30+ hours a week
fulltime_workers <- acs_data %>% 
  filter(uhrswork %in% c(30:99)) %>% 
#Condense Race and Ethnicity categories to create a new race/ethnicity variable
  mutate(race_eth=case_when(hispan %in% c(1:4) & race %in% c(1:9) ~ "Hispanic",
                             hispan == 0 & race == 1 ~ "White, non-Hispanic", 
                             hispan == 0 & race == 2 ~ "Black, non-Hispanic",
                             hispan == 0 & race == 3 ~ "AIAN, non-Hispanic",
                             hispan == 0 & race %in% c(4:6) ~ "Asian or Pacific Islander, non-Hispanic",
                             hispan == 0 & race == 7 ~ "Other, non-Hispanic",
                             hispan == 0 & race %in% c(8,9) ~ "Multiracial, non-Hispanic",
                             TRUE ~ NA_character_))

# Get median income by race/ethnicity and year
median_income_data <- fulltime_workers %>%
  aggregate(incwage ~ year + race_eth, data = ., FUN = median) %>%
  rename(median_income = incwage) %>%
  arrange(year, race_eth)

# View the data with the two new variables (race/ethnicity and poverty level)
view(median_income_data)
#Create a line chart showing median income by race and ethnicity over time

ggplot(median_income_data) +
  aes(x = year, y = median_income, color = race_eth) +
  geom_line()

#Create bar charts for each year showing median income by race and ethnicity

ggplot(median_income_data) + 
  aes(x = year, y = median_income, fill = race_eth) +
  geom_col(position = "dodge") +
  labs(x = "Year", y = "Median Income")

#Create excel file with new variables 

library(writexl)
write_xlsx(median_income_data,"C:\\Users\\kaitl\\Documents\\Every Texan\\R\\TX Median Full Time Income ACS\\Median_Full-Time_Income.xlsx")