#Read in ACS data from IPUMS
usa_ddi <- read_ipums_ddi("usa_00026.xml")
acs_data <- read_ipums_micro("usa_00026.xml", data_file = ("usa_00026.csv.gz"), verbose = FALSE)
#Convert variable names to lower case
names(acs_data) <- tolower(names(acs_data))
# #View(acs_data)
# is.data.frame(acs_data)
# describe(acs_data)
#Determine how many NAs there are
sum(is.na(acs_data))
## [1] 0
#Percent missing values per variable
apply(acs_data, 2, function(col)sum(is.na(col))/length(col))
## year sample serial cbserial hhwt cluster statefip strata
## 0 0 0 0 0 0 0 0
## gq pernum perwt age race raced hispan hispand
## 0 0 0 0 0 0 0 0
## empstat empstatd uhrswork inctot incwage
## 0 0 0 0 0
#Remove missing cases
na.omit(acs_data)
## # A tibble: 392,924 × 21
## year sample serial cbserial hhwt cluster statefip strata gq
## <int> <int+lbl> <dbl> <dbl> <dbl> <dbl> <int+lb> <dbl> <int+l>
## 1 2016 201601 [2016 ACS] 1.15e6 1 57 2.02e12 48 [Tex… 10048 1 [Hou…
## 2 2016 201601 [2016 ACS] 1.15e6 14 30 2.02e12 48 [Tex… 320048 1 [Hou…
## 3 2016 201601 [2016 ACS] 1.15e6 14 30 2.02e12 48 [Tex… 320048 1 [Hou…
## 4 2016 201601 [2016 ACS] 1.15e6 46 191 2.02e12 48 [Tex… 460648 1 [Hou…
## 5 2016 201601 [2016 ACS] 1.15e6 53 59 2.02e12 48 [Tex… 210248 1 [Hou…
## 6 2016 201601 [2016 ACS] 1.15e6 109 299 2.02e12 48 [Tex… 590748 1 [Hou…
## 7 2016 201601 [2016 ACS] 1.15e6 144 48 2.02e12 48 [Tex… 251248 1 [Hou…
## 8 2016 201601 [2016 ACS] 1.15e6 144 48 2.02e12 48 [Tex… 251248 1 [Hou…
## 9 2016 201601 [2016 ACS] 1.15e6 153 180 2.02e12 48 [Tex… 670348 1 [Hou…
## 10 2016 201601 [2016 ACS] 1.15e6 154 52 2.02e12 48 [Tex… 520148 1 [Hou…
## # … with 392,914 more rows, and 12 more variables: pernum <dbl>, perwt <dbl>,
## # age <int+lbl>, race <int+lbl>, raced <int+lbl>, hispan <int+lbl>,
## # hispand <int+lbl>, empstat <int+lbl>, empstatd <int+lbl>,
## # uhrswork <int+lbl>, inctot <dbl+lbl>, incwage <dbl+lbl>
###Recodes###
#Filter for people who usually work 30+ hours a week
fulltime_workers <- acs_data %>%
filter(uhrswork %in% c(30:99)) %>%
#Condense Race and Ethnicity categories to create a new race/ethnicity variable
mutate(race_eth=case_when(hispan %in% c(1:4) & race %in% c(1:9) ~ "Hispanic",
hispan == 0 & race == 1 ~ "White, non-Hispanic",
hispan == 0 & race == 2 ~ "Black, non-Hispanic",
hispan == 0 & race == 3 ~ "AIAN, non-Hispanic",
hispan == 0 & race %in% c(4:6) ~ "Asian or Pacific Islander, non-Hispanic",
hispan == 0 & race == 7 ~ "Other, non-Hispanic",
hispan == 0 & race %in% c(8,9) ~ "Multiracial, non-Hispanic",
TRUE ~ NA_character_))
# Get median income by race/ethnicity and year
median_income_data <- fulltime_workers %>%
aggregate(incwage ~ year + race_eth, data = ., FUN = median) %>%
rename(median_income = incwage) %>%
arrange(year, race_eth)
# View the data with the two new variables (race/ethnicity and poverty level)
view(median_income_data)
#Create a line chart showing median income by race and ethnicity over time
ggplot(median_income_data) +
aes(x = year, y = median_income, color = race_eth) +
geom_line()

#Create bar charts for each year showing median income by race and ethnicity
ggplot(median_income_data) +
aes(x = year, y = median_income, fill = race_eth) +
geom_col(position = "dodge") +
labs(x = "Year", y = "Median Income")

#Create excel file with new variables
library(writexl)
write_xlsx(median_income_data,"C:\\Users\\kaitl\\Documents\\Every Texan\\R\\TX Median Full Time Income ACS\\Median_Full-Time_Income.xlsx")