# install.packages("haven")
library(haven)
# install.packages("dplyr")
library(dplyr)
# install.packages("tidyverse")
library(tidyverse)
haven
libraryLet’s import BRFSS survey data to R. The .xpt file is an open format, as is required for submission of the data to the FDA.
Look at the documentation for the variable columns to identify the variables of interests: https://www.cdc.gov/brfss/annual_data/2020/llcp_varlayout_20_onecolumn.html and, https://www.cdc.gov/brfss/annual_data/2020/pdf/2020-calculated-variables-version4-508.pdf
There are at least two common ways of shortlisting named columns, one by specifying the column number, the other by specifying the exact variable names.
# specify column number
df1_Mini <- df1[,c(1,2,5,10,51)] # random numbers for demonstration
# specifyvariable name
df1_Mini1 <- df1[,c("SEXVAR", "MEDCOST", "GENHLTH", "PHYSHLTH", "ASTHMA3")]
# specify variable name using the dplyr library
df_Mini2 <- df1 %>%
select(SEXVAR, MEDCOST, GENHLTH, PHYSHLTH, ASTHMA3)
Check if you can filter the dataset on the (CDC platform)https://data.cdc.gov/Environmental-Health-Toxicology/Daily-PM2-5-Concentrations-All-County-2001-2016/7vdq-ztk9/data. Go to Filter
, then Add a New Filter Condition based on year
and date
. Download the filtered dataset.
Alternatively, you can use the following codes to subset a large dataset based on variable attributes.
air_data <- read.csv("Daily_PM2.5_County_2001-2016.csv")
# a huge dataset, took a while to load up!
air_data_2016 <- subset(air_data, air_data$year == "2016" &
air_data$date =="01JAN2016")
# observations reduced to 3109
summary(air_data_2016)
## year date statefips countyfips
## Min. :2016 Length:3109 Min. : 1.00 Min. : 1.0
## 1st Qu.:2016 Class :character 1st Qu.:19.00 1st Qu.: 35.0
## Median :2016 Mode :character Median :29.00 Median : 79.0
## Mean :2016 Mean :30.58 Mean :103.4
## 3rd Qu.:2016 3rd Qu.:46.00 3rd Qu.:133.0
## Max. :2016 Max. :56.00 Max. :840.0
## PM25_max_pred PM25_med_pred PM25_mean_pred PM25_pop_pred
## Min. : 0.3444 Min. : 0.2332 Min. : 0.2568 Min. : 0.2479
## 1st Qu.: 6.0805 1st Qu.: 5.7461 1st Qu.: 5.7229 1st Qu.: 5.7428
## Median : 8.1110 Median : 7.7338 Median : 7.7378 Median : 7.7421
## Mean : 8.5758 Mean : 7.9959 Mean : 7.9844 Mean : 7.9959
## 3rd Qu.: 10.1538 3rd Qu.: 9.7228 3rd Qu.: 9.7107 3rd Qu.: 9.7188
## Max. :103.2905 Max. :75.6489 Max. :69.0241 Max. :67.7220
A county FIPS code is a 5-digit number that uniquely identifies each county in the United States. It is a concatentation of - reading from left to right - the 2-digit state code and the 3-digit county code.
# the number between %__d represent the number of digits for that variable
# and how many zeros you can pad a string with.
air_data_2016$statefips1 <- sprintf("%02d", air_data_2016$statefips)
str(air_data_2016$statefips1)
## chr [1:3109] "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" ...
air_data_2016$countyfips1 <- sprintf("%03d", air_data_2016$countyfips)
str(air_data_2016$countyfips1)
## chr [1:3109] "001" "003" "005" "007" "009" "011" "013" "015" "017" "019" ...
# Create a combined state and county FIPS code into one
air_data_2016$FIPS <- paste(air_data_2016$statefips1, air_data_2016$countyfips1, sep="")
str(air_data_2016$FIPS) # format as character
## chr [1:3109] "01001" "01003" "01005" "01007" "01009" "01011" "01013" ...
gsub
As part of the data cleaning process, you will notice some unwanted characters in your dataset. We can alter the entire column using gsub
. Here, we only want to keep the last five FIPS digits by removing “0500000US” from CRE_data$GEO_ID
. we also create and assign a new variable called CRE_data$FIPS
.
Let’s load up another data called “Community Resilience Estimates” https://www.census.gov/programs-surveys/community-resilience-estimates/data/supplement.html
CRE_data <- read.csv("CRE_Equity_County_19.csv")
CRE_data$FIPS <- CRE_data$GEO_ID %>%
stringr::str_remove(pattern = "0500000US")
str(CRE_data$FIPS)
## chr [1:3142] "01001" "01003" "01005" "01007" "01009" "01011" "01013" ...
Now, we have two datasets (CRE_data and air_data_2016) that share a common variable called FIPS
. We can merge the two datasets based on this common column at the county level.
The two key decisions you need to consider are (1) which dataset A (say, air_data_2016) you want to keep, and (2) which variables from the other dataset B (say, CRE_data) you want to merge with dataset A.
# if you want to merge all variables together
new_data_all <- merge(air_data_2016, CRE_data,
by = "FIPS", all.x = TRUE)
# since not all variables in CRE_data are relevant, let's choose a few to work with.
new_data_small <- merge(air_data_2016, CRE_data[c("FIPS","POPUNI","PRED0_PE",
"NH_White_alone_PE",
"GINI_IND_Inequality_E",
"No_Health_Ins_PE")],
by = "FIPS", all.x = TRUE)