Clean Data

Places CDC

library(readr)

places_23 <- read_csv("PLACES__Local_Data_for_Better_Health__Census_Tract_Data_2023_release_20240321.csv")

## Rows: 2555113 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (16): StateAbbr, StateDesc, CountyName, CountyFIPS, LocationName, DataSo...
## dbl  (5): Year, Data_Value, Low_Confidence_Limit, High_Confidence_Limit, Tot...
## lgl  (2): Data_Value_Footnote_Symbol, Data_Value_Footnote
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

places_22 <- read_csv("PLACES__Local_Data_for_Better_Health__Census_Tract_Data_2022_release_20240325.csv")

## Rows: 2161543 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (16): StateAbbr, StateDesc, CountyName, CountyFIPS, LocationName, DataSo...
## dbl  (5): Year, Data_Value, Low_Confidence_Limit, High_Confidence_Limit, Tot...
## lgl  (2): Data_Value_Footnote_Symbol, Data_Value_Footnote
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

places_21 <- read_csv("PLACES__Local_Data_for_Better_Health__Census_Tract_Data_2021_release_20240325.csv")

## Rows: 2125561 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (16): StateAbbr, StateDesc, CountyName, CountyFIPS, LocationName, DataSo...
## dbl  (5): Year, Data_Value, Low_Confidence_Limit, High_Confidence_Limit, Tot...
## lgl  (2): Data_Value_Footnote_Symbol, Data_Value_Footnote
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

places_20 <- read_csv("PLACES__Local_Data_for_Better_Health__Census_Tract_Data_2020_release_20240325.csv")

## Rows: 2024865 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (16): StateAbbr, StateDesc, CountyName, CountyFIPS, LocationName, DataSo...
## dbl  (5): Year, Data_Value, Low_Confidence_Limit, High_Confidence_Limit, Tot...
## lgl  (2): Data_Value_Footnote_Symbol, Data_Value_Footnote
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

head(places_23)

## # A tibble: 6 × 23
##    Year StateAbbr StateDesc CountyName CountyFIPS LocationName DataSource
##   <dbl> <chr>     <chr>     <chr>      <chr>      <chr>        <chr>     
## 1  2021 AL        Alabama   Baldwin    01003      01003011300  BRFSS     
## 2  2021 AL        Alabama   Chambers   01017      01017954000  BRFSS     
## 3  2021 AL        Alabama   Cleburne   01029      01029959600  BRFSS     
## 4  2021 AL        Alabama   Covington  01039      01039962000  BRFSS     
## 5  2021 AL        Alabama   Fayette    01057      01057020400  BRFSS     
## 6  2021 AL        Alabama   Houston    01069      01069040301  BRFSS     
## # ℹ 16 more variables: Category <chr>, Measure <chr>, Data_Value_Unit <chr>,
## #   Data_Value_Type <chr>, Data_Value <dbl>, Data_Value_Footnote_Symbol <lgl>,
## #   Data_Value_Footnote <lgl>, Low_Confidence_Limit <dbl>,
## #   High_Confidence_Limit <dbl>, TotalPopulation <dbl>, Geolocation <chr>,
## #   LocationID <chr>, CategoryID <chr>, MeasureId <chr>, DataValueTypeID <chr>,
## #   Short_Question_Text <chr>

head(places_22)

## # A tibble: 6 × 23
##    Year StateAbbr StateDesc CountyName CountyFIPS LocationName DataSource
##   <dbl> <chr>     <chr>     <chr>      <chr>      <chr>        <chr>     
## 1  2020 AL        Alabama   Barbour    01005      01005950800  BRFSS     
## 2  2020 AL        Alabama   Bibb       01007      01007010002  BRFSS     
## 3  2020 AL        Alabama   Calhoun    01015      01015000900  BRFSS     
## 4  2020 AL        Alabama   Calhoun    01015      01015002400  BRFSS     
## 5  2020 AL        Alabama   Cherokee   01019      01019955900  BRFSS     
## 6  2020 AL        Alabama   Chilton    01021      01021060401  BRFSS     
## # ℹ 16 more variables: Category <chr>, Measure <chr>, Data_Value_Unit <chr>,
## #   Data_Value_Type <chr>, Data_Value <dbl>, Data_Value_Footnote_Symbol <lgl>,
## #   Data_Value_Footnote <lgl>, Low_Confidence_Limit <dbl>,
## #   High_Confidence_Limit <dbl>, TotalPopulation <dbl>, Geolocation <chr>,
## #   LocationID <chr>, CategoryID <chr>, MeasureId <chr>, DataValueTypeID <chr>,
## #   Short_Question_Text <chr>

head(places_21)

## # A tibble: 6 × 23
##    Year StateAbbr StateDesc CountyName CountyFIPS LocationName DataSource
##   <dbl> <chr>     <chr>     <chr>      <chr>      <chr>        <chr>     
## 1  2018 AL        Alabama   Baldwin    01003      01003011601  BRFSS     
## 2  2019 ME        Maine     Waldo      23027      23027045000  BRFSS     
## 3  2019 AL        Alabama   Calhoun    01015      01015002300  BRFSS     
## 4  2019 LA        Louisiana Tangipahoa 22105      22105953500  BRFSS     
## 5  2019 ME        Maine     Cumberland 23005      23005014000  BRFSS     
## 6  2018 LA        Louisiana St. Mary   22101      22101041100  BRFSS     
## # ℹ 16 more variables: Category <chr>, Measure <chr>, Data_Value_Unit <chr>,
## #   Data_Value_Type <chr>, Data_Value <dbl>, Data_Value_Footnote_Symbol <lgl>,
## #   Data_Value_Footnote <lgl>, Low_Confidence_Limit <dbl>,
## #   High_Confidence_Limit <dbl>, TotalPopulation <dbl>, Geolocation <chr>,
## #   LocationID <chr>, CategoryID <chr>, MeasureId <chr>, DataValueTypeID <chr>,
## #   Short_Question_Text <chr>

head(places_20)

## # A tibble: 6 × 23
##    Year StateAbbr StateDesc CountyName CountyFIPS LocationName DataSource
##   <dbl> <chr>     <chr>     <chr>      <chr>      <chr>        <chr>     
## 1  2018 AL        Alabama   Autauga    01001      01001020100  BRFSS     
## 2  2018 AL        Alabama   Autauga    01001      01001020100  BRFSS     
## 3  2018 AL        Alabama   Autauga    01001      01001020100  BRFSS     
## 4  2017 AL        Alabama   Autauga    01001      01001020100  BRFSS     
## 5  2017 AL        Alabama   Autauga    01001      01001020100  BRFSS     
## 6  2018 AL        Alabama   Autauga    01001      01001020100  BRFSS     
## # ℹ 16 more variables: Category <chr>, Measure <chr>, Data_Value_Unit <chr>,
## #   Data_Value_Type <chr>, Data_Value <dbl>, Data_Value_Footnote_Symbol <lgl>,
## #   Data_Value_Footnote <lgl>, Low_Confidence_Limit <dbl>,
## #   High_Confidence_Limit <dbl>, TotalPopulation <dbl>, Geolocation <chr>,
## #   LocationID <chr>, CategoryID <chr>, MeasureId <chr>, DataValueTypeID <chr>,
## #   Short_Question_Text <chr>

table(places_23$Year)

## 
##    2020    2021 
##  578125 1976988

table(places_22$Year)

## 
##    2019    2020 
##  281352 1880191

table(places_21$Year)

## 
##    2018    2019 
##  578125 1547436

table(places_20$Year)

## 
##    2017    2018 
##  289348 1735517

library(dplyr)

## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(purrr)

data_21 <- places_23 %>%
  filter(Year == "2021") 

data_20 <- places_22 %>% 
  filter(Year == "2020")

data_19 <- places_21 %>% 
  filter(Year == "2019")

data_18 <- places_20 %>% 
  filter(Year == "2018")

data_17 <- places_20 %>% 
  filter(Year == "2017")








###### 2017 data ##### : Hypertension, High Cholesterol


table(data_17$Short_Question_Text)

## 
## Cholesterol Screening   High Blood Pressure      High Cholesterol 
##                 72337                 72337                 72337 
##  Taking BP Medication 
##                 72337

table(data_17$MeasureId)

## 
##     BPHIGH      BPMED CHOLSCREEN   HIGHCHOL 
##      72337      72337      72337      72337

#### 1. Create the hypertension data frame

hypertension_data_17 <- data_17 %>%
  filter(Short_Question_Text == "High Blood Pressure")  %>%
  rename(hypertension = Data_Value) 


#### 2. Create the High Cholesterol data frame

high_cholesterol_data_17 <- data_17 %>%
  filter(Short_Question_Text == "High Cholesterol") %>%
  rename(high_cholesterol = Data_Value) %>% 
  select(high_cholesterol, LocationID)### rename the variable and Keep the two variable, which are required variable and the merge variable


#### 3. Merge data

data_list_17 <- list(hypertension_data_17, high_cholesterol_data_17) ### generate the new data list for merging

data_2017 <- reduce(data_list_17, left_join, by = "LocationID") ### merge data by the LocationID variable

data_2017 <- data_2017 %>%
  select(Year, StateAbbr, StateDesc, CountyName, CountyFIPS, LocationID, hypertension, high_cholesterol, Data_Value_Unit, TotalPopulation, Short_Question_Text)












###### 2018 data ##### : Cancer, Diabetes, Obesity

#### 1. Check Data

table(data_18$Short_Question_Text)

## 
##                           Annual Checkup 
##                                    72337 
##                                Arthritis 
##                                    72337 
##                           Binge Drinking 
##                                    72337 
##                     Cancer (except skin) 
##                                    72337 
##                Cervical Cancer Screening 
##                                    72320 
##                   Chronic Kidney Disease 
##                                    72337 
##              Colorectal Cancer Screening 
##                                    72305 
##                                     COPD 
##                                    72337 
##   Core preventive services for older men 
##                                    72193 
## Core preventive services for older women 
##                                    72142 
##                   Coronary Heart Disease 
##                                    72337 
##                           Current Asthma 
##                                    72337 
##                          Current Smoking 
##                                    72337 
##                             Dental Visit 
##                                    72337 
##                                 Diabetes 
##                                    72337 
##                         Health Insurance 
##                                    72337 
##                              Mammography 
##                                    72253 
##                            Mental Health 
##                                    72337 
##                                  Obesity 
##                                    72337 
##                          Physical Health 
##                                    72337 
##                      Physical Inactivity 
##                                    72337 
##                           Sleep <7 hours 
##                                    72337 
##                                   Stroke 
##                                    72337 
##                               Teeth Loss 
##                                    72238

table(data_18$MeasureId)

## 
##      ACCESS2    ARTHRITIS        BINGE       CANCER      CASTHMA     CERVICAL 
##        72337        72337        72337        72337        72337        72320 
##          CHD      CHECKUP COLON_SCREEN         COPD        COREM        COREW 
##        72337        72337        72305        72337        72193        72142 
##     CSMOKING       DENTAL     DIABETES       KIDNEY          LPA     MAMMOUSE 
##        72337        72337        72337        72337        72337        72253 
##        MHLTH      OBESITY        PHLTH        SLEEP       STROKE    TEETHLOST 
##        72337        72337        72337        72337        72337        72238

#### 2. Generate new data
cancer_data_18 <- data_18 %>%
  filter(Short_Question_Text == "Cancer (except skin)") %>%
  rename(cancer = Data_Value) 


create_data_frame <- function(data, question_text, new_var_name) {
  data %>%
    filter(Short_Question_Text == question_text) %>%
    rename(!!new_var_name := Data_Value) %>%
    select(!!new_var_name, LocationID)
}

diabetes_data_18 <- create_data_frame(data_18, "Diabetes", "diabetes")
obesity_data_18 <- create_data_frame(data_18, "Obesity", "obesity")


#### 3. Merge data

data_list_18 <- list(cancer_data_18, diabetes_data_18, obesity_data_18) ### generate the new data list for merging

data_2018 <- reduce(data_list_18, left_join, by = "LocationID") ### merge data by the LocationID variable

data_2018 <- data_2018 %>%
  select(Year, StateAbbr, StateDesc, CountyName, CountyFIPS, LocationID, cancer, diabetes, obesity, Data_Value_Unit, TotalPopulation, Short_Question_Text)












###### 2019 data ##### : Cancer, Diabetes, Hypertension, High Cholesterol, Obesity

#### 1. Check Data

table(data_19$Short_Question_Text)

## 
##         Annual Checkup              Arthritis         Binge Drinking 
##                  70338                  70338                  70338 
##   Cancer (except skin)  Cholesterol Screening Chronic Kidney Disease 
##                  70338                  70338                  70338 
##                   COPD Coronary Heart Disease         Current Asthma 
##                  70338                  70338                  70338 
##        Current Smoking             Depression               Diabetes 
##                  70338                  70338                  70338 
##         General Health       Health Insurance    High Blood Pressure 
##                  70338                  70338                  70338 
##       High Cholesterol          Mental Health                Obesity 
##                  70338                  70338                  70338 
##        Physical Health    Physical Inactivity                 Stroke 
##                  70338                  70338                  70338 
##   Taking BP Medication 
##                  70338

table(data_19$MeasureId)

## 
##    ACCESS2  ARTHRITIS      BINGE     BPHIGH      BPMED     CANCER    CASTHMA 
##      70338      70338      70338      70338      70338      70338      70338 
##        CHD    CHECKUP CHOLSCREEN       COPD   CSMOKING DEPRESSION   DIABETES 
##      70338      70338      70338      70338      70338      70338      70338 
##      GHLTH   HIGHCHOL     KIDNEY        LPA      MHLTH    OBESITY      PHLTH 
##      70338      70338      70338      70338      70338      70338      70338 
##     STROKE 
##      70338

#### 2. Generate new data
cancer_data_19 <- data_19 %>%
  filter(Short_Question_Text == "Cancer (except skin)") %>%
  rename(cancer = Data_Value) 


create_data_frame <- function(data, question_text, new_var_name) {
  data %>%
    filter(Short_Question_Text == question_text) %>%
    rename(!!new_var_name := Data_Value) %>%
    select(!!new_var_name, LocationID)
}

diabetes_data_19 <- create_data_frame(data_19, "Diabetes", "diabetes")
hypertension_data_19 <- create_data_frame(data_19, "High Blood Pressure", "hypertension")
high_cholesterol_data_19 <- create_data_frame(data_19, "High Cholesterol", "high_cholesterol")
obesity_data_19 <- create_data_frame(data_19, "Obesity", "obesity")


#### 3. Merge data

data_list_19 <- list(cancer_data_19, diabetes_data_19, hypertension_data_19, high_cholesterol_data_19, obesity_data_19) ### generate the new data list for merging

data_2019 <- reduce(data_list_19, left_join, by = "LocationID") ### merge data by the LocationID variable

data_2019 <- data_2019 %>%
  select(Year, StateAbbr, StateDesc, CountyName, CountyFIPS, LocationID, cancer, diabetes, hypertension, high_cholesterol, obesity, Data_Value_Unit, TotalPopulation, Short_Question_Text)













###### 2020 data ##### : Cancer, Diabetes, Obesity

#### 1. Check Data

table(data_20$Short_Question_Text)

## 
##                           All Teeth Lost 
##                                    72238 
##                           Annual Checkup 
##                                    72337 
##                                Arthritis 
##                                    72337 
##                           Binge Drinking 
##                                    72337 
##                     Cancer (except skin) 
##                                    72337 
##                Cervical Cancer Screening 
##                                    72320 
##                   Chronic Kidney Disease 
##                                    72337 
##              Colorectal Cancer Screening 
##                                    72305 
##                                     COPD 
##                                    72337 
##   Core preventive services for older men 
##                                    72193 
## Core preventive services for older women 
##                                    72142 
##                   Coronary Heart Disease 
##                                    72337 
##                           Current Asthma 
##                                    72337 
##                          Current Smoking 
##                                    72337 
##                             Dental Visit 
##                                    72337 
##                               Depression 
##                                    72337 
##                                 Diabetes 
##                                    72337 
##                           General Health 
##                                    72337 
##                         Health Insurance 
##                                    72337 
##                              Mammography 
##                                    72253 
##                            Mental Health 
##                                    72337 
##                                  Obesity 
##                                    72337 
##                          Physical Health 
##                                    72337 
##                      Physical Inactivity 
##                                    72337 
##                           Sleep <7 hours 
##                                    72337 
##                                   Stroke 
##                                    72337

table(data_20$MeasureId)

## 
##      ACCESS2    ARTHRITIS        BINGE       CANCER      CASTHMA     CERVICAL 
##        72337        72337        72337        72337        72337        72320 
##          CHD      CHECKUP COLON_SCREEN         COPD        COREM        COREW 
##        72337        72337        72305        72337        72193        72142 
##     CSMOKING       DENTAL   DEPRESSION     DIABETES        GHLTH       KIDNEY 
##        72337        72337        72337        72337        72337        72337 
##          LPA     MAMMOUSE        MHLTH      OBESITY        PHLTH        SLEEP 
##        72337        72253        72337        72337        72337        72337 
##       STROKE    TEETHLOST 
##        72337        72238

#### 2. Generate new data
cancer_data_20 <- data_20 %>%
  filter(Short_Question_Text == "Cancer (except skin)") %>%
  rename(cancer = Data_Value) 


create_data_frame <- function(data, question_text, new_var_name) {
  data %>%
    filter(Short_Question_Text == question_text) %>%
    rename(!!new_var_name := Data_Value) %>%
    select(!!new_var_name, LocationID)
}

diabetes_data_20 <- create_data_frame(data_20, "Diabetes", "diabetes")
obesity_data_20 <- create_data_frame(data_20, "Obesity", "obesity")


#### 3. Merge data

data_list_20 <- list(cancer_data_20, diabetes_data_20, obesity_data_20) ### generate the new data list for merging

data_2020 <- reduce(data_list_20, left_join, by = "LocationID") ### merge data by the LocationID variable

data_2020 <- data_2020 %>%
  select(Year, StateAbbr, StateDesc, CountyName, CountyFIPS, LocationID, cancer, diabetes, obesity, Data_Value_Unit, TotalPopulation, Short_Question_Text)












###### 2021 data ##### : Cancer, Diabetes, Hypertension, High Cholesterol, Obesity

#### 1. Check Data

table(data_21$Short_Question_Text)

## 
##                Annual Checkup                Any Disability 
##                         68172                         68172 
##                     Arthritis                Binge Drinking 
##                         68172                         68172 
##          Cancer (except skin)         Cholesterol Screening 
##                         68172                         68172 
##        Chronic Kidney Disease          Cognitive Disability 
##                         68172                         68172 
##                          COPD        Coronary Heart Disease 
##                         68172                         68172 
##                Current Asthma               Current Smoking 
##                         68172                         68172 
##                    Depression                      Diabetes 
##                         68172                         68172 
##                General Health              Health Insurance 
##                         68172                         68172 
##            Hearing Disability           High Blood Pressure 
##                         68172                         68172 
##              High Cholesterol Independent Living Disability 
##                         68172                         68172 
##                 Mental Health           Mobility Disability 
##                         68172                         68172 
##                       Obesity               Physical Health 
##                         68172                         68172 
##           Physical Inactivity          Self-care Disability 
##                         68172                         68172 
##                        Stroke          Taking BP Medication 
##                         68172                         68172 
##             Vision Disability 
##                         68172

table(data_21$MeasureId)

## 
##    ACCESS2  ARTHRITIS      BINGE     BPHIGH      BPMED     CANCER    CASTHMA 
##      68172      68172      68172      68172      68172      68172      68172 
##        CHD    CHECKUP CHOLSCREEN  COGNITION       COPD   CSMOKING DEPRESSION 
##      68172      68172      68172      68172      68172      68172      68172 
##   DIABETES DISABILITY      GHLTH    HEARING   HIGHCHOL  INDEPLIVE     KIDNEY 
##      68172      68172      68172      68172      68172      68172      68172 
##        LPA      MHLTH   MOBILITY    OBESITY      PHLTH   SELFCARE     STROKE 
##      68172      68172      68172      68172      68172      68172      68172 
##     VISION 
##      68172

#### 2. Generate new data
cancer_data_21 <- data_21 %>%
  filter(Short_Question_Text == "Cancer (except skin)") %>%
  rename(cancer = Data_Value) 


create_data_frame <- function(data, question_text, new_var_name) {
  data %>%
    filter(Short_Question_Text == question_text) %>%
    rename(!!new_var_name := Data_Value) %>%
    select(!!new_var_name, LocationID)
}

diabetes_data_21 <- create_data_frame(data_21, "Diabetes", "diabetes")
hypertension_data_21 <- create_data_frame(data_21, "High Blood Pressure", "hypertension")
high_cholesterol_data_21 <- create_data_frame(data_21, "High Cholesterol", "high_cholesterol")
obesity_data_21 <- create_data_frame(data_21, "Obesity", "obesity")


#### 3. Merge data

data_list_21 <- list(cancer_data_21, diabetes_data_21, hypertension_data_21, high_cholesterol_data_21, obesity_data_21) ### generate the new data list for merging

data_2021 <- reduce(data_list_21, left_join, by = "LocationID") ### merge data by the LocationID variable

data_2021 <- data_2021 %>%
  select(Year, StateAbbr, StateDesc, CountyName, CountyFIPS, LocationID, cancer, diabetes, hypertension, high_cholesterol, obesity, Data_Value_Unit, TotalPopulation, Short_Question_Text)


merged_data <- bind_rows(data_2017, data_2018, data_2019, data_2020, data_2021)


summary(merged_data)

##       Year       StateAbbr          StateDesc          CountyName       
##  Min.   :2017   Length:355521      Length:355521      Length:355521     
##  1st Qu.:2018   Class :character   Class :character   Class :character  
##  Median :2019   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :2019                                                           
##  3rd Qu.:2020                                                           
##  Max.   :2021                                                           
##                                                                         
##   CountyFIPS         LocationID         hypertension    high_cholesterol
##  Length:355521      Length:355521      Min.   : 4.80    Min.   : 6.20   
##  Class :character   Class :character   1st Qu.:27.50    1st Qu.:30.50   
##  Mode  :character   Mode  :character   Median :31.80    Median :33.50   
##                                        Mean   :32.31    Mean   :33.16   
##                                        3rd Qu.:36.60    3rd Qu.:36.30   
##                                        Max.   :73.30    Max.   :57.30   
##                                        NA's   :144674   NA's   :144674  
##  Data_Value_Unit    TotalPopulation Short_Question_Text     cancer     
##  Length:355521      Min.   :   56   Length:355521       Min.   : 0.50  
##  Class :character   1st Qu.: 2908   Class :character    1st Qu.: 5.20  
##  Mode  :character   Median : 4015   Mode  :character    Median : 6.50  
##                     Mean   : 4265                       Mean   : 6.47  
##                     3rd Qu.: 5330                       3rd Qu.: 7.60  
##                     Max.   :37452                       Max.   :20.90  
##                                                         NA's   :72337  
##     diabetes        obesity     
##  Min.   : 0.60   Min.   :10.70  
##  1st Qu.: 8.50   1st Qu.:28.10  
##  Median :10.40   Median :33.00  
##  Mean   :10.98   Mean   :33.01  
##  3rd Qu.:12.80   3rd Qu.:37.60  
##  Max.   :46.10   Max.   :63.90  
##  NA's   :72337   NA's   :72337

write.csv(merged_data, "data.csv", row.names = FALSE)

Clean Data

Jianfeng He

2024-11-06

Places CDC