options(repos = "https://cran.rstudio.com/")
Metadata was created November 10, 2020 and updated August 26, 2023. Published by the Center for Disease Control and Prevention. Data was located on https://catalog.data.gov/dataset/u-s-chronic-disease-indicators-cdi. [3][4]
library(knitr)
library(kableExtra)
library(visdat)
library(naniar)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:kableExtra':
##
## group_rows
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(stringr)
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
data_orig <- read.csv("ChronicDisease.csv")
head(data_orig, 5)
## YearStart YearEnd LocationAbbr LocationDesc DataSource
## 1 2010 2010 OR Oregon NVSS
## 2 2019 2019 AZ Arizona YRBSS
## 3 2019 2019 OH Ohio YRBSS
## 4 2019 2019 US United States YRBSS
## 5 2015 2015 VI Virgin Islands YRBSS
## Topic Question Response DataValueUnit
## 1 Cardiovascular Disease Mortality from heart failure NA
## 2 Alcohol Alcohol use among youth NA %
## 3 Alcohol Alcohol use among youth NA %
## 4 Alcohol Alcohol use among youth NA %
## 5 Alcohol Alcohol use among youth NA %
## DataValueType DataValue DataValueAlt DataValueFootnoteSymbol
## 1 Number 30 30.0
## 2 Crude Prevalence 29.5 29.5
## 3 Crude Prevalence 22.5 22.5
## 4 Crude Prevalence 13.9 13.9
## 5 Crude Prevalence NA -
## DatavalueFootnote LowConfidenceLimit HighConfidenceLimit
## 1 NA NA
## 2 24.9 34.7
## 3 17.0 29.1
## 4 10.4 18.4
## 5 No data available NA NA
## StratificationCategory1 Stratification1
## 1 Race/Ethnicity American Indian or Alaska Native
## 2 Gender Female
## 3 Gender Male
## 4 Race/Ethnicity Asian, non-Hispanic
## 5 Gender Male
## StratificationCategory2 Stratification2 StratificationCategory3
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## Stratification3 GeoLocation ResponseID
## 1 NA POINT (-120.15503132599969 44.56744942400047) NA
## 2 NA POINT (-111.76381127699972 34.865970280000454) NA
## 3 NA POINT (-82.40426005599966 40.06021014100048) NA
## 4 NA NA
## 5 NA POINT (-64.896335 18.335765) NA
## LocationID TopicID QuestionID DataValueTypeID StratificationCategoryID1
## 1 41 CVD CVD1_4 NMBR RACE
## 2 4 ALC ALC1_1 CRDPREV GENDER
## 3 39 ALC ALC1_1 CRDPREV GENDER
## 4 59 ALC ALC1_1 CRDPREV RACE
## 5 78 ALC ALC1_1 CRDPREV GENDER
## StratificationID1 StratificationCategoryID2 StratificationID2
## 1 AIAN NA NA
## 2 GENF NA NA
## 3 GENM NA NA
## 4 ASN NA NA
## 5 GENM NA NA
## StratificationCategoryID3 StratificationID3
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
str(data_orig)
## 'data.frame': 1185676 obs. of 34 variables:
## $ YearStart : int 2010 2019 2019 2019 2015 2020 2015 2019 2018 2013 ...
## $ YearEnd : int 2010 2019 2019 2019 2015 2020 2015 2019 2018 2013 ...
## $ LocationAbbr : chr "OR" "AZ" "OH" "US" ...
## $ LocationDesc : chr "Oregon" "Arizona" "Ohio" "United States" ...
## $ DataSource : chr "NVSS" "YRBSS" "YRBSS" "YRBSS" ...
## $ Topic : chr "Cardiovascular Disease" "Alcohol" "Alcohol" "Alcohol" ...
## $ Question : chr "Mortality from heart failure" "Alcohol use among youth" "Alcohol use among youth" "Alcohol use among youth" ...
## $ Response : logi NA NA NA NA NA NA ...
## $ DataValueUnit : chr "" "%" "%" "%" ...
## $ DataValueType : chr "Number" "Crude Prevalence" "Crude Prevalence" "Crude Prevalence" ...
## $ DataValue : chr "30" "29.5" "22.5" "13.9" ...
## $ DataValueAlt : num 30 29.5 22.5 13.9 NA 59.1 55.6 58.1 62.6 NA ...
## $ DataValueFootnoteSymbol : chr "" "" "" "" ...
## $ DatavalueFootnote : chr "" "" "" "" ...
## $ LowConfidenceLimit : num NA 24.9 17 10.4 NA 53.9 52.3 54.2 58.5 NA ...
## $ HighConfidenceLimit : num NA 34.7 29.1 18.4 NA 64 58.9 61.8 66.6 NA ...
## $ StratificationCategory1 : chr "Race/Ethnicity" "Gender" "Gender" "Race/Ethnicity" ...
## $ Stratification1 : chr "American Indian or Alaska Native" "Female" "Male" "Asian, non-Hispanic" ...
## $ StratificationCategory2 : logi NA NA NA NA NA NA ...
## $ Stratification2 : logi NA NA NA NA NA NA ...
## $ StratificationCategory3 : logi NA NA NA NA NA NA ...
## $ Stratification3 : logi NA NA NA NA NA NA ...
## $ GeoLocation : chr "POINT (-120.15503132599969 44.56744942400047)" "POINT (-111.76381127699972 34.865970280000454)" "POINT (-82.40426005599966 40.06021014100048)" "" ...
## $ ResponseID : logi NA NA NA NA NA NA ...
## $ LocationID : int 41 4 39 59 78 1 10 12 20 28 ...
## $ TopicID : chr "CVD" "ALC" "ALC" "ALC" ...
## $ QuestionID : chr "CVD1_4" "ALC1_1" "ALC1_1" "ALC1_1" ...
## $ DataValueTypeID : chr "NMBR" "CRDPREV" "CRDPREV" "CRDPREV" ...
## $ StratificationCategoryID1: chr "RACE" "GENDER" "GENDER" "RACE" ...
## $ StratificationID1 : chr "AIAN" "GENF" "GENM" "ASN" ...
## $ StratificationCategoryID2: logi NA NA NA NA NA NA ...
## $ StratificationID2 : logi NA NA NA NA NA NA ...
## $ StratificationCategoryID3: logi NA NA NA NA NA NA ...
## $ StratificationID3 : logi NA NA NA NA NA NA ...
summary(data_orig)
## YearStart YearEnd LocationAbbr LocationDesc
## Min. :2001 Min. :2001 Length:1185676 Length:1185676
## 1st Qu.:2013 1st Qu.:2013 Class :character Class :character
## Median :2015 Median :2016 Mode :character Mode :character
## Mean :2015 Mean :2016
## 3rd Qu.:2018 3rd Qu.:2018
## Max. :2021 Max. :2021
##
## DataSource Topic Question Response
## Length:1185676 Length:1185676 Length:1185676 Mode:logical
## Class :character Class :character Class :character NA's:1185676
## Mode :character Mode :character Mode :character
##
##
##
##
## DataValueUnit DataValueType DataValue DataValueAlt
## Length:1185676 Length:1185676 Length:1185676 Min. : 0.0
## Class :character Class :character Class :character 1st Qu.: 16.1
## Mode :character Mode :character Mode :character Median : 40.0
## Mean : 1005.3
## 3rd Qu.: 76.0
## Max. :2925456.0
## NA's :381098
## DataValueFootnoteSymbol DatavalueFootnote LowConfidenceLimit
## Length:1185676 Length:1185676 Min. : 0.0
## Class :character Class :character 1st Qu.: 11.0
## Mode :character Mode :character Median : 28.5
## Mean : 50.3
## 3rd Qu.: 56.3
## Max. :2541.6
## NA's :503296
## HighConfidenceLimit StratificationCategory1 Stratification1
## Min. : 0.0 Length:1185676 Length:1185676
## 1st Qu.: 16.3 Class :character Class :character
## Median : 41.0 Mode :character Mode :character
## Mean : 61.9
## 3rd Qu.: 71.1
## Max. :3530.5
## NA's :503296
## StratificationCategory2 Stratification2 StratificationCategory3
## Mode:logical Mode:logical Mode:logical
## NA's:1185676 NA's:1185676 NA's:1185676
##
##
##
##
##
## Stratification3 GeoLocation ResponseID LocationID
## Mode:logical Length:1185676 Mode:logical Min. : 1.00
## NA's:1185676 Class :character NA's:1185676 1st Qu.:17.00
## Mode :character Median :30.00
## Mean :30.79
## 3rd Qu.:45.00
## Max. :78.00
##
## TopicID QuestionID DataValueTypeID
## Length:1185676 Length:1185676 Length:1185676
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## StratificationCategoryID1 StratificationID1 StratificationCategoryID2
## Length:1185676 Length:1185676 Mode:logical
## Class :character Class :character NA's:1185676
## Mode :character Mode :character
##
##
##
##
## StratificationID2 StratificationCategoryID3 StratificationID3
## Mode:logical Mode:logical Mode:logical
## NA's:1185676 NA's:1185676 NA's:1185676
##
##
##
##
##
[1]
kable(head(data_orig, 10)) %>% kable_styling(font_size = 10) %>%
scroll_box(height = "500px")
| YearStart | YearEnd | LocationAbbr | LocationDesc | DataSource | Topic | Question | Response | DataValueUnit | DataValueType | DataValue | DataValueAlt | DataValueFootnoteSymbol | DatavalueFootnote | LowConfidenceLimit | HighConfidenceLimit | StratificationCategory1 | Stratification1 | StratificationCategory2 | Stratification2 | StratificationCategory3 | Stratification3 | GeoLocation | ResponseID | LocationID | TopicID | QuestionID | DataValueTypeID | StratificationCategoryID1 | StratificationID1 | StratificationCategoryID2 | StratificationID2 | StratificationCategoryID3 | StratificationID3 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2010 | 2010 | OR | Oregon | NVSS | Cardiovascular Disease | Mortality from heart failure | NA | Number | 30 | 30.0 | NA | NA | Race/Ethnicity | American Indian or Alaska Native | NA | NA | NA | NA | POINT (-120.15503132599969 44.56744942400047) | NA | 41 | CVD | CVD1_4 | NMBR | RACE | AIAN | NA | NA | NA | NA | |||
| 2019 | 2019 | AZ | Arizona | YRBSS | Alcohol | Alcohol use among youth | NA | % | Crude Prevalence | 29.5 | 29.5 | 24.9 | 34.7 | Gender | Female | NA | NA | NA | NA | POINT (-111.76381127699972 34.865970280000454) | NA | 4 | ALC | ALC1_1 | CRDPREV | GENDER | GENF | NA | NA | NA | NA | ||
| 2019 | 2019 | OH | Ohio | YRBSS | Alcohol | Alcohol use among youth | NA | % | Crude Prevalence | 22.5 | 22.5 | 17.0 | 29.1 | Gender | Male | NA | NA | NA | NA | POINT (-82.40426005599966 40.06021014100048) | NA | 39 | ALC | ALC1_1 | CRDPREV | GENDER | GENM | NA | NA | NA | NA | ||
| 2019 | 2019 | US | United States | YRBSS | Alcohol | Alcohol use among youth | NA | % | Crude Prevalence | 13.9 | 13.9 | 10.4 | 18.4 | Race/Ethnicity | Asian, non-Hispanic | NA | NA | NA | NA | NA | 59 | ALC | ALC1_1 | CRDPREV | RACE | ASN | NA | NA | NA | NA | |||
| 2015 | 2015 | VI | Virgin Islands | YRBSS | Alcohol | Alcohol use among youth | NA | % | Crude Prevalence | NA |
|
No data available | NA | NA | Gender | Male | NA | NA | NA | NA | POINT (-64.896335 18.335765) | NA | 78 | ALC | ALC1_1 | CRDPREV | GENDER | GENM | NA | NA | NA | NA | |
| 2020 | 2020 | AL | Alabama | PRAMS | Alcohol | Alcohol use before pregnancy | NA | % | Crude Prevalence | 59.1 | 59.1 | 53.9 | 64.0 | Race/Ethnicity | White, non-Hispanic | NA | NA | NA | NA | POINT (-86.63186076199969 32.84057112200048) | NA | 1 | ALC | ALC1_2 | CRDPREV | RACE | WHT | NA | NA | NA | NA | ||
| 2015 | 2015 | DE | Delaware | PRAMS | Alcohol | Alcohol use before pregnancy | NA | % | Crude Prevalence | 55.6 | 55.6 | 52.3 | 58.9 | Overall | Overall | NA | NA | NA | NA | POINT (-75.57774116799965 39.008830667000495) | NA | 10 | ALC | ALC1_2 | CRDPREV | OVERALL | OVR | NA | NA | NA | NA | ||
| 2019 | 2019 | FL | Florida | PRAMS | Alcohol | Alcohol use before pregnancy | NA | % | Crude Prevalence | 58.1 | 58.1 | 54.2 | 61.8 | Overall | Overall | NA | NA | NA | NA | POINT (-81.92896053899966 28.932040377000476) | NA | 12 | ALC | ALC1_2 | CRDPREV | OVERALL | OVR | NA | NA | NA | NA | ||
| 2018 | 2018 | KS | Kansas | PRAMS | Alcohol | Alcohol use before pregnancy | NA | % | Crude Prevalence | 62.6 | 62.6 | 58.5 | 66.6 | Overall | Overall | NA | NA | NA | NA | POINT (-98.20078122699965 38.34774030000045) | NA | 20 | ALC | ALC1_2 | CRDPREV | OVERALL | OVR | NA | NA | NA | NA | ||
| 2013 | 2013 | MS | Mississippi | PRAMS | Alcohol | Alcohol use before pregnancy | NA | % | Crude Prevalence | NA |
|
No data available | NA | NA | Overall | Overall | NA | NA | NA | NA | POINT (-89.53803082499968 32.745510099000455) | NA | 28 | ALC | ALC1_2 | CRDPREV | OVERALL | OVR | NA | NA | NA | NA |
Visualizing the proportion of missing data per variable and the proportion of the dataset.
vis_miss(data_orig, warn_large_data = FALSE)
vis_dat(data_orig, warn_large_data = FALSE)
miss_var_summary(data_orig)
## # A tibble: 34 × 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 Response 1185676 100
## 2 StratificationCategory2 1185676 100
## 3 Stratification2 1185676 100
## 4 StratificationCategory3 1185676 100
## 5 Stratification3 1185676 100
## 6 ResponseID 1185676 100
## 7 StratificationCategoryID2 1185676 100
## 8 StratificationID2 1185676 100
## 9 StratificationCategoryID3 1185676 100
## 10 StratificationID3 1185676 100
## # ℹ 24 more rows
Deleting all variables that contains at least 10% of missing data.
missing_proportions <- colMeans(is.na(data_orig))
columns_to_remove <- names(missing_proportions[missing_proportions > 0.10])
data1 <- data_orig[, !names(data_orig) %in% columns_to_remove]
str(data1)
## 'data.frame': 1185676 obs. of 21 variables:
## $ YearStart : int 2010 2019 2019 2019 2015 2020 2015 2019 2018 2013 ...
## $ YearEnd : int 2010 2019 2019 2019 2015 2020 2015 2019 2018 2013 ...
## $ LocationAbbr : chr "OR" "AZ" "OH" "US" ...
## $ LocationDesc : chr "Oregon" "Arizona" "Ohio" "United States" ...
## $ DataSource : chr "NVSS" "YRBSS" "YRBSS" "YRBSS" ...
## $ Topic : chr "Cardiovascular Disease" "Alcohol" "Alcohol" "Alcohol" ...
## $ Question : chr "Mortality from heart failure" "Alcohol use among youth" "Alcohol use among youth" "Alcohol use among youth" ...
## $ DataValueUnit : chr "" "%" "%" "%" ...
## $ DataValueType : chr "Number" "Crude Prevalence" "Crude Prevalence" "Crude Prevalence" ...
## $ DataValue : chr "30" "29.5" "22.5" "13.9" ...
## $ DataValueFootnoteSymbol : chr "" "" "" "" ...
## $ DatavalueFootnote : chr "" "" "" "" ...
## $ StratificationCategory1 : chr "Race/Ethnicity" "Gender" "Gender" "Race/Ethnicity" ...
## $ Stratification1 : chr "American Indian or Alaska Native" "Female" "Male" "Asian, non-Hispanic" ...
## $ GeoLocation : chr "POINT (-120.15503132599969 44.56744942400047)" "POINT (-111.76381127699972 34.865970280000454)" "POINT (-82.40426005599966 40.06021014100048)" "" ...
## $ LocationID : int 41 4 39 59 78 1 10 12 20 28 ...
## $ TopicID : chr "CVD" "ALC" "ALC" "ALC" ...
## $ QuestionID : chr "CVD1_4" "ALC1_1" "ALC1_1" "ALC1_1" ...
## $ DataValueTypeID : chr "NMBR" "CRDPREV" "CRDPREV" "CRDPREV" ...
## $ StratificationCategoryID1: chr "RACE" "GENDER" "GENDER" "RACE" ...
## $ StratificationID1 : chr "AIAN" "GENF" "GENM" "ASN" ...
glimpse(data1)
## Rows: 1,185,676
## Columns: 21
## $ YearStart <int> 2010, 2019, 2019, 2019, 2015, 2020, 2015, 20…
## $ YearEnd <int> 2010, 2019, 2019, 2019, 2015, 2020, 2015, 20…
## $ LocationAbbr <chr> "OR", "AZ", "OH", "US", "VI", "AL", "DE", "F…
## $ LocationDesc <chr> "Oregon", "Arizona", "Ohio", "United States"…
## $ DataSource <chr> "NVSS", "YRBSS", "YRBSS", "YRBSS", "YRBSS", …
## $ Topic <chr> "Cardiovascular Disease", "Alcohol", "Alcoho…
## $ Question <chr> "Mortality from heart failure", "Alcohol use…
## $ DataValueUnit <chr> "", "%", "%", "%", "%", "%", "%", "%", "%", …
## $ DataValueType <chr> "Number", "Crude Prevalence", "Crude Prevale…
## $ DataValue <chr> "30", "29.5", "22.5", "13.9", "", "59.1", "5…
## $ DataValueFootnoteSymbol <chr> "", "", "", "", "-", "", "", "", "", "-", ""…
## $ DatavalueFootnote <chr> "", "", "", "", "No data available", "", "",…
## $ StratificationCategory1 <chr> "Race/Ethnicity", "Gender", "Gender", "Race/…
## $ Stratification1 <chr> "American Indian or Alaska Native", "Female"…
## $ GeoLocation <chr> "POINT (-120.15503132599969 44.5674494240004…
## $ LocationID <int> 41, 4, 39, 59, 78, 1, 10, 12, 20, 28, 34, 42…
## $ TopicID <chr> "CVD", "ALC", "ALC", "ALC", "ALC", "ALC", "A…
## $ QuestionID <chr> "CVD1_4", "ALC1_1", "ALC1_1", "ALC1_1", "ALC…
## $ DataValueTypeID <chr> "NMBR", "CRDPREV", "CRDPREV", "CRDPREV", "CR…
## $ StratificationCategoryID1 <chr> "RACE", "GENDER", "GENDER", "RACE", "GENDER"…
## $ StratificationID1 <chr> "AIAN", "GENF", "GENM", "ASN", "GENM", "WHT"…
States that have the most number of chronic diseases ranked from most to least
data1 %>%
count(LocationDesc)%>%
arrange(desc(n))
## LocationDesc n
## 1 New York 22556
## 2 Wisconsin 22556
## 3 New Jersey 22550
## 4 Nebraska 22518
## 5 Iowa 22510
## 6 Vermont 22490
## 7 New Mexico 22457
## 8 Washington 22457
## 9 Michigan 22422
## 10 Hawaii 22420
## 11 Colorado 22417
## 12 West Virginia 22407
## 13 Arkansas 22384
## 14 Kentucky 22375
## 15 Oregon 22352
## 16 Utah 22352
## 17 Massachusetts 22349
## 18 Maryland 22346
## 19 North Carolina 22346
## 20 Florida 22311
## 21 Rhode Island 22273
## 22 Arizona 22271
## 23 South Dakota 22264
## 24 Nevada 22241
## 25 South Carolina 22235
## 26 Missouri 22163
## 27 Alaska 22157
## 28 Illinois 22157
## 29 Pennsylvania 22157
## 30 Delaware 22151
## 31 Maine 22142
## 32 Wyoming 22140
## 33 New Hampshire 22104
## 34 Mississippi 22100
## 35 Oklahoma 22093
## 36 Connecticut 22082
## 37 Alabama 22058
## 38 Virginia 22058
## 39 Louisiana 22053
## 40 Tennessee 22052
## 41 California 22037
## 42 Minnesota 22023
## 43 Kansas 21988
## 44 Georgia 21980
## 45 Montana 21973
## 46 North Dakota 21970
## 47 Ohio 21953
## 48 District of Columbia 21936
## 49 Texas 21918
## 50 Indiana 21872
## 51 Idaho 21842
## 52 Puerto Rico 14406
## 53 Virgin Islands 14077
## 54 Guam 14009
## 55 United States 10166
Frequency of Chronic Diseases Topics. Graph shows cancer having the highest frequency and disability having the lowest. [2]
topic <- as.data.frame(table(data1$Topic))
ggplot(topic, aes(x = reorder(Var1, -Freq), y = Freq, fill = Var1)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(
x = "Chronic Diseases",
y = "Frequency",
title = "Frequency of Chronic Diseases Topics"
) +
theme(
legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1)
)
data2 <- data1 %>%
filter(Topic == "Asthma")
Frequency of Chronic Diseases Topics by Race/Ethnicity. As the graph shows, Whites, Hispanics and Blacks has a higher frequency of having a Chronic Disease, while Asian, non-Hispanics having the lowest. [2]
Strat1 <- as.data.frame(table(data1$Stratification1))
ggplot(Strat1, aes(x = reorder(Var1, -Freq), y = Freq, fill = Var1)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(
x = "Race/Ethnicity",
y = "Frequency",
title = "Frequency of Chronic Diseases Topics by Race/Ethnicity"
) +
theme(
legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1)
)
[1]
LocationAbbr <- as.data.frame(table(data1$LocationAbbr))
ggplot(LocationAbbr, aes(x=Var1, y =Freq, fill = Var1))+
geom_bar(stat="identity") +
ggtitle("Frequency of Chronic Diseases by State") +
theme(legend.position = "none")+
xlab("Location by State") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
data1 %>%
count(LocationAbbr)%>%
arrange(desc(n)) %>%
head(10) %>% kable() %>% kable_styling()
| LocationAbbr | n |
|---|---|
| NY | 22556 |
| WI | 22556 |
| NJ | 22550 |
| NE | 22518 |
| IA | 22510 |
| VT | 22490 |
| NM | 22457 |
| WA | 22457 |
| MI | 22422 |
| HI | 22420 |
data2 <- data1 %>%
filter (Topic == "Asthma")
dim(data2)
## [1] 80342 21
[1]
kable(head(data2, 10)) %>% kable_styling(bootstrap_options = "striped", full_width = T, font_size = 10) %>% scroll_box(height = "500px")
| YearStart | YearEnd | LocationAbbr | LocationDesc | DataSource | Topic | Question | DataValueUnit | DataValueType | DataValue | DataValueFootnoteSymbol | DatavalueFootnote | StratificationCategory1 | Stratification1 | GeoLocation | LocationID | TopicID | QuestionID | DataValueTypeID | StratificationCategoryID1 | StratificationID1 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2016 | 2016 | AR | Arkansas | SEDD; SID | Asthma | Emergency department visit rate for asthma | Number | 5285 | Gender | Male | POINT (-92.27449074299966 34.74865012400045) | 5 | AST | AST2_1 | NMBR | GENDER | GENM | |||
| 2017 | 2017 | AZ | Arizona | SEDD; SID | Asthma | Emergency department visit rate for asthma | Number | 13497 | Gender | Male | POINT (-111.76381127699972 34.865970280000454) | 4 | AST | AST2_1 | NMBR | GENDER | GENM | |||
| 2016 | 2016 | CA | California | SEDD; SID | Asthma | Emergency department visit rate for asthma | Number |
|
No data available | Race/Ethnicity | Black, non-Hispanic | POINT (-120.99999953799971 37.63864012300047) | 6 | AST | AST2_1 | NMBR | RACE | BLK | ||
| 2015 | 2015 | CO | Colorado | SEDD; SID | Asthma | Emergency department visit rate for asthma | Number |
|
No data available | Race/Ethnicity | Hispanic | POINT (-106.13361092099967 38.843840757000464) | 8 | AST | AST2_1 | NMBR | RACE | HIS | ||
| 2017 | 2017 | CO | Colorado | SEDD; SID | Asthma | Emergency department visit rate for asthma | Number |
|
No data available | Gender | Male | POINT (-106.13361092099967 38.843840757000464) | 8 | AST | AST2_1 | NMBR | GENDER | GENM | ||
| 2014 | 2014 | CT | Connecticut | SEDD; SID | Asthma | Emergency department visit rate for asthma | Number |
|
No data available | Race/Ethnicity | Hispanic | POINT (-72.64984095199964 41.56266102000046) | 9 | AST | AST2_1 | NMBR | RACE | HIS | ||
| 2016 | 2016 | DE | Delaware | SEDD; SID | Asthma | Emergency department visit rate for asthma | Number |
|
No data available | Gender | Female | POINT (-75.57774116799965 39.008830667000495) | 10 | AST | AST2_1 | NMBR | GENDER | GENF | ||
| 2014 | 2014 | FL | Florida | SEDD; SID | Asthma | Emergency department visit rate for asthma | Number | 131559 | Overall | Overall | POINT (-81.92896053899966 28.932040377000476) | 12 | AST | AST2_1 | NMBR | OVERALL | OVR | |||
| 2016 | 2016 | FL | Florida | SEDD; SID | Asthma | Emergency department visit rate for asthma | Number | 5751 | Race/Ethnicity | Black, non-Hispanic | POINT (-81.92896053899966 28.932040377000476) | 12 | AST | AST2_1 | NMBR | RACE | BLK | |||
| 2017 | 2017 | FL | Florida | SEDD; SID | Asthma | Emergency department visit rate for asthma | cases per 10,000 | Age-adjusted Rate | 7.12 | Race/Ethnicity | Hispanic | POINT (-81.92896053899966 28.932040377000476) | 12 | AST | AST2_1 | AGEADJRATE | RACE | HIS |
kable(tail(data2, 10)) %>% kable_styling(bootstrap_options = "striped", full_width = T, font_size = 10) %>% scroll_box(height = "500px")
| YearStart | YearEnd | LocationAbbr | LocationDesc | DataSource | Topic | Question | DataValueUnit | DataValueType | DataValue | DataValueFootnoteSymbol | DatavalueFootnote | StratificationCategory1 | Stratification1 | GeoLocation | LocationID | TopicID | QuestionID | DataValueTypeID | StratificationCategoryID1 | StratificationID1 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 80333 | 2020 | 2020 | WY | Wyoming | BRFSS | Asthma | Pneumococcal vaccination among noninstitutionalized adults aged 18-64 years with asthma | % | Age-adjusted Prevalence | 34.2 | Overall | Overall | POINT (-108.10983035299967 43.23554134300048) | 56 | AST | AST6_1 | AGEADJPREV | OVERALL | OVR | ||
| 80334 | 2020 | 2020 | WY | Wyoming | BRFSS | Asthma | Influenza vaccination among noninstitutionalized adults aged >= 65 years with asthma | % | Crude Prevalence | 71.7 | Overall | Overall | POINT (-108.10983035299967 43.23554134300048) | 56 | AST | AST5_2 | CRDPREV | OVERALL | OVR | ||
| 80335 | 2020 | 2020 | WY | Wyoming | BRFSS | Asthma | Pneumococcal vaccination among noninstitutionalized adults aged >= 65 years with asthma | % | Crude Prevalence | 89.3 | Overall | Overall | POINT (-108.10983035299967 43.23554134300048) | 56 | AST | AST6_2 | CRDPREV | OVERALL | OVR | ||
| 80336 | 2020 | 2020 | WY | Wyoming | BRFSS | Asthma | Pneumococcal vaccination among noninstitutionalized adults aged 18-64 years with asthma | % | Age-adjusted Prevalence | 36.3 | Gender | Female | POINT (-108.10983035299967 43.23554134300048) | 56 | AST | AST6_1 | AGEADJPREV | GENDER | GENF | ||
| 80337 | 2020 | 2020 | WY | Wyoming | BRFSS | Asthma | Current asthma prevalence among adults aged >= 18 years | % | Crude Prevalence | 7.7 | Race/Ethnicity | Hispanic | POINT (-108.10983035299967 43.23554134300048) | 56 | AST | AST1_1 | CRDPREV | RACE | HIS | ||
| 80338 | 2020 | 2020 | WY | Wyoming | BRFSS | Asthma | Influenza vaccination among noninstitutionalized adults aged 18-64 years with asthma | % | Age-adjusted Prevalence | **** | Sample size of denominator and/or age group for age-standardization is less than 50 or relative standard error is more than 30% | Race/Ethnicity | Black, non-Hispanic | POINT (-108.10983035299967 43.23554134300048) | 56 | AST | AST5_1 | AGEADJPREV | RACE | BLK | |
| 80339 | 2020 | 2020 | WY | Wyoming | BRFSS | Asthma | Pneumococcal vaccination among noninstitutionalized adults aged >= 65 years with asthma | % | Age-adjusted Prevalence | **** | Sample size of denominator and/or age group for age-standardization is less than 50 or relative standard error is more than 30% | Race/Ethnicity | Black, non-Hispanic | POINT (-108.10983035299967 43.23554134300048) | 56 | AST | AST6_2 | AGEADJPREV | RACE | BLK | |
| 80340 | 2020 | 2020 | WY | Wyoming | BRFSS | Asthma | Influenza vaccination among noninstitutionalized adults aged >= 65 years with asthma | % | Crude Prevalence | 68.9 | Gender | Female | POINT (-108.10983035299967 43.23554134300048) | 56 | AST | AST5_2 | CRDPREV | GENDER | GENF | ||
| 80341 | 2020 | 2020 | WY | Wyoming | BRFSS | Asthma | Pneumococcal vaccination among noninstitutionalized adults aged >= 65 years with asthma | % | Crude Prevalence | 88.4 | Gender | Female | POINT (-108.10983035299967 43.23554134300048) | 56 | AST | AST6_2 | CRDPREV | GENDER | GENF | ||
| 80342 | 2020 | 2020 | WY | Wyoming | BRFSS | Asthma | Influenza vaccination among noninstitutionalized adults aged 18-64 years with asthma | % | Age-adjusted Prevalence | 43.1 | Gender | Female | POINT (-108.10983035299967 43.23554134300048) | 56 | AST | AST5_1 | AGEADJPREV | GENDER | GENF |
asthmadf <- data1 %>%
filter(Topic == "Asthma" & LocationAbbr == "US")
asthmadf$YearEnd <- as.factor(asthmadf$YearEnd)
asthmadf$DataValue <- as.numeric(asthmadf$DataValue)
str(asthmadf)
## 'data.frame': 374 obs. of 21 variables:
## $ YearStart : int 2017 2010 2016 2012 2020 2015 2018 2017 2011 2011 ...
## $ YearEnd : Factor w/ 12 levels "2010","2011",..: 8 1 7 3 11 6 9 8 2 2 ...
## $ LocationAbbr : chr "US" "US" "US" "US" ...
## $ LocationDesc : chr "United States" "United States" "United States" "United States" ...
## $ DataSource : chr "NVSS" "NVSS" "NVSS" "NVSS" ...
## $ Topic : chr "Asthma" "Asthma" "Asthma" "Asthma" ...
## $ Question : chr "Asthma mortality rate" "Asthma mortality rate" "Asthma mortality rate" "Asthma mortality rate" ...
## $ DataValueUnit : chr "cases per 1,000,000" "cases per 1,000,000" "cases per 1,000,000" "cases per 1,000,000" ...
## $ DataValueType : chr "Crude Rate" "Age-adjusted Rate" "Crude Rate" "Crude Rate" ...
## $ DataValue : num 10.9 10.5 10.9 11.2 12.6 ...
## $ DataValueFootnoteSymbol : chr "" "" "" "" ...
## $ DatavalueFootnote : chr "" "" "" "" ...
## $ StratificationCategory1 : chr "Overall" "Overall" "Overall" "Overall" ...
## $ Stratification1 : chr "Overall" "Overall" "Overall" "Overall" ...
## $ GeoLocation : chr "" "" "" "" ...
## $ LocationID : int 59 59 59 59 59 59 59 59 59 59 ...
## $ TopicID : chr "AST" "AST" "AST" "AST" ...
## $ QuestionID : chr "AST4_1" "AST4_1" "AST4_1" "AST4_1" ...
## $ DataValueTypeID : chr "CRDRATE" "AGEADJRATE" "CRDRATE" "CRDRATE" ...
## $ StratificationCategoryID1: chr "OVERALL" "OVERALL" "OVERALL" "OVERALL" ...
## $ StratificationID1 : chr "OVR" "OVR" "OVR" "OVR" ...
glimpse(asthmadf)
## Rows: 374
## Columns: 21
## $ YearStart <int> 2017, 2010, 2016, 2012, 2020, 2015, 2018, 20…
## $ YearEnd <fct> 2017, 2010, 2016, 2012, 2020, 2015, 2018, 20…
## $ LocationAbbr <chr> "US", "US", "US", "US", "US", "US", "US", "U…
## $ LocationDesc <chr> "United States", "United States", "United St…
## $ DataSource <chr> "NVSS", "NVSS", "NVSS", "NVSS", "NVSS", "NVS…
## $ Topic <chr> "Asthma", "Asthma", "Asthma", "Asthma", "Ast…
## $ Question <chr> "Asthma mortality rate", "Asthma mortality r…
## $ DataValueUnit <chr> "cases per 1,000,000", "cases per 1,000,000"…
## $ DataValueType <chr> "Crude Rate", "Age-adjusted Rate", "Crude Ra…
## $ DataValue <dbl> 10.9, 10.5, 10.9, 11.2, 12.6, 10.3, 10.5, 35…
## $ DataValueFootnoteSymbol <chr> "", "", "", "", "", "", "", "", "", "", "", …
## $ DatavalueFootnote <chr> "", "", "", "", "", "", "", "", "", "", "", …
## $ StratificationCategory1 <chr> "Overall", "Overall", "Overall", "Overall", …
## $ Stratification1 <chr> "Overall", "Overall", "Overall", "Overall", …
## $ GeoLocation <chr> "", "", "", "", "", "", "", "", "", "", "", …
## $ LocationID <int> 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, …
## $ TopicID <chr> "AST", "AST", "AST", "AST", "AST", "AST", "A…
## $ QuestionID <chr> "AST4_1", "AST4_1", "AST4_1", "AST4_1", "AST…
## $ DataValueTypeID <chr> "CRDRATE", "AGEADJRATE", "CRDRATE", "CRDRATE…
## $ StratificationCategoryID1 <chr> "OVERALL", "OVERALL", "OVERALL", "OVERALL", …
## $ StratificationID1 <chr> "OVR", "OVR", "OVR", "OVR", "OVR", "OVR", "O…
asthmadf2 <- data1 %>%
filter(Topic == "Asthma")
asthmadf2$YearEnd <- as.factor(asthmadf2$YearEnd)
asthmadf2$DataValue <- as.numeric(asthmadf2$DataValue)
str(asthmadf2)
## 'data.frame': 80342 obs. of 21 variables:
## $ YearStart : int 2016 2017 2016 2015 2017 2014 2016 2014 2016 2017 ...
## $ YearEnd : Factor w/ 12 levels "2010","2011",..: 7 8 7 6 8 5 7 5 7 8 ...
## $ LocationAbbr : chr "AR" "AZ" "CA" "CO" ...
## $ LocationDesc : chr "Arkansas" "Arizona" "California" "Colorado" ...
## $ DataSource : chr "SEDD; SID" "SEDD; SID" "SEDD; SID" "SEDD; SID" ...
## $ Topic : chr "Asthma" "Asthma" "Asthma" "Asthma" ...
## $ Question : chr "Emergency department visit rate for asthma" "Emergency department visit rate for asthma" "Emergency department visit rate for asthma" "Emergency department visit rate for asthma" ...
## $ DataValueUnit : chr "" "" "" "" ...
## $ DataValueType : chr "Number" "Number" "Number" "Number" ...
## $ DataValue : num 5285 13497 NA NA NA ...
## $ DataValueFootnoteSymbol : chr "" "" "-" "-" ...
## $ DatavalueFootnote : chr "" "" "No data available" "No data available" ...
## $ StratificationCategory1 : chr "Gender" "Gender" "Race/Ethnicity" "Race/Ethnicity" ...
## $ Stratification1 : chr "Male" "Male" "Black, non-Hispanic" "Hispanic" ...
## $ GeoLocation : chr "POINT (-92.27449074299966 34.74865012400045)" "POINT (-111.76381127699972 34.865970280000454)" "POINT (-120.99999953799971 37.63864012300047)" "POINT (-106.13361092099967 38.843840757000464)" ...
## $ LocationID : int 5 4 6 8 8 9 10 12 12 12 ...
## $ TopicID : chr "AST" "AST" "AST" "AST" ...
## $ QuestionID : chr "AST2_1" "AST2_1" "AST2_1" "AST2_1" ...
## $ DataValueTypeID : chr "NMBR" "NMBR" "NMBR" "NMBR" ...
## $ StratificationCategoryID1: chr "GENDER" "GENDER" "RACE" "RACE" ...
## $ StratificationID1 : chr "GENM" "GENM" "BLK" "HIS" ...
[1]
kable(head(asthmadf, 5)) %>% kable_styling(bootstrap_options = "striped", full_width = T, font_size = 10) %>% scroll_box(height = "500px")
| YearStart | YearEnd | LocationAbbr | LocationDesc | DataSource | Topic | Question | DataValueUnit | DataValueType | DataValue | DataValueFootnoteSymbol | DatavalueFootnote | StratificationCategory1 | Stratification1 | GeoLocation | LocationID | TopicID | QuestionID | DataValueTypeID | StratificationCategoryID1 | StratificationID1 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2017 | 2017 | US | United States | NVSS | Asthma | Asthma mortality rate | cases per 1,000,000 | Crude Rate | 10.9 | Overall | Overall | 59 | AST | AST4_1 | CRDRATE | OVERALL | OVR | |||
| 2010 | 2010 | US | United States | NVSS | Asthma | Asthma mortality rate | cases per 1,000,000 | Age-adjusted Rate | 10.5 | Overall | Overall | 59 | AST | AST4_1 | AGEADJRATE | OVERALL | OVR | |||
| 2016 | 2016 | US | United States | NVSS | Asthma | Asthma mortality rate | cases per 1,000,000 | Crude Rate | 10.9 | Overall | Overall | 59 | AST | AST4_1 | CRDRATE | OVERALL | OVR | |||
| 2012 | 2012 | US | United States | NVSS | Asthma | Asthma mortality rate | cases per 1,000,000 | Crude Rate | 11.2 | Overall | Overall | 59 | AST | AST4_1 | CRDRATE | OVERALL | OVR | |||
| 2020 | 2020 | US | United States | NVSS | Asthma | Asthma mortality rate | cases per 1,000,000 | Crude Rate | 12.6 | Overall | Overall | 59 | AST | AST4_1 | CRDRATE | OVERALL | OVR |
kable(tail(asthmadf, 5)) %>% kable_styling(bootstrap_options = "striped", full_width = T, font_size = 10) %>% scroll_box(height = "500px")
| YearStart | YearEnd | LocationAbbr | LocationDesc | DataSource | Topic | Question | DataValueUnit | DataValueType | DataValue | DataValueFootnoteSymbol | DatavalueFootnote | StratificationCategory1 | Stratification1 | GeoLocation | LocationID | TopicID | QuestionID | DataValueTypeID | StratificationCategoryID1 | StratificationID1 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 370 | 2014 | 2014 | US | United States | BRFSS | Asthma | Pneumococcal vaccination among noninstitutionalized adults aged >= 65 years with asthma | % | Age-adjusted Prevalence | 86.2 |
|
50 States + DC: US Median | Gender | Female | 59 | AST | AST6_2 | AGEADJPREV | GENDER | GENF | |
| 371 | 2018 | 2018 | US | United States | BRFSS | Asthma | Pneumococcal vaccination among noninstitutionalized adults aged >= 65 years with asthma | % | Crude Prevalence | 87.5 | ** | US estimate/number is based on fewer than 50 states and the District of Columbia | Gender | Female | 59 | AST | AST6_2 | CRDPREV | GENDER | GENF | |
| 372 | 2016 | 2016 | US | United States | BRFSS | Asthma | Influenza vaccination among noninstitutionalized adults aged 18-64 years with asthma | % | Age-adjusted Prevalence | 36.4 | ** | US estimate/number is based on fewer than 50 states and the District of Columbia | Gender | Male | 59 | AST | AST5_1 | AGEADJPREV | GENDER | GENM | |
| 373 | 2017 | 2017 | US | United States | BRFSS | Asthma | Pneumococcal vaccination among noninstitutionalized adults aged 18-64 years with asthma | % | Age-adjusted Prevalence | 44.0 |
|
50 States + DC: US Median | Overall | Overall | 59 | AST | AST6_1 | AGEADJPREV | OVERALL | OVR | |
| 374 | 2014 | 2014 | US | United States | BRFSS | Asthma | Current asthma prevalence among adults aged >= 18 years | % | Age-adjusted Prevalence | 6.7 |
|
50 States + DC: US Median | Gender | Male | 59 | AST | AST1_1 | AGEADJPREV | GENDER | GENM |
[1]
Questions <- unique(asthmadf$Question)
kable(Questions) %>% kable_styling(font_size = 12) %>% scroll_box(height = "300px")
| x |
|---|
| Asthma mortality rate |
| Current asthma prevalence among adults aged >= 18 years |
| Asthma prevalence among women aged 18-44 years |
| Pneumococcal vaccination among noninstitutionalized adults aged >= 65 years with asthma |
| Influenza vaccination among noninstitutionalized adults aged >= 65 years with asthma |
| Pneumococcal vaccination among noninstitutionalized adults aged 18-64 years with asthma |
| Influenza vaccination among noninstitutionalized adults aged 18-64 years with asthma |
ggplot(data = asthmadf, aes(x = YearEnd, y = DataValue)) +
geom_bar(stat = "identity") +
labs(x = "Year", y = "Cases per 1,000,000", title = "Cases of Asthma")
ggplot(data = asthmadf, aes(x = YearEnd, y = DataValue)) +
geom_boxplot() +
labs(title = "Distribtution of Asthma Cases", y = "Number of Cases") +
theme_minimal()
Mean, standard deviation, minimum and maximum asthma cases in the US
over the years.
mean(asthmadf$DataValue)
## [1] 149.246
sd(asthmadf$DataValue)
## [1] 599.4269
min(asthmadf$DataValue)
## [1] 6.5
max(asthmadf$DataValue)
## [1] 4145
Works Cited:
[1] Siracusa, M., et al. (2019, March 23). DATA 607 Project 3 Most Valued Data Science Skills. RPubs. https://rpubs.com/kleberperez/477939.
[2] OpenAI. (2023). ChatGPT (Mar 14 version) [Large language model]. https://chat.openai.com/chat.
[3] Centers for Disease Control and Prevention. (2023). U.S. Chronic Disease Indicators (CDI). Catalog. https://catalog.data.gov/dataset/u-s-chronic-disease-indicators-cdi
[4] Centers for Disease Control and Prevention. (2015). Indicators for Chronic Disease Surveillance — United States, 2013. MMWR. https://www.cdc.gov/mmwr/pdf/rr/rr6401.pdf