options(repos = "https://cran.rstudio.com/")

Metadata was created November 10, 2020 and updated August 26, 2023. Published by the Center for Disease Control and Prevention. Data was located on https://catalog.data.gov/dataset/u-s-chronic-disease-indicators-cdi. [3][4]

library(knitr)
library(kableExtra)
library(visdat)
library(naniar)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:kableExtra':
## 
##     group_rows
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(stringr)
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
data_orig <- read.csv("ChronicDisease.csv")
head(data_orig, 5)
##   YearStart YearEnd LocationAbbr   LocationDesc DataSource
## 1      2010    2010           OR         Oregon       NVSS
## 2      2019    2019           AZ        Arizona      YRBSS
## 3      2019    2019           OH           Ohio      YRBSS
## 4      2019    2019           US  United States      YRBSS
## 5      2015    2015           VI Virgin Islands      YRBSS
##                    Topic                     Question Response DataValueUnit
## 1 Cardiovascular Disease Mortality from heart failure       NA              
## 2                Alcohol      Alcohol use among youth       NA             %
## 3                Alcohol      Alcohol use among youth       NA             %
## 4                Alcohol      Alcohol use among youth       NA             %
## 5                Alcohol      Alcohol use among youth       NA             %
##      DataValueType DataValue DataValueAlt DataValueFootnoteSymbol
## 1           Number        30         30.0                        
## 2 Crude Prevalence      29.5         29.5                        
## 3 Crude Prevalence      22.5         22.5                        
## 4 Crude Prevalence      13.9         13.9                        
## 5 Crude Prevalence                     NA                       -
##   DatavalueFootnote LowConfidenceLimit HighConfidenceLimit
## 1                                   NA                  NA
## 2                                 24.9                34.7
## 3                                 17.0                29.1
## 4                                 10.4                18.4
## 5 No data available                 NA                  NA
##   StratificationCategory1                  Stratification1
## 1          Race/Ethnicity American Indian or Alaska Native
## 2                  Gender                           Female
## 3                  Gender                             Male
## 4          Race/Ethnicity              Asian, non-Hispanic
## 5                  Gender                             Male
##   StratificationCategory2 Stratification2 StratificationCategory3
## 1                      NA              NA                      NA
## 2                      NA              NA                      NA
## 3                      NA              NA                      NA
## 4                      NA              NA                      NA
## 5                      NA              NA                      NA
##   Stratification3                                    GeoLocation ResponseID
## 1              NA  POINT (-120.15503132599969 44.56744942400047)         NA
## 2              NA POINT (-111.76381127699972 34.865970280000454)         NA
## 3              NA   POINT (-82.40426005599966 40.06021014100048)         NA
## 4              NA                                                        NA
## 5              NA                   POINT (-64.896335 18.335765)         NA
##   LocationID TopicID QuestionID DataValueTypeID StratificationCategoryID1
## 1         41     CVD     CVD1_4            NMBR                      RACE
## 2          4     ALC     ALC1_1         CRDPREV                    GENDER
## 3         39     ALC     ALC1_1         CRDPREV                    GENDER
## 4         59     ALC     ALC1_1         CRDPREV                      RACE
## 5         78     ALC     ALC1_1         CRDPREV                    GENDER
##   StratificationID1 StratificationCategoryID2 StratificationID2
## 1              AIAN                        NA                NA
## 2              GENF                        NA                NA
## 3              GENM                        NA                NA
## 4               ASN                        NA                NA
## 5              GENM                        NA                NA
##   StratificationCategoryID3 StratificationID3
## 1                        NA                NA
## 2                        NA                NA
## 3                        NA                NA
## 4                        NA                NA
## 5                        NA                NA
str(data_orig)
## 'data.frame':    1185676 obs. of  34 variables:
##  $ YearStart                : int  2010 2019 2019 2019 2015 2020 2015 2019 2018 2013 ...
##  $ YearEnd                  : int  2010 2019 2019 2019 2015 2020 2015 2019 2018 2013 ...
##  $ LocationAbbr             : chr  "OR" "AZ" "OH" "US" ...
##  $ LocationDesc             : chr  "Oregon" "Arizona" "Ohio" "United States" ...
##  $ DataSource               : chr  "NVSS" "YRBSS" "YRBSS" "YRBSS" ...
##  $ Topic                    : chr  "Cardiovascular Disease" "Alcohol" "Alcohol" "Alcohol" ...
##  $ Question                 : chr  "Mortality from heart failure" "Alcohol use among youth" "Alcohol use among youth" "Alcohol use among youth" ...
##  $ Response                 : logi  NA NA NA NA NA NA ...
##  $ DataValueUnit            : chr  "" "%" "%" "%" ...
##  $ DataValueType            : chr  "Number" "Crude Prevalence" "Crude Prevalence" "Crude Prevalence" ...
##  $ DataValue                : chr  "30" "29.5" "22.5" "13.9" ...
##  $ DataValueAlt             : num  30 29.5 22.5 13.9 NA 59.1 55.6 58.1 62.6 NA ...
##  $ DataValueFootnoteSymbol  : chr  "" "" "" "" ...
##  $ DatavalueFootnote        : chr  "" "" "" "" ...
##  $ LowConfidenceLimit       : num  NA 24.9 17 10.4 NA 53.9 52.3 54.2 58.5 NA ...
##  $ HighConfidenceLimit      : num  NA 34.7 29.1 18.4 NA 64 58.9 61.8 66.6 NA ...
##  $ StratificationCategory1  : chr  "Race/Ethnicity" "Gender" "Gender" "Race/Ethnicity" ...
##  $ Stratification1          : chr  "American Indian or Alaska Native" "Female" "Male" "Asian, non-Hispanic" ...
##  $ StratificationCategory2  : logi  NA NA NA NA NA NA ...
##  $ Stratification2          : logi  NA NA NA NA NA NA ...
##  $ StratificationCategory3  : logi  NA NA NA NA NA NA ...
##  $ Stratification3          : logi  NA NA NA NA NA NA ...
##  $ GeoLocation              : chr  "POINT (-120.15503132599969 44.56744942400047)" "POINT (-111.76381127699972 34.865970280000454)" "POINT (-82.40426005599966 40.06021014100048)" "" ...
##  $ ResponseID               : logi  NA NA NA NA NA NA ...
##  $ LocationID               : int  41 4 39 59 78 1 10 12 20 28 ...
##  $ TopicID                  : chr  "CVD" "ALC" "ALC" "ALC" ...
##  $ QuestionID               : chr  "CVD1_4" "ALC1_1" "ALC1_1" "ALC1_1" ...
##  $ DataValueTypeID          : chr  "NMBR" "CRDPREV" "CRDPREV" "CRDPREV" ...
##  $ StratificationCategoryID1: chr  "RACE" "GENDER" "GENDER" "RACE" ...
##  $ StratificationID1        : chr  "AIAN" "GENF" "GENM" "ASN" ...
##  $ StratificationCategoryID2: logi  NA NA NA NA NA NA ...
##  $ StratificationID2        : logi  NA NA NA NA NA NA ...
##  $ StratificationCategoryID3: logi  NA NA NA NA NA NA ...
##  $ StratificationID3        : logi  NA NA NA NA NA NA ...
summary(data_orig)
##    YearStart       YearEnd     LocationAbbr       LocationDesc      
##  Min.   :2001   Min.   :2001   Length:1185676     Length:1185676    
##  1st Qu.:2013   1st Qu.:2013   Class :character   Class :character  
##  Median :2015   Median :2016   Mode  :character   Mode  :character  
##  Mean   :2015   Mean   :2016                                        
##  3rd Qu.:2018   3rd Qu.:2018                                        
##  Max.   :2021   Max.   :2021                                        
##                                                                     
##   DataSource           Topic             Question         Response      
##  Length:1185676     Length:1185676     Length:1185676     Mode:logical  
##  Class :character   Class :character   Class :character   NA's:1185676  
##  Mode  :character   Mode  :character   Mode  :character                 
##                                                                         
##                                                                         
##                                                                         
##                                                                         
##  DataValueUnit      DataValueType       DataValue          DataValueAlt      
##  Length:1185676     Length:1185676     Length:1185676     Min.   :      0.0  
##  Class :character   Class :character   Class :character   1st Qu.:     16.1  
##  Mode  :character   Mode  :character   Mode  :character   Median :     40.0  
##                                                           Mean   :   1005.3  
##                                                           3rd Qu.:     76.0  
##                                                           Max.   :2925456.0  
##                                                           NA's   :381098     
##  DataValueFootnoteSymbol DatavalueFootnote  LowConfidenceLimit
##  Length:1185676          Length:1185676     Min.   :   0.0    
##  Class :character        Class :character   1st Qu.:  11.0    
##  Mode  :character        Mode  :character   Median :  28.5    
##                                             Mean   :  50.3    
##                                             3rd Qu.:  56.3    
##                                             Max.   :2541.6    
##                                             NA's   :503296    
##  HighConfidenceLimit StratificationCategory1 Stratification1   
##  Min.   :   0.0      Length:1185676          Length:1185676    
##  1st Qu.:  16.3      Class :character        Class :character  
##  Median :  41.0      Mode  :character        Mode  :character  
##  Mean   :  61.9                                                
##  3rd Qu.:  71.1                                                
##  Max.   :3530.5                                                
##  NA's   :503296                                                
##  StratificationCategory2 Stratification2 StratificationCategory3
##  Mode:logical            Mode:logical    Mode:logical           
##  NA's:1185676            NA's:1185676    NA's:1185676           
##                                                                 
##                                                                 
##                                                                 
##                                                                 
##                                                                 
##  Stratification3 GeoLocation        ResponseID       LocationID   
##  Mode:logical    Length:1185676     Mode:logical   Min.   : 1.00  
##  NA's:1185676    Class :character   NA's:1185676   1st Qu.:17.00  
##                  Mode  :character                  Median :30.00  
##                                                    Mean   :30.79  
##                                                    3rd Qu.:45.00  
##                                                    Max.   :78.00  
##                                                                   
##    TopicID           QuestionID        DataValueTypeID   
##  Length:1185676     Length:1185676     Length:1185676    
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  StratificationCategoryID1 StratificationID1  StratificationCategoryID2
##  Length:1185676            Length:1185676     Mode:logical             
##  Class :character          Class :character   NA's:1185676             
##  Mode  :character          Mode  :character                            
##                                                                        
##                                                                        
##                                                                        
##                                                                        
##  StratificationID2 StratificationCategoryID3 StratificationID3
##  Mode:logical      Mode:logical              Mode:logical     
##  NA's:1185676      NA's:1185676              NA's:1185676     
##                                                               
##                                                               
##                                                               
##                                                               
## 

[1]

kable(head(data_orig, 10)) %>% kable_styling(font_size = 10) %>% 
  scroll_box(height = "500px")
YearStart YearEnd LocationAbbr LocationDesc DataSource Topic Question Response DataValueUnit DataValueType DataValue DataValueAlt DataValueFootnoteSymbol DatavalueFootnote LowConfidenceLimit HighConfidenceLimit StratificationCategory1 Stratification1 StratificationCategory2 Stratification2 StratificationCategory3 Stratification3 GeoLocation ResponseID LocationID TopicID QuestionID DataValueTypeID StratificationCategoryID1 StratificationID1 StratificationCategoryID2 StratificationID2 StratificationCategoryID3 StratificationID3
2010 2010 OR Oregon NVSS Cardiovascular Disease Mortality from heart failure NA Number 30 30.0 NA NA Race/Ethnicity American Indian or Alaska Native NA NA NA NA POINT (-120.15503132599969 44.56744942400047) NA 41 CVD CVD1_4 NMBR RACE AIAN NA NA NA NA
2019 2019 AZ Arizona YRBSS Alcohol Alcohol use among youth NA % Crude Prevalence 29.5 29.5 24.9 34.7 Gender Female NA NA NA NA POINT (-111.76381127699972 34.865970280000454) NA 4 ALC ALC1_1 CRDPREV GENDER GENF NA NA NA NA
2019 2019 OH Ohio YRBSS Alcohol Alcohol use among youth NA % Crude Prevalence 22.5 22.5 17.0 29.1 Gender Male NA NA NA NA POINT (-82.40426005599966 40.06021014100048) NA 39 ALC ALC1_1 CRDPREV GENDER GENM NA NA NA NA
2019 2019 US United States YRBSS Alcohol Alcohol use among youth NA % Crude Prevalence 13.9 13.9 10.4 18.4 Race/Ethnicity Asian, non-Hispanic NA NA NA NA NA 59 ALC ALC1_1 CRDPREV RACE ASN NA NA NA NA
2015 2015 VI Virgin Islands YRBSS Alcohol Alcohol use among youth NA % Crude Prevalence NA
No data available NA NA Gender Male NA NA NA NA POINT (-64.896335 18.335765) NA 78 ALC ALC1_1 CRDPREV GENDER GENM NA NA NA NA
2020 2020 AL Alabama PRAMS Alcohol Alcohol use before pregnancy NA % Crude Prevalence 59.1 59.1 53.9 64.0 Race/Ethnicity White, non-Hispanic NA NA NA NA POINT (-86.63186076199969 32.84057112200048) NA 1 ALC ALC1_2 CRDPREV RACE WHT NA NA NA NA
2015 2015 DE Delaware PRAMS Alcohol Alcohol use before pregnancy NA % Crude Prevalence 55.6 55.6 52.3 58.9 Overall Overall NA NA NA NA POINT (-75.57774116799965 39.008830667000495) NA 10 ALC ALC1_2 CRDPREV OVERALL OVR NA NA NA NA
2019 2019 FL Florida PRAMS Alcohol Alcohol use before pregnancy NA % Crude Prevalence 58.1 58.1 54.2 61.8 Overall Overall NA NA NA NA POINT (-81.92896053899966 28.932040377000476) NA 12 ALC ALC1_2 CRDPREV OVERALL OVR NA NA NA NA
2018 2018 KS Kansas PRAMS Alcohol Alcohol use before pregnancy NA % Crude Prevalence 62.6 62.6 58.5 66.6 Overall Overall NA NA NA NA POINT (-98.20078122699965 38.34774030000045) NA 20 ALC ALC1_2 CRDPREV OVERALL OVR NA NA NA NA
2013 2013 MS Mississippi PRAMS Alcohol Alcohol use before pregnancy NA % Crude Prevalence NA
No data available NA NA Overall Overall NA NA NA NA POINT (-89.53803082499968 32.745510099000455) NA 28 ALC ALC1_2 CRDPREV OVERALL OVR NA NA NA NA

Visualizing the proportion of missing data per variable and the proportion of the dataset.

vis_miss(data_orig, warn_large_data = FALSE)

vis_dat(data_orig, warn_large_data = FALSE)

miss_var_summary(data_orig)
## # A tibble: 34 × 3
##    variable                   n_miss pct_miss
##    <chr>                       <int>    <dbl>
##  1 Response                  1185676      100
##  2 StratificationCategory2   1185676      100
##  3 Stratification2           1185676      100
##  4 StratificationCategory3   1185676      100
##  5 Stratification3           1185676      100
##  6 ResponseID                1185676      100
##  7 StratificationCategoryID2 1185676      100
##  8 StratificationID2         1185676      100
##  9 StratificationCategoryID3 1185676      100
## 10 StratificationID3         1185676      100
## # ℹ 24 more rows

Deleting all variables that contains at least 10% of missing data.

missing_proportions <- colMeans(is.na(data_orig))
columns_to_remove <- names(missing_proportions[missing_proportions > 0.10])
data1 <- data_orig[, !names(data_orig) %in% columns_to_remove]
str(data1)
## 'data.frame':    1185676 obs. of  21 variables:
##  $ YearStart                : int  2010 2019 2019 2019 2015 2020 2015 2019 2018 2013 ...
##  $ YearEnd                  : int  2010 2019 2019 2019 2015 2020 2015 2019 2018 2013 ...
##  $ LocationAbbr             : chr  "OR" "AZ" "OH" "US" ...
##  $ LocationDesc             : chr  "Oregon" "Arizona" "Ohio" "United States" ...
##  $ DataSource               : chr  "NVSS" "YRBSS" "YRBSS" "YRBSS" ...
##  $ Topic                    : chr  "Cardiovascular Disease" "Alcohol" "Alcohol" "Alcohol" ...
##  $ Question                 : chr  "Mortality from heart failure" "Alcohol use among youth" "Alcohol use among youth" "Alcohol use among youth" ...
##  $ DataValueUnit            : chr  "" "%" "%" "%" ...
##  $ DataValueType            : chr  "Number" "Crude Prevalence" "Crude Prevalence" "Crude Prevalence" ...
##  $ DataValue                : chr  "30" "29.5" "22.5" "13.9" ...
##  $ DataValueFootnoteSymbol  : chr  "" "" "" "" ...
##  $ DatavalueFootnote        : chr  "" "" "" "" ...
##  $ StratificationCategory1  : chr  "Race/Ethnicity" "Gender" "Gender" "Race/Ethnicity" ...
##  $ Stratification1          : chr  "American Indian or Alaska Native" "Female" "Male" "Asian, non-Hispanic" ...
##  $ GeoLocation              : chr  "POINT (-120.15503132599969 44.56744942400047)" "POINT (-111.76381127699972 34.865970280000454)" "POINT (-82.40426005599966 40.06021014100048)" "" ...
##  $ LocationID               : int  41 4 39 59 78 1 10 12 20 28 ...
##  $ TopicID                  : chr  "CVD" "ALC" "ALC" "ALC" ...
##  $ QuestionID               : chr  "CVD1_4" "ALC1_1" "ALC1_1" "ALC1_1" ...
##  $ DataValueTypeID          : chr  "NMBR" "CRDPREV" "CRDPREV" "CRDPREV" ...
##  $ StratificationCategoryID1: chr  "RACE" "GENDER" "GENDER" "RACE" ...
##  $ StratificationID1        : chr  "AIAN" "GENF" "GENM" "ASN" ...
glimpse(data1)
## Rows: 1,185,676
## Columns: 21
## $ YearStart                 <int> 2010, 2019, 2019, 2019, 2015, 2020, 2015, 20…
## $ YearEnd                   <int> 2010, 2019, 2019, 2019, 2015, 2020, 2015, 20…
## $ LocationAbbr              <chr> "OR", "AZ", "OH", "US", "VI", "AL", "DE", "F…
## $ LocationDesc              <chr> "Oregon", "Arizona", "Ohio", "United States"…
## $ DataSource                <chr> "NVSS", "YRBSS", "YRBSS", "YRBSS", "YRBSS", …
## $ Topic                     <chr> "Cardiovascular Disease", "Alcohol", "Alcoho…
## $ Question                  <chr> "Mortality from heart failure", "Alcohol use…
## $ DataValueUnit             <chr> "", "%", "%", "%", "%", "%", "%", "%", "%", …
## $ DataValueType             <chr> "Number", "Crude Prevalence", "Crude Prevale…
## $ DataValue                 <chr> "30", "29.5", "22.5", "13.9", "", "59.1", "5…
## $ DataValueFootnoteSymbol   <chr> "", "", "", "", "-", "", "", "", "", "-", ""…
## $ DatavalueFootnote         <chr> "", "", "", "", "No data available", "", "",…
## $ StratificationCategory1   <chr> "Race/Ethnicity", "Gender", "Gender", "Race/…
## $ Stratification1           <chr> "American Indian or Alaska Native", "Female"…
## $ GeoLocation               <chr> "POINT (-120.15503132599969 44.5674494240004…
## $ LocationID                <int> 41, 4, 39, 59, 78, 1, 10, 12, 20, 28, 34, 42…
## $ TopicID                   <chr> "CVD", "ALC", "ALC", "ALC", "ALC", "ALC", "A…
## $ QuestionID                <chr> "CVD1_4", "ALC1_1", "ALC1_1", "ALC1_1", "ALC…
## $ DataValueTypeID           <chr> "NMBR", "CRDPREV", "CRDPREV", "CRDPREV", "CR…
## $ StratificationCategoryID1 <chr> "RACE", "GENDER", "GENDER", "RACE", "GENDER"…
## $ StratificationID1         <chr> "AIAN", "GENF", "GENM", "ASN", "GENM", "WHT"…

States that have the most number of chronic diseases ranked from most to least

data1 %>%
    count(LocationDesc)%>%
    arrange(desc(n))
##            LocationDesc     n
## 1              New York 22556
## 2             Wisconsin 22556
## 3            New Jersey 22550
## 4              Nebraska 22518
## 5                  Iowa 22510
## 6               Vermont 22490
## 7            New Mexico 22457
## 8            Washington 22457
## 9              Michigan 22422
## 10               Hawaii 22420
## 11             Colorado 22417
## 12        West Virginia 22407
## 13             Arkansas 22384
## 14             Kentucky 22375
## 15               Oregon 22352
## 16                 Utah 22352
## 17        Massachusetts 22349
## 18             Maryland 22346
## 19       North Carolina 22346
## 20              Florida 22311
## 21         Rhode Island 22273
## 22              Arizona 22271
## 23         South Dakota 22264
## 24               Nevada 22241
## 25       South Carolina 22235
## 26             Missouri 22163
## 27               Alaska 22157
## 28             Illinois 22157
## 29         Pennsylvania 22157
## 30             Delaware 22151
## 31                Maine 22142
## 32              Wyoming 22140
## 33        New Hampshire 22104
## 34          Mississippi 22100
## 35             Oklahoma 22093
## 36          Connecticut 22082
## 37              Alabama 22058
## 38             Virginia 22058
## 39            Louisiana 22053
## 40            Tennessee 22052
## 41           California 22037
## 42            Minnesota 22023
## 43               Kansas 21988
## 44              Georgia 21980
## 45              Montana 21973
## 46         North Dakota 21970
## 47                 Ohio 21953
## 48 District of Columbia 21936
## 49                Texas 21918
## 50              Indiana 21872
## 51                Idaho 21842
## 52          Puerto Rico 14406
## 53       Virgin Islands 14077
## 54                 Guam 14009
## 55        United States 10166

Frequency of Chronic Diseases Topics. Graph shows cancer having the highest frequency and disability having the lowest. [2]

topic <- as.data.frame(table(data1$Topic))
ggplot(topic, aes(x = reorder(Var1, -Freq), y = Freq, fill = Var1)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(
    x = "Chronic Diseases",
    y = "Frequency",
    title = "Frequency of Chronic Diseases Topics"
  ) +
  theme(
    legend.position = "none",
    axis.text.x = element_text(angle = 45, hjust = 1)  
  )

data2 <- data1 %>%
  filter(Topic == "Asthma")

Frequency of Chronic Diseases Topics by Race/Ethnicity. As the graph shows, Whites, Hispanics and Blacks has a higher frequency of having a Chronic Disease, while Asian, non-Hispanics having the lowest. [2]

Strat1 <- as.data.frame(table(data1$Stratification1))
ggplot(Strat1, aes(x = reorder(Var1, -Freq), y = Freq, fill = Var1)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(
    x = "Race/Ethnicity",
    y = "Frequency",
    title = "Frequency of Chronic Diseases Topics by Race/Ethnicity"
  ) +
  theme(
    legend.position = "none",
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

[1]

LocationAbbr <- as.data.frame(table(data1$LocationAbbr))
ggplot(LocationAbbr, aes(x=Var1, y =Freq, fill = Var1))+
  geom_bar(stat="identity") + 
  ggtitle("Frequency of Chronic Diseases by State") + 
  theme(legend.position = "none")+ 
  xlab("Location by State") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

data1 %>%
    count(LocationAbbr)%>%
    arrange(desc(n)) %>%
    head(10) %>% kable() %>% kable_styling() 
LocationAbbr n
NY 22556
WI 22556
NJ 22550
NE 22518
IA 22510
VT 22490
NM 22457
WA 22457
MI 22422
HI 22420
data2 <- data1 %>% 
  filter (Topic == "Asthma")
dim(data2)
## [1] 80342    21

[1]

kable(head(data2, 10)) %>% kable_styling(bootstrap_options = "striped", full_width = T,  font_size = 10) %>%  scroll_box(height = "500px")
YearStart YearEnd LocationAbbr LocationDesc DataSource Topic Question DataValueUnit DataValueType DataValue DataValueFootnoteSymbol DatavalueFootnote StratificationCategory1 Stratification1 GeoLocation LocationID TopicID QuestionID DataValueTypeID StratificationCategoryID1 StratificationID1
2016 2016 AR Arkansas SEDD; SID Asthma Emergency department visit rate for asthma Number 5285 Gender Male POINT (-92.27449074299966 34.74865012400045) 5 AST AST2_1 NMBR GENDER GENM
2017 2017 AZ Arizona SEDD; SID Asthma Emergency department visit rate for asthma Number 13497 Gender Male POINT (-111.76381127699972 34.865970280000454) 4 AST AST2_1 NMBR GENDER GENM
2016 2016 CA California SEDD; SID Asthma Emergency department visit rate for asthma Number
No data available Race/Ethnicity Black, non-Hispanic POINT (-120.99999953799971 37.63864012300047) 6 AST AST2_1 NMBR RACE BLK
2015 2015 CO Colorado SEDD; SID Asthma Emergency department visit rate for asthma Number
No data available Race/Ethnicity Hispanic POINT (-106.13361092099967 38.843840757000464) 8 AST AST2_1 NMBR RACE HIS
2017 2017 CO Colorado SEDD; SID Asthma Emergency department visit rate for asthma Number
No data available Gender Male POINT (-106.13361092099967 38.843840757000464) 8 AST AST2_1 NMBR GENDER GENM
2014 2014 CT Connecticut SEDD; SID Asthma Emergency department visit rate for asthma Number
No data available Race/Ethnicity Hispanic POINT (-72.64984095199964 41.56266102000046) 9 AST AST2_1 NMBR RACE HIS
2016 2016 DE Delaware SEDD; SID Asthma Emergency department visit rate for asthma Number
No data available Gender Female POINT (-75.57774116799965 39.008830667000495) 10 AST AST2_1 NMBR GENDER GENF
2014 2014 FL Florida SEDD; SID Asthma Emergency department visit rate for asthma Number 131559 Overall Overall POINT (-81.92896053899966 28.932040377000476) 12 AST AST2_1 NMBR OVERALL OVR
2016 2016 FL Florida SEDD; SID Asthma Emergency department visit rate for asthma Number 5751 Race/Ethnicity Black, non-Hispanic POINT (-81.92896053899966 28.932040377000476) 12 AST AST2_1 NMBR RACE BLK
2017 2017 FL Florida SEDD; SID Asthma Emergency department visit rate for asthma cases per 10,000 Age-adjusted Rate 7.12 Race/Ethnicity Hispanic POINT (-81.92896053899966 28.932040377000476) 12 AST AST2_1 AGEADJRATE RACE HIS
kable(tail(data2, 10)) %>% kable_styling(bootstrap_options = "striped", full_width = T,  font_size = 10) %>%  scroll_box(height = "500px")
YearStart YearEnd LocationAbbr LocationDesc DataSource Topic Question DataValueUnit DataValueType DataValue DataValueFootnoteSymbol DatavalueFootnote StratificationCategory1 Stratification1 GeoLocation LocationID TopicID QuestionID DataValueTypeID StratificationCategoryID1 StratificationID1
80333 2020 2020 WY Wyoming BRFSS Asthma Pneumococcal vaccination among noninstitutionalized adults aged 18-64 years with asthma % Age-adjusted Prevalence 34.2 Overall Overall POINT (-108.10983035299967 43.23554134300048) 56 AST AST6_1 AGEADJPREV OVERALL OVR
80334 2020 2020 WY Wyoming BRFSS Asthma Influenza vaccination among noninstitutionalized adults aged >= 65 years with asthma % Crude Prevalence 71.7 Overall Overall POINT (-108.10983035299967 43.23554134300048) 56 AST AST5_2 CRDPREV OVERALL OVR
80335 2020 2020 WY Wyoming BRFSS Asthma Pneumococcal vaccination among noninstitutionalized adults aged >= 65 years with asthma % Crude Prevalence 89.3 Overall Overall POINT (-108.10983035299967 43.23554134300048) 56 AST AST6_2 CRDPREV OVERALL OVR
80336 2020 2020 WY Wyoming BRFSS Asthma Pneumococcal vaccination among noninstitutionalized adults aged 18-64 years with asthma % Age-adjusted Prevalence 36.3 Gender Female POINT (-108.10983035299967 43.23554134300048) 56 AST AST6_1 AGEADJPREV GENDER GENF
80337 2020 2020 WY Wyoming BRFSS Asthma Current asthma prevalence among adults aged >= 18 years % Crude Prevalence 7.7 Race/Ethnicity Hispanic POINT (-108.10983035299967 43.23554134300048) 56 AST AST1_1 CRDPREV RACE HIS
80338 2020 2020 WY Wyoming BRFSS Asthma Influenza vaccination among noninstitutionalized adults aged 18-64 years with asthma % Age-adjusted Prevalence **** Sample size of denominator and/or age group for age-standardization is less than 50 or relative standard error is more than 30% Race/Ethnicity Black, non-Hispanic POINT (-108.10983035299967 43.23554134300048) 56 AST AST5_1 AGEADJPREV RACE BLK
80339 2020 2020 WY Wyoming BRFSS Asthma Pneumococcal vaccination among noninstitutionalized adults aged >= 65 years with asthma % Age-adjusted Prevalence **** Sample size of denominator and/or age group for age-standardization is less than 50 or relative standard error is more than 30% Race/Ethnicity Black, non-Hispanic POINT (-108.10983035299967 43.23554134300048) 56 AST AST6_2 AGEADJPREV RACE BLK
80340 2020 2020 WY Wyoming BRFSS Asthma Influenza vaccination among noninstitutionalized adults aged >= 65 years with asthma % Crude Prevalence 68.9 Gender Female POINT (-108.10983035299967 43.23554134300048) 56 AST AST5_2 CRDPREV GENDER GENF
80341 2020 2020 WY Wyoming BRFSS Asthma Pneumococcal vaccination among noninstitutionalized adults aged >= 65 years with asthma % Crude Prevalence 88.4 Gender Female POINT (-108.10983035299967 43.23554134300048) 56 AST AST6_2 CRDPREV GENDER GENF
80342 2020 2020 WY Wyoming BRFSS Asthma Influenza vaccination among noninstitutionalized adults aged 18-64 years with asthma % Age-adjusted Prevalence 43.1 Gender Female POINT (-108.10983035299967 43.23554134300048) 56 AST AST5_1 AGEADJPREV GENDER GENF
asthmadf <- data1 %>% 
  filter(Topic == "Asthma" & LocationAbbr == "US")
asthmadf$YearEnd <- as.factor(asthmadf$YearEnd)
asthmadf$DataValue <- as.numeric(asthmadf$DataValue)
str(asthmadf)
## 'data.frame':    374 obs. of  21 variables:
##  $ YearStart                : int  2017 2010 2016 2012 2020 2015 2018 2017 2011 2011 ...
##  $ YearEnd                  : Factor w/ 12 levels "2010","2011",..: 8 1 7 3 11 6 9 8 2 2 ...
##  $ LocationAbbr             : chr  "US" "US" "US" "US" ...
##  $ LocationDesc             : chr  "United States" "United States" "United States" "United States" ...
##  $ DataSource               : chr  "NVSS" "NVSS" "NVSS" "NVSS" ...
##  $ Topic                    : chr  "Asthma" "Asthma" "Asthma" "Asthma" ...
##  $ Question                 : chr  "Asthma mortality rate" "Asthma mortality rate" "Asthma mortality rate" "Asthma mortality rate" ...
##  $ DataValueUnit            : chr  "cases per 1,000,000" "cases per 1,000,000" "cases per 1,000,000" "cases per 1,000,000" ...
##  $ DataValueType            : chr  "Crude Rate" "Age-adjusted Rate" "Crude Rate" "Crude Rate" ...
##  $ DataValue                : num  10.9 10.5 10.9 11.2 12.6 ...
##  $ DataValueFootnoteSymbol  : chr  "" "" "" "" ...
##  $ DatavalueFootnote        : chr  "" "" "" "" ...
##  $ StratificationCategory1  : chr  "Overall" "Overall" "Overall" "Overall" ...
##  $ Stratification1          : chr  "Overall" "Overall" "Overall" "Overall" ...
##  $ GeoLocation              : chr  "" "" "" "" ...
##  $ LocationID               : int  59 59 59 59 59 59 59 59 59 59 ...
##  $ TopicID                  : chr  "AST" "AST" "AST" "AST" ...
##  $ QuestionID               : chr  "AST4_1" "AST4_1" "AST4_1" "AST4_1" ...
##  $ DataValueTypeID          : chr  "CRDRATE" "AGEADJRATE" "CRDRATE" "CRDRATE" ...
##  $ StratificationCategoryID1: chr  "OVERALL" "OVERALL" "OVERALL" "OVERALL" ...
##  $ StratificationID1        : chr  "OVR" "OVR" "OVR" "OVR" ...
glimpse(asthmadf)
## Rows: 374
## Columns: 21
## $ YearStart                 <int> 2017, 2010, 2016, 2012, 2020, 2015, 2018, 20…
## $ YearEnd                   <fct> 2017, 2010, 2016, 2012, 2020, 2015, 2018, 20…
## $ LocationAbbr              <chr> "US", "US", "US", "US", "US", "US", "US", "U…
## $ LocationDesc              <chr> "United States", "United States", "United St…
## $ DataSource                <chr> "NVSS", "NVSS", "NVSS", "NVSS", "NVSS", "NVS…
## $ Topic                     <chr> "Asthma", "Asthma", "Asthma", "Asthma", "Ast…
## $ Question                  <chr> "Asthma mortality rate", "Asthma mortality r…
## $ DataValueUnit             <chr> "cases per 1,000,000", "cases per 1,000,000"…
## $ DataValueType             <chr> "Crude Rate", "Age-adjusted Rate", "Crude Ra…
## $ DataValue                 <dbl> 10.9, 10.5, 10.9, 11.2, 12.6, 10.3, 10.5, 35…
## $ DataValueFootnoteSymbol   <chr> "", "", "", "", "", "", "", "", "", "", "", …
## $ DatavalueFootnote         <chr> "", "", "", "", "", "", "", "", "", "", "", …
## $ StratificationCategory1   <chr> "Overall", "Overall", "Overall", "Overall", …
## $ Stratification1           <chr> "Overall", "Overall", "Overall", "Overall", …
## $ GeoLocation               <chr> "", "", "", "", "", "", "", "", "", "", "", …
## $ LocationID                <int> 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, …
## $ TopicID                   <chr> "AST", "AST", "AST", "AST", "AST", "AST", "A…
## $ QuestionID                <chr> "AST4_1", "AST4_1", "AST4_1", "AST4_1", "AST…
## $ DataValueTypeID           <chr> "CRDRATE", "AGEADJRATE", "CRDRATE", "CRDRATE…
## $ StratificationCategoryID1 <chr> "OVERALL", "OVERALL", "OVERALL", "OVERALL", …
## $ StratificationID1         <chr> "OVR", "OVR", "OVR", "OVR", "OVR", "OVR", "O…
asthmadf2 <- data1 %>% 
  filter(Topic == "Asthma")
asthmadf2$YearEnd <- as.factor(asthmadf2$YearEnd)
asthmadf2$DataValue <- as.numeric(asthmadf2$DataValue)
str(asthmadf2)
## 'data.frame':    80342 obs. of  21 variables:
##  $ YearStart                : int  2016 2017 2016 2015 2017 2014 2016 2014 2016 2017 ...
##  $ YearEnd                  : Factor w/ 12 levels "2010","2011",..: 7 8 7 6 8 5 7 5 7 8 ...
##  $ LocationAbbr             : chr  "AR" "AZ" "CA" "CO" ...
##  $ LocationDesc             : chr  "Arkansas" "Arizona" "California" "Colorado" ...
##  $ DataSource               : chr  "SEDD; SID" "SEDD; SID" "SEDD; SID" "SEDD; SID" ...
##  $ Topic                    : chr  "Asthma" "Asthma" "Asthma" "Asthma" ...
##  $ Question                 : chr  "Emergency department visit rate for asthma" "Emergency department visit rate for asthma" "Emergency department visit rate for asthma" "Emergency department visit rate for asthma" ...
##  $ DataValueUnit            : chr  "" "" "" "" ...
##  $ DataValueType            : chr  "Number" "Number" "Number" "Number" ...
##  $ DataValue                : num  5285 13497 NA NA NA ...
##  $ DataValueFootnoteSymbol  : chr  "" "" "-" "-" ...
##  $ DatavalueFootnote        : chr  "" "" "No data available" "No data available" ...
##  $ StratificationCategory1  : chr  "Gender" "Gender" "Race/Ethnicity" "Race/Ethnicity" ...
##  $ Stratification1          : chr  "Male" "Male" "Black, non-Hispanic" "Hispanic" ...
##  $ GeoLocation              : chr  "POINT (-92.27449074299966 34.74865012400045)" "POINT (-111.76381127699972 34.865970280000454)" "POINT (-120.99999953799971 37.63864012300047)" "POINT (-106.13361092099967 38.843840757000464)" ...
##  $ LocationID               : int  5 4 6 8 8 9 10 12 12 12 ...
##  $ TopicID                  : chr  "AST" "AST" "AST" "AST" ...
##  $ QuestionID               : chr  "AST2_1" "AST2_1" "AST2_1" "AST2_1" ...
##  $ DataValueTypeID          : chr  "NMBR" "NMBR" "NMBR" "NMBR" ...
##  $ StratificationCategoryID1: chr  "GENDER" "GENDER" "RACE" "RACE" ...
##  $ StratificationID1        : chr  "GENM" "GENM" "BLK" "HIS" ...

[1]

kable(head(asthmadf, 5)) %>% kable_styling(bootstrap_options = "striped", full_width = T,  font_size = 10) %>%  scroll_box(height = "500px")
YearStart YearEnd LocationAbbr LocationDesc DataSource Topic Question DataValueUnit DataValueType DataValue DataValueFootnoteSymbol DatavalueFootnote StratificationCategory1 Stratification1 GeoLocation LocationID TopicID QuestionID DataValueTypeID StratificationCategoryID1 StratificationID1
2017 2017 US United States NVSS Asthma Asthma mortality rate cases per 1,000,000 Crude Rate 10.9 Overall Overall 59 AST AST4_1 CRDRATE OVERALL OVR
2010 2010 US United States NVSS Asthma Asthma mortality rate cases per 1,000,000 Age-adjusted Rate 10.5 Overall Overall 59 AST AST4_1 AGEADJRATE OVERALL OVR
2016 2016 US United States NVSS Asthma Asthma mortality rate cases per 1,000,000 Crude Rate 10.9 Overall Overall 59 AST AST4_1 CRDRATE OVERALL OVR
2012 2012 US United States NVSS Asthma Asthma mortality rate cases per 1,000,000 Crude Rate 11.2 Overall Overall 59 AST AST4_1 CRDRATE OVERALL OVR
2020 2020 US United States NVSS Asthma Asthma mortality rate cases per 1,000,000 Crude Rate 12.6 Overall Overall 59 AST AST4_1 CRDRATE OVERALL OVR
kable(tail(asthmadf, 5)) %>% kable_styling(bootstrap_options = "striped", full_width = T,  font_size = 10) %>%  scroll_box(height = "500px")
YearStart YearEnd LocationAbbr LocationDesc DataSource Topic Question DataValueUnit DataValueType DataValue DataValueFootnoteSymbol DatavalueFootnote StratificationCategory1 Stratification1 GeoLocation LocationID TopicID QuestionID DataValueTypeID StratificationCategoryID1 StratificationID1
370 2014 2014 US United States BRFSS Asthma Pneumococcal vaccination among noninstitutionalized adults aged >= 65 years with asthma % Age-adjusted Prevalence 86.2
50 States + DC: US Median Gender Female 59 AST AST6_2 AGEADJPREV GENDER GENF
371 2018 2018 US United States BRFSS Asthma Pneumococcal vaccination among noninstitutionalized adults aged >= 65 years with asthma % Crude Prevalence 87.5 ** US estimate/number is based on fewer than 50 states and the District of Columbia Gender Female 59 AST AST6_2 CRDPREV GENDER GENF
372 2016 2016 US United States BRFSS Asthma Influenza vaccination among noninstitutionalized adults aged 18-64 years with asthma % Age-adjusted Prevalence 36.4 ** US estimate/number is based on fewer than 50 states and the District of Columbia Gender Male 59 AST AST5_1 AGEADJPREV GENDER GENM
373 2017 2017 US United States BRFSS Asthma Pneumococcal vaccination among noninstitutionalized adults aged 18-64 years with asthma % Age-adjusted Prevalence 44.0
50 States + DC: US Median Overall Overall 59 AST AST6_1 AGEADJPREV OVERALL OVR
374 2014 2014 US United States BRFSS Asthma Current asthma prevalence among adults aged >= 18 years % Age-adjusted Prevalence 6.7
50 States + DC: US Median Gender Male 59 AST AST1_1 AGEADJPREV GENDER GENM

[1]

Questions <- unique(asthmadf$Question)
kable(Questions) %>% kable_styling(font_size = 12) %>%  scroll_box(height = "300px")
x
Asthma mortality rate
Current asthma prevalence among adults aged >= 18 years
Asthma prevalence among women aged 18-44 years
Pneumococcal vaccination among noninstitutionalized adults aged >= 65 years with asthma
Influenza vaccination among noninstitutionalized adults aged >= 65 years with asthma
Pneumococcal vaccination among noninstitutionalized adults aged 18-64 years with asthma
Influenza vaccination among noninstitutionalized adults aged 18-64 years with asthma
ggplot(data = asthmadf, aes(x = YearEnd, y = DataValue)) +
  geom_bar(stat = "identity") +
  labs(x = "Year", y = "Cases per 1,000,000", title = "Cases of Asthma")

ggplot(data = asthmadf, aes(x = YearEnd, y = DataValue)) +
  geom_boxplot() + 
  labs(title = "Distribtution of Asthma Cases", y = "Number of Cases") +
  theme_minimal()

Mean, standard deviation, minimum and maximum asthma cases in the US over the years.

mean(asthmadf$DataValue)
## [1] 149.246
sd(asthmadf$DataValue)
## [1] 599.4269
min(asthmadf$DataValue)
## [1] 6.5
max(asthmadf$DataValue)
## [1] 4145

Works Cited:

[1] Siracusa, M., et al. (2019, March 23). DATA 607 Project 3 Most Valued Data Science Skills. RPubs. https://rpubs.com/kleberperez/477939.

[2] OpenAI. (2023). ChatGPT (Mar 14 version) [Large language model]. https://chat.openai.com/chat.

[3] Centers for Disease Control and Prevention. (2023). U.S. Chronic Disease Indicators (CDI). Catalog. https://catalog.data.gov/dataset/u-s-chronic-disease-indicators-cdi

[4] Centers for Disease Control and Prevention. (2015). Indicators for Chronic Disease Surveillance — United States, 2013. MMWR. https://www.cdc.gov/mmwr/pdf/rr/rr6401.pdf