Data Dive

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

# 
# The dataset is data from the 2021 National Health Interview Adult Survey. The survey contained questions related to household and family composition, demographics about the survey taker, satisfaction with life, health insurance, medication, immunization, preventive screenings, and multiple health problems such as hypertension, cardiovascular conditions, cancer, vision, hearing, mobility, and more. 

# This survey is important in following the health of American's based on many different factors of their lives. Looking at previous surveys can also help to see trends in Americans' health.
# 
# Questions
# 1. Does education level play a role in the mental or physical health?
# 2. What are some health issues that correlate to other health issues?
# 3: What health issues are more common among certain demographics?
# 4: Has COVID possibly had an effect on certain health issues?
# 5: Is there a link between physical health and mental health?
# 
# Columns
# 1. General Health: Scale of 1 to 9
#     1: Excellent
#     2: Very Good
#     3: Good
#     4: Fair
#     5: Poor
#     7: Refused
#     8: Not Ascertained
#     9: Don't Know
# 
# 2: Life Satisfaction: Scale of 1 to 4 and 7 to 9.
#     1: Very Satisfied
#     2: Satisfied
#     3: Dissatisfied
#     4: Very Dissatisfied
#     7: Refused
#     8: Not Ascertained
#     9: Don't Know
#     
# 3: General Demographics
#     Classification of County Lived In
#       1: Large central metro
#       2: Large fringe metro
#       3: Medium and small metro
#       4: Nonmetropolitan
#     Household Region
#       1: Northeast
#       2: Midwest
#       3: South
#       4: West
#     Age
#       18-84: 18-84 with number corresponding
#       85: 85+
#       97: Refused
#       98: Not Ascertained
#       99: Don't Know
#     Age 65+
#       1: Less than 65
#       2: 65 or older
#       7: Refused
#       8: Not Ascertained
#       9: Don't Know
#     Sex
#       1: Male
#       2: Female
#       7: Refused
#       8: Not Ascertained
#       9: Don't Know
#     Education Level
#       00: Never attended/Kindergarten only
#       01: Grade 1-11
#       02: 12th grade, no diploma
#       03: GED or equivalent
#       04: High School Graduate
#       05: Some college, no degree
#       06: Associate degree: occupational, technical, or vocational program
#       07: Associate degree: academic program
#       08: Bachelor's degree
#       09: Master's degree
#       10: Professional School or Doctoral degree
#       97: Refused
#       98: Not Ascertained
#       99: Don't Know
#     Weight: Person's weight in lbs
#     
# 4: For medical problems, questions were laid out such as 
#     Told you have (condition)?
#     Told you have (condition) on 2 or more visits?
#     Had (condition) in past 12 months?
#   with the answers being,
#     1: Yes. 1 answered if respondant is taking medication to control the issue
#     2: No
#     7: Refused
#     8: Not Ascertained
#     9: Don't Know
#     
# 5: Age when first told had (type) cancer?
#     01-84: 1-84 years, with the corresponding number
#     85: 85+ years
#     97: Refused
#     98: Not Ascertained
#     99: Don't Know
# 
# 6: Others
#   Days Missed Work
#     0-129: 0 to 129 with corresponding value
#     130: 130+ days
#     997: Refused
#     998: Not Ascertained
#     999: Don't Know"

 library(ggplot2)
#Weight
mean(adult22$WEIGHTLBTC_A)

## [1] 246.2174

max(adult22$WEIGHTLBTC_A)

## [1] 999

min(adult22$WEIGHTLBTC_A)

## [1] 100

#Age
mean(adult22$AGEP_A)

## [1] 53.05092

max(adult22$AGEP_A)

## [1] 99

min(adult22$AGEP_A)

## [1] 18

#Age65+
ggplot(adult22, aes(x = AGE65)) +
  geom_bar()

## Warning: Removed 27525 rows containing non-finite values (`stat_count()`).

#Sex
ggplot(adult22, aes(x = SEX_A)) +
  geom_bar()

#Education Level

ggplot(adult22, aes(x = EDUCP_A)) +
  geom_bar()

#General Health
mean(adult22$PHSTAT_A)

## [1] 2.440273

ggplot(adult22, aes(x = PHSTAT_A)) +
  geom_bar()

#Weight and Health

plot(adult22$WEIGHTLBTC_A, adult22$PHSTAT_A, xlab = "Weight", ylab = "General Health")

#Weight and Height

plot(adult22$WEIGHTLBTC_A, adult22$HEIGHTTC_A)

# Group_By

library(magrittr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

dfEdu <- adult22 %>% group_by(adult22$EDUCP_A)

mean(dfEdu$EDUCP_A)

## [1] 6.443528

# which is an associate degree

# Probability of at least an associate degree (6, 7, 8, 9, 10)

prob_Associate_Up<- nrow(dfEdu[dfEdu$EDUCP_A >= '6' & dfEdu$EDUCP_A <= '10', ])

prob_All <- nrow(dfEdu)

prob_Associate_Up/prob_All

## [1] 0

# Probability of below grade 12

prob_Under_12 <- nrow(dfEdu[dfEdu$EDUCP_A <= '1', ])

prob_Under_12/prob_All

## [1] 0.06802647

# Probability of associate or higher and positive life satisfaction

prob_Associate_Satisfied <- nrow(dfEdu[dfEdu$EDUCP_A >= '6' & dfEdu$EDUCP_A <= '10' & dfEdu$LSATIS4_A <= '2', ])

prob_Associate_Satisfied/prob_Associate_Up

## [1] NaN

#Probability of below grade 12 and satisfied
prob_Under12_Satisfied <- nrow(dfEdu[dfEdu$EDUCP_A <= '1' & dfEdu$LSATIS4_A <= '2', ])

prob_Under12_Satisfied/prob_Under_12

## [1] 0.917597

#Probability of normal BMI(18.5 to 24.9) and general health

dfHealth <- adult22 %>% group_by(adult22$PHSTAT_A)

prob_NormBMI <- nrow(dfHealth[dfHealth$BMICAT_A == '2', ])

prob_NormBMI/prob_All

## [1] 0.307186

prob_NormBMI_GoodHealth <- nrow(dfHealth[dfHealth$BMICAT_A == '2' & dfHealth$PHSTAT_A <= '4', ])

prob_NormBMI_GoodHealth/prob_NormBMI

## [1] 0.970332

#Probability of overweight BMI and positive/negative health

prob_OverweightBMI <- nrow(dfHealth[dfHealth$BMICAT_A == '3', ])

prob_OverweightBMI/prob_All

## [1] 0.3357926

prob_OverweightBMI_GoodHealth <- nrow(dfHealth[dfHealth$BMICAT_A == '3' & dfHealth$PHSTAT_A <= '4', ])

prob_OverweightBMI_GoodHealth/prob_OverweightBMI

## [1] 0.9696284

prob_OverweightBMI_BadHealth <- nrow(dfHealth[dfHealth$BMICAT_A == '3' & dfHealth$PHSTAT_A == '5', ])

prob_OverweightBMI_BadHealth/prob_OverweightBMI

## [1] 0.02994076

prob_GoodHealth <- nrow(dfHealth[dfHealth$PHSTAT_A <= '4', ])

# How many of all BMIs considered themselves to be in good health

prob_GoodHealth/prob_All

## [1] 0.9626053

# About 96% of people considered themselves to be in good, or greater health. Even among different BMIs, the percent that considered themselves to be in good health was above 90%.

# Why do most people see themselves to be in good health, or were most of the survey takers healthy in general? -- Check the more specific medical issues

# Life Satisfaction and General Health

prob_GoodLS_Health <- nrow(dfHealth[dfHealth$LSATIS4_A <= '2' & dfHealth$PHSTAT_A <= '4', ])
prob_GoodLS_Health/prob_All

## [1] 0.9275976

#Prob out of those who have high general health
prob_GoodLS_Health/prob_GoodHealth

## [1] 0.9636323

#Bad life satisfaction and bad health out of all
prob_BadLS_Health <- nrow(dfHealth[dfHealth$LSATIS4_A >= '3' & dfHealth$LSATIS4_A <=4 & dfHealth$PHSTAT_A == '5', ])

prob_BadLS_Health/prob_All

## [1] 0.01182597

#Bad life satisfaction among those with low health
prob_Low_LS <- nrow(dfHealth[dfHealth$PHSTAT_A == '5',])

prob_BadLS_Health/prob_Low_LS

## [1] 0.3180934

# Because the survey was mostly multiple choice, there are not any major anomalies. The only thing that falls out of the typical range of responses are the "don't know, refuse, or not ascertained" but even those have specific values that are consistent across questions. 

# There were a few strange ones among these, such as a few people putting "don't know/not ascertained" for their age, which is something they should know. Probably a wrong click or just not paying attention?

#Education Dataframe Sample
dfEduSample <- dfEdu[ , c("EDUCP_A")]  
dfEdu1 <- sample_n(dfEduSample,100, replace = TRUE)
dfEdu2 <- sample_n(dfEduSample,100, replace = TRUE)
dfEdu3 <- sample_n(dfEduSample,100, replace = TRUE)
dfEdu4 <- sample_n(dfEduSample,100, replace = TRUE)
dfEdu5 <- sample_n(dfEduSample,100, replace = TRUE)
print(dfEdu1)

## # A tibble: 100 × 1
##    EDUCP_A
##      <int>
##  1       9
##  2       8
##  3       4
##  4      10
##  5      10
##  6       7
##  7      10
##  8       5
##  9      10
## 10       4
## # ℹ 90 more rows

print(mean(dfEdu1$EDUCP_A))

## [1] 5.94

print(dfEdu2)

## # A tibble: 100 × 1
##    EDUCP_A
##      <int>
##  1       6
##  2       8
##  3       6
##  4       5
##  5       8
##  6       5
##  7       8
##  8       8
##  9       8
## 10       8
## # ℹ 90 more rows

print(mean(dfEdu2$EDUCP_A))

## [1] 6.31

print(dfEdu3)

## # A tibble: 100 × 1
##    EDUCP_A
##      <int>
##  1       1
##  2       8
##  3       8
##  4       8
##  5       4
##  6       4
##  7       4
##  8       9
##  9      10
## 10       1
## # ℹ 90 more rows

print(mean(dfEdu3$EDUCP_A))

## [1] 7.03

print(dfEdu4)

## # A tibble: 100 × 1
##    EDUCP_A
##      <int>
##  1       9
##  2       4
##  3       7
##  4      10
##  5       6
##  6       9
##  7       9
##  8       4
##  9       8
## 10       4
## # ℹ 90 more rows

print(mean(dfEdu4$EDUCP_A))

## [1] 7.77

print(dfEdu5)

## # A tibble: 100 × 1
##    EDUCP_A
##      <int>
##  1       7
##  2       1
##  3       2
##  4       8
##  5       6
##  6       9
##  7       4
##  8       4
##  9       8
## 10       5
## # ℹ 90 more rows

print(mean(dfEdu5$EDUCP_A))

## [1] 6

# The average tends to be between 5 (some college) and 8 (Bachelor's degree), among all the samples. However if any sample ends up with the 97,98, or 99 that correspond with "don't know", then the sample will be greatly skewed.

dfWeightHeightSample <- dfHealth[ , c("WEIGHTLBTC_A", "HEIGHTTC_A")]  
dfWH1 <- sample_n(dfWeightHeightSample,100, replace = TRUE)
dfWH2 <- sample_n(dfWeightHeightSample,100, replace = TRUE)
dfWH3 <- sample_n(dfWeightHeightSample,100, replace = TRUE)
dfWH4 <- sample_n(dfWeightHeightSample,100, replace = TRUE)
dfWH5 <- sample_n(dfWeightHeightSample,100, replace = TRUE)
print(dfWH1)

## # A tibble: 100 × 2
##    WEIGHTLBTC_A HEIGHTTC_A
##           <int>      <int>
##  1          260         67
##  2          211         71
##  3          165         65
##  4          140         67
##  5          175         63
##  6          169         63
##  7          115         66
##  8          996         96
##  9          181         66
## 10          190         68
## # ℹ 90 more rows

print(dfWH2)

## # A tibble: 100 × 2
##    WEIGHTLBTC_A HEIGHTTC_A
##           <int>      <int>
##  1          187         70
##  2          185         62
##  3          165         72
##  4          210         66
##  5          170         65
##  6          188         63
##  7          105         62
##  8          185         59
##  9          200         68
## 10          185         68
## # ℹ 90 more rows

print(dfWH3)

## # A tibble: 100 × 2
##    WEIGHTLBTC_A HEIGHTTC_A
##           <int>      <int>
##  1          185         70
##  2          115         59
##  3          200         68
##  4          999         69
##  5          186         69
##  6          156         68
##  7          155         66
##  8          240         70
##  9          996         96
## 10          146         65
## # ℹ 90 more rows

print(dfWH4)

## # A tibble: 100 × 2
##    WEIGHTLBTC_A HEIGHTTC_A
##           <int>      <int>
##  1          290         73
##  2          996         96
##  3          150         69
##  4          996         96
##  5          185         69
##  6          135         59
##  7          996         96
##  8          224         69
##  9          220         70
## 10          997         67
## # ℹ 90 more rows

print(dfWH5)

## # A tibble: 100 × 2
##    WEIGHTLBTC_A HEIGHTTC_A
##           <int>      <int>
##  1          160         68
##  2          996         96
##  3          270         69
##  4          150         66
##  5          190         70
##  6          205         66
##  7          185         66
##  8          260         72
##  9          120         63
## 10          178         69
## # ℹ 90 more rows

plot(dfWH1$WEIGHTLBTC_A,dfWH1$HEIGHTTC_A,type="p",main="Normal Distribution",xlab="Weight(lbs)",ylab="Height")
 points(dfWH2$WEIGHTLBTC_A,dfWH2$HEIGHTTC_A, col="green")
 points(dfWH3$WEIGHTLBTC_A,dfWH3$HEIGHTTC_A,col="blue")
 points(dfWH4$WEIGHTLBTC_A,dfWH4$HEIGHTTC_A,col="red")
 points(dfWH5$WEIGHTLBTC_A,dfWH5$HEIGHTTC_A,col="yellow")

 # Among the samples, they tend to stay in the same corner/area for weight and height. They also tend to have around the same number of outliers(97,98,99 for "don't know").

dfGenHealthSample <- dfHealth[ , c("PHSTAT_A")]  
dfGH1 <- sample_n(dfGenHealthSample,100, replace = TRUE)
dfGH2 <- sample_n(dfGenHealthSample,100, replace = TRUE)
dfGH3 <- sample_n(dfGenHealthSample,100, replace = TRUE)
dfGH4 <- sample_n(dfGenHealthSample,100, replace = TRUE)
dfGH5 <- sample_n(dfGenHealthSample,100, replace = TRUE)

# Average
print(mean(dfGH1$PHSTAT_A))

## [1] 2.73

print(mean(dfGH2$PHSTAT_A))

## [1] 2.39

print(mean(dfGH3$PHSTAT_A))

## [1] 2.52

print(mean(dfGH4$PHSTAT_A))

## [1] 2.51

print(mean(dfGH5$PHSTAT_A))

## [1] 2.37

# The average tends to be between 2 and 3, which makes sense because the general health among all survey takers is often a 2 (Very good) or 3 (Good).

# Looking at data among cancer types

Data Dive

Julia French

2023-09-04

R Markdown