# Task 2:Perform an initial exploration}
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
NHIS_Data_2021 <- read_csv("data/NHIS _Data_2021.csv")
## Rows: 29482 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (18): DEMENEV_A, COPDEV_A, HYPEV_A, DEPEV_A, CANEV_A, DIBEV_A, AGEP_A, S...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(NHIS_Data_2021)
## spc_tbl_ [29,482 × 18] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ DEMENEV_A : num [1:29482] 2 2 2 2 2 2 2 2 2 2 ...
## $ COPDEV_A : num [1:29482] 2 2 2 2 2 2 2 2 2 2 ...
## $ HYPEV_A : num [1:29482] 1 1 2 1 2 1 2 2 2 2 ...
## $ DEPEV_A : num [1:29482] 2 2 2 2 2 2 2 2 2 1 ...
## $ CANEV_A : num [1:29482] 2 2 2 2 2 2 2 2 2 2 ...
## $ DIBEV_A : num [1:29482] 2 1 2 2 2 2 2 2 2 2 ...
## $ AGEP_A : num [1:29482] 50 53 56 57 25 55 45 41 26 71 ...
## $ SEX_A : num [1:29482] 1 1 1 2 1 1 1 1 2 2 ...
## $ HISPALLP_A : num [1:29482] 2 3 2 2 3 3 2 3 3 2 ...
## $ MARSTAT_A : num [1:29482] 5 6 5 7 9 9 1 1 7 1 ...
## $ EDUCP_A : num [1:29482] 1 7 8 5 4 5 9 5 4 9 ...
## $ PHSTAT_A : num [1:29482] 2 2 2 4 3 3 1 1 2 1 ...
## $ LSATIS4R_A : num [1:29482] 2 1 3 2 8 8 1 1 1 1 ...
## $ SMKCIGST_A : num [1:29482] 3 4 3 3 9 9 4 4 4 4 ...
## $ RATCAT_A : num [1:29482] 7 12 14 11 6 6 14 14 7 14 ...
## $ BMICAT_A : num [1:29482] 3 3 3 4 4 3 9 3 4 2 ...
## $ WEIGHTLBTC_A: num [1:29482] 199 205 160 190 250 200 997 206 996 127 ...
## $ HEIGHTTC_A : num [1:29482] 69 75 67 63 72 69 67 72 96 63 ...
## - attr(*, "spec")=
## .. cols(
## .. DEMENEV_A = col_double(),
## .. COPDEV_A = col_double(),
## .. HYPEV_A = col_double(),
## .. DEPEV_A = col_double(),
## .. CANEV_A = col_double(),
## .. DIBEV_A = col_double(),
## .. AGEP_A = col_double(),
## .. SEX_A = col_double(),
## .. HISPALLP_A = col_double(),
## .. MARSTAT_A = col_double(),
## .. EDUCP_A = col_double(),
## .. PHSTAT_A = col_double(),
## .. LSATIS4R_A = col_double(),
## .. SMKCIGST_A = col_double(),
## .. RATCAT_A = col_double(),
## .. BMICAT_A = col_double(),
## .. WEIGHTLBTC_A = col_double(),
## .. HEIGHTTC_A = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
summary(NHIS_Data_2021)
## DEMENEV_A COPDEV_A HYPEV_A DEPEV_A
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:2.000
## Median :2.000 Median :2.000 Median :2.000 Median :2.000
## Mean :1.993 Mean :1.951 Mean :1.648 Mean :1.829
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :9.000 Max. :9.000 Max. :9.000 Max. :9.000
## CANEV_A DIBEV_A AGEP_A SEX_A HISPALLP_A
## Min. :1.000 Min. :1.0 Min. :18.00 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.0 1st Qu.:37.00 1st Qu.:1.000 1st Qu.:2.000
## Median :2.000 Median :2.0 Median :53.00 Median :2.000 Median :2.000
## Mean :1.882 Mean :1.9 Mean :52.63 Mean :1.547 Mean :2.203
## 3rd Qu.:2.000 3rd Qu.:2.0 3rd Qu.:68.00 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :9.000 Max. :9.0 Max. :99.00 Max. :9.000 Max. :7.000
## MARSTAT_A EDUCP_A PHSTAT_A LSATIS4R_A
## Min. :1.000 Min. : 1.000 Min. :1.00 Min. :1.000
## 1st Qu.:1.000 1st Qu.: 4.000 1st Qu.:2.00 1st Qu.:1.000
## Median :4.000 Median : 6.000 Median :2.00 Median :2.000
## Mean :3.858 Mean : 6.447 Mean :2.39 Mean :1.748
## 3rd Qu.:7.000 3rd Qu.: 8.000 3rd Qu.:3.00 3rd Qu.:2.000
## Max. :9.000 Max. :99.000 Max. :9.00 Max. :9.000
## SMKCIGST_A RATCAT_A BMICAT_A WEIGHTLBTC_A
## Min. :1.000 Min. : 1.000 Min. :1.000 Min. :100.0
## 1st Qu.:3.000 1st Qu.: 7.000 1st Qu.:2.000 1st Qu.:150.0
## Median :4.000 Median :11.000 Median :3.000 Median :180.0
## Mean :3.579 Mean : 9.848 Mean :3.121 Mean :248.8
## 3rd Qu.:4.000 3rd Qu.:14.000 3rd Qu.:4.000 3rd Qu.:215.0
## Max. :9.000 Max. :14.000 Max. :9.000 Max. :999.0
## HEIGHTTC_A
## Min. :59.00
## 1st Qu.:64.00
## Median :67.00
## Mean :68.72
## 3rd Qu.:70.00
## Max. :99.00
head(NHIS_Data_2021)
## # A tibble: 6 × 18
## DEMENEV_A COPDEV_A HYPEV_A DEPEV_A CANEV_A DIBEV_A AGEP_A SEX_A HISPALLP_A
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2 2 1 2 2 2 50 1 2
## 2 2 2 1 2 2 1 53 1 3
## 3 2 2 2 2 2 2 56 1 2
## 4 2 2 1 2 2 2 57 2 2
## 5 2 2 2 2 2 2 25 1 3
## 6 2 2 1 2 2 2 55 1 3
## # ℹ 9 more variables: MARSTAT_A <dbl>, EDUCP_A <dbl>, PHSTAT_A <dbl>,
## # LSATIS4R_A <dbl>, SMKCIGST_A <dbl>, RATCAT_A <dbl>, BMICAT_A <dbl>,
## # WEIGHTLBTC_A <dbl>, HEIGHTTC_A <dbl>
Interpretation: The 2021 NHIS data consisted of 29,482 observations and 18 variables.
# Day 2: Task 1 - Variable Selection & Missing Value Check}
library(tidyverse)
NHIS_Data_2021 <- read_csv("data/NHIS _Data_2021.csv")
## Rows: 29482 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (18): DEMENEV_A, COPDEV_A, HYPEV_A, DEPEV_A, CANEV_A, DIBEV_A, AGEP_A, S...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Select only variables asked for
data_import_R_02_subset <- NHIS_Data_2021 |>
select(
AGEP_A,
WEIGHTLBTC_A,
HEIGHTTC_A,
SEX_A,
HISPALLP_A,
EDUCP_A,
PHSTAT_A,
LSATIS4R_A
)
View(data_import_R_02_subset)
# Remove missing / nonresponse values using variable-specific codebook values
clean_data_R_02 <- data_import_R_02_subset |>
filter(
!AGEP_A %in% c(97, 98, 99),
!SEX_A %in% c(7, 9),
!HISPALLP_A %in% c(97, 98, 99),
!EDUCP_A %in% c(97, 98, 99),
!PHSTAT_A %in% c(7, 8, 9),
!LSATIS4R_A %in% c(7, 8, 9),
!WEIGHTLBTC_A %in% c(996, 997, 998, 999),
!HEIGHTTC_A %in% c(96, 97, 98, 99)
)
View(clean_data_R_02) # check
# Check cleaned data
summary(clean_data_R_02)
## AGEP_A WEIGHTLBTC_A HEIGHTTC_A SEX_A HISPALLP_A
## Min. :18.00 Min. :100.0 Min. :59.0 Min. :1.00 Min. :1.000
## 1st Qu.:37.00 1st Qu.:147.0 1st Qu.:64.0 1st Qu.:1.00 1st Qu.:2.000
## Median :54.00 Median :173.0 Median :66.0 Median :2.00 Median :2.000
## Mean :52.57 Mean :176.8 Mean :66.7 Mean :1.54 Mean :2.197
## 3rd Qu.:67.00 3rd Qu.:200.0 3rd Qu.:70.0 3rd Qu.:2.00 3rd Qu.:2.000
## Max. :85.00 Max. :299.0 Max. :76.0 Max. :2.00 Max. :7.000
## EDUCP_A PHSTAT_A LSATIS4R_A
## Min. : 1.000 Min. :1.000 Min. :1.000
## 1st Qu.: 4.000 1st Qu.:2.000 1st Qu.:1.000
## Median : 6.000 Median :2.000 Median :2.000
## Mean : 6.032 Mean :2.346 Mean :1.583
## 3rd Qu.: 8.000 3rd Qu.:3.000 3rd Qu.:2.000
## Max. :10.000 Max. :5.000 Max. :4.000
sum(is.na(clean_data_R_02))
## [1] 0
# Day 2: Task 2 - Recoding Variables
clean_data_R_02 <- clean_data_R_02 %>%
mutate(
EDUCP_A = case_when(
EDUCP_A >= 0 & EDUCP_A <= 3 ~ 1,
EDUCP_A == 4 ~ 2,
EDUCP_A >= 5 & EDUCP_A <= 7 ~ 3,
EDUCP_A >= 8 & EDUCP_A <= 10 ~ 4
),
EDUCP_A = factor(
EDUCP_A,
levels = c(1, 2, 3, 4),
labels = c(
"Less than High School",
"High School Graduate",
"Some College Education",
"College Graduate or Better"
)
)
)
# Verify recoding
summary(clean_data_R_02)
## AGEP_A WEIGHTLBTC_A HEIGHTTC_A SEX_A HISPALLP_A
## Min. :18.00 Min. :100.0 Min. :59.0 Min. :1.00 Min. :1.000
## 1st Qu.:37.00 1st Qu.:147.0 1st Qu.:64.0 1st Qu.:1.00 1st Qu.:2.000
## Median :54.00 Median :173.0 Median :66.0 Median :2.00 Median :2.000
## Mean :52.57 Mean :176.8 Mean :66.7 Mean :1.54 Mean :2.197
## 3rd Qu.:67.00 3rd Qu.:200.0 3rd Qu.:70.0 3rd Qu.:2.00 3rd Qu.:2.000
## Max. :85.00 Max. :299.0 Max. :76.0 Max. :2.00 Max. :7.000
## EDUCP_A PHSTAT_A LSATIS4R_A
## Less than High School : 2676 Min. :1.000 Min. :1.000
## High School Graduate : 5765 1st Qu.:2.000 1st Qu.:1.000
## Some College Education : 7238 Median :2.000 Median :2.000
## College Graduate or Better:10358 Mean :2.346 Mean :1.583
## 3rd Qu.:3.000 3rd Qu.:2.000
## Max. :5.000 Max. :4.000
table(clean_data_R_02$EDUCP_A)
##
## Less than High School High School Graduate
## 2676 5765
## Some College Education College Graduate or Better
## 7238 10358
write_csv(clean_data_R_02, "data/nhis_clean.csv")
Interpretation: After the selected variables were extracted, removals of missing/nonresponsive answers, and recoding the Education variable, the data consisted of 26,037 observations and 8 variables.
# Day 3: Task 1: Univariate Analysis & Visualization}
library(tidyverse) #loading tidyverse package
library(fBasics)
nhis_clean <- read_csv("data/nhis_clean.csv") #loading nhis_clean data into R
## Rows: 26037 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): EDUCP_A
## dbl (7): AGEP_A, WEIGHTLBTC_A, HEIGHTTC_A, SEX_A, HISPALLP_A, PHSTAT_A, LSAT...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
"Quantitative Variables"
## [1] "Quantitative Variables"
#AgeP_A (Age)
summary(nhis_clean$AGEP_A) #summary stats for age
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 37.00 54.00 52.57 67.00 85.00
sd(nhis_clean$AGEP_A) #standard deviation for age
## [1] 18.33484
#Base histogram AGEP_A
hist(nhis_clean$AGEP_A, main = "Age Distribution",xlab="Age")
#ggplot histogram AGEP_A
ggplot(nhis_clean,aes(x=AGEP_A)) + #x axis will be Age
geom_histogram(binwidth = 5) + #data will be distributed among 5 bins
xlab("Ages") + #x axis title
ggtitle("Age Distribution of Sample Adults") #chart title
Interpretation: The age distribution spans from approximately 18 to 80 years which demonstrates a wide range of ages in the sample. The graph shows a higher concentration of individuals in the middle-to-older age groups, mainly between 50-70 years with a peak around early 60s. Overall, the distribution demonstrates considerable variability with a small tendency toward older ages.
#Base boxplot of AGEP_A
boxplot(nhis_clean$AGEP_A, main = "Age Distribution of Sample Adults", ylab="Age")
#ggplot boxplot of AGEP_A
ggplot(nhis_clean, aes(y=AGEP_A)) + #y axis will be age values
geom_boxplot() + #identifying boxplot
ggtitle("Age Distribution of Sample Adults") + #chart title
ylab("Ages") #y axis title
Interpretation: The boxplots demonstrate that the median age of participants is around mid-50s, resulting in a relatively older sample. The overall age range spans from slightly below 20 to around 80 years of age. The distribution appears slightly left-skewed, with a somewhat greater spread among older ages and no significant outliers are observed.
#WEIGHTLBTC_A (Weight w/out shoes)
summary(nhis_clean$WEIGHTLBTC_A) #Summary stat for weight w/o shoes
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 100.0 147.0 173.0 176.8 200.0 299.0
sd(nhis_clean$WEIGHTLBTC_A) #standard dev for weight w/o shoes
## [1] 39.59538
basicStats(nhis_clean$WEIGHTLBTC_A) #summary stat for weight w/o shoes
## X..nhis_clean.WEIGHTLBTC_A
## nobs 2.603700e+04
## NAs 0.000000e+00
## Minimum 1.000000e+02
## Maximum 2.990000e+02
## 1. Quartile 1.470000e+02
## 3. Quartile 2.000000e+02
## Mean 1.768261e+02
## Median 1.730000e+02
## Sum 4.604021e+06
## SE Mean 2.453860e-01
## LCL Mean 1.763451e+02
## UCL Mean 1.773071e+02
## Variance 1.567794e+03
## Stdev 3.959538e+01
## Skewness 4.708550e-01
## Kurtosis -2.602220e-01
#Base histogram WEIGHTLBTC_A
hist(nhis_clean$WEIGHTLBTC_A, main = "Weight Distribution of Sample Adults w/out Shoes",xlab="Weight(lbs)")
#ggplot histogram WEIGHTLBTC_A
ggplot(nhis_clean,aes(x=WEIGHTLBTC_A)) + #x axis will be weight
geom_histogram(binwidth = 10) + #data will be distributed among 10 bins
xlab("Weight(lbs)") + #x axis title
ggtitle("Weight Distribution of Sample Adults w/out Shoes") #chart title
Interpretation: The weight distribution ranges from approximately 100 to around 300 pounds, indicating a wide range of weights in the sample. The graph shows a positive skew, with a higher concentration of individuals weighing 130 to 200 pounds, mainly 140 to 190, peaking at 170. Overall, the distribution shows considerable variability with slight outliers.
#Base boxplot of WEIGHTLBTC_A
boxplot(nhis_clean$WEIGHTLBTC_A, main = "Weight Distribution of Sample Adults w/out Shoes",ylab = "Weight(lbs)")
#ggplot boxplot of WEIGHTLBTC_A
ggplot(nhis_clean, aes(y=WEIGHTLBTC_A)) + #y axis will be weight values
geom_boxplot() + #identidying boxplot
ggtitle("Weight Distribution of Sample Adults w/out Shoes") + #chart title
ylab("Weight(lbs)") #y axis title
Interpretation: The boxplots show that the median weight of participants is about 180 pounds, indicating this is the typical weight in this sample. Overall, weights range from 100 to about 300 pounds, but most adults cluster between 150 and 200 pounds. A few heavier individuals pull the distribution slightly to the right.
#HEIGHTTTC_A (Heigh in Inches)
basicStats(nhis_clean$HEIGHTTC_A) #summary stat for total height in inches
## X..nhis_clean.HEIGHTTC_A
## nobs 2.603700e+04
## NAs 0.000000e+00
## Minimum 5.900000e+01
## Maximum 7.600000e+01
## 1. Quartile 6.400000e+01
## 3. Quartile 7.000000e+01
## Mean 6.670108e+01
## Median 6.600000e+01
## Sum 1.736696e+06
## SE Mean 2.416200e-02
## LCL Mean 6.665372e+01
## UCL Mean 6.674844e+01
## Variance 1.520059e+01
## Stdev 3.898793e+00
## Skewness 1.719930e-01
## Kurtosis -7.452840e-01
#Base histogram HEIGHTTTC_A
hist(nhis_clean$HEIGHTTC_A, main = "Height Distribution of Sample Adults",xlab="Total Height in Inches")
#ggplot histogram HEIGHTTTC_A
ggplot(nhis_clean,aes(x=HEIGHTTC_A)) + #x axis will be total height in inches
geom_histogram(binwidth = 2) + #data will be distributed among 2 bins
xlab("Total Height in Inches") + #x axis title
ggtitle("Height Distribution of Sample Adults") #chart title
Interpretation: The height distribution ranges from approximately 60 to about 75 inches, indicating substantial variability in the sample. The graph shows a higher concentration of individuals measuring 62 to 70 inches, with a peak at 63 inches. Overall, the distribution is slightly skewed to the right, but it appears fairly balanced.
#Base boxplot of HEIGHTTTC_A
boxplot(nhis_clean$HEIGHTTC_A, main="Total Height Distribution in Inches of Sample Adults",ylab = "Height(in.)")
#ggplot boxplot of HEIGHTTTC_A
ggplot(nhis_clean, aes(y=HEIGHTTC_A)) + #y axis will be height values
geom_boxplot() + #identidying boxplot
ggtitle("Height Distribution of Sample Adults") + #chart title
ylab("Total Height in Inches") #y axis title
Interpretation: Overall, heights range from about 60 to about 75 inches, but most adults cluster between 65 and 70 inches. The boxplots show that the median height is about 66 inches, indicating this is the typical height in this sample. The median is slightly closer to the lower quartile, and the upper whisker is a bit longer, suggesting a mild right skew; however, the distribution looks fairly balanced overall.
"Qualitative Variables"
## [1] "Qualitative Variables"
#SEX_A (Gender)
nhis_clean$SEX_A <- factor(nhis_clean$SEX_A, #creating categorical data to a factor
levels = c(1,2), #identifying the values w/in data
labels = c("Males","Females")) #relabeling the values w/corresponding label
sex_a_freq_table <- table(nhis_clean$SEX_A) #creating frequency table of sex_a variable
sex_a_freq_table #printing out the sex_a frequency table
##
## Males Females
## 11967 14070
barplot(sex_a_freq_table, #indicating r base barplot of sex_a variable through freq table
main = "Distribution of Sex Among Adult Sample", #chart title
xlab = "Sex", #x axis title
ylab = "Frequency", #y axis title
col = c("lightblue","lightpink")) #bar plot colors
sex_a_freq_df <- as.data.frame(sex_a_freq_table) #converting freq table to a data frame for ggplot
ggplot(sex_a_freq_df, aes(x=Var1,y=Freq)) + #using ggplot & the data frame, indicating x as the variables and y as the frequency
geom_bar(stat = "identity",fill ="purple") + #indicating bar plot
labs(title = "Distribution of Sex Among Adult Sample", #chart title
x = "Sex", #x axis title
y = "Frequency") #y axis title
Interpretation: The sample shows that there are more female participants than male participants, with 14,070 and 11,967, respectively.
#HISPALLP_A (Race/Ethnicity)
nhis_clean$HISPALLP_A <- factor(nhis_clean$HISPALLP_A, #creating categorical data to a factor
levels = c(1,2,3,4,5,6,7), #identifying the values w/in data
labels = c("Hispanic","NHWhite","NHAA","NHAsian Only","NHAIAN","NHAIAN&Other","Other")) #relabeling the values w/corresponding label
hispallp_a_freq_table <- table(nhis_clean$HISPALLP_A) #creating frequency table of HISPALLPa variable
hispallp_a_freq_table #printing out the hispallp_a frequency table
##
## Hispanic NHWhite NHAA NHAsian Only NHAIAN NHAIAN&Other
## 3533 17617 2645 1564 153 196
## Other
## 329
barplot(hispallp_a_freq_table, #indicating r base barplot of hispallp_a variable through freq table
main = "Distribution of Race/Ethnicity Among Adult Sample", #chart title
xlab = "Race/Ethniticy", #x axis title
ylab = "Frequency", #y axis title
col = terrain.colors(7), #bar plot colors
cex.names = 0.6) #label size
hispallp_a_freq_df <- as.data.frame(hispallp_a_freq_table) #converting freq table to a data frame for ggplot
ggplot(hispallp_a_freq_df, aes(x=Var1,y=Freq)) + #using ggplot & the data frame, indicating x as the variables and y as the frequency
geom_bar(stat = "identity",fill ="lightgreen") + #indicating bar plot
labs(title = "Distribution of Race/Ethnicity Among Adult Sample", #chart title
x = "Race/Ethnicity", #x axis title
y = "Frequency") #y axis title
Interpretation: The sample consists mostly of Non-Hispanic White individuals, representing 17,617 of the sample. Hispanic participants were the second most represented in the sample, comprising 3,533 participants. Non-Hispanic African Americans and Non-Hispanic Asians had similar representation, with 2645 and 1564, respectively. Lastly, Non-Hispanic American Indian and Alaska Native, Non-Hispanic American Indian and Alaska Native and Other, and Other were the least represented in the sample.
#EDUCP_A (Education Level)
"EDCUP_A was already recoded and turned into a factor in previous task"
## [1] "EDCUP_A was already recoded and turned into a factor in previous task"
educp_a_freq_table <- table(nhis_clean$EDUCP_A) #creating frequency table of EDUCP_A variable
educp_a_freq_table #printing out the educp_a frequency table
##
## College Graduate or Better High School Graduate
## 10358 5765
## Less than High School Some College Education
## 2676 7238
barplot(educp_a_freq_table, #indicating r base barplot of educp_a variable through freq table
main = "Distribution of Education Level Among Adult Sample", #chart title
xlab = "Education Level", #x axis title
ylab = "Frequency", #y axis title
col = rainbow(4), #bar plot colors
cex.names = 0.6) #label size
educp_a_freq_df <- as.data.frame(educp_a_freq_table) #converting freq table to a data frame for ggplot
ggplot(educp_a_freq_df, aes(x=Var1,y=Freq)) + #using ggplot & the data frame, indicating x as the variables and y as the frequency
geom_bar(stat = "identity",fill ="blue") + #indicating bar plot
labs(title = "Distribution of Education Level Among Adult Sample", #chart title
x = "Race/Ethnicity", #x axis title
y = "Frequency") + #yaxis title
theme(axis.text.x = element_text(size = 9, angle = 45, hjust = 1)) #adjusts labels on x axis
Interpretation: The majority of the participants are educated because most of the sample are college graduates or better, and the second most represented group obtained some college education. High school graduates were the third most represented group, and the least represented group had less than a high school education.
#PHSTAT_A (General Health)
nhis_clean$PHSTAT_A<- factor(nhis_clean$PHSTAT_A, #creating categorical data to a factor
levels = c(1,2,3,4,5), #identifying the values w/in data
labels = c("Excellent","Very Good","Good","Fair","Poor")) #relabeling the values w/corresponding label
phstat_a_freq_table <- table(nhis_clean$PHSTAT_A) #creating frequency table of PHSTAT_A variable
phstat_a_freq_table #printing out the phstat_a frequency table
##
## Excellent Very Good Good Fair Poor
## 6065 9185 7287 2717 783
barplot(phstat_a_freq_table, #indicating r base barplot of phstat_a variable through freq table
main = "Distribution of Health Status Among Adult Sample", #chart title
xlab = "General Health Status", #x axis title
ylab = "Frequency", #y axis title
col = topo.colors(5), #bar plot colors
cex.names = 0.8) #label size
phstat_a_freq_df <- as.data.frame(phstat_a_freq_table) #converting freq table to a data frame for ggplot
ggplot(phstat_a_freq_df, aes(x=Var1,y=Freq)) + #using ggplot & the data frame, indicating x as the variables and y as the frequency
geom_bar(stat = "identity",fill ="gray") + #indicating bar plot
labs(title = "Distribution of Health Status Among Adult Sample", #chart title
x = "General Health Status", #x axis title
y = "Frequency") #y axis title
Interpretation: The majority of the participants report having a very good health status, with most rating their health as “very good” to “good.” The third most represented group reported an “excellent” health status, and “fair” and “poor” were reported to be fourth and fifth, respectively.
#LSATIS4R_A (Quality of Life)
nhis_clean$LSATIS4R_A<- factor(nhis_clean$LSATIS4R_A, #creating categorical data to a factor
levels = c(1,2,3,4), #identifying the values w/in data
labels = c("Very Satisfied","Satisfied","Dissatisfied","Very Dissatisfied")) #relabeling the values w/corresponding label
lsatis4r_a_freq_table <- table(nhis_clean$LSATIS4R_A) #creating frequency table of LSATIS4R_A variable
lsatis4r_a_freq_table #printing out the lsatis4r_a frequency table
##
## Very Satisfied Satisfied Dissatisfied Very Dissatisfied
## 12458 12266 1025 288
barplot(lsatis4r_a_freq_table, #indicating r base barplot of lsatis4r_a variable through freq table
main = "Distribution of Quality of Life Among Adult Sample", #chart title
xlab = "Quality of Life", #x axis title
ylab = "Frequency", #y axis title
col = heat.colors(4), #bar plot colors
cex.names = 0.8) #label size
lstatis4r_a_freq_df <- as.data.frame(lsatis4r_a_freq_table) #converting freq table to a data frame for ggplot
ggplot(lstatis4r_a_freq_df, aes(x=Var1,y=Freq)) + #using ggplot & the data frame, indicating x as the variables and y as the frequency
geom_bar(stat = "identity",fill = "orange") + #indicating bar plot
labs(title = "Distribution of Quality of Life Among Adult Sample", #chart title
x = "Quality of Life", #x axis title
y = "Frequency") #y axis title
Interpretation: The sample reports a high quality of life, with the majority of participants rating their lives as “very satisfied” or “satisfied.” Very few participants reported their lives as “dissatisfied” or “very dissatisfied.
# Day 3: Task 2: Bivariate Analysis (Boxplots Age v Sex & Age v Education)}
#Boxplot Age v Sex
"Age v Sex Boxplot"
## [1] "Age v Sex Boxplot"
boxplot(AGEP_A~SEX_A, data = nhis_clean,#boxplot
main = "Boxplot of Age by Sex ", #chart title
xlab = "Sex", #x axis title
ylab = "Age") #y axis title
"Age v Sex GGPlot"
## [1] "Age v Sex GGPlot"
ggplot(nhis_clean,aes(x=SEX_A,y=AGEP_A)) + #indicating gglot, data set, x & y axis
geom_boxplot(aes(fill = factor(SEX_A))) + #identifying boxplot chart & filling box with Sex values
labs(title = "Boxplot of Age by Sex", #chart title
x = "Sex", #x axis title
y = "Age", #y axis title
fill = "Sex") #title of legend
Interpretation: Overall, both groups cover nearly the same age range, from slightly younger than 20 to the mid-80s. Additionally, both groups have very similar median ages, with females slightly higher; therefore, ~50 years of age is the typical age across the sexes. The distributions appear symmetrical and comparable. There is no strong skew, as there are no outliers or noticeable differences in variability between the two groups.
#Boxplot Age v Education
"Age v Education Boxplot"
## [1] "Age v Education Boxplot"
boxplot(AGEP_A~EDUCP_A, data = nhis_clean,#boxplot
main = "Boxplot of Age by Education ", #chart title
xlab = "Education Level", #x axis title
ylab = "Age",#y axis title
cex.axis = 0.75) #size of x axis text
"Age v Education ggplot"
## [1] "Age v Education ggplot"
ggplot(nhis_clean,aes(x=EDUCP_A,y=AGEP_A)) + #indicating gglot, data set, x & y axis
geom_boxplot(aes(fill = factor(EDUCP_A))) + #identifying boxplot chart & filling box with Educ values
labs(title = "Boxplot of Age by Education Level", #chart title
x = "Education Level", #x axis title
y = "Age", #y axis title
fill = "Education Level") + #title of legend
theme(axis.text.x = element_text(size = 9, angle = 45, hjust = 1)) #adjusts labels on x axis
Interpretation: The broad ranges across all boxes indicate substantial age diversity across all education levels. Less than High School attainment has the highest ages and the greatest variability compared to the other education levels, as indicated by the Q1, mean, Q3, and max. Overall, age appears inversely related to education level, as individuals with lower education levels tend to be older, potentially because younger generations are achieving higher education levels more often. This sample demonstrates a generational shift in educational attainment, with younger people more likely to have higher levels of education.
#Clustered Bar Chart (General Health & Life Satisfaction)
ggplot(nhis_clean,aes(x=PHSTAT_A,fill = LSATIS4R_A)) + #identifying ggplot, x axis and y axis
geom_bar(position = "dodge") + #indicating bar chart
labs(title = "Relationship between General Health & Life Satisfaction", #chart title
x = "General Health Status", #x axis title
y = "Count", #y axis title
fill = "Life Satisfaction") #legend title
Interpretation: As general health improves, life satisfaction tends to increase as well. People who reported “excellent” or “very good health” are much more likely to report being “very satisfied” with their lives. Participants who reported “fair” or “poor” health showed a sharp decline in the share, reporting “very satisfied,” suggesting that poorer health is strongly associated with lower life satisfaction. Interestingly, people with “good” health have the highest count of “satisfied” individuals, possibly reflecting that moderate health still allows a good degree of life satisfaction, though less enthusiasm than with “very good” or “excellent” health. Among those with “excellent” or “very good” health status, only a very small number reported being “dissatisfied” or “very dissatisfied”; this reinforces the pattern that good health contributes to positive life evaluations. Overall, the graph shows a clear, strong relationship between health and happiness because people who perceive their health as better tend to report higher life satisfaction. Conversely, as health status declines, life satisfaction drops correspondingly.
#Scatterplot of Height vs Weight
plot(nhis_clean$HEIGHTTC_A,nhis_clean$WEIGHTLBTC_A, #scatterplot using plot() for height vs weight
main = "Height(in.) vs Weight (lbs.) Scatterplot", #chart title
xlab = "Height (in.)", #x axis title
ylab = "Weight(lbs.)") #y axis title
ggplot(nhis_clean,aes(x=HEIGHTTC_A,y=WEIGHTLBTC_A)) + #using ggplot for scatterplot, indicating x and y values
geom_point() + #indicating points for each observation
labs(title = "Height(in.) vs Weight (lbs.) Scatterplot", #chart title
x = "Height (in.)", #x axis title
y = "Weight(lbs.)") #y axis title
cor(nhis_clean$HEIGHTTC_A,nhis_clean$WEIGHTLBTC_A) #calculating & reporting correlation coefficient; results 0.5023, medium correlation
## [1] 0.5023037
Interpretation: The scatterplot shows heights ranging roughly from 58 to 76 inches and weights from 100 to 300 pounds. There is a positive association between height and weight, meaning that, on average, weight increases with height. This is supported by a correlation coefficient of 0.5023, indicating a moderate positive linear relationship, given the noticeable spread of weights at each height. This wide range suggests that height alone is not a precise predictor of weight and that other factors likely play a significant role in determining weight. The vertical stripes indicate that the height data is discrete rather than continuous. Furthermore, a few observations fall at the extreme high end of weight across multiple heights; although they do not disrupt the overall trend, they may warrant further investigation. Overall, while taller individuals generally weigh more, height alone is insufficient to accurately predict weight due to high variability.
The multivariate visualizations provide insight into how height and weight vary across sex and different social and health-related groupings.
# Day 4 Task 1: Multivariate_Visualization
# Enhanced plot by General Health Status (PHSTAT_A)
ggplot(nhis_clean, aes(x = HEIGHTTC_A, y = WEIGHTLBTC_A, color = factor(SEX_A))) +
geom_point(alpha = 0.6) +
facet_wrap(~ PHSTAT_A) +
labs(
title = "Height vs Weight by Sex and General Health Status",
x = "Height (in.)",
y = "Weight (lbs)",
color = "Sex"
) +
theme_linedraw()
Height vs Weight by General Health Status: The scatterplot shows a pretty clear upward trend—taller people generally weigh more across every health group (excellent through poor). That relationship stays pretty consistent no matter the category of health. “Fair” and “Poor” health groups have a wider spread in weight, meaning body size is more varied when compared to the tighter clustering you see in better health categories. Across all the panels, males consistently sit at higher weight values than females at similar heights, which keeps showing up as a pretty stable sex difference in body size.
# or Enhanced plot by Education Level (PHSTAT_A)
ggplot(nhis_clean, aes(x = HEIGHTTC_A, y = WEIGHTLBTC_A, color = factor(SEX_A))) +
geom_point(alpha = 0.6) +
facet_wrap(~ EDUCP_A) +
labs(
title = "Height vs Weight by Sex and Education Level",
x = "Height (in.)",
y = "Weight (lbs)",
color = "Sex"
) +
theme_linedraw()
Height vs Weight by Education Level: The same pattern is seen here too from above as height and weight move together across all education groups. People with higher education (college graduates and above) look a bit more tightly clustered, while lower education groups show more variation in weight at similar heights. But the overall relationship doesn’t really change by education level. Males tend to cluster higher in weight than females across every group, furthering proving that the pattern stays consistent no matter the education category.
# Create a correlation scatter plot matrix using pairs.panels() from the psych library to visualize AGEP_A, WEIGHTLBTC_A, and HEIGHTTC_A
# Load the package
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:fBasics':
##
## tr
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
#Isolate three variables
nhis_pp_variables <- nhis_clean[, c("AGEP_A", "WEIGHTLBTC_A", "HEIGHTTC_A")]
# Create the pairs.panels correlation plot
pairs.panels(
nhis_pp_variables,
method = "pearson", # correlation method
hist.col = "chartreuse4",
density = TRUE, # add density plots
ellipses = TRUE # add correlation ellipses
)
Interpretation:
Correlation Matrix (Age, Height, Weight): The correlation results back up what the plots are showing. Height and weight have a moderate positive relationship (around 0.50), meaning taller people tend to weigh more. Age, on the other hand, barely relates to either variable—almost zero with height (-0.02) and very weak with weight (-0.09). As a resut it can be said that in this dataset, body size doesn’t really shift with age. The histograms also show age is pretty spread out across the sample, while height and weight are more normally concentrated around their middle ranges.