Load Packages

library(readr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

## Warning in file(con, "r"): cannot open file '/var/db/timezone/zoneinfo/
## +VERSION': No such file or directory

Import Data

NHIS_Data <- read_csv("/Users/chelsyrodriguez/Downloads/NHIS Data.csv")

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   Demo_Race = col_logical(),
##   Demo_Hispanic = col_character(),
##   Demo_RaceEthnicity = col_character(),
##   Demo_Region = col_character(),
##   Demo_sex_C = col_character(),
##   Demo_sexorien_C = col_logical(),
##   Demo_agerange_C = col_character(),
##   Demo_marital_C = col_character(),
##   Demo_hourswrk_C = col_character(),
##   MentalHealth_MentalIllnessK6_C = col_character(),
##   MentalHealth_depressionmeds_B = col_logical(),
##   Health_SelfRatedHealth_C = col_character(),
##   Health_diagnosed_STD5yr_B = col_logical(),
##   Health_BirthControlNow_B = col_logical(),
##   Health_EverHavePrediabetes_B = col_logical(),
##   Health_HIVAidsRisk_C = col_character(),
##   Health_BMI_C = col_character(),
##   Health_UsualPlaceHealthcare_C = col_character(),
##   Health_AbnormalPapPast3yr_B = col_logical(),
##   Behav_CigsPerDay_C = col_character()
##   # ... with 1 more columns
## )
## ℹ Use `spec()` for the full column specifications.

## Warning: 683386 parsing failures.
##   row       col           expected                            actual                                             file
## 68557 Demo_Race 1/0/T/F/TRUE/FALSE Black or African American         '/Users/chelsyrodriguez/Downloads/NHIS Data.csv'
## 68558 Demo_Race 1/0/T/F/TRUE/FALSE Asian                             '/Users/chelsyrodriguez/Downloads/NHIS Data.csv'
## 68559 Demo_Race 1/0/T/F/TRUE/FALSE American Indian or Alaskan Native '/Users/chelsyrodriguez/Downloads/NHIS Data.csv'
## 68560 Demo_Race 1/0/T/F/TRUE/FALSE White                             '/Users/chelsyrodriguez/Downloads/NHIS Data.csv'
## 68561 Demo_Race 1/0/T/F/TRUE/FALSE White                             '/Users/chelsyrodriguez/Downloads/NHIS Data.csv'
## ..... ......... .................. ................................. ................................................
## See problems(...) for more details.

head(NHIS_Data)

## # A tibble: 6 x 50
##     psu sampweight  year year_strata Demo_Race Demo_Hispanic Demo_RaceEthnic…
##   <dbl>      <dbl> <dbl>       <dbl> <lgl>     <chr>         <chr>           
## 1     2       4316  1997       1998. NA        Hispanic      Hispanic (Race …
## 2     2       2845  1997       1998. NA        Hispanic      Hispanic (Race …
## 3     2       3783  1997       1998. NA        Hispanic      Hispanic (Race …
## 4     2       2466  1997       1998. NA        Hispanic      Hispanic (Race …
## 5     2       3794  1997       1998. NA        Hispanic      Hispanic (Race …
## 6     1       1793  1997       1998. NA        Hispanic      Hispanic (Race …
## # … with 43 more variables: Demo_Region <chr>, Demo_sex_C <chr>,
## #   Demo_sexorien_C <lgl>, Demo_belowpovertyline_B <dbl>, Demo_age_N <dbl>,
## #   Demo_agerange_C <chr>, Demo_marital_C <chr>, Demo_hourswrk_C <chr>,
## #   MentalHealth_MentalIllnessK6_N <dbl>, MentalHealth_MentalIllnessK6_C <chr>,
## #   MentalHealth_SeriousMentalIllnessK6_B <dbl>,
## #   MentalHealth_depressionmeds_B <lgl>, Health_SelfRatedHealth_C <chr>,
## #   Health_diagnosed_STD5yr_B <lgl>, Health_BirthControlNow_B <lgl>,
## #   Health_EverHaveHeartAttack_B <dbl>, Health_EverHaveHeartCondition_B <dbl>,
## #   Health_EverHaveCancer_B <dbl>, Health_EverHaveDiabetes_B <dbl>,
## #   Health_EverHavePrediabetes_B <lgl>, Health_EverHaveAsthma_B <dbl>,
## #   Health_StillHaveAsthma_B <dbl>, Health_HIVAidsRisk_C <chr>,
## #   Health_HIVAidsHighRisk_B <dbl>, Health_EverTakeHIVTest_B <dbl>,
## #   Health_EverHaveHypertension_B <dbl>, Health_BMI_N <dbl>,
## #   Health_BMI_C <chr>, Health_BMIOverweight_B <dbl>, Health_BMIObese_B <dbl>,
## #   Health_Weight_N <dbl>, Health_Height_N <dbl>,
## #   Health_UsualPlaceHealthcare_C <chr>, Health_UsualPlaceHealthcare_B <dbl>,
## #   Health_AbnormalPapPast3yr_B <lgl>, Behav_EverSmokeCigs_B <dbl>,
## #   Behav_CigsPerDay_N <dbl>, Behav_CigsPerDay_C <chr>,
## #   Behav_AgeStartSmoking <dbl>, Behav_AlcDaysPerYear_N <dbl>,
## #   Behav_AlcDaysPerWeek_N <dbl>, Behav_BingeDrinkDaysYear_N <dbl>,
## #   Behav_BingeDrinkDaysYear_C <chr>

I hypothesize that females are more likely to contract STD’s than males.

Distribution Table%

table(NHIS_Data$Health_diagnosed_STD5yr_B, NHIS_Data$Demo_sex_C)

##        
##         female  male
##   FALSE  93738 78005
##   TRUE    3455  1588

NHIS_Data %>%
  group_by(Health_diagnosed_STD5yr_B,Demo_sex_C) %>%
  summarize(n=n())

## `summarise()` has grouped output by 'Health_diagnosed_STD5yr_B'. You can override using the `.groups` argument.

## # A tibble: 6 x 3
## # Groups:   Health_diagnosed_STD5yr_B [3]
##   Health_diagnosed_STD5yr_B Demo_sex_C      n
##   <lgl>                     <chr>       <int>
## 1 FALSE                     female      93738
## 2 FALSE                     male        78005
## 3 TRUE                      female       3455
## 4 TRUE                      male         1588
## 5 NA                        female     249535
## 6 NA                        male       193216

Null Hypothesis

chisq.test(NHIS_Data$Health_diagnosed_STD5yr_B, NHIS_Data$Demo_sex_C)[7]

## $expected
##        NHIS_Data$Demo_sex_C
##            female      male
##   FALSE 94420.471 77322.529
##   TRUE   2772.529  2270.471

Actual Observation

chisq.test(NHIS_Data$Health_diagnosed_STD5yr_B, NHIS_Data$Demo_sex_C)[6]

## $observed
##        NHIS_Data$Demo_sex_C
##         female  male
##   FALSE  93738 78005
##   TRUE    3455  1588

There is a similarity between the null and actual observation on the False row, however there is quite of a difference in the TRUE row.

*In the null hypothesis I expected about 2773 female respondents that said true that they had contracted STD’s within the past 5 years, however I observed 3455 female respondents.

*I expected about 2270 male respondents who said true to having to contract STD’s within the past 5 years, however I observed 1588 male respondents.

Row and Column

NHIS_Data %>%
  group_by(Health_diagnosed_STD5yr_B,Demo_sex_C) %>%
  summarize(n=n()) %>%
  mutate(percent=n/sum(n))

## `summarise()` has grouped output by 'Health_diagnosed_STD5yr_B'. You can override using the `.groups` argument.

## # A tibble: 6 x 4
## # Groups:   Health_diagnosed_STD5yr_B [3]
##   Health_diagnosed_STD5yr_B Demo_sex_C      n percent
##   <lgl>                     <chr>       <int>   <dbl>
## 1 FALSE                     female      93738   0.546
## 2 FALSE                     male        78005   0.454
## 3 TRUE                      female       3455   0.685
## 4 TRUE                      male         1588   0.315
## 5 NA                        female     249535   0.564
## 6 NA                        male       193216   0.436

55% of females said false that they never had a STD for the past 5 years, compared to 45% of males. 68% of females said true that they were diagnosed with an STD for the past 5 years compared to 31% of males. 56% of females and 43% of males put N/A.

NHIS_Data %>%
  group_by(Health_diagnosed_STD5yr_B,Demo_sex_C) %>%
  summarize(n=n()) %>%
  mutate(percent=n/sum(n)) %>%
  ggplot()+
  geom_col(aes(x=Health_diagnosed_STD5yr_B,y=percent,fill=Demo_sex_C))

## `summarise()` has grouped output by 'Health_diagnosed_STD5yr_B'. You can override using the `.groups` argument.

Chi-Square Test for Independence

chisq.test(NHIS_Data$Health_diagnosed_STD5yr_B, NHIS_Data$Demo_sex_C)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  NHIS_Data$Health_diagnosed_STD5yr_B and NHIS_Data$Demo_sex_C
## X-squared = 383.53, df = 1, p-value < 2.2e-16

The results indicate a strong evidence for the null hypothesis. The p-value shows the scientific notation as < 2.2e-16 which means that there isn’t a statistically significant reltionship between the sex of the respondents and the diagnosis of STD’s.

Categorical Data Analysis

Chelsy Rodriguez