library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning in file(con, "r"): cannot open file '/var/db/timezone/zoneinfo/
## +VERSION': No such file or directory
NHIS_Data <- read_csv("/Users/chelsyrodriguez/Downloads/NHIS Data.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_double(),
## Demo_Race = col_logical(),
## Demo_Hispanic = col_character(),
## Demo_RaceEthnicity = col_character(),
## Demo_Region = col_character(),
## Demo_sex_C = col_character(),
## Demo_sexorien_C = col_logical(),
## Demo_agerange_C = col_character(),
## Demo_marital_C = col_character(),
## Demo_hourswrk_C = col_character(),
## MentalHealth_MentalIllnessK6_C = col_character(),
## MentalHealth_depressionmeds_B = col_logical(),
## Health_SelfRatedHealth_C = col_character(),
## Health_diagnosed_STD5yr_B = col_logical(),
## Health_BirthControlNow_B = col_logical(),
## Health_EverHavePrediabetes_B = col_logical(),
## Health_HIVAidsRisk_C = col_character(),
## Health_BMI_C = col_character(),
## Health_UsualPlaceHealthcare_C = col_character(),
## Health_AbnormalPapPast3yr_B = col_logical(),
## Behav_CigsPerDay_C = col_character()
## # ... with 1 more columns
## )
## ℹ Use `spec()` for the full column specifications.
## Warning: 683386 parsing failures.
## row col expected actual file
## 68557 Demo_Race 1/0/T/F/TRUE/FALSE Black or African American '/Users/chelsyrodriguez/Downloads/NHIS Data.csv'
## 68558 Demo_Race 1/0/T/F/TRUE/FALSE Asian '/Users/chelsyrodriguez/Downloads/NHIS Data.csv'
## 68559 Demo_Race 1/0/T/F/TRUE/FALSE American Indian or Alaskan Native '/Users/chelsyrodriguez/Downloads/NHIS Data.csv'
## 68560 Demo_Race 1/0/T/F/TRUE/FALSE White '/Users/chelsyrodriguez/Downloads/NHIS Data.csv'
## 68561 Demo_Race 1/0/T/F/TRUE/FALSE White '/Users/chelsyrodriguez/Downloads/NHIS Data.csv'
## ..... ......... .................. ................................. ................................................
## See problems(...) for more details.
head(NHIS_Data)
## # A tibble: 6 x 50
## psu sampweight year year_strata Demo_Race Demo_Hispanic Demo_RaceEthnic…
## <dbl> <dbl> <dbl> <dbl> <lgl> <chr> <chr>
## 1 2 4316 1997 1998. NA Hispanic Hispanic (Race …
## 2 2 2845 1997 1998. NA Hispanic Hispanic (Race …
## 3 2 3783 1997 1998. NA Hispanic Hispanic (Race …
## 4 2 2466 1997 1998. NA Hispanic Hispanic (Race …
## 5 2 3794 1997 1998. NA Hispanic Hispanic (Race …
## 6 1 1793 1997 1998. NA Hispanic Hispanic (Race …
## # … with 43 more variables: Demo_Region <chr>, Demo_sex_C <chr>,
## # Demo_sexorien_C <lgl>, Demo_belowpovertyline_B <dbl>, Demo_age_N <dbl>,
## # Demo_agerange_C <chr>, Demo_marital_C <chr>, Demo_hourswrk_C <chr>,
## # MentalHealth_MentalIllnessK6_N <dbl>, MentalHealth_MentalIllnessK6_C <chr>,
## # MentalHealth_SeriousMentalIllnessK6_B <dbl>,
## # MentalHealth_depressionmeds_B <lgl>, Health_SelfRatedHealth_C <chr>,
## # Health_diagnosed_STD5yr_B <lgl>, Health_BirthControlNow_B <lgl>,
## # Health_EverHaveHeartAttack_B <dbl>, Health_EverHaveHeartCondition_B <dbl>,
## # Health_EverHaveCancer_B <dbl>, Health_EverHaveDiabetes_B <dbl>,
## # Health_EverHavePrediabetes_B <lgl>, Health_EverHaveAsthma_B <dbl>,
## # Health_StillHaveAsthma_B <dbl>, Health_HIVAidsRisk_C <chr>,
## # Health_HIVAidsHighRisk_B <dbl>, Health_EverTakeHIVTest_B <dbl>,
## # Health_EverHaveHypertension_B <dbl>, Health_BMI_N <dbl>,
## # Health_BMI_C <chr>, Health_BMIOverweight_B <dbl>, Health_BMIObese_B <dbl>,
## # Health_Weight_N <dbl>, Health_Height_N <dbl>,
## # Health_UsualPlaceHealthcare_C <chr>, Health_UsualPlaceHealthcare_B <dbl>,
## # Health_AbnormalPapPast3yr_B <lgl>, Behav_EverSmokeCigs_B <dbl>,
## # Behav_CigsPerDay_N <dbl>, Behav_CigsPerDay_C <chr>,
## # Behav_AgeStartSmoking <dbl>, Behav_AlcDaysPerYear_N <dbl>,
## # Behav_AlcDaysPerWeek_N <dbl>, Behav_BingeDrinkDaysYear_N <dbl>,
## # Behav_BingeDrinkDaysYear_C <chr>
I hypothesize that females are more likely to contract STD’s than males.
table(NHIS_Data$Health_diagnosed_STD5yr_B, NHIS_Data$Demo_sex_C)
##
## female male
## FALSE 93738 78005
## TRUE 3455 1588
NHIS_Data %>%
group_by(Health_diagnosed_STD5yr_B,Demo_sex_C) %>%
summarize(n=n())
## `summarise()` has grouped output by 'Health_diagnosed_STD5yr_B'. You can override using the `.groups` argument.
## # A tibble: 6 x 3
## # Groups: Health_diagnosed_STD5yr_B [3]
## Health_diagnosed_STD5yr_B Demo_sex_C n
## <lgl> <chr> <int>
## 1 FALSE female 93738
## 2 FALSE male 78005
## 3 TRUE female 3455
## 4 TRUE male 1588
## 5 NA female 249535
## 6 NA male 193216
chisq.test(NHIS_Data$Health_diagnosed_STD5yr_B, NHIS_Data$Demo_sex_C)[7]
## $expected
## NHIS_Data$Demo_sex_C
## female male
## FALSE 94420.471 77322.529
## TRUE 2772.529 2270.471
chisq.test(NHIS_Data$Health_diagnosed_STD5yr_B, NHIS_Data$Demo_sex_C)[6]
## $observed
## NHIS_Data$Demo_sex_C
## female male
## FALSE 93738 78005
## TRUE 3455 1588
There is a similarity between the null and actual observation on the False row, however there is quite of a difference in the TRUE row.
*In the null hypothesis I expected about 2773 female respondents that said true that they had contracted STD’s within the past 5 years, however I observed 3455 female respondents.
*I expected about 2270 male respondents who said true to having to contract STD’s within the past 5 years, however I observed 1588 male respondents.
NHIS_Data %>%
group_by(Health_diagnosed_STD5yr_B,Demo_sex_C) %>%
summarize(n=n()) %>%
mutate(percent=n/sum(n))
## `summarise()` has grouped output by 'Health_diagnosed_STD5yr_B'. You can override using the `.groups` argument.
## # A tibble: 6 x 4
## # Groups: Health_diagnosed_STD5yr_B [3]
## Health_diagnosed_STD5yr_B Demo_sex_C n percent
## <lgl> <chr> <int> <dbl>
## 1 FALSE female 93738 0.546
## 2 FALSE male 78005 0.454
## 3 TRUE female 3455 0.685
## 4 TRUE male 1588 0.315
## 5 NA female 249535 0.564
## 6 NA male 193216 0.436
55% of females said false that they never had a STD for the past 5 years, compared to 45% of males. 68% of females said true that they were diagnosed with an STD for the past 5 years compared to 31% of males. 56% of females and 43% of males put N/A.
NHIS_Data %>%
group_by(Health_diagnosed_STD5yr_B,Demo_sex_C) %>%
summarize(n=n()) %>%
mutate(percent=n/sum(n)) %>%
ggplot()+
geom_col(aes(x=Health_diagnosed_STD5yr_B,y=percent,fill=Demo_sex_C))
## `summarise()` has grouped output by 'Health_diagnosed_STD5yr_B'. You can override using the `.groups` argument.
chisq.test(NHIS_Data$Health_diagnosed_STD5yr_B, NHIS_Data$Demo_sex_C)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: NHIS_Data$Health_diagnosed_STD5yr_B and NHIS_Data$Demo_sex_C
## X-squared = 383.53, df = 1, p-value < 2.2e-16
The results indicate a strong evidence for the null hypothesis. The p-value shows the scientific notation as < 2.2e-16 which means that there isn’t a statistically significant reltionship between the sex of the respondents and the diagnosis of STD’s.