“Importing data:”
library(readr)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(knitr)
data<- read.csv("C:/Users/jammi/Downloads/SD3 NHIS Data.csv")
head(data)
## year Demo_Race Demo_sex_C Demo_sexorien_C
## 1 2016 Black or African American male Straight
## 2 2016 Black or African American female Straight
## 3 2016 White male Straight
## 4 2016 White female Straight
## 5 2016 White female Straight
## 6 2016 White female Straight
## Demo_belowpovertyline_B Demo_agerange_C Demo_marital_C
## 1 1 18-29 Never Married
## 2 0 18-29 DivorcedOrSeparated
## 3 0 70-79 DivorcedOrSeparated
## 4 0 30-39 Married
## 5 0 30-39 Never Married
## 6 0 18-29 Never Married
## Health_SelfRatedHealth_C MentalHealth_MentalIllnessK6_N Health_BMI_N
## 1 Good 3 33.36
## 2 Excellent 0 20.19
## 3 Good 0 27.27
## 4 Good 0 38.62
## 5 Excellent 2 39.95
## 6 Very Good 0 18.83
## Behav_CigsPerDay_N Behav_AlcDaysPerYear_N Behav_AlcDaysPerWeek_N
## 1 0 36 1
## 2 0 12 0
## 3 0 NA NA
## 4 0 24 0
## 5 5 12 0
## 6 0 NA NA
## Behav_BingeDrinkDaysYear_N
## 1 0
## 2 0
## 3 NA
## 4 0
## 5 0
## 6 NA
str(data)
## 'data.frame': 33028 obs. of 14 variables:
## $ year : int 2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
## $ Demo_Race : chr "Black or African American" "Black or African American" "White" "White" ...
## $ Demo_sex_C : chr "male" "female" "male" "female" ...
## $ Demo_sexorien_C : chr "Straight" "Straight" "Straight" "Straight" ...
## $ Demo_belowpovertyline_B : int 1 0 0 0 0 0 1 1 1 1 ...
## $ Demo_agerange_C : chr "18-29" "18-29" "70-79" "30-39" ...
## $ Demo_marital_C : chr "Never Married" "DivorcedOrSeparated" "DivorcedOrSeparated" "Married" ...
## $ Health_SelfRatedHealth_C : chr "Good" "Excellent" "Good" "Good" ...
## $ MentalHealth_MentalIllnessK6_N: int 3 0 0 0 2 0 4 19 2 3 ...
## $ Health_BMI_N : num 33.4 20.2 27.3 38.6 40 ...
## $ Behav_CigsPerDay_N : int 0 0 0 0 5 0 0 3 0 0 ...
## $ Behav_AlcDaysPerYear_N : int 36 12 NA 24 12 NA 5 52 52 NA ...
## $ Behav_AlcDaysPerWeek_N : int 1 0 NA 0 0 NA 0 1 1 NA ...
## $ Behav_BingeDrinkDaysYear_N : int 0 0 NA 0 0 NA 0 3 12 NA ...
Hypothesis: I hypothesis there is a difference between Behav_AlcDaysPerYear_N and Demo_sex_c
“Select and filter the two groups:”
data_start <-data%>%
select(Behav_AlcDaysPerYear_N, Demo_sex_C)%>%
filter(Demo_sex_C %in% c("male", "female"),
!is.na(TRUE))
“The mean of Behav_AlcDaysPerYear_N”
data %>%
group_by(Demo_sex_C)%>%
summarize(Average = mean(Behav_AlcDaysPerYear_N, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 2
## Demo_sex_C Average
## <chr> <dbl>
## 1 female 56.5
## 2 male 82.3
Creating a Histogram between the mean of the female and the male:
data%>%
ggplot()+
geom_histogram(aes(x=Behav_AlcDaysPerYear_N))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 6485 rows containing non-finite values (stat_bin).
Interpretation: The histogram shows that Behav_AlchDaysPerYear_N has about 0.
data%>%
ggplot()+
geom_histogram(aes(x=Behav_AlcDaysPerYear_N))+
facet_wrap(~Demo_sex_C)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 6485 rows containing non-finite values (stat_bin).
Interpretation: There are fewer female who are drinking Alcohole daysperyear than males.
data_middle <- data%>%
select(Behav_AlcDaysPerYear_N, Demo_sex_C)%>%
filter(Demo_sex_C %in% c("male", "female"),
!is.na(TRUE))
male_data <- data%>%
filter(Demo_sex_C== "male")
sample(male_data$Behav_AlcDaysPerYear_N, 40)%>%
mean(na.rm = TRUE)
## [1] 60.82353
female_data <- data%>%
filter(Demo_sex_C== "female")
sample(female_data$Behav_AlcDaysPerYear_N, 40)%>%
mean(na.rm = TRUE)
## [1] 44.81481
Interpretation: The males has a higher average than female in drinking alcohol.
male_Scale <- replicate(10000,
sample(male_data$Behav_AlcDaysPerYear_N, 40)%>%
mean(na.rm = TRUE))%>%
data.frame()%>%
rename("mean" = 1)
male_Scale%>%
ggplot()+
geom_histogram(aes(x= mean), fill = "Blue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Interpretation: The graph shows the average alcohol drinking is about 80.
female_Scale <- replicate(10000,
sample(female_data$Behav_AlcDaysPerYear_N, 40)%>%
mean(na.rm = TRUE))%>%
data.frame()%>%
rename("mean" = 1)
female_Scale%>%
ggplot()+
geom_histogram(aes(x= mean), fill = "Pink")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Interpretation: The graph shows that the average for female drinking is about 52.
ggplot()+
geom_histogram(data = male_Scale, aes(x=mean), fill = "blue")+
geom_histogram(data = female_Scale, aes(x=mean), fill = "pink")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Interpretation: The graph shows that the male has a higher mean of alcohol drinking than female.
“T test”
t.test(Behav_AlcDaysPerYear_N~Demo_sex_C, data = data)
##
## Welch Two Sample t-test
##
## data: Behav_AlcDaysPerYear_N by Demo_sex_C
## t = -20.751, df = 25188, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -28.31106 -23.42436
## sample estimates:
## mean in group female mean in group male
## 56.47358 82.34129
Interpretation: There is a statistically significant difference between female and male in their mean feelings towards Alcohol driniking