Skills Drill: Continuous Data Analysis

“Importing data:”

library(readr)
library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(knitr)
data<- read.csv("C:/Users/jammi/Downloads/SD3 NHIS Data.csv")

head(data)

##   year                 Demo_Race Demo_sex_C Demo_sexorien_C
## 1 2016 Black or African American       male        Straight
## 2 2016 Black or African American     female        Straight
## 3 2016                     White       male        Straight
## 4 2016                     White     female        Straight
## 5 2016                     White     female        Straight
## 6 2016                     White     female        Straight
##   Demo_belowpovertyline_B Demo_agerange_C      Demo_marital_C
## 1                       1           18-29       Never Married
## 2                       0           18-29 DivorcedOrSeparated
## 3                       0           70-79 DivorcedOrSeparated
## 4                       0           30-39             Married
## 5                       0           30-39       Never Married
## 6                       0           18-29       Never Married
##   Health_SelfRatedHealth_C MentalHealth_MentalIllnessK6_N Health_BMI_N
## 1                     Good                              3        33.36
## 2                Excellent                              0        20.19
## 3                     Good                              0        27.27
## 4                     Good                              0        38.62
## 5                Excellent                              2        39.95
## 6                Very Good                              0        18.83
##   Behav_CigsPerDay_N Behav_AlcDaysPerYear_N Behav_AlcDaysPerWeek_N
## 1                  0                     36                      1
## 2                  0                     12                      0
## 3                  0                     NA                     NA
## 4                  0                     24                      0
## 5                  5                     12                      0
## 6                  0                     NA                     NA
##   Behav_BingeDrinkDaysYear_N
## 1                          0
## 2                          0
## 3                         NA
## 4                          0
## 5                          0
## 6                         NA

str(data)

## 'data.frame':    33028 obs. of  14 variables:
##  $ year                          : int  2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
##  $ Demo_Race                     : chr  "Black or African American" "Black or African American" "White" "White" ...
##  $ Demo_sex_C                    : chr  "male" "female" "male" "female" ...
##  $ Demo_sexorien_C               : chr  "Straight" "Straight" "Straight" "Straight" ...
##  $ Demo_belowpovertyline_B       : int  1 0 0 0 0 0 1 1 1 1 ...
##  $ Demo_agerange_C               : chr  "18-29" "18-29" "70-79" "30-39" ...
##  $ Demo_marital_C                : chr  "Never Married" "DivorcedOrSeparated" "DivorcedOrSeparated" "Married" ...
##  $ Health_SelfRatedHealth_C      : chr  "Good" "Excellent" "Good" "Good" ...
##  $ MentalHealth_MentalIllnessK6_N: int  3 0 0 0 2 0 4 19 2 3 ...
##  $ Health_BMI_N                  : num  33.4 20.2 27.3 38.6 40 ...
##  $ Behav_CigsPerDay_N            : int  0 0 0 0 5 0 0 3 0 0 ...
##  $ Behav_AlcDaysPerYear_N        : int  36 12 NA 24 12 NA 5 52 52 NA ...
##  $ Behav_AlcDaysPerWeek_N        : int  1 0 NA 0 0 NA 0 1 1 NA ...
##  $ Behav_BingeDrinkDaysYear_N    : int  0 0 NA 0 0 NA 0 3 12 NA ...

Hypothesis: I hypothesis there is a difference between Behav_AlcDaysPerYear_N and Demo_sex_c

“Select and filter the two groups:”

data_start <-data%>%
  select(Behav_AlcDaysPerYear_N, Demo_sex_C)%>%
  filter(Demo_sex_C %in% c("male", "female"),
         !is.na(TRUE))

“The mean of Behav_AlcDaysPerYear_N”

 data %>%
  group_by(Demo_sex_C)%>%
  summarize(Average = mean(Behav_AlcDaysPerYear_N, na.rm = TRUE))

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 2 x 2
##   Demo_sex_C Average
##   <chr>        <dbl>
## 1 female        56.5
## 2 male          82.3

Creating a Histogram between the mean of the female and the male:

data%>%
 ggplot()+
  geom_histogram(aes(x=Behav_AlcDaysPerYear_N))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 6485 rows containing non-finite values (stat_bin).

Interpretation: The histogram shows that Behav_AlchDaysPerYear_N has about 0.

data%>%
 ggplot()+
  geom_histogram(aes(x=Behav_AlcDaysPerYear_N))+
  facet_wrap(~Demo_sex_C)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 6485 rows containing non-finite values (stat_bin).

Interpretation: There are fewer female who are drinking Alcohole daysperyear than males.

data_middle <- data%>%
   select(Behav_AlcDaysPerYear_N, Demo_sex_C)%>%
  filter(Demo_sex_C %in% c("male", "female"),
         !is.na(TRUE))

male_data <- data%>%
  filter(Demo_sex_C== "male")

sample(male_data$Behav_AlcDaysPerYear_N, 40)%>%
  mean(na.rm = TRUE)

## [1] 60.82353

female_data <- data%>%
  filter(Demo_sex_C== "female")
sample(female_data$Behav_AlcDaysPerYear_N, 40)%>%
  mean(na.rm = TRUE)

## [1] 44.81481

Interpretation: The males has a higher average than female in drinking alcohol.

male_Scale <- replicate(10000,
              sample(male_data$Behav_AlcDaysPerYear_N, 40)%>%
              mean(na.rm = TRUE))%>%
              data.frame()%>%
              rename("mean" = 1)
male_Scale%>%
  ggplot()+
  geom_histogram(aes(x= mean), fill = "Blue")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Interpretation: The graph shows the average alcohol drinking is about 80.

female_Scale <- replicate(10000,
                sample(female_data$Behav_AlcDaysPerYear_N, 40)%>%
                 mean(na.rm = TRUE))%>%
                data.frame()%>%
                rename("mean" = 1)
female_Scale%>%
  ggplot()+
  geom_histogram(aes(x= mean), fill = "Pink")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Interpretation: The graph shows that the average for female drinking is about 52.

ggplot()+
  geom_histogram(data = male_Scale, aes(x=mean), fill = "blue")+
  geom_histogram(data = female_Scale, aes(x=mean), fill = "pink")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Interpretation: The graph shows that the male has a higher mean of alcohol drinking than female.

“T test”

t.test(Behav_AlcDaysPerYear_N~Demo_sex_C, data = data)

## 
##  Welch Two Sample t-test
## 
## data:  Behav_AlcDaysPerYear_N by Demo_sex_C
## t = -20.751, df = 25188, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -28.31106 -23.42436
## sample estimates:
## mean in group female   mean in group male 
##             56.47358             82.34129

Interpretation: There is a statistically significant difference between female and male in their mean feelings towards Alcohol driniking