#Intro In this assignment, I wanted to test to see if there was a significant behavioral difference between the behaviors male and females smoke.
#Data Prep
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.1 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ─────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
voter<-read_csv("/Users/gabriellakokhabi/Desktop/SD3 NHIS Data.csv")%>%
filter(!is.na(Behav_CigsPerDay_N))
## Parsed with column specification:
## cols(
## year = col_double(),
## Demo_Race = col_character(),
## Demo_sex_C = col_character(),
## Demo_sexorien_C = col_character(),
## Demo_belowpovertyline_B = col_double(),
## Demo_agerange_C = col_character(),
## Demo_marital_C = col_character(),
## Health_SelfRatedHealth_C = col_character(),
## MentalHealth_MentalIllnessK6_N = col_double(),
## Health_BMI_N = col_double(),
## Behav_CigsPerDay_N = col_double(),
## Behav_AlcDaysPerYear_N = col_double(),
## Behav_AlcDaysPerWeek_N = col_double(),
## Behav_BingeDrinkDaysYear_N = col_double()
## )
#Mean Comparsion (Table)
voter%>%
group_by(Demo_sex_C)%>%
summarize(Behav_CigsPerDay_N=mean(Behav_CigsPerDay_N, na.rm=TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 2
## Demo_sex_C Behav_CigsPerDay_N
## <chr> <dbl>
## 1 female 1.63
## 2 male 2.34
#Visualization
voter%>%
ggplot()+
geom_histogram(aes(x=Behav_CigsPerDay_N))+
facet_wrap(~Demo_sex_C)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Mean Comparsion (Chart)
voter%>%
group_by(Demo_sex_C)%>%
summarize(Behav_CigsPerDay_N=mean(Behav_CigsPerDay_N, na.rm=TRUE))%>%
ggplot()+
geom_col(aes(x=Demo_sex_C, y=Behav_CigsPerDay_N, fill=Behav_CigsPerDay_N))+
geom_label(aes(x=Demo_sex_C, y=Behav_CigsPerDay_N, label=round(Behav_CigsPerDay_N)))
## `summarise()` ungrouping output (override with `.groups` argument)
#Interpetion The findings in the data shown in the table and chart are similar but different. The table expresses that on avergae men smoke more cigarettes per day. However, when you round the means (shown on the chart), when rounded the average means are equal, even though on the chart the plots are at different points on the graph.
#Population Distribution for Male and Female Respondents Who Smoke
voter%>%
ggplot()+
geom_histogram(aes(x=Behav_CigsPerDay_N))+
facet_wrap(~Demo_sex_C)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Sampling Distribution
male_data <- voter%>%
filter(Demo_sex_C=="male")
male_sampdist <- replicate(10000,
sample(male_data$Behav_CigsPerDay_N,40)%>%mean(na.rm=TRUE))%>%
data.frame()%>%
rename("mean"=1)
female_data <- voter%>%
filter(Demo_sex_C=="female")
female_sampdist <- replicate(10000,
sample(female_data$Behav_CigsPerDay_N, 40)%>%mean(na.rm=TRUE))%>%
data.frame()%>%
rename("mean"=1)
ggplot()+
geom_histogram(data=male_sampdist,aes(x=mean), fill="green", alpha=0.5)+
geom_histogram(data=female_sampdist,aes(x=mean), fill="purple", alpha=0.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#T-test
t.test(Behav_CigsPerDay_N~Demo_sex_C, data=voter)
##
## Welch Two Sample t-test
##
## data: Behav_CigsPerDay_N by Demo_sex_C
## t = -10.623, df = 27761, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.8329784 -0.5734723
## sample estimates:
## mean in group female mean in group male
## 1.632841 2.336066
#Interpetion of T Test
The t test shows that there is a significant difference in the behaviors between men and females when it comes to smoking.
did i do better?