I will be investigating the relationship between Health_SelfRatedHealth_C and Behav_CigsPerDay_N
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
Data <- read_csv("Downloads/Data.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_double(),
## Demo_Race = col_logical(),
## Demo_Hispanic = col_character(),
## Demo_RaceEthnicity = col_character(),
## Demo_Region = col_character(),
## Demo_sex_C = col_character(),
## Demo_sexorien_C = col_logical(),
## Demo_agerange_C = col_character(),
## Demo_marital_C = col_character(),
## Demo_hourswrk_C = col_character(),
## MentalHealth_MentalIllnessK6_C = col_character(),
## MentalHealth_depressionmeds_B = col_logical(),
## Health_SelfRatedHealth_C = col_character(),
## Health_diagnosed_STD5yr_B = col_logical(),
## Health_BirthControlNow_B = col_logical(),
## Health_EverHavePrediabetes_B = col_logical(),
## Health_HIVAidsRisk_C = col_character(),
## Health_BMI_C = col_character(),
## Health_UsualPlaceHealthcare_C = col_character(),
## Health_AbnormalPapPast3yr_B = col_logical(),
## Behav_CigsPerDay_C = col_character()
## # ... with 1 more columns
## )
## ℹ Use `spec()` for the full column specifications.
## Warning: 683386 parsing failures.
## row col expected actual file
## 68557 Demo_Race 1/0/T/F/TRUE/FALSE Black or African American 'Downloads/Data.csv'
## 68558 Demo_Race 1/0/T/F/TRUE/FALSE Asian 'Downloads/Data.csv'
## 68559 Demo_Race 1/0/T/F/TRUE/FALSE American Indian or Alaskan Native 'Downloads/Data.csv'
## 68560 Demo_Race 1/0/T/F/TRUE/FALSE White 'Downloads/Data.csv'
## 68561 Demo_Race 1/0/T/F/TRUE/FALSE White 'Downloads/Data.csv'
## ..... ......... .................. ................................. ....................
## See problems(...) for more details.
data<-Data%>%
select(Health_SelfRatedHealth_C,Behav_CigsPerDay_N)%>%
filter(Health_SelfRatedHealth_C %in% c("Poor", "Excellent"),!is.na(Behav_CigsPerDay_N))
data%>%
group_by(Health_SelfRatedHealth_C)%>%
summarize(avg_Behav_CigsPerDay_N=mean(Behav_CigsPerDay_N))%>%
ggplot()+
geom_col(aes(x=Health_SelfRatedHealth_C, y=avg_Behav_CigsPerDay_N))
data%>%
ggplot()+
geom_histogram(aes(x=Behav_CigsPerDay_N, fill=Health_SelfRatedHealth_C))+
facet_wrap(~Health_SelfRatedHealth_C)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
edata<-data%>%
filter(Health_SelfRatedHealth_C=="Excellent")
e_Samp_Distro<- replicate(10000,sample(edata$Behav_CigsPerDay_N,40)%>%
mean(na.rm=TRUE))%>%
data.frame()%>%
rename("mean"=1)
ggplot()+geom_histogram(data=e_Samp_Distro, aes(x=mean), fill="cadetblue2")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
fdata<-data%>%
filter(Health_SelfRatedHealth_C=="Poor")
f_Samp_Distro<- replicate(10000,sample(fdata$Behav_CigsPerDay_N,40)%>%
mean(na.rm=TRUE))%>%
data.frame()%>%
rename("mean"=1)
ggplot()+geom_histogram(data=f_Samp_Distro, aes(x=mean), fill="aquamarine2")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
t.test(Behav_CigsPerDay_N~Health_SelfRatedHealth_C, data=data)
##
## Welch Two Sample t-test
##
## data: Behav_CigsPerDay_N by Health_SelfRatedHealth_C
## t = -43.84, df = 23077, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3.207124 -2.932622
## sample estimates:
## mean in group Excellent mean in group Poor
## 1.791539 4.861412