Variable Selection & Research Question

I will be investigating the relationship between Health_SelfRatedHealth_C and Behav_CigsPerDay_N

Data Prep

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
Data <- read_csv("Downloads/Data.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   Demo_Race = col_logical(),
##   Demo_Hispanic = col_character(),
##   Demo_RaceEthnicity = col_character(),
##   Demo_Region = col_character(),
##   Demo_sex_C = col_character(),
##   Demo_sexorien_C = col_logical(),
##   Demo_agerange_C = col_character(),
##   Demo_marital_C = col_character(),
##   Demo_hourswrk_C = col_character(),
##   MentalHealth_MentalIllnessK6_C = col_character(),
##   MentalHealth_depressionmeds_B = col_logical(),
##   Health_SelfRatedHealth_C = col_character(),
##   Health_diagnosed_STD5yr_B = col_logical(),
##   Health_BirthControlNow_B = col_logical(),
##   Health_EverHavePrediabetes_B = col_logical(),
##   Health_HIVAidsRisk_C = col_character(),
##   Health_BMI_C = col_character(),
##   Health_UsualPlaceHealthcare_C = col_character(),
##   Health_AbnormalPapPast3yr_B = col_logical(),
##   Behav_CigsPerDay_C = col_character()
##   # ... with 1 more columns
## )
## ℹ Use `spec()` for the full column specifications.
## Warning: 683386 parsing failures.
##   row       col           expected                            actual                 file
## 68557 Demo_Race 1/0/T/F/TRUE/FALSE Black or African American         'Downloads/Data.csv'
## 68558 Demo_Race 1/0/T/F/TRUE/FALSE Asian                             'Downloads/Data.csv'
## 68559 Demo_Race 1/0/T/F/TRUE/FALSE American Indian or Alaskan Native 'Downloads/Data.csv'
## 68560 Demo_Race 1/0/T/F/TRUE/FALSE White                             'Downloads/Data.csv'
## 68561 Demo_Race 1/0/T/F/TRUE/FALSE White                             'Downloads/Data.csv'
## ..... ......... .................. ................................. ....................
## See problems(...) for more details.
data<-Data%>%
  select(Health_SelfRatedHealth_C,Behav_CigsPerDay_N)%>%
  filter(Health_SelfRatedHealth_C %in% c("Poor", "Excellent"),!is.na(Behav_CigsPerDay_N))

Comparison of Means

  • The people who smoke more cigarettes per day have a higher rate of poor health. And the people who smoke less cigarettes per day have a higher rate of excellent health.
data%>%
  group_by(Health_SelfRatedHealth_C)%>%
    summarize(avg_Behav_CigsPerDay_N=mean(Behav_CigsPerDay_N))%>%
  
    ggplot()+
    geom_col(aes(x=Health_SelfRatedHealth_C, y=avg_Behav_CigsPerDay_N))

Comparison of Distribution

  • People who smoke 0 cigarettes per day have excellent health compared to those who smoke more than 0 cigarettes per day.
data%>%
  ggplot()+
  geom_histogram(aes(x=Behav_CigsPerDay_N, fill=Health_SelfRatedHealth_C))+
  facet_wrap(~Health_SelfRatedHealth_C)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Sampling Distribution

edata<-data%>%
  filter(Health_SelfRatedHealth_C=="Excellent")

e_Samp_Distro<- replicate(10000,sample(edata$Behav_CigsPerDay_N,40)%>%
                            mean(na.rm=TRUE))%>%
  data.frame()%>%
  rename("mean"=1)

ggplot()+geom_histogram(data=e_Samp_Distro, aes(x=mean), fill="cadetblue2")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

fdata<-data%>%
  filter(Health_SelfRatedHealth_C=="Poor")

f_Samp_Distro<- replicate(10000,sample(fdata$Behav_CigsPerDay_N,40)%>%
                            mean(na.rm=TRUE))%>%
  data.frame()%>%
  rename("mean"=1)

ggplot()+geom_histogram(data=f_Samp_Distro, aes(x=mean), fill="aquamarine2")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

T-test

  • There is a statistically significant relationship between a person’s health and how much they smoke per day. The more cigarettes a person smokes per day, the worse their health will be and vice versa.
t.test(Behav_CigsPerDay_N~Health_SelfRatedHealth_C, data=data)
## 
##  Welch Two Sample t-test
## 
## data:  Behav_CigsPerDay_N by Health_SelfRatedHealth_C
## t = -43.84, df = 23077, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -3.207124 -2.932622
## sample estimates:
## mean in group Excellent      mean in group Poor 
##                1.791539                4.861412