Data Cleaning

Click to expand
# Cleaning the data

brfss <- read.csv("brfss_17_21_sample5.csv")

brfss <- clean_names(brfss)

brfss$income <- as.factor(brfss$income)
brfss$genhlth <- as.factor(brfss$genhlth)
brfss$x_bmi5cat <- as.factor(brfss$x_bmi5cat)
brfss$x_educag <- as.factor(brfss$x_educag)
brfss$x_rfsmok3 <- as.factor(brfss$x_rfsmok3)
brfss$x_racegr3 <- as.factor(brfss$x_racegr3)
brfss$x_age_g <- as.factor(brfss$x_age_g)



# cleaning iyear column and converting to numeric
# cite/reference
brfss$iyear <- gsub("b'", "", brfss$iyear)
brfss$iyear <- gsub("'", "", brfss$iyear)
brfss$iyear <- as.numeric(brfss$iyear)

# converts Yes/No to 1 (Yes) and 0 (No) and converts the others to NAs to get just complete answers
brfss$cvdinfr4 <- ifelse(brfss$cvdinfr4 == "Yes", 1, ifelse(brfss$cvdinfr4 == "No", 0, NA))

unique(brfss$cvdcrhd4)
## [1] "No"                  "Yes"                 "Don’t know/Not sure"
## [4] "Refused"
brfss$cvdcrhd4 <- ifelse(brfss$cvdcrhd4 == "Yes", 1, ifelse(brfss$cvdcrhd4 == "No", 0, NA))

unique(brfss$cvdstrk3)
## [1] "No"                  "Yes"                 "Don’t know/Not sure"
## [4] "Refused"
brfss$cvdstrk3 <- ifelse(brfss$cvdstrk3 == "Yes", 1, ifelse(brfss$cvdstrk3 == "No", 0, NA))

brfss$asthma3 <- ifelse(brfss$asthma3 == "Yes", 1, ifelse(brfss$asthma3 == "No", 0, NA))
brfss$chcscncr <- ifelse(brfss$chcscncr == "Yes", 1, ifelse(brfss$chcscncr == "No", 0, NA))
brfss$chcocncr <- ifelse(brfss$chcocncr == "Yes", 1, ifelse(brfss$chcocncr == "No", 0, NA))
brfss$chccopd <- ifelse(brfss$chccopd == "Yes", 1, ifelse(brfss$chccopd == "No", 0, NA))
brfss$addepev <- ifelse(brfss$addepev == "Yes", 1, ifelse(brfss$addepev == "No", 0, NA))
brfss$diabete_2 <- ifelse(brfss$diabete_2 == "Yes", 1, ifelse(brfss$diabete_2 == "No", 0, NA))

brfss$decide <- ifelse(brfss$decide == "Yes", 1, ifelse(brfss$decide == "No", 0, NA))
brfss$diffwalk <- ifelse(brfss$diffwalk == "Yes", 1, ifelse(brfss$diffwalk == "No", 0, NA))
brfss$diffdres <- ifelse(brfss$diffdres == "Yes", 1, ifelse(brfss$diffdres == "No", 0, NA))
brfss$diffalon <- ifelse(brfss$diffalon == "Yes", 1, ifelse(brfss$diffalon == "No", 0, NA))

unique(brfss$x_michd)
## [1] "No"                   "Yes"                  "Not asked or Missing"
brfss$x_michd <- ifelse(brfss$x_michd == "Yes", 1, ifelse(brfss$x_michd == "No", 0, NA))

unique(brfss$x_rfbing5)
## [1] "No"                  "Yes"                 "Don’t know/Not sure"
brfss$x_rfbing5 <- ifelse(brfss$x_rfbing5 == "Yes", 1, ifelse(brfss$x_rfbing5 == "No", 0, NA))

unique(brfss$x_rfhlth)
## [1] "Good"                "Fair/Poor"           "Don’t know/Not sure"
brfss$x_rfhlth <- as.factor(brfss$x_rfhlth)

unique(brfss$x_casthm1_2)
## [1] "Former"  "Current" ""
brfss$x_casthm1_2 <- as.factor(brfss$x_casthm1_2)

unique(brfss$employ1)
##  [1] "Employed for wages"               "Refused"                         
##  [3] "A homemaker"                      "A student"                       
##  [5] "Self-employed"                    "Out of work for less than 1 year"
##  [7] "Retired"                          "Out of work for 1 year or more"  
##  [9] "Unable to work"                   "Not asked or Missing"
brfss$employ1 <- as.factor(brfss$employ1)

brfss$marital <- as.factor(brfss$marital)
unique(brfss$educa)
## [1] "College 1 year to 3 years (Some college or technical school)"
## [2] "Grade 12 or GED (High school graduate)"                      
## [3] "College 4 years or more (College graduate)"                  
## [4] "Grades 9 through 11 (Some high school)"                      
## [5] "Grades 1 through 8 (Elementary)"                             
## [6] "Never attended school or only kindergarten"                  
## [7] "Refused"
brfss$educa <- as.factor(brfss$educa)

unique(brfss$bpmeds)
## [1] "Not asked or Missing" "Yes"                  "No"                  
## [4] "Don’t know/Not sure"  "Refused"
brfss$bpmeds <- ifelse(brfss$bpmeds == "Yes", 1, ifelse(brfss$bpmeds == "No", 0, NA))

unique(brfss$checkup1)
## [1] "5 or more years ago" "Within past 2 years" "Within past year"   
## [4] "Within past 5 years" "Don’t know/Not sure" "Never"              
## [7] "Refused"
brfss$checkup1 <- as.factor(brfss$checkup1)
brfss$state_name <- as.factor(brfss$state_name)
brfss$region <- as.factor(brfss$region)

brfss$x_bmi5 <- brfss$x_bmi5/100 # converting BMI to two decimals

Exploratory Data Analysis

Click to expand
# Descriptive Statistics

mystats <- function(x, na.omit=FALSE){
  if (na.omit)
    x <- x[!is.na(x)]
  m <- mean(x)
  n <- length(x)
  s <- sd(x)
  min <- min(x)
  max <- max(x)
  return(c(n=n, mean=m, stdev=s, 
           min=min, max=max))}

dstats <- function(x)sapply(x, mystats, na.omit=TRUE)
myvars <- c("iyear",     "physhlth",    "menthlth",    "poorhlth",   
            "bpmeds",      "cvdinfr4",    "cvdcrhd4",    "cvdstrk3",   
            "asthma3",     "chcscncr",    "chcocncr",    "chccopd", "addepev", "diabete_2",   "decide",      "diffwalk",    "diffdres",    "diffalon",   "x_michd",     "x_bmi5",     "x_rfbing5") # only numeric or integer columns
Descriptive_stats <- dstats(brfss[myvars])
round(Descriptive_stats,3)
##          iyear physhlth menthlth poorhlth   bpmeds cvdinfr4 cvdcrhd4 cvdstrk3
## n     8887.000 8698.000 8714.000 4579.000 3613.000 8837.000 8799.000 8851.000
## mean  2019.016    3.945    3.693    5.076    0.831    0.055    0.053    0.042
## stdev    2.021    8.469    7.886    9.082    0.375    0.228    0.225    0.200
## min   2017.000    0.000    0.000    0.000    0.000    0.000    0.000    0.000
## max   2022.000   30.000   30.000   30.000    1.000    1.000    1.000    1.000
##        asthma3 chcscncr chcocncr  chccopd  addepev diabete_2   decide diffwalk
## n     8851.000 8861.000 8865.000 8851.000 8835.000  8618.000 8488.000 8477.000
## mean     0.134    0.096    0.095    0.082    0.192     0.139    0.109    0.166
## stdev    0.340    0.295    0.294    0.274    0.394     0.346    0.312    0.372
## min      0.000    0.000    0.000    0.000    0.000     0.000    0.000    0.000
## max      1.000    1.000    1.000    1.000    1.000     1.000    1.000    1.000
##       diffdres diffalon  x_michd   x_bmi5 x_rfbing5
## n     8493.000 8467.000 8799.000 8015.000  8220.000
## mean     0.043    0.072    0.084   28.266     0.861
## stdev    0.203    0.258    0.278    6.347     0.346
## min      0.000    0.000    0.000   12.530     0.000
## max      1.000    1.000    1.000   83.200     1.000
# those who had some kind of cancer diagnosis
cancer <- brfss |> filter(chcscncr == 1 | chcocncr == 1)

diabetes <- brfss |> filter(diabete_2 == 1)


# Key questions what contributed to bad mental, physical, and poor health

# Visualizations
# How did cancer diagnosis impact bad physical health
cancer_nonzero_phys <- cancer |> filter(physhlth!=0)
cancer_nonzero_phys <-  cancer_nonzero_phys |> group_by(physhlth) |> summarise(count=n())
cancer_nonzero_phys$percentage_yes_cancer <- (cancer_nonzero_phys$count / sum(cancer_nonzero_phys$count)) * 100
cancer_nonzero_phys
## # A tibble: 24 × 3
##    physhlth count percentage_yes_cancer
##       <int> <int>                 <dbl>
##  1        1    43                 7.61 
##  2        2    62                11.0  
##  3        3    46                 8.14 
##  4        4    31                 5.49 
##  5        5    48                 8.50 
##  6        6     7                 1.24 
##  7        7    24                 4.25 
##  8        8     4                 0.708
##  9        9     2                 0.354
## 10       10    37                 6.55 
## # ℹ 14 more rows
# compared to 
no_cancer <- brfss |> filter(chcscncr == 0 & chcocncr == 0)
phys_health <- no_cancer |> filter(physhlth!=0)
phys_health <- phys_health |> group_by(physhlth) |> summarise(count=n())
phys_health$percentage_no_cancer <- (phys_health$count / sum(phys_health$count)) * 100
phys_health
## # A tibble: 30 × 3
##    physhlth count percentage_no_cancer
##       <int> <int>                <dbl>
##  1        1   313               12.8  
##  2        2   417               17.1  
##  3        3   243                9.94 
##  4        4   124                5.07 
##  5        5   215                8.80 
##  6        6    23                0.941
##  7        7   137                5.61 
##  8        8    16                0.655
##  9        9     3                0.123
## 10       10   138                5.65 
## # ℹ 20 more rows
# Can test for poor health instead see if greater difference


# How did poor health differ by high income 

# baseline
poor_health <- brfss |> filter(poorhlth!=0)
poor_health |> ggplot(mapping=aes(x=poorhlth)) +
  geom_bar(fill="blue", color="black") +
  labs(title="Days of Poor Health (zero days removed)", x="Days of Poor Health", y="Count")

# $75k or more to poor health
poorhlth_income <- poor_health |> filter(income=="$75,000 or more")
poorhlth_income |> ggplot(mapping=aes(x=poorhlth)) +
  geom_bar(fill="green", color="black") +
  labs(title="Days of Poor Health (zero days removed) >=$75k", x="Days of Poor Health", y="Count")

# How does diabetes and BMI relate? Comparing Means.

diabetes <- brfss |> filter(diabete_2 == 1) #answered yes to having diabetes

diabetes |> ggplot(mapping=aes(x=x_bmi5)) +
  geom_histogram() +
    geom_vline(xintercept = mean(diabetes$x_bmi5, na.rm=TRUE), color = "red", linetype = "dashed") +
  labs(title="Histogram of BMI - Yes to Diabetes", x="BMI", y="Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 103 rows containing non-finite outside the scale range
## (`stat_bin()`).

mean(diabetes$x_bmi5, na.rm=TRUE)
## [1] 31.31085
no_diabetes <- brfss |> filter(diabete_2 == 0) #answered yes to having diabetes
no_diabetes |> ggplot(mapping=aes(x=x_bmi5)) +
  geom_histogram() +
  geom_vline(xintercept = mean(no_diabetes$x_bmi5, na.rm=TRUE), color = "red", linetype = "dashed") +
  labs(title="Histogram of BMI - No to Diabetes", x="BMI", y="Count") +
  theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 743 rows containing non-finite outside the scale range
## (`stat_bin()`).

mean(no_diabetes$x_bmi5, na.rm=TRUE)
## [1] 27.69563
# Analysis with confidence intervals compare original to subsetted
# Compare mean BMI normal to BMI with Diabetes = Yes

bmi <- no_diabetes
bmi <- na.omit(bmi) #removing NAs
# 95% CI
n <- bmi |> summarise(count=n())
n <- as.integer(n)
xbar <- mean(bmi$x_bmi5)
s <- sd(bmi$x_bmi5)
confidence <- 0.95

alpha <- 1-confidence
alphaovertwo <- (alpha/2)
degreesoffreedom <- n-1   # required for t distribution curves
tofalphaover2 <- abs(qt(alphaovertwo, degreesoffreedom))

#2) calculate the estimator (E) using 
# E = t(alpha/2) * (s/(sqrt(n)))
estimator <- tofalphaover2 * (s/sqrt(n))

# 3) calculate the lower and upper ends of the confidence interval range
LowerCIValue <- xbar-estimator
UpperCIValue <- xbar+estimator

# ANSWER
print(paste0("The ", confidence*100,"%", " confidence interval for BMI levels for those who answered No to diabetes is: ", round(LowerCIValue), " <= ", "mu ", "<= ", round(UpperCIValue)," ."))
## [1] "The 95% confidence interval for BMI levels for those who answered No to diabetes is: 29 <= mu <= 30 ."
# Diabetes Yes

diabetes <- na.omit(diabetes) #removing NAs
# 95% CI
n <- diabetes |> summarise(count=n())
n <- as.integer(n)
xbar <- mean(diabetes$x_bmi5)
s <- sd(diabetes$x_bmi5)
confidence <- 0.95

alpha <- 1-confidence
alphaovertwo <- (alpha/2)
degreesoffreedom <- n-1   # required for t distribution curves
tofalphaover2 <- abs(qt(alphaovertwo, degreesoffreedom))

#2) calculate the estimator (E) using 
# E = t(alpha/2) * (s/(sqrt(n)))
estimator <- tofalphaover2 * (s/sqrt(n))

# 3) calculate the lower and upper ends of the confidence interval range
LowerCIValue <- xbar-estimator
UpperCIValue <- xbar+estimator

# ANSWER
print(paste0("The ", confidence*100,"%", " confidence interval for BMI levels for those who answered Yes to diabetes is: ", round(LowerCIValue), " <= ", "mu ", "<= ", round(UpperCIValue)," ."))
## [1] "The 95% confidence interval for BMI levels for those who answered Yes to diabetes is: 32 <= mu <= 33 ."

One Sample Hypothesis Tests

Question 1

For people who have diabetes do they on average have a BMI higher than 30?

Null hypothesis: mu = 30 Alt hypothesis: mu > 30 (claim)

I conducted this test because in the exploratory analysis it appeared BMI was on average higher for those who had diabetes verses those who didn’t. So, I selected 30 as a test value to see if BMI for those with diabetes would be greater than this assumption.

From the results we obtained a p-value of 6.67e-10 near zero and alpha 0.05 we can reject null and conclude there is enough evidence to support the claim that the average BMI is greater than 30 for those with diabetes.

# 5 questions for one sample tests

# 1. For people who have diabetes do they on average have a BMI higher than 30?
# Null hypothesis: mu = 30
# Alt hypothesis: mu > 30 (claim)

diabetes_yes <- brfss |> filter(diabete_2 == 1)
diabetes_yes_bmi <- na.omit(diabetes_yes$x_bmi5)
alpha <- 0.05

mu <- 30 #hypothesized


# This is a one-tail test because claim is about > or <
CVusingt.test <- t.test(diabetes_yes_bmi,
                        alternative="greater",
                        mu=mu,
                        confidence=1-alpha)
CVusingt.test
## 
##  One Sample t-test
## 
## data:  diabetes_yes_bmi
## t = 6.1161, df = 1091, p-value = 6.672e-10
## alternative hypothesis: true mean is greater than 30
## 95 percent confidence interval:
##  30.95801      Inf
## sample estimates:
## mean of x 
##  31.31085
attributes(CVusingt.test)
## $names
##  [1] "statistic"   "parameter"   "p.value"     "conf.int"    "estimate"   
##  [6] "null.value"  "stderr"      "alternative" "method"      "data.name"  
## 
## $class
## [1] "htest"
CVusingt.test$statistic   # the t test statistic
##        t 
## 6.116051
CVusingt.test$parameter   # the degrees of freedom
##   df 
## 1091
CVusingt.test$p.value     # the p-value
## [1] 6.67227e-10
CVusingt.test$conf.int    # the confidence interval (2 numbers)
## [1] 30.95801      Inf
## attr(,"conf.level")
## [1] 0.95
CVusingt.test$estimate    # the estimated mean
## mean of x 
##  31.31085
CVusingt.test$null.value  # the specified hypothesized mean
## mean 
##   30
CVusingt.test$stderr      # standard error of the mean
## [1] 0.2143298
CVusingt.test$alternative # which kind of test (<, > or =)
## [1] "greater"
# With a p-value of 6.67e-10 near zero and alpha 0.05 we can reject null and conclude there is enough evidence to support the claim that the average BMI is greater than 30 for those with diabetes.

Question 2

Does having some form of cancer diagnosis lead to on average having more than 5 poor physical health days?

Null hypothesis: mu = 5 Alt hypothesis: mu > 5 (claim)

I selected this one sample test because in exploratory analysis it showed a greater number of 30 days of poor physical health for those with some kind of cancer diagnosis with zeros removed. I tested this against an average of 5 days poor physical health for those with some kind of cancer diagnosis to see if it would be greater than this selected parameter.

The obtained p-value was 0.0874 which is greater than alpha 0.05 in this instance we would not reject the null and conclude there isn’t enough evidence to support the claim that the average number of days for poor physical health is greater than 5.

# 2. Does having some form of cancer diagnosis lead to on average having more than 5 poor physical health days?
# Null hypothesis: mu = 5
# Alt hypothesis: mu > 5 (claim)

cancer <- brfss |> filter(chcscncr == 1 | chcocncr == 1)
cancer_phy <- na.omit(cancer$physhlth)
alpha <- 0.05

mu <- 5 #hypothesized


# This is a one-tail test because claim is about > or <
CVusingt.test <- t.test(cancer_phy,
                        alternative="greater",
                        mu=mu,
                        confidence=1-alpha)
CVusingt.test
## 
##  One Sample t-test
## 
## data:  cancer_phy
## t = 1.3573, df = 1468, p-value = 0.08745
## alternative hypothesis: true mean is greater than 5
## 95 percent confidence interval:
##  4.925894      Inf
## sample estimates:
## mean of x 
##  5.348536
attributes(CVusingt.test)
## $names
##  [1] "statistic"   "parameter"   "p.value"     "conf.int"    "estimate"   
##  [6] "null.value"  "stderr"      "alternative" "method"      "data.name"  
## 
## $class
## [1] "htest"
CVusingt.test$statistic   # the t test statistic
##        t 
## 1.357301
CVusingt.test$parameter   # the degrees of freedom
##   df 
## 1468
CVusingt.test$p.value     # the p-value
## [1] 0.08744718
CVusingt.test$conf.int    # the confidence interval (2 numbers)
## [1] 4.925894      Inf
## attr(,"conf.level")
## [1] 0.95
CVusingt.test$estimate    # the estimated mean
## mean of x 
##  5.348536
CVusingt.test$null.value  # the specified hypothesized mean
## mean 
##    5
CVusingt.test$stderr      # standard error of the mean
## [1] 0.2567865
CVusingt.test$alternative # which kind of test (<, > or =)
## [1] "greater"
# We obtain a p-value of 0.0874 which is greater than alpha 0.05 in this instance we would not reject the null and conclude there isn't enough evidence to support the claim that the average number of days for poor physical health is greater than 5.

Question 3

Is the average BMI different than 28 for those who make 75k or more per year?

Null hypothesis: mu = 5 Alt hypothesis: mu != 28 (claim)

In this one sample hypothesis test I wanted to see if BMI differed from those making 75k or more per year compared the sample mean of the whole dataset at approximately 28 BMI.

We obtained a p-value of .973 at an alpha of 0.05 we do not reject the null hypothesis and conclude there isn’t sufficient evidence to conclude the average BMI is different from 28 for people making 75k or more.

# 3. Is the average BMI different than 28 for those who make 75k or more per year?
# Null hypothesis: mu = 5
# Alt hypothesis: mu != 28 (claim)

income75 <- brfss |> filter(income=="$75,000 or more")
income75_bmi <- na.omit(income75$x_bmi5)
alpha <- 0.05

mu <- 28 #hypothesized


# This is a one-tail test because claim is about > or <
CVusingt.test <- t.test(income75_bmi,
                        alternative="two.side",
                        mu=mu,
                        confidence=1-alpha)
CVusingt.test
## 
##  One Sample t-test
## 
## data:  income75_bmi
## t = 0.034133, df = 1611, p-value = 0.9728
## alternative hypothesis: true mean is not equal to 28
## 95 percent confidence interval:
##  27.72153 28.28834
## sample estimates:
## mean of x 
##  28.00493
attributes(CVusingt.test)
## $names
##  [1] "statistic"   "parameter"   "p.value"     "conf.int"    "estimate"   
##  [6] "null.value"  "stderr"      "alternative" "method"      "data.name"  
## 
## $class
## [1] "htest"
CVusingt.test$statistic   # the t test statistic
##          t 
## 0.03413275
CVusingt.test$parameter   # the degrees of freedom
##   df 
## 1611
CVusingt.test$p.value     # the p-value
## [1] 0.9727755
CVusingt.test$conf.int    # the confidence interval (2 numbers)
## [1] 27.72153 28.28834
## attr(,"conf.level")
## [1] 0.95
CVusingt.test$estimate    # the estimated mean
## mean of x 
##  28.00493
CVusingt.test$null.value  # the specified hypothesized mean
## mean 
##   28
CVusingt.test$stderr      # standard error of the mean
## [1] 0.1444877
CVusingt.test$alternative # which kind of test (<, > or =)
## [1] "two.sided"
# We obtained a p-value of .973 at an alpha of 0.05 we do not reject the null hypothesis and conclude there isn't sufficient evidence to conclude the average BMI is different from 28 for people making 75k or more.

Question 4

Do people who graduated from college have less poor mental health days than 3.7?

Null hypothesis: mu = 3.7 Alt hypothesis: mu < 3.7 (claim)

In this one sample test I wanted to see if people who graduated college had less poor mental health days compared the sample mean of the dataset of approximately 3.7 days.

We obtained a p-value of 7.977e-11 nearly zero at an alpha of 0.05 this means we can reject the null hypothesis and support the claim that college graduates on average have less than 3.7 poor mental health days per month.

#4. Do people who graduated from college have less poor mental health days than 3.7?
# Null hypothesis: mu = 3.7
# Alt hypothesis: mu < 3.7 (claim)

college_grad <- brfss |> filter(x_educag=="Graduated from College or Technical School")
college_grad_menthlth <- na.omit(college_grad$menthlth)
alpha <- 0.05

mu <- 3.7 #hypothesized


# This is a one-tail test because claim is about > or <
CVusingt.test <- t.test(college_grad_menthlth,
                        alternative="less",
                        mu=mu,
                        confidence=1-alpha)
CVusingt.test
## 
##  One Sample t-test
## 
## data:  college_grad_menthlth
## t = -6.4162, df = 3320, p-value = 7.977e-11
## alternative hypothesis: true mean is less than 3.7
## 95 percent confidence interval:
##      -Inf 3.140543
## sample estimates:
## mean of x 
##  2.947606
attributes(CVusingt.test)
## $names
##  [1] "statistic"   "parameter"   "p.value"     "conf.int"    "estimate"   
##  [6] "null.value"  "stderr"      "alternative" "method"      "data.name"  
## 
## $class
## [1] "htest"
CVusingt.test$statistic   # the t test statistic
##         t 
## -6.416199
CVusingt.test$parameter   # the degrees of freedom
##   df 
## 3320
CVusingt.test$p.value     # the p-value
## [1] 7.977089e-11
CVusingt.test$conf.int    # the confidence interval (2 numbers)
## [1]     -Inf 3.140543
## attr(,"conf.level")
## [1] 0.95
CVusingt.test$estimate    # the estimated mean
## mean of x 
##  2.947606
CVusingt.test$null.value  # the specified hypothesized mean
## mean 
##  3.7
CVusingt.test$stderr      # standard error of the mean
## [1] 0.1172647
CVusingt.test$alternative # which kind of test (<, > or =)
## [1] "less"
# We obtained a p-value of 7.977e-11 nearly zero at an alpha of 0.05 this means we can reject the null hypothesis and support the claim that college graduates on average have less than 3.7 poor mental health days.

Question 5

Do those who live in Alaska have more then 4 days of poor mental health on average?

Null hypothesis: mu = 4 Alt hypothesis: mu > 4 (claim)

In this one sample hypothesis test I wanted to see if Alaska had more than 4 poor mental health days on average slightly above the sample average. I performed a t.test to test this on the filtered data pertaining to just those in the state of Alaska.

The results were we obtained a p-value of 0.439 which is greater than our alpha of 0.05. This implies we do not reject the null and conclude there isn’t enough evidence to support the claim people in Alaska have higher than 4 days poor mental health on average.

#5. Do those who live in Alaska have more then 4 days of poor mental health on average?
# Null hypothesis: mu = 4
# Alt hypothesis: mu > 4 (claim)

Alaska <- brfss |> filter(state_name=="Alaska")
Alaska_menthlth <- na.omit(Alaska$menthlth)
alpha <- 0.05

mu <- 4 #hypothesized


# This is a one-tail test because claim is about > or <
CVusingt.test <- t.test(Alaska_menthlth,
                        alternative="greater",
                        mu=mu,
                        confidence=1-alpha)
CVusingt.test
## 
##  One Sample t-test
## 
## data:  Alaska_menthlth
## t = 0.15187, df = 87, p-value = 0.4398
## alternative hypothesis: true mean is greater than 4
## 95 percent confidence interval:
##  2.643564      Inf
## sample estimates:
## mean of x 
##  4.136364
attributes(CVusingt.test)
## $names
##  [1] "statistic"   "parameter"   "p.value"     "conf.int"    "estimate"   
##  [6] "null.value"  "stderr"      "alternative" "method"      "data.name"  
## 
## $class
## [1] "htest"
CVusingt.test$statistic   # the t test statistic
##         t 
## 0.1518706
CVusingt.test$parameter   # the degrees of freedom
## df 
## 87
CVusingt.test$p.value     # the p-value
## [1] 0.4398203
CVusingt.test$conf.int    # the confidence interval (2 numbers)
## [1] 2.643564      Inf
## attr(,"conf.level")
## [1] 0.95
CVusingt.test$estimate    # the estimated mean
## mean of x 
##  4.136364
CVusingt.test$null.value  # the specified hypothesized mean
## mean 
##    4
CVusingt.test$stderr      # standard error of the mean
## [1] 0.8978934
CVusingt.test$alternative # which kind of test (<, > or =)
## [1] "greater"
# We obtain a p-value of 0.439 which is greater than our alpha of 0.05. This implies we do not reject the null and conclude there isn't enough evidence to support the claim people in Alaska have higher than 4 days poor mental health on average.

Two Sample Hypothesis Tests

Question 1

Is there a difference in poor physical days in the Northeast compared to the South?

H(o): mu1=mu2 H(a): mu1!=mu2 (claim)

In this two sample hypothesis test I wanted to see if there was a significant difference at a confidence level of 95% with an alpha of 0.05 between poor physical health days between the Northeast and Southern regions.

Using a two sided t-test for both variables a p-value was obtained of 0.006395 this is less than our alpha of 0.05 implies we reject the null hypothesis concluding there is enough evidence to support the claim that there is a difference between average number of poor physical health days between Southern and Northeastern regions.

# 5 questions for two sample test

#1. Is there a difference in poor physical days in the Northeast compared to the South?

north <- brfss |> filter(region=="Northeast")
south <- brfss |> filter(region=="South")
north <- na.omit(north$physhlth)
south <- na.omit(south$physhlth)

# Hypothesis test
# H(o): mu1=mu2
# H(a): mu1!=mu2 (claim)

mu <- 0
alpha <- .05

# Two tailed test because the claim is about there being a difference
CVusingt.test <- t.test(north, south,
                        alternative="two.side",
                        mu=mu,
                        confidence=1-alpha)
CVusingt.test
## 
##  Welch Two Sample t-test
## 
## data:  north and south
## t = -2.7283, df = 3874.9, p-value = 0.006395
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.2429058 -0.2035019
## sample estimates:
## mean of x mean of y 
##  3.561668  4.284871
attributes(CVusingt.test)
## $names
##  [1] "statistic"   "parameter"   "p.value"     "conf.int"    "estimate"   
##  [6] "null.value"  "stderr"      "alternative" "method"      "data.name"  
## 
## $class
## [1] "htest"
CVusingt.test$statistic   # the t test statistic
##         t 
## -2.728287
CVusingt.test$parameter   # the degrees of freedom
##       df 
## 3874.902
CVusingt.test$p.value     # the p-value
## [1] 0.006395128
CVusingt.test$conf.int    # the confidence interval (2 numbers)
## [1] -1.2429058 -0.2035019
## attr(,"conf.level")
## [1] 0.95
CVusingt.test$estimate    # the estimated mean
## mean of x mean of y 
##  3.561668  4.284871
CVusingt.test$null.value  # the specified hypothesized mean
## difference in means 
##                   0
CVusingt.test$stderr      # standard error of the mean
## [1] 0.2650761
CVusingt.test$alternative # which kind of test (<, > or =)
## [1] "two.sided"
# P-value is 0.006395 this is less than our alpha of 0.05 implies we reject the null hypothesis concluding there is enough evidence to support the claim that there is a difference between average number of poor physical health days between Southern and Northeastern regions. 

Question 2

Is there a difference in the average number of poor mental health days between people who live in New York and Vermont?

H(o): mu1=mu2 H(a): mu1!=mu2 (claim)

I selected this two sample hypothesis test to see if there was a difference in poor mental health days between Vermont and New York which are found in the same region of the Northeast.

The obtained p-value was 0.374 this is more than our alpha of 0.05 implies we don’t reject the null hypothesis concluding there is not enough evidence to support the claim that there is a difference between average number of poor mental health days between New York and Vermont.

#2. Is there a difference in the average number of poor mental health days between people who live in New York and Vermont?

new_york <- brfss |> filter(state_name=="New York")
vermont <- brfss |> filter(state_name=="Vermont")
new_york <- na.omit(new_york$menthlth)
vermont <- na.omit(vermont$menthlth)


# Hypothesis test
# H(o): mu1=mu2
# H(a): mu1!=mu2 (claim)

mu <- 0
alpha <- .05

# Two tailed test because the claim is about there being a difference
CVusingt.test <- t.test(new_york, vermont,
                        alternative="two.side",
                        mu=mu,
                        confidence=1-alpha)
CVusingt.test
## 
##  Welch Two Sample t-test
## 
## data:  new_york and vermont
## t = -0.89151, df = 175.53, p-value = 0.3739
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.2956760  0.8670093
## sample estimates:
## mean of x mean of y 
##  3.544000  4.258333
attributes(CVusingt.test)
## $names
##  [1] "statistic"   "parameter"   "p.value"     "conf.int"    "estimate"   
##  [6] "null.value"  "stderr"      "alternative" "method"      "data.name"  
## 
## $class
## [1] "htest"
CVusingt.test$statistic   # the t test statistic
##          t 
## -0.8915132
CVusingt.test$parameter   # the degrees of freedom
##       df 
## 175.5253
CVusingt.test$p.value     # the p-value
## [1] 0.3738744
CVusingt.test$conf.int    # the confidence interval (2 numbers)
## [1] -2.2956760  0.8670093
## attr(,"conf.level")
## [1] 0.95
CVusingt.test$estimate    # the estimated mean
## mean of x mean of y 
##  3.544000  4.258333
CVusingt.test$null.value  # the specified hypothesized mean
## difference in means 
##                   0
CVusingt.test$stderr      # standard error of the mean
## [1] 0.8012594
CVusingt.test$alternative # which kind of test (<, > or =)
## [1] "two.sided"
# P-value is 0.374 this is more than our alpha of 0.05 implies we don't reject the null hypothesis concluding there is not enough evidence to support the claim that there is a difference between average number of poor mental health days between New York and Vermont. 

Question 3

Do people in the South have a higher BMI on average than those in the West?

H(o): mu1=mu2 H(a): mu1>mu2 (claim)

In this test I wanted to see if BMI was higher on average for the Southern region compared to the Western region. This utilizes a one tail right side test to see if the test value falls in the rejection region. We observed the results through the p-value in comparison to the selected alpha of 0.05.

P-value was 0.997 this is greater than our alpha of 0.05 implies we don’t reject the null hypothesis concluding there is not enough evidence to support the claim that the Southern region does have a greater BMI on average than the West.

#3. Do people in the South have a higher BMI on average than those in the West?

west <- brfss |> filter(region=="West")
south <- brfss |> filter(region=="South")
west <- na.omit(west$x_bmi5)
south <- na.omit(south$x_bmi5)


# Hypothesis test
# H(o): mu1=mu2
# H(a): mu1>mu2 (claim)

mu <- 0
alpha <- .05

# Two tailed test because the claim is about there being a difference
CVusingt.test <- t.test(west, south,
                        alternative="greater",
                        mu=mu,
                        confidence=1-alpha)
CVusingt.test
## 
##  Welch Two Sample t-test
## 
## data:  west and south
## t = -2.7885, df = 3895.8, p-value = 0.9973
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  -0.8777369        Inf
## sample estimates:
## mean of x mean of y 
##  27.84841  28.40044
attributes(CVusingt.test)
## $names
##  [1] "statistic"   "parameter"   "p.value"     "conf.int"    "estimate"   
##  [6] "null.value"  "stderr"      "alternative" "method"      "data.name"  
## 
## $class
## [1] "htest"
CVusingt.test$statistic   # the t test statistic
##         t 
## -2.788468
CVusingt.test$parameter   # the degrees of freedom
##       df 
## 3895.793
CVusingt.test$p.value     # the p-value
## [1] 0.9973393
CVusingt.test$conf.int    # the confidence interval (2 numbers)
## [1] -0.8777369        Inf
## attr(,"conf.level")
## [1] 0.95
CVusingt.test$estimate    # the estimated mean
## mean of x mean of y 
##  27.84841  28.40044
CVusingt.test$null.value  # the specified hypothesized mean
## difference in means 
##                   0
CVusingt.test$stderr      # standard error of the mean
## [1] 0.1979688
CVusingt.test$alternative # which kind of test (<, > or =)
## [1] "greater"
# P-value is 0.997 this is greater than our alpha of 0.05 implies we don't reject the null hypothesis concluding there is not enough evidence to support the claim that the Southern region does have a greater BMI on average than the West. 

Question 4

Is there a difference of average BMI for people Age 25-34 that live in the Midwest compared to West?

H(o): mu1=mu2 H(a): mu1!=mu2 (claim)

For this two sample hypothesis test I wanted to compare average BMI level for those in 25-34 age range between the Midwest and West regions. As similar to previous problems an alpha of 0.05 was selected and a t.test was conducted to obtain the p-value.

P-value was 0.319 this is greater than our alpha of 0.05 implies we don’t reject the null hypothesis concluding there is not enough evidence to support the claim that there is a difference of average BMI for people Age 25-34 that live in the Midwest compared to West.

#4. Is there a difference of average BMI for people Age 25-34 that live in the Midwest compared to West?

west_age <- brfss |> filter(region=="West" & x_age_g=="25-34")
midwest_age <- brfss |> filter(region=="Midwest" & x_age_g=="25-34")
west_age <- na.omit(west_age$x_bmi5)
midwest_age <- na.omit(midwest_age$x_bmi5)


# Hypothesis test
# H(o): mu1=mu2
# H(a): mu1!=mu2 (claim)

mu <- 0
alpha <- .05

# Two tailed test because the claim is about there being a difference
CVusingt.test <- t.test(west_age, midwest_age,
                        alternative="two.side",
                        mu=mu,
                        confidence=1-alpha)
CVusingt.test
## 
##  Welch Two Sample t-test
## 
## data:  west_age and midwest_age
## t = -0.99834, df = 435.92, p-value = 0.3187
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.9028378  0.6209033
## sample estimates:
## mean of x mean of y 
##  27.79670  28.43767
attributes(CVusingt.test)
## $names
##  [1] "statistic"   "parameter"   "p.value"     "conf.int"    "estimate"   
##  [6] "null.value"  "stderr"      "alternative" "method"      "data.name"  
## 
## $class
## [1] "htest"
CVusingt.test$statistic   # the t test statistic
##          t 
## -0.9983357
CVusingt.test$parameter   # the degrees of freedom
##       df 
## 435.9173
CVusingt.test$p.value     # the p-value
## [1] 0.3186704
CVusingt.test$conf.int    # the confidence interval (2 numbers)
## [1] -1.9028378  0.6209033
## attr(,"conf.level")
## [1] 0.95
CVusingt.test$estimate    # the estimated mean
## mean of x mean of y 
##  27.79670  28.43767
CVusingt.test$null.value  # the specified hypothesized mean
## difference in means 
##                   0
CVusingt.test$stderr      # standard error of the mean
## [1] 0.6420358
CVusingt.test$alternative # which kind of test (<, > or =)
## [1] "two.sided"
# P-value is 0.319 this is greater than our alpha of 0.05 implies we don't reject the null hypothesis concluding there is not enough evidence to support the claim that there is a difference of average BMI for people Age 25-34 that live in the Midwest compared to West.

Question 5

Do people who are retired have less poor mental health days than those employed for wages?

H(o): mu1=mu2 H(a): mu1<mu2 (claim)

In this two sample test the data was split into two sample comparing those who were retired and those currently employed for wages. The alternative hypothesis was set for retired people have less poor mental health days than those employed.

P-value obtained of 1.59e-10 near zero with an alpha of 0.05 since p-value is less than alpha we can reject the null hypothesis and conclude there is enough evidence to support the claim that those who are retired have less poor mental health days than those employed for wages.

#5. Do people who are retired have less poor mental health days than those employed for wages?

retire <- brfss |> filter(employ1=="Retired")
employed <- brfss |> filter(employ1=="Employed for wages")
retire <- na.omit(retire$menthlth)
employed <- na.omit(employed$menthlth)

# Hypothesis test
# H(o): mu1=mu2
# H(a): mu1<mu2 (claim)

mu <- 0
alpha <- .05

# Two tailed test because the claim is about there being a difference
CVusingt.test <- t.test(retire, employed,
                        alternative="less",
                        mu=mu,
                        confidence=1-alpha)
CVusingt.test
## 
##  Welch Two Sample t-test
## 
## data:  retire and employed
## t = -6.2999, df = 6018, p-value = 1.594e-10
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##        -Inf -0.8137664
## sample estimates:
## mean of x mean of y 
##  2.345763  3.447130
attributes(CVusingt.test)
## $names
##  [1] "statistic"   "parameter"   "p.value"     "conf.int"    "estimate"   
##  [6] "null.value"  "stderr"      "alternative" "method"      "data.name"  
## 
## $class
## [1] "htest"
CVusingt.test$statistic   # the t test statistic
##         t 
## -6.299936
CVusingt.test$parameter   # the degrees of freedom
##       df 
## 6018.048
CVusingt.test$p.value     # the p-value
## [1] 1.594252e-10
CVusingt.test$conf.int    # the confidence interval (2 numbers)
## [1]       -Inf -0.8137664
## attr(,"conf.level")
## [1] 0.95
CVusingt.test$estimate    # the estimated mean
## mean of x mean of y 
##  2.345763  3.447130
CVusingt.test$null.value  # the specified hypothesized mean
## difference in means 
##                   0
CVusingt.test$stderr      # standard error of the mean
## [1] 0.174822
CVusingt.test$alternative # which kind of test (<, > or =)
## [1] "less"
# P-value obtained of 1.59e-10 near zero with an alpha of 0.05 since p-value is less than alpha we can reject the null hypothesis and conclude there is enough evidence to support the claim that those who are retired have less poor mental health days than those employed for wages.

References

Kabacoff, R. I. (2015). R in Action (2nd ed.). Manning Publications.

Bluman, A. G. (2018). Elementary statistics: A step by step approach (10th ed.). McGraw Hill.

When prompted with “how to replace yes and no with 1 and 0 in R for unique(data$variable) (then uniquie values)?” the ChatGPT generated text indicated “This code uses the ifelse function to convert”Yes” to 1, “No” to 0, and any other categories to NA (missing values). This way, you’re creating a binary representation of the “Yes/No” responses.” (OpenAI, 2024).

OpenAI. (2024). ChatGPT (March 5 version) [Large Language model] https://chat.openai.com/