Data Cleaning

Click to expand

# Cleaning the data

brfss <- read.csv("brfss_17_21_sample5.csv")

brfss <- clean_names(brfss)

brfss$income <- as.factor(brfss$income)
brfss$genhlth <- as.factor(brfss$genhlth)
brfss$x_bmi5cat <- as.factor(brfss$x_bmi5cat)
brfss$x_educag <- as.factor(brfss$x_educag)
brfss$x_rfsmok3 <- as.factor(brfss$x_rfsmok3)
brfss$x_racegr3 <- as.factor(brfss$x_racegr3)
brfss$x_age_g <- as.factor(brfss$x_age_g)



# cleaning iyear column and converting to numeric
# cite/reference
brfss$iyear <- gsub("b'", "", brfss$iyear)
brfss$iyear <- gsub("'", "", brfss$iyear)
brfss$iyear <- as.numeric(brfss$iyear)

# converts Yes/No to 1 (Yes) and 0 (No) and converts the others to NAs to get just complete answers
brfss$cvdinfr4 <- ifelse(brfss$cvdinfr4 == "Yes", 1, ifelse(brfss$cvdinfr4 == "No", 0, NA))

unique(brfss$cvdcrhd4)

## [1] "No"                  "Yes"                 "Don’t know/Not sure"
## [4] "Refused"

brfss$cvdcrhd4 <- ifelse(brfss$cvdcrhd4 == "Yes", 1, ifelse(brfss$cvdcrhd4 == "No", 0, NA))

unique(brfss$cvdstrk3)

## [1] "No"                  "Yes"                 "Don’t know/Not sure"
## [4] "Refused"

brfss$cvdstrk3 <- ifelse(brfss$cvdstrk3 == "Yes", 1, ifelse(brfss$cvdstrk3 == "No", 0, NA))

brfss$asthma3 <- ifelse(brfss$asthma3 == "Yes", 1, ifelse(brfss$asthma3 == "No", 0, NA))
brfss$chcscncr <- ifelse(brfss$chcscncr == "Yes", 1, ifelse(brfss$chcscncr == "No", 0, NA))
brfss$chcocncr <- ifelse(brfss$chcocncr == "Yes", 1, ifelse(brfss$chcocncr == "No", 0, NA))
brfss$chccopd <- ifelse(brfss$chccopd == "Yes", 1, ifelse(brfss$chccopd == "No", 0, NA))
brfss$addepev <- ifelse(brfss$addepev == "Yes", 1, ifelse(brfss$addepev == "No", 0, NA))
brfss$diabete_2 <- ifelse(brfss$diabete_2 == "Yes", 1, ifelse(brfss$diabete_2 == "No", 0, NA))

brfss$decide <- ifelse(brfss$decide == "Yes", 1, ifelse(brfss$decide == "No", 0, NA))
brfss$diffwalk <- ifelse(brfss$diffwalk == "Yes", 1, ifelse(brfss$diffwalk == "No", 0, NA))
brfss$diffdres <- ifelse(brfss$diffdres == "Yes", 1, ifelse(brfss$diffdres == "No", 0, NA))
brfss$diffalon <- ifelse(brfss$diffalon == "Yes", 1, ifelse(brfss$diffalon == "No", 0, NA))

unique(brfss$x_michd)

## [1] "No"                   "Yes"                  "Not asked or Missing"

brfss$x_michd <- ifelse(brfss$x_michd == "Yes", 1, ifelse(brfss$x_michd == "No", 0, NA))

unique(brfss$x_rfbing5)

## [1] "No"                  "Yes"                 "Don’t know/Not sure"

brfss$x_rfbing5 <- ifelse(brfss$x_rfbing5 == "Yes", 1, ifelse(brfss$x_rfbing5 == "No", 0, NA))

unique(brfss$x_rfhlth)

## [1] "Good"                "Fair/Poor"           "Don’t know/Not sure"

brfss$x_rfhlth <- as.factor(brfss$x_rfhlth)

unique(brfss$x_casthm1_2)

## [1] "Former"  "Current" ""

brfss$x_casthm1_2 <- as.factor(brfss$x_casthm1_2)

unique(brfss$employ1)

##  [1] "Employed for wages"               "Refused"                         
##  [3] "A homemaker"                      "A student"                       
##  [5] "Self-employed"                    "Out of work for less than 1 year"
##  [7] "Retired"                          "Out of work for 1 year or more"  
##  [9] "Unable to work"                   "Not asked or Missing"

brfss$employ1 <- as.factor(brfss$employ1)

brfss$marital <- as.factor(brfss$marital)
unique(brfss$educa)

## [1] "College 1 year to 3 years (Some college or technical school)"
## [2] "Grade 12 or GED (High school graduate)"                      
## [3] "College 4 years or more (College graduate)"                  
## [4] "Grades 9 through 11 (Some high school)"                      
## [5] "Grades 1 through 8 (Elementary)"                             
## [6] "Never attended school or only kindergarten"                  
## [7] "Refused"

brfss$educa <- as.factor(brfss$educa)

unique(brfss$bpmeds)

## [1] "Not asked or Missing" "Yes"                  "No"                  
## [4] "Don’t know/Not sure"  "Refused"

brfss$bpmeds <- ifelse(brfss$bpmeds == "Yes", 1, ifelse(brfss$bpmeds == "No", 0, NA))

unique(brfss$checkup1)

## [1] "5 or more years ago" "Within past 2 years" "Within past year"   
## [4] "Within past 5 years" "Don’t know/Not sure" "Never"              
## [7] "Refused"

brfss$checkup1 <- as.factor(brfss$checkup1)
brfss$state_name <- as.factor(brfss$state_name)
brfss$region <- as.factor(brfss$region)

brfss$x_bmi5 <- brfss$x_bmi5/100 # converting BMI to two decimals

Exploratory Data Analysis

Click to expand

# Descriptive Statistics

mystats <- function(x, na.omit=FALSE){
  if (na.omit)
    x <- x[!is.na(x)]
  m <- mean(x)
  n <- length(x)
  s <- sd(x)
  min <- min(x)
  max <- max(x)
  return(c(n=n, mean=m, stdev=s, 
           min=min, max=max))}

dstats <- function(x)sapply(x, mystats, na.omit=TRUE)
myvars <- c("iyear",     "physhlth",    "menthlth",    "poorhlth",   
            "bpmeds",      "cvdinfr4",    "cvdcrhd4",    "cvdstrk3",   
            "asthma3",     "chcscncr",    "chcocncr",    "chccopd", "addepev", "diabete_2",   "decide",      "diffwalk",    "diffdres",    "diffalon",   "x_michd",     "x_bmi5",     "x_rfbing5") # only numeric or integer columns
Descriptive_stats <- dstats(brfss[myvars])
round(Descriptive_stats,3)

##          iyear physhlth menthlth poorhlth   bpmeds cvdinfr4 cvdcrhd4 cvdstrk3
## n     8887.000 8698.000 8714.000 4579.000 3613.000 8837.000 8799.000 8851.000
## mean  2019.016    3.945    3.693    5.076    0.831    0.055    0.053    0.042
## stdev    2.021    8.469    7.886    9.082    0.375    0.228    0.225    0.200
## min   2017.000    0.000    0.000    0.000    0.000    0.000    0.000    0.000
## max   2022.000   30.000   30.000   30.000    1.000    1.000    1.000    1.000
##        asthma3 chcscncr chcocncr  chccopd  addepev diabete_2   decide diffwalk
## n     8851.000 8861.000 8865.000 8851.000 8835.000  8618.000 8488.000 8477.000
## mean     0.134    0.096    0.095    0.082    0.192     0.139    0.109    0.166
## stdev    0.340    0.295    0.294    0.274    0.394     0.346    0.312    0.372
## min      0.000    0.000    0.000    0.000    0.000     0.000    0.000    0.000
## max      1.000    1.000    1.000    1.000    1.000     1.000    1.000    1.000
##       diffdres diffalon  x_michd   x_bmi5 x_rfbing5
## n     8493.000 8467.000 8799.000 8015.000  8220.000
## mean     0.043    0.072    0.084   28.266     0.861
## stdev    0.203    0.258    0.278    6.347     0.346
## min      0.000    0.000    0.000   12.530     0.000
## max      1.000    1.000    1.000   83.200     1.000

# those who had some kind of cancer diagnosis
cancer <- brfss |> filter(chcscncr == 1 | chcocncr == 1)

diabetes <- brfss |> filter(diabete_2 == 1)


# Key questions what contributed to bad mental, physical, and poor health

# Visualizations
# How did cancer diagnosis impact bad physical health
cancer_nonzero_phys <- cancer |> filter(physhlth!=0)
cancer_nonzero_phys <-  cancer_nonzero_phys |> group_by(physhlth) |> summarise(count=n())
cancer_nonzero_phys$percentage_yes_cancer <- (cancer_nonzero_phys$count / sum(cancer_nonzero_phys$count)) * 100
cancer_nonzero_phys

## # A tibble: 24 × 3
##    physhlth count percentage_yes_cancer
##       <int> <int>                 <dbl>
##  1        1    43                 7.61 
##  2        2    62                11.0  
##  3        3    46                 8.14 
##  4        4    31                 5.49 
##  5        5    48                 8.50 
##  6        6     7                 1.24 
##  7        7    24                 4.25 
##  8        8     4                 0.708
##  9        9     2                 0.354
## 10       10    37                 6.55 
## # ℹ 14 more rows

# compared to 
no_cancer <- brfss |> filter(chcscncr == 0 & chcocncr == 0)
phys_health <- no_cancer |> filter(physhlth!=0)
phys_health <- phys_health |> group_by(physhlth) |> summarise(count=n())
phys_health$percentage_no_cancer <- (phys_health$count / sum(phys_health$count)) * 100
phys_health

## # A tibble: 30 × 3
##    physhlth count percentage_no_cancer
##       <int> <int>                <dbl>
##  1        1   313               12.8  
##  2        2   417               17.1  
##  3        3   243                9.94 
##  4        4   124                5.07 
##  5        5   215                8.80 
##  6        6    23                0.941
##  7        7   137                5.61 
##  8        8    16                0.655
##  9        9     3                0.123
## 10       10   138                5.65 
## # ℹ 20 more rows

# Can test for poor health instead see if greater difference


# How did poor health differ by high income 

# baseline
poor_health <- brfss |> filter(poorhlth!=0)
poor_health |> ggplot(mapping=aes(x=poorhlth)) +
  geom_bar(fill="blue", color="black") +
  labs(title="Days of Poor Health (zero days removed)", x="Days of Poor Health", y="Count")

# $75k or more to poor health
poorhlth_income <- poor_health |> filter(income=="$75,000 or more")
poorhlth_income |> ggplot(mapping=aes(x=poorhlth)) +
  geom_bar(fill="green", color="black") +
  labs(title="Days of Poor Health (zero days removed) >=$75k", x="Days of Poor Health", y="Count")

# How does diabetes and BMI relate? Comparing Means.

diabetes <- brfss |> filter(diabete_2 == 1) #answered yes to having diabetes

diabetes |> ggplot(mapping=aes(x=x_bmi5)) +
  geom_histogram() +
    geom_vline(xintercept = mean(diabetes$x_bmi5, na.rm=TRUE), color = "red", linetype = "dashed") +
  labs(title="Histogram of BMI - Yes to Diabetes", x="BMI", y="Count")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 103 rows containing non-finite outside the scale range
## (`stat_bin()`).

mean(diabetes$x_bmi5, na.rm=TRUE)

## [1] 31.31085

no_diabetes <- brfss |> filter(diabete_2 == 0) #answered yes to having diabetes
no_diabetes |> ggplot(mapping=aes(x=x_bmi5)) +
  geom_histogram() +
  geom_vline(xintercept = mean(no_diabetes$x_bmi5, na.rm=TRUE), color = "red", linetype = "dashed") +
  labs(title="Histogram of BMI - No to Diabetes", x="BMI", y="Count") +
  theme_minimal()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 743 rows containing non-finite outside the scale range
## (`stat_bin()`).

mean(no_diabetes$x_bmi5, na.rm=TRUE)

## [1] 27.69563

# Analysis with confidence intervals compare original to subsetted
# Compare mean BMI normal to BMI with Diabetes = Yes

bmi <- no_diabetes
bmi <- na.omit(bmi) #removing NAs
# 95% CI
n <- bmi |> summarise(count=n())
n <- as.integer(n)
xbar <- mean(bmi$x_bmi5)
s <- sd(bmi$x_bmi5)
confidence <- 0.95

alpha <- 1-confidence
alphaovertwo <- (alpha/2)
degreesoffreedom <- n-1   # required for t distribution curves
tofalphaover2 <- abs(qt(alphaovertwo, degreesoffreedom))

#2) calculate the estimator (E) using 
# E = t(alpha/2) * (s/(sqrt(n)))
estimator <- tofalphaover2 * (s/sqrt(n))

# 3) calculate the lower and upper ends of the confidence interval range
LowerCIValue <- xbar-estimator
UpperCIValue <- xbar+estimator

# ANSWER
print(paste0("The ", confidence*100,"%", " confidence interval for BMI levels for those who answered No to diabetes is: ", round(LowerCIValue), " <= ", "mu ", "<= ", round(UpperCIValue)," ."))

## [1] "The 95% confidence interval for BMI levels for those who answered No to diabetes is: 29 <= mu <= 30 ."

# Diabetes Yes

diabetes <- na.omit(diabetes) #removing NAs
# 95% CI
n <- diabetes |> summarise(count=n())
n <- as.integer(n)
xbar <- mean(diabetes$x_bmi5)
s <- sd(diabetes$x_bmi5)
confidence <- 0.95

alpha <- 1-confidence
alphaovertwo <- (alpha/2)
degreesoffreedom <- n-1   # required for t distribution curves
tofalphaover2 <- abs(qt(alphaovertwo, degreesoffreedom))

#2) calculate the estimator (E) using 
# E = t(alpha/2) * (s/(sqrt(n)))
estimator <- tofalphaover2 * (s/sqrt(n))

# 3) calculate the lower and upper ends of the confidence interval range
LowerCIValue <- xbar-estimator
UpperCIValue <- xbar+estimator

# ANSWER
print(paste0("The ", confidence*100,"%", " confidence interval for BMI levels for those who answered Yes to diabetes is: ", round(LowerCIValue), " <= ", "mu ", "<= ", round(UpperCIValue)," ."))

## [1] "The 95% confidence interval for BMI levels for those who answered Yes to diabetes is: 32 <= mu <= 33 ."

One Sample Hypothesis Tests

Question 1

For people who have diabetes do they on average have a BMI higher than 30?

Null hypothesis: mu = 30 Alt hypothesis: mu > 30 (claim)

I conducted this test because in the exploratory analysis it appeared BMI was on average higher for those who had diabetes verses those who didn’t. So, I selected 30 as a test value to see if BMI for those with diabetes would be greater than this assumption.

From the results we obtained a p-value of 6.67e-10 near zero and alpha 0.05 we can reject null and conclude there is enough evidence to support the claim that the average BMI is greater than 30 for those with diabetes.

# 5 questions for one sample tests

# 1. For people who have diabetes do they on average have a BMI higher than 30?
# Null hypothesis: mu = 30
# Alt hypothesis: mu > 30 (claim)

diabetes_yes <- brfss |> filter(diabete_2 == 1)
diabetes_yes_bmi <- na.omit(diabetes_yes$x_bmi5)
alpha <- 0.05

mu <- 30 #hypothesized


# This is a one-tail test because claim is about > or <
CVusingt.test <- t.test(diabetes_yes_bmi,
                        alternative="greater",
                        mu=mu,
                        confidence=1-alpha)
CVusingt.test

## 
##  One Sample t-test
## 
## data:  diabetes_yes_bmi
## t = 6.1161, df = 1091, p-value = 6.672e-10
## alternative hypothesis: true mean is greater than 30
## 95 percent confidence interval:
##  30.95801      Inf
## sample estimates:
## mean of x 
##  31.31085

attributes(CVusingt.test)

## $names
##  [1] "statistic"   "parameter"   "p.value"     "conf.int"    "estimate"   
##  [6] "null.value"  "stderr"      "alternative" "method"      "data.name"  
## 
## $class
## [1] "htest"

CVusingt.test$statistic   # the t test statistic

##        t 
## 6.116051

CVusingt.test$parameter   # the degrees of freedom

##   df 
## 1091

CVusingt.test$p.value     # the p-value

## [1] 6.67227e-10

CVusingt.test$conf.int    # the confidence interval (2 numbers)

## [1] 30.95801      Inf
## attr(,"conf.level")
## [1] 0.95

CVusingt.test$estimate    # the estimated mean

## mean of x 
##  31.31085

CVusingt.test$null.value  # the specified hypothesized mean

## mean 
##   30

CVusingt.test$stderr      # standard error of the mean

## [1] 0.2143298

CVusingt.test$alternative # which kind of test (<, > or =)

## [1] "greater"

# With a p-value of 6.67e-10 near zero and alpha 0.05 we can reject null and conclude there is enough evidence to support the claim that the average BMI is greater than 30 for those with diabetes.

Question 2

Does having some form of cancer diagnosis lead to on average having more than 5 poor physical health days?

Null hypothesis: mu = 5 Alt hypothesis: mu > 5 (claim)

I selected this one sample test because in exploratory analysis it showed a greater number of 30 days of poor physical health for those with some kind of cancer diagnosis with zeros removed. I tested this against an average of 5 days poor physical health for those with some kind of cancer diagnosis to see if it would be greater than this selected parameter.

The obtained p-value was 0.0874 which is greater than alpha 0.05 in this instance we would not reject the null and conclude there isn’t enough evidence to support the claim that the average number of days for poor physical health is greater than 5.

# 2. Does having some form of cancer diagnosis lead to on average having more than 5 poor physical health days?
# Null hypothesis: mu = 5
# Alt hypothesis: mu > 5 (claim)

cancer <- brfss |> filter(chcscncr == 1 | chcocncr == 1)
cancer_phy <- na.omit(cancer$physhlth)
alpha <- 0.05

mu <- 5 #hypothesized


# This is a one-tail test because claim is about > or <
CVusingt.test <- t.test(cancer_phy,
                        alternative="greater",
                        mu=mu,
                        confidence=1-alpha)
CVusingt.test

## 
##  One Sample t-test
## 
## data:  cancer_phy
## t = 1.3573, df = 1468, p-value = 0.08745
## alternative hypothesis: true mean is greater than 5
## 95 percent confidence interval:
##  4.925894      Inf
## sample estimates:
## mean of x 
##  5.348536

attributes(CVusingt.test)

## $names
##  [1] "statistic"   "parameter"   "p.value"     "conf.int"    "estimate"   
##  [6] "null.value"  "stderr"      "alternative" "method"      "data.name"  
## 
## $class
## [1] "htest"

CVusingt.test$statistic   # the t test statistic

##        t 
## 1.357301

CVusingt.test$parameter   # the degrees of freedom

##   df 
## 1468

CVusingt.test$p.value     # the p-value

## [1] 0.08744718

CVusingt.test$conf.int    # the confidence interval (2 numbers)

## [1] 4.925894      Inf
## attr(,"conf.level")
## [1] 0.95

CVusingt.test$estimate    # the estimated mean

## mean of x 
##  5.348536

CVusingt.test$null.value  # the specified hypothesized mean

## mean 
##    5

CVusingt.test$stderr      # standard error of the mean

## [1] 0.2567865

CVusingt.test$alternative # which kind of test (<, > or =)

## [1] "greater"

# We obtain a p-value of 0.0874 which is greater than alpha 0.05 in this instance we would not reject the null and conclude there isn't enough evidence to support the claim that the average number of days for poor physical health is greater than 5.

Question 3

Is the average BMI different than 28 for those who make 75k or more per year?

Null hypothesis: mu = 5 Alt hypothesis: mu != 28 (claim)

In this one sample hypothesis test I wanted to see if BMI differed from those making 75k or more per year compared the sample mean of the whole dataset at approximately 28 BMI.

We obtained a p-value of .973 at an alpha of 0.05 we do not reject the null hypothesis and conclude there isn’t sufficient evidence to conclude the average BMI is different from 28 for people making 75k or more.

# 3. Is the average BMI different than 28 for those who make 75k or more per year?
# Null hypothesis: mu = 5
# Alt hypothesis: mu != 28 (claim)

income75 <- brfss |> filter(income=="$75,000 or more")
income75_bmi <- na.omit(income75$x_bmi5)
alpha <- 0.05

mu <- 28 #hypothesized


# This is a one-tail test because claim is about > or <
CVusingt.test <- t.test(income75_bmi,
                        alternative="two.side",
                        mu=mu,
                        confidence=1-alpha)
CVusingt.test

## 
##  One Sample t-test
## 
## data:  income75_bmi
## t = 0.034133, df = 1611, p-value = 0.9728
## alternative hypothesis: true mean is not equal to 28
## 95 percent confidence interval:
##  27.72153 28.28834
## sample estimates:
## mean of x 
##  28.00493

attributes(CVusingt.test)

## $names
##  [1] "statistic"   "parameter"   "p.value"     "conf.int"    "estimate"   
##  [6] "null.value"  "stderr"      "alternative" "method"      "data.name"  
## 
## $class
## [1] "htest"

CVusingt.test$statistic   # the t test statistic

##          t 
## 0.03413275

CVusingt.test$parameter   # the degrees of freedom

##   df 
## 1611

CVusingt.test$p.value     # the p-value

## [1] 0.9727755

CVusingt.test$conf.int    # the confidence interval (2 numbers)

## [1] 27.72153 28.28834
## attr(,"conf.level")
## [1] 0.95

CVusingt.test$estimate    # the estimated mean

## mean of x 
##  28.00493

CVusingt.test$null.value  # the specified hypothesized mean

## mean 
##   28

CVusingt.test$stderr      # standard error of the mean

## [1] 0.1444877

CVusingt.test$alternative # which kind of test (<, > or =)

## [1] "two.sided"

# We obtained a p-value of .973 at an alpha of 0.05 we do not reject the null hypothesis and conclude there isn't sufficient evidence to conclude the average BMI is different from 28 for people making 75k or more.

Question 4

Do people who graduated from college have less poor mental health days than 3.7?

Null hypothesis: mu = 3.7 Alt hypothesis: mu < 3.7 (claim)

In this one sample test I wanted to see if people who graduated college had less poor mental health days compared the sample mean of the dataset of approximately 3.7 days.

We obtained a p-value of 7.977e-11 nearly zero at an alpha of 0.05 this means we can reject the null hypothesis and support the claim that college graduates on average have less than 3.7 poor mental health days per month.

#4. Do people who graduated from college have less poor mental health days than 3.7?
# Null hypothesis: mu = 3.7
# Alt hypothesis: mu < 3.7 (claim)

college_grad <- brfss |> filter(x_educag=="Graduated from College or Technical School")
college_grad_menthlth <- na.omit(college_grad$menthlth)
alpha <- 0.05

mu <- 3.7 #hypothesized


# This is a one-tail test because claim is about > or <
CVusingt.test <- t.test(college_grad_menthlth,
                        alternative="less",
                        mu=mu,
                        confidence=1-alpha)
CVusingt.test

## 
##  One Sample t-test
## 
## data:  college_grad_menthlth
## t = -6.4162, df = 3320, p-value = 7.977e-11
## alternative hypothesis: true mean is less than 3.7
## 95 percent confidence interval:
##      -Inf 3.140543
## sample estimates:
## mean of x 
##  2.947606

attributes(CVusingt.test)

## $names
##  [1] "statistic"   "parameter"   "p.value"     "conf.int"    "estimate"   
##  [6] "null.value"  "stderr"      "alternative" "method"      "data.name"  
## 
## $class
## [1] "htest"

CVusingt.test$statistic   # the t test statistic

##         t 
## -6.416199

CVusingt.test$parameter   # the degrees of freedom

##   df 
## 3320

CVusingt.test$p.value     # the p-value

## [1] 7.977089e-11

CVusingt.test$conf.int    # the confidence interval (2 numbers)

## [1]     -Inf 3.140543
## attr(,"conf.level")
## [1] 0.95

CVusingt.test$estimate    # the estimated mean

## mean of x 
##  2.947606

CVusingt.test$null.value  # the specified hypothesized mean

## mean 
##  3.7

CVusingt.test$stderr      # standard error of the mean

## [1] 0.1172647

CVusingt.test$alternative # which kind of test (<, > or =)

## [1] "less"

# We obtained a p-value of 7.977e-11 nearly zero at an alpha of 0.05 this means we can reject the null hypothesis and support the claim that college graduates on average have less than 3.7 poor mental health days.

Question 5

Do those who live in Alaska have more then 4 days of poor mental health on average?

Null hypothesis: mu = 4 Alt hypothesis: mu > 4 (claim)

In this one sample hypothesis test I wanted to see if Alaska had more than 4 poor mental health days on average slightly above the sample average. I performed a t.test to test this on the filtered data pertaining to just those in the state of Alaska.

The results were we obtained a p-value of 0.439 which is greater than our alpha of 0.05. This implies we do not reject the null and conclude there isn’t enough evidence to support the claim people in Alaska have higher than 4 days poor mental health on average.

#5. Do those who live in Alaska have more then 4 days of poor mental health on average?
# Null hypothesis: mu = 4
# Alt hypothesis: mu > 4 (claim)

Alaska <- brfss |> filter(state_name=="Alaska")
Alaska_menthlth <- na.omit(Alaska$menthlth)
alpha <- 0.05

mu <- 4 #hypothesized


# This is a one-tail test because claim is about > or <
CVusingt.test <- t.test(Alaska_menthlth,
                        alternative="greater",
                        mu=mu,
                        confidence=1-alpha)
CVusingt.test

## 
##  One Sample t-test
## 
## data:  Alaska_menthlth
## t = 0.15187, df = 87, p-value = 0.4398
## alternative hypothesis: true mean is greater than 4
## 95 percent confidence interval:
##  2.643564      Inf
## sample estimates:
## mean of x 
##  4.136364

attributes(CVusingt.test)

## $names
##  [1] "statistic"   "parameter"   "p.value"     "conf.int"    "estimate"   
##  [6] "null.value"  "stderr"      "alternative" "method"      "data.name"  
## 
## $class
## [1] "htest"

CVusingt.test$statistic   # the t test statistic

##         t 
## 0.1518706

CVusingt.test$parameter   # the degrees of freedom

## df 
## 87

CVusingt.test$p.value     # the p-value

## [1] 0.4398203

CVusingt.test$conf.int    # the confidence interval (2 numbers)

## [1] 2.643564      Inf
## attr(,"conf.level")
## [1] 0.95

CVusingt.test$estimate    # the estimated mean

## mean of x 
##  4.136364

CVusingt.test$null.value  # the specified hypothesized mean

## mean 
##    4

CVusingt.test$stderr      # standard error of the mean

## [1] 0.8978934

CVusingt.test$alternative # which kind of test (<, > or =)

## [1] "greater"

# We obtain a p-value of 0.439 which is greater than our alpha of 0.05. This implies we do not reject the null and conclude there isn't enough evidence to support the claim people in Alaska have higher than 4 days poor mental health on average.

Two Sample Hypothesis Tests

Question 1

Is there a difference in poor physical days in the Northeast compared to the South?

H(o): mu1=mu2 H(a): mu1!=mu2 (claim)

In this two sample hypothesis test I wanted to see if there was a significant difference at a confidence level of 95% with an alpha of 0.05 between poor physical health days between the Northeast and Southern regions.

Using a two sided t-test for both variables a p-value was obtained of 0.006395 this is less than our alpha of 0.05 implies we reject the null hypothesis concluding there is enough evidence to support the claim that there is a difference between average number of poor physical health days between Southern and Northeastern regions.

# 5 questions for two sample test

#1. Is there a difference in poor physical days in the Northeast compared to the South?

north <- brfss |> filter(region=="Northeast")
south <- brfss |> filter(region=="South")
north <- na.omit(north$physhlth)
south <- na.omit(south$physhlth)

# Hypothesis test
# H(o): mu1=mu2
# H(a): mu1!=mu2 (claim)

mu <- 0
alpha <- .05

# Two tailed test because the claim is about there being a difference
CVusingt.test <- t.test(north, south,
                        alternative="two.side",
                        mu=mu,
                        confidence=1-alpha)
CVusingt.test

## 
##  Welch Two Sample t-test
## 
## data:  north and south
## t = -2.7283, df = 3874.9, p-value = 0.006395
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.2429058 -0.2035019
## sample estimates:
## mean of x mean of y 
##  3.561668  4.284871

attributes(CVusingt.test)

## $names
##  [1] "statistic"   "parameter"   "p.value"     "conf.int"    "estimate"   
##  [6] "null.value"  "stderr"      "alternative" "method"      "data.name"  
## 
## $class
## [1] "htest"

CVusingt.test$statistic   # the t test statistic

##         t 
## -2.728287

CVusingt.test$parameter   # the degrees of freedom

##       df 
## 3874.902

CVusingt.test$p.value     # the p-value

## [1] 0.006395128

CVusingt.test$conf.int    # the confidence interval (2 numbers)

## [1] -1.2429058 -0.2035019
## attr(,"conf.level")
## [1] 0.95

CVusingt.test$estimate    # the estimated mean

## mean of x mean of y 
##  3.561668  4.284871

CVusingt.test$null.value  # the specified hypothesized mean

## difference in means 
##                   0

CVusingt.test$stderr      # standard error of the mean

## [1] 0.2650761

CVusingt.test$alternative # which kind of test (<, > or =)

## [1] "two.sided"

# P-value is 0.006395 this is less than our alpha of 0.05 implies we reject the null hypothesis concluding there is enough evidence to support the claim that there is a difference between average number of poor physical health days between Southern and Northeastern regions.

Question 2

Is there a difference in the average number of poor mental health days between people who live in New York and Vermont?

H(o): mu1=mu2 H(a): mu1!=mu2 (claim)

I selected this two sample hypothesis test to see if there was a difference in poor mental health days between Vermont and New York which are found in the same region of the Northeast.

The obtained p-value was 0.374 this is more than our alpha of 0.05 implies we don’t reject the null hypothesis concluding there is not enough evidence to support the claim that there is a difference between average number of poor mental health days between New York and Vermont.

#2. Is there a difference in the average number of poor mental health days between people who live in New York and Vermont?

new_york <- brfss |> filter(state_name=="New York")
vermont <- brfss |> filter(state_name=="Vermont")
new_york <- na.omit(new_york$menthlth)
vermont <- na.omit(vermont$menthlth)


# Hypothesis test
# H(o): mu1=mu2
# H(a): mu1!=mu2 (claim)

mu <- 0
alpha <- .05

# Two tailed test because the claim is about there being a difference
CVusingt.test <- t.test(new_york, vermont,
                        alternative="two.side",
                        mu=mu,
                        confidence=1-alpha)
CVusingt.test

## 
##  Welch Two Sample t-test
## 
## data:  new_york and vermont
## t = -0.89151, df = 175.53, p-value = 0.3739
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.2956760  0.8670093
## sample estimates:
## mean of x mean of y 
##  3.544000  4.258333

attributes(CVusingt.test)

## $names
##  [1] "statistic"   "parameter"   "p.value"     "conf.int"    "estimate"   
##  [6] "null.value"  "stderr"      "alternative" "method"      "data.name"  
## 
## $class
## [1] "htest"

CVusingt.test$statistic   # the t test statistic

##          t 
## -0.8915132

CVusingt.test$parameter   # the degrees of freedom

##       df 
## 175.5253

CVusingt.test$p.value     # the p-value

## [1] 0.3738744

CVusingt.test$conf.int    # the confidence interval (2 numbers)

## [1] -2.2956760  0.8670093
## attr(,"conf.level")
## [1] 0.95

CVusingt.test$estimate    # the estimated mean

## mean of x mean of y 
##  3.544000  4.258333

CVusingt.test$null.value  # the specified hypothesized mean

## difference in means 
##                   0

CVusingt.test$stderr      # standard error of the mean

## [1] 0.8012594

CVusingt.test$alternative # which kind of test (<, > or =)

## [1] "two.sided"

# P-value is 0.374 this is more than our alpha of 0.05 implies we don't reject the null hypothesis concluding there is not enough evidence to support the claim that there is a difference between average number of poor mental health days between New York and Vermont.

Question 3

Do people in the South have a higher BMI on average than those in the West?

H(o): mu1=mu2 H(a): mu1>mu2 (claim)

In this test I wanted to see if BMI was higher on average for the Southern region compared to the Western region. This utilizes a one tail right side test to see if the test value falls in the rejection region. We observed the results through the p-value in comparison to the selected alpha of 0.05.

P-value was 0.997 this is greater than our alpha of 0.05 implies we don’t reject the null hypothesis concluding there is not enough evidence to support the claim that the Southern region does have a greater BMI on average than the West.

#3. Do people in the South have a higher BMI on average than those in the West?

west <- brfss |> filter(region=="West")
south <- brfss |> filter(region=="South")
west <- na.omit(west$x_bmi5)
south <- na.omit(south$x_bmi5)


# Hypothesis test
# H(o): mu1=mu2
# H(a): mu1>mu2 (claim)

mu <- 0
alpha <- .05

# Two tailed test because the claim is about there being a difference
CVusingt.test <- t.test(west, south,
                        alternative="greater",
                        mu=mu,
                        confidence=1-alpha)
CVusingt.test

## 
##  Welch Two Sample t-test
## 
## data:  west and south
## t = -2.7885, df = 3895.8, p-value = 0.9973
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  -0.8777369        Inf
## sample estimates:
## mean of x mean of y 
##  27.84841  28.40044

attributes(CVusingt.test)

## $names
##  [1] "statistic"   "parameter"   "p.value"     "conf.int"    "estimate"   
##  [6] "null.value"  "stderr"      "alternative" "method"      "data.name"  
## 
## $class
## [1] "htest"

CVusingt.test$statistic   # the t test statistic

##         t 
## -2.788468

CVusingt.test$parameter   # the degrees of freedom

##       df 
## 3895.793

CVusingt.test$p.value     # the p-value

## [1] 0.9973393

CVusingt.test$conf.int    # the confidence interval (2 numbers)

## [1] -0.8777369        Inf
## attr(,"conf.level")
## [1] 0.95

CVusingt.test$estimate    # the estimated mean

## mean of x mean of y 
##  27.84841  28.40044

CVusingt.test$null.value  # the specified hypothesized mean

## difference in means 
##                   0

CVusingt.test$stderr      # standard error of the mean

## [1] 0.1979688

CVusingt.test$alternative # which kind of test (<, > or =)

## [1] "greater"

# P-value is 0.997 this is greater than our alpha of 0.05 implies we don't reject the null hypothesis concluding there is not enough evidence to support the claim that the Southern region does have a greater BMI on average than the West.

Question 4

Is there a difference of average BMI for people Age 25-34 that live in the Midwest compared to West?

H(o): mu1=mu2 H(a): mu1!=mu2 (claim)

For this two sample hypothesis test I wanted to compare average BMI level for those in 25-34 age range between the Midwest and West regions. As similar to previous problems an alpha of 0.05 was selected and a t.test was conducted to obtain the p-value.

P-value was 0.319 this is greater than our alpha of 0.05 implies we don’t reject the null hypothesis concluding there is not enough evidence to support the claim that there is a difference of average BMI for people Age 25-34 that live in the Midwest compared to West.

#4. Is there a difference of average BMI for people Age 25-34 that live in the Midwest compared to West?

west_age <- brfss |> filter(region=="West" & x_age_g=="25-34")
midwest_age <- brfss |> filter(region=="Midwest" & x_age_g=="25-34")
west_age <- na.omit(west_age$x_bmi5)
midwest_age <- na.omit(midwest_age$x_bmi5)


# Hypothesis test
# H(o): mu1=mu2
# H(a): mu1!=mu2 (claim)

mu <- 0
alpha <- .05

# Two tailed test because the claim is about there being a difference
CVusingt.test <- t.test(west_age, midwest_age,
                        alternative="two.side",
                        mu=mu,
                        confidence=1-alpha)
CVusingt.test

## 
##  Welch Two Sample t-test
## 
## data:  west_age and midwest_age
## t = -0.99834, df = 435.92, p-value = 0.3187
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.9028378  0.6209033
## sample estimates:
## mean of x mean of y 
##  27.79670  28.43767

attributes(CVusingt.test)

## $names
##  [1] "statistic"   "parameter"   "p.value"     "conf.int"    "estimate"   
##  [6] "null.value"  "stderr"      "alternative" "method"      "data.name"  
## 
## $class
## [1] "htest"

CVusingt.test$statistic   # the t test statistic

##          t 
## -0.9983357

CVusingt.test$parameter   # the degrees of freedom

##       df 
## 435.9173

CVusingt.test$p.value     # the p-value

## [1] 0.3186704

CVusingt.test$conf.int    # the confidence interval (2 numbers)

## [1] -1.9028378  0.6209033
## attr(,"conf.level")
## [1] 0.95

CVusingt.test$estimate    # the estimated mean

## mean of x mean of y 
##  27.79670  28.43767

CVusingt.test$null.value  # the specified hypothesized mean

## difference in means 
##                   0

CVusingt.test$stderr      # standard error of the mean

## [1] 0.6420358

CVusingt.test$alternative # which kind of test (<, > or =)

## [1] "two.sided"

# P-value is 0.319 this is greater than our alpha of 0.05 implies we don't reject the null hypothesis concluding there is not enough evidence to support the claim that there is a difference of average BMI for people Age 25-34 that live in the Midwest compared to West.

Question 5

Do people who are retired have less poor mental health days than those employed for wages?

H(o): mu1=mu2 H(a): mu1<mu2 (claim)

In this two sample test the data was split into two sample comparing those who were retired and those currently employed for wages. The alternative hypothesis was set for retired people have less poor mental health days than those employed.

P-value obtained of 1.59e-10 near zero with an alpha of 0.05 since p-value is less than alpha we can reject the null hypothesis and conclude there is enough evidence to support the claim that those who are retired have less poor mental health days than those employed for wages.

#5. Do people who are retired have less poor mental health days than those employed for wages?

retire <- brfss |> filter(employ1=="Retired")
employed <- brfss |> filter(employ1=="Employed for wages")
retire <- na.omit(retire$menthlth)
employed <- na.omit(employed$menthlth)

# Hypothesis test
# H(o): mu1=mu2
# H(a): mu1<mu2 (claim)

mu <- 0
alpha <- .05

# Two tailed test because the claim is about there being a difference
CVusingt.test <- t.test(retire, employed,
                        alternative="less",
                        mu=mu,
                        confidence=1-alpha)
CVusingt.test

## 
##  Welch Two Sample t-test
## 
## data:  retire and employed
## t = -6.2999, df = 6018, p-value = 1.594e-10
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##        -Inf -0.8137664
## sample estimates:
## mean of x mean of y 
##  2.345763  3.447130

attributes(CVusingt.test)

## $names
##  [1] "statistic"   "parameter"   "p.value"     "conf.int"    "estimate"   
##  [6] "null.value"  "stderr"      "alternative" "method"      "data.name"  
## 
## $class
## [1] "htest"

CVusingt.test$statistic   # the t test statistic

##         t 
## -6.299936

CVusingt.test$parameter   # the degrees of freedom

##       df 
## 6018.048

CVusingt.test$p.value     # the p-value

## [1] 1.594252e-10

CVusingt.test$conf.int    # the confidence interval (2 numbers)

## [1]       -Inf -0.8137664
## attr(,"conf.level")
## [1] 0.95

CVusingt.test$estimate    # the estimated mean

## mean of x mean of y 
##  2.345763  3.447130

CVusingt.test$null.value  # the specified hypothesized mean

## difference in means 
##                   0

CVusingt.test$stderr      # standard error of the mean

## [1] 0.174822

CVusingt.test$alternative # which kind of test (<, > or =)

## [1] "less"

# P-value obtained of 1.59e-10 near zero with an alpha of 0.05 since p-value is less than alpha we can reject the null hypothesis and conclude there is enough evidence to support the claim that those who are retired have less poor mental health days than those employed for wages.

References

Kabacoff, R. I. (2015). R in Action (2nd ed.). Manning Publications.

Bluman, A. G. (2018). Elementary statistics: A step by step approach (10th ed.). McGraw Hill.

When prompted with “how to replace yes and no with 1 and 0 in R for unique(data$variable) (then uniquie values)?” the ChatGPT generated text indicated “This code uses the ifelse function to convert”Yes” to 1, “No” to 0, and any other categories to NA (missing values). This way, you’re creating a binary representation of the “Yes/No” responses.” (OpenAI, 2024).

OpenAI. (2024). ChatGPT (March 5 version) [Large Language model] https://chat.openai.com/

Final Project Milestone 2

Michael

2024-03-16

Data Cleaning

Exploratory Data Analysis

One Sample Hypothesis Tests

Question 1

Question 2

Question 3

Question 4

Question 5

Two Sample Hypothesis Tests

Question 1

Question 2

Question 3

Question 4

Question 5

References