Cleaning the data and removing outliers

I cleaned the data by converting data types and converting yes/no responses to 1s and 0s. I converted BMI to two decimal places as indicated in the BRFSS codebook. For the variables of interest BMI, menhlth, and phyhlth I removed outliers above 75% and below 25%.

Click to expand
# Cleaning the data

brfss <- read.csv("brfss_17_21_sample5.csv")

brfss <- clean_names(brfss)

brfss$income <- as.factor(brfss$income)
brfss$genhlth <- as.factor(brfss$genhlth)
brfss$x_bmi5cat <- as.factor(brfss$x_bmi5cat)
brfss$x_educag <- as.factor(brfss$x_educag)
brfss$x_rfsmok3 <- as.factor(brfss$x_rfsmok3)
brfss$x_racegr3 <- as.factor(brfss$x_racegr3)
brfss$x_age_g <- as.factor(brfss$x_age_g)


# cleaning iyear column and converting to numeric
brfss$iyear <- gsub("b'", "", brfss$iyear)
brfss$iyear <- gsub("'", "", brfss$iyear)
brfss$iyear <- as.numeric(brfss$iyear)

# converts Yes/No to 1 (Yes) and 0 (No) and converts the others to NAs to get just complete answers
brfss$cvdinfr4 <- ifelse(brfss$cvdinfr4 == "Yes", 1, ifelse(brfss$cvdinfr4 == "No", 0, NA))

unique(brfss$cvdcrhd4)
## [1] "No"                  "Yes"                 "Don’t know/Not sure"
## [4] "Refused"
brfss$cvdcrhd4 <- ifelse(brfss$cvdcrhd4 == "Yes", 1, ifelse(brfss$cvdcrhd4 == "No", 0, NA))

unique(brfss$cvdstrk3)
## [1] "No"                  "Yes"                 "Don’t know/Not sure"
## [4] "Refused"
brfss$cvdstrk3 <- ifelse(brfss$cvdstrk3 == "Yes", 1, ifelse(brfss$cvdstrk3 == "No", 0, NA))

brfss$asthma3 <- ifelse(brfss$asthma3 == "Yes", 1, ifelse(brfss$asthma3 == "No", 0, NA))
brfss$chcscncr <- ifelse(brfss$chcscncr == "Yes", 1, ifelse(brfss$chcscncr == "No", 0, NA))
brfss$chcocncr <- ifelse(brfss$chcocncr == "Yes", 1, ifelse(brfss$chcocncr == "No", 0, NA))
brfss$chccopd <- ifelse(brfss$chccopd == "Yes", 1, ifelse(brfss$chccopd == "No", 0, NA))
brfss$addepev <- ifelse(brfss$addepev == "Yes", 1, ifelse(brfss$addepev == "No", 0, NA))
brfss$diabete_2 <- ifelse(brfss$diabete_2 == "Yes", 1, ifelse(brfss$diabete_2 == "No", 0, NA))

brfss$decide <- ifelse(brfss$decide == "Yes", 1, ifelse(brfss$decide == "No", 0, NA))
brfss$diffwalk <- ifelse(brfss$diffwalk == "Yes", 1, ifelse(brfss$diffwalk == "No", 0, NA))
brfss$diffdres <- ifelse(brfss$diffdres == "Yes", 1, ifelse(brfss$diffdres == "No", 0, NA))
brfss$diffalon <- ifelse(brfss$diffalon == "Yes", 1, ifelse(brfss$diffalon == "No", 0, NA))

unique(brfss$x_michd)
## [1] "No"                   "Yes"                  "Not asked or Missing"
brfss$x_michd <- ifelse(brfss$x_michd == "Yes", 1, ifelse(brfss$x_michd == "No", 0, NA))

unique(brfss$x_rfbing5)
## [1] "No"                  "Yes"                 "Don’t know/Not sure"
brfss$x_rfbing5 <- ifelse(brfss$x_rfbing5 == "Yes", 1, ifelse(brfss$x_rfbing5 == "No", 0, NA))

unique(brfss$x_rfhlth)
## [1] "Good"                "Fair/Poor"           "Don’t know/Not sure"
brfss$x_rfhlth <- as.factor(brfss$x_rfhlth)

unique(brfss$x_casthm1_2)
## [1] "Former"  "Current" ""
brfss$x_casthm1_2 <- as.factor(brfss$x_casthm1_2)

unique(brfss$employ1)
##  [1] "Employed for wages"               "Refused"                         
##  [3] "A homemaker"                      "A student"                       
##  [5] "Self-employed"                    "Out of work for less than 1 year"
##  [7] "Retired"                          "Out of work for 1 year or more"  
##  [9] "Unable to work"                   "Not asked or Missing"
brfss$employ1 <- as.factor(brfss$employ1)

brfss$marital <- as.factor(brfss$marital)
unique(brfss$educa)
## [1] "College 1 year to 3 years (Some college or technical school)"
## [2] "Grade 12 or GED (High school graduate)"                      
## [3] "College 4 years or more (College graduate)"                  
## [4] "Grades 9 through 11 (Some high school)"                      
## [5] "Grades 1 through 8 (Elementary)"                             
## [6] "Never attended school or only kindergarten"                  
## [7] "Refused"
brfss$educa <- as.factor(brfss$educa)

unique(brfss$bpmeds)
## [1] "Not asked or Missing" "Yes"                  "No"                  
## [4] "Don’t know/Not sure"  "Refused"
brfss$bpmeds <- ifelse(brfss$bpmeds == "Yes", 1, ifelse(brfss$bpmeds == "No", 0, NA))

unique(brfss$checkup1)
## [1] "5 or more years ago" "Within past 2 years" "Within past year"   
## [4] "Within past 5 years" "Don’t know/Not sure" "Never"              
## [7] "Refused"
brfss$checkup1 <- as.factor(brfss$checkup1)
brfss$state_name <- as.factor(brfss$state_name)
brfss$region <- as.factor(brfss$region)

# Converting BMI to two decimal places

brfss$x_bmi5 <- brfss$x_bmi5/100


# Removing outliers

variable <- brfss$x_bmi5
variable <- na.omit(variable)
# Calculate the IQR
Q1 <- quantile(variable, 0.25)
Q3 <- quantile(variable, 0.75)
IQR <- Q3 - Q1

# Define the upper and lower bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
lower_bound <- as.integer(lower_bound)
upper_bound <- as.integer(upper_bound)
# Filter out outliers for BMI
brfss <- brfss |> filter(x_bmi5 >= lower_bound & x_bmi5 <= upper_bound)


# remove outliers for menthlth
variable <- brfss$menthlth
variable <- na.omit(variable)
# Calculate the IQR
Q1 <- quantile(variable, 0.25)
Q3 <- quantile(variable, 0.75)
IQR <- Q3 - Q1

# Define the upper and lower bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
lower_bound <- as.integer(lower_bound)
upper_bound <- as.integer(upper_bound)
# Filter out outliers for mental health
brfss <- brfss |> filter(menthlth >= lower_bound & menthlth <= upper_bound)

# remove outliers for phyhlth
variable <- brfss$physhlth
variable <- na.omit(variable)
# Calculate the IQR
Q1 <- quantile(variable, 0.25)
Q3 <- quantile(variable, 0.75)
IQR <- Q3 - Q1

# Define the upper and lower bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
lower_bound <- as.integer(lower_bound)
upper_bound <- as.integer(upper_bound)
# Filter out outliers for physical health
brfss <- brfss |> filter(physhlth >= lower_bound & physhlth <= upper_bound)

Problem 1

For this problem we were asked if the average BMI of those living in Southern and Midwestern States together be less than 28 BMI. We conducted a one tail t.test as the population standard deviation is unknown. The null hypothesis was mu=28 and alternative hypothesis or the claim was mu < 28. We obtained a t value of -5.114 for an alpha of .05 translating to a 95% confidence interval. The df were 3165 so we know this represents a normal distribution. The p-value came out to being very close to zero at 1.66e-07. We know that if p-value <= alpha we reject the null hypothesis. In this case it is true so we can say that there is enough evidence to support the average BMI is less than 28 in Midwest and Southern states.

Click to expand
#1) The average BMI for Southern and Midwestern states 
#together is less than 28
south_midwest <- brfss |> filter(region=="Midwest" | region=="South")

# Null hypothesis: mu = 28
# Alt hypothesis: mu < 28 (claim)
# BMI is two demical places so 28.00
south_midwest <- na.omit(south_midwest$x_bmi5)

south_midwest <- as.numeric(south_midwest)
# STEP 1: Hypotheses and Claims

mu <- 28 #hypothesized


# This is a one-tail test because claim is about > or <
CVusingt.test <- t.test(south_midwest,
                        alternative="less",
                        mu=mu,
                        confidence=1-alpha)
CVusingt.test
## 
##  One Sample t-test
## 
## data:  south_midwest
## t = -5.1144, df = 3165, p-value = 1.667e-07
## alternative hypothesis: true mean is less than 28
## 95 percent confidence interval:
##      -Inf 27.69094
## sample estimates:
## mean of x 
##  27.54436
attributes(CVusingt.test)
## $names
##  [1] "statistic"   "parameter"   "p.value"     "conf.int"    "estimate"   
##  [6] "null.value"  "stderr"      "alternative" "method"      "data.name"  
## 
## $class
## [1] "htest"
CVusingt.test$statistic   # the t test statistic
##         t 
## -5.114396
CVusingt.test$parameter   # the degrees of freedom
##   df 
## 3165
CVusingt.test$p.value     # the p-value
## [1] 1.667377e-07
CVusingt.test$conf.int    # the confidence interval (2 numbers)
## [1]     -Inf 27.69094
## attr(,"conf.level")
## [1] 0.95
CVusingt.test$estimate    # the estimated mean
## mean of x 
##  27.54436
CVusingt.test$null.value  # the specified hypothesized mean
## mean 
##   28
CVusingt.test$stderr      # standard error of the mean
## [1] 0.08908993
CVusingt.test$alternative # which kind of test (<, > or =)
## [1] "less"
# If p-value<=a then rej null hyp
# p-value=0.0089 and alpha=.05 implies rej null

Problem 2

In this problem we looked at just BMI for Maine residents and wanted to determine if on average the BMI was greater than 28. The null hypothesis is mu=28 and the alternative hypothesis is mu>28 or the claim. Using an alpha of 0.05 we perform a t.test in similar fashion to the first problem. We obtain a t value of -1.253 with a df of 128 representing a normal distribution. The p-value obtained was 0.894 knowing p-value>alpha fails to reject the null hypothesis we can conclude there isn’t enough evidence to support the average BMI in Maine is greater than 28.

Click to expand
#2) The average BMI for Maine is greater than 28

#Null hypothesis: mu = 28
#Alt hypothesis: mu > 28 (claim)

maine <- brfss |> filter(state_name=="Maine")
maine <- na.omit(maine$x_bmi5)
alpha <- 0.05


maine <- as.numeric(maine)
# STEP 1: Hypotheses and Claims

mu <- 28 #hypothesized


# This is a one-tail test because claim is about > or <
CVusingt.test <- t.test(maine,
                        alternative="greater",
                        mu=mu,
                        confidence=1-alpha)
CVusingt.test
## 
##  One Sample t-test
## 
## data:  maine
## t = -1.2528, df = 128, p-value = 0.8937
## alternative hypothesis: true mean is greater than 28
## 95 percent confidence interval:
##  26.67456      Inf
## sample estimates:
## mean of x 
##   27.4293
attributes(CVusingt.test)
## $names
##  [1] "statistic"   "parameter"   "p.value"     "conf.int"    "estimate"   
##  [6] "null.value"  "stderr"      "alternative" "method"      "data.name"  
## 
## $class
## [1] "htest"
CVusingt.test$statistic   # the t test statistic
##         t 
## -1.252827
CVusingt.test$parameter   # the degrees of freedom
##  df 
## 128
CVusingt.test$p.value     # the p-value
## [1] 0.8937238
CVusingt.test$conf.int    # the confidence interval (2 numbers)
## [1] 26.67456      Inf
## attr(,"conf.level")
## [1] 0.95
CVusingt.test$estimate    # the estimated mean
## mean of x 
##   27.4293
CVusingt.test$null.value  # the specified hypothesized mean
## mean 
##   28
CVusingt.test$stderr      # standard error of the mean
## [1] 0.4555278
CVusingt.test$alternative # which kind of test (<, > or =)
## [1] "greater"
# If p-value<=a then rej null hyp
# p-value=.6284 and alpha=.05 implies don't rej null

Problem 3

In this problem we looked at the average number of poor mental health days per month in Southern States and wanted to know if it was less than 3. The null was mu=3 and the alternative hypothesis was mu<3 the claim. Perfomring a t.test on just Southern regions and looking at menthlth at an alpha 0.05 we have a t value of -61.554. Keeping in mind we filtered outliers we an already very right skewed data set concentrated at zero. The df were 1489 representing a normal distribution. We obtained a p-value of 0. With p-value <=a rejecting the null we can conclude there is enough evidence to support Southern States had on average poor mental healths days of less than 3 per month. Note: when testing this without removing outliers we obtained a p-value of almost 1 indicating there would be insufficient evidence to support the claim.

Click to expand
#3) The average number of poor mental health days per month 
#for Southern states was less than 3

#South states mental health days less than 3
south_mh <- brfss |> filter(region=="South")

# Null hypothesis: mu = 3
# Alt hypothesis: mu < 3 (claim)

south_mh <- na.omit(south_mh$menthlth)
alpha <- .05
south_mh <- as.numeric(south_mh)
mu <- 3


# This is a one-tail test because claim is about > or <
CVusingt.test <- t.test(south_mh,
                        alternative="less",
                        mu=mu,
                        confidence=1-alpha)
CVusingt.test
## 
##  One Sample t-test
## 
## data:  south_mh
## t = -61.554, df = 1489, p-value < 2.2e-16
## alternative hypothesis: true mean is less than 3
## 95 percent confidence interval:
##      -Inf 0.691607
## sample estimates:
## mean of x 
## 0.6281879
attributes(CVusingt.test)
## $names
##  [1] "statistic"   "parameter"   "p.value"     "conf.int"    "estimate"   
##  [6] "null.value"  "stderr"      "alternative" "method"      "data.name"  
## 
## $class
## [1] "htest"
CVusingt.test$statistic   # the t test statistic
##         t 
## -61.55423
CVusingt.test$parameter   # the degrees of freedom
##   df 
## 1489
CVusingt.test$p.value     # the p-value
## [1] 0
CVusingt.test$conf.int    # the confidence interval (2 numbers)
## [1]     -Inf 0.691607
## attr(,"conf.level")
## [1] 0.95
CVusingt.test$estimate    # the estimated mean
## mean of x 
## 0.6281879
CVusingt.test$null.value  # the specified hypothesized mean
## mean 
##    3
CVusingt.test$stderr      # standard error of the mean
## [1] 0.03853207
CVusingt.test$alternative # which kind of test (<, > or =)
## [1] "less"
# this problem depends on if you remove outliers or not with them removed p-value=0 we reject the null

Problem 4

In this problem carrying out similar method to the previous we looked at if Maine and New Hampshire had on average less than 4 days of poor physical health per month. Null was mu=4 and alternative hypothesis was mu<4. With outliers removed we obtained a t-value of -51.12 with df=218. The p-value was nearly zero at 1.17e-123. Using again an alpha of 0.05 we can reject the null hypothesis since p-value is less than alpha and support the claim that Maine and NH had less than 4 days of poor physical health.

Click to expand
#4) The average number of poor physical health days per month 
#for Maine and NH together was less than 4 days

#South states poor health days less than 4
maine_nh_ph <- brfss |> filter(state_name=="Maine" | state_name=="New Hampshire")

# Null hypothesis: mu = 4
# Alt hypothesis: mu < 4 (claim)

maine_nh_ph <- na.omit(maine_nh_ph$physhlth)
alpha <- 0.05

mu <- 4 #hypothesized


# This is a one-tail test because claim is about > or <
CVusingt.test <- t.test(maine_nh_ph,
                        alternative="less",
                        mu=mu,
                        confidence=1-alpha)
CVusingt.test
## 
##  One Sample t-test
## 
## data:  maine_nh_ph
## t = -51.123, df = 218, p-value < 2.2e-16
## alternative hypothesis: true mean is less than 4
## 95 percent confidence interval:
##       -Inf 0.5401826
## sample estimates:
## mean of x 
## 0.4246575
attributes(CVusingt.test)
## $names
##  [1] "statistic"   "parameter"   "p.value"     "conf.int"    "estimate"   
##  [6] "null.value"  "stderr"      "alternative" "method"      "data.name"  
## 
## $class
## [1] "htest"
CVusingt.test$statistic   # the t test statistic
##         t 
## -51.12321
CVusingt.test$parameter   # the degrees of freedom
##  df 
## 218
CVusingt.test$p.value     # the p-value
## [1] 1.172147e-123
CVusingt.test$conf.int    # the confidence interval (2 numbers)
## [1]      -Inf 0.5401826
## attr(,"conf.level")
## [1] 0.95
CVusingt.test$estimate    # the estimated mean
## mean of x 
## 0.4246575
CVusingt.test$null.value  # the specified hypothesized mean
## mean 
##    4
CVusingt.test$stderr      # standard error of the mean
## [1] 0.06993579
CVusingt.test$alternative # which kind of test (<, > or =)
## [1] "less"
# pvalue is greater than alpha implying fail to rej null not enough evidence

Work Cited

Kabacoff, R. I. (2015). R in Action (2nd ed.). Manning Publications.

Bluman, A. G. (2018). Elementary statistics: A step by step approach (10th ed.). McGraw Hill.