Exploring the BRFSS data

knitr::opts_chunk$set(fig.width=10, fig.height=6, fig.path='Figs/',
                      warning=FALSE, message=FALSE)

Setup

Several packages are needed for manipulating and graphing the data. ### Load packages

#load libraries
suppressMessages(library(dplyr))
library(ggplot2)
library(xtable)
library(lazyeval)

Two helper functions were attempted. The first to trim strings to aid concatenation and the second to consolidate code used to summarize variables. While the latter is not used, it’s included here in case a reader or reviewer had insight into how to fix it. Finally, a variable to store the working directory was created to store the path to the code and the data file.

#helper function to trim leading and ending spaces
trim <- function (x) gsub("^\\s+|\\s+$", "", x)

summarize_Var<-function(df, var){
        print(var)
        df %>% 
        group_by(var) %>% 
        summarise(count = n())
          }

#get wd
wd<-trim(getwd())
wd<-"C:/Coursera/duke_probability"

Load data

The compressed file was downloaded from the course website and the brfss2013.Rdata file extracted.

#open the Rdata file that was previously unzipped
#load puts the Rdata into a dataframe named brfss2013
load(paste("C:/Coursera/duke_probability","/brfss2013.Rdata",sep=""))
names(brfss2013)

##   [1] "X_state"   "fmonth"    "idate"     "imonth"    "iday"     
##   [6] "iyear"     "dispcode"  "seqno"     "X_psu"     "ctelenum" 
##  [11] "pvtresd1"  "colghous"  "stateres"  "cellfon3"  "ladult"   
##  [16] "numadult"  "nummen"    "numwomen"  "genhlth"   "physhlth" 
##  [21] "menthlth"  "poorhlth"  "hlthpln1"  "persdoc2"  "medcost"  
##  [26] "checkup1"  "sleptim1"  "bphigh4"   "bpmeds"    "bloodcho" 
##  [31] "cholchk"   "toldhi2"   "cvdinfr4"  "cvdcrhd4"  "cvdstrk3" 
##  [36] "asthma3"   "asthnow"   "chcscncr"  "chcocncr"  "chccopd1" 
##  [41] "havarth3"  "addepev2"  "chckidny"  "diabete3"  "veteran3" 
##  [46] "marital"   "children"  "educa"     "employ1"   "income2"  
##  [51] "weight2"   "height3"   "numhhol2"  "numphon2"  "cpdemo1"  
##  [56] "cpdemo4"   "internet"  "renthom1"  "sex"       "pregnant" 
##  [61] "qlactlm2"  "useequip"  "blind"     "decide"    "diffwalk" 
##  [66] "diffdres"  "diffalon"  "smoke100"  "smokday2"  "stopsmk2" 
##  [71] "lastsmk2"  "usenow3"   "alcday5"   "avedrnk2"  "drnk3ge5" 
##  [76] "maxdrnks"  "fruitju1"  "fruit1"    "fvbeans"   "fvgreen"  
##  [81] "fvorang"   "vegetab1"  "exerany2"  "exract11"  "exeroft1" 
##  [86] "exerhmm1"  "exract21"  "exeroft2"  "exerhmm2"  "strength" 
##  [91] "lmtjoin3"  "arthdis2"  "arthsocl"  "joinpain"  "seatbelt" 
##  [96] "flushot6"  "flshtmy2"  "tetanus"   "pneuvac3"  "hivtst6"  
## [101] "hivtstd3"  "whrtst10"  "pdiabtst"  "prediab1"  "diabage2" 
## [106] "insulin"   "bldsugar"  "feetchk2"  "doctdiab"  "chkhemo3" 
## [111] "feetchk"   "eyeexam"   "diabeye"   "diabedu"   "painact2" 
## [116] "qlmentl2"  "qlstres2"  "qlhlth2"   "medicare"  "hlthcvrg" 
## [121] "delaymed"  "dlyother"  "nocov121"  "lstcovrg"  "drvisits" 
## [126] "medscost"  "carercvd"  "medbills"  "ssbsugar"  "ssbfrut2" 
## [131] "wtchsalt"  "longwtch"  "dradvise"  "asthmage"  "asattack" 
## [136] "aservist"  "asdrvist"  "asrchkup"  "asactlim"  "asymptom" 
## [141] "asnoslep"  "asthmed3"  "asinhalr"  "harehab1"  "strehab1" 
## [146] "cvdasprn"  "aspunsaf"  "rlivpain"  "rduchart"  "rducstrk" 
## [151] "arttoday"  "arthwgt"   "arthexer"  "arthedu"   "imfvplac" 
## [156] "hpvadvc2"  "hpvadsht"  "hadmam"    "howlong"   "profexam" 
## [161] "lengexam"  "hadpap2"   "lastpap2"  "hadhyst2"  "bldstool" 
## [166] "lstblds3"  "hadsigm3"  "hadsgco1"  "lastsig3"  "pcpsaad2" 
## [171] "pcpsadi1"  "pcpsare1"  "psatest1"  "psatime"   "pcpsars1" 
## [176] "pcpsade1"  "pcdmdecn"  "rrclass2"  "rrcognt2"  "rratwrk2" 
## [181] "rrhcare3"  "rrphysm2"  "rremtsm2"  "misnervs"  "mishopls" 
## [186] "misrstls"  "misdeprd"  "miseffrt"  "miswtles"  "misnowrk" 
## [191] "mistmnt"   "mistrhlp"  "misphlpf"  "scntmony"  "scntmeal" 
## [196] "scntpaid"  "scntwrk1"  "scntlpad"  "scntlwk1"  "scntvot1" 
## [201] "rcsgendr"  "rcsrltn2"  "casthdx2"  "casthno2"  "emtsuprt" 
## [206] "lsatisfy"  "ctelnum1"  "cellfon2"  "cadult"    "pvtresd2" 
## [211] "cclghous"  "cstate"    "landline"  "pctcell"   "qstver"   
## [216] "qstlang"   "mscode"    "X_ststr"   "X_strwt"   "X_rawrake"
## [221] "X_wt2rake" "X_imprace" "X_impnph"  "X_impeduc" "X_impmrtl"
## [226] "X_imphome" "X_chispnc" "X_crace1"  "X_impcage" "X_impcrac"
## [231] "X_impcsex" "X_cllcpwt" "X_dualuse" "X_dualcor" "X_llcpwt2"
## [236] "X_llcpwt"  "X_rfhlth"  "X_hcvu651" "X_rfhype5" "X_cholchk"
## [241] "X_rfchol"  "X_ltasth1" "X_casthm1" "X_asthms1" "X_drdxar1"
## [246] "X_prace1"  "X_mrace1"  "X_hispanc" "X_race"    "X_raceg21"
## [251] "X_racegr3" "X_race_g1" "X_ageg5yr" "X_age65yr" "X_age_g"  
## [256] "htin4"     "htm4"      "wtkg3"     "X_bmi5"    "X_bmi5cat"
## [261] "X_rfbmi5"  "X_chldcnt" "X_educag"  "X_incomg"  "X_smoker3"
## [266] "X_rfsmok3" "drnkany5"  "drocdy3_"  "X_rfbing5" "X_drnkdy4"
## [271] "X_drnkmo4" "X_rfdrhv4" "X_rfdrmn4" "X_rfdrwm4" "ftjuda1_" 
## [276] "frutda1_"  "beanday_"  "grenday_"  "orngday_"  "vegeda1_" 
## [281] "X_misfrtn" "X_misvegn" "X_frtresp" "X_vegresp" "X_frutsum"
## [286] "X_vegesum" "X_frtlt1"  "X_veglt1"  "X_frt16"   "X_veg23"  
## [291] "X_fruitex" "X_vegetex" "X_totinda" "metvl11_"  "metvl21_" 
## [296] "maxvo2_"   "fc60_"     "actin11_"  "actin21_"  "padur1_"  
## [301] "padur2_"   "pafreq1_"  "pafreq2_"  "X_minac11" "X_minac21"
## [306] "strfreq_"  "pamiss1_"  "pamin11_"  "pamin21_"  "pa1min_"  
## [311] "pavig11_"  "pavig21_"  "pa1vigm_"  "X_pacat1"  "X_paindx1"
## [316] "X_pa150r2" "X_pa300r2" "X_pa30021" "X_pastrng" "X_parec1" 
## [321] "X_pastae1" "X_lmtact1" "X_lmtwrk1" "X_lmtscl1" "X_rfseat2"
## [326] "X_rfseat3" "X_flshot6" "X_pneumo2" "X_aidtst3" "X_age80"

Part 1: Data

More details about the data can be located at http://www.cdc.gov/brfss/. After loading the recordset, I chose to keep a subset of variables that describe respondents. First, only Minnesota reponses were selected from the overall dataset.

#limit the data to MN
brfss2013_MN<-filter(brfss2013, X_state == "Minnesota")

Second, the following variables were selected

#get only the following 16 columns of interest
col_names=c(
  "physhlth",
  "menthlth",
  "poorhlth",
  "veteran3",
  "marital",
  "children",
  "educa",
  "employ1",
  "X_incomg",
  "sex",
  "X_race",
  "seatbelt",
  "flushot6",
  "flshtmy2",
  "tetanus",
  "pneuvac3"
    )

#pull out the columns of interest
brfss2013_MN2<-brfss2013_MN[, col_names]

Sadly, there is no easy (and pretty way) to generate tables in R and Markdown. Details about the variables was collected from the CDC and stored in a csv. The following code reads the file and displays the variables of interest.

df_vars=read.csv("https://raw.githubusercontent.com/wer61537/Duke/master/vars.csv")
print(xtable(df_vars[1:16,]), size="\\fontsize{9pt}{10pt}\\selectfont",type="html")

	Variable	Description	DataType	Levels.or.Range	Conversion
1	physhlth	Number Of Days Physical Health Not Good	integer
2	menthlth	Number Of Days Mental Health Not Good	integer
3	poorhlth	Poor Physical Or Mental Health	integer
4	veteran3	Are You A Veteran	Factor	No, Yes
5	marital	Marital Status	Factor	A member of an unmarried couple, Divorced, Married, Never married, Separated, Widowed
6	children	Computed number Of Children In Household	integer
7	educa	Education Level	Factor	College 1 year to 3 years (Some college or technical school), College 4 years or more (College graduate), Grade 12 or GED (High school graduate), Grades 1 through 8 (Elementary), Grades 9 though 11 (Some high school), Never attended school or only kindergarten
8	employ1	Employment Status	Factor	A homemaker, A student, Employed for wages, Out of work for 1 year or more, Out of work for less than 1 year, Retired, Self-employed, Unable to work
9	X_incomg	Computed Income Categories	Factor	Less than $15,000, $15,000 to less than $25,000, $25,000 to less than $35,000, $35,000 to less than $50,000, $50,000 or more
10	sex	Respondents Sex	Factor	Female, Male
11	X_race	Computed Race-Ethnicity Grouping	Factor	American Indian or Alaskan Native only, Non-Hispanic, Asian only, non-Hispanic; Black only, non-Hispanic; Hispanic; Multiracial, non-Hispanic; Native Hawaiian or other Pacific Islander only, Non-Hispanic; Other race only, non-Hispanic; White only, non-Hispanic
12	seatbelt	How Often Use Seatbelts In Car?	Factor	Always, Nearly always, Never, Never drive or ride in a car, Seldom, Sometimes
13	flushot6	Adult Flu Shot/Spray Past 12 Mos	Factor	No, Yes
14	flshtmy2	When Received Most Recent Seasonal Flu Shot/Spray	Factor	April 2012, April 2013, August 2012, August 2013, December 2012, December 2013, February 2012, February 2013, February 2014, January 2012, January 2013, January 2014, July 2012, July 2013, June 2012, June 2013, March 2012, March 2013, May 2012, May 2013, November 2012, November 2013, October 2012, October 2013, September 2012, September 2013	Change to Date and calculate days to January 1, 2014
15	tetanus	Received Tetanus Shot Since 2005?	Factor	No, did not receive any tetanus since 2005, Yes, received Tdap, Yes, received tetanus shot but not sure what type, Yes, received tetanus shot, but not Tdap	Convert to No and Yes
16	pneuvac3	Pneumonia Shot Ever	Factor	No, Yes

Several of the variables had long character strings and sometimes not memorable names. The following code renames a few variables and changes others’ values for graph friendly values.

#add new variables as clean up some of the levels
brfss2013_MN2$gender<-brfss2013_MN2$sex

brfss2013_MN2$tetanusYN[grepl("Yes",brfss2013_MN2$tetanus)]<-"Yes"
brfss2013_MN2$tetanusYN[grepl("No",brfss2013_MN2$tetanus)]<-"No"

brfss2013_MN2$ethnic[grepl("White",brfss2013_MN2$X_race)]<-"White"
brfss2013_MN2$ethnic[grepl("Black",brfss2013_MN2$X_race)]<-"Black"
brfss2013_MN2$ethnic[grepl("Hawaiian",brfss2013_MN2$X_race)]<-"Hawaiian"
brfss2013_MN2$ethnic[grepl("Asian",brfss2013_MN2$X_race)]<-"Asian"
brfss2013_MN2$ethnic[grepl("Other",brfss2013_MN2$X_race)]<-"Other"
brfss2013_MN2$ethnic[grepl("Native",brfss2013_MN2$X_race)]<-"Native"
brfss2013_MN2$ethnic[grepl("Multi",brfss2013_MN2$X_race)]<-"Multi"

#remove verbosity
brfss2013_MN2$ed_level[grepl("Never",brfss2013_MN2$educa)]<-"Never"
brfss2013_MN2$ed_level[grepl("Elementary",brfss2013_MN2$educa)]<-"Elem"
brfss2013_MN2$ed_level[grepl("Some High School",brfss2013_MN2$educa)]<-"Some HS"
brfss2013_MN2$ed_level[grepl("High school graduate",brfss2013_MN2$educa)]<-"HS"
brfss2013_MN2$ed_level[grepl("Other",brfss2013_MN2$educa)]<-"Other"
brfss2013_MN2$ed_level[grepl("Some college",brfss2013_MN2$educa)]<-"Some Colleg"
brfss2013_MN2$ed_level[grepl("College graduate",brfss2013_MN2$educa)]<-"College"

#remove thousands place and verbosity
brfss2013_MN2$income_level<-gsub(",000", "", brfss2013_MN2$X_incomg)
brfss2013_MN2$income_level<-gsub(" to less than ", "-", brfss2013_MN2$income_level)
brfss2013_MN2$income_level<-gsub("\\$", "", brfss2013_MN2$income_level)
brfss2013_MN2$income_level<-gsub(" or more", "50+", brfss2013_MN2$income_level)
brfss2013_MN2$income_level<-gsub("5055+", "50+", brfss2013_MN2$income_level)
brfss2013_MN2$income_level<-gsub("Less than ", "",brfss2013_MN2$income_level)
brfss2013_MN2$income_level<-gsub("15-25", "<25",brfss2013_MN2$income_level)
brfss2013_MN2$income_level<-gsub("15", "<15",brfss2013_MN2$income_level)
brfss2013_MN2$income_level<-gsub("25-35", "<35",brfss2013_MN2$income_level)
brfss2013_MN2$income_level<-gsub("35-50", "<50",brfss2013_MN2$income_level)
brfss2013_MN2$income_level<-gsub("5050", "50",brfss2013_MN2$income_level)

Part 2: Research questions

Research question 1: Is there an association betwen immunization for tetanus and ethnicity, educational level, income and gender? In Minnesota, state law requires every child in grades kindergarten through senior year of high school to be able to prove they have been immunized for hepatitis B, DTaP/DT, polio, MMR, tetanus, diptheria, Tdap and meningcoccal and varicella. Exemptions are allowed for conscientious objection or medical if supported by a health care provider. Immunization within immigrant communties, and to a lesser extent in the larger population, is suspected of causing autism. The Somali community in particular is suspect after a large increase in autism rates since immigration to the United States.

Variables in the dataset to consider are tetanus, ethnic, ed_level, income_level and gender.

Research question 2: Is there an association between seatbelt use and ethnicity, educational level, income and gender? in Minnesota, all automobile (this includes cars, vans and trucks) occupants (drivers and passengers) in the front, back and other seats seats must be buckled up or be in an approved child restraint. Police have the right to stop and ticket drivers or passengers who are unbuckled. Does usage differ between racial and other groups?

Variables in the dataset to consider are seatbelt, ethnic, ed_level, income_level and gender.

Research question 3: Is there an associaton between the number of “bad” mental health days and ethnicity, educational level, income and gender? The menthlth variable is the number Of days the respondent reported that their mental health was not good. The Native American population in Minnesota has high rates of alcoholism, homelessness and violence. The black population in Minnesota is concentrated in specific neighborhoods in Minneapolis and St. Paul. The recent killing of a school cafeteria supervisor perhaps highlights distress in the black communities in the state. Finally, multi-racial individuals may have higher stress due to competing ethnic heritages.

Variables in the dataset to consider are seatbelt, ethnic, ed_level, income_level and gender.

Part 3: Exploratory data analysis

For each of the 16 variables in the Minnesota BRFSS2013 dataset, frequencies for each level were determined.

Number of Days Physical Health Not Good (physhlth)

66 percent of the respondents reported that they had no days where they thought there health was poor. Another 16 percent report that there health was poor up to 5 days.

brfss2013_MN2 %>% 
  group_by(physhlth) %>% 
  summarise (n = n()) %>%
  mutate(freq = n / sum(n))

## Source: local data frame [31 x 3]
## 
##    physhlth     n         freq
##       <int> <int>        <dbl>
## 1         0  9473 0.6605997211
## 2         1   666 0.0464435146
## 3         2   812 0.0566248257
## 4         3   436 0.0304044630
## 5         4   242 0.0168758717
## 6         5   408 0.0284518828
## 7         6    68 0.0047419805
## 8         7   216 0.0150627615
## 9         8    40 0.0027894003
## 10        9    10 0.0006973501
## ..      ...   ...          ...

ggplot(data = brfss2013_MN2, mapping = aes(x = physhlth)) + geom_bar(aes(fill = physhlth))

Number of Days Mental Health Not Good (mnhlth)

70 percent reported that their mental health was never “bad” but 13 percent reported that up to 5 days they considered there mental health poor.

brfss2013_MN2 %>% 
  group_by(menthlth) %>% 
  summarise (n = n()) %>%
  mutate(freq = n / sum(n))

## Source: local data frame [30 x 3]
## 
##    menthlth     n       freq
##       <int> <int>      <dbl>
## 1         0 10091 0.70369596
## 2         1   533 0.03716876
## 3         2   766 0.05341702
## 4         3   389 0.02712692
## 5         4   183 0.01276151
## 6         5   502 0.03500697
## 7         6    58 0.00404463
## 8         7   147 0.01025105
## 9         8    34 0.00237099
## 10        9     2 0.00013947
## ..      ...   ...        ...

ggplot(data = brfss2013_MN2, mapping = aes(x = menthlth)) + geom_bar(aes(fill = menthlth))

Poor Physical Or Mental Health (poorhlth)

28 percent reported that there health was not poor. Another 10 percent reported that there health was poor up to 5 days.

brfss2013_MN2 %>% 
  group_by(poorhlth) %>% 
  summarise (n = n()) %>%
  mutate(freq = n / sum(n))

## Source: local data frame [31 x 3]
## 
##    poorhlth     n        freq
##       <int> <int>       <dbl>
## 1         0  3989 0.278172943
## 2         1   421 0.029358438
## 3         2   430 0.029986053
## 4         3   241 0.016806137
## 5         4   137 0.009553696
## 6         5   257 0.017921897
## 7         6    55 0.003835425
## 8         7   112 0.007810321
## 9         8    24 0.001673640
## 10        9     7 0.000488145
## ..      ...   ...         ...

ggplot(data = brfss2013_MN2, mapping = aes(x = poorhlth)) + geom_bar(aes(fill = poorhlth))

Respondent’s Service Status (veteran3)

Only 11 percent of the respondent’s were veterans.

brfss2013_MN2 %>% 
  group_by(veteran3) %>% 
  summarise (n = n()) %>%
  mutate(freq = n / sum(n))

## Source: local data frame [3 x 3]
## 
##   veteran3     n        freq
##     <fctr> <int>       <dbl>
## 1      Yes  1618 0.112831241
## 2       No 12691 0.885006974
## 3       NA    31 0.002161785

ggplot(data = brfss2013_MN2, mapping = aes(x = veteran3)) + geom_bar(aes(fill = veteran3))+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

Marital Status (marital)

54 percent were married, 14 percent were divorced or separated, 10 percent widowed and 16 percent never married.

brfss2013_MN2 %>% 
  group_by(marital) %>% 
  summarise (n = n()) %>%
  mutate(freq = n / sum(n))

## Source: local data frame [7 x 3]
## 
##                           marital     n        freq
##                            <fctr> <int>       <dbl>
## 1                         Married  7796 0.543654114
## 2                        Divorced  1901 0.132566248
## 3                         Widowed  1480 0.103207810
## 4                       Separated   173 0.012064156
## 5                   Never married  2432 0.169595537
## 6 A member of an unmarried couple   415 0.028940028
## 7                              NA   143 0.009972106

ggplot(data = brfss2013_MN2, mapping = aes(x = marital)) + geom_bar(aes(fill = marital))+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

Number of Children (children)

72 percnet of the respondents did not have children in the household. Households with 1 or 2 children were at 10% and 3 children at 4%. This is one area that seems to differ from the general population in Minnesota. The 2000 census indicated that the average number of children in each family was 1.93 children (https://www.census.gov/population/socdemo/hh-fam/tabST-F1-2000.pdf).

brfss2013_MN2 %>% 
  group_by(children) %>% 
  summarise (n = n()) %>%
  mutate(freq = n / sum(n))

## Source: local data frame [13 x 3]
## 
##    children     n         freq
##       <int> <int>        <dbl>
## 1         0 10266 7.158996e-01
## 2         1  1524 1.062762e-01
## 3         2  1537 1.071827e-01
## 4         3   632 4.407252e-02
## 5         4   196 1.366806e-02
## 6         5    54 3.765690e-03
## 7         6    15 1.046025e-03
## 8         7    11 7.670851e-04
## 9         8     4 2.789400e-04
## 10        9     3 2.092050e-04
## 11       10     2 1.394700e-04
## 12       12     1 6.973501e-05
## 13       NA    95 6.624826e-03

ggplot(data = brfss2013_MN2, mapping = aes(x = children)) + geom_bar(aes(fill = children))

Education Level (ed_level)

41 percent of the respondents graduated from college, 30 percent had some college and 24 percent graduated from high school. For 2010-2014, 33 percent of people over 25 had college degress. 92% were graduates of high school and or college (http://www.census.gov/quickfacts/table/PST045215/27). The respondents do seem similar to the general population as determined by the recent census.

brfss2013_MN2 %>% 
  group_by(ed_level) %>% 
  summarise (n = n()) %>%
  mutate(freq = n / sum(n))

## Source: local data frame [6 x 3]
## 
##      ed_level     n         freq
##         <chr> <int>        <dbl>
## 1     College  5903 0.4116457462
## 2        Elem   162 0.0112970711
## 3          HS  3515 0.2451185495
## 4       Never    13 0.0009065551
## 5 Some Colleg  4250 0.2963737796
## 6          NA   497 0.0346582985

ggplot(data = brfss2013_MN2, mapping = aes(x = ed_level)) + geom_bar(aes(fill = ed_level))+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

Employment Status (employ1)

58 percent were employed or self-employed, 25 percent were retired and 4 percent were unemployed. The census data indicates that 70 percent of people over 16 years old were employed. The respondents have a higher percentge of retired people compared to the census data.

brfss2013_MN2 %>% 
  group_by(employ1) %>% 
  summarise (n = n()) %>%
  mutate(freq = n / sum(n))

## Source: local data frame [9 x 3]
## 
##                            employ1     n        freq
##                             <fctr> <int>       <dbl>
## 1               Employed for wages  7213 0.502998605
## 2                    Self-employed  1199 0.083612273
## 3   Out of work for 1 year or more   275 0.019177127
## 4 Out of work for less than 1 year   313 0.021827057
## 5                      A homemaker   648 0.045188285
## 6                        A student   316 0.022036262
## 7                          Retired  3647 0.254323570
## 8                   Unable to work   619 0.043165969
## 9                               NA   110 0.007670851

ggplot(data = brfss2013_MN2, mapping = aes(x = employ1)) + geom_bar(aes(fill = employ1))+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

Income Category (income_level)

The census data report that the median household income in 2014 dollars was $60,828. In this Minnesota dataset, 45 percent make more than $50,000. Since this was a telephone and cell phone survey, the respodents may not have reported the household income. 18 percent had income below $25,000.

brfss2013_MN2 %>% 
  group_by(income_level) %>% 
  summarise (n = n()) %>%
  mutate(freq = n / sum(n))

## Source: local data frame [6 x 3]
## 
##   income_level     n       freq
##          <chr> <int>      <dbl>
## 1          <15   964 0.06722455
## 2          <25  1823 0.12712692
## 3          <35  1374 0.09581590
## 4          <50  1920 0.13389121
## 5          50+  6483 0.45209205
## 6           NA  1776 0.12384937

ggplot(data = brfss2013_MN2, mapping = aes(x = income_level)) + geom_bar(aes(fill = income_level))+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

Respondent’s Gender (gender)

57 percent of the respondents were female. According to SuburbanStats (https://suburbanstats.org/population/how-many-people-live-in-minnesota), the percentage of women in Minnesota is 50 percent. The number of women respondents is 14 percent higher than the Minnesota fraction.

brfss2013_MN2 %>% 
  group_by(gender) %>% 
  summarise (n = n()) %>%
  mutate(freq = n / sum(n))

## Source: local data frame [2 x 3]
## 
##   gender     n      freq
##   <fctr> <int>     <dbl>
## 1   Male  6134 0.4277545
## 2 Female  8206 0.5722455

ggplot(data = brfss2013_MN2, mapping = aes(x = gender)) + geom_bar(aes(fill = gender))+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

Ethnicity (ethnic)

88 percent of the respondents are white, 4 percent are black and 2 percent Asian. This is somewhat different from the census data where 85.4 percent are white, 6 percent are black, 1.3 percent are Native American, 4.9 percent are Asian, 5.2 percent are Hispanic and 2.4 percent are multi-ethnic. Non-white people are under represented in the respondents.

brfss2013_MN2 %>% 
  group_by(ethnic) %>% 
  summarise (n = n()) %>%
  mutate(freq = n / sum(n))

## Source: local data frame [7 x 3]
## 
##   ethnic     n        freq
##    <chr> <int>       <dbl>
## 1  Asian   236 0.016457462
## 2  Black   524 0.036541144
## 3  Multi   109 0.007601116
## 4 Native   141 0.009832636
## 5  Other   157 0.010948396
## 6  White 12636 0.881171548
## 7     NA   537 0.037447699

ggplot(data = brfss2013_MN2, mapping = aes(x = ethnic)) + geom_bar(aes(fill = ethnic))+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

How Often Use Seatbelts In Car? (seatbelt)

Very few respondents reported not using their seatbelts. 84 percent reported always using their seatbelts and another 5 percent reported they usually do.

brfss2013_MN2 %>% 
  group_by(seatbelt) %>% 
  summarise (n = n()) %>%
  mutate(freq = n / sum(n))

## Source: local data frame [7 x 3]
## 
##                       seatbelt     n        freq
##                         <fctr> <int>       <dbl>
## 1                       Always 12054 0.840585774
## 2                Nearly always   732 0.051046025
## 3                    Sometimes   212 0.014783821
## 4                       Seldom    71 0.004951185
## 5                        Never   122 0.008507671
## 6 Never drive or ride in a car    21 0.001464435
## 7                           NA  1128 0.078661088

ggplot(data = brfss2013_MN2, mapping = aes(x = seatbelt)) + geom_bar(aes(fill = seatbelt))+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

Adult Flu Shot/Spray Past 12 Mos (flushot6)

48 percent reported having a flu shot in the last 6 months.

brfss2013_MN2 %>% 
  group_by(flushot6) %>% 
  summarise (n = n()) %>%
  mutate(freq = n / sum(n))

## Source: local data frame [3 x 3]
## 
##   flushot6     n      freq
##     <fctr> <int>     <dbl>
## 1      Yes  7019 0.4894700
## 2       No  6147 0.4286611
## 3       NA  1174 0.0818689

ggplot(data = brfss2013_MN2, mapping = aes(x = flushot6)) + geom_bar(aes(fill = flushot6))+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

When Received Most Recent Seasonal Flu Shot/Spray (flshtmy2)

16 percent reported that their flu shot was nearly 1 year back (October 2012). Looking at the bar graph, there were more innoculations in 2012.

brfss2013_MN2 %>% 
  group_by(flshtmy2) %>% 
  summarise (n = n()) %>%
  mutate(freq = n / sum(n))

## Source: local data frame [24 x 3]
## 
##          flshtmy2     n         freq
##            <fctr> <int>        <dbl>
## 1    January 2012     7 0.0004881450
## 2   February 2012     5 0.0003486750
## 3      April 2012     3 0.0002092050
## 4        May 2012     3 0.0002092050
## 5       June 2012     4 0.0002789400
## 6       July 2012     8 0.0005578801
## 7     August 2012   129 0.0089958159
## 8  September 2012   672 0.0468619247
## 9    October 2012  2395 0.1670153417
## 10  November 2012  1223 0.0852859135
## ..            ...   ...          ...

ggplot(data = brfss2013_MN2, mapping = aes(x = flshtmy2)) + geom_bar(aes(fill = flshtmy2))+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

Received Tetanus Shot Since 2005? (tetanus)

Nearly 60 percent of respondents had received a tetanus shot. 22 percent had not.

brfss2013_MN2 %>% 
  group_by(tetanus) %>% 
  summarise (n = n()) %>%
  mutate(freq = n / sum(n))

## Source: local data frame [5 x 3]
## 
##                                             tetanus     n       freq
##                                              <fctr> <int>      <dbl>
## 1                                Yes, received Tdap  3877 0.27036262
## 2          Yes, received tetanus shot, but not Tdap  1018 0.07099024
## 3 Yes, received tetanus shot but not sure what type  3786 0.26401674
## 4        No, did not receive any tetanus since 2005  3223 0.22475593
## 5                                                NA  2436 0.16987448

ggplot(data = brfss2013_MN2, mapping = aes(x = tetanus)) + geom_bar(aes(fill = tetanus))+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

Pneumonia Shot Ever (pneuvac3)

Only 31 percent had received a pneumonia vaccine. The CDC recommendation is for children under 5, adults over 65 and people over 6 with specific risk factors. Age was not selected as a variable but 25 percent of the respondents were retired.

brfss2013_MN2 %>% 
  group_by(pneuvac3) %>% 
  summarise (n = n()) %>%
  mutate(freq = n / sum(n))

## Source: local data frame [3 x 3]
## 
##   pneuvac3     n      freq
##     <fctr> <int>     <dbl>
## 1      Yes  4443 0.3098326
## 2       No  7191 0.5014644
## 3       NA  2706 0.1887029

ggplot(data = brfss2013_MN2, mapping = aes(x = pneuvac3)) + geom_bar(aes(fill = pneuvac3))+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

Graphs of each variable of interest (tetanus, seatbelt and menthlth) in combination with the three other variables that might be associated (ethnic, ed_level, income_level and gender were prepared). Tetanus and seatbelt are text responses and menthlth is numerical.

Tetanus Graphs

By ethicity alone, 66 ercent Native Americans respondents reported a tetanus inoculation, 61 percent of whites, 60 percent of multi-ethnics, 54 percent of black, 50 percent of other (unspecified race) and 44 percent of Asians. Remember, that non-whites are under-represented in the Minnesota respondents.

#==========================================
# tetanus graphs
#==========================================

ggplot(brfss2013_MN2, aes(ethnic, fill=tetanus)) + geom_bar() +
  facet_grid(. ~ ed_level) + theme(text = element_text(size=8),
                                   axis.text.x = element_text(face='bold',angle=90, vjust=1))

ggplot(brfss2013_MN2, aes(ethnic, fill=tetanus)) + geom_bar() +
  facet_grid(. ~ income_level) + theme(text = element_text(size=8),
                                       axis.text.x = element_text(face='bold',angle=90, vjust=1))

ggplot(brfss2013_MN2, aes(ethnic, fill=tetanus)) + geom_bar() +
  facet_grid(. ~ gender) + theme(text = element_text(size=8),
                                 axis.text.x = element_text(face='bold',angle=90, vjust=1))

Seatbeat Graphs

#==========================================
# seatbelt graphs
#==========================================

ggplot(brfss2013_MN2, aes(ethnic, fill=seatbelt)) + geom_bar() +
  facet_grid(. ~ ed_level) + theme(text = element_text(size=12),
                                 axis.text.x = element_text(face='bold',angle=90, vjust=1))

ggplot(brfss2013_MN2, aes(ethnic, fill=seatbelt)) + geom_bar() +
  facet_grid(. ~ income_level) + theme(text = element_text(size=12),
                                       axis.text.x = element_text(face='bold',angle=90, vjust=1))

ggplot(brfss2013_MN2, aes(ethnic, fill=seatbelt), na.rm=TRUE) + geom_bar() +
    facet_grid(. ~ gender) + theme(text = element_text(size=12),
                                   axis.text.x = element_text(face='bold',angle=90, vjust=1))

Mental Health Graphs

#==========================================
# menthlth graphs
#==========================================
ggplot(brfss2013_MN2, aes(x=ethnic, y=menthlth)) + 
      geom_boxplot(aes(fill=ed_level)) + 
      facet_wrap(~gender)

ggplot(brfss2013_MN2, aes(x=ethnic, y=menthlth)) + 
  geom_boxplot(aes(fill=income_level)) + 
  facet_wrap(~gender)

Research quesion 1: Is there an association betwen immunization for tetanus and ethnicity, education, income and gender? Tetanus (and other) vaccination is required by state law, reported tetanus immunization was

Research quesion 2: Is there an association between seatbelt use and ethnicity, educational level, income and gender?

Research question 3: Is there an associaton between the number of “bad” mental health days and ethnicity, educational level, income and gender?

As noticed in the two other questions, the population of Minnesota is predominately white. While there are have significant waves of East Africans (Somalis, Kenyans and Ethiopians) and Southeast Asians (Hmong, Cambodian, Vietnamese and Karen), the population is still white.

Following are two facetted barplots showing “bad” mental health days by ethnic group, gender and educational level or income level. It’s interesting that Asians have reported fewer days with little spread in values compared to all other ethnic groups. Black men report have at 15 days while black women report

Do you understand the research question? •Is it clear how data can be used to answer this research question as its phrased? •Scope of inference - generalizability: Is the data collection explained clearly? Did the writer correctly identify the population of interest? Did the writer correctly decide whether the findings from this analysis can be generalized to that population, or, if not, a subsection of that population? Is their explanation satisfactory to make this decision? Are potential sources of bias discussed, and if so, is the discussion satisfactory? •Scope of inference - causality: Did the writer identify correctly whether these data can be used to establish causal links between the variables of interest. Is the explanation satisfactory? •Are the research questions well defined and is it clear why they are of interest to the author / audience? Are appropriate numbers of variables used in each research question? •Are appropriate summary statistics calculated, and are they explained/interpreted in context of the data and the research question? •Are appropriate visualizations included, and are they explained/interpreted in context of the data and the research question? •Did the writer address what the findings from the exploratory analysis suggest about the research question?