Exploring the BRFSS data

Part 3: Exploratory data analysis

NOTE: Insert code chunks as needed by clicking on the “Insert a new code chunk” button (green button with orange arrow) above. Make sure that your code is visible in the project you submit. Delete this note when before you submit your work.

Research quesion 1:

## Selecting the income level, diabete status and states variable accross the dataset
##Question 1: Is there any relationship between the income level and the prevalence in diabete per state?
##
library(dplyr)
library(plyr)

## -------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## -------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

library(vcd)

## Warning: package 'vcd' was built under R version 3.3.3

## Loading required package: grid

library(raster)

## Warning: package 'raster' was built under R version 3.3.3

## Loading required package: sp

## Warning: package 'sp' was built under R version 3.3.3

## 
## Attaching package: 'raster'

## The following object is masked from 'package:vcd':
## 
##     mosaic

## The following object is masked from 'package:dplyr':
## 
##     select

library(ggmosaic)

## Warning: package 'ggmosaic' was built under R version 3.3.3

## Loading required package: productplots

## Warning: package 'productplots' was built under R version 3.3.3

## 
## Attaching package: 'productplots'

## The following object is masked from 'package:raster':
## 
##     mosaic

## The following objects are masked from 'package:vcd':
## 
##     mosaic, spine, tile

## 
## Attaching package: 'ggmosaic'

## The following objects are masked from 'package:productplots':
## 
##     ddecker, hspine, mosaic, prodcalc, spine, vspine

## The following object is masked from 'package:raster':
## 
##     mosaic

## The following objects are masked from 'package:vcd':
## 
##     mosaic, spine

#question1 <- select_(brfss2013, State= brfss2013$X_state, diabete=brfss2013$diabete3,income=brfss2013$income2)
#mosaic(income2~diabete3)

cdplot(diabete3~income2, data=brfss2013)

ggplot(data = brfss2013)+geom_mosaic(aes(x=product(income2), fill=diabete3))+labs(x="Income Level", y="diabete status") + theme(panel.background = NULL, axis.text.x = element_text(angle=90, vjust=1))

Research quesion 1 (continue): The plots shows a trend of decrease of positive diabete diagnose count as the income increases. There appears to be a negative relationship between the diabete diagnostic and the income level.

Research quesion 2:

What is the relationship between the rate of diagnosed diabete and the rate of heart attack recorded in populations accross the states?

lstate <- data.frame(tapply(brfss2013$diabete3, brfss2013$X_state, count))
hstate <- data.frame(tapply(brfss2013$cvdinfr4, brfss2013$X_state, count))

names(lstate) <- c("bystate")
names(hstate) <- c("bystate")
#lstate <- lstate[-1] ### removing the first line 

all_counts <- data.frame()
all_counts <- data.frame(states=unique(brfss2013$X_state))

all_counts  <- all_counts %>% mutate(all_counts, diabete.Yes = NA, diabete.Yes.Preg = NA, diabete.No = NA, diabete.No.bord = NA, diabete.NA = NA, heartattack.Yes=NA, heartattack.No=NA, heartattack.NA=NA, BMI=NA)


for (i in 1:length(lstate$bystate)){

      index <- which(all_counts$states == row.names(lstate$bystate)[i])
      len <- nrow(lstate$bystate[[i]])
      len1 <- nrow(hstate$bystate[[i]])
      
      if(index !=0 & len == 5 & len1==3){
      all_counts$diabete.Yes[index] = lstate$bystate[[i]]$freq[1]
      all_counts$diabete.Yes.Preg[index] = lstate$bystate[[i]]$freq[2]
      all_counts$diabete.No[index] = lstate$bystate[[i]]$freq[3]
      all_counts$diabete.No.bord[index] = lstate$bystate[[i]]$freq[4]
      all_counts$diabete.NA[index] = lstate$bystate[[i]]$freq[5]
      
      all_counts$heartattack.Yes[index] = hstate$bystate[[i]]$freq[1]
      all_counts$heartattack.No[index] = hstate$bystate[[i]]$freq[2]
      all_counts$heartattack.NA[index] = hstate$bystate[[i]]$freq[3]
      
      all_counts$BMI[index] = mean(brfss2013[tolower(brfss2013$X_state) == tolower(row.names(lstate$bystate)[i]),]$X_bmi5, na.rm=TRUE)

      } else{
      next()

      }
}

all_counts <- na.omit(all_counts)

ggplot(data = all_counts, aes(x=diabete.Yes, y=heartattack.Yes))+geom_point(aes(colour=heartattack.Yes))+scale_colour_gradient(low = "green", high="red") + geom_smooth(method = lm, se=FALSE)

Research question 2 continues

Above plot suggests that there is a strong positive correlation between the diabete status and the diagnosis of heart attack accross the states. The blue line represents the linear model plot between the diabete diagnose count and the heart attack count per state. Florida state appears to have the highest rate of residents diagnosed of diabete and the highest rate of diagnosed heart attacks and is represented on the top right corner of the graph.

## The state with the highest diabete rate diagnosed amoung residents is  
all_counts$states[which(all_counts$diabete.Yes == max(all_counts$diabete.Yes))]

## [1] Florida
## 55 Levels: 0 Alabama Alaska Arizona Arkansas California ... 80

## The state with highest heart attack rate diagnosed amoung residents is found in 

all_counts$states[which(all_counts$heartattack.Yes == max(all_counts$heartattack.Yes))]

## [1] Florida
## 55 Levels: 0 Alabama Alaska Arizona Arkansas California ... 80

Research quesion 3:

Is there any correlation between the body mass index, the age and the diabete status for the populations of Florida state?

florida <- brfss2013[tolower(brfss2013$X_state) == "florida",]

florida_yearly <- data.frame(BMI=tapply(florida$X_bmi5, florida$iyear, count), diabete=tapply(florida$diabete3, florida$iyear, count), Sex=tapply(florida$sex, florida$iyear, count))

ggplot(all_counts, aes(x=BMI))+geom_histogram(aes(colour="blue"), binwidth = 20)

ggplot(data = all_counts, aes(x=BMI, y=diabete.Yes))+geom_point(aes(colour=diabete.Yes))+scale_colour_gradient(low = "green", high="red") + geom_smooth(method = lm, se=FALSE)

Research question 3 continue

The above diagrams may help answering the fact that BMI and exposure to diabete are correlated but the incidence of the BMI over the diabete status of respondents is not strong enough

cor(all_counts$diabete.Yes, all_counts$BMI)

## [1] 0.1776073

to be consider as the major or the unique cause of the diabete status of respondents.

Exploring the BRFSS data

Setup

Load packages

Load data

Part 1: Data

Part 2: Research questions

Part 3: Exploratory data analysis