library(dplyr)
library(ggplot2)
library(reshape2)
library(psych)
setwd("~/Desktop/Coursera/DevelopingDataProducts/CourseProject-ShinyAppReproduciblePitch")

The Framingham Heart Study (FHS)

Some Findings of the FHS

The Framingham Dataset

  • We have a subset of the original dataset, pertaining to 4434 patients.
  • Each of these patients was followed for a total of 24 years, but we only have data on their clinical examinations for a period of 12 years, spanning 1956 to 1968.
    • The data is provided in longitudinal form.
    • Each participant has 1 to 3 observations.
    • There are a total of 11627 rows in the data.

Columns in the Framingham Dataset

Here are the first six columns and rows of the dataset:

##   RANDID SEX TOTCHOL AGE SYSBP DIABP
## 1   2448   1     195  39 106.0  70.0
## 2   2448   1     209  52 121.0  66.0
## 3   6238   2     250  46 121.0  81.0
## 4   6238   2     260  52 105.0  69.5
## 5   6238   2     237  58 108.0  66.0
## 6   9428   1     245  48 127.5  80.0
  • RANDID refers to the patient id.
  • The remaining columns are measurements on the patients during their examinations, and outcome measures.

Measurements in Framingham Dataset

Variable Explanation
SEX 1 for males and 2 for females.
PERIOD An integer between 1 and 3 denoting the examination number.
AGE Age at examination (in years).
SYSBP, DIABP Systolic and diastolic blood pressure.

Measurements in Our Dataset | cont’d

Variable Explanation
CIGPDAY Number of cigarettes smoked per day.
BMI Body Mass Index.
LDLC Low density cholesterol.
TOTCHOL Total cholesterol.
HYPERTEN 1 if patient has hypertension, which is defined to be Systolic greater than 140mmHg or Diastolic greater than 90mmHg.

Outcome Measures in Our Dataset

Variable Explanation
STROKE 1 if patient experienced a stroke during the 12 years and 0 otherwise.
ANYCHD 1 if patient experienced a Coronary Heart Disease during the 12 years and 0 otherwise.
CVD 1 if patient experienced a Cerebrovascular Disease during the 12 years and 0 otherwise.

Framingham Graphs

We wish to know whether the mean heart rate of “healthy” people at exam 1 is equal to 70 beats per minute (bpm).

fhs <- read.csv("/Users/Al/Desktop/Coursera/DevelopingDataProducts/CourseProject-ShinyAppReproduciblePitch/data/frmgham2.csv")
colnames(fhs) <- toupper(colnames(fhs))
with(fhs, summary(HEARTRTE))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   37.00   69.00   75.00   76.78   85.00  220.00       6
ggplot(fhs, aes(HEARTRTE)) +
  geom_histogram(color = "black", fill = "white")
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

#
fhs.healthy <- filter(fhs, PERIOD == 1 & CURSMOKE == 0 & BMI < 25 & DIABP < 80 
                      & SYSBP < 120 & DIABETES == 0)
with(fhs.healthy, describe(HEARTRTE))
##   vars   n mean    sd median trimmed   mad min max range skew kurtosis  se
## 1    1 247 70.7 11.02     70   69.93 10.38  47 125    78 0.94     2.16 0.7
ggplot(fhs.healthy, aes(x = HEARTRTE, y = ..density..)) + 
  geom_histogram(color = "black", fill = "white") +
  geom_density(fill = "green", alpha = 0.2)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

fhs.wide <- dcast(fhs, RANDID + SEX ~ PERIOD, value.var = "HEARTRTE")
colnames(fhs.wide)[3:5] <- c("HEARTRTE1", "HEARTRTE2", "HEARTRTE3")
tbl_df(fhs.wide)
## Source: local data frame [4,434 x 5]
## 
##    RANDID SEX HEARTRTE1 HEARTRTE2 HEARTRTE3
## 1    2448   1        80        NA        69
## 2    6238   2        95        80        80
## 3    9428   1        75        75        NA
## 4   10552   2        65        60        NA
## 5   11252   2        85        90        74
## 6   11263   2        77       120        86
## 7   12629   2        60        80        NA
## 8   12806   2        79        75        75
## 9   14367   1        76        85        92
## 10  16365   1        93        75        75
## ..    ... ...       ...       ...       ...
set.seed(12345)
fhs.asample <- sample_n(fhs.wide, 200)
ggplot(fhs.asample, aes(x = HEARTRTE1 - HEARTRTE2, y = ..density..)) + 
  geom_histogram(color = "black", fill = "white") +
  geom_density(fill = "red", alpha = 0.2)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## Warning: Removed 25 rows containing non-finite values (stat_density).

ggplot(fhs.wide, aes(HEARTRTE1, fill = as.factor(SEX))) + 
  geom_density(alpha = 0.2)
## Warning: Removed 1 rows containing non-finite values (stat_density).

ggplot(filter(fhs, PERIOD == 1), aes(x = as.factor(EDUC), y = BMI)) + 
  geom_boxplot() +
  stat_summary(fun.y = "mean", geom = "point", shape = 4)
## Warning: Removed 19 rows containing non-finite values (stat_boxplot).
## Warning: Removed 19 rows containing missing values (stat_summary).

Shiny Product

Input Panel 1

inputPanel(
  radioButtons("x_var", label="Explanatory Variable", 
    choices=c("BMI", "LDLC", "TOTCHOL"), selected="BMI"),
  
  radioButtons("y_var", label="Response Variable", 
    choices=c("CVD", "STROKE", "ANYCHD"), selected="CVD"),
  
  checkboxInput("con_gender", label="Condition on Gender", value=FALSE)
)

Shiny Product

Input Panel 2

inputPanel(
  sliderInput("bmi_range", label="BMI range", min=10,
  max=60, value=c(10,60)),
  sliderInput("ldlc_range", label="LDLC range", min=20,
  max=565, value=c(20,565)),
  sliderInput("tot_range", label="Total Chol. range", min=112,
  max=625, value=c(112,625))
)

Output Panel

  x <- fhs.only3[,input$x_var]
  y <- fhs.only3[,input$y_var]
  
  if(input$x_var == "BMI") {
    x.range <- input$bmi_range
  } else if (input$x_var == "LDLC") {
    x.range <- input$ldlc_range
  } else 
    x.range <- input$tot_range

Output Panel

  if(input$con_gender){
    p <- qplot(x,y, xlab=input$x_var, ylab=input$y_var, facets = . ~ SEX, data=fhs.only3)
  } else {
    p <- qplot(x,y, xlab=input$x_var,ylab=input$y_var) 
  } 
  p + xlim(x.range) + stat_smooth(method="glm", family="binomial") + 
  geom_point(position=position_jitter(height=0.02))