library(dplyr)
library(ggplot2)
library(reshape2)
library(psych)
setwd("~/Desktop/Coursera/DevelopingDataProducts/CourseProject-ShinyAppReproduciblePitch")
- We have a subset of the original dataset, pertaining to 4434 patients.
- Each of these patients was followed for a total of 24 years, but we only have data on their clinical examinations for a period of 12 years, spanning 1956 to 1968.
- The data is provided in longitudinal form.
- Each participant has 1 to 3 observations.
- There are a total of 11627 rows in the data.
Here are the first six columns and rows of the dataset:
## RANDID SEX TOTCHOL AGE SYSBP DIABP
## 1 2448 1 195 39 106.0 70.0
## 2 2448 1 209 52 121.0 66.0
## 3 6238 2 250 46 121.0 81.0
## 4 6238 2 260 52 105.0 69.5
## 5 6238 2 237 58 108.0 66.0
## 6 9428 1 245 48 127.5 80.0
RANDIDrefers to the patient id.- The remaining columns are measurements on the patients during their examinations, and outcome measures.
| Variable | Explanation |
|---|---|
| SEX | 1 for males and 2 for females. |
| PERIOD | An integer between 1 and 3 denoting the examination number. |
| AGE | Age at examination (in years). |
| SYSBP, DIABP | Systolic and diastolic blood pressure. |
| Variable | Explanation |
|---|---|
| CIGPDAY | Number of cigarettes smoked per day. |
| BMI | Body Mass Index. |
| LDLC | Low density cholesterol. |
| TOTCHOL | Total cholesterol. |
| HYPERTEN | 1 if patient has hypertension, which is defined to be Systolic greater than 140mmHg or Diastolic greater than 90mmHg. |
| Variable | Explanation |
|---|---|
| STROKE | 1 if patient experienced a stroke during the 12 years and 0 otherwise. |
| ANYCHD | 1 if patient experienced a Coronary Heart Disease during the 12 years and 0 otherwise. |
| CVD | 1 if patient experienced a Cerebrovascular Disease during the 12 years and 0 otherwise. |
fhs <- read.csv("/Users/Al/Desktop/Coursera/DevelopingDataProducts/CourseProject-ShinyAppReproduciblePitch/data/frmgham2.csv")
colnames(fhs) <- toupper(colnames(fhs))
with(fhs, summary(HEARTRTE))
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 37.00 69.00 75.00 76.78 85.00 220.00 6
ggplot(fhs, aes(HEARTRTE)) +
geom_histogram(color = "black", fill = "white")
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
#
fhs.healthy <- filter(fhs, PERIOD == 1 & CURSMOKE == 0 & BMI < 25 & DIABP < 80
& SYSBP < 120 & DIABETES == 0)
with(fhs.healthy, describe(HEARTRTE))
## vars n mean sd median trimmed mad min max range skew kurtosis se
## 1 1 247 70.7 11.02 70 69.93 10.38 47 125 78 0.94 2.16 0.7
ggplot(fhs.healthy, aes(x = HEARTRTE, y = ..density..)) +
geom_histogram(color = "black", fill = "white") +
geom_density(fill = "green", alpha = 0.2)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
fhs.wide <- dcast(fhs, RANDID + SEX ~ PERIOD, value.var = "HEARTRTE")
colnames(fhs.wide)[3:5] <- c("HEARTRTE1", "HEARTRTE2", "HEARTRTE3")
tbl_df(fhs.wide)
## Source: local data frame [4,434 x 5]
##
## RANDID SEX HEARTRTE1 HEARTRTE2 HEARTRTE3
## 1 2448 1 80 NA 69
## 2 6238 2 95 80 80
## 3 9428 1 75 75 NA
## 4 10552 2 65 60 NA
## 5 11252 2 85 90 74
## 6 11263 2 77 120 86
## 7 12629 2 60 80 NA
## 8 12806 2 79 75 75
## 9 14367 1 76 85 92
## 10 16365 1 93 75 75
## .. ... ... ... ... ...
set.seed(12345)
fhs.asample <- sample_n(fhs.wide, 200)
ggplot(fhs.asample, aes(x = HEARTRTE1 - HEARTRTE2, y = ..density..)) +
geom_histogram(color = "black", fill = "white") +
geom_density(fill = "red", alpha = 0.2)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## Warning: Removed 25 rows containing non-finite values (stat_density).
ggplot(fhs.wide, aes(HEARTRTE1, fill = as.factor(SEX))) +
geom_density(alpha = 0.2)
## Warning: Removed 1 rows containing non-finite values (stat_density).
ggplot(filter(fhs, PERIOD == 1), aes(x = as.factor(EDUC), y = BMI)) +
geom_boxplot() +
stat_summary(fun.y = "mean", geom = "point", shape = 4)
## Warning: Removed 19 rows containing non-finite values (stat_boxplot).
## Warning: Removed 19 rows containing missing values (stat_summary).
inputPanel(
radioButtons("x_var", label="Explanatory Variable",
choices=c("BMI", "LDLC", "TOTCHOL"), selected="BMI"),
radioButtons("y_var", label="Response Variable",
choices=c("CVD", "STROKE", "ANYCHD"), selected="CVD"),
checkboxInput("con_gender", label="Condition on Gender", value=FALSE)
)
inputPanel(
sliderInput("bmi_range", label="BMI range", min=10,
max=60, value=c(10,60)),
sliderInput("ldlc_range", label="LDLC range", min=20,
max=565, value=c(20,565)),
sliderInput("tot_range", label="Total Chol. range", min=112,
max=625, value=c(112,625))
)
x <- fhs.only3[,input$x_var]
y <- fhs.only3[,input$y_var]
if(input$x_var == "BMI") {
x.range <- input$bmi_range
} else if (input$x_var == "LDLC") {
x.range <- input$ldlc_range
} else
x.range <- input$tot_range
if(input$con_gender){
p <- qplot(x,y, xlab=input$x_var, ylab=input$y_var, facets = . ~ SEX, data=fhs.only3)
} else {
p <- qplot(x,y, xlab=input$x_var,ylab=input$y_var)
}
p + xlim(x.range) + stat_smooth(method="glm", family="binomial") +
geom_point(position=position_jitter(height=0.02))