#install.packages(tidyverse)
#install.packages(ggplot2)
#install.packages("gtsummary")
#install.packages("flextable")
library(tidyverse)
library(e1071)
library(gtsummary)
library(flextable)
library(ggplot2)
chd1500<- read.csv("chd1500.csv")
head(chd1500)
## idno age alcoh bmi choltot diabetes diabp estrogen f7 fhha fib
## 1 3500187 65 0.00000000 24.78318 187.76 1 57 1 185 0 429
## 2 3500349 65 8.01922989 21.69088 157.76 1 54 2 93 0 261
## 3 3501213 65 0.00000000 23.11192 248.76 1 80 0 123 1 361
## 4 3503127 65 0.00000000 35.77699 202.76 1 74 1 154 0 337
## 5 3503380 65 0.00000000 29.25887 211.76 1 68 1 124 0 402
## 6 3503720 65 0.03846154 25.99083 203.76 1 69 0 128 0 307
## sex glucose hdl height hip htnmed hyper incchd insulin kcal lipid pkyrs
## 1 0 86 83 159.5 103 0 0 0 5 3780.0 0 0.0
## 2 0 92 45 162.0 90 0 0 0 7 4012.5 0 0.0
## 3 0 102 51 156.0 91 0 2 0 12 135.0 0 0.0
## 4 0 105 47 162.0 123 1 2 0 22 1387.5 0 0.0
## 5 0 101 45 153.0 99 1 2 0 15 982.5 0 55.5
## 6 0 94 66 165.0 103 0 1 0 14 622.5 0 0.0
## racebw smoke sysbp trig waist weight wgt50
## 1 0 1 115 91 79.49999 139.0 150
## 2 0 1 97 154 70.00000 125.5 124
## 3 0 1 171 125 79.00000 124.0 128
## 4 0 1 107 140 103.00000 207.0 190
## 5 0 2 140 201 96.00000 151.0 140
## 6 0 1 142 127 87.00000 156.0 128
# Ensuring the Diabetes, Family History, CHD and Estrogen variables are factors
# Diabetes status
chd1500$diabetes<- factor(chd1500$diabetes, levels = c(1,2,3),
labels = c("Normal", "Impaired", "Diabetic"))
# MI family history
#chd1500$fhha<- factor(chd1500$fhha, levels = c(0,1),
# labels = c("No", "Yes"))
chd1500$fhha <- factor(chd1500$fhha, levels = c(0, 1), labels = c("No", "Yes"), exclude = NULL)
# Sex
chd1500$sex <- factor(chd1500$sex, levels = c(0,1),
labels = c("Female", "Male"))
# Coronary heart disease (CHD)
chd1500$incchd <- factor(chd1500$incchd, levels = c(0,1),
labels = c("No", "Yes"))
# Estrogen
chd1500$estrogen <- factor(chd1500$estrogen, levels = c(0,1,2),labels = c("Never", "Past", "Current"))
#family history of heart attack
chd1500$fhha <- factor(chd1500$fhha, levels = c(0,1),
labels = c("No", "Yes"))
#Smoking status
chd1500$smoke <- factor(chd1500$smoke, levels = c(0,1,3),
labels = c("Never", "Former", "Current"))
#Summary of Data
summary.data.frame(chd1500, maxsum = 100)
## idno age alcoh bmi
## Min. :3000028 Min. :65.00 Min. : 0.00000 Min. :14.65
## 1st Qu.:3503638 1st Qu.:68.00 1st Qu.: 0.00000 1st Qu.:23.31
## Median :4503820 Median :71.00 Median : 0.01923 Median :25.91
## Mean :4601708 Mean :72.36 Mean : 2.35205 Mean :26.43
## 3rd Qu.:5505188 3rd Qu.:76.00 3rd Qu.: 1.01923 3rd Qu.:28.89
## Max. :6503950 Max. :98.00 Max. :77.00000 Max. :49.41
## choltot diabetes diabp estrogen
## Min. : 77.76 Normal :1105 Min. : 40.00 Never :542
## 1st Qu.:185.76 Impaired: 201 1st Qu.: 63.00 Past :261
## Median :211.76 Diabetic: 194 Median : 70.00 Current:113
## Mean :212.01 Mean : 70.28 NA's :584
## 3rd Qu.:235.01 3rd Qu.: 77.00
## Max. :406.76 Max. :107.00
## f7 fhha fib sex glucose
## Min. : 41.0 No : 0 Min. :177.0 Female:916 Min. : 63.0
## 1st Qu.:108.0 Yes : 0 1st Qu.:277.0 Male :584 1st Qu.: 94.0
## Median :124.0 NA's:1500 Median :311.0 Median :101.0
## Mean :126.7 Mean :319.7 Mean :108.4
## 3rd Qu.:143.0 3rd Qu.:358.0 3rd Qu.:110.0
## Max. :291.0 Max. :872.0 Max. :657.0
## hdl height hip htnmed
## Min. : 15.00 Min. :133.0 Min. : 76.5 Min. :0.0000
## 1st Qu.: 44.00 1st Qu.:157.2 1st Qu.: 95.0 1st Qu.:0.0000
## Median : 52.00 Median :163.5 Median :100.5 Median :0.0000
## Mean : 54.75 Mean :164.4 Mean :101.6 Mean :0.3693
## 3rd Qu.: 63.00 3rd Qu.:171.5 3rd Qu.:106.5 3rd Qu.:1.0000
## Max. :149.00 Max. :191.5 Max. :155.0 Max. :1.0000
## hyper incchd insulin kcal lipid
## Min. :0.00 No :1282 Min. : 5 Min. : 0.0 Min. :0.000
## 1st Qu.:0.00 Yes: 218 1st Qu.: 9 1st Qu.: 457.5 1st Qu.:0.000
## Median :1.00 Median : 12 Median : 1140.0 Median :0.000
## Mean :0.91 Mean : 16 Mean : 1847.9 Mean :0.042
## 3rd Qu.:2.00 3rd Qu.: 17 3rd Qu.: 2528.4 3rd Qu.:0.000
## Max. :2.00 Max. :400 Max. :13640.0 Max. :1.000
## pkyrs racebw smoke sysbp
## Min. : 0.00 Min. :0.00000 Never : 0 Min. : 79.0
## 1st Qu.: 0.00 1st Qu.:0.00000 Former :787 1st Qu.:120.0
## Median : 0.00 Median :0.00000 Current:159 Median :133.0
## Mean : 16.49 Mean :0.03333 NA's :554 Mean :135.2
## 3rd Qu.: 26.62 3rd Qu.:0.00000 3rd Qu.:148.0
## Max. :240.00 Max. :1.00000 Max. :221.0
## trig waist weight wgt50
## Min. : 46.0 Min. : 61.50 Min. : 76.8 Min. : 80.0
## 1st Qu.: 94.0 1st Qu.: 85.00 1st Qu.:134.0 1st Qu.:130.0
## Median : 124.0 Median : 93.00 Median :155.8 Median :150.0
## Mean : 140.8 Mean : 93.55 Mean :158.0 Mean :151.8
## 3rd Qu.: 163.2 3rd Qu.:101.75 3rd Qu.:177.3 3rd Qu.:168.2
## Max. :1216.0 Max. :138.00 Max. :282.0 Max. :320.0
# Summary table of frequency and percent for CHD and diabetes status.
chd1500 %>% select(incchd, diabetes) %>%
tbl_summary(missing = "ifany",
digits = list(all_categorical() ~ c(0,1)),
label = list(incchd ~ "CHD",
diabetes ~ "Diabetes status")) %>%
modify_header(label = "**Variable**") %>%
as_flex_table()
Variable | N = 1,5001 |
|---|---|
CHD | 218 (14.5%) |
Diabetes status | |
Normal | 1,105 (73.7%) |
Impaired | 201 (13.4%) |
Diabetic | 194 (12.9%) |
1n (%) | |
# Summary table of frequency and percent for CHD and diabetes status.
chd1500 %>% select(incchd, diabetes,fhha, sex,smoke) %>%
tbl_summary(missing = "ifany",
type = incchd ~ "categorical",
digits = list(all_categorical() ~ c(0,1)),
label = list(incchd ~ "CHD",
sex ~ "Sex",
fhha ~"Family history of heart attack",
smoke ~ "Smoking status ",
diabetes ~ "Diabetes status")) %>%
modify_header(label = "**Variable**") %>%
as_flex_table()
Variable | N = 1,5001 |
|---|---|
CHD | |
No | 1,282 (85.5%) |
Yes | 218 (14.5%) |
Diabetes status | |
Normal | 1,105 (73.7%) |
Impaired | 201 (13.4%) |
Diabetic | 194 (12.9%) |
Family history of heart attack | 0 (NA%) |
Unknown | 1,500 |
Sex | |
Female | 916 (61.1%) |
Male | 584 (38.9%) |
Smoking status | |
Never | 0 (0.0%) |
Former | 787 (83.2%) |
Current | 159 (16.8%) |
Unknown | 554 |
1n (%) | |
# Create a bar chart of (incident) CHD.
barplot(table(chd1500$incchd),
xlab= "CHD", ylab = "No. of subjects")
# Basic histogram of age
hist(chd1500$age, xlab="Age", main="Age")
# Basic histogram of Alcohol Consumption
hist(chd1500$alcoh, xlab="Alcohol Consumption", main="ALcohol Consumption")
# Basic histogram of BMI
hist(chd1500$bmi, xlab="BMI", main="BMI")
# Basic histogram of HDL Cholesterol
hist(chd1500$hdl, xlab="HDL Cholesterol", main="HDL Cholesterol")
# Basic histogram of Insulin
hist(chd1500$insulin, xlab="Serum Insulin", main="Serum Insulin")
# Basic histogram of Pack-years of smoking
hist(chd1500$pkyrs, xlab="Pack-years of smoking ", main="Pack-years of smoking ")
#Nicely formatted histogram of HDL cholesterol.
ggplot(data=chd1500, aes(x=hdl)) +
geom_histogram(color="black", fill="blue", binwidth=10) +
labs(x="HDL cholesterol (mg/dL)", y="No. of subjects",
title="Histogram of HDL cholesterol") +
scale_y_continuous(limits=c(0, 500), breaks=seq(0,500,100),
expand = c(0,0)) +
theme_classic()
##descriptive statisticsfor each of the variables
chd1500 %>% select(age, alcoh, bmi, hdl, insulin,pkyrs) %>%
tbl_summary(missing = "ifany",
type = all_continuous() ~ "continuous2",
statistic = all_continuous() ~ c("{mean} ({sd})",
"{median} ({p25}, {p75})",
"{min}, {max}",
"{skewness}"),
digits = list(all_continuous() ~ 1),
label = list(age ~ "Age (y)",
alcoh ~ "Alcohol consumption (drinks/wk)",
bmi ~"BMI",
hdl ~"HDL cholesterol(mg/dl) ",
insulin ~"Serum insulin (IU/ml)",
pkyrs ~"Pack-years of smoking ")
) %>%
modify_header(label = "**Variable**") %>%
as_flex_table()
Variable | N = 1,500 |
|---|---|
Age (y) | |
Mean (SD) | 72.4 (5.4) |
Median (Q1, Q3) | 71.0 (68.0, 76.0) |
Min, Max | 65.0, 98.0 |
skewness | 1.0 |
Alcohol consumption (drinks/wk) | |
Mean (SD) | 2.4 (6.2) |
Median (Q1, Q3) | 0.0 (0.0, 1.0) |
Min, Max | 0.0, 77.0 |
skewness | 4.8 |
BMI | |
Mean (SD) | 26.4 (4.6) |
Median (Q1, Q3) | 25.9 (23.3, 28.9) |
Min, Max | 14.7, 49.4 |
skewness | 0.9 |
HDL cholesterol(mg/dl) | |
Mean (SD) | 54.7 (15.5) |
Median (Q1, Q3) | 52.0 (44.0, 63.0) |
Min, Max | 15.0, 149.0 |
skewness | 1.0 |
Serum insulin (IU/ml) | |
Mean (SD) | 16.0 (20.6) |
Median (Q1, Q3) | 12.0 (9.0, 17.0) |
Min, Max | 5.0, 400.0 |
skewness | 12.8 |
Pack-years of smoking | |
Mean (SD) | 16.5 (26.2) |
Median (Q1, Q3) | 0.0 (0.0, 26.8) |
Min, Max | 0.0, 240.0 |
skewness | 2.2 |
#creating a summary table of CHD by diabetes status and sex
chd1500 %>% select(incchd, diabetes, sex) %>%
tbl_summary(by=incchd,
missing = "ifany",
percent = "row",
digits = list(all_categorical() ~ c(0,1)),
label = list(diabetes ~ "Diabetes status", sex ~ "Sex")
) %>%
modify_header(label = "**Variable**") %>%
as_flex_table()
Variable | No | Yes |
|---|---|---|
Diabetes status | ||
Normal | 963 (87.1%) | 142 (12.9%) |
Impaired | 167 (83.1%) | 34 (16.9%) |
Diabetic | 152 (78.4%) | 42 (21.6%) |
Sex | ||
Female | 818 (89.3%) | 98 (10.7%) |
Male | 464 (79.5%) | 120 (20.5%) |
1n (%) | ||
#creating boxplots of HDL cholesterol by CHD
ggplot(data=chd1500, aes(x=incchd, y=hdl, fill=incchd)) +
geom_boxplot() +
labs(x="CHD",
y="HDL cholesterol (mg/dL)",
title="Boxplot of HLD cholesterol by CHD") +
scale_y_continuous(limits=c(0, 160), breaks=seq(0,160,20)) +
theme_classic() +
theme(legend.position="none")
#creating boxplots of age by CHD
ggplot(data=chd1500, aes(x=incchd, y=age, fill=incchd)) +
geom_boxplot() +
labs(x="CHD",
y="Age in years",
title="Boxplot of Age by CHD") +
scale_y_continuous(limits=c(60, 100), breaks=seq(0,160,10)) +
theme_classic() +
theme(legend.position="none")
#relationship of CHD with HDL cholesterol and age by a summary table
chd1500 %>% select(incchd, hdl, age) %>%
tbl_summary(by=incchd,
missing = "ifany",
type = all_continuous() ~ "continuous2",
statistic = all_continuous() ~ c("{mean} ({sd})",
"{median} ({p25}, {p75})",
"{min}, {max}",
"{skewness}"),
digits = list(all_continuous() ~ 1),
label = list(hdl ~ "HDL cholesterol (mg/dL)",age ~ "Age (y)" )
) %>%
modify_header(label = "**Variable**") %>%
as_flex_table()
Variable | No | Yes |
|---|---|---|
HDL cholesterol (mg/dL) | ||
Mean (SD) | 55.2 (15.5) | 51.8 (14.9) |
Median (Q1, Q3) | 53.0 (44.0, 64.0) | 49.0 (42.0, 59.0) |
Min, Max | 15.0, 149.0 | 21.0, 129.0 |
skewness | 1.0 | 1.4 |
Age (y) | ||
Mean (SD) | 72.2 (5.4) | 73.2 (5.6) |
Median (Q1, Q3) | 71.0 (68.0, 75.0) | 72.0 (68.0, 77.0) |
Min, Max | 65.0, 98.0 | 66.0, 90.0 |
skewness | 1.1 | 0.7 |