#install.packages(tidyverse)
#install.packages(ggplot2)
#install.packages("gtsummary")
#install.packages("flextable")

library(tidyverse)
library(e1071)
library(gtsummary) 
library(flextable)
library(ggplot2)

chd1500<- read.csv("chd1500.csv")
head(chd1500)
##      idno age      alcoh      bmi choltot diabetes diabp estrogen  f7 fhha fib
## 1 3500187  65 0.00000000 24.78318  187.76        1    57        1 185    0 429
## 2 3500349  65 8.01922989 21.69088  157.76        1    54        2  93    0 261
## 3 3501213  65 0.00000000 23.11192  248.76        1    80        0 123    1 361
## 4 3503127  65 0.00000000 35.77699  202.76        1    74        1 154    0 337
## 5 3503380  65 0.00000000 29.25887  211.76        1    68        1 124    0 402
## 6 3503720  65 0.03846154 25.99083  203.76        1    69        0 128    0 307
##   sex glucose hdl height hip htnmed hyper incchd insulin   kcal lipid pkyrs
## 1   0      86  83  159.5 103      0     0      0       5 3780.0     0   0.0
## 2   0      92  45  162.0  90      0     0      0       7 4012.5     0   0.0
## 3   0     102  51  156.0  91      0     2      0      12  135.0     0   0.0
## 4   0     105  47  162.0 123      1     2      0      22 1387.5     0   0.0
## 5   0     101  45  153.0  99      1     2      0      15  982.5     0  55.5
## 6   0      94  66  165.0 103      0     1      0      14  622.5     0   0.0
##   racebw smoke sysbp trig     waist weight wgt50
## 1      0     1   115   91  79.49999  139.0   150
## 2      0     1    97  154  70.00000  125.5   124
## 3      0     1   171  125  79.00000  124.0   128
## 4      0     1   107  140 103.00000  207.0   190
## 5      0     2   140  201  96.00000  151.0   140
## 6      0     1   142  127  87.00000  156.0   128
# Ensuring the Diabetes, Family History, CHD and Estrogen variables are factors
# Diabetes status
chd1500$diabetes<- factor(chd1500$diabetes, levels = c(1,2,3), 
                          labels = c("Normal", "Impaired", "Diabetic"))
# MI family history
#chd1500$fhha<- factor(chd1500$fhha, levels = c(0,1), 
#                      labels = c("No", "Yes"))
chd1500$fhha <- factor(chd1500$fhha, levels = c(0, 1), labels = c("No", "Yes"), exclude = NULL)
# Sex
chd1500$sex <- factor(chd1500$sex, levels = c(0,1), 
                      labels = c("Female", "Male"))
# Coronary heart disease (CHD)
chd1500$incchd <- factor(chd1500$incchd, levels = c(0,1), 
                         labels = c("No", "Yes"))
# Estrogen
chd1500$estrogen <- factor(chd1500$estrogen, levels = c(0,1,2),labels = c("Never", "Past", "Current"))

#family history of heart attack
chd1500$fhha <- factor(chd1500$fhha, levels = c(0,1), 
                         labels = c("No", "Yes"))
#Smoking status 
chd1500$smoke <- factor(chd1500$smoke, levels = c(0,1,3), 
                         labels = c("Never", "Former", "Current"))
#Summary of Data
summary.data.frame(chd1500, maxsum = 100)
##       idno              age            alcoh               bmi       
##  Min.   :3000028   Min.   :65.00   Min.   : 0.00000   Min.   :14.65  
##  1st Qu.:3503638   1st Qu.:68.00   1st Qu.: 0.00000   1st Qu.:23.31  
##  Median :4503820   Median :71.00   Median : 0.01923   Median :25.91  
##  Mean   :4601708   Mean   :72.36   Mean   : 2.35205   Mean   :26.43  
##  3rd Qu.:5505188   3rd Qu.:76.00   3rd Qu.: 1.01923   3rd Qu.:28.89  
##  Max.   :6503950   Max.   :98.00   Max.   :77.00000   Max.   :49.41  
##     choltot           diabetes        diabp           estrogen  
##  Min.   : 77.76   Normal  :1105   Min.   : 40.00   Never  :542  
##  1st Qu.:185.76   Impaired: 201   1st Qu.: 63.00   Past   :261  
##  Median :211.76   Diabetic: 194   Median : 70.00   Current:113  
##  Mean   :212.01                   Mean   : 70.28   NA's   :584  
##  3rd Qu.:235.01                   3rd Qu.: 77.00                
##  Max.   :406.76                   Max.   :107.00                
##        f7          fhha           fib            sex         glucose     
##  Min.   : 41.0   No  :   0   Min.   :177.0   Female:916   Min.   : 63.0  
##  1st Qu.:108.0   Yes :   0   1st Qu.:277.0   Male  :584   1st Qu.: 94.0  
##  Median :124.0   NA's:1500   Median :311.0                Median :101.0  
##  Mean   :126.7               Mean   :319.7                Mean   :108.4  
##  3rd Qu.:143.0               3rd Qu.:358.0                3rd Qu.:110.0  
##  Max.   :291.0               Max.   :872.0                Max.   :657.0  
##       hdl             height           hip            htnmed      
##  Min.   : 15.00   Min.   :133.0   Min.   : 76.5   Min.   :0.0000  
##  1st Qu.: 44.00   1st Qu.:157.2   1st Qu.: 95.0   1st Qu.:0.0000  
##  Median : 52.00   Median :163.5   Median :100.5   Median :0.0000  
##  Mean   : 54.75   Mean   :164.4   Mean   :101.6   Mean   :0.3693  
##  3rd Qu.: 63.00   3rd Qu.:171.5   3rd Qu.:106.5   3rd Qu.:1.0000  
##  Max.   :149.00   Max.   :191.5   Max.   :155.0   Max.   :1.0000  
##      hyper      incchd        insulin         kcal             lipid      
##  Min.   :0.00   No :1282   Min.   :  5   Min.   :    0.0   Min.   :0.000  
##  1st Qu.:0.00   Yes: 218   1st Qu.:  9   1st Qu.:  457.5   1st Qu.:0.000  
##  Median :1.00              Median : 12   Median : 1140.0   Median :0.000  
##  Mean   :0.91              Mean   : 16   Mean   : 1847.9   Mean   :0.042  
##  3rd Qu.:2.00              3rd Qu.: 17   3rd Qu.: 2528.4   3rd Qu.:0.000  
##  Max.   :2.00              Max.   :400   Max.   :13640.0   Max.   :1.000  
##      pkyrs            racebw            smoke         sysbp      
##  Min.   :  0.00   Min.   :0.00000   Never  :  0   Min.   : 79.0  
##  1st Qu.:  0.00   1st Qu.:0.00000   Former :787   1st Qu.:120.0  
##  Median :  0.00   Median :0.00000   Current:159   Median :133.0  
##  Mean   : 16.49   Mean   :0.03333   NA's   :554   Mean   :135.2  
##  3rd Qu.: 26.62   3rd Qu.:0.00000                 3rd Qu.:148.0  
##  Max.   :240.00   Max.   :1.00000                 Max.   :221.0  
##       trig            waist            weight          wgt50      
##  Min.   :  46.0   Min.   : 61.50   Min.   : 76.8   Min.   : 80.0  
##  1st Qu.:  94.0   1st Qu.: 85.00   1st Qu.:134.0   1st Qu.:130.0  
##  Median : 124.0   Median : 93.00   Median :155.8   Median :150.0  
##  Mean   : 140.8   Mean   : 93.55   Mean   :158.0   Mean   :151.8  
##  3rd Qu.: 163.2   3rd Qu.:101.75   3rd Qu.:177.3   3rd Qu.:168.2  
##  Max.   :1216.0   Max.   :138.00   Max.   :282.0   Max.   :320.0
# Summary table of frequency and percent for CHD and diabetes status.
chd1500 %>% select(incchd, diabetes) %>%
  tbl_summary(missing = "ifany",
              digits = list(all_categorical() ~ c(0,1)), 
              label = list(incchd ~ "CHD",
                           diabetes ~ "Diabetes status")) %>%
  modify_header(label = "**Variable**") %>%
  as_flex_table()

Variable

N = 1,5001

CHD

218 (14.5%)

Diabetes status

Normal

1,105 (73.7%)

Impaired

201 (13.4%)

Diabetic

194 (12.9%)

1n (%)

# Summary table of frequency and percent for CHD and diabetes status.
chd1500 %>% select(incchd, diabetes,fhha, sex,smoke) %>%
  tbl_summary(missing = "ifany",
              type = incchd ~ "categorical",
              digits = list(all_categorical() ~ c(0,1)), 
              label = list(incchd ~ "CHD",
                           sex ~ "Sex",
                           fhha ~"Family history of heart attack",
                           smoke ~ "Smoking status ",
                           diabetes ~ "Diabetes status")) %>%
  modify_header(label = "**Variable**") %>%
  as_flex_table()

Variable

N = 1,5001

CHD

No

1,282 (85.5%)

Yes

218 (14.5%)

Diabetes status

Normal

1,105 (73.7%)

Impaired

201 (13.4%)

Diabetic

194 (12.9%)

Family history of heart attack

0 (NA%)

Unknown

1,500

Sex

Female

916 (61.1%)

Male

584 (38.9%)

Smoking status

Never

0 (0.0%)

Former

787 (83.2%)

Current

159 (16.8%)

Unknown

554

1n (%)

#   Create a bar chart of (incident) CHD. 
barplot(table(chd1500$incchd),
        xlab= "CHD", ylab = "No. of subjects")

# Basic histogram of age
hist(chd1500$age, xlab="Age", main="Age")

# Basic histogram of Alcohol Consumption
hist(chd1500$alcoh, xlab="Alcohol Consumption", main="ALcohol Consumption")

# Basic histogram of BMI
hist(chd1500$bmi, xlab="BMI", main="BMI")

# Basic histogram of HDL Cholesterol
hist(chd1500$hdl, xlab="HDL Cholesterol", main="HDL Cholesterol")

# Basic histogram of Insulin
hist(chd1500$insulin, xlab="Serum Insulin", main="Serum Insulin")

# Basic histogram of Pack-years of smoking 
hist(chd1500$pkyrs, xlab="Pack-years of smoking ", main="Pack-years of smoking ")

#Nicely formatted histogram of HDL cholesterol. 
ggplot(data=chd1500, aes(x=hdl)) +
  geom_histogram(color="black", fill="blue", binwidth=10) +
  labs(x="HDL cholesterol (mg/dL)", y="No. of subjects", 
       title="Histogram of HDL cholesterol") +
  scale_y_continuous(limits=c(0, 500), breaks=seq(0,500,100),
                     expand = c(0,0)) +
  theme_classic()

##descriptive statisticsfor each of the variables 
chd1500 %>% select(age, alcoh, bmi, hdl, insulin,pkyrs) %>%
  tbl_summary(missing = "ifany",
              type = all_continuous() ~ "continuous2",
              statistic = all_continuous() ~ c("{mean} ({sd})",
                                               "{median} ({p25}, {p75})", 
                                               "{min}, {max}",
                                               "{skewness}"),
              digits = list(all_continuous() ~ 1), 
              label = list(age ~ "Age (y)",
                      
                           alcoh ~ "Alcohol consumption (drinks/wk)",
                           bmi ~"BMI",
                           hdl ~"HDL cholesterol(mg/dl) ",
                           insulin ~"Serum insulin (IU/ml)",
                           pkyrs ~"Pack-years of smoking ")
  ) %>%
  modify_header(label = "**Variable**") %>%
  as_flex_table()

Variable

N = 1,500

Age (y)

Mean (SD)

72.4 (5.4)

Median (Q1, Q3)

71.0 (68.0, 76.0)

Min, Max

65.0, 98.0

skewness

1.0

Alcohol consumption (drinks/wk)

Mean (SD)

2.4 (6.2)

Median (Q1, Q3)

0.0 (0.0, 1.0)

Min, Max

0.0, 77.0

skewness

4.8

BMI

Mean (SD)

26.4 (4.6)

Median (Q1, Q3)

25.9 (23.3, 28.9)

Min, Max

14.7, 49.4

skewness

0.9

HDL cholesterol(mg/dl)

Mean (SD)

54.7 (15.5)

Median (Q1, Q3)

52.0 (44.0, 63.0)

Min, Max

15.0, 149.0

skewness

1.0

Serum insulin (IU/ml)

Mean (SD)

16.0 (20.6)

Median (Q1, Q3)

12.0 (9.0, 17.0)

Min, Max

5.0, 400.0

skewness

12.8

Pack-years of smoking

Mean (SD)

16.5 (26.2)

Median (Q1, Q3)

0.0 (0.0, 26.8)

Min, Max

0.0, 240.0

skewness

2.2

#creating a summary table of CHD by diabetes status and sex
chd1500 %>% select(incchd, diabetes, sex) %>%
  tbl_summary(by=incchd,
              missing = "ifany",
              percent = "row",
              digits = list(all_categorical() ~ c(0,1)), 
              label = list(diabetes ~ "Diabetes status", sex ~ "Sex")
  ) %>%
  modify_header(label = "**Variable**") %>%
  as_flex_table()

Variable

No
N = 1,2821

Yes
N = 2181

Diabetes status

Normal

963 (87.1%)

142 (12.9%)

Impaired

167 (83.1%)

34 (16.9%)

Diabetic

152 (78.4%)

42 (21.6%)

Sex

Female

818 (89.3%)

98 (10.7%)

Male

464 (79.5%)

120 (20.5%)

1n (%)

#creating boxplots of HDL cholesterol by CHD
ggplot(data=chd1500, aes(x=incchd, y=hdl, fill=incchd)) +
  geom_boxplot() +
  labs(x="CHD",
       y="HDL cholesterol (mg/dL)",
       title="Boxplot of HLD cholesterol by CHD") +
  scale_y_continuous(limits=c(0, 160), breaks=seq(0,160,20)) +
  theme_classic() +
  theme(legend.position="none")

#creating boxplots of age by CHD
ggplot(data=chd1500, aes(x=incchd, y=age, fill=incchd)) +
  geom_boxplot() +
  labs(x="CHD",
       y="Age in years",
       title="Boxplot of Age by CHD") +
  scale_y_continuous(limits=c(60, 100), breaks=seq(0,160,10)) +
  theme_classic() +
  theme(legend.position="none")

#relationship of CHD with HDL cholesterol and age by a summary table 
chd1500 %>% select(incchd, hdl, age) %>%
  tbl_summary(by=incchd,
              missing = "ifany",
              type = all_continuous() ~ "continuous2",
              statistic = all_continuous() ~ c("{mean} ({sd})",
                                               "{median} ({p25}, {p75})", 
                                               "{min}, {max}",
                                               "{skewness}"),
              digits = list(all_continuous() ~ 1), 
              label = list(hdl ~ "HDL cholesterol (mg/dL)",age ~ "Age (y)" )
  ) %>%
  modify_header(label = "**Variable**") %>%
  as_flex_table()

Variable

No
N = 1,282

Yes
N = 218

HDL cholesterol (mg/dL)

Mean (SD)

55.2 (15.5)

51.8 (14.9)

Median (Q1, Q3)

53.0 (44.0, 64.0)

49.0 (42.0, 59.0)

Min, Max

15.0, 149.0

21.0, 129.0

skewness

1.0

1.4

Age (y)

Mean (SD)

72.2 (5.4)

73.2 (5.6)

Median (Q1, Q3)

71.0 (68.0, 75.0)

72.0 (68.0, 77.0)

Min, Max

65.0, 98.0

66.0, 90.0

skewness

1.1

0.7