preparation

The setup chunk is just to tell the fill that it is a markdown file. The text above — is called YAML and it is a heading that will created automatically.

install and activate packages(libraries)

# for the fist time you need to install pacman package
# install.packages("pacman")
# library(pacman)

pacman::p_load(
  rio,          # file import/export
  here,         # relative filepaths 
  lubridate,    # working with dates/epiweeks
  aweek,        # alternative package for working with dates/epiweeks
  incidence2,   # epicurves of linelist data
  i2extras,     # supplement to incidence2
  stringr,      # search and manipulate character strings
  forcats,      # working with factors
  RColorBrewer, # Color palettes from colorbrewer2.org
  scales,
  tidyverse,     # data management + ggplot2 graphics
  outbreaks,     # contains outbreaks data
  epikit,        # tools for epidemiology
  cleaner,       # cleaning and checking data columns
  janitor,       # clean column names
  stringr        # manipulate text (strings)
)



here::set_here()    # make your current directory the active one so it finds files within same folder, very important

variables

a=5
b=6
a+b
## [1] 11
c="Hello"
c
## [1] "Hello"
d= c(1,5,8,9)

d*2
## [1]  2 10 16 18
d = c(d, 5)
d
## [1] 1 5 8 9 5
length(d)
## [1] 5
e=c("Iyad", "Ahmad", "Samer")
e
## [1] "Iyad"  "Ahmad" "Samer"
c(e, 7)
## [1] "Iyad"  "Ahmad" "Samer" "7"
g=e=="Ahmad"

e[g]
## [1] "Ahmad"

subsetting

a<- c(12,23,54,65,78)

a[c(FALSE, FALSE,  TRUE,  TRUE,  TRUE)]
## [1] 54 65 78
a[a>30]
## [1] 54 65 78
mtcars
##                      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive      21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## Valiant             18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## Merc 240D           24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Merc 230            22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
## Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
## Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
## Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
## Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## Fiat 128            32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## Honda Civic         30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## Toyota Corolla      33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## Toyota Corona       21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## Dodge Challenger    15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
## AMC Javelin         15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
## Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
## Pontiac Firebird    19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
## Fiat X1-9           27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## Porsche 914-2       26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## Lotus Europa        30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## Ford Pantera L      15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## Ferrari Dino        19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
## Maserati Bora       15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
## Volvo 142E          21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
# average mpg for all cars
mean(mtcars$mpg)
## [1] 20.09062
# Average mpg for cars with 6 cylinders
mtcars$cyl==6
##  [1]  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
mean(mtcars$mpg[mtcars$cyl==6])
## [1] 19.74286

home work week session 2

Using the Global Health Observatory data repository , extract the number of cases of malaria and number of deaths for Malaria in Sudan, Somalia, Djibouti and South Sudan per year. Plot the results for the 4 countries using ggplot in a line graph, each country using a different color. I expect 2 have 2 figures, each with 4 panels

Note: you may need to find a way to convert a number like this (193 232 [149 000-245 000]) to 193232. Also you may need to use function gather or pivot_longer to make your table ready for ggplot and facet_wrap to produce multiple facets.

malaria_cases<-read.csv("malaria_cases.csv", skip = 1)
malaria_deaths<- read.csv("malaria_deaths.csv", skip = 1)
malaria_cases<- malaria_cases %>% filter(Countries..territories.and.areas %in% c("Sudan", "Djibouti", "South Sudan", "Somalia"))
colnames(malaria_cases[,-1])<- as.numeric(gsub("X", "", colnames(malaria_cases[,-1])))
malaria_cases
##   Countries..territories.and.areas                           X2021
## 1                         Djibouti                          58 445
## 2                          Somalia   1 131 892 [688 000-1 767 000]
## 3                      South Sudan 2 953 582 [1 643 000-4 972 000]
## 4                            Sudan 3 325 874 [1 643 000-5 906 000]
##                             X2020                           X2019
## 1                          72 332                          49 402
## 2     926 877 [570 000-1 451 000]     957 015 [624 000-1 400 000]
## 3 3 042 784 [1 664 000-5 116 000] 2 938 499 [1 652 000-4 811 000]
## 4 3 261 859 [1 614 000-5 853 000] 2 835 630 [1 360 000-5 217 000]
##                             X2018                           X2017
## 1                          24 845                          14 671
## 2     913 696 [579 000-1 366 000]     820 464 [513 000-1 237 000]
## 3 2 898 120 [1 695 000-4 628 000] 2 960 681 [1 791 000-4 578 000]
## 4 2 485 883 [1 154 000-4 696 000] 2 187 618 [1 075 000-3 956 000]
##                             X2016                           X2015
## 1                          13 822                           9 473
## 2     837 484 [560 000-1 192 000]     811 764 [572 000-1 112 000]
## 3 2 949 646 [1 901 000-4 401 000] 3 033 662 [2 108 000-4 190 000]
## 4 2 097 220 [1 185 000-3 462 000] 1 611 298 [1 053 000-2 378 000]
##                             X2014                           X2013
## 1                           9 439                           1 684
## 2       668 820 [476 000-917 000]       534 460 [381 000-729 000]
## 3 3 098 291 [2 282 000-4 115 000] 3 173 418 [2 399 000-4 121 000]
## 4 1 427 317 [1 008 000-1 956 000]   1 257 489 [924 000-1 683 000]
##                             X2012                           X2011
## 1             2 226 [1 700-2 700]             2 185 [1 700-2 700]
## 2       433 472 [313 000-589 000]       419 303 [300 000-571 000]
## 3 3 079 924 [2 358 000-3 971 000] 2 979 272 [2 308 000-3 782 000]
## 4   1 162 568 [868 000-1 530 000]   1 120 801 [836 000-1 478 000]
##                             X2010                           X2009
## 1                           1 010                           2 686
## 2       500 313 [359 000-679 000]       454 522 [325 000-618 000]
## 3 2 775 374 [2 183 000-3 450 000] 2 640 300 [2 099 000-3 268 000]
## 4   1 104 287 [820 000-1 457 000]   1 140 986 [850 000-1 493 000]
##                             X2008                           X2007
## 1             2 060 [1 600-2 500]             2 018 [1 600-2 500]
## 2       719 911 [525 000-965 000]   1 128 218 [825 000-1 510 000]
## 3 2 586 745 [2 000 000-3 267 000] 2 520 885 [1 819 000-3 395 000]
## 4   1 238 850 [887 000-1 678 000]   1 450 820 [961 000-2 105 000]
##                             X2006                           X2005
## 1             1 976 [1 500-2 400]             1 938 [1 500-2 400]
## 2   1 274 766 [940 000-1 689 000]   1 392 857 [996 000-1 881 000]
## 3 2 450 862 [1 710 000-3 393 000] 2 342 844 [1 618 000-3 292 000]
## 4 1 597 895 [1 009 000-2 435 000]   1 614 440 [994 000-2 428 000]
##                             X2004                           X2003
## 1             1 909 [1 500-2 300]             1 881 [1 400-2 300]
## 2   1 245 035 [874 000-1 743 000]   1 268 524 [857 000-1 814 000]
## 3 2 243 889 [1 514 000-3 253 000] 2 229 084 [1 421 000-3 353 000]
## 4   1 596 519 [988 000-2 429 000] 1 722 030 [1 061 000-2 663 000]
##                             X2002                           X2001
## 1             1 841 [1 400-2 300]             1 786 [1 400-2 200]
## 2   1 236 218 [818 000-1 768 000]   1 283 399 [833 000-1 902 000]
## 3 2 250 591 [1 410 000-3 418 000] 2 283 971 [1 488 000-3 398 000]
## 4 1 837 673 [1 156 000-2 863 000] 2 326 967 [1 516 000-3 408 000]
##                             X2000
## 1             1 731 [1 300-2 100]
## 2   1 177 314 [677 000-1 906 000]
## 3 2 245 004 [1 533 000-3 162 000]
## 4 2 439 775 [1 624 000-3 493 000]
# Function to convert numbers in a specific column
convert_number <- function(x) {
  # Remove any non-numeric characters
  x <- gsub("[^0-9]", "", x)
  # Convert to numeric
  as.numeric(x)
}

dim(malaria_cases)
## [1]  4 23
# Apply the function to all columns
colnames(malaria_cases[,2:23])<- sapply(colnames(malaria_cases[,2:23]), convert_number)
malaria_cases<-malaria_cases %>% gather(year, cases, -Countries..territories.and.areas)
malaria_cases$year<- as.numeric(malaria_cases$year)

Survival Curves

set.seed=123
age<- abs(rnorm(100, mean=40, sd=10))           # random numbers from random distribution
gender<- sample(c("M", "F"), 100, replace=T)    # random gender using sampling
follow_up<- abs(rnorm(100, mean=36, sd=12))     # random FU , mean 36 months, sd 12,  abs to get all results in +
stage<- c(rep("I", 25), rep("II", 25), rep("III", 25), rep("IV", 25))     # 25 patints of each stage
status1<- c(rep(0,22), rep(1,3))        # status more likely to be 0 (alive) for stage I and more likely to be 1 (dead) for advanced stages
status2<- c(rep(0,18), rep(1,7)) 
status3<- c(rep(0,12), rep(1,13)) 
status4<- c(rep(0,5), rep(1,20)) 
status<- c(status1, status2, status3, status4)

ds<- data_frame(age, gender, stage, follow_up, status)
ds
## # A tibble: 100 × 5
##      age gender stage follow_up status
##    <dbl> <chr>  <chr>     <dbl>  <dbl>
##  1  28.3 F      I          33.6      0
##  2  48.2 F      I          29.0      0
##  3  50.7 F      I          29.3      0
##  4  31.5 M      I          37.2      0
##  5  22.4 F      I          46.6      0
##  6  51.6 F      I          23.4      0
##  7  22.1 F      I          29.3      0
##  8  39.0 F      I          11.6      0
##  9  68.7 F      I          50.9      0
## 10  46.5 F      I          32.0      0
## # ℹ 90 more rows
library(survival)
library(survminer)

# Kaplan Meier for all patients
fit<- survfit(Surv(follow_up, status)~1, data=ds)
print(fit)
## Call: survfit(formula = Surv(follow_up, status) ~ 1, data = ds)
## 
##        n events median 0.95LCL 0.95UCL
## [1,] 100     43   48.4      41    55.5
summary(fit, c(12, 36))
## Call: survfit(formula = Surv(follow_up, status) ~ 1, data = ds)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    12     99       0    1.000  0.0000        1.000        1.000
##    36     46      23    0.718  0.0509        0.625        0.825
ggsurvplot(fit, ds)

There are 100 patients who had 43 events (death). The median survival of this group is 46 months (95% confidence limits are 43 and 52)

The overall survival at 1 year was 99% +/- 1% The overall survival at 3 years was 76% +/- 4.6%

# Kaplan Meier for all patients according to stage
fit<- survfit(Surv(follow_up, status)~stage, data=ds)
print(fit)
## Call: survfit(formula = Surv(follow_up, status) ~ stage, data = ds)
## 
##            n events median 0.95LCL 0.95UCL
## stage=I   25      3     NA      NA      NA
## stage=II  25      7   55.5    53.0      NA
## stage=III 25     13   45.6    35.3      NA
## stage=IV  25     20   36.5    30.2    51.8
summary(fit, c(12, 36))
## Call: survfit(formula = Surv(follow_up, status) ~ stage, data = ds)
## 
##                 stage=I 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    12     24       0    1.000   0.000        1.000            1
##    36      9       3    0.803   0.107        0.618            1
## 
##                 stage=II 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    12     25       0    1.000  0.0000        1.000            1
##    36     13       2    0.909  0.0616        0.796            1
## 
##                 stage=III 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    12     25       0     1.00   0.000        1.000        1.000
##    36     13       7     0.66   0.105        0.484        0.902
## 
##                 stage=IV 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    12     25       0    1.000   0.000        1.000        1.000
##    36     11      11    0.539   0.103        0.371        0.784
ggsurvplot(fit, ds, pval=T)

There is a significant difference among patients according to stage with a p value of0.0012 (log-rank test)

Cohort studies

Let us take data from Fremingham study as an example

# Download Framingham study
# URL of the dataset
url <- "https://ocw.mit.edu/courses/15-071-the-analytics-edge-spring-2017/5d689a024551e672313f7fd7eb1bee8d_framingham.csv"

# Downloading and reading the dataset
ds <- read.csv(url)

# Viewing the first few rows of the dataset
head(ds)
##   male age education currentSmoker cigsPerDay BPMeds prevalentStroke
## 1    1  39         4             0          0      0               0
## 2    0  46         2             0          0      0               0
## 3    1  48         1             1         20      0               0
## 4    0  61         3             1         30      0               0
## 5    0  46         3             1         23      0               0
## 6    0  43         2             0          0      0               0
##   prevalentHyp diabetes totChol sysBP diaBP   BMI heartRate glucose TenYearCHD
## 1            0        0     195 106.0    70 26.97        80      77          0
## 2            0        0     250 121.0    81 28.73        95      76          0
## 3            0        0     245 127.5    80 25.34        75      70          0
## 4            1        0     225 150.0    95 28.58        65     103          1
## 5            0        0     285 130.0    84 23.10        85      85          0
## 6            1        0     228 180.0   110 30.30        77      99          0
class(ds)
## [1] "data.frame"
colnames(ds)
##  [1] "male"            "age"             "education"       "currentSmoker"  
##  [5] "cigsPerDay"      "BPMeds"          "prevalentStroke" "prevalentHyp"   
##  [9] "diabetes"        "totChol"         "sysBP"           "diaBP"          
## [13] "BMI"             "heartRate"       "glucose"         "TenYearCHD"
dim(ds)
## [1] 4240   16
head(ds)
##   male age education currentSmoker cigsPerDay BPMeds prevalentStroke
## 1    1  39         4             0          0      0               0
## 2    0  46         2             0          0      0               0
## 3    1  48         1             1         20      0               0
## 4    0  61         3             1         30      0               0
## 5    0  46         3             1         23      0               0
## 6    0  43         2             0          0      0               0
##   prevalentHyp diabetes totChol sysBP diaBP   BMI heartRate glucose TenYearCHD
## 1            0        0     195 106.0    70 26.97        80      77          0
## 2            0        0     250 121.0    81 28.73        95      76          0
## 3            0        0     245 127.5    80 25.34        75      70          0
## 4            1        0     225 150.0    95 28.58        65     103          1
## 5            0        0     285 130.0    84 23.10        85      85          0
## 6            1        0     228 180.0   110 30.30        77      99          0
str(ds)
## 'data.frame':    4240 obs. of  16 variables:
##  $ male           : int  1 0 1 0 0 0 0 0 1 1 ...
##  $ age            : int  39 46 48 61 46 43 63 45 52 43 ...
##  $ education      : int  4 2 1 3 3 2 1 2 1 1 ...
##  $ currentSmoker  : int  0 0 1 1 1 0 0 1 0 1 ...
##  $ cigsPerDay     : int  0 0 20 30 23 0 0 20 0 30 ...
##  $ BPMeds         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ prevalentStroke: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ prevalentHyp   : int  0 0 0 1 0 1 0 0 1 1 ...
##  $ diabetes       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ totChol        : int  195 250 245 225 285 228 205 313 260 225 ...
##  $ sysBP          : num  106 121 128 150 130 ...
##  $ diaBP          : num  70 81 80 95 84 110 71 71 89 107 ...
##  $ BMI            : num  27 28.7 25.3 28.6 23.1 ...
##  $ heartRate      : int  80 95 75 65 85 77 60 79 76 93 ...
##  $ glucose        : int  77 76 70 103 85 99 85 78 79 88 ...
##  $ TenYearCHD     : int  0 0 0 1 0 0 1 0 0 0 ...
library(ggplot2)
library(dplyr)
ds %>%
  mutate(male=as.character(male)) %>% 
  ggplot(aes(x=age, fill=male))+
  geom_histogram()+
  theme_classic()+
  labs(y="Number of individuals", x="Age in years", title="histogram", fill="Sex")

# install package if not install nbefore
# install.packages("gtsummary")
library(gtsummary)
ds %>% 
  mutate(across(c(male, education, currentSmoker, BPMeds, prevalentStroke, prevalentHyp, diabetes,TenYearCHD), as.character)) %>% 
  gtsummary::tbl_summary()
Characteristic N = 4,2401
male
    0 2,420 (57%)
    1 1,820 (43%)
age 49 (42, 56)
education
    1 1,720 (42%)
    2 1,253 (30%)
    3 689 (17%)
    4 473 (11%)
    Unknown 105
currentSmoker
    0 2,145 (51%)
    1 2,095 (49%)
cigsPerDay 0 (0, 20)
    Unknown 29
BPMeds
    0 4,063 (97%)
    1 124 (3.0%)
    Unknown 53
prevalentStroke
    0 4,215 (99%)
    1 25 (0.6%)
prevalentHyp
    0 2,923 (69%)
    1 1,317 (31%)
diabetes
    0 4,131 (97%)
    1 109 (2.6%)
totChol 234 (206, 263)
    Unknown 50
sysBP 128 (117, 144)
diaBP 82 (75, 90)
BMI 25.4 (23.1, 28.0)
    Unknown 19
heartRate 75 (68, 83)
    Unknown 1
glucose 78 (71, 87)
    Unknown 388
TenYearCHD
    0 3,596 (85%)
    1 644 (15%)
1 n (%); Median (IQR)

make a contingency table showing the risk of Ten Year

# install.packages("epitools")
library(epitools)
tbl<- table(ds$prevalentHyp, ds$TenYearCHD)
tbl
##    
##        0    1
##   0 2604  319
##   1  992  325
riskratio(tbl)
## $data
##        
##            0   1 Total
##   0     2604 319  2923
##   1      992 325  1317
##   Total 3596 644  4240
## 
## $measure
##    risk ratio with 95% C.I.
##     estimate   lower    upper
##   0 1.000000      NA       NA
##   1 2.261183 1.96556 2.601268
## 
## $p.value
##    two-sided
##     midp.exact fisher.exact   chi.square
##   0         NA           NA           NA
##   1          0 5.783326e-29 6.948619e-31
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "Unconditional MLE & normal approximation (Wald) CI"

The chi-square test revealed a highly significant association between exposure and outcome, evidenced by a risk ratio (RR) of 2.26 and a p-value of less than 0.001, underscoring the strong link between having hypertension and the increased risk of developing CHF in 10 years.

odds ratio

Odds ratio are not suitable for cohort studies. They are used in case-control studies. Nevertheless, here is an application

epitools::oddsratio(tbl)
## $data
##        
##            0   1 Total
##   0     2604 319  2923
##   1      992 325  1317
##   Total 3596 644  4240
## 
## $measure
##    odds ratio with 95% C.I.
##     estimate    lower    upper
##   0 1.000000       NA       NA
##   1 2.673608 2.253526 3.172686
## 
## $p.value
##    two-sided
##     midp.exact fisher.exact   chi.square
##   0         NA           NA           NA
##   1          0 5.783326e-29 6.948619e-31
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "median-unbiased estimate & mid-p exact CI"

How can we predict the risk of developing CHD at 10 years?

# logistic regression
model<- glm(TenYearCHD~male+age+education+currentSmoker+cigsPerDay+
              BPMeds+prevalentStroke+prevalentHyp+diabetes+totChol+
              sysBP+diaBP+BMI+heartRate+glucose, 
            data=ds)
model
## 
## Call:  glm(formula = TenYearCHD ~ male + age + education + currentSmoker + 
##     cigsPerDay + BPMeds + prevalentStroke + prevalentHyp + diabetes + 
##     totChol + sysBP + diaBP + BMI + heartRate + glucose, data = ds)
## 
## Coefficients:
##     (Intercept)             male              age        education  
##      -0.5762585        0.0574246        0.0070575       -0.0056161  
##   currentSmoker       cigsPerDay           BPMeds  prevalentStroke  
##       0.0081070        0.0021636        0.0459077        0.1368394  
##    prevalentHyp         diabetes          totChol            sysBP  
##       0.0283079        0.0258235        0.0001293        0.0024734  
##           diaBP              BMI        heartRate          glucose  
##      -0.0011731        0.0000790       -0.0003590        0.0011442  
## 
## Degrees of Freedom: 3657 Total (i.e. Null);  3642 Residual
##   (582 observations deleted due to missingness)
## Null Deviance:       472.2 
## Residual Deviance: 424.4     AIC: 2536
summary(model)
## 
## Call:
## glm(formula = TenYearCHD ~ male + age + education + currentSmoker + 
##     cigsPerDay + BPMeds + prevalentStroke + prevalentHyp + diabetes + 
##     totChol + sysBP + diaBP + BMI + heartRate + glucose, data = ds)
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -0.5762585  0.0789125  -7.303 3.45e-13 ***
## male             0.0574246  0.0124715   4.604 4.28e-06 ***
## age              0.0070575  0.0007691   9.176  < 2e-16 ***
## education       -0.0056161  0.0056705  -0.990 0.322040    
## currentSmoker    0.0081070  0.0181548   0.447 0.655226    
## cigsPerDay       0.0021636  0.0007827   2.764 0.005733 ** 
## BPMeds           0.0459077  0.0346845   1.324 0.185726    
## prevalentStroke  0.1368394  0.0753542   1.816 0.069460 .  
## prevalentHyp     0.0283079  0.0174492   1.622 0.104825    
## diabetes         0.0258235  0.0442138   0.584 0.559217    
## totChol          0.0001293  0.0001353   0.956 0.339118    
## sysBP            0.0024734  0.0004960   4.987 6.42e-07 ***
## diaBP           -0.0011731  0.0008166  -1.437 0.150912    
## BMI              0.0000790  0.0015460   0.051 0.959248    
## heartRate       -0.0003590  0.0004927  -0.729 0.466217    
## glucose          0.0011442  0.0003022   3.786 0.000155 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1165278)
## 
##     Null deviance: 472.19  on 3657  degrees of freedom
## Residual deviance: 424.39  on 3642  degrees of freedom
##   (582 observations deleted due to missingness)
## AIC: 2535.6
## 
## Number of Fisher Scoring iterations: 2

In a multivariable logistic regression model, males were at 5.7%+/-1.2% higher risk of developing the outcome of CHD at 10 years with a p value of <0.001. For every year increase in age therew as a 0.7% +/- 0.07% increase in risk of CHD (p<0.001)

more neat way to do it

# logistic regression

model<- glm(TenYearCHD~., data=ds)

summary(model)
## 
## Call:
## glm(formula = TenYearCHD ~ ., data = ds)
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -0.5762585  0.0789125  -7.303 3.45e-13 ***
## male             0.0574246  0.0124715   4.604 4.28e-06 ***
## age              0.0070575  0.0007691   9.176  < 2e-16 ***
## education       -0.0056161  0.0056705  -0.990 0.322040    
## currentSmoker    0.0081070  0.0181548   0.447 0.655226    
## cigsPerDay       0.0021636  0.0007827   2.764 0.005733 ** 
## BPMeds           0.0459077  0.0346845   1.324 0.185726    
## prevalentStroke  0.1368394  0.0753542   1.816 0.069460 .  
## prevalentHyp     0.0283079  0.0174492   1.622 0.104825    
## diabetes         0.0258235  0.0442138   0.584 0.559217    
## totChol          0.0001293  0.0001353   0.956 0.339118    
## sysBP            0.0024734  0.0004960   4.987 6.42e-07 ***
## diaBP           -0.0011731  0.0008166  -1.437 0.150912    
## BMI              0.0000790  0.0015460   0.051 0.959248    
## heartRate       -0.0003590  0.0004927  -0.729 0.466217    
## glucose          0.0011442  0.0003022   3.786 0.000155 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1165278)
## 
##     Null deviance: 472.19  on 3657  degrees of freedom
## Residual deviance: 424.39  on 3642  degrees of freedom
##   (582 observations deleted due to missingness)
## AIC: 2535.6
## 
## Number of Fisher Scoring iterations: 2

account for interactions

model
## 
## Call:  glm(formula = TenYearCHD ~ ., data = ds)
## 
## Coefficients:
##     (Intercept)             male              age        education  
##      -0.5762585        0.0574246        0.0070575       -0.0056161  
##   currentSmoker       cigsPerDay           BPMeds  prevalentStroke  
##       0.0081070        0.0021636        0.0459077        0.1368394  
##    prevalentHyp         diabetes          totChol            sysBP  
##       0.0283079        0.0258235        0.0001293        0.0024734  
##           diaBP              BMI        heartRate          glucose  
##      -0.0011731        0.0000790       -0.0003590        0.0011442  
## 
## Degrees of Freedom: 3657 Total (i.e. Null);  3642 Residual
##   (582 observations deleted due to missingness)
## Null Deviance:       472.2 
## Residual Deviance: 424.4     AIC: 2536
model<- glm(TenYearCHD~male+log(age)+education+currentSmoker*cigsPerDay+
              BPMeds+prevalentStroke+prevalentHyp*(sysBP+diaBP)+diabetes*glucose+totChol+
              BMI+heartRate, 
            data=ds)

summary(model)
## 
## Call:
## glm(formula = TenYearCHD ~ male + log(age) + education + currentSmoker * 
##     cigsPerDay + BPMeds + prevalentStroke + prevalentHyp * (sysBP + 
##     diaBP) + diabetes * glucose + totChol + BMI + heartRate, 
##     data = ds)
## 
## Coefficients: (1 not defined because of singularities)
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -1.4083191  0.1613695  -8.727  < 2e-16 ***
## male                      0.0613811  0.0124986   4.911 9.46e-07 ***
## log(age)                  0.3498910  0.0379387   9.223  < 2e-16 ***
## education                -0.0060349  0.0056683  -1.065  0.28709    
## currentSmoker             0.0063685  0.0181564   0.351  0.72579    
## cigsPerDay                0.0021032  0.0007825   2.688  0.00722 ** 
## BPMeds                    0.0323081  0.0349106   0.925  0.35479    
## prevalentStroke           0.1368524  0.0752759   1.818  0.06914 .  
## prevalentHyp             -0.2589343  0.1161291  -2.230  0.02583 *  
## sysBP                     0.0015043  0.0007532   1.997  0.04589 *  
## diaBP                    -0.0013646  0.0011676  -1.169  0.24258    
## diabetes                 -0.1514163  0.0857292  -1.766  0.07744 .  
## glucose                   0.0003264  0.0004496   0.726  0.46794    
## totChol                   0.0001483  0.0001360   1.090  0.27566    
## BMI                       0.0003537  0.0015464   0.229  0.81910    
## heartRate                -0.0002473  0.0004937  -0.501  0.61648    
## currentSmoker:cigsPerDay         NA         NA      NA       NA    
## prevalentHyp:sysBP        0.0018302  0.0009491   1.928  0.05389 .  
## prevalentHyp:diaBP        0.0004230  0.0015724   0.269  0.78793    
## diabetes:glucose          0.0014558  0.0006066   2.400  0.01644 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1162823)
## 
##     Null deviance: 472.19  on 3657  degrees of freedom
## Residual deviance: 423.15  on 3639  degrees of freedom
##   (582 observations deleted due to missingness)
## AIC: 2530.9
## 
## Number of Fisher Scoring iterations: 2

how can we measure the accuracy of this prediction model?

split the dataset to training and testing sets

# In R split my dataframe (ds) into training and testing sets (8:2)

library(dplyr)

# Splitting the dataframe into training and testing sets (80:20 ratio)
set.seed(123) # Setting seed for reproducibility
split_indices <- sample(nrow(ds), size = 0.8 * nrow(ds))

train_set <- ds[split_indices, ]
test_set <- ds[-split_indices, ]

dim(train_set)
## [1] 3392   16
dim(test_set)
## [1] 848  16

Use training to build the model

model<- glm(TenYearCHD~., data=train_set)

Use the model to predict the test_set risk

predict(model, test_set)
##             6            14            22            47            50 
##  0.1864189843  0.0928425663            NA  0.3337052995            NA 
##            53            57            58            61            62 
##  0.1155807970  0.0934999615  0.2407292167  0.2206462145  0.3233701601 
##            64            66            73            84            85 
##  0.1449275108  0.2514058889            NA  0.1313757842  0.3304128478 
##            86            91            93            98           101 
## -0.0273923202  0.2377143747  0.0344578205            NA  0.1062705545 
##           105           106           110           112           119 
##  0.0427564992  0.1785929419  0.1674169336            NA -0.0220756928 
##           126           127           128           130           135 
##  0.0102604525  0.1178577438  0.0300841633 -0.0199405271  0.0909630413 
##           146           149           157           168           169 
##  0.1799988651  0.2226439412  0.2477008004  0.0025076691  0.1335498602 
##           174           176           177           188           189 
## -0.0100171280  0.3896023218  0.3054941289  0.2224388315  0.0957641167 
##           190           196           205           208           210 
##  0.1530904282  0.0307673440  0.0333729940  0.1547926226  0.1623570227 
##           219           225           226           227           228 
##  0.1264028185  0.1221019631  0.1894171588  0.2469175100  0.3574531357 
##           233           236           248           254           257 
##  0.1048957094 -0.0546037802            NA  0.2355558002  0.1464064820 
##           265           273           274           293           298 
##  0.2355775408  0.2211500886            NA  0.1598529461 -0.0749319108 
##           300           301           305           306           308 
##  0.2731083851  0.0354142354  0.0178103385            NA  0.1427704280 
##           310           313           324           332           334 
##            NA  0.1026373319  0.0457480870  0.0483895634  0.4136885872 
##           336           339           342           345           356 
##  0.0614450375            NA  0.0293957768  0.2984557935            NA 
##           360           363           364           367           375 
##  0.3360324661  0.0545165790  0.0620199460  0.1344716546  0.2415831120 
##           381           385           389           397           399 
##  0.0824471726  0.1529543127 -0.0122244446  0.0645309944  0.3359130137 
##           402           403           404           405           410 
##  0.0597311305 -0.0016390065 -0.0016564302  0.1146670178  0.2313635396 
##           411           412           414           417           425 
##  0.0159728803  0.2494009095            NA  0.1678932970  0.0047234030 
##           426           437           438           440           441 
##            NA            NA  0.1928991506  0.1919923263 -0.0314169461 
##           446           447           454           465           476 
##  0.2716006010  0.4113820438  0.0657867654  0.2599799312  0.3986534958 
##           478           486           495           501           507 
##  0.0876140946  0.2960126339  0.2460211815 -0.0357970939  0.3133320992 
##           508           509           511           514           516 
##  0.1513688364  0.0999742484  0.0905702575  0.1607353078  0.1804274964 
##           519           520           521           531           534 
##  0.0921103239            NA  0.1575137656 -0.0128202303  0.0971599931 
##           538           540           550           557           560 
##  0.3015137141  0.0385032044  0.1874033315  0.0851801241  0.0048740633 
##           562           563           565           566           568 
##  0.0940836380 -0.0088446678  0.0721460121  0.1232486933            NA 
##           569           571           573           575           576 
## -0.0624342081  0.0542074955  0.0465671718  0.1175007462  0.1641151014 
##           589           591           592           593           594 
##  0.2706459966  0.4823007058            NA  0.0839625731  0.0413021067 
##           595           596           600           604           608 
##  0.0380591236  0.0606661070  0.1069926939  0.3032162140  0.1988579937 
##           611           631           632           641           645 
##  0.3038247965  0.3317610939  0.0906053229  0.1240345137 -0.0101627297 
##           652           653           679           687           691 
##  0.1456110034  0.1652106909  0.2749289581  0.1229033558            NA 
##           700           709           715           717           722 
##  0.4584636596  0.0871925682            NA  0.2235347228  0.4174775514 
##           724           731           734           736           744 
##  0.2213649992 -0.0068222095  0.2585390685  0.2513577375  0.1531443694 
##           751           759           768           770           775 
##  0.4341445497  0.1328199369  0.5195981975            NA  0.0815284440 
##           776           784           792           795           798 
##  0.3036778169  0.2061447121  0.2623656203  0.0682747754            NA 
##           800           801           806           808           823 
##  0.2264040390  0.1580630758  0.1432032743  0.3282131923            NA 
##           824           826           840           844           853 
## -0.0126300063  0.1388356267  0.1327908013  0.0137911695  0.0026050808 
##           857           858           863           864           868 
##  0.1290328589  0.0036171661  0.3509090348  0.1834377549  0.2206665124 
##           892           895           897           898           902 
##  0.0073656802  0.0941803170            NA  0.1373631067  0.2557264840 
##           908           909           917           918           921 
##  0.0302108471  0.3339970087            NA            NA  0.1641505994 
##           928           935           936           937           939 
##  0.3343114839  0.2840355370  0.1777679741            NA  0.0814929847 
##           940           941           944           947           962 
##            NA  0.2244792645            NA  0.1083311611  0.1451743326 
##           973           995           996          1009          1012 
##            NA  0.0576059567            NA  0.3624896052  0.2744687157 
##          1013          1016          1020          1024          1027 
##  0.1519786484  0.3052461527            NA  0.1438601100  0.1084922594 
##          1038          1039          1046          1052          1054 
##  0.1140449251            NA  0.3647378494  0.3090091450  0.2548644835 
##          1059          1067          1069          1084          1092 
##  0.2223270762  0.0760271536  0.3872651253  0.1257410997  0.1742283869 
##          1097          1102          1112          1115          1117 
##  0.0953153339            NA  0.2911975335  0.2209837861 -0.0307113666 
##          1124          1131          1133          1138          1145 
##  0.2792216530  0.2263423122  0.1905132913  0.1286520944  0.2684649340 
##          1168          1175          1176          1177          1181 
##  0.0599358927  0.3030819286            NA  0.1248581734  0.0904256137 
##          1198          1200          1206          1207          1209 
##  0.4056744373  0.0938080492  0.1219626914            NA  0.0270601599 
##          1216          1218          1219          1224          1228 
##  0.1984819825  0.0168372717  0.0036087036  0.0329588461            NA 
##          1237          1238          1248          1259          1262 
##  0.2735910755  0.3746726680 -0.0094782412  0.2734972487  0.0564045902 
##          1263          1280          1282          1285          1289 
##  0.1895507926  0.2663913843 -0.0259454138            NA            NA 
##          1292          1303          1305          1309          1310 
##  0.0894456790  0.3189392777  0.1895163491  0.3897115343 -0.0407242708 
##          1318          1321          1324          1329          1330 
##            NA  0.1431872751  0.1911903700  0.1643337619  0.0870611465 
##          1340          1353          1357          1369          1375 
##  0.1625479491  0.1048357324  0.0308522560            NA  0.1513048049 
##          1383          1390          1398          1402          1407 
##  0.0453630468  0.2059051292  0.1111613491 -0.0009890861  0.0018151051 
##          1408          1411          1418          1421          1430 
## -0.0256325986  0.0593188092  0.1288810619 -0.0356474650            NA 
##          1441          1443          1454          1460          1462 
##  0.0260012121  0.4129457204  0.1423280515 -0.0549779408  0.0986903976 
##          1463          1468          1470          1476          1479 
##  0.1870264963  0.0876719626  0.2044365551  0.0762657323            NA 
##          1484          1488          1490          1495          1496 
##  0.1976691666  0.1231703110  0.0803057875  0.1581869047  0.0577298128 
##          1502          1510          1511          1524          1525 
##  0.0980996185  0.0393468858  0.1533099046 -0.0461960269            NA 
##          1527          1529          1534          1535          1540 
##  0.0121751205  0.0857297553  0.1268892511  0.2008108111  0.1910583368 
##          1542          1546          1550          1564          1567 
##  0.1224628302  0.1047981407  0.1447432155  0.2234963121            NA 
##          1578          1583          1588          1591          1592 
##  0.0919705268  0.0608889989 -0.0242780509  0.1433698356  0.2322682404 
##          1595          1596          1598          1601          1615 
##            NA  0.3313159876 -0.0390673065  0.3629124572  0.3455674145 
##          1618          1619          1620          1625          1626 
##            NA  0.1424691060  0.3079387964            NA            NA 
##          1628          1637          1645          1649          1659 
##  0.0318775312  0.0934229327 -0.0231120605  0.1402887448            NA 
##          1660          1661          1663          1664          1671 
##  0.0755616078  0.3277313673  0.2297770987  0.3041345965  0.0400403710 
##          1675          1676          1677          1678          1680 
##  0.3672950895            NA  0.1520671349  0.1085358535 -0.0065506119 
##          1683          1684          1685          1687          1688 
## -0.0297410210 -0.0042253726  0.2782284505  0.0698589283            NA 
##          1704          1729          1732          1749          1750 
##  0.1357836139  0.1496187898  0.2891968741  0.2548993847  0.2167269944 
##          1753          1763          1768          1771          1780 
##  0.3105410600  0.1413408289  0.1157920373            NA  0.1073800261 
##          1783          1786          1788          1794          1798 
##  0.3608890622            NA  0.1270771878  0.2218915448 -0.0310430781 
##          1801          1810          1812          1813          1816 
##            NA  0.1156621175  0.1401176701  0.2952365006            NA 
##          1817          1819          1825          1830          1837 
##  0.2804497698  0.3094338676  0.2295803696  0.0118184825  0.2461972049 
##          1843          1844          1849          1871          1875 
##  0.3387752095  0.2146543900  0.0563573381            NA  0.2459008316 
##          1888          1893          1897          1902          1904 
## -0.0050206318  0.1161171467  0.1715046874  0.2342013010  0.0310034836 
##          1910          1913          1921          1924          1944 
##  0.0987967339  0.0484807183  0.2265500901  0.0670837339  0.0963038365 
##          1945          1947          1952          1954          1967 
##  0.0426420071  0.1221475094            NA  0.0883100989  0.2811555708 
##          1968          1969          1970          1971          1972 
##  0.1595215013  0.0790929403  0.3595483108  0.0455360271            NA 
##          1973          1977          1985          1995          1997 
##  0.1743167657  0.1582226295            NA  0.2785055656  0.0482277206 
##          1999          2000          2007          2010          2011 
##            NA  0.0229651391            NA  0.1357269241            NA 
##          2012          2014          2018          2020          2042 
##  0.2299443702  0.1743746580  0.0898937163  0.0662522128  0.4301335476 
##          2043          2045          2047          2048          2050 
##  0.1862645595  0.0170042731  0.0516636979  0.0905898691            NA 
##          2052          2058          2061          2063          2064 
##  0.0422237535  0.1478686200            NA  0.2372770557  0.1299000519 
##          2070          2073          2079          2095          2098 
##  0.0731284479  0.2726641923 -0.0290202281 -0.0474055107  0.2378487255 
##          2106          2111          2123          2130          2133 
##  0.1422449006            NA -0.0294460637  0.2600193355  0.4049588980 
##          2145          2147          2149          2152          2156 
##  0.1791473204  0.0574126192  0.1646157634  0.0098713316  0.0439298341 
##          2157          2161          2177          2187          2188 
##  0.0001158867            NA  0.0908317439  0.2183976009  0.4576800686 
##          2194          2202          2205          2217          2223 
##  0.1162640100  0.1103650515  0.5285196569            NA  0.3264762720 
##          2239          2245          2247          2254          2259 
##  0.3102891541 -0.0537799265  0.2410597442  0.2889529967  0.1384790336 
##          2262          2264          2268          2275          2282 
##  0.1516181436            NA            NA  0.1543829599  0.1179292660 
##          2295          2296          2298          2320          2336 
##  0.0250705927  0.1968738131  0.1909883786  0.2625760824  0.3191871719 
##          2337          2338          2350          2351          2362 
##  0.0621272198  0.0133467212  0.3819402400  0.0595983224  0.0843368041 
##          2372          2375          2377          2381          2382 
##  0.2752938167  0.1813067559  0.2324425485  0.2755070422 -0.0053230625 
##          2384          2388          2391          2392          2397 
##  0.1626091730  0.0069301359  0.1065562451  0.1558471706  0.0033639339 
##          2402          2408          2416          2420          2427 
## -0.0178879605  0.1054178467  0.2774434639  0.1285407064  0.2187516131 
##          2428          2430          2436          2443          2448 
##  0.0646899454  0.1479578890  0.0582331829  0.0615400195  0.3293667101 
##          2459          2462          2465          2469          2471 
##  0.2325952565  0.5148275697  0.0163856186            NA  0.0684553055 
##          2477          2480          2483          2490          2496 
##  0.2551858413  0.1115460876  0.1125633200  0.0642934835  0.0496988766 
##          2499          2501          2515          2520          2539 
##  0.2729843918  0.1898004309 -0.0672159330  0.0739900177  0.0319923840 
##          2542          2544          2545          2549          2560 
##  0.1796889250  0.1302365038 -0.0420501050 -0.0047573162  0.0450846361 
##          2563          2568          2569          2575          2581 
##  0.0263390324            NA -0.0298419652  0.2017009054  0.1245426706 
##          2587          2590          2598          2603          2604 
##  0.1805547129            NA  0.3906822523  0.1716516033  0.1880868888 
##          2610          2611          2617          2621          2628 
##            NA  0.1382282393  0.0713431102  0.4227941017  0.0878106159 
##          2637          2654          2659          2662          2663 
##  0.0058700076  0.1380531932            NA  0.1993694606  0.1984555307 
##          2665          2672          2674          2675          2677 
##  0.1227667936  0.1873174896  0.0663214086  0.3681025169  0.1678348271 
##          2688          2689          2690          2691          2695 
##  0.3034952023  0.1280782489            NA  0.1733797921  0.2697886855 
##          2703          2712          2715          2717          2718 
##  0.1476512887            NA  0.1102610034  0.3024907827  0.2113935315 
##          2724          2726          2731          2732          2733 
##  0.1471281013  0.0187759371  0.0745090105  0.3012318299  0.0925117352 
##          2742          2745          2755          2766          2773 
## -0.0103774266  0.0904262944  0.0406844223  0.0076484701  0.0045136067 
##          2776          2778          2784          2790          2796 
##  0.2029372032  0.0047969090  0.1409931901  0.2735884715  0.2022482450 
##          2799          2807          2808          2809          2817 
##  0.1395758291  0.2795233784  0.0966271444 -0.0137800292 -0.0048763499 
##          2820          2831          2847          2848          2854 
##  0.4014697153 -0.0415262879            NA            NA  0.1225344130 
##          2858          2870          2878          2883          2885 
## -0.0548120605  0.0658086097  0.2908972460            NA  0.2762910016 
##          2897          2903          2904          2911          2917 
##  0.0427089317            NA  0.1797909208  0.0865110054  0.0565461940 
##          2931          2933          2952          2962          2969 
##  0.4226287736  0.0451111582  0.1033533104  0.4875393289  0.1508875600 
##          2973          2983          2987          2999          3020 
##  0.1448827215  0.0880208656            NA  0.1259073045  0.1465782240 
##          3027          3028          3029          3032          3034 
##  0.4237715676  0.1049495965  0.0565459491  0.3262443471            NA 
##          3037          3040          3041          3047          3048 
## -0.0904935506  0.0353253182  0.2841365023            NA  0.2662457973 
##          3054          3056          3057          3063          3065 
##  0.1104789160            NA  0.0226695034            NA  0.0417984111 
##          3075          3076          3084          3089          3096 
##  0.3726300073 -0.0076397956  0.1789386980  0.5339220022  0.0979883131 
##          3097          3098          3099          3103          3112 
##  0.3269854650  0.0859897277            NA  0.3651693706            NA 
##          3118          3119          3121          3133          3136 
##  0.2598935626            NA  0.0569047896  0.2568841536  0.1025981592 
##          3142          3156          3159          3160          3161 
## -0.0222525732  0.1997911812            NA  0.0768673244            NA 
##          3162          3169          3170          3171          3179 
##  0.2625237587  0.1242687710  0.0145656094  0.1312899618  0.0443367399 
##          3184          3186          3188          3195          3199 
## -0.0209089291  0.1562541100  0.2226791099 -0.0247944395  0.1794207951 
##          3203          3208          3209          3215          3217 
##  0.1359130957            NA -0.0093529847  0.2941419755  0.1206587808 
##          3222          3223          3228          3229          3235 
## -0.0409798328  0.0331283163  0.0801392287  0.0194108682            NA 
##          3238          3244          3245          3252          3255 
##  0.0498139979  0.1401391974  0.2863097634  0.1023377909  0.1855451369 
##          3263          3268          3275          3285          3287 
##  0.2284512076  0.1819523902  0.1571832379 -0.0017018500  0.2696124508 
##          3290          3291          3295          3306          3308 
##  0.2488959726            NA  0.0586532685  0.0336545296 -0.0207007258 
##          3315          3317          3328          3329          3334 
##  0.2687755361            NA  0.1871887369  0.1709661506  0.0523739130 
##          3341          3351          3362          3367          3369 
##  0.3790723359  0.1349077329  0.1513373361  0.2650190613 -0.0027755991 
##          3376          3380          3387          3388          3405 
##            NA  0.0663179325            NA  0.0887909600  0.2913259605 
##          3433          3441          3443          3454          3459 
##            NA  0.0724111343  0.2920941711  0.2772475598  0.3804796831 
##          3461          3477          3483          3491          3493 
##  0.2677411373  0.0875666433  0.0700776422  0.1450934786  0.0859704164 
##          3498          3505          3506          3512          3515 
##  0.1235887401  0.0707499931  0.1645490889  0.1199211239            NA 
##          3529          3530          3549          3572          3582 
##  0.2634984080  0.2673900720  0.1247685222            NA  0.0460273793 
##          3590          3593          3604          3611          3617 
##  0.3656182377  0.1969928141            NA  0.1512869858  0.4474299798 
##          3624          3628          3640          3645          3647 
##  0.1320732461  0.3382067469  0.1199382483            NA  0.0397246427 
##          3653          3659          3669          3672          3674 
##  0.1152945862  0.0838711353 -0.0223656214  0.5965566691            NA 
##          3679          3682          3685          3697          3701 
##  0.2026758561            NA  0.0505975974  0.1227634659 -0.0178281231 
##          3704          3705          3708          3711          3717 
##  0.1832078361  0.1047925247  0.1708843325  0.2182314331  0.1539643260 
##          3726          3738          3743          3749          3751 
##  0.1332952589            NA  0.1546035708  0.2061364761  0.1684979719 
##          3758          3760          3761          3763          3765 
##  0.1544540334  0.1304707588  0.2347189535  0.0790758801            NA 
##          3766          3769          3782          3788          3790 
##  0.0054331990  0.0973276689  0.0776152499            NA  0.2376577111 
##          3801          3802          3804          3805          3807 
##  0.0107255026  0.0474408644  0.2166370955  0.2566690411  0.1217543203 
##          3808          3813          3815          3816          3817 
## -0.0492331872  0.1694067891  0.1000628704  0.1927411458            NA 
##          3818          3825          3827          3828          3831 
##  0.4872831462  0.0038818663 -0.0443304138 -0.0008766468  0.3157322678 
##          3836          3845          3846          3848          3859 
##  0.1942660878  0.7939452533  0.0891173489            NA  0.0263665063 
##          3860          3861          3865          3866          3867 
##  0.0773994785 -0.0286566650            NA  0.1635588552  0.2165638924 
##          3868          3871          3872          3874          3883 
##  0.2406263381  0.2257834366  0.2233902043            NA  0.2809989908 
##          3884          3887          3890          3896          3897 
##            NA  0.2064343667  0.2445297398  0.3185666306  0.0518898831 
##          3899          3900          3901          3911          3920 
##  0.2902618931  0.1189355621  0.0375654319  0.0150443980  0.2618215190 
##          3927          3928          3933          3938          3942 
##  0.0229339959  0.6230484766  0.1846414893 -0.0298394849 -0.0134994817 
##          3950          3953          3958          3962          3965 
##  0.1712617267  0.4291100531            NA            NA -0.0322120171 
##          3972          3973          3977          3978          3979 
##  0.7098964186  0.1039563167  0.2559941374  0.1334474372  0.2157262963 
##          3983          3990          3993          4010          4016 
##  0.1162033518            NA  0.0113205836  0.3546519608 -0.0559034239 
##          4021          4025          4030          4037          4038 
##  0.0841975803  0.2038513125  0.4004898830  0.2813065729  0.1051760185 
##          4039          4048          4058          4067          4070 
## -0.0559493614  0.0703467022  0.3616990224  0.1311136570  0.1025051435 
##          4074          4076          4086          4088          4090 
##  0.1786346386  0.4390660747  0.1534787313  0.2077026536  0.3083556737 
##          4091          4094          4110          4115          4119 
##  0.2568447100  0.0478742184  0.2870082267  0.1484845236  0.0362198082 
##          4121          4125          4130          4135          4144 
##  0.1512303742  0.1753021329  0.2629104257  0.1481082159  0.1056682828 
##          4153          4158          4160          4161          4170 
##  0.2248162816  0.1407594695  0.2705910685            NA -0.0161683901 
##          4172          4174          4176          4185          4187 
##  0.1483831859  0.3138226807  0.1340954819  0.1521251108  0.0546466935 
##          4192          4193          4195          4206          4208 
## -0.0434250351  0.1516820812  0.5199622825  0.0749673565  0.1635776100 
##          4210          4213          4217          4220          4221 
##  0.3193342628  0.2608936365  0.1797421174  0.1206897498  0.2187109880 
##          4222          4227          4238 
##  0.1312639797  0.1972274395  0.1426509692
predicted_values <- predict(model, test_set[, colnames(test_set) !="TenYearCHD"])
actual_values <- test_set[,"TenYearCHD"]

results<- data.frame(predicted_values, actual_values)
head(results)
##    predicted_values actual_values
## 6        0.18641898             0
## 14       0.09284257             0
## 22               NA             0
## 47       0.33370530             0
## 50               NA             0
## 53       0.11558080             0

calculate accuracy and AUC

install.packages("pROC")
library(pROC)

# Assuming binary classification, predicting with your model
# Ensure 'predicted_values' contains the probabilities for the positive class
# If your model outputs factors or classes, adjust accordingly

# Converting probabilities to binary classification based on a threshold (e.g., 0.5)
predicted_class <- ifelse(predicted_values > 0.5, 1, 0)

# Calculating Accuracy
accuracy <- sum(predicted_class == actual_values, na.rm=T) / length(actual_values)

# Calculating AUC
roc_curve <- roc(actual_values, predicted_values)
auc_value <- auc(roc_curve)

# Output Accuracy and AUC
list(Accuracy = accuracy, AUC = auc_value)
## $Accuracy
## [1] 0.7429245
## 
## $AUC
## Area under the curve: 0.7053

draw ROC curve

# Drawing ROC curve using pROC package
library(pROC)

# Generate the ROC curve object
roc_obj <- roc(response = actual_values, predictor = predicted_values)

# Plotting the ROC curve
plot(roc_obj, main="ROC Curve", col="#1c61b6", xlim=c(1,0))

import data from an excel or csv

# make sure your data is in the same folder
# data<- rio::import("file.xlsx")

Using outbreaks library, get simulated Ebola data (ebola_sim_clean)

# library(outbreaks)  # activated above

ebola<- outbreaks::ebola_sim_clean

class(ebola)
## [1] "list"
str(ebola)
## List of 2
##  $ linelist:'data.frame':    5829 obs. of  11 variables:
##   ..$ case_id                : chr [1:5829] "d1fafd" "53371b" "f5c3d8" "6c286a" ...
##   ..$ generation             : int [1:5829] 0 1 1 2 2 0 3 3 2 3 ...
##   ..$ date_of_infection      : Date[1:5829], format: NA "2014-04-09" ...
##   ..$ date_of_onset          : Date[1:5829], format: "2014-04-07" "2014-04-15" ...
##   ..$ date_of_hospitalisation: Date[1:5829], format: "2014-04-17" "2014-04-20" ...
##   ..$ date_of_outcome        : Date[1:5829], format: "2014-04-19" NA ...
##   ..$ outcome                : Factor w/ 2 levels "Death","Recover": NA NA 2 1 2 NA 2 1 2 1 ...
##   ..$ gender                 : Factor w/ 2 levels "f","m": 1 2 1 1 1 1 1 1 2 2 ...
##   ..$ hospital               : Factor w/ 5 levels "Connaught Hospital",..: 2 1 3 NA 3 NA 1 4 3 5 ...
##   ..$ lon                    : num [1:5829] -13.2 -13.2 -13.2 -13.2 -13.2 ...
##   ..$ lat                    : num [1:5829] 8.47 8.46 8.48 8.46 8.45 ...
##  $ contacts:'data.frame':    3800 obs. of  3 variables:
##   ..$ infector: chr [1:3800] "d1fafd" "cac51e" "f5c3d8" "0f58c4" ...
##   ..$ case_id : chr [1:3800] "53371b" "f5c3d8" "0f58c4" "881bd4" ...
##   ..$ source  : Factor w/ 2 levels "funeral","other": 2 1 2 2 2 1 2 2 2 2 ...
ds<- ebola$linelist

Check data, clean it, and fix the dates

check the dataframe

class(ds)
## [1] "data.frame"
dim(ds)
## [1] 5829   11
head(ds)
##   case_id generation date_of_infection date_of_onset date_of_hospitalisation
## 1  d1fafd          0              <NA>    2014-04-07              2014-04-17
## 2  53371b          1        2014-04-09    2014-04-15              2014-04-20
## 3  f5c3d8          1        2014-04-18    2014-04-21              2014-04-25
## 4  6c286a          2              <NA>    2014-04-27              2014-04-27
## 5  0f58c4          2        2014-04-22    2014-04-26              2014-04-29
## 6  49731d          0        2014-03-19    2014-04-25              2014-05-02
##   date_of_outcome outcome gender           hospital       lon      lat
## 1      2014-04-19    <NA>      f  Military Hospital -13.21799 8.473514
## 2            <NA>    <NA>      m Connaught Hospital -13.21491 8.464927
## 3      2014-04-30 Recover      f              other -13.22804 8.483356
## 4      2014-05-07   Death      f               <NA> -13.23112 8.464776
## 5      2014-05-17 Recover      f              other -13.21016 8.452143
## 6      2014-05-07    <NA>      f               <NA> -13.23443 8.468572
colnames(ds)
##  [1] "case_id"                 "generation"             
##  [3] "date_of_infection"       "date_of_onset"          
##  [5] "date_of_hospitalisation" "date_of_outcome"        
##  [7] "outcome"                 "gender"                 
##  [9] "hospital"                "lon"                    
## [11] "lat"
str(ds)
## 'data.frame':    5829 obs. of  11 variables:
##  $ case_id                : chr  "d1fafd" "53371b" "f5c3d8" "6c286a" ...
##  $ generation             : int  0 1 1 2 2 0 3 3 2 3 ...
##  $ date_of_infection      : Date, format: NA "2014-04-09" ...
##  $ date_of_onset          : Date, format: "2014-04-07" "2014-04-15" ...
##  $ date_of_hospitalisation: Date, format: "2014-04-17" "2014-04-20" ...
##  $ date_of_outcome        : Date, format: "2014-04-19" NA ...
##  $ outcome                : Factor w/ 2 levels "Death","Recover": NA NA 2 1 2 NA 2 1 2 1 ...
##  $ gender                 : Factor w/ 2 levels "f","m": 1 2 1 1 1 1 1 1 2 2 ...
##  $ hospital               : Factor w/ 5 levels "Connaught Hospital",..: 2 1 3 NA 3 NA 1 4 3 5 ...
##  $ lon                    : num  -13.2 -13.2 -13.2 -13.2 -13.2 ...
##  $ lat                    : num  8.47 8.46 8.48 8.46 8.45 ...
summary(ds)
##    case_id            generation    date_of_infection    date_of_onset       
##  Length:5829        Min.   : 0.00   Min.   :2014-03-19   Min.   :2014-04-07  
##  Class :character   1st Qu.:13.00   1st Qu.:2014-09-06   1st Qu.:2014-09-16  
##  Mode  :character   Median :16.00   Median :2014-10-11   Median :2014-10-21  
##                     Mean   :16.58   Mean   :2014-10-23   Mean   :2014-11-01  
##                     3rd Qu.:20.00   3rd Qu.:2014-12-05   3rd Qu.:2014-12-16  
##                     Max.   :37.00   Max.   :2015-04-27   Max.   :2015-04-30  
##                                     NA's   :2087                             
##  date_of_hospitalisation date_of_outcome         outcome     gender  
##  Min.   :2014-04-17      Min.   :2014-04-19   Death  :2564   f:2934  
##  1st Qu.:2014-09-19      1st Qu.:2014-09-26   Recover:1963   m:2895  
##  Median :2014-10-23      Median :2014-11-01   NA's   :1302           
##  Mean   :2014-11-03      Mean   :2014-11-13                          
##  3rd Qu.:2014-12-17      3rd Qu.:2014-12-28                          
##  Max.   :2015-04-30      Max.   :2015-06-04                          
##                          NA's   :929                                 
##                                          hospital         lon        
##  Connaught Hospital                          :1737   Min.   :-13.27  
##  Military Hospital                           : 889   1st Qu.:-13.25  
##  other                                       : 876   Median :-13.23  
##  Princess Christian Maternity Hospital (PCMH): 420   Mean   :-13.23  
##  Rokupa Hospital                             : 451   3rd Qu.:-13.22  
##  NA's                                        :1456   Max.   :-13.21  
##                                                                      
##       lat       
##  Min.   :8.446  
##  1st Qu.:8.461  
##  Median :8.469  
##  Mean   :8.470  
##  3rd Qu.:8.480  
##  Max.   :8.492  
## 
# View(ds)

fix dates to make them ready for analysis (parsing)

ds<- ds %>% mutate(across(contains("date"), function(x) parse_date_time(x, orders=c("Ymd"))))

clean text, NA, etc

and this text https://citedrive.medium.com/top-data-wrangling-and-data-cleaning-packages-for-r-in-2023-a-comprehensive-guide-194de0cc0aa6#:~:text=table%2C%20plyr%2C%20janitor%2C%20stringr,on%20data%20analysis%20and%20insights.

ds<- ds %>% 
  janitor::clean_names() %>%                                      # make all column names same format
  mutate(across(is.character, stringr::str_squish)) %>%           # remove white spaces from text
  mutate(across(is.character, stringr::str_to_lower)) %>%       # make all text small letter
  filter()                                                        # last line , use if needed to filter data

alternative way to clean data using cleaner package (new to me)

https://cran.r-project.org/web/packages/cleaner/readme/README.html

ds %>% 
  mutate(outcome=cleaner::clean_factor(outcome, levels=c("Death", "Recover"))) %>% 
  mutate(case_id=cleaner::clean_character(case_id)) %>% head
##   case_id generation date_of_infection date_of_onset date_of_hospitalisation
## 1   dfafd          0              <NA>    2014-04-07              2014-04-17
## 2       b          1        2014-04-09    2014-04-15              2014-04-20
## 3     fcd          1        2014-04-18    2014-04-21              2014-04-25
## 4      ca          2              <NA>    2014-04-27              2014-04-27
## 5      fc          2        2014-04-22    2014-04-26              2014-04-29
## 6       d          0        2014-03-19    2014-04-25              2014-05-02
##   date_of_outcome outcome gender           hospital       lon      lat
## 1      2014-04-19    <NA>      f  Military Hospital -13.21799 8.473514
## 2            <NA>    <NA>      m Connaught Hospital -13.21491 8.464927
## 3      2014-04-30 Recover      f              other -13.22804 8.483356
## 4      2014-05-07   Death      f               <NA> -13.23112 8.464776
## 5      2014-05-17 Recover      f              other -13.21016 8.452143
## 6      2014-05-07    <NA>      f               <NA> -13.23443 8.468572

histogram

plot the incubation period

ds$incubation <- ds$date_of_onset - ds$date_of_infection

ggplot(data = ds) +
  geom_histogram(aes(x=incubation))

# calculate the average incubation period 


# calculate the CFR (case fatality rate)
tbl<- table(ds$outcome)
tbl["Recover"]/tbl["Death"]
##   Recover 
## 0.7656006
ggplot(data=ds)+
  geom_histogram(aes(x=incubation, fill=gender))+
  theme_classic()+
  labs(x="Incubation", y="Frequency")

add density line

ggplot(data = ds, aes(x = date_of_onset)) +
  geom_histogram(aes(y = ..density..), binwidth = 7) + # Adjust binwidth as needed
  geom_density(alpha = 0.2, fill = "#FF6666")

ggplot(data = ds, aes(x = incubation)) +
  geom_histogram(aes(y = ..density..), binwidth = 1, alpha=0.5) + # Adjust binwidth as needed
  geom_density(alpha = 0.2, fill = "#FF6666")+
  theme_classic()

epikit package

https://cran.r-project.org/web/packages/epikit/vignettes/intro.html https://r4epis.netlify.app/outbreaks/

# first you need to install packages for the first use
# install.packages("epikit")
# install.packages("outbreaks")
library(epikit)
library(outbreaks)

df<- outbreaks::ebola_sim_clean

case_fatality_rate_df(df$linelist, 
  outcome == "Death", 
  group = gender,
  add_total = TRUE,
  mergeCI = TRUE
)
## # A tibble: 3 × 5
##   gender deaths population   cfr ci           
##   <fct>   <int>      <int> <dbl> <chr>        
## 1 f        1291       2280  56.6 (54.58-58.64)
## 2 m        1273       2247  56.7 (54.59-58.69)
## 3 Total    2564       4527  56.6 (55.19-58.08)

download Epola cases

linelist <- import("linelist_cleaned.xlsx")

parse dates

linelist$date_hospitalisation<- parse_date_time(linelist$date_hospitalisation, orders=c("Ymd"))

lineliest<- linelist %>% mutate(across(contains("date"),function(x) parse_date_time(x,orders=c("Ymd")) ))
colnames(linelist)
##  [1] "case_id"              "generation"           "date_infection"      
##  [4] "date_onset"           "date_hospitalisation" "date_outcome"        
##  [7] "outcome"              "gender"               "age"                 
## [10] "age_unit"             "age_years"            "age_cat"             
## [13] "age_cat5"             "hospital"             "lon"                 
## [16] "lat"                  "infector"             "source"              
## [19] "wt_kg"                "ht_cm"                "ct_blood"            
## [22] "fever"                "chills"               "cough"               
## [25] "aches"                "vomit"                "temp"                
## [28] "time_admission"       "bmi"                  "days_onset_hosp"
# check range of onset dates
ggplot(data = linelist)+
  geom_histogram(aes(x = date_onset))

# Daily histogram for Central Hospital
ggplot(data = linelist) +          # Set the data source
  geom_histogram(                      # Create a histogram
    mapping = aes(x = as_date(date_onset)))+     # Map the 'date_onset' column to the x-axis
  scale_x_date(                                             # Correctly format the x-axis as date
    labels = scales::date_format("%b-%y"),                  # Use abbreviated month and two-digit year for labels
    breaks = scales::date_breaks("1 month")) +              # Set breaks to be monthly       
  labs(title = "Central Hospital - Daily Cases")  # Add a title to the plot

library(ggplot2)

# Ensure 'date_onset' is of Date type before plotting
linelist$date_onset <- as.Date(linelist$date_onset)

# Daily histogram for Central Hospital with weekly cuts, starting on Mondays
ggplot(data = linelist) +                               # Set the data source
  geom_histogram(                                           # Create a histogram
    mapping = aes(x = date_onset, fill=hospital),  color="black",                         # Map the 'date_onset' column to the x-axis
    binwidth = 7) +                                         # Set the width of the bins to 7 days for weekly aggregation
  scale_x_date(                                             # Correctly format the x-axis as date
    labels = scales::date_format("%a\n%d %b\n %Y"),            # Use format "Day DD Mon YYYY" for labels
    breaks = scales::date_breaks("4 week"),                 # Set breaks to be weekly
    limits = c(min(linelist$date_onset), max(linelist$date_onset) + 7), # Extend limit to include the last week
    date_minor_breaks = "1 weeks") +                         # Minor breaks every week for clarity
  labs(title = "Central Hospital - Weekly Cases")          # Add a title to the plot

linelist$hospital<- factor(linelist$hospital, 
                           levels=c("Port Hospital", "Military Hospital", "Central Hospital", "St. Mark's Maternity Hospital (SMMH)", "Other", "Missing"))
# Daily histogram for Central Hospital with weekly cuts, starting on Mondays
ggplot(data = linelist) +                               # Set the data source
  geom_histogram(                                           # Create a histogram
    mapping = aes(x = date_onset, fill=hospital),  color="black",                         # Map the 'date_onset' column to the x-axis
    binwidth = 7) +                                         # Set the width of the bins to 7 days for weekly aggregation
  scale_x_date(                                             # Correctly format the x-axis as date
    labels = scales::date_format("%a\n%d %b\n %Y"),            # Use format "Day DD Mon YYYY" for labels
    breaks = scales::date_breaks("8 week"),                 # Set breaks to be weekly
    limits = c(min(linelist$date_onset), max(linelist$date_onset) + 7), # Extend limit to include the last week
    date_minor_breaks = "1 weeks") +                         # Minor breaks every week for clarity
  labs(title = "Central Hospital - Weekly Cases")          # Add a title to the plot

ggplot(data = linelist) +                               # Set the data source
  geom_histogram(                                           # Create a histogram
    mapping = aes(x = date_onset, fill=age_cat),  color="black",                         # Map the 'date_onset' column to the x-axis
    binwidth = 7) +                                         # Set the width of the bins to 7 days for weekly aggregation
  scale_x_date(                                             # Correctly format the x-axis as date
    labels = scales::date_format("%a\n%d %b\n %Y"),            # Use format "Day DD Mon YYYY" for labels
    breaks = scales::date_breaks("8 week"),                 # Set breaks to be weekly
    limits = c(min(linelist$date_onset), max(linelist$date_onset) + 7), # Extend limit to include the last week
    date_minor_breaks = "1 weeks") +                         # Minor breaks every week for clarity
  labs(title = "Central Hospital - Weekly Cases")  +        # Add a title to the plo
  facet_wrap(~age_cat)+
  gghighlight::gghighlight()+
  theme_classic()