# for the fist time you need to install pacman package
# install.packages("pacman")
# library(pacman)
pacman::p_load(
rio, # file import/export
here, # relative filepaths
lubridate, # working with dates/epiweeks
aweek, # alternative package for working with dates/epiweeks
incidence2, # epicurves of linelist data
i2extras, # supplement to incidence2
stringr, # search and manipulate character strings
forcats, # working with factors
RColorBrewer, # Color palettes from colorbrewer2.org
scales,
tidyverse, # data management + ggplot2 graphics
outbreaks, # contains outbreaks data
epikit, # tools for epidemiology
cleaner, # cleaning and checking data columns
janitor, # clean column names
stringr # manipulate text (strings)
)
here::set_here() # make your current directory the active one so it finds files within same folder, very important
a=5
b=6
a+b
## [1] 11
c="Hello"
c
## [1] "Hello"
d= c(1,5,8,9)
d*2
## [1] 2 10 16 18
d = c(d, 5)
d
## [1] 1 5 8 9 5
length(d)
## [1] 5
e=c("Iyad", "Ahmad", "Samer")
e
## [1] "Iyad" "Ahmad" "Samer"
c(e, 7)
## [1] "Iyad" "Ahmad" "Samer" "7"
g=e=="Ahmad"
e[g]
## [1] "Ahmad"
a<- c(12,23,54,65,78)
a[c(FALSE, FALSE, TRUE, TRUE, TRUE)]
## [1] 54 65 78
a[a>30]
## [1] 54 65 78
mtcars
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
# average mpg for all cars
mean(mtcars$mpg)
## [1] 20.09062
# Average mpg for cars with 6 cylinders
mtcars$cyl==6
## [1] TRUE TRUE FALSE TRUE FALSE TRUE FALSE FALSE FALSE TRUE TRUE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
mean(mtcars$mpg[mtcars$cyl==6])
## [1] 19.74286
Using the Global Health Observatory data repository , extract the number of cases of malaria and number of deaths for Malaria in Sudan, Somalia, Djibouti and South Sudan per year. Plot the results for the 4 countries using ggplot in a line graph, each country using a different color. I expect 2 have 2 figures, each with 4 panels
Note: you may need to find a way to convert a number like this (193 232 [149 000-245 000]) to 193232. Also you may need to use function gather or pivot_longer to make your table ready for ggplot and facet_wrap to produce multiple facets.
malaria_cases<-read.csv("malaria_cases.csv", skip = 1)
malaria_deaths<- read.csv("malaria_deaths.csv", skip = 1)
malaria_cases<- malaria_cases %>% filter(Countries..territories.and.areas %in% c("Sudan", "Djibouti", "South Sudan", "Somalia"))
colnames(malaria_cases[,-1])<- as.numeric(gsub("X", "", colnames(malaria_cases[,-1])))
malaria_cases
## Countries..territories.and.areas X2021
## 1 Djibouti 58 445
## 2 Somalia 1 131 892 [688 000-1 767 000]
## 3 South Sudan 2 953 582 [1 643 000-4 972 000]
## 4 Sudan 3 325 874 [1 643 000-5 906 000]
## X2020 X2019
## 1 72 332 49 402
## 2 926 877 [570 000-1 451 000] 957 015 [624 000-1 400 000]
## 3 3 042 784 [1 664 000-5 116 000] 2 938 499 [1 652 000-4 811 000]
## 4 3 261 859 [1 614 000-5 853 000] 2 835 630 [1 360 000-5 217 000]
## X2018 X2017
## 1 24 845 14 671
## 2 913 696 [579 000-1 366 000] 820 464 [513 000-1 237 000]
## 3 2 898 120 [1 695 000-4 628 000] 2 960 681 [1 791 000-4 578 000]
## 4 2 485 883 [1 154 000-4 696 000] 2 187 618 [1 075 000-3 956 000]
## X2016 X2015
## 1 13 822 9 473
## 2 837 484 [560 000-1 192 000] 811 764 [572 000-1 112 000]
## 3 2 949 646 [1 901 000-4 401 000] 3 033 662 [2 108 000-4 190 000]
## 4 2 097 220 [1 185 000-3 462 000] 1 611 298 [1 053 000-2 378 000]
## X2014 X2013
## 1 9 439 1 684
## 2 668 820 [476 000-917 000] 534 460 [381 000-729 000]
## 3 3 098 291 [2 282 000-4 115 000] 3 173 418 [2 399 000-4 121 000]
## 4 1 427 317 [1 008 000-1 956 000] 1 257 489 [924 000-1 683 000]
## X2012 X2011
## 1 2 226 [1 700-2 700] 2 185 [1 700-2 700]
## 2 433 472 [313 000-589 000] 419 303 [300 000-571 000]
## 3 3 079 924 [2 358 000-3 971 000] 2 979 272 [2 308 000-3 782 000]
## 4 1 162 568 [868 000-1 530 000] 1 120 801 [836 000-1 478 000]
## X2010 X2009
## 1 1 010 2 686
## 2 500 313 [359 000-679 000] 454 522 [325 000-618 000]
## 3 2 775 374 [2 183 000-3 450 000] 2 640 300 [2 099 000-3 268 000]
## 4 1 104 287 [820 000-1 457 000] 1 140 986 [850 000-1 493 000]
## X2008 X2007
## 1 2 060 [1 600-2 500] 2 018 [1 600-2 500]
## 2 719 911 [525 000-965 000] 1 128 218 [825 000-1 510 000]
## 3 2 586 745 [2 000 000-3 267 000] 2 520 885 [1 819 000-3 395 000]
## 4 1 238 850 [887 000-1 678 000] 1 450 820 [961 000-2 105 000]
## X2006 X2005
## 1 1 976 [1 500-2 400] 1 938 [1 500-2 400]
## 2 1 274 766 [940 000-1 689 000] 1 392 857 [996 000-1 881 000]
## 3 2 450 862 [1 710 000-3 393 000] 2 342 844 [1 618 000-3 292 000]
## 4 1 597 895 [1 009 000-2 435 000] 1 614 440 [994 000-2 428 000]
## X2004 X2003
## 1 1 909 [1 500-2 300] 1 881 [1 400-2 300]
## 2 1 245 035 [874 000-1 743 000] 1 268 524 [857 000-1 814 000]
## 3 2 243 889 [1 514 000-3 253 000] 2 229 084 [1 421 000-3 353 000]
## 4 1 596 519 [988 000-2 429 000] 1 722 030 [1 061 000-2 663 000]
## X2002 X2001
## 1 1 841 [1 400-2 300] 1 786 [1 400-2 200]
## 2 1 236 218 [818 000-1 768 000] 1 283 399 [833 000-1 902 000]
## 3 2 250 591 [1 410 000-3 418 000] 2 283 971 [1 488 000-3 398 000]
## 4 1 837 673 [1 156 000-2 863 000] 2 326 967 [1 516 000-3 408 000]
## X2000
## 1 1 731 [1 300-2 100]
## 2 1 177 314 [677 000-1 906 000]
## 3 2 245 004 [1 533 000-3 162 000]
## 4 2 439 775 [1 624 000-3 493 000]
# Function to convert numbers in a specific column
convert_number <- function(x) {
# Remove any non-numeric characters
x <- gsub("[^0-9]", "", x)
# Convert to numeric
as.numeric(x)
}
dim(malaria_cases)
## [1] 4 23
# Apply the function to all columns
colnames(malaria_cases[,2:23])<- sapply(colnames(malaria_cases[,2:23]), convert_number)
malaria_cases<-malaria_cases %>% gather(year, cases, -Countries..territories.and.areas)
malaria_cases$year<- as.numeric(malaria_cases$year)
set.seed=123
age<- abs(rnorm(100, mean=40, sd=10)) # random numbers from random distribution
gender<- sample(c("M", "F"), 100, replace=T) # random gender using sampling
follow_up<- abs(rnorm(100, mean=36, sd=12)) # random FU , mean 36 months, sd 12, abs to get all results in +
stage<- c(rep("I", 25), rep("II", 25), rep("III", 25), rep("IV", 25)) # 25 patints of each stage
status1<- c(rep(0,22), rep(1,3)) # status more likely to be 0 (alive) for stage I and more likely to be 1 (dead) for advanced stages
status2<- c(rep(0,18), rep(1,7))
status3<- c(rep(0,12), rep(1,13))
status4<- c(rep(0,5), rep(1,20))
status<- c(status1, status2, status3, status4)
ds<- data_frame(age, gender, stage, follow_up, status)
ds
## # A tibble: 100 × 5
## age gender stage follow_up status
## <dbl> <chr> <chr> <dbl> <dbl>
## 1 28.3 F I 33.6 0
## 2 48.2 F I 29.0 0
## 3 50.7 F I 29.3 0
## 4 31.5 M I 37.2 0
## 5 22.4 F I 46.6 0
## 6 51.6 F I 23.4 0
## 7 22.1 F I 29.3 0
## 8 39.0 F I 11.6 0
## 9 68.7 F I 50.9 0
## 10 46.5 F I 32.0 0
## # ℹ 90 more rows
library(survival)
library(survminer)
# Kaplan Meier for all patients
fit<- survfit(Surv(follow_up, status)~1, data=ds)
print(fit)
## Call: survfit(formula = Surv(follow_up, status) ~ 1, data = ds)
##
## n events median 0.95LCL 0.95UCL
## [1,] 100 43 48.4 41 55.5
summary(fit, c(12, 36))
## Call: survfit(formula = Surv(follow_up, status) ~ 1, data = ds)
##
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 12 99 0 1.000 0.0000 1.000 1.000
## 36 46 23 0.718 0.0509 0.625 0.825
ggsurvplot(fit, ds)
There are 100 patients who had 43 events (death). The median survival of
this group is 46 months (95% confidence limits are 43 and 52)
The overall survival at 1 year was 99% +/- 1% The overall survival at 3 years was 76% +/- 4.6%
# Kaplan Meier for all patients according to stage
fit<- survfit(Surv(follow_up, status)~stage, data=ds)
print(fit)
## Call: survfit(formula = Surv(follow_up, status) ~ stage, data = ds)
##
## n events median 0.95LCL 0.95UCL
## stage=I 25 3 NA NA NA
## stage=II 25 7 55.5 53.0 NA
## stage=III 25 13 45.6 35.3 NA
## stage=IV 25 20 36.5 30.2 51.8
summary(fit, c(12, 36))
## Call: survfit(formula = Surv(follow_up, status) ~ stage, data = ds)
##
## stage=I
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 12 24 0 1.000 0.000 1.000 1
## 36 9 3 0.803 0.107 0.618 1
##
## stage=II
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 12 25 0 1.000 0.0000 1.000 1
## 36 13 2 0.909 0.0616 0.796 1
##
## stage=III
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 12 25 0 1.00 0.000 1.000 1.000
## 36 13 7 0.66 0.105 0.484 0.902
##
## stage=IV
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 12 25 0 1.000 0.000 1.000 1.000
## 36 11 11 0.539 0.103 0.371 0.784
ggsurvplot(fit, ds, pval=T)
There is a significant difference among patients according to stage with a p value of0.0012 (log-rank test)
Let us take data from Fremingham study as an example
# Download Framingham study
# URL of the dataset
url <- "https://ocw.mit.edu/courses/15-071-the-analytics-edge-spring-2017/5d689a024551e672313f7fd7eb1bee8d_framingham.csv"
# Downloading and reading the dataset
ds <- read.csv(url)
# Viewing the first few rows of the dataset
head(ds)
## male age education currentSmoker cigsPerDay BPMeds prevalentStroke
## 1 1 39 4 0 0 0 0
## 2 0 46 2 0 0 0 0
## 3 1 48 1 1 20 0 0
## 4 0 61 3 1 30 0 0
## 5 0 46 3 1 23 0 0
## 6 0 43 2 0 0 0 0
## prevalentHyp diabetes totChol sysBP diaBP BMI heartRate glucose TenYearCHD
## 1 0 0 195 106.0 70 26.97 80 77 0
## 2 0 0 250 121.0 81 28.73 95 76 0
## 3 0 0 245 127.5 80 25.34 75 70 0
## 4 1 0 225 150.0 95 28.58 65 103 1
## 5 0 0 285 130.0 84 23.10 85 85 0
## 6 1 0 228 180.0 110 30.30 77 99 0
class(ds)
## [1] "data.frame"
colnames(ds)
## [1] "male" "age" "education" "currentSmoker"
## [5] "cigsPerDay" "BPMeds" "prevalentStroke" "prevalentHyp"
## [9] "diabetes" "totChol" "sysBP" "diaBP"
## [13] "BMI" "heartRate" "glucose" "TenYearCHD"
dim(ds)
## [1] 4240 16
head(ds)
## male age education currentSmoker cigsPerDay BPMeds prevalentStroke
## 1 1 39 4 0 0 0 0
## 2 0 46 2 0 0 0 0
## 3 1 48 1 1 20 0 0
## 4 0 61 3 1 30 0 0
## 5 0 46 3 1 23 0 0
## 6 0 43 2 0 0 0 0
## prevalentHyp diabetes totChol sysBP diaBP BMI heartRate glucose TenYearCHD
## 1 0 0 195 106.0 70 26.97 80 77 0
## 2 0 0 250 121.0 81 28.73 95 76 0
## 3 0 0 245 127.5 80 25.34 75 70 0
## 4 1 0 225 150.0 95 28.58 65 103 1
## 5 0 0 285 130.0 84 23.10 85 85 0
## 6 1 0 228 180.0 110 30.30 77 99 0
str(ds)
## 'data.frame': 4240 obs. of 16 variables:
## $ male : int 1 0 1 0 0 0 0 0 1 1 ...
## $ age : int 39 46 48 61 46 43 63 45 52 43 ...
## $ education : int 4 2 1 3 3 2 1 2 1 1 ...
## $ currentSmoker : int 0 0 1 1 1 0 0 1 0 1 ...
## $ cigsPerDay : int 0 0 20 30 23 0 0 20 0 30 ...
## $ BPMeds : int 0 0 0 0 0 0 0 0 0 0 ...
## $ prevalentStroke: int 0 0 0 0 0 0 0 0 0 0 ...
## $ prevalentHyp : int 0 0 0 1 0 1 0 0 1 1 ...
## $ diabetes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ totChol : int 195 250 245 225 285 228 205 313 260 225 ...
## $ sysBP : num 106 121 128 150 130 ...
## $ diaBP : num 70 81 80 95 84 110 71 71 89 107 ...
## $ BMI : num 27 28.7 25.3 28.6 23.1 ...
## $ heartRate : int 80 95 75 65 85 77 60 79 76 93 ...
## $ glucose : int 77 76 70 103 85 99 85 78 79 88 ...
## $ TenYearCHD : int 0 0 0 1 0 0 1 0 0 0 ...
library(ggplot2)
library(dplyr)
ds %>%
mutate(male=as.character(male)) %>%
ggplot(aes(x=age, fill=male))+
geom_histogram()+
theme_classic()+
labs(y="Number of individuals", x="Age in years", title="histogram", fill="Sex")
# install package if not install nbefore
# install.packages("gtsummary")
library(gtsummary)
ds %>%
mutate(across(c(male, education, currentSmoker, BPMeds, prevalentStroke, prevalentHyp, diabetes,TenYearCHD), as.character)) %>%
gtsummary::tbl_summary()
| Characteristic | N = 4,2401 |
|---|---|
| male | |
| Â Â Â Â 0 | 2,420 (57%) |
| Â Â Â Â 1 | 1,820 (43%) |
| age | 49 (42, 56) |
| education | |
| Â Â Â Â 1 | 1,720 (42%) |
| Â Â Â Â 2 | 1,253 (30%) |
| Â Â Â Â 3 | 689 (17%) |
| Â Â Â Â 4 | 473 (11%) |
| Â Â Â Â Unknown | 105 |
| currentSmoker | |
| Â Â Â Â 0 | 2,145 (51%) |
| Â Â Â Â 1 | 2,095 (49%) |
| cigsPerDay | 0 (0, 20) |
| Â Â Â Â Unknown | 29 |
| BPMeds | |
| Â Â Â Â 0 | 4,063 (97%) |
| Â Â Â Â 1 | 124 (3.0%) |
| Â Â Â Â Unknown | 53 |
| prevalentStroke | |
| Â Â Â Â 0 | 4,215 (99%) |
| Â Â Â Â 1 | 25 (0.6%) |
| prevalentHyp | |
| Â Â Â Â 0 | 2,923 (69%) |
| Â Â Â Â 1 | 1,317 (31%) |
| diabetes | |
| Â Â Â Â 0 | 4,131 (97%) |
| Â Â Â Â 1 | 109 (2.6%) |
| totChol | 234 (206, 263) |
| Â Â Â Â Unknown | 50 |
| sysBP | 128 (117, 144) |
| diaBP | 82 (75, 90) |
| BMI | 25.4 (23.1, 28.0) |
| Â Â Â Â Unknown | 19 |
| heartRate | 75 (68, 83) |
| Â Â Â Â Unknown | 1 |
| glucose | 78 (71, 87) |
| Â Â Â Â Unknown | 388 |
| TenYearCHD | |
| Â Â Â Â 0 | 3,596 (85%) |
| Â Â Â Â 1 | 644 (15%) |
| 1 n (%); Median (IQR) | |
# install.packages("epitools")
library(epitools)
tbl<- table(ds$prevalentHyp, ds$TenYearCHD)
tbl
##
## 0 1
## 0 2604 319
## 1 992 325
riskratio(tbl)
## $data
##
## 0 1 Total
## 0 2604 319 2923
## 1 992 325 1317
## Total 3596 644 4240
##
## $measure
## risk ratio with 95% C.I.
## estimate lower upper
## 0 1.000000 NA NA
## 1 2.261183 1.96556 2.601268
##
## $p.value
## two-sided
## midp.exact fisher.exact chi.square
## 0 NA NA NA
## 1 0 5.783326e-29 6.948619e-31
##
## $correction
## [1] FALSE
##
## attr(,"method")
## [1] "Unconditional MLE & normal approximation (Wald) CI"
The chi-square test revealed a highly significant association between exposure and outcome, evidenced by a risk ratio (RR) of 2.26 and a p-value of less than 0.001, underscoring the strong link between having hypertension and the increased risk of developing CHF in 10 years.
Odds ratio are not suitable for cohort studies. They are used in case-control studies. Nevertheless, here is an application
epitools::oddsratio(tbl)
## $data
##
## 0 1 Total
## 0 2604 319 2923
## 1 992 325 1317
## Total 3596 644 4240
##
## $measure
## odds ratio with 95% C.I.
## estimate lower upper
## 0 1.000000 NA NA
## 1 2.673608 2.253526 3.172686
##
## $p.value
## two-sided
## midp.exact fisher.exact chi.square
## 0 NA NA NA
## 1 0 5.783326e-29 6.948619e-31
##
## $correction
## [1] FALSE
##
## attr(,"method")
## [1] "median-unbiased estimate & mid-p exact CI"
# logistic regression
model<- glm(TenYearCHD~male+age+education+currentSmoker+cigsPerDay+
BPMeds+prevalentStroke+prevalentHyp+diabetes+totChol+
sysBP+diaBP+BMI+heartRate+glucose,
data=ds)
model
##
## Call: glm(formula = TenYearCHD ~ male + age + education + currentSmoker +
## cigsPerDay + BPMeds + prevalentStroke + prevalentHyp + diabetes +
## totChol + sysBP + diaBP + BMI + heartRate + glucose, data = ds)
##
## Coefficients:
## (Intercept) male age education
## -0.5762585 0.0574246 0.0070575 -0.0056161
## currentSmoker cigsPerDay BPMeds prevalentStroke
## 0.0081070 0.0021636 0.0459077 0.1368394
## prevalentHyp diabetes totChol sysBP
## 0.0283079 0.0258235 0.0001293 0.0024734
## diaBP BMI heartRate glucose
## -0.0011731 0.0000790 -0.0003590 0.0011442
##
## Degrees of Freedom: 3657 Total (i.e. Null); 3642 Residual
## (582 observations deleted due to missingness)
## Null Deviance: 472.2
## Residual Deviance: 424.4 AIC: 2536
summary(model)
##
## Call:
## glm(formula = TenYearCHD ~ male + age + education + currentSmoker +
## cigsPerDay + BPMeds + prevalentStroke + prevalentHyp + diabetes +
## totChol + sysBP + diaBP + BMI + heartRate + glucose, data = ds)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.5762585 0.0789125 -7.303 3.45e-13 ***
## male 0.0574246 0.0124715 4.604 4.28e-06 ***
## age 0.0070575 0.0007691 9.176 < 2e-16 ***
## education -0.0056161 0.0056705 -0.990 0.322040
## currentSmoker 0.0081070 0.0181548 0.447 0.655226
## cigsPerDay 0.0021636 0.0007827 2.764 0.005733 **
## BPMeds 0.0459077 0.0346845 1.324 0.185726
## prevalentStroke 0.1368394 0.0753542 1.816 0.069460 .
## prevalentHyp 0.0283079 0.0174492 1.622 0.104825
## diabetes 0.0258235 0.0442138 0.584 0.559217
## totChol 0.0001293 0.0001353 0.956 0.339118
## sysBP 0.0024734 0.0004960 4.987 6.42e-07 ***
## diaBP -0.0011731 0.0008166 -1.437 0.150912
## BMI 0.0000790 0.0015460 0.051 0.959248
## heartRate -0.0003590 0.0004927 -0.729 0.466217
## glucose 0.0011442 0.0003022 3.786 0.000155 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.1165278)
##
## Null deviance: 472.19 on 3657 degrees of freedom
## Residual deviance: 424.39 on 3642 degrees of freedom
## (582 observations deleted due to missingness)
## AIC: 2535.6
##
## Number of Fisher Scoring iterations: 2
In a multivariable logistic regression model, males were at 5.7%+/-1.2% higher risk of developing the outcome of CHD at 10 years with a p value of <0.001. For every year increase in age therew as a 0.7% +/- 0.07% increase in risk of CHD (p<0.001)
# logistic regression
model<- glm(TenYearCHD~., data=ds)
summary(model)
##
## Call:
## glm(formula = TenYearCHD ~ ., data = ds)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.5762585 0.0789125 -7.303 3.45e-13 ***
## male 0.0574246 0.0124715 4.604 4.28e-06 ***
## age 0.0070575 0.0007691 9.176 < 2e-16 ***
## education -0.0056161 0.0056705 -0.990 0.322040
## currentSmoker 0.0081070 0.0181548 0.447 0.655226
## cigsPerDay 0.0021636 0.0007827 2.764 0.005733 **
## BPMeds 0.0459077 0.0346845 1.324 0.185726
## prevalentStroke 0.1368394 0.0753542 1.816 0.069460 .
## prevalentHyp 0.0283079 0.0174492 1.622 0.104825
## diabetes 0.0258235 0.0442138 0.584 0.559217
## totChol 0.0001293 0.0001353 0.956 0.339118
## sysBP 0.0024734 0.0004960 4.987 6.42e-07 ***
## diaBP -0.0011731 0.0008166 -1.437 0.150912
## BMI 0.0000790 0.0015460 0.051 0.959248
## heartRate -0.0003590 0.0004927 -0.729 0.466217
## glucose 0.0011442 0.0003022 3.786 0.000155 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.1165278)
##
## Null deviance: 472.19 on 3657 degrees of freedom
## Residual deviance: 424.39 on 3642 degrees of freedom
## (582 observations deleted due to missingness)
## AIC: 2535.6
##
## Number of Fisher Scoring iterations: 2
model
##
## Call: glm(formula = TenYearCHD ~ ., data = ds)
##
## Coefficients:
## (Intercept) male age education
## -0.5762585 0.0574246 0.0070575 -0.0056161
## currentSmoker cigsPerDay BPMeds prevalentStroke
## 0.0081070 0.0021636 0.0459077 0.1368394
## prevalentHyp diabetes totChol sysBP
## 0.0283079 0.0258235 0.0001293 0.0024734
## diaBP BMI heartRate glucose
## -0.0011731 0.0000790 -0.0003590 0.0011442
##
## Degrees of Freedom: 3657 Total (i.e. Null); 3642 Residual
## (582 observations deleted due to missingness)
## Null Deviance: 472.2
## Residual Deviance: 424.4 AIC: 2536
model<- glm(TenYearCHD~male+log(age)+education+currentSmoker*cigsPerDay+
BPMeds+prevalentStroke+prevalentHyp*(sysBP+diaBP)+diabetes*glucose+totChol+
BMI+heartRate,
data=ds)
summary(model)
##
## Call:
## glm(formula = TenYearCHD ~ male + log(age) + education + currentSmoker *
## cigsPerDay + BPMeds + prevalentStroke + prevalentHyp * (sysBP +
## diaBP) + diabetes * glucose + totChol + BMI + heartRate,
## data = ds)
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.4083191 0.1613695 -8.727 < 2e-16 ***
## male 0.0613811 0.0124986 4.911 9.46e-07 ***
## log(age) 0.3498910 0.0379387 9.223 < 2e-16 ***
## education -0.0060349 0.0056683 -1.065 0.28709
## currentSmoker 0.0063685 0.0181564 0.351 0.72579
## cigsPerDay 0.0021032 0.0007825 2.688 0.00722 **
## BPMeds 0.0323081 0.0349106 0.925 0.35479
## prevalentStroke 0.1368524 0.0752759 1.818 0.06914 .
## prevalentHyp -0.2589343 0.1161291 -2.230 0.02583 *
## sysBP 0.0015043 0.0007532 1.997 0.04589 *
## diaBP -0.0013646 0.0011676 -1.169 0.24258
## diabetes -0.1514163 0.0857292 -1.766 0.07744 .
## glucose 0.0003264 0.0004496 0.726 0.46794
## totChol 0.0001483 0.0001360 1.090 0.27566
## BMI 0.0003537 0.0015464 0.229 0.81910
## heartRate -0.0002473 0.0004937 -0.501 0.61648
## currentSmoker:cigsPerDay NA NA NA NA
## prevalentHyp:sysBP 0.0018302 0.0009491 1.928 0.05389 .
## prevalentHyp:diaBP 0.0004230 0.0015724 0.269 0.78793
## diabetes:glucose 0.0014558 0.0006066 2.400 0.01644 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.1162823)
##
## Null deviance: 472.19 on 3657 degrees of freedom
## Residual deviance: 423.15 on 3639 degrees of freedom
## (582 observations deleted due to missingness)
## AIC: 2530.9
##
## Number of Fisher Scoring iterations: 2
# In R split my dataframe (ds) into training and testing sets (8:2)
library(dplyr)
# Splitting the dataframe into training and testing sets (80:20 ratio)
set.seed(123) # Setting seed for reproducibility
split_indices <- sample(nrow(ds), size = 0.8 * nrow(ds))
train_set <- ds[split_indices, ]
test_set <- ds[-split_indices, ]
dim(train_set)
## [1] 3392 16
dim(test_set)
## [1] 848 16
model<- glm(TenYearCHD~., data=train_set)
predict(model, test_set)
## 6 14 22 47 50
## 0.1864189843 0.0928425663 NA 0.3337052995 NA
## 53 57 58 61 62
## 0.1155807970 0.0934999615 0.2407292167 0.2206462145 0.3233701601
## 64 66 73 84 85
## 0.1449275108 0.2514058889 NA 0.1313757842 0.3304128478
## 86 91 93 98 101
## -0.0273923202 0.2377143747 0.0344578205 NA 0.1062705545
## 105 106 110 112 119
## 0.0427564992 0.1785929419 0.1674169336 NA -0.0220756928
## 126 127 128 130 135
## 0.0102604525 0.1178577438 0.0300841633 -0.0199405271 0.0909630413
## 146 149 157 168 169
## 0.1799988651 0.2226439412 0.2477008004 0.0025076691 0.1335498602
## 174 176 177 188 189
## -0.0100171280 0.3896023218 0.3054941289 0.2224388315 0.0957641167
## 190 196 205 208 210
## 0.1530904282 0.0307673440 0.0333729940 0.1547926226 0.1623570227
## 219 225 226 227 228
## 0.1264028185 0.1221019631 0.1894171588 0.2469175100 0.3574531357
## 233 236 248 254 257
## 0.1048957094 -0.0546037802 NA 0.2355558002 0.1464064820
## 265 273 274 293 298
## 0.2355775408 0.2211500886 NA 0.1598529461 -0.0749319108
## 300 301 305 306 308
## 0.2731083851 0.0354142354 0.0178103385 NA 0.1427704280
## 310 313 324 332 334
## NA 0.1026373319 0.0457480870 0.0483895634 0.4136885872
## 336 339 342 345 356
## 0.0614450375 NA 0.0293957768 0.2984557935 NA
## 360 363 364 367 375
## 0.3360324661 0.0545165790 0.0620199460 0.1344716546 0.2415831120
## 381 385 389 397 399
## 0.0824471726 0.1529543127 -0.0122244446 0.0645309944 0.3359130137
## 402 403 404 405 410
## 0.0597311305 -0.0016390065 -0.0016564302 0.1146670178 0.2313635396
## 411 412 414 417 425
## 0.0159728803 0.2494009095 NA 0.1678932970 0.0047234030
## 426 437 438 440 441
## NA NA 0.1928991506 0.1919923263 -0.0314169461
## 446 447 454 465 476
## 0.2716006010 0.4113820438 0.0657867654 0.2599799312 0.3986534958
## 478 486 495 501 507
## 0.0876140946 0.2960126339 0.2460211815 -0.0357970939 0.3133320992
## 508 509 511 514 516
## 0.1513688364 0.0999742484 0.0905702575 0.1607353078 0.1804274964
## 519 520 521 531 534
## 0.0921103239 NA 0.1575137656 -0.0128202303 0.0971599931
## 538 540 550 557 560
## 0.3015137141 0.0385032044 0.1874033315 0.0851801241 0.0048740633
## 562 563 565 566 568
## 0.0940836380 -0.0088446678 0.0721460121 0.1232486933 NA
## 569 571 573 575 576
## -0.0624342081 0.0542074955 0.0465671718 0.1175007462 0.1641151014
## 589 591 592 593 594
## 0.2706459966 0.4823007058 NA 0.0839625731 0.0413021067
## 595 596 600 604 608
## 0.0380591236 0.0606661070 0.1069926939 0.3032162140 0.1988579937
## 611 631 632 641 645
## 0.3038247965 0.3317610939 0.0906053229 0.1240345137 -0.0101627297
## 652 653 679 687 691
## 0.1456110034 0.1652106909 0.2749289581 0.1229033558 NA
## 700 709 715 717 722
## 0.4584636596 0.0871925682 NA 0.2235347228 0.4174775514
## 724 731 734 736 744
## 0.2213649992 -0.0068222095 0.2585390685 0.2513577375 0.1531443694
## 751 759 768 770 775
## 0.4341445497 0.1328199369 0.5195981975 NA 0.0815284440
## 776 784 792 795 798
## 0.3036778169 0.2061447121 0.2623656203 0.0682747754 NA
## 800 801 806 808 823
## 0.2264040390 0.1580630758 0.1432032743 0.3282131923 NA
## 824 826 840 844 853
## -0.0126300063 0.1388356267 0.1327908013 0.0137911695 0.0026050808
## 857 858 863 864 868
## 0.1290328589 0.0036171661 0.3509090348 0.1834377549 0.2206665124
## 892 895 897 898 902
## 0.0073656802 0.0941803170 NA 0.1373631067 0.2557264840
## 908 909 917 918 921
## 0.0302108471 0.3339970087 NA NA 0.1641505994
## 928 935 936 937 939
## 0.3343114839 0.2840355370 0.1777679741 NA 0.0814929847
## 940 941 944 947 962
## NA 0.2244792645 NA 0.1083311611 0.1451743326
## 973 995 996 1009 1012
## NA 0.0576059567 NA 0.3624896052 0.2744687157
## 1013 1016 1020 1024 1027
## 0.1519786484 0.3052461527 NA 0.1438601100 0.1084922594
## 1038 1039 1046 1052 1054
## 0.1140449251 NA 0.3647378494 0.3090091450 0.2548644835
## 1059 1067 1069 1084 1092
## 0.2223270762 0.0760271536 0.3872651253 0.1257410997 0.1742283869
## 1097 1102 1112 1115 1117
## 0.0953153339 NA 0.2911975335 0.2209837861 -0.0307113666
## 1124 1131 1133 1138 1145
## 0.2792216530 0.2263423122 0.1905132913 0.1286520944 0.2684649340
## 1168 1175 1176 1177 1181
## 0.0599358927 0.3030819286 NA 0.1248581734 0.0904256137
## 1198 1200 1206 1207 1209
## 0.4056744373 0.0938080492 0.1219626914 NA 0.0270601599
## 1216 1218 1219 1224 1228
## 0.1984819825 0.0168372717 0.0036087036 0.0329588461 NA
## 1237 1238 1248 1259 1262
## 0.2735910755 0.3746726680 -0.0094782412 0.2734972487 0.0564045902
## 1263 1280 1282 1285 1289
## 0.1895507926 0.2663913843 -0.0259454138 NA NA
## 1292 1303 1305 1309 1310
## 0.0894456790 0.3189392777 0.1895163491 0.3897115343 -0.0407242708
## 1318 1321 1324 1329 1330
## NA 0.1431872751 0.1911903700 0.1643337619 0.0870611465
## 1340 1353 1357 1369 1375
## 0.1625479491 0.1048357324 0.0308522560 NA 0.1513048049
## 1383 1390 1398 1402 1407
## 0.0453630468 0.2059051292 0.1111613491 -0.0009890861 0.0018151051
## 1408 1411 1418 1421 1430
## -0.0256325986 0.0593188092 0.1288810619 -0.0356474650 NA
## 1441 1443 1454 1460 1462
## 0.0260012121 0.4129457204 0.1423280515 -0.0549779408 0.0986903976
## 1463 1468 1470 1476 1479
## 0.1870264963 0.0876719626 0.2044365551 0.0762657323 NA
## 1484 1488 1490 1495 1496
## 0.1976691666 0.1231703110 0.0803057875 0.1581869047 0.0577298128
## 1502 1510 1511 1524 1525
## 0.0980996185 0.0393468858 0.1533099046 -0.0461960269 NA
## 1527 1529 1534 1535 1540
## 0.0121751205 0.0857297553 0.1268892511 0.2008108111 0.1910583368
## 1542 1546 1550 1564 1567
## 0.1224628302 0.1047981407 0.1447432155 0.2234963121 NA
## 1578 1583 1588 1591 1592
## 0.0919705268 0.0608889989 -0.0242780509 0.1433698356 0.2322682404
## 1595 1596 1598 1601 1615
## NA 0.3313159876 -0.0390673065 0.3629124572 0.3455674145
## 1618 1619 1620 1625 1626
## NA 0.1424691060 0.3079387964 NA NA
## 1628 1637 1645 1649 1659
## 0.0318775312 0.0934229327 -0.0231120605 0.1402887448 NA
## 1660 1661 1663 1664 1671
## 0.0755616078 0.3277313673 0.2297770987 0.3041345965 0.0400403710
## 1675 1676 1677 1678 1680
## 0.3672950895 NA 0.1520671349 0.1085358535 -0.0065506119
## 1683 1684 1685 1687 1688
## -0.0297410210 -0.0042253726 0.2782284505 0.0698589283 NA
## 1704 1729 1732 1749 1750
## 0.1357836139 0.1496187898 0.2891968741 0.2548993847 0.2167269944
## 1753 1763 1768 1771 1780
## 0.3105410600 0.1413408289 0.1157920373 NA 0.1073800261
## 1783 1786 1788 1794 1798
## 0.3608890622 NA 0.1270771878 0.2218915448 -0.0310430781
## 1801 1810 1812 1813 1816
## NA 0.1156621175 0.1401176701 0.2952365006 NA
## 1817 1819 1825 1830 1837
## 0.2804497698 0.3094338676 0.2295803696 0.0118184825 0.2461972049
## 1843 1844 1849 1871 1875
## 0.3387752095 0.2146543900 0.0563573381 NA 0.2459008316
## 1888 1893 1897 1902 1904
## -0.0050206318 0.1161171467 0.1715046874 0.2342013010 0.0310034836
## 1910 1913 1921 1924 1944
## 0.0987967339 0.0484807183 0.2265500901 0.0670837339 0.0963038365
## 1945 1947 1952 1954 1967
## 0.0426420071 0.1221475094 NA 0.0883100989 0.2811555708
## 1968 1969 1970 1971 1972
## 0.1595215013 0.0790929403 0.3595483108 0.0455360271 NA
## 1973 1977 1985 1995 1997
## 0.1743167657 0.1582226295 NA 0.2785055656 0.0482277206
## 1999 2000 2007 2010 2011
## NA 0.0229651391 NA 0.1357269241 NA
## 2012 2014 2018 2020 2042
## 0.2299443702 0.1743746580 0.0898937163 0.0662522128 0.4301335476
## 2043 2045 2047 2048 2050
## 0.1862645595 0.0170042731 0.0516636979 0.0905898691 NA
## 2052 2058 2061 2063 2064
## 0.0422237535 0.1478686200 NA 0.2372770557 0.1299000519
## 2070 2073 2079 2095 2098
## 0.0731284479 0.2726641923 -0.0290202281 -0.0474055107 0.2378487255
## 2106 2111 2123 2130 2133
## 0.1422449006 NA -0.0294460637 0.2600193355 0.4049588980
## 2145 2147 2149 2152 2156
## 0.1791473204 0.0574126192 0.1646157634 0.0098713316 0.0439298341
## 2157 2161 2177 2187 2188
## 0.0001158867 NA 0.0908317439 0.2183976009 0.4576800686
## 2194 2202 2205 2217 2223
## 0.1162640100 0.1103650515 0.5285196569 NA 0.3264762720
## 2239 2245 2247 2254 2259
## 0.3102891541 -0.0537799265 0.2410597442 0.2889529967 0.1384790336
## 2262 2264 2268 2275 2282
## 0.1516181436 NA NA 0.1543829599 0.1179292660
## 2295 2296 2298 2320 2336
## 0.0250705927 0.1968738131 0.1909883786 0.2625760824 0.3191871719
## 2337 2338 2350 2351 2362
## 0.0621272198 0.0133467212 0.3819402400 0.0595983224 0.0843368041
## 2372 2375 2377 2381 2382
## 0.2752938167 0.1813067559 0.2324425485 0.2755070422 -0.0053230625
## 2384 2388 2391 2392 2397
## 0.1626091730 0.0069301359 0.1065562451 0.1558471706 0.0033639339
## 2402 2408 2416 2420 2427
## -0.0178879605 0.1054178467 0.2774434639 0.1285407064 0.2187516131
## 2428 2430 2436 2443 2448
## 0.0646899454 0.1479578890 0.0582331829 0.0615400195 0.3293667101
## 2459 2462 2465 2469 2471
## 0.2325952565 0.5148275697 0.0163856186 NA 0.0684553055
## 2477 2480 2483 2490 2496
## 0.2551858413 0.1115460876 0.1125633200 0.0642934835 0.0496988766
## 2499 2501 2515 2520 2539
## 0.2729843918 0.1898004309 -0.0672159330 0.0739900177 0.0319923840
## 2542 2544 2545 2549 2560
## 0.1796889250 0.1302365038 -0.0420501050 -0.0047573162 0.0450846361
## 2563 2568 2569 2575 2581
## 0.0263390324 NA -0.0298419652 0.2017009054 0.1245426706
## 2587 2590 2598 2603 2604
## 0.1805547129 NA 0.3906822523 0.1716516033 0.1880868888
## 2610 2611 2617 2621 2628
## NA 0.1382282393 0.0713431102 0.4227941017 0.0878106159
## 2637 2654 2659 2662 2663
## 0.0058700076 0.1380531932 NA 0.1993694606 0.1984555307
## 2665 2672 2674 2675 2677
## 0.1227667936 0.1873174896 0.0663214086 0.3681025169 0.1678348271
## 2688 2689 2690 2691 2695
## 0.3034952023 0.1280782489 NA 0.1733797921 0.2697886855
## 2703 2712 2715 2717 2718
## 0.1476512887 NA 0.1102610034 0.3024907827 0.2113935315
## 2724 2726 2731 2732 2733
## 0.1471281013 0.0187759371 0.0745090105 0.3012318299 0.0925117352
## 2742 2745 2755 2766 2773
## -0.0103774266 0.0904262944 0.0406844223 0.0076484701 0.0045136067
## 2776 2778 2784 2790 2796
## 0.2029372032 0.0047969090 0.1409931901 0.2735884715 0.2022482450
## 2799 2807 2808 2809 2817
## 0.1395758291 0.2795233784 0.0966271444 -0.0137800292 -0.0048763499
## 2820 2831 2847 2848 2854
## 0.4014697153 -0.0415262879 NA NA 0.1225344130
## 2858 2870 2878 2883 2885
## -0.0548120605 0.0658086097 0.2908972460 NA 0.2762910016
## 2897 2903 2904 2911 2917
## 0.0427089317 NA 0.1797909208 0.0865110054 0.0565461940
## 2931 2933 2952 2962 2969
## 0.4226287736 0.0451111582 0.1033533104 0.4875393289 0.1508875600
## 2973 2983 2987 2999 3020
## 0.1448827215 0.0880208656 NA 0.1259073045 0.1465782240
## 3027 3028 3029 3032 3034
## 0.4237715676 0.1049495965 0.0565459491 0.3262443471 NA
## 3037 3040 3041 3047 3048
## -0.0904935506 0.0353253182 0.2841365023 NA 0.2662457973
## 3054 3056 3057 3063 3065
## 0.1104789160 NA 0.0226695034 NA 0.0417984111
## 3075 3076 3084 3089 3096
## 0.3726300073 -0.0076397956 0.1789386980 0.5339220022 0.0979883131
## 3097 3098 3099 3103 3112
## 0.3269854650 0.0859897277 NA 0.3651693706 NA
## 3118 3119 3121 3133 3136
## 0.2598935626 NA 0.0569047896 0.2568841536 0.1025981592
## 3142 3156 3159 3160 3161
## -0.0222525732 0.1997911812 NA 0.0768673244 NA
## 3162 3169 3170 3171 3179
## 0.2625237587 0.1242687710 0.0145656094 0.1312899618 0.0443367399
## 3184 3186 3188 3195 3199
## -0.0209089291 0.1562541100 0.2226791099 -0.0247944395 0.1794207951
## 3203 3208 3209 3215 3217
## 0.1359130957 NA -0.0093529847 0.2941419755 0.1206587808
## 3222 3223 3228 3229 3235
## -0.0409798328 0.0331283163 0.0801392287 0.0194108682 NA
## 3238 3244 3245 3252 3255
## 0.0498139979 0.1401391974 0.2863097634 0.1023377909 0.1855451369
## 3263 3268 3275 3285 3287
## 0.2284512076 0.1819523902 0.1571832379 -0.0017018500 0.2696124508
## 3290 3291 3295 3306 3308
## 0.2488959726 NA 0.0586532685 0.0336545296 -0.0207007258
## 3315 3317 3328 3329 3334
## 0.2687755361 NA 0.1871887369 0.1709661506 0.0523739130
## 3341 3351 3362 3367 3369
## 0.3790723359 0.1349077329 0.1513373361 0.2650190613 -0.0027755991
## 3376 3380 3387 3388 3405
## NA 0.0663179325 NA 0.0887909600 0.2913259605
## 3433 3441 3443 3454 3459
## NA 0.0724111343 0.2920941711 0.2772475598 0.3804796831
## 3461 3477 3483 3491 3493
## 0.2677411373 0.0875666433 0.0700776422 0.1450934786 0.0859704164
## 3498 3505 3506 3512 3515
## 0.1235887401 0.0707499931 0.1645490889 0.1199211239 NA
## 3529 3530 3549 3572 3582
## 0.2634984080 0.2673900720 0.1247685222 NA 0.0460273793
## 3590 3593 3604 3611 3617
## 0.3656182377 0.1969928141 NA 0.1512869858 0.4474299798
## 3624 3628 3640 3645 3647
## 0.1320732461 0.3382067469 0.1199382483 NA 0.0397246427
## 3653 3659 3669 3672 3674
## 0.1152945862 0.0838711353 -0.0223656214 0.5965566691 NA
## 3679 3682 3685 3697 3701
## 0.2026758561 NA 0.0505975974 0.1227634659 -0.0178281231
## 3704 3705 3708 3711 3717
## 0.1832078361 0.1047925247 0.1708843325 0.2182314331 0.1539643260
## 3726 3738 3743 3749 3751
## 0.1332952589 NA 0.1546035708 0.2061364761 0.1684979719
## 3758 3760 3761 3763 3765
## 0.1544540334 0.1304707588 0.2347189535 0.0790758801 NA
## 3766 3769 3782 3788 3790
## 0.0054331990 0.0973276689 0.0776152499 NA 0.2376577111
## 3801 3802 3804 3805 3807
## 0.0107255026 0.0474408644 0.2166370955 0.2566690411 0.1217543203
## 3808 3813 3815 3816 3817
## -0.0492331872 0.1694067891 0.1000628704 0.1927411458 NA
## 3818 3825 3827 3828 3831
## 0.4872831462 0.0038818663 -0.0443304138 -0.0008766468 0.3157322678
## 3836 3845 3846 3848 3859
## 0.1942660878 0.7939452533 0.0891173489 NA 0.0263665063
## 3860 3861 3865 3866 3867
## 0.0773994785 -0.0286566650 NA 0.1635588552 0.2165638924
## 3868 3871 3872 3874 3883
## 0.2406263381 0.2257834366 0.2233902043 NA 0.2809989908
## 3884 3887 3890 3896 3897
## NA 0.2064343667 0.2445297398 0.3185666306 0.0518898831
## 3899 3900 3901 3911 3920
## 0.2902618931 0.1189355621 0.0375654319 0.0150443980 0.2618215190
## 3927 3928 3933 3938 3942
## 0.0229339959 0.6230484766 0.1846414893 -0.0298394849 -0.0134994817
## 3950 3953 3958 3962 3965
## 0.1712617267 0.4291100531 NA NA -0.0322120171
## 3972 3973 3977 3978 3979
## 0.7098964186 0.1039563167 0.2559941374 0.1334474372 0.2157262963
## 3983 3990 3993 4010 4016
## 0.1162033518 NA 0.0113205836 0.3546519608 -0.0559034239
## 4021 4025 4030 4037 4038
## 0.0841975803 0.2038513125 0.4004898830 0.2813065729 0.1051760185
## 4039 4048 4058 4067 4070
## -0.0559493614 0.0703467022 0.3616990224 0.1311136570 0.1025051435
## 4074 4076 4086 4088 4090
## 0.1786346386 0.4390660747 0.1534787313 0.2077026536 0.3083556737
## 4091 4094 4110 4115 4119
## 0.2568447100 0.0478742184 0.2870082267 0.1484845236 0.0362198082
## 4121 4125 4130 4135 4144
## 0.1512303742 0.1753021329 0.2629104257 0.1481082159 0.1056682828
## 4153 4158 4160 4161 4170
## 0.2248162816 0.1407594695 0.2705910685 NA -0.0161683901
## 4172 4174 4176 4185 4187
## 0.1483831859 0.3138226807 0.1340954819 0.1521251108 0.0546466935
## 4192 4193 4195 4206 4208
## -0.0434250351 0.1516820812 0.5199622825 0.0749673565 0.1635776100
## 4210 4213 4217 4220 4221
## 0.3193342628 0.2608936365 0.1797421174 0.1206897498 0.2187109880
## 4222 4227 4238
## 0.1312639797 0.1972274395 0.1426509692
predicted_values <- predict(model, test_set[, colnames(test_set) !="TenYearCHD"])
actual_values <- test_set[,"TenYearCHD"]
results<- data.frame(predicted_values, actual_values)
head(results)
## predicted_values actual_values
## 6 0.18641898 0
## 14 0.09284257 0
## 22 NA 0
## 47 0.33370530 0
## 50 NA 0
## 53 0.11558080 0
install.packages("pROC")
library(pROC)
# Assuming binary classification, predicting with your model
# Ensure 'predicted_values' contains the probabilities for the positive class
# If your model outputs factors or classes, adjust accordingly
# Converting probabilities to binary classification based on a threshold (e.g., 0.5)
predicted_class <- ifelse(predicted_values > 0.5, 1, 0)
# Calculating Accuracy
accuracy <- sum(predicted_class == actual_values, na.rm=T) / length(actual_values)
# Calculating AUC
roc_curve <- roc(actual_values, predicted_values)
auc_value <- auc(roc_curve)
# Output Accuracy and AUC
list(Accuracy = accuracy, AUC = auc_value)
## $Accuracy
## [1] 0.7429245
##
## $AUC
## Area under the curve: 0.7053
# Drawing ROC curve using pROC package
library(pROC)
# Generate the ROC curve object
roc_obj <- roc(response = actual_values, predictor = predicted_values)
# Plotting the ROC curve
plot(roc_obj, main="ROC Curve", col="#1c61b6", xlim=c(1,0))
# make sure your data is in the same folder
# data<- rio::import("file.xlsx")
# library(outbreaks) # activated above
ebola<- outbreaks::ebola_sim_clean
class(ebola)
## [1] "list"
str(ebola)
## List of 2
## $ linelist:'data.frame': 5829 obs. of 11 variables:
## ..$ case_id : chr [1:5829] "d1fafd" "53371b" "f5c3d8" "6c286a" ...
## ..$ generation : int [1:5829] 0 1 1 2 2 0 3 3 2 3 ...
## ..$ date_of_infection : Date[1:5829], format: NA "2014-04-09" ...
## ..$ date_of_onset : Date[1:5829], format: "2014-04-07" "2014-04-15" ...
## ..$ date_of_hospitalisation: Date[1:5829], format: "2014-04-17" "2014-04-20" ...
## ..$ date_of_outcome : Date[1:5829], format: "2014-04-19" NA ...
## ..$ outcome : Factor w/ 2 levels "Death","Recover": NA NA 2 1 2 NA 2 1 2 1 ...
## ..$ gender : Factor w/ 2 levels "f","m": 1 2 1 1 1 1 1 1 2 2 ...
## ..$ hospital : Factor w/ 5 levels "Connaught Hospital",..: 2 1 3 NA 3 NA 1 4 3 5 ...
## ..$ lon : num [1:5829] -13.2 -13.2 -13.2 -13.2 -13.2 ...
## ..$ lat : num [1:5829] 8.47 8.46 8.48 8.46 8.45 ...
## $ contacts:'data.frame': 3800 obs. of 3 variables:
## ..$ infector: chr [1:3800] "d1fafd" "cac51e" "f5c3d8" "0f58c4" ...
## ..$ case_id : chr [1:3800] "53371b" "f5c3d8" "0f58c4" "881bd4" ...
## ..$ source : Factor w/ 2 levels "funeral","other": 2 1 2 2 2 1 2 2 2 2 ...
ds<- ebola$linelist
class(ds)
## [1] "data.frame"
dim(ds)
## [1] 5829 11
head(ds)
## case_id generation date_of_infection date_of_onset date_of_hospitalisation
## 1 d1fafd 0 <NA> 2014-04-07 2014-04-17
## 2 53371b 1 2014-04-09 2014-04-15 2014-04-20
## 3 f5c3d8 1 2014-04-18 2014-04-21 2014-04-25
## 4 6c286a 2 <NA> 2014-04-27 2014-04-27
## 5 0f58c4 2 2014-04-22 2014-04-26 2014-04-29
## 6 49731d 0 2014-03-19 2014-04-25 2014-05-02
## date_of_outcome outcome gender hospital lon lat
## 1 2014-04-19 <NA> f Military Hospital -13.21799 8.473514
## 2 <NA> <NA> m Connaught Hospital -13.21491 8.464927
## 3 2014-04-30 Recover f other -13.22804 8.483356
## 4 2014-05-07 Death f <NA> -13.23112 8.464776
## 5 2014-05-17 Recover f other -13.21016 8.452143
## 6 2014-05-07 <NA> f <NA> -13.23443 8.468572
colnames(ds)
## [1] "case_id" "generation"
## [3] "date_of_infection" "date_of_onset"
## [5] "date_of_hospitalisation" "date_of_outcome"
## [7] "outcome" "gender"
## [9] "hospital" "lon"
## [11] "lat"
str(ds)
## 'data.frame': 5829 obs. of 11 variables:
## $ case_id : chr "d1fafd" "53371b" "f5c3d8" "6c286a" ...
## $ generation : int 0 1 1 2 2 0 3 3 2 3 ...
## $ date_of_infection : Date, format: NA "2014-04-09" ...
## $ date_of_onset : Date, format: "2014-04-07" "2014-04-15" ...
## $ date_of_hospitalisation: Date, format: "2014-04-17" "2014-04-20" ...
## $ date_of_outcome : Date, format: "2014-04-19" NA ...
## $ outcome : Factor w/ 2 levels "Death","Recover": NA NA 2 1 2 NA 2 1 2 1 ...
## $ gender : Factor w/ 2 levels "f","m": 1 2 1 1 1 1 1 1 2 2 ...
## $ hospital : Factor w/ 5 levels "Connaught Hospital",..: 2 1 3 NA 3 NA 1 4 3 5 ...
## $ lon : num -13.2 -13.2 -13.2 -13.2 -13.2 ...
## $ lat : num 8.47 8.46 8.48 8.46 8.45 ...
summary(ds)
## case_id generation date_of_infection date_of_onset
## Length:5829 Min. : 0.00 Min. :2014-03-19 Min. :2014-04-07
## Class :character 1st Qu.:13.00 1st Qu.:2014-09-06 1st Qu.:2014-09-16
## Mode :character Median :16.00 Median :2014-10-11 Median :2014-10-21
## Mean :16.58 Mean :2014-10-23 Mean :2014-11-01
## 3rd Qu.:20.00 3rd Qu.:2014-12-05 3rd Qu.:2014-12-16
## Max. :37.00 Max. :2015-04-27 Max. :2015-04-30
## NA's :2087
## date_of_hospitalisation date_of_outcome outcome gender
## Min. :2014-04-17 Min. :2014-04-19 Death :2564 f:2934
## 1st Qu.:2014-09-19 1st Qu.:2014-09-26 Recover:1963 m:2895
## Median :2014-10-23 Median :2014-11-01 NA's :1302
## Mean :2014-11-03 Mean :2014-11-13
## 3rd Qu.:2014-12-17 3rd Qu.:2014-12-28
## Max. :2015-04-30 Max. :2015-06-04
## NA's :929
## hospital lon
## Connaught Hospital :1737 Min. :-13.27
## Military Hospital : 889 1st Qu.:-13.25
## other : 876 Median :-13.23
## Princess Christian Maternity Hospital (PCMH): 420 Mean :-13.23
## Rokupa Hospital : 451 3rd Qu.:-13.22
## NA's :1456 Max. :-13.21
##
## lat
## Min. :8.446
## 1st Qu.:8.461
## Median :8.469
## Mean :8.470
## 3rd Qu.:8.480
## Max. :8.492
##
# View(ds)
ds<- ds %>% mutate(across(contains("date"), function(x) parse_date_time(x, orders=c("Ymd"))))
and this text https://citedrive.medium.com/top-data-wrangling-and-data-cleaning-packages-for-r-in-2023-a-comprehensive-guide-194de0cc0aa6#:~:text=table%2C%20plyr%2C%20janitor%2C%20stringr,on%20data%20analysis%20and%20insights.
ds<- ds %>%
janitor::clean_names() %>% # make all column names same format
mutate(across(is.character, stringr::str_squish)) %>% # remove white spaces from text
mutate(across(is.character, stringr::str_to_lower)) %>% # make all text small letter
filter() # last line , use if needed to filter data
https://cran.r-project.org/web/packages/cleaner/readme/README.html
ds %>%
mutate(outcome=cleaner::clean_factor(outcome, levels=c("Death", "Recover"))) %>%
mutate(case_id=cleaner::clean_character(case_id)) %>% head
## case_id generation date_of_infection date_of_onset date_of_hospitalisation
## 1 dfafd 0 <NA> 2014-04-07 2014-04-17
## 2 b 1 2014-04-09 2014-04-15 2014-04-20
## 3 fcd 1 2014-04-18 2014-04-21 2014-04-25
## 4 ca 2 <NA> 2014-04-27 2014-04-27
## 5 fc 2 2014-04-22 2014-04-26 2014-04-29
## 6 d 0 2014-03-19 2014-04-25 2014-05-02
## date_of_outcome outcome gender hospital lon lat
## 1 2014-04-19 <NA> f Military Hospital -13.21799 8.473514
## 2 <NA> <NA> m Connaught Hospital -13.21491 8.464927
## 3 2014-04-30 Recover f other -13.22804 8.483356
## 4 2014-05-07 Death f <NA> -13.23112 8.464776
## 5 2014-05-17 Recover f other -13.21016 8.452143
## 6 2014-05-07 <NA> f <NA> -13.23443 8.468572
ds$incubation <- ds$date_of_onset - ds$date_of_infection
ggplot(data = ds) +
geom_histogram(aes(x=incubation))
# calculate the average incubation period
# calculate the CFR (case fatality rate)
tbl<- table(ds$outcome)
tbl["Recover"]/tbl["Death"]
## Recover
## 0.7656006
ggplot(data=ds)+
geom_histogram(aes(x=incubation, fill=gender))+
theme_classic()+
labs(x="Incubation", y="Frequency")
ggplot(data = ds, aes(x = date_of_onset)) +
geom_histogram(aes(y = ..density..), binwidth = 7) + # Adjust binwidth as needed
geom_density(alpha = 0.2, fill = "#FF6666")
ggplot(data = ds, aes(x = incubation)) +
geom_histogram(aes(y = ..density..), binwidth = 1, alpha=0.5) + # Adjust binwidth as needed
geom_density(alpha = 0.2, fill = "#FF6666")+
theme_classic()
https://cran.r-project.org/web/packages/epikit/vignettes/intro.html https://r4epis.netlify.app/outbreaks/
# first you need to install packages for the first use
# install.packages("epikit")
# install.packages("outbreaks")
library(epikit)
library(outbreaks)
df<- outbreaks::ebola_sim_clean
case_fatality_rate_df(df$linelist,
outcome == "Death",
group = gender,
add_total = TRUE,
mergeCI = TRUE
)
## # A tibble: 3 × 5
## gender deaths population cfr ci
## <fct> <int> <int> <dbl> <chr>
## 1 f 1291 2280 56.6 (54.58-58.64)
## 2 m 1273 2247 56.7 (54.59-58.69)
## 3 Total 2564 4527 56.6 (55.19-58.08)
https://epirhandbook.com/en/download-handbook-and-data.html#download-handbook-and-data
linelist <- import("linelist_cleaned.xlsx")
linelist$date_hospitalisation<- parse_date_time(linelist$date_hospitalisation, orders=c("Ymd"))
lineliest<- linelist %>% mutate(across(contains("date"),function(x) parse_date_time(x,orders=c("Ymd")) ))
colnames(linelist)
## [1] "case_id" "generation" "date_infection"
## [4] "date_onset" "date_hospitalisation" "date_outcome"
## [7] "outcome" "gender" "age"
## [10] "age_unit" "age_years" "age_cat"
## [13] "age_cat5" "hospital" "lon"
## [16] "lat" "infector" "source"
## [19] "wt_kg" "ht_cm" "ct_blood"
## [22] "fever" "chills" "cough"
## [25] "aches" "vomit" "temp"
## [28] "time_admission" "bmi" "days_onset_hosp"
# check range of onset dates
ggplot(data = linelist)+
geom_histogram(aes(x = date_onset))
# Daily histogram for Central Hospital
ggplot(data = linelist) + # Set the data source
geom_histogram( # Create a histogram
mapping = aes(x = as_date(date_onset)))+ # Map the 'date_onset' column to the x-axis
scale_x_date( # Correctly format the x-axis as date
labels = scales::date_format("%b-%y"), # Use abbreviated month and two-digit year for labels
breaks = scales::date_breaks("1 month")) + # Set breaks to be monthly
labs(title = "Central Hospital - Daily Cases") # Add a title to the plot
library(ggplot2)
# Ensure 'date_onset' is of Date type before plotting
linelist$date_onset <- as.Date(linelist$date_onset)
# Daily histogram for Central Hospital with weekly cuts, starting on Mondays
ggplot(data = linelist) + # Set the data source
geom_histogram( # Create a histogram
mapping = aes(x = date_onset, fill=hospital), color="black", # Map the 'date_onset' column to the x-axis
binwidth = 7) + # Set the width of the bins to 7 days for weekly aggregation
scale_x_date( # Correctly format the x-axis as date
labels = scales::date_format("%a\n%d %b\n %Y"), # Use format "Day DD Mon YYYY" for labels
breaks = scales::date_breaks("4 week"), # Set breaks to be weekly
limits = c(min(linelist$date_onset), max(linelist$date_onset) + 7), # Extend limit to include the last week
date_minor_breaks = "1 weeks") + # Minor breaks every week for clarity
labs(title = "Central Hospital - Weekly Cases") # Add a title to the plot
linelist$hospital<- factor(linelist$hospital,
levels=c("Port Hospital", "Military Hospital", "Central Hospital", "St. Mark's Maternity Hospital (SMMH)", "Other", "Missing"))
# Daily histogram for Central Hospital with weekly cuts, starting on Mondays
ggplot(data = linelist) + # Set the data source
geom_histogram( # Create a histogram
mapping = aes(x = date_onset, fill=hospital), color="black", # Map the 'date_onset' column to the x-axis
binwidth = 7) + # Set the width of the bins to 7 days for weekly aggregation
scale_x_date( # Correctly format the x-axis as date
labels = scales::date_format("%a\n%d %b\n %Y"), # Use format "Day DD Mon YYYY" for labels
breaks = scales::date_breaks("8 week"), # Set breaks to be weekly
limits = c(min(linelist$date_onset), max(linelist$date_onset) + 7), # Extend limit to include the last week
date_minor_breaks = "1 weeks") + # Minor breaks every week for clarity
labs(title = "Central Hospital - Weekly Cases") # Add a title to the plot
ggplot(data = linelist) + # Set the data source
geom_histogram( # Create a histogram
mapping = aes(x = date_onset, fill=age_cat), color="black", # Map the 'date_onset' column to the x-axis
binwidth = 7) + # Set the width of the bins to 7 days for weekly aggregation
scale_x_date( # Correctly format the x-axis as date
labels = scales::date_format("%a\n%d %b\n %Y"), # Use format "Day DD Mon YYYY" for labels
breaks = scales::date_breaks("8 week"), # Set breaks to be weekly
limits = c(min(linelist$date_onset), max(linelist$date_onset) + 7), # Extend limit to include the last week
date_minor_breaks = "1 weeks") + # Minor breaks every week for clarity
labs(title = "Central Hospital - Weekly Cases") + # Add a title to the plo
facet_wrap(~age_cat)+
gghighlight::gghighlight()+
theme_classic()