case <- c(exposed = 8, unexposed = 10)
case
## exposed unexposed
## 8 10
noncase <- c(exposed = 15, unexposed = 78)
noncase
## exposed unexposed
## 15 78
# column bind binds cases and noncases as columns, therefore upper left corner becomes the exposed cases of disease
cbind(case, noncase)
## case noncase
## exposed 8 15
## unexposed 10 78
rep("hello", 5)
## [1] "hello" "hello" "hello" "hello" "hello"
outcome <- c(rep("case", 18),rep("noncase",93))
outcome
## [1] "case" "case" "case" "case" "case" "case" "case"
## [8] "case" "case" "case" "case" "case" "case" "case"
## [15] "case" "case" "case" "case" "noncase" "noncase" "noncase"
## [22] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
## [29] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
## [36] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
## [43] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
## [50] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
## [57] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
## [64] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
## [71] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
## [78] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
## [85] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
## [92] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
## [99] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
## [106] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
tmp <- c("exposed", "unexposed")
exposure <- c(rep(tmp, c(8, 10)), rep(tmp, c(15, 78)))
exposure
## [1] "exposed" "exposed" "exposed" "exposed" "exposed" "exposed"
## [7] "exposed" "exposed" "unexposed" "unexposed" "unexposed" "unexposed"
## [13] "unexposed" "unexposed" "unexposed" "unexposed" "unexposed" "unexposed"
## [19] "exposed" "exposed" "exposed" "exposed" "exposed" "exposed"
## [25] "exposed" "exposed" "exposed" "exposed" "exposed" "exposed"
## [31] "exposed" "exposed" "exposed" "unexposed" "unexposed" "unexposed"
## [37] "unexposed" "unexposed" "unexposed" "unexposed" "unexposed" "unexposed"
## [43] "unexposed" "unexposed" "unexposed" "unexposed" "unexposed" "unexposed"
## [49] "unexposed" "unexposed" "unexposed" "unexposed" "unexposed" "unexposed"
## [55] "unexposed" "unexposed" "unexposed" "unexposed" "unexposed" "unexposed"
## [61] "unexposed" "unexposed" "unexposed" "unexposed" "unexposed" "unexposed"
## [67] "unexposed" "unexposed" "unexposed" "unexposed" "unexposed" "unexposed"
## [73] "unexposed" "unexposed" "unexposed" "unexposed" "unexposed" "unexposed"
## [79] "unexposed" "unexposed" "unexposed" "unexposed" "unexposed" "unexposed"
## [85] "unexposed" "unexposed" "unexposed" "unexposed" "unexposed" "unexposed"
## [91] "unexposed" "unexposed" "unexposed" "unexposed" "unexposed" "unexposed"
## [97] "unexposed" "unexposed" "unexposed" "unexposed" "unexposed" "unexposed"
## [103] "unexposed" "unexposed" "unexposed" "unexposed" "unexposed" "unexposed"
## [109] "unexposed" "unexposed" "unexposed"
# Column bind exposure and outcome in sars.obs'ervations
sars.obs <- cbind(exposure, outcome)
# First four rows of observations (passengers from the airplane)
sars.obs[1:4,]
## exposure outcome
## [1,] "exposed" "case"
## [2,] "exposed" "case"
## [3,] "exposed" "case"
## [4,] "exposed" "case"
# Second column of the sars.obs'ervation table. I.e. Outcomes
sars.obs[,2]
## [1] "case" "case" "case" "case" "case" "case" "case"
## [8] "case" "case" "case" "case" "case" "case" "case"
## [15] "case" "case" "case" "case" "noncase" "noncase" "noncase"
## [22] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
## [29] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
## [36] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
## [43] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
## [50] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
## [57] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
## [64] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
## [71] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
## [78] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
## [85] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
## [92] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
## [99] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
## [106] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
# These two cells are the totals of cases and noncases
table(outcome)
## outcome
## case noncase
## 18 93
# Recall the 2 x2 table from section 1.1.2
table(exposure, outcome)
## outcome
## exposure case noncase
## exposed 8 15
## unexposed 10 78
# Lets construct a 2x2 table from sars.obs'ervations and assign it to sars.tab
sars.tab <- table(sars.obs[ , 1], sars.obs[ , 2])
sars.tab
##
## case noncase
## exposed 8 15
## unexposed 10 78
# Get help file for matrix
# ?matrix
# Display arguments for matrix
args(matrix)
## function (data = NA, nrow = 1, ncol = 1, byrow = FALSE, dimnames = NULL)
## NULL
# Create first matrix (by column)
tab <- matrix(c("a", "b", "c", "d"), 2, 2)
rownames(tab) <- c("Exposed", "unExposed")
colnames(tab) <- c("Disease", "noDisease")
tab
## Disease noDisease
## Exposed "a" "c"
## unExposed "b" "d"
# Create matrix by row this time
tab <- matrix(c("a", "b", "c", "d"), 2, 2, byrow=TRUE)
rownames(tab) <- c("Exposed", "unExposed")
colnames(tab) <- c("Disease", "noDisease")
tab
## Disease noDisease
## Exposed "a" "b"
## unExposed "c" "d"
# Read UGDP table
udat1 <- read.csv('http://www.columbia.edu/~cjd11/charles_dimaggio/DIRE/resources/R/ugdp.txt', as.is = TRUE)
# stringsAsFactors=FALSE or as.is prevent strings or all other read values from converting to factor
# Note to self: There is an alternative way to read files in. Need to find the resource for that option.
# Create 2 by 2 table
udat1.tab = table(Outcome = udat1$Status, udat1$Treatment)
udat1.tab
##
## Outcome Placebo Tolbutamide
## Death 21 30
## Survivor 184 174
# Create 2 by 2 matrix by selecting Exposure and Disease status elements into the right cells
# Is there an easier way to insert the table output into the matrix by sorting the rows and columns according to desired order?
tab <- matrix(c(udat1.tab['Death','Tolbutamide'], udat1.tab['Survivor','Tolbutamide'], udat1.tab['Death','Placebo'], udat1.tab['Survivor','Placebo']), 2, 2, byrow=TRUE)
rownames(tab) <- c("Exposed", "Not Exposed")
colnames(tab) <- c("Disease", "No Disease")
names(dimnames(tab)) <- c('Exposure Status', 'Disease Status')
tab
## Disease Status
## Exposure Status Disease No Disease
## Exposed 30 174
## Not Exposed 21 184
# addmargins function below adds sum's to both rows and columns
addmargins(tab, FUN=sum, quiet = TRUE)
## Disease Status
## Exposure Status Disease No Disease sum
## Exposed 30 174 204
## Not Exposed 21 184 205
## sum 51 358 409
coltot <- apply(tab, 2, sum)
risks <- tab["Exposed",]/coltot
odds <- risks/(1-risks)
risk.ratio <- risks/risks[2]
odds.ratio <- odds/odds[2]
rbind(risks, risk.ratio, odds, odds.ratio)
## Disease No Disease
## risks 0.5882353 0.4860335
## risk.ratio 1.2102772 1.0000000
## odds 1.4285714 0.9456522
## odds.ratio 1.5106732 1.0000000
tab2 = round(tab/10)
tab2
## Disease Status
## Exposure Status Disease No Disease
## Exposed 3 17
## Not Exposed 2 18
## fisher test output is a list
fish <- fisher.test(tab2)
str(fish)
## List of 7
## $ p.value : num 1
## $ conf.int : num [1:2] 0.159 20.979
## ..- attr(*, "conf.level")= num 0.95
## $ estimate : Named num 1.57
## ..- attr(*, "names")= chr "odds ratio"
## $ null.value : Named num 1
## ..- attr(*, "names")= chr "odds ratio"
## $ alternative: chr "two.sided"
## $ method : chr "Fisher's Exact Test for Count Data"
## $ data.name : chr "tab2"
## - attr(*, "class")= chr "htest"
## access list component estimate
fish$estimate
## odds ratio
## 1.570083
## 2 alternative ways to access lower confidence interval
fish$conf.int[1]
## [1] 0.1589135
fish[[2]][1]
## [1] 0.1589135
orcalc <- function(x){
or <- (x[1,1]*x[2,2])/(x[1,2]*x[2,1])
pval <- fisher.test(x)$p.value
list(data=x, odd.ratio = or, p.value = pval)
}
#calculates or and pval and combines the table, or, and pvalue into a list called test
test <- orcalc(tab)
test
## $data
## Disease Status
## Exposure Status Disease No Disease
## Exposed 30 174
## Not Exposed 21 184
##
## $odd.ratio
## [1] 1.510673
##
## $p.value
## [1] 0.1812576
str(test)
## List of 3
## $ data : int [1:2, 1:2] 30 21 174 184
## ..- attr(*, "dimnames")=List of 2
## .. ..$ Exposure Status: chr [1:2] "Exposed" "Not Exposed"
## .. ..$ Disease Status : chr [1:2] "Disease" "No Disease"
## $ odd.ratio: num 1.51
## $ p.value : num 0.181
x <- factor(1:7)
mode(x)
## [1] "numeric"
class(x)
## [1] "factor"
y <- c("1","2","3")
y
## [1] "1" "2" "3"
str(y)
## chr [1:3] "1" "2" "3"
## sum(y) and sum(x) error out
sum(as.numeric(y))
## [1] 6
sum(as.numeric(x))
## [1] 28
mths = c("March", "April", "January", "November", "January", "September", "October", "September", "November", "August", "January", "November", "November", "February", "May", "August", "July", "December", "August", "August", "September", "November", "February", "April")
# Most frequent months is November and there is no apparent order to the vector
table(mths)
## mths
## April August December February January July March May
## 2 4 1 2 3 1 1 1
## November October September
## 5 1 3
class(mths)
## [1] "character"
mode(mths)
## [1] "character"
f.mths = factor(mths, levels = c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"), ordered = TRUE)
f.mths
## [1] March April January November January September October
## [8] September November August January November November February
## [15] May August July December August August September
## [22] November February April
## 12 Levels: January < February < March < April < May < June < ... < December
table(f.mths)
## f.mths
## January February March April May June July August
## 3 2 1 2 1 0 1 4
## September October November December
## 3 1 5 1
class(f.mths)
## [1] "ordered" "factor"
mode(f.mths)
## [1] "numeric"
mths[1]
## [1] "March"
mths[2]
## [1] "April"
# errors out
# as.numeric(mths[1]) < as.numeric(mths[2])
f.mths[1]
## [1] March
## 12 Levels: January < February < March < April < May < June < ... < December
f.mths[2]
## [1] April
## 12 Levels: January < February < March < April < May < June < ... < December
as.numeric(f.mths[1]) < as.numeric(f.mths[2])
## [1] TRUE
as.numeric(f.mths)[1] + as.numeric(f.mths)[2]
## [1] 7
mean(as.numeric(f.mths))
## [1] 6.916667
ses <- sample(c("Low", "Medium", "High"), 100, replace = TRUE)
ses
## [1] "Medium" "Medium" "Low" "Low" "High" "High" "Low" "High"
## [9] "High" "Low" "High" "Low" "Low" "Low" "High" "Low"
## [17] "Medium" "Low" "Low" "Medium" "High" "High" "Medium" "Low"
## [25] "Low" "Medium" "High" "Medium" "Medium" "Medium" "Medium" "High"
## [33] "High" "Medium" "Medium" "Low" "High" "High" "Medium" "High"
## [41] "Medium" "High" "High" "Low" "High" "High" "High" "High"
## [49] "High" "High" "High" "Low" "Medium" "High" "High" "High"
## [57] "Medium" "Low" "Low" "Low" "High" "Low" "Low" "Low"
## [65] "Medium" "Low" "Medium" "Low" "Medium" "High" "Low" "Low"
## [73] "High" "High" "Low" "Low" "Medium" "High" "Low" "High"
## [81] "Low" "High" "Medium" "High" "High" "Low" "High" "Medium"
## [89] "Medium" "Low" "Low" "Medium" "Medium" "High" "Medium" "Low"
## [97] "High" "Low" "Medium" "Low"
f.ses = factor(ses, levels = c("Low", "Medium", "High"), ordered = TRUE)
table(f.ses)
## f.ses
## Low Medium High
## 35 27 38
#load data file infert
data(infert)
# structure of infert
str(infert)
## 'data.frame': 248 obs. of 8 variables:
## $ education : Factor w/ 3 levels "0-5yrs","6-11yrs",..: 1 1 1 1 2 2 2 2 2 2 ...
## $ age : num 26 42 39 34 35 36 23 32 21 28 ...
## $ parity : num 6 1 6 4 3 4 1 2 1 2 ...
## $ induced : num 1 1 2 2 1 2 0 0 0 0 ...
## $ case : num 1 1 1 1 1 1 1 1 1 1 ...
## $ spontaneous : num 2 0 0 0 1 1 0 0 1 0 ...
## $ stratum : int 1 2 3 4 5 6 7 8 9 10 ...
## $ pooled.stratum: num 3 1 4 2 32 36 6 22 5 19 ...
# brwose infert data
infert
# display first 5 rows of infert
infert[1:5,]
# 10 records from the top of infert
head(infert,10)
# 15 records from the bottom of infert
tail(infert, 15)
mean(infert$age)
## [1] 31.50403
# mean of parity addressed by index for the third column and mean of parity by addessing it by name
mean(infert[[3]])
## [1] 2.092742
mean(infert$parity)
## [1] 2.092742
# calculate fivenum'bers for age and parity (minimum, first quartile, median, third quartile, maximum).
# compare to mean() and summary() functions
fivenum(infert$age)
## [1] 21.0 28.0 31.0 35.5 44.0
mean(infert$age)
## [1] 31.50403
summary(infert$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 21.00 28.00 31.00 31.50 35.25 44.00
fivenum(infert$parity)
## [1] 1 1 2 3 6
# cross tabulation of parity by education
table(infert$parity, infert$education)
##
## 0-5yrs 6-11yrs 12+ yrs
## 1 3 42 54
## 2 0 42 39
## 3 0 21 15
## 4 3 12 3
## 5 0 3 3
## 6 6 0 2
# install.packages("epiDisplay")
library(epiDisplay)
## Loading required package: foreign
## Loading required package: survival
## Loading required package: MASS
## Loading required package: nnet
tabpct(infert$parity, infert$education, percent = "both", graph = FALSE)
##
## Original table
## infert$education
## infert$parity 0-5yrs 6-11yrs 12+ yrs Total
## 1 3 42 54 99
## 2 0 42 39 81
## 3 0 21 15 36
## 4 3 12 3 18
## 5 0 3 3 6
## 6 6 0 2 8
## Total 12 120 116 248
##
## Row percent
## infert$education
## infert$parity 0-5yrs 6-11yrs 12+ yrs Total
## 1 3 42 54 99
## (3) (42.4) (54.5) (100)
## 2 0 42 39 81
## (0) (51.9) (48.1) (100)
## 3 0 21 15 36
## (0) (58.3) (41.7) (100)
## 4 3 12 3 18
## (16.7) (66.7) (16.7) (100)
## 5 0 3 3 6
## (0) (50) (50) (100)
## 6 6 0 2 8
## (75) (0) (25) (100)
##
## Column percent
## infert$education
## infert$parity 0-5yrs % 6-11yrs % 12+ yrs %
## 1 3 (25) 42 (35.0) 54 (46.6)
## 2 0 (0) 42 (35.0) 39 (33.6)
## 3 0 (0) 21 (17.5) 15 (12.9)
## 4 3 (25) 12 (10.0) 3 (2.6)
## 5 0 (0) 3 (2.5) 3 (2.6)
## 6 6 (50) 0 (0.0) 2 (1.7)
## Total 12 (100) 120 (100) 116 (100)
plot(infert$education)
plot(infert$parity)
plot(infert$age)
plot(table(infert$parity))
hist(infert$age)
data(USArrests)
str(USArrests)
## 'data.frame': 50 obs. of 4 variables:
## $ Murder : num 13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
## $ Assault : int 236 263 294 190 276 204 110 238 335 211 ...
## $ UrbanPop: int 58 48 80 50 91 78 77 72 80 60 ...
## $ Rape : num 21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
summary(USArrests$Murder)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.800 4.075 7.250 7.788 11.250 17.400
# distribution of number of states grouped in the range ([0-50), [50-100), etc ) of assault arrests
hist(USArrests$Assault)
f <- seq(95, 104, by = 0.1)
f
## [1] 95.0 95.1 95.2 95.3 95.4 95.5 95.6 95.7 95.8 95.9 96.0 96.1
## [13] 96.2 96.3 96.4 96.5 96.6 96.7 96.8 96.9 97.0 97.1 97.2 97.3
## [25] 97.4 97.5 97.6 97.7 97.8 97.9 98.0 98.1 98.2 98.3 98.4 98.5
## [37] 98.6 98.7 98.8 98.9 99.0 99.1 99.2 99.3 99.4 99.5 99.6 99.7
## [49] 99.8 99.9 100.0 100.1 100.2 100.3 100.4 100.5 100.6 100.7 100.8 100.9
## [61] 101.0 101.1 101.2 101.3 101.4 101.5 101.6 101.7 101.8 101.9 102.0 102.1
## [73] 102.2 102.3 102.4 102.5 102.6 102.7 102.8 102.9 103.0 103.1 103.2 103.3
## [85] 103.4 103.5 103.6 103.7 103.8 103.9 104.0
c <- 5/9*(f-32)
c
## [1] 35.00000 35.05556 35.11111 35.16667 35.22222 35.27778 35.33333 35.38889
## [9] 35.44444 35.50000 35.55556 35.61111 35.66667 35.72222 35.77778 35.83333
## [17] 35.88889 35.94444 36.00000 36.05556 36.11111 36.16667 36.22222 36.27778
## [25] 36.33333 36.38889 36.44444 36.50000 36.55556 36.61111 36.66667 36.72222
## [33] 36.77778 36.83333 36.88889 36.94444 37.00000 37.05556 37.11111 37.16667
## [41] 37.22222 37.27778 37.33333 37.38889 37.44444 37.50000 37.55556 37.61111
## [49] 37.66667 37.72222 37.77778 37.83333 37.88889 37.94444 38.00000 38.05556
## [57] 38.11111 38.16667 38.22222 38.27778 38.33333 38.38889 38.44444 38.50000
## [65] 38.55556 38.61111 38.66667 38.72222 38.77778 38.83333 38.88889 38.94444
## [73] 39.00000 39.05556 39.11111 39.16667 39.22222 39.27778 39.33333 39.38889
## [81] 39.44444 39.50000 39.55556 39.61111 39.66667 39.72222 39.77778 39.83333
## [89] 39.88889 39.94444 40.00000
t <- cbind(Fahrenheit = f, Celsius = c)
t
## Fahrenheit Celsius
## [1,] 95.0 35.00000
## [2,] 95.1 35.05556
## [3,] 95.2 35.11111
## [4,] 95.3 35.16667
## [5,] 95.4 35.22222
## [6,] 95.5 35.27778
## [7,] 95.6 35.33333
## [8,] 95.7 35.38889
## [9,] 95.8 35.44444
## [10,] 95.9 35.50000
## [11,] 96.0 35.55556
## [12,] 96.1 35.61111
## [13,] 96.2 35.66667
## [14,] 96.3 35.72222
## [15,] 96.4 35.77778
## [16,] 96.5 35.83333
## [17,] 96.6 35.88889
## [18,] 96.7 35.94444
## [19,] 96.8 36.00000
## [20,] 96.9 36.05556
## [21,] 97.0 36.11111
## [22,] 97.1 36.16667
## [23,] 97.2 36.22222
## [24,] 97.3 36.27778
## [25,] 97.4 36.33333
## [26,] 97.5 36.38889
## [27,] 97.6 36.44444
## [28,] 97.7 36.50000
## [29,] 97.8 36.55556
## [30,] 97.9 36.61111
## [31,] 98.0 36.66667
## [32,] 98.1 36.72222
## [33,] 98.2 36.77778
## [34,] 98.3 36.83333
## [35,] 98.4 36.88889
## [36,] 98.5 36.94444
## [37,] 98.6 37.00000
## [38,] 98.7 37.05556
## [39,] 98.8 37.11111
## [40,] 98.9 37.16667
## [41,] 99.0 37.22222
## [42,] 99.1 37.27778
## [43,] 99.2 37.33333
## [44,] 99.3 37.38889
## [45,] 99.4 37.44444
## [46,] 99.5 37.50000
## [47,] 99.6 37.55556
## [48,] 99.7 37.61111
## [49,] 99.8 37.66667
## [50,] 99.9 37.72222
## [51,] 100.0 37.77778
## [52,] 100.1 37.83333
## [53,] 100.2 37.88889
## [54,] 100.3 37.94444
## [55,] 100.4 38.00000
## [56,] 100.5 38.05556
## [57,] 100.6 38.11111
## [58,] 100.7 38.16667
## [59,] 100.8 38.22222
## [60,] 100.9 38.27778
## [61,] 101.0 38.33333
## [62,] 101.1 38.38889
## [63,] 101.2 38.44444
## [64,] 101.3 38.50000
## [65,] 101.4 38.55556
## [66,] 101.5 38.61111
## [67,] 101.6 38.66667
## [68,] 101.7 38.72222
## [69,] 101.8 38.77778
## [70,] 101.9 38.83333
## [71,] 102.0 38.88889
## [72,] 102.1 38.94444
## [73,] 102.2 39.00000
## [74,] 102.3 39.05556
## [75,] 102.4 39.11111
## [76,] 102.5 39.16667
## [77,] 102.6 39.22222
## [78,] 102.7 39.27778
## [79,] 102.8 39.33333
## [80,] 102.9 39.38889
## [81,] 103.0 39.44444
## [82,] 103.1 39.50000
## [83,] 103.2 39.55556
## [84,] 103.3 39.61111
## [85,] 103.4 39.66667
## [86,] 103.5 39.72222
## [87,] 103.6 39.77778
## [88,] 103.7 39.83333
## [89,] 103.8 39.88889
## [90,] 103.9 39.94444
## [91,] 104.0 40.00000
w <- 148
h <- 5.25
bmi <- (w/2.2)/(h/3.3)^2
bmi
## [1] 26.57959
bmi * 0.95
## [1] 25.25061
bmi * 0.90
## [1] 23.92163
bmi * 0.85
## [1] 22.59265
p.trans = 1/500
p.not.trans = 1 - p.trans
p.not.trans
## [1] 0.998
p.not.trans^365
## [1] 0.4815569
p.trans = 1/5000
p.not.trans = 1 - p.trans
p.not.trans
## [1] 0.9998
p.not.trans^365
## [1] 0.929594
lambda <- 10/100000
t <- c(1, 5, 10)
# CI = lambda*exp(-t)
CI = 1 - exp(-lambda * t)
cbind(rate = lambda, years = t, cumulative.incidence = CI)
## rate years cumulative.incidence
## [1,] 1e-04 1 0.0000999950
## [2,] 1e-04 5 0.0004998750
## [3,] 1e-04 10 0.0009995002
#AF = (Re-Ru)/Re = 1-1/RR
risk.unexp <- 1/100000
risk.exp <- c((1/50000),(1/10000),(1/1000))
rr = risk.exp / risk.unexp
af <- 1-1/rr
af.table <- cbind(unexposed = risk.unexp, exposed = risk.exp, attributes.fx = af)
af.table
## unexposed exposed attributes.fx
## [1,] 1e-05 2e-05 0.50
## [2,] 1e-05 1e-04 0.90
## [3,] 1e-05 1e-03 0.99
af.pop <- af*500/1400
af.table2 <- cbind(af.table, af.pop)
af.table2
## unexposed exposed attributes.fx af.pop
## [1,] 1e-05 2e-05 0.50 0.1785714
## [2,] 1e-05 1e-04 0.90 0.3214286
## [3,] 1e-05 1e-03 0.99 0.3535714
curve(x/(1 - x), 0, 1)
curve(log(x/(1 - x)), 0, 1)
# Type of Exposure Rate per 10,000
exp_type <- c("BT", "IDU", "RAI", "PNS", "RPVI", "IAI", "IPVI", "ROI", "IOI")
exp_rate <- c(9000,67,50,30,10,6.5,5,1,0.5)
exp_risk <- exp_rate / 10000
exp_odds <- exp_risk / (1-exp_risk)
exp_log_odds <- log(exp_odds)
hiv_tx <- data.frame(exp_type, exp_rate, exp_risk, exp_odds, exp_log_odds)
hiv_tx
# Here is a basic Plot
plot(exp_log_odds, type = "b")
# Now, we plot a professional looking graphic
# Calculate range from min to max values of exp_log_odds
g_range <- round(range(min(exp_log_odds), max(exp_log_odds)),2)
# Graph exp_log_odds using y axis that ranges from min to max values of exp_log_odds
# Turn off axes and annotations (axis labels) so we can specify them ourself
plot(exp_log_odds, type="o", col="blue", ylim=g_range, axes=FALSE, ann=FALSE)
# Make x axis using exposure type labels
axis(1, at=1:9, lab=exp_type)
# Make y axis with horizontal labels that display ticks at every 2 marks.
axis(2, las=1, at=2*g_range[1]:g_range[2])
# Create box around plot
box()
# Create a title with a red, bold/italic font
title(main="Log Odds of HIV Transmission", col.main="red", font.main=4)
# Label the x and y axes with dark green text
title(xlab="Exposure Type", col.lab=rgb(0,0.5,0))
title(ylab="Log Odds", col.lab=rgb(0,0.5,0))
# Create a legend at (1, g_range[2]) that is slightly smaller (cex) and uses the same line colors and points used by the actual plots
legend(1.5, g_range[2], c("Log Odds of Different Exposures"), cex=0.8, col=c("blue"), pch=21:22, lty=1:2)
In the Scottish Health Study 85 of 1821 people who lived in rented apartments had coronary artery disease, compared to 77 of 2477 people who owned their homes.
Create a named 2-by-2 matrix object from this data Calculate the row totals. (Hint: use apply()) Calculate the risk ratio and odds ratio for these data. Create a matrix of the risks, relative risks, odds, and odds ratios
dat <- matrix(c(85,(1821-85),77,(2477-77)),2,2)
rownames(dat) <- c('Diseased','Survivors')
colnames(dat) <- c('Renters','Home Owners')
rowtot <- apply(dat, 1, sum) #row totals
coltot <- apply(dat, 2, sum) #column totals
# Alternatively addmargins(dat) displays row and column totals added to the margins of the matrix dat
risks <- dat['Diseased', ]/coltot
risk.ratio <- risks/risks[2] #risk ratio
odds <- risks/(1 - risks)
odds.ratio <- odds/odds[2] #odds ratio
dat # display 2x2 table
## Renters Home Owners
## Diseased 85 77
## Survivors 1736 2400
rbind(risks,risk.ratio,odds,odds.ratio) # display results
## Renters Home Owners
## risks 0.04667765 0.03108599
## risk.ratio 1.50156543 1.00000000
## odds 0.04896313 0.03208333
## odds.ratio 1.52612365 1.00000000
# Read UGDP table
udat <- read.csv('http://www.columbia.edu/~cjd11/charles_dimaggio/DIRE/resources/R/ugdp.txt', header=TRUE, stringsAsFactors = FALSE)
str(udat)
## 'data.frame': 409 obs. of 3 variables:
## $ Status : chr "Death" "Death" "Death" "Death" ...
## $ Treatment: chr "Tolbutamide" "Tolbutamide" "Tolbutamide" "Tolbutamide" ...
## $ Agegrp : chr "<55" "<55" "<55" "<55" ...
#Number of observations in the data set: row count of udat
length(udat[,1])
## [1] 409
nrow(udat)
## [1] 409
#column count of udat
length(udat[1,])
## [1] 3
ncol(udat)
## [1] 3
# How many participants received Tolbutamide?
nrow(udat[udat$Treatment=="Tolbutamide",])
## [1] 204
table(udat$Treatment)
##
## Placebo Tolbutamide
## 205 204
# How many participants were under the age of 55?
nrow(udat[udat$Agegrp=="<55",])
## [1] 226
table(udat$Agegrp)
##
## <55 55+
## 226 183
# Calculate the odds ratio for the association of tolbutamide with death.
tmpdat = table(udat$Status, udat$Treatment)
dat <- tmpdat[,2:1]
coltot <- apply(dat, 2, sum) #column totals
risks <- dat['Death', ]/coltot
risk.ratio <- risks/risks[2] #risk ratio
odds <- risks/(1 - risks)
odds.ratio <- odds/odds[2] #odds ratio
dat # display 2x2 table
##
## Tolbutamide Placebo
## Death 30 21
## Survivor 174 184
rbind(risks,risk.ratio,odds,odds.ratio) # display results
## Tolbutamide Placebo
## risks 0.1470588 0.1024390
## risk.ratio 1.4355742 1.0000000
## odds 0.1724138 0.1141304
## odds.ratio 1.5106732 1.0000000
# Alternative method: a*d/b*c
a = dat[1,1]
b = dat[1,2]
c = dat[2,1]
d = dat[2,2]
uor = a*d/(b*c)
OR_CI_h=exp(log(uor)+1.96*sqrt(1/a+1/b+1/c+1/d))
OR_CI_l=exp(log(uor)-1.96*sqrt(1/a+1/b+1/c+1/d))
uor
## [1] 1.510673
OR_CI_l
## [1] 0.8332882
OR_CI_h
## [1] 2.738709
library(epitools)
##
## Attaching package: 'epitools'
## The following object is masked from 'package:survival':
##
## ratetable
epitab(dat)
## $tab
##
## Tolbutamide p0 Placebo p1 oddsratio lower upper
## Death 30 0.1470588 21 0.102439 1.000000 NA NA
## Survivor 174 0.8529412 184 0.897561 1.510673 0.8332973 2.738679
##
## p.value
## Death NA
## Survivor 0.1812576
##
## $measure
## [1] "wald"
##
## $conf.level
## [1] 0.95
##
## $pvalue
## [1] "fisher.exact"
cars<-read.table("http://www.columbia.edu/~cjd11/charles_dimaggio/DIRE/resources/R/cars.txt", header=T, stringsAsFactors=F)
head(cars)
str(cars)
## 'data.frame': 96 obs. of 4 variables:
## $ safety: int 0 0 0 0 0 0 0 0 0 0 ...
## $ type : chr "Medium" "Sport/Utility" "Medium" "Small" ...
## $ region: chr "NAmerica" "NAmerica" "NAmerica" "NAmerica" ...
## $ weight: num 3.4 4.18 3.14 2.6 3.08 ...
prop.table(table(cars$region))
##
## Asia NAmerica
## 0.3645833 0.6354167
tmptbl <- table(cars$region, cars$safety)
cars_tbl <- tmptbl[,2:1]
epitab(cars_tbl,method="oddsratio")
## $tab
##
## 1 p0 0 p1 oddsratio lower upper p.value
## Asia 15 0.5 20 0.3030303 1.0 NA NA NA
## NAmerica 15 0.5 46 0.6969697 2.3 0.9468078 5.587195 0.07175376
##
## $measure
## [1] "wald"
##
## $conf.level
## [1] 0.95
##
## $pvalue
## [1] "fisher.exact"
chisq.test(cars_tbl)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: cars_tbl
## X-squared = 2.6562, df = 1, p-value = 0.1031
big <- cars$type == "Large"|cars$type == "Sport/Utility"
cars$cat[big] <- "big"
cars$cat[!big] <- "small"
head(cars)
tmparr <- xtabs(~region+safety+cat, data=cars)
dat <- tmparr[,2:1,]; dat
## , , cat = big
##
## safety
## region 1 0
## Asia 1 4
## NAmerica 1 26
##
## , , cat = small
##
## safety
## region 1 0
## Asia 14 16
## NAmerica 14 20
dat_big <- tmparr[,2:1,"big"]
dat_sml <- tmparr[,2:1,"small"]
epitab(dat_big)
## Warning in chisq.test(xx, correct = correction): Chi-squared approximation may
## be incorrect
## $tab
## safety
## region 1 p0 0 p1 oddsratio lower upper p.value
## Asia 1 0.5 4 0.1333333 1.0 NA NA NA
## NAmerica 1 0.5 26 0.8666667 6.5 0.335154 126.0614 0.2923387
##
## $measure
## [1] "wald"
##
## $conf.level
## [1] 0.95
##
## $pvalue
## [1] "fisher.exact"
epitab(dat_sml)
## $tab
## safety
## region 1 p0 0 p1 oddsratio lower upper p.value
## Asia 14 0.5 16 0.4444444 1.00 NA NA NA
## NAmerica 14 0.5 20 0.5555556 1.25 0.464273 3.365477 0.8013262
##
## $measure
## [1] "wald"
##
## $conf.level
## [1] 0.95
##
## $pvalue
## [1] "fisher.exact"
mantelhaen.test(dat)
##
## Mantel-Haenszel chi-squared test with continuity correction
##
## data: dat
## Mantel-Haenszel X-squared = 0.26628, df = 1, p-value = 0.6058
## alternative hypothesis: true common odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.559892 3.657598
## sample estimates:
## common odds ratio
## 1.431034
str(udat)
## 'data.frame': 409 obs. of 3 variables:
## $ Status : chr "Death" "Death" "Death" "Death" ...
## $ Treatment: chr "Tolbutamide" "Tolbutamide" "Tolbutamide" "Tolbutamide" ...
## $ Agegrp : chr "<55" "<55" "<55" "<55" ...
tmparr <- xtabs(~Status+Treatment+Agegrp, data=udat)
udat_arr <- tmparr[,2:1,]
mantelhaen.test(udat_arr)
##
## Mantel-Haenszel chi-squared test with continuity correction
##
## data: udat_arr
## Mantel-Haenszel X-squared = 0.87967, df = 1, p-value = 0.3483
## alternative hypothesis: true common odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.7625152 2.5820153
## sample estimates:
## common odds ratio
## 1.403149
# OR are different 1.87 and 1.24, MH OR 1.40 lower than unadj OR 1.51 -> Confounding
rnorm(16,20,10) # (n=number of observations, mean=default zero, sd=default one)
## [1] 27.834141 12.215918 18.530816 13.793440 32.275193 45.266087 32.345679
## [8] 3.616576 26.687085 19.005346 20.999690 18.116655 15.439688 25.432540
## [15] 20.207304 20.906668
m <- matrix(round(rnorm(16, 20, 10)), 4, 4)
m
## [,1] [,2] [,3] [,4]
## [1,] 3 30 21 26
## [2,] 24 7 19 7
## [3,] 8 26 25 22
## [4,] 23 2 34 20
apply(m, 1, min) # smallest observation in each row (1st dimension of the array)
## [1] 3 7 8 2
apply(m, 2, sum) # column totals
## [1] 58 65 99 75
a <- array(round(rnorm(16, 20, 10)), dim = c(2, 4, 2))
a
## , , 1
##
## [,1] [,2] [,3] [,4]
## [1,] 15 26 20 26
## [2,] 22 28 41 22
##
## , , 2
##
## [,1] [,2] [,3] [,4]
## [1,] 36 16 7 21
## [2,] 13 24 25 24
apply(a, 1, mean)
## [1] 20.875 24.875
sex <- c("male", "female")
race <- c("w", "b", "h", "o")
tx <- c("yes", "no")
dimnames(a) <- list(gender = sex, race = race, treatement = tx)
a
## , , treatement = yes
##
## race
## gender w b h o
## male 15 26 20 26
## female 22 28 41 22
##
## , , treatement = no
##
## race
## gender w b h o
## male 36 16 7 21
## female 13 24 25 24
apply(a, c(2, 3), mean) # apply the function mean to each column (race) - average age of w/b/h/o treated/not treated
## treatement
## race yes no
## w 18.5 24.5
## b 27.0 20.0
## h 30.5 16.0
## o 24.0 22.5
apply(a, c(1, 3), mean) # apply the function mean to each column (gender) - average age of m/f - treated/not treated
## treatement
## gender yes no
## male 21.75 20.0
## female 28.25 21.5
# Alternatively colMeans(), rowMeans(), colSums(), rowSums(), addmargins()
dat <- data.frame(age = (round(rnorm(10, 5, 1))), clinic = sample(c("a", "b", "c", "d", "e")))
tapply(dat$age, dat$clinic, mean)
## a b c d e
## 5.5 5.0 4.0 5.5 5.0
dat <- data.frame(age = (round(rnorm(100, 5, 1))), clinic = sample(c("a", "b", "c", "d", "e")), treatment = sample(c("Tx","noTx")))
b <- tapply(dat$age, dat[, c("clinic","treatment")], mean)
b
## treatment
## clinic noTx Tx
## a 4.4 5.1
## b 4.7 4.9
## c 5.2 5.8
## d 4.7 5.2
## e 5.9 5.3
# Error in FUN(dd[x, ], ...) : could not find function "FUN"
c <- by(dat$age, dat[, c(2, 3)], mean)
c
## clinic: a
## treatment: noTx
## [1] 4.4
## ------------------------------------------------------------
## clinic: b
## treatment: noTx
## [1] 4.7
## ------------------------------------------------------------
## clinic: c
## treatment: noTx
## [1] 5.2
## ------------------------------------------------------------
## clinic: d
## treatment: noTx
## [1] 4.7
## ------------------------------------------------------------
## clinic: e
## treatment: noTx
## [1] 5.9
## ------------------------------------------------------------
## clinic: a
## treatment: Tx
## [1] 5.1
## ------------------------------------------------------------
## clinic: b
## treatment: Tx
## [1] 4.9
## ------------------------------------------------------------
## clinic: c
## treatment: Tx
## [1] 5.8
## ------------------------------------------------------------
## clinic: d
## treatment: Tx
## [1] 5.2
## ------------------------------------------------------------
## clinic: e
## treatment: Tx
## [1] 5.3
by(dat$age,dat[,c(2,3)], mean)
## clinic: a
## treatment: noTx
## [1] 4.4
## ------------------------------------------------------------
## clinic: b
## treatment: noTx
## [1] 4.7
## ------------------------------------------------------------
## clinic: c
## treatment: noTx
## [1] 5.2
## ------------------------------------------------------------
## clinic: d
## treatment: noTx
## [1] 4.7
## ------------------------------------------------------------
## clinic: e
## treatment: noTx
## [1] 5.9
## ------------------------------------------------------------
## clinic: a
## treatment: Tx
## [1] 5.1
## ------------------------------------------------------------
## clinic: b
## treatment: Tx
## [1] 4.9
## ------------------------------------------------------------
## clinic: c
## treatment: Tx
## [1] 5.8
## ------------------------------------------------------------
## clinic: d
## treatment: Tx
## [1] 5.2
## ------------------------------------------------------------
## clinic: e
## treatment: Tx
## [1] 5.3
mydata <- iris
by(mydata$Sepal.Length,list(mydata$Species),mean)
## : setosa
## [1] 5.006
## ------------------------------------------------------------
## : versicolor
## [1] 5.936
## ------------------------------------------------------------
## : virginica
## [1] 6.588
# Error in FUN(dd[x, ], ...) : could not find function "FUN"
l <- list(a = 5, b = 1:5, c = round(rnorm(20, 2, 1)))
l
## $a
## [1] 5
##
## $b
## [1] 1 2 3 4 5
##
## $c
## [1] 2 1 1 2 3 3 2 1 1 3 3 3 1 3 3 1 0 2 1 4
lapply(l, length)
## $a
## [1] 1
##
## $b
## [1] 5
##
## $c
## [1] 20
lapply(l, sum)
## $a
## [1] 5
##
## $b
## [1] 15
##
## $c
## [1] 40
sapply(l, length)
## a b c
## 1 5 20
sapply(l, sum)
## a b c
## 5 15 40
# sapply() can coerce results into a matrix or into an array
# vapply() a version of sapply() that can be tweaked to run more quickly
v1 <- 1:5
v2 <- 6:10
v3 <- 11:15
v1
## [1] 1 2 3 4 5
v2
## [1] 6 7 8 9 10
v3
## [1] 11 12 13 14 15
mapply(sum, v1, v2, v3)
## [1] 18 21 24 27 30
x <- c(chol = 234, sbp = 148, dbp = 78, age = 54)
x
## chol sbp dbp age
## 234 148 78 54
x[1] # by position
## chol
## 234
x[x > 150] # by logical
## chol
## 234
x["chol"] # by name
## chol
## 234
x[1] <- 250 #by position
x[x < 100] <- NA # by logical
x["sbp"] <- 150 # by name
x
## chol sbp dbp age
## 250 150 NA NA
x <- 1:40
x[11] #only the 11th element
## [1] 11
x[-11] #exclude the 11th element
## [1] 1 2 3 4 5 6 7 8 9 10 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
## [26] 27 28 29 30 31 32 33 34 35 36 37 38 39 40
x[11:20] #members 11 to 20
## [1] 11 12 13 14 15 16 17 18 19 20
x[-(11:40)] # all but members 11 to 40
## [1] 1 2 3 4 5 6 7 8 9 10
age <- c(8, NA, 7, 4)
which(age < 5 | age >= 8)
## [1] 1 4
age[(age < 5 | age >= 8)] ## This displays NA observation compared to the which function
## [1] 8 NA 4
names <- c("dopey", "grumpy", "doc", "happy", "bashful", "sneezy", "sleepy")
ages <- c(142, 240, 232, 333, 132, 134, 127)
sex <- c("m", "m", "f", "f", "f", "m", "m")
young <- ages < 150 #create logical vector
names[young] #index name vector using logical vector
## [1] "dopey" "bashful" "sneezy" "sleepy"
names[!young] # old dwarves
## [1] "grumpy" "doc" "happy"
male <- sex == "m" #logical vector male dwarves
names[male] #index names using logical vector males
## [1] "dopey" "grumpy" "sneezy" "sleepy"
names[young & male] # young male dwarves
## [1] "dopey" "sneezy" "sleepy"
# simulate vector with 1000 age values
age <- sample(0:100, 1000, replace = TRUE)
mean(age)
## [1] 50.206
sd(age)
## [1] 28.60692
agecat <- age # make copy
# replace elements agecat with strings for q
# category
agecat[age < 15] <- "<15" # creating character vector
agecat[age >= 15 & age < 25] <- "15-24"
agecat[age >= 25 & age < 45] <- "25-44"
agecat[age >= 45 & age < 65] <- "45-64"
agecat[age >= 65] <- "65+"
table(agecat) # get freqs
## agecat
## <15 15-24 25-44 45-64 65+
## 135 104 196 220 345
str(agecat)
## chr [1:1000] "25-44" "65+" "45-64" "65+" "65+" "45-64" "65+" "65+" "15-24" ...
m <- matrix(round(rnorm(16, 50, 5)), 2, 2)
dimnames(m) <- list(behavior = c("type A", "type B"), MI = c("yes", "no"))
m
## MI
## behavior yes no
## type A 52 51
## type B 53 41
m[1, ] #first row
## yes no
## 52 51
m[1, , drop = FALSE]
## MI
## behavior yes no
## type A 52 51
m[1,2] # cell "c"
## [1] 51
m["type A",]
## yes no
## 52 51
m[, "no"]
## type A type B
## 51 41
m[, 2] < 45 # logical vector
## type A type B
## FALSE TRUE
m[m[, 2] < 45] # data
## [1] 53 41
m[m[,2] < 56,]
## MI
## behavior yes no
## type A 52 51
## type B 53 41
m2 <- matrix(round(rnorm(72, 50, 5)), 6, 6)
m2
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 49 53 51 44 50 56
## [2,] 53 52 55 54 47 50
## [3,] 52 47 48 51 49 46
## [4,] 57 45 49 51 55 50
## [5,] 52 47 56 45 53 50
## [6,] 55 48 50 52 57 48
lower.tri(m2)
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] FALSE FALSE FALSE FALSE FALSE FALSE
## [2,] TRUE FALSE FALSE FALSE FALSE FALSE
## [3,] TRUE TRUE FALSE FALSE FALSE FALSE
## [4,] TRUE TRUE TRUE FALSE FALSE FALSE
## [5,] TRUE TRUE TRUE TRUE FALSE FALSE
## [6,] TRUE TRUE TRUE TRUE TRUE FALSE
upper.tri(m2)
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] FALSE TRUE TRUE TRUE TRUE TRUE
## [2,] FALSE FALSE TRUE TRUE TRUE TRUE
## [3,] FALSE FALSE FALSE TRUE TRUE TRUE
## [4,] FALSE FALSE FALSE FALSE TRUE TRUE
## [5,] FALSE FALSE FALSE FALSE FALSE TRUE
## [6,] FALSE FALSE FALSE FALSE FALSE FALSE
m2[lower.tri(m2)]
## [1] 53 52 57 52 55 47 45 47 48 49 56 50 45 52 57
# Note lower.tri and upper.tri functions
a <- array(sample(10:70, 8, rep = T), c(2, 2, 2))
dimnames(a) <- list(exposure = c("e", "E"), disease = c("d", "D"), confounder = c("c", "C"))
a
## , , confounder = c
##
## disease
## exposure d D
## e 55 26
## E 55 34
##
## , , confounder = C
##
## disease
## exposure d D
## e 66 25
## E 19 35
a[1, 2, 1]
## [1] 26
a["e","D","c"]
## [1] 26
a[a == 32]
## integer(0)
l<- list(1:5, matrix(1:4, 2, 2), c("John Snow", "William Farr"))
l[1] # note l[1] bin versus l[[1]] vector in the next line
## [[1]]
## [1] 1 2 3 4 5
l[[1]]
## [1] 1 2 3 4 5
l[[2]][2, 1]
## [1] 2
l[[3]][2]
## [1] "William Farr"
char <- sapply(l, is.character)
char
## [1] FALSE FALSE TRUE
epi.folk<-l[char]
epi.folk[[1]][2] # note [[]] to address the contents of the bin
## [1] "William Farr"
data(infert)
library(survival) # package with clogit()
mod1 <- clogit(case ~ spontaneous + induced + strata(stratum), data = infert)
mod1 # default results (7x risk c spont AB, 4x induced)
## Call:
## clogit(case ~ spontaneous + induced + strata(stratum), data = infert)
##
## coef exp(coef) se(coef) z p
## spontaneous 1.9859 7.2854 0.3524 5.635 1.75e-08
## induced 1.4090 4.0919 0.3607 3.906 9.38e-05
##
## Likelihood ratio test=53.15 on 2 df, p=2.869e-12
## n= 248, number of events= 83
str(mod1)
## List of 21
## $ coefficients : Named num [1:2] 1.99 1.41
## ..- attr(*, "names")= chr [1:2] "spontaneous" "induced"
## $ var : num [1:2, 1:2] 0.1242 0.0927 0.0927 0.1301
## $ loglik : num [1:2] -90.8 -64.2
## $ score : num 48.4
## $ iter : num 5
## $ linear.predictors: num [1:248] 3.429 -0.543 0.866 0.866 1.443 ...
## $ residuals : Named num [1:248] 0.134 0.328 0.667 0.783 0.373 ...
## ..- attr(*, "names")= chr [1:248] "1" "2" "3" "4" ...
## $ means : Named num [1:2] 0.577 0.573
## ..- attr(*, "names")= chr [1:2] "spontaneous" "induced"
## $ method : chr "exact"
## $ n : int 248
## $ nevent : num 83
## $ terms :Classes 'terms', 'formula' language Surv(rep(1, 248L), case) ~ spontaneous + induced + strata(stratum)
## .. ..- attr(*, "variables")= language list(Surv(rep(1, 248L), case), spontaneous, induced, strata(stratum))
## .. ..- attr(*, "factors")= int [1:4, 1:3] 0 1 0 0 0 0 1 0 0 0 ...
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:4] "Surv(rep(1, 248), case)" "spontaneous" "induced" "strata(stratum)"
## .. .. .. ..$ : chr [1:3] "spontaneous" "induced" "strata(stratum)"
## .. ..- attr(*, "term.labels")= chr [1:3] "spontaneous" "induced" "strata(stratum)"
## .. ..- attr(*, "specials")=Dotted pair list of 2
## .. .. ..$ strata: int 4
## .. .. ..$ tt : NULL
## .. ..- attr(*, "order")= int [1:3] 1 1 1
## .. ..- attr(*, "intercept")= num 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(Surv(rep(1, 248L), case), spontaneous, induced, strata(stratum))
## .. ..- attr(*, "dataClasses")= Named chr [1:4] "nmatrix.2" "numeric" "numeric" "factor"
## .. .. ..- attr(*, "names")= chr [1:4] "Surv(rep(1, 248L), case)" "spontaneous" "induced" "strata(stratum)"
## $ assign :List of 2
## ..$ spontaneous: int 1
## ..$ induced : int 2
## $ wald.test : num 31.8
## $ concordance : Named num [1:7] 113 22 30 0 0 ...
## ..- attr(*, "names")= chr [1:7] "concordant" "discordant" "tied.x" "tied.y" ...
## $ y : 'Surv' num [1:248, 1:2] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:248] "1" "2" "3" "4" ...
## .. ..$ : chr [1:2] "time" "status"
## ..- attr(*, "type")= chr "right"
## $ timefix : logi TRUE
## $ formula :Class 'formula' language Surv(rep(1, 248L), case) ~ spontaneous + induced + strata(stratum)
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## $ xlevels :List of 1
## ..$ strata(stratum): chr [1:83] "stratum=1" "stratum=2" "stratum=3" "stratum=4" ...
## $ call : language coxph(formula = Surv(rep(1, 248L), case) ~ spontaneous + induced + strata(stratum), data = infert, method = "exact")
## $ userCall : language clogit(case ~ spontaneous + induced + strata(stratum), data = infert)
## - attr(*, "class")= chr [1:2] "clogit" "coxph"
names(mod1) #structure, names
## [1] "coefficients" "var" "loglik"
## [4] "score" "iter" "linear.predictors"
## [7] "residuals" "means" "method"
## [10] "n" "nevent" "terms"
## [13] "assign" "wald.test" "concordance"
## [16] "y" "timefix" "formula"
## [19] "xlevels" "call" "userCall"
mod1$coeff # name to index result (list element).
## spontaneous induced
## 1.985876 1.409012
summod1 <- summary(mod1) #more detailed results names(summod1) #detailed list components
names(summod1) #detailed list components
## [1] "call" "fail" "na.action" "n" "loglik"
## [6] "nevent" "coefficients" "conf.int" "logtest" "sctest"
## [11] "rsq" "waldtest" "used.robust" "concordance"
data(infert)
infert[1:4, 1:2]
infert[1:4, 2] <- c(NA, 45, NA, 23)
infert[1:4, 1:2]
names(infert)
## [1] "education" "age" "parity" "induced"
## [5] "case" "spontaneous" "stratum" "pooled.stratum"
infert[1:4, c("education", "age")]
infert[1:4, c("age")] <- c(NA, 45, NA, 23)
infert[1:4, c("education", "age")]
table(infert$parity)
##
## 1 2 3 4 5 6
## 99 81 36 18 6 8
# change values of 5 or 6 to missing
infert$parity[infert$parity==5 | infert$parity==6] <- NA
table(infert$parity)
##
## 1 2 3 4
## 99 81 36 18
table(infert$parity, exclude=NULL)
##
## 1 2 3 4 <NA>
## 99 81 36 18 14
sparcs <- read.csv(file = "http://www.columbia.edu/~cjd11/charles_dimaggio/DIRE/resources/R/sparcsShort.csv", stringsAsFactors = FALSE)
brooklyn<-sparcs[sparcs$county=="59",]
nyc<- sparcs$county=="58"| sparcs$county=="59"| sparcs$county=="60"| sparcs$county=="61"| sparcs$county=="62"
nyc.sparcs<-sparcs[nyc,]
dxs<-sparcs[,"pdx"]
vars<-c("date", "pdx", "disp")
my.vars<-sparcs[,vars]
sparcs2<-sparcs[nyc,vars]
brooklyn.sparcs<-subset(sparcs, county=="59", select=c(date, pdx, disp))
nyc.sparcs<-subset(sparcs, county=="59":"62", select=c(county, pdx, disp))
nyc.sparcs<-subset(sparcs, county=="59":"62", select=-c(county, pdx, disp))
titanic <- read.csv("C:/Dropbox/DRPH/P9489 Application of Epi Research Methods II/Data/titanic.csv", header=TRUE, stringsAsFactors = TRUE)
titanic
names(titanic)
## [1] "X" "pclass" "survived" "name" "sex" "age"
## [7] "sibsp" "parch" "ticket" "fare" "cabin" "embarked"
## [13] "boat" "body" "home.dest"
str(titanic)
## 'data.frame': 1309 obs. of 15 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ pclass : Factor w/ 3 levels "1st","2nd","3rd": 1 1 1 1 1 1 1 1 1 1 ...
## $ survived : int 1 1 0 0 0 1 1 0 1 0 ...
## $ name : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 22 24 25 26 27 31 46 47 51 55 ...
## $ sex : Factor w/ 2 levels "female","male": 1 2 1 2 1 2 1 2 1 2 ...
## $ age : num 29 0.917 2 30 25 ...
## $ sibsp : int 0 1 1 1 1 0 1 0 2 0 ...
## $ parch : int 0 2 2 2 2 0 0 0 0 0 ...
## $ ticket : Factor w/ 929 levels "110152","110413",..: 188 50 50 50 50 125 93 16 77 826 ...
## $ fare : num 211 152 152 152 152 ...
## $ cabin : Factor w/ 187 levels "","A10","A11",..: 45 81 81 81 81 151 147 17 63 1 ...
## $ embarked : Factor w/ 4 levels "","Cherbourg",..: 4 4 4 4 4 4 4 4 4 2 ...
## $ boat : Factor w/ 28 levels "","1","10","11",..: 13 4 1 1 1 14 3 1 28 1 ...
## $ body : int NA NA NA 135 NA NA NA NA NA 22 ...
## $ home.dest: Factor w/ 369 levels "","?Havana, Cuba",..: 309 231 231 231 231 237 163 25 23 229 ...
tmpsurvbysex <- xtabs(~sex+survived, data=titanic) # alternatively table(titanic$sex, titanic$survived)
survbysex <- tmpsurvbysex[2:1,]
surv1 <- epitab(survbysex,method="oddsratio")
surv1$tab[,"oddsratio"]
## male female
## 1.00000 11.30718
surv2 <- glm(survived ~ relevel(factor(sex), ref = "male"), data = titanic, family=binomial(link="logit"))
exp(surv2$coefficients)
## (Intercept)
## 0.2360704
## relevel(factor(sex), ref = "male")female
## 11.3071844
summary(surv2)
##
## Call:
## glm(formula = survived ~ relevel(factor(sex), ref = "male"),
## family = binomial(link = "logit"), data = titanic)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6124 -0.6511 -0.6511 0.7977 1.8196
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.44363 0.08762 -16.48 <2e-16
## relevel(factor(sex), ref = "male")female 2.42544 0.13602 17.83 <2e-16
##
## (Intercept) ***
## relevel(factor(sex), ref = "male")female ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1741.0 on 1308 degrees of freedom
## Residual deviance: 1368.1 on 1307 degrees of freedom
## AIC: 1372.1
##
## Number of Fisher Scoring iterations: 4
exp(confint(surv2))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.1981912 0.2794823
## relevel(factor(sex), ref = "male")female 8.6871038 14.8093675
surv3 <- glm(survived ~ relevel(factor(sex), ref = "male")+relevel(pclass, ref="3rd")+age, data = titanic, family=binomial(link="logit"))
surv3
##
## Call: glm(formula = survived ~ relevel(factor(sex), ref = "male") +
## relevel(pclass, ref = "3rd") + age, family = binomial(link = "logit"),
## data = titanic)
##
## Coefficients:
## (Intercept)
## -1.26543
## relevel(factor(sex), ref = "male")female
## 2.49784
## relevel(pclass, ref = "3rd")1st
## 2.28966
## relevel(pclass, ref = "3rd")2nd
## 1.00909
## age
## -0.03439
##
## Degrees of Freedom: 1045 Total (i.e. Null); 1041 Residual
## (263 observations deleted due to missingness)
## Null Deviance: 1415
## Residual Deviance: 982.5 AIC: 992.5
# install.packages("survival")
library(survival)
library(ranger)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:epiDisplay':
##
## alpha
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggfortify)
load("C:/Dropbox/DRPH/P9489 Application of Epi Research Methods II/Data/popeSurvival.RData")
popes
# For each analysis, provide Kaplan-Meir curves and Log Rank tests.
# How to delete it by name: popes <- popes[,-11] / popes <- popes[,-12]
popes$time1 <- as.Date(popes$DeathDate)-as.Date(popes$BirthDate)
popes$time2 <- as.Date(popes$DeathDate)-as.Date(popes$ElectedDate)
str(popes)
## 'data.frame': 87 obs. of 12 variables:
## $ papalName : chr "Gregory IX" "Celestine IV" "Innocent IV" "Alexander IV" ...
## $ Birth : chr "9 July 1170" NA "9 July 1195" "9 July 1185" ...
## $ Elected : chr "19 Mar 1227" "25 Oct 1241" "25 Jun 1243" "12 Dec 1254" ...
## $ Death : chr "22 Aug 1241" "10 Nov 1241" "7 Dec 1254" "25 May 1261" ...
## $ papalName.1: chr "Gregory IX" "Celestine IV" "Innocent IV" "Alexander IV" ...
## $ birthPlace : chr "Anagni, Papal States, Holy Roman Empire" "Milan, Italy, Holy Roman Empire" "Genoa, Republic of Genoa, Holy Roman Empire" "Jenne, Papal States" ...
## $ roman : chr "Yes" "No" "No" "Yes" ...
## $ BirthDate : Date, format: "1170-07-09" NA ...
## $ DeathDate : Date, format: "1241-08-22" "1241-11-10" ...
## $ ElectedDate: Date, format: "1227-03-19" "1241-10-25" ...
## $ time1 : 'difftime' num 25977 NA 21701 27714 ...
## ..- attr(*, "units")= chr "days"
## $ time2 : 'difftime' num 5270 16 4183 2356 ...
## ..- attr(*, "units")= chr "days"
kmp_fit1 <- survfit(Surv(time1)~roman,data=popes)
summary(kmp_fit1, times = c(15000,1000*(16:50)))
## Call: survfit(formula = Surv(time1) ~ roman, data = popes)
##
## 1 observation deleted due to missingness
## roman=No
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 15000 57 0 1.0000 0.0000 1.00000 1.000
## 16000 57 0 1.0000 0.0000 1.00000 1.000
## 17000 56 1 0.9825 0.0174 0.94896 1.000
## 18000 55 1 0.9649 0.0244 0.91831 1.000
## 19000 53 2 0.9298 0.0338 0.86582 0.999
## 20000 51 2 0.8947 0.0406 0.81851 0.978
## 21000 48 3 0.8421 0.0483 0.75257 0.942
## 22000 44 4 0.7719 0.0556 0.67034 0.889
## 23000 41 3 0.7193 0.0595 0.61162 0.846
## 24000 36 5 0.6316 0.0639 0.51799 0.770
## 25000 29 7 0.5088 0.0662 0.39422 0.657
## 26000 24 5 0.4211 0.0654 0.31055 0.571
## 27000 21 3 0.3684 0.0639 0.26226 0.518
## 28000 18 3 0.3158 0.0616 0.21550 0.463
## 29000 16 2 0.2807 0.0595 0.18525 0.425
## 30000 7 9 0.1228 0.0435 0.06136 0.246
## 31000 5 3 0.0702 0.0338 0.02728 0.181
## 32000 3 1 0.0526 0.0296 0.01749 0.158
## 33000 1 2 0.0175 0.0174 0.00251 0.122
##
## roman=Yes
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 15000 29 0 1.0000 0.0000 1.00000 1.000
## 16000 29 0 1.0000 0.0000 1.00000 1.000
## 17000 29 0 1.0000 0.0000 1.00000 1.000
## 18000 29 0 1.0000 0.0000 1.00000 1.000
## 19000 29 0 1.0000 0.0000 1.00000 1.000
## 20000 28 1 0.9655 0.0339 0.90134 1.000
## 21000 28 0 0.9655 0.0339 0.90134 1.000
## 22000 28 0 0.9655 0.0339 0.90134 1.000
## 23000 27 1 0.9310 0.0471 0.84323 1.000
## 24000 26 1 0.8966 0.0566 0.79229 1.000
## 25000 24 2 0.8276 0.0701 0.70092 0.977
## 26000 14 10 0.4828 0.0928 0.33122 0.704
## 27000 11 3 0.3793 0.0901 0.23812 0.604
## 28000 10 1 0.3448 0.0883 0.20880 0.569
## 29000 9 1 0.3103 0.0859 0.18039 0.534
## 30000 5 4 0.1724 0.0701 0.07767 0.383
## 31000 3 2 0.1034 0.0566 0.03543 0.302
## 32000 1 2 0.0345 0.0339 0.00503 0.237
## 33000 1 0 0.0345 0.0339 0.00503 0.237
## 34000 1 0 0.0345 0.0339 0.00503 0.237
autoplot(kmp_fit1)
survdiff(Surv(time1)~roman,data=popes,rho=0)
## Call:
## survdiff(formula = Surv(time1) ~ roman, data = popes, rho = 0)
##
## n=86, 1 observation deleted due to missingness.
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## roman=No 57 57 51.6 0.557 1.43
## roman=Yes 29 29 34.4 0.837 1.43
##
## Chisq= 1.4 on 1 degrees of freedom, p= 0.2
kmp_fit2 <- survfit(Surv(time2)~roman,data=popes)
summary(kmp_fit2, times = c(1,1000*(1:50)))
## Call: survfit(formula = Surv(time2) ~ roman, data = popes)
##
## roman=No
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 1 58 0 1.0000 0.0000 1.00000 1.000
## 1000 44 14 0.7586 0.0562 0.65611 0.877
## 2000 37 7 0.6379 0.0631 0.52550 0.774
## 3000 27 10 0.4655 0.0655 0.35332 0.613
## 4000 16 11 0.2759 0.0587 0.18181 0.419
## 5000 9 7 0.1552 0.0475 0.08512 0.283
## 6000 5 4 0.0862 0.0369 0.03730 0.199
## 7000 3 2 0.0517 0.0291 0.01718 0.156
## 8000 1 2 0.0172 0.0171 0.00247 0.120
## 9000 1 0 0.0172 0.0171 0.00247 0.120
##
## roman=Yes
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 1 29 0 1.0000 0.0000 1.00000 1.000
## 1000 23 6 0.7931 0.0752 0.65856 0.955
## 2000 16 7 0.5517 0.0923 0.39742 0.766
## 3000 14 2 0.4828 0.0928 0.33122 0.704
## 4000 12 2 0.4138 0.0915 0.26832 0.638
## 5000 9 3 0.3103 0.0859 0.18039 0.534
## 6000 6 3 0.2069 0.0752 0.10146 0.422
## 7000 5 1 0.1724 0.0701 0.07767 0.383
## 8000 4 1 0.1379 0.0640 0.05553 0.343
## 9000 2 2 0.0690 0.0471 0.01811 0.263
## 10000 1 1 0.0345 0.0339 0.00503 0.237
## 11000 1 0 0.0345 0.0339 0.00503 0.237
autoplot(kmp_fit2)
survdiff(Surv(time2)~roman,data=popes,rho=0)
## Call:
## survdiff(formula = Surv(time2) ~ roman, data = popes, rho = 0)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## roman=No 58 58 51.1 0.931 2.42
## roman=Yes 29 29 35.9 1.325 2.42
##
## Chisq= 2.4 on 1 degrees of freedom, p= 0.1
# Look for alternative logrank_test(Surv(time2)~roman,data=popes, distribution = "exact")
# plot(km_fit, xlab="Days", main = 'Kaplan Meyer Plot') #base graphics is always ready