1 Working with objects

1.1 Vectors

1.1.1 Using the assignment and concatenation operatiors to create vectors

case <- c(exposed = 8, unexposed = 10)
case

##   exposed unexposed 
##         8        10

noncase <- c(exposed = 15, unexposed = 78)
noncase

##   exposed unexposed 
##        15        78

1.1.2 Combining vectors to create tables

# column bind binds cases and noncases as columns, therefore upper left corner becomes the exposed cases of disease
cbind(case, noncase)

##           case noncase
## exposed      8      15
## unexposed   10      78

1.1.3 Creating vectors and tables from individual observations

rep("hello", 5)

## [1] "hello" "hello" "hello" "hello" "hello"

outcome <- c(rep("case", 18),rep("noncase",93))
outcome

##   [1] "case"    "case"    "case"    "case"    "case"    "case"    "case"   
##   [8] "case"    "case"    "case"    "case"    "case"    "case"    "case"   
##  [15] "case"    "case"    "case"    "case"    "noncase" "noncase" "noncase"
##  [22] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
##  [29] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
##  [36] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
##  [43] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
##  [50] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
##  [57] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
##  [64] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
##  [71] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
##  [78] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
##  [85] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
##  [92] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
##  [99] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
## [106] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"

tmp <- c("exposed", "unexposed")
exposure <- c(rep(tmp, c(8, 10)), rep(tmp, c(15, 78)))
exposure

##   [1] "exposed"   "exposed"   "exposed"   "exposed"   "exposed"   "exposed"  
##   [7] "exposed"   "exposed"   "unexposed" "unexposed" "unexposed" "unexposed"
##  [13] "unexposed" "unexposed" "unexposed" "unexposed" "unexposed" "unexposed"
##  [19] "exposed"   "exposed"   "exposed"   "exposed"   "exposed"   "exposed"  
##  [25] "exposed"   "exposed"   "exposed"   "exposed"   "exposed"   "exposed"  
##  [31] "exposed"   "exposed"   "exposed"   "unexposed" "unexposed" "unexposed"
##  [37] "unexposed" "unexposed" "unexposed" "unexposed" "unexposed" "unexposed"
##  [43] "unexposed" "unexposed" "unexposed" "unexposed" "unexposed" "unexposed"
##  [49] "unexposed" "unexposed" "unexposed" "unexposed" "unexposed" "unexposed"
##  [55] "unexposed" "unexposed" "unexposed" "unexposed" "unexposed" "unexposed"
##  [61] "unexposed" "unexposed" "unexposed" "unexposed" "unexposed" "unexposed"
##  [67] "unexposed" "unexposed" "unexposed" "unexposed" "unexposed" "unexposed"
##  [73] "unexposed" "unexposed" "unexposed" "unexposed" "unexposed" "unexposed"
##  [79] "unexposed" "unexposed" "unexposed" "unexposed" "unexposed" "unexposed"
##  [85] "unexposed" "unexposed" "unexposed" "unexposed" "unexposed" "unexposed"
##  [91] "unexposed" "unexposed" "unexposed" "unexposed" "unexposed" "unexposed"
##  [97] "unexposed" "unexposed" "unexposed" "unexposed" "unexposed" "unexposed"
## [103] "unexposed" "unexposed" "unexposed" "unexposed" "unexposed" "unexposed"
## [109] "unexposed" "unexposed" "unexposed"

# Column bind exposure and outcome in sars.obs'ervations
sars.obs <- cbind(exposure, outcome)

# First four rows of observations (passengers from the airplane)
sars.obs[1:4,]

##      exposure  outcome
## [1,] "exposed" "case" 
## [2,] "exposed" "case" 
## [3,] "exposed" "case" 
## [4,] "exposed" "case"

# Second column of the sars.obs'ervation table. I.e. Outcomes
sars.obs[,2]

##   [1] "case"    "case"    "case"    "case"    "case"    "case"    "case"   
##   [8] "case"    "case"    "case"    "case"    "case"    "case"    "case"   
##  [15] "case"    "case"    "case"    "case"    "noncase" "noncase" "noncase"
##  [22] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
##  [29] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
##  [36] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
##  [43] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
##  [50] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
##  [57] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
##  [64] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
##  [71] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
##  [78] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
##  [85] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
##  [92] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
##  [99] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"
## [106] "noncase" "noncase" "noncase" "noncase" "noncase" "noncase"

# These two cells are the totals of cases and noncases
table(outcome)

## outcome
##    case noncase 
##      18      93

# Recall the 2 x2 table from section 1.1.2
table(exposure, outcome)

##            outcome
## exposure    case noncase
##   exposed      8      15
##   unexposed   10      78

# Lets construct a 2x2 table from sars.obs'ervations and assign it to sars.tab
sars.tab <- table(sars.obs[ , 1], sars.obs[ , 2])
sars.tab

##            
##             case noncase
##   exposed      8      15
##   unexposed   10      78

1.2 matrix

# Get help file for matrix
# ?matrix

# Display arguments for matrix
args(matrix)

## function (data = NA, nrow = 1, ncol = 1, byrow = FALSE, dimnames = NULL) 
## NULL

# Create first matrix (by column)
tab <- matrix(c("a", "b", "c", "d"), 2, 2)
rownames(tab) <- c("Exposed", "unExposed")
colnames(tab) <- c("Disease", "noDisease")
tab

##           Disease noDisease
## Exposed   "a"     "c"      
## unExposed "b"     "d"

# Create matrix by row this time
tab <- matrix(c("a", "b", "c", "d"), 2, 2, byrow=TRUE)
rownames(tab) <- c("Exposed", "unExposed")
colnames(tab) <- c("Disease", "noDisease")
tab

##           Disease noDisease
## Exposed   "a"     "b"      
## unExposed "c"     "d"

# Read UGDP table
udat1 <- read.csv('http://www.columbia.edu/~cjd11/charles_dimaggio/DIRE/resources/R/ugdp.txt', as.is = TRUE)

# stringsAsFactors=FALSE or as.is prevent strings or all other read values from converting to factor

# Note to self: There is an alternative way to read files in. Need to find the resource for that option.

# Create 2 by 2 table
udat1.tab = table(Outcome = udat1$Status, udat1$Treatment)
udat1.tab

##           
## Outcome    Placebo Tolbutamide
##   Death         21          30
##   Survivor     184         174

# Create 2 by 2 matrix by selecting Exposure and Disease status elements into the right cells
# Is there an easier way to insert the table output into the matrix by sorting the rows and columns according to desired order?

tab <- matrix(c(udat1.tab['Death','Tolbutamide'], udat1.tab['Survivor','Tolbutamide'], udat1.tab['Death','Placebo'], udat1.tab['Survivor','Placebo']), 2, 2, byrow=TRUE)
              
rownames(tab) <- c("Exposed", "Not Exposed")
colnames(tab) <- c("Disease", "No Disease")
names(dimnames(tab)) <- c('Exposure Status', 'Disease Status')
tab

##                Disease Status
## Exposure Status Disease No Disease
##     Exposed          30        174
##     Not Exposed      21        184

# addmargins function below adds sum's to both rows and columns
addmargins(tab, FUN=sum, quiet = TRUE)

##                Disease Status
## Exposure Status Disease No Disease sum
##     Exposed          30        174 204
##     Not Exposed      21        184 205
##     sum              51        358 409

1.2.1 Calculating marginal totals

coltot <- apply(tab, 2, sum)
risks <- tab["Exposed",]/coltot
odds <- risks/(1-risks)

risk.ratio <- risks/risks[2]
odds.ratio <- odds/odds[2]
rbind(risks, risk.ratio, odds, odds.ratio)

##              Disease No Disease
## risks      0.5882353  0.4860335
## risk.ratio 1.2102772  1.0000000
## odds       1.4285714  0.9456522
## odds.ratio 1.5106732  1.0000000

1.3 Lists

tab2 = round(tab/10)
tab2

##                Disease Status
## Exposure Status Disease No Disease
##     Exposed           3         17
##     Not Exposed       2         18

## fisher test output is a list
fish <- fisher.test(tab2)
str(fish)

## List of 7
##  $ p.value    : num 1
##  $ conf.int   : num [1:2] 0.159 20.979
##   ..- attr(*, "conf.level")= num 0.95
##  $ estimate   : Named num 1.57
##   ..- attr(*, "names")= chr "odds ratio"
##  $ null.value : Named num 1
##   ..- attr(*, "names")= chr "odds ratio"
##  $ alternative: chr "two.sided"
##  $ method     : chr "Fisher's Exact Test for Count Data"
##  $ data.name  : chr "tab2"
##  - attr(*, "class")= chr "htest"

## access list component estimate
fish$estimate

## odds ratio 
##   1.570083

## 2 alternative ways to access lower confidence interval
fish$conf.int[1]

## [1] 0.1589135

fish[[2]][1]

## [1] 0.1589135

1.3.1 Create your own function and store the results as a list

orcalc <- function(x){
  or <- (x[1,1]*x[2,2])/(x[1,2]*x[2,1])
  pval <- fisher.test(x)$p.value
  list(data=x, odd.ratio = or, p.value = pval)
}

#calculates or and pval and combines the table, or, and pvalue into a list called test
test <- orcalc(tab)
test

## $data
##                Disease Status
## Exposure Status Disease No Disease
##     Exposed          30        174
##     Not Exposed      21        184
## 
## $odd.ratio
## [1] 1.510673
## 
## $p.value
## [1] 0.1812576

str(test)

## List of 3
##  $ data     : int [1:2, 1:2] 30 21 174 184
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ Exposure Status: chr [1:2] "Exposed" "Not Exposed"
##   .. ..$ Disease Status : chr [1:2] "Disease" "No Disease"
##  $ odd.ratio: num 1.51
##  $ p.value  : num 0.181

2 Class and mode of an R object

x <- factor(1:7)
mode(x)

## [1] "numeric"

class(x)

## [1] "factor"

y <- c("1","2","3")
y

## [1] "1" "2" "3"

str(y)

##  chr [1:3] "1" "2" "3"

## sum(y) and sum(x) error out
sum(as.numeric(y))

## [1] 6

sum(as.numeric(x))

## [1] 28

3.Ordered factors

3.1 Months example

mths = c("March", "April", "January", "November", "January", "September", "October", "September", "November", "August", "January", "November", "November", "February", "May", "August", "July", "December", "August", "August", "September", "November", "February", "April")

# Most frequent months is November and there is no apparent order to the vector
table(mths)

## mths
##     April    August  December  February   January      July     March       May 
##         2         4         1         2         3         1         1         1 
##  November   October September 
##         5         1         3

class(mths)

## [1] "character"

mode(mths)

## [1] "character"

f.mths = factor(mths, levels = c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"), ordered = TRUE)
f.mths

##  [1] March     April     January   November  January   September October  
##  [8] September November  August    January   November  November  February 
## [15] May       August    July      December  August    August    September
## [22] November  February  April    
## 12 Levels: January < February < March < April < May < June < ... < December

table(f.mths)

## f.mths
##   January  February     March     April       May      June      July    August 
##         3         2         1         2         1         0         1         4 
## September   October  November  December 
##         3         1         5         1

class(f.mths)

## [1] "ordered" "factor"

mode(f.mths)

## [1] "numeric"

mths[1]

## [1] "March"

mths[2]

## [1] "April"

# errors out
# as.numeric(mths[1]) < as.numeric(mths[2])

f.mths[1]

## [1] March
## 12 Levels: January < February < March < April < May < June < ... < December

f.mths[2]

## [1] April
## 12 Levels: January < February < March < April < May < June < ... < December

as.numeric(f.mths[1]) < as.numeric(f.mths[2])

## [1] TRUE

as.numeric(f.mths)[1] + as.numeric(f.mths)[2]

## [1] 7

mean(as.numeric(f.mths))

## [1] 6.916667

ses <- sample(c("Low", "Medium", "High"), 100, replace = TRUE)
ses

##   [1] "Medium" "Medium" "Low"    "Low"    "High"   "High"   "Low"    "High"  
##   [9] "High"   "Low"    "High"   "Low"    "Low"    "Low"    "High"   "Low"   
##  [17] "Medium" "Low"    "Low"    "Medium" "High"   "High"   "Medium" "Low"   
##  [25] "Low"    "Medium" "High"   "Medium" "Medium" "Medium" "Medium" "High"  
##  [33] "High"   "Medium" "Medium" "Low"    "High"   "High"   "Medium" "High"  
##  [41] "Medium" "High"   "High"   "Low"    "High"   "High"   "High"   "High"  
##  [49] "High"   "High"   "High"   "Low"    "Medium" "High"   "High"   "High"  
##  [57] "Medium" "Low"    "Low"    "Low"    "High"   "Low"    "Low"    "Low"   
##  [65] "Medium" "Low"    "Medium" "Low"    "Medium" "High"   "Low"    "Low"   
##  [73] "High"   "High"   "Low"    "Low"    "Medium" "High"   "Low"    "High"  
##  [81] "Low"    "High"   "Medium" "High"   "High"   "Low"    "High"   "Medium"
##  [89] "Medium" "Low"    "Low"    "Medium" "Medium" "High"   "Medium" "Low"   
##  [97] "High"   "Low"    "Medium" "Low"

f.ses = factor(ses, levels = c("Low", "Medium", "High"), ordered = TRUE)
table(f.ses)

## f.ses
##    Low Medium   High 
##     35     27     38

4 Exploring data sets

4.1 The infertility data set infert

#load data file infert
data(infert)
# structure of infert
str(infert)

## 'data.frame':    248 obs. of  8 variables:
##  $ education     : Factor w/ 3 levels "0-5yrs","6-11yrs",..: 1 1 1 1 2 2 2 2 2 2 ...
##  $ age           : num  26 42 39 34 35 36 23 32 21 28 ...
##  $ parity        : num  6 1 6 4 3 4 1 2 1 2 ...
##  $ induced       : num  1 1 2 2 1 2 0 0 0 0 ...
##  $ case          : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ spontaneous   : num  2 0 0 0 1 1 0 0 1 0 ...
##  $ stratum       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ pooled.stratum: num  3 1 4 2 32 36 6 22 5 19 ...

# brwose infert data
infert

# display first 5 rows of infert
infert[1:5,]

# 10 records from the top of infert
head(infert,10)

# 15 records from the bottom of infert
tail(infert, 15)

4.2 Preliminary statistics

mean(infert$age)

## [1] 31.50403

# mean of parity addressed by index for the third column and mean of parity by addessing it by name
mean(infert[[3]])

## [1] 2.092742

mean(infert$parity)

## [1] 2.092742

# calculate fivenum'bers for age and parity (minimum, first quartile, median, third quartile, maximum).
# compare to mean() and summary() functions
fivenum(infert$age)

## [1] 21.0 28.0 31.0 35.5 44.0

mean(infert$age)

## [1] 31.50403

summary(infert$age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   21.00   28.00   31.00   31.50   35.25   44.00

fivenum(infert$parity)

## [1] 1 1 2 3 6

4.3 Frequency tables

# cross tabulation of parity by education
table(infert$parity, infert$education)

##    
##     0-5yrs 6-11yrs 12+ yrs
##   1      3      42      54
##   2      0      42      39
##   3      0      21      15
##   4      3      12       3
##   5      0       3       3
##   6      6       0       2

4.4 Installing and using a package

# install.packages("epiDisplay")
library(epiDisplay)

## Loading required package: foreign

## Loading required package: survival

## Loading required package: MASS

## Loading required package: nnet

tabpct(infert$parity, infert$education, percent = "both", graph = FALSE)

## 
## Original table 
##              infert$education
## infert$parity  0-5yrs  6-11yrs  12+ yrs  Total
##         1           3       42       54     99
##         2           0       42       39     81
##         3           0       21       15     36
##         4           3       12        3     18
##         5           0        3        3      6
##         6           6        0        2      8
##         Total      12      120      116    248
## 
## Row percent 
##              infert$education
## infert$parity  0-5yrs  6-11yrs  12+ yrs  Total
##             1       3       42       54     99
##                   (3)   (42.4)   (54.5)  (100)
##             2       0       42       39     81
##                   (0)   (51.9)   (48.1)  (100)
##             3       0       21       15     36
##                   (0)   (58.3)   (41.7)  (100)
##             4       3       12        3     18
##                (16.7)   (66.7)   (16.7)  (100)
##             5       0        3        3      6
##                   (0)     (50)     (50)  (100)
##             6       6        0        2      8
##                  (75)      (0)     (25)  (100)
## 
## Column percent 
##              infert$education
## infert$parity  0-5yrs      %  6-11yrs       %  12+ yrs       %
##         1           3   (25)       42  (35.0)       54  (46.6)
##         2           0    (0)       42  (35.0)       39  (33.6)
##         3           0    (0)       21  (17.5)       15  (12.9)
##         4           3   (25)       12  (10.0)        3   (2.6)
##         5           0    (0)        3   (2.5)        3   (2.6)
##         6           6   (50)        0   (0.0)        2   (1.7)
##         Total      12  (100)      120   (100)      116   (100)

4.5 Plots

plot(infert$education)

plot(infert$parity)

plot(infert$age)

plot(table(infert$parity))

hist(infert$age)

4.6 US Arrest data set

data(USArrests)
str(USArrests)

## 'data.frame':    50 obs. of  4 variables:
##  $ Murder  : num  13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
##  $ Assault : int  236 263 294 190 276 204 110 238 335 211 ...
##  $ UrbanPop: int  58 48 80 50 91 78 77 72 80 60 ...
##  $ Rape    : num  21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...

summary(USArrests$Murder)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.800   4.075   7.250   7.788  11.250  17.400

# distribution of number of states grouped in the range ([0-50), [50-100), etc ) of assault arrests
hist(USArrests$Assault)

5 Calculations

5.1 Tempature conversion

f <- seq(95, 104, by = 0.1)
f

##  [1]  95.0  95.1  95.2  95.3  95.4  95.5  95.6  95.7  95.8  95.9  96.0  96.1
## [13]  96.2  96.3  96.4  96.5  96.6  96.7  96.8  96.9  97.0  97.1  97.2  97.3
## [25]  97.4  97.5  97.6  97.7  97.8  97.9  98.0  98.1  98.2  98.3  98.4  98.5
## [37]  98.6  98.7  98.8  98.9  99.0  99.1  99.2  99.3  99.4  99.5  99.6  99.7
## [49]  99.8  99.9 100.0 100.1 100.2 100.3 100.4 100.5 100.6 100.7 100.8 100.9
## [61] 101.0 101.1 101.2 101.3 101.4 101.5 101.6 101.7 101.8 101.9 102.0 102.1
## [73] 102.2 102.3 102.4 102.5 102.6 102.7 102.8 102.9 103.0 103.1 103.2 103.3
## [85] 103.4 103.5 103.6 103.7 103.8 103.9 104.0

c <- 5/9*(f-32)
c

##  [1] 35.00000 35.05556 35.11111 35.16667 35.22222 35.27778 35.33333 35.38889
##  [9] 35.44444 35.50000 35.55556 35.61111 35.66667 35.72222 35.77778 35.83333
## [17] 35.88889 35.94444 36.00000 36.05556 36.11111 36.16667 36.22222 36.27778
## [25] 36.33333 36.38889 36.44444 36.50000 36.55556 36.61111 36.66667 36.72222
## [33] 36.77778 36.83333 36.88889 36.94444 37.00000 37.05556 37.11111 37.16667
## [41] 37.22222 37.27778 37.33333 37.38889 37.44444 37.50000 37.55556 37.61111
## [49] 37.66667 37.72222 37.77778 37.83333 37.88889 37.94444 38.00000 38.05556
## [57] 38.11111 38.16667 38.22222 38.27778 38.33333 38.38889 38.44444 38.50000
## [65] 38.55556 38.61111 38.66667 38.72222 38.77778 38.83333 38.88889 38.94444
## [73] 39.00000 39.05556 39.11111 39.16667 39.22222 39.27778 39.33333 39.38889
## [81] 39.44444 39.50000 39.55556 39.61111 39.66667 39.72222 39.77778 39.83333
## [89] 39.88889 39.94444 40.00000

t <- cbind(Fahrenheit = f, Celsius = c)
t

##       Fahrenheit  Celsius
##  [1,]       95.0 35.00000
##  [2,]       95.1 35.05556
##  [3,]       95.2 35.11111
##  [4,]       95.3 35.16667
##  [5,]       95.4 35.22222
##  [6,]       95.5 35.27778
##  [7,]       95.6 35.33333
##  [8,]       95.7 35.38889
##  [9,]       95.8 35.44444
## [10,]       95.9 35.50000
## [11,]       96.0 35.55556
## [12,]       96.1 35.61111
## [13,]       96.2 35.66667
## [14,]       96.3 35.72222
## [15,]       96.4 35.77778
## [16,]       96.5 35.83333
## [17,]       96.6 35.88889
## [18,]       96.7 35.94444
## [19,]       96.8 36.00000
## [20,]       96.9 36.05556
## [21,]       97.0 36.11111
## [22,]       97.1 36.16667
## [23,]       97.2 36.22222
## [24,]       97.3 36.27778
## [25,]       97.4 36.33333
## [26,]       97.5 36.38889
## [27,]       97.6 36.44444
## [28,]       97.7 36.50000
## [29,]       97.8 36.55556
## [30,]       97.9 36.61111
## [31,]       98.0 36.66667
## [32,]       98.1 36.72222
## [33,]       98.2 36.77778
## [34,]       98.3 36.83333
## [35,]       98.4 36.88889
## [36,]       98.5 36.94444
## [37,]       98.6 37.00000
## [38,]       98.7 37.05556
## [39,]       98.8 37.11111
## [40,]       98.9 37.16667
## [41,]       99.0 37.22222
## [42,]       99.1 37.27778
## [43,]       99.2 37.33333
## [44,]       99.3 37.38889
## [45,]       99.4 37.44444
## [46,]       99.5 37.50000
## [47,]       99.6 37.55556
## [48,]       99.7 37.61111
## [49,]       99.8 37.66667
## [50,]       99.9 37.72222
## [51,]      100.0 37.77778
## [52,]      100.1 37.83333
## [53,]      100.2 37.88889
## [54,]      100.3 37.94444
## [55,]      100.4 38.00000
## [56,]      100.5 38.05556
## [57,]      100.6 38.11111
## [58,]      100.7 38.16667
## [59,]      100.8 38.22222
## [60,]      100.9 38.27778
## [61,]      101.0 38.33333
## [62,]      101.1 38.38889
## [63,]      101.2 38.44444
## [64,]      101.3 38.50000
## [65,]      101.4 38.55556
## [66,]      101.5 38.61111
## [67,]      101.6 38.66667
## [68,]      101.7 38.72222
## [69,]      101.8 38.77778
## [70,]      101.9 38.83333
## [71,]      102.0 38.88889
## [72,]      102.1 38.94444
## [73,]      102.2 39.00000
## [74,]      102.3 39.05556
## [75,]      102.4 39.11111
## [76,]      102.5 39.16667
## [77,]      102.6 39.22222
## [78,]      102.7 39.27778
## [79,]      102.8 39.33333
## [80,]      102.9 39.38889
## [81,]      103.0 39.44444
## [82,]      103.1 39.50000
## [83,]      103.2 39.55556
## [84,]      103.3 39.61111
## [85,]      103.4 39.66667
## [86,]      103.5 39.72222
## [87,]      103.6 39.77778
## [88,]      103.7 39.83333
## [89,]      103.8 39.88889
## [90,]      103.9 39.94444
## [91,]      104.0 40.00000

5.2 Body mass index

w <- 148
h <- 5.25
bmi <- (w/2.2)/(h/3.3)^2
bmi

## [1] 26.57959

bmi * 0.95

## [1] 25.25061

bmi * 0.90

## [1] 23.92163

bmi * 0.85

## [1] 22.59265

5.3 AIDS transmission

p.trans = 1/500
p.not.trans = 1 - p.trans
p.not.trans

## [1] 0.998

p.not.trans^365

## [1] 0.4815569

p.trans = 1/5000
p.not.trans = 1 - p.trans
p.not.trans

## [1] 0.9998

p.not.trans^365

## [1] 0.929594

5.4 Cumulative incidence

lambda <- 10/100000
t <- c(1, 5, 10)
# CI = lambda*exp(-t)
CI = 1 - exp(-lambda * t)
cbind(rate = lambda, years = t, cumulative.incidence = CI)

##       rate years cumulative.incidence
## [1,] 1e-04     1         0.0000999950
## [2,] 1e-04     5         0.0004998750
## [3,] 1e-04    10         0.0009995002

5.5 Attributable fractions

#AF = (Re-Ru)/Re = 1-1/RR

risk.unexp <- 1/100000
risk.exp <- c((1/50000),(1/10000),(1/1000))
rr = risk.exp / risk.unexp
af <- 1-1/rr
af.table <- cbind(unexposed = risk.unexp, exposed = risk.exp, attributes.fx = af)
af.table

##      unexposed exposed attributes.fx
## [1,]     1e-05   2e-05          0.50
## [2,]     1e-05   1e-04          0.90
## [3,]     1e-05   1e-03          0.99

af.pop <- af*500/1400
af.table2 <- cbind(af.table, af.pop)
af.table2

##      unexposed exposed attributes.fx    af.pop
## [1,]     1e-05   2e-05          0.50 0.1785714
## [2,]     1e-05   1e-04          0.90 0.3214286
## [3,]     1e-05   1e-03          0.99 0.3535714

6 Rates, risks, odds, and logits

6.1 HIV transmission

curve(x/(1 - x), 0, 1)

curve(log(x/(1 - x)), 0, 1)

# Type of Exposure  Rate per 10,000
exp_type <- c("BT", "IDU", "RAI", "PNS", "RPVI", "IAI", "IPVI", "ROI", "IOI")
exp_rate <- c(9000,67,50,30,10,6.5,5,1,0.5)
exp_risk <- exp_rate / 10000
exp_odds <- exp_risk / (1-exp_risk)
exp_log_odds <- log(exp_odds)
hiv_tx <- data.frame(exp_type, exp_rate, exp_risk, exp_odds, exp_log_odds)
hiv_tx

# Here is a basic Plot
plot(exp_log_odds, type = "b")

# Now, we plot a professional looking graphic
# Calculate range from min to max values of exp_log_odds
g_range <- round(range(min(exp_log_odds), max(exp_log_odds)),2)

# Graph exp_log_odds using y axis that ranges from min to max values of exp_log_odds
# Turn off axes and annotations (axis labels) so we can specify them ourself

plot(exp_log_odds, type="o", col="blue", ylim=g_range, axes=FALSE, ann=FALSE)

# Make x axis using exposure type labels
axis(1, at=1:9, lab=exp_type)

# Make y axis with horizontal labels that display ticks at every 2 marks.
axis(2, las=1, at=2*g_range[1]:g_range[2])

# Create box around plot
box()

# Create a title with a red, bold/italic font
title(main="Log Odds of HIV Transmission", col.main="red", font.main=4)

# Label the x and y axes with dark green text
title(xlab="Exposure Type", col.lab=rgb(0,0.5,0))
title(ylab="Log Odds", col.lab=rgb(0,0.5,0))

# Create a legend at (1, g_range[2]) that is slightly smaller (cex) and uses the same line colors and points used by the actual plots 
legend(1.5, g_range[2], c("Log Odds of Different Exposures"), cex=0.8, col=c("blue"), pch=21:22, lty=1:2)

6.2 Scottish Health Study

In the Scottish Health Study 85 of 1821 people who lived in rented apartments had coronary artery disease, compared to 77 of 2477 people who owned their homes.

Create a named 2-by-2 matrix object from this data Calculate the row totals. (Hint: use apply()) Calculate the risk ratio and odds ratio for these data. Create a matrix of the risks, relative risks, odds, and odds ratios

dat <- matrix(c(85,(1821-85),77,(2477-77)),2,2)
rownames(dat) <- c('Diseased','Survivors')
colnames(dat) <- c('Renters','Home Owners')

rowtot <- apply(dat, 1, sum)   #row totals
coltot <- apply(dat, 2, sum)   #column totals

# Alternatively addmargins(dat) displays row and column totals added to the margins of the matrix dat

risks <- dat['Diseased', ]/coltot
risk.ratio <- risks/risks[2]   #risk ratio
odds <- risks/(1 - risks)
odds.ratio <- odds/odds[2]     #odds ratio
dat                            # display 2x2 table

##           Renters Home Owners
## Diseased       85          77
## Survivors    1736        2400

rbind(risks,risk.ratio,odds,odds.ratio) # display results

##               Renters Home Owners
## risks      0.04667765  0.03108599
## risk.ratio 1.50156543  1.00000000
## odds       0.04896313  0.03208333
## odds.ratio 1.52612365  1.00000000

7 Cross tabulations and stratified analysis

7.1 UGDP

# Read UGDP table
udat <- read.csv('http://www.columbia.edu/~cjd11/charles_dimaggio/DIRE/resources/R/ugdp.txt', header=TRUE, stringsAsFactors = FALSE)
str(udat)

## 'data.frame':    409 obs. of  3 variables:
##  $ Status   : chr  "Death" "Death" "Death" "Death" ...
##  $ Treatment: chr  "Tolbutamide" "Tolbutamide" "Tolbutamide" "Tolbutamide" ...
##  $ Agegrp   : chr  "<55" "<55" "<55" "<55" ...

#Number of observations in the data set: row count of udat
length(udat[,1])

## [1] 409

nrow(udat)

## [1] 409

#column count of udat
length(udat[1,])

## [1] 3

ncol(udat)

## [1] 3

# How many participants received Tolbutamide?
nrow(udat[udat$Treatment=="Tolbutamide",])

## [1] 204

table(udat$Treatment)

## 
##     Placebo Tolbutamide 
##         205         204

# How many participants were under the age of 55?
nrow(udat[udat$Agegrp=="<55",])

## [1] 226

table(udat$Agegrp)

## 
## <55 55+ 
## 226 183

# Calculate the odds ratio for the association of tolbutamide with death. 
tmpdat = table(udat$Status, udat$Treatment)
dat <- tmpdat[,2:1]

coltot <- apply(dat, 2, sum)   #column totals
risks <- dat['Death', ]/coltot
risk.ratio <- risks/risks[2]   #risk ratio
odds <- risks/(1 - risks)
odds.ratio <- odds/odds[2]     #odds ratio
dat                            # display 2x2 table

##           
##            Tolbutamide Placebo
##   Death             30      21
##   Survivor         174     184

rbind(risks,risk.ratio,odds,odds.ratio) # display results

##            Tolbutamide   Placebo
## risks        0.1470588 0.1024390
## risk.ratio   1.4355742 1.0000000
## odds         0.1724138 0.1141304
## odds.ratio   1.5106732 1.0000000

# Alternative method: a*d/b*c
a = dat[1,1]
b = dat[1,2]
c = dat[2,1]
d = dat[2,2]
uor = a*d/(b*c)

OR_CI_h=exp(log(uor)+1.96*sqrt(1/a+1/b+1/c+1/d))
OR_CI_l=exp(log(uor)-1.96*sqrt(1/a+1/b+1/c+1/d))

uor

## [1] 1.510673

OR_CI_l

## [1] 0.8332882

OR_CI_h

## [1] 2.738709

7.1.1 epitab()

library(epitools)

## 
## Attaching package: 'epitools'

## The following object is masked from 'package:survival':
## 
##     ratetable

epitab(dat)

## $tab
##           
##            Tolbutamide        p0 Placebo       p1 oddsratio     lower    upper
##   Death             30 0.1470588      21 0.102439  1.000000        NA       NA
##   Survivor         174 0.8529412     184 0.897561  1.510673 0.8332973 2.738679
##           
##              p.value
##   Death           NA
##   Survivor 0.1812576
## 
## $measure
## [1] "wald"
## 
## $conf.level
## [1] 0.95
## 
## $pvalue
## [1] "fisher.exact"

7.2 The cars data set

7.2.1 Unstratified analysis

cars<-read.table("http://www.columbia.edu/~cjd11/charles_dimaggio/DIRE/resources/R/cars.txt", header=T, stringsAsFactors=F)
head(cars)

str(cars)

## 'data.frame':    96 obs. of  4 variables:
##  $ safety: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ type  : chr  "Medium" "Sport/Utility" "Medium" "Small" ...
##  $ region: chr  "NAmerica" "NAmerica" "NAmerica" "NAmerica" ...
##  $ weight: num  3.4 4.18 3.14 2.6 3.08 ...

prop.table(table(cars$region))

## 
##      Asia  NAmerica 
## 0.3645833 0.6354167

tmptbl <- table(cars$region, cars$safety)
cars_tbl <- tmptbl[,2:1]

epitab(cars_tbl,method="oddsratio")

## $tab
##           
##             1  p0  0        p1 oddsratio     lower    upper    p.value
##   Asia     15 0.5 20 0.3030303       1.0        NA       NA         NA
##   NAmerica 15 0.5 46 0.6969697       2.3 0.9468078 5.587195 0.07175376
## 
## $measure
## [1] "wald"
## 
## $conf.level
## [1] 0.95
## 
## $pvalue
## [1] "fisher.exact"

chisq.test(cars_tbl)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  cars_tbl
## X-squared = 2.6562, df = 1, p-value = 0.1031

7.3 Stratified analysis

big <- cars$type == "Large"|cars$type == "Sport/Utility"
cars$cat[big] <- "big"
cars$cat[!big] <- "small"

head(cars)

tmparr <- xtabs(~region+safety+cat, data=cars)

dat <- tmparr[,2:1,]; dat

## , , cat = big
## 
##           safety
## region      1  0
##   Asia      1  4
##   NAmerica  1 26
## 
## , , cat = small
## 
##           safety
## region      1  0
##   Asia     14 16
##   NAmerica 14 20

dat_big <- tmparr[,2:1,"big"] 
dat_sml <- tmparr[,2:1,"small"]

epitab(dat_big)

## Warning in chisq.test(xx, correct = correction): Chi-squared approximation may
## be incorrect

## $tab
##           safety
## region     1  p0  0        p1 oddsratio    lower    upper   p.value
##   Asia     1 0.5  4 0.1333333       1.0       NA       NA        NA
##   NAmerica 1 0.5 26 0.8666667       6.5 0.335154 126.0614 0.2923387
## 
## $measure
## [1] "wald"
## 
## $conf.level
## [1] 0.95
## 
## $pvalue
## [1] "fisher.exact"

epitab(dat_sml)

## $tab
##           safety
## region      1  p0  0        p1 oddsratio    lower    upper   p.value
##   Asia     14 0.5 16 0.4444444      1.00       NA       NA        NA
##   NAmerica 14 0.5 20 0.5555556      1.25 0.464273 3.365477 0.8013262
## 
## $measure
## [1] "wald"
## 
## $conf.level
## [1] 0.95
## 
## $pvalue
## [1] "fisher.exact"

mantelhaen.test(dat)

## 
##  Mantel-Haenszel chi-squared test with continuity correction
## 
## data:  dat
## Mantel-Haenszel X-squared = 0.26628, df = 1, p-value = 0.6058
## alternative hypothesis: true common odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.559892 3.657598
## sample estimates:
## common odds ratio 
##          1.431034

7.3.1 Stratified analysis of UGDP data set

str(udat)

## 'data.frame':    409 obs. of  3 variables:
##  $ Status   : chr  "Death" "Death" "Death" "Death" ...
##  $ Treatment: chr  "Tolbutamide" "Tolbutamide" "Tolbutamide" "Tolbutamide" ...
##  $ Agegrp   : chr  "<55" "<55" "<55" "<55" ...

tmparr <- xtabs(~Status+Treatment+Agegrp, data=udat)
udat_arr <- tmparr[,2:1,]
mantelhaen.test(udat_arr)

## 
##  Mantel-Haenszel chi-squared test with continuity correction
## 
## data:  udat_arr
## Mantel-Haenszel X-squared = 0.87967, df = 1, p-value = 0.3483
## alternative hypothesis: true common odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.7625152 2.5820153
## sample estimates:
## common odds ratio 
##          1.403149

# OR are different 1.87 and 1.24, MH OR 1.40 lower than unadj OR 1.51 -> Confounding

8 Making sense of the *apply() family of functions ## 8.1 apply()

rnorm(16,20,10) # (n=number of observations, mean=default zero, sd=default one)

##  [1] 27.834141 12.215918 18.530816 13.793440 32.275193 45.266087 32.345679
##  [8]  3.616576 26.687085 19.005346 20.999690 18.116655 15.439688 25.432540
## [15] 20.207304 20.906668

m <- matrix(round(rnorm(16, 20, 10)), 4, 4) 
m

##      [,1] [,2] [,3] [,4]
## [1,]    3   30   21   26
## [2,]   24    7   19    7
## [3,]    8   26   25   22
## [4,]   23    2   34   20

apply(m, 1, min) # smallest observation in each row (1st dimension of the array)

## [1] 3 7 8 2

apply(m, 2, sum) # column totals

## [1] 58 65 99 75

a <- array(round(rnorm(16, 20, 10)), dim = c(2, 4, 2))
a

## , , 1
## 
##      [,1] [,2] [,3] [,4]
## [1,]   15   26   20   26
## [2,]   22   28   41   22
## 
## , , 2
## 
##      [,1] [,2] [,3] [,4]
## [1,]   36   16    7   21
## [2,]   13   24   25   24

apply(a, 1, mean)

## [1] 20.875 24.875

sex <- c("male", "female") 
race <- c("w", "b", "h", "o")
tx <- c("yes", "no") 
dimnames(a) <- list(gender = sex, race = race, treatement = tx) 
a

## , , treatement = yes
## 
##         race
## gender    w  b  h  o
##   male   15 26 20 26
##   female 22 28 41 22
## 
## , , treatement = no
## 
##         race
## gender    w  b  h  o
##   male   36 16  7 21
##   female 13 24 25 24

apply(a, c(2, 3), mean) # apply the function mean to each column (race) - average age of w/b/h/o treated/not treated

##     treatement
## race  yes   no
##    w 18.5 24.5
##    b 27.0 20.0
##    h 30.5 16.0
##    o 24.0 22.5

apply(a, c(1, 3), mean) # apply the function mean to each column (gender) - average age of m/f - treated/not treated

##         treatement
## gender     yes   no
##   male   21.75 20.0
##   female 28.25 21.5

# Alternatively colMeans(), rowMeans(), colSums(), rowSums(), addmargins()

8.2 tapply()

dat <- data.frame(age = (round(rnorm(10, 5, 1))), clinic = sample(c("a", "b", "c", "d", "e"))) 
tapply(dat$age, dat$clinic, mean)

##   a   b   c   d   e 
## 5.5 5.0 4.0 5.5 5.0

dat <- data.frame(age = (round(rnorm(100, 5, 1))), clinic = sample(c("a", "b", "c", "d", "e")), treatment = sample(c("Tx","noTx"))) 
b <- tapply(dat$age, dat[, c("clinic","treatment")], mean) 
b

##       treatment
## clinic noTx  Tx
##      a  4.4 5.1
##      b  4.7 4.9
##      c  5.2 5.8
##      d  4.7 5.2
##      e  5.9 5.3

# Error in FUN(dd[x, ], ...) : could not find function "FUN"
c <- by(dat$age, dat[, c(2, 3)], mean) 
c

## clinic: a
## treatment: noTx
## [1] 4.4
## ------------------------------------------------------------ 
## clinic: b
## treatment: noTx
## [1] 4.7
## ------------------------------------------------------------ 
## clinic: c
## treatment: noTx
## [1] 5.2
## ------------------------------------------------------------ 
## clinic: d
## treatment: noTx
## [1] 4.7
## ------------------------------------------------------------ 
## clinic: e
## treatment: noTx
## [1] 5.9
## ------------------------------------------------------------ 
## clinic: a
## treatment: Tx
## [1] 5.1
## ------------------------------------------------------------ 
## clinic: b
## treatment: Tx
## [1] 4.9
## ------------------------------------------------------------ 
## clinic: c
## treatment: Tx
## [1] 5.8
## ------------------------------------------------------------ 
## clinic: d
## treatment: Tx
## [1] 5.2
## ------------------------------------------------------------ 
## clinic: e
## treatment: Tx
## [1] 5.3

by(dat$age,dat[,c(2,3)], mean)

## clinic: a
## treatment: noTx
## [1] 4.4
## ------------------------------------------------------------ 
## clinic: b
## treatment: noTx
## [1] 4.7
## ------------------------------------------------------------ 
## clinic: c
## treatment: noTx
## [1] 5.2
## ------------------------------------------------------------ 
## clinic: d
## treatment: noTx
## [1] 4.7
## ------------------------------------------------------------ 
## clinic: e
## treatment: noTx
## [1] 5.9
## ------------------------------------------------------------ 
## clinic: a
## treatment: Tx
## [1] 5.1
## ------------------------------------------------------------ 
## clinic: b
## treatment: Tx
## [1] 4.9
## ------------------------------------------------------------ 
## clinic: c
## treatment: Tx
## [1] 5.8
## ------------------------------------------------------------ 
## clinic: d
## treatment: Tx
## [1] 5.2
## ------------------------------------------------------------ 
## clinic: e
## treatment: Tx
## [1] 5.3

mydata <- iris
by(mydata$Sepal.Length,list(mydata$Species),mean)

## : setosa
## [1] 5.006
## ------------------------------------------------------------ 
## : versicolor
## [1] 5.936
## ------------------------------------------------------------ 
## : virginica
## [1] 6.588

# Error in FUN(dd[x, ], ...) : could not find function "FUN"

8.3 lapply()

8.4 sapply()

l <- list(a = 5, b = 1:5, c = round(rnorm(20, 2, 1)))
l

## $a
## [1] 5
## 
## $b
## [1] 1 2 3 4 5
## 
## $c
##  [1] 2 1 1 2 3 3 2 1 1 3 3 3 1 3 3 1 0 2 1 4

lapply(l, length)

## $a
## [1] 1
## 
## $b
## [1] 5
## 
## $c
## [1] 20

lapply(l, sum)

## $a
## [1] 5
## 
## $b
## [1] 15
## 
## $c
## [1] 40

sapply(l, length)

##  a  b  c 
##  1  5 20

sapply(l, sum)

##  a  b  c 
##  5 15 40

# sapply() can coerce results into a matrix or into an array
# vapply() a version of sapply() that can be tweaked to run more quickly

8.5 mapply

v1 <- 1:5 
v2 <- 6:10 
v3 <- 11:15 
v1

## [1] 1 2 3 4 5

v2

## [1]  6  7  8  9 10

v3

## [1] 11 12 13 14 15

mapply(sum, v1, v2, v3)

## [1] 18 21 24 27 30

9 Indexing to manipulate data

x <- c(chol = 234, sbp = 148, dbp = 78, age = 54) 
x

## chol  sbp  dbp  age 
##  234  148   78   54

x[1] # by position

## chol 
##  234

x[x > 150] # by logical

## chol 
##  234

x["chol"] # by name

## chol 
##  234

x[1] <- 250 #by position 
x[x < 100] <- NA # by logical 
x["sbp"] <- 150 # by name 
x

## chol  sbp  dbp  age 
##  250  150   NA   NA

9.1 Indexing vectors

9.1.1 By position

x <- 1:40 
x[11] #only the 11th element

## [1] 11

x[-11] #exclude the 11th element

##  [1]  1  2  3  4  5  6  7  8  9 10 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
## [26] 27 28 29 30 31 32 33 34 35 36 37 38 39 40

x[11:20] #members 11 to 20

##  [1] 11 12 13 14 15 16 17 18 19 20

x[-(11:40)] # all but members 11 to 40

##  [1]  1  2  3  4  5  6  7  8  9 10

9.1.1 By logical

age <- c(8, NA, 7, 4) 
which(age < 5 | age >= 8)

## [1] 1 4

age[(age < 5 | age >= 8)] ## This displays NA observation compared to the which function

## [1]  8 NA  4

names <- c("dopey", "grumpy", "doc", "happy", "bashful", "sneezy", "sleepy") 
ages <- c(142, 240, 232, 333, 132, 134, 127) 
sex <- c("m", "m", "f", "f", "f", "m", "m") 

young <- ages < 150 #create logical vector 
names[young] #index name vector using logical vector

## [1] "dopey"   "bashful" "sneezy"  "sleepy"

names[!young] # old dwarves

## [1] "grumpy" "doc"    "happy"

male <- sex == "m" #logical vector male dwarves 
names[male] #index names using logical vector males

## [1] "dopey"  "grumpy" "sneezy" "sleepy"

names[young & male] # young male dwarves

## [1] "dopey"  "sneezy" "sleepy"

# simulate vector with 1000 age values
age <- sample(0:100, 1000, replace = TRUE) 
mean(age)

## [1] 50.206

sd(age)

## [1] 28.60692

agecat <- age # make copy

# replace elements agecat with strings for q
# category
agecat[age < 15] <- "<15" # creating character vector 
agecat[age >= 15 & age < 25] <- "15-24"
agecat[age >= 25 & age < 45] <- "25-44" 
agecat[age >= 45 & age < 65] <- "45-64" 
agecat[age >= 65] <- "65+" 
table(agecat) # get freqs

## agecat
##   <15 15-24 25-44 45-64   65+ 
##   135   104   196   220   345

str(agecat)

##  chr [1:1000] "25-44" "65+" "45-64" "65+" "65+" "45-64" "65+" "65+" "15-24" ...

9.2 Indexing matrices and arrays

9.2.1 indexing matrices

m <- matrix(round(rnorm(16, 50, 5)), 2, 2)
dimnames(m) <- list(behavior = c("type A", "type B"), MI = c("yes", "no"))
m

##         MI
## behavior yes no
##   type A  52 51
##   type B  53 41

m[1, ] #first row

## yes  no 
##  52  51

m[1, , drop = FALSE]

##         MI
## behavior yes no
##   type A  52 51

m[1,2] # cell "c"

## [1] 51

m["type A",]

## yes  no 
##  52  51

m[, "no"]

## type A type B 
##     51     41

m[, 2] < 45 # logical vector

## type A type B 
##  FALSE   TRUE

m[m[, 2] < 45] # data

## [1] 53 41

m[m[,2] < 56,]

##         MI
## behavior yes no
##   type A  52 51
##   type B  53 41

m2 <- matrix(round(rnorm(72, 50, 5)), 6, 6)
m2

##      [,1] [,2] [,3] [,4] [,5] [,6]
## [1,]   49   53   51   44   50   56
## [2,]   53   52   55   54   47   50
## [3,]   52   47   48   51   49   46
## [4,]   57   45   49   51   55   50
## [5,]   52   47   56   45   53   50
## [6,]   55   48   50   52   57   48

lower.tri(m2)

##       [,1]  [,2]  [,3]  [,4]  [,5]  [,6]
## [1,] FALSE FALSE FALSE FALSE FALSE FALSE
## [2,]  TRUE FALSE FALSE FALSE FALSE FALSE
## [3,]  TRUE  TRUE FALSE FALSE FALSE FALSE
## [4,]  TRUE  TRUE  TRUE FALSE FALSE FALSE
## [5,]  TRUE  TRUE  TRUE  TRUE FALSE FALSE
## [6,]  TRUE  TRUE  TRUE  TRUE  TRUE FALSE

upper.tri(m2)

##       [,1]  [,2]  [,3]  [,4]  [,5]  [,6]
## [1,] FALSE  TRUE  TRUE  TRUE  TRUE  TRUE
## [2,] FALSE FALSE  TRUE  TRUE  TRUE  TRUE
## [3,] FALSE FALSE FALSE  TRUE  TRUE  TRUE
## [4,] FALSE FALSE FALSE FALSE  TRUE  TRUE
## [5,] FALSE FALSE FALSE FALSE FALSE  TRUE
## [6,] FALSE FALSE FALSE FALSE FALSE FALSE

m2[lower.tri(m2)]

##  [1] 53 52 57 52 55 47 45 47 48 49 56 50 45 52 57

# Note lower.tri and upper.tri functions

9.2.2 Indexing arrays

a <- array(sample(10:70, 8, rep = T), c(2, 2, 2))
dimnames(a) <- list(exposure = c("e", "E"), disease = c("d", "D"), confounder = c("c", "C"))
a

## , , confounder = c
## 
##         disease
## exposure  d  D
##        e 55 26
##        E 55 34
## 
## , , confounder = C
## 
##         disease
## exposure  d  D
##        e 66 25
##        E 19 35

a[1, 2, 1]

## [1] 26

a["e","D","c"]

## [1] 26

a[a == 32]

## integer(0)

9.3 Indexing lists

l<- list(1:5, matrix(1:4, 2, 2), c("John Snow", "William Farr")) 

l[1] # note l[1] bin versus l[[1]] vector in the next line

## [[1]]
## [1] 1 2 3 4 5

l[[1]]

## [1] 1 2 3 4 5

l[[2]][2, 1]

## [1] 2

l[[3]][2]

## [1] "William Farr"

char <- sapply(l, is.character) 
char

## [1] FALSE FALSE  TRUE

epi.folk<-l[char] 
epi.folk[[1]][2] # note [[]] to address the contents of the bin

## [1] "William Farr"

9.3.1 Indexing the results of modeling

data(infert)
library(survival) # package with clogit()
mod1 <- clogit(case ~ spontaneous + induced + strata(stratum), data = infert)
mod1 # default results (7x risk c spont AB, 4x induced)

## Call:
## clogit(case ~ spontaneous + induced + strata(stratum), data = infert)
## 
##               coef exp(coef) se(coef)     z        p
## spontaneous 1.9859    7.2854   0.3524 5.635 1.75e-08
## induced     1.4090    4.0919   0.3607 3.906 9.38e-05
## 
## Likelihood ratio test=53.15  on 2 df, p=2.869e-12
## n= 248, number of events= 83

str(mod1)

## List of 21
##  $ coefficients     : Named num [1:2] 1.99 1.41
##   ..- attr(*, "names")= chr [1:2] "spontaneous" "induced"
##  $ var              : num [1:2, 1:2] 0.1242 0.0927 0.0927 0.1301
##  $ loglik           : num [1:2] -90.8 -64.2
##  $ score            : num 48.4
##  $ iter             : num 5
##  $ linear.predictors: num [1:248] 3.429 -0.543 0.866 0.866 1.443 ...
##  $ residuals        : Named num [1:248] 0.134 0.328 0.667 0.783 0.373 ...
##   ..- attr(*, "names")= chr [1:248] "1" "2" "3" "4" ...
##  $ means            : Named num [1:2] 0.577 0.573
##   ..- attr(*, "names")= chr [1:2] "spontaneous" "induced"
##  $ method           : chr "exact"
##  $ n                : int 248
##  $ nevent           : num 83
##  $ terms            :Classes 'terms', 'formula'  language Surv(rep(1, 248L), case) ~ spontaneous + induced + strata(stratum)
##   .. ..- attr(*, "variables")= language list(Surv(rep(1, 248L), case), spontaneous, induced, strata(stratum))
##   .. ..- attr(*, "factors")= int [1:4, 1:3] 0 1 0 0 0 0 1 0 0 0 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:4] "Surv(rep(1, 248), case)" "spontaneous" "induced" "strata(stratum)"
##   .. .. .. ..$ : chr [1:3] "spontaneous" "induced" "strata(stratum)"
##   .. ..- attr(*, "term.labels")= chr [1:3] "spontaneous" "induced" "strata(stratum)"
##   .. ..- attr(*, "specials")=Dotted pair list of 2
##   .. .. ..$ strata: int 4
##   .. .. ..$ tt    : NULL
##   .. ..- attr(*, "order")= int [1:3] 1 1 1
##   .. ..- attr(*, "intercept")= num 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(Surv(rep(1, 248L), case), spontaneous, induced, strata(stratum))
##   .. ..- attr(*, "dataClasses")= Named chr [1:4] "nmatrix.2" "numeric" "numeric" "factor"
##   .. .. ..- attr(*, "names")= chr [1:4] "Surv(rep(1, 248L), case)" "spontaneous" "induced" "strata(stratum)"
##  $ assign           :List of 2
##   ..$ spontaneous: int 1
##   ..$ induced    : int 2
##  $ wald.test        : num 31.8
##  $ concordance      : Named num [1:7] 113 22 30 0 0 ...
##   ..- attr(*, "names")= chr [1:7] "concordant" "discordant" "tied.x" "tied.y" ...
##  $ y                : 'Surv' num [1:248, 1:2] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:248] "1" "2" "3" "4" ...
##   .. ..$ : chr [1:2] "time" "status"
##   ..- attr(*, "type")= chr "right"
##  $ timefix          : logi TRUE
##  $ formula          :Class 'formula'  language Surv(rep(1, 248L), case) ~ spontaneous + induced + strata(stratum)
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##  $ xlevels          :List of 1
##   ..$ strata(stratum): chr [1:83] "stratum=1" "stratum=2" "stratum=3" "stratum=4" ...
##  $ call             : language coxph(formula = Surv(rep(1, 248L), case) ~ spontaneous + induced + strata(stratum),      data = infert, method = "exact")
##  $ userCall         : language clogit(case ~ spontaneous + induced + strata(stratum), data = infert)
##  - attr(*, "class")= chr [1:2] "clogit" "coxph"

names(mod1) #structure, names

##  [1] "coefficients"      "var"               "loglik"           
##  [4] "score"             "iter"              "linear.predictors"
##  [7] "residuals"         "means"             "method"           
## [10] "n"                 "nevent"            "terms"            
## [13] "assign"            "wald.test"         "concordance"      
## [16] "y"                 "timefix"           "formula"          
## [19] "xlevels"           "call"              "userCall"

mod1$coeff # name to index result (list element).

## spontaneous     induced 
##    1.985876    1.409012

summod1 <- summary(mod1) #more detailed results names(summod1) #detailed list components 
names(summod1) #detailed list components

##  [1] "call"         "fail"         "na.action"    "n"            "loglik"      
##  [6] "nevent"       "coefficients" "conf.int"     "logtest"      "sctest"      
## [11] "rsq"          "waldtest"     "used.robust"  "concordance"

9.4 Indexing dataframes

data(infert)

infert[1:4, 1:2]

infert[1:4, 2] <- c(NA, 45, NA, 23)
infert[1:4, 1:2]

names(infert)

## [1] "education"      "age"            "parity"         "induced"       
## [5] "case"           "spontaneous"    "stratum"        "pooled.stratum"

infert[1:4, c("education", "age")]

infert[1:4, c("age")] <- c(NA, 45, NA, 23)
infert[1:4, c("education", "age")]

table(infert$parity)

## 
##  1  2  3  4  5  6 
## 99 81 36 18  6  8

# change values of 5 or 6 to missing
infert$parity[infert$parity==5 | infert$parity==6] <- NA
table(infert$parity)

## 
##  1  2  3  4 
## 99 81 36 18

table(infert$parity, exclude=NULL)

## 
##    1    2    3    4 <NA> 
##   99   81   36   18   14

sparcs <- read.csv(file = "http://www.columbia.edu/~cjd11/charles_dimaggio/DIRE/resources/R/sparcsShort.csv", stringsAsFactors = FALSE)

brooklyn<-sparcs[sparcs$county=="59",]
nyc<- sparcs$county=="58"| sparcs$county=="59"| sparcs$county=="60"| sparcs$county=="61"| sparcs$county=="62"
nyc.sparcs<-sparcs[nyc,]

dxs<-sparcs[,"pdx"]
vars<-c("date", "pdx", "disp")
my.vars<-sparcs[,vars]

sparcs2<-sparcs[nyc,vars]

brooklyn.sparcs<-subset(sparcs, county=="59", select=c(date, pdx, disp)) 

nyc.sparcs<-subset(sparcs, county=="59":"62", select=c(county, pdx, disp)) 

nyc.sparcs<-subset(sparcs, county=="59":"62", select=-c(county, pdx, disp))

10 Logistic regression: the Titanic

titanic <- read.csv("C:/Dropbox/DRPH/P9489 Application of Epi Research Methods II/Data/titanic.csv", header=TRUE, stringsAsFactors = TRUE)
titanic

names(titanic)

##  [1] "X"         "pclass"    "survived"  "name"      "sex"       "age"      
##  [7] "sibsp"     "parch"     "ticket"    "fare"      "cabin"     "embarked" 
## [13] "boat"      "body"      "home.dest"

str(titanic)

## 'data.frame':    1309 obs. of  15 variables:
##  $ X        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ pclass   : Factor w/ 3 levels "1st","2nd","3rd": 1 1 1 1 1 1 1 1 1 1 ...
##  $ survived : int  1 1 0 0 0 1 1 0 1 0 ...
##  $ name     : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 22 24 25 26 27 31 46 47 51 55 ...
##  $ sex      : Factor w/ 2 levels "female","male": 1 2 1 2 1 2 1 2 1 2 ...
##  $ age      : num  29 0.917 2 30 25 ...
##  $ sibsp    : int  0 1 1 1 1 0 1 0 2 0 ...
##  $ parch    : int  0 2 2 2 2 0 0 0 0 0 ...
##  $ ticket   : Factor w/ 929 levels "110152","110413",..: 188 50 50 50 50 125 93 16 77 826 ...
##  $ fare     : num  211 152 152 152 152 ...
##  $ cabin    : Factor w/ 187 levels "","A10","A11",..: 45 81 81 81 81 151 147 17 63 1 ...
##  $ embarked : Factor w/ 4 levels "","Cherbourg",..: 4 4 4 4 4 4 4 4 4 2 ...
##  $ boat     : Factor w/ 28 levels "","1","10","11",..: 13 4 1 1 1 14 3 1 28 1 ...
##  $ body     : int  NA NA NA 135 NA NA NA NA NA 22 ...
##  $ home.dest: Factor w/ 369 levels "","?Havana, Cuba",..: 309 231 231 231 231 237 163 25 23 229 ...

tmpsurvbysex <- xtabs(~sex+survived, data=titanic) # alternatively table(titanic$sex, titanic$survived)
survbysex <- tmpsurvbysex[2:1,]

surv1 <- epitab(survbysex,method="oddsratio")

surv1$tab[,"oddsratio"]

##     male   female 
##  1.00000 11.30718

surv2 <- glm(survived ~ relevel(factor(sex), ref = "male"), data = titanic, family=binomial(link="logit"))

exp(surv2$coefficients)

##                              (Intercept) 
##                                0.2360704 
## relevel(factor(sex), ref = "male")female 
##                               11.3071844

summary(surv2)

## 
## Call:
## glm(formula = survived ~ relevel(factor(sex), ref = "male"), 
##     family = binomial(link = "logit"), data = titanic)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6124  -0.6511  -0.6511   0.7977   1.8196  
## 
## Coefficients:
##                                          Estimate Std. Error z value Pr(>|z|)
## (Intercept)                              -1.44363    0.08762  -16.48   <2e-16
## relevel(factor(sex), ref = "male")female  2.42544    0.13602   17.83   <2e-16
##                                             
## (Intercept)                              ***
## relevel(factor(sex), ref = "male")female ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1741.0  on 1308  degrees of freedom
## Residual deviance: 1368.1  on 1307  degrees of freedom
## AIC: 1372.1
## 
## Number of Fisher Scoring iterations: 4

exp(confint(surv2))

## Waiting for profiling to be done...

##                                              2.5 %     97.5 %
## (Intercept)                              0.1981912  0.2794823
## relevel(factor(sex), ref = "male")female 8.6871038 14.8093675

surv3 <- glm(survived ~ relevel(factor(sex), ref = "male")+relevel(pclass, ref="3rd")+age, data = titanic, family=binomial(link="logit"))

surv3

## 
## Call:  glm(formula = survived ~ relevel(factor(sex), ref = "male") + 
##     relevel(pclass, ref = "3rd") + age, family = binomial(link = "logit"), 
##     data = titanic)
## 
## Coefficients:
##                              (Intercept)  
##                                 -1.26543  
## relevel(factor(sex), ref = "male")female  
##                                  2.49784  
##          relevel(pclass, ref = "3rd")1st  
##                                  2.28966  
##          relevel(pclass, ref = "3rd")2nd  
##                                  1.00909  
##                                      age  
##                                 -0.03439  
## 
## Degrees of Freedom: 1045 Total (i.e. Null);  1041 Residual
##   (263 observations deleted due to missingness)
## Null Deviance:       1415 
## Residual Deviance: 982.5     AIC: 992.5

11 Survival analysis: papal longevity

# install.packages("survival")
library(survival)
library(ranger)
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:epiDisplay':
## 
##     alpha

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:MASS':
## 
##     select

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggfortify)

load("C:/Dropbox/DRPH/P9489 Application of Epi Research Methods II/Data/popeSurvival.RData")
popes

# For each analysis, provide Kaplan-Meir curves and Log Rank tests.

# How to delete it by name: popes <- popes[,-11] / popes <- popes[,-12]
popes$time1 <- as.Date(popes$DeathDate)-as.Date(popes$BirthDate)
popes$time2 <- as.Date(popes$DeathDate)-as.Date(popes$ElectedDate)

str(popes)

## 'data.frame':    87 obs. of  12 variables:
##  $ papalName  : chr  "Gregory IX" "Celestine IV" "Innocent IV" "Alexander IV" ...
##  $ Birth      : chr  "9 July 1170" NA "9 July 1195" "9 July 1185" ...
##  $ Elected    : chr  "19 Mar 1227" "25 Oct 1241" "25 Jun 1243" "12 Dec 1254" ...
##  $ Death      : chr  "22 Aug 1241" "10 Nov 1241" "7 Dec 1254" "25 May 1261" ...
##  $ papalName.1: chr  "Gregory IX" "Celestine IV" "Innocent IV" "Alexander IV" ...
##  $ birthPlace : chr  "Anagni, Papal States, Holy Roman Empire" "Milan, Italy, Holy Roman Empire" "Genoa, Republic of Genoa, Holy Roman Empire" "Jenne, Papal States" ...
##  $ roman      : chr  "Yes" "No" "No" "Yes" ...
##  $ BirthDate  : Date, format: "1170-07-09" NA ...
##  $ DeathDate  : Date, format: "1241-08-22" "1241-11-10" ...
##  $ ElectedDate: Date, format: "1227-03-19" "1241-10-25" ...
##  $ time1      : 'difftime' num  25977 NA 21701 27714 ...
##   ..- attr(*, "units")= chr "days"
##  $ time2      : 'difftime' num  5270 16 4183 2356 ...
##   ..- attr(*, "units")= chr "days"

kmp_fit1 <- survfit(Surv(time1)~roman,data=popes)
summary(kmp_fit1, times = c(15000,1000*(16:50)))

## Call: survfit(formula = Surv(time1) ~ roman, data = popes)
## 
## 1 observation deleted due to missingness 
##                 roman=No 
##   time n.risk n.event survival std.err lower 95% CI upper 95% CI
##  15000     57       0   1.0000  0.0000      1.00000        1.000
##  16000     57       0   1.0000  0.0000      1.00000        1.000
##  17000     56       1   0.9825  0.0174      0.94896        1.000
##  18000     55       1   0.9649  0.0244      0.91831        1.000
##  19000     53       2   0.9298  0.0338      0.86582        0.999
##  20000     51       2   0.8947  0.0406      0.81851        0.978
##  21000     48       3   0.8421  0.0483      0.75257        0.942
##  22000     44       4   0.7719  0.0556      0.67034        0.889
##  23000     41       3   0.7193  0.0595      0.61162        0.846
##  24000     36       5   0.6316  0.0639      0.51799        0.770
##  25000     29       7   0.5088  0.0662      0.39422        0.657
##  26000     24       5   0.4211  0.0654      0.31055        0.571
##  27000     21       3   0.3684  0.0639      0.26226        0.518
##  28000     18       3   0.3158  0.0616      0.21550        0.463
##  29000     16       2   0.2807  0.0595      0.18525        0.425
##  30000      7       9   0.1228  0.0435      0.06136        0.246
##  31000      5       3   0.0702  0.0338      0.02728        0.181
##  32000      3       1   0.0526  0.0296      0.01749        0.158
##  33000      1       2   0.0175  0.0174      0.00251        0.122
## 
##                 roman=Yes 
##   time n.risk n.event survival std.err lower 95% CI upper 95% CI
##  15000     29       0   1.0000  0.0000      1.00000        1.000
##  16000     29       0   1.0000  0.0000      1.00000        1.000
##  17000     29       0   1.0000  0.0000      1.00000        1.000
##  18000     29       0   1.0000  0.0000      1.00000        1.000
##  19000     29       0   1.0000  0.0000      1.00000        1.000
##  20000     28       1   0.9655  0.0339      0.90134        1.000
##  21000     28       0   0.9655  0.0339      0.90134        1.000
##  22000     28       0   0.9655  0.0339      0.90134        1.000
##  23000     27       1   0.9310  0.0471      0.84323        1.000
##  24000     26       1   0.8966  0.0566      0.79229        1.000
##  25000     24       2   0.8276  0.0701      0.70092        0.977
##  26000     14      10   0.4828  0.0928      0.33122        0.704
##  27000     11       3   0.3793  0.0901      0.23812        0.604
##  28000     10       1   0.3448  0.0883      0.20880        0.569
##  29000      9       1   0.3103  0.0859      0.18039        0.534
##  30000      5       4   0.1724  0.0701      0.07767        0.383
##  31000      3       2   0.1034  0.0566      0.03543        0.302
##  32000      1       2   0.0345  0.0339      0.00503        0.237
##  33000      1       0   0.0345  0.0339      0.00503        0.237
##  34000      1       0   0.0345  0.0339      0.00503        0.237

autoplot(kmp_fit1)

survdiff(Surv(time1)~roman,data=popes,rho=0)

## Call:
## survdiff(formula = Surv(time1) ~ roman, data = popes, rho = 0)
## 
## n=86, 1 observation deleted due to missingness.
## 
##            N Observed Expected (O-E)^2/E (O-E)^2/V
## roman=No  57       57     51.6     0.557      1.43
## roman=Yes 29       29     34.4     0.837      1.43
## 
##  Chisq= 1.4  on 1 degrees of freedom, p= 0.2

kmp_fit2 <- survfit(Surv(time2)~roman,data=popes)
summary(kmp_fit2, times = c(1,1000*(1:50)))

## Call: survfit(formula = Surv(time2) ~ roman, data = popes)
## 
##                 roman=No 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1     58       0   1.0000  0.0000      1.00000        1.000
##  1000     44      14   0.7586  0.0562      0.65611        0.877
##  2000     37       7   0.6379  0.0631      0.52550        0.774
##  3000     27      10   0.4655  0.0655      0.35332        0.613
##  4000     16      11   0.2759  0.0587      0.18181        0.419
##  5000      9       7   0.1552  0.0475      0.08512        0.283
##  6000      5       4   0.0862  0.0369      0.03730        0.199
##  7000      3       2   0.0517  0.0291      0.01718        0.156
##  8000      1       2   0.0172  0.0171      0.00247        0.120
##  9000      1       0   0.0172  0.0171      0.00247        0.120
## 
##                 roman=Yes 
##   time n.risk n.event survival std.err lower 95% CI upper 95% CI
##      1     29       0   1.0000  0.0000      1.00000        1.000
##   1000     23       6   0.7931  0.0752      0.65856        0.955
##   2000     16       7   0.5517  0.0923      0.39742        0.766
##   3000     14       2   0.4828  0.0928      0.33122        0.704
##   4000     12       2   0.4138  0.0915      0.26832        0.638
##   5000      9       3   0.3103  0.0859      0.18039        0.534
##   6000      6       3   0.2069  0.0752      0.10146        0.422
##   7000      5       1   0.1724  0.0701      0.07767        0.383
##   8000      4       1   0.1379  0.0640      0.05553        0.343
##   9000      2       2   0.0690  0.0471      0.01811        0.263
##  10000      1       1   0.0345  0.0339      0.00503        0.237
##  11000      1       0   0.0345  0.0339      0.00503        0.237

autoplot(kmp_fit2)

survdiff(Surv(time2)~roman,data=popes,rho=0)

## Call:
## survdiff(formula = Surv(time2) ~ roman, data = popes, rho = 0)
## 
##            N Observed Expected (O-E)^2/E (O-E)^2/V
## roman=No  58       58     51.1     0.931      2.42
## roman=Yes 29       29     35.9     1.325      2.42
## 
##  Chisq= 2.4  on 1 degrees of freedom, p= 0.1

# Look for alternative logrank_test(Surv(time2)~roman,data=popes, distribution = "exact")
# plot(km_fit, xlab="Days", main = 'Kaplan Meyer Plot') #base graphics is always ready

Self-Study Exercises