2.5.1 From RStudio main menu, selet ‘File’ > ‘New Project’ > ‘New Directory’ > ‘Empty Project’. Name the new directory ph251d-homework. Use R to display the file path to the work directory?

getwd()
## [1] "/Users/Michelle/Desktop/Berkeley MPH/Fall 2016/R for Epi's/R Program Files"

2.5.2 Recreate Table 2.26 using any combination of the matrix, cbind, rbind, dimnames, or names functions.

dat <- matrix(c(139, 443, 230, 50), 2,2)
rownames(dat) <- c("Dead", "Alive")
colnames(dat) <- c("Smoker", "Non-Smoker")
coltot <- apply(dat, 2, sum)  #column totals
risks <- dat['Dead', ]/coltot
risk.ratio <- risks/risks[2]   #risk ratio
odds <- risks/(1 - risks)
odds.ratio <- odds/odds[2]    #odds ratio
dat                            # display results
##       Smoker Non-Smoker
## Dead     139        230
## Alive    443         50

2.5.3 Starting with the 2x2 matrix object we created in Table 2.26, using any combination of apply, cbind, rbind, names, and dimnames functions, recreate the Table 2.27.

#one way
rtots <- apply(dat, 1, sum) #y axis 
dat_tots <- cbind(dat, Total = rtots)
ctots <- apply(dat_tots, 2, sum);  #x axis
dat_tots <- rbind(dat_tots, Total = ctots); dat_tots 
##       Smoker Non-Smoker Total
## Dead     139        230   369
## Alive    443         50   493
## Total    582        280   862
#another way
addmargins(dat)
##       Smoker Non-Smoker Sum
## Dead     139        230 369
## Alive    443         50 493
## Sum      582        280 862

2.5.4 Using the 2×2 data from Table 2.26, use the sweep and apply functions to calculate row marginal, column marginal, and joint distributions (i.e., three tables).

tab.rowdist <- sweep(dat, 1, rtots, '/'); tab.rowdist
##          Smoker Non-Smoker
## Dead  0.3766938  0.6233062
## Alive 0.8985801  0.1014199
tab.coldist <- sweep(dat, 2, ctots, '/'); tab.coldist
## Warning in sweep(dat, 2, ctots, "/"): STATS is longer than the extent of
## 'dim(x)[MARGIN]'
##          Smoker Non-Smoker
## Dead  0.2388316 0.82142857
## Alive 0.5139211 0.08591065

2.5.5 Using the data from the previous problems, recreate Table 2.28 and interpret the results.

#dat <- matrix(c(30, 174, 21, 184), 2, 2)
#rownames(dat) <- c('Deaths', 'Survivors')
#colnames(dat) <- c('Tolbutamide', 'Placebo')
#coltot <- apply(dat, 2, sum)   #column totals
dat
##       Smoker Non-Smoker
## Dead     139        230
## Alive    443         50
c_tots <- apply(dat, 2, sum)  #x axis
c_tots
##     Smoker Non-Smoker 
##        582        280
risks <- dat ['Dead']/c_tots
risks
##     Smoker Non-Smoker 
##         NA         NA
risk.ratio <- risks/risks[2]
odds <- risks/(1 - risks)
odds.ratio <- odds/odds[2]
rbind (risks, risk.ratio, odds, odds.ratio) # display results
##            Smoker Non-Smoker
## risks          NA         NA
## risk.ratio     NA         NA
## odds           NA         NA
## odds.ratio     NA         NA

2.5.6 Read in the Whickham, England data using the R code below. Stratified by age category, calculate the risk of death comparing smokers to nonsmokers. Show your results. What is your interpretation.

whickdat = read.table("http://www.medepi.net/data/whickham.txt", sep = ",", header = TRUE)
str(whickdat)
## 'data.frame':    1314 obs. of  3 variables:
##  $ Vital.Status: Factor w/ 2 levels "Alive","Dead": 2 2 1 1 1 1 1 1 1 1 ...
##  $ Smoking     : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Age         : Factor w/ 7 levels "18-24","25-34",..: 1 1 1 1 1 1 1 1 1 1 ...
xtab <- xtabs(~Vital.Status + Age + Smoking, data = whickdat)
xtab
## , , Smoking = No
## 
##             Age
## Vital.Status 18-24 25-34 35-44 45-54 55-64 65-74 75+
##        Alive    61   152   114    66    81    28   0
##        Dead      1     5     7    12    40   101  64
## 
## , , Smoking = Yes
## 
##             Age
## Vital.Status 18-24 25-34 35-44 45-54 55-64 65-74 75+
##        Alive    53   121    95   103    64     7   0
##        Dead      2     3    14    27    51    29  13
rowtotwhick <- apply(dat, 1, sum)  #x axis
rowtotwhick
##  Dead Alive 
##   369   493
#rtotswhickx <- apply(xtab, 1, sum) #y axis 
#whick_totsx <- cbind(xtab, Total = rtotswhickx)
#ctotswhickx <- apply(whick_totsx, 2, sum);  #x axis
#whick_totsx <- rbind(whick_totsx, Total = ctotswhickx); whick_totsx

whickdat.table <- table(whickdat$Vital.Status, whickdat$Age)
whickdat.table
##        
##         18-24 25-34 35-44 45-54 55-64 65-74 75+
##   Alive   114   273   209   169   145    35   0
##   Dead      3     8    21    39    91   130  77
rtotswhick <- apply(whickdat.table, 1, sum) #y axis 
whick_tots <- cbind(whickdat.table, Total = rtotswhick)
ctotswhick <- apply(whick_tots, 2, sum);  #x axis
whick_tots <- rbind(whick_tots, Total = ctotswhick); whick_tots
##       18-24 25-34 35-44 45-54 55-64 65-74 75+ Total
## Alive   114   273   209   169   145    35   0   945
## Dead      3     8    21    39    91   130  77   369
## Total   117   281   230   208   236   165  77  1314
#whickdat.array <- array (whick_tots, c(2,7,2))
#dimnames(whickdat.array) <- list(Outcome = c('Dead', 'ALive'), 
#                             Age = c(), 
#                   'Smoker' = c('Yes', 'No')); whickdat.array

3.5.1

Use the read.table function to read in the syphilis. Evaluate structure of data frame. Do not attach the data frame (yet). Create a 3-dimensional array using both the table or xtabs function. Now attach the data frame using the attach function. Create the same 3-dimensional array using both the table or xtabs function.

std89c = read.table("http://www.medepi.net/data/syphilis89c.txt", sep = ",", header = TRUE)

xtabstd <- xtabs(~Race + Age + Sex, data = std89c)
xtabstd
## , , Sex = Female
## 
##        Age
## Race    <=14  >55 15-19 20-24 25-29 30-34 35-44 45-54
##   Black  165   92  2257  4503  3590  2628  1505   392
##   Other   11   15   158   307   283   167   149    40
##   White   14   24   253   475   433   316   243    55
## 
## , , Sex = Male
## 
##        Age
## Race    <=14  >55 15-19 20-24 25-29 30-34 35-44 45-54
##   Black   31  823  1412  4059  4121  4453  3858  1619
##   Other    7  108   210   654   633   520   492   202
##   White    2  216    88   407   550   564   654   323
str(std89c)
## 'data.frame':    44081 obs. of  3 variables:
##  $ Sex : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Race: Factor w/ 3 levels "Black","Other",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ Age : Factor w/ 8 levels "<=14",">55","15-19",..: 1 1 3 3 3 3 3 3 3 3 ...
attach(std89c)

3.5.2 Use the apply function to get marginal totals for the syphilis 3-dimensional array.

#std.array <- array(std89c, c(3, 7, 2))
#dimnames(std.array) <- list (Race = c('Black', 'White', 'Other'), 
#                            'Age group' = c('18-24', '25-34', '35-44', '45-54', '55-64', '65-74', '75+'),
#                            Sex = c('Male', 'Female')); std.array
#std.array

#rtotstd <- apply(std.xtab, 1, sum) #y axis 
#std_tots <- cbind(std.xtab, Total = rtotstd)
#ctotstd <- apply(std_tots, 2, sum);  #x axis
#std_tots <- rbind(std_tots, Total = ctotstd); std_tots

3.5.3 Use the sweep and apply functions to get marginal and joint distributions for a 3-D array.

3.5.4 Review and read in the group-level, tabular data set of primary and secondary syphilis cases in the United States in 1989. Use the rep function on the data frame fields to recreate the individual-level data frame with over 40,000 observations.

std89b = read.table("http://www.medepi.net/data/syphilis89b.txt", sep = ",", header = TRUE)

3.5.5 Working with population estimates can be challenging because of the amount of data manipulation. Study the 2000 population estimates for California Counties: [http://www.medepi.net/data/calpop/CalCounties2000.txt]. Now, study and implement this R code. For each expression or group of expressions, explain in words what the R code is doing. Be sure to display intermediate objects to understand each step.