Problem Exercises 4

1.Read in the Evans data set. Type in the R expressions below. Do not copy and paste. After each expression briefly describe what the step accomplished.

edat <- read.table("~/R_Workspace/data-master/evans.txt", header = TRUE, sep = "")
str(edat)

## 'data.frame':    609 obs. of  12 variables:
##  $ id : int  21 31 51 71 74 91 111 131 141 191 ...
##  $ chd: int  0 0 1 0 0 0 1 0 0 0 ...
##  $ cat: int  0 0 1 1 0 0 0 0 0 0 ...
##  $ age: int  56 43 56 64 49 46 52 63 42 55 ...
##  $ chl: int  270 159 201 179 243 252 179 217 176 250 ...
##  $ smk: int  0 1 1 1 1 1 1 0 1 0 ...
##  $ ecg: int  0 0 1 0 0 0 1 0 0 1 ...
##  $ dbp: int  80 74 112 100 82 88 80 92 76 114 ...
##  $ sbp: int  138 128 164 200 145 142 128 135 114 182 ...
##  $ hpt: int  0 0 1 1 0 0 0 0 0 1 ...
##  $ ch : int  0 0 1 1 0 0 0 0 0 0 ...
##  $ cc : int  0 0 201 179 0 0 0 0 0 0 ...

xtabs(~chd, data = edat)

## chd
##   0   1 
## 538  71

edat$chd2 <- factor(edat$chd, levels = 0:1, 
                    labels = c("No", "Yes"))
xtabs(~chd2, data = edat) # test

## chd2
##  No Yes 
## 538  71

xtabs(~cat, data = edat)

## cat
##   0   1 
## 487 122

edat$cat2 <- factor(edat$cat, levels = 0:1,
                    labels = c("Normal", "High"))
xtabs(~cat2, data = edat) # test

## cat2
## Normal   High 
##    487    122

xtabs(~smk, data = edat)

## smk
##   0   1 
## 222 387

edat$smk2 <- factor(edat$smk, levels = 0:1, 
                    labels = c("Never", "Ever"))
xtabs(~smk2, data = edat) # test

## smk2
## Never  Ever 
##   222   387

xtabs(~ecg, data = edat)

## ecg
##   0   1 
## 443 166

edat$ecg2 <- factor(edat$ecg, levels = 0:1, 
                    labels = c("Normal", "Abnormal"))
xtabs(~ecg2, data = edat) # test

## ecg2
##   Normal Abnormal 
##      443      166

xtabs(~hpt, data = edat)

## hpt
##   0   1 
## 354 255

edat$hpt2 <- factor(edat$hpt, levels = 0:1, 
                    labels = c("No", "Yes"))
xtabs(~hpt2, data = edat) # test

## hpt2
##  No Yes 
## 354 255

Explanation:

Imported the dataset
Created new variables; chd, cat, smk, ecg, and hpt
Changed binary 1/0 labels to Yes/No, created new variables with Yes/No labels instead of 1/0; chd2, cat2, smk2, ecg2, and hpt2
Then checked by printing each one

#### (2)
quantile(edat$age)

##   0%  25%  50%  75% 100% 
##   40   46   52   60   76

edat$age4 <- cut(edat$age, quantile(edat$age), right = FALSE, 
                 include.lowest = TRUE)
xtabs(~age4, data = edat) # test

## age4
## [40,46) [46,52) [52,60) [60,76] 
##     134     158     158     159

Explanation:

Breaks up age into quartiles, shows us each one.
Also shows us the distribution of the variable between each quartile.
Creates variable, age4, which uses quartiles as levels

#### (3) 
hptnew <- rep(NA, nrow(edat))
normal <- edat$sbp < 120 & edat$dbp < 80
hptnew[normal] <- 1
prehyp <- (edat$sbp >= 120 & edat$sbp < 140) | 
            (edat$dbp >= 80 & edat$dbp < 90)
hptnew[prehyp] <- 2
stage1 <- (edat$sbp >= 140 & edat$sbp < 160) | 
            (edat$dbp >= 90 & edat$dbp < 100)
hptnew[stage1] <- 3
stage2 <- edat$sbp >= 160 | edat$dbp >= 100
hptnew[stage2] <- 4
edat$hpt4 <- factor(hptnew, levels = 1:4, 
                    labels=c("Normal", "PreHTN", "HTN.Stage1", 
                             "HTN.Stage2"))
xtabs(~hpt4, data = edat) # test

## hpt4
##     Normal     PreHTN HTN.Stage1 HTN.Stage2 
##         56        165        177        211

Explanation:

Creates new data set involving systolic and diastolyic blood pressure.
Creates 4 different levels (Normal, PreHTN, HTN.Stage1, HTN.Stage2) using blood pressure data and then places them in a new variable, hpt4.

#### (4)
xtabs(~hpt2 + hpt4, data = edat)

##      hpt4
## hpt2  Normal PreHTN HTN.Stage1 HTN.Stage2
##   No      56    165        133          0
##   Yes      0      0         44        211

Explanation:

Compares the binary hpt2 variable to the four level hpt4 variable
The levels in each variable are compatible, all the people listed as “normal” blood pressure in hpt4 are listed as “no” hypertension in hpt2, all HTN.Stage2 in hpt4 are included in hpt2 under “yes”, etc
Indviduals with normal blood pressure do not have hypertension and Stage 2 indviduals all have hypertension, etc.

2. Review the California 2004 surveillance data on human West Nile virus cases available at ~/data/wnv/wnv2004raw.txt. Read in the data setting as.is = TRUE, taking into account missing values (use na.strings option). Convert the calendar dates into the international standard format. Using the write.table function export the data as an ASCII text file.

wnv2004data <- read.table("~/R_Workspace/data-master/wnv2004raw.txt", header = T, sep = ",", as.is = T, na.strings = "N/A")

head(wnv2004data)

##   ID         County Age Sex Syndrome Date.onset  Date.test Death
## 1  1 San Bernardino  40   F      WNF 05/19/2004 06/02/2004    No
## 2  2 San Bernardino  64   F      WNF 05/22/2004 06/16/2004    No
## 3  3 San Bernardino  19   M      WNF 05/22/2004 06/16/2004    No
## 4  4 San Bernardino  12   M      WNF 05/16/2004 06/16/2004    No
## 5  5 San Bernardino  12   M      WNF 05/14/2004 06/16/2004    No
## 6  6 San Bernardino  17   M      WNF 06/07/2004 06/17/2004    No

summary(wnv2004data)

##        ID         County              Age                Sex           
##  Min.   :  1   Length:781         Length:781         Length:781        
##  1st Qu.:196   Class :character   Class :character   Class :character  
##  Median :391   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :391                                                           
##  3rd Qu.:586                                                           
##  Max.   :781                                                           
##    Syndrome          Date.onset         Date.test        
##  Length:781         Length:781         Length:781        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##     Death          
##  Length:781        
##  Class :character  
##  Mode  :character  
##                    
##                    
##

onset_time <- as.Date(wnv2004data$Date.onset, format = "%m/%d/%Y")
test_time <- as.Date(wnv2004data$Date.test, format = "%m/%d/%Y")

class(onset_time)

## [1] "Date"

class(test_time)

## [1] "Date"

onset_julian <- julian(onset_time)
test_julian <- julian(test_time)

onset_international <- as.Date(onset_julian, origin = as.Date('1970-01-01'))
test_international <- as.Date(test_julian, origin = as.Date('1970-01-01'))

head(onset_international)

## [1] "2004-05-19" "2004-05-22" "2004-05-22" "2004-05-16" "2004-05-14"
## [6] "2004-06-07"

head(test_international)

## [1] "2004-06-02" "2004-06-16" "2004-06-16" "2004-06-16" "2004-06-16"
## [6] "2004-06-17"

wnv2004data$Date.onset <- onset_international
wnv2004data$Date.test <- test_international
head(wnv2004data)

##   ID         County Age Sex Syndrome Date.onset  Date.test Death
## 1  1 San Bernardino  40   F      WNF 2004-05-19 2004-06-02    No
## 2  2 San Bernardino  64   F      WNF 2004-05-22 2004-06-16    No
## 3  3 San Bernardino  19   M      WNF 2004-05-22 2004-06-16    No
## 4  4 San Bernardino  12   M      WNF 2004-05-16 2004-06-16    No
## 5  5 San Bernardino  12   M      WNF 2004-05-14 2004-06-16    No
## 6  6 San Bernardino  17   M      WNF 2004-06-07 2004-06-17    No

write.table(wnv2004data, "wnv2004 dates")

3. On April 19th 1940 the local health officer in the village of Lycoming, Oswego County, New York reported the occurence of an outbreak of acute gastrointestinal illness to the District Health Officer in Syracuse. Dr. AM Rubin, epidemilogist-in-training was assigned to conduct an investigation???

Using RStudio plot the cases by time of onset of illness. What does the graph tell you? (Process the text data and then use the hist) function.

It hows us that the onset of infection is centered on April 19 at 12:00 AM.

Are there any cases for which the times of onset are inconsistent with the general experience? How might they be explained?

Yes, there are two cases that are inconsistent with the rest of the curve. One is several hours before the when the picnic took place while the other is several hours afterwards. Both cases could be unrelated to the outbreak, particular the earlier one.

How could the data be sorted by illness status and illness onset times?

You could either sort it by Yes or No illness. Then the ones that are ill could be sorted by onset time.

Where poissible, calculate incubation periods and illustarte their distribution with an appropriate graph. Determine the mean, median, and range of the incubation period.

oswego_data <- read.table("~/R_Workspace/data-master/oswego.txt", header = T, sep = "", as.is = T, na.strings = "N/A")

summary(oswego_data)

##        id            age            sex             meal.time        
##  Min.   : 1.0   Min.   : 3.00   Length:75          Length:75         
##  1st Qu.:19.5   1st Qu.:16.50   Class :character   Class :character  
##  Median :38.0   Median :36.00   Mode  :character   Mode  :character  
##  Mean   :38.0   Mean   :36.81                                        
##  3rd Qu.:56.5   3rd Qu.:57.50                                        
##  Max.   :75.0   Max.   :77.00                                        
##      ill             onset.date         onset.time       
##  Length:75          Length:75          Length:75         
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##   baked.ham           spinach          mashed.potato     
##  Length:75          Length:75          Length:75         
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##  cabbage.salad         jello              rolls          
##  Length:75          Length:75          Length:75         
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##  brown.bread            milk              coffee         
##  Length:75          Length:75          Length:75         
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##     water              cakes           van.ice.cream     
##  Length:75          Length:75          Length:75         
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##  choc.ice.cream     fruit.salad       
##  Length:75          Length:75         
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##

class(oswego_data$onset.date)

## [1] "character"

oswego_data$onset.date <- paste(oswego_data$onset.date, "1940", sep = "/")

head(oswego_data)

##   id age sex meal.time ill onset.date onset.time baked.ham spinach
## 1  2  52   F   8:00 PM   Y  4/19/1940   12:30 AM         Y       Y
## 2  3  65   M   6:30 PM   Y  4/19/1940   12:30 AM         Y       Y
## 3  4  59   F   6:30 PM   Y  4/19/1940   12:30 AM         Y       Y
## 4  6  63   F   7:30 PM   Y  4/18/1940   10:30 PM         Y       Y
## 5  7  70   M   7:30 PM   Y  4/18/1940   10:30 PM         Y       Y
## 6  8  40   F   7:30 PM   Y  4/19/1940    2:00 AM         N       N
##   mashed.potato cabbage.salad jello rolls brown.bread milk coffee water
## 1             Y             N     N     Y           N    N      Y     N
## 2             Y             Y     N     N           N    N      Y     N
## 3             N             N     N     N           N    N      Y     N
## 4             N             Y     Y     N           N    N      N     Y
## 5             Y             N     Y     Y           Y    N      Y     Y
## 6             N             N     N     N           N    N      N     N
##   cakes van.ice.cream choc.ice.cream fruit.salad
## 1     N             Y              N           N
## 2     N             Y              Y           N
## 3     Y             Y              Y           N
## 4     N             Y              N           N
## 5     N             Y              N           N
## 6     N             Y              Y           N

oswego_data.MealTD <- paste("4/18/1940", oswego_data$meal.time, sep = "")

oswego_data.OnsetTD <- paste(oswego_data$onset.date, oswego_data$onset.time, sep = "")

oswego_data$MealTD <- as.POSIXct(strptime(oswego_data.MealTD, format = "%m/%d/%Y %I:%M %p"))

oswego_data$OnsetTD <- as.POSIXct(strptime(oswego_data.OnsetTD, format = "%m/%d/%Y %I:%M %p"))

library(ggplot2)
g <- ggplot(data = oswego_data, aes(OnsetTD))
g + geom_histogram(fill = 'Red', color = 'Black', alpha = .8, bins =  30) + xlab("Onset Time") + ylab('# of People') + ggtitle("Onset Time of Infection")

## Warning: Removed 29 rows containing non-finite values (stat_bin).

oswego_data$IncubPeriod <- (oswego_data$OnsetTD - oswego_data$MealTD)

g2 <- ggplot(data = oswego_data, aes(IncubPeriod))
g2 + geom_histogram(fill = 'Red', color = 'Black', alpha = .6, binwidth = 1) + xlab('Incubation Period (Hours)') + ylab('# of People') + ggtitle("Incubation Period")

## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

## Warning: Removed 53 rows containing non-finite values (stat_bin).

mean(oswego_data$IncubPeriod, na.rm = TRUE)

## Time difference of 4.295455 hours

median(oswego_data$IncubPeriod, na.rm = TRUE)

## Time difference of 4 hours

range(oswego_data$IncubPeriod, na.rm = TRUE)

## Time differences in hours
## [1] 3 7

Problem Exercises 4

Maher Yassin

October 15, 2018