1.Read in the Evans data set. Type in the R expressions below. Do not copy and paste. After each expression briefly describe what the step accomplished.
edat <- read.table("~/R_Workspace/data-master/evans.txt", header = TRUE, sep = "")
str(edat)
## 'data.frame': 609 obs. of 12 variables:
## $ id : int 21 31 51 71 74 91 111 131 141 191 ...
## $ chd: int 0 0 1 0 0 0 1 0 0 0 ...
## $ cat: int 0 0 1 1 0 0 0 0 0 0 ...
## $ age: int 56 43 56 64 49 46 52 63 42 55 ...
## $ chl: int 270 159 201 179 243 252 179 217 176 250 ...
## $ smk: int 0 1 1 1 1 1 1 0 1 0 ...
## $ ecg: int 0 0 1 0 0 0 1 0 0 1 ...
## $ dbp: int 80 74 112 100 82 88 80 92 76 114 ...
## $ sbp: int 138 128 164 200 145 142 128 135 114 182 ...
## $ hpt: int 0 0 1 1 0 0 0 0 0 1 ...
## $ ch : int 0 0 1 1 0 0 0 0 0 0 ...
## $ cc : int 0 0 201 179 0 0 0 0 0 0 ...
xtabs(~chd, data = edat)
## chd
## 0 1
## 538 71
edat$chd2 <- factor(edat$chd, levels = 0:1,
labels = c("No", "Yes"))
xtabs(~chd2, data = edat) # test
## chd2
## No Yes
## 538 71
xtabs(~cat, data = edat)
## cat
## 0 1
## 487 122
edat$cat2 <- factor(edat$cat, levels = 0:1,
labels = c("Normal", "High"))
xtabs(~cat2, data = edat) # test
## cat2
## Normal High
## 487 122
xtabs(~smk, data = edat)
## smk
## 0 1
## 222 387
edat$smk2 <- factor(edat$smk, levels = 0:1,
labels = c("Never", "Ever"))
xtabs(~smk2, data = edat) # test
## smk2
## Never Ever
## 222 387
xtabs(~ecg, data = edat)
## ecg
## 0 1
## 443 166
edat$ecg2 <- factor(edat$ecg, levels = 0:1,
labels = c("Normal", "Abnormal"))
xtabs(~ecg2, data = edat) # test
## ecg2
## Normal Abnormal
## 443 166
xtabs(~hpt, data = edat)
## hpt
## 0 1
## 354 255
edat$hpt2 <- factor(edat$hpt, levels = 0:1,
labels = c("No", "Yes"))
xtabs(~hpt2, data = edat) # test
## hpt2
## No Yes
## 354 255
Explanation:
#### (2)
quantile(edat$age)
## 0% 25% 50% 75% 100%
## 40 46 52 60 76
edat$age4 <- cut(edat$age, quantile(edat$age), right = FALSE,
include.lowest = TRUE)
xtabs(~age4, data = edat) # test
## age4
## [40,46) [46,52) [52,60) [60,76]
## 134 158 158 159
Explanation:
#### (3)
hptnew <- rep(NA, nrow(edat))
normal <- edat$sbp < 120 & edat$dbp < 80
hptnew[normal] <- 1
prehyp <- (edat$sbp >= 120 & edat$sbp < 140) |
(edat$dbp >= 80 & edat$dbp < 90)
hptnew[prehyp] <- 2
stage1 <- (edat$sbp >= 140 & edat$sbp < 160) |
(edat$dbp >= 90 & edat$dbp < 100)
hptnew[stage1] <- 3
stage2 <- edat$sbp >= 160 | edat$dbp >= 100
hptnew[stage2] <- 4
edat$hpt4 <- factor(hptnew, levels = 1:4,
labels=c("Normal", "PreHTN", "HTN.Stage1",
"HTN.Stage2"))
xtabs(~hpt4, data = edat) # test
## hpt4
## Normal PreHTN HTN.Stage1 HTN.Stage2
## 56 165 177 211
Explanation:
#### (4)
xtabs(~hpt2 + hpt4, data = edat)
## hpt4
## hpt2 Normal PreHTN HTN.Stage1 HTN.Stage2
## No 56 165 133 0
## Yes 0 0 44 211
Explanation:
2. Review the California 2004 surveillance data on human West Nile virus cases available at ~/data/wnv/wnv2004raw.txt. Read in the data setting as.is = TRUE, taking into account missing values (use na.strings option). Convert the calendar dates into the international standard format. Using the write.table function export the data as an ASCII text file.
wnv2004data <- read.table("~/R_Workspace/data-master/wnv2004raw.txt", header = T, sep = ",", as.is = T, na.strings = "N/A")
head(wnv2004data)
## ID County Age Sex Syndrome Date.onset Date.test Death
## 1 1 San Bernardino 40 F WNF 05/19/2004 06/02/2004 No
## 2 2 San Bernardino 64 F WNF 05/22/2004 06/16/2004 No
## 3 3 San Bernardino 19 M WNF 05/22/2004 06/16/2004 No
## 4 4 San Bernardino 12 M WNF 05/16/2004 06/16/2004 No
## 5 5 San Bernardino 12 M WNF 05/14/2004 06/16/2004 No
## 6 6 San Bernardino 17 M WNF 06/07/2004 06/17/2004 No
summary(wnv2004data)
## ID County Age Sex
## Min. : 1 Length:781 Length:781 Length:781
## 1st Qu.:196 Class :character Class :character Class :character
## Median :391 Mode :character Mode :character Mode :character
## Mean :391
## 3rd Qu.:586
## Max. :781
## Syndrome Date.onset Date.test
## Length:781 Length:781 Length:781
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## Death
## Length:781
## Class :character
## Mode :character
##
##
##
onset_time <- as.Date(wnv2004data$Date.onset, format = "%m/%d/%Y")
test_time <- as.Date(wnv2004data$Date.test, format = "%m/%d/%Y")
class(onset_time)
## [1] "Date"
class(test_time)
## [1] "Date"
onset_julian <- julian(onset_time)
test_julian <- julian(test_time)
onset_international <- as.Date(onset_julian, origin = as.Date('1970-01-01'))
test_international <- as.Date(test_julian, origin = as.Date('1970-01-01'))
head(onset_international)
## [1] "2004-05-19" "2004-05-22" "2004-05-22" "2004-05-16" "2004-05-14"
## [6] "2004-06-07"
head(test_international)
## [1] "2004-06-02" "2004-06-16" "2004-06-16" "2004-06-16" "2004-06-16"
## [6] "2004-06-17"
wnv2004data$Date.onset <- onset_international
wnv2004data$Date.test <- test_international
head(wnv2004data)
## ID County Age Sex Syndrome Date.onset Date.test Death
## 1 1 San Bernardino 40 F WNF 2004-05-19 2004-06-02 No
## 2 2 San Bernardino 64 F WNF 2004-05-22 2004-06-16 No
## 3 3 San Bernardino 19 M WNF 2004-05-22 2004-06-16 No
## 4 4 San Bernardino 12 M WNF 2004-05-16 2004-06-16 No
## 5 5 San Bernardino 12 M WNF 2004-05-14 2004-06-16 No
## 6 6 San Bernardino 17 M WNF 2004-06-07 2004-06-17 No
write.table(wnv2004data, "wnv2004 dates")
3. On April 19th 1940 the local health officer in the village of Lycoming, Oswego County, New York reported the occurence of an outbreak of acute gastrointestinal illness to the District Health Officer in Syracuse. Dr. AM Rubin, epidemilogist-in-training was assigned to conduct an investigation???
Using RStudio plot the cases by time of onset of illness. What does the graph tell you? (Process the text data and then use the hist) function.
It hows us that the onset of infection is centered on April 19 at 12:00 AM.
Are there any cases for which the times of onset are inconsistent with the general experience? How might they be explained?
Yes, there are two cases that are inconsistent with the rest of the curve. One is several hours before the when the picnic took place while the other is several hours afterwards. Both cases could be unrelated to the outbreak, particular the earlier one.
How could the data be sorted by illness status and illness onset times?
You could either sort it by Yes or No illness. Then the ones that are ill could be sorted by onset time.
Where poissible, calculate incubation periods and illustarte their distribution with an appropriate graph. Determine the mean, median, and range of the incubation period.
oswego_data <- read.table("~/R_Workspace/data-master/oswego.txt", header = T, sep = "", as.is = T, na.strings = "N/A")
summary(oswego_data)
## id age sex meal.time
## Min. : 1.0 Min. : 3.00 Length:75 Length:75
## 1st Qu.:19.5 1st Qu.:16.50 Class :character Class :character
## Median :38.0 Median :36.00 Mode :character Mode :character
## Mean :38.0 Mean :36.81
## 3rd Qu.:56.5 3rd Qu.:57.50
## Max. :75.0 Max. :77.00
## ill onset.date onset.time
## Length:75 Length:75 Length:75
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## baked.ham spinach mashed.potato
## Length:75 Length:75 Length:75
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## cabbage.salad jello rolls
## Length:75 Length:75 Length:75
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## brown.bread milk coffee
## Length:75 Length:75 Length:75
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## water cakes van.ice.cream
## Length:75 Length:75 Length:75
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## choc.ice.cream fruit.salad
## Length:75 Length:75
## Class :character Class :character
## Mode :character Mode :character
##
##
##
class(oswego_data$onset.date)
## [1] "character"
oswego_data$onset.date <- paste(oswego_data$onset.date, "1940", sep = "/")
head(oswego_data)
## id age sex meal.time ill onset.date onset.time baked.ham spinach
## 1 2 52 F 8:00 PM Y 4/19/1940 12:30 AM Y Y
## 2 3 65 M 6:30 PM Y 4/19/1940 12:30 AM Y Y
## 3 4 59 F 6:30 PM Y 4/19/1940 12:30 AM Y Y
## 4 6 63 F 7:30 PM Y 4/18/1940 10:30 PM Y Y
## 5 7 70 M 7:30 PM Y 4/18/1940 10:30 PM Y Y
## 6 8 40 F 7:30 PM Y 4/19/1940 2:00 AM N N
## mashed.potato cabbage.salad jello rolls brown.bread milk coffee water
## 1 Y N N Y N N Y N
## 2 Y Y N N N N Y N
## 3 N N N N N N Y N
## 4 N Y Y N N N N Y
## 5 Y N Y Y Y N Y Y
## 6 N N N N N N N N
## cakes van.ice.cream choc.ice.cream fruit.salad
## 1 N Y N N
## 2 N Y Y N
## 3 Y Y Y N
## 4 N Y N N
## 5 N Y N N
## 6 N Y Y N
oswego_data.MealTD <- paste("4/18/1940", oswego_data$meal.time, sep = "")
oswego_data.OnsetTD <- paste(oswego_data$onset.date, oswego_data$onset.time, sep = "")
oswego_data$MealTD <- as.POSIXct(strptime(oswego_data.MealTD, format = "%m/%d/%Y %I:%M %p"))
oswego_data$OnsetTD <- as.POSIXct(strptime(oswego_data.OnsetTD, format = "%m/%d/%Y %I:%M %p"))
library(ggplot2)
g <- ggplot(data = oswego_data, aes(OnsetTD))
g + geom_histogram(fill = 'Red', color = 'Black', alpha = .8, bins = 30) + xlab("Onset Time") + ylab('# of People') + ggtitle("Onset Time of Infection")
## Warning: Removed 29 rows containing non-finite values (stat_bin).
oswego_data$IncubPeriod <- (oswego_data$OnsetTD - oswego_data$MealTD)
g2 <- ggplot(data = oswego_data, aes(IncubPeriod))
g2 + geom_histogram(fill = 'Red', color = 'Black', alpha = .6, binwidth = 1) + xlab('Incubation Period (Hours)') + ylab('# of People') + ggtitle("Incubation Period")
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
## Warning: Removed 53 rows containing non-finite values (stat_bin).
mean(oswego_data$IncubPeriod, na.rm = TRUE)
## Time difference of 4.295455 hours
median(oswego_data$IncubPeriod, na.rm = TRUE)
## Time difference of 4 hours
range(oswego_data$IncubPeriod, na.rm = TRUE)
## Time differences in hours
## [1] 3 7