#Exercise 4.1 (Practice on data frames) Read in the Evans data set. Type in the R expressions below. Do not copy and paste. After each expression briefly describe what the step accomplished.
edat <- read.table("https://raw.githubusercontent.com/taragonmd/data/master/evans.csv", header = TRUE, sep = "") #read the table that we are getting from the github repository of the class
str(edat) #looking at the structure of the data
## 'data.frame': 609 obs. of 12 variables:
## $ id : int 21 31 51 71 74 91 111 131 141 191 ...
## $ chd: int 0 0 1 0 0 0 1 0 0 0 ...
## $ cat: int 0 0 1 1 0 0 0 0 0 0 ...
## $ age: int 56 43 56 64 49 46 52 63 42 55 ...
## $ chl: int 270 159 201 179 243 252 179 217 176 250 ...
## $ smk: int 0 1 1 1 1 1 1 0 1 0 ...
## $ ecg: int 0 0 1 0 0 0 1 0 0 1 ...
## $ dbp: int 80 74 112 100 82 88 80 92 76 114 ...
## $ sbp: int 138 128 164 200 145 142 128 135 114 182 ...
## $ hpt: int 0 0 1 1 0 0 0 0 0 1 ...
## $ ch : int 0 0 1 1 0 0 0 0 0 0 ...
## $ cc : int 0 0 201 179 0 0 0 0 0 0 ...
xtabs(~chd, data = edat) #table of how many people have coronary heart disease
## chd
## 0 1
## 538 71
edat$chd2 <- factor(edat$chd, levels = 0:1, labels = c("No", "Yes")) #turning into factor the variable of coronary heart disease and labes yes/no
xtabs(~chd2, data = edat) #same table but with labels
## chd2
## No Yes
## 538 71
xtabs(~cat, data = edat) #table of people with low and high catecholamine level
## cat
## 0 1
## 487 122
edat$cat2 <- factor(edat$cat, levels = 0:1, labels = c("Normal", "High")) #factor the variable and use labels normal/high
xtabs(~cat2, data = edat) #same table with labels
## cat2
## Normal High
## 487 122
xtabs(~smk, data = edat) #table for people who smoke
## smk
## 0 1
## 222 387
edat$smk2 <- factor(edat$smk, levels = 0:1, labels = c("Never", "Ever")) #factor and labels for the smoking status
xtabs(~smk2, data = edat) #table with labels never/ever smoked
## smk2
## Never Ever
## 222 387
xtabs(~ecg, data = edat) #table for electrocardiogram
## ecg
## 0 1
## 443 166
edat$ecg2 <- factor(edat$ecg, levels = 0:1, labels = c("Normal", "Abnormal")) #labels for the electrocardiogram normal/abnormal
xtabs(~ecg2, data = edat) # table with labels
## ecg2
## Normal Abnormal
## 443 166
xtabs(~hpt, data = edat) #table for high blood pressure
## hpt
## 0 1
## 354 255
edat$hpt2 <- factor(edat$hpt, levels = 0:1,labels = c("No", "Yes")) #labels yes/no
xtabs(~hpt2, data = edat) #same table with labels
## hpt2
## No Yes
## 354 255
quantile(edat$age) #do quantiles by age
## 0% 25% 50% 75% 100%
## 40 46 52 60 76
edat$age4 <- cut(edat$age, quantile(edat$age), right = FALSE, include.lowest = TRUE) #add a variable for the number of quantile
xtabs(~age4, data = edat) #table for quantiles
## age4
## [40,46) [46,52) [52,60) [60,76]
## 134 158 158 159
hptnew <- rep(NA, nrow(edat)) #repeat NA by the number of rows
normal <- edat$sbp < 120 & edat$dbp < 80
hptnew[normal] <- 1 #assigning value of 1 to those with normal profile of hypertension
prehyp <- (edat$sbp >= 120 & edat$sbp < 140) | (edat$dbp >= 80 & edat$dbp < 90) #conditions for prehypertense
hptnew[prehyp] <- 2 #value of two for next level
stage1 <- (edat$sbp >= 140 & edat$sbp < 160) | (edat$dbp >= 90 & edat$dbp < 100) #conditions for stage 1 hypertension
hptnew[stage1] <- 3 #value of 3 for next level
stage2 <- edat$sbp >= 160 | edat$dbp >= 100 #conditions for stage 2 hypertension
hptnew[stage2] <- 4 #value of 4 for next level
edat$hpt4 <- factor(hptnew, levels = 1:4, labels=c("Normal", "PreHTN", "HTN.Stage1", "HTN.Stage2")) #new column, four-level factors: "normal", "prehypert", "stage1", "stage2" (these last two are hypertense people)
xtabs(~hpt4, data = edat) #table for this new variable
## hpt4
## Normal PreHTN HTN.Stage1 HTN.Stage2
## 56 165 177 211
xtabs(~hpt2 + hpt4, data = edat) #table comparing two definitions of hypertension
## hpt4
## hpt2 Normal PreHTN HTN.Stage1 HTN.Stage2
## No 56 165 133 0
## Yes 0 0 44 211
#Exercise 4.2 (Working with disease surveillance data) Review the California 2004 surveillance data on human West Nile virus cases available at ~/data/wnv/wnv2004raw.csv. Read in the data setting as.is = TRUE, taking into account missing values (use na.strings option). Convert the calendar dates into the international standard format. Using the write.table function export the data as an ASCII text file.
wdat <- read.table("https://raw.githubusercontent.com/taragonmd/data/master/wnv/wnv2004raw.csv", header = TRUE, sep = ",", as.is = TRUE, na.strings = c(".","Unknown"))
str(wdat)
## 'data.frame': 779 obs. of 8 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ county : chr "San Bernardino" "San Bernardino" "San Bernardino" "San Bernardino" ...
## $ age : int 40 64 19 12 12 17 61 74 71 26 ...
## $ sex : chr "F" "F" "M" "M" ...
## $ syndrome : chr "WNF" "WNF" "WNF" "WNF" ...
## $ date.onset : chr "05/19/2004" "05/22/2004" "05/22/2004" "05/16/2004" ...
## $ date.tested: chr "06/02/2004" "06/16/2004" "06/16/2004" "06/16/2004" ...
## $ death : chr "No" "No" "No" "No" ...
wdat$date.onset2 <- as.Date(wdat$date.onset, format = "%m/%d/%Y")
wdat$date.tested2 <- as.Date(wdat$date.tested, format = "%m/%d/%Y")
write.table(wdat, "~/wnvdat.txt",sep = ",", row.names = FALSE)
wdat.test <- read.csv("~/wnvdat.txt")
head(wdat.test)
## id county age sex syndrome date.onset date.tested death
## 1 1 San Bernardino 40 F WNF 05/19/2004 06/02/2004 No
## 2 2 San Bernardino 64 F WNF 05/22/2004 06/16/2004 No
## 3 3 San Bernardino 19 M WNF 05/22/2004 06/16/2004 No
## 4 4 San Bernardino 12 M WNF 05/16/2004 06/16/2004 No
## 5 5 San Bernardino 12 M WNF 05/14/2004 06/16/2004 No
## 6 6 San Bernardino 17 M WNF 06/07/2004 06/17/2004 No
## date.onset2 date.tested2
## 1 2004-05-19 2004-06-02
## 2 2004-05-22 2004-06-16
## 3 2004-05-22 2004-06-16
## 4 2004-05-16 2004-06-16
## 5 2004-05-14 2004-06-16
## 6 2004-06-07 2004-06-17
str(wdat.test)
## 'data.frame': 779 obs. of 10 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ county : Factor w/ 23 levels "Butte","Fresno",..: 14 14 14 14 14 14 14 14 8 12 ...
## $ age : int 40 64 19 12 12 17 61 74 71 26 ...
## $ sex : Factor w/ 2 levels "F","M": 1 1 2 2 2 2 2 1 2 2 ...
## $ syndrome : Factor w/ 2 levels "WNF","WNND": 1 1 1 1 1 1 2 2 1 2 ...
## $ date.onset : Factor w/ 130 levels "02/02/2005","05/14/2004",..: 4 5 5 3 2 6 8 11 8 10 ...
## $ date.tested : Factor w/ 104 levels "01/21/2005","02/04/2005",..: 4 5 5 5 5 6 7 8 9 9 ...
## $ death : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ date.onset2 : Factor w/ 130 levels "2004-05-14","2004-05-16",..: 3 4 4 2 1 5 7 10 7 9 ...
## $ date.tested2: Factor w/ 104 levels "2004-06-02","2004-06-16",..: 1 2 2 2 2 3 4 5 6 6 ...
#Exercise 4.3 (Outbreak investigation)
On April 19, 1940, the local health officer in the village of Lycoming, Oswego County, New York, reported the occurrence of an outbreak of acute gastrointestinal illness to the District Health Officer in Syracuse. Dr. A. M. Rubin, epidemiologist-in-training, was assigned to conduct an investigation. See Appendix C for data dictionary.
When Dr. Rubin arrived in the field, he learned from the health officer that all persons known to be ill had attended a church supper held on the previous evening, April 18. Family members who did not attend the church supper did not become ill. Accordingly, Dr. Rubin focused the investigation on the supper. He completed Interviews with 75 of the 80 persons known to have attended, collecting information about the occurrence and time of onset of symptoms, and foods consumed. Of the 75 persons interviewed, 46 persons reported gastrointestinal illness.
The onset of illness in all cases was acute, characterized chiefly by nausea, vomiting, diarrhea, and abdominal pain. None of the ill persons reported having an elevated temperature; all recovered within 24 to 30 hours. Approximately 20% of the ill persons visited physicians. No fecal specimens were obtained for bacteriologic examination. The investigators suspected that this was a vehicle-borne outbreak, with food as the vehicle. Dr. Rubin put his data into a line listing. See the raw data set at ~/data/oswego/oswego.txt.
The supper was held in the basement of the village church. Foods were contributed by numerous members of the congregation. The supper began at 6:00 p.m. and continued until 11:00 p.m. Food was spread out on a table and consumed over a period of several hours. Data regarding onset of illness and food eaten or water drunk by each of the 75 persons interviewed are provided in the line listing. The approximate time of eating supper was collected for only about half the persons who had gastrointestinal illness.
Using RStudio plot the cases by time of onset of illness (include appropriate labels and title).
What does this graph tell you? Are there any cases for which the times of onset are inconsistent with the general experience? How might they be explained? How could the data be sorted by illness status and illness onset times? Where possible, calculate incubation periods and illustrate their distribution with an appropriate graph. Use the truehist function in the MASS package. Determine the mean, median, and range of the incubation period.
Hint 1: Plotting an epidemic curve with this data has special challenges because we have dates and times to process. To do this in R, we will create date objects that contain both the date and time for each primary event of interest: meal time, and onset time of illness. From this we can plot the distribution of onset times (epidemic curve). An epidemic curve is the distribution of illness onset times and can be displayed with a histogram. First, carefully study the Oswego data set at ~/data/oswego/oswego.txt. We need to do some data preparation in order to work with dates and times. Our initial goal is to get the date/time data to a form that can be passed to R’s strptime function for conversion in a date-time R object. To construct the following curve, study, and implement the R code that follows:
odat <- read.table("https://raw.githubusercontent.com/taragonmd/data/master/oswego.txt", sep = "", header = TRUE, na.strings = ".")
str(odat)
## 'data.frame': 75 obs. of 21 variables:
## $ id : int 2 3 4 6 7 8 9 10 14 16 ...
## $ age : int 52 65 59 63 70 40 15 33 10 32 ...
## $ sex : Factor w/ 2 levels "F","M": 1 2 1 1 2 1 1 1 2 1 ...
## $ meal.time : Factor w/ 6 levels "10:00 PM","11:00 AM",..: 6 3 3 5 5 5 1 4 5 NA ...
## $ ill : Factor w/ 2 levels "N","Y": 2 2 2 2 2 2 2 2 2 2 ...
## $ onset.date : Factor w/ 2 levels "4/18","4/19": 2 2 2 1 1 2 2 1 2 2 ...
## $ onset.time : Factor w/ 17 levels "1:00 AM","10:00 PM",..: 9 9 9 5 5 10 1 6 10 4 ...
## $ baked.ham : Factor w/ 2 levels "N","Y": 2 2 2 2 2 1 1 2 1 2 ...
## $ spinach : Factor w/ 2 levels "N","Y": 2 2 2 2 2 1 1 2 1 2 ...
## $ mashed.potato : Factor w/ 2 levels "N","Y": 2 2 1 1 2 1 1 2 1 1 ...
## $ cabbage.salad : Factor w/ 2 levels "N","Y": 1 2 1 2 1 1 1 1 1 1 ...
## $ jello : Factor w/ 2 levels "N","Y": 1 1 1 2 2 1 1 1 1 1 ...
## $ rolls : Factor w/ 2 levels "N","Y": 2 1 1 1 2 1 1 2 1 2 ...
## $ brown.bread : Factor w/ 2 levels "N","Y": 1 1 1 1 2 1 1 2 1 1 ...
## $ milk : Factor w/ 2 levels "N","Y": 1 1 1 1 1 1 1 1 1 1 ...
## $ coffee : Factor w/ 2 levels "N","Y": 2 2 2 1 2 1 1 1 1 2 ...
## $ water : Factor w/ 2 levels "N","Y": 1 1 1 2 2 1 1 2 1 1 ...
## $ cakes : Factor w/ 2 levels "N","Y": 1 1 2 1 1 1 2 1 1 2 ...
## $ van.ice.cream : Factor w/ 2 levels "N","Y": 2 2 2 2 2 2 1 2 2 2 ...
## $ choc.ice.cream: Factor w/ 2 levels "N","Y": 1 2 2 1 1 2 2 2 2 2 ...
## $ fruit.salad : Factor w/ 2 levels "N","Y": 1 1 1 1 1 1 1 1 1 1 ...
head(odat)
## id age sex meal.time ill onset.date onset.time baked.ham spinach
## 1 2 52 F 8:00 PM Y 4/19 12:30 AM Y Y
## 2 3 65 M 6:30 PM Y 4/19 12:30 AM Y Y
## 3 4 59 F 6:30 PM Y 4/19 12:30 AM Y Y
## 4 6 63 F 7:30 PM Y 4/18 10:30 PM Y Y
## 5 7 70 M 7:30 PM Y 4/18 10:30 PM Y Y
## 6 8 40 F 7:30 PM Y 4/19 2:00 AM N N
## mashed.potato cabbage.salad jello rolls brown.bread milk coffee water
## 1 Y N N Y N N Y N
## 2 Y Y N N N N Y N
## 3 N N N N N N Y N
## 4 N Y Y N N N N Y
## 5 Y N Y Y Y N Y Y
## 6 N N N N N N N N
## cakes van.ice.cream choc.ice.cream fruit.salad
## 1 N Y N N
## 2 N Y Y N
## 3 Y Y Y N
## 4 N Y N N
## 5 N Y N N
## 6 N Y Y N
#### create vector w/ meal date and time
mdt <- paste("4/18/1940", odat$meal.time)
#### convert to standard date and time
meal.dt <- strptime(mdt, "%m/%d/%Y %I:%M %p")
#### create vector with onset date and time
odt <- paste(paste(odat$onset.date,"/1940",sep = ""), odat$onset.time)
#### convert into std date & time
onset.dt <- strptime(odt, "%m/%d/%Y %I:%M %p")
hist(onset.dt, breaks = 30, freq = TRUE)
Hint 2: Now that we have our data frame in R, we can identify those subjects that correspond to minimum and maximum onset times. We will implement R code that can be interpreted as “which positions in vector Y correspond to the minimum values in Y?” We then use these position numbers to indexing the corresponding rows in the data frame.
#### Generate logical vectors and identify 'which' position
min.obs.pos <- which(onset.dt == min(onset.dt, na.rm = TRUE))
min.obs.pos
## [1] 33
max.obs.pos <- which(onset.dt == max(onset.dt, na.rm = TRUE))
max.obs.pos
## [1] 10
#### index data frame to display outliers
odat[min.obs.pos, ]
## id age sex meal.time ill onset.date onset.time baked.ham spinach
## 33 52 8 M 11:00 AM Y 4/18 3:00 PM N N
## mashed.potato cabbage.salad jello rolls brown.bread milk coffee water
## 33 N N N N N N N N
## cakes van.ice.cream choc.ice.cream fruit.salad
## 33 N Y Y N
odat[max.obs.pos, ]
## id age sex meal.time ill onset.date onset.time baked.ham spinach
## 10 16 32 F <NA> Y 4/19 10:30 AM Y Y
## mashed.potato cabbage.salad jello rolls brown.bread milk coffee water
## 10 N N N Y N N Y N
## cakes van.ice.cream choc.ice.cream fruit.salad
## 10 Y Y Y N
Hint 3: We can sort the data frame based values of one or more fields. Suppose we want to sort on illness status and illness onset times. We will use our onset.times vector we created earlier; however, we will need to convert it to “continuous time” in seconds to sort this vector. Study and implement the R code below.
onset.ct <- as.POSIXct(onset.dt)
odat2 <- odat[order(odat$ill, onset.ct), ]; odat2
## id age sex meal.time ill onset.date onset.time baked.ham spinach
## 47 1 11 M <NA> N <NA> <NA> N N
## 48 5 13 F <NA> N <NA> <NA> N N
## 49 11 65 M <NA> N <NA> <NA> Y Y
## 50 12 38 F <NA> N <NA> <NA> Y Y
## 51 13 62 F <NA> N <NA> <NA> Y Y
## 52 15 25 M <NA> N <NA> <NA> Y Y
## 53 19 11 M <NA> N <NA> <NA> Y Y
## 54 23 64 M <NA> N <NA> <NA> N N
## 55 25 65 F <NA> N <NA> <NA> Y Y
## 56 28 62 M <NA> N <NA> <NA> Y Y
## 57 30 17 M 10:00 PM N <NA> <NA> N N
## 58 34 40 M <NA> N <NA> <NA> Y Y
## 59 35 35 F <NA> N <NA> <NA> Y Y
## 60 37 36 M <NA> N <NA> <NA> Y N
## 61 41 54 F <NA> N <NA> <NA> Y Y
## 62 45 20 M 10:00 PM N <NA> <NA> N N
## 63 46 17 M <NA> N <NA> <NA> Y Y
## 64 50 9 F <NA> N <NA> <NA> N N
## 65 51 50 M <NA> N <NA> <NA> Y Y
## 66 53 35 F <NA> N <NA> <NA> N N
## 67 56 11 F <NA> N <NA> <NA> N N
## 68 61 37 M <NA> N <NA> <NA> N N
## 69 62 24 F <NA> N <NA> <NA> Y Y
## 70 63 69 F <NA> N <NA> <NA> N Y
## 71 64 7 M <NA> N <NA> <NA> Y Y
## 72 67 11 F 7:30 PM N <NA> <NA> Y Y
## 73 68 17 M 7:30 PM N <NA> <NA> Y Y
## 74 69 36 F <NA> N <NA> <NA> N N
## 75 73 14 F 10:00 PM N <NA> <NA> N N
## 33 52 8 M 11:00 AM Y 4/18 3:00 PM N N
## 20 31 35 M <NA> Y 4/18 9:00 PM Y Y
## 23 36 35 F <NA> Y 4/18 9:15 PM Y Y
## 26 40 68 M <NA> Y 4/18 9:30 PM Y N
## 29 44 58 M <NA> Y 4/18 9:30 PM Y Y
## 16 24 3 M <NA> Y 4/18 9:45 PM N Y
## 17 26 59 F <NA> Y 4/18 9:45 PM N Y
## 13 20 33 F <NA> Y 4/18 10:00 PM Y Y
## 12 18 36 M <NA> Y 4/18 10:15 PM Y Y
## 4 6 63 F 7:30 PM Y 4/18 10:30 PM Y Y
## 5 7 70 M 7:30 PM Y 4/18 10:30 PM Y Y
## 32 49 52 F <NA> Y 4/18 10:30 PM Y Y
## 36 57 74 M <NA> Y 4/18 10:30 PM Y Y
## 8 10 33 F 7:00 PM Y 4/18 11:00 PM Y Y
## 15 22 7 M <NA> Y 4/18 11:00 PM Y Y
## 19 29 37 F <NA> Y 4/18 11:00 PM Y Y
## 35 55 25 M <NA> Y 4/18 11:00 PM Y N
## 46 75 45 F <NA> Y 4/18 11:00 PM Y Y
## 24 38 57 F <NA> Y 4/18 11:30 PM Y Y
## 39 60 53 F 7:30 PM Y 4/18 11:30 PM Y Y
## 34 54 48 F <NA> Y 4/19 12:00 AM Y Y
## 44 72 18 F 7:30 PM Y 4/19 12:00 AM Y Y
## 1 2 52 F 8:00 PM Y 4/19 12:30 AM Y Y
## 2 3 65 M 6:30 PM Y 4/19 12:30 AM Y Y
## 3 4 59 F 6:30 PM Y 4/19 12:30 AM Y Y
## 11 17 62 F <NA> Y 4/19 12:30 AM N N
## 30 47 62 F <NA> Y 4/19 12:30 AM Y Y
## 41 66 8 F <NA> Y 4/19 12:30 AM Y N
## 42 70 21 F <NA> Y 4/19 12:30 AM Y N
## 7 9 15 F 10:00 PM Y 4/19 1:00 AM N N
## 14 21 13 F 10:00 PM Y 4/19 1:00 AM N N
## 18 27 15 F 10:00 PM Y 4/19 1:00 AM N N
## 21 32 15 M 10:00 PM Y 4/19 1:00 AM N N
## 22 33 50 F 10:00 PM Y 4/19 1:00 AM N N
## 25 39 16 F 10:00 PM Y 4/19 1:00 AM N N
## 31 48 20 F 7:00 PM Y 4/19 1:00 AM N N
## 37 58 12 F 10:00 PM Y 4/19 1:00 AM N N
## 40 65 17 F 10:00 PM Y 4/19 1:00 AM N N
## 43 71 60 M 7:30 PM Y 4/19 1:00 AM N N
## 6 8 40 F 7:30 PM Y 4/19 2:00 AM N N
## 9 14 10 M 7:30 PM Y 4/19 2:00 AM N N
## 28 43 72 F <NA> Y 4/19 2:00 AM Y Y
## 45 74 52 M <NA> Y 4/19 2:15 AM Y N
## 27 42 77 M <NA> Y 4/19 2:30 AM N N
## 38 59 44 F 7:30 PM Y 4/19 2:30 AM Y Y
## 10 16 32 F <NA> Y 4/19 10:30 AM Y Y
## mashed.potato cabbage.salad jello rolls brown.bread milk coffee water
## 47 N N N N N N N N
## 48 N N N N N N N N
## 49 Y N Y Y N N N N
## 50 Y N N Y N N Y N
## 51 N Y Y Y Y N N Y
## 52 Y Y Y Y Y Y Y Y
## 53 <NA> Y N Y N N N Y
## 54 N N N N N N N N
## 55 Y Y Y N Y N Y N
## 56 N Y N Y Y N Y Y
## 57 N N N N N N N N
## 58 N N N Y Y N Y Y
## 59 Y N N Y Y N Y Y
## 60 Y Y N Y Y N Y N
## 61 Y N N Y N N Y N
## 62 N N N N N N N N
## 63 Y N N Y N N N Y
## 64 N N N N N N N N
## 65 Y Y Y Y Y Y Y Y
## 66 N N N N N N N N
## 67 N N N N N N N N
## 68 N N N N N N N N
## 69 Y N N Y N N Y N
## 70 Y N Y N Y N N Y
## 71 Y Y Y Y N N N Y
## 72 Y Y N Y N N Y Y
## 73 Y Y N Y N N Y N
## 74 N N N N N N N N
## 75 N N N N N N N N
## 33 N N N N N N N N
## 20 Y N Y Y Y N Y N
## 23 Y Y N Y Y N Y N
## 26 Y Y N N Y N Y N
## 29 Y N N N Y Y Y N
## 16 Y N N Y N N N Y
## 17 Y Y N Y Y N N Y
## 13 Y Y Y Y N N Y Y
## 12 N Y N Y Y N N N
## 4 N Y Y N N N N Y
## 5 Y N Y Y Y N Y Y
## 32 Y Y N Y N N Y N
## 36 Y Y Y Y Y N Y N
## 8 Y N N Y Y N N Y
## 15 Y Y Y Y Y N N Y
## 19 Y N Y Y Y N Y N
## 35 Y N N Y Y N N Y
## 46 Y Y Y Y Y N Y N
## 24 N Y Y Y Y N Y N
## 39 Y Y Y N Y N Y Y
## 34 Y Y Y Y Y Y Y N
## 44 Y Y Y N N N N Y
## 1 Y N N Y N N Y N
## 2 Y Y N N N N Y N
## 3 N N N N N N Y N
## 11 N N N N N N N N
## 30 N N N Y N N N Y
## 41 Y Y Y N N N N N
## 42 N Y Y N N N N N
## 7 N N N N N N N N
## 14 N N N N N N N N
## 18 N N N N N N N N
## 21 N N N N N N N N
## 22 N N N N N N N N
## 25 N N N N N N N N
## 31 N N N N N N N N
## 37 N N N N N N N N
## 40 N N N N N N N N
## 43 N N N N N N N N
## 6 N N N N N N N N
## 9 N N N N N N N N
## 28 N Y Y N Y N Y N
## 45 Y N Y Y Y N Y Y
## 27 N N N N N N N N
## 38 Y N N Y N N N Y
## 10 N N N Y N N Y N
## cakes van.ice.cream choc.ice.cream fruit.salad
## 47 N N Y N
## 48 N N Y N
## 49 N Y N N
## 50 N Y Y Y
## 51 N N N N
## 52 Y Y N N
## 53 N N Y N
## 54 N Y N N
## 55 Y Y Y N
## 56 Y N Y N
## 57 Y Y Y N
## 58 Y N Y Y
## 59 N N Y N
## 60 N N Y N
## 61 Y N Y N
## 62 Y Y Y N
## 63 N Y Y N
## 64 Y N Y N
## 65 Y N Y N
## 66 N Y Y N
## 67 N N Y N
## 68 N N Y N
## 69 N N N N
## 70 Y N Y N
## 71 Y N Y N
## 72 N N Y N
## 73 Y Y N N
## 74 N N Y N
## 75 Y Y N N
## 33 N Y Y N
## 20 Y Y N Y
## 23 N Y N N
## 26 N Y N N
## 29 N Y <NA> Y
## 16 Y Y N N
## 17 Y Y N N
## 13 Y Y Y N
## 12 N Y N N
## 4 N Y N N
## 5 N Y N N
## 32 N Y Y N
## 36 Y Y N N
## 8 N Y Y N
## 15 Y Y Y N
## 19 Y Y N N
## 35 Y Y Y N
## 46 Y Y N Y
## 24 Y Y Y N
## 39 Y Y Y N
## 34 Y Y Y N
## 44 Y Y Y N
## 1 N Y N N
## 2 N Y Y N
## 3 Y Y Y N
## 11 N Y N N
## 30 N Y N N
## 41 Y Y Y N
## 42 N Y Y N
## 7 Y N Y N
## 14 Y Y N N
## 18 Y Y Y N
## 21 Y Y N N
## 22 N Y N N
## 25 Y N Y N
## 31 N Y N N
## 37 Y Y Y N
## 40 Y Y Y N
## 43 Y Y N N
## 6 N Y Y N
## 9 N Y Y N
## 28 Y Y Y N
## 45 Y Y Y N
## 27 N Y N Y
## 38 Y N Y N
## 10 Y Y Y N
Hint 4: Calculate incubation periods and plot histogram using truehist.
#### Calculate incubation periods
incub.dt <- onset.dt - meal.dt
library(MASS) #load MASS package
truehist(as.numeric(incub.dt), nbins = 7, prob = FALSE,
ylab = 'Frequency', col = "skyblue",
xlab = "Incubation Period (hours)")
#### Calculate mean, median, range; remember to remove NAs
mean(incub.dt, na.rm = TRUE)
## Time difference of 4.295455 hours
median(incub.dt, na.rm = TRUE)
## Time difference of 4 hours
range(incub.dt, na.rm = TRUE)
## Time differences in hours
## [1] 3 7
#Exercise 4.4 (Adventures of Huckleberry Finn)
Mark Twain’s book, “Adventures of Huckleberry Finn,” is available online for reading into R. View the entire text file here: https://www.inferentialthinking.com/data/huck_finn.txt. Get an idea of the structure of this text file. For example, there is a table of contents from CHAPTER I to CHAPTER XLII, followed by CHAPTER THE LAST.
We will scan in the book using the scan function, and the options what and sep. Read this in and explore the data object. What kind of data object do we have? Be specific.
Character type.
hf.url <- "https://www.inferentialthinking.com/data/huck_finn.txt"
hf <- scan(hf.url, what="character", sep="\n")
str(hf)
## chr [1:9672] "The Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete" ...
typeof(hf)
## [1] "character"
typeof(hf.url)
## [1] "character"
#Exercise 4.5 (Adventures of Huckleberry Finn (Part 2)) Run the following R expressions and then explain in plain words what happened and why.
grep("^CHAPTER [IVXL]+[.]\\b",hf) #grab what starts with CHAPTER followed by a roman number and a period, returns the indices of matches, just like value= FALSE
## [1] 300 405 598 729 838 973 1189 1392 1754 1877 1985 2225 2458 2633
## [15] 2773 2968 3252 3533 3916 4187 4472 4780 4937 5120 5315 5551 5812 6030
## [29] 6332 6643 6752 7042 7238 7469 7665 7893 8063 8277 8494 8661 8853 9068
grep("^CHAPTER [IVXL]+[.]\\b",hf, value = TRUE) #grab the same conditions, but use the value/name of the chapter
## [1] "CHAPTER I." "CHAPTER II." "CHAPTER III."
## [4] "CHAPTER IV." "CHAPTER V." "CHAPTER VI."
## [7] "CHAPTER VII." "CHAPTER VIII." "CHAPTER IX."
## [10] "CHAPTER X." "CHAPTER XI." "CHAPTER XII."
## [13] "CHAPTER XIII." "CHAPTER XIV." "CHAPTER XV."
## [16] "CHAPTER XVI." "CHAPTER XVII." "CHAPTER XVIII."
## [19] "CHAPTER XIX." "CHAPTER XX." "CHAPTER XXI."
## [22] "CHAPTER XXII." "CHAPTER XXIII." "CHAPTER XXIV."
## [25] "CHAPTER XXV." "CHAPTER XXVI." "CHAPTER XXVII."
## [28] "CHAPTER XXVIII." "CHAPTER XXIX." "CHAPTER XXX."
## [31] "CHAPTER XXXI." "CHAPTER XXXII." "CHAPTER XXXIII."
## [34] "CHAPTER XXXIV." "CHAPTER XXXV." "CHAPTER XXXVI."
## [37] "CHAPTER XXXVII." "CHAPTER XXXVIII." "CHAPTER XXXIX."
## [40] "CHAPTER XL." "CHAPTER XLI." "CHAPTER XLII."
#Exercise 4.6 (Adventures of Huckleberry Finn (Part 3)) The grepl function is similar to the grep function but generates a logical vector instead of a integer vector. Run the following R expressions.
How many times does the word “Huckleberry” appear in this text file? How many times does the word “Huck” appear in this text file?
Huckleberry, 8 Huck, 100
grep("Huck", hf, value = TRUE) #grep for vector containing the matches
## [1] "The Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete"
## [2] "Title: Adventures of Huckleberry Finn, Complete"
## [3] "CHAPTER I. Civilizing Huck.--Miss Watson.--Tom Sawyer Waits."
## [4] "CHAPTER IV. Huck and the Judge.--Superstition."
## [5] "CHAPTER V. Huck's Father.--The Fond Parent.--Reform."
## [6] "CHAPTER VI. He Went for Judge Thatcher.--Huck Decided to Leave.--Political"
## [7] "CHAPTER XI. Huck and the Woman.--The Search.--Prevarication.--Going to"
## [8] "CHAPTER XV. Huck Loses the Raft.--In the Fog.--Huck Finds the Raft.--Trash."
## [9] "CHAPTER XX. Huck Explains.--Laying Out a Campaign.--Working the"
## [10] "Pardon.--Hiding in the Room.--Huck Takes the Money."
## [11] "Huck,--Quick Sales and Small."
## [12] "Leave.--Huck Parting with Mary Jane.--Mumps.--The Opposition Line."
## [13] "Question of Handwriting.--Digging up the Corpse.--Huck Escapes."
## [14] "CHAPTER THE LAST. Out of Bondage.--Paying the Captive.--Yours Truly, Huck"
## [15] "Huck Stealing Away"
## [16] "Huck Creeps into his Window"
## [17] "Huck and his Father"
## [18] "Huck Stealing Away"
## [19] "Huck Creeps into his Window"
## [20] "Huck and his Father"
## [21] "\"Here I is, Huck\""
## [22] "Huck takes the Money"
## [23] "The Doctor leads Huck"
## [24] "The King shakes Huck"
## [25] "Aunt Sally talks to Huck"
## [26] "\"Don't put your feet up there, Huckleberry;\" and \"Don't scrunch up"
## [27] "like that, Huckleberry--set up straight;\" and pretty soon she would"
## [28] "say, \"Don't gap and stretch like that, Huckleberry--why don't you try to"
## [29] "\"Here's Huck Finn, he hain't got no family; what you going to do 'bout"
## [30] "\"Oh, she'll do. That's all right. Huck can come in.\""
## [31] "\"How you talk, Huck Finn. Why, you'd _have_ to come when he rubbed it,"
## [32] "\"Shucks, it ain't no use to talk to you, Huck Finn. You don't seem to"
## [33] "and crossed me off. She says, \"Take your hands away, Huckleberry; what"
## [34] "for him. Well, _wasn't_ he mad? He said he would show who was Huck"
## [35] "was only Huck; but he laughed _such_ a screechy laugh, and roared and"
## [36] "\"But looky here, Huck, who wuz it dat 'uz killed in dat shanty ef it"
## [37] "would you, Huck?\""
## [38] "\"Well, I b'lieve you, Huck. I--_I run off_.\""
## [39] "Huck.\""
## [40] "know all 'bout de killin'. I 'uz powerful sorry you's killed, Huck, but"
## [41] "I reck'n he's ben dead two er three days. Come in, Huck, but doan' look"
## [42] "Hookerville, but we don't know who 'twas that killed Huck Finn.\""
## [43] "\"The nigger run off the very night Huck Finn was killed. So there's a"
## [44] "Huck's money without having to bother a long time with a lawsuit."
## [45] "he'll walk in Huck's money as easy as nothing.\""
## [46] "\"_Ain'_ dat gay? En what dey got to do, Huck?\""
## [47] "talk to me 'bout Sollermun, Huck, I knows him by de back.\""
## [48] "dey, Huck?\""
## [49] "\"Why, Huck, doan' de French people talk de same way we does?\""
## [50] "\"Is a cat a man, Huck?\""
## [51] "\"Goodness gracious, is dat you, Huck? En you ain' dead--you ain'"
## [52] "dead! you's back agin, 'live en soun', jis de same ole Huck--de same ole"
## [53] "Huck, thanks to goodness!\""
## [54] "\"Huck--Huck Finn, you look me in de eye; look me in de eye. _Hain't_ you"
## [55] "\"But, Huck, it's all jis' as plain to me as--\""
## [56] "\"Well, den, I reck'n I did dream it, Huck; but dog my cats ef it ain't"
## [57] "\"We's safe, Huck, we's safe! Jump up and crack yo' heels! Dat's de"
## [58] "accounts o' Huck; I's a free man, en I couldn't ever ben free ef it"
## [59] "hadn' ben for Huck; Huck done it. Jim won't ever forgit you, Huck;"
## [60] "\"Dah you goes, de ole true Huck; de on'y white genlman dat ever kep' his"
## [61] "\"Here I is, Huck. Is dey out o' sight yit? Don't talk loud.\""
## [62] "raf' agin when dey was gone. But lawsy, how you did fool 'em, Huck!"
## [63] "\"Doan' le's talk about it, Huck. Po' niggers can't have no luck. I"
## [64] "\"It ain't yo' fault, Huck; you didn' know. Don't you blame yo'self"
## [65] "\"Well, 'twarn't no use to 'sturb you, Huck, tell we could do sumfn--but"
## [66] "\"Huck, does you reck'n we gwyne to run acrost any mo' kings on dis"
## [67] "\"Don't it s'prise you de way dem kings carries on, Huck?\""
## [68] "\"Why don't it, Huck?\""
## [69] "\"But, Huck, dese kings o' ourn is reglar rapscallions; dat's jist what"
## [70] "\"But dis one do _smell_ so like de nation, Huck.\""
## [71] "\"Well, anyways, I doan' hanker for no mo' un um, Huck. Dese is all I"
## [72] "jis' as loud as I could yell. _She never budge!_ Oh, Huck, I bust out"
## [73] "hisself as long's he live!' Oh, she was plumb deef en dumb, Huck, plumb"
## [74] "Huckleberry; we'll come down to the village on her.\""
## [75] "disgraced. And then think of _me_! It would get all around that Huck"
## [76] "_Huck Finn._"
## [77] "\"Looky here, Huck, what fools we are to not think of it before! I bet I"
## [78] " Why, Huck, it wouldn't make no more talk than breaking into a soap"
## [79] "complicated than _that_, Huck Finn.\""
## [80] "\"Why, _Huck_! En good _lan_'! ain' dat Misto Tom?\""
## [81] "Huck, it's the stupidest arrangement I ever see. You got to invent _all_"
## [82] "\"Well, if that ain't just like you, Huck Finn. You _can_ get up the"
## [83] "Navarre, or wherever it is. It's gaudy, Huck. I wish there was a moat"
## [84] "all do; and _he's_ got to, too. Huck, you don't ever seem to want to do"
## [85] "\"Oh, shucks, Huck Finn, if I was as ignorant as you I'd keep"
## [86] "\"That ain't got anything to _do_ with it, Huck Finn. All _he's_ got to"
## [87] "\"Huck Finn, did you _ever_ hear of a prisoner having picks and shovels,"
## [88] "\"It ain't no use to try to learn you nothing, Huck. Run along and"
## [89] "\"It ain't no use, Huck, it ain't a-going to work. If we was prisoners"
## [90] "Northumberland! Why, Huck, s'pose it _is_ considerble trouble?--what"
## [91] "but ef you en Huck fetches a rattlesnake in heah for me to tame, I's"
## [92] "\"No!--is that so? _ain't_ it bully! Why, Huck, if it was to do over"
## [93] "\"En a mighty good job it wuz, too, Huck. It 'uz planned beautiful, en"
## [94] "\"Well, den, dis is de way it look to me, Huck. Ef it wuz _him_ dat 'uz"
## [95] "\"You mean where's Huck _Finn_--that's what you mean! I reckon I hain't"
## [96] "bed, Huck Finn.\""
## [97] "\"Dah, now, Huck, what I tell you?--what I tell you up dah on Jackson"
## [98] "\"He ain't a-comin' back no mo', Huck.\""
## [99] "\"Nemmine why, Huck--but he ain't comin' back no mo.\""
## [100] "End of the Project Gutenberg EBook of Adventures of Huckleberry Finn,"
grep("Huckleberry", hf, value=TRUE)
## [1] "The Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete"
## [2] "Title: Adventures of Huckleberry Finn, Complete"
## [3] "\"Don't put your feet up there, Huckleberry;\" and \"Don't scrunch up"
## [4] "like that, Huckleberry--set up straight;\" and pretty soon she would"
## [5] "say, \"Don't gap and stretch like that, Huckleberry--why don't you try to"
## [6] "and crossed me off. She says, \"Take your hands away, Huckleberry; what"
## [7] "Huckleberry; we'll come down to the village on her.\""
## [8] "End of the Project Gutenberg EBook of Adventures of Huckleberry Finn,"
table(grepl("Huckleberry", hf)) #grepl for pattern match
##
## FALSE TRUE
## 9664 8
table(grepl("Huck", hf))
##
## FALSE TRUE
## 9572 100