wvincent — Oct 21, 2013, 7:50 PM
#Wilson Vincent
#PH 251D: Applied Epidemiology Using R
#Homework 3
#3.1 - Creating a data frame
Status <- rep(c("Dead", "Survived"), 4)
Treatment <- rep(c("Tolbutamide", "Placebo"), c(2, 2), 2)
Agegrp <- rep(c("<55", "55+"), c(4, 4))
Freq <- c(8, 98, 5, 115, 22, 76, 16, 69)
dat <- data.frame(Status, Treatment, Agegrp, Freq); dat
Status Treatment Agegrp Freq
1 Dead Tolbutamide <55 8
2 Survived Placebo <55 98
3 Dead Tolbutamide <55 5
4 Survived Placebo <55 115
5 Dead Tolbutamide 55+ 22
6 Survived Placebo 55+ 76
7 Dead Tolbutamide 55+ 16
8 Survived Placebo 55+ 69
#3.2 - Collecting classmate data and using these to create a data frame
#Sanjiv Baxi UCSF sanjiv.baxi@bekeley.edu sanjiv.baxi@ucsf.edu
#Anobel Odisho UCSF anobel.odisho@berkeley.edu OdishoA@urology.ucsf.edu
#Brett Smith UCB brett.james.smith@gmail.com brett.smith@berkeley.edu
First.nm <- c("Sanjiv", "Anobel", "Brett")
Last.nm <- c("Baxi", "Odisho", "Smith")
Affil <- rep(c("UCSF", "Cal"), c(2, 1))
Email1 <- c("private1@bekeley.edu", "private1@berkeley.edu", "private1@gmail.com")
Email2 <- c("private2@ucsf.edu", "private2@ucsf.edu", "private2@berkeley.edu")
Class <- data.frame(First.nm, Last.nm, Affil, Email1, Email2); Class
First.nm Last.nm Affil Email1 Email2
1 Sanjiv Baxi UCSF private1@bekeley.edu private2@ucsf.edu
2 Anobel Odisho UCSF private1@berkeley.edu private2@ucsf.edu
3 Brett Smith Cal private1@gmail.com private2@berkeley.edu
First.nm; Last.nm; Affil; Email1; Email2; Class
First.nm Last.nm Affil Email1 Email2
1 Sanjiv Baxi UCSF private1@bekeley.edu private2@ucsf.edu
2 Anobel Odisho UCSF private1@berkeley.edu private2@ucsf.edu
3 Brett Smith Cal private1@gmail.com private2@berkeley.edu
#3.3 - Reading data into a data frame and graphing
aids <- read.table("http://www.medepi.net/data/aids.txt", header=TRUE,
sep="", na.strings=".")
head(aids)
cases year
1 NA 1980
2 NA 1981
3 NA 1982
4 NA 1983
5 4445 1984
6 8249 1985
plot(aids$year, aids$cases, type = "l", xlab = "Year", lwd = 2,
ylab = "Cases", main = "Reported AIDS Cases in United States, 1980--2003")
#3.4 - Reading data into a data frame and graphing
meas <- read.table("http://www.medepi.net/data/measles.txt",
header=TRUE, sep="")
head(meas)
year cases
1 1950 319000
2 1951 530000
3 1952 683000
4 1953 449000
5 1954 683000
6 1955 555000
plot(meas$year, meas$cases, type = "l", xlab = "Year", lwd = 2,
ylab = "Cases",
main = "Reported Measles Cases in United States, 1980--2003")
plot(meas$year, meas$cases, type = "l", xlab = "Year", lwd = 2,
log = "y", ylab = "Cases",
main = "Reported Measles Cases in United States, 1980--2003")
#3.5 - Reading data into a data frame and plotting
aids <- read.table("http://www.medepi.net/data/aids.txt", sep="",
header = TRUE, na.strings=".")
hepb <- read.table("http://www.medepi.net/data/hepb.txt", sep="",
header = TRUE)
matplot(hepb$year, cbind(hepb$cases,aids$cases),
type = "l", lwd = 2, xlab = "Year", ylab = "Cases",
main = "Reported cases of Hepatitis B and AIDS,
United States, 1980-2003")
legend(1980, 100000, legend = c("Hepatitis B", "AIDS"),
lwd = 2, lty = 1:2, col = 1:2)
#3.6 - Manipulating data
#3.6a
edat <- read.table("http://www.medepi.net/data/evans.txt", header = TRUE, sep="")
str(edat)
'data.frame': 609 obs. of 12 variables:
$ id : int 21 31 51 71 74 91 111 131 141 191 ...
$ chd: int 0 0 1 0 0 0 1 0 0 0 ...
$ cat: int 0 0 1 1 0 0 0 0 0 0 ...
$ age: int 56 43 56 64 49 46 52 63 42 55 ...
$ chl: int 270 159 201 179 243 252 179 217 176 250 ...
$ smk: int 0 1 1 1 1 1 1 0 1 0 ...
$ ecg: int 0 0 1 0 0 0 1 0 0 1 ...
$ dbp: int 80 74 112 100 82 88 80 92 76 114 ...
$ sbp: int 138 128 164 200 145 142 128 135 114 182 ...
$ hpt: int 0 0 1 1 0 0 0 0 0 1 ...
$ ch : int 0 0 1 1 0 0 0 0 0 0 ...
$ cc : int 0 0 201 179 0 0 0 0 0 0 ...
table(edat$chd)
0 1
538 71
edat$chd2 <- factor(edat$chd, levels = 0:1, labels = c("No", "Yes"))
table(edat$chd2)
No Yes
538 71
table(edat$cat)
0 1
487 122
edat$cat2 <- factor(edat$cat, levels = 0:1, labels = c("Normal", "High"))
table(edat$cat2)
Normal High
487 122
table(edat$smk)
0 1
222 387
edat$smk2 <- factor(edat$smk, levels = 0:1, labels = c("Never", "Ever"))
table(edat$smk2)
Never Ever
222 387
table(edat$ecg)
0 1
443 166
edat$ecg2 <- factor(edat$ecg, levels = 0:1, labels = c("Normal", "Abnormal"))
table(edat$ecg2)
Normal Abnormal
443 166
table(edat$hpt)
0 1
354 255
edat$hpt2 <- factor(edat$hpt, levels = 0:1, labels = c("No", "Yes"))
table(edat$hpt2)
No Yes
354 255
head(edat)
id chd cat age chl smk ecg dbp sbp hpt ch cc chd2 cat2 smk2 ecg2
1 21 0 0 56 270 0 0 80 138 0 0 0 No Normal Never Normal
2 31 0 0 43 159 1 0 74 128 0 0 0 No Normal Ever Normal
3 51 1 1 56 201 1 1 112 164 1 1 201 Yes High Ever Abnormal
4 71 0 1 64 179 1 0 100 200 1 1 179 No High Ever Normal
5 74 0 0 49 243 1 0 82 145 0 0 0 No Normal Ever Normal
6 91 0 0 46 252 1 0 88 142 0 0 0 No Normal Ever Normal
hpt2
1 No
2 No
3 Yes
4 Yes
5 No
6 No
#3.6b
quantile(edat$age)
0% 25% 50% 75% 100%
40 46 52 60 76
edat$age4 <- cut(edat$age, quantile(edat$age), right = FALSE, include.lowest = TRUE)
table(edat$age4)
[40,46) [46,52) [52,60) [60,76]
134 158 158 159
#3.6c
hptnew <- rep(NA, nrow(edat))
normal <- edat$sbp<120 & edat$dbp<80
hptnew[normal] <- 1
prehyp <- (edat$sbp>=120 & edat$sbp<140) | (edat$dbp>=80 & edat$dbp<90)
hptnew[prehyp] <- 2
stage1 <- (edat$sbp>=140 & edat$sbp<160) | (edat$dbp>=90 & edat$dbp<100)
hptnew[stage1] <- 3
stage2 <- edat$sbp>=160 | edat$dbp>=100
hptnew[stage2] <- 4
edat$hpt4 <- factor(hptnew, levels=1:4, labels=c("Normal", "PreHTN", "HTN.Stage1", "HTN.Stage2"))
table(edat$hpt4)
Normal PreHTN HTN.Stage1 HTN.Stage2
56 165 177 211
#3.6d
table("Old HTN"=edat$hpt2, "New HTN"=edat$hpt4)
New HTN
Old HTN Normal PreHTN HTN.Stage1 HTN.Stage2
No 56 165 133 0
Yes 0 0 44 211
#3.7 - Exporting data as an ASCII file
wdat <- read.table("http://www.medepi.net/data/wnv/wnv2004raw.txt", header=TRUE,
sep=",", as.is=TRUE, na.strings=c(".","Unknown"))
str(wdat)
'data.frame': 779 obs. of 8 variables:
$ id : int 1 2 3 4 5 6 7 8 9 10 ...
$ county : chr "San Bernardino" "San Bernardino" "San Bernardino" "San Bernardino" ...
$ age : int 40 64 19 12 12 17 61 74 71 26 ...
$ sex : chr "F" "F" "M" "M" ...
$ syndrome : chr "WNF" "WNF" "WNF" "WNF" ...
$ date.onset : chr "05/19/2004" "05/22/2004" "05/22/2004" "05/16/2004" ...
$ date.tested: chr "06/02/2004" "06/16/2004" "06/16/2004" "06/16/2004" ...
$ death : chr "No" "No" "No" "No" ...
wdat$date.onset2 <- as.Date(wdat$date.onset, format="%m/%d/%Y")
wdat$date.tested2 <- as.Date(wdat$date.tested, format="%m/%d/%Y")
write.table(wdat, "C:/Users/wvincent/Desktop", sep=",", row.names=FALSE)
Warning: cannot open file 'C:/Users/wvincent/Desktop': Permission denied
Error: cannot open the connection
#I was on my office computer. I do not have administrative permission to complete this function on the computer than I am using.
#I used an older version of R Studio on my Macbook, which cannot download the new version, and it worked!
#3.8 - Graphing, manipulating, and analyzing data
#3.8a
odat <- read.table("http://www.medepi.net/data/oswego.txt", sep = "", header = TRUE, na.strings = ".")
str(odat)
'data.frame': 75 obs. of 21 variables:
$ id : int 2 3 4 6 7 8 9 10 14 16 ...
$ age : int 52 65 59 63 70 40 15 33 10 32 ...
$ sex : Factor w/ 2 levels "F","M": 1 2 1 1 2 1 1 1 2 1 ...
$ meal.time : Factor w/ 6 levels "10:00 PM","11:00 AM",..: 6 3 3 5 5 5 1 4 5 NA ...
$ ill : Factor w/ 2 levels "N","Y": 2 2 2 2 2 2 2 2 2 2 ...
$ onset.date : Factor w/ 2 levels "4/18","4/19": 2 2 2 1 1 2 2 1 2 2 ...
$ onset.time : Factor w/ 17 levels "1:00 AM","10:00 PM",..: 9 9 9 5 5 10 1 6 10 4 ...
$ baked.ham : Factor w/ 2 levels "N","Y": 2 2 2 2 2 1 1 2 1 2 ...
$ spinach : Factor w/ 2 levels "N","Y": 2 2 2 2 2 1 1 2 1 2 ...
$ mashed.potato : Factor w/ 2 levels "N","Y": 2 2 1 1 2 1 1 2 1 1 ...
$ cabbage.salad : Factor w/ 2 levels "N","Y": 1 2 1 2 1 1 1 1 1 1 ...
$ jello : Factor w/ 2 levels "N","Y": 1 1 1 2 2 1 1 1 1 1 ...
$ rolls : Factor w/ 2 levels "N","Y": 2 1 1 1 2 1 1 2 1 2 ...
$ brown.bread : Factor w/ 2 levels "N","Y": 1 1 1 1 2 1 1 2 1 1 ...
$ milk : Factor w/ 2 levels "N","Y": 1 1 1 1 1 1 1 1 1 1 ...
$ coffee : Factor w/ 2 levels "N","Y": 2 2 2 1 2 1 1 1 1 2 ...
$ water : Factor w/ 2 levels "N","Y": 1 1 1 2 2 1 1 2 1 1 ...
$ cakes : Factor w/ 2 levels "N","Y": 1 1 2 1 1 1 2 1 1 2 ...
$ vanilla.ice.cream : Factor w/ 2 levels "N","Y": 2 2 2 2 2 2 1 2 2 2 ...
$ chocolate.ice.cream: Factor w/ 2 levels "N","Y": 1 2 2 1 1 2 2 2 2 2 ...
$ fruit.salad : Factor w/ 2 levels "N","Y": 1 1 1 1 1 1 1 1 1 1 ...
head(odat)
id age sex meal.time ill onset.date onset.time baked.ham spinach
1 2 52 F 8:00 PM Y 4/19 12:30 AM Y Y
2 3 65 M 6:30 PM Y 4/19 12:30 AM Y Y
3 4 59 F 6:30 PM Y 4/19 12:30 AM Y Y
4 6 63 F 7:30 PM Y 4/18 10:30 PM Y Y
5 7 70 M 7:30 PM Y 4/18 10:30 PM Y Y
6 8 40 F 7:30 PM Y 4/19 2:00 AM N N
mashed.potato cabbage.salad jello rolls brown.bread milk coffee water
1 Y N N Y N N Y N
2 Y Y N N N N Y N
3 N N N N N N Y N
4 N Y Y N N N N Y
5 Y N Y Y Y N Y Y
6 N N N N N N N N
cakes vanilla.ice.cream chocolate.ice.cream fruit.salad
1 N Y N N
2 N Y Y N
3 Y Y Y N
4 N Y N N
5 N Y N N
6 N Y Y N
mdt <- paste("4/18/1940", odat$meal.time)
meal.dt <- strptime(mdt, "%m/%d/%Y %I:%M %p")
odt <- paste(paste(odat$onset.date,"/1940",sep = ""), odat$onset.time)
onset.dt <- strptime(odt, "%m/%d/%Y %I:%M %p")
hist(onset.dt, breaks = 30, freq = TRUE)
#3.8b
min.obs.pos <- which(onset.dt==min(onset.dt,na.rm=T))
min.obs.pos
[1] 33
max.obs.pos <- which(onset.dt==max(onset.dt,na.rm=T))
max.obs.pos
[1] 10
odat[min.obs.pos,]
id age sex meal.time ill onset.date onset.time baked.ham spinach
33 52 8 M 11:00 AM Y 4/18 3:00 PM N N
mashed.potato cabbage.salad jello rolls brown.bread milk coffee water
33 N N N N N N N N
cakes vanilla.ice.cream chocolate.ice.cream fruit.salad
33 N Y Y N
odat[max.obs.pos,]
id age sex meal.time ill onset.date onset.time baked.ham spinach
10 16 32 F <NA> Y 4/19 10:30 AM Y Y
mashed.potato cabbage.salad jello rolls brown.bread milk coffee water
10 N N N Y N N Y N
cakes vanilla.ice.cream chocolate.ice.cream fruit.salad
10 Y Y Y N
#3.8c
onset.ct <- as.POSIXct(onset.dt)
odat2 <- odat[order(odat$ill, onset.ct), ]
head(odat2)
id age sex meal.time ill onset.date onset.time baked.ham spinach
47 1 11 M <NA> N <NA> <NA> N N
48 5 13 F <NA> N <NA> <NA> N N
49 11 65 M <NA> N <NA> <NA> Y Y
50 12 38 F <NA> N <NA> <NA> Y Y
51 13 62 F <NA> N <NA> <NA> Y Y
52 15 25 M <NA> N <NA> <NA> Y Y
mashed.potato cabbage.salad jello rolls brown.bread milk coffee water
47 N N N N N N N N
48 N N N N N N N N
49 Y N Y Y N N N N
50 Y N N Y N N Y N
51 N Y Y Y Y N N Y
52 Y Y Y Y Y Y Y Y
cakes vanilla.ice.cream chocolate.ice.cream fruit.salad
47 N N Y N
48 N N Y N
49 N Y N N
50 N Y Y Y
51 N N N N
52 Y Y N N
#3.8d
incub.dt <- onset.dt - meal.dt
library(MASS) #load MASS package
truehist(as.numeric(incub.dt), nbins = 7, prob = FALSE,
col = "skyblue", xlab = "Incubation Period (hours)")
mean(incub.dt, na.rm = TRUE)
Time difference of 4.295 hours
median(incub.dt, na.rm = TRUE)
Time difference of 4 hours
range(incub.dt, na.rm = TRUE)
Time differences in hours
[1] 3 7