Exercise 1
source("C:/Users/Toshiba L505-ES5018/Dropbox/PH251D/Project 1/job01.R",echo=TRUE)
##
## > y <- c(2, 4, 6, 9)
##
## > x <- c(5, 8, 3, 5)
##
## > z <- matrix(c(x, y), 2, 4)
##
## > z
## [,1] [,2] [,3] [,4]
## [1,] 5 3 2 6
## [2,] 8 5 4 9
Exercise 2
library(datasets)
data(Orange)
temp<-write.table(Orange,"orange.dat")
head(read.table("orange.dat"))
## Tree age circumference
## 1 1 118 30
## 2 1 484 58
## 3 1 664 87
## 4 1 1004 115
## 5 1 1231 120
## 6 1 1372 142
Exercise 3
orange.3<-as.data.frame(Orange)
levels(orange.3$Tree)<-c("camphor","willow","pine","peach","cherry")
#say we only want smaller trees.
subset(orange.3, circumference<100)
## Tree age circumference
## 1 willow 118 30
## 2 willow 484 58
## 3 willow 664 87
## 8 peach 118 33
## 9 peach 484 69
## 15 camphor 118 30
## 16 camphor 484 51
## 17 camphor 664 75
## 22 cherry 118 32
## 23 cherry 484 62
## 29 pine 118 30
## 30 pine 484 49
## 31 pine 664 81
#now for some reason we want another row number vector
orange.3$row.1<-1:length(orange.3$age)
head(orange.3)
## Tree age circumference row.1
## 1 willow 118 30 1
## 2 willow 484 58 2
## 3 willow 664 87 3
## 4 willow 1004 115 4
## 5 willow 1231 120 5
## 6 willow 1372 142 6
Exercise 4
wnile<-read.table("http://www.medepi.net/data/wnv/wnv2004raw.txt",header=TRUE,sep=",",na.strings=c(".", "Unkown"))
class(wnile$date.onset);class(wnile$date.tested)
## [1] "factor"
## [1] "factor"
# these two date column are factors, so we need to fix that and make them into date format.
wnile$date.onset<-as.Date(wnile$date.onset,"%m/%d/%Y")
wnile$date.tested<-as.Date(wnile$date.tested,"%m/%d/%Y")
class(wnile$date.onset);class(wnile$date.tested)
## [1] "Date"
## [1] "Date"
#excellent. now for something more challenging.
oswego<-read.table("http://www.medepi.net/data/oswego.txt",header=TRUE,sep=" ",na.strings=".")
temp<-paste("4/18/1940",oswego$meal.time)
exposure.time<-strptime(temp,format="%m/%d/%Y %I:%M %p")
temp1<-paste(paste(oswego$onset.date,"/1940",sep=""),oswego$onset.time)
outcome.time<-strptime(temp1,format="%m/%d/%Y %I:%M %p")
head(outcome.time)
## [1] "1940-04-19 00:30:00 PST" "1940-04-19 00:30:00 PST"
## [3] "1940-04-19 00:30:00 PST" "1940-04-18 22:30:00 PST"
## [5] "1940-04-18 22:30:00 PST" "1940-04-19 02:00:00 PST"
Exercise 5
To demonstrate using existing functions, lets say people eat the fruit of each tree and say it to be tasty or not tasty, and that their determinations are completely random (0=not tasty, 1=tasty). Lets use a generalized linear model with age of tree and circumference as predictors and see if their associations with our random outcome variable tastyness will be significant (most likeley not because I just sampled randomly from 0 and 1 to create the variable).
attach(Orange)
Tastyness<-sample(c(0,1),replace=TRUE,size=nrow(Orange))
as.factor(Tastyness)
## [1] 0 1 1 0 1 1 0 0 1 1 0 1 0 1 0 0 1 1 1 1 0 0 1 1 1 0 0 1 1 1 1 1 1 0 0
## Levels: 0 1
table(Tastyness) #we can check to see how many of each we got, if we want to.
## Tastyness
## 0 1
## 14 21
cbind(Orange,Tastyness)
## Tree age circumference Tastyness
## 1 1 118 30 0
## 2 1 484 58 1
## 3 1 664 87 1
## 4 1 1004 115 0
## 5 1 1231 120 1
## 6 1 1372 142 1
## 7 1 1582 145 0
## 8 2 118 33 0
## 9 2 484 69 1
## 10 2 664 111 1
## 11 2 1004 156 0
## 12 2 1231 172 1
## 13 2 1372 203 0
## 14 2 1582 203 1
## 15 3 118 30 0
## 16 3 484 51 0
## 17 3 664 75 1
## 18 3 1004 108 1
## 19 3 1231 115 1
## 20 3 1372 139 1
## 21 3 1582 140 0
## 22 4 118 32 0
## 23 4 484 62 1
## 24 4 664 112 1
## 25 4 1004 167 1
## 26 4 1231 179 0
## 27 4 1372 209 0
## 28 4 1582 214 1
## 29 5 118 30 1
## 30 5 484 49 1
## 31 5 664 81 1
## 32 5 1004 125 1
## 33 5 1231 142 1
## 34 5 1372 174 0
## 35 5 1582 177 0
mod.1<-glm(Tastyness~age+circumference,family=binomial)
summary(mod.1)
##
## Call:
## glm(formula = Tastyness ~ age + circumference, family = binomial)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.503 -1.337 0.943 0.997 1.153
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.615176 0.790204 0.78 0.44
## age 0.000798 0.001774 0.45 0.65
## circumference -0.008130 0.015047 -0.54 0.59
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 47.111 on 34 degrees of freedom
## Residual deviance: 46.805 on 32 degrees of freedom
## AIC: 52.81
##
## Number of Fisher Scoring iterations: 4
anova(mod.1)
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Tastyness
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 34 47.1
## age 1 0.011 33 47.1
## circumference 1 0.294 32 46.8
#based on our output, it appears that there is no relationship at all between the predictors and the outcome, not suprising given the random sampling of the outcome.
Exercise 6
Say here we want to use a for loop to create a new data frame of all the trees with circumference less than 100 of whatever unit, and find the mean age of the smaller trees.
trees<-as.data.frame(Orange)
small.trees<-NULL
for (i in 1:length(trees$circumference)){
small.trees=subset(trees,circumference<100)
}
small.trees #Here is our new dataset of small trees.
## Tree age circumference
## 1 1 118 30
## 2 1 484 58
## 3 1 664 87
## 8 2 118 33
## 9 2 484 69
## 15 3 118 30
## 16 3 484 51
## 17 3 664 75
## 22 4 118 32
## 23 4 484 62
## 29 5 118 30
## 30 5 484 49
## 31 5 664 81
mean(small.trees$age) #here is the mean age of the new small trees dataset.
## [1] 384.8
Exercise 7
First, lets make a simple function to add two numbers.
fun.1<-function(x,y){
x+y
}
fun.1(3,10)
## [1] 13
#That was interesting, but not very useful. Lets make a function that will make a scatterplot of 2 variables in a dataframe (first column as x axis and second column as y axis) and add a fit line (red) and lowess curve (blue). The function will act on a dataframe with two columns. To make the example, we'll create a matrix of two randomly sampled normal distributions with slightly different parameters.
x<-rnorm(500,mean=50,sd=10)
y<-rnorm(500,mean=60,sd=12)
z<-cbind(x,y)
fun.2<-function(x){
plot(x[,1],x[,2])
abline(lm(x[,1]~x[,2]),col="red")
lines(lowess(x),col="blue")
}
fun.2(z)
Exercise 8
Here we’ll just use the Orange dataset again because I’m curious if older trees have larger circumference.
mod.2<-lm(circumference~age,data=Orange)
par(cex=.8)
plot(Orange$age,Orange$circumference,main="Plot of Tree Size vs. Age",xlab="Tree Age",ylab="Tree Circumference")
abline(mod.2,col="orange")
legend("topleft",c("Fitted Line"),lty=1,col="orange") #Yep, definitely seems to have a strong correlation.
pdf(mod.2)
mod.2<-lm(circumference~age,data=Orange)
par(cex=.8)
plot(Orange$age,Orange$circumference,main="Plot of Tree Size vs. Age",xlab="Tree Age",ylab="Tree Circumference")
abline(mod.2,col="orange")
legend("topleft",c("Fitted Line"),lty=1,col="orange")
dev.off() #The plot is now outputted as a pdf file!
## pdf
## 2
Exercise 9
Out of all of these fruits, we only want ones that end in the word fruit.
fruits<-c("grapefruit","durian","jackfruit","apple","pear","grape","banana","apricot","pomegranite","fig","breadfruit","orange","kiwi","starfruit","lychee","pineapple","mango","papaya","avocado","watermelon","dragonfruit","nectarine","cherry")
fruits[grep("^[A-Za-z]+fruit$",fruits)]
## [1] "grapefruit" "jackfruit" "breadfruit" "starfruit" "dragonfruit"
Exercise 10
sink("C:/Users/Toshiba L505-ES5018/Dropbox/PH251D/Project 1/proj1.log")
source("C:/Users/Toshiba L505-ES5018/Dropbox/PH251D/Project 1/job01.R",echo=TRUE)
##
## > y <- c(2, 4, 6, 9)
##
## > x <- c(5, 8, 3, 5)
##
## > z <- matrix(c(x, y), 2, 4)
##
## > z
## [,1] [,2] [,3] [,4]
## [1,] 5 3 2 6
## [2,] 8 5 4 9