PH251D Project 1

Exercise 1

source("C:/Users/Toshiba L505-ES5018/Dropbox/PH251D/Project 1/job01.R",echo=TRUE)

## 
## > y <- c(2, 4, 6, 9)
## 
## > x <- c(5, 8, 3, 5)
## 
## > z <- matrix(c(x, y), 2, 4)
## 
## > z
##      [,1] [,2] [,3] [,4]
## [1,]    5    3    2    6
## [2,]    8    5    4    9

Exercise 2

library(datasets)
data(Orange)
temp<-write.table(Orange,"orange.dat")
head(read.table("orange.dat"))

##   Tree  age circumference
## 1    1  118            30
## 2    1  484            58
## 3    1  664            87
## 4    1 1004           115
## 5    1 1231           120
## 6    1 1372           142

Exercise 3

orange.3<-as.data.frame(Orange)
levels(orange.3$Tree)<-c("camphor","willow","pine","peach","cherry")
#say we only want smaller trees.
subset(orange.3, circumference<100)

##       Tree age circumference
## 1   willow 118            30
## 2   willow 484            58
## 3   willow 664            87
## 8    peach 118            33
## 9    peach 484            69
## 15 camphor 118            30
## 16 camphor 484            51
## 17 camphor 664            75
## 22  cherry 118            32
## 23  cherry 484            62
## 29    pine 118            30
## 30    pine 484            49
## 31    pine 664            81

#now for some reason we want another row number vector
orange.3$row.1<-1:length(orange.3$age)
head(orange.3)

##     Tree  age circumference row.1
## 1 willow  118            30     1
## 2 willow  484            58     2
## 3 willow  664            87     3
## 4 willow 1004           115     4
## 5 willow 1231           120     5
## 6 willow 1372           142     6

Exercise 4

wnile<-read.table("http://www.medepi.net/data/wnv/wnv2004raw.txt",header=TRUE,sep=",",na.strings=c(".", "Unkown"))
class(wnile$date.onset);class(wnile$date.tested)

## [1] "factor"

## [1] "factor"

# these two date column are factors, so we need to fix that and make them into date format.
wnile$date.onset<-as.Date(wnile$date.onset,"%m/%d/%Y")
wnile$date.tested<-as.Date(wnile$date.tested,"%m/%d/%Y")
class(wnile$date.onset);class(wnile$date.tested)

## [1] "Date"

## [1] "Date"

#excellent. now for something more challenging.
oswego<-read.table("http://www.medepi.net/data/oswego.txt",header=TRUE,sep=" ",na.strings=".")
temp<-paste("4/18/1940",oswego$meal.time)
exposure.time<-strptime(temp,format="%m/%d/%Y %I:%M %p")
temp1<-paste(paste(oswego$onset.date,"/1940",sep=""),oswego$onset.time)
outcome.time<-strptime(temp1,format="%m/%d/%Y %I:%M %p")
head(outcome.time)

## [1] "1940-04-19 00:30:00 PST" "1940-04-19 00:30:00 PST"
## [3] "1940-04-19 00:30:00 PST" "1940-04-18 22:30:00 PST"
## [5] "1940-04-18 22:30:00 PST" "1940-04-19 02:00:00 PST"

Exercise 5

To demonstrate using existing functions, lets say people eat the fruit of each tree and say it to be tasty or not tasty, and that their determinations are completely random (0=not tasty, 1=tasty). Lets use a generalized linear model with age of tree and circumference as predictors and see if their associations with our random outcome variable tastyness will be significant (most likeley not because I just sampled randomly from 0 and 1 to create the variable).

attach(Orange)
Tastyness<-sample(c(0,1),replace=TRUE,size=nrow(Orange))
as.factor(Tastyness)

##  [1] 0 1 1 0 1 1 0 0 1 1 0 1 0 1 0 0 1 1 1 1 0 0 1 1 1 0 0 1 1 1 1 1 1 0 0
## Levels: 0 1

table(Tastyness) #we can check to see how many of each we got, if we want to.

## Tastyness
##  0  1 
## 14 21

cbind(Orange,Tastyness)

##    Tree  age circumference Tastyness
## 1     1  118            30         0
## 2     1  484            58         1
## 3     1  664            87         1
## 4     1 1004           115         0
## 5     1 1231           120         1
## 6     1 1372           142         1
## 7     1 1582           145         0
## 8     2  118            33         0
## 9     2  484            69         1
## 10    2  664           111         1
## 11    2 1004           156         0
## 12    2 1231           172         1
## 13    2 1372           203         0
## 14    2 1582           203         1
## 15    3  118            30         0
## 16    3  484            51         0
## 17    3  664            75         1
## 18    3 1004           108         1
## 19    3 1231           115         1
## 20    3 1372           139         1
## 21    3 1582           140         0
## 22    4  118            32         0
## 23    4  484            62         1
## 24    4  664           112         1
## 25    4 1004           167         1
## 26    4 1231           179         0
## 27    4 1372           209         0
## 28    4 1582           214         1
## 29    5  118            30         1
## 30    5  484            49         1
## 31    5  664            81         1
## 32    5 1004           125         1
## 33    5 1231           142         1
## 34    5 1372           174         0
## 35    5 1582           177         0

mod.1<-glm(Tastyness~age+circumference,family=binomial)
summary(mod.1)

## 
## Call:
## glm(formula = Tastyness ~ age + circumference, family = binomial)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.503  -1.337   0.943   0.997   1.153  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)
## (Intercept)    0.615176   0.790204    0.78     0.44
## age            0.000798   0.001774    0.45     0.65
## circumference -0.008130   0.015047   -0.54     0.59
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 47.111  on 34  degrees of freedom
## Residual deviance: 46.805  on 32  degrees of freedom
## AIC: 52.81
## 
## Number of Fisher Scoring iterations: 4

anova(mod.1)

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Tastyness
## 
## Terms added sequentially (first to last)
## 
## 
##               Df Deviance Resid. Df Resid. Dev
## NULL                             34       47.1
## age            1    0.011        33       47.1
## circumference  1    0.294        32       46.8

#based on our output, it appears that there is no relationship at all between the predictors and the outcome, not suprising given the random sampling of the outcome.

Exercise 6

Say here we want to use a for loop to create a new data frame of all the trees with circumference less than 100 of whatever unit, and find the mean age of the smaller trees.

trees<-as.data.frame(Orange)
small.trees<-NULL
for (i in 1:length(trees$circumference)){
  small.trees=subset(trees,circumference<100)
  }
small.trees #Here is our new dataset of small trees.

##    Tree age circumference
## 1     1 118            30
## 2     1 484            58
## 3     1 664            87
## 8     2 118            33
## 9     2 484            69
## 15    3 118            30
## 16    3 484            51
## 17    3 664            75
## 22    4 118            32
## 23    4 484            62
## 29    5 118            30
## 30    5 484            49
## 31    5 664            81

mean(small.trees$age) #here is the mean age of the new small trees dataset.

## [1] 384.8

Exercise 7

First, lets make a simple function to add two numbers.

fun.1<-function(x,y){
  x+y
  }
fun.1(3,10)

## [1] 13

#That was interesting, but not very useful. Lets make a function that will make a scatterplot of 2 variables in a dataframe (first column as x axis and second column as y axis) and add a fit line (red) and lowess curve (blue). The function will act on a dataframe with two columns. To make the example, we'll create a matrix of two randomly sampled normal distributions with slightly different parameters.
x<-rnorm(500,mean=50,sd=10)
y<-rnorm(500,mean=60,sd=12)
z<-cbind(x,y)
fun.2<-function(x){
  plot(x[,1],x[,2])
  abline(lm(x[,1]~x[,2]),col="red")
  lines(lowess(x),col="blue")
  }
fun.2(z)

$plot of chunk unnamed-chunk-7$

Exercise 8

Here we’ll just use the Orange dataset again because I’m curious if older trees have larger circumference.

mod.2<-lm(circumference~age,data=Orange)
par(cex=.8)
plot(Orange$age,Orange$circumference,main="Plot of Tree Size vs. Age",xlab="Tree Age",ylab="Tree Circumference")
abline(mod.2,col="orange")
legend("topleft",c("Fitted Line"),lty=1,col="orange") #Yep, definitely seems to have a strong correlation.

plot of chunk unnamed-chunk-8

pdf(mod.2)
mod.2<-lm(circumference~age,data=Orange)
par(cex=.8)
plot(Orange$age,Orange$circumference,main="Plot of Tree Size vs. Age",xlab="Tree Age",ylab="Tree Circumference")
abline(mod.2,col="orange")
legend("topleft",c("Fitted Line"),lty=1,col="orange")
dev.off() #The plot is now outputted as a pdf file!

## pdf 
##   2

Exercise 9

Out of all of these fruits, we only want ones that end in the word fruit.

fruits<-c("grapefruit","durian","jackfruit","apple","pear","grape","banana","apricot","pomegranite","fig","breadfruit","orange","kiwi","starfruit","lychee","pineapple","mango","papaya","avocado","watermelon","dragonfruit","nectarine","cherry")
fruits[grep("^[A-Za-z]+fruit$",fruits)]

## [1] "grapefruit"  "jackfruit"   "breadfruit"  "starfruit"   "dragonfruit"

Exercise 10

sink("C:/Users/Toshiba L505-ES5018/Dropbox/PH251D/Project 1/proj1.log")
source("C:/Users/Toshiba L505-ES5018/Dropbox/PH251D/Project 1/job01.R",echo=TRUE)

## 
## > y <- c(2, 4, 6, 9)
## 
## > x <- c(5, 8, 3, 5)
## 
## > z <- matrix(c(x, y), 2, 4)
## 
## > z
##      [,1] [,2] [,3] [,4]
## [1,]    5    3    2    6
## [2,]    8    5    4    9

PH251D Project 1

Sean Wu

Thursday, October 16, 2014