#### Lesson 2 Cut-Leaf example
#### (I) Basic GOF line by line calculation
#### (II) Doing GOF with chisq.test()
#### (III) Nice R code that corresponds to SAS code and output
#########################################################
##(I) Basic GOF line by line computation to demonstrate formulas
ob=c(926,288,293,104) ## data
ex=1611*c(9,3,3,1)/16 ## expected counts under the assumed model
X2=sum((ob-ex)^2/ex) ## X2 statistic
X2
## [1] 1.468722
#### Output so you check against your output
#[1] 1.468722
1-pchisq(X2,3) ## p-value
## [1] 0.6895079
#### Output
#[1] 0.6895079
G2=2*sum(ob*log(ob/ex)) ## deviance statistic
G2
## [1] 1.477587
#### Output
#[1] 1.477587
1-pchisq(G2,3) ## pvalue
## [1] 0.6874529
#### Output
#[1] 0.6874529
########################################################
#### (II) Using the built-in chisq.test function in R
tomato=chisq.test(c(926, 288,293,104), p=c(9/16, 3/16, 3/16, 1/16))
tomato
##
## Chi-squared test for given probabilities
##
## data: c(926, 288, 293, 104)
## X-squared = 1.4687, df = 3, p-value = 0.6895
tomato$statistic
## X-squared
## 1.468722
tomato$p.value
## [1] 0.6895079
tomato$residuals
## [1] 0.6581581 -0.8091222 -0.5214343 0.3301172
#### To get G2
G2=2*sum(tomato$observed*log(tomato$observed/tomato$expected))
G2
## [1] 1.477587
1-pchisq(G2,3)
## [1] 0.6874529
##deviance residuals
devres=sign(tomato$observed-tomato$expected)*sqrt(abs(2*tomato$observed*log(tomato$observed/tomato$expected)))
devres
## [1] 6.328906 -5.240221 -4.224967 2.594764
##### Creating a nice ouput
### Cell id, Observed count, Expected count, Pearson residuals, Deviance residual
out<-round(cbind(1:5, tomato$observed, tomato$expected, tomato$residuals, devres),3)
## Warning in cbind(1:5, tomato$observed, tomato$expected, tomato$residuals, :
## number of rows of result is not a multiple of vector length (arg 2)
out<-as.data.frame(out)
names(out)<-c("cell_j", "O_j", "E_j", "res_j", "dev_j")
out
## cell_j O_j E_j res_j dev_j
## 1 1 926 906.188 0.658 6.329
## 2 2 288 302.062 -0.809 -5.240
## 3 3 293 302.062 -0.521 -4.225
## 4 4 104 100.688 0.330 2.595
## 5 5 926 906.188 0.658 6.329
#### printing your table of results into a text file with tab separation
write.table(out, "tomato_Results", row.names=FALSE, col.names=TRUE, sep="\t")
#### Ploting expected and observed values
plot(c(1:4), tomato$observed, xlab="cell index", ylab="counts", xlim=c(0,5))
points(tomato$expected, pch=3, col="red")
legend("topright",c("observed", "expected"), col=c(1,"red"), pch=c(1,3),bty="n")

########################################################
#### (III) Nice R code that corresponds to SAS code and output
type=c(rep("tallc",926),rep("tallp",288),rep("dwarf",293),rep("dwarfp",104))
##Please note the table R provides has different order of rows
##from that provided by SAS
table(type)
## type
## dwarf dwarfp tallc tallp
## 293 104 926 288
## Freq Procedure
Percentage=100*as.vector(table(type))/sum(table(type))
CumulativeFrequency=cumsum(c(926,288,293,104))
CumulativePercentage=cumsum(Percentage)
Freq=data.frame(table(type),Percentage,CumulativeFrequency,CumulativePercentage)
Freq
## type Freq Percentage CumulativeFrequency CumulativePercentage
## 1 dwarf 293 18.187461 926 18.18746
## 2 dwarfp 104 6.455618 1214 24.64308
## 3 tallc 926 57.479826 1507 82.12291
## 4 tallp 288 17.877095 1611 100.00000
## Chi-Square Test for Specified Proportions
p=c(18.75,6.25,56.25,18.75)
chisq.test(table(type),p=p/sum(p))
##
## Chi-squared test for given probabilities
##
## data: table(type)
## X-squared = 1.4687, df = 3, p-value = 0.6895
########################################################