#### Lesson 2 Cut-Leaf example
#### (I) Basic GOF line by line calculation 
#### (II) Doing GOF with chisq.test() 
#### (III) Nice R code that corresponds to SAS code and output
#########################################################

##(I) Basic GOF line by line computation to demonstrate formulas
ob=c(926,288,293,104) ## data
ex=1611*c(9,3,3,1)/16  ## expected counts under the assumed model

X2=sum((ob-ex)^2/ex) ## X2 statistic
X2
## [1] 1.468722
####  Output so you check against your output
#[1] 1.468722

1-pchisq(X2,3) ## p-value
## [1] 0.6895079
####  Output
#[1] 0.6895079

G2=2*sum(ob*log(ob/ex)) ## deviance statistic
G2
## [1] 1.477587
####  Output
#[1] 1.477587

1-pchisq(G2,3) ## pvalue
## [1] 0.6874529
####  Output
#[1] 0.6874529

########################################################
#### (II) Using the built-in chisq.test function in R
tomato=chisq.test(c(926, 288,293,104), p=c(9/16, 3/16, 3/16, 1/16))
 tomato
## 
##  Chi-squared test for given probabilities
## 
## data:  c(926, 288, 293, 104)
## X-squared = 1.4687, df = 3, p-value = 0.6895
 tomato$statistic
## X-squared 
##  1.468722
 tomato$p.value 
## [1] 0.6895079
 tomato$residuals
## [1]  0.6581581 -0.8091222 -0.5214343  0.3301172
#### To get G2
 
G2=2*sum(tomato$observed*log(tomato$observed/tomato$expected))
G2
## [1] 1.477587
1-pchisq(G2,3)
## [1] 0.6874529
##deviance residuals
devres=sign(tomato$observed-tomato$expected)*sqrt(abs(2*tomato$observed*log(tomato$observed/tomato$expected)))
devres
## [1]  6.328906 -5.240221 -4.224967  2.594764
##### Creating a nice ouput 
### Cell id, Observed count, Expected count, Pearson residuals, Deviance residual
out<-round(cbind(1:5, tomato$observed, tomato$expected, tomato$residuals, devres),3)
## Warning in cbind(1:5, tomato$observed, tomato$expected, tomato$residuals, :
## number of rows of result is not a multiple of vector length (arg 2)
out<-as.data.frame(out)
names(out)<-c("cell_j", "O_j", "E_j", "res_j", "dev_j")
out
##   cell_j O_j     E_j  res_j  dev_j
## 1      1 926 906.188  0.658  6.329
## 2      2 288 302.062 -0.809 -5.240
## 3      3 293 302.062 -0.521 -4.225
## 4      4 104 100.688  0.330  2.595
## 5      5 926 906.188  0.658  6.329
#### printing your table of results into a text file with tab separation
write.table(out, "tomato_Results", row.names=FALSE, col.names=TRUE, sep="\t")

#### Ploting expected and observed values
plot(c(1:4), tomato$observed, xlab="cell index", ylab="counts", xlim=c(0,5))
points(tomato$expected, pch=3, col="red")
legend("topright",c("observed", "expected"), col=c(1,"red"), pch=c(1,3),bty="n")

########################################################
#### (III) Nice R code that corresponds to SAS code and output
type=c(rep("tallc",926),rep("tallp",288),rep("dwarf",293),rep("dwarfp",104))
##Please note the table R provides has different order of rows 
##from that provided by SAS 
table(type)
## type
##  dwarf dwarfp  tallc  tallp 
##    293    104    926    288
## Freq Procedure
Percentage=100*as.vector(table(type))/sum(table(type))
CumulativeFrequency=cumsum(c(926,288,293,104))
CumulativePercentage=cumsum(Percentage)
Freq=data.frame(table(type),Percentage,CumulativeFrequency,CumulativePercentage)
Freq
##     type Freq Percentage CumulativeFrequency CumulativePercentage
## 1  dwarf  293  18.187461                 926             18.18746
## 2 dwarfp  104   6.455618                1214             24.64308
## 3  tallc  926  57.479826                1507             82.12291
## 4  tallp  288  17.877095                1611            100.00000
## Chi-Square Test for Specified Proportions
p=c(18.75,6.25,56.25,18.75)
chisq.test(table(type),p=p/sum(p))
## 
##  Chi-squared test for given probabilities
## 
## data:  table(type)
## X-squared = 1.4687, df = 3, p-value = 0.6895
########################################################