my_data<-read.csv("https://raw.githubusercontent.com/tuyenhavan/Statistics/Dataset/GLM.herb.csv",sep=",")

head(my_data)
##   HerbDose Offspring
## 1        0        27
## 2        0        32
## 3        0        34
## 4        0        33
## 5        0        36
## 6        0        34

Question 1: Plot a suitable graph

plot(my_data$Offspring~my_data$HerbDose,main="Scatter Plot",xlab="Herbicide DOse",ylab="Zooplankton Offspring",pch=16,col=4)

Question 2: What data features can Poisson Regression accommodate that a regular regression cannot. Explain why Poisson regression is more appropriate for this data?

Question 3: Fit a Poisson regression, treating dose as factor. Perform test to indicate whether this model provides an adequate fit to the data?

model1<-glm(Offspring~factor(HerbDose),data=my_data,family = poisson)

summary(model1)
## 
## Call:
## glm(formula = Offspring ~ factor(HerbDose), family = poisson, 
##     data = my_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.4641  -0.4339   0.0444   0.3164   3.0804  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          3.44681    0.05643  61.078  < 2e-16 ***
## factor(HerbDose)80   0.00318    0.07974   0.040    0.968    
## factor(HerbDose)160 -0.10395    0.08196  -1.268    0.205    
## factor(HerbDose)235 -0.60190    0.09486  -6.345 2.22e-10 ***
## factor(HerbDose)310 -1.65505    0.14089 -11.747  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 312.484  on 49  degrees of freedom
## Residual deviance:  50.719  on 45  degrees of freedom
## AIC: 297.81
## 
## Number of Fisher Scoring iterations: 5
1-pchisq(50.719,45)
## [1] 0.2582753

Question 4: Based on summary of the model, what is the highest dose where the number of offspring produced is not significant different from number produced at HerbDose=0? Explain what portion of output you are basing your conclusion on?

Question 5: Can a lower order polynomial provide an adequate fit? Select a polynomial model and include output that demonstrates the correct degree has been selected. Briefly summarize your conclusion in words.

poly1<- glm(Offspring~HerbDose,data=my_data,family = poisson)

poly2<-glm(Offspring~poly(HerbDose,2),data=my_data,family = poisson)

# comparing two models

anova(poly1,poly2,test="Chisq")
## Analysis of Deviance Table
## 
## Model 1: Offspring ~ HerbDose
## Model 2: Offspring ~ poly(HerbDose, 2)
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1        48    127.589                          
## 2        47     55.871  1   71.718 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Comparing poly2 with model1

anova(poly2,model1,test="Chisq")
## Analysis of Deviance Table
## 
## Model 1: Offspring ~ poly(HerbDose, 2)
## Model 2: Offspring ~ factor(HerbDose)
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1        47     55.871                       
## 2        45     50.719  2   5.1524  0.07606 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(model1,poly2,test="Chisq")
## Analysis of Deviance Table
## 
## Model 1: Offspring ~ factor(HerbDose)
## Model 2: Offspring ~ poly(HerbDose, 2)
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1        45     50.719                       
## 2        47     55.871 -2  -5.1524  0.07606 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Question 6: Create a diagnostic plot

par(mfrow=c(2,2))

plot(poly2)

Question 7: Any outliers? if so, comment on them in the context of problem

Question 8: Plot the data and add a line with model prediction from your polynomial model, predicting at doses 5 units apart over the range of the data.

\[ log(\hat{Offspring_i})= \alpha + \beta_1 . HerbDose_i +\beta_2 . \ HerbDose^2_i \]

# Create a range of data with 5 steps

HerbDose<-my_data$HerbDose

# Square the HerbDose 

HerbDose_sqrt<-HerbDose^2

# Create a dataframe 

df<-data.frame(HerbDose,HerbDose_sqrt)

# Predict or estimate expected offspring from new dataset `df`

offpring_pred<-predict(poly2,newdata=df,type="response")

y_pred<-data.frame(my_data,offpring_pred)

head(y_pred)
##   HerbDose Offspring offpring_pred
## 1        0        27      30.25903
## 2        0        32      30.25903
## 3        0        34      30.25903
## 4        0        33      30.25903
## 5        0        36      30.25903
## 6        0        34      30.25903
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.5
ggplot(data=y_pred,aes(x=HerbDose,y=Offspring)) + geom_point(col=3) + geom_line(aes(y=offpring_pred),color=2,lwd=1) + ggtitle("Scatter Plot")