######### HOMEWORK 4, STOR 665: COMPUTING PART, DUE APR 11, 2016 #########

######### Student Name:                    Jie Huang

# INSTRUCTION: Edit this R file by adding your solution to the end of each 
# question. Your submitted file should run smoothly in R and provide all 
# required results. You are allowed to work with other students but the 
# homework should be in your own words. Identical solutions will receive a 0 
# in grade and will be investigated.

# This homework is an exercise in GLM estimation and inference.  We will
# build up R code that 
#1) implement the IRLS algorithm for Poisson GLM and 
#2) use the algorithm to analyze a real dataset. 

# We will be negligent in the sense that we will ignore precautions
# that a numerical analyst would build into his code, and we
# will not consider measures to improve numerical conditioning such as 
# pivoting (switching the order of the variables) either.



#----------------------------------------------------------------

# PROBLEM 1: Write a function "my.poisson.glm(X,y)" to take X as a matrix
# and y as a vector of counts. The function then computes a 
# list with the following named elements:

# a) "Coeffs": a matrix with one row per regression coefficient and columns
#    named "Coefficient", "Std.Err.Est", "z-Statistic", "P-Value"
# b) "Residual Deviance": a number, D
# c) "Pearson Residuals": the vector of residuals
# d) "Deviance Residuals": the vector of residuals
# e) "Anscombe Residuals": the vector of residuals

# You must use the ITLS algorithm in the function. The link function should be 
# log(). The only high-level function allowed is lm().

# SOLUTION: 
rm(list=ls(all=TRUE))   
    

my.poisson.glm <- function(X,y)
  {
  #so turns out that I don't know how to deal with observations with 0 count , becuase log(0) gives -Inf, so I substitute 0 with 1e-16
  for (i in 1:length(y)){
  if (y[i]==0) y[i] =1e-16}
  mu <- y
  eta <- log(mu)
  z <- eta + (y-mu)/mu                  #initial response
  w <- mu                               #initial weights
  data <- as.data.frame(cbind(z,X))
  lmod <- lm(z ~ ., weights = w, data)  #initial LS estimate of beta
  
  for (i in 1:10) {
    eta <- lmod$fit
    mu <- exp(eta)        #g-inverse of eta to update mu
    z <- eta + (y-mu)/mu  #updating the response
    w <- mu               #updating weights
    data <- as.data.frame(cbind(z,X))
    lmod <- lm(z ~ ., weights = w, data)
    #cat(i, coef(lmod),"\n")
  }
  
  # a) "Coeffs": a matrix with one row per regression coefficient and columns
  #    named "Coefficient", "Std.Err.Est", "z-Statistic", "P-Value"

  #Coefficient
  Coefficient <- coef(lmod)
  
  #standard error 
  xm <- model.matrix(lmod)
  wm <- diag(w)         
  Std.Err.Est <- sqrt(diag(solve(t(xm) %*% wm %*% xm)))
  
  #z-Statistic
  z <- Coefficient/Std.Err.Est
  
  #P-Value
  p <- 2*pnorm(-abs(z))
  
  matrixA <- cbind(Coefficient,Std.Err.Est,"z-Statistic"=z, "P-Value"=p)
  
  # b) "Residual Deviance": a number, D
  
  #D <- 2*sum(w*y*log(y/mu) - w*(y-mu)), not considering this weighted deviance
  D <- 2*sum(y*log(y/mu) - (y-mu))
  
  # c) "Pearson Residuals": the vector of residuals
  Rp <- (y-mu)/sqrt(mu)
  
  # d) "Deviance Residuals": the vector of residuals
  Rd <- sign(y-mu)*sqrt(2*y*log(y/mu) - 2*(y-mu))
  
  # e) "Anscombe Residuals": the vector of residuals
  Ra <- 1.5*(y^(2/3)-mu^(2/3))/mu^(1/6)
  
  returned = list("Coeffs"=matrixA, "Residual Deviance" = D, "Pearson Residuals" = Rp,"Deviance Residuals"=Rd,"Anscombe Residuals"=Ra)
  return(returned)
  }


#testing my code
# library(faraway)          
# attach(gala)
# gala <- gala[,-2]
# modp <- glm(Species ~ .,family=poisson, gala)
# summary(modp)$coef
# resid(modp, type='pearson')
# residuals(modp)  #default deviance residual
# my.poisson.glm(as.matrix(gala[,2:6]),gala[,1])
# 



#----------------------------------------------------------------

# PROBLEM 2: Description of the discoveries data. Start by loading the data:
  library(faraway)                  # Install this package if needed
  data(discoveries)                 # Load the discoveries data.
                                    # Type "help(discoveries)" for details.
  class(discoveries)                # What type is the dataset?
## [1] "ts"
# Plot the data and make a summary of data (average, max, min, etc). 

# Is there any special pattern (trend, seasonality)?

# SOLUTION:
require(graphics)
plot(discoveries, ylab = "Number of important discoveries",
     las = 1)
title(main = "discoveries data set")

hist(discoveries,breaks=10)

summary(discoveries)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     2.0     3.0     3.1     4.0    12.0

Minimue is 0, max is 12, median is 3, on average 3.1/year.

Seems like more discoveries happens between 1880-1900, large fluctations between years.

#----------------------------------------------------------------

# PROBLEM 3: We study if the association between the number of discoveries
# and time. We first make the variable
  years <- 1860:1959
# We now fit the Poisson GLM "my.poisson.glm()" with the canonical link.
# We shall use "discoveries" as the response and "years" as the predictor. 
# Make sure to include the intercept in the regression. Provide all the 
# output of the regression. Compare your output with that from R:
#   modl <- glm(discoveries~years,family=poisson(link = "log"))
#   summary(modl)
# In particular, provide the Pearson residuals and the Anscombe residuals
# from the fitted model "modl"

# SOLUTION:

modl <- glm(discoveries~years,family=poisson(link = "log"))
summary(modl)$coeffi
##                 Estimate  Std. Error   z value    Pr(>|z|)
## (Intercept) 11.354807042 3.775677181  3.007356 0.002635306
## years       -0.005360224 0.001981701 -2.704859 0.006833334
output<-my.poisson.glm(years,discoveries)
output
## $Coeffs
##              Coefficient Std.Err.Est z-Statistic     P-Value
## (Intercept) 11.354807042 3.775677278    3.007356 0.002635306
## X           -0.005360224 0.001981702   -2.704859 0.006833335
## 
## $`Residual Deviance`
## [1] 157.3158
## 
## $`Pearson Residuals`
## Time Series:
## Start = 1860 
## End = 1959 
## Frequency = 1 
##            1            2            3            4            5 
##  0.503382157 -0.487992073 -1.987813713 -0.973662626 -1.977187093 
##            6            7            8            9           10 
## -0.450516006 -0.949642592 -0.431797657  1.111217637 -1.438276183 
##           11           12           13           14           15 
## -0.917711110 -1.425092083 -0.901785150 -1.411948928 -0.366377613 
##           16           17           18           19           20 
## -0.357042956 -0.347710864  0.709021409 -0.854160904  0.206786728 
##           21           22           23           24           25 
##  0.217522246 -1.889124085 -0.822534876 -0.282452398  1.861338099 
##           26           27           28           29           30 
##  4.551634455 -0.254516256  3.520280338  3.000321807 -0.767414617 
##           31           32           33           34           35 
## -0.217293349  1.966907260  1.982044214 -0.736039841 -0.180095416 
##           36           37           38           39           40 
## -0.170799334  1.491675497 -0.704749658  0.411103547 -0.133626605 
##           41           42           43           44           45 
##  0.989662772 -0.673540471 -0.665750409  0.465026661 -1.776190147 
##           46           47           48           49           50 
##  0.486618746 -0.634637425  1.075772576 -0.619108507 -0.040752633 
##           51           52           53           54           55 
## -0.031467946  1.698811094  1.137509632  2.880125463  0.005669208 
##           56           57           58           59           60 
##  1.754497091  1.768449847 -1.715370771  1.211864715 -0.534005489 
##           61           62           63           64           65 
## -0.526292976 -0.518584243  1.852436652  0.089236687  0.692528728 
##           66           67           68           69           70 
##  0.703411161 -0.480095345 -0.472408132  0.736089087  2.553046243 
##           71           72           73           74           75 
##  1.361534331  0.163560968  0.172856090 -1.643367454 -0.418689565 
##           76           77           78           79           80 
## -0.411027954 -0.403369296 -1.010778727  0.228655228  0.856331373 
##           81           82           83           84           85 
## -0.372763088 -0.365118367 -0.980839150 -0.974872543 -0.968912938 
##           86           87           88           89           90 
## -0.334565162 -0.957014564  0.944241829  0.955260327  0.331117640 
##           91           92           93           94           95 
## -0.296427316 -0.927388184  0.999403729 -0.915584560 -0.909692634 
##           96           97           98           99          100 
## -0.903807242 -1.545125121 -1.540989557 -0.235514639 -1.532751607 
## 
## $`Deviance Residuals`
## Time Series:
## Start = 1860 
## End = 1959 
## Frequency = 1 
##   [1]  0.484201179 -0.510263534 -2.811193112 -1.076246099 -2.796164802
##   [6] -0.469546155 -1.047490420 -0.449301044  1.025127162 -1.714256895
##  [11] -1.009413787 -1.696747420 -0.990487807 -1.679313073 -0.379032506
##  [16] -0.369068311 -0.359122508  0.670800121 -0.934154344  0.203194992
##  [21]  0.213546410 -2.671624901 -0.896965088 -0.290012731  1.637198907
##  [26]  3.550362850 -0.260665229  2.856934602  2.489996239 -0.832578388
##  [31] -0.221784975  1.716701534  1.727987863 -0.796177481 -0.183187345
##  [36] -0.173581742  1.336186595 -0.760057450  0.396815799 -0.135333143
##  [41]  0.914879994 -0.724215257 -0.715297768  0.446717970 -2.511912195
##  [46]  0.466559066 -0.679798655  0.987330661 -0.662150955 -0.040912122
##  [51] -0.031563085  1.497135616  1.038572924  2.381682191  0.005666114
##  [56]  1.539411957  1.549939444 -2.425900609  1.099513359 -0.566283921
##  [61] -0.557667780 -0.549067954  1.612758267  0.088467265  0.651713363
##  [66]  0.661294323 -0.506311995 -0.497809131  0.689940786  2.133544758
##  [71]  1.219629126  0.160967902  0.169958824 -2.324072541 -0.438734893
##  [76] -0.430358888 -0.421998548 -1.158718616  0.223574385  0.793756667
##  [81] -0.388712997 -0.380430351 -1.120902533 -1.113385854 -1.105884587
##  [86] -0.347453486 -1.090928125  0.868076915  0.877298168  0.320423722
##  [91] -0.306575172 -1.053803766  0.914032078 -1.039059868 -1.031710435
##  [96] -1.024375958 -2.185136901 -2.179288331 -0.241950035 -2.167638110
## 
## $`Anscombe Residuals`
## Time Series:
## Start = 1860 
## End = 1959 
## Frequency = 1 
##            1            2            3            4            5 
##  0.484317671 -0.510442793 -2.981720569 -1.078311679 -2.965780639 
##            6            7            8            9           10 
## -0.469687307 -1.049410905 -0.449425362  1.026144295 -1.725840121 
##           11           12           13           14           15 
## -1.011151684 -1.708049616 -0.992138952 -1.690338901 -0.379108515 
##           16           17           18           19           20 
## -0.369138664 -0.359187494  0.671122679 -0.935562763  0.203205178 
##           21           22           23           24           25 
##  0.213558262 -2.833686127 -0.898225772 -0.290047579  1.641128568 
##           26           27           28           29           30 
##  3.579319744 -0.260690729  2.873897045  2.501997427 -0.833606215 
##           31           32           33           34           35 
## -0.221800843  1.721302403  1.732690394 -0.797086203 -0.183196378 
##           36           37           38           39           40 
## -0.173589447  1.338587433 -0.760856695  0.396895250 -0.135336832 
##           41           42           43           44           45 
##  0.915743560 -0.724914239 -0.715973075  0.446832696 -2.664285220 
##           46           47           48           49           50 
##  0.466690396 -0.680384634  0.988433733 -0.662695403 -0.040912227 
##           51           52           53           54           55 
## -0.031563133  1.500627742  1.039871620  2.393572612  0.005666114 
##           56           57           58           59           60 
##  1.543242093  1.553857335 -2.573056157  1.101075582 -0.566634676 
##           61           62           63           64           65 
## -0.558003662 -0.549389391  1.617230739  0.088468359  0.652087829 
##           66           67           68           69           70 
##  0.661686470 -0.506567409 -0.498052536  0.690389282  2.143131651 
##           71           72           73           74           75 
##  1.221819977  0.160974626  0.169966758 -2.465051181 -0.438904630 
##           76           77           78           79           80 
## -0.430519510 -0.422150389 -1.163115557  0.223592713  0.794457362 
##           81           82           83           84           85 
## -0.388832919 -0.380543064 -1.124941840 -1.117356071 -1.109786517 
##           86           87           88           89           90 
## -0.347540257 -1.094695865  0.869010539  0.878264083  0.320479139 
##           91           92           93           94           95 
## -0.306635558 -1.057249713  0.915134557 -1.042382468 -1.034972492 
##           96           97           98           99          100 
## -1.027578221 -2.317687681 -2.311484335 -0.241980338 -2.299127410
#compare the difference of resudials outputs from my code, with the R embreded code)
round(output$`Deviance Residuals`- residuals(modl), 1e-10)  #default deviance residual
## Time Series:
## Start = 1860 
## End = 1959 
## Frequency = 1 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
##   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 
##  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
##   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 
##  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54 
##   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 
##  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72 
##   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 
##  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90 
##   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 
##  91  92  93  94  95  96  97  98  99 100 
##   0   0   0   0   0   0   0   0   0   0
round(output$`Pearson Residuals` - resid(modl, type='pearson'), 1e-10)
## Time Series:
## Start = 1860 
## End = 1959 
## Frequency = 1 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
##   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 
##  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
##   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 
##  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54 
##   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 
##  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72 
##   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 
##  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90 
##   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 
##  91  92  93  94  95  96  97  98  99 100 
##   0   0   0   0   0   0   0   0   0   0
#----------------------------------------------------------------