######### HOMEWORK 4, STOR 665: COMPUTING PART, DUE APR 11, 2016 #########
######### Student Name: Jie Huang
# INSTRUCTION: Edit this R file by adding your solution to the end of each
# question. Your submitted file should run smoothly in R and provide all
# required results. You are allowed to work with other students but the
# homework should be in your own words. Identical solutions will receive a 0
# in grade and will be investigated.
# This homework is an exercise in GLM estimation and inference. We will
# build up R code that
#1) implement the IRLS algorithm for Poisson GLM and
#2) use the algorithm to analyze a real dataset.
# We will be negligent in the sense that we will ignore precautions
# that a numerical analyst would build into his code, and we
# will not consider measures to improve numerical conditioning such as
# pivoting (switching the order of the variables) either.
#----------------------------------------------------------------
# PROBLEM 1: Write a function "my.poisson.glm(X,y)" to take X as a matrix
# and y as a vector of counts. The function then computes a
# list with the following named elements:
# a) "Coeffs": a matrix with one row per regression coefficient and columns
# named "Coefficient", "Std.Err.Est", "z-Statistic", "P-Value"
# b) "Residual Deviance": a number, D
# c) "Pearson Residuals": the vector of residuals
# d) "Deviance Residuals": the vector of residuals
# e) "Anscombe Residuals": the vector of residuals
# You must use the ITLS algorithm in the function. The link function should be
# log(). The only high-level function allowed is lm().
# SOLUTION:
rm(list=ls(all=TRUE))
my.poisson.glm <- function(X,y)
{
#so turns out that I don't know how to deal with observations with 0 count , becuase log(0) gives -Inf, so I substitute 0 with 1e-16
for (i in 1:length(y)){
if (y[i]==0) y[i] =1e-16}
mu <- y
eta <- log(mu)
z <- eta + (y-mu)/mu #initial response
w <- mu #initial weights
data <- as.data.frame(cbind(z,X))
lmod <- lm(z ~ ., weights = w, data) #initial LS estimate of beta
for (i in 1:10) {
eta <- lmod$fit
mu <- exp(eta) #g-inverse of eta to update mu
z <- eta + (y-mu)/mu #updating the response
w <- mu #updating weights
data <- as.data.frame(cbind(z,X))
lmod <- lm(z ~ ., weights = w, data)
#cat(i, coef(lmod),"\n")
}
# a) "Coeffs": a matrix with one row per regression coefficient and columns
# named "Coefficient", "Std.Err.Est", "z-Statistic", "P-Value"
#Coefficient
Coefficient <- coef(lmod)
#standard error
xm <- model.matrix(lmod)
wm <- diag(w)
Std.Err.Est <- sqrt(diag(solve(t(xm) %*% wm %*% xm)))
#z-Statistic
z <- Coefficient/Std.Err.Est
#P-Value
p <- 2*pnorm(-abs(z))
matrixA <- cbind(Coefficient,Std.Err.Est,"z-Statistic"=z, "P-Value"=p)
# b) "Residual Deviance": a number, D
#D <- 2*sum(w*y*log(y/mu) - w*(y-mu)), not considering this weighted deviance
D <- 2*sum(y*log(y/mu) - (y-mu))
# c) "Pearson Residuals": the vector of residuals
Rp <- (y-mu)/sqrt(mu)
# d) "Deviance Residuals": the vector of residuals
Rd <- sign(y-mu)*sqrt(2*y*log(y/mu) - 2*(y-mu))
# e) "Anscombe Residuals": the vector of residuals
Ra <- 1.5*(y^(2/3)-mu^(2/3))/mu^(1/6)
returned = list("Coeffs"=matrixA, "Residual Deviance" = D, "Pearson Residuals" = Rp,"Deviance Residuals"=Rd,"Anscombe Residuals"=Ra)
return(returned)
}
#testing my code
# library(faraway)
# attach(gala)
# gala <- gala[,-2]
# modp <- glm(Species ~ .,family=poisson, gala)
# summary(modp)$coef
# resid(modp, type='pearson')
# residuals(modp) #default deviance residual
# my.poisson.glm(as.matrix(gala[,2:6]),gala[,1])
#
#----------------------------------------------------------------
# PROBLEM 2: Description of the discoveries data. Start by loading the data:
library(faraway) # Install this package if needed
data(discoveries) # Load the discoveries data.
# Type "help(discoveries)" for details.
class(discoveries) # What type is the dataset?
## [1] "ts"
# Plot the data and make a summary of data (average, max, min, etc).
# Is there any special pattern (trend, seasonality)?
# SOLUTION:
require(graphics)
plot(discoveries, ylab = "Number of important discoveries",
las = 1)
title(main = "discoveries data set")
hist(discoveries,breaks=10)
summary(discoveries)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 2.0 3.0 3.1 4.0 12.0
Minimue is 0, max is 12, median is 3, on average 3.1/year.
Seems like more discoveries happens between 1880-1900, large fluctations between years.
#----------------------------------------------------------------
# PROBLEM 3: We study if the association between the number of discoveries
# and time. We first make the variable
years <- 1860:1959
# We now fit the Poisson GLM "my.poisson.glm()" with the canonical link.
# We shall use "discoveries" as the response and "years" as the predictor.
# Make sure to include the intercept in the regression. Provide all the
# output of the regression. Compare your output with that from R:
# modl <- glm(discoveries~years,family=poisson(link = "log"))
# summary(modl)
# In particular, provide the Pearson residuals and the Anscombe residuals
# from the fitted model "modl"
# SOLUTION:
modl <- glm(discoveries~years,family=poisson(link = "log"))
summary(modl)$coeffi
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 11.354807042 3.775677181 3.007356 0.002635306
## years -0.005360224 0.001981701 -2.704859 0.006833334
output<-my.poisson.glm(years,discoveries)
output
## $Coeffs
## Coefficient Std.Err.Est z-Statistic P-Value
## (Intercept) 11.354807042 3.775677278 3.007356 0.002635306
## X -0.005360224 0.001981702 -2.704859 0.006833335
##
## $`Residual Deviance`
## [1] 157.3158
##
## $`Pearson Residuals`
## Time Series:
## Start = 1860
## End = 1959
## Frequency = 1
## 1 2 3 4 5
## 0.503382157 -0.487992073 -1.987813713 -0.973662626 -1.977187093
## 6 7 8 9 10
## -0.450516006 -0.949642592 -0.431797657 1.111217637 -1.438276183
## 11 12 13 14 15
## -0.917711110 -1.425092083 -0.901785150 -1.411948928 -0.366377613
## 16 17 18 19 20
## -0.357042956 -0.347710864 0.709021409 -0.854160904 0.206786728
## 21 22 23 24 25
## 0.217522246 -1.889124085 -0.822534876 -0.282452398 1.861338099
## 26 27 28 29 30
## 4.551634455 -0.254516256 3.520280338 3.000321807 -0.767414617
## 31 32 33 34 35
## -0.217293349 1.966907260 1.982044214 -0.736039841 -0.180095416
## 36 37 38 39 40
## -0.170799334 1.491675497 -0.704749658 0.411103547 -0.133626605
## 41 42 43 44 45
## 0.989662772 -0.673540471 -0.665750409 0.465026661 -1.776190147
## 46 47 48 49 50
## 0.486618746 -0.634637425 1.075772576 -0.619108507 -0.040752633
## 51 52 53 54 55
## -0.031467946 1.698811094 1.137509632 2.880125463 0.005669208
## 56 57 58 59 60
## 1.754497091 1.768449847 -1.715370771 1.211864715 -0.534005489
## 61 62 63 64 65
## -0.526292976 -0.518584243 1.852436652 0.089236687 0.692528728
## 66 67 68 69 70
## 0.703411161 -0.480095345 -0.472408132 0.736089087 2.553046243
## 71 72 73 74 75
## 1.361534331 0.163560968 0.172856090 -1.643367454 -0.418689565
## 76 77 78 79 80
## -0.411027954 -0.403369296 -1.010778727 0.228655228 0.856331373
## 81 82 83 84 85
## -0.372763088 -0.365118367 -0.980839150 -0.974872543 -0.968912938
## 86 87 88 89 90
## -0.334565162 -0.957014564 0.944241829 0.955260327 0.331117640
## 91 92 93 94 95
## -0.296427316 -0.927388184 0.999403729 -0.915584560 -0.909692634
## 96 97 98 99 100
## -0.903807242 -1.545125121 -1.540989557 -0.235514639 -1.532751607
##
## $`Deviance Residuals`
## Time Series:
## Start = 1860
## End = 1959
## Frequency = 1
## [1] 0.484201179 -0.510263534 -2.811193112 -1.076246099 -2.796164802
## [6] -0.469546155 -1.047490420 -0.449301044 1.025127162 -1.714256895
## [11] -1.009413787 -1.696747420 -0.990487807 -1.679313073 -0.379032506
## [16] -0.369068311 -0.359122508 0.670800121 -0.934154344 0.203194992
## [21] 0.213546410 -2.671624901 -0.896965088 -0.290012731 1.637198907
## [26] 3.550362850 -0.260665229 2.856934602 2.489996239 -0.832578388
## [31] -0.221784975 1.716701534 1.727987863 -0.796177481 -0.183187345
## [36] -0.173581742 1.336186595 -0.760057450 0.396815799 -0.135333143
## [41] 0.914879994 -0.724215257 -0.715297768 0.446717970 -2.511912195
## [46] 0.466559066 -0.679798655 0.987330661 -0.662150955 -0.040912122
## [51] -0.031563085 1.497135616 1.038572924 2.381682191 0.005666114
## [56] 1.539411957 1.549939444 -2.425900609 1.099513359 -0.566283921
## [61] -0.557667780 -0.549067954 1.612758267 0.088467265 0.651713363
## [66] 0.661294323 -0.506311995 -0.497809131 0.689940786 2.133544758
## [71] 1.219629126 0.160967902 0.169958824 -2.324072541 -0.438734893
## [76] -0.430358888 -0.421998548 -1.158718616 0.223574385 0.793756667
## [81] -0.388712997 -0.380430351 -1.120902533 -1.113385854 -1.105884587
## [86] -0.347453486 -1.090928125 0.868076915 0.877298168 0.320423722
## [91] -0.306575172 -1.053803766 0.914032078 -1.039059868 -1.031710435
## [96] -1.024375958 -2.185136901 -2.179288331 -0.241950035 -2.167638110
##
## $`Anscombe Residuals`
## Time Series:
## Start = 1860
## End = 1959
## Frequency = 1
## 1 2 3 4 5
## 0.484317671 -0.510442793 -2.981720569 -1.078311679 -2.965780639
## 6 7 8 9 10
## -0.469687307 -1.049410905 -0.449425362 1.026144295 -1.725840121
## 11 12 13 14 15
## -1.011151684 -1.708049616 -0.992138952 -1.690338901 -0.379108515
## 16 17 18 19 20
## -0.369138664 -0.359187494 0.671122679 -0.935562763 0.203205178
## 21 22 23 24 25
## 0.213558262 -2.833686127 -0.898225772 -0.290047579 1.641128568
## 26 27 28 29 30
## 3.579319744 -0.260690729 2.873897045 2.501997427 -0.833606215
## 31 32 33 34 35
## -0.221800843 1.721302403 1.732690394 -0.797086203 -0.183196378
## 36 37 38 39 40
## -0.173589447 1.338587433 -0.760856695 0.396895250 -0.135336832
## 41 42 43 44 45
## 0.915743560 -0.724914239 -0.715973075 0.446832696 -2.664285220
## 46 47 48 49 50
## 0.466690396 -0.680384634 0.988433733 -0.662695403 -0.040912227
## 51 52 53 54 55
## -0.031563133 1.500627742 1.039871620 2.393572612 0.005666114
## 56 57 58 59 60
## 1.543242093 1.553857335 -2.573056157 1.101075582 -0.566634676
## 61 62 63 64 65
## -0.558003662 -0.549389391 1.617230739 0.088468359 0.652087829
## 66 67 68 69 70
## 0.661686470 -0.506567409 -0.498052536 0.690389282 2.143131651
## 71 72 73 74 75
## 1.221819977 0.160974626 0.169966758 -2.465051181 -0.438904630
## 76 77 78 79 80
## -0.430519510 -0.422150389 -1.163115557 0.223592713 0.794457362
## 81 82 83 84 85
## -0.388832919 -0.380543064 -1.124941840 -1.117356071 -1.109786517
## 86 87 88 89 90
## -0.347540257 -1.094695865 0.869010539 0.878264083 0.320479139
## 91 92 93 94 95
## -0.306635558 -1.057249713 0.915134557 -1.042382468 -1.034972492
## 96 97 98 99 100
## -1.027578221 -2.317687681 -2.311484335 -0.241980338 -2.299127410
#compare the difference of resudials outputs from my code, with the R embreded code)
round(output$`Deviance Residuals`- residuals(modl), 1e-10) #default deviance residual
## Time Series:
## Start = 1860
## End = 1959
## Frequency = 1
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 91 92 93 94 95 96 97 98 99 100
## 0 0 0 0 0 0 0 0 0 0
round(output$`Pearson Residuals` - resid(modl, type='pearson'), 1e-10)
## Time Series:
## Start = 1860
## End = 1959
## Frequency = 1
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 91 92 93 94 95 96 97 98 99 100
## 0 0 0 0 0 0 0 0 0 0
#----------------------------------------------------------------