Generalized Linear Models (Model Fit)

setwd('C:/Users/mchsu/Downloads/Mich/') 
Data<-read.csv("Imigration.csv")
M<-lm(Inc~En+Lit+US5,data=Data)
SumM<-summary(M)
print(SumM)

## 
## Call:
## lm(formula = Inc ~ En + Lit + US5, data = Data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.5223 -0.2935  0.1564  0.7294  1.3823 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.575812   1.311500   1.964 0.058549 .  
## En           0.041500   0.023948   1.733 0.093054 .  
## Lit          0.079103   0.020128   3.930 0.000444 ***
## US5         -0.003091   0.020731  -0.149 0.882447    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.059 on 31 degrees of freedom
## Multiple R-squared:  0.7772, Adjusted R-squared:  0.7556 
## F-statistic: 36.05 on 3 and 31 DF,  p-value: 3.158e-10

names(SumM)

##  [1] "call"          "terms"         "residuals"     "coefficients" 
##  [5] "aliased"       "sigma"         "df"            "r.squared"    
##  [9] "adj.r.squared" "fstatistic"    "cov.unscaled"

Density Plot of Residuals

plot(density(SumM$residuals))

Normal Density Plot: set to default

setEPS(paper='special')

Plot a Normal Density: Encapsulated PostScript

postscript('NormalDensity.eps',width=7,height=5,pointsize=12)
x<-seq(-3.1,3.1,by=0.001)
plot(x,dnorm(x),type='l',axes=FALSE,ylab='',xlab='',ylim=c(0,dnorm(0)+.3))
axis(1,at=c(-3,-2,-1,0,1,2,3),labels=c(expression(mu-3*sigma),expression(mu-2*sigma),expression(mu-1*sigma),expression(mu), expression(mu+1*sigma),expression(mu+2*sigma),expression(mu+3*sigma)))
mtext(expression(paste('Density - f(y|' , mu,",",sigma,')           ',)),2)


lines(c(-1,-1),c(0,dnorm(0)+.07));lines(c(1,1),c(0,dnorm(0)+.07))
text(0,dnorm(0)+.03,'68% within',pos=3,cex=.8)
text(0,dnorm(0),'1 standard deviation',pos=3,cex=.8)
arrows(-.8,dnorm(0)+.04,-1,dnorm(0)+.04,length=.1)
arrows(.8,dnorm(0)+.04,1,dnorm(0)+.04,length=.1)#box()

lines(c(-2,-2),c(0,dnorm(0)+.15));lines(c(2,2),c(0,dnorm(0)+.15))
text(0,dnorm(0)+.12,'95% within',pos=3,cex=.8)
text(0,dnorm(0)+.09,'2 standard deviation',pos=3,cex=.8)
arrows(-.8,dnorm(0)+.13,-2,dnorm(0)+.13,length=.1)
arrows(.8,dnorm(0)+.13,2,dnorm(0)+.13,length=.1)#box()

lines(c(-3,-3),c(0,dnorm(0)+.25));lines(c(3,3),c(0,dnorm(0)+.25))
text(0,dnorm(0)+.22,'99.7% within',pos=3,cex=.8)
text(0,dnorm(0)+.19,'3 standard deviation',pos=3,cex=.8)
arrows(-.8,dnorm(0)+.23,-3,dnorm(0)+.23,length=.1)
arrows(.8,dnorm(0)+.23,3,dnorm(0)+.23,length=.1)#box()

### Plot a bernoulli and Binomial probability mass function
postscript('BinomialDensity.eps',width=8,height=5,pointsize=12)
par(mfrow=c(1,2))
x<- 0:1;
db<-dbinom(x,1,0.2)
plot(x,db,type='h',lwd=7,col='blue',xlab='',ylab='Density - Bernoulli(0.2)',axes=F,xlim=c(-0.2,1.2))
axis(1,at=c(0,1),label=c(0,1))
axis(2)
x<- 0:10;
db<-dbinom(x,10,0.2)
plot(x,db,type='h',lwd=7,col='blue',xlab='',ylab='Density - Binomial(10, 0.2)', axes=F,xlim=c(-0.2,10.4))
axis(1,at=0:10,label=0:10)
axis(1,at=10,label=10)
axis(2)
dev.off()

## png 
##   2

Concatenate and Print (Note: read the data as vector)

cat("1 396 1 568  1 1212 1 171 1 554  1 1104 1 257 1 435 1 295  1 397
   1 288 1 1004 1 431  1 795 1 1621 1 1378 1 902 1 958 1 1283 1 2415
   2 375 2 375  2 752  2 208 2 151  2 116  2 736 2 192 2 315  2 1252 
   2 675 2 700  2 440  2 771 2 688  2 426  2 410 2 979 2 377  2 503", file="data.txt")

Data1 <- scan("Data.txt")
Data1

##  [1]    1  396    1  568    1 1212    1  171    1  554    1 1104    1  257
## [15]    1  435    1  295    1  397    1  288    1 1004    1  431    1  795
## [29]    1 1621    1 1378    1  902    1  958    1 1283    1 2415    2  375
## [43]    2  375    2  752    2  208    2  151    2  116    2  736    2  192
## [57]    2  315    2 1252    2  675    2  700    2  440    2  771    2  688
## [71]    2  426    2  410    2  979    2  377    2  503

Transform the data “matrix()”

Data2<-matrix(Data1,ncol=2,byrow=TRUE)
Data<-data.frame(Data2)
names(Data)<-c('disease','cd4')
Data$disease<-factor(Data$disease)

1.Normal distribution for the response (CD4 count)

 N1 <-  glm(cd4 ~ disease, data=Data)  # Object N1 contains all components of the model

 names(N1) # get the names of the objcts

##  [1] "coefficients"      "residuals"         "fitted.values"    
##  [4] "effects"           "R"                 "rank"             
##  [7] "qr"                "family"            "linear.predictors"
## [10] "deviance"          "aic"               "null.deviance"    
## [13] "iter"              "weights"           "prior.weights"    
## [16] "df.residual"       "df.null"           "y"                
## [19] "converged"         "boundary"          "model"            
## [22] "call"              "formula"           "terms"            
## [25] "data"              "offset"            "control"          
## [28] "method"            "contrasts"         "xlevels"

 #[1] "coefficients"      "residuals"         "fitted.values"     "effects"           "R"                 "rank"              "qr"                "family"           
# [9] "linear.predictors" "deviance"          "aic"               "null.deviance"     "iter"              "weights"           "prior.weights"     "df.residual"      
#[17] "df.null"           "y"                 "converged"         "boundary"          "model"             "call"              "formula"           "terms"            
#[25] "data"              "offset"            "control"           "method"            "contrasts"         "xlevels"    
  
  N1$family # What generalized linear model do we fit

## 
## Family: gaussian 
## Link function: identity

  #Family: gaussian 
  #Link function: identity 

  N1$contrasts  # Treatment constrats - as in SAS

## $disease
## [1] "contr.treatment"

  S1<- summary(N1) # Summary of the output
  
  names(S1)   # Get the components of S1

##  [1] "call"           "terms"          "family"         "deviance"      
##  [5] "aic"            "contrasts"      "df.residual"    "null.deviance" 
##  [9] "df.null"        "iter"           "deviance.resid" "coefficients"  
## [13] "aliased"        "dispersion"     "df"             "cov.unscaled"  
## [17] "cov.scaled"

#[1] "call"           "terms"          "family"         "deviance"       "aic"            "contrasts"      "df.residual"    "null.deviance"  "df.null"        "iter"          
#[11] "deviance.resid" "coefficients"   "aliased"        "dispersion"     "df"             "cov.unscaled"   "cov.scaled" 
  
  S1$coefficients  # Get the coefficients

##             Estimate Std. Error   t value     Pr(>|t|)
## (Intercept)   823.20   100.8259  8.164567 6.937410e-10
## disease2     -301.15   142.5894 -2.112009 4.131601e-02

#            Estimate Std. Error   t value     Pr(>|t|)
#(Intercept)   823.20   100.8259  8.164567 6.937410e-10
#disease2     -301.15   142.5894 -2.112009 4.131601e-02

Construct the log(CD4 count): Lognormal Model for the CD4 counts

Data$LogCD4 <- log(Data$cd4)
  LN1 <- glm(LogCD4~disease, data=Data)

  SN1<-summary(LN1)
  SN1$coefficients   # Get coefficients stats

##               Estimate Std. Error   t value     Pr(>|t|)
## (Intercept)  6.4869319  0.1500490 43.232102 6.346047e-34
## disease2    -0.3982919  0.2122013 -1.876953 6.821821e-02

  dev.res <- SN1$deviance.resid # get the devoaiance residuals
  qqnorm(dev.res)               # Normal q-q plot of the residuals

Normal Disribution with LOG link

  N2 <- glm(cd4 ~ disease, family=gaussian(link=log), data = Data)
  ### Other options
   #binomial(link = "logit")
   #gaussian(link = "identity")
   #Gamma(link = "inverse")
   #inverse.gaussian(link = "1/mu^2")
   #poisson(link = "log")
   #quasi(link = "identity", variance = "constant")    # Specify your own
   #quasibinomial(link = "logit")          # Overdispered and Underdispersed binomial
   #quasipoisson(link = "log")             # Overdispered and Underdispersed poisson

  
  N2$family         # what generalized linear model is fit

## 
## Family: gaussian 
## Link function: log

  #Family: gaussian 
  #Link function: log 
  SN2 <- summary(N2)
  dev.res2 <- SN2$deviance.resid # get the devoaiance residuals
  qqnorm(dev.res2)               # Normal q-q plot of the residuals

Poisson Distribution

 P1 <- glm(cd4 ~ disease, family=poisson(link = log), data=Data)
 
  P1$family         # what generalized linear model is fit

## 
## Family: poisson 
## Link function: log

  #Family: poisson 
  #Link function: log 
  SP1 <- summary(P1)
  dev.res3 <- SP1$deviance.resid # get the deviance residuals
  qqnorm(dev.res3)               # Normal q-q plot of the residuals

Negative Binomial Distribution

library(MASS)                              ### Need to upload a new library to fit the negative binomial model
 NB1 <- glm.nb(cd4 ~ disease, data=Data)
 
 SNB1<-summary(NB1)
 
  SNB1$family         # what generalized linear model is fit

## 
## Family: Negative Binomial(2.6962) 
## Link function: log

   #Family: Negative Binomial(2.6962) 
   #Link function: log 
  dev.res4 <- SNB1$deviance.resid # get the devoaiance residuals
  qqnorm(dev.res4)               # Normal q-q plot of the residuals

Generalized Linear Models (Model Fit)

Michelle C. Hsu

February 14, 2018