make gaussian glm function

gaussian_glm = 
  function(X,y,threshold = 0.01, max_iter = 1000000){
    cal_p = function(X,beta){
      beta = as.vector(beta)
      return(X%*%beta)
    }  
    beta = rep(0,ncol(X))
    diff = 10000 
    iter_count = 0
    while(diff > threshold ) #tests for convergence
    {
      #calculate probabilities using current estimate of beta
      p = as.vector(cal_p(X,beta))
      
      #the weights are constant
      #W does not exist (constant)
      
      #calculate the change in beta
      beta_change = solve(t(X)%*%X) %*% t(X)%*%(y - p)
      
      #update beta
      beta = beta + beta_change
      
      #calculate how much we changed beta by in this iteration 
      diff = sum(beta_change^2)
      
      #see if we've hit the maximum number of iterations
      iter_count = iter_count + 1
      if(iter_count > max_iter) {
        stop("This isn't converging.")
      }
    }
    #result of coef
    x1=colnames(X)[2]
    x2=colnames(X)[3]
    coef = c("(Intercept)" = round(beta[1],6), x1 = round(beta[2],6), x2 = round(beta[3],6))
    #sigma.hat=(t(y-X%*%beta)%*%(y-X%*%beta))/(nrow(y)-ncol(X)-1)
    sigma.hat=(t(y-X%*%beta)%*%(y-X%*%beta))/(nrow(y)-ncol(X))
    cov=as.numeric(sigma.hat)*solve(t(X)%*%X)
    std.error=sqrt(diag(cov))
    tvalue=coef/std.error
    pvalue=2*pt(-abs(tvalue),df=nrow(y)-ncol(X))
    result.coef=rbind(coef,std.error,tvalue,pvalue)
    
    #deviance residuals
    d<-c()
    for (i in 1:length(y)){
      d[i]=sign(y[i]-(X%*%beta)[i])*sqrt((y[i]-(X%*%beta)[i])^2)
    }
    result.d=c("Min"=round(min(d),3),"1Q"=round(quantile(d,0.25),3),"Median"=round(median(d),3),
               "3Q"=round(quantile(d,0.75),3),"Max"=round(max(d),3))
    form=paste("glm(formula=y~",colnames(X)[2],"+",colnames(X)[3],",family=gaussian)")
    return(list("Call"=form,"Deviance residuals:"=result.d, "Coefficients"=t(result.coef),
                "----"=paste0("(Dispersion parameter for binomial family taken to be ",round(sigma.hat,2),")"))) 
  }

Check if the function is same as the package

data("airquality")
airquality<-na.omit(airquality)
tempfit<-glm(Ozone~Wind+Temp,data=airquality,family="gaussian")
X=as.matrix(cbind(rep(1,111),airquality[,3:4]))
Y=as.matrix(airquality$Ozone)
gaussian_glm(X,Y)

## $Call
## [1] "glm(formula=y~ Wind + Temp ,family=gaussian)"
## 
## $`Deviance residuals:`
##     Min  1Q.25%  Median  3Q.75%     Max 
## -42.156 -13.216  -3.123  10.598  98.492 
## 
## $Coefficients
##                   coef  std.error    tvalue       pvalue
## (Intercept) -67.321953 23.6210451 -2.850084 5.236066e-03
## x1           -3.294839  0.6711482 -4.909257 3.261734e-06
## x2            1.827554  0.2505520  7.294110 5.294680e-11
## 
## $`----`
## [1] "(Dispersion parameter for binomial family taken to be 472.12)"

summary(tempfit)

## 
## Call:
## glm(formula = Ozone ~ Wind + Temp, family = "gaussian", data = airquality)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -42.156  -13.216   -3.123   10.598   98.492  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -67.3220    23.6210  -2.850  0.00524 ** 
## Wind         -3.2948     0.6711  -4.909 3.26e-06 ***
## Temp          1.8276     0.2506   7.294 5.29e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 472.12)
## 
##     Null deviance: 121802  on 110  degrees of freedom
## Residual deviance:  50989  on 108  degrees of freedom
## AIC: 1003.4
## 
## Number of Fisher Scoring iterations: 2

make Logistic glm function

Logistic_glm = 
  function(X,y,threshold = 0.001, max_iter = 100000){
   cal_p = function(X,beta){
    beta = as.vector(beta)
    return(exp(X%*%beta) / (1+ exp(X%*%beta)))
  }  
  beta = rep(0,ncol(X))
  diff = 10000 
  iter_count = 0
  while(diff > threshold ) #tests for convergence
  {
    #calculate probabilities using current estimate of beta
    p = as.vector(cal_p(X,beta))
    
    #calculate matrix of weights W
    W =  diag(p*(1-p)) 
    
    #calculate the change in beta
    beta_change = solve(t(X)%*%W%*%X) %*% t(X)%*%(y - p)
    
    #update beta
    beta = beta + beta_change
    
    #calculate how much we changed beta by in this iteration 
    diff = sum(beta_change^2)
    
    #see if we've hit the maximum number of iterations
    iter_count = iter_count + 1
    if(iter_count > max_iter) {
      stop("This isn't converging.")
    }
  }
  #result of coef
  x1=colnames(X)[2]
  x2=colnames(X)[3]
  coef = c("(Intercept)" = round(beta[1],6), x1 = round(beta[2],6), x2 = round(beta[3],6))
  cov=solve(t(X)%*%W%*%X)
  std.error=sqrt(diag(cov))
  zvalue=coef/std.error
  pvalue=2*pnorm(-abs(zvalue))
  result.coef=rbind(coef,std.error,zvalue,pvalue)

  #result of deviance residuals
  d<-c()
  for (i in 1:length(y)){
    if (y[i]==1){
      d[i]=sign(y[i]-(1*p[i]))*sqrt(2*y[i]*log(y[i]/(1*p[i])))
    } else if(y[i]==0){
      d[i]=sign(y[i]-(1*p[i]))*sqrt(2*(1-y[i])*log((1-y[i])/(1*(1-p[i]))))
    }
  }
  result.d=c("min"=min(d),"1Q"=quantile(d,0.25),"median"=median(d),
                      "3Q"=quantile(d,0.75),"max"=max(d))
  form=paste("glm(formula=y~",colnames(X)[2],"+",colnames(X)[3],",family=binomial)")
  return(list("Call"=form,"Deviance Residuals"=result.d,"Coefficients"=t(result.coef),
              "----"="(Dispersion parameter for binomial family taken to be 1)"))
  }

Check if the function is same as the package

record<-read.csv("https://stats.idre.ucla.edu/stat/data/binary.csv")[,c(1:3)]
recordfit <- glm(admit ~ gre+gpa, data = record, family = "binomial")
X=as.matrix(cbind(rep(1,400),record[c(2,3)]))
Y=as.matrix(record$admit)
Logistic_glm(X,Y)

## $Call
## [1] "glm(formula=y~ gre + gpa ,family=binomial)"
## 
## $`Deviance Residuals`
##        min     1Q.25%     median     3Q.75%        max 
## -1.2729816 -0.8988205 -0.7206340  1.3012952  2.0619874 
## 
## $Coefficients
##                  coef   std.error    zvalue       pvalue
## (Intercept) -4.949378 1.075088489 -4.603694 4.150630e-06
## x1           0.002691 0.001057488  2.544709 1.093687e-02
## x2           0.754687 0.319584760  2.361461 1.820308e-02
## 
## $`----`
## [1] "(Dispersion parameter for binomial family taken to be 1)"

summary(recordfit)

## 
## Call:
## glm(formula = admit ~ gre + gpa, family = "binomial", data = record)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2730  -0.8988  -0.7206   1.3013   2.0620  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.949378   1.075093  -4.604 4.15e-06 ***
## gre          0.002691   0.001057   2.544   0.0109 *  
## gpa          0.754687   0.319586   2.361   0.0182 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 499.98  on 399  degrees of freedom
## Residual deviance: 480.34  on 397  degrees of freedom
## AIC: 486.34
## 
## Number of Fisher Scoring iterations: 4

GLM Assignment1

김민지

2018년 9월 23일

make gaussian glm function

Check if the function is same as the package

make Logistic glm function

Check if the function is same as the package