logistic.R

# https://ww2.coastal.edu/kingw/statistics/R-tutorials/logistic.html

library("MASS")
data(menarche)
str(menarche)

## 'data.frame':    25 obs. of  3 variables:
##  $ Age     : num  9.21 10.21 10.58 10.83 11.08 ...
##  $ Total   : num  376 200 93 120 90 88 105 111 100 93 ...
##  $ Menarche: num  0 0 0 2 2 5 10 17 16 29 ...

summary(menarche)

##       Age            Total           Menarche      
##  Min.   : 9.21   Min.   :  88.0   Min.   :   0.00  
##  1st Qu.:11.58   1st Qu.:  98.0   1st Qu.:  10.00  
##  Median :13.08   Median : 105.0   Median :  51.00  
##  Mean   :13.10   Mean   : 156.7   Mean   :  92.32  
##  3rd Qu.:14.58   3rd Qu.: 117.0   3rd Qu.:  92.00  
##  Max.   :17.58   Max.   :1049.0   Max.   :1049.00

#?menarche

#data dictionary
# 
# This data frame contains the following columns:
#   
#   Age
# Average age of the group. (The groups are reasonably age homogeneous.)
# 
# Total
# Total number of children in the group.
# 
# Menarche
# Number who have reached menarche
glm.out = glm(cbind(Menarche, Total-Menarche) ~ Age, family=binomial(logit),
              data=menarche)
str(glm.out)

## List of 30
##  $ coefficients     : Named num [1:2] -21.23 1.63
##   ..- attr(*, "names")= chr [1:2] "(Intercept)" "Age"
##  $ residuals        : Named num [1:25] -1.002 -1.01 -1.019 -0.413 -0.482 ...
##   ..- attr(*, "names")= chr [1:25] "1" "2" "3" "4" ...
##  $ fitted.values    : Named num [1:25] 0.00203 0.01031 0.0187 0.02786 0.04132 ...
##   ..- attr(*, "names")= chr [1:25] "1" "2" "3" "4" ...
##  $ effects          : Named num [1:25] -0.578 27.682 -1.069 -0.413 -0.593 ...
##   ..- attr(*, "names")= chr [1:25] "(Intercept)" "Age" "" "" ...
##  $ R                : num [1:2, 1:2] -15.9 0 -206.5 17
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "(Intercept)" "Age"
##   .. ..$ : chr [1:2] "(Intercept)" "Age"
##  $ rank             : int 2
##  $ qr               :List of 5
##   ..$ qr   : num [1:25, 1:2] -15.851 0.0901 0.0824 0.1137 0.1191 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:25] "1" "2" "3" "4" ...
##   .. .. ..$ : chr [1:2] "(Intercept)" "Age"
##   ..$ rank : int 2
##   ..$ qraux: num [1:2] 1.06 1.22
##   ..$ pivot: int [1:2] 1 2
##   ..$ tol  : num 1e-11
##   ..- attr(*, "class")= chr "qr"
##  $ family           :List of 12
##   ..$ family    : chr "binomial"
##   ..$ link      : chr "logit"
##   ..$ linkfun   :function (mu)  
##   ..$ linkinv   :function (eta)  
##   ..$ variance  :function (mu)  
##   ..$ dev.resids:function (y, mu, wt)  
##   ..$ aic       :function (y, n, mu, wt, dev)  
##   ..$ mu.eta    :function (eta)  
##   ..$ initialize:  expression({     if (NCOL(y) == 1) {         if (is.factor(y))              y <- y != levels(y)[1L]         n <- rep.int(1, nobs)         y[weights == 0] <- 0         if (any(y < 0 | y > 1))              stop("y values must be 0 <= y <= 1")         mustart <- (weights * y + 0.5)/(weights + 1)         m <- weights * y         if (any(abs(m - round(m)) > 0.001))              warning("non-integer #successes in a binomial glm!")     }     else if (NCOL(y) == 2) {         if (any(abs(y - round(y)) > 0.001))              warning("non-integer counts in a binomial glm!")         n <- y[, 1] + y[, 2]         y <- ifelse(n == 0, 0, y[, 1]/n)         weights <- weights * n         mustart <- (n * y + 0.5)/(n + 1)     }     else stop("for the 'binomial' family, y must be a vector of 0 and 1's\nor a 2 column matrix where col 1 is no. successes and col 2 is no. failures") })
##   ..$ validmu   :function (mu)  
##   ..$ valideta  :function (eta)  
##   ..$ simulate  :function (object, nsim)  
##   ..- attr(*, "class")= chr "family"
##  $ linear.predictors: Named num [1:25] -6.2 -4.56 -3.96 -3.55 -3.14 ...
##   ..- attr(*, "names")= chr [1:25] "1" "2" "3" "4" ...
##  $ deviance         : num 26.7
##  $ aic              : num 115
##  $ null.deviance    : num 3694
##  $ iter             : int 4
##  $ weights          : Named num [1:25] 0.763 2.041 1.707 3.25 3.565 ...
##   ..- attr(*, "names")= chr [1:25] "1" "2" "3" "4" ...
##  $ prior.weights    : Named num [1:25] 376 200 93 120 90 88 105 111 100 93 ...
##   ..- attr(*, "names")= chr [1:25] "1" "2" "3" "4" ...
##  $ df.residual      : int 23
##  $ df.null          : int 24
##  $ y                : Named num [1:25] 0 0 0 0.0167 0.0222 ...
##   ..- attr(*, "names")= chr [1:25] "1" "2" "3" "4" ...
##  $ converged        : logi TRUE
##  $ boundary         : logi FALSE
##  $ model            :'data.frame':   25 obs. of  2 variables:
##   ..$ cbind(Menarche, Total - Menarche): num [1:25, 1:2] 0 0 0 2 2 5 10 17 16 29 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : NULL
##   .. .. ..$ : chr [1:2] "Menarche" ""
##   ..$ Age                              : num [1:25] 9.21 10.21 10.58 10.83 11.08 ...
##   ..- attr(*, "terms")=Classes 'terms', 'formula'  language cbind(Menarche, Total - Menarche) ~ Age
##   .. .. ..- attr(*, "variables")= language list(cbind(Menarche, Total - Menarche), Age)
##   .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. .. ..$ : chr [1:2] "cbind(Menarche, Total - Menarche)" "Age"
##   .. .. .. .. ..$ : chr "Age"
##   .. .. ..- attr(*, "term.labels")= chr "Age"
##   .. .. ..- attr(*, "order")= int 1
##   .. .. ..- attr(*, "intercept")= int 1
##   .. .. ..- attr(*, "response")= int 1
##   .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. .. ..- attr(*, "predvars")= language list(cbind(Menarche, Total - Menarche), Age)
##   .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "nmatrix.2" "numeric"
##   .. .. .. ..- attr(*, "names")= chr [1:2] "cbind(Menarche, Total - Menarche)" "Age"
##  $ call             : language glm(formula = cbind(Menarche, Total - Menarche) ~ Age, family = binomial(logit),      data = menarche)
##  $ formula          :Class 'formula'  language cbind(Menarche, Total - Menarche) ~ Age
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##  $ terms            :Classes 'terms', 'formula'  language cbind(Menarche, Total - Menarche) ~ Age
##   .. ..- attr(*, "variables")= language list(cbind(Menarche, Total - Menarche), Age)
##   .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:2] "cbind(Menarche, Total - Menarche)" "Age"
##   .. .. .. ..$ : chr "Age"
##   .. ..- attr(*, "term.labels")= chr "Age"
##   .. ..- attr(*, "order")= int 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(cbind(Menarche, Total - Menarche), Age)
##   .. ..- attr(*, "dataClasses")= Named chr [1:2] "nmatrix.2" "numeric"
##   .. .. ..- attr(*, "names")= chr [1:2] "cbind(Menarche, Total - Menarche)" "Age"
##  $ data             :'data.frame':   25 obs. of  3 variables:
##   ..$ Age     : num [1:25] 9.21 10.21 10.58 10.83 11.08 ...
##   ..$ Total   : num [1:25] 376 200 93 120 90 88 105 111 100 93 ...
##   ..$ Menarche: num [1:25] 0 0 0 2 2 5 10 17 16 29 ...
##  $ offset           : NULL
##  $ control          :List of 3
##   ..$ epsilon: num 1e-08
##   ..$ maxit  : num 25
##   ..$ trace  : logi FALSE
##  $ method           : chr "glm.fit"
##  $ contrasts        : NULL
##  $ xlevels          : Named list()
##  - attr(*, "class")= chr [1:2] "glm" "lm"

names(glm.out)

##  [1] "coefficients"      "residuals"         "fitted.values"    
##  [4] "effects"           "R"                 "rank"             
##  [7] "qr"                "family"            "linear.predictors"
## [10] "deviance"          "aic"               "null.deviance"    
## [13] "iter"              "weights"           "prior.weights"    
## [16] "df.residual"       "df.null"           "y"                
## [19] "converged"         "boundary"          "model"            
## [22] "call"              "formula"           "terms"            
## [25] "data"              "offset"            "control"          
## [28] "method"            "contrasts"         "xlevels"

class(glm.out)

## [1] "glm" "lm"

summary(glm.out)

## 
## Call:
## glm(formula = cbind(Menarche, Total - Menarche) ~ Age, family = binomial(logit), 
##     data = menarche)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0363  -0.9953  -0.4900   0.7780   1.3675  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -21.22639    0.77068  -27.54   <2e-16 ***
## Age           1.63197    0.05895   27.68   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3693.884  on 24  degrees of freedom
## Residual deviance:   26.703  on 23  degrees of freedom
## AIC: 114.76
## 
## Number of Fisher Scoring iterations: 4

plot(Menarche/Total ~ Age, data=menarche)
lines(menarche$Age, glm.out$fitted, type="l", col="red")
title(main="Menarche Data with Fitted Logistic Regression Line")
summary(glm.out)

## 
## Call:
## glm(formula = cbind(Menarche, Total - Menarche) ~ Age, family = binomial(logit), 
##     data = menarche)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0363  -0.9953  -0.4900   0.7780   1.3675  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -21.22639    0.77068  -27.54   <2e-16 ***
## Age           1.63197    0.05895   27.68   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3693.884  on 24  degrees of freedom
## Residual deviance:   26.703  on 23  degrees of freedom
## AIC: 114.76
## 
## Number of Fisher Scoring iterations: 4

#AIC - the smaller the AIC or BIC, the better the fit.

#2

# http://www.ats.ucla.edu/stat/r/dae/logit.htm
#A researcher is interested in how variables, 
#such as GRE (Graduate Record Exam scores), 
#GPA (grade point average) and
#prestige of the undergraduate institution,
#effect admission into graduate school. 
#The response variable, admit/don't admit,
#is a binary variable.


#install.packages("aod")
library(aod)
library(ggplot2)

library(Rcpp)
mydata <- read.csv("http://www.ats.ucla.edu/stat/data/binary.csv")
head(mydata)

##   admit gre  gpa rank
## 1     0 380 3.61    3
## 2     1 660 3.67    3
## 3     1 800 4.00    1
## 4     1 640 3.19    4
## 5     0 520 2.93    4
## 6     1 760 3.00    2

str(mydata)

## 'data.frame':    400 obs. of  4 variables:
##  $ admit: int  0 1 1 1 0 1 1 0 1 0 ...
##  $ gre  : int  380 660 800 640 520 760 560 400 540 700 ...
##  $ gpa  : num  3.61 3.67 4 3.19 2.93 3 2.98 3.08 3.39 3.92 ...
##  $ rank : int  3 3 1 4 4 2 1 2 3 2 ...

summary(mydata)

##      admit             gre             gpa             rank      
##  Min.   :0.0000   Min.   :220.0   Min.   :2.260   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:520.0   1st Qu.:3.130   1st Qu.:2.000  
##  Median :0.0000   Median :580.0   Median :3.395   Median :2.000  
##  Mean   :0.3175   Mean   :587.7   Mean   :3.390   Mean   :2.485  
##  3rd Qu.:1.0000   3rd Qu.:660.0   3rd Qu.:3.670   3rd Qu.:3.000  
##  Max.   :1.0000   Max.   :800.0   Max.   :4.000   Max.   :4.000

sapply(mydata, sd)

##       admit         gre         gpa        rank 
##   0.4660867 115.5165364   0.3805668   0.9444602

xtabs(~ admit + rank, data = mydata)

##      rank
## admit  1  2  3  4
##     0 28 97 93 55
##     1 33 54 28 12

mydata$rank <- factor(mydata$rank)
mylogit <- glm(admit ~ gre + gpa + rank, data = mydata, family = "binomial")
summary(mylogit)

## 
## Call:
## glm(formula = admit ~ gre + gpa + rank, family = "binomial", 
##     data = mydata)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6268  -0.8662  -0.6388   1.1490   2.0790  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.989979   1.139951  -3.500 0.000465 ***
## gre          0.002264   0.001094   2.070 0.038465 *  
## gpa          0.804038   0.331819   2.423 0.015388 *  
## rank2       -0.675443   0.316490  -2.134 0.032829 *  
## rank3       -1.340204   0.345306  -3.881 0.000104 ***
## rank4       -1.551464   0.417832  -3.713 0.000205 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 499.98  on 399  degrees of freedom
## Residual deviance: 458.52  on 394  degrees of freedom
## AIC: 470.52
## 
## Number of Fisher Scoring iterations: 4

# #mtcars
# ?mtcars
# A data frame with 32 observations on 11 variables.
# 
# [, 1]  mpg     Miles/(US) gallon
# [, 2]  cyl     Number of cylinders
# [, 3]  disp    Displacement (cu.in.)
# [, 4]  hp  Gross horsepower
# [, 5]  drat    Rear axle ratio
# [, 6]  wt  Weight (1000 lbs)
# [, 7]  qsec    1/4 mile time
# [, 8]  vs  V/S
# [, 9]  am  Transmission (0 = automatic, 1 = manual)
# [,10]  gear    Number of forward gears
# [,11]  carb    Number of carburetors


am.glm = glm(formula=am ~ hp + wt,data=mtcars,family=binomial)
summary(am.glm)

## 
## Call:
## glm(formula = am ~ hp + wt, family = binomial, data = mtcars)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2537  -0.1568  -0.0168   0.1543   1.3449  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept) 18.86630    7.44356   2.535  0.01126 * 
## hp           0.03626    0.01773   2.044  0.04091 * 
## wt          -8.08348    3.06868  -2.634  0.00843 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 43.230  on 31  degrees of freedom
## Residual deviance: 10.059  on 29  degrees of freedom
## AIC: 16.059
## 
## Number of Fisher Scoring iterations: 8

newdata = data.frame(hp=120, wt=2.8)
newdata

##    hp  wt
## 1 120 2.8

predict(am.glm, newdata, type="response")

##         1 
## 0.6418125

#p-values of the hp and wt variables are both less than 0.05, neither hp or wt is insignificant in the logistic regression model.
#http://www.r-tutor.com/elementary-statistics/logistic-regression/estimated-logistic-regression-equation
#help(predict.glm)

logistic.R

dell

Sun Dec 18 11:08:08 2016