# https://ww2.coastal.edu/kingw/statistics/R-tutorials/logistic.html
library("MASS")
data(menarche)
str(menarche)
## 'data.frame': 25 obs. of 3 variables:
## $ Age : num 9.21 10.21 10.58 10.83 11.08 ...
## $ Total : num 376 200 93 120 90 88 105 111 100 93 ...
## $ Menarche: num 0 0 0 2 2 5 10 17 16 29 ...
summary(menarche)
## Age Total Menarche
## Min. : 9.21 Min. : 88.0 Min. : 0.00
## 1st Qu.:11.58 1st Qu.: 98.0 1st Qu.: 10.00
## Median :13.08 Median : 105.0 Median : 51.00
## Mean :13.10 Mean : 156.7 Mean : 92.32
## 3rd Qu.:14.58 3rd Qu.: 117.0 3rd Qu.: 92.00
## Max. :17.58 Max. :1049.0 Max. :1049.00
#?menarche
#data dictionary
#
# This data frame contains the following columns:
#
# Age
# Average age of the group. (The groups are reasonably age homogeneous.)
#
# Total
# Total number of children in the group.
#
# Menarche
# Number who have reached menarche
glm.out = glm(cbind(Menarche, Total-Menarche) ~ Age, family=binomial(logit),
data=menarche)
str(glm.out)
## List of 30
## $ coefficients : Named num [1:2] -21.23 1.63
## ..- attr(*, "names")= chr [1:2] "(Intercept)" "Age"
## $ residuals : Named num [1:25] -1.002 -1.01 -1.019 -0.413 -0.482 ...
## ..- attr(*, "names")= chr [1:25] "1" "2" "3" "4" ...
## $ fitted.values : Named num [1:25] 0.00203 0.01031 0.0187 0.02786 0.04132 ...
## ..- attr(*, "names")= chr [1:25] "1" "2" "3" "4" ...
## $ effects : Named num [1:25] -0.578 27.682 -1.069 -0.413 -0.593 ...
## ..- attr(*, "names")= chr [1:25] "(Intercept)" "Age" "" "" ...
## $ R : num [1:2, 1:2] -15.9 0 -206.5 17
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "(Intercept)" "Age"
## .. ..$ : chr [1:2] "(Intercept)" "Age"
## $ rank : int 2
## $ qr :List of 5
## ..$ qr : num [1:25, 1:2] -15.851 0.0901 0.0824 0.1137 0.1191 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:25] "1" "2" "3" "4" ...
## .. .. ..$ : chr [1:2] "(Intercept)" "Age"
## ..$ rank : int 2
## ..$ qraux: num [1:2] 1.06 1.22
## ..$ pivot: int [1:2] 1 2
## ..$ tol : num 1e-11
## ..- attr(*, "class")= chr "qr"
## $ family :List of 12
## ..$ family : chr "binomial"
## ..$ link : chr "logit"
## ..$ linkfun :function (mu)
## ..$ linkinv :function (eta)
## ..$ variance :function (mu)
## ..$ dev.resids:function (y, mu, wt)
## ..$ aic :function (y, n, mu, wt, dev)
## ..$ mu.eta :function (eta)
## ..$ initialize: expression({ if (NCOL(y) == 1) { if (is.factor(y)) y <- y != levels(y)[1L] n <- rep.int(1, nobs) y[weights == 0] <- 0 if (any(y < 0 | y > 1)) stop("y values must be 0 <= y <= 1") mustart <- (weights * y + 0.5)/(weights + 1) m <- weights * y if (any(abs(m - round(m)) > 0.001)) warning("non-integer #successes in a binomial glm!") } else if (NCOL(y) == 2) { if (any(abs(y - round(y)) > 0.001)) warning("non-integer counts in a binomial glm!") n <- y[, 1] + y[, 2] y <- ifelse(n == 0, 0, y[, 1]/n) weights <- weights * n mustart <- (n * y + 0.5)/(n + 1) } else stop("for the 'binomial' family, y must be a vector of 0 and 1's\nor a 2 column matrix where col 1 is no. successes and col 2 is no. failures") })
## ..$ validmu :function (mu)
## ..$ valideta :function (eta)
## ..$ simulate :function (object, nsim)
## ..- attr(*, "class")= chr "family"
## $ linear.predictors: Named num [1:25] -6.2 -4.56 -3.96 -3.55 -3.14 ...
## ..- attr(*, "names")= chr [1:25] "1" "2" "3" "4" ...
## $ deviance : num 26.7
## $ aic : num 115
## $ null.deviance : num 3694
## $ iter : int 4
## $ weights : Named num [1:25] 0.763 2.041 1.707 3.25 3.565 ...
## ..- attr(*, "names")= chr [1:25] "1" "2" "3" "4" ...
## $ prior.weights : Named num [1:25] 376 200 93 120 90 88 105 111 100 93 ...
## ..- attr(*, "names")= chr [1:25] "1" "2" "3" "4" ...
## $ df.residual : int 23
## $ df.null : int 24
## $ y : Named num [1:25] 0 0 0 0.0167 0.0222 ...
## ..- attr(*, "names")= chr [1:25] "1" "2" "3" "4" ...
## $ converged : logi TRUE
## $ boundary : logi FALSE
## $ model :'data.frame': 25 obs. of 2 variables:
## ..$ cbind(Menarche, Total - Menarche): num [1:25, 1:2] 0 0 0 2 2 5 10 17 16 29 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : NULL
## .. .. ..$ : chr [1:2] "Menarche" ""
## ..$ Age : num [1:25] 9.21 10.21 10.58 10.83 11.08 ...
## ..- attr(*, "terms")=Classes 'terms', 'formula' language cbind(Menarche, Total - Menarche) ~ Age
## .. .. ..- attr(*, "variables")= language list(cbind(Menarche, Total - Menarche), Age)
## .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
## .. .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. .. ..$ : chr [1:2] "cbind(Menarche, Total - Menarche)" "Age"
## .. .. .. .. ..$ : chr "Age"
## .. .. ..- attr(*, "term.labels")= chr "Age"
## .. .. ..- attr(*, "order")= int 1
## .. .. ..- attr(*, "intercept")= int 1
## .. .. ..- attr(*, "response")= int 1
## .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. .. ..- attr(*, "predvars")= language list(cbind(Menarche, Total - Menarche), Age)
## .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "nmatrix.2" "numeric"
## .. .. .. ..- attr(*, "names")= chr [1:2] "cbind(Menarche, Total - Menarche)" "Age"
## $ call : language glm(formula = cbind(Menarche, Total - Menarche) ~ Age, family = binomial(logit), data = menarche)
## $ formula :Class 'formula' language cbind(Menarche, Total - Menarche) ~ Age
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## $ terms :Classes 'terms', 'formula' language cbind(Menarche, Total - Menarche) ~ Age
## .. ..- attr(*, "variables")= language list(cbind(Menarche, Total - Menarche), Age)
## .. ..- attr(*, "factors")= int [1:2, 1] 0 1
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:2] "cbind(Menarche, Total - Menarche)" "Age"
## .. .. .. ..$ : chr "Age"
## .. ..- attr(*, "term.labels")= chr "Age"
## .. ..- attr(*, "order")= int 1
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(cbind(Menarche, Total - Menarche), Age)
## .. ..- attr(*, "dataClasses")= Named chr [1:2] "nmatrix.2" "numeric"
## .. .. ..- attr(*, "names")= chr [1:2] "cbind(Menarche, Total - Menarche)" "Age"
## $ data :'data.frame': 25 obs. of 3 variables:
## ..$ Age : num [1:25] 9.21 10.21 10.58 10.83 11.08 ...
## ..$ Total : num [1:25] 376 200 93 120 90 88 105 111 100 93 ...
## ..$ Menarche: num [1:25] 0 0 0 2 2 5 10 17 16 29 ...
## $ offset : NULL
## $ control :List of 3
## ..$ epsilon: num 1e-08
## ..$ maxit : num 25
## ..$ trace : logi FALSE
## $ method : chr "glm.fit"
## $ contrasts : NULL
## $ xlevels : Named list()
## - attr(*, "class")= chr [1:2] "glm" "lm"
names(glm.out)
## [1] "coefficients" "residuals" "fitted.values"
## [4] "effects" "R" "rank"
## [7] "qr" "family" "linear.predictors"
## [10] "deviance" "aic" "null.deviance"
## [13] "iter" "weights" "prior.weights"
## [16] "df.residual" "df.null" "y"
## [19] "converged" "boundary" "model"
## [22] "call" "formula" "terms"
## [25] "data" "offset" "control"
## [28] "method" "contrasts" "xlevels"
class(glm.out)
## [1] "glm" "lm"
summary(glm.out)
##
## Call:
## glm(formula = cbind(Menarche, Total - Menarche) ~ Age, family = binomial(logit),
## data = menarche)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0363 -0.9953 -0.4900 0.7780 1.3675
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -21.22639 0.77068 -27.54 <2e-16 ***
## Age 1.63197 0.05895 27.68 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3693.884 on 24 degrees of freedom
## Residual deviance: 26.703 on 23 degrees of freedom
## AIC: 114.76
##
## Number of Fisher Scoring iterations: 4
plot(Menarche/Total ~ Age, data=menarche)
lines(menarche$Age, glm.out$fitted, type="l", col="red")
title(main="Menarche Data with Fitted Logistic Regression Line")
summary(glm.out)
##
## Call:
## glm(formula = cbind(Menarche, Total - Menarche) ~ Age, family = binomial(logit),
## data = menarche)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0363 -0.9953 -0.4900 0.7780 1.3675
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -21.22639 0.77068 -27.54 <2e-16 ***
## Age 1.63197 0.05895 27.68 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3693.884 on 24 degrees of freedom
## Residual deviance: 26.703 on 23 degrees of freedom
## AIC: 114.76
##
## Number of Fisher Scoring iterations: 4
#AIC - the smaller the AIC or BIC, the better the fit.
#2
# http://www.ats.ucla.edu/stat/r/dae/logit.htm
#A researcher is interested in how variables,
#such as GRE (Graduate Record Exam scores),
#GPA (grade point average) and
#prestige of the undergraduate institution,
#effect admission into graduate school.
#The response variable, admit/don't admit,
#is a binary variable.
#install.packages("aod")
library(aod)
library(ggplot2)

library(Rcpp)
mydata <- read.csv("http://www.ats.ucla.edu/stat/data/binary.csv")
head(mydata)
## admit gre gpa rank
## 1 0 380 3.61 3
## 2 1 660 3.67 3
## 3 1 800 4.00 1
## 4 1 640 3.19 4
## 5 0 520 2.93 4
## 6 1 760 3.00 2
str(mydata)
## 'data.frame': 400 obs. of 4 variables:
## $ admit: int 0 1 1 1 0 1 1 0 1 0 ...
## $ gre : int 380 660 800 640 520 760 560 400 540 700 ...
## $ gpa : num 3.61 3.67 4 3.19 2.93 3 2.98 3.08 3.39 3.92 ...
## $ rank : int 3 3 1 4 4 2 1 2 3 2 ...
summary(mydata)
## admit gre gpa rank
## Min. :0.0000 Min. :220.0 Min. :2.260 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:520.0 1st Qu.:3.130 1st Qu.:2.000
## Median :0.0000 Median :580.0 Median :3.395 Median :2.000
## Mean :0.3175 Mean :587.7 Mean :3.390 Mean :2.485
## 3rd Qu.:1.0000 3rd Qu.:660.0 3rd Qu.:3.670 3rd Qu.:3.000
## Max. :1.0000 Max. :800.0 Max. :4.000 Max. :4.000
sapply(mydata, sd)
## admit gre gpa rank
## 0.4660867 115.5165364 0.3805668 0.9444602
xtabs(~ admit + rank, data = mydata)
## rank
## admit 1 2 3 4
## 0 28 97 93 55
## 1 33 54 28 12
mydata$rank <- factor(mydata$rank)
mylogit <- glm(admit ~ gre + gpa + rank, data = mydata, family = "binomial")
summary(mylogit)
##
## Call:
## glm(formula = admit ~ gre + gpa + rank, family = "binomial",
## data = mydata)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6268 -0.8662 -0.6388 1.1490 2.0790
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.989979 1.139951 -3.500 0.000465 ***
## gre 0.002264 0.001094 2.070 0.038465 *
## gpa 0.804038 0.331819 2.423 0.015388 *
## rank2 -0.675443 0.316490 -2.134 0.032829 *
## rank3 -1.340204 0.345306 -3.881 0.000104 ***
## rank4 -1.551464 0.417832 -3.713 0.000205 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 499.98 on 399 degrees of freedom
## Residual deviance: 458.52 on 394 degrees of freedom
## AIC: 470.52
##
## Number of Fisher Scoring iterations: 4
# #mtcars
# ?mtcars
# A data frame with 32 observations on 11 variables.
#
# [, 1] mpg Miles/(US) gallon
# [, 2] cyl Number of cylinders
# [, 3] disp Displacement (cu.in.)
# [, 4] hp Gross horsepower
# [, 5] drat Rear axle ratio
# [, 6] wt Weight (1000 lbs)
# [, 7] qsec 1/4 mile time
# [, 8] vs V/S
# [, 9] am Transmission (0 = automatic, 1 = manual)
# [,10] gear Number of forward gears
# [,11] carb Number of carburetors
am.glm = glm(formula=am ~ hp + wt,data=mtcars,family=binomial)
summary(am.glm)
##
## Call:
## glm(formula = am ~ hp + wt, family = binomial, data = mtcars)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2537 -0.1568 -0.0168 0.1543 1.3449
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 18.86630 7.44356 2.535 0.01126 *
## hp 0.03626 0.01773 2.044 0.04091 *
## wt -8.08348 3.06868 -2.634 0.00843 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 43.230 on 31 degrees of freedom
## Residual deviance: 10.059 on 29 degrees of freedom
## AIC: 16.059
##
## Number of Fisher Scoring iterations: 8
newdata = data.frame(hp=120, wt=2.8)
newdata
## hp wt
## 1 120 2.8
predict(am.glm, newdata, type="response")
## 1
## 0.6418125
#p-values of the hp and wt variables are both less than 0.05, neither hp or wt is insignificant in the logistic regression model.
#http://www.r-tutor.com/elementary-statistics/logistic-regression/estimated-logistic-regression-equation
#help(predict.glm)