glm_models.R

getwd()

## [1] "/home/ajayohri/R/R Tutorial 2"

rm(list=ls())
gc()

##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 359518 19.3     592000 31.7   460000 24.6
## Vcells 552602  4.3    1023718  7.9   847241  6.5

ls()

## character(0)

library("MASS")
data(menarche)
summary(menarche)

##       Age            Total           Menarche      
##  Min.   : 9.21   Min.   :  88.0   Min.   :   0.00  
##  1st Qu.:11.58   1st Qu.:  98.0   1st Qu.:  10.00  
##  Median :13.08   Median : 105.0   Median :  51.00  
##  Mean   :13.10   Mean   : 156.7   Mean   :  92.32  
##  3rd Qu.:14.58   3rd Qu.: 117.0   3rd Qu.:  92.00  
##  Max.   :17.58   Max.   :1049.0   Max.   :1049.00

# ?menarche
# Description
# 
# Proportions of female children at various ages during adolescence who have reached menarche.
# 
# Usage
# 
# menarche
# Format
# 
# This data frame contains the following columns:
#   
# Age
# Average age of the group. (The groups are reasonably age homogeneous.)
# 
# Total
# Total number of children in the group.
# 
# Menarche
# Number who have reached menarche.
glm.out = glm(cbind(Menarche, Total-Menarche) ~ Age, family=binomial(logit), data=menarche)

plot(Menarche/Total ~ Age, data=menarche)
lines(menarche$Age, glm.out$fitted, type="l", col="red")
title(main="Menarche Data with Fitted Logistic Regression Line")
summary(glm.out)

## 
## Call:
## glm(formula = cbind(Menarche, Total - Menarche) ~ Age, family = binomial(logit), 
##     data = menarche)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0363  -0.9953  -0.4900   0.7780   1.3675  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -21.22639    0.77068  -27.54   <2e-16 ***
## Age           1.63197    0.05895   27.68   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3693.884  on 24  degrees of freedom
## Residual deviance:   26.703  on 23  degrees of freedom
## AIC: 114.76
## 
## Number of Fisher Scoring iterations: 4

##MTCARS

am.glm = glm(formula=am ~ hp + wt, data=mtcars, family=binomial)
summary(am.glm)

## 
## Call:
## glm(formula = am ~ hp + wt, family = binomial, data = mtcars)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2537  -0.1568  -0.0168   0.1543   1.3449  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept) 18.86630    7.44356   2.535  0.01126 * 
## hp           0.03626    0.01773   2.044  0.04091 * 
## wt          -8.08348    3.06868  -2.634  0.00843 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 43.230  on 31  degrees of freedom
## Residual deviance: 10.059  on 29  degrees of freedom
## AIC: 16.059
## 
## Number of Fisher Scoring iterations: 8

newdata = data.frame(hp=140, wt=2.6)
predict(am.glm, newdata, type="response")

##         1 
## 0.9490708

###College Admissions
#install.packages("aod")
library(aod)
library(ggplot2)

library(Rcpp)
mydata <- read.csv("http://www.ats.ucla.edu/stat/data/binary.csv")
head(mydata)

##   admit gre  gpa rank
## 1     0 380 3.61    3
## 2     1 660 3.67    3
## 3     1 800 4.00    1
## 4     1 640 3.19    4
## 5     0 520 2.93    4
## 6     1 760 3.00    2

summary(mydata)

##      admit             gre             gpa             rank      
##  Min.   :0.0000   Min.   :220.0   Min.   :2.260   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:520.0   1st Qu.:3.130   1st Qu.:2.000  
##  Median :0.0000   Median :580.0   Median :3.395   Median :2.000  
##  Mean   :0.3175   Mean   :587.7   Mean   :3.390   Mean   :2.485  
##  3rd Qu.:1.0000   3rd Qu.:660.0   3rd Qu.:3.670   3rd Qu.:3.000  
##  Max.   :1.0000   Max.   :800.0   Max.   :4.000   Max.   :4.000

sapply(mydata, sd)

##       admit         gre         gpa        rank 
##   0.4660867 115.5165364   0.3805668   0.9444602

xtabs(~ admit + rank, data = mydata)

##      rank
## admit  1  2  3  4
##     0 28 97 93 55
##     1 33 54 28 12

mydata$rank <- factor(mydata$rank)
mylogit <- glm(admit ~ gre + gpa + rank, data = mydata, family = "binomial")
summary(mylogit)

## 
## Call:
## glm(formula = admit ~ gre + gpa + rank, family = "binomial", 
##     data = mydata)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6268  -0.8662  -0.6388   1.1490   2.0790  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.989979   1.139951  -3.500 0.000465 ***
## gre          0.002264   0.001094   2.070 0.038465 *  
## gpa          0.804038   0.331819   2.423 0.015388 *  
## rank2       -0.675443   0.316490  -2.134 0.032829 *  
## rank3       -1.340204   0.345306  -3.881 0.000104 ***
## rank4       -1.551464   0.417832  -3.713 0.000205 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 499.98  on 399  degrees of freedom
## Residual deviance: 458.52  on 394  degrees of freedom
## AIC: 470.52
## 
## Number of Fisher Scoring iterations: 4

library(ROCR)

## Loading required package: gplots

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

p = predict(mylogit,
            mydata, 
            type="response")
p

##          1          2          3          4          5          6 
## 0.17262654 0.29217496 0.73840825 0.17838461 0.11835391 0.36996994 
##          7          8          9         10         11         12 
## 0.41924616 0.21700328 0.20073518 0.51786820 0.37431440 0.40020025 
##         13         14         15         16         17         18 
## 0.72053858 0.35345462 0.69237989 0.18582508 0.33993917 0.07895335 
##         19         20         21         22         23         24 
## 0.54022772 0.57351182 0.16122101 0.43727108 0.12837525 0.19204860 
##         25         26         27         28         29         30 
## 0.43759396 0.68229503 0.57848091 0.20475422 0.42307349 0.45829857 
##         31         32         33         34         35         36 
## 0.21765393 0.28583616 0.22481919 0.42494837 0.34296523 0.21293277 
##         37         38         39         40         41         42 
## 0.48413281 0.13931720 0.26569575 0.11942769 0.18975965 0.33567002 
##         43         44         45         46         47         48 
## 0.31560404 0.17702923 0.32817441 0.18025548 0.36121718 0.11699101 
##         49         50         51         52         53         54 
## 0.07235381 0.15047417 0.31488795 0.11624726 0.23936553 0.37838478 
##         55         56         57         58         59         60 
## 0.24045684 0.39213236 0.18283980 0.10853139 0.30472142 0.12837525 
##         61         62         63         64         65         66 
## 0.33078459 0.16742893 0.28289780 0.33295972 0.30988311 0.39645173 
##         67         68         69         70         71         72 
## 0.27784995 0.51681586 0.57206626 0.69436828 0.33966212 0.07486000 
##         73         74         75         76         77         78 
## 0.15073716 0.46607599 0.24284830 0.38139149 0.20415281 0.42494837 
##         79         80         81         82         83         84 
## 0.43570986 0.65251556 0.16456653 0.31150713 0.20517359 0.08776685 
##         85         86         87         88         89         90 
## 0.21358749 0.25126279 0.34584314 0.37549461 0.55783057 0.51131037 
##         91         92         93         94         95         96 
## 0.49978497 0.63809471 0.57000341 0.26968427 0.40010880 0.37907977 
##         97         98         99        100        101        102 
## 0.22063013 0.33002244 0.31762762 0.14640896 0.11633954 0.24114689 
##        103        104        105        106        107        108 
## 0.11883427 0.28100436 0.50126183 0.35394219 0.61241920 0.25695415 
##        109        110        111        112        113        114 
## 0.11218813 0.30904921 0.17869743 0.13603549 0.10881750 0.48942091 
##        115        116        117        118        119        120 
## 0.35153649 0.32780508 0.29004920 0.47768876 0.68922540 0.09863460 
##        121        122        123        124        125        126 
## 0.38205848 0.19283124 0.13456621 0.14161529 0.35890251 0.16784107 
##        127        128        129        130        131        132 
## 0.55353632 0.29761787 0.29364378 0.12270194 0.32900715 0.27429792 
##        133        134        135        136        137        138 
## 0.35016196 0.15167362 0.26397051 0.20956391 0.16855273 0.37076538 
##        139        140        141        142        143        144 
## 0.37104174 0.56147017 0.48592324 0.24487554 0.27496207 0.21702497 
##        145        146        147        148        149        150 
## 0.18326999 0.15292361 0.30053113 0.13202601 0.36278299 0.58590453 
##        151        152        153        154        155        156 
## 0.69607194 0.26076336 0.48793196 0.22533437 0.27701027 0.12691355 
##        157        158        159        160        161        162 
## 0.20243105 0.49385024 0.40979572 0.33767745 0.31214097 0.40081797 
##        163        164        165        166        167        168 
## 0.44572710 0.21536268 0.33209361 0.69237989 0.12564635 0.33881603 
##        169        170        171        172        173        174 
## 0.27253083 0.25713529 0.16766865 0.13610230 0.27045353 0.47601029 
##        175        176        177        178        179        180 
## 0.17207711 0.36543032 0.20079352 0.20929210 0.22290898 0.09702710 
##        181        182        183        184        185        186 
## 0.29173405 0.21592659 0.53390445 0.41213948 0.10284874 0.51016205 
##        187        188        189        190        191        192 
## 0.23875288 0.26184001 0.28313813 0.30160149 0.29894660 0.33797096 
##        193        194        195        196        197        198 
## 0.29780561 0.14252603 0.37361105 0.37499458 0.20306181 0.11520619 
##        199        200        201        202        203        204 
## 0.25867413 0.23203530 0.29790835 0.31450637 0.69237989 0.19176895 
##        205        206        207        208        209        210 
## 0.62160882 0.37552455 0.62994688 0.59336886 0.17269671 0.36867073 
##        211        212        213        214        215        216 
## 0.23500145 0.28417171 0.21145148 0.23806753 0.39069474 0.18303592 
##        217        218        219        220        221        222 
## 0.29144726 0.49458858 0.36532833 0.37499458 0.18691983 0.35841190 
##        223        224        225        226        227        228 
## 0.38346629 0.32549498 0.37234438 0.29200523 0.40539785 0.13119209 
##        229        230        231        232        233        234 
## 0.30562595 0.42917277 0.17040039 0.20845157 0.25212831 0.09688336 
##        235        236        237        238        239        240 
## 0.65921863 0.30806878 0.40979572 0.41039144 0.10815929 0.27465027 
##        241        242        243        244        245        246 
## 0.19001218 0.56239934 0.19616746 0.33794240 0.41996550 0.40736827 
##        247        248        249        250        251        252 
## 0.39171070 0.24596016 0.29657173 0.29278619 0.20011793 0.17414395 
##        253        254        255        256        257        258 
## 0.43247252 0.18780755 0.26200847 0.23371984 0.30267400 0.32075797 
##        259        260        261        262        263        264 
## 0.33944941 0.46187255 0.34863249 0.24298996 0.16969339 0.32075797 
##        265        266        267        268        269        270 
## 0.26562483 0.14378335 0.15865328 0.26021896 0.41492493 0.12579904 
##        271        272        273        274        275        276 
## 0.48994106 0.19310678 0.45641226 0.54337733 0.27302605 0.28684953 
##        277        278        279        280        281        282 
## 0.22143462 0.55028996 0.16945136 0.34384116 0.49925174 0.13172559 
##        283        284        285        286        287        288 
## 0.21874547 0.13337693 0.28021662 0.17925207 0.60122274 0.25502619 
##        289        290        291        292        293        294 
## 0.23197657 0.05878643 0.38047126 0.35008696 0.46240272 0.73372225 
##        295        296        297        298        299        300 
## 0.29885443 0.17659931 0.45483793 0.23950580 0.34785059 0.27566478 
##        301        302        303        304        305        306 
## 0.36288468 0.28067279 0.22671860 0.51860565 0.07198547 0.19060160 
##        307        308        309        310        311        312 
## 0.44561844 0.37054412 0.28373804 0.12588934 0.30028221 0.44520022 
##        313        314        315        316        317        318 
## 0.30907647 0.19322270 0.17701800 0.15412239 0.18491373 0.29806393 
##        319        320        321        322        323        324 
## 0.18670880 0.46755914 0.14630641 0.32183935 0.12035456 0.17486941 
##        325        326        327        328        329        330 
## 0.12112920 0.66498227 0.38597852 0.35450549 0.33926538 0.11370930 
##        331        332        333        334        335        336 
## 0.39213236 0.27905234 0.34097123 0.21344965 0.20393972 0.59795326 
##        337        338        339        340        341        342 
## 0.16520993 0.16070084 0.45158492 0.26006097 0.14037382 0.12659514 
##        343        344        345        346        347        348 
## 0.22560760 0.29075910 0.18859648 0.14657301 0.35132030 0.42636137 
##        349        350        351        352        353        354 
## 0.25767548 0.27488628 0.57858815 0.23714608 0.18120291 0.43779599 
##        355        356        357        358        359        360 
## 0.40050290 0.49758253 0.38909423 0.57487559 0.25063922 0.37007654 
##        361        362        363        364        365        366 
## 0.59956970 0.50972425 0.35412991 0.29777892 0.49491656 0.11836196 
##        367        368        369        370        371        372 
## 0.12645014 0.26745319 0.63170496 0.56803162 0.39857395 0.31708679 
##        373        374        375        376        377        378 
## 0.37650752 0.53085361 0.41142403 0.18735742 0.41512421 0.58958954 
##        379        380        381        382        383        384 
## 0.20223990 0.21896113 0.46366743 0.34602886 0.34967678 0.67275941 
##        385        386        387        388        389        390 
## 0.18665107 0.35189341 0.52842881 0.34287938 0.33908140 0.40275050 
##        391        392        393        394        395        396 
## 0.40093595 0.48719398 0.22202911 0.43872524 0.25342327 0.48866999 
##        397        398        399        400 
## 0.16550430 0.18106222 0.46366743 0.30073055

pred <- prediction(p, mydata$admit)
perf <- performance(pred, "tpr", "fpr")
plot(perf, col=rainbow(10))

a=sample(1:400,20,F)
a

##  [1]  45 159  18 230 365 119 153 262 265 275 378 111 244 286  83 237 314
## [18] 383 139 387

p2 = predict(mylogit,
            mydata[a,], 
            type="response")
p2

##         45        159         18        230        365        119 
## 0.32817441 0.40979572 0.07895335 0.42917277 0.49491656 0.68922540 
##        153        262        265        275        378        111 
## 0.48793196 0.24298996 0.26562483 0.27302605 0.58958954 0.17869743 
##        244        286         83        237        314        383 
## 0.33794240 0.17925207 0.20517359 0.40979572 0.19322270 0.34967678 
##        139        387 
## 0.37104174 0.52842881

pred2 <- prediction(p2, mydata[a,1])
#pred2
perf2 <- performance(pred2, "tpr", "fpr")
plot(perf2, col=rainbow(10))

auc <- performance(pred2,"auc")
auc

## An object of class "performance"
## Slot "x.name":
## [1] "None"
## 
## Slot "y.name":
## [1] "Area under the ROC curve"
## 
## Slot "alpha.name":
## [1] "none"
## 
## Slot "x.values":
## list()
## 
## Slot "y.values":
## [[1]]
## [1] 0.8535354
## 
## 
## Slot "alpha.values":
## list()

##Confusion Matrix
#https://gist.github.com/ryanwitt/2911560
confusion.glm <- function(data, model) {
  prediction <- ifelse(predict(model, data, type='response') > 0.5, TRUE, FALSE)
  confusion  <- table(prediction, as.logical(model$y))
  confusion  <- cbind(confusion, c(1 - confusion[1,1]/(confusion[1,1]+confusion[2,1]), 1 - confusion[2,2]/(confusion[2,2]+confusion[1,2])))
  confusion  <- as.data.frame(confusion)
  names(confusion) <- c('FALSE', 'TRUE', 'class.error')
  confusion
}

confusion.glm(mydata,mylogit)

##       FALSE TRUE class.error
## FALSE   254   97  0.06959707
## TRUE     19   30  0.76377953

glm_models.R

ajayohri

Fri Sep 23 22:13:08 2016