library(ISLR)
attach(Default)
#?Default
table(Default$default)
## 
##   No  Yes 
## 9667  333
str(Default)
## 'data.frame':    10000 obs. of  4 variables:
##  $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
##  $ balance: num  730 817 1074 529 786 ...
##  $ income : num  44362 12106 31767 35704 38463 ...
summary(Default)
##  default    student       balance           income     
##  No :9667   No :7056   Min.   :   0.0   Min.   :  772  
##  Yes: 333   Yes:2944   1st Qu.: 481.7   1st Qu.:21340  
##                        Median : 823.6   Median :34553  
##                        Mean   : 835.4   Mean   :33517  
##                        3rd Qu.:1166.3   3rd Qu.:43808  
##                        Max.   :2654.3   Max.   :73554
library(ggplot2)
#install.packages("gridExtra")
library(gridExtra)
x<-qplot(x=balance,y=income,color=default,
         shape=default,
         geom='point')+scale_shape(solid=F)
y<-qplot(x=default,y=balance,fill=default,
         geom = 'boxplot')+guides(fill=F)
z<-qplot(x=default,y=income,fill=default,
         geom='boxplot')+guides(fill=F)
x

y

z

fit<-glm(default~income,data=Default,
         family = "binomial")
summary(fit)
## 
## Call:
## glm(formula = default ~ income, family = "binomial", data = Default)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.2968  -0.2723  -0.2576  -0.2478   2.7111  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.094e+00  1.463e-01 -21.156   <2e-16 ***
## income      -8.353e-06  4.207e-06  -1.985   0.0471 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 2916.7  on 9998  degrees of freedom
## AIC: 2920.7
## 
## Number of Fisher Scoring iterations: 6
fit<-glm(default~income+balance+student,data=Default,
         family = "binomial")
#install.packages("MASS")
library(MASS)
step<-stepAIC(fit)
## Start:  AIC=1579.54
## default ~ income + balance + student
## 
##           Df Deviance    AIC
## - income   1   1571.7 1577.7
## <none>         1571.5 1579.5
## - student  1   1579.0 1585.0
## - balance  1   2907.5 2913.5
## 
## Step:  AIC=1577.68
## default ~ balance + student
## 
##           Df Deviance    AIC
## <none>         1571.7 1577.7
## - student  1   1596.5 1600.5
## - balance  1   2908.7 2912.7
###########Example 2################
data<-read.csv("http://www.ats.ucla.edu/stat/data/binary.csv")
View(data)
summary(data)
##      admit             gre             gpa             rank      
##  Min.   :0.0000   Min.   :220.0   Min.   :2.260   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:520.0   1st Qu.:3.130   1st Qu.:2.000  
##  Median :0.0000   Median :580.0   Median :3.395   Median :2.000  
##  Mean   :0.3175   Mean   :587.7   Mean   :3.390   Mean   :2.485  
##  3rd Qu.:1.0000   3rd Qu.:660.0   3rd Qu.:3.670   3rd Qu.:3.000  
##  Max.   :1.0000   Max.   :800.0   Max.   :4.000   Max.   :4.000
str(data)
## 'data.frame':    400 obs. of  4 variables:
##  $ admit: int  0 1 1 1 0 1 1 0 1 0 ...
##  $ gre  : int  380 660 800 640 520 760 560 400 540 700 ...
##  $ gpa  : num  3.61 3.67 4 3.19 2.93 3 2.98 3.08 3.39 3.92 ...
##  $ rank : int  3 3 1 4 4 2 1 2 3 2 ...
data$rank<-as.factor(data$rank)
logit<-glm(admit~.,data=data,family="binomial")
summary(logit)
## 
## Call:
## glm(formula = admit ~ ., family = "binomial", data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6268  -0.8662  -0.6388   1.1490   2.0790  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.989979   1.139951  -3.500 0.000465 ***
## gre          0.002264   0.001094   2.070 0.038465 *  
## gpa          0.804038   0.331819   2.423 0.015388 *  
## rank2       -0.675443   0.316490  -2.134 0.032829 *  
## rank3       -1.340204   0.345306  -3.881 0.000104 ***
## rank4       -1.551464   0.417832  -3.713 0.000205 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 499.98  on 399  degrees of freedom
## Residual deviance: 458.52  on 394  degrees of freedom
## AIC: 470.52
## 
## Number of Fisher Scoring iterations: 4
stepAIC(logit)
## Start:  AIC=470.52
## admit ~ gre + gpa + rank
## 
##        Df Deviance    AIC
## <none>      458.52 470.52
## - gre   1   462.88 472.88
## - gpa   1   464.53 474.53
## - rank  3   480.34 486.34
## 
## Call:  glm(formula = admit ~ gre + gpa + rank, family = "binomial", 
##     data = data)
## 
## Coefficients:
## (Intercept)          gre          gpa        rank2        rank3  
##   -3.989979     0.002264     0.804038    -0.675443    -1.340204  
##       rank4  
##   -1.551464  
## 
## Degrees of Freedom: 399 Total (i.e. Null);  394 Residual
## Null Deviance:       500 
## Residual Deviance: 458.5     AIC: 470.5
##create new data
ranktest<-with(data,data.frame(gre=mean(gre),
                               gpa=mean(gpa),
                               rank=factor(1:4)))
ranktest
##     gre    gpa rank
## 1 587.7 3.3899    1
## 2 587.7 3.3899    2
## 3 587.7 3.3899    3
## 4 587.7 3.3899    4
#output<-predict(logit,ranktest)
#View(as.data.frame(output))
ranktest$Predicted<-predict(logit,ranktest,type='response')
#View(ranktest)
ranktest$PredictedOutcome<-ifelse(ranktest$Predicted>0.5,1,0)
View(ranktest)
##create new data
tmp<-with(data,data.frame(gre=rep(seq(200,800,length.out=100),4),
                          gpa=mean(gpa),
                          rank=factor(rep(1:4,each=100))))
probs<-cbind(tmp,predict(logit,newdata = tmp,
                         type='link',se=T))
probs<-within(probs,{
  PredProb<-plogis(fit)#fit logistic curve
  LL<-plogis(fit-(1.96*se.fit))#create lower limits
  UL<-plogis(fit+(1.96*se.fit))#create upper limits
})
View(probs)
##plotting ribbon curve using calculated values
library(ggplot2)
ggplot(probs,aes(x=gre,y=PredProb))+
  geom_ribbon(aes(ymin=LL,ymax=UL,fill=rank),alpha=.2)+
  geom_line(aes(color=rank))