library(ISLR)
attach(Default)
#?Default
table(Default$default)
##
## No Yes
## 9667 333
str(Default)
## 'data.frame': 10000 obs. of 4 variables:
## $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
## $ balance: num 730 817 1074 529 786 ...
## $ income : num 44362 12106 31767 35704 38463 ...
summary(Default)
## default student balance income
## No :9667 No :7056 Min. : 0.0 Min. : 772
## Yes: 333 Yes:2944 1st Qu.: 481.7 1st Qu.:21340
## Median : 823.6 Median :34553
## Mean : 835.4 Mean :33517
## 3rd Qu.:1166.3 3rd Qu.:43808
## Max. :2654.3 Max. :73554
library(ggplot2)
#install.packages("gridExtra")
library(gridExtra)
x<-qplot(x=balance,y=income,color=default,
shape=default,
geom='point')+scale_shape(solid=F)
y<-qplot(x=default,y=balance,fill=default,
geom = 'boxplot')+guides(fill=F)
z<-qplot(x=default,y=income,fill=default,
geom='boxplot')+guides(fill=F)
x

y

z

fit<-glm(default~income,data=Default,
family = "binomial")
summary(fit)
##
## Call:
## glm(formula = default ~ income, family = "binomial", data = Default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.2968 -0.2723 -0.2576 -0.2478 2.7111
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.094e+00 1.463e-01 -21.156 <2e-16 ***
## income -8.353e-06 4.207e-06 -1.985 0.0471 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 2916.7 on 9998 degrees of freedom
## AIC: 2920.7
##
## Number of Fisher Scoring iterations: 6
fit<-glm(default~income+balance+student,data=Default,
family = "binomial")
#install.packages("MASS")
library(MASS)
step<-stepAIC(fit)
## Start: AIC=1579.54
## default ~ income + balance + student
##
## Df Deviance AIC
## - income 1 1571.7 1577.7
## <none> 1571.5 1579.5
## - student 1 1579.0 1585.0
## - balance 1 2907.5 2913.5
##
## Step: AIC=1577.68
## default ~ balance + student
##
## Df Deviance AIC
## <none> 1571.7 1577.7
## - student 1 1596.5 1600.5
## - balance 1 2908.7 2912.7
###########Example 2################
data<-read.csv("http://www.ats.ucla.edu/stat/data/binary.csv")
View(data)
summary(data)
## admit gre gpa rank
## Min. :0.0000 Min. :220.0 Min. :2.260 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:520.0 1st Qu.:3.130 1st Qu.:2.000
## Median :0.0000 Median :580.0 Median :3.395 Median :2.000
## Mean :0.3175 Mean :587.7 Mean :3.390 Mean :2.485
## 3rd Qu.:1.0000 3rd Qu.:660.0 3rd Qu.:3.670 3rd Qu.:3.000
## Max. :1.0000 Max. :800.0 Max. :4.000 Max. :4.000
str(data)
## 'data.frame': 400 obs. of 4 variables:
## $ admit: int 0 1 1 1 0 1 1 0 1 0 ...
## $ gre : int 380 660 800 640 520 760 560 400 540 700 ...
## $ gpa : num 3.61 3.67 4 3.19 2.93 3 2.98 3.08 3.39 3.92 ...
## $ rank : int 3 3 1 4 4 2 1 2 3 2 ...
data$rank<-as.factor(data$rank)
logit<-glm(admit~.,data=data,family="binomial")
summary(logit)
##
## Call:
## glm(formula = admit ~ ., family = "binomial", data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6268 -0.8662 -0.6388 1.1490 2.0790
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.989979 1.139951 -3.500 0.000465 ***
## gre 0.002264 0.001094 2.070 0.038465 *
## gpa 0.804038 0.331819 2.423 0.015388 *
## rank2 -0.675443 0.316490 -2.134 0.032829 *
## rank3 -1.340204 0.345306 -3.881 0.000104 ***
## rank4 -1.551464 0.417832 -3.713 0.000205 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 499.98 on 399 degrees of freedom
## Residual deviance: 458.52 on 394 degrees of freedom
## AIC: 470.52
##
## Number of Fisher Scoring iterations: 4
stepAIC(logit)
## Start: AIC=470.52
## admit ~ gre + gpa + rank
##
## Df Deviance AIC
## <none> 458.52 470.52
## - gre 1 462.88 472.88
## - gpa 1 464.53 474.53
## - rank 3 480.34 486.34
##
## Call: glm(formula = admit ~ gre + gpa + rank, family = "binomial",
## data = data)
##
## Coefficients:
## (Intercept) gre gpa rank2 rank3
## -3.989979 0.002264 0.804038 -0.675443 -1.340204
## rank4
## -1.551464
##
## Degrees of Freedom: 399 Total (i.e. Null); 394 Residual
## Null Deviance: 500
## Residual Deviance: 458.5 AIC: 470.5
##create new data
ranktest<-with(data,data.frame(gre=mean(gre),
gpa=mean(gpa),
rank=factor(1:4)))
ranktest
## gre gpa rank
## 1 587.7 3.3899 1
## 2 587.7 3.3899 2
## 3 587.7 3.3899 3
## 4 587.7 3.3899 4
#output<-predict(logit,ranktest)
#View(as.data.frame(output))
ranktest$Predicted<-predict(logit,ranktest,type='response')
#View(ranktest)
ranktest$PredictedOutcome<-ifelse(ranktest$Predicted>0.5,1,0)
View(ranktest)
##create new data
tmp<-with(data,data.frame(gre=rep(seq(200,800,length.out=100),4),
gpa=mean(gpa),
rank=factor(rep(1:4,each=100))))
probs<-cbind(tmp,predict(logit,newdata = tmp,
type='link',se=T))
probs<-within(probs,{
PredProb<-plogis(fit)#fit logistic curve
LL<-plogis(fit-(1.96*se.fit))#create lower limits
UL<-plogis(fit+(1.96*se.fit))#create upper limits
})
View(probs)
##plotting ribbon curve using calculated values
library(ggplot2)
ggplot(probs,aes(x=gre,y=PredProb))+
geom_ribbon(aes(ymin=LL,ymax=UL,fill=rank),alpha=.2)+
geom_line(aes(color=rank))
