0. Setup

work.d<-"C:/Users/iihsk/Desktop/SeongJin Kim/6. ESC/2. Assignments/4. Week4 Assignment"
setwd(work.d)
ucla<-read.csv("binary.csv", header=T)
attach(ucla)
str(ucla)
## 'data.frame':    400 obs. of  4 variables:
##  $ admit: int  0 1 1 1 0 1 1 0 1 0 ...
##  $ gre  : int  380 660 800 640 520 760 560 400 540 700 ...
##  $ gpa  : num  3.61 3.67 4 3.19 2.93 3 2.98 3.08 3.39 3.92 ...
##  $ rank : int  3 3 1 4 4 2 1 2 3 2 ...
cor(ucla[,-1])
##             gre         gpa        rank
## gre   1.0000000  0.38426588 -0.12344707
## gpa   0.3842659  1.00000000 -0.05746077
## rank -0.1234471 -0.05746077  1.00000000
cor(gpa, gre)
## [1] 0.3842659
lm.gpa.gre<-lm(gre~gpa)
plot(gpa, gre)
abline(lm.gpa.gre, col="red")

1. Construct Logistic Regression Model

llm.fit<-glm(admit~., data=ucla, family="binomial")
summary(llm.fit)
## 
## Call:
## glm(formula = admit ~ ., family = "binomial", data = ucla)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5802  -0.8848  -0.6382   1.1575   2.1732  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.449548   1.132846  -3.045  0.00233 ** 
## gre          0.002294   0.001092   2.101  0.03564 *  
## gpa          0.777014   0.327484   2.373  0.01766 *  
## rank        -0.560031   0.127137  -4.405 1.06e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 499.98  on 399  degrees of freedom
## Residual deviance: 459.44  on 396  degrees of freedom
## AIC: 467.44
## 
## Number of Fisher Scoring iterations: 4
par(mfrow=c(2,2))
plot(llm.fit)

2. Calculating odds, logits for each predictors.

A. Probability of Y=1 with given predictors

llm.probs<-predict(llm.fit, type="response")
head(llm.probs, 4)
##         1         2         3         4 
## 0.1895527 0.3177807 0.7178136 0.1489492
odds.llm<-llm.probs/(1-llm.probs)
logits.llm<-log(llm.probs/(1-llm.probs))

3. Constructing model with separate training data; test for predictory power.

A. Reconstruct logistic regression model with 280 separate training data

set.seed(8783)
test = sample(1:nrow(ucla),3/10*nrow(ucla), replace=FALSE)
str(ucla[-test,])
## 'data.frame':    280 obs. of  4 variables:
##  $ admit: int  0 1 1 1 0 1 1 0 1 0 ...
##  $ gre  : int  380 660 800 640 520 760 560 400 540 800 ...
##  $ gpa  : num  3.61 3.67 4 3.19 2.93 3 2.98 3.08 3.39 4 ...
##  $ rank : int  3 3 1 4 4 2 1 2 3 4 ...
llm.fit<-glm(admit~., data=ucla[-test,],family="binomial")
summary(llm.fit)
## 
## Call:
## glm(formula = admit ~ ., family = "binomial", data = ucla[-test, 
##     ])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4858  -0.8856  -0.6127   1.1420   2.1806  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.156955   1.404417  -2.960 0.003077 ** 
## gre          0.002249   0.001312   1.714 0.086547 .  
## gpa          0.952075   0.404758   2.352 0.018662 *  
## rank        -0.524506   0.147454  -3.557 0.000375 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 348.59  on 279  degrees of freedom
## Residual deviance: 317.98  on 276  degrees of freedom
## AIC: 325.98
## 
## Number of Fisher Scoring iterations: 4
attach(llm.fit)
## The following object is masked from ucla:
## 
##     rank

B. Test model’s predictory power with test set.

drumroll<-ucla[test,]$admit
llm.probs<-predict(llm.fit, ucla[test,], type="response")
llm.pred=rep(0,3/10*nrow(ucla))
llm.pred[llm.probs>=0.5]=1
table(llm.pred, drumroll)
##         drumroll
## llm.pred  0  1
##        0 77 33
##        1  4  6
mean(llm.pred!=drumroll)
## [1] 0.3083333