LogisticRegression.r

setwd("D:/Class Materials & Work/Summer 2020 practice/Logistic Regression")
getwd()

## [1] "D:/Class Materials & Work/Summer 2020 practice/Logistic Regression"

#Prepare the data-----------------------------------------

#Import a dataframe
df <- read.csv("binary.csv", header = T)
str(df) #Check data structure

## 'data.frame':    400 obs. of  4 variables:
##  $ admit: int  0 1 1 1 0 1 1 0 1 0 ...
##  $ gre  : int  380 660 800 640 520 760 560 400 540 700 ...
##  $ gpa  : num  3.61 3.67 4 3.19 2.93 3 2.98 3.08 3.39 3.92 ...
##  $ rank : int  3 3 1 4 4 2 1 2 3 2 ...

#Our aim is to build a model so that predict the probability of that student getting admit if we are given their profile.

sum(is.na(df)) #Check if there is any missing data

## [1] 0

summary(df)

##      admit             gre             gpa             rank      
##  Min.   :0.0000   Min.   :220.0   Min.   :2.260   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:520.0   1st Qu.:3.130   1st Qu.:2.000  
##  Median :0.0000   Median :580.0   Median :3.395   Median :2.000  
##  Mean   :0.3175   Mean   :587.7   Mean   :3.390   Mean   :2.485  
##  3rd Qu.:1.0000   3rd Qu.:660.0   3rd Qu.:3.670   3rd Qu.:3.000  
##  Max.   :1.0000   Max.   :800.0   Max.   :4.000   Max.   :4.000

#We can notice that there are a greater number of rejects than there are acceptance since the mean of variable admit is less than "0.5".

#We do this to check if the admits are distributed well enough in each category of rank. If let's say one rank has only 5 admit or reject information, then it will not be necessary to include that rank in analysis.
#Crosstabs

xtabs(~ admit +rank ,data=df)

##      rank
## admit  1  2  3  4
##     0 28 97 93 55
##     1 33 54 28 12

#Convert rank variable to factor

df$rank <- as.factor(df$rank)

#Run logit function-----------------------------------------

logit <- glm(admit ~ gre+gpa+rank,data=df,family="binomial")
summary(logit)

## 
## Call:
## glm(formula = admit ~ gre + gpa + rank, family = "binomial", 
##     data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6268  -0.8662  -0.6388   1.1490   2.0790  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.989979   1.139951  -3.500 0.000465 ***
## gre          0.002264   0.001094   2.070 0.038465 *  
## gpa          0.804038   0.331819   2.423 0.015388 *  
## rank2       -0.675443   0.316490  -2.134 0.032829 *  
## rank3       -1.340204   0.345306  -3.881 0.000104 ***
## rank4       -1.551464   0.417832  -3.713 0.000205 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 499.98  on 399  degrees of freedom
## Residual deviance: 458.52  on 394  degrees of freedom
## AIC: 470.52
## 
## Number of Fisher Scoring iterations: 4

#Interpretation of each predictor
# 1.Each one-unit change in "gre" will increase the log odds of getting admit by 0.002, and its p-value indicates that it is somewhat significant in determining the admit.
# 2.Each unit increase in "GPA" increases the log odds of getting admit by 0.80 and p-value indicates that it is somewhat significant in determining the admit.
# 3.The interpretation of rank is different from one to another. Dropping from rank-2 college from rank-1 college will decrease the log odds of getting admit by -0.67. Likewise, going from rank-2 to rank-3 and rank-3 to rank-4 will decrease it by -1.340 and -1.551 respectively.
# 4.The difference between Null deviance and Residual deviance tells us that the model is a good fit. 
#   Greater the difference better the model. Null deviance is the value when you only have intercept in your equation with no variables and Residual deviance is the value when you are taking all the variables into account.
#   It makes sense to consider the model good if that difference is big enough.

#Predict a new sample
new.sample <- data.frame(gre=790,gpa=3.8,rank=as.factor("1"))
p <- predict(logit, new.sample)
p

##       1 
## 0.85426

#there is a chance of 85.42% that the new sample student will get admitted given their current GRE, GPA, and colege rank.

#We can use the confint function to obtain confidence intervals for the coefficient estimates.
confint(logit)

## Waiting for profiling to be done...

##                     2.5 %       97.5 %
## (Intercept) -6.2716202334 -1.792547080
## gre          0.0001375921  0.004435874
## gpa          0.1602959439  1.464142727
## rank2       -1.3008888002 -0.056745722
## rank3       -2.0276713127 -0.670372346
## rank4       -2.4000265384 -0.753542605

#We can also get CIs based on just the standard errors by using the default method.
confint.default(logit)

##                     2.5 %       97.5 %
## (Intercept) -6.2242418514 -1.755716295
## gre          0.0001202298  0.004408622
## gpa          0.1536836760  1.454391423
## rank2       -1.2957512650 -0.055134591
## rank3       -2.0169920597 -0.663415773
## rank4       -2.3703986294 -0.732528724

#Test the overall effect of "rank" using wald test-----------------------------------------
require(aod)

## Loading required package: aod

## Warning: package 'aod' was built under R version 4.0.2

#The order in which the coefficients are given in the table of coefficients is the same as the order of the terms in the model (see summary(logit)). 
#This is important because the wald.test function refers to the coefficients by their order in the model.

#b supplies the coefficients
#Sigma supplies the variance covariance matrix of the error terms
#Terms tells R which terms (column in the coefficient table) in the model are to be tested. In this case, terms 4, 5, and 6, are the three terms for the levels of rank 2, 3, and 4 respectively.

#Wald test for Rank
wald.test(b = coef(logit), Sigma = vcov(logit), Terms = 4:6)

## Wald test:
## ----------
## 
## Chi-squared test:
## X2 = 20.9, df = 3, P(> X2) = 0.00011

#The chi-squared test statistic of 20.9, df = 3, and p = 0.00011 indicate that the overall effect of rank is statistically significant.

#Wald test for GPA
wald.test(b = coef(logit), Sigma = vcov(logit), Terms = 3)

## Wald test:
## ----------
## 
## Chi-squared test:
## X2 = 5.9, df = 1, P(> X2) = 0.015

#Chi-squared = 5.9, df = 1, and p = 0.015 indicate that the overall effect of GPA is statistically significant at Alpha 0.05.

#Wald test for GRE score
wald.test(b = coef(logit), Sigma = vcov(logit), Terms = 2)

## Wald test:
## ----------
## 
## Chi-squared test:
## X2 = 4.3, df = 1, P(> X2) = 0.038

#Chi-squared = 4.3, df = 1, and p = 0.038 indicate that the overall effect of GRE is statistically significant at Alpha 0.05.

#We can also test additional hypotheses about the differences in the coefficients for the different levels of rank.
#We will test if the coefficient of Rank 2 is equal to Rank 3.

l <- cbind(0, 0, 0, 1, -1, 0) #Creates a vector "l" that defines the test we want to perform.

#We want to test the difference of the terms for rank 2 and rank 3 (the 4th and 5th terms in the model).
#We multiply one of the rank by 1, another, -1, other irrelevant terms, 0.
#L = l means we want to test the vector "l" that we created above.

wald.test(b = coef(logit), Sigma = vcov(logit), L = l)

## Wald test:
## ----------
## 
## Chi-squared test:
## X2 = 5.5, df = 1, P(> X2) = 0.019

#Chi-squared = 5.5, df = 1, and p = 0.019 indicate that the coefficient difference between rank 2 and 3 is statistically significant at Alpha 0.05.

#exponentiate the coefficients and interpret them as odds-ratios.
exp(coef(logit))

## (Intercept)         gre         gpa       rank2       rank3       rank4 
##   0.0185001   1.0022670   2.2345448   0.5089310   0.2617923   0.2119375

#get odds ratios (OR) and their confidence intervals in the same table using "cbind"

exp(cbind(OR = coef(logit), confint(logit)))

## Waiting for profiling to be done...

##                    OR       2.5 %    97.5 %
## (Intercept) 0.0185001 0.001889165 0.1665354
## gre         1.0022670 1.000137602 1.0044457
## gpa         2.2345448 1.173858216 4.3238349
## rank2       0.5089310 0.272289674 0.9448343
## rank3       0.2617923 0.131641717 0.5115181
## rank4       0.2119375 0.090715546 0.4706961

#Interpretation: For a one unit increase in gpa, the odds of being admitted to graduate school (versus not being admitted) increase by a factor of 2.23. 
#More information on "https://stats.idre.ucla.edu/other/mult-pkg/faq/general/faq-how-do-i-interpret-odds-ratios-in-logistic-regression/"

#Predicted Probabilities-----------------------------------------

#Predicted probabilities can be computed for both categorical and continuous predictor variables.

#Create a new data frame with the values we want the independent variables to take on to create our predictions.

#Calculating the predicted probability of admission at each value of rank, holding gre and gpa at their means.
newdata1 <- with(df, data.frame(gre = mean(gre), gpa = mean(gpa), rank = factor(1:4)))
newdata1

##     gre    gpa rank
## 1 587.7 3.3899    1
## 2 587.7 3.3899    2
## 3 587.7 3.3899    3
## 4 587.7 3.3899    4

#Predicting
newdata1$rankProbability <- predict(logit, newdata = newdata1, type = "response")
newdata1

##     gre    gpa rank rankProbability
## 1 587.7 3.3899    1       0.5166016
## 2 587.7 3.3899    2       0.3522846
## 3 587.7 3.3899    3       0.2186120
## 4 587.7 3.3899    4       0.1846684

#Students from college rank 1 to 4 have probability ot getting admitted into graduate school of 51%, 35%, 21%, and 18%, given that they all have the same GRE score and GPA.

#Predicted Probabilities with a large dataset-----------------------------------------

#Try implementing the method above with a large data set and generate a plot.

#Creating  a table of predicted probabilities varying the value of gre and rank.
newdata2 <- with(df, data.frame(gre = rep(seq(from = 200, to = 800, length.out = 100),
                                    4), gpa = mean(gpa), rank = factor(rep(1:4, each = 100))))

#Add standard error, residual, and model fit coefficient.

newdata3 <- cbind(newdata2, predict(logit, newdata = newdata2, type = "link", se = TRUE))

#Add Confidence Interval and Predicted probability

newdata3 <- within(newdata3, {
  PredictedProb <- plogis(fit)
  LL <- plogis(fit - (1.96 * se.fit))
  UL <- plogis(fit + (1.96 * se.fit))
})

head(newdata3)

##        gre    gpa rank        fit    se.fit residual.scale        UL        LL
## 1 200.0000 3.3899    1 -0.8114870 0.5147714              1 0.5492064 0.1393812
## 2 206.0606 3.3899    1 -0.7977632 0.5090986              1 0.5498513 0.1423880
## 3 212.1212 3.3899    1 -0.7840394 0.5034491              1 0.5505074 0.1454429
## 4 218.1818 3.3899    1 -0.7703156 0.4978239              1 0.5511750 0.1485460
## 5 224.2424 3.3899    1 -0.7565919 0.4922237              1 0.5518545 0.1516973
## 6 230.3030 3.3899    1 -0.7428681 0.4866494              1 0.5525464 0.1548966
##   PredictedProb
## 1     0.3075737
## 2     0.3105042
## 3     0.3134499
## 4     0.3164108
## 5     0.3193867
## 6     0.3223773

#Graph the predicted probability
require(ggplot2)

## Loading required package: ggplot2

ggplot(newdata3, aes(x = gre, y = PredictedProb)) + geom_ribbon(aes(ymin = LL,
      ymax = UL, fill = rank), alpha = 0.2) + geom_line(aes(colour = rank), size = 1)

#Find out the difference in deviance, degree of freedom, and p-value between the empty model (NULL) and our model.
with(logit, null.deviance - deviance)

## [1] 41.45903

with(logit, df.null - df.residual)

## [1] 5

with(logit, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))

## [1] 7.578194e-08

#Chi-squared of 41.459, df = 5, and p-value < 0.001 indicate that our model as a whole fits significantly better than an empty model.

#For model's log likelihood.
logLik(logit)

## 'log Lik.' -229.2587 (df=6)

#Reference
#https://stats.idre.ucla.edu/r/dae/logit-regression/
#https://towardsdatascience.com/simply-explained-logistic-regression-with-example-in-r-b919acb1d6b3

LogisticRegression.r

tarid

2020-09-29