Statistical Learning: Week-5

Binary outcome as a generalization of linear regression model with limited dependent variables

Estimating a model of binary outcome using lm() function.

library(ISLR)
data(Default)
names(Default)

## [1] "default" "student" "balance" "income"

## Convert the outcome variable "default" to numeric: Yes = 1, No = 0
# The ifelse logic: ifelse(if a variable is equal (or is not equal) to something, code this variable as 1, else 0)
Default$default <- ifelse(Default$default == "Yes", 1, 0)

# Take a look at the converted outcome variable
table(Default$default) # display freq. distribution by category

## 
##    0    1 
## 9667  333

lm.default <- lm(default ~ balance + income, data =  Default)
summary(lm.default)  # Yes, you may still get the result from such model despite the DV being dichotomous

## 
## Call:
## lm(formula = default ~ balance + income, data = Default)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.24607 -0.06989 -0.02663  0.02005  0.98554 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -9.224e-02  5.788e-03 -15.936  < 2e-16 ***
## balance      1.318e-04  3.514e-06  37.511  < 2e-16 ***
## income       4.605e-07  1.274e-07   3.613 0.000304 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.168 on 9997 degrees of freedom
## Multiple R-squared:  0.1237, Adjusted R-squared:  0.1236 
## F-statistic: 705.8 on 2 and 9997 DF,  p-value: < 2.2e-16

GLM as an extension of linear regression model

Estimating a linear regression model using glm function.

library(ISLR)
data(Auto)
names(Auto)

## [1] "mpg"          "cylinders"    "displacement" "horsepower"   "weight"      
## [6] "acceleration" "year"         "origin"       "name"

# We now estimate glm without specifying its link function argument. This just comes down to a linear regression model (i.e., lm()) with continuous DV that you should have been familiar with...
# mpg is a continuous variable
glm.mpg <- glm(mpg ~ weight + cylinders, data=Auto)
summary(glm.mpg)

## 
## Call:
## glm(formula = mpg ~ weight + cylinders, data = Auto)
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 46.2923105  0.7939685  58.305   <2e-16 ***
## weight      -0.0063471  0.0005811 -10.922   <2e-16 ***
## cylinders   -0.7213779  0.2893780  -2.493   0.0131 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 18.52472)
## 
##     Null deviance: 23819.0  on 391  degrees of freedom
## Residual deviance:  7206.1  on 389  degrees of freedom
## AIC: 2261.7
## 
## Number of Fisher Scoring iterations: 2

# Verify the result with lm
glm.mpg <- lm(mpg ~ weight + cylinders, data=Auto)
summary(glm.mpg)

## 
## Call:
## lm(formula = mpg ~ weight + cylinders, data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.6469  -2.8282  -0.2905   2.1606  16.5856 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 46.2923105  0.7939685  58.305   <2e-16 ***
## weight      -0.0063471  0.0005811 -10.922   <2e-16 ***
## cylinders   -0.7213779  0.2893780  -2.493   0.0131 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.304 on 389 degrees of freedom
## Multiple R-squared:  0.6975, Adjusted R-squared:  0.6959 
## F-statistic: 448.4 on 2 and 389 DF,  p-value: < 2.2e-16

# Don't they look the same? In fact, they should be. A glm() without the "link" argument will estimate an OLS when the dependent variable is a continuous variable.

Estimating binary outcomes using OLS. OLS model simply treats binary data as continuous data with limited outcomes.

# Let's generate a new data column and append it to existing Auto dataframe
table(Auto$origin)

## 
##   1   2   3 
## 245  68  79

Auto$origin_recode <- ifelse(Auto$origin == 1, 1, 0)
# Take a look at the frequency distribution of the new data column
table(Auto$origin_recode)

## 
##   0   1 
## 147 245

# Estimating this binary outcome using OLS
recode.fit  <- lm(origin_recode ~ weight + cylinders, data=Auto)
summary(recode.fit)

## 
## Call:
## lm(formula = origin_recode ~ weight + cylinders, data = Auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.81102 -0.32853 -0.07169  0.29570  0.71127 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.071e-01  7.018e-02  -5.801 1.37e-08 ***
## weight       1.557e-04  5.137e-05   3.031   0.0026 ** 
## cylinders    1.039e-01  2.558e-02   4.062 5.88e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3804 on 389 degrees of freedom
## Multiple R-squared:  0.3872, Adjusted R-squared:  0.384 
## F-statistic: 122.9 on 2 and 389 DF,  p-value: < 2.2e-16

# lm() still works, although the estimated coefficients may be difficult to explain.

Probit exercise

Estimate a probit model, interpret the coefficients, and plot the result.

## download AER package and load HMDA data
# install.packages("AER")
library(AER)

## Warning: package 'AER' was built under R version 4.4.3

## Loading required package: car

## Loading required package: carData

## Loading required package: lmtest

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: sandwich

## Loading required package: survival

data(HMDA)

## Fit a probit model using glm() function, note how the link function is specified
probit.fit <- glm(deny ~ pirat, 
                  family = binomial(link = "probit"), 
                  data = HMDA)
summary(probit.fit)

## 
## Call:
## glm(formula = deny ~ pirat, family = binomial(link = "probit"), 
##     data = HMDA)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.1941     0.1378 -15.927  < 2e-16 ***
## pirat         2.9679     0.3858   7.694 1.43e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1744.2  on 2379  degrees of freedom
## Residual deviance: 1663.6  on 2378  degrees of freedom
## AIC: 1667.6
## 
## Number of Fisher Scoring iterations: 6

## Download mfx package and run the same model with the coefficients estimated as marginal effects
# install.packages("mfx")
library(mfx)

## Warning: package 'mfx' was built under R version 4.4.3

## Loading required package: MASS

## Loading required package: betareg

## Warning: package 'betareg' was built under R version 4.4.3

# Use pirat (payment-to-income ratio) as our X
probit_mfx <- probitmfx(deny ~ pirat, data = HMDA)

## Check out what's inside the probit_mfx object, how would you interpret the marginal effects?
names(probit_mfx)

## [1] "mfxest" "fit"    "dcvar"  "call"

probit_mfx$mfxest

##           dF/dx  Std. Err.        z        P>|z|
## pirat 0.5678076 0.07359498 7.715303 1.206949e-14

## In fact, you can do the same with the glm model using the margins function
# install.packages("margins")
library(margins)

## Warning: package 'margins' was built under R version 4.4.3

margins(probit.fit, type = "response")  # Apply margins() to your probit glm output

## Average marginal effects

## glm(formula = deny ~ pirat, family = binomial(link = "probit"),     data = HMDA)

##   pirat
##  0.5665

## Interpretation ##
# The result dF/dx of pirat should be interpreted as one unit increase in "pirat" is associated with an increase in the probability of being denied mortgage by 56.7% (or 56.7 percentage point) because the estimated beta is 0.5665 and you will need to explain it in probability terms.

## You can plot the predicted outcome (Y = {0, 1} the first plot) as well as predicted effect (the second plot) of pirat along with confidence intervals using the cploy() function!
cplot(probit.fit, "pirat", what = "prediction", main = "Predicted denied, given P/I ratio", xlab = "P/I Index", ylab = "Predicted Values on denied status")

cplot(probit.fit, "pirat", what = "effect", main = "Average Marginal Effect of P/I ratio", xlab = "P/I Index")

# Plot the data on a bivariate plot
plot(x = HMDA$pirat, 
     y = HMDA$deny,
     main = "Probit Model of the Probability of Denial, Given Payment-to-Income Ratio",
     xlab = "P/I ratio",
     ylab = "Deny",
     pch = 20,  # pch means "point shape"
     ylim = c(-0.4, 1.4), # gives the range (min, max) of y
     cex.main = 0.85)

# Add horizontal dashed lines and text
# Tell R where to place the horizontal lines at certain value(s) "h", line type "lty", and color "col"

abline(h = 1, lty = 2, col = "blue")
abline(h = 0, lty = 2, col = "darkred")
text(2.5, 0.9, cex = 0.8, "Mortgage denied")   # text(x, y, cex, "") shows the x-y coordinate of text position
text(2.5, -0.1, cex= 0.8, "Mortgage approved")

# Add estimated regression line
x <- seq(0, 3, 0.01)  # min (0) and max (3) x values by an increment of 0.01 
y <- predict(probit.fit, list(pirat = x), type = "response")  # predict (using the model you just fitted) on the x data, set type = "response" to get probability estimate

lines(x, y, lwd = 1.5, col = "steelblue")  # lwd = linewidth, set color to steelblue

Logit exercise

Working on the same data using logistic regression.

# install.packages("AER")
library(AER)
data(HMDA)
library(mfx)
library(margins)
# install.packages(c("plyr", "dplyr", "ggplot2", "modelr"))  # install multiple packages in one swoop
library(plyr)
library(dplyr)
library(modelr)


# Fit and estimate a univeriate logit model, note how the link function is specified
logit.fit <- glm(deny ~ pirat, 
                  family = binomial(link = "logit"), 
                  data = HMDA)
# It is the same as the following. glm()'s family argument uses logit as its default link function
logit.fit <- glm(deny ~ pirat, 
                  family = "binomial", 
                  data = HMDA)

summary(logit.fit)   # How would you interpret the coefficient?

## 
## Call:
## glm(formula = deny ~ pirat, family = "binomial", data = HMDA)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -4.0284     0.2686 -14.999  < 2e-16 ***
## pirat         5.8845     0.7336   8.021 1.05e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1744.2  on 2379  degrees of freedom
## Residual deviance: 1660.2  on 2378  degrees of freedom
## AIC: 1664.2
## 
## Number of Fisher Scoring iterations: 5

# There are quicker ways to access the logit.fit coefficient
coef(logit.fit)

## (Intercept)       pirat 
##   -4.028432    5.884498

# You can also use a similar function called logitmfx from the mfx package to estimate the marginal effects (measured in probability term) of this logit model
# library(mfx)
logit_mfx <- logitmfx(deny ~ pirat, data = HMDA)
# Take a look at what's inside the logit_mfx output
names(logit_mfx)

## [1] "mfxest" "fit"    "dcvar"  "call"

# The marginal effect (in probability term) of "pirat" is very similar in magnitude as with our previous probit estimate, one unit increase in pirat increases the probability of being denied mortgage by 58 percent.
logit_mfx$mfxest

##           dF/dx  Std. Err.        z        P>|z|
## pirat 0.5801374 0.07053629 8.224665 1.957367e-16

## Odds-ratio coefficient
# Exponentiating the logit coefficients to transform it to odds-ratio scale, how would you interpret the new coefficients?
logit.coef <- exp(logit.fit$coefficients)
logit.coef

##  (Intercept)        pirat 
##   0.01780222 359.42220584

# Since the exponentiated logit coefficient is measured in odds-ratio term, so one-unit increase in "pirat" is associated with 359.42220584 - 1 = 358.4 times MORE odds of being denied mortgage (relative to being approved).

# You can also replace the "coefficients" of logit.fit output with the exponentiated logit coef. generated above using the following code
# logit.fit$coefficients <- exp(logit.fit$coefficients)
# However, the original s.e. of logit.fit won't be consistent with your exponentiated logit coef.


## What to do (and what NOT to do) with the calculation of odds-ratio logit s.e.

## The WRONG way ##
logit.se <- sqrt(diag(vcov(logit.fit)))
exp(logit.se)

## (Intercept)       pirat 
##    1.308100    2.082562

## The appropriate way to approximate odds-ratio s.e. ##
## For the ease of data manipulation, it's better to first convert the glm output to data.frame format

library(broom)  # require broom package to use tidy() function
library(dplyr)  # require dplyr package to use mutate() function

logit.df <- tidy(logit.fit)  # Convert model to dataframe for the ease of manipulation

# Verify that if the newly generated object is of tbl format
class(logit.df)

## [1] "tbl_df"     "tbl"        "data.frame"

logit.df %>% 
  mutate(or = exp(estimate),  # estimated beta converted to odds-ratio scale
         var.diag = diag(vcov(logit.fit)),  # Variance of each coefficient
         or.se = sqrt(or^2 * var.diag)) # odd-ratio-ized s.e.

## # A tibble: 2 × 8
##   term        estimate std.error statistic  p.value       or var.diag     or.se
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>    <dbl>    <dbl>     <dbl>
## 1 (Intercept)    -4.03     0.269    -15.0  7.43e-51   0.0178   0.0721   0.00478
## 2 pirat           5.88     0.734      8.02 1.05e-15 359.       0.538  264.

# %>% is the so-called "pipelining," it passes previous output to the following function as its first argument

# Plot the predicted outcome and predicted effect of pirat

cplot(logit.fit, "pirat", what = "prediction", main = "Predicted denied, given P/I Index", xlab = "P/I Index", ylab = "Predicted Values on denied status")

cplot(logit.fit, "pirat", what = "effect", main = "Average Marginal Effect of P/I Index", xlab = "P/I Index")

## Plotting using ggplot2
# Require ggplot2 package
library(ggplot2)

HMDA %>%
  mutate(prob = ifelse(deny == "yes", 1, 0)) %>%   # mutate() means creating new data columns that are functions of existing variables
  ggplot(aes(pirat, prob)) +   # aes(x, y) means what should be on the x and y axis, pirat on the x-axis, prob on the y-axis
  geom_point(alpha = .15) +   # alpha is color transparency level
  geom_smooth(method = "glm", method.args = list(family = "binomial")) +  # use GLM with logit link function
  ggtitle("Logistic regression model fit") +
  xlab("P/I Index") +
  ylab("Probability of being denied mortgage")

## `geom_smooth()` using formula = 'y ~ x'

Code your own logistic regression model :)

# Define the function and parameters inside the function

mle.logitreg = function(func, data)
{
    # Define the negative log likelihood function
    logl <- function(theta,x,y){
      y <- y
      x <- as.matrix(x)
      beta <- theta[1:ncol(x)]  # estimated coefficients need to be of the same length as the number of x

      # p is defined as the logistic transformation of a linear combination of variables, according to logit(p) = (x%*%beta)

      loglik <- sum(-y*log(1 + exp(-(x%*%beta))) - (1-y)*log(1 + exp(x%*%beta)))  # The likelihood function
      return(-loglik)
    }

    # Preparing the data
    outcome = rownames(attr(terms(func),"factors"))[1] # Y variable
    dfrTmp = model.frame(data)
    x = as.matrix(model.matrix(func, data = dfrTmp)) # Creates a design matrix of a list of variables
    y = as.numeric(as.matrix(data[, match(outcome, colnames(data))])) # Set y to numeric class

    # Define initial values for the parameters
    theta.start = rep(0, (dim(x)[2]))  # Set all values to 0
    names(theta.start) = colnames(x)

    # Calculate the maximum likelihood using the built-in optim() function
    mle = optim(theta.start, logl, x = x, y = y, hessian = FALSE)  # optim(initial values, function, x, y, Hessian)

    # Obtain regression coefficients
    beta = mle$par
 
    # Calculate the Information matrix
    # The variance of a Bernoulli distribution is given by p(1-p)
    p = 1/(1 + exp(-x%*%beta))
    V = array(0, dim = c(dim(x)[1], dim(x)[1])) # Define vcov matrix corresponding to the dimension of x
    diag(V) = p*(1-p)  # Insert the estimated values into the diag 
    IB = t(x)%*%V%*%x   # Calculate vcov. t(x) is the transpose of x
 
    # Return estimates
    out = list(coef = beta, vcov = solve(IB), dev = 2*mle$value)
}


## Require AER package and download the HMDA data
library(AER)
data(HMDA)

## Convert "deny" to numerical value
HMDA$deny <- ifelse(HMDA$deny == "yes", 1, 0)

## Subset the data to a new dataframe
HMDA.data <- HMDA[, c("deny", "pirat", "unemp", "mhist")]
myfunc <- as.formula("deny ~ pirat + unemp + mhist")

## Run the model and examine the estimated coef. results
logit.result <- mle.logitreg(myfunc, HMDA.data)
logit.result$coef

## (Intercept)       pirat       unemp      mhist2      mhist3      mhist4 
## -3.51218279  2.93980416  0.04961861  0.34377073 -0.50521530 -0.59834125

Likelihood-ratio test

Fit two different models and compare their relative performance using likelihood-ratio test.

# install.packages("AER")
library(AER)
data(HMDA)
## Fit two different specifications of a logit model
logit.fit1 <- glm(deny ~ pirat, 
                  family = binomial(link = "logit"), 
                  data = HMDA)
summary(logit.fit1)

## 
## Call:
## glm(formula = deny ~ pirat, family = binomial(link = "logit"), 
##     data = HMDA)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -4.0284     0.2686 -14.999  < 2e-16 ***
## pirat         5.8845     0.7336   8.021 1.05e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1744.2  on 2379  degrees of freedom
## Residual deviance: 1660.2  on 2378  degrees of freedom
## AIC: 1664.2
## 
## Number of Fisher Scoring iterations: 5

## For the second model, add two more variables to see if they help improve model fit (in terms of the likelihood of observing "deny" (the outcome variable))
logit.fit2 <- glm(deny ~ pirat + unemp + mhist, 
                  family = binomial(link = "logit"), 
                  data = HMDA)
summary(logit.fit2)

## 
## Call:
## glm(formula = deny ~ pirat + unemp + mhist, family = binomial(link = "logit"), 
##     data = HMDA)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.83807    0.33025 -14.650  < 2e-16 ***
## pirat        5.93442    0.75077   7.904 2.69e-15 ***
## unemp        0.04645    0.02979   1.559  0.11894    
## mhist2       0.80240    0.16578   4.840 1.30e-06 ***
## mhist3       0.95779    0.45114   2.123  0.03375 *  
## mhist4       1.43193    0.54554   2.625  0.00867 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1744.2  on 2379  degrees of freedom
## Residual deviance: 1628.2  on 2374  degrees of freedom
## AIC: 1640.2
## 
## Number of Fisher Scoring iterations: 5

## The verdict? Likelihood-ratio test result, should we reject the null hypothesis that model 1 explains the data just as well as model 2?
lrtest(logit.fit1, logit.fit2)

## Likelihood ratio test
## 
## Model 1: deny ~ pirat
## Model 2: deny ~ pirat + unemp + mhist
##   #Df  LogLik Df  Chisq Pr(>Chisq)    
## 1   2 -830.09                         
## 2   6 -814.08  4 32.023  1.892e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Putting it all together

Fit a predictive model and predict outcomes by student status.

## load required packages
library(tidyverse)  # For data manipulation and visualization

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ dplyr::select() masks MASS::select()
## ✖ purrr::some()   masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(dplyr)      # Allow pipelining, you may also use other packages
library(modelr)
library(broom)      # Tidy up model outputs

## 
## Attaching package: 'broom'
## 
## The following object is masked from 'package:modelr':
## 
##     bootstrap

library(ISLR)

default <- as_tibble(ISLR::Default)   # as_tibble() is a more efficient method for accessing matrices and data.frame objects
## The two colon signs :: serve two purposes: first, call the items attached under an R object. Second, prevent the required R function from being masked by another function (from other R packages) of the same name
## check out the data
dim(default)

## [1] 10000     4

names(default)

## [1] "default" "student" "balance" "income"

head(default)

## # A tibble: 6 × 4
##   default student balance income
##   <fct>   <fct>     <dbl>  <dbl>
## 1 No      No         730. 44362.
## 2 No      Yes        817. 12106.
## 3 No      No        1074. 31767.
## 4 No      No         529. 35704.
## 5 No      No         786. 38463.
## 6 No      Yes        920.  7492.

## Convert text string to binary numerical coding
default$default <- ifelse(default$default == "Yes", 1, 0)
# default$student <- ifelse(default$student == "Yes", 1, 0)   # You may assign the converted value to a new variable to preserve original coding.

# train-test-split: this provides another way to do train-test-split. We will do a 75-25 split

set.seed(123) # set seed for replication purpose
sample <- sample(c(TRUE, FALSE), nrow(default), replace = T, prob = c(0.75, 0.25))
training <- default[sample, ]
testing <- default[!sample, ]   # ! means "negation". All those rows not in "sample" will be removed from default dataframe; those that remain will become testing data

## Fit two different models, note how the interaction term is generated and reported!

probit.train1 <- glm(default ~ income*student, family = binomial(link = "probit"), data = training)
summary(probit.train1)

## 
## Call:
## glm(formula = default ~ income * student, family = binomial(link = "probit"), 
##     data = training)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -2.047e+00  1.470e-01 -13.925   <2e-16 ***
## income             3.287e-06  3.523e-06   0.933    0.351    
## studentYes         1.329e-01  2.442e-01   0.544    0.586    
## income:studentYes  8.608e-06  1.098e-05   0.784    0.433    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2183.0  on 7564  degrees of freedom
## Residual deviance: 2167.4  on 7561  degrees of freedom
## AIC: 2175.4
## 
## Number of Fisher Scoring iterations: 6

probit.train2 <- glm(default ~ income*student + balance, family = binomial(link = "probit"), data = training)
summary(probit.train2)

## 
## Call:
## glm(formula = default ~ income * student + balance, family = binomial(link = "probit"), 
##     data = training)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -5.583e+00  2.908e-01 -19.198   <2e-16 ***
## income            -2.207e-06  5.154e-06  -0.428   0.6684    
## studentYes        -8.158e-01  3.623e-01  -2.252   0.0243 *  
## balance            2.980e-03  1.386e-04  21.503   <2e-16 ***
## income:studentYes  2.505e-05  1.635e-05   1.532   0.1256    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2183.0  on 7564  degrees of freedom
## Residual deviance: 1130.9  on 7560  degrees of freedom
## AIC: 1140.9
## 
## Number of Fisher Scoring iterations: 8

## Get marginal effects
library(mfx)

probit_mfx1 <- probitmfx(default ~ income*student, data = training)
probit_mfx1$mfxest

##                          dF/dx    Std. Err.         z     P>|z|
## income            2.359567e-07 2.526659e-07 0.9338686 0.3503717
## studentYes        1.004741e-02 1.942379e-02 0.5172736 0.6049652
## income:studentYes 6.179392e-07 7.879999e-07 0.7841869 0.4329305

probit_mfx2 <- probitmfx(default ~ income*student + balance, data =  training)
probit_mfx2$mfxest

##                           dF/dx    Std. Err.          z       P>|z|
## income            -4.140261e-09 9.725923e-09 -0.4256933 0.670331337
## studentYes        -1.152226e-03 6.007098e-04 -1.9181074 0.055097390
## balance            5.589860e-06 1.704551e-06  3.2793750 0.001040373
## income:studentYes  4.698041e-08 3.383341e-08  1.3885805 0.164960350

## Adjudicate their relative performance
lrtest(probit.train1, probit.train2)

## Likelihood ratio test
## 
## Model 1: default ~ income * student
## Model 2: default ~ income * student + balance
##   #Df   LogLik Df  Chisq Pr(>Chisq)    
## 1   4 -1083.72                         
## 2   5  -565.47  1 1036.5  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## Predict on testing data based the model trained using training data: note the sequence by which arguments are put into the predict() function: predict("trained model object", "testing data", se.fit)
# "type" is the type of prediction required. The default is on the scale of the linear predictors; the alternative "response" is on the scale of the response variable. 
# For the default binomial model the default predictions are log-odds (probabilities on logit scale)
# and type = "response" gives the predicted probabilities for probit model 
# The "terms" option returns a matrix giving the fitted values of each term in the model formula on the linear predictor scale. 

probit.pred1 <- predict(probit.train1, testing, se.fit = TRUE, type = "response")

# Plot predicted value with confidence intervals
# Check out what's inside predicted values
names(probit.pred1)

## [1] "fit"            "se.fit"         "residual.scale"

# "with" is a handy way to produce a new dataframe out of an existing R model output. Here we generate a new dataframe including predicted values, lower and upper bounds
plottable <- with(probit.pred1, 
                  data.frame(testing, 
                             fit = probit.pred1$fit,
                             lwr = (probit.pred1$fit + 2*probit.pred1$se.fit),
                             upr = (probit.pred1$fit - 2*probit.pred1$se.fit)))


# Wrangling plot with THE ggplot2 package

library(ggplot2)
theme_set(theme_minimal())   # Replace the default 'mouldy waffle' background

ggplot(plottable, aes(x = income, y = fit, col = student, fill = student)) +   # Plot predicted values and colored by student status, income on the x-axis, predicted default probability on the y-axis
  geom_ribbon(aes(ymin = lwr, ymax = upr), alpha = 0.4, colour = NA) +  #geom_ribbon() displays y interval defined by ymin and ymax, aka., your 95% lower and upper bounds
  geom_line(size = 1, alpha = 0.4) +   # alpha is the color transparency level, we need this to highlight the overlay between two regression lines
  scale_color_brewer(palette="Set1") +  # Replace default colors and fill
  scale_fill_brewer(palette="Set1") +
  labs(x = "Income", y = "Probability of Default", title = "The effect of Income on Default probability by Student Status")

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Exercise (I'll leave it to you)
# Fit a logit model with the same set of variables and plot it by student status

Supplementary materials

Likelihood-ratio test for linear regression models

## Define optim.table  this has been done for you  :)
optim.table = function(x, varnames=NULL,N,d=4){
    k=length(x$par)
    Parameter=matrix(x$par,k,1)
    StdError=matrix(sqrt(diag(solve(-1 * x$hessian))),k,1)
    Tstat=Parameter/StdError
    table1 = cbind(round(Parameter, digits=d), round(StdError,digits=d),
    round(Tstat, digits=d), round(2 * pt(abs(Tstat), 
            N-length(varnames), lower.tail = FALSE), digits=d))
    dimnames(table1) = list(varnames, 
            c("Estimate", "Std. Error", "t value", "p value"))
    cat("\n\n","   RESULTS   ", "\n")
    print(table1)
    cat("Log-likelihood = ",x$value,"\t\t N = ",N,"\n")
    cat("Convergence = ",x$convergence,"\n")
}


## Define log-likelihood function
LNorm<-function(b,X,Y) {
  lns2=b[length(b)]                 # log(s^2)
  s2=exp(lns2)                      # s^2
  b=b[1:length(b)-1]                # rest of the coeffs
  r=length(Y)
  if (is.null(X)) X=matrix(1,r,1) else X=cbind(1,X)     # add constant
  xb=X%*%b                          # b0 + b1x1 + b2x2 +...+ bkxk
  llik= -log(s2)/2-((Y-xb)^2)/(2*s2)
  sum(llik)
  }


# main program
# Y:  income_pc
# X:  propmil, secondary_pc, univ_pc, compnom, governmentcrises, workf_agri


# Download data (in Stata dta format) from our shared dropbox folder
library(foreign)
worlddata <- "https://www.dropbox.com/s/xkhsx1dk4ch3zdw/world_data_stata6.dta?dl=1"
worlddata <- read.dta(worlddata)
attach(worlddata)   # attach data.frame
Y=matrix(income_p,length(income_p),1)/100
X=cbind(propmil,secondar,univ_pc,compnom,governme,workf_ag)
cat("\nNumber of original observations:  ",nrow(X),"\n")

## 
## Number of original observations:   270

stval=runif(ncol(X)+2)*0.1          # starting values
norm.mle.U=optim(stval,LNorm,hessian=TRUE,method="BFGS",
                control=list(fnscale=-1,trace=1,REPORT=1,reltol=1e-15,maxit=200),
                X=X,Y=Y)

## initial  value 477183.336221 
## iter   2 value 4133.938891
## iter   3 value 3522.205695
## iter   4 value 3520.160674
## iter   5 value 3520.083762
## iter   6 value 3520.081037
## iter   7 value 3520.070611
## iter   8 value 3520.047445
## iter   9 value 3519.983522
## iter  10 value 3519.826229
## iter  11 value 3519.452087
## iter  12 value 3518.713855
## iter  13 value 3517.677770
## iter  14 value 3516.825880
## iter  15 value 3516.579433
## iter  16 value 3516.568862
## iter  17 value 3516.568676
## iter  18 value 3516.568637
## iter  19 value 3516.421806
## iter  20 value 3472.833656
## iter  21 value 3472.775717
## iter  22 value 3472.395869
## iter  23 value 3280.924774
## iter  24 value 3014.000378
## iter  25 value 3008.425486
## iter  26 value 2996.558532
## iter  27 value 2989.700874
## iter  28 value 2866.009992
## iter  29 value 2855.952938
## iter  30 value 2801.806858
## iter  31 value 2790.348819
## iter  32 value 2783.155686
## iter  33 value 2771.439014
## iter  34 value 2692.006017
## iter  35 value 2325.462342
## iter  36 value 2206.252218
## iter  37 value 2022.595769
## iter  38 value 1969.840238
## iter  39 value 1859.895937
## iter  40 value 1804.719574
## iter  41 value 1706.680294
## iter  42 value 1647.626284
## iter  43 value 1619.768284
## iter  44 value 1565.632817
## iter  45 value 1521.532027
## iter  46 value 1477.186721
## iter  47 value 1446.198677
## iter  48 value 1400.294623
## iter  49 value 1330.371749
## iter  50 value 1236.812567
## iter  51 value 1189.370509
## iter  52 value 1142.882384
## iter  53 value 1095.484675
## iter  54 value 1077.045141
## iter  55 value 1029.512165
## iter  56 value 989.065673
## iter  57 value 985.662244
## iter  58 value 912.392506
## iter  59 value 873.022699
## iter  60 value 787.824342
## iter  61 value 741.092443
## iter  62 value 685.438674
## iter  63 value 655.644032
## iter  64 value 543.474230
## iter  65 value 536.976734
## iter  66 value 495.063058
## iter  67 value 491.039757
## iter  68 value 481.258040
## iter  69 value 475.208401
## iter  70 value 475.167902
## iter  71 value 474.861950
## iter  72 value 474.808790
## iter  73 value 474.771672
## iter  74 value 473.830939
## iter  75 value 472.653738
## iter  76 value 472.615650
## iter  77 value 472.570192
## iter  78 value 472.565150
## iter  79 value 472.565146
## iter  80 value 472.565146
## iter  81 value 472.565146
## iter  81 value 472.565146
## iter  81 value 472.565146
## final  value 472.565146 
## converged

                                    # trace shows iterations, fnscale=-1 causes maximization

# Print results
vars=c("const",colnames(X),"lns2")
N=length(Y)
optim.table(norm.mle.U,vars,N)

## 
## 
##     RESULTS    
##          Estimate Std. Error  t value p value
## const     -0.2405     0.9099  -0.2643  0.7918
## propmil    0.0165     0.0042   3.8946  0.0001
## secondar   0.0012     0.0013   0.9125  0.3623
## univ_pc    0.0736     0.0037  19.6327  0.0000
## compnom    3.1599     0.4326   7.3043  0.0000
## governme  -0.0331     0.2450  -0.1350  0.8927
## workf_ag  -0.0112     0.0011 -10.2853  0.0000
## lns2       2.5005     0.0861  29.0530  0.0000
## Log-likelihood =  -472.5651       N =  270 
## Convergence =  0

# Verify results with lm()
print(summary(lm(Y~X)))

## 
## Call:
## lm(formula = Y ~ X)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.8785  -1.7375  -0.1166   1.5814  13.1971 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.240454   0.921927  -0.261 0.794438    
## Xpropmil     0.016461   0.004282   3.844 0.000152 ***
## Xsecondar    0.001194   0.001326   0.901 0.368633    
## Xuniv_pc     0.073592   0.003798  19.377  < 2e-16 ***
## Xcompnom     3.159904   0.438325   7.209 5.98e-12 ***
## Xgovernme   -0.033088   0.248280  -0.133 0.894082    
## Xworkf_ag   -0.011215   0.001105 -10.151  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.537 on 263 degrees of freedom
## Multiple R-squared:  0.8524, Adjusted R-squared:  0.849 
## F-statistic: 253.1 on 6 and 263 DF,  p-value: < 2.2e-16

# Now conduct likelihood ratio test
X=NULL
stval=runif(2)*0.1
norm.mle.R=optim(stval,LNorm,hessian=TRUE,method="BFGS",
                control=list(fnscale=-1,trace=1,REPORT=1,reltol=1e-15,maxit=200),
                X=X,Y=Y)

## initial  value 23071.459698 
## iter   2 value 4954.506560
## iter   3 value 4708.056662
## iter   4 value 2310.762508
## iter   5 value 1579.793203
## iter   6 value 1574.718565
## iter   7 value 1572.569931
## iter   8 value 1570.768263
## iter   9 value 1515.085532
## iter  10 value 1160.728954
## iter  11 value 1159.805897
## iter  12 value 1114.727964
## iter  13 value 1114.196971
## iter  14 value 976.165065
## iter  15 value 974.228464
## iter  16 value 757.418268
## iter  17 value 754.058367
## iter  18 value 733.886242
## iter  19 value 731.251745
## iter  20 value 730.855552
## iter  21 value 730.839367
## iter  22 value 730.839339
## iter  23 value 730.839331
## iter  24 value 730.839330
## iter  25 value 730.839330
## iter  25 value 730.839330
## iter  25 value 730.839330
## final  value 730.839330 
## converged

lnL_U=norm.mle.U$value              # loglik value of unrestricted model
lnL_R=norm.mle.R$value              # loglik value of restricted model
df=6                                # restricting 6 params=0
LR=2*(lnL_U-lnL_R)
cat("likelihood ratio R =",LR,"     p-value = ",pchisq(LR,df,lower.tail=F))

## likelihood ratio R = 516.5484      p-value =  2.287926e-108