Introduction

In this homework, we use Zelig 5 syntax for logit regression:

http://docs.zeligproject.org/articles/zelig_logit.html

One observation is that Reference Class type of syntax seems to apply to least square regression only as model = “logit” is not supported.

on the credit card default dataset described in the following section.

Data

The data of this homework is the credit card default data from University of California, Irvine Data source: https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients

In the original set male, female and marital status had numeric values. They were recoded to have string values. Also were ever there was missing data, those parts were omitted from the final data set.

Results

library(Zelig)
library(tidyr)
library(dplyr)
library(ggplot2)

data <- read.csv("default of credit card clients_UCI_clean.csv")

colnames(data)[ncol(data)] <- "default"
z.out1 <- zelig(default~ AGE + SEX*MARRIAGE + EDUCATION, model = "logit", data = data, cite = FALSE)

# Age effect
age.range = min(data$AGE):max(data$AGE)
x <- setx(z.out1, AGE = age.range)
s <- sim(z.out1, x = x)
ci.plot(s)

# Education effect: 1 --> Graduate School, 2--> University, 3 --> high school, 4--> others

education.range = min(data$EDUCATION):max(data$EDUCATION)
x <- setx(z.out1, EDUCATION = education.range)
s <- sim(z.out1, x = x)
ci.plot(s)

As we can see from the the plots above, default rates decreases as age increases and as the education level increases.

Note: education (1–> graduate school, 2–> university, 3–> high school, 4–> others)

#### First Difference of Male and Female
#### Estimating the risk difference between male and high female 
#### while all the other variables held at their default values
x.male <- setx(z.out1, SEX = "male")
x.female<- setx(z.out1, SEX = "female")
s.out2 <- sim(z.out1, x = x.male, x1 = x.female)
summary(s.out2)
## 
##  sim x :
##  -----
## ev
##           mean          sd       50%      2.5%     97.5%
## [1,] 0.2257816 0.005207143 0.2254383 0.2166691 0.2370322
## pv
##          0     1
## [1,] 0.765 0.235
## 
##  sim x1 :
##  -----
## ev
##           mean          sd       50%      2.5%     97.5%
## [1,] 0.1975275 0.004224585 0.1974771 0.1892727 0.2064206
## pv
##          0     1
## [1,] 0.806 0.194
## fd
##             mean          sd         50%        2.5%       97.5%
## [1,] -0.02825408 0.006501644 -0.02795819 -0.04100257 -0.01602859
fd <- s.out2$get_qi(xvalue="x1", qi="fd")
summary(fd)
##        V1           
##  Min.   :-0.054704  
##  1st Qu.:-0.032524  
##  Median :-0.027958  
##  Mean   :-0.028254  
##  3rd Qu.:-0.023757  
##  Max.   :-0.008797
plot(s.out2)

##### Marriage variations in Sex differences

# married
m1x <- setx(z.out1, SEX = "male", MARRIAGE = "married")
m1x1 <- setx(z.out1, SEX = "female", MARRIAGE = "married")
m1s <- sim(z.out1, x=m1x, x1=m1x1)

# single
m2x <- setx(z.out1, SEX = "male", MARRIAGE = "single")
m2x1 <- setx(z.out1, SEX = "female", MARRIAGE = "single")
m2s <- sim(z.out1, x=m2x, x1=m2x1)

# others
m3x <- setx(z.out1, SEX = "male", MARRIAGE = "others")
m3x1 <- setx(z.out1, SEX = "female", MARRIAGE = "others")
m3s <- sim(z.out1, x=m3x, x1=m3x1)
### plot
plot(m1s)

plot(m2s)

plot(m3s)

Plots above show the Marriage variations in Sex differences

d1 <- m1s$get_qi(xvalue="x1", qi="fd")
d2 <- m2s$get_qi(xvalue="x1", qi="fd")
d3 <- m3s$get_qi(xvalue="x1", qi="fd")

dfd <- data.frame(cbind(d1, d2, d3))
head(dfd)
##            X1          X2          X3
## 1 -0.04945864 -0.03638695 -0.04051664
## 2 -0.05003353 -0.03016418 -0.08550038
## 3 -0.04633163 -0.03216324 -0.07582145
## 4 -0.05317897 -0.02768412 -0.01094818
## 5 -0.04242932 -0.01667884  0.00748812
## 6 -0.04430928 -0.03253825 -0.03625357
### Putting them together

tidd <- dfd %>% 
  gather(marriage, simv, 1:3)
head(tidd)
##   marriage        simv
## 1       X1 -0.04945864
## 2       X1 -0.05003353
## 3       X1 -0.04633163
## 4       X1 -0.05317897
## 5       X1 -0.04242932
## 6       X1 -0.04430928
tidd %>% 
  group_by(marriage) %>% 
  summarise(mean = mean(simv), sd = sd(simv))
## # A tibble: 3 x 3
##   marriage    mean      sd
##   <chr>      <dbl>   <dbl>
## 1 X1       -0.0432 0.00789
## 2 X2       -0.0291 0.00672
## 3 X3       -0.0792 0.0491
#### ggplot

ggplot(tidd, aes(simv)) + geom_histogram() + facet_grid(~marriage)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Conclusion

In the above graphs and tables, females are less likely to default than males because the first differnce results are negative (female default rate- male default rate). The first difference between female and male is the largest for people in the others catagory, followed by those in the married category.