Homework#3

Loan Data Analysis

Status	Credit.Grade	Amount	Age	Borrower.Rate	Debt.To.Income.Ratio
Current	C	5000	4	0.1500	0.040
Current	HR	1900	6	0.2650	0.020
Current	HR	1000	3	0.1500	0.020
Current	HR	1000	5	0.2900	0.020
Current	AA	2550	8	0.0795	0.033
Current	NC	1500	2	0.2600	0.030

Status shows which loan is bad or good. Current is good, late and bad is bad

loandata$Status = recode(loandata$Status, "'Current'=1; else=0" )
loandata$Status = as.numeric(levels(loandata$Status)[loandata$Status])
##Changing categorical value into numerical value

n = length(loandata$Status)

n1=floor(n*(0.6))

n2 = n - n1

train = sample(1:n, n1)

Xdel <- model.matrix(Status~., data = loandata)[,-1]
Xdel[1:3,]

##   Credit.GradeAA Credit.GradeB Credit.GradeC Credit.GradeD Credit.GradeE
## 1              0             0             1             0             0
## 2              0             0             0             0             0
## 3              0             0             0             0             0
##   Credit.GradeHR Credit.GradeNC Amount Age Borrower.Rate
## 1              0              0   5000   4         0.150
## 2              1              0   1900   6         0.265
## 3              1              0   1000   3         0.150
##   Debt.To.Income.Ratio
## 1                 0.04
## 2                 0.02
## 3                 0.02

Xtrain <- Xdel[train, ]
xnew <- Xdel[-train,]
ytrain <- loandata$Status[train]
ynew <- loandata$Status[-train]

m1 = glm(Status~., family = binomial, data = data.frame(Status=ytrain, Xtrain))
m1

## 
## Call:  glm(formula = Status ~ ., family = binomial, data = data.frame(Status = ytrain, 
##     Xtrain))
## 
## Coefficients:
##          (Intercept)        Credit.GradeAA         Credit.GradeB  
##            7.385e+00             7.563e-01            -2.407e-02  
##        Credit.GradeC         Credit.GradeD         Credit.GradeE  
##            3.251e-01            -2.363e-01            -5.557e-01  
##       Credit.GradeHR        Credit.GradeNC                Amount  
##           -1.130e+00            -1.804e+00            -5.762e-05  
##                  Age         Borrower.Rate  Debt.To.Income.Ratio  
##           -3.558e-01            -1.037e+01             1.998e-01  
## 
## Degrees of Freedom: 3365 Total (i.e. Null);  3354 Residual
## Null Deviance:       1826 
## Residual Deviance: 1452  AIC: 1476

summary(m1)

## 
## Call:
## glm(formula = Status ~ ., family = binomial, data = data.frame(Status = ytrain, 
##     Xtrain))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.1719   0.1419   0.2442   0.3983   2.0635  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           7.385e+00  5.239e-01  14.096  < 2e-16 ***
## Credit.GradeAA        7.563e-01  6.857e-01   1.103  0.27006    
## Credit.GradeB        -2.407e-02  4.607e-01  -0.052  0.95834    
## Credit.GradeC         3.251e-01  4.638e-01   0.701  0.48335    
## Credit.GradeD        -2.363e-01  4.442e-01  -0.532  0.59479    
## Credit.GradeE        -5.557e-01  4.643e-01  -1.197  0.23136    
## Credit.GradeHR       -1.130e+00  4.755e-01  -2.376  0.01748 *  
## Credit.GradeNC       -1.804e+00  6.706e-01  -2.690  0.00713 ** 
## Amount               -5.762e-05  2.062e-05  -2.794  0.00520 ** 
## Age                  -3.558e-01  2.828e-02 -12.582  < 2e-16 ***
## Borrower.Rate        -1.037e+01  1.602e+00  -6.472 9.66e-11 ***
## Debt.To.Income.Ratio  1.998e-01  3.707e-01   0.539  0.58988    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1826.0  on 3365  degrees of freedom
## Residual deviance: 1451.6  on 3354  degrees of freedom
## AIC: 1475.6
## 
## Number of Fisher Scoring iterations: 17

First 10 predictions

ptest <- predict(m1, newdata = data.frame(xnew), type = "response")
data.frame(ynew,ptest)[1:10,]

##    ynew     ptest
## 3     1 0.9728342
## 4     1 0.8045158
## 5     1 0.9870006
## 8     1 0.8797900
## 9     1 0.9988177
## 11    1 0.9956890
## 12    1 0.6972082
## 13    1 0.9826625
## 25    1 0.9447024
## 27    1 0.9918913

Making logistic regression graph

bb=cbind(ptest,ynew)

bb1=bb[order(ptest,decreasing=TRUE),]


xbar=mean(ynew)
xbar

## [1] 0.9260579

axis=dim(n2)
ax=dim(n2)
ay=dim(n2)
axis[1]=1
ax[1]=xbar
ay[1]=bb1[1,2]
for (i in 2:n2) {
  axis[i]=i
  ax[i]=xbar*i
  ay[i]=ay[i-1]+bb1[i,2]
}

aaa=cbind(bb1[,1],bb1[,2],ay,ax)


plot(axis,ay,xlab="number of cases",ylab="number of successes",
     main="Lift: Cum successes sorted by pred val/success prob")
points(axis,ax,type="l")

Titanic Analysis

## delete last 3 columns
titanic = titanic[-9:-11]
Survived = titanic$Survived
## make dummy variables
a1 = rep(1,length(titanic$Survived))
a2 = rep(0,length(titanic$Survived))

titanic$GenderMale = ifelse(titanic$Gender =="male",a1,a2)

titanic =titanic[-3]

titanic$EmbarkedAtC = ifelse(titanic$EmbarkedAt == "C",a1,a2)
titanic$EmbarkedAtQ = ifelse(titanic$EmbarkedAt == "Q", a1,a2)

titanic = titanic[-7]

NewTitanic = cbind(Survived, Class=titanic$Class, Age=titanic$Age,
                   NumbSibOrSpsAbd =titanic$NumbSibOrSpsAbd, NumbParOrChildAbd =titanic$NumbParOrChildAbd,
                   Fare = titanic$Fare, GenderMale=titanic$GenderMale, EmbarkedAtC = titanic$EmbarkedAtC,
                   EmbarkedAtQ = titanic$EmbarkedAtQ)

n = length(titanic$Survived)

n1=floor(n*(0.6))

n2 = n - n1

train = sample(1:n, n1)

Logistic Regression Output

m2 = glm(Survived~., family = binomial, data.frame(NewTitanic))
summary(m2)

## 
## Call:
## glm(formula = Survived ~ ., family = binomial, data = data.frame(NewTitanic))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5726  -0.6846  -0.4058   0.6456   2.5376  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        4.534836   0.486022   9.331  < 2e-16 ***
## Class             -1.009327   0.133464  -7.563 3.95e-14 ***
## Age               -0.037687   0.006634  -5.681 1.34e-08 ***
## NumbSibOrSpsAbd   -0.348021   0.108435  -3.209  0.00133 ** 
## NumbParOrChildAbd  0.049846   0.104193   0.478  0.63236    
## Fare               0.000463   0.001934   0.239  0.81084    
## GenderMale        -2.608938   0.179304 -14.550  < 2e-16 ***
## EmbarkedAtC        0.679066   0.211566   3.210  0.00133 ** 
## EmbarkedAtQ       -0.767411   0.406290  -1.889  0.05892 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1409.99  on 1042  degrees of freedom
## Residual deviance:  954.75  on 1034  degrees of freedom
## AIC: 972.75
## 
## Number of Fisher Scoring iterations: 5

Distribution of probabilities

NewTitanic = NewTitanic[,-1]
Xtrain <- NewTitanic[train, ]
xnew <- NewTitanic[-train,]
ytrain <- Survived[train]
ynew <- Survived[-train]

ptest = predict(m2, newdata = data.frame(xnew), type = "response")
hist(ptest)

Coding 1 (Survived) if probability is 0.4 or higher than 0.4

g=floor(ptest+0.4) ## prob .4 or higher
t=table(ynew,g)
t

##     g
## ynew   0   1
##    0 235  24
##    1  60  99

error=(t[1,2]+t[2,1])/n2
## % of Error
error

## [1] 0.2009569

Showing first 10 prediction

b=cbind(ptest,ynew)
b[1:10,]

##        ptest ynew
## 1  0.9262486    1
## 2  0.9634547    0
## 3  0.3651202    0
## 4  0.3969200    0
## 5  0.9230614    1
## 6  0.6524154    1
## 7  0.9386056    1
## 8  0.9265782    1
## 9  0.5860399    1
## 10 0.9601771    1

b1=b[order(ptest,decreasing=TRUE),]
b1[1:10,]

##         ptest ynew
## 41  0.9693150    1
## 35  0.9685112    1
## 2   0.9634547    0
## 10  0.9601771    1
## 96  0.9578213    1
## 81  0.9548405    1
## 91  0.9518332    1
## 75  0.9513740    1
## 23  0.9497901    1
## 165 0.9492367    1

xbar=mean(ynew)

axis=dim(n2)
ax=dim(n2)
ay=dim(n2)
axis[1]=1
ax[1]=xbar
ay[1]=bb1[1,2]
for (i in 2:n2) {
  axis[i]=i
  ax[i]=xbar*i
  ay[i]=ay[i-1]+bb1[i,2]
}

The grapgh of the regression

plot(axis,ay,xlab="number of cases",ylab="number of successes",main="Lift: Cum successes sorted by pred val/success prob")
points(axis,ax,type="l")

Homework#3

Shunya

2017/10/16

Loan Data Analysis

Status shows which loan is bad or good. Current is good, late and bad is bad

First 10 predictions

Making logistic regression graph

Titanic Analysis

Logistic Regression Output

Distribution of probabilities

Coding 1 (Survived) if probability is 0.4 or higher than 0.4

Showing first 10 prediction

The grapgh of the regression