Homework 3

Loan Data Example

library(car)
download.file("https://www.biz.uiowa.edu/faculty/jledolter/DataMining/LoanData.csv", "LoanData.csv")
loan <- read.csv("LoanData.csv")
summary(loan)
##      Status      Credit.Grade      Amount           Age        
##  Current:5186   HR     :1217   Min.   : 1000   Min.   : 0.000  
##  Default:  75   E      :1129   1st Qu.: 2025   1st Qu.: 2.000  
##  Late   : 350   D      : 927   Median : 3001   Median : 4.000  
##                 C      : 843   Mean   : 4817   Mean   : 4.504  
##                 B      : 553   3rd Qu.: 6000   3rd Qu.: 7.000  
##                 AA     : 451   Max.   :25000   Max.   :14.000  
##                 (Other): 491                                   
##  Borrower.Rate    Debt.To.Income.Ratio
##  Min.   :0.0000   Min.   :    0.00    
##  1st Qu.:0.1425   1st Qu.:    0.09    
##  Median :0.1950   Median :    0.16    
##  Mean   :0.1937   Mean   :   45.38    
##  3rd Qu.:0.2500   3rd Qu.:    0.25    
##  Max.   :0.4975   Max.   :51280.07    
## 
loan$Status = recode(loan$Status, "'Current'=1; else=0" )
loan$Status = as.numeric(levels(loan$Status)[loan$Status])

##Changing categorical value into numerical value

n = length(loan$Status)

n1=floor(n*(0.6))

n2 = n - n1

train = sample(1:n, n1)

Xdel <- model.matrix(Status~., data = loan)[,-1]
Xdel[1:3,]
##   Credit.GradeAA Credit.GradeB Credit.GradeC Credit.GradeD Credit.GradeE
## 1              0             0             1             0             0
## 2              0             0             0             0             0
## 3              0             0             0             0             0
##   Credit.GradeHR Credit.GradeNC Amount Age Borrower.Rate
## 1              0              0   5000   4         0.150
## 2              1              0   1900   6         0.265
## 3              1              0   1000   3         0.150
##   Debt.To.Income.Ratio
## 1                 0.04
## 2                 0.02
## 3                 0.02
Xtrain <- Xdel[train, ]
xnew <- Xdel[-train,]
ytrain <- loan$Status[train]
ynew <- loan$Status[-train]

##Logistic Regression
m1 = glm(Status~., family = binomial, data = data.frame(Status=ytrain, Xtrain))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
m1
## 
## Call:  glm(formula = Status ~ ., family = binomial, data = data.frame(Status = ytrain, 
##     Xtrain))
## 
## Coefficients:
##          (Intercept)        Credit.GradeAA         Credit.GradeB  
##            7.663e+00             1.039e+00            -1.813e-01  
##        Credit.GradeC         Credit.GradeD         Credit.GradeE  
##           -1.445e-01            -1.681e-01            -5.677e-01  
##       Credit.GradeHR        Credit.GradeNC                Amount  
##           -1.082e+00            -2.114e+00            -4.587e-05  
##                  Age         Borrower.Rate  Debt.To.Income.Ratio  
##           -3.849e-01            -1.094e+01             1.081e-01  
## 
## Degrees of Freedom: 3365 Total (i.e. Null);  3354 Residual
## Null Deviance:       1846 
## Residual Deviance: 1446  AIC: 1470
summary(m1)
## 
## Call:
## glm(formula = Status ~ ., family = binomial, data = data.frame(Status = ytrain, 
##     Xtrain))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2537   0.1354   0.2397   0.3990   2.1625  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           7.663e+00  5.494e-01  13.947  < 2e-16 ***
## Credit.GradeAA        1.039e+00  8.177e-01   1.271 0.203880    
## Credit.GradeB        -1.813e-01  4.894e-01  -0.370 0.711015    
## Credit.GradeC        -1.445e-01  4.770e-01  -0.303 0.762029    
## Credit.GradeD        -1.681e-01  4.826e-01  -0.348 0.727576    
## Credit.GradeE        -5.677e-01  4.975e-01  -1.141 0.253783    
## Credit.GradeHR       -1.082e+00  5.078e-01  -2.131 0.033083 *  
## Credit.GradeNC       -2.114e+00  6.310e-01  -3.350 0.000809 ***
## Amount               -4.587e-05  2.120e-05  -2.164 0.030498 *  
## Age                  -3.849e-01  2.872e-02 -13.401  < 2e-16 ***
## Borrower.Rate        -1.094e+01  1.592e+00  -6.872 6.34e-12 ***
## Debt.To.Income.Ratio  1.081e-01  3.072e-01   0.352 0.725036    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1845.8  on 3365  degrees of freedom
## Residual deviance: 1445.8  on 3354  degrees of freedom
## AIC: 1469.8
## 
## Number of Fisher Scoring iterations: 17
##First 10 predictions
ptest <- predict(m1, newdata = data.frame(xnew), type = "response")
data.frame(ynew,ptest)[1:10,]
##    ynew     ptest
## 12    1 0.6927001
## 13    1 0.9879369
## 15    1 0.8774886
## 23    1 0.8619873
## 29    1 0.7180974
## 34    1 0.9670327
## 35    1 0.9827409
## 36    1 0.8607462
## 39    1 0.6784198
## 40    1 0.9960884
##Logistic Regression Graph
bb=cbind(ptest,ynew)

bb1=bb[order(ptest,decreasing=TRUE),]


xbar=mean(ynew)
xbar
## [1] 0.9278396
axis=dim(n2)
ax=dim(n2)
ay=dim(n2)
axis[1]=1
ax[1]=xbar
ay[1]=bb1[1,2]
for (i in 2:n2) {
  axis[i]=i
  ax[i]=xbar*i
  ay[i]=ay[i-1]+bb1[i,2]
}

aaa=cbind(bb1[,1],bb1[,2],ay,ax)


plot(axis,ay,xlab="cases",ylab="successes",
     main="Lift: Total successes sorted by pred value/success probability")
points(axis,ax,type="l")

##Not a very good graph. There is barely an arch in the ROC Curve.

Titanic Analysis

titanic <- read.csv("/users/kimberlyhatlestad/Data Mining/Titanic.csv")
summary(titanic)
##      Class          Survived         Gender         Age         
##  Min.   :1.000   Min.   :0.0000   female:386   Min.   : 0.1667  
##  1st Qu.:1.000   1st Qu.:0.0000   male  :657   1st Qu.:21.0000  
##  Median :2.000   Median :0.0000                Median :28.0000  
##  Mean   :2.209   Mean   :0.4075                Mean   :29.8132  
##  3rd Qu.:3.000   3rd Qu.:1.0000                3rd Qu.:39.0000  
##  Max.   :3.000   Max.   :1.0000                Max.   :80.0000  
##                                                                 
##  NumbSibOrSpsAbd  NumbParOrChildAbd      Fare        EmbarkedAt
##  Min.   :0.0000   Min.   :0.0000    Min.   :  0.00   C:212     
##  1st Qu.:0.0000   1st Qu.:0.0000    1st Qu.:  8.05   Q: 50     
##  Median :0.0000   Median :0.0000    Median : 15.75   S:781     
##  Mean   :0.5043   Mean   :0.4219    Mean   : 36.60             
##  3rd Qu.:1.0000   3rd Qu.:1.0000    3rd Qu.: 35.08             
##  Max.   :8.0000   Max.   :6.0000    Max.   :512.33             
##                                                                
##                             HomeAndDestination           Home    
##                                      :359      null        :359  
##  New York, NY                        : 55      New York, NY: 63  
##  London                              : 14      London      : 25  
##  Montreal, PQ                        : 10      Cornwall    : 21  
##  Cornwall / Akron, OH                :  9      England     : 15  
##  Wiltshire, England Niagara Falls, NY:  8      Montreal, PQ: 14  
##  (Other)                             :588      (Other)     :546  
##      KnownDestination
##              :392    
##  null        :359    
##  New York, NY: 37    
##  Akron, OH   : 11    
##  Chicago, IL : 11    
##  Detroit, MI : 10    
##  (Other)     :223
## delete last 3 columns
titanic = titanic[-9:-11]
Survived = titanic$Survived
## make dummy variables for categorical data
a1 = rep(1,length(titanic$Survived))
a2 = rep(0,length(titanic$Survived))

titanic$GenderMale = ifelse(titanic$Gender =="male",a1,a2)

titanic =titanic[-3]

titanic$EmbarkedAtC = ifelse(titanic$EmbarkedAt == "C",a1,a2)
titanic$EmbarkedAtQ = ifelse(titanic$EmbarkedAt == "Q", a1,a2)

titanic = titanic[-7]

NewTitanic = cbind(Survived, Class=titanic$Class, Age=titanic$Age,
                   NumbSibOrSpsAbd =titanic$NumbSibOrSpsAbd, NumbParOrChildAbd =titanic$NumbParOrChildAbd,
                   Fare = titanic$Fare, GenderMale=titanic$GenderMale, EmbarkedAtC = titanic$EmbarkedAtC,
                   EmbarkedAtQ = titanic$EmbarkedAtQ)

n = length(titanic$Survived)

n1=floor(n*(0.6))

n2 = n - n1

train = sample(1:n, n1)

##Logistic Regression
m2 = glm(Survived~., family = binomial, data.frame(NewTitanic))
summary(m2)
## 
## Call:
## glm(formula = Survived ~ ., family = binomial, data = data.frame(NewTitanic))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5726  -0.6846  -0.4058   0.6456   2.5376  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        4.534836   0.486022   9.331  < 2e-16 ***
## Class             -1.009327   0.133464  -7.563 3.95e-14 ***
## Age               -0.037687   0.006634  -5.681 1.34e-08 ***
## NumbSibOrSpsAbd   -0.348021   0.108435  -3.209  0.00133 ** 
## NumbParOrChildAbd  0.049846   0.104193   0.478  0.63236    
## Fare               0.000463   0.001934   0.239  0.81084    
## GenderMale        -2.608938   0.179304 -14.550  < 2e-16 ***
## EmbarkedAtC        0.679066   0.211566   3.210  0.00133 ** 
## EmbarkedAtQ       -0.767411   0.406290  -1.889  0.05892 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1409.99  on 1042  degrees of freedom
## Residual deviance:  954.75  on 1034  degrees of freedom
## AIC: 972.75
## 
## Number of Fisher Scoring iterations: 5
##Histogram of Probability Distribution
NewTitanic = NewTitanic[,-1]
Xtrain <- NewTitanic[train, ]
xnew <- NewTitanic[-train,]
ytrain <- Survived[train]
ynew <- Survived[-train]

ptest = predict(m2, newdata = data.frame(xnew), type = "response")
hist(ptest)

##Survivial if probability is above 40%
g=floor(ptest+0.4) 
t=table(ynew,g)
t
##     g
## ynew   0   1
##    0 230  24
##    1  59 105
## % of Error
error=(t[1,2]+t[2,1])/n2
error
## [1] 0.1985646
##First 10 Predictions
b=cbind(ptest,ynew)
b[1:10,]
##        ptest ynew
## 1  0.9262486    1
## 2  0.6690433    1
## 3  0.9634547    0
## 4  0.4031899    0
## 5  0.7018051    1
## 6  0.2578457    0
## 7  0.9655164    1
## 8  0.9297047    1
## 9  0.7017412    0
## 10 0.9230614    1
b1=b[order(ptest,decreasing=TRUE),]
b1[1:10,]
##         ptest ynew
## 49  0.9706331    1
## 7   0.9655164    1
## 3   0.9634547    0
## 106 0.9568333    1
## 89  0.9548405    1
## 52  0.9518215    1
## 90  0.9497068    1
## 80  0.9488905    1
## 55  0.9439248    1
## 29  0.9429522    1
xbar=mean(ynew)

axis=dim(n2)
ax=dim(n2)
ay=dim(n2)
axis[1]=1
ax[1]=xbar
ay[1]=b1[1,2]
for (i in 2:n2) {
  axis[i]=i
  ax[i]=xbar*i
  ay[i]=ay[i-1]+b1[i,2]
}

##Graph of Regression

plot(axis,ay,xlab="cases",ylab="successes",main="Lift: Total successes sorted by prediction value/success probability")
points(axis,ax,type="l")

Better graph than Loan Data graph. Still not an ideal graph as there is only a slight arch in the ROC curve.