library(car)
download.file("https://www.biz.uiowa.edu/faculty/jledolter/DataMining/LoanData.csv", "LoanData.csv")
loan <- read.csv("LoanData.csv")
summary(loan)
## Status Credit.Grade Amount Age
## Current:5186 HR :1217 Min. : 1000 Min. : 0.000
## Default: 75 E :1129 1st Qu.: 2025 1st Qu.: 2.000
## Late : 350 D : 927 Median : 3001 Median : 4.000
## C : 843 Mean : 4817 Mean : 4.504
## B : 553 3rd Qu.: 6000 3rd Qu.: 7.000
## AA : 451 Max. :25000 Max. :14.000
## (Other): 491
## Borrower.Rate Debt.To.Income.Ratio
## Min. :0.0000 Min. : 0.00
## 1st Qu.:0.1425 1st Qu.: 0.09
## Median :0.1950 Median : 0.16
## Mean :0.1937 Mean : 45.38
## 3rd Qu.:0.2500 3rd Qu.: 0.25
## Max. :0.4975 Max. :51280.07
##
loan$Status = recode(loan$Status, "'Current'=1; else=0" )
loan$Status = as.numeric(levels(loan$Status)[loan$Status])
##Changing categorical value into numerical value
n = length(loan$Status)
n1=floor(n*(0.6))
n2 = n - n1
train = sample(1:n, n1)
Xdel <- model.matrix(Status~., data = loan)[,-1]
Xdel[1:3,]
## Credit.GradeAA Credit.GradeB Credit.GradeC Credit.GradeD Credit.GradeE
## 1 0 0 1 0 0
## 2 0 0 0 0 0
## 3 0 0 0 0 0
## Credit.GradeHR Credit.GradeNC Amount Age Borrower.Rate
## 1 0 0 5000 4 0.150
## 2 1 0 1900 6 0.265
## 3 1 0 1000 3 0.150
## Debt.To.Income.Ratio
## 1 0.04
## 2 0.02
## 3 0.02
Xtrain <- Xdel[train, ]
xnew <- Xdel[-train,]
ytrain <- loan$Status[train]
ynew <- loan$Status[-train]
##Logistic Regression
m1 = glm(Status~., family = binomial, data = data.frame(Status=ytrain, Xtrain))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
m1
##
## Call: glm(formula = Status ~ ., family = binomial, data = data.frame(Status = ytrain,
## Xtrain))
##
## Coefficients:
## (Intercept) Credit.GradeAA Credit.GradeB
## 7.663e+00 1.039e+00 -1.813e-01
## Credit.GradeC Credit.GradeD Credit.GradeE
## -1.445e-01 -1.681e-01 -5.677e-01
## Credit.GradeHR Credit.GradeNC Amount
## -1.082e+00 -2.114e+00 -4.587e-05
## Age Borrower.Rate Debt.To.Income.Ratio
## -3.849e-01 -1.094e+01 1.081e-01
##
## Degrees of Freedom: 3365 Total (i.e. Null); 3354 Residual
## Null Deviance: 1846
## Residual Deviance: 1446 AIC: 1470
summary(m1)
##
## Call:
## glm(formula = Status ~ ., family = binomial, data = data.frame(Status = ytrain,
## Xtrain))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2537 0.1354 0.2397 0.3990 2.1625
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.663e+00 5.494e-01 13.947 < 2e-16 ***
## Credit.GradeAA 1.039e+00 8.177e-01 1.271 0.203880
## Credit.GradeB -1.813e-01 4.894e-01 -0.370 0.711015
## Credit.GradeC -1.445e-01 4.770e-01 -0.303 0.762029
## Credit.GradeD -1.681e-01 4.826e-01 -0.348 0.727576
## Credit.GradeE -5.677e-01 4.975e-01 -1.141 0.253783
## Credit.GradeHR -1.082e+00 5.078e-01 -2.131 0.033083 *
## Credit.GradeNC -2.114e+00 6.310e-01 -3.350 0.000809 ***
## Amount -4.587e-05 2.120e-05 -2.164 0.030498 *
## Age -3.849e-01 2.872e-02 -13.401 < 2e-16 ***
## Borrower.Rate -1.094e+01 1.592e+00 -6.872 6.34e-12 ***
## Debt.To.Income.Ratio 1.081e-01 3.072e-01 0.352 0.725036
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1845.8 on 3365 degrees of freedom
## Residual deviance: 1445.8 on 3354 degrees of freedom
## AIC: 1469.8
##
## Number of Fisher Scoring iterations: 17
##First 10 predictions
ptest <- predict(m1, newdata = data.frame(xnew), type = "response")
data.frame(ynew,ptest)[1:10,]
## ynew ptest
## 12 1 0.6927001
## 13 1 0.9879369
## 15 1 0.8774886
## 23 1 0.8619873
## 29 1 0.7180974
## 34 1 0.9670327
## 35 1 0.9827409
## 36 1 0.8607462
## 39 1 0.6784198
## 40 1 0.9960884
##Logistic Regression Graph
bb=cbind(ptest,ynew)
bb1=bb[order(ptest,decreasing=TRUE),]
xbar=mean(ynew)
xbar
## [1] 0.9278396
axis=dim(n2)
ax=dim(n2)
ay=dim(n2)
axis[1]=1
ax[1]=xbar
ay[1]=bb1[1,2]
for (i in 2:n2) {
axis[i]=i
ax[i]=xbar*i
ay[i]=ay[i-1]+bb1[i,2]
}
aaa=cbind(bb1[,1],bb1[,2],ay,ax)
plot(axis,ay,xlab="cases",ylab="successes",
main="Lift: Total successes sorted by pred value/success probability")
points(axis,ax,type="l")
##Not a very good graph. There is barely an arch in the ROC Curve.
titanic <- read.csv("/users/kimberlyhatlestad/Data Mining/Titanic.csv")
summary(titanic)
## Class Survived Gender Age
## Min. :1.000 Min. :0.0000 female:386 Min. : 0.1667
## 1st Qu.:1.000 1st Qu.:0.0000 male :657 1st Qu.:21.0000
## Median :2.000 Median :0.0000 Median :28.0000
## Mean :2.209 Mean :0.4075 Mean :29.8132
## 3rd Qu.:3.000 3rd Qu.:1.0000 3rd Qu.:39.0000
## Max. :3.000 Max. :1.0000 Max. :80.0000
##
## NumbSibOrSpsAbd NumbParOrChildAbd Fare EmbarkedAt
## Min. :0.0000 Min. :0.0000 Min. : 0.00 C:212
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 8.05 Q: 50
## Median :0.0000 Median :0.0000 Median : 15.75 S:781
## Mean :0.5043 Mean :0.4219 Mean : 36.60
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.: 35.08
## Max. :8.0000 Max. :6.0000 Max. :512.33
##
## HomeAndDestination Home
## :359 null :359
## New York, NY : 55 New York, NY: 63
## London : 14 London : 25
## Montreal, PQ : 10 Cornwall : 21
## Cornwall / Akron, OH : 9 England : 15
## Wiltshire, England Niagara Falls, NY: 8 Montreal, PQ: 14
## (Other) :588 (Other) :546
## KnownDestination
## :392
## null :359
## New York, NY: 37
## Akron, OH : 11
## Chicago, IL : 11
## Detroit, MI : 10
## (Other) :223
## delete last 3 columns
titanic = titanic[-9:-11]
Survived = titanic$Survived
## make dummy variables for categorical data
a1 = rep(1,length(titanic$Survived))
a2 = rep(0,length(titanic$Survived))
titanic$GenderMale = ifelse(titanic$Gender =="male",a1,a2)
titanic =titanic[-3]
titanic$EmbarkedAtC = ifelse(titanic$EmbarkedAt == "C",a1,a2)
titanic$EmbarkedAtQ = ifelse(titanic$EmbarkedAt == "Q", a1,a2)
titanic = titanic[-7]
NewTitanic = cbind(Survived, Class=titanic$Class, Age=titanic$Age,
NumbSibOrSpsAbd =titanic$NumbSibOrSpsAbd, NumbParOrChildAbd =titanic$NumbParOrChildAbd,
Fare = titanic$Fare, GenderMale=titanic$GenderMale, EmbarkedAtC = titanic$EmbarkedAtC,
EmbarkedAtQ = titanic$EmbarkedAtQ)
n = length(titanic$Survived)
n1=floor(n*(0.6))
n2 = n - n1
train = sample(1:n, n1)
##Logistic Regression
m2 = glm(Survived~., family = binomial, data.frame(NewTitanic))
summary(m2)
##
## Call:
## glm(formula = Survived ~ ., family = binomial, data = data.frame(NewTitanic))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5726 -0.6846 -0.4058 0.6456 2.5376
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.534836 0.486022 9.331 < 2e-16 ***
## Class -1.009327 0.133464 -7.563 3.95e-14 ***
## Age -0.037687 0.006634 -5.681 1.34e-08 ***
## NumbSibOrSpsAbd -0.348021 0.108435 -3.209 0.00133 **
## NumbParOrChildAbd 0.049846 0.104193 0.478 0.63236
## Fare 0.000463 0.001934 0.239 0.81084
## GenderMale -2.608938 0.179304 -14.550 < 2e-16 ***
## EmbarkedAtC 0.679066 0.211566 3.210 0.00133 **
## EmbarkedAtQ -0.767411 0.406290 -1.889 0.05892 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1409.99 on 1042 degrees of freedom
## Residual deviance: 954.75 on 1034 degrees of freedom
## AIC: 972.75
##
## Number of Fisher Scoring iterations: 5
##Histogram of Probability Distribution
NewTitanic = NewTitanic[,-1]
Xtrain <- NewTitanic[train, ]
xnew <- NewTitanic[-train,]
ytrain <- Survived[train]
ynew <- Survived[-train]
ptest = predict(m2, newdata = data.frame(xnew), type = "response")
hist(ptest)
##Survivial if probability is above 40%
g=floor(ptest+0.4)
t=table(ynew,g)
t
## g
## ynew 0 1
## 0 230 24
## 1 59 105
## % of Error
error=(t[1,2]+t[2,1])/n2
error
## [1] 0.1985646
##First 10 Predictions
b=cbind(ptest,ynew)
b[1:10,]
## ptest ynew
## 1 0.9262486 1
## 2 0.6690433 1
## 3 0.9634547 0
## 4 0.4031899 0
## 5 0.7018051 1
## 6 0.2578457 0
## 7 0.9655164 1
## 8 0.9297047 1
## 9 0.7017412 0
## 10 0.9230614 1
b1=b[order(ptest,decreasing=TRUE),]
b1[1:10,]
## ptest ynew
## 49 0.9706331 1
## 7 0.9655164 1
## 3 0.9634547 0
## 106 0.9568333 1
## 89 0.9548405 1
## 52 0.9518215 1
## 90 0.9497068 1
## 80 0.9488905 1
## 55 0.9439248 1
## 29 0.9429522 1
xbar=mean(ynew)
axis=dim(n2)
ax=dim(n2)
ay=dim(n2)
axis[1]=1
ax[1]=xbar
ay[1]=b1[1,2]
for (i in 2:n2) {
axis[i]=i
ax[i]=xbar*i
ay[i]=ay[i-1]+b1[i,2]
}
##Graph of Regression
plot(axis,ay,xlab="cases",ylab="successes",main="Lift: Total successes sorted by prediction value/success probability")
points(axis,ax,type="l")