Box Plots of Balance and Income versus Student Status: Default Data set; package ISLR

library(ISLR)
load("~/.RData")
data("Default")

boxplot(Default$balance~Default$student,col=c(5,20),xlab="Student Status",ylab= "Balance", main="Boxplot Balance versus Student Status")

boxplot(Default$income~Default$student,col=c(5,20),xlab="Student Status",ylab="Income",main="Boxplot Income versus Student Status")

Logisitc Regression Models to predict Default with balance as predictor

summary(glm(default~balance, data=Default, family=binomial(link=logit)))

## 
## Call:
## glm(formula = default ~ balance, family = binomial(link = logit), 
##     data = Default)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2697  -0.1465  -0.0589  -0.0221   3.7589  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.065e+01  3.612e-01  -29.49   <2e-16 ***
## balance      5.499e-03  2.204e-04   24.95   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1596.5  on 9998  degrees of freedom
## AIC: 1600.5
## 
## Number of Fisher Scoring iterations: 8

Logisitc Regression Models to predict Default with Student as predictor

summary(glm(default~student, data=Default, family=binomial(link=logit)))

## 
## Call:
## glm(formula = default ~ student, family = binomial(link = logit), 
##     data = Default)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.2970  -0.2970  -0.2434  -0.2434   2.6585  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.50413    0.07071  -49.55  < 2e-16 ***
## studentYes   0.40489    0.11502    3.52 0.000431 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 2908.7  on 9998  degrees of freedom
## AIC: 2912.7
## 
## Number of Fisher Scoring iterations: 6

Logisitc Regression Models to predict Default with balance, student, and income as predictors

summary(glm(default~. ,data=Default, family=binomial(link=logit)))

## 
## Call:
## glm(formula = default ~ ., family = binomial(link = logit), data = Default)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4691  -0.1418  -0.0557  -0.0203   3.7383  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.087e+01  4.923e-01 -22.080  < 2e-16 ***
## studentYes  -6.468e-01  2.363e-01  -2.738  0.00619 ** 
## balance      5.737e-03  2.319e-04  24.738  < 2e-16 ***
## income       3.033e-06  8.203e-06   0.370  0.71152    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1571.5  on 9996  degrees of freedom
## AIC: 1579.5
## 
## Number of Fisher Scoring iterations: 8

Creating prediction table (confusion matrix :) ) : Logisitc Regression Model

x=(glm(Default,family=binomial(link=logit)))
xp=x$fitted.values
xp[xp>=0.5]=1
xp[xp<0.5]=0
table(Default$default,xp)

##      xp
##          0    1
##   No  9627   40
##   Yes  228  105

round(prop.table(table(Default$default,xp),1),2)

##      xp
##          0    1
##   No  1.00 0.00
##   Yes 0.68 0.32

Training-Holdout validation of Logistic Regression Model

s1=sample(1:nrow(Default),nrow(Default)*0.7,replace=FALSE)
x=(glm(Default[s1,],family=binomial(link=logit)))
xp=x$fitted.values
xp[xp>=0.5]=1
xp[xp<0.5]=0
table(Default[s1,1],xp)

##      xp
##          0    1
##   No  6747   25
##   Yes  162   66

round(prop.table(table(Default[s1,1],xp),1),2)

##      xp
##          0    1
##   No  1.00 0.00
##   Yes 0.71 0.29

xp=predict(x, newdata=Default[-s1,-1],type="response")
xp[xp>=0.5]=1
xp[xp<0.5]=0
table(Default[-s1,1],xp)

##      xp
##          0    1
##   No  2879   16
##   Yes   64   41

round(prop.table(table(Default[-s1,1],xp),1),2)

##      xp
##          0    1
##   No  0.99 0.01
##   Yes 0.61 0.39

Why Logistic? Similarities of Logistic and Normal densities (pdf)

x=seq(-5,5,.02)
plot(x,dnorm(x),type="p", ylim=c(0,0.5), col=4,main="Similarity of Standard Normal and Logistic Densities", ylab ="Probability", xlab ="Quantiles")
points(x, 1.5*exp(-1.5*x)/(1+exp(-1.5*x))^2, col=5)

Why Logistic? Similarities of Logistic and Normal cumulative distributions (cdf)

x=seq(-5,5,.02)
plot(x,pnorm(x),type="p",ylim=c(0,1),col=4,main="Standard Normal and Logistic Cumulative distributions",ylab="Cumulative Probability",xlab="Quantiles")
points(x,1/(1+exp(-1.5*x))^2,col=5)

Why Multinomial Logit? Similarities of Gumbel and Normal densities (pdf)

x=seq(-5,5,.02)
plot(x,1/(2*pi)^0.5 * exp(-0.5 * x^2),type="p", ylim=c(0,0.5), col=4,main="Similarity of Standard Normal and Gumbel densities", ylab ="Probability", xlab ="Quantiles")
points(x,1.1*exp(-1.1*x)*exp(-exp(-1.1*x)),col=11)

Why Multinomial Logit? Similarities of Gumbel and Normal cumulative distributions (cdf)

x=seq(-5,5,.02)
plot(x,pnorm(x),type="p",ylim=c(0,1),col=4,main="Standard Normal and Gumbel Cumulative distributions",ylab="Cumulative Probability",xlab="Quantiles")
points(x,exp(-exp(-1.1*x)),col=11)

Read fasion.cv file using the read.csv function into R.

It can be downloaded from data sets folder in Course Material in Chalk

Use package nnet in R to estimate Multinomial Logit model

fashion=read.csv('/Users/codethedral/Google Drive/MScA/MSCA_31008 Data Mining/Lecture 5/fashion_2000_from_chalk.csv')

#levels(fashion[,1])=levels(fashion[,1])[c(2,3,1)]
summary(fashion)

##     RATING         SEX          AGE             FASHION     QUALITY    
##  High  : 424   Female:1157   16-24:915   Modern     :1016   High:1002  
##  Low   : 464   Male  : 843   25-39:426   Traditional: 984   Low : 998  
##  Medium:1112                 40+  :659                                 
##     PRICE     
##  Higher:1012  
##  Lower : 988  
##

require(nnet)

## Loading required package: nnet

x=multinom(relevel(RATING,ref="Low")~.,data=fashion)

## # weights:  24 (14 variable)
## initial  value 2197.224577 
## iter  10 value 1666.158060
## iter  20 value 1615.993787
## final  value 1615.988314 
## converged

summary(x)

## Call:
## multinom(formula = relevel(RATING, ref = "Low") ~ ., data = fashion)
## 
## Coefficients:
##        (Intercept)    SEXMale  AGE25-39    AGE40+ FASHIONTraditional
## High      1.021470 -0.0464043 0.5712490 0.5238451          -3.391201
## Medium    1.819807  0.1488543 0.3402063 0.3911855          -1.867446
##        QUALITYLow PRICELower
## High   -1.9924648   2.570459
## Medium -0.8218136   1.279181
## 
## Std. Errors:
##        (Intercept)   SEXMale  AGE25-39    AGE40+ FASHIONTraditional
## High     0.2012743 0.1661045 0.2130375 0.1876391          0.1909015
## Medium   0.1643670 0.1250484 0.1633628 0.1410642          0.1417692
##        QUALITYLow PRICELower
## High    0.1719416  0.1775988
## Medium  0.1267600  0.1316249
## 
## Residual Deviance: 3231.977 
## AIC: 3259.977

Creating Prediction Tables for Multinomial Model Predictions

table(fashion$RATING,predict(x,type="class"))

##         
##          Low High Medium
##   High     6  149    269
##   Low    198    0    266
##   Medium 122  102    888

prop.table(table(fashion$RATING,predict(x,type="class")),2)

##         
##                 Low       High     Medium
##   High   0.01840491 0.59362550 0.18903725
##   Low    0.60736196 0.00000000 0.18692902
##   Medium 0.37423313 0.40637450 0.62403373

prop.table(table(fashion$RATING,predict(x,type="class")),1)

##         
##                 Low       High     Medium
##   High   0.01415094 0.35141509 0.63443396
##   Low    0.42672414 0.00000000 0.57327586
##   Medium 0.10971223 0.09172662 0.79856115

Performing training-holdout validations for Multinomial Logit Model

s1=sample(1:nrow(fashion),.7*nrow(fashion),replace=FALSE)
x=multinom(relevel(RATING,ref="Low")~.,data=fashion[s1,])

## # weights:  24 (14 variable)
## initial  value 1538.057204 
## iter  10 value 1146.202953
## iter  20 value 1136.360627
## final  value 1136.360348 
## converged

summary(x)

## Call:
## multinom(formula = relevel(RATING, ref = "Low") ~ ., data = fashion[s1, 
##     ])
## 
## Coefficients:
##        (Intercept)     SEXMale  AGE25-39    AGE40+ FASHIONTraditional
## High      1.195247 -0.08355537 0.6113377 0.3902596          -3.294032
## Medium    1.851715  0.15477551 0.4039291 0.4249496          -1.876347
##        QUALITYLow PRICELower
## High   -2.0824732   2.580337
## Medium -0.8694767   1.275182
## 
## Std. Errors:
##        (Intercept)   SEXMale  AGE25-39    AGE40+ FASHIONTraditional
## High     0.2382692 0.1985659 0.2522676 0.2230830          0.2247770
## Medium   0.1990181 0.1527110 0.1983279 0.1707263          0.1714455
##        QUALITYLow PRICELower
## High    0.2052496  0.2109767
## Medium  0.1544662  0.1602237
## 
## Residual Deviance: 2272.721 
## AIC: 2300.721

table(fashion$RATING[s1],predict(x,type="class"))

##         
##          Low High Medium
##   High     5  112    201
##   Low    147    0    170
##   Medium  87   69    609

prop.table(table(fashion$RATING[s1],predict(x,type="class")),2)

##         
##                Low      High    Medium
##   High   0.0209205 0.6187845 0.2051020
##   Low    0.6150628 0.0000000 0.1734694
##   Medium 0.3640167 0.3812155 0.6214286

prop.table(table(fashion$RATING[s1],predict(x,type="class")),1)

##         
##                 Low       High     Medium
##   High   0.01572327 0.35220126 0.63207547
##   Low    0.46372240 0.00000000 0.53627760
##   Medium 0.11372549 0.09019608 0.79607843

table(fashion$RATING[-s1],predict(x,newdata=fashion[-s1,], type="class"))

##         
##          Low High Medium
##   High     1   37     68
##   Low     51    0     96
##   Medium  35   33    279

prop.table(table(fashion$RATING[-s1], predict(x, newdata=fashion[-s1,], type="class")),1)

##         
##                  Low        High      Medium
##   High   0.009433962 0.349056604 0.641509434
##   Low    0.346938776 0.000000000 0.653061224
##   Medium 0.100864553 0.095100865 0.804034582

prop.table(table(fashion$RATING[-s1], predict(x, newdata=fashion[-s1,], type="class")),2)

##         
##                 Low       High     Medium
##   High   0.01149425 0.52857143 0.15349887
##   Low    0.58620690 0.00000000 0.21670429
##   Medium 0.40229885 0.47142857 0.62979684

Data Mining Session 5: Logisitc Regression and Multinomial Logit

Dr. Anil Chaturvedi

December 1, 2015

Box Plots of Balance and Income versus Student Status: Default Data set; package ISLR

Logisitc Regression Models to predict Default with balance as predictor

Logisitc Regression Models to predict Default with Student as predictor

Logisitc Regression Models to predict Default with balance, student, and income as predictors

Creating prediction table (confusion matrix :) ) : Logisitc Regression Model

Training-Holdout validation of Logistic Regression Model

Why Logistic? Similarities of Logistic and Normal densities (pdf)

Why Logistic? Similarities of Logistic and Normal cumulative distributions (cdf)

Why Multinomial Logit? Similarities of Gumbel and Normal densities (pdf)

Why Multinomial Logit? Similarities of Gumbel and Normal cumulative distributions (cdf)

Read fasion.cv file using the read.csv function into R.

It can be downloaded from data sets folder in Course Material in Chalk

Use package nnet in R to estimate Multinomial Logit model

Creating Prediction Tables for Multinomial Model Predictions

Performing training-holdout validations for Multinomial Logit Model