Box Plots of Balance and Income versus Student Status: Default Data set; package ISLR

library(ISLR)
load("~/.RData")
data("Default")

boxplot(Default$balance~Default$student,col=c(5,20),xlab="Student Status",ylab= "Balance", main="Boxplot Balance versus Student Status")

boxplot(Default$income~Default$student,col=c(5,20),xlab="Student Status",ylab="Income",main="Boxplot Income versus Student Status")

Logisitc Regression Models to predict Default with balance as predictor

summary(glm(default~balance, data=Default, family=binomial(link=logit)))
## 
## Call:
## glm(formula = default ~ balance, family = binomial(link = logit), 
##     data = Default)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2697  -0.1465  -0.0589  -0.0221   3.7589  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.065e+01  3.612e-01  -29.49   <2e-16 ***
## balance      5.499e-03  2.204e-04   24.95   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1596.5  on 9998  degrees of freedom
## AIC: 1600.5
## 
## Number of Fisher Scoring iterations: 8

Logisitc Regression Models to predict Default with Student as predictor

summary(glm(default~student, data=Default, family=binomial(link=logit)))
## 
## Call:
## glm(formula = default ~ student, family = binomial(link = logit), 
##     data = Default)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.2970  -0.2970  -0.2434  -0.2434   2.6585  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.50413    0.07071  -49.55  < 2e-16 ***
## studentYes   0.40489    0.11502    3.52 0.000431 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 2908.7  on 9998  degrees of freedom
## AIC: 2912.7
## 
## Number of Fisher Scoring iterations: 6

Logisitc Regression Models to predict Default with balance, student, and income as predictors

summary(glm(default~. ,data=Default, family=binomial(link=logit)))
## 
## Call:
## glm(formula = default ~ ., family = binomial(link = logit), data = Default)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4691  -0.1418  -0.0557  -0.0203   3.7383  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.087e+01  4.923e-01 -22.080  < 2e-16 ***
## studentYes  -6.468e-01  2.363e-01  -2.738  0.00619 ** 
## balance      5.737e-03  2.319e-04  24.738  < 2e-16 ***
## income       3.033e-06  8.203e-06   0.370  0.71152    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1571.5  on 9996  degrees of freedom
## AIC: 1579.5
## 
## Number of Fisher Scoring iterations: 8

Creating prediction table (confusion matrix :) ) : Logisitc Regression Model

x=(glm(Default,family=binomial(link=logit)))
xp=x$fitted.values
xp[xp>=0.5]=1
xp[xp<0.5]=0
table(Default$default,xp)
##      xp
##          0    1
##   No  9627   40
##   Yes  228  105
round(prop.table(table(Default$default,xp),1),2)
##      xp
##          0    1
##   No  1.00 0.00
##   Yes 0.68 0.32

Training-Holdout validation of Logistic Regression Model

s1=sample(1:nrow(Default),nrow(Default)*0.7,replace=FALSE)
x=(glm(Default[s1,],family=binomial(link=logit)))
xp=x$fitted.values
xp[xp>=0.5]=1
xp[xp<0.5]=0
table(Default[s1,1],xp)
##      xp
##          0    1
##   No  6747   25
##   Yes  162   66
round(prop.table(table(Default[s1,1],xp),1),2)
##      xp
##          0    1
##   No  1.00 0.00
##   Yes 0.71 0.29
xp=predict(x, newdata=Default[-s1,-1],type="response")
xp[xp>=0.5]=1
xp[xp<0.5]=0
table(Default[-s1,1],xp)
##      xp
##          0    1
##   No  2879   16
##   Yes   64   41
round(prop.table(table(Default[-s1,1],xp),1),2)
##      xp
##          0    1
##   No  0.99 0.01
##   Yes 0.61 0.39

Why Logistic? Similarities of Logistic and Normal densities (pdf)

x=seq(-5,5,.02)
plot(x,dnorm(x),type="p", ylim=c(0,0.5), col=4,main="Similarity of Standard Normal and Logistic Densities", ylab ="Probability", xlab ="Quantiles")
points(x, 1.5*exp(-1.5*x)/(1+exp(-1.5*x))^2, col=5)

Why Logistic? Similarities of Logistic and Normal cumulative distributions (cdf)

x=seq(-5,5,.02)
plot(x,pnorm(x),type="p",ylim=c(0,1),col=4,main="Standard Normal and Logistic Cumulative distributions",ylab="Cumulative Probability",xlab="Quantiles")
points(x,1/(1+exp(-1.5*x))^2,col=5)

Why Multinomial Logit? Similarities of Gumbel and Normal densities (pdf)

x=seq(-5,5,.02)
plot(x,1/(2*pi)^0.5 * exp(-0.5 * x^2),type="p", ylim=c(0,0.5), col=4,main="Similarity of Standard Normal and Gumbel densities", ylab ="Probability", xlab ="Quantiles")
points(x,1.1*exp(-1.1*x)*exp(-exp(-1.1*x)),col=11)

Why Multinomial Logit? Similarities of Gumbel and Normal cumulative distributions (cdf)

x=seq(-5,5,.02)
plot(x,pnorm(x),type="p",ylim=c(0,1),col=4,main="Standard Normal and Gumbel Cumulative distributions",ylab="Cumulative Probability",xlab="Quantiles")
points(x,exp(-exp(-1.1*x)),col=11)

Read fasion.cv file using the read.csv function into R.

It can be downloaded from data sets folder in Course Material in Chalk

Use package nnet in R to estimate Multinomial Logit model

fashion=read.csv('/Users/codethedral/Google Drive/MScA/MSCA_31008 Data Mining/Lecture 5/fashion_2000_from_chalk.csv')

#levels(fashion[,1])=levels(fashion[,1])[c(2,3,1)]
summary(fashion)
##     RATING         SEX          AGE             FASHION     QUALITY    
##  High  : 424   Female:1157   16-24:915   Modern     :1016   High:1002  
##  Low   : 464   Male  : 843   25-39:426   Traditional: 984   Low : 998  
##  Medium:1112                 40+  :659                                 
##     PRICE     
##  Higher:1012  
##  Lower : 988  
## 
require(nnet)
## Loading required package: nnet
x=multinom(relevel(RATING,ref="Low")~.,data=fashion)
## # weights:  24 (14 variable)
## initial  value 2197.224577 
## iter  10 value 1666.158060
## iter  20 value 1615.993787
## final  value 1615.988314 
## converged
summary(x)
## Call:
## multinom(formula = relevel(RATING, ref = "Low") ~ ., data = fashion)
## 
## Coefficients:
##        (Intercept)    SEXMale  AGE25-39    AGE40+ FASHIONTraditional
## High      1.021470 -0.0464043 0.5712490 0.5238451          -3.391201
## Medium    1.819807  0.1488543 0.3402063 0.3911855          -1.867446
##        QUALITYLow PRICELower
## High   -1.9924648   2.570459
## Medium -0.8218136   1.279181
## 
## Std. Errors:
##        (Intercept)   SEXMale  AGE25-39    AGE40+ FASHIONTraditional
## High     0.2012743 0.1661045 0.2130375 0.1876391          0.1909015
## Medium   0.1643670 0.1250484 0.1633628 0.1410642          0.1417692
##        QUALITYLow PRICELower
## High    0.1719416  0.1775988
## Medium  0.1267600  0.1316249
## 
## Residual Deviance: 3231.977 
## AIC: 3259.977

Creating Prediction Tables for Multinomial Model Predictions

table(fashion$RATING,predict(x,type="class"))
##         
##          Low High Medium
##   High     6  149    269
##   Low    198    0    266
##   Medium 122  102    888
prop.table(table(fashion$RATING,predict(x,type="class")),2)
##         
##                 Low       High     Medium
##   High   0.01840491 0.59362550 0.18903725
##   Low    0.60736196 0.00000000 0.18692902
##   Medium 0.37423313 0.40637450 0.62403373
prop.table(table(fashion$RATING,predict(x,type="class")),1)
##         
##                 Low       High     Medium
##   High   0.01415094 0.35141509 0.63443396
##   Low    0.42672414 0.00000000 0.57327586
##   Medium 0.10971223 0.09172662 0.79856115

Performing training-holdout validations for Multinomial Logit Model

s1=sample(1:nrow(fashion),.7*nrow(fashion),replace=FALSE)
x=multinom(relevel(RATING,ref="Low")~.,data=fashion[s1,])
## # weights:  24 (14 variable)
## initial  value 1538.057204 
## iter  10 value 1146.202953
## iter  20 value 1136.360627
## final  value 1136.360348 
## converged
summary(x)
## Call:
## multinom(formula = relevel(RATING, ref = "Low") ~ ., data = fashion[s1, 
##     ])
## 
## Coefficients:
##        (Intercept)     SEXMale  AGE25-39    AGE40+ FASHIONTraditional
## High      1.195247 -0.08355537 0.6113377 0.3902596          -3.294032
## Medium    1.851715  0.15477551 0.4039291 0.4249496          -1.876347
##        QUALITYLow PRICELower
## High   -2.0824732   2.580337
## Medium -0.8694767   1.275182
## 
## Std. Errors:
##        (Intercept)   SEXMale  AGE25-39    AGE40+ FASHIONTraditional
## High     0.2382692 0.1985659 0.2522676 0.2230830          0.2247770
## Medium   0.1990181 0.1527110 0.1983279 0.1707263          0.1714455
##        QUALITYLow PRICELower
## High    0.2052496  0.2109767
## Medium  0.1544662  0.1602237
## 
## Residual Deviance: 2272.721 
## AIC: 2300.721
table(fashion$RATING[s1],predict(x,type="class"))
##         
##          Low High Medium
##   High     5  112    201
##   Low    147    0    170
##   Medium  87   69    609
prop.table(table(fashion$RATING[s1],predict(x,type="class")),2)
##         
##                Low      High    Medium
##   High   0.0209205 0.6187845 0.2051020
##   Low    0.6150628 0.0000000 0.1734694
##   Medium 0.3640167 0.3812155 0.6214286
prop.table(table(fashion$RATING[s1],predict(x,type="class")),1)
##         
##                 Low       High     Medium
##   High   0.01572327 0.35220126 0.63207547
##   Low    0.46372240 0.00000000 0.53627760
##   Medium 0.11372549 0.09019608 0.79607843
table(fashion$RATING[-s1],predict(x,newdata=fashion[-s1,], type="class"))
##         
##          Low High Medium
##   High     1   37     68
##   Low     51    0     96
##   Medium  35   33    279
prop.table(table(fashion$RATING[-s1], predict(x, newdata=fashion[-s1,], type="class")),1)
##         
##                  Low        High      Medium
##   High   0.009433962 0.349056604 0.641509434
##   Low    0.346938776 0.000000000 0.653061224
##   Medium 0.100864553 0.095100865 0.804034582
prop.table(table(fashion$RATING[-s1], predict(x, newdata=fashion[-s1,], type="class")),2)
##         
##                 Low       High     Medium
##   High   0.01149425 0.52857143 0.15349887
##   Low    0.58620690 0.00000000 0.21670429
##   Medium 0.40229885 0.47142857 0.62979684