Box Plots of Balance and Income versus Student Status: Default Data set; package ISLR
library(ISLR)
load("~/.RData")
data("Default")
boxplot(Default$balance~Default$student,col=c(5,20),xlab="Student Status",ylab= "Balance", main="Boxplot Balance versus Student Status")

boxplot(Default$income~Default$student,col=c(5,20),xlab="Student Status",ylab="Income",main="Boxplot Income versus Student Status")

Logisitc Regression Models to predict Default with balance as predictor
summary(glm(default~balance, data=Default, family=binomial(link=logit)))
##
## Call:
## glm(formula = default ~ balance, family = binomial(link = logit),
## data = Default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2697 -0.1465 -0.0589 -0.0221 3.7589
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.065e+01 3.612e-01 -29.49 <2e-16 ***
## balance 5.499e-03 2.204e-04 24.95 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1596.5 on 9998 degrees of freedom
## AIC: 1600.5
##
## Number of Fisher Scoring iterations: 8
Logisitc Regression Models to predict Default with Student as predictor
summary(glm(default~student, data=Default, family=binomial(link=logit)))
##
## Call:
## glm(formula = default ~ student, family = binomial(link = logit),
## data = Default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.2970 -0.2970 -0.2434 -0.2434 2.6585
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.50413 0.07071 -49.55 < 2e-16 ***
## studentYes 0.40489 0.11502 3.52 0.000431 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 2908.7 on 9998 degrees of freedom
## AIC: 2912.7
##
## Number of Fisher Scoring iterations: 6
Logisitc Regression Models to predict Default with balance, student, and income as predictors
summary(glm(default~. ,data=Default, family=binomial(link=logit)))
##
## Call:
## glm(formula = default ~ ., family = binomial(link = logit), data = Default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4691 -0.1418 -0.0557 -0.0203 3.7383
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.087e+01 4.923e-01 -22.080 < 2e-16 ***
## studentYes -6.468e-01 2.363e-01 -2.738 0.00619 **
## balance 5.737e-03 2.319e-04 24.738 < 2e-16 ***
## income 3.033e-06 8.203e-06 0.370 0.71152
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1571.5 on 9996 degrees of freedom
## AIC: 1579.5
##
## Number of Fisher Scoring iterations: 8
Creating prediction table (confusion matrix :) ) : Logisitc Regression Model
x=(glm(Default,family=binomial(link=logit)))
xp=x$fitted.values
xp[xp>=0.5]=1
xp[xp<0.5]=0
table(Default$default,xp)
## xp
## 0 1
## No 9627 40
## Yes 228 105
round(prop.table(table(Default$default,xp),1),2)
## xp
## 0 1
## No 1.00 0.00
## Yes 0.68 0.32
Training-Holdout validation of Logistic Regression Model
s1=sample(1:nrow(Default),nrow(Default)*0.7,replace=FALSE)
x=(glm(Default[s1,],family=binomial(link=logit)))
xp=x$fitted.values
xp[xp>=0.5]=1
xp[xp<0.5]=0
table(Default[s1,1],xp)
## xp
## 0 1
## No 6747 25
## Yes 162 66
round(prop.table(table(Default[s1,1],xp),1),2)
## xp
## 0 1
## No 1.00 0.00
## Yes 0.71 0.29
xp=predict(x, newdata=Default[-s1,-1],type="response")
xp[xp>=0.5]=1
xp[xp<0.5]=0
table(Default[-s1,1],xp)
## xp
## 0 1
## No 2879 16
## Yes 64 41
round(prop.table(table(Default[-s1,1],xp),1),2)
## xp
## 0 1
## No 0.99 0.01
## Yes 0.61 0.39
Why Logistic? Similarities of Logistic and Normal densities (pdf)
x=seq(-5,5,.02)
plot(x,dnorm(x),type="p", ylim=c(0,0.5), col=4,main="Similarity of Standard Normal and Logistic Densities", ylab ="Probability", xlab ="Quantiles")
points(x, 1.5*exp(-1.5*x)/(1+exp(-1.5*x))^2, col=5)

Why Logistic? Similarities of Logistic and Normal cumulative distributions (cdf)
x=seq(-5,5,.02)
plot(x,pnorm(x),type="p",ylim=c(0,1),col=4,main="Standard Normal and Logistic Cumulative distributions",ylab="Cumulative Probability",xlab="Quantiles")
points(x,1/(1+exp(-1.5*x))^2,col=5)

Why Multinomial Logit? Similarities of Gumbel and Normal densities (pdf)
x=seq(-5,5,.02)
plot(x,1/(2*pi)^0.5 * exp(-0.5 * x^2),type="p", ylim=c(0,0.5), col=4,main="Similarity of Standard Normal and Gumbel densities", ylab ="Probability", xlab ="Quantiles")
points(x,1.1*exp(-1.1*x)*exp(-exp(-1.1*x)),col=11)

Why Multinomial Logit? Similarities of Gumbel and Normal cumulative distributions (cdf)
x=seq(-5,5,.02)
plot(x,pnorm(x),type="p",ylim=c(0,1),col=4,main="Standard Normal and Gumbel Cumulative distributions",ylab="Cumulative Probability",xlab="Quantiles")
points(x,exp(-exp(-1.1*x)),col=11)

Read fasion.cv file using the read.csv function into R.
It can be downloaded from data sets folder in Course Material in Chalk
Use package nnet in R to estimate Multinomial Logit model
fashion=read.csv('/Users/codethedral/Google Drive/MScA/MSCA_31008 Data Mining/Lecture 5/fashion_2000_from_chalk.csv')
#levels(fashion[,1])=levels(fashion[,1])[c(2,3,1)]
summary(fashion)
## RATING SEX AGE FASHION QUALITY
## High : 424 Female:1157 16-24:915 Modern :1016 High:1002
## Low : 464 Male : 843 25-39:426 Traditional: 984 Low : 998
## Medium:1112 40+ :659
## PRICE
## Higher:1012
## Lower : 988
##
require(nnet)
## Loading required package: nnet
x=multinom(relevel(RATING,ref="Low")~.,data=fashion)
## # weights: 24 (14 variable)
## initial value 2197.224577
## iter 10 value 1666.158060
## iter 20 value 1615.993787
## final value 1615.988314
## converged
summary(x)
## Call:
## multinom(formula = relevel(RATING, ref = "Low") ~ ., data = fashion)
##
## Coefficients:
## (Intercept) SEXMale AGE25-39 AGE40+ FASHIONTraditional
## High 1.021470 -0.0464043 0.5712490 0.5238451 -3.391201
## Medium 1.819807 0.1488543 0.3402063 0.3911855 -1.867446
## QUALITYLow PRICELower
## High -1.9924648 2.570459
## Medium -0.8218136 1.279181
##
## Std. Errors:
## (Intercept) SEXMale AGE25-39 AGE40+ FASHIONTraditional
## High 0.2012743 0.1661045 0.2130375 0.1876391 0.1909015
## Medium 0.1643670 0.1250484 0.1633628 0.1410642 0.1417692
## QUALITYLow PRICELower
## High 0.1719416 0.1775988
## Medium 0.1267600 0.1316249
##
## Residual Deviance: 3231.977
## AIC: 3259.977
Creating Prediction Tables for Multinomial Model Predictions
table(fashion$RATING,predict(x,type="class"))
##
## Low High Medium
## High 6 149 269
## Low 198 0 266
## Medium 122 102 888
prop.table(table(fashion$RATING,predict(x,type="class")),2)
##
## Low High Medium
## High 0.01840491 0.59362550 0.18903725
## Low 0.60736196 0.00000000 0.18692902
## Medium 0.37423313 0.40637450 0.62403373
prop.table(table(fashion$RATING,predict(x,type="class")),1)
##
## Low High Medium
## High 0.01415094 0.35141509 0.63443396
## Low 0.42672414 0.00000000 0.57327586
## Medium 0.10971223 0.09172662 0.79856115
Performing training-holdout validations for Multinomial Logit Model
s1=sample(1:nrow(fashion),.7*nrow(fashion),replace=FALSE)
x=multinom(relevel(RATING,ref="Low")~.,data=fashion[s1,])
## # weights: 24 (14 variable)
## initial value 1538.057204
## iter 10 value 1146.202953
## iter 20 value 1136.360627
## final value 1136.360348
## converged
summary(x)
## Call:
## multinom(formula = relevel(RATING, ref = "Low") ~ ., data = fashion[s1,
## ])
##
## Coefficients:
## (Intercept) SEXMale AGE25-39 AGE40+ FASHIONTraditional
## High 1.195247 -0.08355537 0.6113377 0.3902596 -3.294032
## Medium 1.851715 0.15477551 0.4039291 0.4249496 -1.876347
## QUALITYLow PRICELower
## High -2.0824732 2.580337
## Medium -0.8694767 1.275182
##
## Std. Errors:
## (Intercept) SEXMale AGE25-39 AGE40+ FASHIONTraditional
## High 0.2382692 0.1985659 0.2522676 0.2230830 0.2247770
## Medium 0.1990181 0.1527110 0.1983279 0.1707263 0.1714455
## QUALITYLow PRICELower
## High 0.2052496 0.2109767
## Medium 0.1544662 0.1602237
##
## Residual Deviance: 2272.721
## AIC: 2300.721
table(fashion$RATING[s1],predict(x,type="class"))
##
## Low High Medium
## High 5 112 201
## Low 147 0 170
## Medium 87 69 609
prop.table(table(fashion$RATING[s1],predict(x,type="class")),2)
##
## Low High Medium
## High 0.0209205 0.6187845 0.2051020
## Low 0.6150628 0.0000000 0.1734694
## Medium 0.3640167 0.3812155 0.6214286
prop.table(table(fashion$RATING[s1],predict(x,type="class")),1)
##
## Low High Medium
## High 0.01572327 0.35220126 0.63207547
## Low 0.46372240 0.00000000 0.53627760
## Medium 0.11372549 0.09019608 0.79607843
table(fashion$RATING[-s1],predict(x,newdata=fashion[-s1,], type="class"))
##
## Low High Medium
## High 1 37 68
## Low 51 0 96
## Medium 35 33 279
prop.table(table(fashion$RATING[-s1], predict(x, newdata=fashion[-s1,], type="class")),1)
##
## Low High Medium
## High 0.009433962 0.349056604 0.641509434
## Low 0.346938776 0.000000000 0.653061224
## Medium 0.100864553 0.095100865 0.804034582
prop.table(table(fashion$RATING[-s1], predict(x, newdata=fashion[-s1,], type="class")),2)
##
## Low High Medium
## High 0.01149425 0.52857143 0.15349887
## Low 0.58620690 0.00000000 0.21670429
## Medium 0.40229885 0.47142857 0.62979684