Multiple Logistic Regression (Based on Class Example)
# Purchased or Not Dataset
dataset <- read.csv("C:\\Users\\samy_\\Desktop\\R_Python_Machine Learning DataSets\\Logistic Reg\\Data.csv")
str(dataset)
## 'data.frame': 10 obs. of 4 variables:
## $ Country : Factor w/ 3 levels "France","Germany",..: 1 3 2 3 2 1 3 1 2 1
## $ Age : int 44 27 30 38 40 35 NA 48 50 37
## $ Salary : int 72000 48000 54000 61000 NA 58000 52000 79000 83000 67000
## $ Purchased: Factor w/ 2 levels "No","Yes": 1 2 1 1 2 2 1 2 1 2
attach(dataset)
summary(dataset)
## Country Age Salary Purchased
## France :4 Min. :27.00 Min. :48000 No :5
## Germany:3 1st Qu.:35.00 1st Qu.:54000 Yes:5
## Spain :3 Median :38.00 Median :61000
## Mean :38.78 Mean :63778
## 3rd Qu.:44.00 3rd Qu.:72000
## Max. :50.00 Max. :83000
## NA's :1 NA's :1
# Logistic Regression
str(Country)
## Factor w/ 3 levels "France","Germany",..: 1 3 2 3 2 1 3 1 2 1
logit <- glm(Purchased ~ Country + Age + Salary, family = binomial)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logit)
##
## Call:
## glm(formula = Purchased ~ Country + Age + Salary, family = binomial)
##
## Deviance Residuals:
## 1 2 3 4 6 8
## -2.347e-05 2.110e-08 -4.216e-06 -2.110e-08 1.672e-05 1.968e-05
## 9 10
## -2.110e-08 2.110e-08
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.716e+01 2.665e+05 0.000 1.000
## CountryGermany -3.850e+02 4.462e+05 -0.001 0.999
## CountrySpain -6.723e+01 4.745e+07 0.000 1.000
## Age -1.335e+02 1.492e+05 -0.001 0.999
## Salary 8.260e-02 9.276e+01 0.001 0.999
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1.1090e+01 on 7 degrees of freedom
## Residual deviance: 1.2358e-09 on 3 degrees of freedom
## (2 observations deleted due to missingness)
## AIC: 10
##
## Number of Fisher Scoring iterations: 25
library(MASS)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(car)
## Loading required package: carData
x <- stepAIC(logit)
## Start: AIC=10
## Purchased ~ Country + Age + Salary
##
## Df Deviance AIC
## <none> 0.0000 10.000
## - Salary 1 5.0374 13.037
## - Age 1 5.7071 13.707
## - Country 2 10.1572 16.157
# Display the Coefficients
coef(logit) # Display log Beta
## (Intercept) CountryGermany CountrySpain Age Salary
## -97.15707249 -385.02051152 -67.23103769 -133.45752274 0.08260105
# Odds Ratio
exp(coef(logit)) # Display Original Parameter Values
## (Intercept) CountryGermany CountrySpain Age Salary
## 6.385862e-43 6.133614e-168 6.337695e-30 1.096817e-58 1.086108e+00
# Confusion Matrix Table
prob <- predict(logit, type = c("response"), dataset)
prob <- as.data.frame(prob)
final <- cbind(prob, dataset)
#dim(final)
confusion <- table(prob > 0.5, Purchased) # Confusion Matrix Table
confusion
## Purchased
## No Yes
## FALSE 4 0
## TRUE 0 4
# Model Accuracy
Accuracy <- sum(diag(confusion)/sum(confusion))
Accuracy
## [1] 1
vif(logit)
## GVIF Df GVIF^(1/(2*Df))
## Country 4.738607 2 1.47541
## Age 941.342243 1 30.68130
## Salary 908.266996 1 30.13747
# ROC Curve (ROC - Receiver Operating Characteristics)
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
rocrpred <- prediction(prob, Purchased) # Create prediction object
rocrperf <- performance(rocrpred, 'tpr', 'fpr') # Function to create performance objects
plot(rocrperf, colorize=T, text.adj=c(-0.2, 1.7))

# More area under the ROC Curve better is the logistic regression model