Multiple Logistic Regression (Based on Class Example)

# Purchased or Not Dataset
dataset <- read.csv("C:\\Users\\samy_\\Desktop\\R_Python_Machine Learning DataSets\\Logistic Reg\\Data.csv")
str(dataset)
## 'data.frame':    10 obs. of  4 variables:
##  $ Country  : Factor w/ 3 levels "France","Germany",..: 1 3 2 3 2 1 3 1 2 1
##  $ Age      : int  44 27 30 38 40 35 NA 48 50 37
##  $ Salary   : int  72000 48000 54000 61000 NA 58000 52000 79000 83000 67000
##  $ Purchased: Factor w/ 2 levels "No","Yes": 1 2 1 1 2 2 1 2 1 2
attach(dataset)
summary(dataset)
##     Country       Age            Salary      Purchased
##  France :4   Min.   :27.00   Min.   :48000   No :5    
##  Germany:3   1st Qu.:35.00   1st Qu.:54000   Yes:5    
##  Spain  :3   Median :38.00   Median :61000            
##              Mean   :38.78   Mean   :63778            
##              3rd Qu.:44.00   3rd Qu.:72000            
##              Max.   :50.00   Max.   :83000            
##              NA's   :1       NA's   :1
# Logistic Regression
str(Country)
##  Factor w/ 3 levels "France","Germany",..: 1 3 2 3 2 1 3 1 2 1
logit <- glm(Purchased ~ Country + Age + Salary, family = binomial)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logit)
## 
## Call:
## glm(formula = Purchased ~ Country + Age + Salary, family = binomial)
## 
## Deviance Residuals: 
##          1           2           3           4           6           8  
## -2.347e-05   2.110e-08  -4.216e-06  -2.110e-08   1.672e-05   1.968e-05  
##          9          10  
## -2.110e-08   2.110e-08  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)
## (Intercept)    -9.716e+01  2.665e+05   0.000    1.000
## CountryGermany -3.850e+02  4.462e+05  -0.001    0.999
## CountrySpain   -6.723e+01  4.745e+07   0.000    1.000
## Age            -1.335e+02  1.492e+05  -0.001    0.999
## Salary          8.260e-02  9.276e+01   0.001    0.999
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1.1090e+01  on 7  degrees of freedom
## Residual deviance: 1.2358e-09  on 3  degrees of freedom
##   (2 observations deleted due to missingness)
## AIC: 10
## 
## Number of Fisher Scoring iterations: 25
library(MASS)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(car)
## Loading required package: carData
x <- stepAIC(logit)
## Start:  AIC=10
## Purchased ~ Country + Age + Salary
## 
##           Df Deviance    AIC
## <none>         0.0000 10.000
## - Salary   1   5.0374 13.037
## - Age      1   5.7071 13.707
## - Country  2  10.1572 16.157
# Display the Coefficients
coef(logit) # Display log Beta
##    (Intercept) CountryGermany   CountrySpain            Age         Salary 
##   -97.15707249  -385.02051152   -67.23103769  -133.45752274     0.08260105
# Odds Ratio
exp(coef(logit)) # Display Original Parameter Values
##    (Intercept) CountryGermany   CountrySpain            Age         Salary 
##   6.385862e-43  6.133614e-168   6.337695e-30   1.096817e-58   1.086108e+00
# Confusion Matrix Table
prob <- predict(logit, type = c("response"), dataset)
prob <- as.data.frame(prob)
final <- cbind(prob, dataset)
#dim(final)
confusion <- table(prob > 0.5, Purchased) # Confusion Matrix Table
confusion
##        Purchased
##         No Yes
##   FALSE  4   0
##   TRUE   0   4
# Model Accuracy
Accuracy <- sum(diag(confusion)/sum(confusion))
Accuracy
## [1] 1
vif(logit)
##               GVIF Df GVIF^(1/(2*Df))
## Country   4.738607  2         1.47541
## Age     941.342243  1        30.68130
## Salary  908.266996  1        30.13747
# ROC Curve (ROC - Receiver Operating Characteristics)
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
rocrpred <- prediction(prob, Purchased) # Create prediction object
rocrperf <- performance(rocrpred, 'tpr', 'fpr') # Function to create performance objects

plot(rocrperf, colorize=T, text.adj=c(-0.2, 1.7))

# More area under the ROC Curve better is the logistic regression model