Problem Definition
It is estimated that 10,000 people die each year worldwide due to hurricanes and tropical storms. The majority of human deaths are caused by flooding. Hurricane Irma hit Florida as a Category 4 storm the morning of Sept. 10, 2017, ripping off roofs, flooding coastal cities, and knocking out power to more than people. The storm and its aftermath has killed at least 38 in the Caribbean, 34 in Florida, three in Georgia, four in South Carolina, and one in North Carolina. The occurrences of these natural disasters have been on a high which is a concern for United Nation; The World Meteorological Organization (specialized agency of UN) has been collecting data about all the individuals that are living in and around Hurricanes and Cyclones prone areas. In the aftermath of Irma, WMO wants to find a pattern or a relation between the attributes that will prove whether an individual will SURVIVE OR NOT SURVIVE any hurricane/cyclones in the near future.

Data Description
DATE OF BIRTH
ENDURANCE LEVEL
MARITAL STATUS
FAVOURITE SPORT
ANNUAL SALARY
FAVOURITE COLOUR
EDUCATION
SOURCE OF NEWS
EMPLOYMENT
DISTANCE FROM COAST
RELIGIOUS ORIENTATION
MONTHLY TRAVEL
FAVOURITE TV SHOW
PREFERRED GENRE OF MOVIES
PREFERRED BRAND OF CAR
FAVOURITE SUBJECT
GENDER
PREFERRED CHOICE OF ALCOHOL
FAVOURITE CUISINE
FAVOURITE SUPERHERO
FAVOURITE GENRE OF MUSIC
CLASS

Setup

library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(corrgram)
library(gridExtra) 
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(Deducer)
## Loading required package: JGR
## Loading required package: rJava
## Loading required package: JavaGD
## Loading required package: iplots
## 
## Please type JGR() to launch console. Platform specific launchers (.exe and .app) can also be obtained at http://www.rforge.net/JGR/files/.
## Loading required package: car
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## 
## Note Non-JGR console detected:
##  Deducer is best used from within JGR (http://jgr.markushelbig.org/).
##  To Bring up GUI dialogs, type deducer().
library(caret)
## Loading required package: lattice
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
library(mice)
## 
## Attaching package: 'mice'
## The following object is masked from 'package:tidyr':
## 
##     complete

Functions

detect_outliers <- function(inp, na.rm=TRUE) {
  i.qnt <- quantile(inp, probs=c(.25, .75), na.rm=na.rm)
  i.max <- 2.5 * IQR(inp, na.rm=na.rm)
  otp <- inp
  otp[inp < (i.qnt[1] - i.max)] <- NA
  otp[inp > (i.qnt[2] + i.max)] <- NA
  inp[is.na(otp)]
}

detect_na <- function(inp) {
  sum(is.na(inp))
}

Graph_Boxplot <- function (input, na.rm = TRUE){
Plot <- ggplot(dfrModel, aes(x="", y=input)) +
            geom_boxplot(aes(fill=input), color="green") +
            labs(title="Outliers")
Plot
}

Dataset

setwd("D:/Welingkar/Competitions/Cognetix")
dfrModel <- read.csv("./Training_Data.csv", header=T, stringsAsFactors=F)
head(dfrModel)
##   Age M_STATUS SALARY EDU_DATA EMP_DATA REL_ORIEN FAV_TV PREF_CAR GENDER
## 1  53        1      1        4        0         1      1        1      1
## 2  36        0      4        2        1         1      2        2      1
## 3  39        1      2        2        1         1      3        3      1
## 4  39        0      4        3        1         2      4        4      1
## 5  49        0      5        2        0         2      5        5      1
## 6  47        0      2        1        1         1      4        6      0
##   FAV_CUIS FAV_MUSIC ENDU_LEVEL FAV_SPORT FAV_COLR NEWS_SOURCE
## 1        1         1          3         1        1           1
## 2        2         2          5         2        2           2
## 3        1         1          3         2        3           3
## 4        3         1          2         3        1           4
## 5        4         1          1         3        4           3
## 6        5         3          3         4        3           5
##   DIST_FRM_COAST MNTLY_TRAVEL GEN_MOVIES FAV_SUBJ ALCOHOL FAV_SUPERHERO
## 1              2            2          1        1       1             1
## 2              5            3          2        2       2             2
## 3              2            1          3        3       3             3
## 4              1            1          4        4       4             4
## 5              2            1          2        5       5             2
## 6              1            1          1        1       5             2
##   Dist_Coast Class
## 1        462     0
## 2        269     0
## 3       1308     1
## 4       1039     0
## 5        272     0
## 6        603     0

Observation
There is Numeric data only
As no Alphanumeric Column so no need to do any changes here or no need to drop any column as of now.

Outliers Data

#detect_outliers(dfrModel$Age)
lapply(dfrModel, FUN=detect_outliers)
## $Age
## integer(0)
## 
## $M_STATUS
##  [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 
## $SALARY
##  [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 
## $EDU_DATA
## [1] NA NA NA NA
## 
## $EMP_DATA
## integer(0)
## 
## $REL_ORIEN
## integer(0)
## 
## $FAV_TV
## integer(0)
## 
## $PREF_CAR
## integer(0)
## 
## $GENDER
##  [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 
## $FAV_CUIS
## integer(0)
## 
## $FAV_MUSIC
## integer(0)
## 
## $ENDU_LEVEL
## integer(0)
## 
## $FAV_SPORT
## integer(0)
## 
## $FAV_COLR
## integer(0)
## 
## $NEWS_SOURCE
## integer(0)
## 
## $DIST_FRM_COAST
## integer(0)
## 
## $MNTLY_TRAVEL
## [1] NA NA NA NA NA
## 
## $GEN_MOVIES
## integer(0)
## 
## $FAV_SUBJ
## integer(0)
## 
## $ALCOHOL
## [1] NA NA NA
## 
## $FAV_SUPERHERO
## integer(0)
## 
## $Dist_Coast
## integer(0)
## 
## $Class
## integer(0)

Outliers Graph

lapply(dfrModel, FUN=Graph_Boxplot)
## $Age

## 
## $M_STATUS
## Warning: Removed 23 rows containing non-finite values (stat_boxplot).

## 
## $SALARY
## Warning: Removed 18 rows containing non-finite values (stat_boxplot).

## 
## $EDU_DATA
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).

## 
## $EMP_DATA

## 
## $REL_ORIEN

## 
## $FAV_TV

## 
## $PREF_CAR

## 
## $GENDER
## Warning: Removed 16 rows containing non-finite values (stat_boxplot).

## 
## $FAV_CUIS

## 
## $FAV_MUSIC

## 
## $ENDU_LEVEL

## 
## $FAV_SPORT

## 
## $FAV_COLR

## 
## $NEWS_SOURCE

## 
## $DIST_FRM_COAST

## 
## $MNTLY_TRAVEL
## Warning: Removed 5 rows containing non-finite values (stat_boxplot).

## 
## $GEN_MOVIES

## 
## $FAV_SUBJ

## 
## $ALCOHOL
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).

## 
## $FAV_SUPERHERO

## 
## $Dist_Coast

## 
## $Class

Observations
There are few outliers.
We will go with Outliers

Train and test Data

dim(dfrModel)
## [1] 5020   23

Observations
There are 5020 records in the data sheet.

Missing Data

#sum(is.na(dfrModel$Age))
lapply(dfrModel, FUN=detect_na)
## $Age
## [1] 0
## 
## $M_STATUS
## [1] 23
## 
## $SALARY
## [1] 18
## 
## $EDU_DATA
## [1] 4
## 
## $EMP_DATA
## [1] 0
## 
## $REL_ORIEN
## [1] 0
## 
## $FAV_TV
## [1] 0
## 
## $PREF_CAR
## [1] 0
## 
## $GENDER
## [1] 16
## 
## $FAV_CUIS
## [1] 0
## 
## $FAV_MUSIC
## [1] 0
## 
## $ENDU_LEVEL
## [1] 0
## 
## $FAV_SPORT
## [1] 0
## 
## $FAV_COLR
## [1] 0
## 
## $NEWS_SOURCE
## [1] 0
## 
## $DIST_FRM_COAST
## [1] 0
## 
## $MNTLY_TRAVEL
## [1] 5
## 
## $GEN_MOVIES
## [1] 0
## 
## $FAV_SUBJ
## [1] 0
## 
## $ALCOHOL
## [1] 3
## 
## $FAV_SUPERHERO
## [1] 0
## 
## $Dist_Coast
## [1] 0
## 
## $Class
## [1] 0
dfrModel <- dfrModel[complete.cases(dfrModel), ]
lapply(dfrModel, FUN=detect_na)
## $Age
## [1] 0
## 
## $M_STATUS
## [1] 0
## 
## $SALARY
## [1] 0
## 
## $EDU_DATA
## [1] 0
## 
## $EMP_DATA
## [1] 0
## 
## $REL_ORIEN
## [1] 0
## 
## $FAV_TV
## [1] 0
## 
## $PREF_CAR
## [1] 0
## 
## $GENDER
## [1] 0
## 
## $FAV_CUIS
## [1] 0
## 
## $FAV_MUSIC
## [1] 0
## 
## $ENDU_LEVEL
## [1] 0
## 
## $FAV_SPORT
## [1] 0
## 
## $FAV_COLR
## [1] 0
## 
## $NEWS_SOURCE
## [1] 0
## 
## $DIST_FRM_COAST
## [1] 0
## 
## $MNTLY_TRAVEL
## [1] 0
## 
## $GEN_MOVIES
## [1] 0
## 
## $FAV_SUBJ
## [1] 0
## 
## $ALCOHOL
## [1] 0
## 
## $FAV_SUPERHERO
## [1] 0
## 
## $Dist_Coast
## [1] 0
## 
## $Class
## [1] 0

Observations
as no of datasets were less for which data is missing so we removed them from our analysis.

dim(dfrModel)
## [1] 4951   23

Summary

#summary(dfrModel)
lapply(dfrModel, FUN=summary)
## $Age
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    22.0    31.0    40.0    39.7    48.0    58.0 
## 
## $M_STATUS
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  1.0000  0.6393  1.0000  2.0000 
## 
## $SALARY
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   4.000   3.382   4.000   6.000 
## 
## $EDU_DATA
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   2.000   2.495   3.000   4.000 
## 
## $EMP_DATA
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   1.012   2.000   2.000 
## 
## $REL_ORIEN
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   1.000   1.588   2.000   3.000 
## 
## $FAV_TV
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   4.000   5.000   5.228   7.000  10.000 
## 
## $PREF_CAR
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   4.000   4.367   7.000  14.000 
## 
## $GENDER
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  1.0000  0.6076  1.0000  2.0000 
## 
## $FAV_CUIS
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   4.000   4.261   6.000   9.000 
## 
## $FAV_MUSIC
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   3.604   6.000   8.000 
## 
## $ENDU_LEVEL
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   3.000   3.000   4.094   6.000   8.000 
## 
## $FAV_SPORT
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   3.666   5.000   8.000 
## 
## $FAV_COLR
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   3.000   5.000   4.845   7.000   9.000 
## 
## $NEWS_SOURCE
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   4.000   4.372   6.000   8.000 
## 
## $DIST_FRM_COAST
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   2.704   4.000   5.000 
## 
## $MNTLY_TRAVEL
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    1.00    2.00    2.58    4.00    6.00 
## 
## $GEN_MOVIES
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   4.000   3.804   5.000   7.000 
## 
## $FAV_SUBJ
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   3.000   5.000   5.123   8.000  10.000 
## 
## $ALCOHOL
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    3.00    5.00    4.44    6.00    7.00 
## 
## $FAV_SUPERHERO
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   3.565   5.000   7.000 
## 
## $Dist_Coast
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    40.0   388.0   756.0   761.7  1126.0  1500.0 
## 
## $Class
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.4866  1.0000  1.0000

Outliers Data

#detect_outliers(dfrModel$Age)
lapply(dfrModel, FUN=detect_outliers)
## $Age
## integer(0)
## 
## $M_STATUS
## integer(0)
## 
## $SALARY
## integer(0)
## 
## $EDU_DATA
## integer(0)
## 
## $EMP_DATA
## integer(0)
## 
## $REL_ORIEN
## integer(0)
## 
## $FAV_TV
## integer(0)
## 
## $PREF_CAR
## integer(0)
## 
## $GENDER
## integer(0)
## 
## $FAV_CUIS
## integer(0)
## 
## $FAV_MUSIC
## integer(0)
## 
## $ENDU_LEVEL
## integer(0)
## 
## $FAV_SPORT
## integer(0)
## 
## $FAV_COLR
## integer(0)
## 
## $NEWS_SOURCE
## integer(0)
## 
## $DIST_FRM_COAST
## integer(0)
## 
## $MNTLY_TRAVEL
## integer(0)
## 
## $GEN_MOVIES
## integer(0)
## 
## $FAV_SUBJ
## integer(0)
## 
## $ALCOHOL
## integer(0)
## 
## $FAV_SUPERHERO
## integer(0)
## 
## $Dist_Coast
## integer(0)
## 
## $Class
## integer(0)

Observation
There are only few outliers so we will continue with outliers in the data.
As well as Ouliers have been reduced after data imputation which will help to get the better results.

Correlation

vctCorr = numeric(0)
for (i in names(dfrModel)){
    cor.result <- cor(as.numeric(dfrModel$Class), as.numeric(dfrModel[,i]))
    vctCorr <- c(vctCorr, cor.result)
}
dfrCorr <- vctCorr
names(dfrCorr) <- names(dfrModel)
dfrCorr
##            Age       M_STATUS         SALARY       EDU_DATA       EMP_DATA 
##  -0.0065161041   0.0041168084  -0.0033321482   0.0127697472   0.0098060088 
##      REL_ORIEN         FAV_TV       PREF_CAR         GENDER       FAV_CUIS 
##   0.0058868906  -0.0191763388   0.0103658628  -0.0004937748  -0.0096656525 
##      FAV_MUSIC     ENDU_LEVEL      FAV_SPORT       FAV_COLR    NEWS_SOURCE 
##  -0.0113882550  -0.0072796138  -0.0103530250  -0.0214460090  -0.0133743018 
## DIST_FRM_COAST   MNTLY_TRAVEL     GEN_MOVIES       FAV_SUBJ        ALCOHOL 
##   0.0182963567   0.0164257008  -0.0024826225  -0.0117980186   0.0078620834 
##  FAV_SUPERHERO     Dist_Coast          Class 
##  -0.0033929827   0.0101736822   1.0000000000

Data For Visualization

dfrGraph <- gather(dfrModel, variable, value, -Class)
head(dfrGraph)
##   Class variable value
## 1     0      Age    53
## 2     0      Age    36
## 3     1      Age    39
## 4     0      Age    39
## 5     0      Age    49
## 6     0      Age    47

Data Visualization

ggplot(dfrGraph) +
    geom_jitter(aes(value,Class, colour=variable)) + 
    facet_wrap(~variable, scales="free_x") +
    labs(title="Relation Of Class With Other Features")

Observation
As per graph, There is some correlation between Outcome variable and other variables.

Find Best Multi Logistic Model
Choose the best logistic model by using step().

stpModel=step(glm(data=dfrModel, formula=Class~., family=binomial), trace=0, steps=100)
summary(stpModel)
## 
## Call:
## glm(formula = Class ~ NEWS_SOURCE + DIST_FRM_COAST, family = binomial, 
##     data = dfrModel)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.262  -1.154  -1.102   1.201   1.255  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)  
## (Intercept)    -0.08724    0.07166  -1.217   0.2235  
## NEWS_SOURCE    -0.03126    0.01515  -2.064   0.0390 *
## DIST_FRM_COAST  0.06293    0.02807   2.242   0.0249 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6860  on 4950  degrees of freedom
## Residual deviance: 6854  on 4948  degrees of freedom
## AIC: 6860
## 
## Number of Fisher Scoring iterations: 3

Observation
Best results given by Class ~ NEWS_SOURCE + DIST_FRM_COAST
The degrees of Freedom for Null Variance is 4951-1=4950 when only outcome is considered so only one parameter
While for Reidual Deviance is 4951-5=4946 As no of parameters are 3 for residual deviance.

Make Final Multi Linear Model

# make model
mgmModel <- glm(data=dfrModel, formula=Class ~ NEWS_SOURCE + DIST_FRM_COAST , family=binomial(link="logit"))
# print summary
summary(mgmModel)
## 
## Call:
## glm(formula = Class ~ NEWS_SOURCE + DIST_FRM_COAST, family = binomial(link = "logit"), 
##     data = dfrModel)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.262  -1.154  -1.102   1.201   1.255  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)  
## (Intercept)    -0.08724    0.07166  -1.217   0.2235  
## NEWS_SOURCE    -0.03126    0.01515  -2.064   0.0390 *
## DIST_FRM_COAST  0.06293    0.02807   2.242   0.0249 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6860  on 4950  degrees of freedom
## Residual deviance: 6854  on 4948  degrees of freedom
## AIC: 6860
## 
## Number of Fisher Scoring iterations: 3
pR2 = 1 - mgmModel$deviance / mgmModel$null.deviance
pR2
## [1] 0.0008634399
mgmModel_null <- glm(dfrModel$Class~1, family = binomial, data = dfrModel)
pR21= 1- logLik(mgmModel)/logLik(mgmModel_null)
pR21
## 'log Lik.' 0.0008634399 (df=3)
library(rcompanion)
nagelkerke(mgmModel)
## $Models
##                                                                                         
## Model: "glm, Class ~ NEWS_SOURCE + DIST_FRM_COAST, binomial(link = \"logit\"), dfrModel"
## Null:  "glm, Class ~ 1, binomial(link = \"logit\"), dfrModel"                           
## 
## $Pseudo.R.squared.for.model.vs.null
##                              Pseudo.R.squared
## McFadden                           0.00086344
## Cox and Snell (ML)                 0.00119564
## Nagelkerke (Cragg and Uhler)       0.00159457
## 
## $Likelihood.ratio.test
##  Df.diff LogLik.diff  Chisq  p.value
##       -2     -2.9616 5.9232 0.051737
## 
## $Number.of.observations
##            
## Model: 4951
## Null:  4951
## 
## $Messages
## [1] "Note: For models fit with REML, these statistics are based on refitting with ML"
## 
## $Warnings
## [1] "None"
pR2(mgmModel)
##           llh       llhNull            G2      McFadden          r2ML 
## -3.427023e+03 -3.429985e+03  5.923172e+00  8.634399e-04  1.195643e-03 
##          r2CU 
##  1.594575e-03

Confusion Matrix

prdVal <- predict(mgmModel, type='response')
prdBln <- ifelse(prdVal > 0.5, 1, 0)
cnfmtrx <- table(prd=prdBln, act=dfrModel$Class)
confusionMatrix(cnfmtrx)
## Confusion Matrix and Statistics
## 
##    act
## prd    0    1
##   0 2095 1948
##   1  447  461
##                                           
##                Accuracy : 0.5163          
##                  95% CI : (0.5022, 0.5303)
##     No Information Rate : 0.5134          
##     P-Value [Acc > NIR] : 0.3506          
##                                           
##                   Kappa : 0.0158          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.8242          
##             Specificity : 0.1914          
##          Pos Pred Value : 0.5182          
##          Neg Pred Value : 0.5077          
##              Prevalence : 0.5134          
##          Detection Rate : 0.4231          
##    Detection Prevalence : 0.8166          
##       Balanced Accuracy : 0.5078          
##                                           
##        'Positive' Class : 0               
## 

Observations
Accuracy is good which is around 0.52

Regression Data

dfrPlot <- mutate(dfrModel, PrdVal=prdVal, POutcome=prdBln)
#write.csv(dfrPlot, "Train_Predict.csv")
head(dfrPlot)
##   Age M_STATUS SALARY EDU_DATA EMP_DATA REL_ORIEN FAV_TV PREF_CAR GENDER
## 1  53        1      1        4        0         1      1        1      1
## 2  36        0      4        2        1         1      2        2      1
## 3  39        1      2        2        1         1      3        3      1
## 4  39        0      4        3        1         2      4        4      1
## 5  49        0      5        2        0         2      5        5      1
## 6  47        0      2        1        1         1      4        6      0
##   FAV_CUIS FAV_MUSIC ENDU_LEVEL FAV_SPORT FAV_COLR NEWS_SOURCE
## 1        1         1          3         1        1           1
## 2        2         2          5         2        2           2
## 3        1         1          3         2        3           3
## 4        3         1          2         3        1           4
## 5        4         1          1         3        4           3
## 6        5         3          3         4        3           5
##   DIST_FRM_COAST MNTLY_TRAVEL GEN_MOVIES FAV_SUBJ ALCOHOL FAV_SUPERHERO
## 1              2            2          1        1       1             1
## 2              5            3          2        2       2             2
## 3              2            1          3        3       3             3
## 4              1            1          4        4       4             4
## 5              2            1          2        5       5             2
## 6              1            1          1        1       5             2
##   Dist_Coast Class    PrdVal POutcome
## 1        462     0 0.5018412        1
## 2        269     0 0.5411325        1
## 3       1308     1 0.4862125        0
## 4       1039     0 0.4627287        0
## 5        272     0 0.4862125        0
## 6        603     0 0.4549658        0

Regression Visulaization

#dfrPlot
ggplot(dfrPlot, aes(x=PrdVal, y=POutcome))  + 
    geom_point(shape=19, colour="blue", fill="blue") +
    geom_smooth(method="gam", formula=y~s(log(x)), se=FALSE) +
    labs(title="Binomial Regression Curve") +
    labs(x="") +
    labs(y="")

ROC Visulaization

#rocplot(logistic.model,diag=TRUE,pred.prob.labels=FALSE,prob.label.digits=3,AUC=TRUE)
rocplot(mgmModel)

Observations
The closer the ROC gets to the optimal point of perfect prediction the closer the AUC gets to 1.
We can see that AUC value is around 0.52 as well as Accuracy of our model is around 0.52 which are quite equal so Model is good.

Test Data

Dataset

setwd("D:/Welingkar/Competitions/Cognetix/CSV Files")
dfrTests <- read.csv("./Test_Data.csv", header=T, stringsAsFactors=F)
head(dfrTests)
##   Age M_STATUS Salary EDU_DATA EMP_DATA REL_ORIEN FAV_TV PREF_CAR GENDER
## 1  38        0      1        1        2         1      4        8      1
## 2  46        0      2        4        0         1      5        2      0
## 3  27        1      3        3        2         2      1       10      1
## 4  56        1      2        2        2         2      1        4      0
## 5  28        0      3       NA        0         2      8        1      1
## 6  31        1      2        3        0         1      3        1      0
##   FAV_CUIS FAV_MUSIC ENDU_LEVEL FAV_SPORT FAV_COLR NEWS_SOURCE
## 1        2         4          3         4        4           3
## 2        8         3          3         4        3           5
## 3        1         1          3         5        3           5
## 4        6         3          3         8        3           5
## 5        4         7          4         7        6           7
## 6        6         3          3         1        5           4
##   DIST_FRM_COAST MNTLY_TRAVEL GEN_MOVIES FAV_SUBJ ALCOHOL FAV_SUPERHERO
## 1              1            1          3        3       5             6
## 2              1            1          1        1       7             1
## 3              1            1          4        4       1             4
## 4              1            1          6        5       2             7
## 5              4            2          1        9       3             2
## 6              2            1          1        1       4             2
dim(dfrTests)
## [1] 3344   21

Missing Data

lapply(dfrTests, FUN=detect_na)
## $Age
## [1] 0
## 
## $M_STATUS
## [1] 12
## 
## $Salary
## [1] 13
## 
## $EDU_DATA
## [1] 47
## 
## $EMP_DATA
## [1] 16
## 
## $REL_ORIEN
## [1] 0
## 
## $FAV_TV
## [1] 0
## 
## $PREF_CAR
## [1] 0
## 
## $GENDER
## [1] 0
## 
## $FAV_CUIS
## [1] 1
## 
## $FAV_MUSIC
## [1] 0
## 
## $ENDU_LEVEL
## [1] 0
## 
## $FAV_SPORT
## [1] 0
## 
## $FAV_COLR
## [1] 1
## 
## $NEWS_SOURCE
## [1] 0
## 
## $DIST_FRM_COAST
## [1] 0
## 
## $MNTLY_TRAVEL
## [1] 1
## 
## $GEN_MOVIES
## [1] 0
## 
## $FAV_SUBJ
## [1] 2
## 
## $ALCOHOL
## [1] 3
## 
## $FAV_SUPERHERO
## [1] 0
dfrTests <- dfrTests[complete.cases(dfrTests), ]
lapply(dfrTests, FUN=detect_na)
## $Age
## [1] 0
## 
## $M_STATUS
## [1] 0
## 
## $Salary
## [1] 0
## 
## $EDU_DATA
## [1] 0
## 
## $EMP_DATA
## [1] 0
## 
## $REL_ORIEN
## [1] 0
## 
## $FAV_TV
## [1] 0
## 
## $PREF_CAR
## [1] 0
## 
## $GENDER
## [1] 0
## 
## $FAV_CUIS
## [1] 0
## 
## $FAV_MUSIC
## [1] 0
## 
## $ENDU_LEVEL
## [1] 0
## 
## $FAV_SPORT
## [1] 0
## 
## $FAV_COLR
## [1] 0
## 
## $NEWS_SOURCE
## [1] 0
## 
## $DIST_FRM_COAST
## [1] 0
## 
## $MNTLY_TRAVEL
## [1] 0
## 
## $GEN_MOVIES
## [1] 0
## 
## $FAV_SUBJ
## [1] 0
## 
## $ALCOHOL
## [1] 0
## 
## $FAV_SUPERHERO
## [1] 0
dim(dfrTests)
## [1] 3252   21

Predict

resVal <- predict(mgmModel, dfrTests, type="response")
prdSur <- ifelse(resVal > 0.5, 1, 0)

Observations

Test Data Confusion Matrix

resVal <- predict(mgmModel, dfrTests, type="response")
prdSur <- ifelse(resVal > 0.5, 1, 0)
prdSur <- as.factor(prdSur)
dfrTests <- mutate(dfrTests, Result=resVal, Prd_Outcome=prdSur)
#write.csv(dfrTests, "Test_Predict.csv")
head(dfrTests)
##   Age M_STATUS Salary EDU_DATA EMP_DATA REL_ORIEN FAV_TV PREF_CAR GENDER
## 1  38        0      1        1        2         1      4        8      1
## 2  46        0      2        4        0         1      5        2      0
## 3  27        1      3        3        2         2      1       10      1
## 4  56        1      2        2        2         2      1        4      0
## 5  31        1      2        3        0         1      3        1      0
## 6  42        1      3        2        1         2      8        3      1
##   FAV_CUIS FAV_MUSIC ENDU_LEVEL FAV_SPORT FAV_COLR NEWS_SOURCE
## 1        2         4          3         4        4           3
## 2        8         3          3         4        3           5
## 3        1         1          3         5        3           5
## 4        6         3          3         8        3           5
## 5        6         3          3         1        5           4
## 6        4         7          4         6        7           6
##   DIST_FRM_COAST MNTLY_TRAVEL GEN_MOVIES FAV_SUBJ ALCOHOL FAV_SUPERHERO
## 1              1            1          3        3       5             6
## 2              1            1          1        1       7             1
## 3              1            1          4        4       1             4
## 4              1            1          6        5       2             7
## 5              2            1          1        1       4             2
## 6              3            2          6        8       1             7
##      Result Prd_Outcome
## 1 0.4705098           0
## 2 0.4549658           0
## 3 0.4549658           0
## 4 0.4549658           0
## 5 0.4784064           0
## 6 0.4785076           0