libraries

  library(caret)

## Warning: package 'caret' was built under R version 3.3.3

## Loading required package: lattice

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 3.3.3

  library(e1071)

## Warning: package 'e1071' was built under R version 3.3.3

  library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

data

trainsheet = "life_Line_AME_sheet2_out"; testsheet = "life_Line_AME_sheet8_out"
#Health <- function(trainsheet, testsheet) {
 train <-
    read.csv(
      paste0(
        "E:\\Chandu\\From\\RSA\\RSA_Health\\POC\\Output\\Relation_Output\\",
        trainsheet,
        ".csv"
      )
    )
  test <-
    read.csv(
      paste0(
        "E:\\Chandu\\From\\RSA\\RSA_Health\\POC\\Output\\Relation_Output\\",
        testsheet,
        ".csv"
      )
    )

  head(train)

##           PolicyID claim_status claims_count familysize Main_insurred
## 1 HC00041886000107            1            1          2             0
## 2 HC00041885000107            1            2          3          Self
## 3 HC00041882000107            0            0          3             0
## 4 HC00041866000107            0            0          2          Self
## 5 HC00041864000107            0            0          1          Self
## 6 HC00041857000107            0            0          1             0
##   Main_insurred_age age1 relationship1 age2 relationship2 age3
## 1                 0   64 Mother in Law   35       Brother    0
## 2                43   12           Son   33          Wife    0
## 3                 0   36       Husband   63        Father   56
## 4                48   46          Wife    0             0    0
## 5                57    0             0    0             0    0
## 6                 0    8      Daughter    0             0    0
##   relationship3 age4 relationship4 age5 relationship5 age6 relationship6
## 1             0    0             0    0             0    0             0
## 2             0    0             0    0             0    0             0
## 3        Mother    0             0    0             0    0             0
## 4             0    0             0    0             0    0             0
## 5             0    0             0    0             0    0             0
## 6             0    0             0    0             0    0             0
##   age7 relationship7 age8 relationship8 age9 relationship9 age10
## 1    0             0    0             0    0             0     0
## 2    0             0    0             0    0             0     0
## 3    0             0    0             0    0             0     0
## 4    0             0    0             0    0             0     0
## 5    0             0    0             0    0             0     0
## 6    0             0    0             0    0             0     0
##   relationship10 age11 relationship11 age12 relationship12 age13
## 1              0     0              0     0              0     0
## 2              0     0              0     0              0     0
## 3              0     0              0     0              0     0
## 4              0     0              0     0              0     0
## 5              0     0              0     0              0     0
## 6              0     0              0     0              0     0
##   relationship13 age14 relationship14 age15 relationship15 age16
## 1              0     0              0     0              0     0
## 2              0     0              0     0              0     0
## 3              0     0              0     0              0     0
## 4              0     0              0     0              0     0
## 5              0     0              0     0              0     0
## 6              0     0              0     0              0     0
##   relationship16 age17 relationship17 age18 relationship18 age19
## 1              0     0              0     0              0     0
## 2              0     0              0     0              0     0
## 3              0     0              0     0              0     0
## 4              0     0              0     0              0     0
## 5              0     0              0     0              0     0
## 6              0     0              0     0              0     0
##   relationship19 age20 relationship20 age21 relationship21 age22
## 1              0     0              0     0              0     0
## 2              0     0              0     0              0     0
## 3              0     0              0     0              0     0
## 4              0     0              0     0              0     0
## 5              0     0              0     0              0     0
## 6              0     0              0     0              0     0
##   relationship22 age23 relationship23 age24 relationship24
## 1              0     0              0     0              0
## 2              0     0              0     0              0
## 3              0     0              0     0              0
## 4              0     0              0     0              0
## 5              0     0              0     0              0
## 6              0     0              0     0              0

  head(test)

##           PolicyID claim_status claims_count familysize Main_insurred
## 1 TQ00000128000101            0            0          2          Self
## 2 TQ00000127000101            0            0          3          Self
## 3 TQ00000124000101            0            0          4          Self
## 4 TQ00000123000101            0            0          3          Self
## 5 TQ00000123000100            0            0          3          Self
## 6 TQ00000122000101            0            0          4             0
##   Main_insurred_age age1 relationship1 age2 relationship2 age3
## 1                38   39          Wife    0             0    0
## 2                31   29          Wife    8      Daughter    0
## 3                37   36          Wife    8           Son    6
## 4                34   30          Wife    4           Son    0
## 5                33   29          Wife    3           Son    0
## 6                 0   42        Spouse   18        Father   13
##   relationship3 age4 relationship4 age5 relationship5 age6 relationship6
## 1             0    0             0    0             0    0             0
## 2             0    0             0    0             0    0             0
## 3           Son    0             0    0             0    0             0
## 4             0    0             0    0             0    0             0
## 5             0    0             0    0             0    0             0
## 6        Father   43        Spouse    0             0    0             0
##   age7 relationship7 age8 relationship8 age9 relationship9 age10
## 1    0             0    0             0    0             0     0
## 2    0             0    0             0    0             0     0
## 3    0             0    0             0    0             0     0
## 4    0             0    0             0    0             0     0
## 5    0             0    0             0    0             0     0
## 6    0             0    0             0    0             0     0
##   relationship10 age11 relationship11 age12 relationship12 age13
## 1              0     0              0     0              0     0
## 2              0     0              0     0              0     0
## 3              0     0              0     0              0     0
## 4              0     0              0     0              0     0
## 5              0     0              0     0              0     0
## 6              0     0              0     0              0     0
##   relationship13 age14 relationship14 age15 relationship15 age16
## 1              0     0              0     0              0     0
## 2              0     0              0     0              0     0
## 3              0     0              0     0              0     0
## 4              0     0              0     0              0     0
## 5              0     0              0     0              0     0
## 6              0     0              0     0              0     0
##   relationship16 age17 relationship17 age18 relationship18 age19
## 1              0     0              0     0              0     0
## 2              0     0              0     0              0     0
## 3              0     0              0     0              0     0
## 4              0     0              0     0              0     0
## 5              0     0              0     0              0     0
## 6              0     0              0     0              0     0
##   relationship19 age20 relationship20 age21 relationship21 age22
## 1              0     0              0     0              0     0
## 2              0     0              0     0              0     0
## 3              0     0              0     0              0     0
## 4              0     0              0     0              0     0
## 5              0     0              0     0              0     0
## 6              0     0              0     0              0     0
##   relationship22 age23 relationship23 age24 relationship24
## 1              0     0              0     0              0
## 2              0     0              0     0              0
## 3              0     0              0     0              0
## 4              0     0              0     0              0
## 5              0     0              0     0              0
## 6              0     0              0     0              0

  train <- train[, -c(1, 3)]  #removing policy id and claim_count
  test <- test[, -c(1, 3)]

# converting every relationship into factor --------------------------------------------------
  
  
  for (i in 1:24) {
    h <- paste0("relationship", i)
    
    train[, h] <- as.factor(train[, h])
    test[, h] <- as.factor(test[, h])
    
  }

# levels combining --------------------------------------------------------
  
  
  le <- function(na) {
    a <- levels(train[, na])
    b <- levels(test[, na])
    c <- unique(c(a, b))
    return(c)
  }
  
  for (i in 1:24) {
    h <- paste0("relationship", i)
    
    train[, h] <- factor(train[, h], levels = le(na = h))
    test[, h] <- factor(test[, h], levels = le(na = h))
    
  }

 # Removing unique coumns --------------------------------------------------
  
  train <-
    train[, !sapply(train, function(col)
      nlevels(col) == 1)]  # removing factors with 1 level
  train <-
    train[, colSums(train != 0) != 0]   # removing columns with colsums==0
  test <-
    test[, names(test) %in% names(train)]   # so what are the columns in training we are going to take in testing

 # svm ---------------------------------------------------------------------
  
  
  svm_model <- svm(claim_status ~ ., data = train)
  summary(svm_model)

## 
## Call:
## svm(formula = claim_status ~ ., data = train)
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  0.01052632 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  451

  svm_predicted <- predict(svm_model, test[,-1], type = 'response')
  svm_pred <- ifelse(svm_predicted > 0.03, 1, 0)
  svm_output <- confusionMatrix(svm_pred, test$claim_status)
  svm_output

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 231   6
##          1   1   0
##                                           
##                Accuracy : 0.9706          
##                  95% CI : (0.9403, 0.9881)
##     No Information Rate : 0.9748          
##     P-Value [Acc > NIR] : 0.7458          
##                                           
##                   Kappa : -0.0073         
##  Mcnemar's Test P-Value : 0.1306          
##                                           
##             Sensitivity : 0.9957          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.9747          
##          Neg Pred Value : 0.0000          
##              Prevalence : 0.9748          
##          Detection Rate : 0.9706          
##    Detection Prevalence : 0.9958          
##       Balanced Accuracy : 0.4978          
##                                           
##        'Positive' Class : 0               
##

# glm ---------------------------------------------------------------------
  
  
  glm_model <-
    glm(claim_status ~ .,
        data = train,
        family = "binomial")

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

  train_relation <-
    names(train %>% select(contains("relationship"))) #columns names which contains relationships
  
  # merging levels to the model
  for (i in train_relation) {
    glm_model$xlevels[[i]] <-
      union(glm_model$xlevels[[i]], levels(test[, i]))
  }
  
  
  glm_predicted <- predict(glm_model, test[,-1], type = 'response')

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

  glm_predict <- ifelse(glm_predicted > 0.5, 1, 0)
  glm_output <- confusionMatrix(glm_predict, test$claim_status)

## Warning in confusionMatrix.default(glm_predict, test$claim_status): Levels
## are not in the same order for reference and data. Refactoring data to
## match.

  glm_output

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 232   6
##          1   0   0
##                                           
##                Accuracy : 0.9748          
##                  95% CI : (0.9459, 0.9907)
##     No Information Rate : 0.9748          
##     P-Value [Acc > NIR] : 0.60631         
##                                           
##                   Kappa : 0               
##  Mcnemar's Test P-Value : 0.04123         
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.9748          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.9748          
##          Detection Rate : 0.9748          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 0               
##

test1<-data.frame(cbind(svm_predicted,svm_pred,glm_predicted,glm_predict,test))
  #return(list(svm_output, glm_output,test1))  #returning multiple objects
#}

#a<-suppressWarnings(Health(trainsheet = "life_Line_AME_sheet2_out", testsheet = "life_Line_AME_sheet8_out"))
#result<-a[[3]]

Levels mis-matching by GLM and SVM

sujith

April 10, 2017

libraries

data