Data Exploration

evaluation <- read.csv("C:/Users/91976/Downloads/insurance-evaluation-data.csv")
training <- read.csv("C:/Users/91976/Downloads/insurance_training_data.csv")

M <- rbind(training, evaluation) # Merged
n <- nrow(training); # training is M[1:n, 4:26]
m <- nrow(evaluation) # evaluation is M[(1+n):(m+n), ]
X <- data.frame("TARGET_FLAG" = rep(T, ncol(M)), 
                "TARGET_AMT" = rep(T, ncol(M)))
X[match(c("INDEX", "TARGET_AMT"), names(M)), "TARGET_FLAG"] <- F
X[match(c("INDEX", "TARGET_FLAG"), names(M)), "TARGET_AMT"] <- F

#Normalizing data for a more clear view

quantitative <- c(4:8, 10, 15, 17, 18, 21, 22, 24, 25)
names(M[quantitative])
##  [1] "KIDSDRIV" "AGE"      "HOMEKIDS" "YOJ"      "INCOME"   "HOME_VAL"
##  [7] "TRAVTIME" "BLUEBOOK" "TIF"      "OLDCLAIM" "CLM_FREQ" "MVR_PTS" 
## [13] "CAR_AGE"
categorical <- c(13, 14, 19)
names(M[categorical])
## [1] "EDUCATION" "JOB"       "CAR_TYPE"
binary <- c(9, 11, 12, 16, 20, 23, 26)
names(M[binary])
## [1] "PARENT1"    "MSTATUS"    "SEX"        "CAR_USE"    "RED_CAR"   
## [6] "REVOKED"    "URBANICITY"
Currency_Convert <- function(Field){
  Field <- as.numeric(gsub("\\$|,","", Field))
}

Binary_Convert <- function(Field, Neg, Pos) {
  Field <- as.character(Field)
  Field[which(Field == Neg)] <- 0
  Field[which(Field == Pos)] <- 1
  Field <- as.numeric(Field)
}


M$INCOME <- Currency_Convert(M$INCOME)
M$PARENT1 <- Binary_Convert(M$PARENT1, "No", "Yes")
M$HOME_VAL <- Currency_Convert(M$HOME_VAL)
M$MSTATUS <- Binary_Convert(M$MSTATUS, "z_No", "Yes")
M$SEX <- Binary_Convert(M$SEX, "M", "z_F")
M$CAR_USE <- Binary_Convert(M$CAR_USE, "Commercial", "Private")
M$BLUEBOOK <- Currency_Convert(M$BLUEBOOK)
M$RED_CAR <- Binary_Convert(M$RED_CAR, "no", "yes")
M$OLDCLAIM <- Currency_Convert(M$OLDCLAIM)
M$REVOKED <- Binary_Convert(M$REVOKED, "No", "Yes")
M$URBANICITY <- Binary_Convert(M$URBANICITY, "z_Highly Rural/ Rural", "Highly Urban/ Urban")

M$CAR_AGE[which(M$CAR_AGE < 0)] <- NA
M$HOME_VAL[which(M$HOME_VAL == 0)] <- NA

Using summary stats on the training data for representation

library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
stargazer(training [,-1], type = "text")
## 
## =======================================================
## Statistic     N     Mean    St. Dev.   Min      Max    
## -------------------------------------------------------
## TARGET_FLAG 8,161   0.264     0.441     0        1     
## TARGET_AMT  8,161 1,504.325 4,704.027 0.000 107,586.100
## KIDSDRIV    8,161   0.171     0.512     0        4     
## AGE         8,155  44.790     8.628    16       81     
## HOMEKIDS    8,161   0.721     1.116     0        5     
## YOJ         7,707  10.499     4.092     0       23     
## TRAVTIME    8,161  33.486    15.908     5       142    
## TIF         8,161   5.351     4.147     1       25     
## CLM_FREQ    8,161   0.799     1.158     0        5     
## MVR_PTS     8,161   1.696     2.147     0       13     
## CAR_AGE     7,651   8.328     5.701    -3       28     
## -------------------------------------------------------
#Visualising data

#Load ggplot2 library
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
#Create scatterplot

ggplot(data = training, aes(x = INCOME, y = HOME_VAL)) +
  geom_point(color = "#4F94CD", size = 2.5) +
  labs(x = "Income (in thousands)", y = "Home Value (in thousands)",
       title = "Scatterplot of Income and Home Value") +
  theme(plot.title = element_text(hjust = 0.5),
        panel.background = element_rect(fill = "#F2F2F2", color = NA),
        panel.grid.major = element_line(color = "#E5E5E5"),
        axis.line = element_line(color = "black"))

Create boxplot

library(ggplot2)
library(reshape2)

# Melt the training data to long format
training_melted <- melt(training)
## Using INCOME, PARENT1, HOME_VAL, MSTATUS, SEX, EDUCATION, JOB, CAR_USE, BLUEBOOK, CAR_TYPE, RED_CAR, OLDCLAIM, REVOKED, URBANICITY as id variables
# Create a ggplot with melted training data
ggplot(data = training_melted, aes(x = variable, y = value)) +
  geom_boxplot() +                     # Add a boxplot layer
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +   # Rotate x-axis labels by 90 degrees and align to the right
  labs(x = "Variable", y = "Value", title = "Boxplot of Variables")   # Add axis labels and a title
## Warning: Removed 970 rows containing non-finite values (`stat_boxplot()`).

Checking for correlation

library(ggplot2)
library(reshape2)
ggplot(data = melt(abs(cor(sapply(na.omit(training), as.numeric)))), aes(x=Var1, y=Var2, fill=value)) +
  scale_fill_gradient(low = 'black', high = 'red', name = "Absolute Value") +
  geom_tile() + labs(title = "Correlation Heatmap") +
  theme(axis.title.y = element_blank(),
        axis.title.x = element_blank(),
        axis.text.x = element_text(angle = 90, hjust = 1),
        plot.title = element_text(hjust = 0.5))
## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion

## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion

## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion

## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion

## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion

## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion

## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion

## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion

## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion

## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion

## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion

## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion

## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion

## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion

Extracting important Information

PCA <- function(X) {
  Xpca <- prcomp(na.omit(X), center = T, scale. = T) 
  M <- as.matrix(na.omit(X)); R <- as.matrix(Xpca$rotation); score <- M %*% R
  print(list("Importance of Components" = summary(Xpca)$importance[ ,1:5], 
             "Rotation (Variable Loadings)" = Xpca$rotation[ ,1:5],
             "Correlation between X and PC" = cor(na.omit(X), score)[ ,1:5]))
  par(mfrow=c(2,3))
  barplot(Xpca$sdev^2, ylab = "Component Variance")
  barplot(cor(cbind(X)), ylab = "Correlations")
  barplot(Xpca$rotation, ylab = "Loadings")  
  biplot(Xpca); barplot(M); barplot(score)
}
PCA(M[1:n, quantitative])
## $`Importance of Components`
##                             PC1      PC2      PC3      PC4       PC5
## Standard deviation     1.645244 1.334995 1.270056 1.012738 0.9938183
## Proportion of Variance 0.208220 0.137090 0.124080 0.078900 0.0759700
## Cumulative Proportion  0.208220 0.345310 0.469390 0.548290 0.6242600
## 
## $`Rotation (Variable Loadings)`
##                  PC1         PC2         PC3         PC4          PC5
## KIDSDRIV  0.10774567 -0.17584383  0.50530195 -0.07331372  0.083069165
## AGE      -0.26771011  0.02800193 -0.33747972  0.09204957  0.398329669
## HOMEKIDS  0.22780686 -0.15800486  0.61071654 -0.03742552 -0.070548997
## YOJ      -0.14018772 -0.15622254  0.30537124  0.16303338  0.482290631
## INCOME   -0.53361654 -0.15978248  0.16457623 -0.05878797 -0.111702800
## HOME_VAL -0.54258303 -0.15297470  0.13345360 -0.05556924 -0.070094419
## TRAVTIME  0.03412860  0.01807301 -0.03461120 -0.66695073  0.612187440
## BLUEBOOK -0.34153090 -0.09792299  0.09498366 -0.05047749  0.005257043
## TIF       0.01113475  0.06077082  0.08346442  0.70417523  0.407479904
## OLDCLAIM  0.10846886 -0.53950381 -0.18816292  0.08137372  0.027275343
## CLM_FREQ  0.12435306 -0.57054584 -0.21337557  0.02350619  0.041684584
## MVR_PTS   0.12635248 -0.48204768 -0.13797155 -0.02119042 -0.055156727
## CAR_AGE  -0.32388952 -0.08617776 -0.04999174  0.02775244 -0.181923195
## 
## $`Correlation between X and PC`
##                  PC1          PC2         PC3         PC4         PC5
## KIDSDRIV  0.03279334  0.026553825 -0.03415518  0.03430623  0.03304671
## AGE      -0.21530718 -0.207800075  0.21108534 -0.21467167 -0.20309336
## HOMEKIDS  0.14930487  0.138457016 -0.14945042  0.15094329  0.14520501
## YOJ      -0.20724065 -0.209543450  0.20479944 -0.20430965 -0.20661069
## INCOME   -0.98252529 -0.961368639  0.98208694 -0.97955171 -0.98781092
## HOME_VAL -0.99539680 -0.973818412  0.99092055 -0.99071401 -0.99286141
## TRAVTIME  0.03450769  0.037350720 -0.03374552  0.03207236  0.03588214
## BLUEBOOK -0.43933895 -0.428786579  0.44044267 -0.45038023 -0.40712199
## TIF       0.02086626  0.026174257 -0.01892478  0.01915871  0.02069475
## OLDCLAIM  0.06486343 -0.155449228 -0.13116239  0.13800364  0.07289718
## CLM_FREQ  0.07562867 -0.034894080 -0.10831923  0.11178715  0.07912375
## MVR_PTS   0.07488695  0.009564432 -0.09370134  0.09613452  0.07548462
## CAR_AGE  -0.37196326 -0.364430487  0.37128491 -0.37062402 -0.37290793

Data Preparation

Fixing Missing Values

options(repos = "http://cran.rstudio.com/")


install.packages('VIM')
## package 'VIM' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\91976\AppData\Local\Temp\RtmpeATJFy\downloaded_packages
library(VIM)
## Warning: package 'VIM' was built under R version 4.2.3
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
all(complete.cases(training))
## [1] FALSE
aggr(M[1:n, 4:26], bars=F, sortVars=T)

## 
##  Variables sorted by number of missings: 
##    Variable       Count
##    HOME_VAL 0.337948781
##     CAR_AGE 0.062614876
##         YOJ 0.055630437
##      INCOME 0.054527631
##         AGE 0.000735204
##    KIDSDRIV 0.000000000
##    HOMEKIDS 0.000000000
##     PARENT1 0.000000000
##     MSTATUS 0.000000000
##         SEX 0.000000000
##   EDUCATION 0.000000000
##         JOB 0.000000000
##    TRAVTIME 0.000000000
##     CAR_USE 0.000000000
##    BLUEBOOK 0.000000000
##         TIF 0.000000000
##    CAR_TYPE 0.000000000
##     RED_CAR 0.000000000
##    OLDCLAIM 0.000000000
##    CLM_FREQ 0.000000000
##     REVOKED 0.000000000
##     MVR_PTS 0.000000000
##  URBANICITY 0.000000000

#We have notable amounts of missing values in the HOME_VAL, JOB, CAR_AGE, YOJ, and INCOME variables. There is also a small amount of missing values in AGE.

Imputing

Likely_Value <- function(Field_1, Field_2, Value) {
  # Mode for Field_1 for given Value of Field_2
  frequencies <- table(Field_1[which(Field_2 == Value)])
  most_frequent <- names(sort(frequencies, decreasing = TRUE)[1])
  return(most_frequent)
}

M$JOB[(is.na(M$JOB) & M$EDUCATION == "PhD")] <- Likely_Value(M$JOB, M$EDUCATION, "PhD")
M$JOB[(is.na(M$JOB) & M$EDUCATION == "Masters")] <- Likely_Value(M$JOB, M$EDUCATION, "Masters")
M$JOB[(is.na(M$JOB) & M$EDUCATION == "Bachelors")] <- Likely_Value(M$JOB, M$EDUCATION, "Bachelors")
M$JOB[(is.na(M$JOB) & M$EDUCATION == "z_High School")] <- Likely_Value(M$JOB, M$EDUCATION, "z_High School")
M$JOB[(is.na(M$JOB) & M$EDUCATION == "<High School")] <- Likely_Value(M$JOB, M$EDUCATION, "<High School")

##Assuming that education level can serve as a reasonable proxy for a person’s job, the likely JOB value for each sample given the EDUCATION value level is imputed for missing JOB values by looking at the predominant (mode) JOB value for each EDUCATION value. For example, if education level E is mostly employee in job J, then where there exists and education level E without missing job information, we assume job J

install.packages("mice")
## package 'mice' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\91976\AppData\Local\Temp\RtmpeATJFy\downloaded_packages
library(mice)
## Warning: package 'mice' was built under R version 4.2.3
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
MICE <- mice(M[1:n, 4:26], predictorMatrix = quickpred(M[1:n, 4:26]), method = "mean", printFlag = F)
## Warning: Number of logged events: 2
M[1:n, 4:26] <- complete(MICE, action = 1)
MICE <- mice(M[(1+n):(m+n), ], predictorMatrix = quickpred(M[(1+n):(m+n), ]), method = "mean", printFlag = F)
## Warning: Number of logged events: 4
M[(1+n):(m+n), ] <- complete(MICE, action = 1)
M$CAR_AGE <- as.integer(M$CAR_AGE)
aggr(M[1:n, 4:26], bars=F, sortVars=T)

## 
##  Variables sorted by number of missings: 
##    Variable Count
##    KIDSDRIV     0
##         AGE     0
##    HOMEKIDS     0
##         YOJ     0
##      INCOME     0
##     PARENT1     0
##    HOME_VAL     0
##     MSTATUS     0
##         SEX     0
##   EDUCATION     0
##         JOB     0
##    TRAVTIME     0
##     CAR_USE     0
##    BLUEBOOK     0
##         TIF     0
##    CAR_TYPE     0
##     RED_CAR     0
##    OLDCLAIM     0
##    CLM_FREQ     0
##     REVOKED     0
##     MVR_PTS     0
##     CAR_AGE     0
##  URBANICITY     0
#Missing values were replaced with the mean value using Multivariate Imputation 
#by Chained Equations (MICE).

#Categorical Varibales

M$PHD <- ifelse(M$EDUCATION == "PhD", 1, 0)
M$MASTERS <- ifelse(M$EDUCATION == "Masters", 1, 0)
M$BACHELORS <- ifelse(M$EDUCATION == "Bachelors", 1, 0)
M$HS <- ifelse(M$EDUCATION == "z_High School", 1, 0)
M$NOHS <- ifelse(M$EDUCATION == "<High School", 1, 0)

M$CLERICAL <- ifelse(M$JOB == "Clerical", 1, 0)
M$DOCTOR <- ifelse(M$JOB == "Doctor", 1, 0)
M$HOME_MAKER <- ifelse(M$JOB == "Home Maker", 1, 0)
M$LAWYER <- ifelse(M$JOB == "Lawyer", 1, 0)
M$MANAGER <- ifelse(M$JOB == "Manager", 1, 0)
M$PROF <- ifelse(M$JOB == "Professional", 1, 0)
M$STUDENT <- ifelse(M$JOB == "Student", 1, 0)
M$BLUE_COLLAR <- ifelse(M$JOB == "z_Blue Collar", 1, 0)

M$MINIVAN <- ifelse(M$CAR_TYPE == "Minivan", 1, 0)
M$TRUCK <- ifelse(M$CAR_TYPE == "Panel Truck", 1, 0)
M$PICKUP <- ifelse(M$CAR_TYPE == "Pickup", 1, 0)
M$SPORTS <- ifelse(M$CAR_TYPE == "Sports Car", 1, 0)
M$VAN <- ifelse(M$CAR_TYPE == "Van", 1, 0)
M$SUV <- ifelse(M$CAR_TYPE == "z_SUV", 1, 0)

remove <- c("EDUCATION", "JOB", "CAR_TYPE")
X <- rbind(X, data.frame("TARGET_FLAG" = rep(T, ncol(M)-nrow(X)), 
                         "TARGET_AMT" = rep(T, ncol(M)-nrow(X))))
X[match(remove, names(M)), ] <- F

#Catergorial variables were changed to binary.

Correlations

library(reshape2)
Corr_XY <- function(X, Y) {
  corr <- data.frame(array(NA, dim = c(ncol(X), 5)))
  colnames(corr) <- c("Y", "X", "r","p","<0.05")
  for (i in 1:ncol(X)) {
    r <- cor.test(Y[, 1], X[, i])
    corr[i, 1] <- names(Y)
    corr[i, 2] <- names(X[i])
    corr[i, 3] <- r$estimate
    corr[i, 4] <- r$p.value
    corr[i, 5] <- corr[i, 4] < 0.05
  }
  return(corr)
}
Corr_XX <- function(X, threshold) {
  corr <- data.frame(array(NA, dim = c(choose(ncol(X), 2), 5)))
  colnames(corr) <- c("X1", "X2", "r","p","<0.05"); k = 1
  for (i in 1:(ncol(X) - 1)) {
    for (j in (i+1):ncol(X)) {
      r <- cor.test(X[,i], X[,j])
      corr[k, 1] <- names(X[i])
      corr[k, 2] <- names(X[j])
      corr[k, 3] <- r$estimate
      corr[k, 4] <- r$p.value
      corr[k, 5] <- corr[i, 4] < 0.05
      k = k + 1
    }
  }
  least <- corr[corr[,"<0.05"] == F, ]
  most <- corr[abs(corr[,"r"]) >= threshold, ]
  result <- list("Correlations" = corr, "Least_Correlated"= least, "Most_Correlated" = most)
  return(result)
}

#Between TARGET_AMT and X Variables #The specification M[1:n, -c(1:3, categorical)] creates a data frame excluding the INDEX, TARGET_FLAG, TARGET_AMT, and categorical variables. The specification M[1:n, 2, drop = FALSE] creates a data frame with the Y of interest and retains the column name

correlations <- Corr_XY(M[1:n, -c(1:3, categorical)], M[1:n, 3, drop = FALSE])
correlations
##             Y           X            r            p <0.05
## 1  TARGET_AMT    KIDSDRIV  0.055394177 5.520134e-07  TRUE
## 2  TARGET_AMT         AGE -0.041722748 1.631008e-04  TRUE
## 3  TARGET_AMT    HOMEKIDS  0.061988043 2.089491e-08  TRUE
## 4  TARGET_AMT         YOJ -0.020939639 5.854856e-02 FALSE
## 5  TARGET_AMT      INCOME -0.056639628 3.054521e-07  TRUE
## 6  TARGET_AMT     PARENT1  0.096965421 1.648791e-18  TRUE
## 7  TARGET_AMT    HOME_VAL -0.041320305 1.885921e-04  TRUE
## 8  TARGET_AMT     MSTATUS -0.087661194 2.135335e-15  TRUE
## 9  TARGET_AMT         SEX -0.011053614 3.180652e-01 FALSE
## 10 TARGET_AMT    TRAVTIME  0.027987016 1.145817e-02  TRUE
## 11 TARGET_AMT     CAR_USE -0.098613835 4.298495e-19  TRUE
## 12 TARGET_AMT    BLUEBOOK -0.004699523 6.712129e-01 FALSE
## 13 TARGET_AMT         TIF -0.046480831 2.661802e-05  TRUE
## 14 TARGET_AMT     RED_CAR  0.008091979 4.648309e-01 FALSE
## 15 TARGET_AMT    OLDCLAIM  0.070953287 1.390750e-10  TRUE
## 16 TARGET_AMT    CLM_FREQ  0.116419159 5.005267e-26  TRUE
## 17 TARGET_AMT     REVOKED  0.061385464 2.859309e-08  TRUE
## 18 TARGET_AMT     MVR_PTS  0.137865509 6.385648e-36  TRUE
## 19 TARGET_AMT     CAR_AGE -0.057600986 1.918161e-07  TRUE
## 20 TARGET_AMT  URBANICITY  0.120973821 5.495497e-28  TRUE
## 21 TARGET_AMT         PHD -0.024424437 2.735236e-02  TRUE
## 22 TARGET_AMT     MASTERS -0.035171011 1.484052e-03  TRUE
## 23 TARGET_AMT   BACHELORS -0.017277942 1.185859e-01 FALSE
## 24 TARGET_AMT          HS  0.042098024 1.422829e-04  TRUE
## 25 TARGET_AMT        NOHS  0.027676590 1.240704e-02  TRUE
## 26 TARGET_AMT    CLERICAL  0.007805255 4.808004e-01 FALSE
## 27 TARGET_AMT      DOCTOR -0.034750482 1.690833e-03  TRUE
## 28 TARGET_AMT  HOME_MAKER -0.007081752 5.223917e-01 FALSE
## 29 TARGET_AMT      LAWYER -0.029185515 8.371037e-03  TRUE
## 30 TARGET_AMT     MANAGER -0.064606496 5.168715e-09  TRUE
## 31 TARGET_AMT        PROF -0.004547087 6.812815e-01 FALSE
## 32 TARGET_AMT     STUDENT  0.024409854 2.744467e-02  TRUE
## 33 TARGET_AMT BLUE_COLLAR  0.061830058 2.269225e-08  TRUE
## 34 TARGET_AMT     MINIVAN -0.075267324 9.887179e-12  TRUE
## 35 TARGET_AMT       TRUCK  0.029468292 7.761229e-03  TRUE
## 36 TARGET_AMT      PICKUP  0.021906619 4.782277e-02  TRUE
## 37 TARGET_AMT      SPORTS  0.023294077 3.535147e-02  TRUE
## 38 TARGET_AMT         VAN  0.023479460 3.391668e-02  TRUE
## 39 TARGET_AMT         SUV  0.005942619 5.914276e-01 FALSE
print(correlations)
##             Y           X            r            p <0.05
## 1  TARGET_AMT    KIDSDRIV  0.055394177 5.520134e-07  TRUE
## 2  TARGET_AMT         AGE -0.041722748 1.631008e-04  TRUE
## 3  TARGET_AMT    HOMEKIDS  0.061988043 2.089491e-08  TRUE
## 4  TARGET_AMT         YOJ -0.020939639 5.854856e-02 FALSE
## 5  TARGET_AMT      INCOME -0.056639628 3.054521e-07  TRUE
## 6  TARGET_AMT     PARENT1  0.096965421 1.648791e-18  TRUE
## 7  TARGET_AMT    HOME_VAL -0.041320305 1.885921e-04  TRUE
## 8  TARGET_AMT     MSTATUS -0.087661194 2.135335e-15  TRUE
## 9  TARGET_AMT         SEX -0.011053614 3.180652e-01 FALSE
## 10 TARGET_AMT    TRAVTIME  0.027987016 1.145817e-02  TRUE
## 11 TARGET_AMT     CAR_USE -0.098613835 4.298495e-19  TRUE
## 12 TARGET_AMT    BLUEBOOK -0.004699523 6.712129e-01 FALSE
## 13 TARGET_AMT         TIF -0.046480831 2.661802e-05  TRUE
## 14 TARGET_AMT     RED_CAR  0.008091979 4.648309e-01 FALSE
## 15 TARGET_AMT    OLDCLAIM  0.070953287 1.390750e-10  TRUE
## 16 TARGET_AMT    CLM_FREQ  0.116419159 5.005267e-26  TRUE
## 17 TARGET_AMT     REVOKED  0.061385464 2.859309e-08  TRUE
## 18 TARGET_AMT     MVR_PTS  0.137865509 6.385648e-36  TRUE
## 19 TARGET_AMT     CAR_AGE -0.057600986 1.918161e-07  TRUE
## 20 TARGET_AMT  URBANICITY  0.120973821 5.495497e-28  TRUE
## 21 TARGET_AMT         PHD -0.024424437 2.735236e-02  TRUE
## 22 TARGET_AMT     MASTERS -0.035171011 1.484052e-03  TRUE
## 23 TARGET_AMT   BACHELORS -0.017277942 1.185859e-01 FALSE
## 24 TARGET_AMT          HS  0.042098024 1.422829e-04  TRUE
## 25 TARGET_AMT        NOHS  0.027676590 1.240704e-02  TRUE
## 26 TARGET_AMT    CLERICAL  0.007805255 4.808004e-01 FALSE
## 27 TARGET_AMT      DOCTOR -0.034750482 1.690833e-03  TRUE
## 28 TARGET_AMT  HOME_MAKER -0.007081752 5.223917e-01 FALSE
## 29 TARGET_AMT      LAWYER -0.029185515 8.371037e-03  TRUE
## 30 TARGET_AMT     MANAGER -0.064606496 5.168715e-09  TRUE
## 31 TARGET_AMT        PROF -0.004547087 6.812815e-01 FALSE
## 32 TARGET_AMT     STUDENT  0.024409854 2.744467e-02  TRUE
## 33 TARGET_AMT BLUE_COLLAR  0.061830058 2.269225e-08  TRUE
## 34 TARGET_AMT     MINIVAN -0.075267324 9.887179e-12  TRUE
## 35 TARGET_AMT       TRUCK  0.029468292 7.761229e-03  TRUE
## 36 TARGET_AMT      PICKUP  0.021906619 4.782277e-02  TRUE
## 37 TARGET_AMT      SPORTS  0.023294077 3.535147e-02  TRUE
## 38 TARGET_AMT         VAN  0.023479460 3.391668e-02  TRUE
## 39 TARGET_AMT         SUV  0.005942619 5.914276e-01 FALSE

##The predictor variables SEX, BLUEBOOK, RED_CAR, BACHELORS, CLERICAL, HOME_MAKER, PROF, LAWYER, YOJ, and SUV do not have statistically significant correlations with the response variable and are therefore not being considered for the model. The variable YOJ sits at the threshold of statistical viability, and will be left in

remove <- c("SEX", "BLUEBOOK", "RED_CAR", "BACHELORS", "CLERICAL", "HOME_MAKER", "PROF", "LAWYER", "YOJ", "SUV")
X[match(remove, names(M)), "TARGET_AMT"] <- F

##Between TARGETFLAG and X Variables ##The specification M[1:n, -c(1:3, categorical)] creates a data frame excluding the INDEX, TARGET_FLAG, TARGET_AMT, and categorical variables. The specification M[1:n, 2, drop = FALSE] creates a data frame with the Y of interest and retains the column name

correlations <- Corr_XY(M[1:n, -c(1:3, categorical)], M[1:n, 2, drop = FALSE])
print(correlations)
##              Y           X             r            p <0.05
## 1  TARGET_FLAG    KIDSDRIV  0.1036682963 6.052406e-21  TRUE
## 2  TARGET_FLAG         AGE -0.1031261224 9.659249e-21  TRUE
## 3  TARGET_FLAG    HOMEKIDS  0.1156210106 1.083837e-25  TRUE
## 4  TARGET_FLAG         YOJ -0.0684875222 5.889201e-10  TRUE
## 5  TARGET_FLAG      INCOME -0.1382427249 4.129718e-36  TRUE
## 6  TARGET_FLAG     PARENT1  0.1576222195 1.488738e-46  TRUE
## 7  TARGET_FLAG    HOME_VAL -0.0975878744 9.950630e-19  TRUE
## 8  TARGET_FLAG     MSTATUS -0.1351247571 1.460728e-34  TRUE
## 9  TARGET_FLAG         SEX  0.0210785602 5.689454e-02 FALSE
## 10 TARGET_FLAG    TRAVTIME  0.0483683103 1.234536e-05  TRUE
## 11 TARGET_FLAG     CAR_USE -0.1426736765 2.252692e-38  TRUE
## 12 TARGET_FLAG    BLUEBOOK -0.1033831893 7.741376e-21  TRUE
## 13 TARGET_FLAG         TIF -0.0823700498 9.145383e-14  TRUE
## 14 TARGET_FLAG     RED_CAR -0.0069472579 5.303220e-01 FALSE
## 15 TARGET_FLAG    OLDCLAIM  0.1380838297 4.962696e-36  TRUE
## 16 TARGET_FLAG    CLM_FREQ  0.2161960608 6.332803e-87  TRUE
## 17 TARGET_FLAG     REVOKED  0.1519390816 2.410252e-43  TRUE
## 18 TARGET_FLAG     MVR_PTS  0.2191970538 2.320264e-89  TRUE
## 19 TARGET_FLAG     CAR_AGE -0.0970693948 1.515753e-18  TRUE
## 20 TARGET_FLAG  URBANICITY  0.2242509434 1.512617e-93  TRUE
## 21 TARGET_FLAG         PHD -0.0654121132 3.325908e-09  TRUE
## 22 TARGET_FLAG     MASTERS -0.0762959857 5.147403e-12  TRUE
## 23 TARGET_FLAG   BACHELORS -0.0426525815 1.160501e-04  TRUE
## 24 TARGET_FLAG          HS  0.1097693656 2.653941e-23  TRUE
## 25 TARGET_FLAG        NOHS  0.0530418729 1.632022e-06  TRUE
## 26 TARGET_FLAG    CLERICAL  0.0273667617 1.342281e-02  TRUE
## 27 TARGET_FLAG      DOCTOR -0.0583769873 1.310526e-07  TRUE
## 28 TARGET_FLAG  HOME_MAKER  0.0112592910 3.091434e-01 FALSE
## 29 TARGET_FLAG      LAWYER -0.0617312643 2.389150e-08  TRUE
## 30 TARGET_FLAG     MANAGER -0.1053953267 1.343221e-21  TRUE
## 31 TARGET_FLAG        PROF -0.0385723360 4.915476e-04  TRUE
## 32 TARGET_FLAG     STUDENT  0.0770140270 3.247037e-12  TRUE
## 33 TARGET_FLAG BLUE_COLLAR  0.1017866167 3.033581e-20  TRUE
## 34 TARGET_FLAG     MINIVAN -0.1369991100 1.729612e-35  TRUE
## 35 TARGET_FLAG       TRUCK -0.0003423919 9.753283e-01 FALSE
## 36 TARGET_FLAG      PICKUP  0.0566433091 3.049128e-07  TRUE
## 37 TARGET_FLAG      SPORTS  0.0572528091 2.272140e-07  TRUE
## 38 TARGET_FLAG         VAN  0.0030204421 7.849914e-01 FALSE
## 39 TARGET_FLAG         SUV  0.0450322221 4.709927e-05  TRUE

##The predictor variables SEX, RED_CAR, HOME_MAKER, TRUCK, and VAN do not have statistically significant correlations with the response variable and are therefore not being considered for the model.

remove <- c("SEX", "RED_CAR", "HOME_MAKER", "TRUCK", "VAN")
X[match(remove, names(M)), "TARGET_FLAG"] <- F

#Betweeen all X variables


remove <- c("SEX", "RED_CAR", "HOME_MAKER", "TRUCK", "VAN")
X[match(remove, names(M)), "TARGET_FLAG"] <- F

#Betweeen all X variables

correlations <- Corr_XX(M[1:n,  (X[,"TARGET_AMT"] & X[,"TARGET_FLAG"])], 0.50)
print(correlations$Least_Correlated)
##              X1          X2             r             p <0.05
## 142     MSTATUS    TRAVTIME  0.0102482953  3.546042e-01 FALSE
## 143     MSTATUS     CAR_USE  0.0209315442  5.864616e-02 FALSE
## 144     MSTATUS         TIF -0.0007410648  9.466325e-01 FALSE
## 145     MSTATUS    OLDCLAIM -0.0459197532  3.326731e-05 FALSE
## 146     MSTATUS    CLM_FREQ -0.0693288825  3.618950e-10 FALSE
## 147     MSTATUS     REVOKED -0.0432305388  9.360415e-05 FALSE
## 148     MSTATUS     MVR_PTS -0.0479670481  1.456999e-05 FALSE
## 149     MSTATUS     CAR_AGE -0.0320848886  3.745996e-03 FALSE
## 150     MSTATUS  URBANICITY -0.0025618324  8.170069e-01 FALSE
## 151     MSTATUS         PHD -0.0373484866  7.390760e-04 FALSE
## 152     MSTATUS     MASTERS  0.0029389035  7.906587e-01 FALSE
## 153     MSTATUS          HS  0.0380637415  5.831527e-04 FALSE
## 154     MSTATUS        NOHS  0.0138148710  2.120751e-01 FALSE
## 155     MSTATUS      DOCTOR -0.0373295003  7.436993e-04 FALSE
## 156     MSTATUS     MANAGER  0.0019279928  8.617515e-01 FALSE
## 157     MSTATUS     STUDENT  0.0035687134  7.471929e-01 FALSE
## 158     MSTATUS BLUE_COLLAR  0.0045503896  6.810627e-01 FALSE
## 159     MSTATUS     MINIVAN  0.0009564217  9.311578e-01 FALSE
## 160     MSTATUS      PICKUP  0.0013592314  9.022878e-01 FALSE
## 161     MSTATUS      SPORTS  0.0112104575  3.112466e-01 FALSE
## 162    TRAVTIME     CAR_USE -0.0248053795  2.503410e-02 FALSE
## 163    TRAVTIME         TIF -0.0116046256  2.945390e-01 FALSE
## 164    TRAVTIME    OLDCLAIM -0.0192671689  8.177875e-02 FALSE
## 165    TRAVTIME    CLM_FREQ  0.0065602114  5.534799e-01 FALSE
## 166    TRAVTIME     REVOKED -0.0121152699  2.738038e-01 FALSE
## 167    TRAVTIME     MVR_PTS  0.0105985106  3.384001e-01 FALSE
## 168    TRAVTIME     CAR_AGE -0.0364222498  9.986376e-04 FALSE
## 169    TRAVTIME  URBANICITY -0.1660047341  1.640236e-51 FALSE
## 170    TRAVTIME         PHD -0.0429039572  1.057263e-04 FALSE
## 171    TRAVTIME     MASTERS -0.0375494718  6.917442e-04 FALSE
## 172    TRAVTIME          HS  0.0190208120  8.576030e-02 FALSE
## 173    TRAVTIME        NOHS  0.0273923554  1.333620e-02 FALSE
## 174    TRAVTIME      DOCTOR -0.0269177228  1.502532e-02 FALSE
## 175    TRAVTIME     MANAGER -0.0749253684  1.225975e-11 FALSE
## 176    TRAVTIME     STUDENT  0.0283659291  1.038762e-02 FALSE
## 177    TRAVTIME BLUE_COLLAR  0.0424745029  1.239311e-04 FALSE
## 178    TRAVTIME     MINIVAN -0.0083978230  4.481267e-01 FALSE
## 179    TRAVTIME      PICKUP -0.0084380748  4.459539e-01 FALSE
## 180    TRAVTIME      SPORTS  0.0112356765  3.101592e-01 FALSE
## 181     CAR_USE         TIF -0.0001160512  9.916365e-01 FALSE
## 182     CAR_USE    OLDCLAIM -0.0357676283  1.230458e-03 FALSE
## 183     CAR_USE    CLM_FREQ -0.0814906825  1.670027e-13 FALSE
## 184     CAR_USE     REVOKED -0.0168968510  1.269334e-01 FALSE
## 185     CAR_USE     MVR_PTS -0.0680837946  7.424192e-10 FALSE
## 186     CAR_USE     CAR_AGE  0.0676272022  9.632170e-10 FALSE
## 187     CAR_USE  URBANICITY  0.0204630452  6.452916e-02 FALSE
## 188     CAR_USE         PHD  0.0250940213  2.339268e-02 FALSE
## 189     CAR_USE     MASTERS  0.1282037052  2.971966e-31 FALSE
## 190     CAR_USE          HS -0.1596014265  1.062614e-47 FALSE
## 191     CAR_USE        NOHS  0.1269912794  1.082325e-30 FALSE
## 192     CAR_USE      DOCTOR  0.1354404558  1.021913e-34 FALSE
## 193     CAR_USE     MANAGER  0.0961826970  3.097201e-18 FALSE
## 194     CAR_USE     STUDENT -0.0806555709  2.941293e-13 FALSE
## 195     CAR_USE BLUE_COLLAR -0.4380537778  0.000000e+00 FALSE
## 196     CAR_USE     MINIVAN  0.2046295876  7.061299e-78 FALSE
## 197     CAR_USE      PICKUP -0.2257311630  8.587749e-95 FALSE
## 198     CAR_USE      SPORTS  0.1425417945  2.637087e-38 FALSE
## 199         TIF    OLDCLAIM -0.0219581980  4.730039e-02 FALSE
## 200         TIF    CLM_FREQ -0.0230229550  3.754291e-02 FALSE
## 201         TIF     REVOKED -0.0318415132  4.017359e-03 FALSE
## 202         TIF     MVR_PTS -0.0410457340  2.080836e-04 FALSE
## 203         TIF     CAR_AGE  0.0075594969  4.947226e-01 FALSE
## 204         TIF  URBANICITY  0.0071310133  5.195023e-01 FALSE
## 205         TIF         PHD -0.0078535466  4.780900e-01 FALSE
## 206         TIF     MASTERS  0.0181811177  1.005207e-01 FALSE
## 207         TIF          HS  0.0025818532  8.156024e-01 FALSE
## 208         TIF        NOHS -0.0008852708  9.362678e-01 FALSE
## 209         TIF      DOCTOR -0.0113072010  3.070891e-01 FALSE
## 210         TIF     MANAGER  0.0099573035  3.684344e-01 FALSE
## 211         TIF     STUDENT -0.0166644709  1.322436e-01 FALSE
## 212         TIF BLUE_COLLAR -0.0066761671  5.464903e-01 FALSE
## 213         TIF     MINIVAN -0.0093689420  3.974062e-01 FALSE
## 214         TIF      PICKUP  0.0048783681  6.594748e-01 FALSE
## 215         TIF      SPORTS -0.0074874653  4.988436e-01 FALSE
## 331      DOCTOR     MANAGER -0.0654289801  3.295165e-09 FALSE
## 332      DOCTOR     STUDENT -0.0545045933  8.360529e-07 FALSE
## 333      DOCTOR BLUE_COLLAR -0.0946162963  1.077181e-17 FALSE
## 334      DOCTOR     MINIVAN  0.0380040469  5.948895e-04 FALSE
## 335      DOCTOR      PICKUP -0.0340756591  2.078654e-03 FALSE
## 336      DOCTOR      SPORTS -0.0007753139  9.441700e-01 FALSE
## 346 BLUE_COLLAR     MINIVAN  0.0242686903  2.835221e-02 FALSE
## 347 BLUE_COLLAR      PICKUP  0.0378625555  6.235926e-04 FALSE
## 348 BLUE_COLLAR      SPORTS -0.0325862131  3.238823e-03 FALSE
## 349     MINIVAN      PICKUP -0.2704284523 9.198656e-137 FALSE
## 350     MINIVAN      SPORTS -0.2111419861  6.628612e-83 FALSE
## 351      PICKUP      SPORTS -0.1601428363  5.130616e-48 FALSE

##The specification M[1:n, (X[,“TARGET_AMT”] & X[,“TARGET_FLAG”])] creates a data frame excluding INDEX, TARGET_FLAG, TARGET_AMT, and the variables previously marked for removal due to statistically significant correlations with the response variable. There are strong statistically significant correlations between HOME_VAL & INCOME, PHD & DOCTOR, and MASTERS & LAWYER. From these paired correlated variables we find that HOME_VAL, DOCTOR, and LAWYER are least correlated to both TARGET_FLAG and TARGET_AMT. These three variables will therefore not be considered for the model. It is worth noting that the high correlation between PHD & DOCTOR and MASTERS & LAWYER is likely due to prior imputation.

remove <- c("HOME_VAL", "DOCTOR", "LAWYER")
X[match(remove, names(M)), ] <- F

Logarithmic Transformation

library(MASS)
## Warning: package 'MASS' was built under R version 4.2.3
columns <- c("INCOME", "HOME_VAL", "TRAVTIME", "BLUEBOOK", "TIF", "MVR_PTS")
fit_exp <- function(X, fields) {
  potential <- match(fields, names(X))
  lambda <- numeric(ncol(X))
  par(mfrow=c(2,3))
  for (i in potential) {
    shifted <- X[, i] - min(X[, i]) + 1e-32
    fit_exp <- fitdistr(shifted, "Exponential")
    lambda[i] <- fit_exp$estimate
    exp <- rexp(1000, lambda[i])
    hist(X[, i], prob=TRUE, col="grey", main =names(X[i]), 
         xlab=paste("Lambda =",fractions(lambda[i])))
    lines(density(exp), col="blue", lwd=2)
  }
  lambda <- data.frame("VARIABLE"=fields, "LAMBDA"=lambda[potential])
  return(lambda)
}
lambda <- fit_exp(M[1:n, ], columns)

lambda
##   VARIABLE       LAMBDA
## 1   INCOME 1.615559e-05
## 2 HOME_VAL 5.868625e-06
## 3 TRAVTIME 3.510530e-02
## 4 BLUEBOOK 7.037347e-05
## 5      TIF 2.298161e-01
## 6  MVR_PTS 5.897955e-01

#Five of the six potential variables lend themselves toward modeling with an exponential distribution. The variables were shifted to slightly above zero by subtracting the minimum value and then adding 1−32 to the modified value. This would also shift data with a negative minimum in the appropriate direction since subtracting the negative minimum value equates to adding the minimum value

M[, "log_INCOME"] <- log(M[, "INCOME"] - min(M[, "INCOME"]) + 1e-32, lambda[1,2])
M[, "log_TRAVTIME"] <- log(M[, "TRAVTIME"] - min(M[, "TRAVTIME"]) + 1e-32, lambda[3,2])
M[, "log_BLUEBOOK"] <- log(M[, "BLUEBOOK"] - min(M[, "BLUEBOOK"]) + 1e-32, lambda[4,2])
M[, "log_TIF"] <- log(M[, "TIF"] - min(M[, "TIF"]) + 1e-32, lambda[5,2])
M[, "log_MVR_PTS"] <- log(M[, "MVR_PTS"] - min(M[, "MVR_PTS"]) + 1e-32, lambda[6,2])
remove <- c("INCOME", "TRAVTIME", "BLUEBOOK", "TIF", "MVR_PTS")
X <- rbind(X, data.frame("TARGET_FLAG" = rep(T, ncol(M)-nrow(X)), 
                         "TARGET_AMT" = rep(T, ncol(M)-nrow(X))))
X[match(remove, names(M)), ] <- F

#Categorization of Multimodal Data

par(mfrow=c(2,3))
smoothScatter(M[1:n, "KIDSDRIV"], ylab = "KIDSDRIV")
smoothScatter(M[1:n, "HOMEKIDS"], ylab = "HOMEKIDS")
smoothScatter(M[1:n, "YOJ"], ylab = "YOJ")
smoothScatter(M[1:n, "OLDCLAIM"], ylab = "OLDCLAIM")
smoothScatter(M[1:n, "CLM_FREQ"], ylab = "CLM_FREQ")
smoothScatter(M[1:n, "CAR_AGE"], ylab = "CAR_AGE")

#The variables KIDSDRIV, HOMEKIDS, YOJ, OLDCLAIM, CLM_FREQ, and CAR_AGE have bimodal distributions. There are clear lines of demarcation in the values that we can use to bifurcate the variables into categories. The defining value for each of these variables is zero. Therefore, we can categorize the variables as zero if the value is equal to zero, and one otherwise.

M[,"cat_KIDSDRIV"] <- ifelse(M$KIDSDRIV == 0, 0, 1)
M[,"cat_HOMEKIDS"] <- ifelse(M$HOMEKIDS == 0, 0, 1)
M[,"cat_YOJ"] <- ifelse(M$YOJ == 0, 0, 1)
M[,"cat_OLDCLAIM"] <- ifelse(M$OLDCLAIM == 0, 0, 1)
M[,"cat_CLM_FREQ"] <- ifelse(M$CLM_FREQ == 0, 0, 1)
M[,"cat_CAR_AGE"] <- ifelse(M$CAR_AGE == 0, 0, 1)
remove <- c("KIDSDRIV", "HOMEKIDS", "YOJ", "OLDCLAIM", "CLM_FREQ", "CAR_AGE")
X <- rbind(X, data.frame("TARGET_FLAG" = rep(T, ncol(M)-nrow(X)), 
                         "TARGET_AMT" = rep(T, ncol(M)-nrow(X))))
X[match(remove, names(M)), ] <- F

Box-Cox Transformation

library(car)
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.2.3
potential <- match(c("AGE"), names(M))
box.cox.powers <- powerTransform(M[1:n, potential], family="bcPower")
summary(box.cox.powers)
## bcPower Transformation to Normality 
##                   Est Power Rounded Pwr Wald Lwr Bnd Wald Upr Bnd
## M[1:n, potential]    1.0391           1         0.95       1.1282
## 
## Likelihood ratio test that transformation parameter is equal to 0
##  (log transformation)
##                           LRT df       pval
## LR test, lambda = (0) 559.678  1 < 2.22e-16
## 
## Likelihood ratio test that no transformation is needed
##                             LRT df    pval
## LR test, lambda = (1) 0.7421689  1 0.38897

#The only unexamined variable that the Box-Cox Transformation could potentially be applied to is the quantitative variable AGE. However, upon examination, examination AGE returns an estimated power close to one which indicates that no transformation is necessary. This is further supported by the boundaries which include the value of one in the range.

Building Models

training_AMT <- M[1:n, X[,"TARGET_AMT"]]
training_FLAG <- M[1:n, X[,"TARGET_FLAG"]]


#Forward Selection


#Multiple Linear Regression

null <- lm(TARGET_AMT ~ 0, training_AMT)
full <- lm(TARGET_AMT ~ ., training_AMT)
aic_steps <- step(null, scope=list(lower=null, upper=full), direction="forward", k = 2, trace=F)
aic_steps$call
## lm(formula = TARGET_AMT ~ URBANICITY + cat_OLDCLAIM + PARENT1 + 
##     MANAGER + CAR_USE + cat_CAR_AGE + MINIVAN + log_MVR_PTS + 
##     MSTATUS + cat_KIDSDRIV + log_TIF + NOHS + HS + REVOKED + 
##     SPORTS + log_INCOME + log_TRAVTIME - 1, data = training_AMT)

##The above model has the lowest AIC.

forward_AMT <- lm(aic_steps$call, training_AMT)
round(coef(summary(forward_AMT)), 6)
##                 Estimate Std. Error   t value Pr(>|t|)
## URBANICITY   1505.085315 136.313068 11.041387 0.000000
## cat_OLDCLAIM  552.793025 114.474396  4.828967 0.000001
## PARENT1       640.172763 177.545247  3.605688 0.000313
## MANAGER      -851.066664 162.431235 -5.239551 0.000000
## CAR_USE      -785.064181 111.841871 -7.019412 0.000000
## cat_CAR_AGE   761.221081 182.601816  4.168749 0.000031
## MINIVAN      -470.879470 121.461053 -3.876794 0.000107
## log_MVR_PTS    -2.769052   0.757178 -3.657068 0.000257
## MSTATUS      -585.805201 119.630701 -4.896780 0.000001
## cat_KIDSDRIV  692.326953 162.330292  4.264928 0.000020
## log_TIF         6.991248   2.130387  3.281680 0.001036
## NOHS          657.591808 153.642482  4.280013 0.000019
## HS            470.297182 120.908400  3.889698 0.000101
## REVOKED       471.513352 155.133315  3.039407 0.002378
## SPORTS        321.992205 168.356009  1.912567 0.055839
## log_INCOME     45.990011  25.543071  1.800489 0.071820
## log_TRAVTIME  -18.650708  11.105457 -1.679418 0.093109

#At a significance level of alpha=0.5,the 17 forward selected variables yield a TARGET_AMT MLR model with three insignificant variables: SPORTS, log_INCOME, and log_TRAVTIME.Removing those three insignificant variables yields a model with all significant variables

forward_AMT <- lm(TARGET_AMT ~ URBANICITY + cat_OLDCLAIM + PARENT1 + 
                    MANAGER + CAR_USE + MINIVAN + log_MVR_PTS + MSTATUS + cat_CAR_AGE +
                    cat_KIDSDRIV + log_TIF + NOHS + HS + REVOKED - 1, training_AMT)
summary(forward_AMT)
## 
## Call:
## lm(formula = TARGET_AMT ~ URBANICITY + cat_OLDCLAIM + PARENT1 + 
##     MANAGER + CAR_USE + MINIVAN + log_MVR_PTS + MSTATUS + cat_CAR_AGE + 
##     cat_KIDSDRIV + log_TIF + NOHS + HS + REVOKED - 1, data = training_AMT)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -4997  -1694   -818    354 104984 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## URBANICITY   1469.0432   135.8182  10.816  < 2e-16 ***
## cat_OLDCLAIM  572.1325   114.3497   5.003 5.75e-07 ***
## PARENT1       655.2860   177.4451   3.693 0.000223 ***
## MANAGER      -884.0253   162.0108  -5.457 5.00e-08 ***
## CAR_USE      -729.9259   109.4039  -6.672 2.69e-11 ***
## MINIVAN      -540.1926   117.7244  -4.589 4.53e-06 ***
## log_MVR_PTS    -2.8047     0.7574  -3.703 0.000214 ***
## MSTATUS      -581.0082   119.6464  -4.856 1.22e-06 ***
## cat_CAR_AGE   772.8248   181.7666   4.252 2.14e-05 ***
## cat_KIDSDRIV  683.0515   162.3725   4.207 2.62e-05 ***
## log_TIF         7.1675     2.1301   3.365 0.000769 ***
## NOHS          676.0476   153.2975   4.410 1.05e-05 ***
## HS            496.4631   120.3608   4.125 3.75e-05 ***
## REVOKED       474.5687   155.1805   3.058 0.002234 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4560 on 8147 degrees of freedom
## Multiple R-squared:  0.1488, Adjusted R-squared:  0.1473 
## F-statistic: 101.7 on 14 and 8147 DF,  p-value: < 2.2e-16

Binary Logistic Regression

null <- glm(TARGET_FLAG ~ 0, family = binomial(link = "logit"), training_FLAG)
full <- glm(TARGET_FLAG ~ ., family = binomial(link = "logit"), training_FLAG)
aic_steps <- step(null, scope=list(lower=null, upper=full), direction="forward", k = 2, trace=F)
aic_steps$aic
## [1] 7500.651

##The above model has the lowest AIC.

forward_FLAG <- glm(aic_steps$formula, family = binomial(link = "logit"), training_FLAG)
round(coef(summary(forward_FLAG)), 6)
##               Estimate Std. Error   z value Pr(>|z|)
## AGE          -0.003424   0.003925 -0.872196 0.383101
## URBANICITY    2.224240   0.110518 20.125582 0.000000
## cat_OLDCLAIM  0.553906   0.060688  9.127162 0.000000
## CAR_USE      -0.729315   0.081633 -8.934078 0.000000
## cat_YOJ      -0.000101   0.283764 -0.000355 0.999717
## MSTATUS      -0.692176   0.074923 -9.238513 0.000000
## MANAGER      -0.712284   0.107968 -6.597173 0.000000
## MINIVAN      -0.418051   0.105548 -3.960787 0.000075
## REVOKED       0.729757   0.079316  9.200623 0.000000
## cat_KIDSDRIV  0.556026   0.096696  5.750221 0.000000
## HS            0.513970   0.075100  6.843797 0.000000
## NOHS          0.546576   0.097037  5.632669 0.000000
## cat_CAR_AGE  -2.817303   0.329517 -8.549807 0.000000
## log_TIF       0.007683   0.001190  6.459280 0.000000
## log_MVR_PTS  -0.002206   0.000437 -5.052844 0.000000
## cat_HOMEKIDS  0.224665   0.096231  2.334635 0.019563
## SPORTS        0.636551   0.119359  5.333084 0.000000
## log_TRAVTIME -0.028562   0.007067 -4.041575 0.000053
## SUV           0.384536   0.100159  3.839250 0.000123
## PHD          -0.314215   0.115890 -2.711326 0.006701
## log_INCOME    0.106774   0.037340  2.859542 0.004243
## CLERICAL      0.325682   0.097022  3.356792 0.000789
## PICKUP        0.223960   0.095602  2.342627 0.019149
## log_BLUEBOOK  0.061436   0.023980  2.561970 0.010408
## BLUE_COLLAR   0.188304   0.090050  2.091115 0.036518
## PARENT1       0.215201   0.118922  1.809608 0.070357
#At a significance level of alpha=0.5,the 26 forward selected variables yield a TARGET_FLAG BLR model with three insignificant variables: AGE, cat_YOJ, and PARENT1. Removing those three insignificant variables yields a model with all significant variables

forward_FLAG <- glm(TARGET_FLAG ~ URBANICITY + cat_OLDCLAIM + CAR_USE +
                      MSTATUS + MANAGER + MINIVAN + REVOKED + cat_KIDSDRIV + HS + 
                      NOHS + cat_CAR_AGE + log_TIF + log_MVR_PTS + cat_HOMEKIDS + 
                      SPORTS + log_TRAVTIME + SUV + PHD + log_INCOME + CLERICAL + 
                      PICKUP + log_BLUEBOOK + BLUE_COLLAR - 1, family = binomial(link = "logit"), training_FLAG)
summary(forward_FLAG)
## 
## Call:
## glm(formula = TARGET_FLAG ~ URBANICITY + cat_OLDCLAIM + CAR_USE + 
##     MSTATUS + MANAGER + MINIVAN + REVOKED + cat_KIDSDRIV + HS + 
##     NOHS + cat_CAR_AGE + log_TIF + log_MVR_PTS + cat_HOMEKIDS + 
##     SPORTS + log_TRAVTIME + SUV + PHD + log_INCOME + CLERICAL + 
##     PICKUP + log_BLUEBOOK + BLUE_COLLAR - 1, family = binomial(link = "logit"), 
##     data = training_FLAG)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1946  -0.7273  -0.4224   0.6889   3.1402  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## URBANICITY    2.2185007  0.1101897  20.133  < 2e-16 ***
## cat_OLDCLAIM  0.5511070  0.0606483   9.087  < 2e-16 ***
## CAR_USE      -0.7312157  0.0815656  -8.965  < 2e-16 ***
## MSTATUS      -0.7827185  0.0585965 -13.358  < 2e-16 ***
## MANAGER      -0.7090159  0.1077946  -6.577 4.79e-11 ***
## MINIVAN      -0.4156174  0.1055195  -3.939 8.19e-05 ***
## REVOKED       0.7302799  0.0793086   9.208  < 2e-16 ***
## cat_KIDSDRIV  0.5302165  0.0941368   5.632 1.78e-08 ***
## HS            0.5146327  0.0749761   6.864 6.70e-12 ***
## NOHS          0.5475820  0.0968129   5.656 1.55e-08 ***
## cat_CAR_AGE  -2.9259596  0.1395232 -20.971  < 2e-16 ***
## log_TIF       0.0076797  0.0011890   6.459 1.06e-10 ***
## log_MVR_PTS  -0.0022271  0.0004362  -5.106 3.29e-07 ***
## cat_HOMEKIDS  0.3568313  0.0683715   5.219 1.80e-07 ***
## SPORTS        0.6357416  0.1191917   5.334 9.62e-08 ***
## log_TRAVTIME -0.0283289  0.0070487  -4.019 5.84e-05 ***
## SUV           0.3844424  0.1001565   3.838 0.000124 ***
## PHD          -0.3247954  0.1155157  -2.812 0.004928 ** 
## log_INCOME    0.1076736  0.0150112   7.173 7.34e-13 ***
## CLERICAL      0.3327295  0.0966097   3.444 0.000573 ***
## PICKUP        0.2252018  0.0956307   2.355 0.018527 *  
## log_BLUEBOOK  0.0618683  0.0239369   2.585 0.009748 ** 
## BLUE_COLLAR   0.1911011  0.0899125   2.125 0.033552 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 11313.5  on 8161  degrees of freedom
## Residual deviance:  7452.6  on 8138  degrees of freedom
## AIC: 7498.6
## 
## Number of Fisher Scoring iterations: 5

Backward Elimination

#Backward stepwise subset elimination based on AIC Using k=2 degrees of freedom for the penalty gives the genuine AIC #Using k=log(n) is sometimes referred to as BIC or SBC

Multiple Linear Regression

null <- lm(TARGET_AMT ~ 0, training_AMT)
full <- lm(TARGET_AMT ~ ., training_AMT)
aic_steps <- step(full, scope=list(lower=null, upper=full), direction="backward", k = 2, trace=F)
aic_steps$call
## lm(formula = TARGET_AMT ~ PARENT1 + MSTATUS + CAR_USE + REVOKED + 
##     URBANICITY + HS + NOHS + MANAGER + MINIVAN + SPORTS + log_INCOME + 
##     log_TRAVTIME + log_TIF + log_MVR_PTS + cat_KIDSDRIV + cat_OLDCLAIM, 
##     data = training_AMT)
backward_AMT <- lm(aic_steps$call, training_AMT)
round(coef(summary(backward_AMT)), 6)
##                 Estimate Std. Error   t value Pr(>|t|)
## (Intercept)   762.784417 183.151125  4.164782 0.000031
## PARENT1       640.018132 177.569345  3.604328 0.000315
## MSTATUS      -586.551921 119.723526 -4.899220 0.000001
## CAR_USE      -785.707613 111.900046 -7.021513 0.000000
## REVOKED       470.841555 155.143313  3.034881 0.002414
## URBANICITY   1505.218992 136.333458 11.040716 0.000000
## HS            470.034608 120.940482  3.886495 0.000103
## NOHS          657.777887 153.639789  4.281299 0.000019
## MANAGER      -851.651499 162.435546 -5.243012 0.000000
## MINIVAN      -471.449326 121.477290 -3.880967 0.000105
## SPORTS        322.212013 168.353540  1.913901 0.055668
## log_INCOME     46.035671  25.544003  1.802210 0.071549
## log_TRAVTIME  -18.636693  11.105620 -1.678132 0.093360
## log_TIF         6.989195   2.130538  3.280483 0.001041
## log_MVR_PTS    -2.773052   0.757597 -3.660325 0.000253
## cat_KIDSDRIV  691.794967 162.331388  4.261622 0.000021
## cat_OLDCLAIM  552.008131 114.513722  4.820454 0.000001

#At a significance level of alpha=0.5 , the 17 backward selected variables yield a TARGET_AMT MLR model with three insignificant variables: SPORTS, log_INCOME, and log_TRAVTIME. Removing those three insignificant variables yields a model with all significant variables

backward_AMT <- lm(TARGET_AMT ~ PARENT1 + MSTATUS + CAR_USE + REVOKED + 
                     URBANICITY + HS + NOHS + MANAGER + MINIVAN + log_TIF + 
                     log_MVR_PTS + cat_KIDSDRIV + cat_OLDCLAIM, training_AMT)
summary(backward_AMT)
## 
## Call:
## lm(formula = TARGET_AMT ~ PARENT1 + MSTATUS + CAR_USE + REVOKED + 
##     URBANICITY + HS + NOHS + MANAGER + MINIVAN + log_TIF + log_MVR_PTS + 
##     cat_KIDSDRIV + cat_OLDCLAIM, data = training_AMT)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -4997  -1694   -819    354 104984 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   774.2430   182.3082   4.247 2.19e-05 ***
## PARENT1       655.1922   177.4678   3.692 0.000224 ***
## MSTATUS      -581.7175   119.7388  -4.858 1.21e-06 ***
## CAR_USE      -730.5026   109.4617  -6.674 2.66e-11 ***
## REVOKED       473.8986   155.1903   3.054 0.002268 ** 
## URBANICITY   1469.2228   135.8413  10.816  < 2e-16 ***
## HS            496.2580   120.3904   4.122 3.79e-05 ***
## NOHS          676.2812   153.2939   4.412 1.04e-05 ***
## MANAGER      -884.6343   162.0158  -5.460 4.90e-08 ***
## MINIVAN      -540.8132   117.7404  -4.593 4.43e-06 ***
## log_TIF         7.1657     2.1302   3.364 0.000772 ***
## log_MVR_PTS    -2.8086     0.7578  -3.706 0.000212 ***
## cat_KIDSDRIV  682.5059   162.3736   4.203 2.66e-05 ***
## cat_OLDCLAIM  571.3661   114.3887   4.995 6.01e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4560 on 8147 degrees of freedom
## Multiple R-squared:  0.0617, Adjusted R-squared:  0.0602 
## F-statistic: 41.21 on 13 and 8147 DF,  p-value: < 2.2e-16

Binary Logsitic Regression

null <- glm(TARGET_FLAG ~ 0, family = binomial(link = "logit"), training_FLAG)
full <- glm(TARGET_FLAG ~ ., family = binomial(link = "logit"), training_FLAG)
aic_steps <- step(full, scope=list(lower=null, upper=full), direction="backward", k = 2, trace=F)
aic_steps$formula
## TARGET_FLAG ~ PARENT1 + MSTATUS + CAR_USE + REVOKED + URBANICITY + 
##     PHD + MASTERS + BACHELORS + CLERICAL + MANAGER + STUDENT + 
##     BLUE_COLLAR + MINIVAN + PICKUP + SPORTS + SUV + log_INCOME + 
##     log_TRAVTIME + log_BLUEBOOK + log_TIF + log_MVR_PTS + cat_KIDSDRIV + 
##     cat_HOMEKIDS + cat_OLDCLAIM
backward_FLAG <- glm(aic_steps$formula, family = binomial(link = "logit"), training_FLAG)
round(coef(summary(backward_FLAG)), 6)
##               Estimate Std. Error    z value Pr(>|z|)
## (Intercept)  -2.554923   0.160423 -15.926206 0.000000
## PARENT1       0.217757   0.119008   1.829758 0.067286
## MSTATUS      -0.694028   0.074400  -9.328352 0.000000
## CAR_USE      -0.688893   0.080778  -8.528279 0.000000
## REVOKED       0.733365   0.079366   9.240333 0.000000
## URBANICITY    2.249695   0.111388  20.196878 0.000000
## PHD          -0.791338   0.133765  -5.915870 0.000000
## MASTERS      -0.508002   0.105529  -4.813852 0.000001
## BACHELORS    -0.485927   0.077640  -6.258734 0.000000
## CLERICAL      0.382111   0.106668   3.582245 0.000341
## MANAGER      -0.690384   0.109689  -6.293993 0.000000
## STUDENT       0.184715   0.126539   1.459743 0.144361
## BLUE_COLLAR   0.250666   0.103458   2.422876 0.015398
## MINIVAN      -0.435240   0.105575  -4.122578 0.000037
## PICKUP        0.219309   0.096226   2.279105 0.022661
## SPORTS        0.616909   0.118944   5.186530 0.000000
## SUV           0.364174   0.099838   3.647647 0.000265
## log_INCOME    0.100999   0.015677   6.442715 0.000000
## log_TRAVTIME -0.028743   0.007072  -4.064134 0.000048
## log_BLUEBOOK  0.059508   0.024005   2.479025 0.013174
## log_TIF       0.007729   0.001190   6.493015 0.000000
## log_MVR_PTS  -0.002189   0.000437  -5.008679 0.000001
## cat_KIDSDRIV  0.542256   0.094714   5.725173 0.000000
## cat_HOMEKIDS  0.252943   0.087625   2.886646 0.003894
## cat_OLDCLAIM  0.557205   0.060751   9.171972 0.000000

#At a significance level of alpha=0.5 the 25 backward selected variables yield a TARGET_FLAG BLR model with three insignificant variables: PARENT1 and STUDENT. Removing those two insignificant variables yields a model with all significant variables

backward_FLAG <- glm(TARGET_FLAG ~ MSTATUS + CAR_USE + REVOKED + URBANICITY + 
                       PHD + MASTERS + BACHELORS + CLERICAL + MANAGER + 
                       BLUE_COLLAR + MINIVAN + PICKUP + SPORTS + SUV + log_INCOME + 
                       log_TRAVTIME + log_BLUEBOOK + log_TIF + log_MVR_PTS + cat_KIDSDRIV + 
                       cat_HOMEKIDS + cat_OLDCLAIM, family = binomial(link = "logit"), training_FLAG)
summary(backward_FLAG)
## 
## Call:
## glm(formula = TARGET_FLAG ~ MSTATUS + CAR_USE + REVOKED + URBANICITY + 
##     PHD + MASTERS + BACHELORS + CLERICAL + MANAGER + BLUE_COLLAR + 
##     MINIVAN + PICKUP + SPORTS + SUV + log_INCOME + log_TRAVTIME + 
##     log_BLUEBOOK + log_TIF + log_MVR_PTS + cat_KIDSDRIV + cat_HOMEKIDS + 
##     cat_OLDCLAIM, family = binomial(link = "logit"), data = training_FLAG)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2016  -0.7231  -0.4212   0.6891   3.1351  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.4312711  0.1486779 -16.353  < 2e-16 ***
## MSTATUS      -0.7777777  0.0586439 -13.263  < 2e-16 ***
## CAR_USE      -0.7155003  0.0787814  -9.082  < 2e-16 ***
## REVOKED       0.7343615  0.0793348   9.256  < 2e-16 ***
## URBANICITY    2.2363335  0.1109238  20.161  < 2e-16 ***
## PHD          -0.8497653  0.1275975  -6.660 2.74e-11 ***
## MASTERS      -0.5639725  0.0979155  -5.760 8.42e-09 ***
## BACHELORS    -0.5101645  0.0752438  -6.780 1.20e-11 ***
## CLERICAL      0.3256558  0.0990172   3.289  0.00101 ** 
## MANAGER      -0.7098209  0.1082712  -6.556 5.53e-11 ***
## BLUE_COLLAR   0.1852830  0.0928333   1.996  0.04595 *  
## MINIVAN      -0.4147417  0.1048748  -3.955 7.67e-05 ***
## PICKUP        0.2317109  0.0958995   2.416  0.01568 *  
## SPORTS        0.6331507  0.1186074   5.338 9.39e-08 ***
## SUV           0.3802718  0.0993833   3.826  0.00013 ***
## log_INCOME    0.1067877  0.0151806   7.034 2.00e-12 ***
## log_TRAVTIME -0.0285046  0.0070522  -4.042 5.30e-05 ***
## log_BLUEBOOK  0.0604612  0.0239746   2.522  0.01167 *  
## log_TIF       0.0077168  0.0011898   6.486 8.82e-11 ***
## log_MVR_PTS  -0.0021850  0.0004367  -5.004 5.63e-07 ***
## cat_KIDSDRIV  0.5297128  0.0942272   5.622 1.89e-08 ***
## cat_HOMEKIDS  0.3609836  0.0685862   5.263 1.42e-07 ***
## cat_OLDCLAIM  0.5555339  0.0607168   9.150  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9418.0  on 8160  degrees of freedom
## Residual deviance: 7446.2  on 8138  degrees of freedom
## AIC: 7492.2
## 
## Number of Fisher Scoring iterations: 5

Creating a matrix

library(glmnet)
## Warning: package 'glmnet' was built under R version 4.2.3
## Loading required package: Matrix
## Warning: package 'Matrix' was built under R version 4.2.3
## Loaded glmnet 4.1-7
library(glmnet)
library(Matrix)
my_matrix <- as.matrix(training_AMT[, -1])
my_sparse_matrix <- as(my_matrix, "dgCMatrix")
result <- t(my_sparse_matrix) %*% my_sparse_matrix

Lasso Regression

#Lasso Regression 

library(glmnet)
# Set lambda sequence
lambda_seq <- 10^seq(10, -2, length = 100)
# Prepare training data
training_AMT <- M[1:n, X[,"TARGET_AMT"]]
training_FLAG <- M[1:n, X[,"TARGET_FLAG"]]
x_AMT <- model.matrix(TARGET_AMT ~ ., data = training_AMT)[,-1]
x_FLAG <- model.matrix(TARGET_FLAG ~ ., data = training_FLAG)[,-1]
y_AMT <- training_AMT$TARGET_AMT
y_FLAG <- training_FLAG$TARGET_FLAG
# Fit Lasso regression for TARGET_AMT
cv_AMT <- cv.glmnet(x_AMT, y_AMT, alpha = 1, lambda = lambda_seq)
best_AMT <- glmnet(x_AMT, y_AMT, alpha = 1, lambda = cv_AMT$lambda.min)
# Fit Lasso regression for TARGET_FLAG
cv_FLAG <- cv.glmnet(x_FLAG, y_FLAG, alpha = 1, lambda = lambda_seq, family = "binomial")
best_FLAG <- glmnet(x_FLAG, y_FLAG, alpha = 1, lambda = cv_FLAG$lambda.min, family = "binomial")
# Print coefficients
round(coef(best_AMT), 6)
## 30 x 1 sparse Matrix of class "dgCMatrix"
##                       s0
## (Intercept)   683.212749
## AGE             2.996552
## PARENT1       481.144847
## MSTATUS      -620.166789
## CAR_USE      -719.826794
## REVOKED       437.760410
## URBANICITY   1473.526491
## PHD          -258.016305
## MASTERS       -94.320812
## HS            349.285622
## NOHS          508.319844
## MANAGER      -813.282712
## STUDENT         .       
## BLUE_COLLAR    78.957225
## MINIVAN      -476.355151
## TRUCK           .       
## PICKUP        -85.379281
## SPORTS        256.502591
## VAN            79.646690
## log_INCOME     45.478168
## log_TRAVTIME  -15.468947
## log_BLUEBOOK  -25.189019
## log_TIF         6.406495
## log_MVR_PTS    -2.683490
## cat_KIDSDRIV  553.361153
## cat_HOMEKIDS  196.950690
## cat_YOJ         .       
## cat_OLDCLAIM  542.017591
## cat_CLM_FREQ    0.000000
## cat_CAR_AGE     .
round(coef(best_FLAG), 6)
## 32 x 1 sparse Matrix of class "dgCMatrix"
##                     s0
## (Intercept)  -2.436792
## AGE          -0.000177
## PARENT1       0.242626
## MSTATUS      -0.483827
## CAR_USE      -0.406180
## REVOKED       0.583235
## URBANICITY    1.703912
## PHD          -0.201795
## MASTERS      -0.034306
## BACHELORS     .       
## HS            0.415456
## NOHS          0.344233
## CLERICAL      0.125212
## MANAGER      -0.567087
## PROF          .       
## STUDENT       0.063044
## BLUE_COLLAR   0.163277
## MINIVAN      -0.523709
## PICKUP        .       
## SPORTS        0.161783
## SUV           0.004017
## log_INCOME    0.077110
## log_TRAVTIME -0.010885
## log_BLUEBOOK  0.010419
## log_TIF       0.004613
## log_MVR_PTS  -0.001580
## cat_KIDSDRIV  0.358394
## cat_HOMEKIDS  0.209235
## cat_YOJ       .       
## cat_OLDCLAIM  0.527077
## cat_CLM_FREQ  0.000000
## cat_CAR_AGE   .

Adjusted R square

install.packages("leaps")
## package 'leaps' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\91976\AppData\Local\Temp\RtmpeATJFy\downloaded_packages
library(leaps)
## Warning: package 'leaps' was built under R version 4.2.3
model_sum_AMT <- summary(regsubsets(TARGET_AMT ~ ., training_AMT, nvmax=ncol(training_AMT)))
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 1 linear dependencies found
## Reordering variables and trying again:
model_sum_FLAG <- summary(regsubsets(TARGET_FLAG ~ ., training_FLAG, nvmax=ncol(training_FLAG)))
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 2 linear dependencies found
## Reordering variables and trying again:
par(mfrow=c(1,2))
plot(model_sum_AMT$adjr2, xlab = "Number of Variables", ylab = "Adj R-squared", main="TARGET_AMT")
plot(model_sum_FLAG$adjr2, xlab = "Number of Variables", ylab = "Adj R-squared", main="TARGET_FLAG")

cbind(max(model_sum_AMT$adjr2), which.max(model_sum_AMT$adjr2))
##            [,1] [,2]
## [1,] 0.06123634   19
cbind(max(model_sum_FLAG$adjr2), which.max(model_sum_FLAG$adjr2))
##          [,1] [,2]
## [1,] 0.211793   24

Multiple Linear Regression

#The maximum Adjusted R2 of 0.0612363 for the model predicting TARGET_AMT is reached when the model contains 19 variables

model_sum_AMT$which[which.max(model_sum_AMT$adjr2), ]
##  (Intercept)          AGE      PARENT1      MSTATUS      CAR_USE      REVOKED 
##         TRUE        FALSE         TRUE         TRUE         TRUE         TRUE 
##   URBANICITY          PHD      MASTERS           HS         NOHS      MANAGER 
##         TRUE         TRUE        FALSE         TRUE         TRUE         TRUE 
##      STUDENT  BLUE_COLLAR      MINIVAN        TRUCK       PICKUP       SPORTS 
##        FALSE        FALSE         TRUE        FALSE         TRUE         TRUE 
##          VAN   log_INCOME log_TRAVTIME log_BLUEBOOK      log_TIF  log_MVR_PTS 
##        FALSE         TRUE         TRUE        FALSE         TRUE         TRUE 
## cat_KIDSDRIV cat_HOMEKIDS      cat_YOJ cat_OLDCLAIM cat_CLM_FREQ  cat_CAR_AGE 
##         TRUE         TRUE        FALSE        FALSE         TRUE        FALSE
adjustedr2_AMT <- lm(TARGET_AMT ~ 1 + PARENT1 + MSTATUS + CAR_USE + 
                       REVOKED + URBANICITY + PHD + HS + NOHS + MANAGER + MINIVAN + PICKUP + 
                       SPORTS + log_INCOME + log_TRAVTIME + log_TIF + log_MVR_PTS + 
                       cat_KIDSDRIV + cat_HOMEKIDS + cat_CLM_FREQ, training_AMT)
round(coef(summary(adjustedr2_AMT)), 6)
##                 Estimate Std. Error   t value Pr(>|t|)
## (Intercept)   855.350195 190.724913  4.484732 0.000007
## PARENT1       477.258229 217.200288  2.197319 0.028026
## MSTATUS      -647.893956 127.782128 -5.070302 0.000000
## CAR_USE      -803.418566 113.162687 -7.099677 0.000000
## REVOKED       464.971703 155.173002  2.996473 0.002739
## URBANICITY   1524.033915 136.731529 11.146178 0.000000
## PHD          -252.479571 185.347297 -1.362197 0.173173
## HS            425.653740 124.349425  3.423046 0.000622
## NOHS          616.658877 157.387319  3.918098 0.000090
## MANAGER      -843.520474 162.559687 -5.188989 0.000000
## MINIVAN      -514.629399 126.582870 -4.065553 0.000048
## PICKUP       -165.005969 145.517170 -1.133928 0.256858
## SPORTS        275.376716 171.928269  1.601695 0.109262
## log_INCOME     45.399087  25.548734  1.776960 0.075612
## log_TRAVTIME  -18.536784  11.106373 -1.669022 0.095151
## log_TIF         7.028143   2.130404  3.298971 0.000975
## log_MVR_PTS    -2.790571   0.757820 -3.682366 0.000233
## cat_KIDSDRIV  601.792137 177.410599  3.392087 0.000697
## cat_HOMEKIDS  179.571417 150.040877  1.196817 0.231413
## cat_CLM_FREQ  540.063343 114.635246  4.711146 0.000003

#At a significance level of alpha=0.5, the R2adj selected variables yield a TARGET_AMT MLR model with six insignificant variables: PHD, PICKUP, SPORTS, og_INCOME, log_TRAVTIME, and cat_HOMEKIDS. Removing those six insignificant variables yields a model with all significant variables

adjustedr2_AMT <- lm(TARGET_AMT ~ 1 + PARENT1 + MSTATUS + CAR_USE + 
                       REVOKED + URBANICITY + HS + NOHS + MANAGER + MINIVAN + log_TIF + 
                       log_MVR_PTS + cat_KIDSDRIV + cat_CLM_FREQ, training_AMT)
summary(adjustedr2_AMT)
## 
## Call:
## lm(formula = TARGET_AMT ~ 1 + PARENT1 + MSTATUS + CAR_USE + REVOKED + 
##     URBANICITY + HS + NOHS + MANAGER + MINIVAN + log_TIF + log_MVR_PTS + 
##     cat_KIDSDRIV + cat_CLM_FREQ, data = training_AMT)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -4997  -1694   -819    354 104984 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   774.2430   182.3082   4.247 2.19e-05 ***
## PARENT1       655.1922   177.4678   3.692 0.000224 ***
## MSTATUS      -581.7175   119.7388  -4.858 1.21e-06 ***
## CAR_USE      -730.5026   109.4617  -6.674 2.66e-11 ***
## REVOKED       473.8986   155.1903   3.054 0.002268 ** 
## URBANICITY   1469.2228   135.8413  10.816  < 2e-16 ***
## HS            496.2580   120.3904   4.122 3.79e-05 ***
## NOHS          676.2812   153.2939   4.412 1.04e-05 ***
## MANAGER      -884.6343   162.0158  -5.460 4.90e-08 ***
## MINIVAN      -540.8132   117.7404  -4.593 4.43e-06 ***
## log_TIF         7.1657     2.1302   3.364 0.000772 ***
## log_MVR_PTS    -2.8086     0.7578  -3.706 0.000212 ***
## cat_KIDSDRIV  682.5059   162.3736   4.203 2.66e-05 ***
## cat_CLM_FREQ  571.3661   114.3887   4.995 6.01e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4560 on 8147 degrees of freedom
## Multiple R-squared:  0.0617, Adjusted R-squared:  0.0602 
## F-statistic: 41.21 on 13 and 8147 DF,  p-value: < 2.2e-16

#Model is osmewhat similar to Backward elimation Model

Binary Logsitic Regression

#The maximum Adjusted R^(2) of 0.211793 for the model predicting TARGET_FLAG is reached when the model contains 24 variables.

model_sum_FLAG$which[which.max(model_sum_FLAG$adjr2), ]
##  (Intercept)          AGE      PARENT1      MSTATUS      CAR_USE      REVOKED 
##         TRUE        FALSE         TRUE         TRUE         TRUE         TRUE 
##   URBANICITY          PHD      MASTERS    BACHELORS           HS         NOHS 
##         TRUE         TRUE         TRUE         TRUE        FALSE        FALSE 
##     CLERICAL      MANAGER         PROF      STUDENT  BLUE_COLLAR      MINIVAN 
##         TRUE         TRUE        FALSE         TRUE         TRUE         TRUE 
##       PICKUP       SPORTS          SUV   log_INCOME log_TRAVTIME log_BLUEBOOK 
##         TRUE         TRUE         TRUE         TRUE         TRUE         TRUE 
##      log_TIF  log_MVR_PTS cat_KIDSDRIV cat_HOMEKIDS      cat_YOJ cat_OLDCLAIM 
##         TRUE         TRUE         TRUE         TRUE        FALSE        FALSE 
## cat_CLM_FREQ  cat_CAR_AGE 
##         TRUE        FALSE
adjustedr2_FLAG <- glm(TARGET_FLAG ~ 1 + PARENT1 + MSTATUS + CAR_USE + 
                         REVOKED + URBANICITY + PHD + MASTERS + BACHELORS + CLERICAL + MANAGER + 
                         STUDENT + BLUE_COLLAR + MINIVAN + PICKUP + SPORTS + SUV + log_INCOME + 
                         log_TRAVTIME + log_BLUEBOOK + log_TIF + log_MVR_PTS + cat_KIDSDRIV + 
                         cat_HOMEKIDS + cat_CLM_FREQ, family = binomial(link = "logit"), training_FLAG)
round(coef(summary(adjustedr2_FLAG)), 6)
##               Estimate Std. Error    z value Pr(>|z|)
## (Intercept)  -2.554923   0.160423 -15.926206 0.000000
## PARENT1       0.217757   0.119008   1.829758 0.067286
## MSTATUS      -0.694028   0.074400  -9.328352 0.000000
## CAR_USE      -0.688893   0.080778  -8.528279 0.000000
## REVOKED       0.733365   0.079366   9.240333 0.000000
## URBANICITY    2.249695   0.111388  20.196878 0.000000
## PHD          -0.791338   0.133765  -5.915870 0.000000
## MASTERS      -0.508002   0.105529  -4.813852 0.000001
## BACHELORS    -0.485927   0.077640  -6.258734 0.000000
## CLERICAL      0.382111   0.106668   3.582245 0.000341
## MANAGER      -0.690384   0.109689  -6.293993 0.000000
## STUDENT       0.184715   0.126539   1.459743 0.144361
## BLUE_COLLAR   0.250666   0.103458   2.422876 0.015398
## MINIVAN      -0.435240   0.105575  -4.122578 0.000037
## PICKUP        0.219309   0.096226   2.279105 0.022661
## SPORTS        0.616909   0.118944   5.186530 0.000000
## SUV           0.364174   0.099838   3.647647 0.000265
## log_INCOME    0.100999   0.015677   6.442715 0.000000
## log_TRAVTIME -0.028743   0.007072  -4.064134 0.000048
## log_BLUEBOOK  0.059508   0.024005   2.479025 0.013174
## log_TIF       0.007729   0.001190   6.493015 0.000000
## log_MVR_PTS  -0.002189   0.000437  -5.008679 0.000001
## cat_KIDSDRIV  0.542256   0.094714   5.725173 0.000000
## cat_HOMEKIDS  0.252943   0.087625   2.886646 0.003894
## cat_CLM_FREQ  0.557205   0.060751   9.171972 0.000000

#At a significance level of alpha=0.5, the R^2adj selected variables yield a TARGET_FLAG BLR model with two insignificant variables: PARENT1 and STUDENT. Removing those three insignificant variables yields a model with all significant variables

adjustedr2_FLAG <- glm(TARGET_FLAG ~ 1 + MSTATUS + CAR_USE + REVOKED + 
                         URBANICITY + PHD + MASTERS + BACHELORS + CLERICAL + MANAGER + 
                         BLUE_COLLAR + MINIVAN + PICKUP + SPORTS + SUV + log_INCOME + 
                         log_TRAVTIME + log_BLUEBOOK + log_TIF + log_MVR_PTS + cat_KIDSDRIV + 
                         cat_HOMEKIDS + cat_CLM_FREQ, family = binomial(link = "logit"), training_FLAG)
summary(adjustedr2_FLAG)
## 
## Call:
## glm(formula = TARGET_FLAG ~ 1 + MSTATUS + CAR_USE + REVOKED + 
##     URBANICITY + PHD + MASTERS + BACHELORS + CLERICAL + MANAGER + 
##     BLUE_COLLAR + MINIVAN + PICKUP + SPORTS + SUV + log_INCOME + 
##     log_TRAVTIME + log_BLUEBOOK + log_TIF + log_MVR_PTS + cat_KIDSDRIV + 
##     cat_HOMEKIDS + cat_CLM_FREQ, family = binomial(link = "logit"), 
##     data = training_FLAG)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2016  -0.7231  -0.4212   0.6891   3.1351  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.4312711  0.1486779 -16.353  < 2e-16 ***
## MSTATUS      -0.7777777  0.0586439 -13.263  < 2e-16 ***
## CAR_USE      -0.7155003  0.0787814  -9.082  < 2e-16 ***
## REVOKED       0.7343615  0.0793348   9.256  < 2e-16 ***
## URBANICITY    2.2363335  0.1109238  20.161  < 2e-16 ***
## PHD          -0.8497653  0.1275975  -6.660 2.74e-11 ***
## MASTERS      -0.5639725  0.0979155  -5.760 8.42e-09 ***
## BACHELORS    -0.5101645  0.0752438  -6.780 1.20e-11 ***
## CLERICAL      0.3256558  0.0990172   3.289  0.00101 ** 
## MANAGER      -0.7098209  0.1082712  -6.556 5.53e-11 ***
## BLUE_COLLAR   0.1852830  0.0928333   1.996  0.04595 *  
## MINIVAN      -0.4147417  0.1048748  -3.955 7.67e-05 ***
## PICKUP        0.2317109  0.0958995   2.416  0.01568 *  
## SPORTS        0.6331507  0.1186074   5.338 9.39e-08 ***
## SUV           0.3802718  0.0993833   3.826  0.00013 ***
## log_INCOME    0.1067877  0.0151806   7.034 2.00e-12 ***
## log_TRAVTIME -0.0285046  0.0070522  -4.042 5.30e-05 ***
## log_BLUEBOOK  0.0604612  0.0239746   2.522  0.01167 *  
## log_TIF       0.0077168  0.0011898   6.486 8.82e-11 ***
## log_MVR_PTS  -0.0021850  0.0004367  -5.004 5.63e-07 ***
## cat_KIDSDRIV  0.5297128  0.0942272   5.622 1.89e-08 ***
## cat_HOMEKIDS  0.3609836  0.0685862   5.263 1.42e-07 ***
## cat_CLM_FREQ  0.5555339  0.0607168   9.150  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9418.0  on 8160  degrees of freedom
## Residual deviance: 7446.2  on 8138  degrees of freedom
## AIC: 7492.2
## 
## Number of Fisher Scoring iterations: 5

Estimating the size of the bias

par(mfrow=c(1,2))
plot(model_sum_AMT$cp, xlab = "Number of Variables", ylab = "bias Cp", main="TARGET_AMT")
plot(model_sum_FLAG$cp, xlab = "Number of Variables", ylab = "bias Cp", main="TARGET_FLAG")

cbind(min(model_sum_AMT$cp), which.min(model_sum_AMT$cp))
##          [,1] [,2]
## [1,] 12.81883   16
cbind(min(model_sum_FLAG$cp), which.min(model_sum_FLAG$cp))
##          [,1] [,2]
## [1,] 19.25552   24

Multiple Linear Regression

#The minimum bias 12.8188345 for the model predicting TARGET_AMT is reached when the model contains 16 variables.

model_sum_AMT$which[which.min(model_sum_AMT$cp), ]
##  (Intercept)          AGE      PARENT1      MSTATUS      CAR_USE      REVOKED 
##         TRUE        FALSE         TRUE         TRUE         TRUE         TRUE 
##   URBANICITY          PHD      MASTERS           HS         NOHS      MANAGER 
##         TRUE        FALSE        FALSE         TRUE         TRUE         TRUE 
##      STUDENT  BLUE_COLLAR      MINIVAN        TRUCK       PICKUP       SPORTS 
##        FALSE        FALSE         TRUE        FALSE        FALSE         TRUE 
##          VAN   log_INCOME log_TRAVTIME log_BLUEBOOK      log_TIF  log_MVR_PTS 
##        FALSE         TRUE         TRUE        FALSE         TRUE         TRUE 
## cat_KIDSDRIV cat_HOMEKIDS      cat_YOJ cat_OLDCLAIM cat_CLM_FREQ  cat_CAR_AGE 
##         TRUE        FALSE        FALSE        FALSE         TRUE        FALSE
bias_AMT <- lm(TARGET_AMT ~ 1 + PARENT1 + MSTATUS + CAR_USE + 
    REVOKED + URBANICITY + HS + NOHS + MANAGER + MINIVAN + SPORTS + 
    log_INCOME + log_TRAVTIME + log_TIF + log_MVR_PTS + 
    cat_KIDSDRIV + cat_CLM_FREQ, training_AMT)
round(coef(summary(bias_AMT)), 6)
##                 Estimate Std. Error   t value Pr(>|t|)
## (Intercept)   762.784417 183.151125  4.164782 0.000031
## PARENT1       640.018132 177.569345  3.604328 0.000315
## MSTATUS      -586.551921 119.723526 -4.899220 0.000001
## CAR_USE      -785.707613 111.900046 -7.021513 0.000000
## REVOKED       470.841555 155.143313  3.034881 0.002414
## URBANICITY   1505.218992 136.333458 11.040716 0.000000
## HS            470.034608 120.940482  3.886495 0.000103
## NOHS          657.777887 153.639789  4.281299 0.000019
## MANAGER      -851.651499 162.435546 -5.243012 0.000000
## MINIVAN      -471.449326 121.477290 -3.880967 0.000105
## SPORTS        322.212013 168.353540  1.913901 0.055668
## log_INCOME     46.035671  25.544003  1.802210 0.071549
## log_TRAVTIME  -18.636693  11.105620 -1.678132 0.093360
## log_TIF         6.989195   2.130538  3.280483 0.001041
## log_MVR_PTS    -2.773052   0.757597 -3.660325 0.000253
## cat_KIDSDRIV  691.794967 162.331388  4.261622 0.000021
## cat_CLM_FREQ  552.008131 114.513722  4.820454 0.000001

#At a significance level of alpha=0.5 , the R2adj selected variables yield a TARGET_AMT MLR model with three insignificant variables: SPORTS, log_INCOME, and log_TRAVTIME. Removing those three insignificant variables yields a model with all significant variables.

bias_AMT <- lm(TARGET_AMT ~ 1 + PARENT1 + MSTATUS + CAR_USE + 
    REVOKED + URBANICITY + HS + NOHS + MANAGER + MINIVAN + log_TIF + 
    log_MVR_PTS + cat_KIDSDRIV + cat_CLM_FREQ, training_AMT)
summary(bias_AMT)
## 
## Call:
## lm(formula = TARGET_AMT ~ 1 + PARENT1 + MSTATUS + CAR_USE + REVOKED + 
##     URBANICITY + HS + NOHS + MANAGER + MINIVAN + log_TIF + log_MVR_PTS + 
##     cat_KIDSDRIV + cat_CLM_FREQ, data = training_AMT)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -4997  -1694   -819    354 104984 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   774.2430   182.3082   4.247 2.19e-05 ***
## PARENT1       655.1922   177.4678   3.692 0.000224 ***
## MSTATUS      -581.7175   119.7388  -4.858 1.21e-06 ***
## CAR_USE      -730.5026   109.4617  -6.674 2.66e-11 ***
## REVOKED       473.8986   155.1903   3.054 0.002268 ** 
## URBANICITY   1469.2228   135.8413  10.816  < 2e-16 ***
## HS            496.2580   120.3904   4.122 3.79e-05 ***
## NOHS          676.2812   153.2939   4.412 1.04e-05 ***
## MANAGER      -884.6343   162.0158  -5.460 4.90e-08 ***
## MINIVAN      -540.8132   117.7404  -4.593 4.43e-06 ***
## log_TIF         7.1657     2.1302   3.364 0.000772 ***
## log_MVR_PTS    -2.8086     0.7578  -3.706 0.000212 ***
## cat_KIDSDRIV  682.5059   162.3736   4.203 2.66e-05 ***
## cat_CLM_FREQ  571.3661   114.3887   4.995 6.01e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4560 on 8147 degrees of freedom
## Multiple R-squared:  0.0617, Adjusted R-squared:  0.0602 
## F-statistic: 41.21 on 13 and 8147 DF,  p-value: < 2.2e-16

Binary Logistic Regression

#The minimum bias of 19.2555192 for the model predicting TARGET_FLAG is reached when the model contains 24 variables

model_sum_FLAG$which[which.min(model_sum_FLAG$cp), ]
##  (Intercept)          AGE      PARENT1      MSTATUS      CAR_USE      REVOKED 
##         TRUE        FALSE         TRUE         TRUE         TRUE         TRUE 
##   URBANICITY          PHD      MASTERS    BACHELORS           HS         NOHS 
##         TRUE         TRUE         TRUE         TRUE        FALSE        FALSE 
##     CLERICAL      MANAGER         PROF      STUDENT  BLUE_COLLAR      MINIVAN 
##         TRUE         TRUE        FALSE         TRUE         TRUE         TRUE 
##       PICKUP       SPORTS          SUV   log_INCOME log_TRAVTIME log_BLUEBOOK 
##         TRUE         TRUE         TRUE         TRUE         TRUE         TRUE 
##      log_TIF  log_MVR_PTS cat_KIDSDRIV cat_HOMEKIDS      cat_YOJ cat_OLDCLAIM 
##         TRUE         TRUE         TRUE         TRUE        FALSE        FALSE 
## cat_CLM_FREQ  cat_CAR_AGE 
##         TRUE        FALSE
bias_FLAG <- glm(TARGET_FLAG ~ 1 + PARENT1 + MSTATUS + CAR_USE +
    REVOKED + URBANICITY + PHD + MASTERS + BACHELORS + CLERICAL +
    MANAGER + STUDENT + BLUE_COLLAR + MINIVAN + PICKUP + SPORTS +
    SUV + log_INCOME + log_TRAVTIME + log_BLUEBOOK + log_TIF + 
    log_MVR_PTS + cat_KIDSDRIV + cat_HOMEKIDS + cat_CLM_FREQ, 
    family = binomial(link = "logit"), training_FLAG)
round(coef(summary(bias_FLAG)), 6)
##               Estimate Std. Error    z value Pr(>|z|)
## (Intercept)  -2.554923   0.160423 -15.926206 0.000000
## PARENT1       0.217757   0.119008   1.829758 0.067286
## MSTATUS      -0.694028   0.074400  -9.328352 0.000000
## CAR_USE      -0.688893   0.080778  -8.528279 0.000000
## REVOKED       0.733365   0.079366   9.240333 0.000000
## URBANICITY    2.249695   0.111388  20.196878 0.000000
## PHD          -0.791338   0.133765  -5.915870 0.000000
## MASTERS      -0.508002   0.105529  -4.813852 0.000001
## BACHELORS    -0.485927   0.077640  -6.258734 0.000000
## CLERICAL      0.382111   0.106668   3.582245 0.000341
## MANAGER      -0.690384   0.109689  -6.293993 0.000000
## STUDENT       0.184715   0.126539   1.459743 0.144361
## BLUE_COLLAR   0.250666   0.103458   2.422876 0.015398
## MINIVAN      -0.435240   0.105575  -4.122578 0.000037
## PICKUP        0.219309   0.096226   2.279105 0.022661
## SPORTS        0.616909   0.118944   5.186530 0.000000
## SUV           0.364174   0.099838   3.647647 0.000265
## log_INCOME    0.100999   0.015677   6.442715 0.000000
## log_TRAVTIME -0.028743   0.007072  -4.064134 0.000048
## log_BLUEBOOK  0.059508   0.024005   2.479025 0.013174
## log_TIF       0.007729   0.001190   6.493015 0.000000
## log_MVR_PTS  -0.002189   0.000437  -5.008679 0.000001
## cat_KIDSDRIV  0.542256   0.094714   5.725173 0.000000
## cat_HOMEKIDS  0.252943   0.087625   2.886646 0.003894
## cat_CLM_FREQ  0.557205   0.060751   9.171972 0.000000

#At a significance level of alpha=0.5 , the R2adj selected variables yield a TARGET_FLAG BLR model with two insignificant variables: PARENT1 and STUDENT. Removing those three insignificant variables yields a model with all significant variables

bias_FLAG <- glm(TARGET_FLAG ~ 1 + MSTATUS + CAR_USE + 
    REVOKED + URBANICITY + PHD + MASTERS + BACHELORS + CLERICAL +
    MANAGER + BLUE_COLLAR + MINIVAN + PICKUP + SPORTS + SUV +
    log_INCOME + log_TRAVTIME + log_BLUEBOOK + log_TIF +
    log_MVR_PTS + cat_KIDSDRIV + cat_HOMEKIDS + cat_CLM_FREQ, 
    family = binomial(link = "logit"), training_FLAG)
summary(bias_FLAG)
## 
## Call:
## glm(formula = TARGET_FLAG ~ 1 + MSTATUS + CAR_USE + REVOKED + 
##     URBANICITY + PHD + MASTERS + BACHELORS + CLERICAL + MANAGER + 
##     BLUE_COLLAR + MINIVAN + PICKUP + SPORTS + SUV + log_INCOME + 
##     log_TRAVTIME + log_BLUEBOOK + log_TIF + log_MVR_PTS + cat_KIDSDRIV + 
##     cat_HOMEKIDS + cat_CLM_FREQ, family = binomial(link = "logit"), 
##     data = training_FLAG)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2016  -0.7231  -0.4212   0.6891   3.1351  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.4312711  0.1486779 -16.353  < 2e-16 ***
## MSTATUS      -0.7777777  0.0586439 -13.263  < 2e-16 ***
## CAR_USE      -0.7155003  0.0787814  -9.082  < 2e-16 ***
## REVOKED       0.7343615  0.0793348   9.256  < 2e-16 ***
## URBANICITY    2.2363335  0.1109238  20.161  < 2e-16 ***
## PHD          -0.8497653  0.1275975  -6.660 2.74e-11 ***
## MASTERS      -0.5639725  0.0979155  -5.760 8.42e-09 ***
## BACHELORS    -0.5101645  0.0752438  -6.780 1.20e-11 ***
## CLERICAL      0.3256558  0.0990172   3.289  0.00101 ** 
## MANAGER      -0.7098209  0.1082712  -6.556 5.53e-11 ***
## BLUE_COLLAR   0.1852830  0.0928333   1.996  0.04595 *  
## MINIVAN      -0.4147417  0.1048748  -3.955 7.67e-05 ***
## PICKUP        0.2317109  0.0958995   2.416  0.01568 *  
## SPORTS        0.6331507  0.1186074   5.338 9.39e-08 ***
## SUV           0.3802718  0.0993833   3.826  0.00013 ***
## log_INCOME    0.1067877  0.0151806   7.034 2.00e-12 ***
## log_TRAVTIME -0.0285046  0.0070522  -4.042 5.30e-05 ***
## log_BLUEBOOK  0.0604612  0.0239746   2.522  0.01167 *  
## log_TIF       0.0077168  0.0011898   6.486 8.82e-11 ***
## log_MVR_PTS  -0.0021850  0.0004367  -5.004 5.63e-07 ***
## cat_KIDSDRIV  0.5297128  0.0942272   5.622 1.89e-08 ***
## cat_HOMEKIDS  0.3609836  0.0685862   5.263 1.42e-07 ***
## cat_CLM_FREQ  0.5555339  0.0607168   9.150  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9418.0  on 8160  degrees of freedom
## Residual deviance: 7446.2  on 8138  degrees of freedom
## AIC: 7492.2
## 
## Number of Fisher Scoring iterations: 5

Selecting Models

sum1 <- summary(forward_AMT)
sum2 <- summary(backward_AMT)
sum3 <- summary(adjustedr2_AMT)
sum4 <- summary(bias_AMT)

#Multi-collinearity

library(lmtest)
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.2.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
dwtest(forward_AMT)
## 
##  Durbin-Watson test
## 
## data:  forward_AMT
## DW = 1.988, p-value = 0.294
## alternative hypothesis: true autocorrelation is greater than 0
dwtest(backward_AMT)
## 
##  Durbin-Watson test
## 
## data:  backward_AMT
## DW = 1.988, p-value = 0.2932
## alternative hypothesis: true autocorrelation is greater than 0
dwtest(adjustedr2_AMT)
## 
##  Durbin-Watson test
## 
## data:  adjustedr2_AMT
## DW = 1.988, p-value = 0.2932
## alternative hypothesis: true autocorrelation is greater than 0
dwtest(bias_AMT)
## 
##  Durbin-Watson test
## 
## data:  bias_AMT
## DW = 1.988, p-value = 0.2932
## alternative hypothesis: true autocorrelation is greater than 0

#The null hypothesis is that there does not exist multicollinearity. Since the p-valuea are large, we fail to reject the null hypothesis.

Mean squared error

summary(M[1:n, "TARGET_AMT"])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0    1504    1036  107586
data.frame("MODEL" = c("forward_AMT", "backward_AMT", "adjustedr2_AMT", "bias_AMT"),
  "MSE" = c(sum1$sigma^2, sum2$sigma^2, sum3$sigma^2, sum4$sigma^2),
  "RMSE" = c(sum1$sigma, sum2$sigma, sum3$sigma, sum4$sigma))
##            MODEL      MSE     RMSE
## 1    forward_AMT 20795666 4560.227
## 2   backward_AMT 20795771 4560.238
## 3 adjustedr2_AMT 20795771 4560.238
## 4       bias_AMT 20795771 4560.238

#Mean squared error is the sqaure of RMSE. We see that standard error of the mean (RMSE) is fairly large relative to the target variable. In these models, the standard deviation of the unexplained variance in TARGET_AMT are in the neighborhood of 4560 which is a large deviation from the 1504.325 average claim encountered in the data

R^2

data.frame("MODEL" = c("forward_AMT", "backward_AMT", "adjustedr2_AMT", "biascp_AMT"),
  "R.SQUARED" = c(sum1$r.squared, sum2$r.squared, sum3$r.squared, sum4$r.squared),
  "ADJ.R.SQUARED" = c(sum1$adj.r.squared, sum2$adj.r.squared, sum3$adj.r.squared, sum4$adj.r.squared))
##            MODEL  R.SQUARED ADJ.R.SQUARED
## 1    forward_AMT 0.14876730    0.14730452
## 2   backward_AMT 0.06169727    0.06020004
## 3 adjustedr2_AMT 0.06169727    0.06020004
## 4     biascp_AMT 0.06169727    0.06020004

#R^2 is fairly low for this model. R^2 however, is not an adequate performance measure for this model. Adjusted R^2 is more appropriate when models have multiple variables. It incorporates a penalty to account for the decrease in degrees of freedom (from additional variables).

F-statistic

data.frame("MODEL" = c("forward_AMT", "backward_AMT", "adjustedr2_AMT", "biascp_AMT"),
           rbind(sum1$fstatistic, sum2$fstatistic, sum3$fstatistic, sum4$fstatistic))
##            MODEL    value numdf dendf
## 1    forward_AMT 101.7019    14  8147
## 2   backward_AMT  41.2076    13  8147
## 3 adjustedr2_AMT  41.2076    13  8147
## 4     biascp_AMT  41.2076    13  8147

#The F-test evaluates the null hypothesis that all regression coefficients are equal to zero versus the alternative that at least one does not

Examine Residuals

par(mfrow = c(2,2))
plot(forward_AMT)

par(mfrow = c(2,2))
plot(backward_AMT)

par(mfrow = c(2,2))
plot(adjustedr2_AMT)

par(mfrow = c(2,2))
plot(bias_AMT)

#The Residuals vs Fitted plot shows that the residuals do not have a linear pattern. The Normal Q-Q plot shows that the residuals are also not normally distributed

Model Selection

#The model derived using Forward Selection has the lowest RMSE, although only by a minimal amount. When it comes to the F -statistics and Adjusted R^2 however, the forward_AMT model has values that are substantially higher. The chosen model for TARGET_AMT is therefore forward_AMT.

Binary Logistic Regression Model

Confusion Matrix

library(caret)
## Warning: package 'caret' was built under R version 4.2.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.2.3
training_FLAG[ ,"probability.forward"] <- predict(forward_FLAG, training_FLAG, type="response")
training_FLAG[ ,"class.forward"] <- ifelse(training_FLAG$probability.forward < 0.5, 0, 1)

training_FLAG$class.forward <- factor(training_FLAG$class.forward, levels = c("0", "1"))
training_FLAG$TARGET_FLAG <- factor(training_FLAG$TARGET_FLAG, levels = c("0", "1"))

(cm1 <- confusionMatrix(training_FLAG$class.forward, training_FLAG$TARGET_FLAG, positive = "1"))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 5533 1297
##          1  475  856
##                                           
##                Accuracy : 0.7829          
##                  95% CI : (0.7738, 0.7918)
##     No Information Rate : 0.7362          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.363           
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.3976          
##             Specificity : 0.9209          
##          Pos Pred Value : 0.6431          
##          Neg Pred Value : 0.8101          
##              Prevalence : 0.2638          
##          Detection Rate : 0.1049          
##    Detection Prevalence : 0.1631          
##       Balanced Accuracy : 0.6593          
##                                           
##        'Positive' Class : 1               
## 
cm1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 5533 1297
##          1  475  856
##                                           
##                Accuracy : 0.7829          
##                  95% CI : (0.7738, 0.7918)
##     No Information Rate : 0.7362          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.363           
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.3976          
##             Specificity : 0.9209          
##          Pos Pred Value : 0.6431          
##          Neg Pred Value : 0.8101          
##              Prevalence : 0.2638          
##          Detection Rate : 0.1049          
##    Detection Prevalence : 0.1631          
##       Balanced Accuracy : 0.6593          
##                                           
##        'Positive' Class : 1               
## 

#The model derived using Forward Selection has the following performance metrics: Accuracy of 0.7828697, Error Rate of 0.2171303, Precision of 0.6431255, Sensitivity of 0.3975848, Specificity of 0.9209387, and F1 Score of 0.4913892.

training_FLAG[ ,"probability.backward"] <- predict(backward_FLAG, training_FLAG, type="response")
training_FLAG[ ,"class.backward"] <- ifelse(training_FLAG$probability.backward < 0.5, 0, 1)

training_FLAG$class.backward <- factor(training_FLAG$class.backward, levels = c("0", "1"))
training_FLAG$TARGET_FLAG <- factor(training_FLAG$TARGET_FLAG, levels = c("0", "1"))

(cm2 <- confusionMatrix(training_FLAG$class.backward, training_FLAG$TARGET_FLAG, positive = "1"))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 5536 1295
##          1  472  858
##                                           
##                Accuracy : 0.7835          
##                  95% CI : (0.7744, 0.7924)
##     No Information Rate : 0.7362          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3647          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.3985          
##             Specificity : 0.9214          
##          Pos Pred Value : 0.6451          
##          Neg Pred Value : 0.8104          
##              Prevalence : 0.2638          
##          Detection Rate : 0.1051          
##    Detection Prevalence : 0.1630          
##       Balanced Accuracy : 0.6600          
##                                           
##        'Positive' Class : 1               
## 

#The model derived using Backward Elimination has the following performance metrics: Accuracy of 0.7834824, Error Rate of 0.2165176, Precision of 0.6451128, Sensitivity of 0.3985137, Specificity of 0.9214381, and F1 Score of 0.4926787

training_FLAG[ ,"probability.adjustedr2"] <- predict(adjustedr2_FLAG, training_FLAG, type="response")
training_FLAG[ ,"class.adjustedr2"] <- ifelse(training_FLAG$probability.adjustedr2 < 0.5, 0, 1)

training_FLAG$class.adjustedr2 <- factor(training_FLAG$class.adjustedr2, levels = c("0", "1"))
training_FLAG$TARGET_FLAG <- factor(training_FLAG$TARGET_FLAG, levels = c("0", "1"))


(cm3 <- confusionMatrix(training_FLAG$class.adjustedr2, training_FLAG$TARGET_FLAG, positive = "1"))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 5536 1295
##          1  472  858
##                                           
##                Accuracy : 0.7835          
##                  95% CI : (0.7744, 0.7924)
##     No Information Rate : 0.7362          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3647          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.3985          
##             Specificity : 0.9214          
##          Pos Pred Value : 0.6451          
##          Neg Pred Value : 0.8104          
##              Prevalence : 0.2638          
##          Detection Rate : 0.1051          
##    Detection Prevalence : 0.1630          
##       Balanced Accuracy : 0.6600          
##                                           
##        'Positive' Class : 1               
## 

#The model derived using Adjusted R^2 has the following performance metrics: Accuracy of 0.7834824, Error Rate of 0.2165176, Precision of , Sensitivity of 0.3985137, Specificity of 0.9214381, and F1 Score of 0.4926787. These metrics are identical to those from Backward Elimination since, as previously mentioned, both models identical.

training_FLAG[ ,"probability.bias"] <- predict(bias_FLAG, training_FLAG, type="response")

training_FLAG[ ,"class.bias"] <- ifelse(training_FLAG$probability.bias < 0.5, 0, 1)

training_FLAG$class.bias <- factor(training_FLAG$class.bias, levels = c("0", "1"))
training_FLAG$TARGET_FLAG <- factor(training_FLAG$TARGET_FLAG, levels = c("0", "1"))

(cm4 <- confusionMatrix(training_FLAG$class.bias, training_FLAG$TARGET_FLAG, positive = "1"))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 5536 1295
##          1  472  858
##                                           
##                Accuracy : 0.7835          
##                  95% CI : (0.7744, 0.7924)
##     No Information Rate : 0.7362          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3647          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.3985          
##             Specificity : 0.9214          
##          Pos Pred Value : 0.6451          
##          Neg Pred Value : 0.8104          
##              Prevalence : 0.2638          
##          Detection Rate : 0.1051          
##    Detection Prevalence : 0.1630          
##       Balanced Accuracy : 0.6600          
##                                           
##        'Positive' Class : 1               
## 

#The model derived using bias has the following performance metrics: Accuracy of 0.7834824, Error Rate of 0.2165176, Precision of , Sensitivity of 0.3985137, Specificity of 0.9214381, and F1 Score of 0.4926787. These metrics are identical to those from Backward Elimination since, as previously mentioned, both models identical

ROC Curve

library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following object is masked from 'package:colorspace':
## 
##     coords
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
par(mfrow=c(2,2))

training_FLAG$class.forward <- as.numeric(training_FLAG$class.forward)

plot(roc(training_FLAG$TARGET_FLAG, training_FLAG$class.forward, smooth=F), print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
training_FLAG$class.backward <- as.numeric(training_FLAG$class.backward)

plot(roc(training_FLAG$TARGET_FLAG, training_FLAG$class.backward, smooth=F), print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc(training_FLAG$TARGET_FLAG, training_FLAG$probability.adjustedr2, smooth=F), print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc(training_FLAG$TARGET_FLAG, training_FLAG$probability.bias, smooth=F), print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

#The models with the highest Accuracy were those derived using Backward Elimination, Adjusted R2, and bias models yielded identical results. These models with the greatest Area Under the ROC Curve however, were the adjustedr2_AMT and bias_AMT models. As such, the backward_AMT is being eliminated and there is indifference between using the adjustedr2_AMT and bias_AMT models for TARGET_FLAG.

Predictions

validation <- M[(1+n):(m+n),]
probability <- predict(adjustedr2_FLAG, validation, type="response")
predict_FLAG <- ifelse(probability >= .5, 1, 0)
predict_AMT <- predict(forward_AMT, validation)
predict_AMT[predict_FLAG == 0] <- 0
predictions <- data.frame("predict_FLAG" = predict_FLAG, "predict_AMT" = predict_AMT)

head(predictions)
##      predict_FLAG predict_AMT
## 8162            0           0
## 8163            0           0
## 8164            0           0
## 8165            0           0
## 8166            0           0
## 8167            0           0
training_FLAG$TARGET_FLAG <- as.numeric(training_FLAG$TARGET_FLAG)

n <- sum(training_FLAG$TARGET_FLAG)
N <- nrow(training_FLAG)
m <- sum(predict_FLAG)
M <- length(predict_FLAG)
p <- m / M

as.numeric(predict_FLAG)
##    [1] 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
##   [38] 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0
##   [75] 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1
##  [112] 0 0 0 1 0 0 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 0 0
##  [149] 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 1 1 1 0 0 0 1
##  [186] 0 0 0 0 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
##  [223] 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 1
##  [260] 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 1 1 0 1 0 1 0 0 0 1 0 0
##  [297] 0 1 0 0 0 0 0 1 1 0 0 1 0 0 1 0 0 1 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1
##  [334] 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
##  [371] 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [408] 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
##  [445] 0 0 0 0 1 1 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 1 1 0 0 0
##  [482] 0 0 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1
##  [519] 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
##  [556] 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0
##  [593] 0 0 0 1 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0
##  [630] 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1
##  [667] 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0
##  [704] 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
##  [741] 1 0 1 0 0 1 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0
##  [778] 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [815] 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1
##  [852] 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 1 0 1 1 1 0 0 0 0 1 0 0 0 1 0 1 0
##  [889] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0
##  [926] 0 0 0 1 0 0 1 0 1 0 1 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0
##  [963] 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1
## [1000] 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0
## [1037] 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0
## [1074] 1 0 0 0 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0
## [1111] 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## [1148] 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 1 0 1 0 0
## [1185] 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0
## [1222] 0 1 1 1 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0
## [1259] 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0
## [1296] 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0
## [1333] 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0
## [1370] 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0
## [1407] 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0
## [1444] 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## [1481] 0 0 1 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0
## [1518] 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 1
## [1555] 0 0 0 0 1 0 0 1 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## [1592] 1 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 1 1 0 0 1 0 1 0 1 0 0 1 0 1 0 0 0 0 0
## [1629] 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 1 1
## [1666] 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0
## [1703] 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## [1740] 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0
## [1777] 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1
## [1814] 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0
## [1851] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## [1888] 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## [1925] 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1
## [1962] 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 1 1 0 0 0 1
## [1999] 1 0 1 0 1 0 1 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1
## [2036] 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0
## [2073] 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 0 0 1 0 0
## [2110] 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
as.numeric(training_FLAG$TARGET_FLAG)
##    [1] 1 1 1 1 1 2 1 2 2 1 2 1 1 2 2 1 1 1 1 2 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1
##   [38] 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 2 1 2 2 1 2 1 2 1 1 1 1 1 1 1 2 1 2 1 2 1 1
##   [75] 1 1 1 2 1 1 1 1 1 1 1 2 1 2 2 1 1 2 2 1 2 1 1 1 2 1 1 1 1 1 2 1 1 1 2 2 1
##  [112] 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 2 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [149] 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 2 1 1 1 1 1 2 2 1 2 1 1 1 1 1 1
##  [186] 2 1 1 2 1 2 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 2 1 1 2 1
##  [223] 1 1 1 1 2 1 1 2 1 1 1 1 2 2 2 1 2 2 1 1 1 1 1 2 2 1 2 1 1 1 1 1 1 2 2 2 1
##  [260] 1 2 1 2 2 1 1 1 1 1 1 2 1 1 1 1 2 1 2 1 1 1 1 1 2 1 1 1 1 1 2 2 2 1 1 1 1
##  [297] 2 2 1 1 2 2 1 1 1 1 1 1 2 1 2 2 1 2 2 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1
##  [334] 2 2 1 2 2 2 1 1 2 2 1 1 2 2 1 1 1 2 1 2 1 2 1 2 2 1 1 1 1 1 2 1 1 1 1 1 1
##  [371] 1 1 1 1 2 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1 1 2 2 2 1 1 1 2 1 2 1 1 2 1 1 1
##  [408] 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 2 2 1 2 1 1 1 2 1 1 2 2 1 1 1 2 1
##  [445] 2 1 1 1 2 2 1 1 1 1 1 2 1 1 2 2 1 2 1 1 1 1 2 1 2 1 2 1 1 2 2 1 1 1 1 1 1
##  [482] 1 1 2 1 2 1 1 1 1 1 1 2 1 1 1 1 2 2 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 1 1
##  [519] 2 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 2 1 1 2 2 1 2 1
##  [556] 2 1 2 1 2 2 1 1 1 1 2 1 1 1 1 2 1 2 1 1 1 1 1 2 1 2 1 1 1 1 2 1 1 1 1 1 1
##  [593] 2 1 1 1 1 1 1 1 1 2 1 1 1 2 2 1 1 2 1 1 1 1 1 2 1 2 1 1 1 1 1 1 2 1 1 1 1
##  [630] 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 1 2 1 2 2 1
##  [667] 2 2 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 2 1 1 1 1 1 1 1 1
##  [704] 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 2 2 1 1 1 2 1 1 1 1 1
##  [741] 1 2 2 1 1 1 1 2 1 2 1 1 1 1 1 2 1 1 2 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 2 1
##  [778] 2 1 2 1 1 1 1 2 2 1 1 2 2 1 1 2 1 2 1 1 1 1 1 1 1 2 2 1 1 2 2 1 1 1 1 1 1
##  [815] 2 1 1 1 2 1 1 1 1 1 1 2 2 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 1 1 1 1
##  [852] 1 1 1 1 1 2 1 1 2 1 2 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1
##  [889] 1 2 1 1 2 1 1 1 1 1 1 2 1 2 2 1 1 2 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2
##  [926] 2 1 1 1 2 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1 2 1 1 2 2 2 1 2 2 2 1 1 2 2
##  [963] 2 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 2 1 1 1 2 1 1 1 2 1 1 1 1
## [1000] 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1 1 2 1 1 2 1 1 2 1 1 1 1 2 1
## [1037] 1 2 2 1 1 2 2 2 1 1 2 1 2 1 1 1 1 2 2 1 1 1 2 1 1 1 1 1 2 1 1 2 1 2 2 1 2
## [1074] 2 1 2 2 2 2 2 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 2 1 2 1 1 1 1 2 2 1 1 2 1 2 1
## [1111] 1 1 1 2 1 2 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 1 1 1 2 1 2 1 1 2 1 1 1 2 1 1 1
## [1148] 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 2 1 1 2 1 1 1 1 1 1 1 2 2 1 2 2 1
## [1185] 1 2 1 1 2 1 2 2 1 1 1 1 2 2 1 1 2 1 1 1 2 1 1 1 2 1 1 1 1 1 2 1 2 2 1 1 2
## [1222] 1 2 1 2 1 1 1 2 1 1 2 1 1 1 2 1 2 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [1259] 1 1 1 1 2 1 1 2 1 2 1 1 2 1 2 2 1 1 1 1 2 1 1 1 1 2 1 2 2 1 2 1 1 1 1 1 1
## [1296] 1 1 1 1 1 2 1 2 1 1 1 1 1 1 1 2 1 2 1 2 2 1 2 1 2 1 1 1 1 1 1 2 1 1 1 2 2
## [1333] 1 1 2 1 2 1 1 1 1 2 2 1 1 2 2 1 2 1 1 2 2 2 1 1 2 2 1 1 1 1 1 1 2 1 1 1 2
## [1370] 1 1 1 1 1 1 2 1 1 2 1 1 2 1 1 1 2 1 1 2 1 1 1 1 1 1 1 2 1 2 1 1 2 2 2 1 1
## [1407] 1 2 1 1 2 1 1 2 2 1 1 1 1 1 2 1 2 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1 1 1 1 1 2
## [1444] 2 1 2 2 1 1 1 1 1 2 1 2 2 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 1 1
## [1481] 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 2 2 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2
## [1518] 2 2 1 1 1 1 1 1 1 2 2 1 1 1 2 2 1 2 1 1 1 1 1 1 1 2 2 1 2 1 1 2 1 1 1 1 1
## [1555] 2 2 1 2 1 1 1 1 1 1 1 2 2 1 1 1 1 2 1 1 2 1 1 2 2 2 2 1 1 2 1 1 1 1 1 2 1
## [1592] 2 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1 2 1 1 1 1 2 1 1 2 2 2 1 2 1 2 1 1
## [1629] 1 1 1 1 2 1 1 1 1 2 2 2 1 2 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 2 2 1 1 2 1 1 1
## [1666] 2 1 1 1 1 1 1 1 2 2 1 1 2 2 2 1 2 1 2 2 1 1 1 2 1 2 2 1 1 1 1 2 1 2 2 2 1
## [1703] 2 1 2 1 1 1 1 1 1 1 2 1 2 1 1 1 2 1 2 1 2 1 1 1 2 1 2 2 1 1 1 1 1 1 1 1 1
## [1740] 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1 1 2 1 2 1 1 2 1 1
## [1777] 1 2 1 2 1 1 1 2 1 1 2 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 2
## [1814] 2 1 1 1 2 1 2 1 1 2 1 1 1 1 1 1 2 2 2 1 1 2 1 2 2 1 1 2 1 1 1 1 1 1 1 1 1
## [1851] 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 2 1 1 2 2 1 1 1 1 1 1 1 2 1 1 1 2 1 2 2 1
## [1888] 1 1 1 2 2 1 1 1 1 1 2 1 1 1 1 1 1 2 2 2 1 1 2 2 1 2 1 1 2 1 1 2 1 1 1 1 2
## [1925] 2 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 2 1 1 2 1 1 1 1 1 1 1
## [1962] 1 1 1 1 1 1 1 1 2 2 1 2 1 1 1 1 1 1 1 1 1 2 1 1 2 1 1 2 2 1 1 1 1 1 1 2 1
## [1999] 1 1 2 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 2 1 2 1 2 1 1 1 1 1 1
## [2036] 2 2 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1
## [2073] 2 1 2 2 2 1 1 2 1 1 1 1 1 2 2 1 1 2 1 1 1 1 2 2 1 1 1 1 1 1 1 2 1 2 1 2 1
## [2110] 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 2 2 2 2 1 1
## [2147] 1 1 1 2 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 1 2 2 1 1 1 1 1 1 1 1 2
## [2184] 1 2 1 1 1 1 1 1 1 2 1 2 1 2 2 2 1 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 2 1 2 1
## [2221] 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2258] 2 2 1 2 2 1 1 1 1 1 2 1 2 1 2 1 2 1 1 1 1 1 2 1 1 1 1 1 1 2 1 1 1 1 1 2 1
## [2295] 1 1 1 1 1 1 2 2 1 1 2 1 2 1 1 2 1 2 1 1 1 1 1 1 2 1 1 2 1 2 1 1 2 1 1 2 1
## [2332] 2 1 1 1 2 1 2 2 2 1 1 1 1 1 2 1 1 1 2 1 1 2 1 1 1 2 1 1 1 1 1 1 2 2 1 1 1
## [2369] 1 2 2 1 2 1 2 1 2 1 1 1 1 2 2 1 2 1 1 1 2 1 1 1 2 1 1 1 1 1 1 2 1 1 1 1 2
## [2406] 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 2 2 1 1 1 1 1 1 2 1 2 1 1 2 2 1 2 1 1 1 2
## [2443] 1 2 1 1 1 1 1 2 1 1 1 1 2 1 1 2 1 1 2 1 1 2 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1
## [2480] 1 1 2 1 1 2 2 2 1 2 1 1 1 2 1 1 1 1 1 1 2 1 1 1 2 1 1 2 1 1 1 1 2 1 1 2 2
## [2517] 1 2 1 1 1 1 2 1 2 2 2 1 2 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2
## [2554] 1 1 2 1 1 2 2 1 1 2 1 1 1 2 1 1 1 2 2 1 1 2 1 2 1 1 1 1 1 1 1 1 1 1 2 1 2
## [2591] 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 1 1 2 1 1 1 2 1 2 2 1 2 2 1 2 1 2 2 1 1
## [2628] 1 1 1 2 1 1 1 1 2 1 2 2 1 1 1 2 2 1 1 1 1 1 1 1 2 2 1 2 2 2 1 2 2 2 2 1 1
## [2665] 2 1 1 2 2 2 1 1 1 1 2 1 2 1 1 1 1 1 1 1 1 2 2 1 2 2 1 1 1 2 1 1 1 1 2 1 1
## [2702] 1 1 2 2 1 1 1 1 1 2 2 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1
## [2739] 1 1 2 2 1 1 1 1 2 1 1 1 2 2 1 2 1 2 1 1 2 2 1 1 2 2 2 1 1 2 1 2 1 2 1 1 1
## [2776] 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2813] 1 2 1 1 1 2 2 1 1 2 1 1 1 1 2 1 1 2 2 2 2 1 2 2 2 1 1 1 1 1 1 1 1 1 1 2 1
## [2850] 1 2 1 2 2 1 1 1 1 2 2 1 2 1 1 2 1 2 1 2 1 1 2 2 1 1 2 1 1 2 1 2 2 1 1 1 2
## [2887] 1 2 1 2 1 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 1
## [2924] 1 1 2 1 2 1 1 1 1 1 2 2 1 1 1 1 2 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 2 2 1 2
## [2961] 1 1 1 1 1 1 1 1 1 2 2 1 2 1 1 1 1 2 2 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 2 1 1
## [2998] 1 1 1 2 2 1 1 1 2 1 1 2 1 1 1 1 1 1 2 1 1 1 2 1 2 2 1 1 2 1 1 2 1 2 1 1 1
## [3035] 2 1 1 2 1 1 2 1 1 1 1 1 1 1 1 2 2 1 1 1 2 2 1 1 2 2 1 1 2 1 1 1 1 1 1 1 1
## [3072] 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 2 1 2 1 1 1 1 1 1 2 1 1
## [3109] 1 1 1 1 1 2 1 1 1 2 2 1 1 2 1 2 1 2 1 2 1 1 1 1 1 1 1 1 2 1 2 1 1 1 2 1 1
## [3146] 2 2 2 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 2 1 1 2 1 2 1 2 2 1 1 1 1 1 1 1 1 1
## [3183] 1 2 1 1 1 1 1 2 1 2 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1 1 1 1 2 1 1
## [3220] 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1
## [3257] 2 2 1 2 1 1 1 1 2 2 2 1 2 1 1 1 1 1 2 2 1 1 1 1 1 1 1 2 1 2 1 2 1 1 2 1 1
## [3294] 1 1 1 1 2 1 2 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1 1 1 1 2 1 1 1
## [3331] 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1
## [3368] 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 2 2 1 1 2 1 1 1 1 2 2 2 1 1 2 1 1 1 1 1 1 1
## [3405] 1 1 1 2 2 2 1 1 1 2 1 1 1 1 2 2 2 1 1 1 1 1 1 1 2 1 1 1 2 2 1 1 2 1 1 1 1
## [3442] 2 1 2 1 2 1 1 1 1 2 1 1 1 2 1 2 2 1 2 1 2 1 2 1 2 1 1 1 2 1 1 1 1 1 1 1 1
## [3479] 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1 2 2 1 1 1 2 1 1 1 1 2 1 1 1 2 1 1 2
## [3516] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 2 2 1 1 1
## [3553] 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 2 1 1 2 1 1 1 2 1 2 1 1 1 1 1 2 1 2 1 1 1 1
## [3590] 1 1 2 2 1 2 1 2 2 2 1 1 1 1 1 2 2 1 2 1 1 1 2 1 1 1 1 2 1 1 2 1 1 1 1 2 1
## [3627] 1 1 1 1 2 1 1 1 1 1 1 1 1 2 1 2 1 1 1 2 2 2 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1
## [3664] 1 2 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 2 1 2 1 1 1 1 2 1 2 1 2 1 1 1 1 1 1 1
## [3701] 1 2 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 2 1 2 2 1 2 1 1 1 1 1 1 1 2 1 1
## [3738] 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 2 1 1 1 2 1 1 1 2 1 2 2 2 1 1 1 2 1 2 1 1 1
## [3775] 1 1 1 2 2 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 2 2 1 1 1 2 1 2 1 2 1 1 1 1 2 1
## [3812] 1 1 1 1 1 1 1 1 1 2 2 1 1 1 2 2 1 1 1 2 1 1 1 2 1 2 1 2 2 1 1 1 1 1 2 2 1
## [3849] 1 1 1 2 2 2 1 1 2 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 2 1 1 1 2 2 1 1
## [3886] 2 2 1 1 1 1 1 2 2 1 2 1 2 1 1 2 1 1 1 2 2 1 2 1 1 1 1 1 1 1 2 1 1 1 2 1 1
## [3923] 2 1 1 2 1 1 1 1 1 1 1 2 1 1 2 1 1 2 2 1 1 2 1 1 1 2 1 2 1 1 2 1 1 1 1 1 1
## [3960] 1 2 2 1 1 1 2 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1
## [3997] 2 2 1 1 1 2 1 1 1 2 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 1
## [4034] 1 1 2 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 2 2 1 1 2 2 1 1 1 1 1 1 1 1
## [4071] 1 1 1 1 1 1 1 2 1 1 1 1 1 2 1 2 1 1 2 1 1 2 1 1 1 2 2 1 2 1 1 2 1 1 1 1 1
## [4108] 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 2 2 1 2 1 1 2 1 1 2 2 1 1 1
## [4145] 2 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 2 1 1 2 1 2 1 2 1 1 1 2 1 1 1 2 1 1 1
## [4182] 1 1 1 1 1 1 1 1 2 2 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 2 2
## [4219] 2 2 1 1 1 2 1 1 1 1 1 2 2 1 2 1 1 1 1 1 1 1 1 2 1 2 1 2 1 1 1 1 2 1 1 2 2
## [4256] 1 1 1 1 2 1 1 1 1 2 1 1 1 2 1 2 1 1 1 1 2 1 1 1 1 1 1 1 2 2 1 1 1 1 1 2 1
## [4293] 1 1 1 1 2 1 1 2 1 2 2 1 1 1 1 1 1 1 1 1 1 2 2 1 2 2 1 1 2 1 2 2 1 1 1 1 1
## [4330] 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1 1 1 2 1 1 1 2 2 2 1 1 1
## [4367] 1 2 2 1 1 1 1 1 2 1 2 1 2 2 1 1 2 1 2 1 1 1 1 1 2 1 1 1 1 1 1 2 1 1 1 1 1
## [4404] 2 1 2 1 1 1 2 1 1 2 1 1 1 1 2 1 2 1 1 1 2 2 2 1 1 1 1 1 1 2 1 1 2 1 1 1 2
## [4441] 2 1 2 2 1 2 1 2 1 1 2 1 1 2 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2
## [4478] 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 2
## [4515] 1 1 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 2 1 1 1 1 2 1 2 1 1 1 1 1 1 1 2 1 1 1 1
## [4552] 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1
## [4589] 1 1 1 1 2 1 2 1 1 1 1 1 1 2 1 1 2 1 1 1 1 2 2 2 1 2 2 1 2 2 1 1 1 1 2 2 1
## [4626] 2 1 2 2 1 1 2 1 1 2 1 1 1 2 2 1 2 1 1 1 1 1 1 1 2 1 2 1 1 1 2 1 2 1 2 1 1
## [4663] 1 1 1 1 1 1 2 1 2 2 2 2 1 1 2 1 1 1 1 2 1 2 1 2 1 1 1 2 1 2 2 1 1 1 2 2 2
## [4700] 2 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 2 2 1 1 1 1 1 2 1 1 2 1 1 1 2 2 1 1 1 1
## [4737] 1 1 1 1 1 1 1 2 2 1 2 1 2 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 2 1 1
## [4774] 1 2 1 1 2 2 2 1 1 1 1 1 2 1 1 1 2 2 1 1 2 1 1 1 2 2 1 1 2 1 1 1 2 2 2 2 1
## [4811] 2 1 1 1 2 2 2 1 1 1 2 1 2 1 1 1 2 1 1 2 1 2 2 1 1 2 1 1 2 1 1 1 1 1 1 2 1
## [4848] 2 1 1 2 2 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1 2 1 1 1 2 1 1 2 1 2 2 1 1 1 1 1 1
## [4885] 1 1 1 1 2 2 1 1 1 2 2 1 2 2 1 1 1 2 1 1 1 1 1 1 1 2 1 2 2 2 1 2 1 2 1 1 1
## [4922] 1 1 1 2 1 1 2 1 1 1 1 2 2 1 2 2 1 1 1 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2
## [4959] 1 1 1 1 2 1 1 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1
## [4996] 1 2 1 1 1 1 1 1 1 2 2 1 2 1 1 2 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 2 2 1
## [5033] 2 1 2 1 2 2 2 1 1 1 1 2 1 2 1 1 1 1 2 1 1 2 1 1 1 1 1 1 1 1 1 2 1 1 1 2 1
## [5070] 1 2 1 1 1 1 2 2 1 1 1 2 1 1 1 1 2 1 1 2 1 1 1 1 1 1 2 1 1 1 1 2 2 1 1 1 1
## [5107] 1 1 2 2 1 1 1 1 2 1 1 1 1 1 1 2 2 1 1 2 1 1 1 2 2 1 1 1 2 1 1 1 2 1 1 1 2
## [5144] 1 1 1 2 1 2 1 2 1 2 1 1 1 2 1 1 2 1 1 1 1 1 1 1 2 1 2 1 1 2 1 1 1 1 1 1 1
## [5181] 2 1 2 2 1 2 1 2 1 2 1 1 1 1 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 2 2 1 2 2 1
## [5218] 1 1 2 1 1 1 1 2 1 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 2 1 1 2 1 2 1 1 2 1 1 1
## [5255] 1 1 1 1 1 1 1 2 1 1 2 2 2 1 2 1 2 1 1 2 1 1 2 1 2 1 1 1 2 1 1 1 1 1 2 1 1
## [5292] 1 1 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 2 2 2 1 1 1 1 1 1 1 2 1 1 1 1 2
## [5329] 1 1 2 1 1 1 1 1 2 2 1 2 2 1 1 1 2 2 1 1 1 1 1 2 1 2 2 2 1 1 2 1 2 2 1 1 1
## [5366] 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 2 1 2 1 1 1 2 2 2 1 1 1 1 2 1 1 1 1 2 2 2
## [5403] 1 2 1 2 2 1 2 2 1 1 1 2 1 2 2 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1
## [5440] 2 2 1 1 2 2 1 1 2 2 2 1 2 2 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 2 1 2 2
## [5477] 1 1 1 1 2 1 1 2 1 1 1 1 1 1 2 2 2 2 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1
## [5514] 2 2 1 1 2 1 2 1 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 1 2 1 2 1 1 1 1 1 1 1 2
## [5551] 1 1 2 1 1 1 2 1 2 1 1 1 2 2 2 1 1 1 1 1 1 2 1 1 2 1 2 1 1 1 2 2 1 1 1 1 2
## [5588] 2 1 1 1 1 1 1 2 2 2 2 1 1 1 1 1 1 1 2 1 2 2 1 1 2 2 1 1 2 2 1 1 2 2 1 1 1
## [5625] 1 2 1 2 1 1 1 1 2 1 2 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 2 1 1 1
## [5662] 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 2 1 1 1 1 1 1 1 2 2 1 1 1 1 2 2 2 1 1 2 1 2
## [5699] 2 1 2 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 2 2 2 2 1 2 1 1 2 1 1 1 2 2 1 2
## [5736] 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1 2
## [5773] 1 1 1 1 1 1 1 2 1 1 2 1 1 1 2 2 1 2 2 1 2 1 1 2 1 1 2 2 1 2 1 1 1 2 2 2 2
## [5810] 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 2 1 1 1 2 1 2 1 2
## [5847] 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1
## [5884] 1 1 2 1 1 1 1 2 1 2 1 1 1 1 2 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 2 1 1 1 2
## [5921] 1 1 2 1 1 1 2 1 1 1 1 2 1 1 1 2 2 2 2 1 1 2 1 1 2 2 1 1 2 2 1 1 2 1 1 2 1
## [5958] 2 2 2 1 1 2 1 1 1 1 1 2 1 1 1 2 1 1 2 1 1 1 2 2 1 1 2 1 1 2 1 1 1 1 1 1 2
## [5995] 2 2 1 1 1 1 1 2 1 1 1 1 1 1 2 1 2 1 2 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1
## [6032] 1 1 1 2 1 1 2 1 1 1 2 1 1 1 2 2 2 1 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 2
## [6069] 1 1 1 1 1 2 1 2 1 2 1 1 1 1 1 1 2 1 2 1 1 2 1 1 1 1 2 2 1 1 1 1 1 2 1 1 1
## [6106] 1 1 2 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 2 1 2 2 1 2
## [6143] 1 1 1 1 1 2 1 1 1 2 1 1 2 2 1 1 1 1 1 2 1 2 2 1 1 1 1 1 1 1 1 2 1 2 1 1 1
## [6180] 2 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 2 2 1 1 1
## [6217] 1 2 1 1 1 2 2 1 1 2 1 1 1 1 2 1 1 1 1 1 2 2 1 2 1 2 1 2 1 1 1 1 1 1 1 1 2
## [6254] 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1 1 2 1 1 1 1 1 2 1 2 1 2
## [6291] 1 1 1 1 1 1 1 1 2 1 1 1 1 1 2 2 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 2 1
## [6328] 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2
## [6365] 1 2 1 2 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 1 2 1 1 2 1 1 1 1 1 1 1 1 2 1 1 2
## [6402] 1 1 1 1 1 1 1 2 1 1 2 1 1 2 1 1 1 2 1 2 1 1 1 1 1 1 1 2 1 1 1 1 2 1 2 1 1
## [6439] 1 1 1 1 1 2 1 1 1 1 1 2 2 1 1 1 1 1 2 1 1 1 2 1 2 1 2 1 1 1 1 1 2 2 1 1 2
## [6476] 1 2 1 2 1 1 2 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 2 1 1 2 1 1 1 1
## [6513] 1 1 2 2 1 2 2 2 2 2 2 2 1 2 1 1 1 2 1 1 1 1 1 1 1 2 1 2 1 1 2 1 1 1 1 1 2
## [6550] 1 1 2 1 1 1 1 1 2 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1
## [6587] 1 1 2 2 2 1 2 1 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 2 2 2 2 2 1 2 1 1 1 1 1 1
## [6624] 1 1 1 2 1 2 1 1 1 1 2 2 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 2 1 2 1 2
## [6661] 1 1 2 2 1 1 1 2 1 2 2 1 1 2 1 1 2 1 2 1 2 2 1 1 2 1 1 1 1 2 2 1 1 1 1 1 1
## [6698] 1 1 2 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1
## [6735] 1 1 1 2 1 1 2 2 1 1 1 2 2 2 1 1 1 2 2 2 1 1 2 1 1 1 1 1 1 1 2 1 1 1 2 1 1
## [6772] 1 1 2 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 2 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 2 2 1
## [6809] 1 1 1 1 1 1 2 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 1
## [6846] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 2 1 1
## [6883] 2 1 2 2 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 2 1 2 1 1 1 1 1 1
## [6920] 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 2 1 2 1 1 1 1 2 1 1
## [6957] 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 2 2 1 1 1 1 1 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1
## [6994] 1 2 1 2 1 1 1 1 2 1 1 1 1 2 2 1 2 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [7031] 1 1 1 1 1 1 2 2 2 1 1 1 1 2 2 1 1 1 2 2 2 2 1 1 1 1 1 1 2 2 2 1 1 1 1 1 1
## [7068] 1 1 1 1 2 2 1 1 2 1 2 1 2 1 1 1 2 2 1 1 1 2 2 1 1 2 1 1 2 1 1 1 2 2 1 2 2
## [7105] 2 1 2 2 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 2 1 2 1 1 1 2 1 1 1 1 1 1 1 2 1 2 1
## [7142] 1 2 1 1 1 1 1 1 1 1 1 1 1 2 2 2 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 2 2 1 1
## [7179] 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 1 1 1 2 1 2 2 1 1 1 1 1 1 1 1 2
## [7216] 1 1 1 1 1 2 2 2 2 1 1 2 2 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1
## [7253] 2 1 1 2 1 2 1 1 2 1 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 1 1 1 1 1
## [7290] 1 1 2 1 1 2 1 1 2 1 2 1 1 2 1 1 1 2 1 1 2 2 1 1 2 1 2 1 1 2 1 1 1 2 2 1 1
## [7327] 1 1 2 1 1 1 2 1 1 1 2 1 1 2 2 2 2 1 1 1 2 2 1 1 1 2 2 1 2 1 1 1 2 1 1 1 1
## [7364] 2 1 1 2 2 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 2 2 1 1 1 2 2 1 1 2 1 1 2
## [7401] 1 1 2 2 1 1 1 1 1 1 2 2 1 1 1 1 2 1 1 1 2 1 1 2 1 2 1 1 1 1 2 2 2 2 1 1 1
## [7438] 1 1 1 1 2 1 1 1 1 1 2 2 2 2 2 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 2 2
## [7475] 1 2 2 2 1 1 2 2 1 1 1 2 2 2 1 1 1 1 2 2 1 1 2 1 2 1 1 1 1 1 2 1 1 2 2 1 2
## [7512] 1 1 2 1 1 2 2 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 2 1 1 1 1 1 1 1 1 2 1 2 2 1 1
## [7549] 1 1 2 1 1 1 2 1 1 1 1 2 1 1 1 2 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1
## [7586] 1 1 1 2 1 2 2 1 1 2 1 2 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1
## [7623] 2 1 2 1 2 1 1 1 1 1 2 1 1 1 1 2 1 1 2 1 1 2 1 2 2 1 1 2 2 2 1 1 2 2 1 1 1
## [7660] 1 1 1 1 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1 2 1 2 2
## [7697] 1 1 2 2 1 1 1 1 1 1 1 2 1 1 1 2 2 1 1 2 1 2 1 2 1 1 1 1 2 2 1 1 1 1 1 1 1
## [7734] 1 1 1 1 1 1 1 2 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 2 1 2 2 1
## [7771] 2 1 2 1 2 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 2 1 1 2 1 1 1 1 1 1 1 1 2 1 1
## [7808] 1 1 2 2 1 2 1 1 1 1 1 2 2 1 1 2 1 1 2 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1
## [7845] 1 1 1 1 2 1 2 2 1 1 2 1 2 1 1 1 1 1 1 2 2 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 2
## [7882] 1 1 1 1 1 1 1 2 2 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 2 1 2 2 2 1 1 1 1 2 1 1
## [7919] 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 2 2 1 1 1 1 1 2 2 2
## [7956] 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 2 1 2 1 1 2 1 1 2 1 1 1 1 1 1 1 1 2 1 1
## [7993] 2 1 1 2 1 2 1 1 2 1 1 1 1 2 2 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [8030] 1 2 1 2 1 1 1 1 1 1 1 2 2 2 2 1 1 1 1 2 1 1 1 2 2 2 1 1 1 1 2 1 1 2 1 1 1
## [8067] 1 1 2 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 2 1 2 2 1 1 2 1 1 2 1
## [8104] 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 2 2 1 1 1 1 2 1 2 2 2 1 1 2 1 1
## [8141] 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1
binom.test(m, M, p)
## 
##  Exact binomial test
## 
## data:  m and M
## number of successes = 383, number of trials = 2141, p-value = 1
## alternative hypothesis: true probability of success is not equal to 0.1788884
## 95 percent confidence interval:
##  0.1628687 0.1957921
## sample estimates:
## probability of success 
##              0.1788884
?`latexpdf-package`
## No documentation for 'latexpdf-package' in specified packages and libraries:
## you could try '??latexpdf-package'
library(latexpdf)

#The prevalence of the positive condition is 26.38% in the training data and 17.89% in the evaluation data results. Although there is some difference in these figures, the difference is not significant at an alpha=0.05 as can be seen in the above Binomial test