evaluation <- read.csv("C:/Users/91976/Downloads/insurance-evaluation-data.csv")
training <- read.csv("C:/Users/91976/Downloads/insurance_training_data.csv")
M <- rbind(training, evaluation) # Merged
n <- nrow(training); # training is M[1:n, 4:26]
m <- nrow(evaluation) # evaluation is M[(1+n):(m+n), ]
X <- data.frame("TARGET_FLAG" = rep(T, ncol(M)),
"TARGET_AMT" = rep(T, ncol(M)))
X[match(c("INDEX", "TARGET_AMT"), names(M)), "TARGET_FLAG"] <- F
X[match(c("INDEX", "TARGET_FLAG"), names(M)), "TARGET_AMT"] <- F
#Normalizing data for a more clear view
quantitative <- c(4:8, 10, 15, 17, 18, 21, 22, 24, 25)
names(M[quantitative])
## [1] "KIDSDRIV" "AGE" "HOMEKIDS" "YOJ" "INCOME" "HOME_VAL"
## [7] "TRAVTIME" "BLUEBOOK" "TIF" "OLDCLAIM" "CLM_FREQ" "MVR_PTS"
## [13] "CAR_AGE"
categorical <- c(13, 14, 19)
names(M[categorical])
## [1] "EDUCATION" "JOB" "CAR_TYPE"
binary <- c(9, 11, 12, 16, 20, 23, 26)
names(M[binary])
## [1] "PARENT1" "MSTATUS" "SEX" "CAR_USE" "RED_CAR"
## [6] "REVOKED" "URBANICITY"
Currency_Convert <- function(Field){
Field <- as.numeric(gsub("\\$|,","", Field))
}
Binary_Convert <- function(Field, Neg, Pos) {
Field <- as.character(Field)
Field[which(Field == Neg)] <- 0
Field[which(Field == Pos)] <- 1
Field <- as.numeric(Field)
}
M$INCOME <- Currency_Convert(M$INCOME)
M$PARENT1 <- Binary_Convert(M$PARENT1, "No", "Yes")
M$HOME_VAL <- Currency_Convert(M$HOME_VAL)
M$MSTATUS <- Binary_Convert(M$MSTATUS, "z_No", "Yes")
M$SEX <- Binary_Convert(M$SEX, "M", "z_F")
M$CAR_USE <- Binary_Convert(M$CAR_USE, "Commercial", "Private")
M$BLUEBOOK <- Currency_Convert(M$BLUEBOOK)
M$RED_CAR <- Binary_Convert(M$RED_CAR, "no", "yes")
M$OLDCLAIM <- Currency_Convert(M$OLDCLAIM)
M$REVOKED <- Binary_Convert(M$REVOKED, "No", "Yes")
M$URBANICITY <- Binary_Convert(M$URBANICITY, "z_Highly Rural/ Rural", "Highly Urban/ Urban")
M$CAR_AGE[which(M$CAR_AGE < 0)] <- NA
M$HOME_VAL[which(M$HOME_VAL == 0)] <- NA
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
stargazer(training [,-1], type = "text")
##
## =======================================================
## Statistic N Mean St. Dev. Min Max
## -------------------------------------------------------
## TARGET_FLAG 8,161 0.264 0.441 0 1
## TARGET_AMT 8,161 1,504.325 4,704.027 0.000 107,586.100
## KIDSDRIV 8,161 0.171 0.512 0 4
## AGE 8,155 44.790 8.628 16 81
## HOMEKIDS 8,161 0.721 1.116 0 5
## YOJ 7,707 10.499 4.092 0 23
## TRAVTIME 8,161 33.486 15.908 5 142
## TIF 8,161 5.351 4.147 1 25
## CLM_FREQ 8,161 0.799 1.158 0 5
## MVR_PTS 8,161 1.696 2.147 0 13
## CAR_AGE 7,651 8.328 5.701 -3 28
## -------------------------------------------------------
#Visualising data
#Load ggplot2 library
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
#Create scatterplot
ggplot(data = training, aes(x = INCOME, y = HOME_VAL)) +
geom_point(color = "#4F94CD", size = 2.5) +
labs(x = "Income (in thousands)", y = "Home Value (in thousands)",
title = "Scatterplot of Income and Home Value") +
theme(plot.title = element_text(hjust = 0.5),
panel.background = element_rect(fill = "#F2F2F2", color = NA),
panel.grid.major = element_line(color = "#E5E5E5"),
axis.line = element_line(color = "black"))
library(ggplot2)
library(reshape2)
# Melt the training data to long format
training_melted <- melt(training)
## Using INCOME, PARENT1, HOME_VAL, MSTATUS, SEX, EDUCATION, JOB, CAR_USE, BLUEBOOK, CAR_TYPE, RED_CAR, OLDCLAIM, REVOKED, URBANICITY as id variables
# Create a ggplot with melted training data
ggplot(data = training_melted, aes(x = variable, y = value)) +
geom_boxplot() + # Add a boxplot layer
theme(axis.text.x = element_text(angle = 90, hjust = 1)) + # Rotate x-axis labels by 90 degrees and align to the right
labs(x = "Variable", y = "Value", title = "Boxplot of Variables") # Add axis labels and a title
## Warning: Removed 970 rows containing non-finite values (`stat_boxplot()`).
library(ggplot2)
library(reshape2)
ggplot(data = melt(abs(cor(sapply(na.omit(training), as.numeric)))), aes(x=Var1, y=Var2, fill=value)) +
scale_fill_gradient(low = 'black', high = 'red', name = "Absolute Value") +
geom_tile() + labs(title = "Correlation Heatmap") +
theme(axis.title.y = element_blank(),
axis.title.x = element_blank(),
axis.text.x = element_text(angle = 90, hjust = 1),
plot.title = element_text(hjust = 0.5))
## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion
## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion
## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion
## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion
## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion
## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion
## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion
## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion
## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion
## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion
## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion
## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion
## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion
## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion
PCA <- function(X) {
Xpca <- prcomp(na.omit(X), center = T, scale. = T)
M <- as.matrix(na.omit(X)); R <- as.matrix(Xpca$rotation); score <- M %*% R
print(list("Importance of Components" = summary(Xpca)$importance[ ,1:5],
"Rotation (Variable Loadings)" = Xpca$rotation[ ,1:5],
"Correlation between X and PC" = cor(na.omit(X), score)[ ,1:5]))
par(mfrow=c(2,3))
barplot(Xpca$sdev^2, ylab = "Component Variance")
barplot(cor(cbind(X)), ylab = "Correlations")
barplot(Xpca$rotation, ylab = "Loadings")
biplot(Xpca); barplot(M); barplot(score)
}
PCA(M[1:n, quantitative])
## $`Importance of Components`
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 1.645244 1.334995 1.270056 1.012738 0.9938183
## Proportion of Variance 0.208220 0.137090 0.124080 0.078900 0.0759700
## Cumulative Proportion 0.208220 0.345310 0.469390 0.548290 0.6242600
##
## $`Rotation (Variable Loadings)`
## PC1 PC2 PC3 PC4 PC5
## KIDSDRIV 0.10774567 -0.17584383 0.50530195 -0.07331372 0.083069165
## AGE -0.26771011 0.02800193 -0.33747972 0.09204957 0.398329669
## HOMEKIDS 0.22780686 -0.15800486 0.61071654 -0.03742552 -0.070548997
## YOJ -0.14018772 -0.15622254 0.30537124 0.16303338 0.482290631
## INCOME -0.53361654 -0.15978248 0.16457623 -0.05878797 -0.111702800
## HOME_VAL -0.54258303 -0.15297470 0.13345360 -0.05556924 -0.070094419
## TRAVTIME 0.03412860 0.01807301 -0.03461120 -0.66695073 0.612187440
## BLUEBOOK -0.34153090 -0.09792299 0.09498366 -0.05047749 0.005257043
## TIF 0.01113475 0.06077082 0.08346442 0.70417523 0.407479904
## OLDCLAIM 0.10846886 -0.53950381 -0.18816292 0.08137372 0.027275343
## CLM_FREQ 0.12435306 -0.57054584 -0.21337557 0.02350619 0.041684584
## MVR_PTS 0.12635248 -0.48204768 -0.13797155 -0.02119042 -0.055156727
## CAR_AGE -0.32388952 -0.08617776 -0.04999174 0.02775244 -0.181923195
##
## $`Correlation between X and PC`
## PC1 PC2 PC3 PC4 PC5
## KIDSDRIV 0.03279334 0.026553825 -0.03415518 0.03430623 0.03304671
## AGE -0.21530718 -0.207800075 0.21108534 -0.21467167 -0.20309336
## HOMEKIDS 0.14930487 0.138457016 -0.14945042 0.15094329 0.14520501
## YOJ -0.20724065 -0.209543450 0.20479944 -0.20430965 -0.20661069
## INCOME -0.98252529 -0.961368639 0.98208694 -0.97955171 -0.98781092
## HOME_VAL -0.99539680 -0.973818412 0.99092055 -0.99071401 -0.99286141
## TRAVTIME 0.03450769 0.037350720 -0.03374552 0.03207236 0.03588214
## BLUEBOOK -0.43933895 -0.428786579 0.44044267 -0.45038023 -0.40712199
## TIF 0.02086626 0.026174257 -0.01892478 0.01915871 0.02069475
## OLDCLAIM 0.06486343 -0.155449228 -0.13116239 0.13800364 0.07289718
## CLM_FREQ 0.07562867 -0.034894080 -0.10831923 0.11178715 0.07912375
## MVR_PTS 0.07488695 0.009564432 -0.09370134 0.09613452 0.07548462
## CAR_AGE -0.37196326 -0.364430487 0.37128491 -0.37062402 -0.37290793
options(repos = "http://cran.rstudio.com/")
install.packages('VIM')
## package 'VIM' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\91976\AppData\Local\Temp\RtmpeATJFy\downloaded_packages
library(VIM)
## Warning: package 'VIM' was built under R version 4.2.3
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
all(complete.cases(training))
## [1] FALSE
aggr(M[1:n, 4:26], bars=F, sortVars=T)
##
## Variables sorted by number of missings:
## Variable Count
## HOME_VAL 0.337948781
## CAR_AGE 0.062614876
## YOJ 0.055630437
## INCOME 0.054527631
## AGE 0.000735204
## KIDSDRIV 0.000000000
## HOMEKIDS 0.000000000
## PARENT1 0.000000000
## MSTATUS 0.000000000
## SEX 0.000000000
## EDUCATION 0.000000000
## JOB 0.000000000
## TRAVTIME 0.000000000
## CAR_USE 0.000000000
## BLUEBOOK 0.000000000
## TIF 0.000000000
## CAR_TYPE 0.000000000
## RED_CAR 0.000000000
## OLDCLAIM 0.000000000
## CLM_FREQ 0.000000000
## REVOKED 0.000000000
## MVR_PTS 0.000000000
## URBANICITY 0.000000000
#We have notable amounts of missing values in the HOME_VAL, JOB, CAR_AGE, YOJ, and INCOME variables. There is also a small amount of missing values in AGE.
Likely_Value <- function(Field_1, Field_2, Value) {
# Mode for Field_1 for given Value of Field_2
frequencies <- table(Field_1[which(Field_2 == Value)])
most_frequent <- names(sort(frequencies, decreasing = TRUE)[1])
return(most_frequent)
}
M$JOB[(is.na(M$JOB) & M$EDUCATION == "PhD")] <- Likely_Value(M$JOB, M$EDUCATION, "PhD")
M$JOB[(is.na(M$JOB) & M$EDUCATION == "Masters")] <- Likely_Value(M$JOB, M$EDUCATION, "Masters")
M$JOB[(is.na(M$JOB) & M$EDUCATION == "Bachelors")] <- Likely_Value(M$JOB, M$EDUCATION, "Bachelors")
M$JOB[(is.na(M$JOB) & M$EDUCATION == "z_High School")] <- Likely_Value(M$JOB, M$EDUCATION, "z_High School")
M$JOB[(is.na(M$JOB) & M$EDUCATION == "<High School")] <- Likely_Value(M$JOB, M$EDUCATION, "<High School")
##Assuming that education level can serve as a reasonable proxy for a person’s job, the likely JOB value for each sample given the EDUCATION value level is imputed for missing JOB values by looking at the predominant (mode) JOB value for each EDUCATION value. For example, if education level E is mostly employee in job J, then where there exists and education level E without missing job information, we assume job J
install.packages("mice")
## package 'mice' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\91976\AppData\Local\Temp\RtmpeATJFy\downloaded_packages
library(mice)
## Warning: package 'mice' was built under R version 4.2.3
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
MICE <- mice(M[1:n, 4:26], predictorMatrix = quickpred(M[1:n, 4:26]), method = "mean", printFlag = F)
## Warning: Number of logged events: 2
M[1:n, 4:26] <- complete(MICE, action = 1)
MICE <- mice(M[(1+n):(m+n), ], predictorMatrix = quickpred(M[(1+n):(m+n), ]), method = "mean", printFlag = F)
## Warning: Number of logged events: 4
M[(1+n):(m+n), ] <- complete(MICE, action = 1)
M$CAR_AGE <- as.integer(M$CAR_AGE)
aggr(M[1:n, 4:26], bars=F, sortVars=T)
##
## Variables sorted by number of missings:
## Variable Count
## KIDSDRIV 0
## AGE 0
## HOMEKIDS 0
## YOJ 0
## INCOME 0
## PARENT1 0
## HOME_VAL 0
## MSTATUS 0
## SEX 0
## EDUCATION 0
## JOB 0
## TRAVTIME 0
## CAR_USE 0
## BLUEBOOK 0
## TIF 0
## CAR_TYPE 0
## RED_CAR 0
## OLDCLAIM 0
## CLM_FREQ 0
## REVOKED 0
## MVR_PTS 0
## CAR_AGE 0
## URBANICITY 0
#Missing values were replaced with the mean value using Multivariate Imputation
#by Chained Equations (MICE).
#Categorical Varibales
M$PHD <- ifelse(M$EDUCATION == "PhD", 1, 0)
M$MASTERS <- ifelse(M$EDUCATION == "Masters", 1, 0)
M$BACHELORS <- ifelse(M$EDUCATION == "Bachelors", 1, 0)
M$HS <- ifelse(M$EDUCATION == "z_High School", 1, 0)
M$NOHS <- ifelse(M$EDUCATION == "<High School", 1, 0)
M$CLERICAL <- ifelse(M$JOB == "Clerical", 1, 0)
M$DOCTOR <- ifelse(M$JOB == "Doctor", 1, 0)
M$HOME_MAKER <- ifelse(M$JOB == "Home Maker", 1, 0)
M$LAWYER <- ifelse(M$JOB == "Lawyer", 1, 0)
M$MANAGER <- ifelse(M$JOB == "Manager", 1, 0)
M$PROF <- ifelse(M$JOB == "Professional", 1, 0)
M$STUDENT <- ifelse(M$JOB == "Student", 1, 0)
M$BLUE_COLLAR <- ifelse(M$JOB == "z_Blue Collar", 1, 0)
M$MINIVAN <- ifelse(M$CAR_TYPE == "Minivan", 1, 0)
M$TRUCK <- ifelse(M$CAR_TYPE == "Panel Truck", 1, 0)
M$PICKUP <- ifelse(M$CAR_TYPE == "Pickup", 1, 0)
M$SPORTS <- ifelse(M$CAR_TYPE == "Sports Car", 1, 0)
M$VAN <- ifelse(M$CAR_TYPE == "Van", 1, 0)
M$SUV <- ifelse(M$CAR_TYPE == "z_SUV", 1, 0)
remove <- c("EDUCATION", "JOB", "CAR_TYPE")
X <- rbind(X, data.frame("TARGET_FLAG" = rep(T, ncol(M)-nrow(X)),
"TARGET_AMT" = rep(T, ncol(M)-nrow(X))))
X[match(remove, names(M)), ] <- F
#Catergorial variables were changed to binary.
library(reshape2)
Corr_XY <- function(X, Y) {
corr <- data.frame(array(NA, dim = c(ncol(X), 5)))
colnames(corr) <- c("Y", "X", "r","p","<0.05")
for (i in 1:ncol(X)) {
r <- cor.test(Y[, 1], X[, i])
corr[i, 1] <- names(Y)
corr[i, 2] <- names(X[i])
corr[i, 3] <- r$estimate
corr[i, 4] <- r$p.value
corr[i, 5] <- corr[i, 4] < 0.05
}
return(corr)
}
Corr_XX <- function(X, threshold) {
corr <- data.frame(array(NA, dim = c(choose(ncol(X), 2), 5)))
colnames(corr) <- c("X1", "X2", "r","p","<0.05"); k = 1
for (i in 1:(ncol(X) - 1)) {
for (j in (i+1):ncol(X)) {
r <- cor.test(X[,i], X[,j])
corr[k, 1] <- names(X[i])
corr[k, 2] <- names(X[j])
corr[k, 3] <- r$estimate
corr[k, 4] <- r$p.value
corr[k, 5] <- corr[i, 4] < 0.05
k = k + 1
}
}
least <- corr[corr[,"<0.05"] == F, ]
most <- corr[abs(corr[,"r"]) >= threshold, ]
result <- list("Correlations" = corr, "Least_Correlated"= least, "Most_Correlated" = most)
return(result)
}
#Between TARGET_AMT and X Variables #The specification M[1:n, -c(1:3, categorical)] creates a data frame excluding the INDEX, TARGET_FLAG, TARGET_AMT, and categorical variables. The specification M[1:n, 2, drop = FALSE] creates a data frame with the Y of interest and retains the column name
correlations <- Corr_XY(M[1:n, -c(1:3, categorical)], M[1:n, 3, drop = FALSE])
correlations
## Y X r p <0.05
## 1 TARGET_AMT KIDSDRIV 0.055394177 5.520134e-07 TRUE
## 2 TARGET_AMT AGE -0.041722748 1.631008e-04 TRUE
## 3 TARGET_AMT HOMEKIDS 0.061988043 2.089491e-08 TRUE
## 4 TARGET_AMT YOJ -0.020939639 5.854856e-02 FALSE
## 5 TARGET_AMT INCOME -0.056639628 3.054521e-07 TRUE
## 6 TARGET_AMT PARENT1 0.096965421 1.648791e-18 TRUE
## 7 TARGET_AMT HOME_VAL -0.041320305 1.885921e-04 TRUE
## 8 TARGET_AMT MSTATUS -0.087661194 2.135335e-15 TRUE
## 9 TARGET_AMT SEX -0.011053614 3.180652e-01 FALSE
## 10 TARGET_AMT TRAVTIME 0.027987016 1.145817e-02 TRUE
## 11 TARGET_AMT CAR_USE -0.098613835 4.298495e-19 TRUE
## 12 TARGET_AMT BLUEBOOK -0.004699523 6.712129e-01 FALSE
## 13 TARGET_AMT TIF -0.046480831 2.661802e-05 TRUE
## 14 TARGET_AMT RED_CAR 0.008091979 4.648309e-01 FALSE
## 15 TARGET_AMT OLDCLAIM 0.070953287 1.390750e-10 TRUE
## 16 TARGET_AMT CLM_FREQ 0.116419159 5.005267e-26 TRUE
## 17 TARGET_AMT REVOKED 0.061385464 2.859309e-08 TRUE
## 18 TARGET_AMT MVR_PTS 0.137865509 6.385648e-36 TRUE
## 19 TARGET_AMT CAR_AGE -0.057600986 1.918161e-07 TRUE
## 20 TARGET_AMT URBANICITY 0.120973821 5.495497e-28 TRUE
## 21 TARGET_AMT PHD -0.024424437 2.735236e-02 TRUE
## 22 TARGET_AMT MASTERS -0.035171011 1.484052e-03 TRUE
## 23 TARGET_AMT BACHELORS -0.017277942 1.185859e-01 FALSE
## 24 TARGET_AMT HS 0.042098024 1.422829e-04 TRUE
## 25 TARGET_AMT NOHS 0.027676590 1.240704e-02 TRUE
## 26 TARGET_AMT CLERICAL 0.007805255 4.808004e-01 FALSE
## 27 TARGET_AMT DOCTOR -0.034750482 1.690833e-03 TRUE
## 28 TARGET_AMT HOME_MAKER -0.007081752 5.223917e-01 FALSE
## 29 TARGET_AMT LAWYER -0.029185515 8.371037e-03 TRUE
## 30 TARGET_AMT MANAGER -0.064606496 5.168715e-09 TRUE
## 31 TARGET_AMT PROF -0.004547087 6.812815e-01 FALSE
## 32 TARGET_AMT STUDENT 0.024409854 2.744467e-02 TRUE
## 33 TARGET_AMT BLUE_COLLAR 0.061830058 2.269225e-08 TRUE
## 34 TARGET_AMT MINIVAN -0.075267324 9.887179e-12 TRUE
## 35 TARGET_AMT TRUCK 0.029468292 7.761229e-03 TRUE
## 36 TARGET_AMT PICKUP 0.021906619 4.782277e-02 TRUE
## 37 TARGET_AMT SPORTS 0.023294077 3.535147e-02 TRUE
## 38 TARGET_AMT VAN 0.023479460 3.391668e-02 TRUE
## 39 TARGET_AMT SUV 0.005942619 5.914276e-01 FALSE
print(correlations)
## Y X r p <0.05
## 1 TARGET_AMT KIDSDRIV 0.055394177 5.520134e-07 TRUE
## 2 TARGET_AMT AGE -0.041722748 1.631008e-04 TRUE
## 3 TARGET_AMT HOMEKIDS 0.061988043 2.089491e-08 TRUE
## 4 TARGET_AMT YOJ -0.020939639 5.854856e-02 FALSE
## 5 TARGET_AMT INCOME -0.056639628 3.054521e-07 TRUE
## 6 TARGET_AMT PARENT1 0.096965421 1.648791e-18 TRUE
## 7 TARGET_AMT HOME_VAL -0.041320305 1.885921e-04 TRUE
## 8 TARGET_AMT MSTATUS -0.087661194 2.135335e-15 TRUE
## 9 TARGET_AMT SEX -0.011053614 3.180652e-01 FALSE
## 10 TARGET_AMT TRAVTIME 0.027987016 1.145817e-02 TRUE
## 11 TARGET_AMT CAR_USE -0.098613835 4.298495e-19 TRUE
## 12 TARGET_AMT BLUEBOOK -0.004699523 6.712129e-01 FALSE
## 13 TARGET_AMT TIF -0.046480831 2.661802e-05 TRUE
## 14 TARGET_AMT RED_CAR 0.008091979 4.648309e-01 FALSE
## 15 TARGET_AMT OLDCLAIM 0.070953287 1.390750e-10 TRUE
## 16 TARGET_AMT CLM_FREQ 0.116419159 5.005267e-26 TRUE
## 17 TARGET_AMT REVOKED 0.061385464 2.859309e-08 TRUE
## 18 TARGET_AMT MVR_PTS 0.137865509 6.385648e-36 TRUE
## 19 TARGET_AMT CAR_AGE -0.057600986 1.918161e-07 TRUE
## 20 TARGET_AMT URBANICITY 0.120973821 5.495497e-28 TRUE
## 21 TARGET_AMT PHD -0.024424437 2.735236e-02 TRUE
## 22 TARGET_AMT MASTERS -0.035171011 1.484052e-03 TRUE
## 23 TARGET_AMT BACHELORS -0.017277942 1.185859e-01 FALSE
## 24 TARGET_AMT HS 0.042098024 1.422829e-04 TRUE
## 25 TARGET_AMT NOHS 0.027676590 1.240704e-02 TRUE
## 26 TARGET_AMT CLERICAL 0.007805255 4.808004e-01 FALSE
## 27 TARGET_AMT DOCTOR -0.034750482 1.690833e-03 TRUE
## 28 TARGET_AMT HOME_MAKER -0.007081752 5.223917e-01 FALSE
## 29 TARGET_AMT LAWYER -0.029185515 8.371037e-03 TRUE
## 30 TARGET_AMT MANAGER -0.064606496 5.168715e-09 TRUE
## 31 TARGET_AMT PROF -0.004547087 6.812815e-01 FALSE
## 32 TARGET_AMT STUDENT 0.024409854 2.744467e-02 TRUE
## 33 TARGET_AMT BLUE_COLLAR 0.061830058 2.269225e-08 TRUE
## 34 TARGET_AMT MINIVAN -0.075267324 9.887179e-12 TRUE
## 35 TARGET_AMT TRUCK 0.029468292 7.761229e-03 TRUE
## 36 TARGET_AMT PICKUP 0.021906619 4.782277e-02 TRUE
## 37 TARGET_AMT SPORTS 0.023294077 3.535147e-02 TRUE
## 38 TARGET_AMT VAN 0.023479460 3.391668e-02 TRUE
## 39 TARGET_AMT SUV 0.005942619 5.914276e-01 FALSE
##The predictor variables SEX, BLUEBOOK, RED_CAR, BACHELORS, CLERICAL, HOME_MAKER, PROF, LAWYER, YOJ, and SUV do not have statistically significant correlations with the response variable and are therefore not being considered for the model. The variable YOJ sits at the threshold of statistical viability, and will be left in
remove <- c("SEX", "BLUEBOOK", "RED_CAR", "BACHELORS", "CLERICAL", "HOME_MAKER", "PROF", "LAWYER", "YOJ", "SUV")
X[match(remove, names(M)), "TARGET_AMT"] <- F
##Between TARGETFLAG and X Variables ##The specification M[1:n, -c(1:3, categorical)] creates a data frame excluding the INDEX, TARGET_FLAG, TARGET_AMT, and categorical variables. The specification M[1:n, 2, drop = FALSE] creates a data frame with the Y of interest and retains the column name
correlations <- Corr_XY(M[1:n, -c(1:3, categorical)], M[1:n, 2, drop = FALSE])
print(correlations)
## Y X r p <0.05
## 1 TARGET_FLAG KIDSDRIV 0.1036682963 6.052406e-21 TRUE
## 2 TARGET_FLAG AGE -0.1031261224 9.659249e-21 TRUE
## 3 TARGET_FLAG HOMEKIDS 0.1156210106 1.083837e-25 TRUE
## 4 TARGET_FLAG YOJ -0.0684875222 5.889201e-10 TRUE
## 5 TARGET_FLAG INCOME -0.1382427249 4.129718e-36 TRUE
## 6 TARGET_FLAG PARENT1 0.1576222195 1.488738e-46 TRUE
## 7 TARGET_FLAG HOME_VAL -0.0975878744 9.950630e-19 TRUE
## 8 TARGET_FLAG MSTATUS -0.1351247571 1.460728e-34 TRUE
## 9 TARGET_FLAG SEX 0.0210785602 5.689454e-02 FALSE
## 10 TARGET_FLAG TRAVTIME 0.0483683103 1.234536e-05 TRUE
## 11 TARGET_FLAG CAR_USE -0.1426736765 2.252692e-38 TRUE
## 12 TARGET_FLAG BLUEBOOK -0.1033831893 7.741376e-21 TRUE
## 13 TARGET_FLAG TIF -0.0823700498 9.145383e-14 TRUE
## 14 TARGET_FLAG RED_CAR -0.0069472579 5.303220e-01 FALSE
## 15 TARGET_FLAG OLDCLAIM 0.1380838297 4.962696e-36 TRUE
## 16 TARGET_FLAG CLM_FREQ 0.2161960608 6.332803e-87 TRUE
## 17 TARGET_FLAG REVOKED 0.1519390816 2.410252e-43 TRUE
## 18 TARGET_FLAG MVR_PTS 0.2191970538 2.320264e-89 TRUE
## 19 TARGET_FLAG CAR_AGE -0.0970693948 1.515753e-18 TRUE
## 20 TARGET_FLAG URBANICITY 0.2242509434 1.512617e-93 TRUE
## 21 TARGET_FLAG PHD -0.0654121132 3.325908e-09 TRUE
## 22 TARGET_FLAG MASTERS -0.0762959857 5.147403e-12 TRUE
## 23 TARGET_FLAG BACHELORS -0.0426525815 1.160501e-04 TRUE
## 24 TARGET_FLAG HS 0.1097693656 2.653941e-23 TRUE
## 25 TARGET_FLAG NOHS 0.0530418729 1.632022e-06 TRUE
## 26 TARGET_FLAG CLERICAL 0.0273667617 1.342281e-02 TRUE
## 27 TARGET_FLAG DOCTOR -0.0583769873 1.310526e-07 TRUE
## 28 TARGET_FLAG HOME_MAKER 0.0112592910 3.091434e-01 FALSE
## 29 TARGET_FLAG LAWYER -0.0617312643 2.389150e-08 TRUE
## 30 TARGET_FLAG MANAGER -0.1053953267 1.343221e-21 TRUE
## 31 TARGET_FLAG PROF -0.0385723360 4.915476e-04 TRUE
## 32 TARGET_FLAG STUDENT 0.0770140270 3.247037e-12 TRUE
## 33 TARGET_FLAG BLUE_COLLAR 0.1017866167 3.033581e-20 TRUE
## 34 TARGET_FLAG MINIVAN -0.1369991100 1.729612e-35 TRUE
## 35 TARGET_FLAG TRUCK -0.0003423919 9.753283e-01 FALSE
## 36 TARGET_FLAG PICKUP 0.0566433091 3.049128e-07 TRUE
## 37 TARGET_FLAG SPORTS 0.0572528091 2.272140e-07 TRUE
## 38 TARGET_FLAG VAN 0.0030204421 7.849914e-01 FALSE
## 39 TARGET_FLAG SUV 0.0450322221 4.709927e-05 TRUE
##The predictor variables SEX, RED_CAR, HOME_MAKER, TRUCK, and VAN do not have statistically significant correlations with the response variable and are therefore not being considered for the model.
remove <- c("SEX", "RED_CAR", "HOME_MAKER", "TRUCK", "VAN")
X[match(remove, names(M)), "TARGET_FLAG"] <- F
#Betweeen all X variables
remove <- c("SEX", "RED_CAR", "HOME_MAKER", "TRUCK", "VAN")
X[match(remove, names(M)), "TARGET_FLAG"] <- F
#Betweeen all X variables
correlations <- Corr_XX(M[1:n, (X[,"TARGET_AMT"] & X[,"TARGET_FLAG"])], 0.50)
print(correlations$Least_Correlated)
## X1 X2 r p <0.05
## 142 MSTATUS TRAVTIME 0.0102482953 3.546042e-01 FALSE
## 143 MSTATUS CAR_USE 0.0209315442 5.864616e-02 FALSE
## 144 MSTATUS TIF -0.0007410648 9.466325e-01 FALSE
## 145 MSTATUS OLDCLAIM -0.0459197532 3.326731e-05 FALSE
## 146 MSTATUS CLM_FREQ -0.0693288825 3.618950e-10 FALSE
## 147 MSTATUS REVOKED -0.0432305388 9.360415e-05 FALSE
## 148 MSTATUS MVR_PTS -0.0479670481 1.456999e-05 FALSE
## 149 MSTATUS CAR_AGE -0.0320848886 3.745996e-03 FALSE
## 150 MSTATUS URBANICITY -0.0025618324 8.170069e-01 FALSE
## 151 MSTATUS PHD -0.0373484866 7.390760e-04 FALSE
## 152 MSTATUS MASTERS 0.0029389035 7.906587e-01 FALSE
## 153 MSTATUS HS 0.0380637415 5.831527e-04 FALSE
## 154 MSTATUS NOHS 0.0138148710 2.120751e-01 FALSE
## 155 MSTATUS DOCTOR -0.0373295003 7.436993e-04 FALSE
## 156 MSTATUS MANAGER 0.0019279928 8.617515e-01 FALSE
## 157 MSTATUS STUDENT 0.0035687134 7.471929e-01 FALSE
## 158 MSTATUS BLUE_COLLAR 0.0045503896 6.810627e-01 FALSE
## 159 MSTATUS MINIVAN 0.0009564217 9.311578e-01 FALSE
## 160 MSTATUS PICKUP 0.0013592314 9.022878e-01 FALSE
## 161 MSTATUS SPORTS 0.0112104575 3.112466e-01 FALSE
## 162 TRAVTIME CAR_USE -0.0248053795 2.503410e-02 FALSE
## 163 TRAVTIME TIF -0.0116046256 2.945390e-01 FALSE
## 164 TRAVTIME OLDCLAIM -0.0192671689 8.177875e-02 FALSE
## 165 TRAVTIME CLM_FREQ 0.0065602114 5.534799e-01 FALSE
## 166 TRAVTIME REVOKED -0.0121152699 2.738038e-01 FALSE
## 167 TRAVTIME MVR_PTS 0.0105985106 3.384001e-01 FALSE
## 168 TRAVTIME CAR_AGE -0.0364222498 9.986376e-04 FALSE
## 169 TRAVTIME URBANICITY -0.1660047341 1.640236e-51 FALSE
## 170 TRAVTIME PHD -0.0429039572 1.057263e-04 FALSE
## 171 TRAVTIME MASTERS -0.0375494718 6.917442e-04 FALSE
## 172 TRAVTIME HS 0.0190208120 8.576030e-02 FALSE
## 173 TRAVTIME NOHS 0.0273923554 1.333620e-02 FALSE
## 174 TRAVTIME DOCTOR -0.0269177228 1.502532e-02 FALSE
## 175 TRAVTIME MANAGER -0.0749253684 1.225975e-11 FALSE
## 176 TRAVTIME STUDENT 0.0283659291 1.038762e-02 FALSE
## 177 TRAVTIME BLUE_COLLAR 0.0424745029 1.239311e-04 FALSE
## 178 TRAVTIME MINIVAN -0.0083978230 4.481267e-01 FALSE
## 179 TRAVTIME PICKUP -0.0084380748 4.459539e-01 FALSE
## 180 TRAVTIME SPORTS 0.0112356765 3.101592e-01 FALSE
## 181 CAR_USE TIF -0.0001160512 9.916365e-01 FALSE
## 182 CAR_USE OLDCLAIM -0.0357676283 1.230458e-03 FALSE
## 183 CAR_USE CLM_FREQ -0.0814906825 1.670027e-13 FALSE
## 184 CAR_USE REVOKED -0.0168968510 1.269334e-01 FALSE
## 185 CAR_USE MVR_PTS -0.0680837946 7.424192e-10 FALSE
## 186 CAR_USE CAR_AGE 0.0676272022 9.632170e-10 FALSE
## 187 CAR_USE URBANICITY 0.0204630452 6.452916e-02 FALSE
## 188 CAR_USE PHD 0.0250940213 2.339268e-02 FALSE
## 189 CAR_USE MASTERS 0.1282037052 2.971966e-31 FALSE
## 190 CAR_USE HS -0.1596014265 1.062614e-47 FALSE
## 191 CAR_USE NOHS 0.1269912794 1.082325e-30 FALSE
## 192 CAR_USE DOCTOR 0.1354404558 1.021913e-34 FALSE
## 193 CAR_USE MANAGER 0.0961826970 3.097201e-18 FALSE
## 194 CAR_USE STUDENT -0.0806555709 2.941293e-13 FALSE
## 195 CAR_USE BLUE_COLLAR -0.4380537778 0.000000e+00 FALSE
## 196 CAR_USE MINIVAN 0.2046295876 7.061299e-78 FALSE
## 197 CAR_USE PICKUP -0.2257311630 8.587749e-95 FALSE
## 198 CAR_USE SPORTS 0.1425417945 2.637087e-38 FALSE
## 199 TIF OLDCLAIM -0.0219581980 4.730039e-02 FALSE
## 200 TIF CLM_FREQ -0.0230229550 3.754291e-02 FALSE
## 201 TIF REVOKED -0.0318415132 4.017359e-03 FALSE
## 202 TIF MVR_PTS -0.0410457340 2.080836e-04 FALSE
## 203 TIF CAR_AGE 0.0075594969 4.947226e-01 FALSE
## 204 TIF URBANICITY 0.0071310133 5.195023e-01 FALSE
## 205 TIF PHD -0.0078535466 4.780900e-01 FALSE
## 206 TIF MASTERS 0.0181811177 1.005207e-01 FALSE
## 207 TIF HS 0.0025818532 8.156024e-01 FALSE
## 208 TIF NOHS -0.0008852708 9.362678e-01 FALSE
## 209 TIF DOCTOR -0.0113072010 3.070891e-01 FALSE
## 210 TIF MANAGER 0.0099573035 3.684344e-01 FALSE
## 211 TIF STUDENT -0.0166644709 1.322436e-01 FALSE
## 212 TIF BLUE_COLLAR -0.0066761671 5.464903e-01 FALSE
## 213 TIF MINIVAN -0.0093689420 3.974062e-01 FALSE
## 214 TIF PICKUP 0.0048783681 6.594748e-01 FALSE
## 215 TIF SPORTS -0.0074874653 4.988436e-01 FALSE
## 331 DOCTOR MANAGER -0.0654289801 3.295165e-09 FALSE
## 332 DOCTOR STUDENT -0.0545045933 8.360529e-07 FALSE
## 333 DOCTOR BLUE_COLLAR -0.0946162963 1.077181e-17 FALSE
## 334 DOCTOR MINIVAN 0.0380040469 5.948895e-04 FALSE
## 335 DOCTOR PICKUP -0.0340756591 2.078654e-03 FALSE
## 336 DOCTOR SPORTS -0.0007753139 9.441700e-01 FALSE
## 346 BLUE_COLLAR MINIVAN 0.0242686903 2.835221e-02 FALSE
## 347 BLUE_COLLAR PICKUP 0.0378625555 6.235926e-04 FALSE
## 348 BLUE_COLLAR SPORTS -0.0325862131 3.238823e-03 FALSE
## 349 MINIVAN PICKUP -0.2704284523 9.198656e-137 FALSE
## 350 MINIVAN SPORTS -0.2111419861 6.628612e-83 FALSE
## 351 PICKUP SPORTS -0.1601428363 5.130616e-48 FALSE
##The specification M[1:n, (X[,“TARGET_AMT”] & X[,“TARGET_FLAG”])] creates a data frame excluding INDEX, TARGET_FLAG, TARGET_AMT, and the variables previously marked for removal due to statistically significant correlations with the response variable. There are strong statistically significant correlations between HOME_VAL & INCOME, PHD & DOCTOR, and MASTERS & LAWYER. From these paired correlated variables we find that HOME_VAL, DOCTOR, and LAWYER are least correlated to both TARGET_FLAG and TARGET_AMT. These three variables will therefore not be considered for the model. It is worth noting that the high correlation between PHD & DOCTOR and MASTERS & LAWYER is likely due to prior imputation.
remove <- c("HOME_VAL", "DOCTOR", "LAWYER")
X[match(remove, names(M)), ] <- F
library(MASS)
## Warning: package 'MASS' was built under R version 4.2.3
columns <- c("INCOME", "HOME_VAL", "TRAVTIME", "BLUEBOOK", "TIF", "MVR_PTS")
fit_exp <- function(X, fields) {
potential <- match(fields, names(X))
lambda <- numeric(ncol(X))
par(mfrow=c(2,3))
for (i in potential) {
shifted <- X[, i] - min(X[, i]) + 1e-32
fit_exp <- fitdistr(shifted, "Exponential")
lambda[i] <- fit_exp$estimate
exp <- rexp(1000, lambda[i])
hist(X[, i], prob=TRUE, col="grey", main =names(X[i]),
xlab=paste("Lambda =",fractions(lambda[i])))
lines(density(exp), col="blue", lwd=2)
}
lambda <- data.frame("VARIABLE"=fields, "LAMBDA"=lambda[potential])
return(lambda)
}
lambda <- fit_exp(M[1:n, ], columns)
lambda
## VARIABLE LAMBDA
## 1 INCOME 1.615559e-05
## 2 HOME_VAL 5.868625e-06
## 3 TRAVTIME 3.510530e-02
## 4 BLUEBOOK 7.037347e-05
## 5 TIF 2.298161e-01
## 6 MVR_PTS 5.897955e-01
#Five of the six potential variables lend themselves toward modeling with an exponential distribution. The variables were shifted to slightly above zero by subtracting the minimum value and then adding 1−32 to the modified value. This would also shift data with a negative minimum in the appropriate direction since subtracting the negative minimum value equates to adding the minimum value
M[, "log_INCOME"] <- log(M[, "INCOME"] - min(M[, "INCOME"]) + 1e-32, lambda[1,2])
M[, "log_TRAVTIME"] <- log(M[, "TRAVTIME"] - min(M[, "TRAVTIME"]) + 1e-32, lambda[3,2])
M[, "log_BLUEBOOK"] <- log(M[, "BLUEBOOK"] - min(M[, "BLUEBOOK"]) + 1e-32, lambda[4,2])
M[, "log_TIF"] <- log(M[, "TIF"] - min(M[, "TIF"]) + 1e-32, lambda[5,2])
M[, "log_MVR_PTS"] <- log(M[, "MVR_PTS"] - min(M[, "MVR_PTS"]) + 1e-32, lambda[6,2])
remove <- c("INCOME", "TRAVTIME", "BLUEBOOK", "TIF", "MVR_PTS")
X <- rbind(X, data.frame("TARGET_FLAG" = rep(T, ncol(M)-nrow(X)),
"TARGET_AMT" = rep(T, ncol(M)-nrow(X))))
X[match(remove, names(M)), ] <- F
#Categorization of Multimodal Data
par(mfrow=c(2,3))
smoothScatter(M[1:n, "KIDSDRIV"], ylab = "KIDSDRIV")
smoothScatter(M[1:n, "HOMEKIDS"], ylab = "HOMEKIDS")
smoothScatter(M[1:n, "YOJ"], ylab = "YOJ")
smoothScatter(M[1:n, "OLDCLAIM"], ylab = "OLDCLAIM")
smoothScatter(M[1:n, "CLM_FREQ"], ylab = "CLM_FREQ")
smoothScatter(M[1:n, "CAR_AGE"], ylab = "CAR_AGE")
#The variables KIDSDRIV, HOMEKIDS, YOJ, OLDCLAIM, CLM_FREQ, and CAR_AGE have bimodal distributions. There are clear lines of demarcation in the values that we can use to bifurcate the variables into categories. The defining value for each of these variables is zero. Therefore, we can categorize the variables as zero if the value is equal to zero, and one otherwise.
M[,"cat_KIDSDRIV"] <- ifelse(M$KIDSDRIV == 0, 0, 1)
M[,"cat_HOMEKIDS"] <- ifelse(M$HOMEKIDS == 0, 0, 1)
M[,"cat_YOJ"] <- ifelse(M$YOJ == 0, 0, 1)
M[,"cat_OLDCLAIM"] <- ifelse(M$OLDCLAIM == 0, 0, 1)
M[,"cat_CLM_FREQ"] <- ifelse(M$CLM_FREQ == 0, 0, 1)
M[,"cat_CAR_AGE"] <- ifelse(M$CAR_AGE == 0, 0, 1)
remove <- c("KIDSDRIV", "HOMEKIDS", "YOJ", "OLDCLAIM", "CLM_FREQ", "CAR_AGE")
X <- rbind(X, data.frame("TARGET_FLAG" = rep(T, ncol(M)-nrow(X)),
"TARGET_AMT" = rep(T, ncol(M)-nrow(X))))
X[match(remove, names(M)), ] <- F
library(car)
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.2.3
potential <- match(c("AGE"), names(M))
box.cox.powers <- powerTransform(M[1:n, potential], family="bcPower")
summary(box.cox.powers)
## bcPower Transformation to Normality
## Est Power Rounded Pwr Wald Lwr Bnd Wald Upr Bnd
## M[1:n, potential] 1.0391 1 0.95 1.1282
##
## Likelihood ratio test that transformation parameter is equal to 0
## (log transformation)
## LRT df pval
## LR test, lambda = (0) 559.678 1 < 2.22e-16
##
## Likelihood ratio test that no transformation is needed
## LRT df pval
## LR test, lambda = (1) 0.7421689 1 0.38897
#The only unexamined variable that the Box-Cox Transformation could potentially be applied to is the quantitative variable AGE. However, upon examination, examination AGE returns an estimated power close to one which indicates that no transformation is necessary. This is further supported by the boundaries which include the value of one in the range.
training_AMT <- M[1:n, X[,"TARGET_AMT"]]
training_FLAG <- M[1:n, X[,"TARGET_FLAG"]]
#Forward Selection
#Multiple Linear Regression
null <- lm(TARGET_AMT ~ 0, training_AMT)
full <- lm(TARGET_AMT ~ ., training_AMT)
aic_steps <- step(null, scope=list(lower=null, upper=full), direction="forward", k = 2, trace=F)
aic_steps$call
## lm(formula = TARGET_AMT ~ URBANICITY + cat_OLDCLAIM + PARENT1 +
## MANAGER + CAR_USE + cat_CAR_AGE + MINIVAN + log_MVR_PTS +
## MSTATUS + cat_KIDSDRIV + log_TIF + NOHS + HS + REVOKED +
## SPORTS + log_INCOME + log_TRAVTIME - 1, data = training_AMT)
##The above model has the lowest AIC.
forward_AMT <- lm(aic_steps$call, training_AMT)
round(coef(summary(forward_AMT)), 6)
## Estimate Std. Error t value Pr(>|t|)
## URBANICITY 1505.085315 136.313068 11.041387 0.000000
## cat_OLDCLAIM 552.793025 114.474396 4.828967 0.000001
## PARENT1 640.172763 177.545247 3.605688 0.000313
## MANAGER -851.066664 162.431235 -5.239551 0.000000
## CAR_USE -785.064181 111.841871 -7.019412 0.000000
## cat_CAR_AGE 761.221081 182.601816 4.168749 0.000031
## MINIVAN -470.879470 121.461053 -3.876794 0.000107
## log_MVR_PTS -2.769052 0.757178 -3.657068 0.000257
## MSTATUS -585.805201 119.630701 -4.896780 0.000001
## cat_KIDSDRIV 692.326953 162.330292 4.264928 0.000020
## log_TIF 6.991248 2.130387 3.281680 0.001036
## NOHS 657.591808 153.642482 4.280013 0.000019
## HS 470.297182 120.908400 3.889698 0.000101
## REVOKED 471.513352 155.133315 3.039407 0.002378
## SPORTS 321.992205 168.356009 1.912567 0.055839
## log_INCOME 45.990011 25.543071 1.800489 0.071820
## log_TRAVTIME -18.650708 11.105457 -1.679418 0.093109
#At a significance level of alpha=0.5,the 17 forward selected variables yield a TARGET_AMT MLR model with three insignificant variables: SPORTS, log_INCOME, and log_TRAVTIME.Removing those three insignificant variables yields a model with all significant variables
forward_AMT <- lm(TARGET_AMT ~ URBANICITY + cat_OLDCLAIM + PARENT1 +
MANAGER + CAR_USE + MINIVAN + log_MVR_PTS + MSTATUS + cat_CAR_AGE +
cat_KIDSDRIV + log_TIF + NOHS + HS + REVOKED - 1, training_AMT)
summary(forward_AMT)
##
## Call:
## lm(formula = TARGET_AMT ~ URBANICITY + cat_OLDCLAIM + PARENT1 +
## MANAGER + CAR_USE + MINIVAN + log_MVR_PTS + MSTATUS + cat_CAR_AGE +
## cat_KIDSDRIV + log_TIF + NOHS + HS + REVOKED - 1, data = training_AMT)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4997 -1694 -818 354 104984
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## URBANICITY 1469.0432 135.8182 10.816 < 2e-16 ***
## cat_OLDCLAIM 572.1325 114.3497 5.003 5.75e-07 ***
## PARENT1 655.2860 177.4451 3.693 0.000223 ***
## MANAGER -884.0253 162.0108 -5.457 5.00e-08 ***
## CAR_USE -729.9259 109.4039 -6.672 2.69e-11 ***
## MINIVAN -540.1926 117.7244 -4.589 4.53e-06 ***
## log_MVR_PTS -2.8047 0.7574 -3.703 0.000214 ***
## MSTATUS -581.0082 119.6464 -4.856 1.22e-06 ***
## cat_CAR_AGE 772.8248 181.7666 4.252 2.14e-05 ***
## cat_KIDSDRIV 683.0515 162.3725 4.207 2.62e-05 ***
## log_TIF 7.1675 2.1301 3.365 0.000769 ***
## NOHS 676.0476 153.2975 4.410 1.05e-05 ***
## HS 496.4631 120.3608 4.125 3.75e-05 ***
## REVOKED 474.5687 155.1805 3.058 0.002234 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4560 on 8147 degrees of freedom
## Multiple R-squared: 0.1488, Adjusted R-squared: 0.1473
## F-statistic: 101.7 on 14 and 8147 DF, p-value: < 2.2e-16
null <- glm(TARGET_FLAG ~ 0, family = binomial(link = "logit"), training_FLAG)
full <- glm(TARGET_FLAG ~ ., family = binomial(link = "logit"), training_FLAG)
aic_steps <- step(null, scope=list(lower=null, upper=full), direction="forward", k = 2, trace=F)
aic_steps$aic
## [1] 7500.651
##The above model has the lowest AIC.
forward_FLAG <- glm(aic_steps$formula, family = binomial(link = "logit"), training_FLAG)
round(coef(summary(forward_FLAG)), 6)
## Estimate Std. Error z value Pr(>|z|)
## AGE -0.003424 0.003925 -0.872196 0.383101
## URBANICITY 2.224240 0.110518 20.125582 0.000000
## cat_OLDCLAIM 0.553906 0.060688 9.127162 0.000000
## CAR_USE -0.729315 0.081633 -8.934078 0.000000
## cat_YOJ -0.000101 0.283764 -0.000355 0.999717
## MSTATUS -0.692176 0.074923 -9.238513 0.000000
## MANAGER -0.712284 0.107968 -6.597173 0.000000
## MINIVAN -0.418051 0.105548 -3.960787 0.000075
## REVOKED 0.729757 0.079316 9.200623 0.000000
## cat_KIDSDRIV 0.556026 0.096696 5.750221 0.000000
## HS 0.513970 0.075100 6.843797 0.000000
## NOHS 0.546576 0.097037 5.632669 0.000000
## cat_CAR_AGE -2.817303 0.329517 -8.549807 0.000000
## log_TIF 0.007683 0.001190 6.459280 0.000000
## log_MVR_PTS -0.002206 0.000437 -5.052844 0.000000
## cat_HOMEKIDS 0.224665 0.096231 2.334635 0.019563
## SPORTS 0.636551 0.119359 5.333084 0.000000
## log_TRAVTIME -0.028562 0.007067 -4.041575 0.000053
## SUV 0.384536 0.100159 3.839250 0.000123
## PHD -0.314215 0.115890 -2.711326 0.006701
## log_INCOME 0.106774 0.037340 2.859542 0.004243
## CLERICAL 0.325682 0.097022 3.356792 0.000789
## PICKUP 0.223960 0.095602 2.342627 0.019149
## log_BLUEBOOK 0.061436 0.023980 2.561970 0.010408
## BLUE_COLLAR 0.188304 0.090050 2.091115 0.036518
## PARENT1 0.215201 0.118922 1.809608 0.070357
#At a significance level of alpha=0.5,the 26 forward selected variables yield a TARGET_FLAG BLR model with three insignificant variables: AGE, cat_YOJ, and PARENT1. Removing those three insignificant variables yields a model with all significant variables
forward_FLAG <- glm(TARGET_FLAG ~ URBANICITY + cat_OLDCLAIM + CAR_USE +
MSTATUS + MANAGER + MINIVAN + REVOKED + cat_KIDSDRIV + HS +
NOHS + cat_CAR_AGE + log_TIF + log_MVR_PTS + cat_HOMEKIDS +
SPORTS + log_TRAVTIME + SUV + PHD + log_INCOME + CLERICAL +
PICKUP + log_BLUEBOOK + BLUE_COLLAR - 1, family = binomial(link = "logit"), training_FLAG)
summary(forward_FLAG)
##
## Call:
## glm(formula = TARGET_FLAG ~ URBANICITY + cat_OLDCLAIM + CAR_USE +
## MSTATUS + MANAGER + MINIVAN + REVOKED + cat_KIDSDRIV + HS +
## NOHS + cat_CAR_AGE + log_TIF + log_MVR_PTS + cat_HOMEKIDS +
## SPORTS + log_TRAVTIME + SUV + PHD + log_INCOME + CLERICAL +
## PICKUP + log_BLUEBOOK + BLUE_COLLAR - 1, family = binomial(link = "logit"),
## data = training_FLAG)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1946 -0.7273 -0.4224 0.6889 3.1402
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## URBANICITY 2.2185007 0.1101897 20.133 < 2e-16 ***
## cat_OLDCLAIM 0.5511070 0.0606483 9.087 < 2e-16 ***
## CAR_USE -0.7312157 0.0815656 -8.965 < 2e-16 ***
## MSTATUS -0.7827185 0.0585965 -13.358 < 2e-16 ***
## MANAGER -0.7090159 0.1077946 -6.577 4.79e-11 ***
## MINIVAN -0.4156174 0.1055195 -3.939 8.19e-05 ***
## REVOKED 0.7302799 0.0793086 9.208 < 2e-16 ***
## cat_KIDSDRIV 0.5302165 0.0941368 5.632 1.78e-08 ***
## HS 0.5146327 0.0749761 6.864 6.70e-12 ***
## NOHS 0.5475820 0.0968129 5.656 1.55e-08 ***
## cat_CAR_AGE -2.9259596 0.1395232 -20.971 < 2e-16 ***
## log_TIF 0.0076797 0.0011890 6.459 1.06e-10 ***
## log_MVR_PTS -0.0022271 0.0004362 -5.106 3.29e-07 ***
## cat_HOMEKIDS 0.3568313 0.0683715 5.219 1.80e-07 ***
## SPORTS 0.6357416 0.1191917 5.334 9.62e-08 ***
## log_TRAVTIME -0.0283289 0.0070487 -4.019 5.84e-05 ***
## SUV 0.3844424 0.1001565 3.838 0.000124 ***
## PHD -0.3247954 0.1155157 -2.812 0.004928 **
## log_INCOME 0.1076736 0.0150112 7.173 7.34e-13 ***
## CLERICAL 0.3327295 0.0966097 3.444 0.000573 ***
## PICKUP 0.2252018 0.0956307 2.355 0.018527 *
## log_BLUEBOOK 0.0618683 0.0239369 2.585 0.009748 **
## BLUE_COLLAR 0.1911011 0.0899125 2.125 0.033552 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 11313.5 on 8161 degrees of freedom
## Residual deviance: 7452.6 on 8138 degrees of freedom
## AIC: 7498.6
##
## Number of Fisher Scoring iterations: 5
#Backward stepwise subset elimination based on AIC Using k=2 degrees of freedom for the penalty gives the genuine AIC #Using k=log(n) is sometimes referred to as BIC or SBC
null <- lm(TARGET_AMT ~ 0, training_AMT)
full <- lm(TARGET_AMT ~ ., training_AMT)
aic_steps <- step(full, scope=list(lower=null, upper=full), direction="backward", k = 2, trace=F)
aic_steps$call
## lm(formula = TARGET_AMT ~ PARENT1 + MSTATUS + CAR_USE + REVOKED +
## URBANICITY + HS + NOHS + MANAGER + MINIVAN + SPORTS + log_INCOME +
## log_TRAVTIME + log_TIF + log_MVR_PTS + cat_KIDSDRIV + cat_OLDCLAIM,
## data = training_AMT)
backward_AMT <- lm(aic_steps$call, training_AMT)
round(coef(summary(backward_AMT)), 6)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 762.784417 183.151125 4.164782 0.000031
## PARENT1 640.018132 177.569345 3.604328 0.000315
## MSTATUS -586.551921 119.723526 -4.899220 0.000001
## CAR_USE -785.707613 111.900046 -7.021513 0.000000
## REVOKED 470.841555 155.143313 3.034881 0.002414
## URBANICITY 1505.218992 136.333458 11.040716 0.000000
## HS 470.034608 120.940482 3.886495 0.000103
## NOHS 657.777887 153.639789 4.281299 0.000019
## MANAGER -851.651499 162.435546 -5.243012 0.000000
## MINIVAN -471.449326 121.477290 -3.880967 0.000105
## SPORTS 322.212013 168.353540 1.913901 0.055668
## log_INCOME 46.035671 25.544003 1.802210 0.071549
## log_TRAVTIME -18.636693 11.105620 -1.678132 0.093360
## log_TIF 6.989195 2.130538 3.280483 0.001041
## log_MVR_PTS -2.773052 0.757597 -3.660325 0.000253
## cat_KIDSDRIV 691.794967 162.331388 4.261622 0.000021
## cat_OLDCLAIM 552.008131 114.513722 4.820454 0.000001
#At a significance level of alpha=0.5 , the 17 backward selected variables yield a TARGET_AMT MLR model with three insignificant variables: SPORTS, log_INCOME, and log_TRAVTIME. Removing those three insignificant variables yields a model with all significant variables
backward_AMT <- lm(TARGET_AMT ~ PARENT1 + MSTATUS + CAR_USE + REVOKED +
URBANICITY + HS + NOHS + MANAGER + MINIVAN + log_TIF +
log_MVR_PTS + cat_KIDSDRIV + cat_OLDCLAIM, training_AMT)
summary(backward_AMT)
##
## Call:
## lm(formula = TARGET_AMT ~ PARENT1 + MSTATUS + CAR_USE + REVOKED +
## URBANICITY + HS + NOHS + MANAGER + MINIVAN + log_TIF + log_MVR_PTS +
## cat_KIDSDRIV + cat_OLDCLAIM, data = training_AMT)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4997 -1694 -819 354 104984
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 774.2430 182.3082 4.247 2.19e-05 ***
## PARENT1 655.1922 177.4678 3.692 0.000224 ***
## MSTATUS -581.7175 119.7388 -4.858 1.21e-06 ***
## CAR_USE -730.5026 109.4617 -6.674 2.66e-11 ***
## REVOKED 473.8986 155.1903 3.054 0.002268 **
## URBANICITY 1469.2228 135.8413 10.816 < 2e-16 ***
## HS 496.2580 120.3904 4.122 3.79e-05 ***
## NOHS 676.2812 153.2939 4.412 1.04e-05 ***
## MANAGER -884.6343 162.0158 -5.460 4.90e-08 ***
## MINIVAN -540.8132 117.7404 -4.593 4.43e-06 ***
## log_TIF 7.1657 2.1302 3.364 0.000772 ***
## log_MVR_PTS -2.8086 0.7578 -3.706 0.000212 ***
## cat_KIDSDRIV 682.5059 162.3736 4.203 2.66e-05 ***
## cat_OLDCLAIM 571.3661 114.3887 4.995 6.01e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4560 on 8147 degrees of freedom
## Multiple R-squared: 0.0617, Adjusted R-squared: 0.0602
## F-statistic: 41.21 on 13 and 8147 DF, p-value: < 2.2e-16
null <- glm(TARGET_FLAG ~ 0, family = binomial(link = "logit"), training_FLAG)
full <- glm(TARGET_FLAG ~ ., family = binomial(link = "logit"), training_FLAG)
aic_steps <- step(full, scope=list(lower=null, upper=full), direction="backward", k = 2, trace=F)
aic_steps$formula
## TARGET_FLAG ~ PARENT1 + MSTATUS + CAR_USE + REVOKED + URBANICITY +
## PHD + MASTERS + BACHELORS + CLERICAL + MANAGER + STUDENT +
## BLUE_COLLAR + MINIVAN + PICKUP + SPORTS + SUV + log_INCOME +
## log_TRAVTIME + log_BLUEBOOK + log_TIF + log_MVR_PTS + cat_KIDSDRIV +
## cat_HOMEKIDS + cat_OLDCLAIM
backward_FLAG <- glm(aic_steps$formula, family = binomial(link = "logit"), training_FLAG)
round(coef(summary(backward_FLAG)), 6)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.554923 0.160423 -15.926206 0.000000
## PARENT1 0.217757 0.119008 1.829758 0.067286
## MSTATUS -0.694028 0.074400 -9.328352 0.000000
## CAR_USE -0.688893 0.080778 -8.528279 0.000000
## REVOKED 0.733365 0.079366 9.240333 0.000000
## URBANICITY 2.249695 0.111388 20.196878 0.000000
## PHD -0.791338 0.133765 -5.915870 0.000000
## MASTERS -0.508002 0.105529 -4.813852 0.000001
## BACHELORS -0.485927 0.077640 -6.258734 0.000000
## CLERICAL 0.382111 0.106668 3.582245 0.000341
## MANAGER -0.690384 0.109689 -6.293993 0.000000
## STUDENT 0.184715 0.126539 1.459743 0.144361
## BLUE_COLLAR 0.250666 0.103458 2.422876 0.015398
## MINIVAN -0.435240 0.105575 -4.122578 0.000037
## PICKUP 0.219309 0.096226 2.279105 0.022661
## SPORTS 0.616909 0.118944 5.186530 0.000000
## SUV 0.364174 0.099838 3.647647 0.000265
## log_INCOME 0.100999 0.015677 6.442715 0.000000
## log_TRAVTIME -0.028743 0.007072 -4.064134 0.000048
## log_BLUEBOOK 0.059508 0.024005 2.479025 0.013174
## log_TIF 0.007729 0.001190 6.493015 0.000000
## log_MVR_PTS -0.002189 0.000437 -5.008679 0.000001
## cat_KIDSDRIV 0.542256 0.094714 5.725173 0.000000
## cat_HOMEKIDS 0.252943 0.087625 2.886646 0.003894
## cat_OLDCLAIM 0.557205 0.060751 9.171972 0.000000
#At a significance level of alpha=0.5 the 25 backward selected variables yield a TARGET_FLAG BLR model with three insignificant variables: PARENT1 and STUDENT. Removing those two insignificant variables yields a model with all significant variables
backward_FLAG <- glm(TARGET_FLAG ~ MSTATUS + CAR_USE + REVOKED + URBANICITY +
PHD + MASTERS + BACHELORS + CLERICAL + MANAGER +
BLUE_COLLAR + MINIVAN + PICKUP + SPORTS + SUV + log_INCOME +
log_TRAVTIME + log_BLUEBOOK + log_TIF + log_MVR_PTS + cat_KIDSDRIV +
cat_HOMEKIDS + cat_OLDCLAIM, family = binomial(link = "logit"), training_FLAG)
summary(backward_FLAG)
##
## Call:
## glm(formula = TARGET_FLAG ~ MSTATUS + CAR_USE + REVOKED + URBANICITY +
## PHD + MASTERS + BACHELORS + CLERICAL + MANAGER + BLUE_COLLAR +
## MINIVAN + PICKUP + SPORTS + SUV + log_INCOME + log_TRAVTIME +
## log_BLUEBOOK + log_TIF + log_MVR_PTS + cat_KIDSDRIV + cat_HOMEKIDS +
## cat_OLDCLAIM, family = binomial(link = "logit"), data = training_FLAG)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2016 -0.7231 -0.4212 0.6891 3.1351
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.4312711 0.1486779 -16.353 < 2e-16 ***
## MSTATUS -0.7777777 0.0586439 -13.263 < 2e-16 ***
## CAR_USE -0.7155003 0.0787814 -9.082 < 2e-16 ***
## REVOKED 0.7343615 0.0793348 9.256 < 2e-16 ***
## URBANICITY 2.2363335 0.1109238 20.161 < 2e-16 ***
## PHD -0.8497653 0.1275975 -6.660 2.74e-11 ***
## MASTERS -0.5639725 0.0979155 -5.760 8.42e-09 ***
## BACHELORS -0.5101645 0.0752438 -6.780 1.20e-11 ***
## CLERICAL 0.3256558 0.0990172 3.289 0.00101 **
## MANAGER -0.7098209 0.1082712 -6.556 5.53e-11 ***
## BLUE_COLLAR 0.1852830 0.0928333 1.996 0.04595 *
## MINIVAN -0.4147417 0.1048748 -3.955 7.67e-05 ***
## PICKUP 0.2317109 0.0958995 2.416 0.01568 *
## SPORTS 0.6331507 0.1186074 5.338 9.39e-08 ***
## SUV 0.3802718 0.0993833 3.826 0.00013 ***
## log_INCOME 0.1067877 0.0151806 7.034 2.00e-12 ***
## log_TRAVTIME -0.0285046 0.0070522 -4.042 5.30e-05 ***
## log_BLUEBOOK 0.0604612 0.0239746 2.522 0.01167 *
## log_TIF 0.0077168 0.0011898 6.486 8.82e-11 ***
## log_MVR_PTS -0.0021850 0.0004367 -5.004 5.63e-07 ***
## cat_KIDSDRIV 0.5297128 0.0942272 5.622 1.89e-08 ***
## cat_HOMEKIDS 0.3609836 0.0685862 5.263 1.42e-07 ***
## cat_OLDCLAIM 0.5555339 0.0607168 9.150 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9418.0 on 8160 degrees of freedom
## Residual deviance: 7446.2 on 8138 degrees of freedom
## AIC: 7492.2
##
## Number of Fisher Scoring iterations: 5
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.2.3
## Loading required package: Matrix
## Warning: package 'Matrix' was built under R version 4.2.3
## Loaded glmnet 4.1-7
library(glmnet)
library(Matrix)
my_matrix <- as.matrix(training_AMT[, -1])
my_sparse_matrix <- as(my_matrix, "dgCMatrix")
result <- t(my_sparse_matrix) %*% my_sparse_matrix
Lasso Regression
#Lasso Regression
library(glmnet)
# Set lambda sequence
lambda_seq <- 10^seq(10, -2, length = 100)
# Prepare training data
training_AMT <- M[1:n, X[,"TARGET_AMT"]]
training_FLAG <- M[1:n, X[,"TARGET_FLAG"]]
x_AMT <- model.matrix(TARGET_AMT ~ ., data = training_AMT)[,-1]
x_FLAG <- model.matrix(TARGET_FLAG ~ ., data = training_FLAG)[,-1]
y_AMT <- training_AMT$TARGET_AMT
y_FLAG <- training_FLAG$TARGET_FLAG
# Fit Lasso regression for TARGET_AMT
cv_AMT <- cv.glmnet(x_AMT, y_AMT, alpha = 1, lambda = lambda_seq)
best_AMT <- glmnet(x_AMT, y_AMT, alpha = 1, lambda = cv_AMT$lambda.min)
# Fit Lasso regression for TARGET_FLAG
cv_FLAG <- cv.glmnet(x_FLAG, y_FLAG, alpha = 1, lambda = lambda_seq, family = "binomial")
best_FLAG <- glmnet(x_FLAG, y_FLAG, alpha = 1, lambda = cv_FLAG$lambda.min, family = "binomial")
# Print coefficients
round(coef(best_AMT), 6)
## 30 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 683.212749
## AGE 2.996552
## PARENT1 481.144847
## MSTATUS -620.166789
## CAR_USE -719.826794
## REVOKED 437.760410
## URBANICITY 1473.526491
## PHD -258.016305
## MASTERS -94.320812
## HS 349.285622
## NOHS 508.319844
## MANAGER -813.282712
## STUDENT .
## BLUE_COLLAR 78.957225
## MINIVAN -476.355151
## TRUCK .
## PICKUP -85.379281
## SPORTS 256.502591
## VAN 79.646690
## log_INCOME 45.478168
## log_TRAVTIME -15.468947
## log_BLUEBOOK -25.189019
## log_TIF 6.406495
## log_MVR_PTS -2.683490
## cat_KIDSDRIV 553.361153
## cat_HOMEKIDS 196.950690
## cat_YOJ .
## cat_OLDCLAIM 542.017591
## cat_CLM_FREQ 0.000000
## cat_CAR_AGE .
round(coef(best_FLAG), 6)
## 32 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) -2.436792
## AGE -0.000177
## PARENT1 0.242626
## MSTATUS -0.483827
## CAR_USE -0.406180
## REVOKED 0.583235
## URBANICITY 1.703912
## PHD -0.201795
## MASTERS -0.034306
## BACHELORS .
## HS 0.415456
## NOHS 0.344233
## CLERICAL 0.125212
## MANAGER -0.567087
## PROF .
## STUDENT 0.063044
## BLUE_COLLAR 0.163277
## MINIVAN -0.523709
## PICKUP .
## SPORTS 0.161783
## SUV 0.004017
## log_INCOME 0.077110
## log_TRAVTIME -0.010885
## log_BLUEBOOK 0.010419
## log_TIF 0.004613
## log_MVR_PTS -0.001580
## cat_KIDSDRIV 0.358394
## cat_HOMEKIDS 0.209235
## cat_YOJ .
## cat_OLDCLAIM 0.527077
## cat_CLM_FREQ 0.000000
## cat_CAR_AGE .
install.packages("leaps")
## package 'leaps' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\91976\AppData\Local\Temp\RtmpeATJFy\downloaded_packages
library(leaps)
## Warning: package 'leaps' was built under R version 4.2.3
model_sum_AMT <- summary(regsubsets(TARGET_AMT ~ ., training_AMT, nvmax=ncol(training_AMT)))
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 1 linear dependencies found
## Reordering variables and trying again:
model_sum_FLAG <- summary(regsubsets(TARGET_FLAG ~ ., training_FLAG, nvmax=ncol(training_FLAG)))
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 2 linear dependencies found
## Reordering variables and trying again:
par(mfrow=c(1,2))
plot(model_sum_AMT$adjr2, xlab = "Number of Variables", ylab = "Adj R-squared", main="TARGET_AMT")
plot(model_sum_FLAG$adjr2, xlab = "Number of Variables", ylab = "Adj R-squared", main="TARGET_FLAG")
cbind(max(model_sum_AMT$adjr2), which.max(model_sum_AMT$adjr2))
## [,1] [,2]
## [1,] 0.06123634 19
cbind(max(model_sum_FLAG$adjr2), which.max(model_sum_FLAG$adjr2))
## [,1] [,2]
## [1,] 0.211793 24
#The maximum Adjusted R2 of 0.0612363 for the model predicting TARGET_AMT is reached when the model contains 19 variables
model_sum_AMT$which[which.max(model_sum_AMT$adjr2), ]
## (Intercept) AGE PARENT1 MSTATUS CAR_USE REVOKED
## TRUE FALSE TRUE TRUE TRUE TRUE
## URBANICITY PHD MASTERS HS NOHS MANAGER
## TRUE TRUE FALSE TRUE TRUE TRUE
## STUDENT BLUE_COLLAR MINIVAN TRUCK PICKUP SPORTS
## FALSE FALSE TRUE FALSE TRUE TRUE
## VAN log_INCOME log_TRAVTIME log_BLUEBOOK log_TIF log_MVR_PTS
## FALSE TRUE TRUE FALSE TRUE TRUE
## cat_KIDSDRIV cat_HOMEKIDS cat_YOJ cat_OLDCLAIM cat_CLM_FREQ cat_CAR_AGE
## TRUE TRUE FALSE FALSE TRUE FALSE
adjustedr2_AMT <- lm(TARGET_AMT ~ 1 + PARENT1 + MSTATUS + CAR_USE +
REVOKED + URBANICITY + PHD + HS + NOHS + MANAGER + MINIVAN + PICKUP +
SPORTS + log_INCOME + log_TRAVTIME + log_TIF + log_MVR_PTS +
cat_KIDSDRIV + cat_HOMEKIDS + cat_CLM_FREQ, training_AMT)
round(coef(summary(adjustedr2_AMT)), 6)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 855.350195 190.724913 4.484732 0.000007
## PARENT1 477.258229 217.200288 2.197319 0.028026
## MSTATUS -647.893956 127.782128 -5.070302 0.000000
## CAR_USE -803.418566 113.162687 -7.099677 0.000000
## REVOKED 464.971703 155.173002 2.996473 0.002739
## URBANICITY 1524.033915 136.731529 11.146178 0.000000
## PHD -252.479571 185.347297 -1.362197 0.173173
## HS 425.653740 124.349425 3.423046 0.000622
## NOHS 616.658877 157.387319 3.918098 0.000090
## MANAGER -843.520474 162.559687 -5.188989 0.000000
## MINIVAN -514.629399 126.582870 -4.065553 0.000048
## PICKUP -165.005969 145.517170 -1.133928 0.256858
## SPORTS 275.376716 171.928269 1.601695 0.109262
## log_INCOME 45.399087 25.548734 1.776960 0.075612
## log_TRAVTIME -18.536784 11.106373 -1.669022 0.095151
## log_TIF 7.028143 2.130404 3.298971 0.000975
## log_MVR_PTS -2.790571 0.757820 -3.682366 0.000233
## cat_KIDSDRIV 601.792137 177.410599 3.392087 0.000697
## cat_HOMEKIDS 179.571417 150.040877 1.196817 0.231413
## cat_CLM_FREQ 540.063343 114.635246 4.711146 0.000003
#At a significance level of alpha=0.5, the R2adj selected variables yield a TARGET_AMT MLR model with six insignificant variables: PHD, PICKUP, SPORTS, og_INCOME, log_TRAVTIME, and cat_HOMEKIDS. Removing those six insignificant variables yields a model with all significant variables
adjustedr2_AMT <- lm(TARGET_AMT ~ 1 + PARENT1 + MSTATUS + CAR_USE +
REVOKED + URBANICITY + HS + NOHS + MANAGER + MINIVAN + log_TIF +
log_MVR_PTS + cat_KIDSDRIV + cat_CLM_FREQ, training_AMT)
summary(adjustedr2_AMT)
##
## Call:
## lm(formula = TARGET_AMT ~ 1 + PARENT1 + MSTATUS + CAR_USE + REVOKED +
## URBANICITY + HS + NOHS + MANAGER + MINIVAN + log_TIF + log_MVR_PTS +
## cat_KIDSDRIV + cat_CLM_FREQ, data = training_AMT)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4997 -1694 -819 354 104984
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 774.2430 182.3082 4.247 2.19e-05 ***
## PARENT1 655.1922 177.4678 3.692 0.000224 ***
## MSTATUS -581.7175 119.7388 -4.858 1.21e-06 ***
## CAR_USE -730.5026 109.4617 -6.674 2.66e-11 ***
## REVOKED 473.8986 155.1903 3.054 0.002268 **
## URBANICITY 1469.2228 135.8413 10.816 < 2e-16 ***
## HS 496.2580 120.3904 4.122 3.79e-05 ***
## NOHS 676.2812 153.2939 4.412 1.04e-05 ***
## MANAGER -884.6343 162.0158 -5.460 4.90e-08 ***
## MINIVAN -540.8132 117.7404 -4.593 4.43e-06 ***
## log_TIF 7.1657 2.1302 3.364 0.000772 ***
## log_MVR_PTS -2.8086 0.7578 -3.706 0.000212 ***
## cat_KIDSDRIV 682.5059 162.3736 4.203 2.66e-05 ***
## cat_CLM_FREQ 571.3661 114.3887 4.995 6.01e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4560 on 8147 degrees of freedom
## Multiple R-squared: 0.0617, Adjusted R-squared: 0.0602
## F-statistic: 41.21 on 13 and 8147 DF, p-value: < 2.2e-16
#Model is osmewhat similar to Backward elimation Model
#The maximum Adjusted R^(2) of 0.211793 for the model predicting TARGET_FLAG is reached when the model contains 24 variables.
model_sum_FLAG$which[which.max(model_sum_FLAG$adjr2), ]
## (Intercept) AGE PARENT1 MSTATUS CAR_USE REVOKED
## TRUE FALSE TRUE TRUE TRUE TRUE
## URBANICITY PHD MASTERS BACHELORS HS NOHS
## TRUE TRUE TRUE TRUE FALSE FALSE
## CLERICAL MANAGER PROF STUDENT BLUE_COLLAR MINIVAN
## TRUE TRUE FALSE TRUE TRUE TRUE
## PICKUP SPORTS SUV log_INCOME log_TRAVTIME log_BLUEBOOK
## TRUE TRUE TRUE TRUE TRUE TRUE
## log_TIF log_MVR_PTS cat_KIDSDRIV cat_HOMEKIDS cat_YOJ cat_OLDCLAIM
## TRUE TRUE TRUE TRUE FALSE FALSE
## cat_CLM_FREQ cat_CAR_AGE
## TRUE FALSE
adjustedr2_FLAG <- glm(TARGET_FLAG ~ 1 + PARENT1 + MSTATUS + CAR_USE +
REVOKED + URBANICITY + PHD + MASTERS + BACHELORS + CLERICAL + MANAGER +
STUDENT + BLUE_COLLAR + MINIVAN + PICKUP + SPORTS + SUV + log_INCOME +
log_TRAVTIME + log_BLUEBOOK + log_TIF + log_MVR_PTS + cat_KIDSDRIV +
cat_HOMEKIDS + cat_CLM_FREQ, family = binomial(link = "logit"), training_FLAG)
round(coef(summary(adjustedr2_FLAG)), 6)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.554923 0.160423 -15.926206 0.000000
## PARENT1 0.217757 0.119008 1.829758 0.067286
## MSTATUS -0.694028 0.074400 -9.328352 0.000000
## CAR_USE -0.688893 0.080778 -8.528279 0.000000
## REVOKED 0.733365 0.079366 9.240333 0.000000
## URBANICITY 2.249695 0.111388 20.196878 0.000000
## PHD -0.791338 0.133765 -5.915870 0.000000
## MASTERS -0.508002 0.105529 -4.813852 0.000001
## BACHELORS -0.485927 0.077640 -6.258734 0.000000
## CLERICAL 0.382111 0.106668 3.582245 0.000341
## MANAGER -0.690384 0.109689 -6.293993 0.000000
## STUDENT 0.184715 0.126539 1.459743 0.144361
## BLUE_COLLAR 0.250666 0.103458 2.422876 0.015398
## MINIVAN -0.435240 0.105575 -4.122578 0.000037
## PICKUP 0.219309 0.096226 2.279105 0.022661
## SPORTS 0.616909 0.118944 5.186530 0.000000
## SUV 0.364174 0.099838 3.647647 0.000265
## log_INCOME 0.100999 0.015677 6.442715 0.000000
## log_TRAVTIME -0.028743 0.007072 -4.064134 0.000048
## log_BLUEBOOK 0.059508 0.024005 2.479025 0.013174
## log_TIF 0.007729 0.001190 6.493015 0.000000
## log_MVR_PTS -0.002189 0.000437 -5.008679 0.000001
## cat_KIDSDRIV 0.542256 0.094714 5.725173 0.000000
## cat_HOMEKIDS 0.252943 0.087625 2.886646 0.003894
## cat_CLM_FREQ 0.557205 0.060751 9.171972 0.000000
#At a significance level of alpha=0.5, the R^2adj selected variables yield a TARGET_FLAG BLR model with two insignificant variables: PARENT1 and STUDENT. Removing those three insignificant variables yields a model with all significant variables
adjustedr2_FLAG <- glm(TARGET_FLAG ~ 1 + MSTATUS + CAR_USE + REVOKED +
URBANICITY + PHD + MASTERS + BACHELORS + CLERICAL + MANAGER +
BLUE_COLLAR + MINIVAN + PICKUP + SPORTS + SUV + log_INCOME +
log_TRAVTIME + log_BLUEBOOK + log_TIF + log_MVR_PTS + cat_KIDSDRIV +
cat_HOMEKIDS + cat_CLM_FREQ, family = binomial(link = "logit"), training_FLAG)
summary(adjustedr2_FLAG)
##
## Call:
## glm(formula = TARGET_FLAG ~ 1 + MSTATUS + CAR_USE + REVOKED +
## URBANICITY + PHD + MASTERS + BACHELORS + CLERICAL + MANAGER +
## BLUE_COLLAR + MINIVAN + PICKUP + SPORTS + SUV + log_INCOME +
## log_TRAVTIME + log_BLUEBOOK + log_TIF + log_MVR_PTS + cat_KIDSDRIV +
## cat_HOMEKIDS + cat_CLM_FREQ, family = binomial(link = "logit"),
## data = training_FLAG)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2016 -0.7231 -0.4212 0.6891 3.1351
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.4312711 0.1486779 -16.353 < 2e-16 ***
## MSTATUS -0.7777777 0.0586439 -13.263 < 2e-16 ***
## CAR_USE -0.7155003 0.0787814 -9.082 < 2e-16 ***
## REVOKED 0.7343615 0.0793348 9.256 < 2e-16 ***
## URBANICITY 2.2363335 0.1109238 20.161 < 2e-16 ***
## PHD -0.8497653 0.1275975 -6.660 2.74e-11 ***
## MASTERS -0.5639725 0.0979155 -5.760 8.42e-09 ***
## BACHELORS -0.5101645 0.0752438 -6.780 1.20e-11 ***
## CLERICAL 0.3256558 0.0990172 3.289 0.00101 **
## MANAGER -0.7098209 0.1082712 -6.556 5.53e-11 ***
## BLUE_COLLAR 0.1852830 0.0928333 1.996 0.04595 *
## MINIVAN -0.4147417 0.1048748 -3.955 7.67e-05 ***
## PICKUP 0.2317109 0.0958995 2.416 0.01568 *
## SPORTS 0.6331507 0.1186074 5.338 9.39e-08 ***
## SUV 0.3802718 0.0993833 3.826 0.00013 ***
## log_INCOME 0.1067877 0.0151806 7.034 2.00e-12 ***
## log_TRAVTIME -0.0285046 0.0070522 -4.042 5.30e-05 ***
## log_BLUEBOOK 0.0604612 0.0239746 2.522 0.01167 *
## log_TIF 0.0077168 0.0011898 6.486 8.82e-11 ***
## log_MVR_PTS -0.0021850 0.0004367 -5.004 5.63e-07 ***
## cat_KIDSDRIV 0.5297128 0.0942272 5.622 1.89e-08 ***
## cat_HOMEKIDS 0.3609836 0.0685862 5.263 1.42e-07 ***
## cat_CLM_FREQ 0.5555339 0.0607168 9.150 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9418.0 on 8160 degrees of freedom
## Residual deviance: 7446.2 on 8138 degrees of freedom
## AIC: 7492.2
##
## Number of Fisher Scoring iterations: 5
par(mfrow=c(1,2))
plot(model_sum_AMT$cp, xlab = "Number of Variables", ylab = "bias Cp", main="TARGET_AMT")
plot(model_sum_FLAG$cp, xlab = "Number of Variables", ylab = "bias Cp", main="TARGET_FLAG")
cbind(min(model_sum_AMT$cp), which.min(model_sum_AMT$cp))
## [,1] [,2]
## [1,] 12.81883 16
cbind(min(model_sum_FLAG$cp), which.min(model_sum_FLAG$cp))
## [,1] [,2]
## [1,] 19.25552 24
#The minimum bias 12.8188345 for the model predicting TARGET_AMT is reached when the model contains 16 variables.
model_sum_AMT$which[which.min(model_sum_AMT$cp), ]
## (Intercept) AGE PARENT1 MSTATUS CAR_USE REVOKED
## TRUE FALSE TRUE TRUE TRUE TRUE
## URBANICITY PHD MASTERS HS NOHS MANAGER
## TRUE FALSE FALSE TRUE TRUE TRUE
## STUDENT BLUE_COLLAR MINIVAN TRUCK PICKUP SPORTS
## FALSE FALSE TRUE FALSE FALSE TRUE
## VAN log_INCOME log_TRAVTIME log_BLUEBOOK log_TIF log_MVR_PTS
## FALSE TRUE TRUE FALSE TRUE TRUE
## cat_KIDSDRIV cat_HOMEKIDS cat_YOJ cat_OLDCLAIM cat_CLM_FREQ cat_CAR_AGE
## TRUE FALSE FALSE FALSE TRUE FALSE
bias_AMT <- lm(TARGET_AMT ~ 1 + PARENT1 + MSTATUS + CAR_USE +
REVOKED + URBANICITY + HS + NOHS + MANAGER + MINIVAN + SPORTS +
log_INCOME + log_TRAVTIME + log_TIF + log_MVR_PTS +
cat_KIDSDRIV + cat_CLM_FREQ, training_AMT)
round(coef(summary(bias_AMT)), 6)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 762.784417 183.151125 4.164782 0.000031
## PARENT1 640.018132 177.569345 3.604328 0.000315
## MSTATUS -586.551921 119.723526 -4.899220 0.000001
## CAR_USE -785.707613 111.900046 -7.021513 0.000000
## REVOKED 470.841555 155.143313 3.034881 0.002414
## URBANICITY 1505.218992 136.333458 11.040716 0.000000
## HS 470.034608 120.940482 3.886495 0.000103
## NOHS 657.777887 153.639789 4.281299 0.000019
## MANAGER -851.651499 162.435546 -5.243012 0.000000
## MINIVAN -471.449326 121.477290 -3.880967 0.000105
## SPORTS 322.212013 168.353540 1.913901 0.055668
## log_INCOME 46.035671 25.544003 1.802210 0.071549
## log_TRAVTIME -18.636693 11.105620 -1.678132 0.093360
## log_TIF 6.989195 2.130538 3.280483 0.001041
## log_MVR_PTS -2.773052 0.757597 -3.660325 0.000253
## cat_KIDSDRIV 691.794967 162.331388 4.261622 0.000021
## cat_CLM_FREQ 552.008131 114.513722 4.820454 0.000001
#At a significance level of alpha=0.5 , the R2adj selected variables yield a TARGET_AMT MLR model with three insignificant variables: SPORTS, log_INCOME, and log_TRAVTIME. Removing those three insignificant variables yields a model with all significant variables.
bias_AMT <- lm(TARGET_AMT ~ 1 + PARENT1 + MSTATUS + CAR_USE +
REVOKED + URBANICITY + HS + NOHS + MANAGER + MINIVAN + log_TIF +
log_MVR_PTS + cat_KIDSDRIV + cat_CLM_FREQ, training_AMT)
summary(bias_AMT)
##
## Call:
## lm(formula = TARGET_AMT ~ 1 + PARENT1 + MSTATUS + CAR_USE + REVOKED +
## URBANICITY + HS + NOHS + MANAGER + MINIVAN + log_TIF + log_MVR_PTS +
## cat_KIDSDRIV + cat_CLM_FREQ, data = training_AMT)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4997 -1694 -819 354 104984
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 774.2430 182.3082 4.247 2.19e-05 ***
## PARENT1 655.1922 177.4678 3.692 0.000224 ***
## MSTATUS -581.7175 119.7388 -4.858 1.21e-06 ***
## CAR_USE -730.5026 109.4617 -6.674 2.66e-11 ***
## REVOKED 473.8986 155.1903 3.054 0.002268 **
## URBANICITY 1469.2228 135.8413 10.816 < 2e-16 ***
## HS 496.2580 120.3904 4.122 3.79e-05 ***
## NOHS 676.2812 153.2939 4.412 1.04e-05 ***
## MANAGER -884.6343 162.0158 -5.460 4.90e-08 ***
## MINIVAN -540.8132 117.7404 -4.593 4.43e-06 ***
## log_TIF 7.1657 2.1302 3.364 0.000772 ***
## log_MVR_PTS -2.8086 0.7578 -3.706 0.000212 ***
## cat_KIDSDRIV 682.5059 162.3736 4.203 2.66e-05 ***
## cat_CLM_FREQ 571.3661 114.3887 4.995 6.01e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4560 on 8147 degrees of freedom
## Multiple R-squared: 0.0617, Adjusted R-squared: 0.0602
## F-statistic: 41.21 on 13 and 8147 DF, p-value: < 2.2e-16
#The minimum bias of 19.2555192 for the model predicting TARGET_FLAG is reached when the model contains 24 variables
model_sum_FLAG$which[which.min(model_sum_FLAG$cp), ]
## (Intercept) AGE PARENT1 MSTATUS CAR_USE REVOKED
## TRUE FALSE TRUE TRUE TRUE TRUE
## URBANICITY PHD MASTERS BACHELORS HS NOHS
## TRUE TRUE TRUE TRUE FALSE FALSE
## CLERICAL MANAGER PROF STUDENT BLUE_COLLAR MINIVAN
## TRUE TRUE FALSE TRUE TRUE TRUE
## PICKUP SPORTS SUV log_INCOME log_TRAVTIME log_BLUEBOOK
## TRUE TRUE TRUE TRUE TRUE TRUE
## log_TIF log_MVR_PTS cat_KIDSDRIV cat_HOMEKIDS cat_YOJ cat_OLDCLAIM
## TRUE TRUE TRUE TRUE FALSE FALSE
## cat_CLM_FREQ cat_CAR_AGE
## TRUE FALSE
bias_FLAG <- glm(TARGET_FLAG ~ 1 + PARENT1 + MSTATUS + CAR_USE +
REVOKED + URBANICITY + PHD + MASTERS + BACHELORS + CLERICAL +
MANAGER + STUDENT + BLUE_COLLAR + MINIVAN + PICKUP + SPORTS +
SUV + log_INCOME + log_TRAVTIME + log_BLUEBOOK + log_TIF +
log_MVR_PTS + cat_KIDSDRIV + cat_HOMEKIDS + cat_CLM_FREQ,
family = binomial(link = "logit"), training_FLAG)
round(coef(summary(bias_FLAG)), 6)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.554923 0.160423 -15.926206 0.000000
## PARENT1 0.217757 0.119008 1.829758 0.067286
## MSTATUS -0.694028 0.074400 -9.328352 0.000000
## CAR_USE -0.688893 0.080778 -8.528279 0.000000
## REVOKED 0.733365 0.079366 9.240333 0.000000
## URBANICITY 2.249695 0.111388 20.196878 0.000000
## PHD -0.791338 0.133765 -5.915870 0.000000
## MASTERS -0.508002 0.105529 -4.813852 0.000001
## BACHELORS -0.485927 0.077640 -6.258734 0.000000
## CLERICAL 0.382111 0.106668 3.582245 0.000341
## MANAGER -0.690384 0.109689 -6.293993 0.000000
## STUDENT 0.184715 0.126539 1.459743 0.144361
## BLUE_COLLAR 0.250666 0.103458 2.422876 0.015398
## MINIVAN -0.435240 0.105575 -4.122578 0.000037
## PICKUP 0.219309 0.096226 2.279105 0.022661
## SPORTS 0.616909 0.118944 5.186530 0.000000
## SUV 0.364174 0.099838 3.647647 0.000265
## log_INCOME 0.100999 0.015677 6.442715 0.000000
## log_TRAVTIME -0.028743 0.007072 -4.064134 0.000048
## log_BLUEBOOK 0.059508 0.024005 2.479025 0.013174
## log_TIF 0.007729 0.001190 6.493015 0.000000
## log_MVR_PTS -0.002189 0.000437 -5.008679 0.000001
## cat_KIDSDRIV 0.542256 0.094714 5.725173 0.000000
## cat_HOMEKIDS 0.252943 0.087625 2.886646 0.003894
## cat_CLM_FREQ 0.557205 0.060751 9.171972 0.000000
#At a significance level of alpha=0.5 , the R2adj selected variables yield a TARGET_FLAG BLR model with two insignificant variables: PARENT1 and STUDENT. Removing those three insignificant variables yields a model with all significant variables
bias_FLAG <- glm(TARGET_FLAG ~ 1 + MSTATUS + CAR_USE +
REVOKED + URBANICITY + PHD + MASTERS + BACHELORS + CLERICAL +
MANAGER + BLUE_COLLAR + MINIVAN + PICKUP + SPORTS + SUV +
log_INCOME + log_TRAVTIME + log_BLUEBOOK + log_TIF +
log_MVR_PTS + cat_KIDSDRIV + cat_HOMEKIDS + cat_CLM_FREQ,
family = binomial(link = "logit"), training_FLAG)
summary(bias_FLAG)
##
## Call:
## glm(formula = TARGET_FLAG ~ 1 + MSTATUS + CAR_USE + REVOKED +
## URBANICITY + PHD + MASTERS + BACHELORS + CLERICAL + MANAGER +
## BLUE_COLLAR + MINIVAN + PICKUP + SPORTS + SUV + log_INCOME +
## log_TRAVTIME + log_BLUEBOOK + log_TIF + log_MVR_PTS + cat_KIDSDRIV +
## cat_HOMEKIDS + cat_CLM_FREQ, family = binomial(link = "logit"),
## data = training_FLAG)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2016 -0.7231 -0.4212 0.6891 3.1351
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.4312711 0.1486779 -16.353 < 2e-16 ***
## MSTATUS -0.7777777 0.0586439 -13.263 < 2e-16 ***
## CAR_USE -0.7155003 0.0787814 -9.082 < 2e-16 ***
## REVOKED 0.7343615 0.0793348 9.256 < 2e-16 ***
## URBANICITY 2.2363335 0.1109238 20.161 < 2e-16 ***
## PHD -0.8497653 0.1275975 -6.660 2.74e-11 ***
## MASTERS -0.5639725 0.0979155 -5.760 8.42e-09 ***
## BACHELORS -0.5101645 0.0752438 -6.780 1.20e-11 ***
## CLERICAL 0.3256558 0.0990172 3.289 0.00101 **
## MANAGER -0.7098209 0.1082712 -6.556 5.53e-11 ***
## BLUE_COLLAR 0.1852830 0.0928333 1.996 0.04595 *
## MINIVAN -0.4147417 0.1048748 -3.955 7.67e-05 ***
## PICKUP 0.2317109 0.0958995 2.416 0.01568 *
## SPORTS 0.6331507 0.1186074 5.338 9.39e-08 ***
## SUV 0.3802718 0.0993833 3.826 0.00013 ***
## log_INCOME 0.1067877 0.0151806 7.034 2.00e-12 ***
## log_TRAVTIME -0.0285046 0.0070522 -4.042 5.30e-05 ***
## log_BLUEBOOK 0.0604612 0.0239746 2.522 0.01167 *
## log_TIF 0.0077168 0.0011898 6.486 8.82e-11 ***
## log_MVR_PTS -0.0021850 0.0004367 -5.004 5.63e-07 ***
## cat_KIDSDRIV 0.5297128 0.0942272 5.622 1.89e-08 ***
## cat_HOMEKIDS 0.3609836 0.0685862 5.263 1.42e-07 ***
## cat_CLM_FREQ 0.5555339 0.0607168 9.150 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9418.0 on 8160 degrees of freedom
## Residual deviance: 7446.2 on 8138 degrees of freedom
## AIC: 7492.2
##
## Number of Fisher Scoring iterations: 5
sum1 <- summary(forward_AMT)
sum2 <- summary(backward_AMT)
sum3 <- summary(adjustedr2_AMT)
sum4 <- summary(bias_AMT)
#Multi-collinearity
library(lmtest)
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.2.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
dwtest(forward_AMT)
##
## Durbin-Watson test
##
## data: forward_AMT
## DW = 1.988, p-value = 0.294
## alternative hypothesis: true autocorrelation is greater than 0
dwtest(backward_AMT)
##
## Durbin-Watson test
##
## data: backward_AMT
## DW = 1.988, p-value = 0.2932
## alternative hypothesis: true autocorrelation is greater than 0
dwtest(adjustedr2_AMT)
##
## Durbin-Watson test
##
## data: adjustedr2_AMT
## DW = 1.988, p-value = 0.2932
## alternative hypothesis: true autocorrelation is greater than 0
dwtest(bias_AMT)
##
## Durbin-Watson test
##
## data: bias_AMT
## DW = 1.988, p-value = 0.2932
## alternative hypothesis: true autocorrelation is greater than 0
#The null hypothesis is that there does not exist multicollinearity. Since the p-valuea are large, we fail to reject the null hypothesis.
summary(M[1:n, "TARGET_AMT"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 1504 1036 107586
data.frame("MODEL" = c("forward_AMT", "backward_AMT", "adjustedr2_AMT", "bias_AMT"),
"MSE" = c(sum1$sigma^2, sum2$sigma^2, sum3$sigma^2, sum4$sigma^2),
"RMSE" = c(sum1$sigma, sum2$sigma, sum3$sigma, sum4$sigma))
## MODEL MSE RMSE
## 1 forward_AMT 20795666 4560.227
## 2 backward_AMT 20795771 4560.238
## 3 adjustedr2_AMT 20795771 4560.238
## 4 bias_AMT 20795771 4560.238
#Mean squared error is the sqaure of RMSE. We see that standard error of the mean (RMSE) is fairly large relative to the target variable. In these models, the standard deviation of the unexplained variance in TARGET_AMT are in the neighborhood of 4560 which is a large deviation from the 1504.325 average claim encountered in the data
R^2
data.frame("MODEL" = c("forward_AMT", "backward_AMT", "adjustedr2_AMT", "biascp_AMT"),
"R.SQUARED" = c(sum1$r.squared, sum2$r.squared, sum3$r.squared, sum4$r.squared),
"ADJ.R.SQUARED" = c(sum1$adj.r.squared, sum2$adj.r.squared, sum3$adj.r.squared, sum4$adj.r.squared))
## MODEL R.SQUARED ADJ.R.SQUARED
## 1 forward_AMT 0.14876730 0.14730452
## 2 backward_AMT 0.06169727 0.06020004
## 3 adjustedr2_AMT 0.06169727 0.06020004
## 4 biascp_AMT 0.06169727 0.06020004
#R^2 is fairly low for this model. R^2 however, is not an adequate performance measure for this model. Adjusted R^2 is more appropriate when models have multiple variables. It incorporates a penalty to account for the decrease in degrees of freedom (from additional variables).
data.frame("MODEL" = c("forward_AMT", "backward_AMT", "adjustedr2_AMT", "biascp_AMT"),
rbind(sum1$fstatistic, sum2$fstatistic, sum3$fstatistic, sum4$fstatistic))
## MODEL value numdf dendf
## 1 forward_AMT 101.7019 14 8147
## 2 backward_AMT 41.2076 13 8147
## 3 adjustedr2_AMT 41.2076 13 8147
## 4 biascp_AMT 41.2076 13 8147
#The F-test evaluates the null hypothesis that all regression coefficients are equal to zero versus the alternative that at least one does not
par(mfrow = c(2,2))
plot(forward_AMT)
par(mfrow = c(2,2))
plot(backward_AMT)
par(mfrow = c(2,2))
plot(adjustedr2_AMT)
par(mfrow = c(2,2))
plot(bias_AMT)
#The Residuals vs Fitted plot shows that the residuals do not have a linear pattern. The Normal Q-Q plot shows that the residuals are also not normally distributed
#The model derived using Forward Selection has the lowest RMSE, although only by a minimal amount. When it comes to the F -statistics and Adjusted R^2 however, the forward_AMT model has values that are substantially higher. The chosen model for TARGET_AMT is therefore forward_AMT.
library(caret)
## Warning: package 'caret' was built under R version 4.2.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.2.3
training_FLAG[ ,"probability.forward"] <- predict(forward_FLAG, training_FLAG, type="response")
training_FLAG[ ,"class.forward"] <- ifelse(training_FLAG$probability.forward < 0.5, 0, 1)
training_FLAG$class.forward <- factor(training_FLAG$class.forward, levels = c("0", "1"))
training_FLAG$TARGET_FLAG <- factor(training_FLAG$TARGET_FLAG, levels = c("0", "1"))
(cm1 <- confusionMatrix(training_FLAG$class.forward, training_FLAG$TARGET_FLAG, positive = "1"))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 5533 1297
## 1 475 856
##
## Accuracy : 0.7829
## 95% CI : (0.7738, 0.7918)
## No Information Rate : 0.7362
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.363
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.3976
## Specificity : 0.9209
## Pos Pred Value : 0.6431
## Neg Pred Value : 0.8101
## Prevalence : 0.2638
## Detection Rate : 0.1049
## Detection Prevalence : 0.1631
## Balanced Accuracy : 0.6593
##
## 'Positive' Class : 1
##
cm1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 5533 1297
## 1 475 856
##
## Accuracy : 0.7829
## 95% CI : (0.7738, 0.7918)
## No Information Rate : 0.7362
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.363
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.3976
## Specificity : 0.9209
## Pos Pred Value : 0.6431
## Neg Pred Value : 0.8101
## Prevalence : 0.2638
## Detection Rate : 0.1049
## Detection Prevalence : 0.1631
## Balanced Accuracy : 0.6593
##
## 'Positive' Class : 1
##
#The model derived using Forward Selection has the following performance metrics: Accuracy of 0.7828697, Error Rate of 0.2171303, Precision of 0.6431255, Sensitivity of 0.3975848, Specificity of 0.9209387, and F1 Score of 0.4913892.
training_FLAG[ ,"probability.backward"] <- predict(backward_FLAG, training_FLAG, type="response")
training_FLAG[ ,"class.backward"] <- ifelse(training_FLAG$probability.backward < 0.5, 0, 1)
training_FLAG$class.backward <- factor(training_FLAG$class.backward, levels = c("0", "1"))
training_FLAG$TARGET_FLAG <- factor(training_FLAG$TARGET_FLAG, levels = c("0", "1"))
(cm2 <- confusionMatrix(training_FLAG$class.backward, training_FLAG$TARGET_FLAG, positive = "1"))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 5536 1295
## 1 472 858
##
## Accuracy : 0.7835
## 95% CI : (0.7744, 0.7924)
## No Information Rate : 0.7362
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3647
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.3985
## Specificity : 0.9214
## Pos Pred Value : 0.6451
## Neg Pred Value : 0.8104
## Prevalence : 0.2638
## Detection Rate : 0.1051
## Detection Prevalence : 0.1630
## Balanced Accuracy : 0.6600
##
## 'Positive' Class : 1
##
#The model derived using Backward Elimination has the following performance metrics: Accuracy of 0.7834824, Error Rate of 0.2165176, Precision of 0.6451128, Sensitivity of 0.3985137, Specificity of 0.9214381, and F1 Score of 0.4926787
training_FLAG[ ,"probability.adjustedr2"] <- predict(adjustedr2_FLAG, training_FLAG, type="response")
training_FLAG[ ,"class.adjustedr2"] <- ifelse(training_FLAG$probability.adjustedr2 < 0.5, 0, 1)
training_FLAG$class.adjustedr2 <- factor(training_FLAG$class.adjustedr2, levels = c("0", "1"))
training_FLAG$TARGET_FLAG <- factor(training_FLAG$TARGET_FLAG, levels = c("0", "1"))
(cm3 <- confusionMatrix(training_FLAG$class.adjustedr2, training_FLAG$TARGET_FLAG, positive = "1"))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 5536 1295
## 1 472 858
##
## Accuracy : 0.7835
## 95% CI : (0.7744, 0.7924)
## No Information Rate : 0.7362
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3647
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.3985
## Specificity : 0.9214
## Pos Pred Value : 0.6451
## Neg Pred Value : 0.8104
## Prevalence : 0.2638
## Detection Rate : 0.1051
## Detection Prevalence : 0.1630
## Balanced Accuracy : 0.6600
##
## 'Positive' Class : 1
##
#The model derived using Adjusted R^2 has the following performance metrics: Accuracy of 0.7834824, Error Rate of 0.2165176, Precision of , Sensitivity of 0.3985137, Specificity of 0.9214381, and F1 Score of 0.4926787. These metrics are identical to those from Backward Elimination since, as previously mentioned, both models identical.
training_FLAG[ ,"probability.bias"] <- predict(bias_FLAG, training_FLAG, type="response")
training_FLAG[ ,"class.bias"] <- ifelse(training_FLAG$probability.bias < 0.5, 0, 1)
training_FLAG$class.bias <- factor(training_FLAG$class.bias, levels = c("0", "1"))
training_FLAG$TARGET_FLAG <- factor(training_FLAG$TARGET_FLAG, levels = c("0", "1"))
(cm4 <- confusionMatrix(training_FLAG$class.bias, training_FLAG$TARGET_FLAG, positive = "1"))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 5536 1295
## 1 472 858
##
## Accuracy : 0.7835
## 95% CI : (0.7744, 0.7924)
## No Information Rate : 0.7362
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3647
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.3985
## Specificity : 0.9214
## Pos Pred Value : 0.6451
## Neg Pred Value : 0.8104
## Prevalence : 0.2638
## Detection Rate : 0.1051
## Detection Prevalence : 0.1630
## Balanced Accuracy : 0.6600
##
## 'Positive' Class : 1
##
#The model derived using bias has the following performance metrics: Accuracy of 0.7834824, Error Rate of 0.2165176, Precision of , Sensitivity of 0.3985137, Specificity of 0.9214381, and F1 Score of 0.4926787. These metrics are identical to those from Backward Elimination since, as previously mentioned, both models identical
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:colorspace':
##
## coords
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
par(mfrow=c(2,2))
training_FLAG$class.forward <- as.numeric(training_FLAG$class.forward)
plot(roc(training_FLAG$TARGET_FLAG, training_FLAG$class.forward, smooth=F), print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
training_FLAG$class.backward <- as.numeric(training_FLAG$class.backward)
plot(roc(training_FLAG$TARGET_FLAG, training_FLAG$class.backward, smooth=F), print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc(training_FLAG$TARGET_FLAG, training_FLAG$probability.adjustedr2, smooth=F), print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc(training_FLAG$TARGET_FLAG, training_FLAG$probability.bias, smooth=F), print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
#The models with the highest Accuracy were those derived using Backward Elimination, Adjusted R2, and bias models yielded identical results. These models with the greatest Area Under the ROC Curve however, were the adjustedr2_AMT and bias_AMT models. As such, the backward_AMT is being eliminated and there is indifference between using the adjustedr2_AMT and bias_AMT models for TARGET_FLAG.
validation <- M[(1+n):(m+n),]
probability <- predict(adjustedr2_FLAG, validation, type="response")
predict_FLAG <- ifelse(probability >= .5, 1, 0)
predict_AMT <- predict(forward_AMT, validation)
predict_AMT[predict_FLAG == 0] <- 0
predictions <- data.frame("predict_FLAG" = predict_FLAG, "predict_AMT" = predict_AMT)
head(predictions)
## predict_FLAG predict_AMT
## 8162 0 0
## 8163 0 0
## 8164 0 0
## 8165 0 0
## 8166 0 0
## 8167 0 0
training_FLAG$TARGET_FLAG <- as.numeric(training_FLAG$TARGET_FLAG)
n <- sum(training_FLAG$TARGET_FLAG)
N <- nrow(training_FLAG)
m <- sum(predict_FLAG)
M <- length(predict_FLAG)
p <- m / M
as.numeric(predict_FLAG)
## [1] 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## [38] 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0
## [75] 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1
## [112] 0 0 0 1 0 0 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 0 0
## [149] 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 1 1 1 0 0 0 1
## [186] 0 0 0 0 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## [223] 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 1
## [260] 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 1 1 0 1 0 1 0 0 0 1 0 0
## [297] 0 1 0 0 0 0 0 1 1 0 0 1 0 0 1 0 0 1 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1
## [334] 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## [371] 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [408] 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## [445] 0 0 0 0 1 1 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 1 1 0 0 0
## [482] 0 0 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1
## [519] 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## [556] 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0
## [593] 0 0 0 1 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0
## [630] 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1
## [667] 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## [704] 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## [741] 1 0 1 0 0 1 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0
## [778] 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [815] 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1
## [852] 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 1 0 1 1 1 0 0 0 0 1 0 0 0 1 0 1 0
## [889] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## [926] 0 0 0 1 0 0 1 0 1 0 1 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0
## [963] 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1
## [1000] 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0
## [1037] 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0
## [1074] 1 0 0 0 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0
## [1111] 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## [1148] 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 1 0 1 0 0
## [1185] 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0
## [1222] 0 1 1 1 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0
## [1259] 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0
## [1296] 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0
## [1333] 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0
## [1370] 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0
## [1407] 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0
## [1444] 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## [1481] 0 0 1 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0
## [1518] 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 1
## [1555] 0 0 0 0 1 0 0 1 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## [1592] 1 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 1 1 0 0 1 0 1 0 1 0 0 1 0 1 0 0 0 0 0
## [1629] 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 1 1
## [1666] 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0
## [1703] 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## [1740] 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0
## [1777] 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1
## [1814] 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0
## [1851] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## [1888] 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## [1925] 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1
## [1962] 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 1 1 0 0 0 1
## [1999] 1 0 1 0 1 0 1 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1
## [2036] 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0
## [2073] 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 0 0 1 0 0
## [2110] 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
as.numeric(training_FLAG$TARGET_FLAG)
## [1] 1 1 1 1 1 2 1 2 2 1 2 1 1 2 2 1 1 1 1 2 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 2 1 2 2 1 2 1 2 1 1 1 1 1 1 1 2 1 2 1 2 1 1
## [75] 1 1 1 2 1 1 1 1 1 1 1 2 1 2 2 1 1 2 2 1 2 1 1 1 2 1 1 1 1 1 2 1 1 1 2 2 1
## [112] 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 2 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 2 1 1 1 1 1 2 2 1 2 1 1 1 1 1 1
## [186] 2 1 1 2 1 2 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 2 1 1 2 1
## [223] 1 1 1 1 2 1 1 2 1 1 1 1 2 2 2 1 2 2 1 1 1 1 1 2 2 1 2 1 1 1 1 1 1 2 2 2 1
## [260] 1 2 1 2 2 1 1 1 1 1 1 2 1 1 1 1 2 1 2 1 1 1 1 1 2 1 1 1 1 1 2 2 2 1 1 1 1
## [297] 2 2 1 1 2 2 1 1 1 1 1 1 2 1 2 2 1 2 2 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1
## [334] 2 2 1 2 2 2 1 1 2 2 1 1 2 2 1 1 1 2 1 2 1 2 1 2 2 1 1 1 1 1 2 1 1 1 1 1 1
## [371] 1 1 1 1 2 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1 1 2 2 2 1 1 1 2 1 2 1 1 2 1 1 1
## [408] 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 2 2 1 2 1 1 1 2 1 1 2 2 1 1 1 2 1
## [445] 2 1 1 1 2 2 1 1 1 1 1 2 1 1 2 2 1 2 1 1 1 1 2 1 2 1 2 1 1 2 2 1 1 1 1 1 1
## [482] 1 1 2 1 2 1 1 1 1 1 1 2 1 1 1 1 2 2 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 1 1
## [519] 2 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 2 1 1 2 2 1 2 1
## [556] 2 1 2 1 2 2 1 1 1 1 2 1 1 1 1 2 1 2 1 1 1 1 1 2 1 2 1 1 1 1 2 1 1 1 1 1 1
## [593] 2 1 1 1 1 1 1 1 1 2 1 1 1 2 2 1 1 2 1 1 1 1 1 2 1 2 1 1 1 1 1 1 2 1 1 1 1
## [630] 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 1 2 1 2 2 1
## [667] 2 2 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 2 1 1 1 1 1 1 1 1
## [704] 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 2 2 1 1 1 2 1 1 1 1 1
## [741] 1 2 2 1 1 1 1 2 1 2 1 1 1 1 1 2 1 1 2 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 2 1
## [778] 2 1 2 1 1 1 1 2 2 1 1 2 2 1 1 2 1 2 1 1 1 1 1 1 1 2 2 1 1 2 2 1 1 1 1 1 1
## [815] 2 1 1 1 2 1 1 1 1 1 1 2 2 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 1 1 1 1
## [852] 1 1 1 1 1 2 1 1 2 1 2 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1
## [889] 1 2 1 1 2 1 1 1 1 1 1 2 1 2 2 1 1 2 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2
## [926] 2 1 1 1 2 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1 2 1 1 2 2 2 1 2 2 2 1 1 2 2
## [963] 2 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 2 1 1 1 2 1 1 1 2 1 1 1 1
## [1000] 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1 1 2 1 1 2 1 1 2 1 1 1 1 2 1
## [1037] 1 2 2 1 1 2 2 2 1 1 2 1 2 1 1 1 1 2 2 1 1 1 2 1 1 1 1 1 2 1 1 2 1 2 2 1 2
## [1074] 2 1 2 2 2 2 2 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 2 1 2 1 1 1 1 2 2 1 1 2 1 2 1
## [1111] 1 1 1 2 1 2 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 1 1 1 2 1 2 1 1 2 1 1 1 2 1 1 1
## [1148] 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 2 1 1 2 1 1 1 1 1 1 1 2 2 1 2 2 1
## [1185] 1 2 1 1 2 1 2 2 1 1 1 1 2 2 1 1 2 1 1 1 2 1 1 1 2 1 1 1 1 1 2 1 2 2 1 1 2
## [1222] 1 2 1 2 1 1 1 2 1 1 2 1 1 1 2 1 2 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [1259] 1 1 1 1 2 1 1 2 1 2 1 1 2 1 2 2 1 1 1 1 2 1 1 1 1 2 1 2 2 1 2 1 1 1 1 1 1
## [1296] 1 1 1 1 1 2 1 2 1 1 1 1 1 1 1 2 1 2 1 2 2 1 2 1 2 1 1 1 1 1 1 2 1 1 1 2 2
## [1333] 1 1 2 1 2 1 1 1 1 2 2 1 1 2 2 1 2 1 1 2 2 2 1 1 2 2 1 1 1 1 1 1 2 1 1 1 2
## [1370] 1 1 1 1 1 1 2 1 1 2 1 1 2 1 1 1 2 1 1 2 1 1 1 1 1 1 1 2 1 2 1 1 2 2 2 1 1
## [1407] 1 2 1 1 2 1 1 2 2 1 1 1 1 1 2 1 2 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1 1 1 1 1 2
## [1444] 2 1 2 2 1 1 1 1 1 2 1 2 2 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 1 1
## [1481] 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 2 2 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2
## [1518] 2 2 1 1 1 1 1 1 1 2 2 1 1 1 2 2 1 2 1 1 1 1 1 1 1 2 2 1 2 1 1 2 1 1 1 1 1
## [1555] 2 2 1 2 1 1 1 1 1 1 1 2 2 1 1 1 1 2 1 1 2 1 1 2 2 2 2 1 1 2 1 1 1 1 1 2 1
## [1592] 2 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1 2 1 1 1 1 2 1 1 2 2 2 1 2 1 2 1 1
## [1629] 1 1 1 1 2 1 1 1 1 2 2 2 1 2 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 2 2 1 1 2 1 1 1
## [1666] 2 1 1 1 1 1 1 1 2 2 1 1 2 2 2 1 2 1 2 2 1 1 1 2 1 2 2 1 1 1 1 2 1 2 2 2 1
## [1703] 2 1 2 1 1 1 1 1 1 1 2 1 2 1 1 1 2 1 2 1 2 1 1 1 2 1 2 2 1 1 1 1 1 1 1 1 1
## [1740] 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1 1 2 1 2 1 1 2 1 1
## [1777] 1 2 1 2 1 1 1 2 1 1 2 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 2
## [1814] 2 1 1 1 2 1 2 1 1 2 1 1 1 1 1 1 2 2 2 1 1 2 1 2 2 1 1 2 1 1 1 1 1 1 1 1 1
## [1851] 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 2 1 1 2 2 1 1 1 1 1 1 1 2 1 1 1 2 1 2 2 1
## [1888] 1 1 1 2 2 1 1 1 1 1 2 1 1 1 1 1 1 2 2 2 1 1 2 2 1 2 1 1 2 1 1 2 1 1 1 1 2
## [1925] 2 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 2 1 1 2 1 1 1 1 1 1 1
## [1962] 1 1 1 1 1 1 1 1 2 2 1 2 1 1 1 1 1 1 1 1 1 2 1 1 2 1 1 2 2 1 1 1 1 1 1 2 1
## [1999] 1 1 2 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 2 1 2 1 2 1 1 1 1 1 1
## [2036] 2 2 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1
## [2073] 2 1 2 2 2 1 1 2 1 1 1 1 1 2 2 1 1 2 1 1 1 1 2 2 1 1 1 1 1 1 1 2 1 2 1 2 1
## [2110] 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 2 2 2 2 1 1
## [2147] 1 1 1 2 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 1 2 2 1 1 1 1 1 1 1 1 2
## [2184] 1 2 1 1 1 1 1 1 1 2 1 2 1 2 2 2 1 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 2 1 2 1
## [2221] 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2258] 2 2 1 2 2 1 1 1 1 1 2 1 2 1 2 1 2 1 1 1 1 1 2 1 1 1 1 1 1 2 1 1 1 1 1 2 1
## [2295] 1 1 1 1 1 1 2 2 1 1 2 1 2 1 1 2 1 2 1 1 1 1 1 1 2 1 1 2 1 2 1 1 2 1 1 2 1
## [2332] 2 1 1 1 2 1 2 2 2 1 1 1 1 1 2 1 1 1 2 1 1 2 1 1 1 2 1 1 1 1 1 1 2 2 1 1 1
## [2369] 1 2 2 1 2 1 2 1 2 1 1 1 1 2 2 1 2 1 1 1 2 1 1 1 2 1 1 1 1 1 1 2 1 1 1 1 2
## [2406] 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 2 2 1 1 1 1 1 1 2 1 2 1 1 2 2 1 2 1 1 1 2
## [2443] 1 2 1 1 1 1 1 2 1 1 1 1 2 1 1 2 1 1 2 1 1 2 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1
## [2480] 1 1 2 1 1 2 2 2 1 2 1 1 1 2 1 1 1 1 1 1 2 1 1 1 2 1 1 2 1 1 1 1 2 1 1 2 2
## [2517] 1 2 1 1 1 1 2 1 2 2 2 1 2 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2
## [2554] 1 1 2 1 1 2 2 1 1 2 1 1 1 2 1 1 1 2 2 1 1 2 1 2 1 1 1 1 1 1 1 1 1 1 2 1 2
## [2591] 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 1 1 2 1 1 1 2 1 2 2 1 2 2 1 2 1 2 2 1 1
## [2628] 1 1 1 2 1 1 1 1 2 1 2 2 1 1 1 2 2 1 1 1 1 1 1 1 2 2 1 2 2 2 1 2 2 2 2 1 1
## [2665] 2 1 1 2 2 2 1 1 1 1 2 1 2 1 1 1 1 1 1 1 1 2 2 1 2 2 1 1 1 2 1 1 1 1 2 1 1
## [2702] 1 1 2 2 1 1 1 1 1 2 2 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1
## [2739] 1 1 2 2 1 1 1 1 2 1 1 1 2 2 1 2 1 2 1 1 2 2 1 1 2 2 2 1 1 2 1 2 1 2 1 1 1
## [2776] 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2813] 1 2 1 1 1 2 2 1 1 2 1 1 1 1 2 1 1 2 2 2 2 1 2 2 2 1 1 1 1 1 1 1 1 1 1 2 1
## [2850] 1 2 1 2 2 1 1 1 1 2 2 1 2 1 1 2 1 2 1 2 1 1 2 2 1 1 2 1 1 2 1 2 2 1 1 1 2
## [2887] 1 2 1 2 1 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 1
## [2924] 1 1 2 1 2 1 1 1 1 1 2 2 1 1 1 1 2 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 2 2 1 2
## [2961] 1 1 1 1 1 1 1 1 1 2 2 1 2 1 1 1 1 2 2 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 2 1 1
## [2998] 1 1 1 2 2 1 1 1 2 1 1 2 1 1 1 1 1 1 2 1 1 1 2 1 2 2 1 1 2 1 1 2 1 2 1 1 1
## [3035] 2 1 1 2 1 1 2 1 1 1 1 1 1 1 1 2 2 1 1 1 2 2 1 1 2 2 1 1 2 1 1 1 1 1 1 1 1
## [3072] 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 2 1 2 1 1 1 1 1 1 2 1 1
## [3109] 1 1 1 1 1 2 1 1 1 2 2 1 1 2 1 2 1 2 1 2 1 1 1 1 1 1 1 1 2 1 2 1 1 1 2 1 1
## [3146] 2 2 2 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 2 1 1 2 1 2 1 2 2 1 1 1 1 1 1 1 1 1
## [3183] 1 2 1 1 1 1 1 2 1 2 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1 1 1 1 2 1 1
## [3220] 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1
## [3257] 2 2 1 2 1 1 1 1 2 2 2 1 2 1 1 1 1 1 2 2 1 1 1 1 1 1 1 2 1 2 1 2 1 1 2 1 1
## [3294] 1 1 1 1 2 1 2 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1 1 1 1 2 1 1 1
## [3331] 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1
## [3368] 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 2 2 1 1 2 1 1 1 1 2 2 2 1 1 2 1 1 1 1 1 1 1
## [3405] 1 1 1 2 2 2 1 1 1 2 1 1 1 1 2 2 2 1 1 1 1 1 1 1 2 1 1 1 2 2 1 1 2 1 1 1 1
## [3442] 2 1 2 1 2 1 1 1 1 2 1 1 1 2 1 2 2 1 2 1 2 1 2 1 2 1 1 1 2 1 1 1 1 1 1 1 1
## [3479] 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1 2 2 1 1 1 2 1 1 1 1 2 1 1 1 2 1 1 2
## [3516] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 2 2 1 1 1
## [3553] 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 2 1 1 2 1 1 1 2 1 2 1 1 1 1 1 2 1 2 1 1 1 1
## [3590] 1 1 2 2 1 2 1 2 2 2 1 1 1 1 1 2 2 1 2 1 1 1 2 1 1 1 1 2 1 1 2 1 1 1 1 2 1
## [3627] 1 1 1 1 2 1 1 1 1 1 1 1 1 2 1 2 1 1 1 2 2 2 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1
## [3664] 1 2 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 2 1 2 1 1 1 1 2 1 2 1 2 1 1 1 1 1 1 1
## [3701] 1 2 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 2 1 2 2 1 2 1 1 1 1 1 1 1 2 1 1
## [3738] 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 2 1 1 1 2 1 1 1 2 1 2 2 2 1 1 1 2 1 2 1 1 1
## [3775] 1 1 1 2 2 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 2 2 1 1 1 2 1 2 1 2 1 1 1 1 2 1
## [3812] 1 1 1 1 1 1 1 1 1 2 2 1 1 1 2 2 1 1 1 2 1 1 1 2 1 2 1 2 2 1 1 1 1 1 2 2 1
## [3849] 1 1 1 2 2 2 1 1 2 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 2 1 1 1 2 2 1 1
## [3886] 2 2 1 1 1 1 1 2 2 1 2 1 2 1 1 2 1 1 1 2 2 1 2 1 1 1 1 1 1 1 2 1 1 1 2 1 1
## [3923] 2 1 1 2 1 1 1 1 1 1 1 2 1 1 2 1 1 2 2 1 1 2 1 1 1 2 1 2 1 1 2 1 1 1 1 1 1
## [3960] 1 2 2 1 1 1 2 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1
## [3997] 2 2 1 1 1 2 1 1 1 2 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 1
## [4034] 1 1 2 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 2 2 1 1 2 2 1 1 1 1 1 1 1 1
## [4071] 1 1 1 1 1 1 1 2 1 1 1 1 1 2 1 2 1 1 2 1 1 2 1 1 1 2 2 1 2 1 1 2 1 1 1 1 1
## [4108] 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 2 2 1 2 1 1 2 1 1 2 2 1 1 1
## [4145] 2 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 2 1 1 2 1 2 1 2 1 1 1 2 1 1 1 2 1 1 1
## [4182] 1 1 1 1 1 1 1 1 2 2 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 2 2
## [4219] 2 2 1 1 1 2 1 1 1 1 1 2 2 1 2 1 1 1 1 1 1 1 1 2 1 2 1 2 1 1 1 1 2 1 1 2 2
## [4256] 1 1 1 1 2 1 1 1 1 2 1 1 1 2 1 2 1 1 1 1 2 1 1 1 1 1 1 1 2 2 1 1 1 1 1 2 1
## [4293] 1 1 1 1 2 1 1 2 1 2 2 1 1 1 1 1 1 1 1 1 1 2 2 1 2 2 1 1 2 1 2 2 1 1 1 1 1
## [4330] 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1 1 1 2 1 1 1 2 2 2 1 1 1
## [4367] 1 2 2 1 1 1 1 1 2 1 2 1 2 2 1 1 2 1 2 1 1 1 1 1 2 1 1 1 1 1 1 2 1 1 1 1 1
## [4404] 2 1 2 1 1 1 2 1 1 2 1 1 1 1 2 1 2 1 1 1 2 2 2 1 1 1 1 1 1 2 1 1 2 1 1 1 2
## [4441] 2 1 2 2 1 2 1 2 1 1 2 1 1 2 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2
## [4478] 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 2
## [4515] 1 1 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 2 1 1 1 1 2 1 2 1 1 1 1 1 1 1 2 1 1 1 1
## [4552] 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1
## [4589] 1 1 1 1 2 1 2 1 1 1 1 1 1 2 1 1 2 1 1 1 1 2 2 2 1 2 2 1 2 2 1 1 1 1 2 2 1
## [4626] 2 1 2 2 1 1 2 1 1 2 1 1 1 2 2 1 2 1 1 1 1 1 1 1 2 1 2 1 1 1 2 1 2 1 2 1 1
## [4663] 1 1 1 1 1 1 2 1 2 2 2 2 1 1 2 1 1 1 1 2 1 2 1 2 1 1 1 2 1 2 2 1 1 1 2 2 2
## [4700] 2 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 2 2 1 1 1 1 1 2 1 1 2 1 1 1 2 2 1 1 1 1
## [4737] 1 1 1 1 1 1 1 2 2 1 2 1 2 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 2 1 1
## [4774] 1 2 1 1 2 2 2 1 1 1 1 1 2 1 1 1 2 2 1 1 2 1 1 1 2 2 1 1 2 1 1 1 2 2 2 2 1
## [4811] 2 1 1 1 2 2 2 1 1 1 2 1 2 1 1 1 2 1 1 2 1 2 2 1 1 2 1 1 2 1 1 1 1 1 1 2 1
## [4848] 2 1 1 2 2 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1 2 1 1 1 2 1 1 2 1 2 2 1 1 1 1 1 1
## [4885] 1 1 1 1 2 2 1 1 1 2 2 1 2 2 1 1 1 2 1 1 1 1 1 1 1 2 1 2 2 2 1 2 1 2 1 1 1
## [4922] 1 1 1 2 1 1 2 1 1 1 1 2 2 1 2 2 1 1 1 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2
## [4959] 1 1 1 1 2 1 1 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1
## [4996] 1 2 1 1 1 1 1 1 1 2 2 1 2 1 1 2 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 2 2 1
## [5033] 2 1 2 1 2 2 2 1 1 1 1 2 1 2 1 1 1 1 2 1 1 2 1 1 1 1 1 1 1 1 1 2 1 1 1 2 1
## [5070] 1 2 1 1 1 1 2 2 1 1 1 2 1 1 1 1 2 1 1 2 1 1 1 1 1 1 2 1 1 1 1 2 2 1 1 1 1
## [5107] 1 1 2 2 1 1 1 1 2 1 1 1 1 1 1 2 2 1 1 2 1 1 1 2 2 1 1 1 2 1 1 1 2 1 1 1 2
## [5144] 1 1 1 2 1 2 1 2 1 2 1 1 1 2 1 1 2 1 1 1 1 1 1 1 2 1 2 1 1 2 1 1 1 1 1 1 1
## [5181] 2 1 2 2 1 2 1 2 1 2 1 1 1 1 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 2 2 1 2 2 1
## [5218] 1 1 2 1 1 1 1 2 1 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 2 1 1 2 1 2 1 1 2 1 1 1
## [5255] 1 1 1 1 1 1 1 2 1 1 2 2 2 1 2 1 2 1 1 2 1 1 2 1 2 1 1 1 2 1 1 1 1 1 2 1 1
## [5292] 1 1 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 2 2 2 1 1 1 1 1 1 1 2 1 1 1 1 2
## [5329] 1 1 2 1 1 1 1 1 2 2 1 2 2 1 1 1 2 2 1 1 1 1 1 2 1 2 2 2 1 1 2 1 2 2 1 1 1
## [5366] 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 2 1 2 1 1 1 2 2 2 1 1 1 1 2 1 1 1 1 2 2 2
## [5403] 1 2 1 2 2 1 2 2 1 1 1 2 1 2 2 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1
## [5440] 2 2 1 1 2 2 1 1 2 2 2 1 2 2 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 2 1 2 2
## [5477] 1 1 1 1 2 1 1 2 1 1 1 1 1 1 2 2 2 2 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1
## [5514] 2 2 1 1 2 1 2 1 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 1 2 1 2 1 1 1 1 1 1 1 2
## [5551] 1 1 2 1 1 1 2 1 2 1 1 1 2 2 2 1 1 1 1 1 1 2 1 1 2 1 2 1 1 1 2 2 1 1 1 1 2
## [5588] 2 1 1 1 1 1 1 2 2 2 2 1 1 1 1 1 1 1 2 1 2 2 1 1 2 2 1 1 2 2 1 1 2 2 1 1 1
## [5625] 1 2 1 2 1 1 1 1 2 1 2 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 2 1 1 1
## [5662] 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 2 1 1 1 1 1 1 1 2 2 1 1 1 1 2 2 2 1 1 2 1 2
## [5699] 2 1 2 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 2 2 2 2 1 2 1 1 2 1 1 1 2 2 1 2
## [5736] 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1 2
## [5773] 1 1 1 1 1 1 1 2 1 1 2 1 1 1 2 2 1 2 2 1 2 1 1 2 1 1 2 2 1 2 1 1 1 2 2 2 2
## [5810] 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 2 1 1 1 2 1 2 1 2
## [5847] 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1
## [5884] 1 1 2 1 1 1 1 2 1 2 1 1 1 1 2 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 2 1 1 1 2
## [5921] 1 1 2 1 1 1 2 1 1 1 1 2 1 1 1 2 2 2 2 1 1 2 1 1 2 2 1 1 2 2 1 1 2 1 1 2 1
## [5958] 2 2 2 1 1 2 1 1 1 1 1 2 1 1 1 2 1 1 2 1 1 1 2 2 1 1 2 1 1 2 1 1 1 1 1 1 2
## [5995] 2 2 1 1 1 1 1 2 1 1 1 1 1 1 2 1 2 1 2 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1
## [6032] 1 1 1 2 1 1 2 1 1 1 2 1 1 1 2 2 2 1 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 2
## [6069] 1 1 1 1 1 2 1 2 1 2 1 1 1 1 1 1 2 1 2 1 1 2 1 1 1 1 2 2 1 1 1 1 1 2 1 1 1
## [6106] 1 1 2 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 2 1 2 2 1 2
## [6143] 1 1 1 1 1 2 1 1 1 2 1 1 2 2 1 1 1 1 1 2 1 2 2 1 1 1 1 1 1 1 1 2 1 2 1 1 1
## [6180] 2 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 2 2 1 1 1
## [6217] 1 2 1 1 1 2 2 1 1 2 1 1 1 1 2 1 1 1 1 1 2 2 1 2 1 2 1 2 1 1 1 1 1 1 1 1 2
## [6254] 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1 1 2 1 1 1 1 1 2 1 2 1 2
## [6291] 1 1 1 1 1 1 1 1 2 1 1 1 1 1 2 2 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 2 1
## [6328] 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2
## [6365] 1 2 1 2 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 1 2 1 1 2 1 1 1 1 1 1 1 1 2 1 1 2
## [6402] 1 1 1 1 1 1 1 2 1 1 2 1 1 2 1 1 1 2 1 2 1 1 1 1 1 1 1 2 1 1 1 1 2 1 2 1 1
## [6439] 1 1 1 1 1 2 1 1 1 1 1 2 2 1 1 1 1 1 2 1 1 1 2 1 2 1 2 1 1 1 1 1 2 2 1 1 2
## [6476] 1 2 1 2 1 1 2 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 2 1 1 2 1 1 1 1
## [6513] 1 1 2 2 1 2 2 2 2 2 2 2 1 2 1 1 1 2 1 1 1 1 1 1 1 2 1 2 1 1 2 1 1 1 1 1 2
## [6550] 1 1 2 1 1 1 1 1 2 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1
## [6587] 1 1 2 2 2 1 2 1 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 2 2 2 2 2 1 2 1 1 1 1 1 1
## [6624] 1 1 1 2 1 2 1 1 1 1 2 2 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 2 1 2 1 2
## [6661] 1 1 2 2 1 1 1 2 1 2 2 1 1 2 1 1 2 1 2 1 2 2 1 1 2 1 1 1 1 2 2 1 1 1 1 1 1
## [6698] 1 1 2 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1
## [6735] 1 1 1 2 1 1 2 2 1 1 1 2 2 2 1 1 1 2 2 2 1 1 2 1 1 1 1 1 1 1 2 1 1 1 2 1 1
## [6772] 1 1 2 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 2 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 2 2 1
## [6809] 1 1 1 1 1 1 2 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 1
## [6846] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 2 1 1
## [6883] 2 1 2 2 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 2 1 2 1 1 1 1 1 1
## [6920] 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 2 1 2 1 1 1 1 2 1 1
## [6957] 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 2 2 1 1 1 1 1 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1
## [6994] 1 2 1 2 1 1 1 1 2 1 1 1 1 2 2 1 2 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [7031] 1 1 1 1 1 1 2 2 2 1 1 1 1 2 2 1 1 1 2 2 2 2 1 1 1 1 1 1 2 2 2 1 1 1 1 1 1
## [7068] 1 1 1 1 2 2 1 1 2 1 2 1 2 1 1 1 2 2 1 1 1 2 2 1 1 2 1 1 2 1 1 1 2 2 1 2 2
## [7105] 2 1 2 2 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 2 1 2 1 1 1 2 1 1 1 1 1 1 1 2 1 2 1
## [7142] 1 2 1 1 1 1 1 1 1 1 1 1 1 2 2 2 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 2 2 1 1
## [7179] 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 1 1 1 2 1 2 2 1 1 1 1 1 1 1 1 2
## [7216] 1 1 1 1 1 2 2 2 2 1 1 2 2 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1
## [7253] 2 1 1 2 1 2 1 1 2 1 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 1 1 1 1 1
## [7290] 1 1 2 1 1 2 1 1 2 1 2 1 1 2 1 1 1 2 1 1 2 2 1 1 2 1 2 1 1 2 1 1 1 2 2 1 1
## [7327] 1 1 2 1 1 1 2 1 1 1 2 1 1 2 2 2 2 1 1 1 2 2 1 1 1 2 2 1 2 1 1 1 2 1 1 1 1
## [7364] 2 1 1 2 2 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 2 2 1 1 1 2 2 1 1 2 1 1 2
## [7401] 1 1 2 2 1 1 1 1 1 1 2 2 1 1 1 1 2 1 1 1 2 1 1 2 1 2 1 1 1 1 2 2 2 2 1 1 1
## [7438] 1 1 1 1 2 1 1 1 1 1 2 2 2 2 2 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 2 2
## [7475] 1 2 2 2 1 1 2 2 1 1 1 2 2 2 1 1 1 1 2 2 1 1 2 1 2 1 1 1 1 1 2 1 1 2 2 1 2
## [7512] 1 1 2 1 1 2 2 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 2 1 1 1 1 1 1 1 1 2 1 2 2 1 1
## [7549] 1 1 2 1 1 1 2 1 1 1 1 2 1 1 1 2 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1
## [7586] 1 1 1 2 1 2 2 1 1 2 1 2 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1
## [7623] 2 1 2 1 2 1 1 1 1 1 2 1 1 1 1 2 1 1 2 1 1 2 1 2 2 1 1 2 2 2 1 1 2 2 1 1 1
## [7660] 1 1 1 1 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1 2 1 2 2
## [7697] 1 1 2 2 1 1 1 1 1 1 1 2 1 1 1 2 2 1 1 2 1 2 1 2 1 1 1 1 2 2 1 1 1 1 1 1 1
## [7734] 1 1 1 1 1 1 1 2 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 2 1 2 2 1
## [7771] 2 1 2 1 2 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 2 1 1 2 1 1 1 1 1 1 1 1 2 1 1
## [7808] 1 1 2 2 1 2 1 1 1 1 1 2 2 1 1 2 1 1 2 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1
## [7845] 1 1 1 1 2 1 2 2 1 1 2 1 2 1 1 1 1 1 1 2 2 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 2
## [7882] 1 1 1 1 1 1 1 2 2 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 2 1 2 2 2 1 1 1 1 2 1 1
## [7919] 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 2 2 1 1 1 1 1 2 2 2
## [7956] 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 2 1 2 1 1 2 1 1 2 1 1 1 1 1 1 1 1 2 1 1
## [7993] 2 1 1 2 1 2 1 1 2 1 1 1 1 2 2 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [8030] 1 2 1 2 1 1 1 1 1 1 1 2 2 2 2 1 1 1 1 2 1 1 1 2 2 2 1 1 1 1 2 1 1 2 1 1 1
## [8067] 1 1 2 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 2 1 2 2 1 1 2 1 1 2 1
## [8104] 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 2 2 1 1 1 1 2 1 2 2 2 1 1 2 1 1
## [8141] 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1
binom.test(m, M, p)
##
## Exact binomial test
##
## data: m and M
## number of successes = 383, number of trials = 2141, p-value = 1
## alternative hypothesis: true probability of success is not equal to 0.1788884
## 95 percent confidence interval:
## 0.1628687 0.1957921
## sample estimates:
## probability of success
## 0.1788884
?`latexpdf-package`
## No documentation for 'latexpdf-package' in specified packages and libraries:
## you could try '??latexpdf-package'
library(latexpdf)
#The prevalence of the positive condition is 26.38% in the training data and 17.89% in the evaluation data results. Although there is some difference in these figures, the difference is not significant at an alpha=0.05 as can be seen in the above Binomial test