Assignment 6: Prudential Life Insurance Assessment

Step 1: Data preparation and data cleaning

Import the data, replacing empty values with NAs

df <- read.csv("https://michaschweizer.github.io/datasets/train.csv", na.strings = "")

Look at structure of the data

str(df)
## 'data.frame':    59381 obs. of  128 variables:
##  $ Id                 : int  2 5 6 7 8 10 11 14 15 16 ...
##  $ Product_Info_1     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Product_Info_2     : Factor w/ 19 levels "A1","A2","A3",..: 17 1 19 18 16 16 8 16 17 19 ...
##  $ Product_Info_3     : int  10 26 26 10 26 26 10 26 26 21 ...
##  $ Product_Info_4     : num  0.0769 0.0769 0.0769 0.4872 0.2308 ...
##  $ Product_Info_5     : int  2 2 2 2 2 3 2 2 2 2 ...
##  $ Product_Info_6     : int  1 3 3 3 3 1 3 3 3 3 ...
##  $ Product_Info_7     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Ins_Age            : num  0.6418 0.0597 0.0299 0.1642 0.4179 ...
##  $ Ht                 : num  0.582 0.6 0.745 0.673 0.655 ...
##  $ Wt                 : num  0.149 0.132 0.289 0.205 0.234 ...
##  $ BMI                : num  0.323 0.272 0.429 0.352 0.424 ...
##  $ Employment_Info_1  : num  0.028 0 0.03 0.042 0.027 0.325 0.11 0.12 0.165 0.025 ...
##  $ Employment_Info_2  : int  12 1 9 9 9 15 1 12 9 1 ...
##  $ Employment_Info_3  : int  1 3 1 1 1 1 3 1 1 3 ...
##  $ Employment_Info_4  : num  0 0 0 0 0 0 NA 0 0 0 ...
##  $ Employment_Info_5  : int  3 2 2 3 2 2 3 2 2 3 ...
##  $ Employment_Info_6  : num  NA 0.0018 0.03 0.2 0.05 1 0.8 1 1 0.05 ...
##  $ InsuredInfo_1      : int  1 1 1 2 1 1 1 1 1 2 ...
##  $ InsuredInfo_2      : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ InsuredInfo_3      : int  6 6 8 8 6 8 3 6 3 3 ...
##  $ InsuredInfo_4      : int  3 3 3 3 3 3 3 3 2 3 ...
##  $ InsuredInfo_5      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ InsuredInfo_6      : int  2 2 1 2 2 1 2 1 1 2 ...
##  $ InsuredInfo_7      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Insurance_History_1: int  1 2 2 2 2 2 1 1 1 2 ...
##  $ Insurance_History_2: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Insurance_History_3: int  3 3 1 1 1 3 3 3 3 3 ...
##  $ Insurance_History_4: int  1 1 3 3 3 2 2 1 2 1 ...
##  $ Insurance_History_5: num  0.000667 0.000133 NA NA NA ...
##  $ Insurance_History_7: int  1 1 3 3 3 1 1 1 1 1 ...
##  $ Insurance_History_8: int  1 3 2 2 2 3 1 1 1 3 ...
##  $ Insurance_History_9: int  2 2 3 3 3 2 2 2 2 2 ...
##  $ Family_Hist_1      : int  2 2 3 3 2 2 3 2 3 3 ...
##  $ Family_Hist_2      : num  NA 0.188 0.304 0.42 0.464 ...
##  $ Family_Hist_3      : num  0.598 NA NA NA NA ...
##  $ Family_Hist_4      : num  NA 0.0845 0.2254 0.3521 0.4085 ...
##  $ Family_Hist_5      : num  0.527 NA NA NA NA ...
##  $ Medical_History_1  : int  4 5 10 0 NA 6 5 6 4 NA ...
##  $ Medical_History_2  : int  112 412 3 350 162 491 600 145 16 162 ...
##  $ Medical_History_3  : int  2 2 2 2 2 2 3 2 2 2 ...
##  $ Medical_History_4  : int  1 1 2 2 2 2 2 2 2 2 ...
##  $ Medical_History_5  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Medical_History_6  : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Medical_History_7  : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ Medical_History_8  : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ Medical_History_9  : int  1 1 2 2 2 2 1 1 1 2 ...
##  $ Medical_History_10 : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Medical_History_11 : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Medical_History_12 : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ Medical_History_13 : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Medical_History_14 : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Medical_History_15 : int  240 0 NA NA NA NA NA NA NA NA ...
##  $ Medical_History_16 : int  3 1 1 1 1 1 1 1 1 3 ...
##  $ Medical_History_17 : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Medical_History_18 : int  1 1 1 1 1 2 1 1 1 1 ...
##  $ Medical_History_19 : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Medical_History_20 : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ Medical_History_21 : int  1 1 1 2 1 2 1 1 1 1 ...
##  $ Medical_History_22 : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ Medical_History_23 : int  3 3 3 3 3 3 3 3 3 1 ...
##  $ Medical_History_24 : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Medical_History_25 : int  1 1 2 1 2 1 1 1 1 1 ...
##  $ Medical_History_26 : int  3 3 2 3 2 3 3 3 3 3 ...
##  $ Medical_History_27 : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Medical_History_28 : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Medical_History_29 : int  3 3 3 3 3 3 1 3 1 3 ...
##  $ Medical_History_30 : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ Medical_History_31 : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Medical_History_32 : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Medical_History_33 : int  1 3 3 3 3 3 3 3 3 3 ...
##  $ Medical_History_34 : int  3 1 3 3 3 1 3 3 3 3 ...
##  $ Medical_History_35 : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Medical_History_36 : int  2 2 3 2 3 2 2 2 2 2 ...
##  $ Medical_History_37 : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ Medical_History_38 : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Medical_History_39 : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Medical_History_40 : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Medical_History_41 : int  3 1 1 1 1 3 3 1 3 1 ...
##  $ Medical_Keyword_1  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_2  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_3  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_4  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_5  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_6  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_7  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_8  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_9  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_10 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_11 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_12 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_13 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_14 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_15 : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ Medical_Keyword_16 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_17 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_18 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_19 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Medical_Keyword_20 : int  0 0 0 0 0 0 0 0 1 0 ...
##   [list output truncated]

Convert Response–our target variable–to a factor

df$Response <- as.factor(df$Response)

Check for NAs

sum(is.na(df))
## [1] 393103

Data has many NAs. See where they are

sapply(df, function(x) sum(is.na(x)) )
##                  Id      Product_Info_1      Product_Info_2 
##                   0                   0                   0 
##      Product_Info_3      Product_Info_4      Product_Info_5 
##                   0                   0                   0 
##      Product_Info_6      Product_Info_7             Ins_Age 
##                   0                   0                   0 
##                  Ht                  Wt                 BMI 
##                   0                   0                   0 
##   Employment_Info_1   Employment_Info_2   Employment_Info_3 
##                  19                   0                   0 
##   Employment_Info_4   Employment_Info_5   Employment_Info_6 
##                6779                   0               10854 
##       InsuredInfo_1       InsuredInfo_2       InsuredInfo_3 
##                   0                   0                   0 
##       InsuredInfo_4       InsuredInfo_5       InsuredInfo_6 
##                   0                   0                   0 
##       InsuredInfo_7 Insurance_History_1 Insurance_History_2 
##                   0                   0                   0 
## Insurance_History_3 Insurance_History_4 Insurance_History_5 
##                   0                   0               25396 
## Insurance_History_7 Insurance_History_8 Insurance_History_9 
##                   0                   0                   0 
##       Family_Hist_1       Family_Hist_2       Family_Hist_3 
##                   0               28656               34241 
##       Family_Hist_4       Family_Hist_5   Medical_History_1 
##               19184               41811                8889 
##   Medical_History_2   Medical_History_3   Medical_History_4 
##                   0                   0                   0 
##   Medical_History_5   Medical_History_6   Medical_History_7 
##                   0                   0                   0 
##   Medical_History_8   Medical_History_9  Medical_History_10 
##                   0                   0               58824 
##  Medical_History_11  Medical_History_12  Medical_History_13 
##                   0                   0                   0 
##  Medical_History_14  Medical_History_15  Medical_History_16 
##                   0               44596                   0 
##  Medical_History_17  Medical_History_18  Medical_History_19 
##                   0                   0                   0 
##  Medical_History_20  Medical_History_21  Medical_History_22 
##                   0                   0                   0 
##  Medical_History_23  Medical_History_24  Medical_History_25 
##                   0               55580                   0 
##  Medical_History_26  Medical_History_27  Medical_History_28 
##                   0                   0                   0 
##  Medical_History_29  Medical_History_30  Medical_History_31 
##                   0                   0                   0 
##  Medical_History_32  Medical_History_33  Medical_History_34 
##               58274                   0                   0 
##  Medical_History_35  Medical_History_36  Medical_History_37 
##                   0                   0                   0 
##  Medical_History_38  Medical_History_39  Medical_History_40 
##                   0                   0                   0 
##  Medical_History_41   Medical_Keyword_1   Medical_Keyword_2 
##                   0                   0                   0 
##   Medical_Keyword_3   Medical_Keyword_4   Medical_Keyword_5 
##                   0                   0                   0 
##   Medical_Keyword_6   Medical_Keyword_7   Medical_Keyword_8 
##                   0                   0                   0 
##   Medical_Keyword_9  Medical_Keyword_10  Medical_Keyword_11 
##                   0                   0                   0 
##  Medical_Keyword_12  Medical_Keyword_13  Medical_Keyword_14 
##                   0                   0                   0 
##  Medical_Keyword_15  Medical_Keyword_16  Medical_Keyword_17 
##                   0                   0                   0 
##  Medical_Keyword_18  Medical_Keyword_19  Medical_Keyword_20 
##                   0                   0                   0 
##  Medical_Keyword_21  Medical_Keyword_22  Medical_Keyword_23 
##                   0                   0                   0 
##  Medical_Keyword_24  Medical_Keyword_25  Medical_Keyword_26 
##                   0                   0                   0 
##  Medical_Keyword_27  Medical_Keyword_28  Medical_Keyword_29 
##                   0                   0                   0 
##  Medical_Keyword_30  Medical_Keyword_31  Medical_Keyword_32 
##                   0                   0                   0 
##  Medical_Keyword_33  Medical_Keyword_34  Medical_Keyword_35 
##                   0                   0                   0 
##  Medical_Keyword_36  Medical_Keyword_37  Medical_Keyword_38 
##                   0                   0                   0 
##  Medical_Keyword_39  Medical_Keyword_40  Medical_Keyword_41 
##                   0                   0                   0 
##  Medical_Keyword_42  Medical_Keyword_43  Medical_Keyword_44 
##                   0                   0                   0 
##  Medical_Keyword_45  Medical_Keyword_46  Medical_Keyword_47 
##                   0                   0                   0 
##  Medical_Keyword_48            Response 
##                   0                   0

Some columns have a very high number of NAs. Delete these columns

df <- subset(df, select = -c(Employment_Info_4, Employment_Info_6, 
                             Insurance_History_5, 
                             Family_Hist_2, Family_Hist_3, Family_Hist_4, Family_Hist_5, 
                             Medical_History_1,Medical_History_10, Medical_History_15,Medical_History_24, Medical_History_25,Medical_History_32) )

Next, delete all rows with NAs. Another way to handle NAs would be to use one of various imputation methods, but we will not do this here

dfnm <- na.omit(df)

Step 2: Exploratory data analysis

library(ggplot2)

Examine target variable Response

g <- ggplot(data = dfnm, aes(Response)) + geom_bar()
g

Response values for 3 and 4 are very rare. 8 is by far the most common value

Two-way analyses of different predictors and the target variable. We do this to figure out which variables to include in the machine learning model. We will only show some of the predictor variables here that seem to have an impact on the response variable based on the graphs.

g <- ggplot(dfnm, aes(x=BMI, fill=Response)) + geom_density(position = 'stack') +xlim(0.2,0.8)
g
## Warning: Removed 1083 rows containing non-finite values (stat_density).

g <- ggplot(dfnm, aes(x=BMI, fill=Response)) + geom_density(position = 'fill') +xlim(0.2,0.8)
g
## Warning: Removed 1083 rows containing non-finite values (stat_density).

BMI seems to be related to different risk classifications–which makes sense intuitively, as obesity has been linked to higher mortality

dfnm$InsuredInfo_1 <- as.factor(dfnm$InsuredInfo_1)

g <- ggplot(dfnm, aes(InsuredInfo_1, fill = Response)) + geom_bar(position = 'fill')
g

dfnm$InsuredInfo_1 <- as.numeric(as.character(dfnm$InsuredInfo_1))

InsuredInfo, which provides information about the applicant, seems to be related to the risk classification

dfnm$Medical_Keyword_48 <- as.factor(dfnm$Medical_Keyword_48)

g <- ggplot(dfnm, aes(Medical_Keyword_48, fill = Response)) + geom_bar(position = 'fill')
g

dfnm$Medical_Keyword_48 <- as.numeric(as.character(dfnm$Medical_Keyword_48))

Medical keywords seem to be related to risk classification. This makes sense intuitivelly. If someone has had, for example, cancer, then mortality is higher and this will influence risk classification

Step 3: Modelling

We will use two different models here: decision tree and multinomial regression. Let us start with the decision tree

Decision tree

library(rpart)
library(rpart.plot)

model.tree <- rpart(Response ~ ., dfnm, method = "class")
rpart.plot(model.tree, type=1, extra = 102)

Now, let us get and prepare the test set

df_test <- read.csv("https://michaschweizer.github.io/datasets/test.csv", na.strings = "")

df_test <- subset(df_test, select = -c(Employment_Info_4, Employment_Info_6, 
                             Insurance_History_5, 
                             Family_Hist_2, Family_Hist_3, Family_Hist_4, Family_Hist_5, 
                             Medical_History_1,Medical_History_10, Medical_History_15,Medical_History_24, Medical_History_25,Medical_History_32) )

sapply(df_test, function(x) sum(is.na(x)) )
##                  Id      Product_Info_1      Product_Info_2 
##                   0                   0                   0 
##      Product_Info_3      Product_Info_4      Product_Info_5 
##                   0                   0                   0 
##      Product_Info_6      Product_Info_7             Ins_Age 
##                   0                   0                   0 
##                  Ht                  Wt                 BMI 
##                   0                   0                   0 
##   Employment_Info_1   Employment_Info_2   Employment_Info_3 
##                   3                   0                   0 
##   Employment_Info_5       InsuredInfo_1       InsuredInfo_2 
##                   0                   0                   0 
##       InsuredInfo_3       InsuredInfo_4       InsuredInfo_5 
##                   0                   0                   0 
##       InsuredInfo_6       InsuredInfo_7 Insurance_History_1 
##                   0                   0                   0 
## Insurance_History_2 Insurance_History_3 Insurance_History_4 
##                   0                   0                   0 
## Insurance_History_7 Insurance_History_8 Insurance_History_9 
##                   0                   0                   0 
##       Family_Hist_1   Medical_History_2   Medical_History_3 
##                   0                   0                   0 
##   Medical_History_4   Medical_History_5   Medical_History_6 
##                   0                   0                   0 
##   Medical_History_7   Medical_History_8   Medical_History_9 
##                   0                   0                   0 
##  Medical_History_11  Medical_History_12  Medical_History_13 
##                   0                   0                   0 
##  Medical_History_14  Medical_History_16  Medical_History_17 
##                   0                   0                   0 
##  Medical_History_18  Medical_History_19  Medical_History_20 
##                   0                   0                   0 
##  Medical_History_21  Medical_History_22  Medical_History_23 
##                   0                   0                   0 
##  Medical_History_26  Medical_History_27  Medical_History_28 
##                   0                   0                   0 
##  Medical_History_29  Medical_History_30  Medical_History_31 
##                   0                   0                   0 
##  Medical_History_33  Medical_History_34  Medical_History_35 
##                   0                   0                   0 
##  Medical_History_36  Medical_History_37  Medical_History_38 
##                   0                   0                   0 
##  Medical_History_39  Medical_History_40  Medical_History_41 
##                   0                   0                   0 
##   Medical_Keyword_1   Medical_Keyword_2   Medical_Keyword_3 
##                   0                   0                   0 
##   Medical_Keyword_4   Medical_Keyword_5   Medical_Keyword_6 
##                   0                   0                   0 
##   Medical_Keyword_7   Medical_Keyword_8   Medical_Keyword_9 
##                   0                   0                   0 
##  Medical_Keyword_10  Medical_Keyword_11  Medical_Keyword_12 
##                   0                   0                   0 
##  Medical_Keyword_13  Medical_Keyword_14  Medical_Keyword_15 
##                   0                   0                   0 
##  Medical_Keyword_16  Medical_Keyword_17  Medical_Keyword_18 
##                   0                   0                   0 
##  Medical_Keyword_19  Medical_Keyword_20  Medical_Keyword_21 
##                   0                   0                   0 
##  Medical_Keyword_22  Medical_Keyword_23  Medical_Keyword_24 
##                   0                   0                   0 
##  Medical_Keyword_25  Medical_Keyword_26  Medical_Keyword_27 
##                   0                   0                   0 
##  Medical_Keyword_28  Medical_Keyword_29  Medical_Keyword_30 
##                   0                   0                   0 
##  Medical_Keyword_31  Medical_Keyword_32  Medical_Keyword_33 
##                   0                   0                   0 
##  Medical_Keyword_34  Medical_Keyword_35  Medical_Keyword_36 
##                   0                   0                   0 
##  Medical_Keyword_37  Medical_Keyword_38  Medical_Keyword_39 
##                   0                   0                   0 
##  Medical_Keyword_40  Medical_Keyword_41  Medical_Keyword_42 
##                   0                   0                   0 
##  Medical_Keyword_43  Medical_Keyword_44  Medical_Keyword_45 
##                   0                   0                   0 
##  Medical_Keyword_46  Medical_Keyword_47  Medical_Keyword_48 
##                   0                   0                   0
df_test[!complete.cases(df_test),'Employment_Info_1'] <- median(df_test$Employment_Info_1, na.rm = T)

Next, we use the decision tree to predict the Response values for the test set

probs <- predict(model.tree, df_test, type = "class")

Then, create dataframe with predictions for export and export the file for upload

predictions <- data.frame(df_test$Id)
predictions$Response <- probs
names(predictions) <- c('Id', 'Response')

write.csv(predictions, 'myprediction_tree.csv', row.names = F)

Multinomial regression

Build the model, based on the predictor variables that were found to be relevant in the exploratory data analysis

library(foreign)
library(nnet)

dfnm_multi <- df[c(12,17:23,32:115)]
dfnm_multi$Response <- relevel(dfnm_multi$Response, re = 1)
model.multinom <- multinom(Response ~ ., data = dfnm_multi)
## # weights:  744 (644 variable)
## initial  value 123479.318186 
## iter  10 value 108973.221407
## iter  20 value 100691.110691
## iter  30 value 94624.394589
## iter  40 value 93171.639891
## iter  50 value 92659.386672
## iter  60 value 91283.531698
## iter  70 value 89500.023199
## iter  80 value 87747.862564
## iter  90 value 87002.269314
## iter 100 value 86363.500196
## final  value 86363.500196 
## stopped after 100 iterations

Now, let us get and prepare the test set

df_test_multi <- read.csv("https://michaschweizer.github.io/datasets/test.csv", na.strings = "")

df_test_multi <- subset(df_test_multi, select = -c(Employment_Info_4, Employment_Info_6, 
                             Insurance_History_5, 
                             Family_Hist_2, Family_Hist_3, Family_Hist_4, Family_Hist_5, 
                             Medical_History_1,Medical_History_10, Medical_History_15,Medical_History_24, Medical_History_25,Medical_History_32) )

sapply(df_test_multi, function(x) sum(is.na(x)) )
##                  Id      Product_Info_1      Product_Info_2 
##                   0                   0                   0 
##      Product_Info_3      Product_Info_4      Product_Info_5 
##                   0                   0                   0 
##      Product_Info_6      Product_Info_7             Ins_Age 
##                   0                   0                   0 
##                  Ht                  Wt                 BMI 
##                   0                   0                   0 
##   Employment_Info_1   Employment_Info_2   Employment_Info_3 
##                   3                   0                   0 
##   Employment_Info_5       InsuredInfo_1       InsuredInfo_2 
##                   0                   0                   0 
##       InsuredInfo_3       InsuredInfo_4       InsuredInfo_5 
##                   0                   0                   0 
##       InsuredInfo_6       InsuredInfo_7 Insurance_History_1 
##                   0                   0                   0 
## Insurance_History_2 Insurance_History_3 Insurance_History_4 
##                   0                   0                   0 
## Insurance_History_7 Insurance_History_8 Insurance_History_9 
##                   0                   0                   0 
##       Family_Hist_1   Medical_History_2   Medical_History_3 
##                   0                   0                   0 
##   Medical_History_4   Medical_History_5   Medical_History_6 
##                   0                   0                   0 
##   Medical_History_7   Medical_History_8   Medical_History_9 
##                   0                   0                   0 
##  Medical_History_11  Medical_History_12  Medical_History_13 
##                   0                   0                   0 
##  Medical_History_14  Medical_History_16  Medical_History_17 
##                   0                   0                   0 
##  Medical_History_18  Medical_History_19  Medical_History_20 
##                   0                   0                   0 
##  Medical_History_21  Medical_History_22  Medical_History_23 
##                   0                   0                   0 
##  Medical_History_26  Medical_History_27  Medical_History_28 
##                   0                   0                   0 
##  Medical_History_29  Medical_History_30  Medical_History_31 
##                   0                   0                   0 
##  Medical_History_33  Medical_History_34  Medical_History_35 
##                   0                   0                   0 
##  Medical_History_36  Medical_History_37  Medical_History_38 
##                   0                   0                   0 
##  Medical_History_39  Medical_History_40  Medical_History_41 
##                   0                   0                   0 
##   Medical_Keyword_1   Medical_Keyword_2   Medical_Keyword_3 
##                   0                   0                   0 
##   Medical_Keyword_4   Medical_Keyword_5   Medical_Keyword_6 
##                   0                   0                   0 
##   Medical_Keyword_7   Medical_Keyword_8   Medical_Keyword_9 
##                   0                   0                   0 
##  Medical_Keyword_10  Medical_Keyword_11  Medical_Keyword_12 
##                   0                   0                   0 
##  Medical_Keyword_13  Medical_Keyword_14  Medical_Keyword_15 
##                   0                   0                   0 
##  Medical_Keyword_16  Medical_Keyword_17  Medical_Keyword_18 
##                   0                   0                   0 
##  Medical_Keyword_19  Medical_Keyword_20  Medical_Keyword_21 
##                   0                   0                   0 
##  Medical_Keyword_22  Medical_Keyword_23  Medical_Keyword_24 
##                   0                   0                   0 
##  Medical_Keyword_25  Medical_Keyword_26  Medical_Keyword_27 
##                   0                   0                   0 
##  Medical_Keyword_28  Medical_Keyword_29  Medical_Keyword_30 
##                   0                   0                   0 
##  Medical_Keyword_31  Medical_Keyword_32  Medical_Keyword_33 
##                   0                   0                   0 
##  Medical_Keyword_34  Medical_Keyword_35  Medical_Keyword_36 
##                   0                   0                   0 
##  Medical_Keyword_37  Medical_Keyword_38  Medical_Keyword_39 
##                   0                   0                   0 
##  Medical_Keyword_40  Medical_Keyword_41  Medical_Keyword_42 
##                   0                   0                   0 
##  Medical_Keyword_43  Medical_Keyword_44  Medical_Keyword_45 
##                   0                   0                   0 
##  Medical_Keyword_46  Medical_Keyword_47  Medical_Keyword_48 
##                   0                   0                   0
df_test_multi[!complete.cases(df_test_multi),'Employment_Info_1'] <- median(df_test_multi$Employment_Info_1, na.rm = T)

df_test_multi <- df_test_multi[c(12,17:23,32:114)]

Next, we use the multinomial regression model to predict the Response values for the test set

model.multinom.fitted <- predict(model.multinom,df_test_multi,type='class')

Then, create dataframe with predictions for export and export the file for upload

predictions <- data.frame(df_test$Id)
predictions$Response <- model.multinom.fitted
names(predictions) <- c('Id', 'Response')

write.csv(predictions, 'myprediction_multinom.csv', row.names = F)