Import the data, replacing empty values with NAs
df <- read.csv("https://michaschweizer.github.io/datasets/train.csv", na.strings = "")
Look at structure of the data
str(df)
## 'data.frame': 59381 obs. of 128 variables:
## $ Id : int 2 5 6 7 8 10 11 14 15 16 ...
## $ Product_Info_1 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Product_Info_2 : Factor w/ 19 levels "A1","A2","A3",..: 17 1 19 18 16 16 8 16 17 19 ...
## $ Product_Info_3 : int 10 26 26 10 26 26 10 26 26 21 ...
## $ Product_Info_4 : num 0.0769 0.0769 0.0769 0.4872 0.2308 ...
## $ Product_Info_5 : int 2 2 2 2 2 3 2 2 2 2 ...
## $ Product_Info_6 : int 1 3 3 3 3 1 3 3 3 3 ...
## $ Product_Info_7 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Ins_Age : num 0.6418 0.0597 0.0299 0.1642 0.4179 ...
## $ Ht : num 0.582 0.6 0.745 0.673 0.655 ...
## $ Wt : num 0.149 0.132 0.289 0.205 0.234 ...
## $ BMI : num 0.323 0.272 0.429 0.352 0.424 ...
## $ Employment_Info_1 : num 0.028 0 0.03 0.042 0.027 0.325 0.11 0.12 0.165 0.025 ...
## $ Employment_Info_2 : int 12 1 9 9 9 15 1 12 9 1 ...
## $ Employment_Info_3 : int 1 3 1 1 1 1 3 1 1 3 ...
## $ Employment_Info_4 : num 0 0 0 0 0 0 NA 0 0 0 ...
## $ Employment_Info_5 : int 3 2 2 3 2 2 3 2 2 3 ...
## $ Employment_Info_6 : num NA 0.0018 0.03 0.2 0.05 1 0.8 1 1 0.05 ...
## $ InsuredInfo_1 : int 1 1 1 2 1 1 1 1 1 2 ...
## $ InsuredInfo_2 : int 2 2 2 2 2 2 2 2 2 2 ...
## $ InsuredInfo_3 : int 6 6 8 8 6 8 3 6 3 3 ...
## $ InsuredInfo_4 : int 3 3 3 3 3 3 3 3 2 3 ...
## $ InsuredInfo_5 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ InsuredInfo_6 : int 2 2 1 2 2 1 2 1 1 2 ...
## $ InsuredInfo_7 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Insurance_History_1: int 1 2 2 2 2 2 1 1 1 2 ...
## $ Insurance_History_2: int 1 1 1 1 1 1 1 1 1 1 ...
## $ Insurance_History_3: int 3 3 1 1 1 3 3 3 3 3 ...
## $ Insurance_History_4: int 1 1 3 3 3 2 2 1 2 1 ...
## $ Insurance_History_5: num 0.000667 0.000133 NA NA NA ...
## $ Insurance_History_7: int 1 1 3 3 3 1 1 1 1 1 ...
## $ Insurance_History_8: int 1 3 2 2 2 3 1 1 1 3 ...
## $ Insurance_History_9: int 2 2 3 3 3 2 2 2 2 2 ...
## $ Family_Hist_1 : int 2 2 3 3 2 2 3 2 3 3 ...
## $ Family_Hist_2 : num NA 0.188 0.304 0.42 0.464 ...
## $ Family_Hist_3 : num 0.598 NA NA NA NA ...
## $ Family_Hist_4 : num NA 0.0845 0.2254 0.3521 0.4085 ...
## $ Family_Hist_5 : num 0.527 NA NA NA NA ...
## $ Medical_History_1 : int 4 5 10 0 NA 6 5 6 4 NA ...
## $ Medical_History_2 : int 112 412 3 350 162 491 600 145 16 162 ...
## $ Medical_History_3 : int 2 2 2 2 2 2 3 2 2 2 ...
## $ Medical_History_4 : int 1 1 2 2 2 2 2 2 2 2 ...
## $ Medical_History_5 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Medical_History_6 : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Medical_History_7 : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Medical_History_8 : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Medical_History_9 : int 1 1 2 2 2 2 1 1 1 2 ...
## $ Medical_History_10 : int NA NA NA NA NA NA NA NA NA NA ...
## $ Medical_History_11 : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Medical_History_12 : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Medical_History_13 : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Medical_History_14 : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Medical_History_15 : int 240 0 NA NA NA NA NA NA NA NA ...
## $ Medical_History_16 : int 3 1 1 1 1 1 1 1 1 3 ...
## $ Medical_History_17 : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Medical_History_18 : int 1 1 1 1 1 2 1 1 1 1 ...
## $ Medical_History_19 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Medical_History_20 : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Medical_History_21 : int 1 1 1 2 1 2 1 1 1 1 ...
## $ Medical_History_22 : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Medical_History_23 : int 3 3 3 3 3 3 3 3 3 1 ...
## $ Medical_History_24 : int NA NA NA NA NA NA NA NA NA NA ...
## $ Medical_History_25 : int 1 1 2 1 2 1 1 1 1 1 ...
## $ Medical_History_26 : int 3 3 2 3 2 3 3 3 3 3 ...
## $ Medical_History_27 : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Medical_History_28 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Medical_History_29 : int 3 3 3 3 3 3 1 3 1 3 ...
## $ Medical_History_30 : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Medical_History_31 : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Medical_History_32 : int NA NA NA NA NA NA NA NA NA NA ...
## $ Medical_History_33 : int 1 3 3 3 3 3 3 3 3 3 ...
## $ Medical_History_34 : int 3 1 3 3 3 1 3 3 3 3 ...
## $ Medical_History_35 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Medical_History_36 : int 2 2 3 2 3 2 2 2 2 2 ...
## $ Medical_History_37 : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Medical_History_38 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Medical_History_39 : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Medical_History_40 : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Medical_History_41 : int 3 1 1 1 1 3 3 1 3 1 ...
## $ Medical_Keyword_1 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_2 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_3 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_4 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_5 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_6 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_7 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_8 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_9 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_10 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_11 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_12 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_13 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_14 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_15 : int 0 0 0 0 0 0 0 0 0 1 ...
## $ Medical_Keyword_16 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_17 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_18 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_19 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Medical_Keyword_20 : int 0 0 0 0 0 0 0 0 1 0 ...
## [list output truncated]
Convert Response–our target variable–to a factor
df$Response <- as.factor(df$Response)
Check for NAs
sum(is.na(df))
## [1] 393103
Data has many NAs. See where they are
sapply(df, function(x) sum(is.na(x)) )
## Id Product_Info_1 Product_Info_2
## 0 0 0
## Product_Info_3 Product_Info_4 Product_Info_5
## 0 0 0
## Product_Info_6 Product_Info_7 Ins_Age
## 0 0 0
## Ht Wt BMI
## 0 0 0
## Employment_Info_1 Employment_Info_2 Employment_Info_3
## 19 0 0
## Employment_Info_4 Employment_Info_5 Employment_Info_6
## 6779 0 10854
## InsuredInfo_1 InsuredInfo_2 InsuredInfo_3
## 0 0 0
## InsuredInfo_4 InsuredInfo_5 InsuredInfo_6
## 0 0 0
## InsuredInfo_7 Insurance_History_1 Insurance_History_2
## 0 0 0
## Insurance_History_3 Insurance_History_4 Insurance_History_5
## 0 0 25396
## Insurance_History_7 Insurance_History_8 Insurance_History_9
## 0 0 0
## Family_Hist_1 Family_Hist_2 Family_Hist_3
## 0 28656 34241
## Family_Hist_4 Family_Hist_5 Medical_History_1
## 19184 41811 8889
## Medical_History_2 Medical_History_3 Medical_History_4
## 0 0 0
## Medical_History_5 Medical_History_6 Medical_History_7
## 0 0 0
## Medical_History_8 Medical_History_9 Medical_History_10
## 0 0 58824
## Medical_History_11 Medical_History_12 Medical_History_13
## 0 0 0
## Medical_History_14 Medical_History_15 Medical_History_16
## 0 44596 0
## Medical_History_17 Medical_History_18 Medical_History_19
## 0 0 0
## Medical_History_20 Medical_History_21 Medical_History_22
## 0 0 0
## Medical_History_23 Medical_History_24 Medical_History_25
## 0 55580 0
## Medical_History_26 Medical_History_27 Medical_History_28
## 0 0 0
## Medical_History_29 Medical_History_30 Medical_History_31
## 0 0 0
## Medical_History_32 Medical_History_33 Medical_History_34
## 58274 0 0
## Medical_History_35 Medical_History_36 Medical_History_37
## 0 0 0
## Medical_History_38 Medical_History_39 Medical_History_40
## 0 0 0
## Medical_History_41 Medical_Keyword_1 Medical_Keyword_2
## 0 0 0
## Medical_Keyword_3 Medical_Keyword_4 Medical_Keyword_5
## 0 0 0
## Medical_Keyword_6 Medical_Keyword_7 Medical_Keyword_8
## 0 0 0
## Medical_Keyword_9 Medical_Keyword_10 Medical_Keyword_11
## 0 0 0
## Medical_Keyword_12 Medical_Keyword_13 Medical_Keyword_14
## 0 0 0
## Medical_Keyword_15 Medical_Keyword_16 Medical_Keyword_17
## 0 0 0
## Medical_Keyword_18 Medical_Keyword_19 Medical_Keyword_20
## 0 0 0
## Medical_Keyword_21 Medical_Keyword_22 Medical_Keyword_23
## 0 0 0
## Medical_Keyword_24 Medical_Keyword_25 Medical_Keyword_26
## 0 0 0
## Medical_Keyword_27 Medical_Keyword_28 Medical_Keyword_29
## 0 0 0
## Medical_Keyword_30 Medical_Keyword_31 Medical_Keyword_32
## 0 0 0
## Medical_Keyword_33 Medical_Keyword_34 Medical_Keyword_35
## 0 0 0
## Medical_Keyword_36 Medical_Keyword_37 Medical_Keyword_38
## 0 0 0
## Medical_Keyword_39 Medical_Keyword_40 Medical_Keyword_41
## 0 0 0
## Medical_Keyword_42 Medical_Keyword_43 Medical_Keyword_44
## 0 0 0
## Medical_Keyword_45 Medical_Keyword_46 Medical_Keyword_47
## 0 0 0
## Medical_Keyword_48 Response
## 0 0
Some columns have a very high number of NAs. Delete these columns
df <- subset(df, select = -c(Employment_Info_4, Employment_Info_6,
Insurance_History_5,
Family_Hist_2, Family_Hist_3, Family_Hist_4, Family_Hist_5,
Medical_History_1,Medical_History_10, Medical_History_15,Medical_History_24, Medical_History_25,Medical_History_32) )
Next, delete all rows with NAs. Another way to handle NAs would be to use one of various imputation methods, but we will not do this here
dfnm <- na.omit(df)
library(ggplot2)
Examine target variable Response
g <- ggplot(data = dfnm, aes(Response)) + geom_bar()
g
Response values for 3 and 4 are very rare. 8 is by far the most common value
Two-way analyses of different predictors and the target variable. We do this to figure out which variables to include in the machine learning model. We will only show some of the predictor variables here that seem to have an impact on the response variable based on the graphs.
g <- ggplot(dfnm, aes(x=BMI, fill=Response)) + geom_density(position = 'stack') +xlim(0.2,0.8)
g
## Warning: Removed 1083 rows containing non-finite values (stat_density).
g <- ggplot(dfnm, aes(x=BMI, fill=Response)) + geom_density(position = 'fill') +xlim(0.2,0.8)
g
## Warning: Removed 1083 rows containing non-finite values (stat_density).
BMI seems to be related to different risk classifications–which makes sense intuitively, as obesity has been linked to higher mortality
dfnm$InsuredInfo_1 <- as.factor(dfnm$InsuredInfo_1)
g <- ggplot(dfnm, aes(InsuredInfo_1, fill = Response)) + geom_bar(position = 'fill')
g
dfnm$InsuredInfo_1 <- as.numeric(as.character(dfnm$InsuredInfo_1))
InsuredInfo, which provides information about the applicant, seems to be related to the risk classification
dfnm$Medical_Keyword_48 <- as.factor(dfnm$Medical_Keyword_48)
g <- ggplot(dfnm, aes(Medical_Keyword_48, fill = Response)) + geom_bar(position = 'fill')
g
dfnm$Medical_Keyword_48 <- as.numeric(as.character(dfnm$Medical_Keyword_48))
Medical keywords seem to be related to risk classification. This makes sense intuitivelly. If someone has had, for example, cancer, then mortality is higher and this will influence risk classification
We will use two different models here: decision tree and multinomial regression. Let us start with the decision tree
library(rpart)
library(rpart.plot)
model.tree <- rpart(Response ~ ., dfnm, method = "class")
rpart.plot(model.tree, type=1, extra = 102)
Now, let us get and prepare the test set
df_test <- read.csv("https://michaschweizer.github.io/datasets/test.csv", na.strings = "")
df_test <- subset(df_test, select = -c(Employment_Info_4, Employment_Info_6,
Insurance_History_5,
Family_Hist_2, Family_Hist_3, Family_Hist_4, Family_Hist_5,
Medical_History_1,Medical_History_10, Medical_History_15,Medical_History_24, Medical_History_25,Medical_History_32) )
sapply(df_test, function(x) sum(is.na(x)) )
## Id Product_Info_1 Product_Info_2
## 0 0 0
## Product_Info_3 Product_Info_4 Product_Info_5
## 0 0 0
## Product_Info_6 Product_Info_7 Ins_Age
## 0 0 0
## Ht Wt BMI
## 0 0 0
## Employment_Info_1 Employment_Info_2 Employment_Info_3
## 3 0 0
## Employment_Info_5 InsuredInfo_1 InsuredInfo_2
## 0 0 0
## InsuredInfo_3 InsuredInfo_4 InsuredInfo_5
## 0 0 0
## InsuredInfo_6 InsuredInfo_7 Insurance_History_1
## 0 0 0
## Insurance_History_2 Insurance_History_3 Insurance_History_4
## 0 0 0
## Insurance_History_7 Insurance_History_8 Insurance_History_9
## 0 0 0
## Family_Hist_1 Medical_History_2 Medical_History_3
## 0 0 0
## Medical_History_4 Medical_History_5 Medical_History_6
## 0 0 0
## Medical_History_7 Medical_History_8 Medical_History_9
## 0 0 0
## Medical_History_11 Medical_History_12 Medical_History_13
## 0 0 0
## Medical_History_14 Medical_History_16 Medical_History_17
## 0 0 0
## Medical_History_18 Medical_History_19 Medical_History_20
## 0 0 0
## Medical_History_21 Medical_History_22 Medical_History_23
## 0 0 0
## Medical_History_26 Medical_History_27 Medical_History_28
## 0 0 0
## Medical_History_29 Medical_History_30 Medical_History_31
## 0 0 0
## Medical_History_33 Medical_History_34 Medical_History_35
## 0 0 0
## Medical_History_36 Medical_History_37 Medical_History_38
## 0 0 0
## Medical_History_39 Medical_History_40 Medical_History_41
## 0 0 0
## Medical_Keyword_1 Medical_Keyword_2 Medical_Keyword_3
## 0 0 0
## Medical_Keyword_4 Medical_Keyword_5 Medical_Keyword_6
## 0 0 0
## Medical_Keyword_7 Medical_Keyword_8 Medical_Keyword_9
## 0 0 0
## Medical_Keyword_10 Medical_Keyword_11 Medical_Keyword_12
## 0 0 0
## Medical_Keyword_13 Medical_Keyword_14 Medical_Keyword_15
## 0 0 0
## Medical_Keyword_16 Medical_Keyword_17 Medical_Keyword_18
## 0 0 0
## Medical_Keyword_19 Medical_Keyword_20 Medical_Keyword_21
## 0 0 0
## Medical_Keyword_22 Medical_Keyword_23 Medical_Keyword_24
## 0 0 0
## Medical_Keyword_25 Medical_Keyword_26 Medical_Keyword_27
## 0 0 0
## Medical_Keyword_28 Medical_Keyword_29 Medical_Keyword_30
## 0 0 0
## Medical_Keyword_31 Medical_Keyword_32 Medical_Keyword_33
## 0 0 0
## Medical_Keyword_34 Medical_Keyword_35 Medical_Keyword_36
## 0 0 0
## Medical_Keyword_37 Medical_Keyword_38 Medical_Keyword_39
## 0 0 0
## Medical_Keyword_40 Medical_Keyword_41 Medical_Keyword_42
## 0 0 0
## Medical_Keyword_43 Medical_Keyword_44 Medical_Keyword_45
## 0 0 0
## Medical_Keyword_46 Medical_Keyword_47 Medical_Keyword_48
## 0 0 0
df_test[!complete.cases(df_test),'Employment_Info_1'] <- median(df_test$Employment_Info_1, na.rm = T)
Next, we use the decision tree to predict the Response values for the test set
probs <- predict(model.tree, df_test, type = "class")
Then, create dataframe with predictions for export and export the file for upload
predictions <- data.frame(df_test$Id)
predictions$Response <- probs
names(predictions) <- c('Id', 'Response')
write.csv(predictions, 'myprediction_tree.csv', row.names = F)
Build the model, based on the predictor variables that were found to be relevant in the exploratory data analysis
library(foreign)
library(nnet)
dfnm_multi <- df[c(12,17:23,32:115)]
dfnm_multi$Response <- relevel(dfnm_multi$Response, re = 1)
model.multinom <- multinom(Response ~ ., data = dfnm_multi)
## # weights: 744 (644 variable)
## initial value 123479.318186
## iter 10 value 108973.221407
## iter 20 value 100691.110691
## iter 30 value 94624.394589
## iter 40 value 93171.639891
## iter 50 value 92659.386672
## iter 60 value 91283.531698
## iter 70 value 89500.023199
## iter 80 value 87747.862564
## iter 90 value 87002.269314
## iter 100 value 86363.500196
## final value 86363.500196
## stopped after 100 iterations
Now, let us get and prepare the test set
df_test_multi <- read.csv("https://michaschweizer.github.io/datasets/test.csv", na.strings = "")
df_test_multi <- subset(df_test_multi, select = -c(Employment_Info_4, Employment_Info_6,
Insurance_History_5,
Family_Hist_2, Family_Hist_3, Family_Hist_4, Family_Hist_5,
Medical_History_1,Medical_History_10, Medical_History_15,Medical_History_24, Medical_History_25,Medical_History_32) )
sapply(df_test_multi, function(x) sum(is.na(x)) )
## Id Product_Info_1 Product_Info_2
## 0 0 0
## Product_Info_3 Product_Info_4 Product_Info_5
## 0 0 0
## Product_Info_6 Product_Info_7 Ins_Age
## 0 0 0
## Ht Wt BMI
## 0 0 0
## Employment_Info_1 Employment_Info_2 Employment_Info_3
## 3 0 0
## Employment_Info_5 InsuredInfo_1 InsuredInfo_2
## 0 0 0
## InsuredInfo_3 InsuredInfo_4 InsuredInfo_5
## 0 0 0
## InsuredInfo_6 InsuredInfo_7 Insurance_History_1
## 0 0 0
## Insurance_History_2 Insurance_History_3 Insurance_History_4
## 0 0 0
## Insurance_History_7 Insurance_History_8 Insurance_History_9
## 0 0 0
## Family_Hist_1 Medical_History_2 Medical_History_3
## 0 0 0
## Medical_History_4 Medical_History_5 Medical_History_6
## 0 0 0
## Medical_History_7 Medical_History_8 Medical_History_9
## 0 0 0
## Medical_History_11 Medical_History_12 Medical_History_13
## 0 0 0
## Medical_History_14 Medical_History_16 Medical_History_17
## 0 0 0
## Medical_History_18 Medical_History_19 Medical_History_20
## 0 0 0
## Medical_History_21 Medical_History_22 Medical_History_23
## 0 0 0
## Medical_History_26 Medical_History_27 Medical_History_28
## 0 0 0
## Medical_History_29 Medical_History_30 Medical_History_31
## 0 0 0
## Medical_History_33 Medical_History_34 Medical_History_35
## 0 0 0
## Medical_History_36 Medical_History_37 Medical_History_38
## 0 0 0
## Medical_History_39 Medical_History_40 Medical_History_41
## 0 0 0
## Medical_Keyword_1 Medical_Keyword_2 Medical_Keyword_3
## 0 0 0
## Medical_Keyword_4 Medical_Keyword_5 Medical_Keyword_6
## 0 0 0
## Medical_Keyword_7 Medical_Keyword_8 Medical_Keyword_9
## 0 0 0
## Medical_Keyword_10 Medical_Keyword_11 Medical_Keyword_12
## 0 0 0
## Medical_Keyword_13 Medical_Keyword_14 Medical_Keyword_15
## 0 0 0
## Medical_Keyword_16 Medical_Keyword_17 Medical_Keyword_18
## 0 0 0
## Medical_Keyword_19 Medical_Keyword_20 Medical_Keyword_21
## 0 0 0
## Medical_Keyword_22 Medical_Keyword_23 Medical_Keyword_24
## 0 0 0
## Medical_Keyword_25 Medical_Keyword_26 Medical_Keyword_27
## 0 0 0
## Medical_Keyword_28 Medical_Keyword_29 Medical_Keyword_30
## 0 0 0
## Medical_Keyword_31 Medical_Keyword_32 Medical_Keyword_33
## 0 0 0
## Medical_Keyword_34 Medical_Keyword_35 Medical_Keyword_36
## 0 0 0
## Medical_Keyword_37 Medical_Keyword_38 Medical_Keyword_39
## 0 0 0
## Medical_Keyword_40 Medical_Keyword_41 Medical_Keyword_42
## 0 0 0
## Medical_Keyword_43 Medical_Keyword_44 Medical_Keyword_45
## 0 0 0
## Medical_Keyword_46 Medical_Keyword_47 Medical_Keyword_48
## 0 0 0
df_test_multi[!complete.cases(df_test_multi),'Employment_Info_1'] <- median(df_test_multi$Employment_Info_1, na.rm = T)
df_test_multi <- df_test_multi[c(12,17:23,32:114)]
Next, we use the multinomial regression model to predict the Response values for the test set
model.multinom.fitted <- predict(model.multinom,df_test_multi,type='class')
Then, create dataframe with predictions for export and export the file for upload
predictions <- data.frame(df_test$Id)
predictions$Response <- model.multinom.fitted
names(predictions) <- c('Id', 'Response')
write.csv(predictions, 'myprediction_multinom.csv', row.names = F)