GOAL : To predict how likely individuals are to receive their H1N1 and seasonal flu vaccines. Specifically, you’ll be predicting two probabilities: one for h1n1_vaccine and one for seasonal_vaccine.
## Loading required package: lattice
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'mldr'
## The following objects are masked from 'package:caret':
##
## precision, recall
## Loading required package: parallel
## Loading required package: ROCR
##
## Attaching package: 'utiml'
## The following object is masked from 'package:caret':
##
## lift
data <- read.csv2("/Users/aashaysharma/Desktop/DS/Flu Shot Prediction/Flu_Shot_Learning_Predict_H1N1_and_Seasonal_Flu_Vaccines_-_Training_Features.csv", sep = ",")
labels <- read.csv2("/Users/aashaysharma/Desktop/DS/Flu Shot Prediction/Flu_Shot_Learning_Predict_H1N1_and_Seasonal_Flu_Vaccines_-_Training_Labels.csv", sep = ",")
new_data <- merge(labels, data)
summary(data)
## respondent_id h1n1_concern h1n1_knowledge behavioral_antiviral_meds
## Min. : 0 Min. :0.000 Min. :0.000 Min. :0.00000
## 1st Qu.: 6676 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:0.00000
## Median :13353 Median :2.000 Median :1.000 Median :0.00000
## Mean :13353 Mean :1.618 Mean :1.263 Mean :0.04884
## 3rd Qu.:20030 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:0.00000
## Max. :26706 Max. :3.000 Max. :2.000 Max. :1.00000
## NA's :92 NA's :116 NA's :71
## behavioral_avoidance behavioral_face_mask behavioral_wash_hands
## Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:1.0000
## Median :1.0000 Median :0.00000 Median :1.0000
## Mean :0.7256 Mean :0.06898 Mean :0.8256
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.00000 Max. :1.0000
## NA's :208 NA's :19 NA's :42
## behavioral_large_gatherings behavioral_outside_home behavioral_touch_face
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :1.0000
## Mean :0.3586 Mean :0.3373 Mean :0.6773
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## NA's :87 NA's :82 NA's :128
## doctor_recc_h1n1 doctor_recc_seasonal chronic_med_condition
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.2203 Mean :0.3297 Mean :0.2833
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## NA's :2160 NA's :2160 NA's :971
## child_under_6_months health_worker health_insurance
## Min. :0.0000 Min. :0.0000 Min. :0.00
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1.00
## Median :0.0000 Median :0.0000 Median :1.00
## Mean :0.0826 Mean :0.1119 Mean :0.88
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:1.00
## Max. :1.0000 Max. :1.0000 Max. :1.00
## NA's :820 NA's :804 NA's :12274
## opinion_h1n1_vacc_effective opinion_h1n1_risk opinion_h1n1_sick_from_vacc
## Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:1.000 1st Qu.:1.000
## Median :4.000 Median :2.000 Median :2.000
## Mean :3.851 Mean :2.343 Mean :2.358
## 3rd Qu.:5.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000
## NA's :391 NA's :388 NA's :395
## opinion_seas_vacc_effective opinion_seas_risk opinion_seas_sick_from_vacc
## Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:4.000 1st Qu.:2.000 1st Qu.:1.000
## Median :4.000 Median :2.000 Median :2.000
## Mean :4.026 Mean :2.719 Mean :2.118
## 3rd Qu.:5.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000
## NA's :462 NA's :514 NA's :537
## age_group education race sex
## Length:26707 Length:26707 Length:26707 Length:26707
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## income_poverty marital_status rent_or_own employment_status
## Length:26707 Length:26707 Length:26707 Length:26707
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## hhs_geo_region census_msa household_adults household_children
## Length:26707 Length:26707 Min. :0.0000 Min. :0.0000
## Class :character Class :character 1st Qu.:0.0000 1st Qu.:0.0000
## Mode :character Mode :character Median :1.0000 Median :0.0000
## Mean :0.8865 Mean :0.5346
## 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :3.0000 Max. :3.0000
## NA's :249 NA's :249
## employment_industry employment_occupation
## Length:26707 Length:26707
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
NA_percent <- apply(data, 2, function(col)(sum(is.na(col))/26707)*100)
sort(NA_percent, decreasing = TRUE)
## health_insurance doctor_recc_h1n1
## 45.9579885 8.0877673
## doctor_recc_seasonal chronic_med_condition
## 8.0877673 3.6357509
## child_under_6_months health_worker
## 3.0703561 3.0104467
## opinion_seas_sick_from_vacc opinion_seas_risk
## 2.0107088 1.9245891
## opinion_seas_vacc_effective opinion_h1n1_sick_from_vacc
## 1.7298836 1.4790130
## opinion_h1n1_vacc_effective opinion_h1n1_risk
## 1.4640356 1.4528026
## household_adults household_children
## 0.9323398 0.9323398
## behavioral_avoidance behavioral_touch_face
## 0.7788220 0.4792751
## h1n1_knowledge h1n1_concern
## 0.4343431 0.3444790
## behavioral_large_gatherings behavioral_outside_home
## 0.3257573 0.3070356
## behavioral_antiviral_meds behavioral_wash_hands
## 0.2658479 0.1572621
## behavioral_face_mask respondent_id
## 0.0711424 0.0000000
## age_group education
## 0.0000000 0.0000000
## race sex
## 0.0000000 0.0000000
## income_poverty marital_status
## 0.0000000 0.0000000
## rent_or_own employment_status
## 0.0000000 0.0000000
## hhs_geo_region census_msa
## 0.0000000 0.0000000
## employment_industry employment_occupation
## 0.0000000 0.0000000
I personally think 25% should be the threshold for NA frequency in the column to decide wether to consider the column for analysis or not.
Sometimes columns with greater NA percentage have more semantic meaning and thus they cannot be ignored.
Here we can see health_insurance column with approx 46 % NA values and thus it cannot be used for analysis as nearly half of the data is missing.
data2 <- new_data[-c(18)]
data2 <- na.omit(data2)
Here are multiple variables which shows behavioral awareness or habits of people which can determine their likely hood of vaccination. So instead of treating all the behavioral variables separately we can combine them to a precaution score.
data2$precaution_score <- (as.numeric(data2$behavioral_antiviral_meds)) + (as.numeric(data2$behavioral_avoidance)) + (as.numeric(data2$behavioral_face_mask)) + (as.numeric(data2$behavioral_large_gatherings)) + (as.numeric(data2$behavioral_outside_home)) + (as.numeric(data2$behavioral_touch_face)) + (as.numeric(data2$behavioral_wash_hands))
a <- ggplot(data2, aes(precaution_score, fill = as.factor(h1n1_vaccine)))
a + geom_bar(position = "fill")
b <- ggplot(data2, aes(precaution_score, fill = as.factor(seasonal_vaccine)))
b + geom_bar(position = "fill") + scale_fill_manual(values = c("green", "blue"))
A person’s opinion about how effective is the vaccine can determine their chances of getting vaccinated themselves.
1 = Not at all effective; 2 = Not very effective; 3 = Don’t know; 4 = Somewhat effective; 5 = Very effective.
c <- ggplot(data2, aes(opinion_h1n1_vacc_effective, fill = as.factor(h1n1_vaccine)))
c + geom_bar(position = "dodge")
d <- ggplot(data2, aes(opinion_seas_vacc_effective, fill = as.factor(seasonal_vaccine)))
d + geom_bar(position = "dodge") + scale_fill_manual(values = c("green", "blue"))
Both the plots shows the trend.
1 = Very Low; 2 = Somewhat low; 3 = Don’t know; 4 = Somewhat high; 5 = Very high.
g <- ggplot(data2, aes(opinion_h1n1_risk, fill = as.factor(h1n1_vaccine)))
g + geom_bar(position = "fill")
h <- ggplot(data2, aes(opinion_seas_risk, fill = as.factor(seasonal_vaccine)))
h + geom_bar(position = "fill") + scale_fill_manual(values = c("green", "blue"))
People that fill that they are at high risk of getting h1n1 or seasonal flu without vaccination are more likely to get vaccinated.
1 = Not at all worried; 2 = Not very worried; 3 = Don’t know; 4 = Somewhat worried; 5 = Very worried.
i <- ggplot(data2, aes(opinion_h1n1_sick_from_vacc, fill = as.factor(h1n1_vaccine)))
i + geom_bar(position = "dodge")
j <- ggplot(data2, aes(opinion_seas_sick_from_vacc, fill = as.factor(seasonal_vaccine)))
j + geom_bar(position = "dodge") + scale_fill_manual(values = c("green", "blue"))
e <- ggplot(data2, aes(income_poverty, fill = as.factor(h1n1_vaccine)))
e + geom_bar(position = "fill") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
f <- ggplot(data2, aes(income_poverty, fill = as.factor(seasonal_vaccine)))
f + geom_bar(position = "fill") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + scale_fill_manual(values = c("green", "blue"))
The plot shows that people below poverty are less vaccinated compared relatively wealthier groups.
Other features are transformed as the below :
data2$race[data2$race == "White"] = 1
data2$race[data2$race == "Black"] = 2
data2$race[data2$race == "Other or Multiple"] = 3
data2$race[data2$race == "Hispanic"] = 4
data2$income_poverty[data2$income_poverty == "Below Poverty"] = 1
data2$income_poverty[data2$income_poverty == "<= $75,000, Above Poverty"] = 2
data2$income_poverty[data2$income_poverty == "> $75,000"] = 3
data2$income_poverty[data2$income_poverty == ""] = 4
data2$marital_status[data2$marital_status == "Not Married"] = 1
data2$marital_status[data2$marital_status == "Married"] = 2
data2$marital_status[data2$marital_status == ""] = 3
data2$rent_or_own[data2$rent_or_own == "Own"] = 1
data2$rent_or_own[data2$rent_or_own == "Rent"] = 2
data2$rent_or_own[data2$rent_or_own == ""] = 3
data2$employment_status[data2$employment_status == "Not in Labor Force"] = 3
data2$employment_status[data2$employment_status == "Employed"] = 1
data2$employment_status[data2$employment_status == "Unemployed"] = 2
data2$employment_status[data2$employment_status == ""] = 4
data2$household_adults <- as.factor(data2$household_adults)
data2$household_children <- as.factor(data2$household_children)
data2$census_msa[data2$census_msa == "Non-MSA"] = 1
data2$census_msa[data2$census_msa == "MSA, Not Principle City"] = 2
data2$census_msa[data2$census_msa == "MSA, Principle City"] = 3
cols <- colnames(data2[-c(1)])
data2[cols] <- lapply(data2[cols], factor)
data2$sex <- as.numeric(data2$sex)
data2_final <- data2[-c(6,7,8,9,10,11,12,25,32,36,37)]
data2_final_numeric <- data.frame(lapply(data2_final, as.numeric))
colnames(data2_final_numeric)
## [1] "respondent_id" "h1n1_vaccine"
## [3] "seasonal_vaccine" "h1n1_concern"
## [5] "h1n1_knowledge" "doctor_recc_h1n1"
## [7] "doctor_recc_seasonal" "chronic_med_condition"
## [9] "child_under_6_months" "health_worker"
## [11] "opinion_h1n1_vacc_effective" "opinion_h1n1_risk"
## [13] "opinion_h1n1_sick_from_vacc" "opinion_seas_vacc_effective"
## [15] "opinion_seas_risk" "opinion_seas_sick_from_vacc"
## [17] "age_group" "race"
## [19] "sex" "income_poverty"
## [21] "marital_status" "rent_or_own"
## [23] "employment_status" "census_msa"
## [25] "household_adults" "household_children"
## [27] "precaution_score"
I have transformed all the variables as numeric to make an mldr object (from mldr library) which will help in modeling and applying different multi label classification methods form the utiml package.
mldr_obj <- mldr_from_dataframe(data2_final_numeric[-c(1)], labelIndices = c(1, 2))
summary(mldr_obj)
## num.attributes num.instances num.inputs num.labels num.labelsets
## 1 26 22976 24 2 4
## num.single.labelsets max.frequency cardinality density meanIR scumble
## 1 0 11117 2.703778 1.351889 1.102979 -0.1432278
## scumble.cv tcs
## 1 -1.029687 5.257495
mldr_obj$labels
## index count freq IRLbl SCUMBLE SCUMBLE.CV
## h1n1_vaccine 1 28161 1.225670 1.205959 -0.1432278 -1.029687
## seasonal_vaccine 2 33961 1.478108 1.000000 -0.1432278 -1.029687
plot(mldr_obj, type = "LB")
In this particular competition the performance metric is ROC AUC score (macro averaging) so the model which yields the highest ROC AUC score will be selected.
utiml package provides all the necessary functions and strategies that are required for multi label classification such as binary relevance or classifier chains and much more.
set.seed(1231)
DS <- create_holdout_partition(mldr_obj, c(train=0.7, test=0.3))
rf_model <- br(DS$train, "RF")
pred <- predict(rf_model, DS$test)
res <- multilabel_evaluate(DS$test, pred, c("example-based", "macro-AUC"))
round(res, 4)
## accuracy F1 hamming-loss macro-AUC precision
## 1.0000 0.7984 0.0000 0.3403 0.8247
## recall subset-accuracy
## 0.7947 0.6636
This basic method provided an AUC score of 0.3504 on the test set submission made to driven data competition submission.
set.seed(1234)
xgb_model <- br(DS$train, "XGB")
pred <- predict(xgb_model, DS$test)
res <- multilabel_evaluate(DS$test, pred, c("example-based", "macro-AUC"))
round(res, 4)
## accuracy F1 hamming-loss macro-AUC precision
## 0.5639 0.5968 0.4361 0.8399 1.0000
## recall subset-accuracy
## 0.4352 0.0125
XGB model performed very well in the competition as it gave a score of 0.8240 on the test set submission made to driven data.