Remaining stuff to do
This notebook is not yet complete. Immediate next steps include:
- Filter for instances where TA8B is not missing
- Drop any variables which are all NA (or more than 10% NA)
- Do a quick search and remove any other variables which don’t make sense
- Impute medians for all remaining variables
- Figure out how to specify lm and mnl in glmnet (perhaps it just assumes mnl if y var is a factor)
Install packages and load data
library(tidyverse)
library(mlbench)
library(caret)
library(rsample)
library(Metrics)
Attaching package: 㤼㸱Metrics㤼㸲
The following objects are masked from 㤼㸱package:caret㤼㸲:
precision, recall
library(sjmisc)
ihds_ind_dir <- "C:/Users/dougj/Documents/Data/IHDS/IHDS 2012/DS0001"
ind_file <- file.path(ihds_ind_dir, "36151-0001-Data.dta")
# read in just those variables that i need
# this is much faster than reading in everything and then selecting
df <- read_dta(ind_file, col_select = c(STATEID, starts_with("RO"), starts_with("CS"), starts_with("TA"), starts_with("ED")))
df <- df %>% mutate(state = factor(STATEID)) %>% select(-STATEID)
# df <- read_dta(ind_file, col_select = c(STATEID, PSUID, URBAN2011, HHID, HHSPLITID, PERSONID, IDPSU, WT, RO3, RO7, RO5, starts_with("CS"), starts_with("TA"), starts_with("ED")) )
Process dataset and split
data_split <- initial_split(df, prop = .9)
training_data <- training(data_split)
test_data <- testing(data_split)
Run multi logit and linear, both with LASSO
model <- train(
TA8B ~.,
training_data,
method = "glmnet",
tuneGrid = expand.grid(
alpha = 1,
lambda = seq(0.0001,1, length = 20)
),
trControl = trainControl(
method = "cv",
number = 10
)
)
Error in na.fail.default(list(TA8B = c(NA, NA, NA, NA, NA, NA, NA, NA, :
missing values in object
LS0tDQp0aXRsZTogIlByZWRpY3Rpb24gbW9kZWwgZm9yIElIRFMgQVNFUiINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCiMjIFN1bW1hcnkNClRoaXMgbm90ZWJvb2sgdXNlcyBtYWNoaW5lIGxlYXJuaW5nIHRvIGFzc2VzcyB3aGV0aGVyIGhvdXNlaG9sZCB2YXJpYWJsZXMgd291bGQgYmUgdXNlZnVsIGluIGltcHV0aW5nIGxlYXJuaW5nIG91dGNvbWVzIGRhdGEuICBJbiBwYXJ0aWN1bGFyLCBJIGF0dGVtcHQgdG8gYnVpbGQgYSBtb2RlbCB0byBwcmVkaWN0IEFTRVIgc2NvcmVzIGFuZCB0aGVuIHRlc3QgdGhlc2UgcHJlZGljdGlvbnMgb24gYSBoZWxkIG91dCBzYW1wbGUuDQoNCiMjIFdvcmtmbG93DQoxLiBTcGxpdCBkYXRhc2V0IGludG8gdHJhaW4gYW5kIHRlc3QgKDkwLzEwIHNwbGl0IHNpbmNlIHRoZSBkYXRhc2V0IGlzIGxhcmdlKQ0KMi4gUnVuIG11bHRpbm9taWFsIGxvZ2l0IGFuZCBsaW5lYXIgcmVncmVzc2lvbiB3aXRoIExBU1NPLiAodGhlcmUgaXMgbm8gb2J2aW91cyB3YXkgdG8gcnVuIG9yZGVyZWQgbG9naXQgd2l0aCBMQVNTTykNCjIuIFVzZSBjcm9zcyB2YWxpZGF0aW9uIHRvIHNlbGVjdCBiZXN0IGxldmVsIG9mIGxhbWJkYSBmb3IgZWFjaA0KMy4gUnVuIG11bHRpIGxvZ2l0IGFuZCBsaW5lYXIgcmVncmVzc2lvbiBvbiBmdWxsIHRyYWluaW5nIGRhdGFzZXQgZm9yIG9wdGltYWwgbGV2ZWwgb2YgbGFtYmRhLg0KNC4gR2V0IHRoZSBsaXN0IG9mIHZhcmlhYmxlcyBzZWxlY3RlZCBmcm9tIGVhY2ggbW9kZWwuDQo1LiBSdW4gb3JkZXJlZCBsb2dpdCB3aXRoIHRoZSB0d28gbGlzdHMgb2YgdmFyaWFibGVzLg0KNi4gT2YgdGhlIDMgbW9kZWxzIHJ1biBvbiB0aGUgZnVsbCB0cmFpbmluZyBkYXRhc2V0IChtdWx0aSBsb2dpdCwgbGluZWFyLCBhbmQgb3JkZXJlZCBsb2dpdCksIGdvIHdpdGggdGhlIG9uZSB3aXRoIHRoZSBoaWdoZXN0IGFjY3VyYWN5DQo3LiBUZXN0IA0KDQojIyBSZW1haW5pbmcgc3R1ZmYgdG8gZG8NClRoaXMgbm90ZWJvb2sgaXMgbm90IHlldCBjb21wbGV0ZS4gSW1tZWRpYXRlIG5leHQgc3RlcHMgaW5jbHVkZToNCg0KMS4gRmlsdGVyIGZvciBpbnN0YW5jZXMgd2hlcmUgVEE4QiBpcyBub3QgbWlzc2luZw0KMi4gRHJvcCBhbnkgdmFyaWFibGVzIHdoaWNoIGFyZSBhbGwgTkEgKG9yIG1vcmUgdGhhbiAxMCUgTkEpDQoyLiBEbyBhIHF1aWNrIHNlYXJjaCBhbmQgcmVtb3ZlIGFueSBvdGhlciB2YXJpYWJsZXMgd2hpY2ggZG9uJ3QgbWFrZSBzZW5zZQ0KMy4gSW1wdXRlIG1lZGlhbnMgZm9yIGFsbCByZW1haW5pbmcgdmFyaWFibGVzDQo0LiBGaWd1cmUgb3V0IGhvdyB0byBzcGVjaWZ5IGxtIGFuZCBtbmwgaW4gZ2xtbmV0IChwZXJoYXBzIGl0IGp1c3QgYXNzdW1lcyBtbmwgaWYgeSB2YXIgaXMgYSBmYWN0b3IpDQoNCg0KIyMjIEluc3RhbGwgcGFja2FnZXMgYW5kIGxvYWQgZGF0YQ0KDQpgYGB7ciBzZXR1cH0NCmxpYnJhcnkodGlkeXZlcnNlKQ0KbGlicmFyeShtbGJlbmNoKQ0KbGlicmFyeShjYXJldCkNCmxpYnJhcnkocnNhbXBsZSkNCmxpYnJhcnkoTWV0cmljcykNCmxpYnJhcnkoc2ptaXNjKQ0KYGBgDQoNCmBgYHtyfQ0KaWhkc19pbmRfZGlyIDwtICJDOi9Vc2Vycy9kb3Vnai9Eb2N1bWVudHMvRGF0YS9JSERTL0lIRFMgMjAxMi9EUzAwMDEiDQppbmRfZmlsZSA8LSBmaWxlLnBhdGgoaWhkc19pbmRfZGlyLCAiMzYxNTEtMDAwMS1EYXRhLmR0YSIpDQojIHJlYWQgaW4ganVzdCB0aG9zZSB2YXJpYWJsZXMgdGhhdCBpIG5lZWQNCiMgdGhpcyBpcyBtdWNoIGZhc3RlciB0aGFuIHJlYWRpbmcgaW4gZXZlcnl0aGluZyBhbmQgdGhlbiBzZWxlY3RpbmcNCmRmIDwtIHJlYWRfZHRhKGluZF9maWxlLCBjb2xfc2VsZWN0ID0gYyhTVEFURUlELCBzdGFydHNfd2l0aCgiUk8iKSwgc3RhcnRzX3dpdGgoIkNTIiksIHN0YXJ0c193aXRoKCJUQSIpLCBzdGFydHNfd2l0aCgiRUQiKSkpDQpkZiA8LSBkZiAlPiUgbXV0YXRlKHN0YXRlID0gZmFjdG9yKFNUQVRFSUQpKSAlPiUgc2VsZWN0KC1TVEFURUlEKQ0KIyBkZiA8LSByZWFkX2R0YShpbmRfZmlsZSwgY29sX3NlbGVjdCA9IGMoU1RBVEVJRCwgUFNVSUQsIFVSQkFOMjAxMSwgSEhJRCwgSEhTUExJVElELCBQRVJTT05JRCwgSURQU1UsIFdULCBSTzMsIFJPNywgUk81LCBzdGFydHNfd2l0aCgiQ1MiKSwgc3RhcnRzX3dpdGgoIlRBIiksIHN0YXJ0c193aXRoKCJFRCIpKSApDQpgYGANCg0KIyMjIFByb2Nlc3MgZGF0YXNldCBhbmQgc3BsaXQNCg0KYGBge3J9DQoNCmRhdGFfc3BsaXQgPC0gaW5pdGlhbF9zcGxpdChkZiwgcHJvcCA9IC45KQ0KdHJhaW5pbmdfZGF0YSA8LSB0cmFpbmluZyhkYXRhX3NwbGl0KQ0KdGVzdF9kYXRhIDwtIHRlc3RpbmcoZGF0YV9zcGxpdCkNCmBgYA0KDQojIyMgUnVuIG11bHRpIGxvZ2l0IGFuZCBsaW5lYXIsIGJvdGggd2l0aCBMQVNTTw0KYGBge3J9DQptb2RlbCA8LSB0cmFpbigNCiAgVEE4QiB+LiwNCiAgdHJhaW5pbmdfZGF0YSwNCiAgbWV0aG9kID0gImdsbW5ldCIsDQogIHR1bmVHcmlkID0gZXhwYW5kLmdyaWQoDQogICAgYWxwaGEgPSAxLA0KICAgIGxhbWJkYSA9IHNlcSgwLjAwMDEsMSwgbGVuZ3RoID0gMjApDQogICksDQogIHRyQ29udHJvbCA9IHRyYWluQ29udHJvbCgNCiAgICBtZXRob2QgPSAiY3YiLA0KICAgIG51bWJlciA9IDEwDQogICkNCikNCmBgYA0K