require(RCurl); require(caret); library(glmnet)
## Loading required package: RCurl
## Warning: package 'RCurl' was built under R version 3.1.2
## Loading required package: bitops
## Loading required package: caret
## Warning: package 'caret' was built under R version 3.1.2
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.1.2
## Loading required package: ggplot2
## Loading required package: Matrix
## Warning: package 'Matrix' was built under R version 3.1.2
## Loaded glmnet 1.9-8
# DATA INGESTION
binData <- getBinaryURL("https://archive.ics.uci.edu/ml/machine-learning-databases/00296/dataset_diabetes.zip", ssl.verifypeer=FALSE)
conObj <- file("dataset_diabetes.zip", open = "wb") # writing binary file
writeBin(binData, conObj) #transfer binary data
close(conObj) # close connection
files <- unzip("dataset_diabetes.zip")
diabetes <- read.csv(files[1], stringsAsFactors = FALSE) # file[1] = diabetic_data.csv
names(diabetes)
## [1] "encounter_id" "patient_nbr"
## [3] "race" "gender"
## [5] "age" "weight"
## [7] "admission_type_id" "discharge_disposition_id"
## [9] "admission_source_id" "time_in_hospital"
## [11] "payer_code" "medical_specialty"
## [13] "num_lab_procedures" "num_procedures"
## [15] "num_medications" "number_outpatient"
## [17] "number_emergency" "number_inpatient"
## [19] "diag_1" "diag_2"
## [21] "diag_3" "number_diagnoses"
## [23] "max_glu_serum" "A1Cresult"
## [25] "metformin" "repaglinide"
## [27] "nateglinide" "chlorpropamide"
## [29] "glimepiride" "acetohexamide"
## [31] "glipizide" "glyburide"
## [33] "tolbutamide" "pioglitazone"
## [35] "rosiglitazone" "acarbose"
## [37] "miglitol" "troglitazone"
## [39] "tolazamide" "examide"
## [41] "citoglipton" "insulin"
## [43] "glyburide.metformin" "glipizide.metformin"
## [45] "glimepiride.pioglitazone" "metformin.rosiglitazone"
## [47] "metformin.pioglitazone" "change"
## [49] "diabetesMed" "readmitted"
# data cleansing
diabetes <- subset(diabetes,select=-c(encounter_id, patient_nbr)) #1: remove useless columns
diabetes[diabetes == "?"] <- NA #2: # change ? to 0s
diabetes <- diabetes[sapply(diabetes, function(x) length(levels(factor(x,exclude=NULL)))>1)] #3: remove single level vari.
diabetes$readmitted <- ifelse(diabetes$readmitted == "<30",1,0) #4: # binary representation of readmitted within 30 days
#diag_1, diag_2 and diag_3 are 3 fields with numerical representations of patient diagnoses
sapply(17:19, function(x) length(unique(diabetes[,x])))
## [1] 717 749 790
#Since these numerical values of diagnoses have no meaning, these must be treated as factors
diabetes_dummy <- diabetes
dmy <- dummyVars(" ~ .", data = diabetes_dummy)
diabetes_dummy <- data.frame(predict(dmy, newdata = diabetes_dummy))
dim(diabetes_dummy)
## [1] 101766 2462
diabetes_dummy[is.na(diabetes_dummy)] <- 0 # change all NAs to 0
# split the data into training and testing data sets
set.seed(111)
inTrain <- createDataPartition(diabetes_dummy$readmitted, p=.6, list=FALSE)
objTrain <-diabetes_dummy[inTrain,]
objTest <- diabetes_dummy[-inTrain,]
predictorNames <- setdiff(names(diabetes_dummy),'readmitted') # Xi
#glmnet - Lasso and elastic-net regularized generalized linear models
# glmnet supports sparse matrices
glmnetModel <- cv.glmnet(sparse.model.matrix(~., data=objTrain[,predictorNames]), objTrain$readmitted, family = "binomial", type.measure = "auc")
glmnetPredict <- predict(glmnetModel,sparse.model.matrix(~., data=objTest[,predictorNames]), s="lambda.min")
#Area under curve: score
(AUC <- auc(objTest$readmitted, glmnetPredict))
## [1] 0.6536718