require(RCurl); require(caret); library(glmnet)
## Loading required package: RCurl
## Warning: package 'RCurl' was built under R version 3.1.2
## Loading required package: bitops
## Loading required package: caret
## Warning: package 'caret' was built under R version 3.1.2
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.1.2
## Loading required package: ggplot2
## Loading required package: Matrix
## Warning: package 'Matrix' was built under R version 3.1.2
## Loaded glmnet 1.9-8
# DATA INGESTION
binData <- getBinaryURL("https://archive.ics.uci.edu/ml/machine-learning-databases/00296/dataset_diabetes.zip", ssl.verifypeer=FALSE)
conObj <- file("dataset_diabetes.zip", open = "wb") # writing binary file
writeBin(binData, conObj) #transfer binary data 
close(conObj) # close connection
files <- unzip("dataset_diabetes.zip") 
diabetes <- read.csv(files[1], stringsAsFactors = FALSE) # file[1] = diabetic_data.csv
names(diabetes)
##  [1] "encounter_id"             "patient_nbr"             
##  [3] "race"                     "gender"                  
##  [5] "age"                      "weight"                  
##  [7] "admission_type_id"        "discharge_disposition_id"
##  [9] "admission_source_id"      "time_in_hospital"        
## [11] "payer_code"               "medical_specialty"       
## [13] "num_lab_procedures"       "num_procedures"          
## [15] "num_medications"          "number_outpatient"       
## [17] "number_emergency"         "number_inpatient"        
## [19] "diag_1"                   "diag_2"                  
## [21] "diag_3"                   "number_diagnoses"        
## [23] "max_glu_serum"            "A1Cresult"               
## [25] "metformin"                "repaglinide"             
## [27] "nateglinide"              "chlorpropamide"          
## [29] "glimepiride"              "acetohexamide"           
## [31] "glipizide"                "glyburide"               
## [33] "tolbutamide"              "pioglitazone"            
## [35] "rosiglitazone"            "acarbose"                
## [37] "miglitol"                 "troglitazone"            
## [39] "tolazamide"               "examide"                 
## [41] "citoglipton"              "insulin"                 
## [43] "glyburide.metformin"      "glipizide.metformin"     
## [45] "glimepiride.pioglitazone" "metformin.rosiglitazone" 
## [47] "metformin.pioglitazone"   "change"                  
## [49] "diabetesMed"              "readmitted"
# data cleansing 
diabetes <- subset(diabetes,select=-c(encounter_id, patient_nbr)) #1: remove useless columns
diabetes[diabetes == "?"] <- NA #2: # change ? to 0s
diabetes <- diabetes[sapply(diabetes, function(x) length(levels(factor(x,exclude=NULL)))>1)] #3: remove single level vari.
diabetes$readmitted <- ifelse(diabetes$readmitted == "<30",1,0) #4: # binary representation of readmitted within 30 days

#diag_1, diag_2 and diag_3 are 3 fields with numerical representations of patient diagnoses
sapply(17:19, function(x) length(unique(diabetes[,x])))
## [1] 717 749 790
#Since these numerical values of diagnoses have no meaning, these must be treated as factors
diabetes_dummy <- diabetes
dmy <- dummyVars(" ~ .", data = diabetes_dummy)
diabetes_dummy <- data.frame(predict(dmy, newdata = diabetes_dummy))
dim(diabetes_dummy)
## [1] 101766   2462
diabetes_dummy[is.na(diabetes_dummy)] <- 0 # change all NAs to 0

# split the data into training and testing data sets
set.seed(111)
inTrain <- createDataPartition(diabetes_dummy$readmitted, p=.6, list=FALSE)
objTrain <-diabetes_dummy[inTrain,]
objTest <- diabetes_dummy[-inTrain,]

predictorNames <- setdiff(names(diabetes_dummy),'readmitted') # Xi
#glmnet - Lasso and elastic-net regularized generalized linear models
# glmnet supports sparse matrices
glmnetModel <- cv.glmnet(sparse.model.matrix(~., data=objTrain[,predictorNames]), objTrain$readmitted, family = "binomial", type.measure = "auc")
glmnetPredict <- predict(glmnetModel,sparse.model.matrix(~., data=objTest[,predictorNames]), s="lambda.min")
#Area under curve: score
(AUC <- auc(objTest$readmitted, glmnetPredict))
## [1] 0.6536718