A couple of rules that will save you much debugging time:
download.file("http://archive.ics.uci.edu/ml/machine-learning-databases/00319/MHEALTHDATASET.zip",
destfile = "mHealth.zip")
unzip("mHealth.zip")
mHealth_subject1 <- read.delim("mHealth/mHealth_subject1.log", header = FALSE)
# read labels
mLabels <- readLines("mHealth/README.txt")[58:81]
# apply labels
mHealth_subject1 %<>% purrr::map2(.x = ., .y = mLabels, .f = function(.x, .y) {
attr(.x, "label") <- str_extract(.y, "(?<=Column\\s\\d{1,2}\\:\\s).*")
return(.x)
})
mHealth_subject1 %<>% as.data.frame()
# ----------------------- Sat Nov 10 21:14:08 2018 ------------------------# Add
# levels
Activities <- readLines("mHealth/README.txt")[37:48] %>% str_extract("(?<=\\:\\s).*")
names(Activities) <- readLines("mHealth/README.txt")[37:48] %>% str_extract("L\\d{1,}(?=\\:)")
mTest <- mHealth_subject1[not(mHealth_subject1$V24 == "0"), ]
unique(as.character(sort(mTest$V24)))
mTest$V24 %<>% factor(labels = gsub("^", "L", unique(as.character(sort(mTest$V24)))))
mTest$V24 %>% summary
mTrain <- mTest[caret::createDataPartition(mTest$V24, times = 1, p = 0.8) %>% .[[1]] %T>%
assign("trIndex", ., envir = .GlobalEnv), ] # Create a training set with equal lengths of each of the response variables categories
mTest <- mTest[trIndex %in% rownames(mTest) %>% not %>% which, ] # Create a test set that is not those rows
#https://youtu.be/aircAruvnKk?t=163 to 4:56 Overview
rv <- "V24" #enter response variable name
dv <- NULL # IF not using all vars, enter Dependent Variables as paste(c("var1","var2"),collapse="+")
data <- mTrain # Enter Data Variable name here
mod.packages <- c("caret","doParallel","iterators","parallel","foreach","caretEnsemble","nnet") # [package dependencies]
startPkgs <- Vectorize(FUN=function(pkg){suppressPackageStartupMessages(library(pkg,character.only = T))}) # Vectorized package init
invisible(startPkgs(mod.packages)) #suppress console output for packages
sysNN1.mod <- system.time({ #Time the model creation
data.train <- caret::createDataPartition(data[[rv]], times = 2, p=.85) # create .85 of each grouping category for training purposes twice
data.train <- caret::trainControl(method="repeatedcv", # repeated cross validation
index=data.train, # Index is training set
number=5, # number of iterations
repeats=1, # number of repeats
search = "grid", # Move through the combinations of parameters incrementally
allowParallel = T, # allow parallel processing
summaryFunction=caret::defaultSummary, #Use this summary function
classProbs=T, # retain raw probabilities of predicted vlues
savePredictions = "final", # Save final predictions for each iteration
returnResamp = "final", # reurn summary metrics from each iteration
returnData = F) # Do not return original data (recommended with large datasets)
if(!is.null(dv)){form <- as.formula(paste0(rv," ~ ",paste(dv,collapse="+")))}else {form <- as.formula(paste0(rv," ~ ."))} # create formula from dependent variables if provided, ortherwise use all
cl <- makeCluster(detectCores()-1) # Allocate the number of parallel workers (here all but one CPU)
registerDoParallel(cl) # register these CPUs
getDoParWorkers() # Make sure it worked
nnetmod <- tryCatch(
{
nnetmod <- caretEnsemble::caretList(form = form, #enter model name here
data = data,
trControl = data.train,
metric="Accuracy",
methodList = c("avNNet","nnet","pcaNNet"), #[method list]
tuneList = list("avNNet"=caretEnsemble::caretModelSpec( #[method specs]
method="avNNet", tuneGrid = tuneGrids),"nnet"=caretEnsemble::caretModelSpec(
method="nnet", tuneGrid = tuneGrids),"pcaNNet"=caretEnsemble::caretModelSpec(
method="pcaNNet", tuneGrid = tuneGrids)))
},
error=function(cond) {
message("Here's the original error message:")
message(cond)
gmailr::use_secret_file("googleusercontent.com.json")
msg <- gmailr::mime(
To = "7818797492@vtext.com",
From = "spiritsynaesthetic@gmail.com",
Subject = "Notification from R",
body = paste("Ensemble Build Error"))
gmailr::send_message(msg)
return(NA)
},
warning=function(cond) {
message("Here's the original warning message:")
message(cond)
# Choose a return value in case of warning
},
finally={
# NOTE:
# Here goes everything that should be executed at the end,
# regardless of success or error.
# If you want more than one expression to be executed, then you
# need to wrap them in curly brackets ({...}); otherwise you could
# just have written 'finally=<expression>'
stopCluster(cl)
registerDoSEQ() # Stop the parallel processing return to sequential
}
)
})
unloadPkgs <- Vectorize(FUN=function(pkg){invisible(detach(paste0("package:",pkg),character.only = T))})
invisible(unloadPkgs(mod.packages)) # Unload packages
save(nnetmod,file = "nnet.Rdata") # save the file (recommended for long runtimes/large files)
#rm(nnetmod) RM from global environment if large file for smoother running
gmailr::use_secret_file("googleusercontent.com.json") #Send a text message indicating completion
msg <- gmailr::mime(
To = "7818797492@vtext.com",
From = "spiritsynaesthetic@gmail.com",
Subject = "Notification from R",
body = paste("NNet 1 Complete"))
gmailr::send_message(msg)
load("nnet.Rdata")
best.Tunes <- lapply(nnetmod, FUN = function(x) {
purrr::pluck(x, list("bestTune")) # Take out the best tuning parameters from each iteration/repetition
})
tuneGrids <- purrr::map2(.x = names(best.Tunes), .y = nnetmod, bT = best.Tunes, function(.x,
.y, bT) {
# Get the tuning parameters
varnms <- names(bT[[.x]])
# Sort the results by accuracy
.y[["results"]] %<>% arrange(Accuracy)
# Determine the two rows with the best results
bestRes <- .y[["results"]][{
nrow(.y[["results"]]) - 1
}:nrow(.y[["results"]]), varnms]
whole.num <- lapply(bestRes[, unlist(lapply(bestRes, is.numeric))], sfsmisc::is.whole) %>%
lapply(any) %>% unlist %>% which %>% names
# Create a tuning grid from those values
Params <- lapply(bestRes, FUN = function(l) {
if (is.numeric(l)) {
out <- quantile(l) # if it's numeric take the quantile
} else {
out <- unique(l) # if non numeric take the unique obs
}
return(out)
}) %>% expand.grid()
Params[, whole.num] %<>% lapply(round) # if it needs to be a whole number round it to be such
return(Params)
})
tuneGrids %<>% split(rep(1:3, 2)) %>% lapply(FUN = function(l) {
out <- do.call("rbind", l)
out <- out[not(duplicated(out)), ]
}) # Create a tuning grid data frame and remove duplicates
names(tuneGrids) <- names(best.Tunes) %>% gsub("\\.\\d\\d?", "", .) %>% unique
# Remove name artifacts
rv <- "V24" #enter response variable name
iv <- NULL # IF not using all vars, enter independent Variables as paste(c("var1","var2"),collapse="+")
data <- mTrain # Enter Data Variable name here
mod.packages <- c("caret","doParallel","iterators","parallel","foreach","caretEnsemble","nnet") # [package dependencies]
startPkgs <- Vectorize(FUN=function(pkg){invisible(suppressPackageStartupMessages(library(pkg,character.only = T)))})
startPkgs(mod.packages)
registerDoParallel() # defaults to half of available
sysNN1.mod <- system.time({
data.train <- caret::createDataPartition(data[[rv]], times = 2, p=.85)
data.train <- caret::trainControl(method="repeatedcv",
index=data.train,
number=5,
repeats=0, # 0 repeats
search = "grid",
allowParallel = T,
summaryFunction=caret::defaultSummary,
classProbs=T,
savePredictions = "final",
returnResamp = "final",
returnData = F,
verboseIter = T # provide console output
)
if(!is.null(iv)){form <- as.formula(paste0(rv," ~ ",paste(iv,collapse="+")))}else {form <- as.formula(paste0(rv," ~ ."))}
nnetmod <- caretEnsemble::caretList(form = form,
data = data,
trControl = data.train,
metric="Accuracy",
methodList = c("avNNet","nnet","pcaNNet"), #[method list]
tuneList = list("avNNet"=caretEnsemble::caretModelSpec( #[method specs] use tunegrids
method="avNNet", tuneGrid = tuneGrids$avNNet),"nnet"=caretEnsemble::caretModelSpec(
method="nnet", tuneGrid = tuneGrids$nnet),"pcaNNet"=caretEnsemble::caretModelSpec(
method="pcaNNet", tuneGrid = tuneGrids$pcaNNet)))
})
registerDoSEQ()
unloadPkgs <- Vectorize(FUN=function(pkg){detach(pkg,character.only = T)})
unloadPkgs(mod.packages %>% gsub("^","package:",.))
save(nnetmod,sysNN1.mod,file = "nnetfinal.Rdata")
#rm(nnetmod)
gmailr::use_secret_file("googleusercontent.com.json")
msg <- gmailr::mime(
To = "7818797492@vtext.com",
From = "spiritsynaesthetic@gmail.com",
Subject = "Notification from R",
body = paste("NNet 1 Complete"))
gmailr::send_message(msg)
load("~/Northeastern/Fall 2018/HINF6400/Incentivized Insurance/cM.Rdata")
cM %>% print
## $positive
## NULL
##
## $table
## Reference
## Prediction L1 L2 L3 L4 L5 L6 L7 L8 L9 L10 L11 L12
## L1 3072 0 0 0 0 2 0 0 0 0 0 0
## L2 0 0 0 0 0 0 0 0 0 0 0 0
## L3 0 83 1438 0 0 0 0 0 0 5 1 0
## L4 0 2173 0 1005 0 39 2 6 1 12 1 0
## L5 0 0 0 1 0 50 6 39 2 2 6 0
## L6 0 0 0 0 0 0 0 0 0 0 0 0
## L7 0 0 0 5 0 11 716 65 0 0 0 0
## L8 0 0 0 1 0 2361 87 3261 0 7 1 0
## L9 0 0 0 0 0 0 0 0 710 4 0 0
## L10 0 0 0 0 0 0 0 0 0 2653 5 0
## L11 0 0 0 5 0 0 0 6 0 126 170 0
## L12 0 0 0 7 0 0 0 2 2 160 5 0
##
## $overall
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.7111269 0.6682215 0.7045026 0.7176841 0.1844835
## AccuracyPValue McnemarPValue
## 0.0000000 NaN
##
## $byClass
## Sensitivity Specificity Pos Pred Value Neg Pred Value Precision
## Class: L1 1.0000000 0.9998688 0.9993494 1.0000000 0.9993494
## Class: L2 0.0000000 1.0000000 NaN 0.8768290 NA
## Class: L3 1.0000000 0.9947269 0.9417158 1.0000000 0.9417158
## Class: L4 0.9814453 0.8708073 0.3102810 0.9987398 0.3102810
## Class: L5 NA 0.9942127 NA NA 0.0000000
## Class: L6 0.0000000 1.0000000 NaN 0.8655274 NA
## Class: L7 0.8828607 0.9953728 0.8983689 0.9945773 0.8983689
## Class: L8 0.9650784 0.8355091 0.5703043 0.9906334 0.5703043
## Class: L9 0.9930070 0.9997727 0.9943978 0.9997159 0.9943978
## Class: L10 0.8935669 0.9996742 0.9981189 0.9798186 0.9981189
## Class: L11 0.8994709 0.9924422 0.5537459 0.9989450 0.5537459
## Class: L12 NA 0.9903909 NA NA 0.0000000
## Recall F1 Prevalence Detection Rate
## Class: L1 1.0000000 0.9996746 0.16772221 0.167722210
## Class: L2 0.0000000 NA 0.12317100 0.000000000
## Class: L3 1.0000000 0.9699831 0.07851059 0.078510592
## Class: L4 0.9814453 0.4714989 0.05590740 0.054870059
## Class: L5 NA NA 0.00000000 0.000000000
## Class: L6 0.0000000 NA 0.13447259 0.000000000
## Class: L7 0.8828607 0.8905473 0.04427823 0.039091505
## Class: L8 0.9650784 0.7169397 0.18448351 0.178041057
## Class: L9 0.9930070 0.9937019 0.03903691 0.038763922
## Class: L10 0.8935669 0.9429536 0.16209871 0.144846036
## Class: L11 0.8994709 0.6854839 0.01031885 0.009281503
## Class: L12 NA NA 0.00000000 0.000000000
## Detection Prevalence Balanced Accuracy
## Class: L1 0.167831404 0.9999344
## Class: L2 0.000000000 0.5000000
## Class: L3 0.083369731 0.9973634
## Class: L4 0.176839921 0.9261263
## Class: L5 0.005787290 NA
## Class: L6 0.000000000 0.5000000
## Class: L7 0.043513868 0.9391167
## Class: L8 0.312186067 0.9002938
## Class: L9 0.038982311 0.9963899
## Class: L10 0.145119022 0.9466205
## Class: L11 0.016761302 0.9459566
## Class: L12 0.009609085 NA
##
## $mode
## [1] "sens_spec"
##
## $dots
## list()
##
## attr(,"class")
## [1] "confusionMatrix"
cM[["table"]] %>% kableExtra::kable("html", booktabs = T) %>% kableExtra::kable_styling(position = "center")
L1 | L2 | L3 | L4 | L5 | L6 | L7 | L8 | L9 | L10 | L11 | L12 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
L1 | 3072 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
L2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
L3 | 0 | 83 | 1438 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 1 | 0 |
L4 | 0 | 2173 | 0 | 1005 | 0 | 39 | 2 | 6 | 1 | 12 | 1 | 0 |
L5 | 0 | 0 | 0 | 1 | 0 | 50 | 6 | 39 | 2 | 2 | 6 | 0 |
L6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
L7 | 0 | 0 | 0 | 5 | 0 | 11 | 716 | 65 | 0 | 0 | 0 | 0 |
L8 | 0 | 0 | 0 | 1 | 0 | 2361 | 87 | 3261 | 0 | 7 | 1 | 0 |
L9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 710 | 4 | 0 | 0 |
L10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2653 | 5 | 0 |
L11 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 6 | 0 | 126 | 170 | 0 |
L12 | 0 | 0 | 0 | 7 | 0 | 0 | 0 | 2 | 2 | 160 | 5 | 0 |
cM[["overall"]] %>% kableExtra::kable("html", booktabs = T) %>% kableExtra::kable_styling(position = "center")
x | |
---|---|
Accuracy | 0.7111269 |
Kappa | 0.6682215 |
AccuracyLower | 0.7045026 |
AccuracyUpper | 0.7176841 |
AccuracyNull | 0.1844835 |
AccuracyPValue | 0.0000000 |
McnemarPValue | NaN |