# Pima Indians Diabetes Database
library(mlbench)
data(PimaIndiansDiabetes)
dim(PimaIndiansDiabetes)
## [1] 768 9
levels(PimaIndiansDiabetes$diabetes)
## [1] "neg" "pos"
head(PimaIndiansDiabetes)
## pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1 6 148 72 35 0 33.6 0.627 50 pos
## 2 1 85 66 29 0 26.6 0.351 31 neg
## 3 8 183 64 0 0 23.3 0.672 32 pos
## 4 1 89 66 23 94 28.1 0.167 21 neg
## 5 0 137 40 35 168 43.1 2.288 33 pos
## 6 5 116 74 0 0 25.6 0.201 30 neg
str(PimaIndiansDiabetes)
## 'data.frame': 768 obs. of 9 variables:
## $ pregnant: num 6 1 8 1 0 5 3 10 2 8 ...
## $ glucose : num 148 85 183 89 137 116 78 115 197 125 ...
## $ pressure: num 72 66 64 66 40 74 50 0 70 96 ...
## $ triceps : num 35 29 0 23 35 0 32 0 45 0 ...
## $ insulin : num 0 0 0 94 168 0 88 0 543 0 ...
## $ mass : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ pedigree: num 0.627 0.351 0.672 0.167 2.288 ...
## $ age : num 50 31 32 21 33 30 26 29 53 54 ...
## $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...
pima.scale = as.data.frame(scale(PimaIndiansDiabetes[,-9]))
pima.scale$type = PimaIndiansDiabetes$diabetes
str(pima.scale)
## 'data.frame': 768 obs. of 9 variables:
## $ pregnant: num 0.64 -0.844 1.233 -0.844 -1.141 ...
## $ glucose : num 0.848 -1.123 1.942 -0.998 0.504 ...
## $ pressure: num 0.15 -0.16 -0.264 -0.16 -1.504 ...
## $ triceps : num 0.907 0.531 -1.287 0.154 0.907 ...
## $ insulin : num -0.692 -0.692 -0.692 0.123 0.765 ...
## $ mass : num 0.204 -0.684 -1.103 -0.494 1.409 ...
## $ pedigree: num 0.468 -0.365 0.604 -0.92 5.481 ...
## $ age : num 1.4251 -0.1905 -0.1055 -1.0409 -0.0205 ...
## $ type : Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...
getwd()
## [1] "C:/Users/HP/Documents"
write.csv(pima.scale, file="pimaScale.csv", row.names=FALSE)
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
localH2O = h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 11 minutes 27 seconds
## H2O cluster version: 3.10.5.3
## H2O cluster version age: 1 month and 6 days
## H2O cluster name: H2O_started_from_R_HP_ywg784
## H2O cluster total nodes: 1
## H2O cluster total memory: 0.86 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 3.4.0 (2017-04-21)
h2o.getConnection()
## IP Address: localhost
## Port : 54321
## Session ID: _sid_ac3b
## Key Count : 0
path = "pimaScale.csv"
pima.hex = h2o.uploadFile(path=path, destination_frame="pima.hex")
##
|
| | 0%
|
|=================================================================| 100%
class(pima.hex)
## [1] "H2OFrame"
str(pima.hex)
## Class 'H2OFrame' <environment: 0x0000000007fd5680>
## - attr(*, "op")= chr "Parse"
## - attr(*, "id")= chr "pima.hex"
## - attr(*, "eval")= logi FALSE
## - attr(*, "nrow")= int 768
## - attr(*, "ncol")= int 9
## - attr(*, "types")=List of 9
## ..$ : chr "real"
## ..$ : chr "real"
## ..$ : chr "real"
## ..$ : chr "real"
## ..$ : chr "real"
## ..$ : chr "real"
## ..$ : chr "real"
## ..$ : chr "real"
## ..$ : chr "enum"
## - attr(*, "data")='data.frame': 10 obs. of 9 variables:
## ..$ pregnant: num 0.64 -0.844 1.233 -0.844 -1.141 ...
## ..$ glucose : num 0.848 -1.123 1.942 -0.998 0.504 ...
## ..$ pressure: num 0.15 -0.16 -0.264 -0.16 -1.504 ...
## ..$ triceps : num 0.907 0.531 -1.287 0.154 0.907 ...
## ..$ insulin : num -0.692 -0.692 -0.692 0.123 0.765 ...
## ..$ mass : num 0.204 -0.684 -1.103 -0.494 1.409 ...
## ..$ pedigree: num 0.468 -0.365 0.604 -0.92 5.481 ...
## ..$ age : num 1.4251 -0.1905 -0.1055 -1.0409 -0.0205 ...
## ..$ type : Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2
head(pima.hex)
## pregnant glucose pressure triceps insulin mass
## 1 0.6395305 0.8477713 0.1495433 0.9066791 -0.6924393 0.2038799
## 2 -0.8443348 -1.1226647 -0.1604412 0.5305558 -0.6924393 -0.6839762
## 3 1.2330766 1.9424580 -0.2637694 -1.2873733 -0.6924393 -1.1025370
## 4 -0.8443348 -0.9975577 -0.1604412 0.1544326 0.1232213 -0.4937213
## 5 -1.1411079 0.5037269 -1.5037073 0.9066791 0.7653372 1.4088275
## 6 0.3427574 -0.1530851 0.2528715 -1.2873733 -0.6924393 -0.8108128
## pedigree age type
## 1 0.4681869 1.42506672 pos
## 2 -0.3648230 -0.19054773 neg
## 3 0.6040037 -0.10551539 pos
## 4 -0.9201630 -1.04087112 neg
## 5 5.4813370 -0.02048305 pos
## 6 -0.8175458 -0.27558007 neg
summary(pima.hex)
## Warning in summary.H2OFrame(pima.hex): Approximated quantiles
## computed! If you are interested in exact quantiles, please pass the
## `exact_quantiles=TRUE` parameter.
## pregnant glucose pressure
## Min. :-1.141e+00 Min. :-3.781e+00 Min. :-3.570e+00
## 1st Qu.:-8.485e-01 1st Qu.:-6.878e-01 1st Qu.:-3.683e-01
## Median :-2.532e-01 Median :-1.277e-01 Median : 1.485e-01
## Mean :-8.559e-16 Mean :-1.331e-16 Mean : 1.115e-16
## 3rd Qu.: 6.348e-01 3rd Qu.: 6.021e-01 3rd Qu.: 5.582e-01
## Max. : 3.904e+00 Max. : 2.443e+00 Max. : 2.733e+00
## triceps insulin mass
## Min. :-1.287e+00 Min. :-6.924e-01 Min. :-4.058e+00
## 1st Qu.:-1.287e+00 1st Qu.:-6.924e-01 1st Qu.:-6.025e-01
## Median : 1.524e-01 Median :-4.318e-01 Median :-6.720e-03
## Mean :-1.567e-16 Mean : 5.584e-17 Mean :-4.787e-17
## 3rd Qu.: 7.172e-01 3rd Qu.: 4.105e-01 3rd Qu.: 5.805e-01
## Max. : 4.919e+00 Max. : 6.649e+00 Max. : 4.453e+00
## pedigree age type
## Min. :-1.189e+00 Min. :-1.041e+00 neg:500
## 1st Qu.:-6.940e-01 1st Qu.:-7.858e-01 pos:268
## Median :-3.017e-01 Median :-3.623e-01
## Mean : 3.524e-17 Mean :-2.259e-16
## 3rd Qu.: 4.600e-01 3rd Qu.: 6.581e-01
## Max. : 5.880e+00 Max. : 4.061e+00
rand = h2o.runif(pima.hex, seed = 123)
train = pima.hex[rand <= 0.7, ]
train = h2o.assign(train, key = "train")
test = pima.hex[rand > 0.7, ]
test <- h2o.assign(test, key = "test")
h2o.table(train[,9])
## type Count
## 1 neg 341
## 2 pos 185
##
## [2 rows x 2 columns]
h2o.table(test[,9])
## type Count
## 1 neg 159
## 2 pos 83
##
## [2 rows x 2 columns]
args(h2o.deeplearning)
## function (x, y, training_frame, model_id = NULL, validation_frame = NULL,
## nfolds = 0, keep_cross_validation_predictions = FALSE, keep_cross_validation_fold_assignment = FALSE,
## fold_assignment = c("AUTO", "Random", "Modulo", "Stratified"),
## fold_column = NULL, ignore_const_cols = TRUE, score_each_iteration = FALSE,
## weights_column = NULL, offset_column = NULL, balance_classes = FALSE,
## class_sampling_factors = NULL, max_after_balance_size = 5,
## max_hit_ratio_k = 0, checkpoint = NULL, pretrained_autoencoder = NULL,
## overwrite_with_best_model = TRUE, use_all_factor_levels = TRUE,
## standardize = TRUE, activation = c("Tanh", "TanhWithDropout",
## "Rectifier", "RectifierWithDropout", "Maxout", "MaxoutWithDropout"),
## hidden = c(200, 200), epochs = 10, train_samples_per_iteration = -2,
## target_ratio_comm_to_comp = 0.05, seed = -1, adaptive_rate = TRUE,
## rho = 0.99, epsilon = 1e-08, rate = 0.005, rate_annealing = 1e-06,
## rate_decay = 1, momentum_start = 0, momentum_ramp = 1e+06,
## momentum_stable = 0, nesterov_accelerated_gradient = TRUE,
## input_dropout_ratio = 0, hidden_dropout_ratios = NULL, l1 = 0,
## l2 = 0, max_w2 = 3.4028235e+38, initial_weight_distribution = c("UniformAdaptive",
## "Uniform", "Normal"), initial_weight_scale = 1, initial_weights = NULL,
## initial_biases = NULL, loss = c("Automatic", "CrossEntropy",
## "Quadratic", "Huber", "Absolute", "Quantile"), distribution = c("AUTO",
## "bernoulli", "multinomial", "gaussian", "poisson", "gamma",
## "tweedie", "laplace", "quantile", "huber"), quantile_alpha = 0.5,
## tweedie_power = 1.5, huber_alpha = 0.9, score_interval = 5,
## score_training_samples = 10000, score_validation_samples = 0,
## score_duty_cycle = 0.1, classification_stop = 0, regression_stop = 1e-06,
## stopping_rounds = 5, stopping_metric = c("AUTO", "deviance",
## "logloss", "MSE", "RMSE", "MAE", "RMSLE", "AUC", "lift_top_group",
## "misclassification", "mean_per_class_error"), stopping_tolerance = 0,
## max_runtime_secs = 0, score_validation_sampling = c("Uniform",
## "Stratified"), diagnostics = TRUE, fast_mode = TRUE,
## force_load_balance = TRUE, variable_importances = TRUE, replicate_training_data = TRUE,
## single_node_mode = FALSE, shuffle_training_data = FALSE,
## missing_values_handling = c("MeanImputation", "Skip"), quiet_mode = FALSE,
## autoencoder = FALSE, sparse = FALSE, col_major = FALSE, average_activation = 0,
## sparsity_beta = 0, max_categorical_features = 2147483647,
## reproducible = FALSE, export_weights_and_biases = FALSE,
## mini_batch_size = 1, categorical_encoding = c("AUTO", "Enum",
## "OneHotInternal", "OneHotExplicit", "Binary", "Eigen",
## "LabelEncoder", "SortByResponse", "EnumLimited"), elastic_averaging = FALSE,
## elastic_averaging_moving_rate = 0.9, elastic_averaging_regularization = 0.001)
## NULL
dlmodel <- h2o.deeplearning(x=1:8, y=9, training_frame = train,
validation_frame = test, seed = 123, variable_importances = TRUE)
##
|
| | 0%
|
|============================================== | 70%
|
|=================================================================| 100%
dlmodel
## Model Details:
## ==============
##
## H2OBinomialModel: deeplearning
## Model ID: DeepLearning_model_R_1501960966284_7
## Status of Neuron Layers: predicting type, 2-class classification, bernoulli distribution, CrossEntropy loss, 42,402 weights/biases, 506.0 KB, 5,260 training samples, mini-batch size 1
## layer units type dropout l1 l2 mean_rate rate_rms
## 1 1 8 Input 0.00 %
## 2 2 200 Rectifier 0.00 % 0.000000 0.000000 0.003426 0.001900
## 3 3 200 Rectifier 0.00 % 0.000000 0.000000 0.059771 0.143572
## 4 4 2 Softmax 0.000000 0.000000 0.001888 0.001131
## momentum mean_weight weight_rms mean_bias bias_rms
## 1
## 2 0.000000 -0.004742 0.092210 0.391177 0.052695
## 3 0.000000 -0.006082 0.070035 0.984823 0.012891
## 4 0.000000 -0.029659 0.378094 -0.000008 0.000365
##
##
## H2OBinomialMetrics: deeplearning
## ** Reported on training data. **
## ** Metrics reported on full training frame **
##
## MSE: 0.1774354
## RMSE: 0.4212309
## LogLoss: 0.6171341
## Mean Per-Class Error: 0.1956804
## AUC: 0.8825632
## Gini: 0.7651264
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## neg pos Error Rate
## neg 261 80 0.234604 =80/341
## pos 29 156 0.156757 =29/185
## Totals 290 236 0.207224 =109/526
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.041928 0.741093 227
## 2 max f2 0.014685 0.829361 290
## 3 max f0point5 0.146641 0.763975 154
## 4 max accuracy 0.115444 0.823194 169
## 5 max precision 0.986608 1.000000 0
## 6 max recall 0.000438 1.000000 390
## 7 max specificity 0.986608 1.000000 0
## 8 max absolute_mcc 0.113692 0.606873 171
## 9 max min_per_class_accuracy 0.054234 0.797654 213
## 10 max mean_per_class_accuracy 0.041928 0.804320 227
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## H2OBinomialMetrics: deeplearning
## ** Reported on validation data. **
## ** Metrics reported on full validation frame **
##
## MSE: 0.220593
## RMSE: 0.4696733
## LogLoss: 0.8821718
## Mean Per-Class Error: 0.2892703
## AUC: 0.7694173
## Gini: 0.5388346
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## neg pos Error Rate
## neg 90 69 0.433962 =69/159
## pos 12 71 0.144578 =12/83
## Totals 102 140 0.334711 =81/242
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.014637 0.636771 139
## 2 max f2 0.004220 0.772994 178
## 3 max f0point5 0.501319 0.646388 44
## 4 max accuracy 0.501319 0.752066 44
## 5 max precision 0.994303 1.000000 0
## 6 max recall 0.000051 1.000000 235
## 7 max specificity 0.994303 1.000000 0
## 8 max absolute_mcc 0.474066 0.415449 46
## 9 max min_per_class_accuracy 0.049525 0.698113 105
## 10 max mean_per_class_accuracy 0.021900 0.711525 130
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
dlmodel@allparameters
## $model_id
## [1] "DeepLearning_model_R_1501960966284_7"
##
## $training_frame
## [1] "train"
##
## $validation_frame
## [1] "test"
##
## $nfolds
## [1] 0
##
## $keep_cross_validation_predictions
## [1] FALSE
##
## $keep_cross_validation_fold_assignment
## [1] FALSE
##
## $fold_assignment
## [1] "AUTO"
##
## $ignore_const_cols
## [1] TRUE
##
## $score_each_iteration
## [1] FALSE
##
## $balance_classes
## [1] FALSE
##
## $max_after_balance_size
## [1] 5
##
## $max_confusion_matrix_size
## [1] 20
##
## $max_hit_ratio_k
## [1] 0
##
## $overwrite_with_best_model
## [1] TRUE
##
## $use_all_factor_levels
## [1] TRUE
##
## $standardize
## [1] TRUE
##
## $activation
## [1] "Rectifier"
##
## $hidden
## [1] 200 200
##
## $epochs
## [1] 10
##
## $train_samples_per_iteration
## [1] -2
##
## $target_ratio_comm_to_comp
## [1] 0.05
##
## $seed
## [1] 123
##
## $adaptive_rate
## [1] TRUE
##
## $rho
## [1] 0.99
##
## $epsilon
## [1] 1e-08
##
## $rate
## [1] 0.005
##
## $rate_annealing
## [1] 1e-06
##
## $rate_decay
## [1] 1
##
## $momentum_start
## [1] 0
##
## $momentum_ramp
## [1] 1e+06
##
## $momentum_stable
## [1] 0
##
## $nesterov_accelerated_gradient
## [1] TRUE
##
## $input_dropout_ratio
## [1] 0
##
## $l1
## [1] 0
##
## $l2
## [1] 0
##
## $max_w2
## [1] 3.402823e+38
##
## $initial_weight_distribution
## [1] "UniformAdaptive"
##
## $initial_weight_scale
## [1] 1
##
## $loss
## [1] "Automatic"
##
## $distribution
## [1] "AUTO"
##
## $quantile_alpha
## [1] 0.5
##
## $tweedie_power
## [1] 1.5
##
## $huber_alpha
## [1] 0.9
##
## $score_interval
## [1] 5
##
## $score_training_samples
## [1] 10000
##
## $score_validation_samples
## [1] 0
##
## $score_duty_cycle
## [1] 0.1
##
## $classification_stop
## [1] 0
##
## $regression_stop
## [1] 1e-06
##
## $stopping_rounds
## [1] 5
##
## $stopping_metric
## [1] "AUTO"
##
## $stopping_tolerance
## [1] 0
##
## $max_runtime_secs
## [1] 0
##
## $score_validation_sampling
## [1] "Uniform"
##
## $diagnostics
## [1] TRUE
##
## $fast_mode
## [1] TRUE
##
## $force_load_balance
## [1] TRUE
##
## $variable_importances
## [1] TRUE
##
## $replicate_training_data
## [1] TRUE
##
## $single_node_mode
## [1] FALSE
##
## $shuffle_training_data
## [1] FALSE
##
## $missing_values_handling
## [1] "MeanImputation"
##
## $quiet_mode
## [1] FALSE
##
## $autoencoder
## [1] FALSE
##
## $sparse
## [1] FALSE
##
## $col_major
## [1] FALSE
##
## $average_activation
## [1] 0
##
## $sparsity_beta
## [1] 0
##
## $max_categorical_features
## [1] 2147483647
##
## $reproducible
## [1] FALSE
##
## $export_weights_and_biases
## [1] FALSE
##
## $mini_batch_size
## [1] 1
##
## $categorical_encoding
## [1] "AUTO"
##
## $elastic_averaging
## [1] FALSE
##
## $elastic_averaging_moving_rate
## [1] 0.9
##
## $elastic_averaging_regularization
## [1] 0.001
##
## $x
## [1] "pregnant" "glucose" "pressure" "triceps" "insulin" "mass"
## [7] "pedigree" "age"
##
## $y
## [1] "type"
dlmodel@model$variable_importances
## Variable Importances:
## variable relative_importance scaled_importance percentage
## 1 glucose 1.000000 1.000000 0.131849
## 2 age 0.998628 0.998628 0.131669
## 3 pressure 0.970514 0.970514 0.127962
## 4 insulin 0.970048 0.970048 0.127900
## 5 mass 0.956604 0.956604 0.126128
## 6 pregnant 0.948564 0.948564 0.125068
## 7 pedigree 0.895442 0.895442 0.118064
## 8 triceps 0.844606 0.844606 0.111361
dlPredict = h2o.predict(dlmodel,newdata=test)
##
|
| | 0%
|
|=================================================================| 100%
dlPredict
## predict neg pos
## 1 neg 0.9998340 0.0001660238
## 2 pos 0.0626549 0.9373450986
## 3 neg 0.9998435 0.0001565213
## 4 pos 0.9659392 0.0340607757
## 5 pos 0.6810771 0.3189228946
## 6 neg 0.9941267 0.0058733392
##
## [242 rows x 3 columns]
dlPred = as.data.frame(dlPredict)
head(dlPred)
## predict neg pos
## 1 neg 0.9998340 0.0001660238
## 2 pos 0.0626549 0.9373450986
## 3 neg 0.9998435 0.0001565213
## 4 pos 0.9659392 0.0340607757
## 5 pos 0.6810771 0.3189228946
## 6 neg 0.9941267 0.0058733392