About Dataset
This dataset consists of 101 animals from a zoo.
There are 16 variables with various traits to describe the animals.
The 7 Class Types are: Mammal, Bird, Reptile, Fish, Amphibian, Bug and Invertebrate
The purpose for this dataset is to be able to predict the classification of the animals, based upon the variables. It is the perfect dataset for those who are new to learning Machine Learning.
animal_name: Unique for each instance
hair Boolean
feathers Boolean
eggs Boolean
milk Boolean
airborne Boolean
aquatic Boolean
predator Boolean
toothed Boolean
backbone Boolean
breathes Boolean
venomous Boolean
fins Boolean
legs Numeric (set of values: {0,2,4,5,6,8})
tail Boolean
domestic Boolean
catsize Boolean
class_type Numeric (integer values in range [1,7])
| Col1 | Col2 |
|---|---|
| 1 | mammal |
| 2 | bird |
| 3 | reptile |
| 4 | fish |
| 5 | amphibian |
| 6 | bug |
| 7 | intervibrate |
zoo_clean=zoo %>%
mutate_if(is.integer,as.factor) %>%
mutate(legs=as.numeric(legs))
tibble(zoo_clean)## # A tibble: 101 × 18
## animal_name hair feath…¹ eggs milk airbo…² aquatic preda…³ toothed backb…⁴
## <chr> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct>
## 1 aardvark 1 0 0 1 0 0 1 1 1
## 2 antelope 1 0 0 1 0 0 0 1 1
## 3 bass 0 0 1 0 0 1 1 1 1
## 4 bear 1 0 0 1 0 0 1 1 1
## 5 boar 1 0 0 1 0 0 1 1 1
## 6 buffalo 1 0 0 1 0 0 0 1 1
## 7 calf 1 0 0 1 0 0 0 1 1
## 8 carp 0 0 1 0 0 1 0 1 1
## 9 catfish 0 0 1 0 0 1 1 1 1
## 10 cavy 1 0 0 1 0 0 0 1 1
## # … with 91 more rows, 8 more variables: breathes <fct>, venomous <fct>,
## # fins <fct>, legs <dbl>, tail <fct>, domestic <fct>, catsize <fct>,
## # class_type <fct>, and abbreviated variable names ¹feathers, ²airborne,
## # ³predator, ⁴backbone
zoo_untidy1=zoo_clean %>%
select(-animal_name,-legs) %>%
gather(key = "Variable",value = "value",-class_type) %>%
mutate(value=as.factor(value))
zoo_untidy2=gather(zoo_clean %>% select(animal_name,legs,class_type),key = "Variable",value = "value",-class_type)library(ggplot2)
ggplot(zoo_untidy1) +
aes(x = class_type, fill = value) +
geom_bar() +
scale_fill_viridis_d(option = "cividis",
direction = 1) +
theme_bw() +
facet_wrap(vars(Variable))count=zoo_untidy2 %>%
filter(Variable %in% "animal_name") %>%
ggplot() +
aes(x = class_type, fill = value) +
geom_bar() +
scale_fill_hue(direction = 1) +
labs(title = "animal count") +
theme_minimal() +
theme(plot.title = element_text(size = 15L, face = "bold",
hjust = 0.5)) +
facet_wrap(vars(value))
plotly::ggplotly(count)## # A tibble: 101 × 17
## hair feathers eggs milk airborne aquatic predator toothed backbone breat…¹
## <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct>
## 1 1 0 0 1 0 0 1 1 1 1
## 2 1 0 0 1 0 0 0 1 1 1
## 3 0 0 1 0 0 1 1 1 1 0
## 4 1 0 0 1 0 0 1 1 1 1
## 5 1 0 0 1 0 0 1 1 1 1
## 6 1 0 0 1 0 0 0 1 1 1
## 7 1 0 0 1 0 0 0 1 1 1
## 8 0 0 1 0 0 1 0 1 1 0
## 9 0 0 1 0 0 1 1 1 1 0
## 10 1 0 0 1 0 0 0 1 1 1
## # … with 91 more rows, 7 more variables: venomous <fct>, fins <fct>,
## # legs <dbl>, tail <fct>, domestic <fct>, catsize <fct>, class_type <fct>,
## # and abbreviated variable name ¹breathes
zooTask=makeClassifTask(data = zoo_clean2, target = "class_type")
lrn.naivebayes=makeLearner("classif.naiveBayes")
kfold=makeResampleDesc("RepCV",folds=10,reps=10)
naivecv=resample(task = zooTask, learner = lrn.naivebayes,resampling = kfold,acc)
naivecv## Resample Result
## Task: zoo_clean2
## Learner: classif.naiveBayes
## Aggr perf: acc.test.mean=0.9608182
## Runtime: 1.04003
## predicted
## true 1 2 3 4 5 6 7 -err.-
## 1 410 0 0 0 0 0 0 0
## 2 0 200 0 0 0 0 0 0
## 3 0 6 22 10 8 0 4 28
## 4 0 0 0 130 0 0 0 0
## 5 0 0 0 0 40 0 0 0
## 6 0 0 0 0 0 80 0 0
## 7 0 0 10 0 0 2 88 12
## -err.- 0 6 10 10 8 2 4 40
zoo_clean3=zoo_clean2 %>%
select(legs,class_type)
zooTaskknn=makeClassifTask(data=zoo_clean3,target = "class_type")
lrn.knn=makeLearner("classif.knn")
ps2=makeParamSet(
makeIntegerParam("k",0,10)
)
kfold## Resample description: repeated cross-validation with 100 iterations: 10 folds and 10 reps.
## Predict: test
## Stratification: FALSE
sc=makeTuneControlGrid()
lrn.knn.tune=makeTuneWrapper(learner = lrn.knn,resampling = kfold,control = sc,par.set = ps2)
kfold_outer=makeResampleDesc("CV",iters=5)
knncv=resample(learner = lrn.knn.tune, task=zooTaskknn,resampling = kfold_outer,acc)
knncv## Resample Result
## Task: zoo_clean3
## Learner: classif.knn.tuned
## Aggr perf: acc.test.mean=0.7219048
## Runtime: 13.5264
## Supervised task: zoo_clean2
## Type: classif
## Target: class_type
## Observations: 101
## Features:
## numerics factors ordered functionals
## 1 15 0 0
## Missings: FALSE
## Has weights: FALSE
## Has blocking: FALSE
## Has coordinates: FALSE
## Classes: 7
## 1 2 3 4 5 6 7
## 41 20 5 13 4 8 10
## Positive class: NA
## Type len Def
## type discrete - C-classifica...
## cost numeric - 1
## nu numeric - 0.5
## class.weights numericvector <NA> -
## kernel discrete - radial
## degree integer - 3
## coef0 numeric - 0
## gamma numeric - -
## cachesize numeric - 40
## tolerance numeric - 0.001
## shrinking logical - TRUE
## cross integer - 0
## fitted logical - TRUE
## scale logicalvector <NA> TRUE
## Constr Req Tunable Trafo
## type C-classification,nu-classification - TRUE -
## cost 0 to Inf Y TRUE -
## nu -Inf to Inf Y TRUE -
## class.weights 0 to Inf - TRUE -
## kernel linear,polynomial,radial,sigmoid - TRUE -
## degree 1 to Inf Y TRUE -
## coef0 -Inf to Inf Y TRUE -
## gamma 0 to Inf Y TRUE -
## cachesize -Inf to Inf - TRUE -
## tolerance 0 to Inf - TRUE -
## shrinking - - TRUE -
## cross 0 to Inf - FALSE -
## fitted - - FALSE -
## scale - - TRUE -
ps3=makeParamSet(
makeNumericParam("cost",0,20),
makeDiscreteParam("kernel",c("linear","polynomial","radial","sigmoid")),
makeIntegerParam("degree",1,10),
makeNumericParam("gamma",1,20)
)
kfold## Resample description: repeated cross-validation with 100 iterations: 10 folds and 10 reps.
## Predict: test
## Stratification: FALSE
sc=makeTuneControlRandom(maxit = 50)
lrn.svm.tune=makeTuneWrapper(learner = lrn.svm, control = sc, resampling = kfold,par.set = ps3)
svmcv=resample(learner = lrn.svm.tune, task = zooTask, resampling = kfold_outer,acc)
svmcv## Resample Result
## Task: zoo_clean2
## Learner: classif.svm.tuned
## Aggr perf: acc.test.mean=0.9514286
## Runtime: 256.168
## predicted
## true 1 2 3 4 5 6 7 -err.-
## 1 41 0 0 0 0 0 0 0
## 2 0 20 0 0 0 0 0 0
## 3 0 1 2 0 1 0 1 3
## 4 0 0 0 13 0 0 0 0
## 5 0 0 1 0 3 0 0 1
## 6 0 0 0 0 0 8 0 0
## 7 0 0 0 0 0 1 9 1
## -err.- 0 1 1 0 1 1 1 5
## Supervised task: zoo_clean3
## Type: classif
## Target: class_type
## Observations: 101
## Features:
## numerics factors ordered functionals
## 1 0 0 0
## Missings: FALSE
## Has weights: FALSE
## Has blocking: FALSE
## Has coordinates: FALSE
## Classes: 7
## 1 2 3 4 5 6 7
## 41 20 5 13 4 8 10
## Positive class: NA
## Resample description: repeated cross-validation with 100 iterations: 10 folds and 10 reps.
## Predict: test
## Stratification: FALSE
## Resample Result
## Task: zoo_clean3
## Learner: classif.lda
## Aggr perf: acc.test.mean=0.5847619
## Runtime: 0.024184
train.lda=train(learner=lrn.lda,task=zooTaskknn)
modeldata=getLearnerModel(train.lda)
ldapreds=predict(modeldata)$x## Supervised task: zoo_clean2
## Type: classif
## Target: class_type
## Observations: 101
## Features:
## numerics factors ordered functionals
## 1 15 0 0
## Missings: FALSE
## Has weights: FALSE
## Has blocking: FALSE
## Has coordinates: FALSE
## Classes: 7
## 1 2 3 4 5 6 7
## 41 20 5 13 4 8 10
## Positive class: NA
## Type len Def Constr Req Tunable Trafo
## minsplit integer - 20 1 to Inf - TRUE -
## minbucket integer - - 1 to Inf - TRUE -
## cp numeric - 0.01 0 to 1 - TRUE -
## maxcompete integer - 4 0 to Inf - TRUE -
## maxsurrogate integer - 5 0 to Inf - TRUE -
## usesurrogate discrete - 2 0,1,2 - TRUE -
## surrogatestyle discrete - 0 0,1 - TRUE -
## maxdepth integer - 30 1 to 30 - TRUE -
## xval integer - 10 0 to Inf - FALSE -
## parms untyped - - - - TRUE -
ps.rpart=makeParamSet(
makeIntegerParam("minsplit",0,30),
makeIntegerParam("minbucket",1,10),
makeNumericParam("cp",0,1),
makeIntegerParam("maxdepth",1,30)
)
kfold=makeResampleDesc("CV", iters=5)
sc.rpart=makeTuneControlRandom(maxit = 50)
lrn.rpart.tune=makeTuneWrapper(learner = lrn.rpart, par.set = ps.rpart, control =sc.rpart,resampling = kfold )
tp.rpart=tuneParams(learner = lrn.rpart,task= zooTask, par.set = ps.rpart, control =sc.rpart,resampling = kfold )
kfold_outer=makeResampleDesc("CV", iters=3)
rpartcv=resample(task = zooTask, learner = lrn.rpart.tune, resampling = kfold_outer, acc)
rpartcv## Resample Result
## Task: zoo_clean2
## Learner: classif.rpart.tuned
## Aggr perf: acc.test.mean=0.8514557
## Runtime: 8.63858
## Tune result:
## Op. pars: minsplit=7; minbucket=4; cp=0.0579; maxdepth=7
## mmce.test.mean=0.0995238
rpart.tuned=setHyperPars(learner = lrn.rpart, par.vals = tp.rpart$x)
model.rpart=train(learner = rpart.tuned , task = zooTask )
library(rpart.plot)
rpartdata=getLearnerModel(model.rpart)
rpart.plot(rpartdata,roundint = FALSE,box.palette = "BuBn",type = 5)## predicted
## true 1 2 3 4 5 6 7 -err.-
## 1 41 0 0 0 0 0 0 0
## 2 0 20 0 0 0 0 0 0
## 3 0 0 3 0 0 0 2 2
## 4 0 0 0 13 0 0 0 0
## 5 0 0 3 0 0 0 1 4
## 6 0 0 0 0 0 6 2 2
## 7 0 0 0 0 0 7 3 7
## -err.- 0 0 3 0 0 7 5 15
## Supervised task: zoo_clean2
## Type: classif
## Target: class_type
## Observations: 101
## Features:
## numerics factors ordered functionals
## 1 15 0 0
## Missings: FALSE
## Has weights: FALSE
## Has blocking: FALSE
## Has coordinates: FALSE
## Classes: 7
## 1 2 3 4 5 6 7
## 41 20 5 13 4 8 10
## Positive class: NA
## Type len Def Constr Req Tunable Trafo
## ntree integer - 500 1 to Inf - TRUE -
## mtry integer - - 1 to Inf - TRUE -
## replace logical - TRUE - - TRUE -
## classwt numericvector <NA> - 0 to Inf - TRUE -
## cutoff numericvector <NA> - 0 to 1 - TRUE -
## strata untyped - - - - FALSE -
## sampsize integervector <NA> - 1 to Inf - TRUE -
## nodesize integer - 1 1 to Inf - TRUE -
## maxnodes integer - - 1 to Inf - TRUE -
## importance logical - FALSE - - TRUE -
## localImp logical - FALSE - - TRUE -
## proximity logical - FALSE - - FALSE -
## oob.prox logical - - - Y FALSE -
## norm.votes logical - TRUE - - FALSE -
## do.trace logical - FALSE - - FALSE -
## keep.forest logical - TRUE - - FALSE -
## keep.inbag logical - FALSE - - FALSE -
ps.rand=makeParamSet(
makeIntegerParam("ntree", 200,200),
makeIntegerParam("mtry",1,50),
makeIntegerParam("nodesize",1,5),
makeIntegerParam("maxnodes",5,150)
)
kfold=makeResampleDesc("RepCV", folds=5, reps=5)
sc=makeTuneControlRandom(maxit = 50)
lrn.random.tune=makeTuneWrapper(learner = lrn.random, par.set = ps.rand,resampling = kfold,control = sc)
randcv=resample(learner = lrn.random.tune, task = zooTask,resampling = kfold_outer, acc)
randcv## Resample Result
## Task: zoo_clean2
## Learner: classif.randomForest.tuned
## Aggr perf: acc.test.mean=0.9212715
## Runtime: 60.809
tp=tuneParams(learner = lrn.random,task = zooTask, par.set = ps.rand,resampling = kfold,control = sc,acc)
lrn.random.tune.set=setHyperPars(lrn.random,par.vals = tp$x)
model.random=train(learner=lrn.random.tune.set,task=zooTask )
tp## Tune result:
## Op. pars: ntree=200; mtry=12; nodesize=2; maxnodes=109
## acc.test.mean=0.9544762
## predicted
## true 1 2 3 4 5 6 7 -err.-
## 1 41 0 0 0 0 0 0 0
## 2 0 20 0 0 0 0 0 0
## 3 0 1 3 1 0 0 0 2
## 4 0 0 0 13 0 0 0 0
## 5 0 0 0 0 1 0 3 3
## 6 0 0 0 0 0 8 0 0
## 7 0 0 0 0 0 3 7 3
## -err.- 0 1 0 1 0 3 3 8
zoo_clean4=zoo %>%
select(-animal_name) %>%
mutate(class_type=as.factor(class_type))
tibble(zoo_clean4)## # A tibble: 101 × 17
## hair feathers eggs milk airborne aquatic predator toothed backbone breat…¹
## <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 1 0 0 1 0 0 1 1 1 1
## 2 1 0 0 1 0 0 0 1 1 1
## 3 0 0 1 0 0 1 1 1 1 0
## 4 1 0 0 1 0 0 1 1 1 1
## 5 1 0 0 1 0 0 1 1 1 1
## 6 1 0 0 1 0 0 0 1 1 1
## 7 1 0 0 1 0 0 0 1 1 1
## 8 0 0 1 0 0 1 0 1 1 0
## 9 0 0 1 0 0 1 1 1 1 0
## 10 1 0 0 1 0 0 0 1 1 1
## # … with 91 more rows, 7 more variables: venomous <int>, fins <int>,
## # legs <int>, tail <int>, domestic <int>, catsize <int>, class_type <fct>,
## # and abbreviated variable name ¹breathes
zooTask2=makeClassifTask(data=zoo_clean4,target = "class_type")
lrn.xg=makeLearner("classif.xgboost")
getParamSet(lrn.xg)## Type len Def
## booster discrete - gbtree
## watchlist untyped - <NULL>
## eta numeric - 0.3
## gamma numeric - 0
## max_depth integer - 6
## min_child_weight numeric - 1
## subsample numeric - 1
## colsample_bytree numeric - 1
## colsample_bylevel numeric - 1
## num_parallel_tree integer - 1
## lambda numeric - 1
## lambda_bias numeric - 0
## alpha numeric - 0
## objective untyped - binary:logistic
## eval_metric untyped - error
## base_score numeric - 0.5
## max_delta_step numeric - 0
## missing numeric -
## monotone_constraints integervector <NA> 0
## tweedie_variance_power numeric - 1.5
## nthread integer - -
## nrounds integer - -
## feval untyped - <NULL>
## verbose integer - 1
## print_every_n integer - 1
## early_stopping_rounds integer - <NULL>
## maximize logical - <NULL>
## sample_type discrete - uniform
## normalize_type discrete - tree
## rate_drop numeric - 0
## skip_drop numeric - 0
## scale_pos_weight numeric - 1
## refresh_leaf logical - TRUE
## feature_selector discrete - cyclic
## top_k integer - 0
## predictor discrete - cpu_predictor
## updater untyped - -
## sketch_eps numeric - 0.03
## one_drop logical - FALSE
## tree_method discrete - auto
## grow_policy discrete - depthwise
## max_leaves integer - 0
## max_bin integer - 256
## callbacks untyped - list()
## Constr Req Tunable Trafo
## booster gbtree,gblinear,dart - TRUE -
## watchlist - - FALSE -
## eta 0 to 1 - TRUE -
## gamma 0 to Inf - TRUE -
## max_depth 0 to Inf - TRUE -
## min_child_weight 0 to Inf - TRUE -
## subsample 0 to 1 - TRUE -
## colsample_bytree 0 to 1 - TRUE -
## colsample_bylevel 0 to 1 - TRUE -
## num_parallel_tree 1 to Inf - TRUE -
## lambda 0 to Inf - TRUE -
## lambda_bias 0 to Inf - TRUE -
## alpha 0 to Inf - TRUE -
## objective - - FALSE -
## eval_metric - - FALSE -
## base_score -Inf to Inf - FALSE -
## max_delta_step 0 to Inf - TRUE -
## missing -Inf to Inf - FALSE -
## monotone_constraints -1 to 1 - TRUE -
## tweedie_variance_power 1 to 2 Y TRUE -
## nthread 1 to Inf - FALSE -
## nrounds 1 to Inf - TRUE -
## feval - - FALSE -
## verbose 0 to 2 - FALSE -
## print_every_n 1 to Inf Y FALSE -
## early_stopping_rounds 1 to Inf - FALSE -
## maximize - - FALSE -
## sample_type uniform,weighted Y TRUE -
## normalize_type tree,forest Y TRUE -
## rate_drop 0 to 1 Y TRUE -
## skip_drop 0 to 1 Y TRUE -
## scale_pos_weight -Inf to Inf - TRUE -
## refresh_leaf - - TRUE -
## feature_selector cyclic,shuffle,random,greedy,thrifty - TRUE -
## top_k 0 to Inf - TRUE -
## predictor cpu_predictor,gpu_predictor - TRUE -
## updater - - TRUE -
## sketch_eps 0 to 1 - TRUE -
## one_drop - Y TRUE -
## tree_method auto,exact,approx,hist,gpu_hist Y TRUE -
## grow_policy depthwise,lossguide Y TRUE -
## max_leaves 0 to Inf Y TRUE -
## max_bin 2 to Inf Y TRUE -
## callbacks - - FALSE -
ps.xg=makeParamSet(
makeNumericParam("eta",0.3,1),
makeNumericParam("gamma",0,5),
makeIntegerParam("max_depth",6,10),
makeNumericParam("min_child_weight",1,10),
makeNumericParam("subsample",0,1),
makeNumericParam("colsample_bytree", 0,1),
makeIntegerParam("nrounds",20,20)
)
kfold=makeResampleDesc("CV",iters=5)
sc=makeTuneControlRandom(maxit=100)
lrn.xg.tune=makeTuneWrapper(learner = lrn.xg,resampling = kfold,control = sc,par.set = ps.xg)
kfold_outer=makeResampleDesc("CV",iters=3)
xgcv=resample(learner = lrn.xg.tune,task = zooTask2,acc,resampling=kfold_outer)## Resample Result
## Task: zoo_clean4
## Learner: classif.xgboost.tuned
## Aggr perf: acc.test.mean=0.8820559
## Runtime: 62.4176
tp=tuneParams(learner = lrn.xg,task=zooTask2,resampling = kfold_outer,control = sc,par.set = ps.xg)
lrn.xg.tune.set=setHyperPars(learner = lrn.xg,par.vals = tp$x)
model.xg=train(lrn.xg.tune.set,zooTask2)## predicted
## true 1 2 3 4 5 6 7 -err.-
## 1 41 0 0 0 0 0 0 0
## 2 0 20 0 0 0 0 0 0
## 3 0 0 2 1 2 0 0 3
## 4 0 0 0 13 0 0 0 0
## 5 0 0 1 0 3 0 0 1
## 6 0 0 0 0 0 8 0 0
## 7 0 0 0 3 0 5 2 8
## -err.- 0 0 1 4 2 5 0 12