#=======================================================================

It is free (as in libre) open source software.

It is licensed under the GNU General Public License,

Version 2. Rattle comes with ABSOLUTELY NO WARRANTY.

Rattle was written by Graham Williams with contributions

from others as acknowledged in ‘library(help=rattle)’.

Visit https://rattle.togaware.com/ for details.

#======================================================================= # Rattle timestamp: 2020-09-05 23:04:09 x86_64-w64-mingw32

Rattle version 5.4.0 user ‘prana’

This log captures interactions with Rattle as an R script.

For repeatability, export this activity log to a

file, like ‘model.R’ using the Export button or

through the Tools menu. Th script can then serve as a

starting point for developing your own scripts.

After xporting to a file called ‘model.R’, for exmample,

you can type into a new R Console the command

“source(‘model.R’)” and so repeat all actions. Generally,

you will want to edit the file to suit your own needs.

You can also edit this log in place to record additional

information before exporting the script.

Note that saving/loading projects retains this log.

We begin most scripts by loading the required packages.

Here are some initial packages to load and others will be

identified as we proceed through the script. When writing

our own scripts we often collect together the library

commands at the beginning of the script here.

library(rattle) # Access the weather dataset and utilities. library(magrittr) # Utilise %>% and %<>% pipeline operators.

This log generally records the process of building a model.

However, with very little effort the log can also be used

to score a new dataset. The logical variable ‘building’

is used to toggle between generating transformations,

when building a model and using the transformations,

when scoring a dataset.

building <- TRUE scoring <- ! building

A pre-defined value is used to reset the random seed

so that results are repeatable.

crv$seed <- 42

#======================================================================= # Rattle timestamp: 2020-09-05 23:04:39 x86_64-w64-mingw32

Load a dataset from file.

fname <- “file:///C:/Users/prana/Desktop/L6-diabetes-data-for-R-_csv_-_2__sample.csv” crs$dataset <- read.csv(fname, na.strings=c(“.”, “NA”, "“,”?“), strip.white=TRUE, encoding=”UTF-8")

#======================================================================= # Rattle timestamp: 2020-09-05 23:04:40 x86_64-w64-mingw32

Action the user selections from the Data tab.

Build the train/validate/test datasets.

nobs=282 train=197 validate=42 test=43

set.seed(crv$seed)

crs\(nobs <- nrow(crs\)dataset)

crs\(train <- sample(crs\)nobs, 0.7*crs$nobs)

crs\(nobs %>% seq_len() %>% setdiff(crs\)train) %>% sample(0.15*crs\(nobs) -> crs\)validate

crs\(nobs %>% seq_len() %>% setdiff(crs\)train) %>% setdiff(crs\(validate) -> crs\)test

The following variable selections have been noted.

crs$input <- c(“X”, “id”, “chol”, “stab.glu”, “hdl”, “ratio”, “glyhb”, “location”, “age”, “gender”, “height”, “weight”, “frame”, “bp.1s”, “bp.1d”, “bp.2s”, “bp.2d”, “waist”, “hip”, “time.ppn”, “insurance”, “fh”, “smoking”, “dm”, “TNM_dm”, “IMN_hdl”, “IMN_ratio”, “IMN_glyhb”, “IMN_height”, “IMN_weight”, “IMN_bp.1s”, “IMN_bp.1d”, “IMN_bp.2s”, “IMN_bp.2d”, “IMN_waist”, “IMN_hip”, “IMN_time.ppn”, “IMO_dm”)

crs$numeric <- c(“X”, “id”, “chol”, “stab.glu”, “hdl”, “ratio”, “glyhb”, “age”, “height”, “weight”, “bp.1s”, “bp.1d”, “bp.2s”, “bp.2d”, “waist”, “hip”, “time.ppn”, “insurance”, “fh”, “smoking”, “TNM_dm”, “IMN_hdl”, “IMN_ratio”, “IMN_glyhb”, “IMN_height”, “IMN_weight”, “IMN_bp.1s”, “IMN_bp.1d”, “IMN_bp.2s”, “IMN_bp.2d”, “IMN_waist”, “IMN_hip”, “IMN_time.ppn”)

crs$categoric <- c(“location”, “gender”, “frame”, “dm”, “IMO_dm”)

crs\(target <- "TNM_IMO_dm" crs\)risk <- NULL crs\(ident <- NULL crs\)ignore <- NULL crs$weights <- NULL

#======================================================================= # Rattle timestamp: 2020-09-05 23:05:43 x86_64-w64-mingw32

Action the user selections from the Data tab.

Build the train/validate/test datasets.

nobs=282 train=197 validate=0 test=85

set.seed(1)

crs\(nobs <- nrow(crs\)dataset)

crs\(train <- sample(crs\)nobs, 0.7*crs\(nobs) crs\)validate <- NULL

crs\(nobs %>% seq_len() %>% setdiff(crs\)train) %>% setdiff(crs\(validate) -> crs\)test

The following variable selections have been noted.

crs$input <- c(“X”, “id”, “chol”, “stab.glu”, “hdl”, “ratio”, “glyhb”, “location”, “age”, “gender”, “height”, “weight”, “frame”, “bp.1s”, “bp.1d”, “bp.2s”, “bp.2d”, “waist”, “hip”, “time.ppn”, “insurance”, “fh”, “smoking”, “dm”, “TNM_dm”, “IMN_hdl”, “IMN_ratio”, “IMN_glyhb”, “IMN_height”, “IMN_weight”, “IMN_bp.1s”, “IMN_bp.1d”, “IMN_bp.2s”, “IMN_bp.2d”, “IMN_waist”, “IMN_hip”, “IMN_time.ppn”, “IMO_dm”)

crs$numeric <- c(“X”, “id”, “chol”, “stab.glu”, “hdl”, “ratio”, “glyhb”, “age”, “height”, “weight”, “bp.1s”, “bp.1d”, “bp.2s”, “bp.2d”, “waist”, “hip”, “time.ppn”, “insurance”, “fh”, “smoking”, “TNM_dm”, “IMN_hdl”, “IMN_ratio”, “IMN_glyhb”, “IMN_height”, “IMN_weight”, “IMN_bp.1s”, “IMN_bp.1d”, “IMN_bp.2s”, “IMN_bp.2d”, “IMN_waist”, “IMN_hip”, “IMN_time.ppn”)

crs$categoric <- c(“location”, “gender”, “frame”, “dm”, “IMO_dm”)

crs\(target <- "TNM_IMO_dm" crs\)risk <- NULL crs\(ident <- NULL crs\)ignore <- NULL crs$weights <- NULL

#======================================================================= # Rattle timestamp: 2020-09-05 23:05:50 x86_64-w64-mingw32

Action the user selections from the Data tab.

Build the train/validate/test datasets.

nobs=282 train=197 validate=0 test=85

set.seed(1)

crs\(nobs <- nrow(crs\)dataset)

crs\(train <- sample(crs\)nobs, 0.7*crs\(nobs) crs\)validate <- NULL

crs\(nobs %>% seq_len() %>% setdiff(crs\)train) %>% setdiff(crs\(validate) -> crs\)test

The following variable selections have been noted.

crs$input <- c(“X”, “id”, “chol”, “stab.glu”, “hdl”, “ratio”, “glyhb”, “location”, “age”, “gender”, “height”, “weight”, “frame”, “bp.1s”, “bp.1d”, “bp.2s”, “bp.2d”, “waist”, “hip”, “time.ppn”, “insurance”, “fh”, “smoking”, “dm”, “TNM_dm”, “IMN_hdl”, “IMN_ratio”, “IMN_glyhb”, “IMN_height”, “IMN_weight”, “IMN_bp.1s”, “IMN_bp.1d”, “IMN_bp.2s”, “IMN_bp.2d”, “IMN_waist”, “IMN_hip”, “IMN_time.ppn”, “IMO_dm”)

crs$numeric <- c(“X”, “id”, “chol”, “stab.glu”, “hdl”, “ratio”, “glyhb”, “age”, “height”, “weight”, “bp.1s”, “bp.1d”, “bp.2s”, “bp.2d”, “waist”, “hip”, “time.ppn”, “insurance”, “fh”, “smoking”, “TNM_dm”, “IMN_hdl”, “IMN_ratio”, “IMN_glyhb”, “IMN_height”, “IMN_weight”, “IMN_bp.1s”, “IMN_bp.1d”, “IMN_bp.2s”, “IMN_bp.2d”, “IMN_waist”, “IMN_hip”, “IMN_time.ppn”)

crs$categoric <- c(“location”, “gender”, “frame”, “dm”, “IMO_dm”)

crs\(target <- "TNM_IMO_dm" crs\)risk <- NULL crs\(ident <- NULL crs\)ignore <- NULL crs$weights <- NULL

#======================================================================= # Rattle timestamp: 2020-09-05 23:06:34 x86_64-w64-mingw32

Decision Tree

The ‘rpart’ package provides the ‘rpart’ function.

library(rpart, quietly=TRUE)

Reset the random number seed to obtain the same results each time.

set.seed(crv$seed)

Build the Decision Tree model.

crs\(rpart <- rpart(TNM_IMO_dm ~ ., data=crs\)dataset[crs\(train, c(crs\)input, crs$target)], method=“class”, parms=list(split=“information”), control=rpart.control(usesurrogate=0, maxsurrogate=0), model=TRUE)

Generate a textual view of the Decision Tree model.

print(crs\(rpart) printcp(crs\)rpart) cat(“”)

Time taken: 0.04 secs

#======================================================================= # Rattle timestamp: 2020-09-05 23:06:51 x86_64-w64-mingw32

Evaluate model performance on the testing dataset.

Generate an Error Matrix for the Decision Tree model.

Obtain the response from the Decision Tree model.

crs\(pr <- predict(crs\)rpart, newdata=crs\(dataset[crs\)test, c(crs\(input, crs\)target)], type=“class”)

Generate the confusion matrix showing counts.

rattle::errorMatrix(crs\(dataset[crs\)test, c(crs\(input, crs\)target)]\(TNM_IMO_dm, crs\)pr, count=TRUE)

Generate the confusion matrix showing proportions.

(per <- rattle::errorMatrix(crs\(dataset[crs\)test, c(crs\(input, crs\)target)]\(TNM_IMO_dm, crs\)pr))

Calculate the overall error percentage.

cat(100-sum(diag(per), na.rm=TRUE))

Calculate the averaged class error percentage.

cat(mean(per[,“Error”], na.rm=TRUE))