library(rpart)
library(rpart.plot)
data<-read.csv("D:/Rencana Tugas Akhir/Dataset3.csv",sep=";")
dataset<-na.omit(data) #removing NA values
str(dataset)
## 'data.frame': 100 obs. of 5 variables:
## $ Y : int 1 0 0 0 1 0 0 1 0 0 ...
## $ D1: int 0 0 0 0 0 1 0 1 0 0 ...
## $ D2: int 1 1 1 1 1 0 0 0 1 1 ...
## $ D3: int 0 0 0 0 0 0 0 0 0 0 ...
## $ D4: int 0 0 0 0 0 1 0 0 0 1 ...
## - attr(*, "na.action")= 'omit' Named int [1:2] 101 102
## ..- attr(*, "names")= chr [1:2] "101" "102"
Model<-rpart(Y~.,data=dataset,method="class",control=rpart.control(minsplit=10,cp=0))
prp(Model,extra=6,box.col=4,5)
summary(Model)
## Call:
## rpart(formula = Y ~ ., data = dataset, method = "class", control = rpart.control(minsplit = 10,
## cp = 0))
## n= 100
##
## CP nsplit rel error xerror xstd
## 1 0.08333333 0 1.0000000 1.000000 0.2134375
## 2 0.00000000 2 0.8333333 1.055556 0.2179449
##
## Variable importance
## D2 D4 D1
## 45 30 25
##
## Node number 1: 100 observations, complexity param=0.08333333
## predicted class=0 expected loss=0.18 P(node) =1
## class counts: 82 18
## probabilities: 0.820 0.180
## left son=2 (78 obs) right son=3 (22 obs)
## Primary splits:
## D4 < 0.5 to the left, improve=1.9022840, (0 missing)
## D1 < 0.5 to the left, improve=1.8632070, (0 missing)
## D2 < 0.5 to the right, improve=0.6438095, (0 missing)
## D3 < 0.5 to the left, improve=0.0437167, (0 missing)
##
## Node number 2: 78 observations
## predicted class=0 expected loss=0.1282051 P(node) =0.78
## class counts: 68 10
## probabilities: 0.872 0.128
##
## Node number 3: 22 observations, complexity param=0.08333333
## predicted class=0 expected loss=0.3636364 P(node) =0.22
## class counts: 14 8
## probabilities: 0.636 0.364
## left son=6 (13 obs) right son=7 (9 obs)
## Primary splits:
## D2 < 0.5 to the right, improve=2.797203, (0 missing)
## D1 < 0.5 to the left, improve=2.464171, (0 missing)
## Surrogate splits:
## D1 < 0.5 to the left, agree=0.818, adj=0.556, (0 split)
##
## Node number 6: 13 observations
## predicted class=0 expected loss=0.1538462 P(node) =0.13
## class counts: 11 2
## probabilities: 0.846 0.154
##
## Node number 7: 9 observations
## predicted class=1 expected loss=0.3333333 P(node) =0.09
## class counts: 3 6
## probabilities: 0.333 0.667
set.seed(123)
n=nrow(dataset)
n_train <- round(0.80 * n)
# Create a vector of indices which is an 80% random sample
set.seed(123)
train_indices <- sample(1:n, n_train)
# Subset the Diabetes data frame to training indices only
traini <- dataset[train_indices, ]
traini
## Y D1 D2 D3 D4
## 31 0 0 1 0 0
## 79 0 0 0 1 0
## 51 0 0 1 1 0
## 14 0 0 1 0 0
## 67 0 0 0 0 0
## 42 0 0 1 1 0
## 50 0 0 1 0 0
## 43 0 0 1 0 0
## 97 0 0 1 0 1
## 25 0 0 1 0 0
## 90 1 0 0 0 1
## 69 0 0 1 0 0
## 57 0 0 0 1 0
## 9 0 0 1 0 0
## 72 0 0 1 0 0
## 26 0 0 1 1 0
## 7 0 0 0 0 0
## 95 0 0 1 0 0
## 87 0 0 0 0 0
## 36 0 0 1 1 0
## 78 0 0 1 1 0
## 93 0 0 1 0 1
## 76 0 0 1 0 0
## 15 0 0 1 0 0
## 32 0 0 1 1 0
## 84 1 1 0 0 1
## 82 0 0 0 0 0
## 41 0 0 1 0 0
## 23 0 0 1 0 0
## 27 1 0 1 1 0
## 60 0 0 0 0 0
## 53 0 0 0 0 0
## 75 0 1 0 0 0
## 89 0 0 1 0 0
## 71 1 1 0 0 1
## 38 1 0 1 1 0
## 91 0 0 1 0 1
## 34 1 0 1 0 1
## 29 0 0 0 0 1
## 5 1 0 1 0 0
## 8 1 1 0 0 0
## 12 1 0 1 1 0
## 13 0 1 0 1 0
## 18 0 0 1 0 0
## 33 1 0 1 0 1
## 66 0 0 1 0 0
## 64 1 0 0 0 1
## 65 0 0 0 1 0
## 21 0 0 0 0 0
## 77 0 0 1 0 0
## 73 0 0 1 0 0
## 47 0 0 1 0 0
## 85 0 0 0 0 0
## 100 0 0 1 0 0
## 16 0 0 0 0 1
## 30 0 0 1 0 0
## 6 0 1 0 0 1
## 99 0 1 0 0 0
## 70 0 1 0 0 0
## 22 0 0 0 1 0
## 94 0 0 1 0 0
## 39 0 0 1 0 0
## 49 0 0 0 0 0
## 17 0 0 1 0 0
## 63 0 0 1 0 1
## 4 0 0 1 0 0
## 58 0 0 1 0 0
## 61 0 0 1 0 1
## 40 0 0 1 1 0
## 96 0 0 1 0 0
## 19 1 0 1 0 0
## 54 0 0 1 0 0
## 20 1 0 0 0 0
## 80 0 0 1 0 1
## 62 0 0 1 0 0
## 92 0 0 1 1 0
## 86 0 0 1 0 1
## 3 0 0 1 0 0
## 83 0 0 1 0 0
## 46 0 0 1 0 0
# Exclude the training indices to create the test set
test <- dataset[-train_indices, ]
test
## Y D1 D2 D3 D4
## 1 1 0 1 0 0
## 2 0 0 1 0 0
## 10 0 0 1 0 1
## 11 1 0 1 1 0
## 24 0 0 1 0 0
## 28 1 1 0 0 1
## 35 0 0 0 0 0
## 37 0 0 1 1 0
## 44 0 0 1 0 0
## 45 1 1 0 0 1
## 48 0 0 1 0 1
## 52 1 0 1 0 0
## 55 0 0 1 0 0
## 56 0 0 1 0 0
## 59 0 0 1 0 0
## 68 0 0 1 0 1
## 74 0 0 1 0 1
## 81 0 0 1 0 0
## 88 0 0 1 1 0
## 98 0 1 0 0 0
paste("train sample size: ", dim(traini)[1])
## [1] "train sample size: 80"
paste("test sample size: ", dim(test)[1])
## [1] "test sample size: 20"
model_tree<-rpart(Y~.,data=traini,method="class",control=rpart.control(minsplit=10,cp=0))
prp(model_tree,extra=6,box.col=4,5)
summary(model_tree)
## Call:
## rpart(formula = Y ~ ., data = traini, method = "class", control = rpart.control(minsplit = 10,
## cp = 0))
## n= 80
##
## CP nsplit rel error xerror xstd
## 1 0.03846154 0 1.0000000 1.000000 0.2538170
## 2 0.00000000 2 0.9230769 1.153846 0.2685431
##
## Variable importance
## D4 D2 D1
## 57 30 13
##
## Node number 1: 80 observations, complexity param=0.03846154
## predicted class=0 expected loss=0.1625 P(node) =1
## class counts: 67 13
## probabilities: 0.838 0.163
## left son=2 (64 obs) right son=3 (16 obs)
## Primary splits:
## D4 < 0.5 to the left, improve=1.8062500, (0 missing)
## D1 < 0.5 to the left, improve=0.8027778, (0 missing)
## D2 < 0.5 to the right, improve=0.3590456, (0 missing)
## D3 < 0.5 to the left, improve=0.0250000, (0 missing)
##
## Node number 2: 64 observations
## predicted class=0 expected loss=0.109375 P(node) =0.8
## class counts: 57 7
## probabilities: 0.891 0.109
##
## Node number 3: 16 observations, complexity param=0.03846154
## predicted class=0 expected loss=0.375 P(node) =0.2
## class counts: 10 6
## probabilities: 0.625 0.375
## left son=6 (9 obs) right son=7 (7 obs)
## Primary splits:
## D2 < 0.5 to the right, improve=0.9603175, (0 missing)
## D1 < 0.5 to the left, improve=0.6282051, (0 missing)
## Surrogate splits:
## D1 < 0.5 to the left, agree=0.75, adj=0.429, (0 split)
##
## Node number 6: 9 observations
## predicted class=0 expected loss=0.2222222 P(node) =0.1125
## class counts: 7 2
## probabilities: 0.778 0.222
##
## Node number 7: 7 observations
## predicted class=1 expected loss=0.4285714 P(node) =0.0875
## class counts: 3 4
## probabilities: 0.429 0.571
tree.pred= predict(model_tree,)
tree.pred
## 0 1
## 31 0.8906250 0.1093750
## 79 0.8906250 0.1093750
## 51 0.8906250 0.1093750
## 14 0.8906250 0.1093750
## 67 0.8906250 0.1093750
## 42 0.8906250 0.1093750
## 50 0.8906250 0.1093750
## 43 0.8906250 0.1093750
## 97 0.7777778 0.2222222
## 25 0.8906250 0.1093750
## 90 0.4285714 0.5714286
## 69 0.8906250 0.1093750
## 57 0.8906250 0.1093750
## 9 0.8906250 0.1093750
## 72 0.8906250 0.1093750
## 26 0.8906250 0.1093750
## 7 0.8906250 0.1093750
## 95 0.8906250 0.1093750
## 87 0.8906250 0.1093750
## 36 0.8906250 0.1093750
## 78 0.8906250 0.1093750
## 93 0.7777778 0.2222222
## 76 0.8906250 0.1093750
## 15 0.8906250 0.1093750
## 32 0.8906250 0.1093750
## 84 0.4285714 0.5714286
## 82 0.8906250 0.1093750
## 41 0.8906250 0.1093750
## 23 0.8906250 0.1093750
## 27 0.8906250 0.1093750
## 60 0.8906250 0.1093750
## 53 0.8906250 0.1093750
## 75 0.8906250 0.1093750
## 89 0.8906250 0.1093750
## 71 0.4285714 0.5714286
## 38 0.8906250 0.1093750
## 91 0.7777778 0.2222222
## 34 0.7777778 0.2222222
## 29 0.4285714 0.5714286
## 5 0.8906250 0.1093750
## 8 0.8906250 0.1093750
## 12 0.8906250 0.1093750
## 13 0.8906250 0.1093750
## 18 0.8906250 0.1093750
## 33 0.7777778 0.2222222
## 66 0.8906250 0.1093750
## 64 0.4285714 0.5714286
## 65 0.8906250 0.1093750
## 21 0.8906250 0.1093750
## 77 0.8906250 0.1093750
## 73 0.8906250 0.1093750
## 47 0.8906250 0.1093750
## 85 0.8906250 0.1093750
## 100 0.8906250 0.1093750
## 16 0.4285714 0.5714286
## 30 0.8906250 0.1093750
## 6 0.4285714 0.5714286
## 99 0.8906250 0.1093750
## 70 0.8906250 0.1093750
## 22 0.8906250 0.1093750
## 94 0.8906250 0.1093750
## 39 0.8906250 0.1093750
## 49 0.8906250 0.1093750
## 17 0.8906250 0.1093750
## 63 0.7777778 0.2222222
## 4 0.8906250 0.1093750
## 58 0.8906250 0.1093750
## 61 0.7777778 0.2222222
## 40 0.8906250 0.1093750
## 96 0.8906250 0.1093750
## 19 0.8906250 0.1093750
## 54 0.8906250 0.1093750
## 20 0.8906250 0.1093750
## 80 0.7777778 0.2222222
## 62 0.8906250 0.1093750
## 92 0.8906250 0.1093750
## 86 0.7777778 0.2222222
## 3 0.8906250 0.1093750
## 83 0.8906250 0.1093750
## 46 0.8906250 0.1093750
nrow(tree.pred)
## [1] 80
summary(tree.pred)
## 0 1
## Min. :0.4286 Min. :0.1094
## 1st Qu.:0.8906 1st Qu.:0.1094
## Median :0.8906 Median :0.1094
## Mean :0.8375 Mean :0.1625
## 3rd Qu.:0.8906 3rd Qu.:0.1094
## Max. :0.8906 Max. :0.5714
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.