library(ROCR)
## Warning: package 'ROCR' was built under R version 4.1.3
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.1.3
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.1.3
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.1.3
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
library(caret)
## Warning: package 'caret' was built under R version 4.1.3
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
data <- read.csv("C:/Users/User/Documents/Kuliah/sms2/Data Mining/UAS/Soal/heartData.csv")
head(data)
## Age Sex ChestPainType RestingBP Cholesterol FastingBS RestingECG MaxHR
## 1 40 M ATA 140 289 0 Normal 172
## 2 49 F NAP 160 180 0 Normal 156
## 3 37 M ATA 130 283 0 ST 98
## 4 48 F ASY 138 214 0 Normal 108
## 5 54 M NAP 150 195 0 Normal 122
## 6 39 M NAP 120 339 0 Normal 170
## ExerciseAngina Oldpeak ST_Slope HeartDisease
## 1 N 0.0 Up 0
## 2 N 1.0 Flat 1
## 3 N 0.0 Up 0
## 4 Y 1.5 Flat 1
## 5 N 0.0 Up 0
## 6 N 0.0 Up 0
dim(data)
## [1] 918 12
colnames(data)
## [1] "Age" "Sex" "ChestPainType" "RestingBP"
## [5] "Cholesterol" "FastingBS" "RestingECG" "MaxHR"
## [9] "ExerciseAngina" "Oldpeak" "ST_Slope" "HeartDisease"
BasicSummary <- function(df, dgts = 3){
m <- ncol(df)
varNames <- colnames(df)
varType <- vector("character",m)
topLevel <- vector("character",m)
topCount <- vector("numeric",m)
missCount <- vector("numeric",m)
levels <- vector("numeric", m)
for (i in 1:m){
x <- df[,i]
varType[i] <- class(x)
xtab <- table(x, useNA = "ifany")
levels[i] <- length(xtab)
nums <- as.numeric(xtab)
maxnum <- max(nums)
topCount[i] <- maxnum
maxIndex <- which.max(nums)
lvls <- names(xtab)
topLevel[i] <- lvls[maxIndex]
missIndex <- which((is.na(x)) | (x == "") | (x == " "))
missCount[i] <- length(missIndex)
}
n <- nrow(df)
topFrac <- round(topCount/n, digits = dgts)
missFrac <- round(missCount/n, digits = dgts)
## #
summaryFrame <- data.frame(variable = varNames, type = varType,
levels = levels, topLevel = topLevel,
topCount = topCount, topFrac = topFrac,
missFreq = missCount, missFrac = missFrac)
return(summaryFrame)
}
BasicSummary(data)
## variable type levels topLevel topCount topFrac missFreq missFrac
## 1 Age integer 50 54 51 0.056 0 0
## 2 Sex character 2 M 725 0.790 0 0
## 3 ChestPainType character 4 ASY 496 0.540 0 0
## 4 RestingBP integer 67 120 132 0.144 0 0
## 5 Cholesterol integer 222 0 172 0.187 0 0
## 6 FastingBS integer 2 0 704 0.767 0 0
## 7 RestingECG character 3 Normal 552 0.601 0 0
## 8 MaxHR integer 119 150 43 0.047 0 0
## 9 ExerciseAngina character 2 N 547 0.596 0 0
## 10 Oldpeak numeric 53 0 368 0.401 0 0
## 11 ST_Slope character 3 Flat 460 0.501 0 0
## 12 HeartDisease integer 2 1 508 0.553 0 0
Explanation
Dari data-data yang berasal dari dataset heartData, ditemukan bahwa ada 918 baris dan 12 kolom. Hal ini mengartikan ada 918 data dengan 12 atribut didalamnya.
Atribut tersebut terdiri dari “Age”, “Sex”, “ChestPainType”, “RestingBP”, “Cholesterol”, “FastingBS”, “RestingECG”, “MaxHR”, “ExerciseAngina”, “Oldpeak”, “ST_Slope”, dan “HeartDisease.” Nama-nama dari keduabelas atribut tersebut cukup lugas sehingga tidak menimbulkan kebingungan yang berarti bagi yang mengolah data.
Dalam data ini, terdapat 5 atribut yang memiliki tipe data karakter, 6 atribut yang memiliki tipe data integer, dan 1 atribut yang memiliki tipe data numerik.
Setiap atribut masing-masing memiliki unique values. Untuk atribut “Age” terdapat 50 unique values, “Sex” terdapat 2, “ChestPainType” terdapat 4, “RestingBP” terdapat 67, “Cholesterol” terdapat 222, “FastingBS” terdapat 2, “RestingECG” terdapat 3, “MaxHR” terdapat 119, “ExerciseAngina” terdapat 2, “Oldpeak” terdapat 53, “ST_Slope” terdapat 3, dan “HeartDisease” terdapat 2.
str(data)
## 'data.frame': 918 obs. of 12 variables:
## $ Age : int 40 49 37 48 54 39 45 54 37 48 ...
## $ Sex : chr "M" "F" "M" "F" ...
## $ ChestPainType : chr "ATA" "NAP" "ATA" "ASY" ...
## $ RestingBP : int 140 160 130 138 150 120 130 110 140 120 ...
## $ Cholesterol : int 289 180 283 214 195 339 237 208 207 284 ...
## $ FastingBS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ RestingECG : chr "Normal" "Normal" "ST" "Normal" ...
## $ MaxHR : int 172 156 98 108 122 170 170 142 130 120 ...
## $ ExerciseAngina: chr "N" "N" "N" "Y" ...
## $ Oldpeak : num 0 1 0 1.5 0 0 0 0 1.5 0 ...
## $ ST_Slope : chr "Up" "Flat" "Up" "Flat" ...
## $ HeartDisease : int 0 1 0 1 0 0 0 0 1 0 ...
colSums(is.na(data))
## Age Sex ChestPainType RestingBP Cholesterol
## 0 0 0 0 0
## FastingBS RestingECG MaxHR ExerciseAngina Oldpeak
## 0 0 0 0 0
## ST_Slope HeartDisease
## 0 0
sum(duplicated(data))
## [1] 0
describe(data)
## data
##
## 12 Variables 918 Observations
## --------------------------------------------------------------------------------
## Age
## n missing distinct Info Mean Gmd .05 .10
## 918 0 50 0.999 53.51 10.71 37 40
## .25 .50 .75 .90 .95
## 47 54 60 65 68
##
## lowest : 28 29 30 31 32, highest: 73 74 75 76 77
## --------------------------------------------------------------------------------
## Sex
## n missing distinct
## 918 0 2
##
## Value F M
## Frequency 193 725
## Proportion 0.21 0.79
## --------------------------------------------------------------------------------
## ChestPainType
## n missing distinct
## 918 0 4
##
## Value ASY ATA NAP TA
## Frequency 496 173 203 46
## Proportion 0.540 0.188 0.221 0.050
## --------------------------------------------------------------------------------
## RestingBP
## n missing distinct Info Mean Gmd .05 .10
## 918 0 67 0.993 132.4 20.09 106 110
## .25 .50 .75 .90 .95
## 120 130 140 160 160
##
## lowest : 0 80 92 94 95, highest: 180 185 190 192 200
## --------------------------------------------------------------------------------
## Cholesterol
## n missing distinct Info Mean Gmd .05 .10
## 918 0 222 0.993 198.8 116 0.0 0.0
## .25 .50 .75 .90 .95
## 173.2 223.0 267.0 305.0 331.3
##
## lowest : 0 85 100 110 113, highest: 491 518 529 564 603
## --------------------------------------------------------------------------------
## FastingBS
## n missing distinct Info Sum Mean Gmd
## 918 0 2 0.536 214 0.2331 0.3579
##
## --------------------------------------------------------------------------------
## RestingECG
## n missing distinct
## 918 0 3
##
## Value LVH Normal ST
## Frequency 188 552 178
## Proportion 0.205 0.601 0.194
## --------------------------------------------------------------------------------
## MaxHR
## n missing distinct Info Mean Gmd .05 .10
## 918 0 119 1 136.8 29.03 96 103
## .25 .50 .75 .90 .95
## 120 138 156 170 178
##
## lowest : 60 63 67 69 70, highest: 190 192 194 195 202
## --------------------------------------------------------------------------------
## ExerciseAngina
## n missing distinct
## 918 0 2
##
## Value N Y
## Frequency 547 371
## Proportion 0.596 0.404
## --------------------------------------------------------------------------------
## Oldpeak
## n missing distinct Info Mean Gmd .05 .10
## 918 0 53 0.934 0.8874 1.126 0.0 0.0
## .25 .50 .75 .90 .95
## 0.0 0.6 1.5 2.3 3.0
##
## lowest : -2.6 -2.0 -1.5 -1.1 -1.0, highest: 4.2 4.4 5.0 5.6 6.2
## --------------------------------------------------------------------------------
## ST_Slope
## n missing distinct
## 918 0 3
##
## Value Down Flat Up
## Frequency 63 460 395
## Proportion 0.069 0.501 0.430
## --------------------------------------------------------------------------------
## HeartDisease
## n missing distinct Info Sum Mean Gmd
## 918 0 2 0.741 508 0.5534 0.4948
##
## --------------------------------------------------------------------------------
Explanation
Berdasarkan isi dari tiap atribut dan tipe datanya, tidak ada kekeliruan dalam pengkategorian tipe data di tiap atribut. Sehingga kita tidak memerlukan perubahan jenis tipe data.
Dataset ini tidak ada ada missing value yang terdeteksi.
Dataset ini tidak ada data yang duplikat.
Secara keseluruhan, data ini sudah bersih dari awal.
ThreeSigma <- function(x, t = 3){
mu <- mean(x, na.rm = TRUE)
sig <- sd(x, na.rm = TRUE)
if (sig == 0){
message("All non-missing x-values are identical")
}
up <- mu + t * sig
down <- mu - t * sig
out <- list(up = up, down = down)
return(out)
}
Hampel <- function(x, t = 3){
mu <- median(x, na.rm = TRUE)
sig <- mad(x, na.rm = TRUE)
if (sig == 0){
message("Hampel identifer implosion: MAD scale estimate is zero")
}
up <- mu + t * sig
down <- mu - t * sig
out <- list(up = up, down = down)
return(out)
}
BoxplotRule<- function(x, t = 1.5){
xL <- quantile(x, na.rm = TRUE, probs = 0.25, names = FALSE)
xU <- quantile(x, na.rm = TRUE, probs = 0.75, names = FALSE)
Q <- xU - xL
if (Q == 0){
message("Boxplot rule implosion: interquartile distance is zero")
}
up <- xU + t * Q
down <- xU - t * Q
out <- list(up = up, down = down)
return(out)
}
ExtractDetails <- function(x, down, up){
outClass <- rep("N", length(x))
indexLo <- which(x < down)
indexHi <- which(x > up)
outClass[indexLo] <- "L"
outClass[indexHi] <- "U"
index <- union(indexLo, indexHi)
values <- x[index]
outClass <- outClass[index]
nOut <- length(index)
maxNom <- max(x[which(x <= up)])
minNom <- min(x[which(x >= down)])
outList <- list(nOut = nOut, lowLim = down,
upLim = up, minNom = minNom,
maxNom = maxNom, index = index,
values = values,
outClass = outClass)
return(outList)
}
FindOutliers <- function(x, t3 = 3, tH = 3, tb = 1.5){
threeLims <- ThreeSigma(x, t = t3)
HampLims <- Hampel(x, t = tH)
boxLims <- BoxplotRule(x, t = tb)
n <- length(x)
nMiss <- length(which(is.na(x)))
threeList <- ExtractDetails(x, threeLims$down, threeLims$up)
HampList <- ExtractDetails(x, HampLims$down, HampLims$up)
boxList <- ExtractDetails(x, boxLims$down, boxLims$up)
sumFrame <- data.frame(method = "ThreeSigma", n = n,
nMiss = nMiss, nOut = threeList$nOut,
lowLim = threeList$lowLim,
upLim = threeList$upLim,
minNom = threeList$minNom,
maxNom = threeList$maxNom)
upFrame <- data.frame(method = "Hampel", n = n,
nMiss = nMiss, nOut = HampList$nOut,
lowLim = HampList$lowLim,
upLim = HampList$upLim,
minNom = HampList$minNom,
maxNom = HampList$maxNom)
sumFrame <- rbind.data.frame(sumFrame, upFrame)
upFrame <- data.frame(method = "BoxplotRule", n = n,
nMiss = nMiss, nOut = boxList$nOut,
lowLim = boxList$lowLim,
upLim = boxList$upLim,
minNom = boxList$minNom,
maxNom = boxList$maxNom)
sumFrame <- rbind.data.frame(sumFrame, upFrame)
threeFrame <- data.frame(index = threeList$index,
values = threeList$values,
type = threeList$outClass)
HampFrame <- data.frame(index = HampList$index,
values = HampList$values,
type = HampList$outClass)
boxFrame <- data.frame(index = boxList$index,
values = boxList$values,
type = boxList$outClass)
outList <- list(summary = sumFrame, threeSigma = threeFrame,
Hampel = HampFrame, boxplotRule = boxFrame)
return(outList)
}
fullSummaryBP <- FindOutliers(data$RestingBP)
fullSummaryBP$summary
## method n nMiss nOut lowLim upLim minNom maxNom
## 1 ThreeSigma 918 0 8 76.85405 187.939 80 185
## 2 Hampel 918 0 25 85.52200 174.478 92 174
## 3 BoxplotRule 918 0 80 110.00000 170.000 110 170
fullSummaryCh <- FindOutliers(data$Cholesterol)
fullSummaryCh$summary
## method n nMiss nOut lowLim upLim minNom maxNom
## 1 ThreeSigma 918 0 3 -129.3529 526.9520 0 518
## 2 Hampel 918 0 180 18.4012 427.5988 85 417
## 3 BoxplotRule 918 0 192 126.3750 407.6250 129 407
fullSummaryMx <- FindOutliers(data$MaxHR)
fullSummaryMx$summary
## method n nMiss nOut lowLim upLim minNom maxNom
## 1 ThreeSigma 918 0 1 60.42837 213.1904 63 202
## 2 Hampel 918 0 0 57.93960 218.0604 60 202
## 3 BoxplotRule 918 0 85 102.00000 210.0000 102 202
Explanation
Pada atribut “RestingBP” terdapat 8 outliers berdasarkan metode ThreeSigma, 25 outliers berdasarkan metode Hampel, dan 80 Outliers berdasarkan metode BocxplotRule.
Pada atribut “Cholesterol” terdapat 3 outliers berdasarkan metode ThreeSigma, 180 outliers berdasarkan metode Hampel, dan 192 Outliers berdasarkan metode BocxplotRule.
Pada atribut “MaxHR” terdapat 1 outliers berdasarkan metode ThreeSigma, tidak ada outliers berdasarkan metode Hampel, dan 85 Outliers berdasarkan metode BocxplotRule.
dataint <- subset(data, select=c(1, 4, 5, 6, 8, 10, 12))
rcorr(as.matrix(dataint), type = "spearman")
## Age RestingBP Cholesterol FastingBS MaxHR Oldpeak HeartDisease
## Age 1.00 0.28 -0.05 0.20 -0.37 0.30 0.29
## RestingBP 0.28 1.00 0.11 0.07 -0.11 0.18 0.11
## Cholesterol -0.05 0.11 1.00 -0.19 0.18 0.05 -0.14
## FastingBS 0.20 0.07 -0.19 1.00 -0.12 0.10 0.27
## MaxHR -0.37 -0.11 0.18 -0.12 1.00 -0.21 -0.40
## Oldpeak 0.30 0.18 0.05 0.10 -0.21 1.00 0.42
## HeartDisease 0.29 0.11 -0.14 0.27 -0.40 0.42 1.00
##
## n= 918
##
##
## P
## Age RestingBP Cholesterol FastingBS MaxHR Oldpeak HeartDisease
## Age 0.0000 0.1534 0.0000 0.0000 0.0000 0.0000
## RestingBP 0.0000 0.0009 0.0455 0.0011 0.0000 0.0005
## Cholesterol 0.1534 0.0009 0.0000 0.0000 0.1173 0.0000
## FastingBS 0.0000 0.0455 0.0000 0.0002 0.0027 0.0000
## MaxHR 0.0000 0.0011 0.0000 0.0002 0.0000 0.0000
## Oldpeak 0.0000 0.0000 0.1173 0.0027 0.0000 0.0000
## HeartDisease 0.0000 0.0005 0.0000 0.0000 0.0000 0.0000
Explanation
Dikarenakan pada data ini kita mempelajari sesuatu yang sulit diukur (heart disease), maka memiliki koefisien korelasi yang rendah bukanlah sesuatu yang mengejutkan lagi. Dengan kata lain koefisien korelasi di atas 0.4 dapat dikatakan korelasi yang relatif kuat.
Untuk P-value sebagian besar atribut menunjukan nilai <0.05 yang memiliki arti sangat kuat korelasinya.
set.seed(1)
dataPartition <- createDataPartition(data$HeartDisease, p=0.8, list=FALSE)
validation <- data[-dataPartition,]
train <- data[dataPartition,]
Explanation
Dikarenakan kita sudah mengetahui bahwa dataset ini tidak memiliki missing value maupun data yang duplikat, kita akan langsung sama melakukan partisi data. Dengan pembagian data, 80% untuk training dan 20% untuk validasi.
Menurut saya, beberapa atribut yang menurut saya penting di dataset adalah atribut “ChestPainType”, “FastingBS”, “Cholesterol”, “ExerciseAngina”, “Oldpeak”, dan “ST_Slope” dikarenakan atribut-atribut ini merupakan faktor-faktor yang mempengaruhi terkenanya penyakit jantung.
logisticModel <- glm(HeartDisease ~ ., family = binomial(link = "logit"), data = data)
summary(logisticModel)
##
## Call:
## glm(formula = HeartDisease ~ ., family = binomial(link = "logit"),
## data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6531 -0.3747 0.1745 0.4457 2.5778
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.163656 1.416003 -0.822 0.411197
## Age 0.016550 0.013197 1.254 0.209803
## SexM 1.466477 0.279834 5.241 1.60e-07 ***
## ChestPainTypeATA -1.830289 0.326293 -5.609 2.03e-08 ***
## ChestPainTypeNAP -1.685682 0.266001 -6.337 2.34e-10 ***
## ChestPainTypeTA -1.488392 0.432572 -3.441 0.000580 ***
## RestingBP 0.004194 0.006010 0.698 0.485296
## Cholesterol -0.004115 0.001087 -3.785 0.000154 ***
## FastingBS 1.136482 0.274999 4.133 3.59e-05 ***
## RestingECGNormal -0.177033 0.271925 -0.651 0.515022
## RestingECGST -0.268546 0.350020 -0.767 0.442945
## MaxHR -0.004288 0.005023 -0.854 0.393249
## ExerciseAnginaY 0.900292 0.244513 3.682 0.000231 ***
## Oldpeak 0.380643 0.118466 3.213 0.001313 **
## ST_SlopeFlat 1.453902 0.429086 3.388 0.000703 ***
## ST_SlopeUp -0.994101 0.450196 -2.208 0.027234 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1262.14 on 917 degrees of freedom
## Residual deviance: 594.19 on 902 degrees of freedom
## AIC: 626.19
##
## Number of Fisher Scoring iterations: 6
logisticModel1 <- glm(HeartDisease~ Sex + ChestPainType + Cholesterol + FastingBS + ExerciseAngina + ST_Slope, family = binomial(link = "logit"), data = data)
summary(logisticModel1)
##
## Call:
## glm(formula = HeartDisease ~ Sex + ChestPainType + Cholesterol +
## FastingBS + ExerciseAngina + ST_Slope, family = binomial(link = "logit"),
## data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5484 -0.3858 0.1961 0.4701 2.6244
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.371629 0.516909 0.719 0.472175
## SexM 1.414102 0.273744 5.166 2.39e-07 ***
## ChestPainTypeATA -1.982943 0.315618 -6.283 3.33e-10 ***
## ChestPainTypeNAP -1.732316 0.257905 -6.717 1.86e-11 ***
## ChestPainTypeTA -1.389718 0.425028 -3.270 0.001077 **
## Cholesterol -0.003930 0.001035 -3.799 0.000146 ***
## FastingBS 1.194154 0.270405 4.416 1.00e-05 ***
## ExerciseAnginaY 1.132184 0.229381 4.936 7.98e-07 ***
## ST_SlopeFlat 0.961588 0.402423 2.389 0.016872 *
## ST_SlopeUp -1.740447 0.405451 -4.293 1.77e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1262.14 on 917 degrees of freedom
## Residual deviance: 612.71 on 908 degrees of freedom
## AIC: 632.71
##
## Number of Fisher Scoring iterations: 5
logisticModel2 <- glm(HeartDisease ~ Sex + ChestPainType + FastingBS + Cholesterol + ExerciseAngina + Oldpeak + ST_Slope, family = binomial(link = "logit"), data = data)
summary(logisticModel2)
##
## Call:
## glm(formula = HeartDisease ~ Sex + ChestPainType + FastingBS +
## Cholesterol + ExerciseAngina + Oldpeak + ST_Slope, family = binomial(link = "logit"),
## data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7578 -0.3763 0.1775 0.4345 2.6728
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.481859 0.562252 -0.857 0.391436
## SexM 1.454586 0.278086 5.231 1.69e-07 ***
## ChestPainTypeATA -1.878771 0.322002 -5.835 5.39e-09 ***
## ChestPainTypeNAP -1.706720 0.260758 -6.545 5.94e-11 ***
## ChestPainTypeTA -1.458703 0.424979 -3.432 0.000598 ***
## FastingBS 1.193157 0.271642 4.392 1.12e-05 ***
## Cholesterol -0.004124 0.001026 -4.019 5.84e-05 ***
## ExerciseAnginaY 0.991359 0.235370 4.212 2.53e-05 ***
## Oldpeak 0.410094 0.115694 3.545 0.000393 ***
## ST_SlopeFlat 1.443532 0.425675 3.391 0.000696 ***
## ST_SlopeUp -1.060365 0.443634 -2.390 0.016840 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1262.14 on 917 degrees of freedom
## Residual deviance: 599.61 on 907 degrees of freedom
## AIC: 621.61
##
## Number of Fisher Scoring iterations: 5
Explanation
Untuk model pertama, saya akan memasukkan semua atribut kedalam model. Dalam model ini, AIC skornya sebesar 626.19. Dikarenakan AIC skornya terbilang cukup tinggi, maka saya membuat model lain untuk mencari alternatif model dengan AIC yang lebih rendah.
Model kedua saya masukkan atribut-atribut yang memiliki signifikasi yang tinggi dengan model, yaitu atribut “Sex”, “ChestPainType”, “Cholesterol”, “FastingBS”, “ExerciseAngina”, dan “ST_Slope.” Model ini menghasilkan skor AIC sebesar 632.71, yang dimana lebih tinggi dari AIC skor model pertama. Sehingga, saya memutuskan untuk membuat model ketiga.
Pada model ketiga, saya memasukkan atribut-atribut yang menjadi faktor-faktor mempengaruhi terkenanya penyakit jantung, yaitu “ChestPainType”, “FastingBS”, “Cholesterol”, “ExerciseAngina”, “Oldpeak”, dan “ST_Slope” Lalu menambahkan atribut “Sex” untuk menjadi parameter. Model ini menghasilkan AIC skor lebih rendah dari model pertama, sehingga saya memutuskan untuk menggunakan model ini untuk dianalisis lebih lanjut.
predictionLogistic <- predict(logisticModel2, validation, type ="response")
pd <- prediction(predictionLogistic, validation$HeartDisease)
rocCurve <- performance(pd, measure = "tpr", x.measure = "fpr")
plot(rocCurve)
auc <- performance(pd, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.9437831
Explanation
DTModel <- rpart(HeartDisease ~., data = train, method = "class")
DTModel
## n= 735
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 735 326 1 (0.4435374 0.5564626)
## 2) ST_Slope=Up 307 60 0 (0.8045603 0.1954397)
## 4) Cholesterol>=42.5 268 33 0 (0.8768657 0.1231343)
## 8) ChestPainType=ATA,NAP,TA 192 9 0 (0.9531250 0.0468750) *
## 9) ChestPainType=ASY 76 24 0 (0.6842105 0.3157895)
## 18) ExerciseAngina=N 57 10 0 (0.8245614 0.1754386) *
## 19) ExerciseAngina=Y 19 5 1 (0.2631579 0.7368421) *
## 5) Cholesterol< 42.5 39 12 1 (0.3076923 0.6923077)
## 10) FastingBS< 0.5 20 8 0 (0.6000000 0.4000000) *
## 11) FastingBS>=0.5 19 0 1 (0.0000000 1.0000000) *
## 3) ST_Slope=Down,Flat 428 79 1 (0.1845794 0.8154206)
## 6) Sex=F 72 34 1 (0.4722222 0.5277778)
## 12) ChestPainType=ATA,NAP,TA 28 7 0 (0.7500000 0.2500000) *
## 13) ChestPainType=ASY 44 13 1 (0.2954545 0.7045455) *
## 7) Sex=M 356 45 1 (0.1264045 0.8735955) *
rpart.plot(DTModel)
predictionDT <- predict(DTModel, validation, type = "class")
cm <- table(predictionDT, validation$HeartDisease)
cm
##
## predictionDT 0 1
## 0 70 10
## 1 14 89
print(paste("Accuracy: ", sum(diag(cm))/sum(cm)))
## [1] "Accuracy: 0.868852459016393"
print(paste("Misclassification: ", 1 - sum(diag(cm))/sum(cm)))
## [1] "Misclassification: 0.131147540983607"
Explanation
Setelah dibuat decision model diketahui bahwa ada 70 data yang termauk True Positive, 14 data termasuk False Positive, 10 False Negative, dan 89 data termasuk True Negative.
Kita dapat hitung tingkat akurasinya sebesar 89% dan 13% misklasifikasi. Sehingga kita dapat mengatakan bahwa model ini merupakan model yang baik untuk memprediksi apakah seseorang memiliki penyakit jantung atau tidak.