library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Loading required package: lattice
library(neuralnet)
## Warning: package 'neuralnet' was built under R version 4.4.3
##
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
##
## compute
Data1 <- read.csv("C:/Users/LENOVO/Documents/sm unnes 22/ghina tugas/SMST 6/data mining/healthcare-dataset-stroke-data.csv")
View(Data1)
str(Data1)
## 'data.frame': 5110 obs. of 12 variables:
## $ id : int 9046 51676 31112 60182 1665 56669 53882 10434 27419 60491 ...
## $ gender : chr "Male" "Female" "Male" "Female" ...
## $ age : num 67 61 80 49 79 81 74 69 59 78 ...
## $ hypertension : int 0 0 0 0 1 0 1 0 0 0 ...
## $ heart_disease : int 1 0 1 0 0 0 1 0 0 0 ...
## $ ever_married : chr "Yes" "Yes" "Yes" "Yes" ...
## $ work_type : chr "Private" "Self-employed" "Private" "Private" ...
## $ Residence_type : chr "Urban" "Rural" "Rural" "Urban" ...
## $ avg_glucose_level: num 229 202 106 171 174 ...
## $ bmi : chr "36.6" "N/A" "32.5" "34.4" ...
## $ smoking_status : chr "formerly smoked" "never smoked" "never smoked" "smokes" ...
## $ stroke : int 1 1 1 1 1 1 1 1 1 1 ...
# Ubah "N/A" menjadi NA (missing value R)
Data1$bmi[Data1$bmi == "N/A"] <- NA
# Konversi kolom bmi dari karakter ke numerik
Data1$bmi <- as.numeric(Data1$bmi)
# Hitung nilai rata-rata bmi (abaikan NA)
Mean_value <- mean(Data1$bmi, na.rm = TRUE)
# Ganti nilai NA dengan rata-rata
Data1$bmi[is.na(Data1$bmi)] <- Mean_value
head(Data1$bmi, 20)
## [1] 36.60000 28.89324 32.50000 34.40000 24.00000 29.00000 27.40000 22.80000
## [9] 28.89324 24.20000 29.70000 36.80000 27.30000 28.89324 28.20000 30.90000
## [17] 37.50000 25.80000 37.80000 28.89324
anyNA(Data1)
## [1] FALSE
# Hilangkan kolom 'id' kalau tidak diperlukan
Data1$id <- NULL
# Ubah semua data menjadi numeric dummy (otomatis one-hot encoding)
Dummy_data <- model.matrix(~ . -1, data = Data1) # -1 untuk hilangkan intercept
# Pastikan dalam bentuk data frame
Dummy_data <- as.data.frame(Dummy_data)
# Lihat hasilnya
head(Dummy_data, 5)
## genderFemale genderMale genderOther age hypertension heart_disease
## 1 0 1 0 67 0 1
## 2 1 0 0 61 0 0
## 3 0 1 0 80 0 1
## 4 1 0 0 49 0 0
## 5 1 0 0 79 1 0
## ever_marriedYes work_typeGovt_job work_typeNever_worked work_typePrivate
## 1 1 0 0 1
## 2 1 0 0 0
## 3 1 0 0 1
## 4 1 0 0 1
## 5 1 0 0 0
## work_typeSelf-employed Residence_typeUrban avg_glucose_level bmi
## 1 0 1 228.69 36.60000
## 2 1 0 202.21 28.89324
## 3 0 0 105.92 32.50000
## 4 0 1 171.23 34.40000
## 5 1 0 174.12 24.00000
## smoking_statusnever smoked smoking_statussmokes smoking_statusUnknown stroke
## 1 0 0 0 1
## 2 1 0 0 1
## 3 1 0 0 1
## 4 0 1 0 1
## 5 1 0 0 1
scaled <- as.data.frame(scale(Dummy_data, center = TRUE, scale = TRUE))
head(scaled,6)
## genderFemale genderMale genderOther age hypertension heart_disease
## 1 -1.1893935 1.1898733 -0.01398909 1.0513314 -0.3285697 4.1846225
## 2 0.8406001 -0.8402611 -0.01398909 0.7859932 -0.3285697 -0.2389234
## 3 -1.1893935 1.1898733 -0.01398909 1.6262309 -0.3285697 4.1846225
## 4 0.8406001 -0.8402611 -0.01398909 0.2553167 -0.3285697 -0.2389234
## 5 0.8406001 -0.8402611 -0.01398909 1.5820079 3.0428986 -0.2389234
## 6 -1.1893935 1.1898733 -0.01398909 1.6704540 -0.3285697 -0.2389234
## ever_marriedYes work_typeGovt_job work_typeNever_worked work_typePrivate
## 1 0.7238134 -0.3840731 -0.06574993 0.864212
## 2 0.7238134 -0.3840731 -0.06574993 -1.156897
## 3 0.7238134 -0.3840731 -0.06574993 0.864212
## 4 0.7238134 -0.3840731 -0.06574993 0.864212
## 5 0.7238134 -0.3840731 -0.06574993 -1.156897
## 6 0.7238134 -0.3840731 -0.06574993 0.864212
## work_typeSelf-employed Residence_typeUrban avg_glucose_level bmi
## 1 -0.4368378 0.9839834 2.706110617 1.00113604
## 2 2.2887313 -1.0160784 2.121350940 0.00000000
## 3 -0.4368378 -1.0160784 -0.005027809 0.46853140
## 4 -0.4368378 0.9839834 1.437217451 0.71534819
## 5 2.2887313 -1.0160784 1.501037522 -0.63564895
## 6 -0.4368378 0.9839834 1.768021830 0.01386891
## smoking_statusnever smoked smoking_statussmokes smoking_statusUnknown
## 1 -0.7666993 -0.4272714 -0.6579463
## 2 1.3040371 -0.4272714 -0.6579463
## 3 1.3040371 -0.4272714 -0.6579463
## 4 -0.7666993 2.3399745 -0.6579463
## 5 1.3040371 -0.4272714 -0.6579463
## 6 -0.7666993 -0.4272714 -0.6579463
## stroke
## 1 4.417948
## 2 4.417948
## 3 4.417948
## 4 4.417948
## 5 4.417948
## 6 4.417948
# Partisi data: training dan testing
set.seed(80)
samplesize <- floor(0.6 * nrow(scaled))
index <- sample(seq_len(nrow(scaled)), size = samplesize)
trainNN <- scaled[index, ]
testNN <- scaled[-index, ]
cat("Jumlah data training:", nrow(trainNN), "\n")
## Jumlah data training: 3066
cat("Jumlah data testing :", nrow(testNN), "\n")
## Jumlah data testing : 2044
# (Opsional) Lihat sebagian isi data
head(trainNN[, 1:4]) # 4 kolom pertama dari data training
## genderFemale genderMale genderOther age
## 4235 0.8406001 -0.8402611 -0.01398909 -1.0271515
## 3883 -1.1893935 1.1898733 -0.01398909 1.6704540
## 2335 -1.1893935 1.1898733 -0.01398909 1.3166696
## 4843 -1.1893935 1.1898733 -0.01398909 -1.8320109
## 486 0.8406001 -0.8402611 -0.01398909 -0.5406981
## 3172 0.8406001 -0.8402611 -0.01398909 -0.4522520
head(testNN[, 1:4]) # 4 kolom pertama dari data testing
## genderFemale genderMale genderOther age
## 1 -1.1893935 1.1898733 -0.01398909 1.0513314
## 3 -1.1893935 1.1898733 -0.01398909 1.6262309
## 6 -1.1893935 1.1898733 -0.01398909 1.6704540
## 9 0.8406001 -0.8402611 -0.01398909 0.6975471
## 10 0.8406001 -0.8402611 -0.01398909 1.5377848
## 11 0.8406001 -0.8402611 -0.01398909 1.6704540
glimpse(trainNN)
## Rows: 3,066
## Columns: 18
## $ genderFemale <dbl> 0.8406001, -1.1893935, -1.1893935, -1.189…
## $ genderMale <dbl> -0.8402611, 1.1898733, 1.1898733, 1.18987…
## $ genderOther <dbl> -0.01398909, -0.01398909, -0.01398909, -0…
## $ age <dbl> -1.02715152, 1.67045397, 1.31666964, -1.8…
## $ hypertension <dbl> -0.3285697, 3.0428986, -0.3285697, -0.328…
## $ heart_disease <dbl> -0.2389234, 4.1846225, -0.2389234, -0.238…
## $ ever_marriedYes <dbl> -1.3813012, 0.7238134, -1.3813012, -1.381…
## $ work_typeGovt_job <dbl> -0.3840731, 2.6031618, 2.6031618, -0.3840…
## $ work_typeNever_worked <dbl> -0.06574993, -0.06574993, -0.06574993, -0…
## $ work_typePrivate <dbl> 0.864212, -1.156897, -1.156897, -1.156897…
## $ `work_typeSelf-employed` <dbl> -0.4368378, -0.4368378, -0.4368378, -0.43…
## $ Residence_typeUrban <dbl> -1.0160784, -1.0160784, -1.0160784, 0.983…
## $ avg_glucose_level <dbl> -0.05515638, 2.46871762, -0.58647503, 1.0…
## $ bmi <dbl> -1.54497394, -0.62265859, 0.32563748, -1.…
## $ `smoking_statusnever smoked` <dbl> -0.7666993, -0.7666993, -0.7666993, -0.76…
## $ smoking_statussmokes <dbl> -0.4272714, -0.4272714, 2.3399745, -0.427…
## $ smoking_statusUnknown <dbl> -0.6579463, -0.6579463, -0.6579463, 1.519…
## $ stroke <dbl> -0.2263051, -0.2263051, -0.2263051, -0.22…
set.seed(2)
model_ann <- neuralnet(
stroke ~ ever_marriedYes + age + hypertension + avg_glucose_level + bmi + smoking_statussmokes,trainNN,
hidden = 3,
linear.output = T,
)
library(NeuralNetTools)
## Warning: package 'NeuralNetTools' was built under R version 4.4.3
plotnet(model_ann)

plot(model_ann)
# Pastikan urutan dan nama kolom sama dengan model
input_test <- testNN[, c("ever_marriedYes", "age", "hypertension", "avg_glucose_level", "bmi", "smoking_statussmokes")]
# Hitung output prediksi
output <- compute(model_ann, input_test)
# Ambil hasil prediksi (probabilitas 0-1)
results <- output$net.result
# Lakukan pembulatan ke 0/1
roundedresults <- sapply(results, round, digits = 0)
# Bandingkan dengan nilai sebenarnya
hasil_akhir <- data.frame(Prediksi = roundedresults, Asli = testNN$stroke)
head(hasil_akhir)
## Prediksi Asli
## 1 0 4.417948
## 2 1 4.417948
## 3 1 4.417948
## 4 0 4.417948
## 5 0 4.417948
## 6 1 4.417948
# Prediksi
output <- compute(model_ann, testNN[, c("ever_marriedYes", "age", "hypertension", "avg_glucose_level", "bmi", "smoking_statussmokes")])
prediction <- factor(round(output$net.result), levels = c(0, 1))
data_asli <- Dummy_data
# Ambil target asli dari data sebelum scaling
actual <- factor(data_asli[-index, "stroke"], levels = c(0, 1))
# Evaluasi
library(caret)
confusionMatrix(prediction, actual)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1792 63
## 1 144 31
##
## Accuracy : 0.898
## 95% CI : (0.884, 0.9109)
## No Information Rate : 0.9537
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1811
##
## Mcnemar's Test P-Value : 2.692e-08
##
## Sensitivity : 0.9256
## Specificity : 0.3298
## Pos Pred Value : 0.9660
## Neg Pred Value : 0.1771
## Prevalence : 0.9537
## Detection Rate : 0.8828
## Detection Prevalence : 0.9138
## Balanced Accuracy : 0.6277
##
## 'Positive' Class : 0
##