LOAD LIBRARY
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.5.2
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(stringr)
library(tidyr)
LOAD DATASET
data_rs <- read.csv("hospital_dataset.csv")
head (data_rs, 15)
## Nama Tanggal_Lahir Tensi Skin_Stiffness_N_per_mm
## 1 Michael Anderson 01/04/1957 112/67 0.69
## 2 N/A 20/09/1975 140 / 91 1.50
## 3 Tan Wei Ming 12/04/1965 134/72 0.76
## 4 Shen Yi-Ching 11/09/1980 120/79 1.92
## 5 Kung Mei-Lin 22/08/1985 99/77 0.81
## 6 Ho Chuan-Wei 10/08/1962 149/65 0.61
## 7 18/01/1994 110/71 1.04
## 8 Betty Lewis 02/08/1982 108/67 2.24
## 9 Joseph Garcia 06/12/1982 0.18
## 10 Ong Lay Kheng 26/02/1951 128/78 NA
## 11 Lin Mei-Ling 16/02/1944 113/75 0.25
## 12 Tan Ah Kow 113/68 0.87
## 13 Tan Wei Ming 03/10/1946 105/90 1.92
## 14 N/A 02/11/1957 128/62 1.07
## 15 Hsu Kuo-Chang 18/03/1973 102/80 0.38
## Microcirculation_PU Suhu_Tubuh_Celcius Penyakit
## 1 42.0 37.6 Non-Diabetic
## 2 41.9 36.5°C Non-Diabetic
## 3 26.3 37.5 Non-Diabetic
## 4 NA 37.0 Diabetic
## 5 25.5 36.0 Diabetic
## 6 42.2 36.8 Non-Diabetic
## 7 2.0 36.3 Diabetic
## 8 9.5 36.4 Diabetic
## 9 24.8 36.9 Non-Diabetic
## 10 40.9 36.6 Non-Diabetic
## 11 44.0 37.2celcius Non-Diabetic
## 12 23.1 36.4 Diabetic
## 13 6.5 37.1 Diabetic
## 14 20.0 37.1 Diabetic
## 15 53.5 36.5
## Peak_Plantar_Pressure_kPa
## 1 294.0
## 2 NA
## 3 431.8
## 4 577.5
## 5 502.3
## 6 201.4
## 7 512.8
## 8 327.7
## 9 NA
## 10 308.9
## 11 NA
## 12 327.8
## 13 623.0
## 14 513.7
## 15 254.2
CEK NILAI UNIK, SUMMARY, DAN JUMLAH N/A AWAL
# Struktur data
str(data_rs)
## 'data.frame': 700 obs. of 8 variables:
## $ Nama : chr "Michael Anderson" "N/A" "Tan Wei Ming" "Shen Yi-Ching" ...
## $ Tanggal_Lahir : chr "01/04/1957" "20/09/1975" "12/04/1965" "11/09/1980" ...
## $ Tensi : chr "112/67" "140 / 91" "134/72" "120/79" ...
## $ Skin_Stiffness_N_per_mm : num 0.69 1.5 0.76 1.92 0.81 0.61 1.04 2.24 0.18 NA ...
## $ Microcirculation_PU : num 42 41.9 26.3 NA 25.5 42.2 2 9.5 24.8 40.9 ...
## $ Suhu_Tubuh_Celcius : chr "37.6" "36.5°C" "37.5" "37.0" ...
## $ Penyakit : chr "Non-Diabetic" "Non-Diabetic" "Non-Diabetic" "Diabetic" ...
## $ Peak_Plantar_Pressure_kPa: num 294 NA 432 578 502 ...
- Inkonsisten Data
# Cek nilai unik
lapply(data_rs %>% select(Nama, Penyakit, Suhu_Tubuh_Celcius), unique)
## $Nama
## [1] "Michael Anderson" "N/A" "Tan Wei Ming" "Shen Yi-Ching"
## [5] "Kung Mei-Lin" "Ho Chuan-Wei" "" "Betty Lewis"
## [9] "Joseph Garcia" "Ong Lay Kheng" "Lin Mei-Ling" "Tan Ah Kow"
## [13] "Hsu Kuo-Chang" "Lee Siew Eng" "John Smith" "Karen Thompson"
## [17] "Chou Mei-Yu" "Barbara Taylor" "Cheng Shu-Fen" "Yen Kuo-Jung"
## [21] "Charles Clark" "Chang Chung-Wei" "Joseph Walker" "William Thomas"
## [25] "Fang Shu-Chen" "Tseng Wen-Liang" "Tung Li-Fang" "Hsieh Shu-Hui"
## [29] "Robert Wilson" "Pasien" "UNKNOWN" "Linda Martinez"
## [33] "Richard Martin" "Huang Li-Chen" "Nancy Robinson" "Jessica White"
## [37] "Helen Hall" "Susan Jackson" "Lu Hsiang-Ling" "???"
## [41] "Ng Boon Hua" "Wu Ming-Hui" "Tsai Chin-Lung" "Yang Hsiu-Mei"
## [45] "James Brown" "Patricia Davis" "Liao Chih-Cheng" "Wang Jie"
## [49] "Liu Hsiao-Fen" "Chiu Yu-Chin" "Pan Mei-Hsuan" "Mary Johnson"
## [53] "David Harris" "Chen Wei" "123456" "Kao Chin-Feng"
## [57] "unknown" "NULL" "."
##
## $Penyakit
## [1] "Non-Diabetic" "Diabetic" "" "Sehat" "Sakit"
## [6] "Tidak" "Yes" "No" "NON-DIABETIC" "Normal"
## [11] "DIABETIC" "DM" "diabetic" "1" "non-diabetic"
## [16] "Healthy"
##
## $Suhu_Tubuh_Celcius
## [1] "37.6" "36.5°C" "37.5" "37.0" "36.0"
## [6] "36.8" "36.3" "36.4" "36.9" "36.6"
## [11] "37.2celcius" "37.1" "36.5" "36.9 C" "36.7"
## [16] "37.4" "37.2" "35.7" "36.2" ""
## [21] "37.3" "36.1" "36.5 derajat" "37.8" "42.5"
## [26] "35.9" "36.9°C" "37.7" "35.5" "37.0 derajat"
## [31] "35.6" "99.9" "36.1 derajat" "36.7 derajat" "38.0"
## [36] "-1.0" "35.8" "37.4°C" "37.1celcius" "37.2°C"
## [41] "36.6 derajat"
# Cek ringkasan untuk melihat nilai ekstrem (Outlier)
summary(data_rs %>% select(where(is.numeric)))
## Skin_Stiffness_N_per_mm Microcirculation_PU Peak_Plantar_Pressure_kPa
## Min. : -2.180 Min. : -32.50 Min. : -100.0
## 1st Qu.: 0.700 1st Qu.: 18.00 1st Qu.: 268.6
## Median : 1.100 Median : 27.70 Median : 384.3
## Mean : 1.342 Mean : 35.58 Mean : 991.9
## 3rd Qu.: 1.595 3rd Qu.: 39.00 3rd Qu.: 508.5
## Max. :150.000 Max. :5000.00 Max. :99999.0
## NA's :37 NA's :50 NA's :43
- Missing Value
# Cek jumlah N/A awal
colSums(is.na(data_rs))
## Nama Tanggal_Lahir Tensi
## 0 0 0
## Skin_Stiffness_N_per_mm Microcirculation_PU Suhu_Tubuh_Celcius
## 37 50 0
## Penyakit Peak_Plantar_Pressure_kPa
## 0 43
- Duplikasi Data
# Cek data duplikat yang identik
cat("Jumlah baris yang duplikat identik:", sum(duplicated(data_rs)))
## Jumlah baris yang duplikat identik: 2
data_rs[duplicated(data_rs) | duplicated(data_rs, fromLast = TRUE), ]
## Nama Tanggal_Lahir Tensi Skin_Stiffness_N_per_mm
## 108 Hsieh Shu-Hui 15/01/1967 96/73 1.43
## 277 Betty Lewis 23/05/1988 121/96 1.66
## 342 Betty Lewis 23/05/1988 121/96 1.66
## 556 Hsieh Shu-Hui 15/01/1967 96/73 1.43
## Microcirculation_PU Suhu_Tubuh_Celcius Penyakit Peak_Plantar_Pressure_kPa
## 108 36.9 37.0 Diabetic 393.2
## 277 14.6 36.6 Diabetic 475.8
## 342 14.6 36.6 Diabetic 475.8
## 556 36.9 37.0 Diabetic 393.2
# Hapus data duplikat yang identik
data_rs <- data_rs %>% distinct()
PROSES CLEANING
- Kolom Nama
# Ubah kata yang inkonsisten atau kosong menjadi "Unknown"
data_rs <- data_rs %>%
mutate(Nama = case_when(
is.na(Nama) ~ "Unknown",
Nama %in% c("???", "N/A", "123456", "Pasien", "UNKNOWN", "unknown", "NULL", ".", "") ~ "Unknown",
TRUE ~ Nama
))
- Kolom Tanggal_Lahir & Umur
data_rs <- data_rs %>%
mutate(
# Ambil tahun saja
Tahun_Extracted = as.numeric(str_extract(Tanggal_Lahir, "\\d{4}")),
# Hitung umur
Umur = 2026 - Tahun_Extracted
) %>%
mutate(
# Isi N/A umur dengan median
Umur = as.integer(ifelse(is.na(Umur), median(Umur, na.rm = TRUE), Umur)),
# Perbaiki Tanggal lahir yang kosong atau hanya tahun menjadi format 01/01/tahun
Tanggal_Lahir = case_when(
is.na(Tanggal_Lahir) | Tanggal_Lahir == "" ~ paste0("01/01/", 2026 - Umur),
str_detect(Tanggal_Lahir, "^\\d{4}$") ~ paste0("01/01/", Tanggal_Lahir),
TRUE ~ Tanggal_Lahir
)
) %>%
# Hapus kolom bantu dan pindahkan kolom umur ke sebelah kolom Tanggal lahir
select(-Tahun_Extracted) %>%
relocate(Umur, .after = Tanggal_Lahir)
- Kolom Tensi
# Ambil angka pertama sebagai Sistolik, dan angka kedua (jika ada) sebagai Diastolik
data_rs <- data_rs %>%
extract(
Tensi,
into = c("Sistolik", "Diastolik"),
regex = "(\\d+)(?:\\D+(\\d+))?",
remove = FALSE,
convert = TRUE
)
# Hitung median untuk mengisi N/A
median_sistolik <- as.integer(median(data_rs$Sistolik, na.rm = TRUE))
median_diastolik <- as.integer(median(data_rs$Diastolik, na.rm = TRUE))
# Isi N/A Sistolik & Diastolik dengan median
data_rs <- data_rs %>%
mutate(
Sistolik = ifelse(is.na(Sistolik), median_sistolik, Sistolik),
Diastolik = ifelse(is.na(Diastolik), median_diastolik, Diastolik),
# Update kolom tensi yang kosong atau hanya sistolik
Tensi = paste0(Sistolik, "/", Diastolik)
) %>%
# Pindahkan kolom Sistolik dan Diastolik ke sebelah kolom Tensi
relocate(Sistolik, Diastolik, .after = Tensi)
summary(data_rs %>% select(Sistolik, Diastolik))
## Sistolik Diastolik
## Min. : 78.0 Min. : 46.00
## 1st Qu.:113.0 1st Qu.: 72.00
## Median :121.0 Median : 78.00
## Mean :121.3 Mean : 78.25
## 3rd Qu.:131.0 3rd Qu.: 84.75
## Max. :164.0 Max. :113.00
- Kolom Skin_Stiffness_N_per_mm
# Hitung median (batas >0 & <10 )
median_skin <- median(data_rs$Skin_Stiffness_N_per_mm[data_rs$Skin_Stiffness_N_per_mm > 0 & data_rs$Skin_Stiffness_N_per_mm < 10], na.rm = TRUE)
# Ubah N/A atau nilai ekstrem dengan median
data_rs <- data_rs %>%
mutate(
Skin_Stiffness_N_per_mm = ifelse(is.na(Skin_Stiffness_N_per_mm) | Skin_Stiffness_N_per_mm <= 0 | Skin_Stiffness_N_per_mm > 10,
median_skin,
Skin_Stiffness_N_per_mm
)
)
summary(data_rs$Skin_Stiffness_N_per_mm)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.100 0.720 1.100 1.136 1.560 2.900
- Kolom Microcirculation_PU
# Hitung median (batas >0 & <= 100)
median_micro <- median(data_rs$Microcirculation_PU[
data_rs$Microcirculation_PU > 0 &
data_rs$Microcirculation_PU <= 100], na.rm = TRUE)
# Ubah N/A atau nilai ekstrem dengan median
data_rs <- data_rs %>%
mutate(
Microcirculation_PU = ifelse(
is.na(Microcirculation_PU) | Microcirculation_PU <= 0 | Microcirculation_PU > 100, median_micro, Microcirculation_PU)
)
summary(data_rs$Microcirculation_PU)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 19.82 28.00 28.46 37.75 77.30
- Kolom Suhu_Tubuh_Celcius
# Bersihkan dan ubah tipe data ke numerik
data_rs <- data_rs %>%
mutate(
Suhu_Tubuh_Celcius = Suhu_Tubuh_Celcius %>%
str_replace_all("(?i)celcius|derajat|°C|C", "") %>%
str_remove_all("\\s+"),
Suhu_Tubuh_Celcius = as.numeric(Suhu_Tubuh_Celcius)
)
# Hitung median (suhu normal 30-40)
median_suhu <- median(data_rs$Suhu_Tubuh_Celcius[
data_rs$Suhu_Tubuh_Celcius >= 30 & data_rs$Suhu_Tubuh_Celcius <= 40
], na.rm = TRUE)
# Ubah N/A atau nilai ekstrem dengan median
data_rs <- data_rs %>%
mutate(
Suhu_Tubuh_Celcius = ifelse(
is.na(Suhu_Tubuh_Celcius) | Suhu_Tubuh_Celcius < 30 | Suhu_Tubuh_Celcius > 40,
median_suhu,Suhu_Tubuh_Celcius))
summary(data_rs$Suhu_Tubuh_Celcius)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 35.50 36.50 36.80 36.78 37.00 38.00
- Kolom Penyakit
# Kelompok
# 1. Diabetic: (Sakit, Yes, DIABETIC, DM, diabetic, 1).
# 2. Non-Diabetic: (Sehat, Tidak, No, Normal, non-diabetic, Healthy).
# Kelompokkan dengan case_when
data_rs <- data_rs %>%
# Ubah semua jadi huruf kecil dan hapus spasi
mutate(
Penyakit = str_to_lower(as.character(Penyakit)),
Penyakit = str_trim(Penyakit)
) %>%
# Kelompokkan
mutate(
Penyakit = case_when(
is.na(Penyakit) | Penyakit == "" | Penyakit == "null" ~ "Non-Diabetic",
Penyakit %in% c("sakit", "yes", "diabetic", "dm", "1") ~ "Diabetic",
Penyakit %in% c("sehat", "tidak", "no", "normal", "non-diabetic", "healthy") ~ "Non-Diabetic",
TRUE ~ "Non-Diabetic"
)
)
# Ubah jadi faktor
data_rs$Penyakit <- factor(data_rs$Penyakit, levels = c("Non-Diabetic", "Diabetic"))
summary(data_rs$Penyakit)
## Non-Diabetic Diabetic
## 391 307
- Kolom Peak_Plantar_Pressure_kPa
# Hitung median
median_pppk <- median(data_rs$Peak_Plantar_Pressure_kPa[
data_rs$Peak_Plantar_Pressure_kPa >= 10 &
data_rs$Peak_Plantar_Pressure_kPa <= 1000], na.rm = TRUE)
# Ubah N/A atau nilai ekstrem dengan median
data_rs <- data_rs %>%
mutate(
Peak_Plantar_Pressure_kPa = ifelse(
is.na(Peak_Plantar_Pressure_kPa) |
Peak_Plantar_Pressure_kPa < 10 |
Peak_Plantar_Pressure_kPa > 1000,
median_pppk, Peak_Plantar_Pressure_kPa))
summary(data_rs$Peak_Plantar_Pressure_kPa)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 52.1 283.0 384.3 389.1 500.9 715.4
CEK DATA AKHIR
- Cek ringkasan data
summary(data_rs)
## Nama Tanggal_Lahir Umur Tensi
## Length:698 Length:698 Min. :21.00 Length:698
## Class :character Class :character 1st Qu.:38.00 Class :character
## Mode :character Mode :character Median :53.00 Mode :character
## Mean :53.31
## 3rd Qu.:69.00
## Max. :86.00
## Sistolik Diastolik Skin_Stiffness_N_per_mm Microcirculation_PU
## Min. : 78.0 Min. : 46.00 Min. :0.100 Min. : 1.00
## 1st Qu.:113.0 1st Qu.: 72.00 1st Qu.:0.720 1st Qu.:19.82
## Median :121.0 Median : 78.00 Median :1.100 Median :28.00
## Mean :121.3 Mean : 78.25 Mean :1.136 Mean :28.46
## 3rd Qu.:131.0 3rd Qu.: 84.75 3rd Qu.:1.560 3rd Qu.:37.75
## Max. :164.0 Max. :113.00 Max. :2.900 Max. :77.30
## Suhu_Tubuh_Celcius Penyakit Peak_Plantar_Pressure_kPa
## Min. :35.50 Non-Diabetic:391 Min. : 52.1
## 1st Qu.:36.50 Diabetic :307 1st Qu.:283.0
## Median :36.80 Median :384.3
## Mean :36.78 Mean :389.1
## 3rd Qu.:37.00 3rd Qu.:500.9
## Max. :38.00 Max. :715.4
- Cek Missing value
colSums(is.na(data_rs))
## Nama Tanggal_Lahir Umur
## 0 0 0
## Tensi Sistolik Diastolik
## 0 0 0
## Skin_Stiffness_N_per_mm Microcirculation_PU Suhu_Tubuh_Celcius
## 0 0 0
## Penyakit Peak_Plantar_Pressure_kPa
## 0 0