LOAD LIBRARY

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.5.2
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(stringr)
library(tidyr)

LOAD DATASET

data_rs <- read.csv("hospital_dataset.csv")
head (data_rs, 15)
##                Nama Tanggal_Lahir    Tensi Skin_Stiffness_N_per_mm
## 1  Michael Anderson    01/04/1957   112/67                    0.69
## 2               N/A    20/09/1975 140 / 91                    1.50
## 3      Tan Wei Ming    12/04/1965   134/72                    0.76
## 4     Shen Yi-Ching    11/09/1980   120/79                    1.92
## 5      Kung Mei-Lin    22/08/1985    99/77                    0.81
## 6      Ho Chuan-Wei    10/08/1962   149/65                    0.61
## 7                      18/01/1994   110/71                    1.04
## 8       Betty Lewis    02/08/1982   108/67                    2.24
## 9     Joseph Garcia    06/12/1982                             0.18
## 10    Ong Lay Kheng    26/02/1951   128/78                      NA
## 11     Lin Mei-Ling    16/02/1944   113/75                    0.25
## 12       Tan Ah Kow                 113/68                    0.87
## 13     Tan Wei Ming    03/10/1946   105/90                    1.92
## 14              N/A    02/11/1957   128/62                    1.07
## 15    Hsu Kuo-Chang    18/03/1973   102/80                    0.38
##    Microcirculation_PU Suhu_Tubuh_Celcius     Penyakit
## 1                 42.0               37.6 Non-Diabetic
## 2                 41.9             36.5°C Non-Diabetic
## 3                 26.3               37.5 Non-Diabetic
## 4                   NA               37.0     Diabetic
## 5                 25.5               36.0     Diabetic
## 6                 42.2               36.8 Non-Diabetic
## 7                  2.0               36.3     Diabetic
## 8                  9.5               36.4     Diabetic
## 9                 24.8               36.9 Non-Diabetic
## 10                40.9               36.6 Non-Diabetic
## 11                44.0        37.2celcius Non-Diabetic
## 12                23.1               36.4     Diabetic
## 13                 6.5               37.1     Diabetic
## 14                20.0               37.1     Diabetic
## 15                53.5               36.5             
##    Peak_Plantar_Pressure_kPa
## 1                      294.0
## 2                         NA
## 3                      431.8
## 4                      577.5
## 5                      502.3
## 6                      201.4
## 7                      512.8
## 8                      327.7
## 9                         NA
## 10                     308.9
## 11                        NA
## 12                     327.8
## 13                     623.0
## 14                     513.7
## 15                     254.2

CEK NILAI UNIK, SUMMARY, DAN JUMLAH N/A AWAL

# Struktur data
str(data_rs)
## 'data.frame':    700 obs. of  8 variables:
##  $ Nama                     : chr  "Michael Anderson" "N/A" "Tan Wei Ming" "Shen Yi-Ching" ...
##  $ Tanggal_Lahir            : chr  "01/04/1957" "20/09/1975" "12/04/1965" "11/09/1980" ...
##  $ Tensi                    : chr  "112/67" "140 / 91" "134/72" "120/79" ...
##  $ Skin_Stiffness_N_per_mm  : num  0.69 1.5 0.76 1.92 0.81 0.61 1.04 2.24 0.18 NA ...
##  $ Microcirculation_PU      : num  42 41.9 26.3 NA 25.5 42.2 2 9.5 24.8 40.9 ...
##  $ Suhu_Tubuh_Celcius       : chr  "37.6" "36.5°C" "37.5" "37.0" ...
##  $ Penyakit                 : chr  "Non-Diabetic" "Non-Diabetic" "Non-Diabetic" "Diabetic" ...
##  $ Peak_Plantar_Pressure_kPa: num  294 NA 432 578 502 ...

- Inkonsisten Data

# Cek nilai unik 
lapply(data_rs %>% select(Nama, Penyakit, Suhu_Tubuh_Celcius), unique)
## $Nama
##  [1] "Michael Anderson" "N/A"              "Tan Wei Ming"     "Shen Yi-Ching"   
##  [5] "Kung Mei-Lin"     "Ho Chuan-Wei"     ""                 "Betty Lewis"     
##  [9] "Joseph Garcia"    "Ong Lay Kheng"    "Lin Mei-Ling"     "Tan Ah Kow"      
## [13] "Hsu Kuo-Chang"    "Lee Siew Eng"     "John Smith"       "Karen Thompson"  
## [17] "Chou Mei-Yu"      "Barbara Taylor"   "Cheng Shu-Fen"    "Yen Kuo-Jung"    
## [21] "Charles Clark"    "Chang Chung-Wei"  "Joseph Walker"    "William Thomas"  
## [25] "Fang Shu-Chen"    "Tseng Wen-Liang"  "Tung Li-Fang"     "Hsieh Shu-Hui"   
## [29] "Robert Wilson"    "Pasien"           "UNKNOWN"          "Linda Martinez"  
## [33] "Richard Martin"   "Huang Li-Chen"    "Nancy Robinson"   "Jessica White"   
## [37] "Helen Hall"       "Susan Jackson"    "Lu Hsiang-Ling"   "???"             
## [41] "Ng Boon Hua"      "Wu Ming-Hui"      "Tsai Chin-Lung"   "Yang Hsiu-Mei"   
## [45] "James Brown"      "Patricia Davis"   "Liao Chih-Cheng"  "Wang Jie"        
## [49] "Liu Hsiao-Fen"    "Chiu Yu-Chin"     "Pan Mei-Hsuan"    "Mary Johnson"    
## [53] "David Harris"     "Chen Wei"         "123456"           "Kao Chin-Feng"   
## [57] "unknown"          "NULL"             "."               
## 
## $Penyakit
##  [1] "Non-Diabetic" "Diabetic"     ""             "Sehat"        "Sakit"       
##  [6] "Tidak"        "Yes"          "No"           "NON-DIABETIC" "Normal"      
## [11] "DIABETIC"     "DM"           "diabetic"     "1"            "non-diabetic"
## [16] "Healthy"     
## 
## $Suhu_Tubuh_Celcius
##  [1] "37.6"         "36.5°C"       "37.5"         "37.0"         "36.0"        
##  [6] "36.8"         "36.3"         "36.4"         "36.9"         "36.6"        
## [11] "37.2celcius"  "37.1"         "36.5"         "36.9 C"       "36.7"        
## [16] "37.4"         "37.2"         "35.7"         "36.2"         ""            
## [21] "37.3"         "36.1"         "36.5 derajat" "37.8"         "42.5"        
## [26] "35.9"         "36.9°C"       "37.7"         "35.5"         "37.0 derajat"
## [31] "35.6"         "99.9"         "36.1 derajat" "36.7 derajat" "38.0"        
## [36] "-1.0"         "35.8"         "37.4°C"       "37.1celcius"  "37.2°C"      
## [41] "36.6 derajat"
# Cek ringkasan untuk melihat nilai ekstrem (Outlier)
summary(data_rs %>% select(where(is.numeric)))
##  Skin_Stiffness_N_per_mm Microcirculation_PU Peak_Plantar_Pressure_kPa
##  Min.   : -2.180         Min.   : -32.50     Min.   : -100.0          
##  1st Qu.:  0.700         1st Qu.:  18.00     1st Qu.:  268.6          
##  Median :  1.100         Median :  27.70     Median :  384.3          
##  Mean   :  1.342         Mean   :  35.58     Mean   :  991.9          
##  3rd Qu.:  1.595         3rd Qu.:  39.00     3rd Qu.:  508.5          
##  Max.   :150.000         Max.   :5000.00     Max.   :99999.0          
##  NA's   :37              NA's   :50          NA's   :43

- Missing Value

# Cek jumlah N/A awal
colSums(is.na(data_rs))
##                      Nama             Tanggal_Lahir                     Tensi 
##                         0                         0                         0 
##   Skin_Stiffness_N_per_mm       Microcirculation_PU        Suhu_Tubuh_Celcius 
##                        37                        50                         0 
##                  Penyakit Peak_Plantar_Pressure_kPa 
##                         0                        43

- Duplikasi Data

# Cek data duplikat yang identik
cat("Jumlah baris yang duplikat identik:", sum(duplicated(data_rs)))
## Jumlah baris yang duplikat identik: 2
data_rs[duplicated(data_rs) | duplicated(data_rs, fromLast = TRUE), ]
##              Nama Tanggal_Lahir  Tensi Skin_Stiffness_N_per_mm
## 108 Hsieh Shu-Hui    15/01/1967  96/73                    1.43
## 277   Betty Lewis    23/05/1988 121/96                    1.66
## 342   Betty Lewis    23/05/1988 121/96                    1.66
## 556 Hsieh Shu-Hui    15/01/1967  96/73                    1.43
##     Microcirculation_PU Suhu_Tubuh_Celcius Penyakit Peak_Plantar_Pressure_kPa
## 108                36.9               37.0 Diabetic                     393.2
## 277                14.6               36.6 Diabetic                     475.8
## 342                14.6               36.6 Diabetic                     475.8
## 556                36.9               37.0 Diabetic                     393.2
# Hapus data duplikat yang identik
data_rs <- data_rs %>% distinct()

PROSES CLEANING

- Kolom Nama

# Ubah kata yang inkonsisten atau kosong menjadi "Unknown"
data_rs <- data_rs %>%
  mutate(Nama = case_when(
    is.na(Nama) ~ "Unknown",
    Nama %in% c("???", "N/A", "123456", "Pasien", "UNKNOWN", "unknown", "NULL", ".", "") ~ "Unknown",
    TRUE ~ Nama  
  ))

- Kolom Tanggal_Lahir & Umur

data_rs <- data_rs %>%
  mutate(
# Ambil tahun saja
  Tahun_Extracted = as.numeric(str_extract(Tanggal_Lahir, "\\d{4}")),
    
# Hitung umur
  Umur = 2026 - Tahun_Extracted
  ) %>%
  mutate(
    
# Isi N/A umur dengan median
  Umur = as.integer(ifelse(is.na(Umur), median(Umur, na.rm = TRUE), Umur)),
    
# Perbaiki Tanggal lahir yang kosong atau hanya tahun menjadi format 01/01/tahun
    Tanggal_Lahir = case_when(
      is.na(Tanggal_Lahir) | Tanggal_Lahir == "" ~ paste0("01/01/", 2026 - Umur),
      str_detect(Tanggal_Lahir, "^\\d{4}$")      ~ paste0("01/01/", Tanggal_Lahir),
      TRUE                                       ~ Tanggal_Lahir
    )
  ) %>%
# Hapus kolom bantu dan pindahkan kolom umur ke sebelah kolom Tanggal lahir 
  select(-Tahun_Extracted) %>%
  relocate(Umur, .after = Tanggal_Lahir)

- Kolom Tensi

# Ambil angka pertama sebagai Sistolik, dan angka kedua (jika ada) sebagai Diastolik
data_rs <- data_rs %>%
  extract(
    Tensi, 
    into = c("Sistolik", "Diastolik"), 
    regex = "(\\d+)(?:\\D+(\\d+))?", 
    remove = FALSE, 
    convert = TRUE
  )

# Hitung median untuk mengisi N/A 
median_sistolik <- as.integer(median(data_rs$Sistolik, na.rm = TRUE))
median_diastolik <- as.integer(median(data_rs$Diastolik, na.rm = TRUE))

# Isi N/A Sistolik & Diastolik dengan median
data_rs <- data_rs %>%
  mutate(
Sistolik = ifelse(is.na(Sistolik), median_sistolik, Sistolik),
Diastolik = ifelse(is.na(Diastolik), median_diastolik, Diastolik),

# Update kolom tensi yang kosong atau hanya sistolik
Tensi = paste0(Sistolik, "/", Diastolik) 
  ) %>%
  
# Pindahkan kolom Sistolik dan Diastolik ke sebelah kolom Tensi
relocate(Sistolik, Diastolik, .after = Tensi)

summary(data_rs %>% select(Sistolik, Diastolik))
##     Sistolik       Diastolik     
##  Min.   : 78.0   Min.   : 46.00  
##  1st Qu.:113.0   1st Qu.: 72.00  
##  Median :121.0   Median : 78.00  
##  Mean   :121.3   Mean   : 78.25  
##  3rd Qu.:131.0   3rd Qu.: 84.75  
##  Max.   :164.0   Max.   :113.00

- Kolom Skin_Stiffness_N_per_mm

# Hitung median (batas >0 & <10 )
median_skin <- median(data_rs$Skin_Stiffness_N_per_mm[data_rs$Skin_Stiffness_N_per_mm > 0 & data_rs$Skin_Stiffness_N_per_mm < 10], na.rm = TRUE)

# Ubah N/A atau nilai ekstrem dengan median
data_rs <- data_rs %>%
  mutate(
    Skin_Stiffness_N_per_mm = ifelse(is.na(Skin_Stiffness_N_per_mm) | Skin_Stiffness_N_per_mm <= 0 | Skin_Stiffness_N_per_mm > 10, 
      median_skin, 
      Skin_Stiffness_N_per_mm
    )
  )
summary(data_rs$Skin_Stiffness_N_per_mm)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.100   0.720   1.100   1.136   1.560   2.900

- Kolom Microcirculation_PU

# Hitung median (batas >0 & <= 100)
median_micro <- median(data_rs$Microcirculation_PU[
  data_rs$Microcirculation_PU > 0 & 
  data_rs$Microcirculation_PU <= 100], na.rm = TRUE)

# Ubah N/A atau nilai ekstrem dengan median
data_rs <- data_rs %>%
  mutate(
    Microcirculation_PU = ifelse(
      is.na(Microcirculation_PU) | Microcirculation_PU <= 0 | Microcirculation_PU > 100, median_micro, Microcirculation_PU)
  )
summary(data_rs$Microcirculation_PU)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   19.82   28.00   28.46   37.75   77.30

- Kolom Suhu_Tubuh_Celcius

# Bersihkan dan ubah tipe data ke numerik
data_rs <- data_rs %>%
  mutate(
    Suhu_Tubuh_Celcius = Suhu_Tubuh_Celcius %>%
      str_replace_all("(?i)celcius|derajat|°C|C", "") %>% 
      str_remove_all("\\s+"), 
  Suhu_Tubuh_Celcius = as.numeric(Suhu_Tubuh_Celcius)
  )
# Hitung median (suhu normal 30-40)
median_suhu <- median(data_rs$Suhu_Tubuh_Celcius[
  data_rs$Suhu_Tubuh_Celcius >= 30 & data_rs$Suhu_Tubuh_Celcius <= 40
], na.rm = TRUE)

# Ubah N/A atau nilai ekstrem dengan median
data_rs <- data_rs %>%
  mutate(
    Suhu_Tubuh_Celcius = ifelse(
      is.na(Suhu_Tubuh_Celcius) | Suhu_Tubuh_Celcius < 30 | Suhu_Tubuh_Celcius > 40,
      median_suhu,Suhu_Tubuh_Celcius))
summary(data_rs$Suhu_Tubuh_Celcius)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   35.50   36.50   36.80   36.78   37.00   38.00

- Kolom Penyakit

# Kelompok
# 1. Diabetic: (Sakit, Yes, DIABETIC, DM, diabetic, 1).
# 2. Non-Diabetic: (Sehat, Tidak, No, Normal, non-diabetic, Healthy).

# Kelompokkan dengan case_when
data_rs <- data_rs %>%
  # Ubah semua jadi huruf kecil dan hapus spasi 
  mutate(
  Penyakit = str_to_lower(as.character(Penyakit)),
  Penyakit = str_trim(Penyakit)
  ) %>% 
  # Kelompokkan
  mutate(
    Penyakit = case_when(
      is.na(Penyakit)  | Penyakit == "" | Penyakit == "null"  ~ "Non-Diabetic",
      Penyakit %in% c("sakit", "yes", "diabetic", "dm", "1") ~ "Diabetic",
      Penyakit %in% c("sehat", "tidak", "no", "normal", "non-diabetic", "healthy") ~ "Non-Diabetic",
      TRUE ~ "Non-Diabetic" 
    )
  )

# Ubah jadi faktor
data_rs$Penyakit <- factor(data_rs$Penyakit, levels = c("Non-Diabetic", "Diabetic"))
summary(data_rs$Penyakit)
## Non-Diabetic     Diabetic 
##          391          307

- Kolom Peak_Plantar_Pressure_kPa

# Hitung median
median_pppk <- median(data_rs$Peak_Plantar_Pressure_kPa[
  data_rs$Peak_Plantar_Pressure_kPa >= 10 & 
  data_rs$Peak_Plantar_Pressure_kPa <= 1000], na.rm = TRUE)

# Ubah N/A atau nilai ekstrem dengan median
data_rs <- data_rs %>%
  mutate(
    Peak_Plantar_Pressure_kPa = ifelse(
      is.na(Peak_Plantar_Pressure_kPa) | 
      Peak_Plantar_Pressure_kPa < 10 | 
      Peak_Plantar_Pressure_kPa > 1000,
      median_pppk, Peak_Plantar_Pressure_kPa))

summary(data_rs$Peak_Plantar_Pressure_kPa)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    52.1   283.0   384.3   389.1   500.9   715.4

CEK DATA AKHIR

- Cek ringkasan data

summary(data_rs)
##      Nama           Tanggal_Lahir           Umur          Tensi          
##  Length:698         Length:698         Min.   :21.00   Length:698        
##  Class :character   Class :character   1st Qu.:38.00   Class :character  
##  Mode  :character   Mode  :character   Median :53.00   Mode  :character  
##                                        Mean   :53.31                     
##                                        3rd Qu.:69.00                     
##                                        Max.   :86.00                     
##     Sistolik       Diastolik      Skin_Stiffness_N_per_mm Microcirculation_PU
##  Min.   : 78.0   Min.   : 46.00   Min.   :0.100           Min.   : 1.00      
##  1st Qu.:113.0   1st Qu.: 72.00   1st Qu.:0.720           1st Qu.:19.82      
##  Median :121.0   Median : 78.00   Median :1.100           Median :28.00      
##  Mean   :121.3   Mean   : 78.25   Mean   :1.136           Mean   :28.46      
##  3rd Qu.:131.0   3rd Qu.: 84.75   3rd Qu.:1.560           3rd Qu.:37.75      
##  Max.   :164.0   Max.   :113.00   Max.   :2.900           Max.   :77.30      
##  Suhu_Tubuh_Celcius         Penyakit   Peak_Plantar_Pressure_kPa
##  Min.   :35.50      Non-Diabetic:391   Min.   : 52.1            
##  1st Qu.:36.50      Diabetic    :307   1st Qu.:283.0            
##  Median :36.80                         Median :384.3            
##  Mean   :36.78                         Mean   :389.1            
##  3rd Qu.:37.00                         3rd Qu.:500.9            
##  Max.   :38.00                         Max.   :715.4

- Cek Missing value

colSums(is.na(data_rs))
##                      Nama             Tanggal_Lahir                      Umur 
##                         0                         0                         0 
##                     Tensi                  Sistolik                 Diastolik 
##                         0                         0                         0 
##   Skin_Stiffness_N_per_mm       Microcirculation_PU        Suhu_Tubuh_Celcius 
##                         0                         0                         0 
##                  Penyakit Peak_Plantar_Pressure_kPa 
##                         0                         0