Load Package

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'readr' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Import Data

data <- read_csv("hospital_dataset.csv")
## Rows: 700 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Nama, Tanggal_Lahir, Tensi, Suhu_Tubuh_Celcius, Penyakit
## dbl (3): Skin_Stiffness_N_per_mm, Microcirculation_PU, Peak_Plantar_Pressure...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(data)
## # A tibble: 6 × 8
##   Nama            Tanggal_Lahir Tensi Skin_Stiffness_N_per…¹ Microcirculation_PU
##   <chr>           <chr>         <chr>                  <dbl>               <dbl>
## 1 Michael Anders… 01/04/1957    112/…                   0.69                42  
## 2 N/A             20/09/1975    140 …                   1.5                 41.9
## 3 Tan Wei Ming    12/04/1965    134/…                   0.76                26.3
## 4 Shen Yi-Ching   11/09/1980    120/…                   1.92                NA  
## 5 Kung Mei-Lin    22/08/1985    99/77                   0.81                25.5
## 6 Ho Chuan-Wei    10/08/1962    149/…                   0.61                42.2
## # ℹ abbreviated name: ¹​Skin_Stiffness_N_per_mm
## # ℹ 3 more variables: Suhu_Tubuh_Celcius <chr>, Penyakit <chr>,
## #   Peak_Plantar_Pressure_kPa <dbl>
glimpse(data)
## Rows: 700
## Columns: 8
## $ Nama                      <chr> "Michael Anderson", "N/A", "Tan Wei Ming", "…
## $ Tanggal_Lahir             <chr> "01/04/1957", "20/09/1975", "12/04/1965", "1…
## $ Tensi                     <chr> "112/67", "140 / 91", "134/72", "120/79", "9…
## $ Skin_Stiffness_N_per_mm   <dbl> 0.69, 1.50, 0.76, 1.92, 0.81, 0.61, 1.04, 2.…
## $ Microcirculation_PU       <dbl> 42.0, 41.9, 26.3, NA, 25.5, 42.2, 2.0, 9.5, …
## $ Suhu_Tubuh_Celcius        <chr> "37.6", "36.5°C", "37.5", "37.0", "36.0", "3…
## $ Penyakit                  <chr> "Non-Diabetic", "Non-Diabetic", "Non-Diabeti…
## $ Peak_Plantar_Pressure_kPa <dbl> 294.0, NA, 431.8, 577.5, 502.3, 201.4, 512.8…

CEK MISSING VALUE

colSums(is.na(data))
##                      Nama             Tanggal_Lahir                     Tensi 
##                        40                        42                        47 
##   Skin_Stiffness_N_per_mm       Microcirculation_PU        Suhu_Tubuh_Celcius 
##                        37                        50                        49 
##                  Penyakit Peak_Plantar_Pressure_kPa 
##                        45                        43

PERBAIKI NAMA KOLOM

names(data) <- tolower(names(data))
names(data) <- gsub(" ", "_", names(data))

names(data)
## [1] "nama"                      "tanggal_lahir"            
## [3] "tensi"                     "skin_stiffness_n_per_mm"  
## [5] "microcirculation_pu"       "suhu_tubuh_celcius"       
## [7] "penyakit"                  "peak_plantar_pressure_kpa"

Mengatasi MISSING VALUE

data <- data %>%
  mutate(across(where(is.numeric),
                ~ifelse(is.na(.), median(., na.rm = TRUE), .)))

Menghapus data duplikat

data <- data %>%
  distinct()

nrow(data)
## [1] 698

CEK DATA

summary(data)
##      nama           tanggal_lahir         tensi          
##  Length:698         Length:698         Length:698        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##  skin_stiffness_n_per_mm microcirculation_pu suhu_tubuh_celcius
##  Min.   : -2.180         Min.   : -32.50     Length:698        
##  1st Qu.:  0.710         1st Qu.:  19.32     Class :character  
##  Median :  1.100         Median :  27.70     Mode  :character  
##  Mean   :  1.329         Mean   :  35.04                       
##  3rd Qu.:  1.560         3rd Qu.:  37.88                       
##  Max.   :150.000         Max.   :5000.00                       
##    penyakit         peak_plantar_pressure_kpa
##  Length:698         Min.   : -100.0          
##  Class :character   1st Qu.:  276.6          
##  Mode  :character   Median :  384.3          
##                     Mean   :  956.1          
##                     3rd Qu.:  502.6          
##                     Max.   :99999.0

CEK OUTLIER

num_data <- select(data, where(is.numeric))
if(ncol(num_data) > 0){
  boxplot(num_data)
}

write_csv(data, "cleaned_hospital_dataset.csv")