library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'tibble' was built under R version 4.5.2
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'dplyr' was built under R version 4.5.2
## Warning: package 'stringr' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
data <- read_csv("hospital_dataset.csv")
## Rows: 700 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Nama, Tanggal_Lahir, Tensi, Suhu_Tubuh_Celcius, Penyakit
## dbl (3): Skin_Stiffness_N_per_mm, Microcirculation_PU, Peak_Plantar_Pressure...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(data)           
## Rows: 700
## Columns: 8
## $ Nama                      <chr> "Michael Anderson", "N/A", "Tan Wei Ming", "…
## $ Tanggal_Lahir             <chr> "1/4/1957", "20/09/1975", "12/4/1965", "11/9…
## $ Tensi                     <chr> "112/67", "140 / 91", "134/72", "120/79", "9…
## $ Skin_Stiffness_N_per_mm   <dbl> 0.69, 1.50, 0.76, 1.92, 0.81, 0.61, 1.04, 2.…
## $ Microcirculation_PU       <dbl> 42.0, 41.9, 26.3, NA, 25.5, 42.2, 2.0, 9.5, …
## $ Suhu_Tubuh_Celcius        <chr> "37.6", "36.5°C", "37.5", "37", "36", "36.8"…
## $ Penyakit                  <chr> "Non-Diabetic", "Non-Diabetic", "Non-Diabeti…
## $ Peak_Plantar_Pressure_kPa <dbl> 294.0, NA, 431.8, 577.5, 502.3, 201.4, 512.8…
summary(data)            
##      Nama           Tanggal_Lahir         Tensi          
##  Length:700         Length:700         Length:700        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  Skin_Stiffness_N_per_mm Microcirculation_PU Suhu_Tubuh_Celcius
##  Min.   : -2.180         Min.   : -32.50     Length:700        
##  1st Qu.:  0.700         1st Qu.:  18.00     Class :character  
##  Median :  1.100         Median :  27.70     Mode  :character  
##  Mean   :  1.342         Mean   :  35.58                       
##  3rd Qu.:  1.595         3rd Qu.:  39.00                       
##  Max.   :150.000         Max.   :5000.00                       
##  NA's   :37              NA's   :50                            
##    Penyakit         Peak_Plantar_Pressure_kPa
##  Length:700         Min.   : -100.0          
##  Class :character   1st Qu.:  268.6          
##  Mode  :character   Median :  384.3          
##                     Mean   :  991.9          
##                     3rd Qu.:  508.5          
##                     Max.   :99999.0          
##                     NA's   :43
colSums(is.na(data))
##                      Nama             Tanggal_Lahir                     Tensi 
##                        40                        42                        47 
##   Skin_Stiffness_N_per_mm       Microcirculation_PU        Suhu_Tubuh_Celcius 
##                        37                        50                        49 
##                  Penyakit Peak_Plantar_Pressure_kPa 
##                        45                        43
data %>% 
  count(Nama, sort = TRUE) 
## # A tibble: 59 × 2
##    Nama                n
##    <chr>           <int>
##  1 <NA>               40
##  2 Lu Hsiang-Ling     22
##  3 Ong Lay Kheng      21
##  4 Richard Martin     20
##  5 Tung Li-Fang       20
##  6 Chang Chung-Wei    18
##  7 Hsu Kuo-Chang      18
##  8 Tan Wei Ming       18
##  9 Barbara Taylor     16
## 10 Pan Mei-Hsuan      16
## # ℹ 49 more rows
data %>% 
  count(Penyakit, sort = TRUE)
## # A tibble: 16 × 2
##    Penyakit         n
##    <chr>        <int>
##  1 Non-Diabetic   334
##  2 Diabetic       294
##  3 <NA>            45
##  4 DM               5
##  5 DIABETIC         3
##  6 No               3
##  7 Tidak            3
##  8 1                2
##  9 Yes              2
## 10 diabetic         2
## 11 non-diabetic     2
## 12 Healthy          1
## 13 NON-DIABETIC     1
## 14 Normal           1
## 15 Sakit            1
## 16 Sehat            1
data %>%
  filter(str_detect(Suhu_Tubuh_Celcius, "[^0-9.]") & !is.na(Suhu_Tubuh_Celcius)) %>%
  count(Suhu_Tubuh_Celcius, sort = TRUE)
## # A tibble: 13 × 2
##    Suhu_Tubuh_Celcius     n
##    <chr>              <int>
##  1 36.5°C                 3
##  2 -1                     1
##  3 36.1 derajat           1
##  4 36.5 derajat           1
##  5 36.6 derajat           1
##  6 36.7 derajat           1
##  7 36.9 C                 1
##  8 36.9°C                 1
##  9 37.0 derajat           1
## 10 37.1celcius            1
## 11 37.2celcius            1
## 12 37.2°C                 1
## 13 37.4°C                 1
data %>%
  count(Tensi, sort = TRUE) %>%
  filter(!str_detect(coalesce(Tensi, ""), "^\\d{2,3}/\\d{2,3}$") | is.na(Tensi))
## # A tibble: 22 × 2
##    Tensi           n
##    <chr>       <int>
##  1 <NA>           47
##  2 101 / 96        1
##  3 102 / 71        1
##  4 103mmHg/81      1
##  5 107|60          1
##  6 110             1
##  7 114mmHg/67      1
##  8 114|71          1
##  9 122-71          1
## 10 126/63 mmHg     1
## # ℹ 12 more rows
data_clean <- data %>%
  mutate(
    Nama = if_else(
      str_trim(Nama) %in% c("N/A", "UNKNOWN", "Pasien", ".", "123456", "???", "unknown", "NULL"),
      NA_character_,
      Nama
    ),
    Penyakit = case_when(
      Penyakit %in% c("DIABETIC", "diabetic", "DM", "Yes", "1", "Sakit") ~ "Diabetic",
      Penyakit %in% c("No", "Tidak", "non-diabetic", "Healthy", "NON-DIABETIC", "Normal", "Sehat") ~ "Non-Diabetic",
      TRUE ~ Penyakit
    ),
    Suhu_Tubuh_Celcius = as.numeric(str_replace_all(Suhu_Tubuh_Celcius, "[^0-9.]", "")),
    
    Tanggal_Lahir = parse_date_time(Tanggal_Lahir, orders = c("dmy", "y")),
    
    Tensi_bersih = str_replace_all(Tensi, " Dia:", "/"),
    Tensi_bersih = str_replace_all(Tensi_bersih, "[|-]", "/"),
    Tensi_bersih = str_replace_all(Tensi_bersih, "[^0-9/]", ""),
    
    Sistolik = as.numeric(str_split_fixed(Tensi_bersih, "/", 2)[,1]),
    Diastolik = as.numeric(str_split_fixed(Tensi_bersih, "/", 2)[,2])
  ) %>%
  select(-Tensi, -Tensi_bersih)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Tanggal_Lahir = parse_date_time(Tanggal_Lahir, orders =
##   c("dmy", "y"))`.
## Caused by warning:
## !  3 failed to parse.
data_clean %>% 
  count(Penyakit, sort = TRUE)
## # A tibble: 3 × 2
##   Penyakit         n
##   <chr>        <int>
## 1 Non-Diabetic   346
## 2 Diabetic       309
## 3 <NA>            45
data_clean %>% 
  select(Nama, Sistolik, Diastolik) %>% 
  head(5)
## # A tibble: 5 × 3
##   Nama             Sistolik Diastolik
##   <chr>               <dbl>     <dbl>
## 1 Michael Anderson      112        67
## 2 <NA>                  140        91
## 3 Tan Wei Ming          134        72
## 4 Shen Yi-Ching         120        79
## 5 Kung Mei-Lin           99        77
data_clean %>% 
  select(Suhu_Tubuh_Celcius, Sistolik, Diastolik, Tanggal_Lahir) %>% 
  summary()
##  Suhu_Tubuh_Celcius    Sistolik       Diastolik     
##  Min.   : 1.00      Min.   : 78.0   Min.   : 46.00  
##  1st Qu.:36.50      1st Qu.:111.0   1st Qu.: 71.00  
##  Median :36.80      Median :121.0   Median : 78.00  
##  Mean   :36.84      Mean   :121.3   Mean   : 78.28  
##  3rd Qu.:37.10      3rd Qu.:131.0   3rd Qu.: 85.00  
##  Max.   :99.90      Max.   :164.0   Max.   :113.00  
##  NA's   :49         NA's   :47      NA's   :49      
##  Tanggal_Lahir                
##  Min.   :1940-02-21 00:00:00  
##  1st Qu.:1956-09-09 00:00:00  
##  Median :1973-10-18 00:00:00  
##  Mean   :1973-09-24 19:38:22  
##  3rd Qu.:1989-10-02 00:00:00  
##  Max.   :2067-02-02 00:00:00  
##  NA's   :45
data_clean2 <- data_clean %>%
  drop_na(Penyakit, Tanggal_Lahir, Sistolik, Diastolik) %>%
  
  mutate(
    Suhu_Tubuh_Celcius = replace_na(Suhu_Tubuh_Celcius, median(Suhu_Tubuh_Celcius, na.rm = TRUE)),
    Skin_Stiffness_N_per_mm = replace_na(Skin_Stiffness_N_per_mm, median(Skin_Stiffness_N_per_mm, na.rm = TRUE)),
    Microcirculation_PU = replace_na(Microcirculation_PU, median(Microcirculation_PU, na.rm = TRUE)),
    Peak_Plantar_Pressure_kPa = replace_na(Peak_Plantar_Pressure_kPa, median(Peak_Plantar_Pressure_kPa, na.rm = TRUE))
  )

colSums(is.na(data_clean2))
##                      Nama             Tanggal_Lahir   Skin_Stiffness_N_per_mm 
##                        42                         0                         0 
##       Microcirculation_PU        Suhu_Tubuh_Celcius                  Penyakit 
##                         0                         0                         0 
## Peak_Plantar_Pressure_kPa                  Sistolik                 Diastolik 
##                         0                         0                         0
cat("Jumlah baris duplikat:", sum(duplicated(data_clean2)), "\n")
## Jumlah baris duplikat: 4
data_clean2 <- data_clean2 %>% distinct()
cat("Baris setelah hapus duplikat:", nrow(data_clean2), "\n")
## Baris setelah hapus duplikat: 565
par(mfrow=c(2,3)) 

boxplot(data_clean2$Sistolik, main="Sistolik", col="orange")
boxplot(data_clean2$Diastolik, main="Diastolik", col="lightblue")
boxplot(data_clean2$Suhu_Tubuh_Celcius, main="Suhu Tubuh", col="pink")
boxplot(data_clean2$Skin_Stiffness_N_per_mm, main="Skin Stiffness", col="lightgreen")
boxplot(data_clean2$Microcirculation_PU, main="Microcirculation", col="yellow")
boxplot(data_clean2$Peak_Plantar_Pressure_kPa, main="Peak Plantar", col="grey")

par(mfrow=c(1,1))
data_final <- data_clean2 %>%
  filter(
    Suhu_Tubuh_Celcius > 30 & Suhu_Tubuh_Celcius < 45,
    Sistolik > 40 & Sistolik < 250, 
    Diastolik > 30 & Diastolik < 150,
    Skin_Stiffness_N_per_mm > 0 & Skin_Stiffness_N_per_mm < 100,
    Microcirculation_PU > 0 & Microcirculation_PU < 1000,
    Peak_Plantar_Pressure_kPa > 0 & Peak_Plantar_Pressure_kPa < 10000
  )

par(mfrow=c(2,3))
boxplot(data_final$Sistolik, main="Sistolik", col="orange")
boxplot(data_final$Diastolik, main="Diastolik", col="lightblue")
boxplot(data_final$Suhu_Tubuh_Celcius, main="Suhu Tubuh", col="pink")
boxplot(data_final$Skin_Stiffness_N_per_mm, main="Skin Stiffness", col="lightgreen")
boxplot(data_final$Microcirculation_PU, main="Microcirculation", col="yellow")
boxplot(data_final$Peak_Plantar_Pressure_kPa, main="Peak Plantar", col="grey")