R Markdown

Panggil library

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.3
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)

Import data

datars <- read_csv("hospital_dataset.csv")
## Rows: 700 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Nama, Tanggal_Lahir, Tensi, Suhu_Tubuh_Celcius, Penyakit
## dbl (3): Skin_Stiffness_N_per_mm, Microcirculation_PU, Peak_Plantar_Pressure...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Melihat data

head(datars)
## # A tibble: 6 × 8
##   Nama            Tanggal_Lahir Tensi Skin_Stiffness_N_per…¹ Microcirculation_PU
##   <chr>           <chr>         <chr>                  <dbl>               <dbl>
## 1 Michael Anders… 01/04/1957    112/…                   0.69                42  
## 2 N/A             20/09/1975    140 …                   1.5                 41.9
## 3 Tan Wei Ming    12/04/1965    134/…                   0.76                26.3
## 4 Shen Yi-Ching   11/09/1980    120/…                   1.92                NA  
## 5 Kung Mei-Lin    22/08/1985    99/77                   0.81                25.5
## 6 Ho Chuan-Wei    10/08/1962    149/…                   0.61                42.2
## # ℹ abbreviated name: ¹​Skin_Stiffness_N_per_mm
## # ℹ 3 more variables: Suhu_Tubuh_Celcius <chr>, Penyakit <chr>,
## #   Peak_Plantar_Pressure_kPa <dbl>

Struktur data

str(datars)
## spc_tbl_ [700 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Nama                     : chr [1:700] "Michael Anderson" "N/A" "Tan Wei Ming" "Shen Yi-Ching" ...
##  $ Tanggal_Lahir            : chr [1:700] "01/04/1957" "20/09/1975" "12/04/1965" "11/09/1980" ...
##  $ Tensi                    : chr [1:700] "112/67" "140 / 91" "134/72" "120/79" ...
##  $ Skin_Stiffness_N_per_mm  : num [1:700] 0.69 1.5 0.76 1.92 0.81 0.61 1.04 2.24 0.18 NA ...
##  $ Microcirculation_PU      : num [1:700] 42 41.9 26.3 NA 25.5 42.2 2 9.5 24.8 40.9 ...
##  $ Suhu_Tubuh_Celcius       : chr [1:700] "37.6" "36.5°C" "37.5" "37.0" ...
##  $ Penyakit                 : chr [1:700] "Non-Diabetic" "Non-Diabetic" "Non-Diabetic" "Diabetic" ...
##  $ Peak_Plantar_Pressure_kPa: num [1:700] 294 NA 432 578 502 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Nama = col_character(),
##   ..   Tanggal_Lahir = col_character(),
##   ..   Tensi = col_character(),
##   ..   Skin_Stiffness_N_per_mm = col_double(),
##   ..   Microcirculation_PU = col_double(),
##   ..   Suhu_Tubuh_Celcius = col_character(),
##   ..   Penyakit = col_character(),
##   ..   Peak_Plantar_Pressure_kPa = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Melihat struktur data dan tipe kolom

glimpse(datars)
## Rows: 700
## Columns: 8
## $ Nama                      <chr> "Michael Anderson", "N/A", "Tan Wei Ming", "…
## $ Tanggal_Lahir             <chr> "01/04/1957", "20/09/1975", "12/04/1965", "1…
## $ Tensi                     <chr> "112/67", "140 / 91", "134/72", "120/79", "9…
## $ Skin_Stiffness_N_per_mm   <dbl> 0.69, 1.50, 0.76, 1.92, 0.81, 0.61, 1.04, 2.…
## $ Microcirculation_PU       <dbl> 42.0, 41.9, 26.3, NA, 25.5, 42.2, 2.0, 9.5, …
## $ Suhu_Tubuh_Celcius        <chr> "37.6", "36.5°C", "37.5", "37.0", "36.0", "3…
## $ Penyakit                  <chr> "Non-Diabetic", "Non-Diabetic", "Non-Diabeti…
## $ Peak_Plantar_Pressure_kPa <dbl> 294.0, NA, 431.8, 577.5, 502.3, 201.4, 512.8…

Ringkasan statistik

summary(datars)
##      Nama           Tanggal_Lahir         Tensi          
##  Length:700         Length:700         Length:700        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  Skin_Stiffness_N_per_mm Microcirculation_PU Suhu_Tubuh_Celcius
##  Min.   : -2.180         Min.   : -32.50     Length:700        
##  1st Qu.:  0.700         1st Qu.:  18.00     Class :character  
##  Median :  1.100         Median :  27.70     Mode  :character  
##  Mean   :  1.342         Mean   :  35.58                       
##  3rd Qu.:  1.595         3rd Qu.:  39.00                       
##  Max.   :150.000         Max.   :5000.00                       
##  NA's   :37              NA's   :50                            
##    Penyakit         Peak_Plantar_Pressure_kPa
##  Length:700         Min.   : -100.0          
##  Class :character   1st Qu.:  268.6          
##  Mode  :character   Median :  384.3          
##                     Mean   :  991.9          
##                     3rd Qu.:  508.5          
##                     Max.   :99999.0          
##                     NA's   :43

#melihat nama variabel

colnames(datars)
## [1] "Nama"                      "Tanggal_Lahir"            
## [3] "Tensi"                     "Skin_Stiffness_N_per_mm"  
## [5] "Microcirculation_PU"       "Suhu_Tubuh_Celcius"       
## [7] "Penyakit"                  "Peak_Plantar_Pressure_kPa"

#cek data kosong

colSums(is.na(datars))
##                      Nama             Tanggal_Lahir                     Tensi 
##                        40                        42                        47 
##   Skin_Stiffness_N_per_mm       Microcirculation_PU        Suhu_Tubuh_Celcius 
##                        37                        50                        49 
##                  Penyakit Peak_Plantar_Pressure_kPa 
##                        45                        43

#DATA CLEANING

datars_hapus <- na.omit(datars)

#isi dengan median

datars$Skin_Stiffness_N_per_mm[is.na(datars$Skin_Stiffness_N_per_mm)] <- 
median(datars$Skin_Stiffness_N_per_mm, na.rm=TRUE)

datars$Microcirculation_PU[is.na(datars$Microcirculation_PU)] <- 
median(datars$Microcirculation_PU, na.rm=TRUE)

datars$Peak_Plantar_Pressure_kPa[is.na(datars$Peak_Plantar_Pressure_kPa)] <- 
median(datars$Peak_Plantar_Pressure_kPa, na.rm=TRUE)

#Deteksi outlier (Kuartil & IQR)

Q1 <- quantile(datars$Microcirculation_PU, 0.25, na.rm=TRUE)
Q3 <- quantile(datars$Microcirculation_PU, 0.75, na.rm=TRUE)
IQR <- Q3 - Q1

batas_bawah <- Q1 - 1.5*IQR
batas_atas  <- Q3 + 1.5*IQR

#INCONSISTENCY DATA FORMAT ANGKA

#Bersihkan suhu tubuh

datars$Suhu_Tubuh_Celcius <- as.numeric(gsub("[^0-9.]","",datars$Suhu_Tubuh_Celcius))
# Ubah ke numerik dulu agar aman
datars$Suhu_Tubuh_Celcius <- as.numeric(datars$Suhu_Tubuh_Celcius)

# Isi NA dengan nilai tengah (median)
datars$Suhu_Tubuh_Celcius[is.na(datars$Suhu_Tubuh_Celcius)] <- median(datars$Suhu_Tubuh_Celcius, na.rm=TRUE)

#pisahkan tensi

if("Tensi" %in% names(datars)){

datars <- datars %>%
mutate(Tensi = gsub(" ","",Tensi)) %>%
separate(
col = Tensi,
into = c("Sistolik","Diastolik"),
sep = "/",
remove = TRUE
)

}
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 11 rows [78, 137, 440,
## 480, 488, 535, 544, 559, 624, 639, 653].

#MENANGANI INKONSISTENSI TEKS (CASE & WHITESPACE)

# hapus spasi depan belakang
datars$Nama <- trimws(datars$Nama)
datars$Penyakit <- trimws(datars$Penyakit)

# samakan huruf kecil
datars$Penyakit <- tolower(datars$Penyakit)

# ubah ke format rapi
datars$Penyakit <- str_to_title(datars$Penyakit)
colSums(is.na(datars))
##                      Nama             Tanggal_Lahir                  Sistolik 
##                        40                        42                        47 
##                 Diastolik   Skin_Stiffness_N_per_mm       Microcirculation_PU 
##                        58                         0                         0 
##        Suhu_Tubuh_Celcius                  Penyakit Peak_Plantar_Pressure_kPa 
##                         0                        45                         0

#Isi kolom numerik dengan median

datars$Sistolik <- as.numeric(datars$Sistolik)
## Warning: NAs introduced by coercion
datars$Diastolik <- as.numeric(datars$Diastolik)
## Warning: NAs introduced by coercion
datars$Sistolik[is.na(datars$Sistolik)] <- median(datars$Sistolik, na.rm=TRUE)
datars$Diastolik[is.na(datars$Diastolik)] <- median(datars$Diastolik, na.rm=TRUE)

#isi kolom teks dengan modus

ModeFunc <- function(x){
names(sort(table(x), decreasing=TRUE))[1]
}

datars$Nama[is.na(datars$Nama)] <- ModeFunc(datars$Nama)

datars$Penyakit[is.na(datars$Penyakit)] <- ModeFunc(datars$Penyakit)

#isi tanggal lahir

datars$Tanggal_Lahir[is.na(datars$Tanggal_Lahir)] <- 
ModeFunc(datars$Tanggal_Lahir)
colSums(is.na(datars))
##                      Nama             Tanggal_Lahir                  Sistolik 
##                         0                         0                         0 
##                 Diastolik   Skin_Stiffness_N_per_mm       Microcirculation_PU 
##                         0                         0                         0 
##        Suhu_Tubuh_Celcius                  Penyakit Peak_Plantar_Pressure_kPa 
##                         0                         0                         0
summary(datars)
##      Nama           Tanggal_Lahir         Sistolik       Diastolik     
##  Length:700         Length:700         Min.   : 78.0   Min.   : 46.00  
##  Class :character   Class :character   1st Qu.:113.0   1st Qu.: 72.00  
##  Mode  :character   Mode  :character   Median :121.0   Median : 78.00  
##                                        Mean   :121.3   Mean   : 78.39  
##                                        3rd Qu.:130.0   3rd Qu.: 84.25  
##                                        Max.   :164.0   Max.   :113.00  
##  Skin_Stiffness_N_per_mm Microcirculation_PU Suhu_Tubuh_Celcius
##  Min.   : -2.180         Min.   : -32.50     Min.   : 1.00     
##  1st Qu.:  0.710         1st Qu.:  19.30     1st Qu.:36.50     
##  Median :  1.100         Median :  27.70     Median :36.80     
##  Mean   :  1.330         Mean   :  35.01     Mean   :36.83     
##  3rd Qu.:  1.562         3rd Qu.:  37.83     3rd Qu.:37.00     
##  Max.   :150.000         Max.   :5000.00     Max.   :99.90     
##    Penyakit         Peak_Plantar_Pressure_kPa
##  Length:700         Min.   : -100.0          
##  Class :character   1st Qu.:  276.8          
##  Mode  :character   Median :  384.3          
##                     Mean   :  954.6          
##                     3rd Qu.:  502.4          
##                     Max.   :99999.0
# ubah nilai tidak logis jadi NA

datars$Skin_Stiffness_N_per_mm[
datars$Skin_Stiffness_N_per_mm < 0 |
datars$Skin_Stiffness_N_per_mm > 10] <- NA

datars$Microcirculation_PU[
datars$Microcirculation_PU < 0 |
datars$Microcirculation_PU > 100] <- NA

datars$Suhu_Tubuh_Celcius[
datars$Suhu_Tubuh_Celcius < 30 |
datars$Suhu_Tubuh_Celcius > 45] <- NA

datars$Peak_Plantar_Pressure_kPa[
datars$Peak_Plantar_Pressure_kPa < 0 |
datars$Peak_Plantar_Pressure_kPa > 1000] <- NA
datars$Sistolik[datars$Sistolik > 250]
## numeric(0)
datars$Sistolik[
datars$Sistolik < 70 |
datars$Sistolik > 250
] <- NA
datars$Sistolik[is.na(datars$Sistolik)] <- 
median(datars$Sistolik, na.rm=TRUE)

#isi lagi dengan median

num <- c("Skin_Stiffness_N_per_mm",
"Microcirculation_PU",
"Suhu_Tubuh_Celcius",
"Peak_Plantar_Pressure_kPa")

for(i in num){
datars[[i]][is.na(datars[[i]])] <- median(datars[[i]], na.rm=TRUE)
}
datars$Penyakit <- as.factor(datars$Penyakit)
summary(datars)
##      Nama           Tanggal_Lahir         Sistolik       Diastolik     
##  Length:700         Length:700         Min.   : 78.0   Min.   : 46.00  
##  Class :character   Class :character   1st Qu.:113.0   1st Qu.: 72.00  
##  Mode  :character   Mode  :character   Median :121.0   Median : 78.00  
##                                        Mean   :121.3   Mean   : 78.39  
##                                        3rd Qu.:130.0   3rd Qu.: 84.25  
##                                        Max.   :164.0   Max.   :113.00  
##                                                                        
##  Skin_Stiffness_N_per_mm Microcirculation_PU Suhu_Tubuh_Celcius
##  Min.   :0.100           Min.   : 1.00       Min.   :35.5      
##  1st Qu.:0.720           1st Qu.:19.75       1st Qu.:36.5      
##  Median :1.100           Median :27.70       Median :36.8      
##  Mean   :1.137           Mean   :28.43       Mean   :36.8      
##  3rd Qu.:1.560           3rd Qu.:37.65       3rd Qu.:37.0      
##  Max.   :2.900           Max.   :77.30       Max.   :42.5      
##                                                                
##          Penyakit   Peak_Plantar_Pressure_kPa
##  Non-Diabetic:382   Min.   :  0.001          
##  Diabetic    :299   1st Qu.:282.650          
##  Dm          :  5   Median :384.300          
##  No          :  3   Mean   :388.101          
##  Tidak       :  3   3rd Qu.:499.850          
##  1           :  2   Max.   :715.400          
##  (Other)     :  6
# Bar chart penyakit
ggplot(datars, aes(Penyakit)) +
geom_bar(fill="steelblue")

# Histogram suhu tubuh
ggplot(datars, aes(Suhu_Tubuh_Celcius)) +
geom_histogram(fill="orange", bins=20)

# Boxplot tekanan kaki
ggplot(datars, aes(y=Peak_Plantar_Pressure_kPa)) +
geom_boxplot(fill="green")