#DATA CLEANING

Hospital <- read.csv("hospital_dataset.csv")
head(Hospital)
##               Nama Tanggal_Lahir    Tensi Skin_Stiffness_N_per_mm
## 1 Michael Anderson    01/04/1957   112/67                    0.69
## 2              N/A    20/09/1975 140 / 91                    1.50
## 3     Tan Wei Ming    12/04/1965   134/72                    0.76
## 4    Shen Yi-Ching    11/09/1980   120/79                    1.92
## 5     Kung Mei-Lin    22/08/1985    99/77                    0.81
## 6     Ho Chuan-Wei    10/08/1962   149/65                    0.61
##   Microcirculation_PU Suhu_Tubuh_Celcius     Penyakit Peak_Plantar_Pressure_kPa
## 1                42.0               37.6 Non-Diabetic                     294.0
## 2                41.9             36.5°C Non-Diabetic                        NA
## 3                26.3               37.5 Non-Diabetic                     431.8
## 4                  NA               37.0     Diabetic                     577.5
## 5                25.5               36.0     Diabetic                     502.3
## 6                42.2               36.8 Non-Diabetic                     201.4
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(readr)

#Missing Value

colSums(is.na(Hospital))
##                      Nama             Tanggal_Lahir                     Tensi 
##                         0                         0                         0 
##   Skin_Stiffness_N_per_mm       Microcirculation_PU        Suhu_Tubuh_Celcius 
##                        37                        50                         0 
##                  Penyakit Peak_Plantar_Pressure_kPa 
##                         0                        43

#Mengganti NA di Skin_Stiffness_N_per_mm dengan Rata-rata

Hospital$Skin_Stiffness_N_per_mm[is.na(Hospital$Skin_Stiffness_N_per_mm)] <- mean(Hospital$Skin_Stiffness_N_per_mm, na.rm = TRUE)

#Mengganti NA di Microcirculation_PU dengan Rata-rata

Hospital$Microcirculation_PU[is.na(Hospital$Microcirculation_PU)] <- mean(Hospital$ Microcirculation_PU, na.rm = TRUE)

#Mengganti NA di Peak_Plantar_Pressure_kPa dengan Rata-rata

Hospital$Peak_Plantar_Pressure_kPa[is.na(Hospital$Peak_Plantar_Pressure_kPa)] <- mean(Hospital$ Peak_Plantar_Pressure_kPa, na.rm = TRUE)

#Cek

colSums(is.na(Hospital))
##                      Nama             Tanggal_Lahir                     Tensi 
##                         0                         0                         0 
##   Skin_Stiffness_N_per_mm       Microcirculation_PU        Suhu_Tubuh_Celcius 
##                         0                         0                         0 
##                  Penyakit Peak_Plantar_Pressure_kPa 
##                         0                         0

#Menghapus Duplikat

Hospital <- distinct(Hospital)

#Outlier

Q1 <- quantile(Hospital$Skin_Stiffness_N_per_mm, 0.25, na.rm = TRUE)
Q3 <- quantile(Hospital$Skin_Stiffness_N_per_mm, 0.75, na.rm = TRUE)
IQR_val <- IQR(Hospital$Skin_Stiffness_N_per_mm, na.rm = TRUE)

lower <- Q1 - 1.5 * IQR_val
upper <- Q3 + 1.5 * IQR_val

Hospital$Skin_Stiffness_N_per_mm <- ifelse(Hospital$Skin_Stiffness_N_per_mm < lower, lower,
                                    Hospital$Skin_Stiffness_N_per_mm)
Hospital$Skin_Stiffness_N_per_mm <- ifelse(Hospital$Skin_Stiffness_N_per_mm > upper, upper,
                                    Hospital$Skin_Stiffness_N_per_mm)
Q1 <- quantile(Hospital$Microcirculation_PU, 0.25, na.rm = TRUE)
Q3 <- quantile(Hospital$Microcirculation_PU, 0.75, na.rm = TRUE)
IQR_val <- IQR(Hospital$Microcirculation_PU, na.rm = TRUE)

lower <- Q1 - 1.5 * IQR_val
upper <- Q3 + 1.5 * IQR_val

Hospital$Microcirculation_PU <- ifelse(Hospital$Microcirculation_PU < lower, lower,
                                Hospital$Microcirculation_PU)
Hospital$Microcirculation_PU <- ifelse(Hospital$Microcirculation_PU > upper, upper,
                                Hospital$Microcirculation_PU)
Q1 <- quantile(Hospital$Microcirculation_PU, 0.25, na.rm = TRUE)
Q3 <- quantile(Hospital$Microcirculation_PU, 0.75, na.rm = TRUE)
IQR_val <- IQR(Hospital$Microcirculation_PU, na.rm = TRUE)

lower <- Q1 - 1.5 * IQR_val
upper <- Q3 + 1.5 * IQR_val

Hospital$Microcirculation_PU <- ifelse(Hospital$Microcirculation_PU < lower, lower,
                                Hospital$Microcirculation_PU)
Hospital$Microcirculation_PU <- ifelse(Hospital$Microcirculation_PU > upper, upper,
                                Hospital$Microcirculation_PU)
Q1 <- quantile(Hospital$Peak_Plantar_Pressure_kPa, 0.25, na.rm = TRUE)
Q3 <- quantile(Hospital$Peak_Plantar_Pressure_kPa, 0.75, na.rm = TRUE)
IQR_val <- IQR(Hospital$Peak_Plantar_Pressure_kPa, na.rm = TRUE)

lower <- Q1 - 1.5 * IQR_val
upper <- Q3 + 1.5 * IQR_val

Hospital$Peak_Plantar_Pressure_kPa <- ifelse(Hospital$Peak_Plantar_Pressure_kPa < lower, lower,
                                      Hospital$Peak_Plantar_Pressure_kPa)
Hospital$Peak_Plantar_Pressure_kPa <- ifelse(Hospital$Peak_Plantar_Pressure_kPa > upper, upper,
                                      Hospital$Peak_Plantar_Pressure_kPa)

#Inkonsisten Data

str(Hospital)
## 'data.frame':    698 obs. of  8 variables:
##  $ Nama                     : chr  "Michael Anderson" "N/A" "Tan Wei Ming" "Shen Yi-Ching" ...
##  $ Tanggal_Lahir            : chr  "01/04/1957" "20/09/1975" "12/04/1965" "11/09/1980" ...
##  $ Tensi                    : chr  "112/67" "140 / 91" "134/72" "120/79" ...
##  $ Skin_Stiffness_N_per_mm  : num  0.69 1.5 0.76 1.92 0.81 ...
##  $ Microcirculation_PU      : num  42 41.9 26.3 35.6 25.5 ...
##  $ Suhu_Tubuh_Celcius       : chr  "37.6" "36.5°C" "37.5" "37.0" ...
##  $ Penyakit                 : chr  "Non-Diabetic" "Non-Diabetic" "Non-Diabetic" "Diabetic" ...
##  $ Peak_Plantar_Pressure_kPa: num  294 903 432 578 502 ...
summary(Hospital)
##      Nama           Tanggal_Lahir         Tensi          
##  Length:698         Length:698         Length:698        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##  Skin_Stiffness_N_per_mm Microcirculation_PU Suhu_Tubuh_Celcius
##  Min.   :-0.565          Min.   :-8.50       Length:698        
##  1st Qu.: 0.710          1st Qu.:19.32       Class :character  
##  Median : 1.145          Median :29.00       Mode  :character  
##  Mean   : 1.137          Mean   :28.64                         
##  3rd Qu.: 1.560          3rd Qu.:37.88                         
##  Max.   : 2.835          Max.   :65.70                         
##    Penyakit         Peak_Plantar_Pressure_kPa
##  Length:698         Min.   :-99.11           
##  Class :character   1st Qu.:276.60           
##  Mode  :character   Median :396.85           
##                     Mean   :420.11           
##                     3rd Qu.:527.08           
##                     Max.   :902.79
Hospital$Skin_Stiffness_N_per_mm[
  Hospital$Skin_Stiffness_N_per_mm < 0
] <- NA

Hospital$Microcirculation_PU[
  Hospital$Microcirculation_PU < 0
] <- NA

Hospital$Peak_Plantar_Pressure_kPa[
  Hospital$Peak_Plantar_Pressure_kPa < 0
] <- NA
Hospital$Suhu_Tubuh_Celcius[
  Hospital$Suhu_Tubuh_Celcius < 30 | Hospital$Suhu_Tubuh_Celcius > 45
] <- NA
Hospital$Suhu_Tubuh_Celcius <- as.numeric(gsub(",", ".", Hospital$Suhu_Tubuh_Celcius))
## Warning: NAs introduced by coercion
num_cols <- sapply(Hospital, is.numeric)

Hospital[num_cols] <- lapply(Hospital[num_cols], function(x) {
  x[is.na(x)] <- median(x, na.rm = TRUE)
  x
})