#DATA CLEANING
Hospital <- read.csv("hospital_dataset.csv")
head(Hospital)
## Nama Tanggal_Lahir Tensi Skin_Stiffness_N_per_mm
## 1 Michael Anderson 01/04/1957 112/67 0.69
## 2 N/A 20/09/1975 140 / 91 1.50
## 3 Tan Wei Ming 12/04/1965 134/72 0.76
## 4 Shen Yi-Ching 11/09/1980 120/79 1.92
## 5 Kung Mei-Lin 22/08/1985 99/77 0.81
## 6 Ho Chuan-Wei 10/08/1962 149/65 0.61
## Microcirculation_PU Suhu_Tubuh_Celcius Penyakit Peak_Plantar_Pressure_kPa
## 1 42.0 37.6 Non-Diabetic 294.0
## 2 41.9 36.5°C Non-Diabetic NA
## 3 26.3 37.5 Non-Diabetic 431.8
## 4 NA 37.0 Diabetic 577.5
## 5 25.5 36.0 Diabetic 502.3
## 6 42.2 36.8 Non-Diabetic 201.4
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(readr)
#Missing Value
colSums(is.na(Hospital))
## Nama Tanggal_Lahir Tensi
## 0 0 0
## Skin_Stiffness_N_per_mm Microcirculation_PU Suhu_Tubuh_Celcius
## 37 50 0
## Penyakit Peak_Plantar_Pressure_kPa
## 0 43
#Mengganti NA di Skin_Stiffness_N_per_mm dengan Rata-rata
Hospital$Skin_Stiffness_N_per_mm[is.na(Hospital$Skin_Stiffness_N_per_mm)] <- mean(Hospital$Skin_Stiffness_N_per_mm, na.rm = TRUE)
#Mengganti NA di Microcirculation_PU dengan Rata-rata
Hospital$Microcirculation_PU[is.na(Hospital$Microcirculation_PU)] <- mean(Hospital$ Microcirculation_PU, na.rm = TRUE)
#Mengganti NA di Peak_Plantar_Pressure_kPa dengan Rata-rata
Hospital$Peak_Plantar_Pressure_kPa[is.na(Hospital$Peak_Plantar_Pressure_kPa)] <- mean(Hospital$ Peak_Plantar_Pressure_kPa, na.rm = TRUE)
#Cek
colSums(is.na(Hospital))
## Nama Tanggal_Lahir Tensi
## 0 0 0
## Skin_Stiffness_N_per_mm Microcirculation_PU Suhu_Tubuh_Celcius
## 0 0 0
## Penyakit Peak_Plantar_Pressure_kPa
## 0 0
#Menghapus Duplikat
Hospital <- distinct(Hospital)
#Outlier
Q1 <- quantile(Hospital$Skin_Stiffness_N_per_mm, 0.25, na.rm = TRUE)
Q3 <- quantile(Hospital$Skin_Stiffness_N_per_mm, 0.75, na.rm = TRUE)
IQR_val <- IQR(Hospital$Skin_Stiffness_N_per_mm, na.rm = TRUE)
lower <- Q1 - 1.5 * IQR_val
upper <- Q3 + 1.5 * IQR_val
Hospital$Skin_Stiffness_N_per_mm <- ifelse(Hospital$Skin_Stiffness_N_per_mm < lower, lower,
Hospital$Skin_Stiffness_N_per_mm)
Hospital$Skin_Stiffness_N_per_mm <- ifelse(Hospital$Skin_Stiffness_N_per_mm > upper, upper,
Hospital$Skin_Stiffness_N_per_mm)
Q1 <- quantile(Hospital$Microcirculation_PU, 0.25, na.rm = TRUE)
Q3 <- quantile(Hospital$Microcirculation_PU, 0.75, na.rm = TRUE)
IQR_val <- IQR(Hospital$Microcirculation_PU, na.rm = TRUE)
lower <- Q1 - 1.5 * IQR_val
upper <- Q3 + 1.5 * IQR_val
Hospital$Microcirculation_PU <- ifelse(Hospital$Microcirculation_PU < lower, lower,
Hospital$Microcirculation_PU)
Hospital$Microcirculation_PU <- ifelse(Hospital$Microcirculation_PU > upper, upper,
Hospital$Microcirculation_PU)
Q1 <- quantile(Hospital$Microcirculation_PU, 0.25, na.rm = TRUE)
Q3 <- quantile(Hospital$Microcirculation_PU, 0.75, na.rm = TRUE)
IQR_val <- IQR(Hospital$Microcirculation_PU, na.rm = TRUE)
lower <- Q1 - 1.5 * IQR_val
upper <- Q3 + 1.5 * IQR_val
Hospital$Microcirculation_PU <- ifelse(Hospital$Microcirculation_PU < lower, lower,
Hospital$Microcirculation_PU)
Hospital$Microcirculation_PU <- ifelse(Hospital$Microcirculation_PU > upper, upper,
Hospital$Microcirculation_PU)
Q1 <- quantile(Hospital$Peak_Plantar_Pressure_kPa, 0.25, na.rm = TRUE)
Q3 <- quantile(Hospital$Peak_Plantar_Pressure_kPa, 0.75, na.rm = TRUE)
IQR_val <- IQR(Hospital$Peak_Plantar_Pressure_kPa, na.rm = TRUE)
lower <- Q1 - 1.5 * IQR_val
upper <- Q3 + 1.5 * IQR_val
Hospital$Peak_Plantar_Pressure_kPa <- ifelse(Hospital$Peak_Plantar_Pressure_kPa < lower, lower,
Hospital$Peak_Plantar_Pressure_kPa)
Hospital$Peak_Plantar_Pressure_kPa <- ifelse(Hospital$Peak_Plantar_Pressure_kPa > upper, upper,
Hospital$Peak_Plantar_Pressure_kPa)
#Inkonsisten Data
str(Hospital)
## 'data.frame': 698 obs. of 8 variables:
## $ Nama : chr "Michael Anderson" "N/A" "Tan Wei Ming" "Shen Yi-Ching" ...
## $ Tanggal_Lahir : chr "01/04/1957" "20/09/1975" "12/04/1965" "11/09/1980" ...
## $ Tensi : chr "112/67" "140 / 91" "134/72" "120/79" ...
## $ Skin_Stiffness_N_per_mm : num 0.69 1.5 0.76 1.92 0.81 ...
## $ Microcirculation_PU : num 42 41.9 26.3 35.6 25.5 ...
## $ Suhu_Tubuh_Celcius : chr "37.6" "36.5°C" "37.5" "37.0" ...
## $ Penyakit : chr "Non-Diabetic" "Non-Diabetic" "Non-Diabetic" "Diabetic" ...
## $ Peak_Plantar_Pressure_kPa: num 294 903 432 578 502 ...
summary(Hospital)
## Nama Tanggal_Lahir Tensi
## Length:698 Length:698 Length:698
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## Skin_Stiffness_N_per_mm Microcirculation_PU Suhu_Tubuh_Celcius
## Min. :-0.565 Min. :-8.50 Length:698
## 1st Qu.: 0.710 1st Qu.:19.32 Class :character
## Median : 1.145 Median :29.00 Mode :character
## Mean : 1.137 Mean :28.64
## 3rd Qu.: 1.560 3rd Qu.:37.88
## Max. : 2.835 Max. :65.70
## Penyakit Peak_Plantar_Pressure_kPa
## Length:698 Min. :-99.11
## Class :character 1st Qu.:276.60
## Mode :character Median :396.85
## Mean :420.11
## 3rd Qu.:527.08
## Max. :902.79
Hospital$Skin_Stiffness_N_per_mm[
Hospital$Skin_Stiffness_N_per_mm < 0
] <- NA
Hospital$Microcirculation_PU[
Hospital$Microcirculation_PU < 0
] <- NA
Hospital$Peak_Plantar_Pressure_kPa[
Hospital$Peak_Plantar_Pressure_kPa < 0
] <- NA
Hospital$Suhu_Tubuh_Celcius[
Hospital$Suhu_Tubuh_Celcius < 30 | Hospital$Suhu_Tubuh_Celcius > 45
] <- NA
Hospital$Suhu_Tubuh_Celcius <- as.numeric(gsub(",", ".", Hospital$Suhu_Tubuh_Celcius))
## Warning: NAs introduced by coercion
num_cols <- sapply(Hospital, is.numeric)
Hospital[num_cols] <- lapply(Hospital[num_cols], function(x) {
x[is.na(x)] <- median(x, na.rm = TRUE)
x
})