##SOAL NOMOR 2 # Library yang diperlukan library(dplyr) library(readr)
data <- read_delim(“dirty_v3_path.csv”, delim = “;”)
glimpse(data)
head(data)
names(data)
colSums(is.na(data))
sum(duplicated(data))
sapply(data, class)
dim(data)
summary(data)
##SOAL NOMOR 3 library(dplyr) library(stringr)
glimpse(data)
data_clean <- data %>% select(-random_notes, -noise_col)
fix_number <- function(x) { x <- str_replace_all(x, “,”, “.”) # ganti koma jadi titik x <- str_extract(x, “\d+\.?\d*“) # ambil angka pertama saja as.numeric(x) }
data_clean <- data_clean %>% mutate( Age = fix_number(Age),
Glucose = fix_number(Glucose), Blood Pressure
=
fix_number(Blood Pressure
), BMI = fix_number(BMI),
Oxygen Saturation
=
fix_number(Oxygen Saturation
), Cholesterol =
fix_number(Cholesterol), Triglycerides = fix_number(Triglycerides),
HbA1c = fix_number(HbA1c), Physical Activity
=
fix_number(Physical Activity
), Diet Score
=
fix_number(Diet Score
), Stress Level
=
fix_number(Stress Level
), Sleep Hours
=
fix_number(Sleep Hours
) )
data_clean <- data_clean %>% mutate( Age = ifelse(is.na(Age),
median(Age, na.rm = TRUE), Age), Glucose = ifelse(is.na(Glucose),
median(Glucose, na.rm = TRUE), Glucose), Blood Pressure
=
ifelse(is.na(Blood Pressure
),
median(Blood Pressure
, na.rm = TRUE),
Blood Pressure
), Gender = ifelse(is.na(Gender), “Unknown”,
Gender), Medical Condition
=
ifelse(is.na(Medical Condition
), “Unknown”,
Medical Condition
) )
str(data_clean)
colSums(is.na(data_clean))
##SOAL NOMOR 4 # LIBRARY library(dplyr) library(stringr) library(ggplot2)
numeric_cols <- sapply(data_clean, is.numeric)
data_transformed <- data_clean data_transformed[, numeric_cols] <- scale(data_clean[, numeric_cols])
data_transformed <- data_transformed %>% mutate( RiskLevel = ifelse(Glucose > median(Glucose, na.rm = TRUE), “High”, “Low”), RiskLevel_num = ifelse(Glucose > median(Glucose, na.rm = TRUE), 1, 0) )
par(mfrow = c(1, 2)) hist(fix_number(data$Glucose), main = “Distribusi Glucose (Sebelum Cleaning)”, col = “tomato”, xlab = “Glucose”)
hist(data_clean$Glucose, main = “Distribusi Glucose (Sesudah Cleaning)”, col = “skyblue”, xlab = “Glucose”)
summary(fix_number(data\(Glucose)) summary(data_clean\)Glucose)
data_raw_numeric <- data %>% mutate( Glucose =
fix_number(Glucose), Blood Pressure
=
fix_number(Blood Pressure
), BMI = fix_number(BMI) ) %>%
select(Glucose, Blood Pressure
, BMI) %>%
mutate(RiskLevel_num = ifelse(Glucose > median(Glucose, na.rm =
TRUE), 1, 0))
set.seed(123) trainIndex_raw <- sample(1:nrow(data_raw_numeric), 0.8 * nrow(data_raw_numeric)) trainRaw <- data_raw_numeric[trainIndex_raw, ] testRaw <- data_raw_numeric[-trainIndex_raw, ]
model_raw <- glm(RiskLevel_num ~ ., data = trainRaw, family = “binomial”)
pred_raw <- predict(model_raw, testRaw, type = “response”) pred_raw_class <- ifelse(pred_raw > 0.5, 1, 0) acc_raw <- mean(pred_raw_class == testRaw$RiskLevel_num)
set.seed(123) trainIndex_clean <- sample(1:nrow(data_transformed), 0.8 * nrow(data_transformed)) trainData <- data_transformed[trainIndex_clean, ] testData <- data_transformed[-trainIndex_clean, ]
model_clean <- glm(RiskLevel_num ~ ., data = trainData, family = “binomial”)
pred_clean <- predict(model_clean, testData, type = “response”) pred_clean_class <- ifelse(pred_clean > 0.5, 1, 0) acc_clean <- mean(pred_clean_class == testData$RiskLevel_num)
cat(“============================================”) cat(“Perbandingan Akurasi Model:”) cat(“——————————————–”) cat(“model SEBELUM preprocessing :”, round(acc_raw, 3)) cat(“model SESUDAH preprocessing :”, round(acc_clean, 3)) cat(“============================================”)