##SOAL NOMOR 2 # Library yang diperlukan library(dplyr) library(readr)

Membaca ulang dataset dengan pemisah ‘;’

data <- read_delim(“dirty_v3_path.csv”, delim = “;”)

Melihat struktur data

glimpse(data)

Melihat 5 data pertama

head(data)

Menampilkan nama variabel

names(data)

1. Jumlah data hilang (missing values)

colSums(is.na(data))

2. Jumlah data duplikat

sum(duplicated(data))

3. Pemeriksaan kesalahan format (contoh: numeric yang terbaca karakter)

sapply(data, class)

Cek jumlah baris dan kolom

dim(data)

Cek statistik ringkas

summary(data)

##SOAL NOMOR 3 library(dplyr) library(stringr)

Periksa kembali struktur awal

glimpse(data)

Hapus kolom yang tidak relevan (noise)

data_clean <- data %>% select(-random_notes, -noise_col)

Bersihkan format numerik yang salah (“46.00.00” → “46.00”)

fix_number <- function(x) { x <- str_replace_all(x, “,”, “.”) # ganti koma jadi titik x <- str_extract(x, “\d+\.?\d*“) # ambil angka pertama saja as.numeric(x) }

Terapkan ke kolom yang seharusnya numerik

data_clean <- data_clean %>% mutate( Age = fix_number(Age), Glucose = fix_number(Glucose), Blood Pressure = fix_number(Blood Pressure), BMI = fix_number(BMI), Oxygen Saturation = fix_number(Oxygen Saturation), Cholesterol = fix_number(Cholesterol), Triglycerides = fix_number(Triglycerides), HbA1c = fix_number(HbA1c), Physical Activity = fix_number(Physical Activity), Diet Score = fix_number(Diet Score), Stress Level = fix_number(Stress Level), Sleep Hours = fix_number(Sleep Hours) )

Menangani missing values

Isi (impute) dengan median atau mean

data_clean <- data_clean %>% mutate( Age = ifelse(is.na(Age), median(Age, na.rm = TRUE), Age), Glucose = ifelse(is.na(Glucose), median(Glucose, na.rm = TRUE), Glucose), Blood Pressure = ifelse(is.na(Blood Pressure), median(Blood Pressure, na.rm = TRUE), Blood Pressure), Gender = ifelse(is.na(Gender), “Unknown”, Gender), Medical Condition = ifelse(is.na(Medical Condition), “Unknown”, Medical Condition) )

Pastikan semua tipe data sudah benar

str(data_clean)

Cek kembali apakah masih ada NA

colSums(is.na(data_clean))

##SOAL NOMOR 4 # LIBRARY library(dplyr) library(stringr) library(ggplot2)

TRANSFORMASI DATA

Transformasi: normalisasi variabel numerik

numeric_cols <- sapply(data_clean, is.numeric)

data_transformed <- data_clean data_transformed[, numeric_cols] <- scale(data_clean[, numeric_cols])

Tambahkan kolom kategori target sederhana (misalnya risiko penyakit)

Membuat dua versi target: RiskLevel (teks) dan RiskLevel_num (numerik)

data_transformed <- data_transformed %>% mutate( RiskLevel = ifelse(Glucose > median(Glucose, na.rm = TRUE), “High”, “Low”), RiskLevel_num = ifelse(Glucose > median(Glucose, na.rm = TRUE), 1, 0) )

EKSPLORASI DATA SEBELUM DAN SESUDAH PREPROCESSING

Plot histogram untuk melihat perubahan distribusi

par(mfrow = c(1, 2)) hist(fix_number(data$Glucose), main = “Distribusi Glucose (Sebelum Cleaning)”, col = “tomato”, xlab = “Glucose”)

hist(data_clean$Glucose, main = “Distribusi Glucose (Sesudah Cleaning)”, col = “skyblue”, xlab = “Glucose”)

Bandingkan ringkasan statistik

summary(fix_number(data\(Glucose)) summary(data_clean\)Glucose)

PERBANDINGAN MODEL SEBELUM DAN SESUDAH PREPROCESSING

— MODEL 1: SEBELUM PREPROCESSING —

Ambil hanya kolom numerik yang bisa digunakan

data_raw_numeric <- data %>% mutate( Glucose = fix_number(Glucose), Blood Pressure = fix_number(Blood Pressure), BMI = fix_number(BMI) ) %>% select(Glucose, Blood Pressure, BMI) %>% mutate(RiskLevel_num = ifelse(Glucose > median(Glucose, na.rm = TRUE), 1, 0))

Bagi data menjadi 80% training dan 20% testing

set.seed(123) trainIndex_raw <- sample(1:nrow(data_raw_numeric), 0.8 * nrow(data_raw_numeric)) trainRaw <- data_raw_numeric[trainIndex_raw, ] testRaw <- data_raw_numeric[-trainIndex_raw, ]

Bangun model logistik (data mentah)

model_raw <- glm(RiskLevel_num ~ ., data = trainRaw, family = “binomial”)

Prediksi dan hitung akurasi

pred_raw <- predict(model_raw, testRaw, type = “response”) pred_raw_class <- ifelse(pred_raw > 0.5, 1, 0) acc_raw <- mean(pred_raw_class == testRaw$RiskLevel_num)

— MODEL 2: SESUDAH PREPROCESSING —

Gunakan data_transformed yang sudah bersih dan dinormalisasi

set.seed(123) trainIndex_clean <- sample(1:nrow(data_transformed), 0.8 * nrow(data_transformed)) trainData <- data_transformed[trainIndex_clean, ] testData <- data_transformed[-trainIndex_clean, ]

Bangun model logistik (data setelah preprocessing)

model_clean <- glm(RiskLevel_num ~ ., data = trainData, family = “binomial”)

Prediksi dan hitung akurasi

pred_clean <- predict(model_clean, testData, type = “response”) pred_clean_class <- ifelse(pred_clean > 0.5, 1, 0) acc_clean <- mean(pred_clean_class == testData$RiskLevel_num)

INTERPRETASI HASIL MODEL

cat(“============================================”) cat(“Perbandingan Akurasi Model:”) cat(“——————————————–”) cat(“model SEBELUM preprocessing :”, round(acc_raw, 3)) cat(“model SESUDAH preprocessing :”, round(acc_clean, 3)) cat(“============================================”)