###Analisis Faktor Risiko Stroke Menggunakan Regresi Logistik pada Data Kesehatan Individu###

#input data #data <-read.csv(file.choose(), header = TRUE, sep =“;”) data <-read.csv(“C:/Users/hehehe/Documents/Unimus/Porto Unimus/healthcare-dataset-stroke-data.csv”, header = TRUE) head(data) dim(data)

#kopi Data df <- data

#hapus kolom id df$id <- NULL

#Encode df\(ever_married <- ifelse(df\)ever_married == “Yes”, 1, 0) df\(Residence_type <- ifelse(df\)Residence_type == “Urban”, 1, 0)

df\(gender <- factor(df\)gender) df\(work_type <- factor(df\)work_type) df\(smoking_status <- factor(df\)smoking_status)

df\(age <- as.numeric(df\)age) df\(avg_glucose_level <- as.numeric(df\)avg_glucose_level) df\(bmi <- as.numeric(df\)bmi)

#Missing value sum(is.na(df\(bmi)) df\)bmi[is.na(df\(bmi)] <- median(df\)bmi, na.rm = TRUE)

df\(stroke <- as.numeric(df\)stroke) str(df)

df\(hypertension <- factor(df\)hypertension, levels = c(0,1)) df\(heart_disease <- factor(df\)heart_disease, levels = c(0,1)) df\(ever_married <- factor(df\)ever_married, levels = c(0,1)) df\(Residence_type <- factor(df\)Residence_type, levels = c(0,1)) df\(stroke <- factor(df\)stroke, levels = c(0,1))

df\(gender <- relevel(df\)gender, ref = “Female”) df\(smoking_status <- relevel(df\)smoking_status, ref = “never smoked”) df\(work_type <- relevel (df\)work_type, ref = “Private”) df\(Residence_type <- relevel(df\)Residence_type, ref = “0”) # Rural

#Cek outlaier par(mfrow = c(1,3)) boxplot(df\(age, main = "Outlier Age", col = "lightblue") boxplot(df\)avg_glucose_level, main = “Outlier Glucose”, col = “lightgreen”) boxplot(df$bmi, main = “Outlier BMI”, col = “lightpink”) par(mfrow = c(1,1))

#Variabel usia tidak menunjukkan outlier ekstrem dan mencerminkan sebaran populasi yang wajar. #Nilai glukosa ekstrem mencerminkan kondisi klinis tertentu dan tetap dipertahankan karena relevan secara medis. #Outlier pada BMI merepresentasikan individu dengan obesitas ekstrem dan tidak dihapus karena mengandung informasi risiko yang penting.

#Eksplorasi Data

desc_stats <- data.frame( Variable = c(“Age”, “Avg Glucose Level”, “BMI”), Mean = c(mean(df\(age), mean(df\)avg_glucose_level), mean(df\(bmi)), Median = c(median(df\)age), median(df\(avg_glucose_level), median(df\)bmi)), SD = c(sd(df\(age), sd(df\)avg_glucose_level), sd(df\(bmi)), Min = c(min(df\)age), min(df\(avg_glucose_level), min(df\)bmi)), Max = c(max(df\(age), max(df\)avg_glucose_level), max(df$bmi)) ) desc_stats_round <- desc_stats num_cols <- sapply(desc_stats_round, is.numeric) desc_stats_round[num_cols] <- round(desc_stats_round[num_cols], 0)

desc_stats_round

#Histogram Distribusi par(mfrow = c(1,3))

hist(df\(age, breaks = 20, main = "Age Distribution", xlab = "Age", col = "lightblue", border = "white") abline(v = mean(df\)age), col = “red”, lwd = 2)

hist(df\(avg_glucose_level, breaks = 30, main = "Glucose Distribution", xlab = "Avg Glucose Level", col = "lightgreen", border = "white") abline(v = mean(df\)avg_glucose_level), col = “red”, lwd = 2)

hist(df\(bmi, breaks = 25, main = "BMI Distribution", xlab = "BMI", col = "lightpink", border = "white") abline(v = mean(df\)bmi), col = “red”, lwd = 2)

par(mfrow = c(1,1))

#Pemilihan variabel num_data <- df[, c(“age”, “avg_glucose_level”, “bmi”)]

cor_matrix <- cor(num_data, use = “complete.obs”, method = “pearson”) round(cor_matrix, 2)

#Multikolineriaritas (VIF) install.packages(“car”) library(car) vif_model_full <- lm( age ~ avg_glucose_level + bmi + hypertension + heart_disease + ever_married + Residence_type, data = df )

vif(vif_model_full)

#Membangun Model # Model regresi logistik logit_model <- glm( stroke ~ age + avg_glucose_level + bmi + hypertension + heart_disease + gender + ever_married + Residence_type + work_type + smoking_status, data = df, family = binomial(link = “logit”) )

Ringkasan model

summary(logit_model)

#Evaluasi model # Prediksi probabilitas df$prob_stroke <- predict(logit_model, type = “response”)

Lihat ringkasan

summary(df$prob_stroke)

Prediksi kelas dengan threshold 0.5

df\(pred_class_05 <- ifelse(df\)prob_stroke >= 0.5, 1, 0)

Lihat hasil awal

table(Predicted = df\(pred_class_05, Actual = df\)stroke)

#Interpretasi hasil # Odds Ratio OR <- exp(coef(logit_model))

Confidence Interval 95%

CI <- exp(confint(logit_model))

Gabungkan

OR_table <- cbind(OR, CI) round(OR_table, 3)

Ambil ringkasan model

model_summary <- summary(logit_model)

Ekstrak tabel koefisien

coef_table <- as.data.frame(model_summary$coefficients)

Tambahkan nama variabel

coef_table$Variable <- rownames(coef_table)

Reset rownames

rownames(coef_table) <- NULL

Cek hasil

coef_table

Filter p-value < 0.05

sig_vars <- coef_table[coef_table$Pr(>|z|) < 0.05, ]

Urutkan dari yang paling signifikan

sig_vars <- sig_vars[order(sig_vars$Pr(>|z|)), ]

sig_vars

Ambil ringkasan model

model_summary <- summary(logit_model)

Ekstrak tabel koefisien

coef_table <- as.data.frame(model_summary$coefficients)

Tambahkan nama variabel

coef_table$Variable <- rownames(coef_table)

Reset rownames

rownames(coef_table) <- NULL

logit_model_final <- glm( stroke ~ age + avg_glucose_level + hypertension + heart_disease + gender + ever_married + Residence_type + work_type + smoking_status, data = df, family = binomial )

Cek hasil

coef_table

nonsig_vars <- coef_table[coef_table$Pr(>|z|) >= 0.05, ] nonsig_vars

logit_model_final <- glm( stroke ~ age + avg_glucose_level + hypertension + heart_disease + gender + ever_married + Residence_type + work_type + smoking_status, data = df, family = binomial )

summary(logit_model_final)

###hitung odds ratio dan confidence interval # Odds Ratio OR <- exp(coef(logit_model_final))

Confidence Interval 95%

CI <- exp(confint(logit_model_final))

Gabungkan OR dan CI

OR_table <- cbind(OR, CI)

Rapikan output

round(OR_table, 3)

##visualisasi Forest Plot Odds Ratio dengan forest plot #Menyiapkan iapkan Data Odds Ratio # Ambil OR dan CI OR <- exp(coef(logit_model_final)) CI <- exp(confint(logit_model_final))

Buat data frame

forest_df <- data.frame( Variable = rownames(CI), OR = OR, CI_low = CI[,1], CI_high = CI[,2] )

Hapus intercept

forest_df <- forest_df[forest_df$Variable != “(Intercept)”, ]

Urutkan berdasarkan OR

forest_df <- forest_df[order(forest_df$OR), ]

Atur margin

par(mar = c(5, 9, 4, 2))

Plot kosong

plot( forest_df\(OR, seq_along(forest_df\)OR), xlim = c(0.3, 6), ylim = c(1, nrow(forest_df)), xlab = “Odds Ratio (log scale)”, ylab = ““, yaxt =”n”, pch = 19, log = “x” )

Tambahkan CI (garis horizontal)

segments( forest_df\(CI_low, seq_along(forest_df\)OR), forest_df\(CI_high, seq_along(forest_df\)OR) )

Garis OR = 1

abline(v = 1, lty = 2, col = “red”)

Label variabel

axis( 2, at = seq_along(forest_df\(OR), labels = forest_df\)Variable, las = 1 )

title(“Forest Plot Odds Ratio Stroke Risk”)