###Analisis Faktor Risiko Stroke Menggunakan Regresi Logistik pada Data Kesehatan Individu###
#input data #data <-read.csv(file.choose(), header = TRUE, sep =“;”) data <-read.csv(“C:/Users/hehehe/Documents/Unimus/Porto Unimus/healthcare-dataset-stroke-data.csv”, header = TRUE) head(data) dim(data)
#kopi Data df <- data
#hapus kolom id df$id <- NULL
#Encode df\(ever_married <- ifelse(df\)ever_married == “Yes”, 1, 0) df\(Residence_type <- ifelse(df\)Residence_type == “Urban”, 1, 0)
df\(gender <- factor(df\)gender) df\(work_type <- factor(df\)work_type) df\(smoking_status <- factor(df\)smoking_status)
df\(age <- as.numeric(df\)age) df\(avg_glucose_level <- as.numeric(df\)avg_glucose_level) df\(bmi <- as.numeric(df\)bmi)
#Missing value sum(is.na(df\(bmi)) df\)bmi[is.na(df\(bmi)] <- median(df\)bmi, na.rm = TRUE)
df\(stroke <- as.numeric(df\)stroke) str(df)
df\(hypertension <- factor(df\)hypertension, levels = c(0,1)) df\(heart_disease <- factor(df\)heart_disease, levels = c(0,1)) df\(ever_married <- factor(df\)ever_married, levels = c(0,1)) df\(Residence_type <- factor(df\)Residence_type, levels = c(0,1)) df\(stroke <- factor(df\)stroke, levels = c(0,1))
df\(gender <- relevel(df\)gender, ref = “Female”) df\(smoking_status <- relevel(df\)smoking_status, ref = “never smoked”) df\(work_type <- relevel (df\)work_type, ref = “Private”) df\(Residence_type <- relevel(df\)Residence_type, ref = “0”) # Rural
#Cek outlaier par(mfrow = c(1,3)) boxplot(df\(age, main = "Outlier Age", col = "lightblue") boxplot(df\)avg_glucose_level, main = “Outlier Glucose”, col = “lightgreen”) boxplot(df$bmi, main = “Outlier BMI”, col = “lightpink”) par(mfrow = c(1,1))
#Variabel usia tidak menunjukkan outlier ekstrem dan mencerminkan sebaran populasi yang wajar. #Nilai glukosa ekstrem mencerminkan kondisi klinis tertentu dan tetap dipertahankan karena relevan secara medis. #Outlier pada BMI merepresentasikan individu dengan obesitas ekstrem dan tidak dihapus karena mengandung informasi risiko yang penting.
#Eksplorasi Data
desc_stats <- data.frame( Variable = c(“Age”, “Avg Glucose Level”, “BMI”), Mean = c(mean(df\(age), mean(df\)avg_glucose_level), mean(df\(bmi)), Median = c(median(df\)age), median(df\(avg_glucose_level), median(df\)bmi)), SD = c(sd(df\(age), sd(df\)avg_glucose_level), sd(df\(bmi)), Min = c(min(df\)age), min(df\(avg_glucose_level), min(df\)bmi)), Max = c(max(df\(age), max(df\)avg_glucose_level), max(df$bmi)) ) desc_stats_round <- desc_stats num_cols <- sapply(desc_stats_round, is.numeric) desc_stats_round[num_cols] <- round(desc_stats_round[num_cols], 0)
desc_stats_round
#Histogram Distribusi par(mfrow = c(1,3))
hist(df\(age, breaks = 20, main = "Age Distribution", xlab = "Age", col = "lightblue", border = "white") abline(v = mean(df\)age), col = “red”, lwd = 2)
hist(df\(avg_glucose_level, breaks = 30, main = "Glucose Distribution", xlab = "Avg Glucose Level", col = "lightgreen", border = "white") abline(v = mean(df\)avg_glucose_level), col = “red”, lwd = 2)
hist(df\(bmi, breaks = 25, main = "BMI Distribution", xlab = "BMI", col = "lightpink", border = "white") abline(v = mean(df\)bmi), col = “red”, lwd = 2)
par(mfrow = c(1,1))
#Pemilihan variabel num_data <- df[, c(“age”, “avg_glucose_level”, “bmi”)]
cor_matrix <- cor(num_data, use = “complete.obs”, method = “pearson”) round(cor_matrix, 2)
#Multikolineriaritas (VIF) install.packages(“car”) library(car) vif_model_full <- lm( age ~ avg_glucose_level + bmi + hypertension + heart_disease + ever_married + Residence_type, data = df )
vif(vif_model_full)
#Membangun Model # Model regresi logistik logit_model <- glm( stroke ~ age + avg_glucose_level + bmi + hypertension + heart_disease + gender + ever_married + Residence_type + work_type + smoking_status, data = df, family = binomial(link = “logit”) )
summary(logit_model)
#Evaluasi model # Prediksi probabilitas df$prob_stroke <- predict(logit_model, type = “response”)
summary(df$prob_stroke)
df\(pred_class_05 <- ifelse(df\)prob_stroke >= 0.5, 1, 0)
table(Predicted = df\(pred_class_05, Actual = df\)stroke)
#Interpretasi hasil # Odds Ratio OR <- exp(coef(logit_model))
CI <- exp(confint(logit_model))
OR_table <- cbind(OR, CI) round(OR_table, 3)
model_summary <- summary(logit_model)
coef_table <- as.data.frame(model_summary$coefficients)
coef_table$Variable <- rownames(coef_table)
rownames(coef_table) <- NULL
coef_table
sig_vars <- coef_table[coef_table$Pr(>|z|) <
0.05, ]
sig_vars <- sig_vars[order(sig_vars$Pr(>|z|)),
]
sig_vars
model_summary <- summary(logit_model)
coef_table <- as.data.frame(model_summary$coefficients)
coef_table$Variable <- rownames(coef_table)
rownames(coef_table) <- NULL
logit_model_final <- glm( stroke ~ age + avg_glucose_level + hypertension + heart_disease + gender + ever_married + Residence_type + work_type + smoking_status, data = df, family = binomial )
coef_table
nonsig_vars <- coef_table[coef_table$Pr(>|z|)
>= 0.05, ] nonsig_vars
logit_model_final <- glm( stroke ~ age + avg_glucose_level + hypertension + heart_disease + gender + ever_married + Residence_type + work_type + smoking_status, data = df, family = binomial )
summary(logit_model_final)
###hitung odds ratio dan confidence interval # Odds Ratio OR <- exp(coef(logit_model_final))
CI <- exp(confint(logit_model_final))
OR_table <- cbind(OR, CI)
round(OR_table, 3)
##visualisasi Forest Plot Odds Ratio dengan forest plot #Menyiapkan iapkan Data Odds Ratio # Ambil OR dan CI OR <- exp(coef(logit_model_final)) CI <- exp(confint(logit_model_final))
forest_df <- data.frame( Variable = rownames(CI), OR = OR, CI_low = CI[,1], CI_high = CI[,2] )
forest_df <- forest_df[forest_df$Variable != “(Intercept)”, ]
forest_df <- forest_df[order(forest_df$OR), ]
par(mar = c(5, 9, 4, 2))
plot( forest_df\(OR, seq_along(forest_df\)OR), xlim = c(0.3, 6), ylim = c(1, nrow(forest_df)), xlab = “Odds Ratio (log scale)”, ylab = ““, yaxt =”n”, pch = 19, log = “x” )
segments( forest_df\(CI_low, seq_along(forest_df\)OR), forest_df\(CI_high, seq_along(forest_df\)OR) )
abline(v = 1, lty = 2, col = “red”)
axis( 2, at = seq_along(forest_df\(OR), labels = forest_df\)Variable, las = 1 )
title(“Forest Plot Odds Ratio Stroke Risk”)