Load Dataset
data <- read.csv("C:/Users/nurul afidah/Downloads/SEM4/health_activity_data_preprocessed.csv", header = TRUE, sep = ",")
head(data)
## ID Age Gender Height_cm Weight_kg BMI Daily_Steps Calories_Intake
## 1 1 56 0 164 81 30.72 5134 1796
## 2 2 69 0 156 82 20.86 12803 1650
## 3 3 46 1 158 65 30.93 16408 1756
## 4 4 32 0 197 87 31.19 18420 2359
## 5 5 60 0 157 63 29.37 17351 2556
## 6 6 25 1 199 85 31.14 5131 3256
## Hours_of_Sleep Heart_Rate Blood_Pressure Exercise_Hours_per_Week Smoker
## 1 8.6 102 137/72 8.1 0
## 2 4.5 103 129/65 3.7 0
## 3 4.3 74 127/68 3.2 1
## 4 4.1 116 125/86 8.5 0
## 5 5.1 111 100/64 8.5 1
## 6 6.5 104 106/79 3.6 0
## Alcohol_Consumption_per_Week Diabetic Heart_Disease
## 1 7 0 0
## 2 7 0 0
## 3 0 0 0
## 4 5 0 0
## 5 8 0 0
## 6 7 0 0
Preprocessing
data$Systolic <- as.numeric(sub("/.*", "", data$Blood_Pressure))
data$Diastolic <- as.numeric(sub(".*/", "", data$Blood_Pressure))
data$BMI <- data$Weight_kg / (data$Height_cm / 100)^2
head(data[, c("Blood_Pressure", "Systolic", "Diastolic", "BMI")])
## Blood_Pressure Systolic Diastolic BMI
## 1 137/72 137 72 30.11600
## 2 129/65 129 65 33.69494
## 3 127/68 127 68 26.03749
## 4 125/86 125 86 22.41748
## 5 100/64 100 64 25.55885
## 6 106/79 106 79 21.46410
EDA
str(data)
## 'data.frame': 1000 obs. of 18 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : int 56 69 46 32 60 25 78 38 56 75 ...
## $ Gender : int 0 0 1 0 0 1 0 0 0 0 ...
## $ Height_cm : int 164 156 158 197 157 199 172 178 167 180 ...
## $ Weight_kg : int 81 82 65 87 63 85 72 115 51 114 ...
## $ BMI : num 30.1 33.7 26 22.4 25.6 ...
## $ Daily_Steps : int 5134 12803 16408 18420 17351 5131 19532 13057 12349 12486 ...
## $ Calories_Intake : int 1796 1650 1756 2359 2556 3256 2216 3262 2038 2837 ...
## $ Hours_of_Sleep : num 8.6 4.5 4.3 4.1 5.1 6.5 9.6 6.7 5.7 7.4 ...
## $ Heart_Rate : int 102 103 74 116 111 104 95 73 95 71 ...
## $ Blood_Pressure : chr "137/72" "129/65" "127/68" "125/86" ...
## $ Exercise_Hours_per_Week : num 8.1 3.7 3.2 8.5 8.5 3.6 8.3 8.5 4.2 0.7 ...
## $ Smoker : int 0 0 1 0 1 0 1 1 0 1 ...
## $ Alcohol_Consumption_per_Week: int 7 7 0 5 8 7 2 0 1 1 ...
## $ Diabetic : int 0 0 0 0 0 0 1 0 0 1 ...
## $ Heart_Disease : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Systolic : num 137 129 127 125 100 106 96 136 104 102 ...
## $ Diastolic : num 72 65 68 86 64 79 86 84 73 70 ...
summary(data)
## ID Age Gender Height_cm
## Min. : 1.0 Min. :18.00 Min. :0.000 Min. :150.0
## 1st Qu.: 250.8 1st Qu.:35.00 1st Qu.:0.000 1st Qu.:162.0
## Median : 500.5 Median :50.00 Median :0.000 Median :175.0
## Mean : 500.5 Mean :49.86 Mean :0.477 Mean :174.7
## 3rd Qu.: 750.2 3rd Qu.:66.00 3rd Qu.:1.000 3rd Qu.:187.0
## Max. :1000.0 Max. :79.00 Max. :1.000 Max. :199.0
## Weight_kg BMI Daily_Steps Calories_Intake
## Min. : 50.00 Min. :12.75 Min. : 1016 Min. :1201
## 1st Qu.: 68.00 1st Qu.:21.56 1st Qu.: 6028 1st Qu.:1746
## Median : 84.00 Median :27.54 Median :10898 Median :2328
## Mean : 84.35 Mean :28.26 Mean :10717 Mean :2327
## 3rd Qu.:101.00 3rd Qu.:33.85 3rd Qu.:15253 3rd Qu.:2880
## Max. :119.00 Max. :51.51 Max. :19931 Max. :3498
## Hours_of_Sleep Heart_Rate Blood_Pressure Exercise_Hours_per_Week
## Min. : 4.00 Min. : 50.00 Length:1000 Min. : 0.000
## 1st Qu.: 5.30 1st Qu.: 67.00 Class :character 1st Qu.: 2.675
## Median : 6.90 Median : 84.00 Mode :character Median : 5.100
## Mean : 6.91 Mean : 84.71 Mean : 5.045
## 3rd Qu.: 8.40 3rd Qu.:103.00 3rd Qu.: 7.600
## Max. :10.00 Max. :119.00 Max. :10.000
## Smoker Alcohol_Consumption_per_Week Diabetic Heart_Disease
## Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000
## 1st Qu.:0.000 1st Qu.:2.000 1st Qu.:0.000 1st Qu.:0.000
## Median :0.000 Median :5.000 Median :0.000 Median :0.000
## Mean :0.191 Mean :4.573 Mean :0.155 Mean :0.093
## 3rd Qu.:0.000 3rd Qu.:7.000 3rd Qu.:0.000 3rd Qu.:0.000
## Max. :1.000 Max. :9.000 Max. :1.000 Max. :1.000
## Systolic Diastolic
## Min. : 90.0 Min. :60.00
## 1st Qu.:102.0 1st Qu.:67.00
## Median :113.0 Median :75.00
## Mean :113.9 Mean :74.85
## 3rd Qu.:126.0 3rd Qu.:83.00
## Max. :139.0 Max. :89.00
colSums(is.na(data))
## ID Age
## 0 0
## Gender Height_cm
## 0 0
## Weight_kg BMI
## 0 0
## Daily_Steps Calories_Intake
## 0 0
## Hours_of_Sleep Heart_Rate
## 0 0
## Blood_Pressure Exercise_Hours_per_Week
## 0 0
## Smoker Alcohol_Consumption_per_Week
## 0 0
## Diabetic Heart_Disease
## 0 0
## Systolic Diastolic
## 0 0
hist(data$BMI, main = "Distribusi BMI", xlab = "BMI", col = "skyblue", breaks = 20)

hist(data$Systolic, main = "Distribusi Sistolik", xlab = "Sistolik", col = "lightblue", breaks = 20)

hist(data$Diastolic, main = "Distribusi Diastolik", xlab = "Diastolik", col = "lightgreen", breaks = 20)

boxplot(data$Systolic, main = "Boxplot Sistolik", ylab = "Sistolik (mmHg)", col = "lightcoral")

boxplot(data$Diastolic, main = "Boxplot Diastolik", ylab = "Diastolik (mmHg)", col = "lightblue")

library(GGally)
## Warning: package 'GGally' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(data[, c("BMI", "Systolic", "Diastolic", "Daily_Steps")])

barplot(table(data$Gender), main = "Distribusi Gender", col = c("lightblue", "lightpink"))

barplot(table(data$Smoker), main = "Distribusi Smoker", col = c("lightgreen", "salmon"))

barplot(table(data$Diabetic), main = "Distribusi Diabetic", col = c("lightyellow", "salmon"))

barplot(table(data$Heart_Disease), main = "Distribusi Heart Disease", col = c("lightblue", "lightcoral"))

Perceptual Mapping - MDS & PCA
library(MASS)
## Warning: package 'MASS' was built under R version 4.4.3
mds_data <- data[, c("BMI", "Systolic", "Diastolic", "Daily_Steps")]
distance_matrix <- dist(mds_data)
mds_result <- isoMDS(distance_matrix)
## initial value 0.004621
## final value 0.004621
## converged
plot(mds_result$points, main = "MDS - Multidimensional Scaling", xlab = "Dimension 1", ylab = "Dimension 2", pch = 19, col = "blue")

pca_result <- prcomp(mds_data, center = TRUE, scale. = TRUE)
biplot(pca_result, main = "PCA - Biplot")

Klasifikasi - Logistic Regression
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
set.seed(123)
data$Heart_Disease <- as.factor(data$Heart_Disease)
numeric_cols <- sapply(data, is.numeric)
data_norm <- as.data.frame(scale(data[, numeric_cols]))
data_norm$Heart_Disease <- data$Heart_Disease
trainIndex <- createDataPartition(data_norm$Heart_Disease, p = 0.8, list = FALSE)
train <- data_norm[trainIndex, ]
test <- data_norm[-trainIndex, ]
model_log <- glm(Heart_Disease ~ ., data = train, family = binomial)
pred <- predict(model_log, test, type = "response")
pred_class <- ifelse(pred > 0.5, 1, 0)
pred_class <- factor(pred_class, levels = levels(test$Heart_Disease))
confusionMatrix(pred_class, test$Heart_Disease)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 181 18
## 1 0 0
##
## Accuracy : 0.9095
## 95% CI : (0.8608, 0.9455)
## No Information Rate : 0.9095
## P-Value [Acc > NIR] : 0.5623
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 6.151e-05
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.9095
## Neg Pred Value : NaN
## Prevalence : 0.9095
## Detection Rate : 0.9095
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 0
##
Clustering - KMeans
pca_scores <- as.data.frame(pca_result$x[, 1:2])
set.seed(123)
kmeans_result <- kmeans(pca_scores, centers = 3, nstart = 25)
pca_scores$Cluster <- as.factor(kmeans_result$cluster)
library(ggplot2)
ggplot(pca_scores, aes(PC1, PC2, color = Cluster)) +
geom_point(size = 2) +
labs(title = "K-Means Clustering Berdasarkan PCA") +
theme_minimal()

data_clustered <- cbind(data, Cluster = pca_scores$Cluster)
aggregate(cbind(BMI, Systolic, Diastolic, Daily_Steps) ~ Cluster, data = data_clustered, FUN = mean)
## Cluster BMI Systolic Diastolic Daily_Steps
## 1 1 23.15083 105.7762 73.20963 13859.915
## 2 2 25.52028 122.7576 78.73278 7084.488
## 3 3 38.09705 112.8415 71.93310 11453.574
Hierarchical Clustering
# Hitung jarak dari dua komponen PCA
dist_matrix <- dist(pca_scores[, 1:2])
# Hierarchical clustering dengan metode Ward
hc <- hclust(dist_matrix, method = "ward.D2")
# Plot dendrogram
plot(hc, main = "Dendrogram - Hierarchical Clustering", xlab = "", sub = "", cex = 0.6)
# Tambahkan batas klaster
rect.hclust(hc, k = 3, border = "red")

MANOVA
manova_cluster <- manova(cbind(BMI, Systolic, Diastolic) ~ Cluster, data = data_clustered)
summary(manova_cluster, test = "Wilks")
## Df Wilks approx F num Df den Df Pr(>F)
## Cluster 2 0.28205 292.85 6 1990 < 2.2e-16 ***
## Residuals 997
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
MANCOVA - Mengontrol usia
mancova_model <- manova(cbind(BMI, Systolic, Diastolic) ~ Gender + Age, data = data)
summary(mancova_model, test = "Wilks")
## Df Wilks approx F num Df den Df Pr(>F)
## Gender 1 0.99970 0.09898 3 995 0.9606
## Age 1 0.99719 0.93434 3 995 0.4234
## Residuals 997