Load Dataset

data <- read.csv("C:/Users/nurul  afidah/Downloads/SEM4/health_activity_data_preprocessed.csv", header = TRUE, sep = ",")
head(data)
##   ID Age Gender Height_cm Weight_kg   BMI Daily_Steps Calories_Intake
## 1  1  56      0       164        81 30.72        5134            1796
## 2  2  69      0       156        82 20.86       12803            1650
## 3  3  46      1       158        65 30.93       16408            1756
## 4  4  32      0       197        87 31.19       18420            2359
## 5  5  60      0       157        63 29.37       17351            2556
## 6  6  25      1       199        85 31.14        5131            3256
##   Hours_of_Sleep Heart_Rate Blood_Pressure Exercise_Hours_per_Week Smoker
## 1            8.6        102         137/72                     8.1      0
## 2            4.5        103         129/65                     3.7      0
## 3            4.3         74         127/68                     3.2      1
## 4            4.1        116         125/86                     8.5      0
## 5            5.1        111         100/64                     8.5      1
## 6            6.5        104         106/79                     3.6      0
##   Alcohol_Consumption_per_Week Diabetic Heart_Disease
## 1                            7        0             0
## 2                            7        0             0
## 3                            0        0             0
## 4                            5        0             0
## 5                            8        0             0
## 6                            7        0             0

Preprocessing

data$Systolic <- as.numeric(sub("/.*", "", data$Blood_Pressure))
data$Diastolic <- as.numeric(sub(".*/", "", data$Blood_Pressure))
data$BMI <- data$Weight_kg / (data$Height_cm / 100)^2
head(data[, c("Blood_Pressure", "Systolic", "Diastolic", "BMI")])
##   Blood_Pressure Systolic Diastolic      BMI
## 1         137/72      137        72 30.11600
## 2         129/65      129        65 33.69494
## 3         127/68      127        68 26.03749
## 4         125/86      125        86 22.41748
## 5         100/64      100        64 25.55885
## 6         106/79      106        79 21.46410

EDA

str(data)
## 'data.frame':    1000 obs. of  18 variables:
##  $ ID                          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Age                         : int  56 69 46 32 60 25 78 38 56 75 ...
##  $ Gender                      : int  0 0 1 0 0 1 0 0 0 0 ...
##  $ Height_cm                   : int  164 156 158 197 157 199 172 178 167 180 ...
##  $ Weight_kg                   : int  81 82 65 87 63 85 72 115 51 114 ...
##  $ BMI                         : num  30.1 33.7 26 22.4 25.6 ...
##  $ Daily_Steps                 : int  5134 12803 16408 18420 17351 5131 19532 13057 12349 12486 ...
##  $ Calories_Intake             : int  1796 1650 1756 2359 2556 3256 2216 3262 2038 2837 ...
##  $ Hours_of_Sleep              : num  8.6 4.5 4.3 4.1 5.1 6.5 9.6 6.7 5.7 7.4 ...
##  $ Heart_Rate                  : int  102 103 74 116 111 104 95 73 95 71 ...
##  $ Blood_Pressure              : chr  "137/72" "129/65" "127/68" "125/86" ...
##  $ Exercise_Hours_per_Week     : num  8.1 3.7 3.2 8.5 8.5 3.6 8.3 8.5 4.2 0.7 ...
##  $ Smoker                      : int  0 0 1 0 1 0 1 1 0 1 ...
##  $ Alcohol_Consumption_per_Week: int  7 7 0 5 8 7 2 0 1 1 ...
##  $ Diabetic                    : int  0 0 0 0 0 0 1 0 0 1 ...
##  $ Heart_Disease               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Systolic                    : num  137 129 127 125 100 106 96 136 104 102 ...
##  $ Diastolic                   : num  72 65 68 86 64 79 86 84 73 70 ...
summary(data)
##        ID              Age            Gender        Height_cm    
##  Min.   :   1.0   Min.   :18.00   Min.   :0.000   Min.   :150.0  
##  1st Qu.: 250.8   1st Qu.:35.00   1st Qu.:0.000   1st Qu.:162.0  
##  Median : 500.5   Median :50.00   Median :0.000   Median :175.0  
##  Mean   : 500.5   Mean   :49.86   Mean   :0.477   Mean   :174.7  
##  3rd Qu.: 750.2   3rd Qu.:66.00   3rd Qu.:1.000   3rd Qu.:187.0  
##  Max.   :1000.0   Max.   :79.00   Max.   :1.000   Max.   :199.0  
##    Weight_kg           BMI         Daily_Steps    Calories_Intake
##  Min.   : 50.00   Min.   :12.75   Min.   : 1016   Min.   :1201   
##  1st Qu.: 68.00   1st Qu.:21.56   1st Qu.: 6028   1st Qu.:1746   
##  Median : 84.00   Median :27.54   Median :10898   Median :2328   
##  Mean   : 84.35   Mean   :28.26   Mean   :10717   Mean   :2327   
##  3rd Qu.:101.00   3rd Qu.:33.85   3rd Qu.:15253   3rd Qu.:2880   
##  Max.   :119.00   Max.   :51.51   Max.   :19931   Max.   :3498   
##  Hours_of_Sleep    Heart_Rate     Blood_Pressure     Exercise_Hours_per_Week
##  Min.   : 4.00   Min.   : 50.00   Length:1000        Min.   : 0.000         
##  1st Qu.: 5.30   1st Qu.: 67.00   Class :character   1st Qu.: 2.675         
##  Median : 6.90   Median : 84.00   Mode  :character   Median : 5.100         
##  Mean   : 6.91   Mean   : 84.71                      Mean   : 5.045         
##  3rd Qu.: 8.40   3rd Qu.:103.00                      3rd Qu.: 7.600         
##  Max.   :10.00   Max.   :119.00                      Max.   :10.000         
##      Smoker      Alcohol_Consumption_per_Week    Diabetic     Heart_Disease  
##  Min.   :0.000   Min.   :0.000                Min.   :0.000   Min.   :0.000  
##  1st Qu.:0.000   1st Qu.:2.000                1st Qu.:0.000   1st Qu.:0.000  
##  Median :0.000   Median :5.000                Median :0.000   Median :0.000  
##  Mean   :0.191   Mean   :4.573                Mean   :0.155   Mean   :0.093  
##  3rd Qu.:0.000   3rd Qu.:7.000                3rd Qu.:0.000   3rd Qu.:0.000  
##  Max.   :1.000   Max.   :9.000                Max.   :1.000   Max.   :1.000  
##     Systolic       Diastolic    
##  Min.   : 90.0   Min.   :60.00  
##  1st Qu.:102.0   1st Qu.:67.00  
##  Median :113.0   Median :75.00  
##  Mean   :113.9   Mean   :74.85  
##  3rd Qu.:126.0   3rd Qu.:83.00  
##  Max.   :139.0   Max.   :89.00
colSums(is.na(data))
##                           ID                          Age 
##                            0                            0 
##                       Gender                    Height_cm 
##                            0                            0 
##                    Weight_kg                          BMI 
##                            0                            0 
##                  Daily_Steps              Calories_Intake 
##                            0                            0 
##               Hours_of_Sleep                   Heart_Rate 
##                            0                            0 
##               Blood_Pressure      Exercise_Hours_per_Week 
##                            0                            0 
##                       Smoker Alcohol_Consumption_per_Week 
##                            0                            0 
##                     Diabetic                Heart_Disease 
##                            0                            0 
##                     Systolic                    Diastolic 
##                            0                            0
hist(data$BMI, main = "Distribusi BMI", xlab = "BMI", col = "skyblue", breaks = 20)

hist(data$Systolic, main = "Distribusi Sistolik", xlab = "Sistolik", col = "lightblue", breaks = 20)

hist(data$Diastolic, main = "Distribusi Diastolik", xlab = "Diastolik", col = "lightgreen", breaks = 20)

boxplot(data$Systolic, main = "Boxplot Sistolik", ylab = "Sistolik (mmHg)", col = "lightcoral")

boxplot(data$Diastolic, main = "Boxplot Diastolik", ylab = "Diastolik (mmHg)", col = "lightblue")

library(GGally)
## Warning: package 'GGally' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggpairs(data[, c("BMI", "Systolic", "Diastolic", "Daily_Steps")])

barplot(table(data$Gender), main = "Distribusi Gender", col = c("lightblue", "lightpink"))

barplot(table(data$Smoker), main = "Distribusi Smoker", col = c("lightgreen", "salmon"))

barplot(table(data$Diabetic), main = "Distribusi Diabetic", col = c("lightyellow", "salmon"))

barplot(table(data$Heart_Disease), main = "Distribusi Heart Disease", col = c("lightblue", "lightcoral"))

Perceptual Mapping - MDS & PCA

library(MASS)
## Warning: package 'MASS' was built under R version 4.4.3
mds_data <- data[, c("BMI", "Systolic", "Diastolic", "Daily_Steps")]
distance_matrix <- dist(mds_data)
mds_result <- isoMDS(distance_matrix)
## initial  value 0.004621 
## final  value 0.004621 
## converged
plot(mds_result$points, main = "MDS - Multidimensional Scaling", xlab = "Dimension 1", ylab = "Dimension 2", pch = 19, col = "blue")

pca_result <- prcomp(mds_data, center = TRUE, scale. = TRUE)
biplot(pca_result, main = "PCA - Biplot")

Klasifikasi - Logistic Regression

library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
set.seed(123)
data$Heart_Disease <- as.factor(data$Heart_Disease)
numeric_cols <- sapply(data, is.numeric)
data_norm <- as.data.frame(scale(data[, numeric_cols]))
data_norm$Heart_Disease <- data$Heart_Disease
trainIndex <- createDataPartition(data_norm$Heart_Disease, p = 0.8, list = FALSE)
train <- data_norm[trainIndex, ]
test <- data_norm[-trainIndex, ]
model_log <- glm(Heart_Disease ~ ., data = train, family = binomial)
pred <- predict(model_log, test, type = "response")
pred_class <- ifelse(pred > 0.5, 1, 0)
pred_class <- factor(pred_class, levels = levels(test$Heart_Disease))
confusionMatrix(pred_class, test$Heart_Disease)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 181  18
##          1   0   0
##                                           
##                Accuracy : 0.9095          
##                  95% CI : (0.8608, 0.9455)
##     No Information Rate : 0.9095          
##     P-Value [Acc > NIR] : 0.5623          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : 6.151e-05       
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.9095          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.9095          
##          Detection Rate : 0.9095          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 0               
## 

Clustering - KMeans

pca_scores <- as.data.frame(pca_result$x[, 1:2])
set.seed(123)
kmeans_result <- kmeans(pca_scores, centers = 3, nstart = 25)
pca_scores$Cluster <- as.factor(kmeans_result$cluster)
library(ggplot2)
ggplot(pca_scores, aes(PC1, PC2, color = Cluster)) +
  geom_point(size = 2) +
  labs(title = "K-Means Clustering Berdasarkan PCA") +
  theme_minimal()

data_clustered <- cbind(data, Cluster = pca_scores$Cluster)
aggregate(cbind(BMI, Systolic, Diastolic, Daily_Steps) ~ Cluster, data = data_clustered, FUN = mean)
##   Cluster      BMI Systolic Diastolic Daily_Steps
## 1       1 23.15083 105.7762  73.20963   13859.915
## 2       2 25.52028 122.7576  78.73278    7084.488
## 3       3 38.09705 112.8415  71.93310   11453.574

Hierarchical Clustering

# Hitung jarak dari dua komponen PCA
dist_matrix <- dist(pca_scores[, 1:2])

# Hierarchical clustering dengan metode Ward
hc <- hclust(dist_matrix, method = "ward.D2")

# Plot dendrogram
plot(hc, main = "Dendrogram - Hierarchical Clustering", xlab = "", sub = "", cex = 0.6)

# Tambahkan batas klaster
rect.hclust(hc, k = 3, border = "red")

MANOVA

manova_cluster <- manova(cbind(BMI, Systolic, Diastolic) ~ Cluster, data = data_clustered)
summary(manova_cluster, test = "Wilks")
##            Df   Wilks approx F num Df den Df    Pr(>F)    
## Cluster     2 0.28205   292.85      6   1990 < 2.2e-16 ***
## Residuals 997                                             
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

MANCOVA - Mengontrol usia

mancova_model <- manova(cbind(BMI, Systolic, Diastolic) ~ Gender + Age, data = data)
summary(mancova_model, test = "Wilks")
##            Df   Wilks approx F num Df den Df Pr(>F)
## Gender      1 0.99970  0.09898      3    995 0.9606
## Age         1 0.99719  0.93434      3    995 0.4234
## Residuals 997