R Markdown

Project Overview:

Objective: To perform a data science analysis on a real-world dementia dataset using R.

Scope: The dataset is obtained from a mobile healthcare service provided in collaboration with non-governmental organizations that manage elderly care centers in various districts of Hong Kong. The service was offered for free from 2008 to 2018.

Dataset Content: The dataset contains individual cases with various variables, including age, body height, body weight, education level, mobility, Mini Nutritional Assessment outcomes, and more.

Outcome Prediction Labels: The prediction labels are based on the categories of the Mini Mental State Exam class (MMSE_class_2). The MMSE is a widely used screening tool for assessing cognitive impairment and dementia in adults.

Source: Mention where the dataset was obtained from. In this case, it’s mentioned that the dataset is from a mobile healthcare service in collaboration with NGOs in Hong Kong.

Characteristics: Highlight any key characteristics or features of the dataset that are relevant to the analysis.

# load csv file
data <- read.csv("data.csv")
head(data)
##   X Age Education_ID Mobility body_height MNAa_q3 body_weight MNAb_tot waist
## 1 1  86            1        4       148.3       2        61.1     10.5  95.0
## 2 2  92            2        4       156.2       2        61.5     15.0  88.0
## 3 3  81            1        4       146.3       2        47.0     13.0  73.5
## 4 4  79            2        4       152.2       2        65.3     14.0  95.0
## 5 5  86            1        4       157.3       2        57.3     11.5  88.0
## 6 6  80            1        4       147.1       2        51.1     13.0  85.0
##     bmi MNAa_tot Endocrine.Disease_Hyperlipidaemia MMSE_class_2
## 1 27.76       12                                 1            0
## 2 25.19       14                                 0            0
## 3 21.96       13                                 0            0
## 4 28.17       14                                 0            0
## 5 23.16       13                                 1            0
## 6 23.59       14                                 1            1
summary(data)
##        X               Age          Education_ID      Mobility    
##  Min.   :   1.0   Min.   : 51.00   Min.   :1.000   Min.   :1.000  
##  1st Qu.: 575.5   1st Qu.: 70.00   1st Qu.:1.000   1st Qu.:4.000  
##  Median :1150.0   Median : 77.00   Median :2.000   Median :4.000  
##  Mean   :1150.0   Mean   : 77.49   Mean   :1.994   Mean   :3.898  
##  3rd Qu.:1724.5   3rd Qu.: 84.00   3rd Qu.:3.000   3rd Qu.:4.000  
##  Max.   :2299.0   Max.   :104.00   Max.   :4.000   Max.   :4.000  
##   body_height       MNAa_q3       body_weight       MNAb_tot    
##  Min.   :109.0   Min.   :0.000   Min.   :21.10   Min.   : 3.00  
##  1st Qu.:148.5   1st Qu.:2.000   1st Qu.:50.05   1st Qu.:11.50  
##  Median :153.4   Median :2.000   Median :56.30   Median :12.50  
##  Mean   :154.2   Mean   :1.979   Mean   :57.19   Mean   :12.07  
##  3rd Qu.:160.0   3rd Qu.:2.000   3rd Qu.:63.80   3rd Qu.:13.00  
##  Max.   :184.0   Max.   :2.000   Max.   :98.00   Max.   :16.00  
##      waist             bmi           MNAa_tot    
##  Min.   : 57.50   Min.   :13.44   Min.   : 4.00  
##  1st Qu.: 80.00   1st Qu.:21.57   1st Qu.:12.00  
##  Median : 86.00   Median :23.82   Median :13.00  
##  Mean   : 86.48   Mean   :23.98   Mean   :12.85  
##  3rd Qu.: 93.00   3rd Qu.:26.20   3rd Qu.:14.00  
##  Max.   :120.00   Max.   :56.47   Max.   :14.00  
##  Endocrine.Disease_Hyperlipidaemia  MMSE_class_2   
##  Min.   :0.0000                    Min.   :0.0000  
##  1st Qu.:0.0000                    1st Qu.:0.0000  
##  Median :0.0000                    Median :0.0000  
##  Mean   :0.2623                    Mean   :0.1857  
##  3rd Qu.:1.0000                    3rd Qu.:0.0000  
##  Max.   :1.0000                    Max.   :1.0000
# Check for missing values
missing_values <- sum(is.na(data))
missing_values
## [1] 0
# Identify and handle outliers using techniques like Winsorizing, or removing outliers.
# Example: Identify and winsorize outliers in Age
q1 <- quantile(data$Age, 0.25)
q3 <- quantile(data$Age, 0.75)
iqr <- q3 - q1
lower_bound <- q1 - 1.5 * iqr
upper_bound <- q3 + 1.5 * iqr
data$Age[data$Age < lower_bound] <- lower_bound
data$Age[data$Age > upper_bound] <- upper_bound
cat("Q1:", q1, "\n")
## Q1: 70
cat("Q3:", q3, "\n")
## Q3: 84
cat("IQR:", iqr, "\n")
## IQR: 14
cat("Lower Bound:", lower_bound, "\n")
## Lower Bound: 49
cat("Upper Bound:", upper_bound, "\n")
## Upper Bound: 105
# Remove outliers
data <- data[data$Age >= lower_bound & data$Age <= upper_bound, ]
head(data)
##   X Age Education_ID Mobility body_height MNAa_q3 body_weight MNAb_tot waist
## 1 1  86            1        4       148.3       2        61.1     10.5  95.0
## 2 2  92            2        4       156.2       2        61.5     15.0  88.0
## 3 3  81            1        4       146.3       2        47.0     13.0  73.5
## 4 4  79            2        4       152.2       2        65.3     14.0  95.0
## 5 5  86            1        4       157.3       2        57.3     11.5  88.0
## 6 6  80            1        4       147.1       2        51.1     13.0  85.0
##     bmi MNAa_tot Endocrine.Disease_Hyperlipidaemia MMSE_class_2
## 1 27.76       12                                 1            0
## 2 25.19       14                                 0            0
## 3 21.96       13                                 0            0
## 4 28.17       14                                 0            0
## 5 23.16       13                                 1            0
## 6 23.59       14                                 1            1
summary(data$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   51.00   70.00   77.00   77.49   84.00  104.00
summary(data$body_height)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   109.0   148.5   153.4   154.2   160.0   184.0
summary(data$body_weight)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   21.10   50.05   56.30   57.19   63.80   98.00
summary(data$MNAa_tot)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    4.00   12.00   13.00   12.85   14.00   14.00
summary(data$MNAb_tot)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3.00   11.50   12.50   12.07   13.00   16.00
summary(data$waist)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   57.50   80.00   86.00   86.48   93.00  120.00
summary(data$BMI)
## Length  Class   Mode 
##      0   NULL   NULL
library(ggplot2)
# Bar plot for Hyperlipidaemia
ggplot(data, aes(x = factor(Endocrine.Disease_Hyperlipidaemia))) +
  geom_bar() +
  labs(title = "Distribution of Hyperlipidaemia", x = "Hyperlipidaemia", y = "Frequency")

ggplot(data, aes(x = factor(Mobility), fill = factor(Mobility))) +
  geom_bar() +
  labs(title = "Distribution of Mobility",
       x = "Mobility",
       y = "Frequency")

ggplot(data, aes(x = factor(Education_ID), fill = factor(Education_ID))) +
  geom_bar() +
  labs(title = "Distribution of Education",
       x = "Education",
       y = "Frequency")

ggplot(data, aes(x = factor(MNAa_q3), fill = factor(MNAa_q3))) +
  geom_bar() +
  labs(title = "Distribution of MNAa_q3",
       x = "MNAa_q3",
       y = "Frequency")

ggplot(data, aes(x = factor(MMSE_class_2), fill = factor(MMSE_class_2))) +
  geom_bar() +
  labs(title = "Distribution of MMSE_class_2",
       x = "MMSE_class_2",
       y = "Frequency")

# Create histograms for continuous variables
ggplot(data, aes(x = Age)) +
  geom_histogram(binwidth = 5, fill = "blue", color = "black") +
  labs(title = "Distribution of Age",
       x = "Age",
       y = "Frequency")

ggplot(data, aes(x = body_height)) +
  geom_histogram(binwidth = 5, fill = "green", color = "black") +
  labs(title = "Distribution of Body Height",
       x = "Body Height (cm)",
       y = "Frequency")

ggplot(data, aes(x = body_weight)) +
  geom_histogram(binwidth = 5, fill = "red", color = "black") +
  labs(title = "Distribution of Body Weight",
       x = "Body Weight (kg)",
       y = "Frequency")

ggplot(data, aes(x = MNAa_tot)) +
  geom_histogram(binwidth = 1, fill = "purple", color = "black") +
  labs(title = "Distribution of MNAa_tot",
       x = "MNAa_tot",
       y = "Frequency")

ggplot(data, aes(x = MNAb_tot)) +
  geom_histogram(binwidth = 1, fill = "orange", color = "black") +
  labs(title = "Distribution of MNAb_tot",
       x = "MNAb_tot",
       y = "Frequency")

ggplot(data, aes(x = bmi)) +
  geom_histogram(binwidth = 1, fill = "blue", color = "black") +
  labs(title = "Distribution of BMI",
       x = "BMI",
       y = "Frequency")

# Example: Scatter plot of Age vs. BMI
ggplot(data, aes(x = Age, y = bmi)) +
  geom_point(color = "red") +
  labs(title = "Scatter plot of Age vs. BMI", x = "Age", y = "BMI")

# Correlation matrix
correlation_matrix <- cor(data[c("Age", "body_height", "body_weight", "MNAa_tot", "MNAb_tot", "waist", "bmi")])
print(correlation_matrix)
##                     Age body_height body_weight    MNAa_tot   MNAb_tot
## Age          1.00000000 -0.23968124  -0.1937651 -0.17962709 -0.1632691
## body_height -0.23968124  1.00000000   0.5584275  0.08264676  0.1574497
## body_weight -0.19376511  0.55842745   1.0000000  0.45395095  0.2446275
## MNAa_tot    -0.17962709  0.08264676   0.4539510  1.00000000  0.3886480
## MNAb_tot    -0.16326910  0.15744971   0.2446275  0.38864797  1.0000000
## waist        0.07347450  0.18027088   0.7863016  0.41440017  0.1641434
## bmi         -0.06046809 -0.05468356   0.7900238  0.48800362  0.1868191
##                 waist         bmi
## Age         0.0734745 -0.06046809
## body_height 0.1802709 -0.05468356
## body_weight 0.7863016  0.79002380
## MNAa_tot    0.4144002  0.48800362
## MNAb_tot    0.1641434  0.18681910
## waist       1.0000000  0.80725899
## bmi         0.8072590  1.00000000
# Example: Box plot of Age by Education level
ggplot(data, aes(x = factor(Education_ID), y = Age)) +
  geom_boxplot(fill = "blue", color = "black") +
  labs(title = "Age by Education Level", x = "Education Level", y = "Age")

# Splitting the data into training and testing sets
set.seed(123)
trainIndex <- sample(1:nrow(data), 0.8*nrow(data))
trainData <- data[trainIndex,]
testData <- data[-trainIndex,]
head(trainData)
##         X Age Education_ID Mobility body_height MNAa_q3 body_weight MNAb_tot
## 2227 2227  69            1        4       144.4       2        53.5     13.0
## 526   526  73            2        4       163.2       2        48.5      9.5
## 195   195  90            2        4       163.6       2        55.7     13.5
## 1842 1842  70            2        4       151.0       2        58.6     12.0
## 1142 1142  69            4        4       164.5       2        68.5     13.0
## 1253 1253  64            2        4       150.0       2        57.7     10.0
##      waist   bmi MNAa_tot Endocrine.Disease_Hyperlipidaemia MMSE_class_2
## 2227    90 25.63       13                                 1            0
## 526     73 18.19       10                                 1            0
## 195     89 20.79       12                                 0            0
## 1842    97 25.68       14                                 1            0
## 1142    95 25.31       14                                 0            0
## 1253    89 25.64       12                                 0            0
head(testData)
##     X Age Education_ID Mobility body_height MNAa_q3 body_weight MNAb_tot waist
## 3   3  81            1        4       146.3       2        47.0     13.0  73.5
## 15 15  88            2        4       134.1       2        55.9     12.0 109.0
## 21 21  98            1        4       143.0       2        59.7     11.5 104.0
## 22 22  83            1        4       162.5       2        85.2     12.0 117.0
## 28 28  80            2        4       146.0       2        48.8     12.5  85.0
## 42 42  91            2        4       169.0       2        69.2     11.5  85.5
##      bmi MNAa_tot Endocrine.Disease_Hyperlipidaemia MMSE_class_2
## 3  21.96       13                                 0            0
## 15 31.06       14                                 1            1
## 21 29.19       14                                 0            1
## 22 32.27       13                                 0            1
## 28 22.87       13                                 0            0
## 42 24.23       14                                 0            0
# Fitting the Logistic Regression model
logistic_model <- glm(MMSE_class_2 ~ Age + body_height + body_weight + Education_ID + Mobility + MNAa_tot + MNAb_tot + waist + bmi, data = trainData, family = "binomial")
# Making predictions
logistic_predictions <- predict(logistic_model, newdata = testData, type = "response")
# Converting probabilities to binary predictions
logistic_predictions_binary <- ifelse(logistic_predictions > 0.5, 1, 0)

# Evaluating model performance
logistic_accuracy <- sum(logistic_predictions_binary == testData$MMSE_class_2) / length(testData$MMSE_class_2)
logistic_accuracy
## [1] 0.8130435
# Fitting the Random Forest model
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
set.seed(123)
rf_model <- randomForest(factor(MMSE_class_2) ~ Age + body_height + body_weight + Education_ID + Mobility + MNAa_tot + MNAb_tot + waist + bmi, data = trainData)
# Making predictions
rf_predictions <- predict(rf_model, newdata = testData, type = "response")

rf_predictions_numeric <- as.numeric(as.character(rf_predictions))

rf_predictions_binary <- ifelse(rf_predictions_numeric > 0.5, 1, 0)
# Evaluating model performance
rf_accuracy <- sum(rf_predictions_binary == testData$MMSE_class_2) / length(testData$MMSE_class_2)
rf_accuracy
## [1] 0.8304348
# Calculating confusion matrix for Logistic Regression
conf_matrix_logistic <- table(logistic_predictions_binary, testData$MMSE_class_2)
conf_matrix_logistic
##                            
## logistic_predictions_binary   0   1
##                           0 340  75
##                           1  11  34
# Calculating confusion matrix for Random Forest
conf_matrix_rf <- table(rf_predictions_binary, testData$MMSE_class_2)
conf_matrix_rf
##                      
## rf_predictions_binary   0   1
##                     0 344  71
##                     1   7  38
# Define a function to calculate precision, recall, and F1-score
calculate_metrics <- function(conf_matrix) {
  tp <- conf_matrix[2,2]
  fp <- conf_matrix[1,2]
  fn <- conf_matrix[2,1]
  precision <- tp / (tp + fp)
  recall <- tp / (tp + fn)
  f1_score <- 2 * (precision * recall) / (precision + recall)
  
  metrics <- c(Precision = precision, Recall = recall, F1_Score = f1_score)
  return(metrics)
}

# Calculate metrics for Logistic Regression
metrics_logistic <- calculate_metrics(conf_matrix_logistic)

# Calculate metrics for Random Forest
metrics_rf <- calculate_metrics(conf_matrix_rf)
metrics_rf
## Precision    Recall  F1_Score 
## 0.3486239 0.8444444 0.4935065