Project Overview:
Objective: To perform a data science analysis on a real-world dementia dataset using R.
Scope: The dataset is obtained from a mobile healthcare service provided in collaboration with non-governmental organizations that manage elderly care centers in various districts of Hong Kong. The service was offered for free from 2008 to 2018.
Dataset Content: The dataset contains individual cases with various variables, including age, body height, body weight, education level, mobility, Mini Nutritional Assessment outcomes, and more.
Outcome Prediction Labels: The prediction labels are based on the categories of the Mini Mental State Exam class (MMSE_class_2). The MMSE is a widely used screening tool for assessing cognitive impairment and dementia in adults.
Source: Mention where the dataset was obtained from. In this case, it’s mentioned that the dataset is from a mobile healthcare service in collaboration with NGOs in Hong Kong.
Characteristics: Highlight any key characteristics or features of the dataset that are relevant to the analysis.
# load csv file
data <- read.csv("data.csv")
head(data)
## X Age Education_ID Mobility body_height MNAa_q3 body_weight MNAb_tot waist
## 1 1 86 1 4 148.3 2 61.1 10.5 95.0
## 2 2 92 2 4 156.2 2 61.5 15.0 88.0
## 3 3 81 1 4 146.3 2 47.0 13.0 73.5
## 4 4 79 2 4 152.2 2 65.3 14.0 95.0
## 5 5 86 1 4 157.3 2 57.3 11.5 88.0
## 6 6 80 1 4 147.1 2 51.1 13.0 85.0
## bmi MNAa_tot Endocrine.Disease_Hyperlipidaemia MMSE_class_2
## 1 27.76 12 1 0
## 2 25.19 14 0 0
## 3 21.96 13 0 0
## 4 28.17 14 0 0
## 5 23.16 13 1 0
## 6 23.59 14 1 1
summary(data)
## X Age Education_ID Mobility
## Min. : 1.0 Min. : 51.00 Min. :1.000 Min. :1.000
## 1st Qu.: 575.5 1st Qu.: 70.00 1st Qu.:1.000 1st Qu.:4.000
## Median :1150.0 Median : 77.00 Median :2.000 Median :4.000
## Mean :1150.0 Mean : 77.49 Mean :1.994 Mean :3.898
## 3rd Qu.:1724.5 3rd Qu.: 84.00 3rd Qu.:3.000 3rd Qu.:4.000
## Max. :2299.0 Max. :104.00 Max. :4.000 Max. :4.000
## body_height MNAa_q3 body_weight MNAb_tot
## Min. :109.0 Min. :0.000 Min. :21.10 Min. : 3.00
## 1st Qu.:148.5 1st Qu.:2.000 1st Qu.:50.05 1st Qu.:11.50
## Median :153.4 Median :2.000 Median :56.30 Median :12.50
## Mean :154.2 Mean :1.979 Mean :57.19 Mean :12.07
## 3rd Qu.:160.0 3rd Qu.:2.000 3rd Qu.:63.80 3rd Qu.:13.00
## Max. :184.0 Max. :2.000 Max. :98.00 Max. :16.00
## waist bmi MNAa_tot
## Min. : 57.50 Min. :13.44 Min. : 4.00
## 1st Qu.: 80.00 1st Qu.:21.57 1st Qu.:12.00
## Median : 86.00 Median :23.82 Median :13.00
## Mean : 86.48 Mean :23.98 Mean :12.85
## 3rd Qu.: 93.00 3rd Qu.:26.20 3rd Qu.:14.00
## Max. :120.00 Max. :56.47 Max. :14.00
## Endocrine.Disease_Hyperlipidaemia MMSE_class_2
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000
## Mean :0.2623 Mean :0.1857
## 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000
# Check for missing values
missing_values <- sum(is.na(data))
missing_values
## [1] 0
# Identify and handle outliers using techniques like Winsorizing, or removing outliers.
# Example: Identify and winsorize outliers in Age
q1 <- quantile(data$Age, 0.25)
q3 <- quantile(data$Age, 0.75)
iqr <- q3 - q1
lower_bound <- q1 - 1.5 * iqr
upper_bound <- q3 + 1.5 * iqr
data$Age[data$Age < lower_bound] <- lower_bound
data$Age[data$Age > upper_bound] <- upper_bound
cat("Q1:", q1, "\n")
## Q1: 70
cat("Q3:", q3, "\n")
## Q3: 84
cat("IQR:", iqr, "\n")
## IQR: 14
cat("Lower Bound:", lower_bound, "\n")
## Lower Bound: 49
cat("Upper Bound:", upper_bound, "\n")
## Upper Bound: 105
# Remove outliers
data <- data[data$Age >= lower_bound & data$Age <= upper_bound, ]
head(data)
## X Age Education_ID Mobility body_height MNAa_q3 body_weight MNAb_tot waist
## 1 1 86 1 4 148.3 2 61.1 10.5 95.0
## 2 2 92 2 4 156.2 2 61.5 15.0 88.0
## 3 3 81 1 4 146.3 2 47.0 13.0 73.5
## 4 4 79 2 4 152.2 2 65.3 14.0 95.0
## 5 5 86 1 4 157.3 2 57.3 11.5 88.0
## 6 6 80 1 4 147.1 2 51.1 13.0 85.0
## bmi MNAa_tot Endocrine.Disease_Hyperlipidaemia MMSE_class_2
## 1 27.76 12 1 0
## 2 25.19 14 0 0
## 3 21.96 13 0 0
## 4 28.17 14 0 0
## 5 23.16 13 1 0
## 6 23.59 14 1 1
summary(data$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 51.00 70.00 77.00 77.49 84.00 104.00
summary(data$body_height)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 109.0 148.5 153.4 154.2 160.0 184.0
summary(data$body_weight)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 21.10 50.05 56.30 57.19 63.80 98.00
summary(data$MNAa_tot)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.00 12.00 13.00 12.85 14.00 14.00
summary(data$MNAb_tot)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.00 11.50 12.50 12.07 13.00 16.00
summary(data$waist)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 57.50 80.00 86.00 86.48 93.00 120.00
summary(data$BMI)
## Length Class Mode
## 0 NULL NULL
library(ggplot2)
# Bar plot for Hyperlipidaemia
ggplot(data, aes(x = factor(Endocrine.Disease_Hyperlipidaemia))) +
geom_bar() +
labs(title = "Distribution of Hyperlipidaemia", x = "Hyperlipidaemia", y = "Frequency")
ggplot(data, aes(x = factor(Mobility), fill = factor(Mobility))) +
geom_bar() +
labs(title = "Distribution of Mobility",
x = "Mobility",
y = "Frequency")
ggplot(data, aes(x = factor(Education_ID), fill = factor(Education_ID))) +
geom_bar() +
labs(title = "Distribution of Education",
x = "Education",
y = "Frequency")
ggplot(data, aes(x = factor(MNAa_q3), fill = factor(MNAa_q3))) +
geom_bar() +
labs(title = "Distribution of MNAa_q3",
x = "MNAa_q3",
y = "Frequency")
ggplot(data, aes(x = factor(MMSE_class_2), fill = factor(MMSE_class_2))) +
geom_bar() +
labs(title = "Distribution of MMSE_class_2",
x = "MMSE_class_2",
y = "Frequency")
# Create histograms for continuous variables
ggplot(data, aes(x = Age)) +
geom_histogram(binwidth = 5, fill = "blue", color = "black") +
labs(title = "Distribution of Age",
x = "Age",
y = "Frequency")
ggplot(data, aes(x = body_height)) +
geom_histogram(binwidth = 5, fill = "green", color = "black") +
labs(title = "Distribution of Body Height",
x = "Body Height (cm)",
y = "Frequency")
ggplot(data, aes(x = body_weight)) +
geom_histogram(binwidth = 5, fill = "red", color = "black") +
labs(title = "Distribution of Body Weight",
x = "Body Weight (kg)",
y = "Frequency")
ggplot(data, aes(x = MNAa_tot)) +
geom_histogram(binwidth = 1, fill = "purple", color = "black") +
labs(title = "Distribution of MNAa_tot",
x = "MNAa_tot",
y = "Frequency")
ggplot(data, aes(x = MNAb_tot)) +
geom_histogram(binwidth = 1, fill = "orange", color = "black") +
labs(title = "Distribution of MNAb_tot",
x = "MNAb_tot",
y = "Frequency")
ggplot(data, aes(x = bmi)) +
geom_histogram(binwidth = 1, fill = "blue", color = "black") +
labs(title = "Distribution of BMI",
x = "BMI",
y = "Frequency")
# Example: Scatter plot of Age vs. BMI
ggplot(data, aes(x = Age, y = bmi)) +
geom_point(color = "red") +
labs(title = "Scatter plot of Age vs. BMI", x = "Age", y = "BMI")
# Correlation matrix
correlation_matrix <- cor(data[c("Age", "body_height", "body_weight", "MNAa_tot", "MNAb_tot", "waist", "bmi")])
print(correlation_matrix)
## Age body_height body_weight MNAa_tot MNAb_tot
## Age 1.00000000 -0.23968124 -0.1937651 -0.17962709 -0.1632691
## body_height -0.23968124 1.00000000 0.5584275 0.08264676 0.1574497
## body_weight -0.19376511 0.55842745 1.0000000 0.45395095 0.2446275
## MNAa_tot -0.17962709 0.08264676 0.4539510 1.00000000 0.3886480
## MNAb_tot -0.16326910 0.15744971 0.2446275 0.38864797 1.0000000
## waist 0.07347450 0.18027088 0.7863016 0.41440017 0.1641434
## bmi -0.06046809 -0.05468356 0.7900238 0.48800362 0.1868191
## waist bmi
## Age 0.0734745 -0.06046809
## body_height 0.1802709 -0.05468356
## body_weight 0.7863016 0.79002380
## MNAa_tot 0.4144002 0.48800362
## MNAb_tot 0.1641434 0.18681910
## waist 1.0000000 0.80725899
## bmi 0.8072590 1.00000000
# Example: Box plot of Age by Education level
ggplot(data, aes(x = factor(Education_ID), y = Age)) +
geom_boxplot(fill = "blue", color = "black") +
labs(title = "Age by Education Level", x = "Education Level", y = "Age")
# Splitting the data into training and testing sets
set.seed(123)
trainIndex <- sample(1:nrow(data), 0.8*nrow(data))
trainData <- data[trainIndex,]
testData <- data[-trainIndex,]
head(trainData)
## X Age Education_ID Mobility body_height MNAa_q3 body_weight MNAb_tot
## 2227 2227 69 1 4 144.4 2 53.5 13.0
## 526 526 73 2 4 163.2 2 48.5 9.5
## 195 195 90 2 4 163.6 2 55.7 13.5
## 1842 1842 70 2 4 151.0 2 58.6 12.0
## 1142 1142 69 4 4 164.5 2 68.5 13.0
## 1253 1253 64 2 4 150.0 2 57.7 10.0
## waist bmi MNAa_tot Endocrine.Disease_Hyperlipidaemia MMSE_class_2
## 2227 90 25.63 13 1 0
## 526 73 18.19 10 1 0
## 195 89 20.79 12 0 0
## 1842 97 25.68 14 1 0
## 1142 95 25.31 14 0 0
## 1253 89 25.64 12 0 0
head(testData)
## X Age Education_ID Mobility body_height MNAa_q3 body_weight MNAb_tot waist
## 3 3 81 1 4 146.3 2 47.0 13.0 73.5
## 15 15 88 2 4 134.1 2 55.9 12.0 109.0
## 21 21 98 1 4 143.0 2 59.7 11.5 104.0
## 22 22 83 1 4 162.5 2 85.2 12.0 117.0
## 28 28 80 2 4 146.0 2 48.8 12.5 85.0
## 42 42 91 2 4 169.0 2 69.2 11.5 85.5
## bmi MNAa_tot Endocrine.Disease_Hyperlipidaemia MMSE_class_2
## 3 21.96 13 0 0
## 15 31.06 14 1 1
## 21 29.19 14 0 1
## 22 32.27 13 0 1
## 28 22.87 13 0 0
## 42 24.23 14 0 0
# Fitting the Logistic Regression model
logistic_model <- glm(MMSE_class_2 ~ Age + body_height + body_weight + Education_ID + Mobility + MNAa_tot + MNAb_tot + waist + bmi, data = trainData, family = "binomial")
# Making predictions
logistic_predictions <- predict(logistic_model, newdata = testData, type = "response")
# Converting probabilities to binary predictions
logistic_predictions_binary <- ifelse(logistic_predictions > 0.5, 1, 0)
# Evaluating model performance
logistic_accuracy <- sum(logistic_predictions_binary == testData$MMSE_class_2) / length(testData$MMSE_class_2)
logistic_accuracy
## [1] 0.8130435
# Fitting the Random Forest model
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
set.seed(123)
rf_model <- randomForest(factor(MMSE_class_2) ~ Age + body_height + body_weight + Education_ID + Mobility + MNAa_tot + MNAb_tot + waist + bmi, data = trainData)
# Making predictions
rf_predictions <- predict(rf_model, newdata = testData, type = "response")
rf_predictions_numeric <- as.numeric(as.character(rf_predictions))
rf_predictions_binary <- ifelse(rf_predictions_numeric > 0.5, 1, 0)
# Evaluating model performance
rf_accuracy <- sum(rf_predictions_binary == testData$MMSE_class_2) / length(testData$MMSE_class_2)
rf_accuracy
## [1] 0.8304348
# Calculating confusion matrix for Logistic Regression
conf_matrix_logistic <- table(logistic_predictions_binary, testData$MMSE_class_2)
conf_matrix_logistic
##
## logistic_predictions_binary 0 1
## 0 340 75
## 1 11 34
# Calculating confusion matrix for Random Forest
conf_matrix_rf <- table(rf_predictions_binary, testData$MMSE_class_2)
conf_matrix_rf
##
## rf_predictions_binary 0 1
## 0 344 71
## 1 7 38
# Define a function to calculate precision, recall, and F1-score
calculate_metrics <- function(conf_matrix) {
tp <- conf_matrix[2,2]
fp <- conf_matrix[1,2]
fn <- conf_matrix[2,1]
precision <- tp / (tp + fp)
recall <- tp / (tp + fn)
f1_score <- 2 * (precision * recall) / (precision + recall)
metrics <- c(Precision = precision, Recall = recall, F1_Score = f1_score)
return(metrics)
}
# Calculate metrics for Logistic Regression
metrics_logistic <- calculate_metrics(conf_matrix_logistic)
# Calculate metrics for Random Forest
metrics_rf <- calculate_metrics(conf_matrix_rf)
metrics_rf
## Precision Recall F1_Score
## 0.3486239 0.8444444 0.4935065