Load necessary libraries

library(dplyr) library(ggplot2) library(corrplot)

Load the dataset

data <- read.csv(“diabetesprojectdata.csv”)

Glimpse of the data

glimpse(data)

Summary statistics for all variables

data %>% summarise(across(everything(), list(mean = mean, sd = sd, min = min, max = max), na.rm = TRUE))

Check for missing values

data %>% summarise(across(everything(), ~sum(is.na(.))))

Histogram for numerical variables

numeric_cols <- colnames(data)[sapply(data, is.numeric)] for (col in numeric_cols) { print( ggplot(data, aes_string(x = col)) + geom_histogram(binwidth = 10, fill = “blue”, color = “black”, alpha = 0.7) + labs(title = paste(“Distribution of”, col), x = col, y = “Frequency”) ) }

Correlation matrix

cor_matrix <- cor(data %>% select_if(is.numeric), use = “complete.obs”) print(cor_matrix)

Visualize correlations

library(corrplot) corrplot::corrplot(cor_matrix, method = “circle”)

Scatter plot between Glucose and BMI grouped by Outcome

ggplot(data, aes(x = Glucose, y = BMI, color = as.factor(Outcome))) + geom_point(alpha = 0.7) + labs(title = “Glucose vs BMI by Outcome”, x = “Glucose”, y = “BMI”)

Categorize Age into groups

data <- data %>% mutate(AgeGroup = case_when( Age < 30 ~ “Under 30”, Age >= 30 & Age < 50 ~ “30-49”, Age >= 50 ~ “50 and above” ))

Age group distribution by Outcome

data %>% group_by(AgeGroup, Outcome) %>% summarise(Count = n()) %>% ggplot(aes(x = AgeGroup, y = Count, fill = as.factor(Outcome))) + geom_bar(stat = “identity”, position = “dodge”) + labs(title = “Age Group Distribution by Outcome”, x = “Age Group”, y = “Count”)

Install necessary packages

library(caret)

Select all columns except the target variable ‘Outcome’

X <- data[, setdiff(names(data), ‘Outcome’)]

Select the target variable ‘Outcome’

y <- data$Outcome

Set seed for reproducibility

set.seed(123)

Stratified train-test split

trainIndex <- createDataPartition(data$Outcome, p = 0.8, list = FALSE)

Create training and testing sets

train <- data[trainIndex, ] test <- data[-trainIndex, ]

train$Outcome <- factor(train$Outcome, levels = c(0, 1), labels = c(“non.diabetic”, “diabetic”)) test$Outcome <- factor(test$Outcome, levels = c(0, 1), labels = c(“non.diabetic”, “diabetic”))

logistic_model <- glm(Outcome ~ . , data = train, family = binomial) #Pregnancies + Glucose + BMI + DiabetesPedigreeFunction + BloodPressure

Model summary

summary(logistic_model)

Predict probabilities and classes

predicted_probs <- predict(logistic_model, newdata = test, type = “response”) predicted_classes <- ifelse(predicted_probs > 0.5, 1, 0)

library(rpart)

Train a decision tree model

tree_model <- rpart(Outcome ~ ., data = train, method = “class”)

View the summary of the model

summary(tree_model)

To plot the decision tree

library(rpart.plot) rpart.plot(tree_model)

Install required packages

library(e1071)

Train an SVM model without tuning (default settings)

svm_model <- svm(Outcome ~ ., data = train, type = “C-classification”, kernel = “radial”)

View the model summary

summary(svm_model)

Predict on the test set

svm_predictions <- predict(svm_model, newdata = test)

Confusion matrix to evaluate model performance

confusionMatrix(svm_predictions, test$Outcome)

Select the features and target variable

We will predict BMI using other features

dataReg <- subset(data, select = -c(Outcome)) # Exclude the binary outcome column

Split the data into training and testing sets

set.seed(123) # For reproducibility trainIndex <- createDataPartition(dataReg$BMI, p = 0.8, list = FALSE) trainData <- dataReg[trainIndex, ] testData <- dataReg[-trainIndex, ]

Install package

Load the package

library(randomForest)

Train a linear regression model

lm_model <- train(BMI ~ ., data = trainData, method = “lm”)

View the model summaries

cat(“Linear Regression Summary:”) summary(lm_model$finalModel)

Make predictions on the test set

lm_pred <- predict(lm_model, newdata = testData)

Train a random forest regression model

rf_model <- train(BMI ~ ., data = trainData, method = “rf”, tuneGrid = expand.grid(.mtry = seq(2, ncol(trainData) - 1, by = 1)), trControl = trainControl(method = “cv”, number = 5))

cat(“Forest Model Parameters:”) print(rf_model$bestTune)

rf_pred <- predict(rf_model, newdata = testData)

Train an XGBoost regression model

xgb_model <- train(BMI ~ ., data = trainData, method = “xgbLinear”, tuneGrid = expand.grid(.nrounds = seq(50, 200, by = 50), .lambda = c(0, 0.1, 1), .alpha = c(0, 0.1, 1), .eta = c(0.01, 0.1, 0.3)), trControl = trainControl(method = “cv”, number = 5))

cat(“Model Best Parameters:”) print(xgb_model$bestTune)

xgb_pred <- predict(xgb_model, newdata = testData)

Project

Hafizah

2025-01-12