library(dplyr) library(ggplot2) library(corrplot)
data <- read.csv(“diabetesprojectdata.csv”)
glimpse(data)
data %>% summarise(across(everything(), list(mean = mean, sd = sd, min = min, max = max), na.rm = TRUE))
data %>% summarise(across(everything(), ~sum(is.na(.))))
numeric_cols <- colnames(data)[sapply(data, is.numeric)] for (col in numeric_cols) { print( ggplot(data, aes_string(x = col)) + geom_histogram(binwidth = 10, fill = “blue”, color = “black”, alpha = 0.7) + labs(title = paste(“Distribution of”, col), x = col, y = “Frequency”) ) }
cor_matrix <- cor(data %>% select_if(is.numeric), use = “complete.obs”) print(cor_matrix)
library(corrplot) corrplot::corrplot(cor_matrix, method = “circle”)
ggplot(data, aes(x = Glucose, y = BMI, color = as.factor(Outcome))) + geom_point(alpha = 0.7) + labs(title = “Glucose vs BMI by Outcome”, x = “Glucose”, y = “BMI”)
data <- data %>% mutate(AgeGroup = case_when( Age < 30 ~ “Under 30”, Age >= 30 & Age < 50 ~ “30-49”, Age >= 50 ~ “50 and above” ))
data %>% group_by(AgeGroup, Outcome) %>% summarise(Count = n()) %>% ggplot(aes(x = AgeGroup, y = Count, fill = as.factor(Outcome))) + geom_bar(stat = “identity”, position = “dodge”) + labs(title = “Age Group Distribution by Outcome”, x = “Age Group”, y = “Count”)
library(caret)
X <- data[, setdiff(names(data), ‘Outcome’)]
y <- data$Outcome
set.seed(123)
trainIndex <- createDataPartition(data$Outcome, p = 0.8, list = FALSE)
train <- data[trainIndex, ] test <- data[-trainIndex, ]
train\(Outcome <- factor(train\)Outcome, levels = c(0, 1), labels = c(“non.diabetic”, “diabetic”)) test\(Outcome <- factor(test\)Outcome, levels = c(0, 1), labels = c(“non.diabetic”, “diabetic”))
logistic_model <- glm(Outcome ~ . , data = train, family = binomial) #Pregnancies + Glucose + BMI + DiabetesPedigreeFunction + BloodPressure
summary(logistic_model)
predicted_probs <- predict(logistic_model, newdata = test, type = “response”) predicted_classes <- ifelse(predicted_probs > 0.5, 1, 0)
library(rpart)
tree_model <- rpart(Outcome ~ ., data = train, method = “class”)
summary(tree_model)
library(rpart.plot) rpart.plot(tree_model)
library(e1071)
svm_model <- svm(Outcome ~ ., data = train, type = “C-classification”, kernel = “radial”)
summary(svm_model)
svm_predictions <- predict(svm_model, newdata = test)
confusionMatrix(svm_predictions, test$Outcome)
dataReg <- subset(data, select = -c(Outcome)) # Exclude the binary outcome column
set.seed(123) # For reproducibility trainIndex <- createDataPartition(dataReg$BMI, p = 0.8, list = FALSE) trainData <- dataReg[trainIndex, ] testData <- dataReg[-trainIndex, ]
library(randomForest)
lm_model <- train(BMI ~ ., data = trainData, method = “lm”)
cat(“Linear Regression Summary:”) summary(lm_model$finalModel)
lm_pred <- predict(lm_model, newdata = testData)
rf_model <- train(BMI ~ ., data = trainData, method = “rf”, tuneGrid = expand.grid(.mtry = seq(2, ncol(trainData) - 1, by = 1)), trControl = trainControl(method = “cv”, number = 5))
cat(“Forest Model Parameters:”) print(rf_model$bestTune)
rf_pred <- predict(rf_model, newdata = testData)
xgb_model <- train(BMI ~ ., data = trainData, method = “xgbLinear”, tuneGrid = expand.grid(.nrounds = seq(50, 200, by = 50), .lambda = c(0, 0.1, 1), .alpha = c(0, 0.1, 1), .eta = c(0.01, 0.1, 0.3)), trControl = trainControl(method = “cv”, number = 5))
cat(“Model Best Parameters:”) print(xgb_model$bestTune)
xgb_pred <- predict(xgb_model, newdata = testData)