I did this assignment based on week 6 lab notes.
# Load necessary libraries
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Load the dataset
obesity<- read.csv("C:\\Users\\saisr\\Downloads\\statistics using R\\estimation+of+obesity+levels+based+on+eating+habits+and+physical+condition\\obesity.csv")
# View the first few rows of the dataset
head(obesity)
## Gender Age Height Weight family_history_with_overweight FAVC FCVC NCP
## 1 Female 21 1.62 64.0 yes no 2 3
## 2 Female 21 1.52 56.0 yes no 3 3
## 3 Male 23 1.80 77.0 yes no 2 3
## 4 Male 27 1.80 87.0 no no 3 3
## 5 Male 22 1.78 89.8 no no 2 1
## 6 Male 29 1.62 53.0 no yes 2 3
## CAEC SMOKE CH2O SCC FAF TUE CALC MTRANS
## 1 Sometimes no 2 no 0 1 no Public_Transportation
## 2 Sometimes yes 3 yes 3 0 Sometimes Public_Transportation
## 3 Sometimes no 2 no 2 1 Frequently Public_Transportation
## 4 Sometimes no 2 no 2 0 Frequently Walking
## 5 Sometimes no 2 no 0 0 Sometimes Public_Transportation
## 6 Sometimes no 2 no 0 0 Sometimes Automobile
## NObeyesdad
## 1 Normal_Weight
## 2 Normal_Weight
## 3 Normal_Weight
## 4 Overweight_Level_I
## 5 Overweight_Level_II
## 6 Normal_Weight
Health and wellness companies aiming to provide personalized obesity management strategies.Dietitians and fitness trainers can use the data to tailor personalized dietary and workout plans for their clients.Health insurers can use the data to assess risks associated with obesity and design insurance policies accordingly.
The company classifies individuals into obesity categories (e.g., underweight, normal weight, overweight, or obese) based on demographic, physical, and lifestyle factors. This classification will help:
The UCI Obesity Dataset includes attributes such as:
Success will be measured by:
The lab uses basic models such as logistic regression or decision trees without deeper exploration. Potential shortcomings are:
##1 Feature Selection and Engineering
# Load required libraries
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.2
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
# Convert target variable 'Gender' to a factor for classification
obesity$Gender <- as.factor(obesity$Gender)
# Define predictor variables (all columns except 'Gender')
predictors <- obesity[, !names(obesity) %in% "Gender"]
# Define the target variable
target <- obesity$Gender
# Fit a random forest model to determine feature importance
rf_model <- randomForest(predictors, target, importance = TRUE)
# Print the importance of each feature
print(rf_model$importance)
## Female Male MeanDecreaseAccuracy
## Age 0.0410611300 3.007294e-02 3.551229e-02
## Height 0.1931341222 1.576535e-01 1.750837e-01
## Weight 0.1842265194 1.097658e-01 1.465963e-01
## family_history_with_overweight 0.0124381994 8.890179e-03 1.063526e-02
## FAVC 0.0018143012 8.966803e-04 1.345695e-03
## FCVC 0.1173114228 4.547788e-02 8.090677e-02
## NCP 0.0237439892 1.278535e-02 1.821531e-02
## CAEC 0.0133175987 8.023460e-03 1.063173e-02
## SMOKE 0.0001596736 9.203189e-07 8.081875e-05
## CH2O 0.0218017250 2.156011e-02 2.167183e-02
## SCC 0.0010398854 1.101589e-03 1.069188e-03
## FAF 0.0324812515 1.547392e-02 2.391810e-02
## TUE 0.0275297370 9.668658e-03 1.849097e-02
## CALC 0.0120518016 6.638550e-03 9.300916e-03
## MTRANS 0.0123819071 4.373122e-03 8.325643e-03
## NObeyesdad 0.1200982264 6.355443e-02 9.147538e-02
## MeanDecreaseGini
## Age 66.558691
## Height 332.453392
## Weight 221.003224
## family_history_with_overweight 15.034904
## FAVC 6.894314
## FCVC 113.293256
## NCP 35.063217
## CAEC 18.638783
## SMOKE 2.098514
## CH2O 45.386595
## SCC 4.194708
## FAF 49.514057
## TUE 32.286566
## CALC 14.376022
## MTRANS 13.035739
## NObeyesdad 80.787493
# Optionally, plot the feature importance
varImpPlot(rf_model)
##2 Addressing Class Imbalance
# Specify a CRAN mirror
install.packages("caret", repos = "https://cran.rstudio.com/")
## Installing package into 'C:/Users/saisr/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'caret' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'caret'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\saisr\AppData\Local\R\win-library\4.4\00LOCK\caret\libs\x64\caret.dll
## to C:\Users\saisr\AppData\Local\R\win-library\4.4\caret\libs\x64\caret.dll:
## Permission denied
## Warning: restored 'caret'
##
## The downloaded binary packages are in
## C:\Users\saisr\AppData\Local\Temp\RtmpaMcX7b\downloaded_packages
# Load the caret package
library(caret)
## Warning: package 'caret' was built under R version 4.4.2
## Loading required package: lattice
# Load obesity dataset
data(obesity) # Assuming 'obesity' is the dataset
## Warning in data(obesity): data set 'obesity' not found
# Set seed for reproducibility
set.seed(42)
# Split the dataset into training and testing sets (for example)
trainIndex <- createDataPartition(obesity$Gender, p = 0.8,
list = FALSE,
times = 1)
train_data <- obesity[ trainIndex,]
test_data <- obesity[-trainIndex,]
# Oversample minority class (up-sampling)
balanced_data_up <- upSample(x = train_data[, -ncol(train_data)],
y = train_data$Gender)
# Undersample majority class (down-sampling)
balanced_data_down <- downSample(x = train_data[, -ncol(train_data)],
y = train_data$Gender)
# View balanced dataset using oversampling
table(balanced_data_up$Class) # 'Class' is the new outcome variable
##
## Female Male
## 855 855
##3 Enhanced Model Evaluation - Issue: Sole reliance on accuracy can be misleading, especially in imbalanced datasets. - Improvement: Evaluate models using additional metrics like precision, recall, F1-score, and ROC-AUC. For example: - Precision helps assess false positives. - Recall ensures true positives are captured effectively. - F1-score balances precision and recall. - ROC-AUC evaluates the overall performance of the classifier.
# Load necessary libraries
library(randomForest)
library(caret)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# Assuming 'obesity' is your dataset
set.seed(123) # For reproducibility
# Split the data into training and testing sets (70-30 split)
train_index <- createDataPartition(obesity$Gender, p = 0.7, list = FALSE)
train_data <- obesity[train_index, ]
test_data <- obesity[-train_index, ]
# Ensure factor levels for Gender are the same between training and testing data
train_data$Gender <- factor(train_data$Gender)
test_data$Gender <- factor(test_data$Gender, levels = levels(train_data$Gender))
# Train a Random Forest model
rf_model <- randomForest(Gender ~ ., data = train_data, ntree = 100)
# Make predictions on the test data (probabilities for the 'Male' class)
predictions <- predict(rf_model, test_data, type = "prob")[, "Male"]
# Generate the ROC curve
roc_curve <- roc(test_data$Gender, predictions)
## Setting levels: control = Female, case = Male
## Setting direction: controls < cases
# Plot the ROC curve
plot(roc_curve, main = "ROC Curve", col = "blue", lwd = 2)