library(readr)
## Warning: le package 'readr' a été compilé avec la version R 4.3.1
diabetes <- read_csv("C:/Users/hasan/Downloads/diabetes/diabetes.csv")
## Rows: 768 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (9): Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, D...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data = as.data.frame(diabetes)
str(data)
## 'data.frame': 768 obs. of 9 variables:
## $ Pregnancies : num 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : num 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : num 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : num 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : num 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : num 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : num 1 0 1 0 1 0 1 0 1 1 ...
for (i in 1:(ncol(data)-1)){
for (j in 1:nrow(data)){
if(data[j,i] == 0)
data[j,i] = NA
}
}
summary(data)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 1.000 Min. : 44.0 Min. : 24.00 Min. : 7.00
## 1st Qu.: 2.000 1st Qu.: 99.0 1st Qu.: 64.00 1st Qu.:22.00
## Median : 4.000 Median :117.0 Median : 72.00 Median :29.00
## Mean : 4.495 Mean :121.7 Mean : 72.41 Mean :29.15
## 3rd Qu.: 7.000 3rd Qu.:141.0 3rd Qu.: 80.00 3rd Qu.:36.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## NA's :111 NA's :5 NA's :35 NA's :227
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 14.00 Min. :18.20 Min. :0.0780 Min. :21.00
## 1st Qu.: 76.25 1st Qu.:27.50 1st Qu.:0.2437 1st Qu.:24.00
## Median :125.00 Median :32.30 Median :0.3725 Median :29.00
## Mean :155.55 Mean :32.46 Mean :0.4719 Mean :33.24
## 3rd Qu.:190.00 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.00 Max. :67.10 Max. :2.4200 Max. :81.00
## NA's :374 NA's :11
## Outcome
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
##
for (i in 1:(ncol(data)-1)) {
boxplot(data[,colnames(data)[i]], main = colnames(data)[i])
}
# Assuming you have a data frame named 'diabetes_df'
feature_names <- names(data)[1:8]
# Set the figure size
par(mfrow=c(4, 2), mar=c(4, 4, 2, 1))
# Loop through each feature
for (name in feature_names) {
# Create a boxplot to visualize the distribution
boxplot(data[[name]], main=name, col=c("lightblue"), border="black", horizontal=TRUE)
}
# Winsorizing function
winsorize <- function(x, p = 0.25) {
if (is.numeric(x)) {
q <- quantile(x, probs = c(p, 1 - p), na.rm = TRUE)
Q3 <- q[2]
Q1 <- q[1]
IQR = Q3 - Q1
L = sum(x < q[1]-1.5*IQR, na.rm = TRUE)
U = sum(x > q[2]+1.5*IQR, na.rm = TRUE)
x[x < Q1-1.5*IQR & is.na(x)==FALSE] <- as.numeric(quantile(x, probs = 0.01, na.rm = TRUE))
x[x > Q3+1.5*IQR & is.na(x)==FALSE] <- as.numeric(quantile(x, probs = 0.99, na.rm = TRUE))
}
return(x)
#print(L)
#print(U)
}
data$Outcome = as.factor(data$Outcome)
# Apply winsorizing to numeric columns only
numeric_cols <- sapply(data, is.numeric)
data[numeric_cols] <- lapply(data[numeric_cols], winsorize)
# Confirm the changes
summary(data)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 1.000 Min. : 44.0 Min. : 40.00 Min. : 7.00
## 1st Qu.: 2.000 1st Qu.: 99.0 1st Qu.: 64.00 1st Qu.:22.00
## Median : 4.000 Median :117.0 Median : 72.00 Median :29.00
## Mean : 4.486 Mean :121.7 Mean : 72.42 Mean :29.04
## 3rd Qu.: 7.000 3rd Qu.:141.0 3rd Qu.: 80.00 3rd Qu.:36.00
## Max. :14.000 Max. :199.0 Max. :106.00 Max. :56.00
## NA's :111 NA's :5 NA's :35 NA's :227
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 14.00 Min. :18.20 Min. :0.0780 Min. :21.00
## 1st Qu.: 76.25 1st Qu.:27.50 1st Qu.:0.2437 1st Qu.:24.00
## Median :125.00 Median :32.30 Median :0.3725 Median :29.00
## Mean :159.89 Mean :32.40 Mean :0.4777 Mean :33.21
## 3rd Qu.:190.00 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :580.47 Max. :51.01 Max. :1.6983 Max. :67.00
## NA's :374 NA's :11
## Outcome
## 0:500
## 1:268
##
##
##
##
##
# Assuming you have a data frame named 'diabetes_df'
feature_names <- names(data)[1:8]
# Set the figure size
par(mfrow=c(4, 2), mar=c(4, 4, 2, 1))
# Loop through each feature
for (name in feature_names) {
# Create a boxplot to visualize the distribution
boxplot(data[[name]], main=name, col=c("lightblue"), border="black", horizontal=TRUE)
}
#### Check for missing values in each column
colSums(is.na(data))
## Pregnancies Glucose BloodPressure
## 111 5 35
## SkinThickness Insulin BMI
## 227 374 11
## DiabetesPedigreeFunction Age Outcome
## 0 0 0
sum(is.na(data))
## [1] 763
library(ggplot2)
## Warning: le package 'ggplot2' a été compilé avec la version R 4.3.1
ggplot(data, aes(x = Outcome)) +
geom_bar() +
labs(title = "Target Distribution")
# Assuming you have a data frame named 'diabetes_df'
library(plotly)
## Warning: le package 'plotly' a été compilé avec la version R 4.3.1
##
## Attachement du package : 'plotly'
## L'objet suivant est masqué depuis 'package:ggplot2':
##
## last_plot
## L'objet suivant est masqué depuis 'package:stats':
##
## filter
## L'objet suivant est masqué depuis 'package:graphics':
##
## layout
colors <- c('gold', 'mediumturquoise')
labels <- c('0', '1')
values <- table(data$Outcome) / nrow(data)
# Create a pie chart
fig <- plot_ly(labels = labels, values = values, type = "pie", hole = 0.2) %>%
layout(
title = "Outcome",
showlegend = FALSE
)
fig
table(data$Outcome)
##
## 0 1
## 500 268
# Calculate proportions
prop.table(table(data$Outcome))
##
## 0 1
## 0.6510417 0.3489583
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
for (i in 1:nrow(data)) {
modea = getmode(data$Pregnancies[data$Age==data$Age[i] & is.na(data$Pregnancies)==FALSE])
data$Pregnancies[data$Age==data$Age[i] & is.na(data$Pregnancies)==TRUE] = modea
}
hist(data$Pregnancies)
### Handling missing values for Glucose
data$Glucose[is.na(data$Glucose)==TRUE] = mean(data$Glucose[is.na(data$Glucose)==FALSE])
hist(data$Glucose)
### Handling missing values for Bloodpressure
data$BloodPressure[is.na(data$BloodPressure)==TRUE] = mean(data$BloodPressure[is.na(data$BloodPressure)==FALSE])
hist(data$BloodPressure)
### Handling missing values for Skinthickness
data$SkinThickness[is.na(data$SkinThickness)==TRUE] = mean(data$SkinThickness[is.na(data$SkinThickness)==FALSE])
hist(data$SkinThickness)
### Handling missing values for Insulin
data$Insulin[is.na(data$Insulin)==TRUE] = quantile(data$Insulin, p = 0.5, na.rm = TRUE)
hist(data$Insulin)
### Handling missing values for BMI
data$BMI[is.na(data$BMI)==TRUE] = mean(data$BMI[is.na(data$BMI)==FALSE])
hist(data$BMI)
summary(data)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 1.000 Min. : 44.00 Min. : 40.00 Min. : 7.00
## 1st Qu.: 1.000 1st Qu.: 99.75 1st Qu.: 64.00 1st Qu.:25.00
## Median : 3.000 Median :117.00 Median : 72.21 Median :29.04
## Mean : 4.142 Mean :121.69 Mean : 72.42 Mean :29.04
## 3rd Qu.: 6.000 3rd Qu.:140.25 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :14.000 Max. :199.00 Max. :106.00 Max. :56.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 14.0 Min. :18.20 Min. :0.0780 Min. :21.00
## 1st Qu.:121.5 1st Qu.:27.50 1st Qu.:0.2437 1st Qu.:24.00
## Median :125.0 Median :32.40 Median :0.3725 Median :29.00
## Mean :142.9 Mean :32.40 Mean :0.4777 Mean :33.21
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :580.5 Max. :51.01 Max. :1.6983 Max. :67.00
## Outcome
## 0:500
## 1:268
##
##
##
##