Data visualization

library(readr)
## Warning: le package 'readr' a été compilé avec la version R 4.3.1
diabetes <- read_csv("C:/Users/hasan/Downloads/diabetes/diabetes.csv")
## Rows: 768 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (9): Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, D...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data = as.data.frame(diabetes)
str(data)
## 'data.frame':    768 obs. of  9 variables:
##  $ Pregnancies             : num  6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : num  148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : num  72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness           : num  35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin                 : num  0 0 0 94 168 0 88 0 543 0 ...
##  $ BMI                     : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ DiabetesPedigreeFunction: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : num  50 31 32 21 33 30 26 29 53 54 ...
##  $ Outcome                 : num  1 0 1 0 1 0 1 0 1 1 ...

Replacing 0 with NA

for (i in 1:(ncol(data)-1)){
  for (j in 1:nrow(data)){
    if(data[j,i] == 0) 
      data[j,i] = NA
  }
}
summary(data)
##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 1.000   Min.   : 44.0   Min.   : 24.00   Min.   : 7.00  
##  1st Qu.: 2.000   1st Qu.: 99.0   1st Qu.: 64.00   1st Qu.:22.00  
##  Median : 4.000   Median :117.0   Median : 72.00   Median :29.00  
##  Mean   : 4.495   Mean   :121.7   Mean   : 72.41   Mean   :29.15  
##  3rd Qu.: 7.000   3rd Qu.:141.0   3rd Qu.: 80.00   3rd Qu.:36.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##  NA's   :111      NA's   :5       NA's   :35       NA's   :227    
##     Insulin            BMI        DiabetesPedigreeFunction      Age       
##  Min.   : 14.00   Min.   :18.20   Min.   :0.0780           Min.   :21.00  
##  1st Qu.: 76.25   1st Qu.:27.50   1st Qu.:0.2437           1st Qu.:24.00  
##  Median :125.00   Median :32.30   Median :0.3725           Median :29.00  
##  Mean   :155.55   Mean   :32.46   Mean   :0.4719           Mean   :33.24  
##  3rd Qu.:190.00   3rd Qu.:36.60   3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :846.00   Max.   :67.10   Max.   :2.4200           Max.   :81.00  
##  NA's   :374      NA's   :11                                              
##     Outcome     
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.349  
##  3rd Qu.:1.000  
##  Max.   :1.000  
## 

Outlier visualization

for (i in 1:(ncol(data)-1)) {
  boxplot(data[,colnames(data)[i]], main = colnames(data)[i])
}

# Assuming you have a data frame named 'diabetes_df'
feature_names <- names(data)[1:8]

# Set the figure size
par(mfrow=c(4, 2), mar=c(4, 4, 2, 1))

# Loop through each feature
for (name in feature_names) {
  # Create a boxplot to visualize the distribution
  boxplot(data[[name]], main=name, col=c("lightblue"), border="black", horizontal=TRUE)
  
}

Outlier winsorizing

# Winsorizing function
winsorize <- function(x, p = 0.25) {
  if (is.numeric(x)) {
    q <- quantile(x, probs = c(p, 1 - p), na.rm = TRUE)
    Q3 <- q[2]
    Q1 <- q[1]
    IQR = Q3 - Q1
    
    L = sum(x < q[1]-1.5*IQR, na.rm = TRUE)
    U = sum(x > q[2]+1.5*IQR, na.rm = TRUE)
    
    x[x < Q1-1.5*IQR & is.na(x)==FALSE] <- as.numeric(quantile(x, probs = 0.01, na.rm = TRUE))
    x[x > Q3+1.5*IQR & is.na(x)==FALSE] <- as.numeric(quantile(x, probs = 0.99, na.rm = TRUE))
    
  }
  return(x)
  #print(L)
  #print(U)
  
}

data$Outcome = as.factor(data$Outcome)

# Apply winsorizing to numeric columns only
numeric_cols <- sapply(data, is.numeric)

data[numeric_cols] <- lapply(data[numeric_cols], winsorize)

# Confirm the changes

summary(data)
##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 1.000   Min.   : 44.0   Min.   : 40.00   Min.   : 7.00  
##  1st Qu.: 2.000   1st Qu.: 99.0   1st Qu.: 64.00   1st Qu.:22.00  
##  Median : 4.000   Median :117.0   Median : 72.00   Median :29.00  
##  Mean   : 4.486   Mean   :121.7   Mean   : 72.42   Mean   :29.04  
##  3rd Qu.: 7.000   3rd Qu.:141.0   3rd Qu.: 80.00   3rd Qu.:36.00  
##  Max.   :14.000   Max.   :199.0   Max.   :106.00   Max.   :56.00  
##  NA's   :111      NA's   :5       NA's   :35       NA's   :227    
##     Insulin            BMI        DiabetesPedigreeFunction      Age       
##  Min.   : 14.00   Min.   :18.20   Min.   :0.0780           Min.   :21.00  
##  1st Qu.: 76.25   1st Qu.:27.50   1st Qu.:0.2437           1st Qu.:24.00  
##  Median :125.00   Median :32.30   Median :0.3725           Median :29.00  
##  Mean   :159.89   Mean   :32.40   Mean   :0.4777           Mean   :33.21  
##  3rd Qu.:190.00   3rd Qu.:36.60   3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :580.47   Max.   :51.01   Max.   :1.6983           Max.   :67.00  
##  NA's   :374      NA's   :11                                              
##  Outcome
##  0:500  
##  1:268  
##         
##         
##         
##         
## 
# Assuming you have a data frame named 'diabetes_df'
feature_names <- names(data)[1:8]

# Set the figure size
par(mfrow=c(4, 2), mar=c(4, 4, 2, 1))

# Loop through each feature
for (name in feature_names) {
  # Create a boxplot to visualize the distribution
  boxplot(data[[name]], main=name, col=c("lightblue"), border="black", horizontal=TRUE)
  
}

#### Check for missing values in each column

colSums(is.na(data))
##              Pregnancies                  Glucose            BloodPressure 
##                      111                        5                       35 
##            SkinThickness                  Insulin                      BMI 
##                      227                      374                       11 
## DiabetesPedigreeFunction                      Age                  Outcome 
##                        0                        0                        0

Check for missing values in the entire data frame

sum(is.na(data))
## [1] 763

Checking Target Imbalance

library(ggplot2)
## Warning: le package 'ggplot2' a été compilé avec la version R 4.3.1
ggplot(data, aes(x = Outcome)) +
  geom_bar() +
  labs(title = "Target Distribution")

# Assuming you have a data frame named 'diabetes_df'
library(plotly)
## Warning: le package 'plotly' a été compilé avec la version R 4.3.1
## 
## Attachement du package : 'plotly'
## L'objet suivant est masqué depuis 'package:ggplot2':
## 
##     last_plot
## L'objet suivant est masqué depuis 'package:stats':
## 
##     filter
## L'objet suivant est masqué depuis 'package:graphics':
## 
##     layout
colors <- c('gold', 'mediumturquoise')
labels <- c('0', '1')
values <- table(data$Outcome) / nrow(data)

# Create a pie chart
fig <- plot_ly(labels = labels, values = values, type = "pie", hole = 0.2) %>%
  layout(
    title = "Outcome",
    showlegend = FALSE
  )

fig

Outcome probabilities

table(data$Outcome)
## 
##   0   1 
## 500 268
# Calculate proportions
prop.table(table(data$Outcome))
## 
##         0         1 
## 0.6510417 0.3489583

Handling missing values for Pregnancies

getmode <- function(v) {
   uniqv <- unique(v)
   uniqv[which.max(tabulate(match(v, uniqv)))]
}


for (i in 1:nrow(data)) {
  
  modea = getmode(data$Pregnancies[data$Age==data$Age[i] & is.na(data$Pregnancies)==FALSE])
  
  data$Pregnancies[data$Age==data$Age[i] & is.na(data$Pregnancies)==TRUE] = modea
}


hist(data$Pregnancies)

### Handling missing values for Glucose

data$Glucose[is.na(data$Glucose)==TRUE] = mean(data$Glucose[is.na(data$Glucose)==FALSE])

hist(data$Glucose)

### Handling missing values for Bloodpressure

data$BloodPressure[is.na(data$BloodPressure)==TRUE] = mean(data$BloodPressure[is.na(data$BloodPressure)==FALSE])

hist(data$BloodPressure)

### Handling missing values for Skinthickness

data$SkinThickness[is.na(data$SkinThickness)==TRUE] = mean(data$SkinThickness[is.na(data$SkinThickness)==FALSE])

hist(data$SkinThickness)

### Handling missing values for Insulin

data$Insulin[is.na(data$Insulin)==TRUE] = quantile(data$Insulin, p = 0.5, na.rm = TRUE)

hist(data$Insulin)

### Handling missing values for BMI

data$BMI[is.na(data$BMI)==TRUE] = mean(data$BMI[is.na(data$BMI)==FALSE])

hist(data$BMI)

summary(data)
##   Pregnancies        Glucose       BloodPressure    SkinThickness  
##  Min.   : 1.000   Min.   : 44.00   Min.   : 40.00   Min.   : 7.00  
##  1st Qu.: 1.000   1st Qu.: 99.75   1st Qu.: 64.00   1st Qu.:25.00  
##  Median : 3.000   Median :117.00   Median : 72.21   Median :29.04  
##  Mean   : 4.142   Mean   :121.69   Mean   : 72.42   Mean   :29.04  
##  3rd Qu.: 6.000   3rd Qu.:140.25   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :14.000   Max.   :199.00   Max.   :106.00   Max.   :56.00  
##     Insulin           BMI        DiabetesPedigreeFunction      Age       
##  Min.   : 14.0   Min.   :18.20   Min.   :0.0780           Min.   :21.00  
##  1st Qu.:121.5   1st Qu.:27.50   1st Qu.:0.2437           1st Qu.:24.00  
##  Median :125.0   Median :32.40   Median :0.3725           Median :29.00  
##  Mean   :142.9   Mean   :32.40   Mean   :0.4777           Mean   :33.21  
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :580.5   Max.   :51.01   Max.   :1.6983           Max.   :67.00  
##  Outcome
##  0:500  
##  1:268  
##         
##         
##         
##