library(mlbench)
library(readr)
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(arulesViz)
library("knitr")
library(readr)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
##
## expand, pack, unpack
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ purrr 1.0.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ tidyr::pack() masks Matrix::pack()
## ✖ dplyr::recode() masks arules::recode()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
df = read.csv("C:\\Users\\huiwu\\OneDrive\\Desktop\\CAU\\Spring2025\\R_DataScience\\diabetes.csv",header = T)
summary(df)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## Outcome
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
str(df)
## 'data.frame': 768 obs. of 9 variables:
## $ Pregnancies : int 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : int 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : int 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : int 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : int 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : int 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : int 1 0 1 0 1 0 1 0 1 1 ...
head(df)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
## Check missing and data types
library(DataExplorer)
plot_intro(df)
dim(df)
## [1] 768 9
glimpse(df)
## Rows: 768
## Columns: 9
## $ Pregnancies <int> 6, 1, 8, 1, 0, 5, 3, 10, 2, 8, 4, 10, 10, 1, …
## $ Glucose <int> 148, 85, 183, 89, 137, 116, 78, 115, 197, 125…
## $ BloodPressure <int> 72, 66, 64, 66, 40, 74, 50, 0, 70, 96, 92, 74…
## $ SkinThickness <int> 35, 29, 0, 23, 35, 0, 32, 0, 45, 0, 0, 0, 0, …
## $ Insulin <int> 0, 0, 0, 94, 168, 0, 88, 0, 543, 0, 0, 0, 0, …
## $ BMI <dbl> 33.6, 26.6, 23.3, 28.1, 43.1, 25.6, 31.0, 35.…
## $ DiabetesPedigreeFunction <dbl> 0.627, 0.351, 0.672, 0.167, 2.288, 0.201, 0.2…
## $ Age <int> 50, 31, 32, 21, 33, 30, 26, 29, 53, 54, 30, 3…
## $ Outcome <int> 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, …
summary(df)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## Outcome
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
#store rows for partition
partition <- caret::createDataPartition(y = df$Outcome, times = 1, p = 0.7, list = FALSE)
# create training data set
train_set <- df[partition,]
# create testing data set, subtracting the rows partition to get remaining 30% # of the data
test_set <- df[-partition,]
str(train_set)
## 'data.frame': 538 obs. of 9 variables:
## $ Pregnancies : int 8 1 5 3 10 8 10 7 0 7 ...
## $ Glucose : int 183 89 116 78 115 125 139 100 118 107 ...
## $ BloodPressure : int 64 66 74 50 0 96 80 0 84 74 ...
## $ SkinThickness : int 0 23 0 32 0 0 0 0 47 0 ...
## $ Insulin : int 0 94 0 88 0 0 0 0 230 0 ...
## $ BMI : num 23.3 28.1 25.6 31 35.3 0 27.1 30 45.8 29.6 ...
## $ DiabetesPedigreeFunction: num 0.672 0.167 0.201 0.248 0.134 ...
## $ Age : int 32 21 30 26 29 54 57 32 31 31 ...
## $ Outcome : int 1 0 0 1 0 1 0 1 1 1 ...
str(test_set)
## 'data.frame': 230 obs. of 9 variables:
## $ Pregnancies : int 6 1 0 2 4 10 1 5 7 10 ...
## $ Glucose : int 148 85 137 197 110 168 189 166 196 125 ...
## $ BloodPressure : int 72 66 40 70 92 74 60 72 90 70 ...
## $ SkinThickness : int 35 29 35 45 0 0 23 19 0 26 ...
## $ Insulin : int 0 0 168 543 0 0 846 175 0 115 ...
## $ BMI : num 33.6 26.6 43.1 30.5 37.6 38 30.1 25.8 39.8 31.1 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 2.288 0.158 0.191 ...
## $ Age : int 50 31 33 53 30 34 59 51 41 41 ...
## $ Outcome : int 1 0 1 1 0 1 1 1 1 1 ...
summary(train_set)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.:100.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.894 Mean :121.6 Mean : 69.42 Mean :20.56
## 3rd Qu.: 6.000 3rd Qu.:140.0 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :114.00 Max. :99.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 0.00 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.00 1st Qu.:27.30 1st Qu.:0.2442 1st Qu.:24.00
## Median : 36.00 Median :32.15 Median :0.3690 Median :29.00
## Mean : 83.02 Mean :32.05 Mean :0.4721 Mean :33.24
## 3rd Qu.:135.00 3rd Qu.:36.40 3rd Qu.:0.6138 3rd Qu.:41.00
## Max. :744.00 Max. :67.10 Max. :2.3290 Max. :81.00
## Outcome
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.3587
## 3rd Qu.:1.0000
## Max. :1.0000
summarytools::descr(train_set)
## Descriptive Statistics
## train_set
## N: 538
##
## Age BloodPressure BMI DiabetesPedigreeFunction Glucose Insulin
## ----------------- -------- --------------- -------- -------------------------- --------- ---------
## Mean 33.24 69.42 32.05 0.47 121.59 83.02
## Std.Dev 11.88 18.75 7.73 0.33 31.81 115.92
## Min 21.00 0.00 0.00 0.08 0.00 0.00
## Q1 24.00 62.00 27.30 0.24 100.00 0.00
## Median 29.00 72.00 32.15 0.37 117.00 36.00
## Q3 41.00 80.00 36.40 0.61 140.00 135.00
## Max 81.00 114.00 67.10 2.33 199.00 744.00
## MAD 10.38 11.86 6.75 0.25 28.17 53.37
## IQR 17.00 18.00 9.10 0.37 40.00 135.00
## CV 0.36 0.27 0.24 0.69 0.26 1.40
## Skewness 1.11 -1.84 -0.35 1.69 0.25 1.99
## SE.Skewness 0.11 0.11 0.11 0.11 0.11 0.11
## Kurtosis 0.57 5.43 3.44 3.98 0.60 5.12
## N.Valid 538.00 538.00 538.00 538.00 538.00 538.00
## N 538.00 538.00 538.00 538.00 538.00 538.00
## Pct.Valid 100.00 100.00 100.00 100.00 100.00 100.00
##
## Table: Table continues below
##
##
##
## Outcome Pregnancies SkinThickness
## ----------------- --------- ------------- ---------------
## Mean 0.36 3.89 20.56
## Std.Dev 0.48 3.42 15.82
## Min 0.00 0.00 0.00
## Q1 0.00 1.00 0.00
## Median 0.00 3.00 23.00
## Q3 1.00 6.00 32.00
## Max 1.00 17.00 99.00
## MAD 0.00 2.97 17.79
## IQR 1.00 5.00 32.00
## CV 1.34 0.88 0.77
## Skewness 0.59 0.92 0.15
## SE.Skewness 0.11 0.11 0.11
## Kurtosis -1.66 0.16 -0.22
## N.Valid 538.00 538.00 538.00
## N 538.00 538.00 538.00
## Pct.Valid 100.00 100.00 100.00
##Diabetes Distribution
ggplot(train_set, aes(train_set$Outcome, fill = Outcome)) +
geom_bar() +
theme_bw() +
labs(title = "Diabetes Classification", x = "Outcome") +
theme(plot.title = element_text(hjust = 0.5))
## Warning: Use of `train_set$Outcome` is discouraged.
## ℹ Use `Outcome` instead.
## Warning: The following aesthetics were dropped during statistical transformation: fill.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
cor_data <- cor(train_set[,setdiff(names(train_set), 'diabetes')])
#Numerical Correlation Matrix
cor_data
## Pregnancies Glucose BloodPressure SkinThickness
## Pregnancies 1.00000000 0.13409347 0.16051923 -0.07032423
## Glucose 0.13409347 1.00000000 0.18153369 0.09295568
## BloodPressure 0.16051923 0.18153369 1.00000000 0.19656682
## SkinThickness -0.07032423 0.09295568 0.19656682 1.00000000
## Insulin -0.05897094 0.31599876 0.09531117 0.43689402
## BMI 0.03836817 0.23896056 0.31897616 0.38406504
## DiabetesPedigreeFunction 0.02101428 0.08910634 0.03411150 0.19244913
## Age 0.55243749 0.26156883 0.24808459 -0.06575306
## Outcome 0.22972635 0.47711923 0.11997201 0.07536788
## Insulin BMI DiabetesPedigreeFunction
## Pregnancies -0.05897094 0.03836817 0.02101428
## Glucose 0.31599876 0.23896056 0.08910634
## BloodPressure 0.09531117 0.31897616 0.03411150
## SkinThickness 0.43689402 0.38406504 0.19244913
## Insulin 1.00000000 0.22311495 0.21955656
## BMI 0.22311495 1.00000000 0.06354955
## DiabetesPedigreeFunction 0.21955656 0.06354955 1.00000000
## Age -0.03881853 0.06181424 0.06873461
## Outcome 0.10070611 0.31361064 0.16875343
## Age Outcome
## Pregnancies 0.55243749 0.22972635
## Glucose 0.26156883 0.47711923
## BloodPressure 0.24808459 0.11997201
## SkinThickness -0.06575306 0.07536788
## Insulin -0.03881853 0.10070611
## BMI 0.06181424 0.31361064
## DiabetesPedigreeFunction 0.06873461 0.16875343
## Age 1.00000000 0.24491355
## Outcome 0.24491355 1.00000000
corrplot::corrplot(cor_data)
corrplot::corrplot(cor_data, type = "lower", method = "number")
corrplot::corrplot(cor_data, type = "lower", method = "pie")
##We can see that there is a moderately positive high correlation between age ##and pregn.
##Univariate Analysis
univar_graph <- function(univar_name, univar, data, output_var) {
g_1 <- ggplot(data, aes(x=univar)) +
geom_density() +
xlab(univar_name) +
theme_bw()
g_2 <- ggplot(data, aes(x=univar, fill=output_var)) +
geom_density(alpha=0.4) +
xlab(univar_name) +
theme_bw()
gridExtra::grid.arrange(g_1, g_2, ncol=2, top = paste(univar_name,"variable", "/ [ Skew:",timeDate::skewness(univar),"]"))
}
for (x in 1:(ncol(train_set)-1)) {
univar_graph(univar_name = names(train_set)[x], univar = train_set[,x], data = train_set, output_var = train_set[,'Outcome'])
}
## Warning: The following aesthetics were dropped during statistical transformation: fill.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: The following aesthetics were dropped during statistical transformation: fill.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: The following aesthetics were dropped during statistical transformation: fill.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: The following aesthetics were dropped during statistical transformation: fill.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: The following aesthetics were dropped during statistical transformation: fill.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: The following aesthetics were dropped during statistical transformation: fill.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: The following aesthetics were dropped during statistical transformation: fill.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: The following aesthetics were dropped during statistical transformation: fill.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## R Markdown
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see <http://rmarkdown.rstudio.com>.
When you click the **Knit** button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
```