##PIMA DATA Set
rm(list=ls())
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#install.packages("GGally")
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
PIMA <- read_csv("C:/Users/cu_dv/Documents/PIMA_DATA.csv")
## Rows: 768 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (9): Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, D...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
PIMA
## # A tibble: 768 × 9
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## 7 3 78 50 32 88 31
## 8 10 115 0 0 0 35.3
## 9 2 197 70 45 543 30.5
## 10 8 125 96 0 0 0
## # ℹ 758 more rows
## # ℹ 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>, Outcome <dbl>
dim(PIMA)
## [1] 768 9
names(PIMA)
## [1] "Pregnancies" "Glucose"
## [3] "BloodPressure" "SkinThickness"
## [5] "Insulin" "BMI"
## [7] "DiabetesPedigreeFunction" "Age"
## [9] "Outcome"
# Variable Descriptions
# Pregnancies: Number of times pregnant
# Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test
# BloodPressure: Diastolic blood pressure (mm Hg)
# SkinThickness: Triceps skin fold thickness (mm)
# Insulin: 2-Hour serum insulin (mu U/ml)
# BMI: Body mass index (weight in kg/(height in m)^2)
# DiabetesPedigreeFunction: Diabetes pedigree function
# Age: Age (years)
# Outcome: Diabetes or No Diabetes
#Correlation Matrix of Variables
PIMA_Cor_Matrix <- cor(PIMA)
print(PIMA_Cor_Matrix)
## Pregnancies Glucose BloodPressure SkinThickness
## Pregnancies 1.00000000 0.12945867 0.14128198 -0.08167177
## Glucose 0.12945867 1.00000000 0.15258959 0.05732789
## BloodPressure 0.14128198 0.15258959 1.00000000 0.20737054
## SkinThickness -0.08167177 0.05732789 0.20737054 1.00000000
## Insulin -0.07353461 0.33135711 0.08893338 0.43678257
## BMI 0.01768309 0.22107107 0.28180529 0.39257320
## DiabetesPedigreeFunction -0.03352267 0.13733730 0.04126495 0.18392757
## Age 0.54434123 0.26351432 0.23952795 -0.11397026
## Outcome 0.22189815 0.46658140 0.06506836 0.07475223
## Insulin BMI DiabetesPedigreeFunction
## Pregnancies -0.07353461 0.01768309 -0.03352267
## Glucose 0.33135711 0.22107107 0.13733730
## BloodPressure 0.08893338 0.28180529 0.04126495
## SkinThickness 0.43678257 0.39257320 0.18392757
## Insulin 1.00000000 0.19785906 0.18507093
## BMI 0.19785906 1.00000000 0.14064695
## DiabetesPedigreeFunction 0.18507093 0.14064695 1.00000000
## Age -0.04216295 0.03624187 0.03356131
## Outcome 0.13054795 0.29269466 0.17384407
## Age Outcome
## Pregnancies 0.54434123 0.22189815
## Glucose 0.26351432 0.46658140
## BloodPressure 0.23952795 0.06506836
## SkinThickness -0.11397026 0.07475223
## Insulin -0.04216295 0.13054795
## BMI 0.03624187 0.29269466
## DiabetesPedigreeFunction 0.03356131 0.17384407
## Age 1.00000000 0.23835598
## Outcome 0.23835598 1.00000000
#summaries of Age,Pregnancies and BMI
glimpse(PIMA)
## Rows: 768
## Columns: 9
## $ Pregnancies <dbl> 6, 1, 8, 1, 0, 5, 3, 10, 2, 8, 4, 10, 10, 1, …
## $ Glucose <dbl> 148, 85, 183, 89, 137, 116, 78, 115, 197, 125…
## $ BloodPressure <dbl> 72, 66, 64, 66, 40, 74, 50, 0, 70, 96, 92, 74…
## $ SkinThickness <dbl> 35, 29, 0, 23, 35, 0, 32, 0, 45, 0, 0, 0, 0, …
## $ Insulin <dbl> 0, 0, 0, 94, 168, 0, 88, 0, 543, 0, 0, 0, 0, …
## $ BMI <dbl> 33.6, 26.6, 23.3, 28.1, 43.1, 25.6, 31.0, 35.…
## $ DiabetesPedigreeFunction <dbl> 0.627, 0.351, 0.672, 0.167, 2.288, 0.201, 0.2…
## $ Age <dbl> 50, 31, 32, 21, 33, 30, 26, 29, 53, 54, 30, 3…
## $ Outcome <dbl> 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, …
summary(PIMA$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 21.00 24.00 29.00 33.24 41.00 81.00
summary(PIMA$Pregnancies)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 3.000 3.845 6.000 17.000
summary(PIMA$BMI)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 27.30 32.00 31.99 36.60 67.10
#table(PIMA$DiabetesPedigreeFunction)
#Factorize Outcome 1=Diabetes, 0=No Diabetes
PIMA$Outcome <- factor(PIMA$Outcome, levels = c(1,0),
labels= c("Diabetes", "No_Diabetes"))
PIMA$BMI <- cut(PIMA$BMI,breaks= c(0,18, 25, 30),
labels=c("<18","18-25",">25"))
#Histogram of BMI-Body Mass Index
PIMA$BMI
## [1] <NA> >25 18-25 >25 <NA> >25 <NA> <NA> <NA> <NA> <NA> <NA>
## [13] >25 <NA> >25 >25 <NA> >25 <NA> <NA> <NA> <NA> <NA> >25
## [25] <NA> <NA> <NA> 18-25 18-25 <NA> <NA> <NA> 18-25 18-25 >25 18-25
## [37] <NA> <NA> <NA> <NA> <NA> <NA> 18-25 <NA> >25 <NA> >25 >25
## [49] <NA> <NA> 18-25 18-25 18-25 <NA> <NA> 18-25 <NA> <NA> <NA> <NA>
## [61] <NA> <NA> 18-25 >25 <NA> >25 <NA> <NA> 18-25 >25 <NA> >25
## [73] <NA> <NA> <NA> 18-25 <NA> <NA> <NA> 18-25 18-25 <NA> >25 18-25
## [85] <NA> <NA> <NA> <NA> <NA> >25 18-25 <NA> <NA> 18-25 18-25 <NA>
## [97] <NA> 18-25 >25 <NA> <NA> >25 18-25 >25 <NA> >25 18-25 >25
## [109] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> >25 18-25
## [121] <NA> <NA> <NA> >25 <NA> <NA> <NA> <NA> <NA> >25 >25 <NA>
## [133] <NA> <NA> 18-25 <NA> <NA> >25 <NA> <NA> 18-25 <NA> <NA> <NA>
## [145] <NA> <NA> <NA> <NA> <NA> >25 <NA> 18-25 <NA> <NA> <NA> <NA>
## [157] 18-25 >25 >25 <NA> >25 <NA> <NA> >25 <NA> >25 <NA> >25
## [169] <NA> >25 <NA> <NA> >25 <NA> >25 <NA> <NA> <NA> <NA> <NA>
## [181] 18-25 <NA> >25 >25 >25 <NA> <NA> <NA> >25 <NA> 18-25 <NA>
## [193] <NA> <NA> 18-25 <NA> 18-25 18-25 <NA> <NA> <NA> <NA> >25 18-25
## [205] <NA> 18-25 <NA> <NA> <NA> <NA> >25 <NA> <NA> <NA> <NA> <NA>
## [217] <NA> >25 >25 <NA> <NA> <NA> >25 >25 18-25 <NA> <NA> <NA>
## [229] <NA> <NA> <NA> <NA> >25 <NA> >25 <NA> <NA> <NA> <NA> 18-25
## [241] >25 <NA> >25 >25 <NA> >25 <NA> <NA> <NA> <NA> <NA> >25
## [253] 18-25 <NA> >25 <NA> <NA> >25 >25 <NA> <NA> >25 <NA> <NA>
## [265] <NA> <NA> <NA> <NA> >25 >25 <NA> >25 18-25 <NA> <NA> <NA>
## [277] >25 >25 18-25 >25 <NA> <NA> <NA> <NA> >25 >25 <NA> <NA>
## [289] 18-25 <NA> <NA> <NA> <NA> <NA> 18-25 <NA> >25 <NA> <NA> 18-25
## [301] <NA> <NA> <NA> <NA> 18-25 <NA> >25 18-25 <NA> <NA> >25 <NA>
## [313] >25 >25 <NA> <NA> 18-25 <NA> <NA> 18-25 >25 <NA> >25 >25
## [325] <NA> >25 <NA> <NA> <NA> <NA> 18-25 <NA> <NA> 18-25 18-25 <NA>
## [337] <NA> <NA> <NA> <NA> >25 >25 <NA> <NA> <NA> <NA> >25 18-25
## [349] 18-25 <NA> <NA> <NA> <NA> >25 <NA> <NA> <NA> <NA> <NA> <NA>
## [361] <NA> >25 <NA> <NA> <NA> <NA> >25 18-25 >25 <NA> <NA> <NA>
## [373] <NA> <NA> <NA> <NA> >25 <NA> <NA> <NA> <NA> 18-25 >25 >25
## [385] 18-25 18-25 <NA> <NA> <NA> <NA> <NA> <NA> 18-25 18-25 <NA> >25
## [397] 18-25 <NA> 18-25 <NA> <NA> 18-25 <NA> <NA> <NA> <NA> >25 18-25
## [409] >25 <NA> <NA> <NA> <NA> >25 <NA> <NA> >25 <NA> 18-25 >25
## [421] <NA> >25 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> 18-25 <NA>
## [433] >25 >25 18-25 <NA> <NA> >25 18-25 <NA> <NA> <NA> <NA> <NA>
## [445] >25 <NA> >25 <NA> <NA> <NA> 18-25 >25 <NA> 18-25 <NA> <NA>
## [457] >25 <NA> <NA> >25 18-25 18-25 <NA> >25 18-25 18-25 >25 <NA>
## [469] >25 <NA> <NA> <NA> <NA> >25 >25 >25 <NA> 18-25 >25 >25
## [481] <NA> <NA> >25 <NA> <NA> <NA> <NA> <NA> >25 >25 <NA> <NA>
## [493] <NA> >25 <NA> >25 >25 <NA> >25 >25 >25 <NA> <NA> <NA>
## [505] <NA> <NA> <NA> >25 <NA> 18-25 >25 18-25 18-25 >25 >25 <NA>
## [517] <NA> <NA> <NA> 18-25 18-25 <NA> <NA> <NA> <NA> 18-25 18-25 >25
## [529] <NA> 18-25 >25 <NA> <NA> >25 <NA> <NA> >25 18-25 <NA> <NA>
## [541] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> >25 >25 <NA>
## [553] >25 >25 <NA> >25 <NA> >25 <NA> <NA> <NA> <NA> <NA> >25
## [565] <NA> >25 <NA> <NA> <NA> <NA> <NA> 18-25 >25 <NA> <NA> <NA>
## [577] 18-25 <NA> >25 <NA> <NA> 18-25 >25 <NA> >25 18-25 <NA> 18-25
## [589] <NA> 18-25 <NA> <NA> <NA> >25 <NA> <NA> <NA> >25 <NA> 18-25
## [601] >25 18-25 >25 <NA> >25 <NA> <NA> 18-25 <NA> 18-25 <NA> <NA>
## [613] <NA> <NA> <NA> >25 >25 18-25 >25 <NA> <NA> 18-25 <NA> <NA>
## [625] <NA> <NA> 18-25 <NA> <NA> 18-25 >25 <NA> >25 >25 >25 <NA>
## [637] >25 <NA> <NA> 18-25 >25 <NA> >25 >25 >25 <NA> 18-25 <NA>
## [649] >25 >25 >25 <NA> <NA> >25 <NA> <NA> 18-25 <NA> <NA> <NA>
## [661] >25 <NA> <NA> <NA> <NA> <NA> <NA> >25 <NA> <NA> <NA> >25
## [673] <NA> <NA> <NA> <NA> 18-25 <NA> <NA> 18-25 18-25 <NA> <NA> <NA>
## [685] <NA> <NA> 18-25 >25 18-25 <NA> 18-25 <NA> <NA> <NA> 18-25 <NA>
## [697] >25 18-25 <NA> <NA> <NA> >25 <NA> <NA> >25 <NA> <NA> <NA>
## [709] <NA> <NA> <NA> >25 <NA> >25 >25 <NA> <NA> 18-25 <NA> <NA>
## [721] >25 <NA> >25 <NA> <NA> <NA> <NA> <NA> 18-25 <NA> >25 >25
## [733] <NA> >25 18-25 <NA> >25 <NA> <NA> <NA> <NA> <NA> >25 <NA>
## [745] <NA> >25 <NA> <NA> <NA> 18-25 <NA> <NA> >25 <NA> <NA> <NA>
## [757] <NA> <NA> <NA> <NA> >25 <NA> 18-25 <NA> <NA> >25 <NA> <NA>
## Levels: <18 18-25 >25
#Table of Outcome-Diabetes/No Diabetes
table(PIMA$Outcome)
##
## Diabetes No_Diabetes
## 268 500
ggparcoord(PIMA,columns=1:4,groupColumn="Outcome")

#Bar Graph of Outcome
ggplot(PIMA,aes(x=Outcome))+ geom_bar()+ggtitle("Frequency of Outcome")

#Bar Graph of BMI
ggplot(PIMA,aes(x=BMI))+ geom_bar()+ggtitle("Frequency of BMI")

#Frequency of Age Vs No. Of Pregnancies
ggplot(PIMA,aes(x= Age, y=Pregnancies ))

geom_point(size=1.2, color = "red", shape= 21)
## geom_point: na.rm = FALSE
## stat_identity: na.rm = FALSE
## position_identity
#Scatterplot Matrix
ggpairs(PIMA[,2:8])
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 476 rows containing missing values or values outside the scale range
## (`stat_boxplot()`).
## Removed 476 rows containing missing values or values outside the scale range
## (`stat_boxplot()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Parallel Coordinate Plot
ggparcoord(PIMA,columns=2:8,groupColumn="Outcome")
