##PIMA DATA Set
rm(list=ls())
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#install.packages("GGally")
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
PIMA <- read_csv("C:/Users/cu_dv/Documents/PIMA_DATA.csv")
## Rows: 768 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (9): Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, D...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
PIMA
## # A tibble: 768 × 9
##    Pregnancies Glucose BloodPressure SkinThickness Insulin   BMI
##          <dbl>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>
##  1           6     148            72            35       0  33.6
##  2           1      85            66            29       0  26.6
##  3           8     183            64             0       0  23.3
##  4           1      89            66            23      94  28.1
##  5           0     137            40            35     168  43.1
##  6           5     116            74             0       0  25.6
##  7           3      78            50            32      88  31  
##  8          10     115             0             0       0  35.3
##  9           2     197            70            45     543  30.5
## 10           8     125            96             0       0   0  
## # ℹ 758 more rows
## # ℹ 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>, Outcome <dbl>
dim(PIMA)
## [1] 768   9
names(PIMA)
## [1] "Pregnancies"              "Glucose"                 
## [3] "BloodPressure"            "SkinThickness"           
## [5] "Insulin"                  "BMI"                     
## [7] "DiabetesPedigreeFunction" "Age"                     
## [9] "Outcome"
# Variable Descriptions
# Pregnancies: Number of times pregnant
# Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test
# BloodPressure: Diastolic blood pressure (mm Hg)
# SkinThickness: Triceps skin fold thickness (mm)
# Insulin: 2-Hour serum insulin (mu U/ml)
# BMI: Body mass index (weight in kg/(height in m)^2)
# DiabetesPedigreeFunction: Diabetes pedigree function
# Age: Age (years)
# Outcome: Diabetes or No Diabetes

#Correlation Matrix of Variables

PIMA_Cor_Matrix <- cor(PIMA)
print(PIMA_Cor_Matrix)
##                          Pregnancies    Glucose BloodPressure SkinThickness
## Pregnancies               1.00000000 0.12945867    0.14128198   -0.08167177
## Glucose                   0.12945867 1.00000000    0.15258959    0.05732789
## BloodPressure             0.14128198 0.15258959    1.00000000    0.20737054
## SkinThickness            -0.08167177 0.05732789    0.20737054    1.00000000
## Insulin                  -0.07353461 0.33135711    0.08893338    0.43678257
## BMI                       0.01768309 0.22107107    0.28180529    0.39257320
## DiabetesPedigreeFunction -0.03352267 0.13733730    0.04126495    0.18392757
## Age                       0.54434123 0.26351432    0.23952795   -0.11397026
## Outcome                   0.22189815 0.46658140    0.06506836    0.07475223
##                              Insulin        BMI DiabetesPedigreeFunction
## Pregnancies              -0.07353461 0.01768309              -0.03352267
## Glucose                   0.33135711 0.22107107               0.13733730
## BloodPressure             0.08893338 0.28180529               0.04126495
## SkinThickness             0.43678257 0.39257320               0.18392757
## Insulin                   1.00000000 0.19785906               0.18507093
## BMI                       0.19785906 1.00000000               0.14064695
## DiabetesPedigreeFunction  0.18507093 0.14064695               1.00000000
## Age                      -0.04216295 0.03624187               0.03356131
## Outcome                   0.13054795 0.29269466               0.17384407
##                                  Age    Outcome
## Pregnancies               0.54434123 0.22189815
## Glucose                   0.26351432 0.46658140
## BloodPressure             0.23952795 0.06506836
## SkinThickness            -0.11397026 0.07475223
## Insulin                  -0.04216295 0.13054795
## BMI                       0.03624187 0.29269466
## DiabetesPedigreeFunction  0.03356131 0.17384407
## Age                       1.00000000 0.23835598
## Outcome                   0.23835598 1.00000000
#summaries of Age,Pregnancies and BMI
glimpse(PIMA)
## Rows: 768
## Columns: 9
## $ Pregnancies              <dbl> 6, 1, 8, 1, 0, 5, 3, 10, 2, 8, 4, 10, 10, 1, …
## $ Glucose                  <dbl> 148, 85, 183, 89, 137, 116, 78, 115, 197, 125…
## $ BloodPressure            <dbl> 72, 66, 64, 66, 40, 74, 50, 0, 70, 96, 92, 74…
## $ SkinThickness            <dbl> 35, 29, 0, 23, 35, 0, 32, 0, 45, 0, 0, 0, 0, …
## $ Insulin                  <dbl> 0, 0, 0, 94, 168, 0, 88, 0, 543, 0, 0, 0, 0, …
## $ BMI                      <dbl> 33.6, 26.6, 23.3, 28.1, 43.1, 25.6, 31.0, 35.…
## $ DiabetesPedigreeFunction <dbl> 0.627, 0.351, 0.672, 0.167, 2.288, 0.201, 0.2…
## $ Age                      <dbl> 50, 31, 32, 21, 33, 30, 26, 29, 53, 54, 30, 3…
## $ Outcome                  <dbl> 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, …
summary(PIMA$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   21.00   24.00   29.00   33.24   41.00   81.00
summary(PIMA$Pregnancies)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   3.000   3.845   6.000  17.000
summary(PIMA$BMI)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   27.30   32.00   31.99   36.60   67.10
#table(PIMA$DiabetesPedigreeFunction)

#Factorize Outcome 1=Diabetes, 0=No Diabetes

PIMA$Outcome <- factor(PIMA$Outcome, levels = c(1,0),
                               labels= c("Diabetes", "No_Diabetes"))

PIMA$BMI <- cut(PIMA$BMI,breaks= c(0,18, 25, 30),
                          labels=c("<18","18-25",">25"))

#Histogram of BMI-Body Mass Index
PIMA$BMI
##   [1] <NA>  >25   18-25 >25   <NA>  >25   <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  [13] >25   <NA>  >25   >25   <NA>  >25   <NA>  <NA>  <NA>  <NA>  <NA>  >25  
##  [25] <NA>  <NA>  <NA>  18-25 18-25 <NA>  <NA>  <NA>  18-25 18-25 >25   18-25
##  [37] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  18-25 <NA>  >25   <NA>  >25   >25  
##  [49] <NA>  <NA>  18-25 18-25 18-25 <NA>  <NA>  18-25 <NA>  <NA>  <NA>  <NA> 
##  [61] <NA>  <NA>  18-25 >25   <NA>  >25   <NA>  <NA>  18-25 >25   <NA>  >25  
##  [73] <NA>  <NA>  <NA>  18-25 <NA>  <NA>  <NA>  18-25 18-25 <NA>  >25   18-25
##  [85] <NA>  <NA>  <NA>  <NA>  <NA>  >25   18-25 <NA>  <NA>  18-25 18-25 <NA> 
##  [97] <NA>  18-25 >25   <NA>  <NA>  >25   18-25 >25   <NA>  >25   18-25 >25  
## [109] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  >25   18-25
## [121] <NA>  <NA>  <NA>  >25   <NA>  <NA>  <NA>  <NA>  <NA>  >25   >25   <NA> 
## [133] <NA>  <NA>  18-25 <NA>  <NA>  >25   <NA>  <NA>  18-25 <NA>  <NA>  <NA> 
## [145] <NA>  <NA>  <NA>  <NA>  <NA>  >25   <NA>  18-25 <NA>  <NA>  <NA>  <NA> 
## [157] 18-25 >25   >25   <NA>  >25   <NA>  <NA>  >25   <NA>  >25   <NA>  >25  
## [169] <NA>  >25   <NA>  <NA>  >25   <NA>  >25   <NA>  <NA>  <NA>  <NA>  <NA> 
## [181] 18-25 <NA>  >25   >25   >25   <NA>  <NA>  <NA>  >25   <NA>  18-25 <NA> 
## [193] <NA>  <NA>  18-25 <NA>  18-25 18-25 <NA>  <NA>  <NA>  <NA>  >25   18-25
## [205] <NA>  18-25 <NA>  <NA>  <NA>  <NA>  >25   <NA>  <NA>  <NA>  <NA>  <NA> 
## [217] <NA>  >25   >25   <NA>  <NA>  <NA>  >25   >25   18-25 <NA>  <NA>  <NA> 
## [229] <NA>  <NA>  <NA>  <NA>  >25   <NA>  >25   <NA>  <NA>  <NA>  <NA>  18-25
## [241] >25   <NA>  >25   >25   <NA>  >25   <NA>  <NA>  <NA>  <NA>  <NA>  >25  
## [253] 18-25 <NA>  >25   <NA>  <NA>  >25   >25   <NA>  <NA>  >25   <NA>  <NA> 
## [265] <NA>  <NA>  <NA>  <NA>  >25   >25   <NA>  >25   18-25 <NA>  <NA>  <NA> 
## [277] >25   >25   18-25 >25   <NA>  <NA>  <NA>  <NA>  >25   >25   <NA>  <NA> 
## [289] 18-25 <NA>  <NA>  <NA>  <NA>  <NA>  18-25 <NA>  >25   <NA>  <NA>  18-25
## [301] <NA>  <NA>  <NA>  <NA>  18-25 <NA>  >25   18-25 <NA>  <NA>  >25   <NA> 
## [313] >25   >25   <NA>  <NA>  18-25 <NA>  <NA>  18-25 >25   <NA>  >25   >25  
## [325] <NA>  >25   <NA>  <NA>  <NA>  <NA>  18-25 <NA>  <NA>  18-25 18-25 <NA> 
## [337] <NA>  <NA>  <NA>  <NA>  >25   >25   <NA>  <NA>  <NA>  <NA>  >25   18-25
## [349] 18-25 <NA>  <NA>  <NA>  <NA>  >25   <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## [361] <NA>  >25   <NA>  <NA>  <NA>  <NA>  >25   18-25 >25   <NA>  <NA>  <NA> 
## [373] <NA>  <NA>  <NA>  <NA>  >25   <NA>  <NA>  <NA>  <NA>  18-25 >25   >25  
## [385] 18-25 18-25 <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  18-25 18-25 <NA>  >25  
## [397] 18-25 <NA>  18-25 <NA>  <NA>  18-25 <NA>  <NA>  <NA>  <NA>  >25   18-25
## [409] >25   <NA>  <NA>  <NA>  <NA>  >25   <NA>  <NA>  >25   <NA>  18-25 >25  
## [421] <NA>  >25   <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  18-25 <NA> 
## [433] >25   >25   18-25 <NA>  <NA>  >25   18-25 <NA>  <NA>  <NA>  <NA>  <NA> 
## [445] >25   <NA>  >25   <NA>  <NA>  <NA>  18-25 >25   <NA>  18-25 <NA>  <NA> 
## [457] >25   <NA>  <NA>  >25   18-25 18-25 <NA>  >25   18-25 18-25 >25   <NA> 
## [469] >25   <NA>  <NA>  <NA>  <NA>  >25   >25   >25   <NA>  18-25 >25   >25  
## [481] <NA>  <NA>  >25   <NA>  <NA>  <NA>  <NA>  <NA>  >25   >25   <NA>  <NA> 
## [493] <NA>  >25   <NA>  >25   >25   <NA>  >25   >25   >25   <NA>  <NA>  <NA> 
## [505] <NA>  <NA>  <NA>  >25   <NA>  18-25 >25   18-25 18-25 >25   >25   <NA> 
## [517] <NA>  <NA>  <NA>  18-25 18-25 <NA>  <NA>  <NA>  <NA>  18-25 18-25 >25  
## [529] <NA>  18-25 >25   <NA>  <NA>  >25   <NA>  <NA>  >25   18-25 <NA>  <NA> 
## [541] <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  >25   >25   <NA> 
## [553] >25   >25   <NA>  >25   <NA>  >25   <NA>  <NA>  <NA>  <NA>  <NA>  >25  
## [565] <NA>  >25   <NA>  <NA>  <NA>  <NA>  <NA>  18-25 >25   <NA>  <NA>  <NA> 
## [577] 18-25 <NA>  >25   <NA>  <NA>  18-25 >25   <NA>  >25   18-25 <NA>  18-25
## [589] <NA>  18-25 <NA>  <NA>  <NA>  >25   <NA>  <NA>  <NA>  >25   <NA>  18-25
## [601] >25   18-25 >25   <NA>  >25   <NA>  <NA>  18-25 <NA>  18-25 <NA>  <NA> 
## [613] <NA>  <NA>  <NA>  >25   >25   18-25 >25   <NA>  <NA>  18-25 <NA>  <NA> 
## [625] <NA>  <NA>  18-25 <NA>  <NA>  18-25 >25   <NA>  >25   >25   >25   <NA> 
## [637] >25   <NA>  <NA>  18-25 >25   <NA>  >25   >25   >25   <NA>  18-25 <NA> 
## [649] >25   >25   >25   <NA>  <NA>  >25   <NA>  <NA>  18-25 <NA>  <NA>  <NA> 
## [661] >25   <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  >25   <NA>  <NA>  <NA>  >25  
## [673] <NA>  <NA>  <NA>  <NA>  18-25 <NA>  <NA>  18-25 18-25 <NA>  <NA>  <NA> 
## [685] <NA>  <NA>  18-25 >25   18-25 <NA>  18-25 <NA>  <NA>  <NA>  18-25 <NA> 
## [697] >25   18-25 <NA>  <NA>  <NA>  >25   <NA>  <NA>  >25   <NA>  <NA>  <NA> 
## [709] <NA>  <NA>  <NA>  >25   <NA>  >25   >25   <NA>  <NA>  18-25 <NA>  <NA> 
## [721] >25   <NA>  >25   <NA>  <NA>  <NA>  <NA>  <NA>  18-25 <NA>  >25   >25  
## [733] <NA>  >25   18-25 <NA>  >25   <NA>  <NA>  <NA>  <NA>  <NA>  >25   <NA> 
## [745] <NA>  >25   <NA>  <NA>  <NA>  18-25 <NA>  <NA>  >25   <NA>  <NA>  <NA> 
## [757] <NA>  <NA>  <NA>  <NA>  >25   <NA>  18-25 <NA>  <NA>  >25   <NA>  <NA> 
## Levels: <18 18-25 >25
#Table of Outcome-Diabetes/No Diabetes
table(PIMA$Outcome)
## 
##    Diabetes No_Diabetes 
##         268         500
ggparcoord(PIMA,columns=1:4,groupColumn="Outcome")

#Bar Graph of Outcome
ggplot(PIMA,aes(x=Outcome))+ geom_bar()+ggtitle("Frequency of Outcome")

#Bar Graph of BMI
ggplot(PIMA,aes(x=BMI))+ geom_bar()+ggtitle("Frequency of BMI")

#Frequency of Age Vs No. Of Pregnancies
ggplot(PIMA,aes(x= Age, y=Pregnancies )) 

  geom_point(size=1.2, color = "red", shape= 21)
## geom_point: na.rm = FALSE
## stat_identity: na.rm = FALSE
## position_identity
#Scatterplot Matrix  
ggpairs(PIMA[,2:8])
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 476 rows containing missing values or values outside the scale range
## (`stat_boxplot()`).
## Removed 476 rows containing missing values or values outside the scale range
## (`stat_boxplot()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Parallel Coordinate Plot
ggparcoord(PIMA,columns=2:8,groupColumn="Outcome")