library(readr)
## Warning: package 'readr' was built under R version 4.0.3
library(treeheatr)
## Warning: package 'treeheatr' was built under R version 4.0.3
library(partykit)
## Warning: package 'partykit' was built under R version 4.0.3
## Loading required package: grid
## Loading required package: libcoin
## Warning: package 'libcoin' was built under R version 4.0.3
## Loading required package: mvtnorm
## Warning: package 'mvtnorm' was built under R version 4.0.3
library(explore)
## Warning: package 'explore' was built under R version 4.0.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(shiny)
## Warning: package 'shiny' was built under R version 4.0.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.5 v stringr 1.4.0
## v tidyr 1.1.2 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.3
## Warning: package 'tibble' was built under R version 4.0.3
## Warning: package 'tidyr' was built under R version 4.0.3
## Warning: package 'purrr' was built under R version 4.0.3
## Warning: package 'stringr' was built under R version 4.0.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
diabetes <- read_csv("C:/Users/Nikhil/Desktop/diabetes.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## Pregnancies = col_double(),
## Glucose = col_double(),
## BloodPressure = col_double(),
## SkinThickness = col_double(),
## Insulin = col_double(),
## BMI = col_double(),
## DiabetesPedigreeFunction = col_double(),
## Age = col_double(),
## Outcome = col_double()
## )
str(diabetes)
## tibble [768 x 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Pregnancies : num [1:768] 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : num [1:768] 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : num [1:768] 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : num [1:768] 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : num [1:768] 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num [1:768] 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num [1:768] 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : num [1:768] 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : num [1:768] 1 0 1 0 1 0 1 0 1 1 ...
## - attr(*, "spec")=
## .. cols(
## .. Pregnancies = col_double(),
## .. Glucose = col_double(),
## .. BloodPressure = col_double(),
## .. SkinThickness = col_double(),
## .. Insulin = col_double(),
## .. BMI = col_double(),
## .. DiabetesPedigreeFunction = col_double(),
## .. Age = col_double(),
## .. Outcome = col_double()
## .. )
summary(diabetes)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## Outcome
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
dim(diabetes)
## [1] 768 9
diabetes$Outcome<- as.factor(diabetes$Outcome)
View(diabetes)
diabetes <- na.omit(diabetes)
## # A tibble: 9 x 8
## variable type na na_pct unique min mean max
## <chr> <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 Pregnancies dbl 0 0 17 0 3.85 17
## 2 Glucose dbl 0 0 136 0 121. 199
## 3 BloodPressure dbl 0 0 47 0 69.1 122
## 4 SkinThickness dbl 0 0 51 0 20.5 99
## 5 Insulin dbl 0 0 186 0 79.8 846
## 6 BMI dbl 0 0 248 0 32.0 67.1
## 7 DiabetesPedigreeFunction dbl 0 0 517 0.08 0.47 2.42
## 8 Age dbl 0 0 52 21 33.2 81
## 9 Outcome fct 0 0 2 NA NA NA
## # A tibble: 1 x 8
## variable type na na_pct unique min mean max
## <chr> <chr> <int> <dbl> <int> <dbl> <dbl> <dbl>
## 1 Outcome fct 0 0 2 NA NA NA
## # A tibble: 0 x 8
## # ... with 8 variables: variable <chr>, type <chr>, na <int>, na_pct <dbl>,
## # unique <int>, min <dbl>, mean <dbl>, max <dbl>
References : explore package
https://cran.r-project.org/web/packages/explore/vignettes/explore.html