library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.0 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.5
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ---------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(corrplot)
## corrplot 0.84 loaded
diabetes = read_csv("C:/Users/HP/Documents/DataSet/Diabetes/diabetes.csv")
## Parsed with column specification:
## cols(
## Pregnancies = col_double(),
## Glucose = col_double(),
## BloodPressure = col_double(),
## SkinThickness = col_double(),
## Insulin = col_double(),
## BMI = col_double(),
## DiabetesPedigreeFunction = col_double(),
## Age = col_double(),
## Outcome = col_double()
## )
##Note: the above will follow your own path to the data, in doubt just right-click and copy the path, then change to follow slash after you've pasted.
head(diabetes)
glimpse(diabetes)
## Observations: 768
## Variables: 9
## $ Pregnancies <dbl> 6, 1, 8, 1, 0, 5, 3, 10, 2, 8, 4, 10, 10, ...
## $ Glucose <dbl> 148, 85, 183, 89, 137, 116, 78, 115, 197, ...
## $ BloodPressure <dbl> 72, 66, 64, 66, 40, 74, 50, 0, 70, 96, 92,...
## $ SkinThickness <dbl> 35, 29, 0, 23, 35, 0, 32, 0, 45, 0, 0, 0, ...
## $ Insulin <dbl> 0, 0, 0, 94, 168, 0, 88, 0, 543, 0, 0, 0, ...
## $ BMI <dbl> 33.6, 26.6, 23.3, 28.1, 43.1, 25.6, 31.0, ...
## $ DiabetesPedigreeFunction <dbl> 0.627, 0.351, 0.672, 0.167, 2.288, 0.201, ...
## $ Age <dbl> 50, 31, 32, 21, 33, 30, 26, 29, 53, 54, 30...
## $ Outcome <dbl> 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, ...
summary(diabetes)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## Outcome
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
sum(is.na(diabetes)) ##No Null Values, again another clean dataset.
## [1] 0
Since our data is clean, the next step is to visualize our datasets, this will mildly introduce us to the inherent data relationships.
ggplot(data = diabetes) +
geom_bar(mapping = aes(x = Pregnancies)) +
facet_wrap(~Outcome, ncol = 2)
ggplot(data = diabetes) +
geom_bar(mapping = aes(x = Glucose)) +
facet_wrap(~Outcome, ncol = 2)
diabetes %>% ggplot(aes(Age, Pregnancies)) +
geom_line() +
stat_smooth(method = 'lm') +
facet_wrap(~Outcome, ncol = 1)
## `geom_smooth()` using formula 'y ~ x'
##coord_cartesian(xlim = c(min(wine$alcohol), quantile(wine$alcohol, .99)),
##ylim = c(min(wine$density), quantile(wine$density, .99))) +
BiVariate Relationships
cat("Diabetic Respondents: \n")
## Diabetic Respondents:
with(subset(diabetes, Outcome == 1), by(Glucose, Pregnancies, summary))
## Pregnancies: 0
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 95.0 123.2 138.0 144.2 174.5 198.0
## ------------------------------------------------------------
## Pregnancies: 1
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 88.0 122.0 133.0 143.8 172.0 199.0
## ------------------------------------------------------------
## Pregnancies: 2
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 90.0 106.5 134.0 135.5 155.0 197.0
## ------------------------------------------------------------
## Pregnancies: 3
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 78.0 129.0 158.0 148.4 173.0 193.0
## ------------------------------------------------------------
## Pregnancies: 4
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 95.0 124.0 142.0 139.9 152.0 184.0
## ------------------------------------------------------------
## Pregnancies: 5
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 115.0 136.0 131.2 162.0 189.0
## ------------------------------------------------------------
## Pregnancies: 6
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 118.0 129.5 132.4 151.5 195.0
## ------------------------------------------------------------
## Pregnancies: 7
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 97.0 114.0 152.0 148.8 181.0 196.0
## ------------------------------------------------------------
## Pregnancies: 8
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 100.0 121.0 152.5 150.0 180.5 197.0
## ------------------------------------------------------------
## Pregnancies: 9
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 102.0 124.0 148.5 144.9 164.0 184.0
## ------------------------------------------------------------
## Pregnancies: 10
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 90.0 108.8 120.0 125.6 143.2 168.0
## ------------------------------------------------------------
## Pregnancies: 11
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 111.0 127.5 136.0 134.0 140.5 155.0
## ------------------------------------------------------------
## Pregnancies: 12
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 84.0 90.0 116.0 116.8 142.8 151.0
## ------------------------------------------------------------
## Pregnancies: 13
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 104.0 126.0 129.0 133.8 152.0 158.0
## ------------------------------------------------------------
## Pregnancies: 14
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 100.0 118.8 137.5 137.5 156.2 175.0
## ------------------------------------------------------------
## Pregnancies: 15
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 136 136 136 136 136 136
## ------------------------------------------------------------
## Pregnancies: 17
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 163 163 163 163 163 163
##within the subset of our data, where Outcome is 1, show the summary of Glucose based on the distribution of Pregnancies.
cat("Non-Diabetes Respondents: \n")
## Non-Diabetes Respondents:
summary(lm(formula = Pregnancies ~ Outcome, data = subset(diabetes, Outcome == 0)))
##
## Call:
## lm(formula = Pregnancies ~ Outcome, data = subset(diabetes, Outcome ==
## 0))
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.298 -2.298 -1.298 1.702 9.702
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.2980 0.1349 24.44 <2e-16 ***
## Outcome NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.017 on 499 degrees of freedom
corr <- cor(subset(diabetes,
Outcome == 0,
select = (-c(Outcome)),
method = "pearson"))
corrplot.mixed(abs(corr))
corr
## Pregnancies Glucose BloodPressure SkinThickness
## Pregnancies 1.00000000 0.09868452 0.13309586 -0.11833990
## Glucose 0.09868452 1.00000000 0.19279456 0.01601513
## BloodPressure 0.13309586 0.19279456 1.00000000 0.18707162
## SkinThickness -0.11833990 0.01601513 0.18707162 1.00000000
## Insulin -0.13198606 0.35295698 0.07462648 0.41278982
## BMI 0.01649507 0.13174900 0.36317809 0.43860594
## DiabetesPedigreeFunction -0.07995055 0.09554795 0.02729154 0.09518116
## Age 0.57277616 0.22801775 0.21469388 -0.16378832
## Insulin BMI DiabetesPedigreeFunction
## Pregnancies -0.13198606 0.01649507 -0.07995055
## Glucose 0.35295698 0.13174900 0.09554795
## BloodPressure 0.07462648 0.36317809 0.02729154
## SkinThickness 0.41278982 0.43860594 0.09518116
## Insulin 1.00000000 0.25420153 0.22738532
## BMI 0.25420153 1.00000000 0.07066436
## DiabetesPedigreeFunction 0.22738532 0.07066436 1.00000000
## Age -0.14923353 0.03606979 0.04166504
## Age
## Pregnancies 0.57277616
## Glucose 0.22801775
## BloodPressure 0.21469388
## SkinThickness -0.16378832
## Insulin -0.14923353
## BMI 0.03606979
## DiabetesPedigreeFunction 0.04166504
## Age 1.00000000
The end-user does not really understand your 1 and O outcome, using the previous tutorials, go and adjust the ‘Outcome’ feature to have 0 as ‘No’ and 1 as ‘Yes’, and then adjust your codes accordingly.