#reading data
t = "C:/Users/Q-Anh\\Desktop/Prediction Model - NVT Prof/Dataset/Diabetes data.csv"
db <- read.csv(t, header = TRUE)
#calculating data
db$diab <- ifelse(db$diabetes == "Yes", 1, 0)
db$group[db$diab == 1] <- "diabetes"
db$group[db$diab == 0] <- "non-diabetes"
#analysis
library(DescTools)
Desc(db, maxrows= 20, digits = 1)
## -------------------------------------------------------------------------
## Describe db (data.frame):
##
## data.frame: 3165 obs. of 16 variables
##
## Nr ColName Class NAs Levels
## 1 id factor . (973): 1-1, 2-1,000, 3-10,
## 4-100, 5-101, ...
## 2 age integer .
## 3 gender factor . (2): 1-Female, 2-Male
## 4 height numeric 7 (0.2%)
## 5 weight numeric 2 (0.1%)
## 6 waist numeric 11 (0.3%)
## 7 hip numeric 11 (0.3%)
## 8 sysbp integer 29 (0.9%)
## 9 diabp integer 29 (0.9%)
## 10 active integer .
## 11 hypertension integer .
## 12 bmi numeric 9 (0.3%)
## 13 whr numeric 11 (0.3%)
## 14 diabetes factor . (3): 1-IFG, 2-Normal,
## 3-Yes
## 15 diab numeric .
## 16 group character .
##
##
## -------------------------------------------------------------------------
## 1 - id (factor)
##
## length n NAs unique levels dupes
## 3e+03 3e+03 0 1e+03 1e+03 y
## 100.0% 0.0%
##
## level freq perc cumfreq cumperc
## 1 103 5e+00 0.2% 5e+00 0.2%
## 2 104 5e+00 0.2% 1e+01 0.3%
## 3 105 5e+00 0.2% 2e+01 0.5%
## 4 107 5e+00 0.2% 2e+01 0.6%
## 5 108 5e+00 0.2% 2e+01 0.8%
## 6 110 5e+00 0.2% 3e+01 0.9%
## 7 112 5e+00 0.2% 4e+01 1.1%
## 8 114 5e+00 0.2% 4e+01 1.3%
## 9 116 5e+00 0.2% 4e+01 1.4%
## 10 117 5e+00 0.2% 5e+01 1.6%
## 11 119 5e+00 0.2% 6e+01 1.7%
## 12 12 5e+00 0.2% 6e+01 1.9%
## 13 120 5e+00 0.2% 6e+01 2.1%
## 14 123 5e+00 0.2% 7e+01 2.2%
## 15 127 5e+00 0.2% 8e+01 2.4%
## 16 129 5e+00 0.2% 8e+01 2.5%
## 17 137 5e+00 0.2% 8e+01 2.7%
## 18 138 5e+00 0.2% 9e+01 2.8%
## 19 139 5e+00 0.2% 1e+02 3.0%
## 20 141 5e+00 0.2% 1e+02 3.2%
## ... etc.
## [list output truncated]
## -------------------------------------------------------------------------
## 2 - age (integer)
##
## length n NAs unique 0s mean meanCI
## 3e+03 3e+03 0 6e+01 0 5.2e+01 5.2e+01
## 100.0% 0.0% 0.0% 5.3e+01
##
## .05 .10 .25 median .75 .90 .95
## 3.4e+01 3.7e+01 4.3e+01 5.1e+01 6.1e+01 6.9e+01 7.4e+01
##
## range sd vcoef mad IQR skew kurt
## 6.3e+01 1.2e+01 2.3e-01 1.3e+01 1.8e+01 3.5e-01 -4.9e-01
##
##
## level freq perc cumfreq cumperc
## 1 30 2e+01 0.8% 2e+01 0.8%
## 2 31 3e+01 0.9% 5e+01 1.7%
## 3 32 4e+01 1.3% 9e+01 3.0%
## 4 33 4e+01 1.4% 1e+02 4.4%
## 5 34 6e+01 1.8% 2e+02 6.2%
## 6 35 4e+01 1.2% 2e+02 7.4%
## 7 36 5e+01 1.5% 3e+02 8.9%
## 8 37 5e+01 1.6% 3e+02 10.5%
## 9 38 6e+01 1.8% 4e+02 12.3%
## 10 39 6e+01 2.0% 4e+02 14.2%
## 11 40 9e+01 2.7% 5e+02 16.9%
## 12 41 9e+01 2.8% 6e+02 19.7%
## 13 42 8e+01 2.5% 7e+02 22.2%
## 14 43 9e+01 2.8% 8e+02 25.0%
## 15 44 1e+02 3.2% 9e+02 28.2%
## 16 45 8e+01 2.7% 1e+03 30.9%
## 17 46 9e+01 3.0% 1e+03 33.8%
## 18 47 1e+02 3.9% 1e+03 37.7%
## 19 48 1e+02 3.6% 1e+03 41.3%
## 20 49 1e+02 3.3% 1e+03 44.6%
## ... etc.
## [list output truncated]
## -------------------------------------------------------------------------
## 3 - gender (factor - dichotomous)
##
## length n NAs unique
## 3e+03 3e+03 0 2e+00
## 100.0% 0.0%
##
## freq perc lci9.50e-01 uci9.50e-01'
## Female 2e+03 68.3% 66.6% 69.9%
## Male 1e+03 31.7% 30.1% 33.4%
##
## ' 95%-CI Wilson
## -------------------------------------------------------------------------
## 4 - height (numeric)
##
## length n NAs unique 0s mean meanCI
## 3e+03 3e+03 7e+00 6e+01 0 1.6e+02 1.6e+02
## 99.8% 0.2% 0.0% 1.6e+02
##
## .05 .10 .25 median .75 .90 .95
## 1.5e+02 1.5e+02 1.5e+02 1.6e+02 1.6e+02 1.7e+02 1.7e+02
##
## range sd vcoef mad IQR skew kurt
## 4.8e+01 7.3e+00 4.7e-02 7.4e+00 1.0e+01 2.9e-01 -2.1e-01
##
##
## level freq perc cumfreq cumperc
## 1 132 1e+00 0.0% 1e+00 0.0%
## 2 133 1e+00 0.0% 2e+00 0.1%
## 3 135 1e+00 0.0% 3e+00 0.1%
## 4 137 1e+00 0.0% 4e+00 0.1%
## 5 138 2e+00 0.1% 6e+00 0.2%
## 6 139 5e+00 0.2% 1e+01 0.3%
## 7 140 2e+01 0.5% 3e+01 0.9%
## 8 141 6e+00 0.2% 3e+01 1.0%
## 9 142 1e+01 0.4% 4e+01 1.4%
## 10 143 3e+01 0.9% 8e+01 2.4%
## 11 144 2e+01 0.7% 1e+02 3.1%
## 12 144.5 1e+00 0.0% 1e+02 3.1%
## 13 145 6e+01 1.8% 2e+02 4.9%
## 14 146 5e+01 1.5% 2e+02 6.5%
## 15 147 8e+01 2.5% 3e+02 9.0%
## 16 147.5 1e+00 0.0% 3e+02 9.0%
## 17 148 1e+02 3.3% 4e+02 12.3%
## 18 149 1e+02 3.5% 5e+02 15.9%
## 19 150 2e+02 6.6% 7e+02 22.5%
## 20 151 1e+02 4.6% 9e+02 27.1%
## ... etc.
## [list output truncated]
## -------------------------------------------------------------------------
## 5 - weight (numeric)
##
## length n NAs unique 0s mean meanCI
## 3e+03 3e+03 2e+00 1e+02 0 5.9e+01 5.8e+01
## 99.9% 0.1% 0.0% 5.9e+01
##
## .05 .10 .25 median .75 .90 .95
## 4.4e+01 4.6e+01 5.2e+01 5.8e+01 6.5e+01 7.2e+01 7.6e+01
##
## range sd vcoef mad IQR skew kurt
## 8.6e+01 1.0e+01 1.8e-01 1.0e+01 1.4e+01 6.0e-01 8.9e-01
##
##
## level freq perc cumfreq cumperc
## 1 29 1e+00 0.0% 1e+00 0.0%
## 2 31 1e+00 0.0% 2e+00 0.1%
## 3 32 1e+00 0.0% 3e+00 0.1%
## 4 34 1e+00 0.0% 4e+00 0.1%
## 5 35 4e+00 0.1% 8e+00 0.3%
## 6 36 7e+00 0.2% 2e+01 0.5%
## 7 36.5 1e+00 0.0% 2e+01 0.5%
## 8 37 3e+00 0.1% 2e+01 0.6%
## 9 38 7e+00 0.2% 3e+01 0.8%
## 10 38.5 1e+00 0.0% 3e+01 0.9%
## 11 39 2e+01 0.5% 4e+01 1.4%
## 12 40 3e+01 0.9% 7e+01 2.2%
## 13 40.5 3e+00 0.1% 7e+01 2.3%
## 14 41 1e+01 0.4% 9e+01 2.8%
## 15 41.5 1e+00 0.0% 9e+01 2.8%
## 16 42 2e+01 0.6% 1e+02 3.4%
## 17 42.5 4e+00 0.1% 1e+02 3.5%
## 18 43 3e+01 0.9% 1e+02 4.4%
## 19 43.5 3e+00 0.1% 1e+02 4.5%
## 20 44 4e+01 1.2% 2e+02 5.7%
## ... etc.
## [list output truncated]
## -------------------------------------------------------------------------
## 6 - waist (numeric)
##
## length n NAs unique 0s mean meanCI
## 3e+03 3e+03 1e+01 8e+01 0 8.2e+01 8.2e+01
## 99.7% 0.3% 0.0% 8.2e+01
##
## .05 .10 .25 median .75 .90 .95
## 6.5e+01 6.9e+01 7.4e+01 8.2e+01 8.9e+01 9.5e+01 9.9e+01
##
## range sd vcoef mad IQR skew kurt
## 6.6e+01 1.0e+01 1.3e-01 1.2e+01 1.5e+01 2.0e-01 -8.1e-02
##
##
## level freq perc cumfreq cumperc
## 1 54 2e+00 0.1% 2e+00 0.1%
## 2 55 1e+00 0.0% 3e+00 0.1%
## 3 56 2e+00 0.1% 5e+00 0.2%
## 4 57 4e+00 0.1% 9e+00 0.3%
## 5 58 3e+00 0.1% 1e+01 0.4%
## 6 59 8e+00 0.3% 2e+01 0.6%
## 7 60 2e+01 0.5% 4e+01 1.1%
## 8 61 1e+01 0.4% 5e+01 1.6%
## 9 62 2e+01 0.5% 7e+01 2.1%
## 10 63 2e+01 0.7% 9e+01 2.8%
## 11 64 3e+01 1.0% 1e+02 3.7%
## 12 65 5e+01 1.6% 2e+02 5.3%
## 13 66 5e+01 1.5% 2e+02 6.8%
## 14 67 6e+01 1.7% 3e+02 8.5%
## 15 68 4e+01 1.4% 3e+02 9.9%
## 16 69 6e+01 1.7% 4e+02 11.7%
## 17 70 8e+01 2.6% 5e+02 14.3%
## 18 71 8e+01 2.4% 5e+02 16.7%
## 19 71.5 1e+00 0.0% 5e+02 16.7%
## 20 72 8e+01 2.6% 6e+02 19.3%
## ... etc.
## [list output truncated]
## -------------------------------------------------------------------------
## 7 - hip (numeric)
##
## length n NAs unique 0s mean meanCI
## 3e+03 3e+03 1e+01 6e+01 0 9.4e+01 9.4e+01
## 99.7% 0.3% 0.0% 9.5e+01
##
## .05 .10 .25 median .75 .90 .95
## 8.3e+01 8.5e+01 8.9e+01 9.4e+01 1.0e+02 1.0e+02 1.1e+02
##
## range sd vcoef mad IQR skew kurt
## 7.4e+01 7.7e+00 8.1e-02 7.4e+00 1.1e+01 1.5e-01 6.4e-01
##
##
## level freq perc cumfreq cumperc
## 1 56 1e+00 0.0% 1e+00 0.0%
## 2 58 1e+00 0.0% 2e+00 0.1%
## 3 62 1e+00 0.0% 3e+00 0.1%
## 4 64 1e+00 0.0% 4e+00 0.1%
## 5 67 1e+00 0.0% 5e+00 0.2%
## 6 69 1e+00 0.0% 6e+00 0.2%
## 7 70 1e+00 0.0% 7e+00 0.2%
## 8 71 1e+00 0.0% 8e+00 0.3%
## 9 72 1e+00 0.0% 9e+00 0.3%
## 10 73 2e+00 0.1% 1e+01 0.3%
## 11 74 2e+00 0.1% 1e+01 0.4%
## 12 75 3e+00 0.1% 2e+01 0.5%
## 13 76 2e+00 0.1% 2e+01 0.6%
## 14 77 8e+00 0.3% 3e+01 0.8%
## 15 78 7e+00 0.2% 3e+01 1.0%
## 16 79 2e+01 0.5% 5e+01 1.6%
## 17 80 3e+01 1.1% 8e+01 2.6%
## 18 81 3e+01 0.9% 1e+02 3.5%
## 19 82 4e+01 1.4% 2e+02 4.9%
## 20 83 6e+01 1.9% 2e+02 6.8%
## ... etc.
## [list output truncated]
## -------------------------------------------------------------------------
## 8 - sysbp (integer)
##
## length n NAs unique 0s mean meanCI
## 3e+03 3e+03 3e+01 6e+01 0 1.4e+02 1.4e+02
## 99.1% 0.9% 0.0% 1.4e+02
##
## .05 .10 .25 median .75 .90 .95
## 1.0e+02 1.1e+02 1.2e+02 1.4e+02 1.5e+02 1.6e+02 1.8e+02
##
## range sd vcoef mad IQR skew kurt
## 1.5e+02 2.3e+01 1.7e-01 3.0e+01 3.0e+01 4.3e-01 5.3e-02
##
##
## level freq perc cumfreq cumperc
## 1 90 3e+01 1.0% 3e+01 1.0%
## 2 95 5e+00 0.2% 4e+01 1.1%
## 3 100 2e+02 6.1% 2e+02 7.3%
## 4 102 1e+00 0.0% 2e+02 7.3%
## 5 104 1e+00 0.0% 2e+02 7.3%
## 6 105 2e+00 0.1% 2e+02 7.4%
## 7 110 3e+02 10.7% 6e+02 18.1%
## 8 112 1e+00 0.0% 6e+02 18.2%
## 9 115 6e+00 0.2% 6e+02 18.4%
## 10 116 1e+00 0.0% 6e+02 18.4%
## 11 117 2e+00 0.1% 6e+02 18.5%
## 12 120 6e+02 17.9% 1e+03 36.4%
## 13 121 1e+00 0.0% 1e+03 36.4%
## 14 122 1e+00 0.0% 1e+03 36.4%
## 15 124 3e+00 0.1% 1e+03 36.5%
## 16 125 2e+01 0.6% 1e+03 37.2%
## 17 126 2e+00 0.1% 1e+03 37.2%
## 18 128 1e+00 0.0% 1e+03 37.3%
## 19 129 1e+00 0.0% 1e+03 37.3%
## 20 130 4e+02 11.9% 2e+03 49.2%
## ... etc.
## [list output truncated]
## -------------------------------------------------------------------------
## 9 - diabp (integer)
##
## length n NAs unique 0s mean meanCI
## 3e+03 3e+03 3e+01 4e+01 0 8.2e+01 8.1e+01
## 99.1% 0.9% 0.0% 8.2e+01
##
## .05 .10 .25 median .75 .90 .95
## 6.0e+01 7.0e+01 7.0e+01 8.0e+01 9.0e+01 1.0e+02 1.0e+02
##
## range sd vcoef mad IQR skew kurt
## 9.0e+01 1.2e+01 1.5e-01 1.5e+01 2.0e+01 4.3e-01 5.5e-01
##
##
## level freq perc cumfreq cumperc
## 1 50 2e+00 0.1% 2e+00 0.1%
## 2 57 1e+00 0.0% 3e+00 0.1%
## 3 60 3e+02 8.5% 3e+02 8.6%
## 4 65 7e+00 0.2% 3e+02 8.9%
## 5 67 4e+00 0.1% 3e+02 9.0%
## 6 70 6e+02 17.9% 8e+02 26.9%
## 7 71 1e+00 0.0% 8e+02 26.9%
## 8 72 2e+00 0.1% 8e+02 27.0%
## 9 73 2e+00 0.1% 8e+02 27.0%
## 10 75 1e+01 0.4% 9e+02 27.5%
## 11 76 3e+00 0.1% 9e+02 27.6%
## 12 77 2e+00 0.1% 9e+02 27.6%
## 13 78 3e+00 0.1% 9e+02 27.7%
## 14 80 1e+03 37.3% 2e+03 65.0%
## 15 82 1e+00 0.0% 2e+03 65.0%
## 16 83 1e+00 0.0% 2e+03 65.1%
## 17 84 3e+00 0.1% 2e+03 65.1%
## 18 85 3e+01 0.8% 2e+03 66.0%
## 19 86 4e+00 0.1% 2e+03 66.1%
## 20 87 3e+00 0.1% 2e+03 66.2%
## ... etc.
## [list output truncated]
## -------------------------------------------------------------------------
## 10 - active (integer - dichotomous)
##
## length n NAs unique
## 3e+03 3e+03 0 2e+00
## 100.0% 0.0%
##
## freq perc lci9.50e-01 uci9.50e-01'
## 0 1e+03 46.9% 45.2% 48.6%
## 1 2e+03 53.1% 51.4% 54.8%
##
## ' 95%-CI Wilson
## -------------------------------------------------------------------------
## 11 - hypertension (integer - dichotomous)
##
## length n NAs unique
## 3e+03 3e+03 0 2e+00
## 100.0% 0.0%
##
## freq perc lci9.50e-01 uci9.50e-01'
## 0 1e+03 42.0% 40.3% 43.7%
## 1 2e+03 58.0% 56.3% 59.7%
##
## ' 95%-CI Wilson
## -------------------------------------------------------------------------
## 12 - bmi (numeric)
##
## length n NAs unique 0s mean meanCI
## 3e+03 3e+03 9e+00 8e+02 0 2.4e+01 2.4e+01
## 99.7% 0.3% 0.0% 2.4e+01
##
## .05 .10 .25 median .75 .90 .95
## 1.9e+01 2.0e+01 2.1e+01 2.4e+01 2.6e+01 2.9e+01 3.0e+01
##
## range sd vcoef mad IQR skew kurt
## 3.0e+01 3.6e+00 1.5e-01 3.5e+00 4.8e+00 5.2e-01 9.2e-01
##
##
## level freq perc cumfreq cumperc
## 1 13.34 1e+00 0.0% 1e+00 0.0%
## 2 13.42 1e+00 0.0% 2e+00 0.1%
## 3 13.6 1e+00 0.0% 3e+00 0.1%
## 4 13.62 1e+00 0.0% 4e+00 0.1%
## 5 13.96 1e+00 0.0% 5e+00 0.2%
## 6 14.33 1e+00 0.0% 6e+00 0.2%
## 7 14.84 1e+00 0.0% 7e+00 0.2%
## 8 15.06 1e+00 0.0% 8e+00 0.3%
## 9 15.38 1e+00 0.0% 9e+00 0.3%
## 10 15.58 1e+00 0.0% 1e+01 0.3%
## 11 15.61 1e+00 0.0% 1e+01 0.3%
## 12 15.65 1e+00 0.0% 1e+01 0.4%
## 13 15.79 2e+00 0.1% 1e+01 0.4%
## 14 15.82 1e+00 0.0% 2e+01 0.5%
## 15 15.89 1e+00 0.0% 2e+01 0.5%
## 16 15.99 1e+00 0.0% 2e+01 0.5%
## 17 16.02 1e+00 0.0% 2e+01 0.6%
## 18 16.03 1e+00 0.0% 2e+01 0.6%
## 19 16.07 1e+00 0.0% 2e+01 0.6%
## 20 16.16 1e+00 0.0% 2e+01 0.7%
## ... etc.
## [list output truncated]
## -------------------------------------------------------------------------
## 13 - whr (numeric)
##
## length n NAs unique 0s mean meanCI
## 3e+03 3e+03 1e+01 5e+01 0 8.7e-01 8.7e-01
## 99.7% 0.3% 0.0% 8.7e-01
##
## .05 .10 .25 median .75 .90 .95
## 7.5e-01 7.7e-01 8.1e-01 8.7e-01 9.2e-01 9.7e-01 9.9e-01
##
## range sd vcoef mad IQR skew kurt
## 7.5e-01 7.7e-02 8.8e-02 7.4e-02 1.1e-01 2.4e-01 5.3e-01
##
##
## level freq perc cumfreq cumperc
## 1 0.61 1e+00 0.0% 1e+00 0.0%
## 2 0.64 2e+00 0.1% 3e+00 0.1%
## 3 0.65 1e+00 0.0% 4e+00 0.1%
## 4 0.66 3e+00 0.1% 7e+00 0.2%
## 5 0.67 1e+00 0.0% 8e+00 0.3%
## 6 0.68 2e+00 0.1% 1e+01 0.3%
## 7 0.69 6e+00 0.2% 2e+01 0.5%
## 8 0.7 1e+01 0.3% 3e+01 0.9%
## 9 0.71 1e+01 0.3% 4e+01 1.2%
## 10 0.72 4e+01 1.2% 8e+01 2.4%
## 11 0.73 3e+01 0.9% 1e+02 3.3%
## 12 0.74 5e+01 1.5% 2e+02 4.8%
## 13 0.75 5e+01 1.6% 2e+02 6.3%
## 14 0.76 8e+01 2.6% 3e+02 9.0%
## 15 0.77 9e+01 2.9% 4e+02 11.8%
## 16 0.78 9e+01 2.9% 5e+02 14.7%
## 17 0.79 1e+02 3.4% 6e+02 18.1%
## 18 0.8 1e+02 4.1% 7e+02 22.2%
## 19 0.81 1e+02 4.1% 8e+02 26.3%
## 20 0.82 1e+02 3.9% 1e+03 30.1%
## ... etc.
## [list output truncated]
## -------------------------------------------------------------------------
## 14 - diabetes (factor)
##
## length n NAs unique levels dupes
## 3e+03 3e+03 0 3e+00 3e+00 y
## 100.0% 0.0%
##
## level freq perc cumfreq cumperc
## 1 Normal 3e+03 84.7% 3e+03 84.7%
## 2 IFG 2e+02 7.7% 3e+03 92.4%
## 3 Yes 2e+02 7.6% 3e+03 100.0%
## -------------------------------------------------------------------------
## 15 - diab (numeric)
##
## length n NAs unique 0s mean meanCI
## 3e+03 3e+03 0 2e+00 3e+03 7.6e-02 6.7e-02
## 100.0% 0.0% 92.4% 8.6e-02
##
## .05 .10 .25 median .75 .90 .95
## 0.0 0.0 0.0 0.0 0.0 0.0 1.0e+00
##
## range sd vcoef mad IQR skew kurt
## 1.0e+00 2.7e-01 3.5e+00 0.0 0.0 3.2e+00 8.2e+00
##
##
## level freq perc cumfreq cumperc
## 1 0 3e+03 92.4% 3e+03 92.4%
## 2 1 2e+02 7.6% 3e+03 100.0%
## -------------------------------------------------------------------------
## 16 - group (character - dichotomous)
##
## length n NAs unique
## 3e+03 3e+03 0 2e+00
## 100.0% 0.0%
##
## freq perc lci9.50e-01 uci9.50e-01'
## non-diabetes 3e+03 92.4% 91.4% 93.2%
## diabetes 2e+02 7.6% 6.8% 8.6%
##
## ' 95%-CI Wilson
boxplot(db$diab, db$bmi)
plot(db$bmi, db$whr,
xlab=("BMI"),
ylab=("WHR"))