Data preparation

#Loading packages

suppressWarnings({
  library(tidyverse)
  library(openintro)
})

Loading dataset

url <- "https://raw.githubusercontent.com/lburenkov/Diabetes/main/diabetes_012_health_indicators_BRFSS2015.csv"  # Replace with the URL of your CSV data
data <- read.csv(url)

Research question

Can we predict diabetes based on health indicators?

Cases

# Using the dim() function to get the number of rows and columns
dimensions <- dim(data)

# Extracting the number of rows and columns
num_rows <- dimensions[1]
num_columns <- dimensions[2]

# Printing the number of rows and columns
cat("Number of rows:", num_rows, "\n")
## Number of rows: 253680
cat("Number of columns:", num_columns, "\n")
## Number of columns: 22

Data collection

The Behavioral Risk Factor Surveillance System (BRFSS) is a health-related telephone survey that is collected annually by the CDC. Each year, the survey collects responses from over 400,000 Americans on health-related risk behaviors, chronic health conditions, and the use of preventative services. It has been conducted every year since 1984. https://www.kaggle.com/datasets/alexteboul/diabetes-health-indicators-dataset

Type of study

This is an observational study.

Response

The response variable is categorical.

Explanatory

The explanatory variable is numerical.

Relevant summary statistics

# Print summary statistics for the entire dataset
summary(data)
##   Diabetes_012        HighBP         HighChol        CholCheck     
##  Min.   :0.0000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:1.0000  
##  Median :0.0000   Median :0.000   Median :0.0000   Median :1.0000  
##  Mean   :0.2969   Mean   :0.429   Mean   :0.4241   Mean   :0.9627  
##  3rd Qu.:0.0000   3rd Qu.:1.000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :2.0000   Max.   :1.000   Max.   :1.0000   Max.   :1.0000  
##       BMI            Smoker           Stroke        HeartDiseaseorAttack
##  Min.   :12.00   Min.   :0.0000   Min.   :0.00000   Min.   :0.00000     
##  1st Qu.:24.00   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000     
##  Median :27.00   Median :0.0000   Median :0.00000   Median :0.00000     
##  Mean   :28.38   Mean   :0.4432   Mean   :0.04057   Mean   :0.09419     
##  3rd Qu.:31.00   3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:0.00000     
##  Max.   :98.00   Max.   :1.0000   Max.   :1.00000   Max.   :1.00000     
##   PhysActivity        Fruits          Veggies       HvyAlcoholConsump
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   
##  1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:0.0000   
##  Median :1.0000   Median :1.0000   Median :1.0000   Median :0.0000   
##  Mean   :0.7565   Mean   :0.6343   Mean   :0.8114   Mean   :0.0562   
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000   
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   
##  AnyHealthcare     NoDocbcCost         GenHlth         MentHlth     
##  Min.   :0.0000   Min.   :0.00000   Min.   :1.000   Min.   : 0.000  
##  1st Qu.:1.0000   1st Qu.:0.00000   1st Qu.:2.000   1st Qu.: 0.000  
##  Median :1.0000   Median :0.00000   Median :2.000   Median : 0.000  
##  Mean   :0.9511   Mean   :0.08418   Mean   :2.511   Mean   : 3.185  
##  3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:3.000   3rd Qu.: 2.000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :5.000   Max.   :30.000  
##     PhysHlth         DiffWalk           Sex              Age        
##  Min.   : 0.000   Min.   :0.0000   Min.   :0.0000   Min.   : 1.000  
##  1st Qu.: 0.000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.: 6.000  
##  Median : 0.000   Median :0.0000   Median :0.0000   Median : 8.000  
##  Mean   : 4.242   Mean   :0.1682   Mean   :0.4403   Mean   : 8.032  
##  3rd Qu.: 3.000   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:10.000  
##  Max.   :30.000   Max.   :1.0000   Max.   :1.0000   Max.   :13.000  
##    Education        Income     
##  Min.   :1.00   Min.   :1.000  
##  1st Qu.:4.00   1st Qu.:5.000  
##  Median :5.00   Median :7.000  
##  Mean   :5.05   Mean   :6.054  
##  3rd Qu.:6.00   3rd Qu.:8.000  
##  Max.   :6.00   Max.   :8.000

LS0tDQp0aXRsZTogIlByb2plY3QgcHJvcG9zYWwiDQphdXRob3I6ICJMYXVyYSBQdWVibGEiDQpkYXRlOiAiYHIgU3lzLkRhdGUoKWAiDQpvdXRwdXQ6IG9wZW5pbnRybzo6bGFiX3JlcG9ydA0KLS0tDQoNCg0KIyMjIERhdGEgcHJlcGFyYXRpb24NCmBgYHtyIGxvYWQtcGFja2FnZXMsIG1lc3NhZ2U9RkFMU0V9DQojTG9hZGluZyBwYWNrYWdlcw0KDQpzdXBwcmVzc1dhcm5pbmdzKHsNCiAgbGlicmFyeSh0aWR5dmVyc2UpDQogIGxpYnJhcnkob3BlbmludHJvKQ0KfSkNCg0KYGBgDQoNCkxvYWRpbmcgZGF0YXNldA0KYGBge3J9DQp1cmwgPC0gImh0dHBzOi8vcmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbS9sYnVyZW5rb3YvRGlhYmV0ZXMvbWFpbi9kaWFiZXRlc18wMTJfaGVhbHRoX2luZGljYXRvcnNfQlJGU1MyMDE1LmNzdiIgICMgUmVwbGFjZSB3aXRoIHRoZSBVUkwgb2YgeW91ciBDU1YgZGF0YQ0KZGF0YSA8LSByZWFkLmNzdih1cmwpDQoNCmBgYA0KDQojIyMgUmVzZWFyY2ggcXVlc3Rpb24gDQpDYW4gd2UgcHJlZGljdCBkaWFiZXRlcyBiYXNlZCBvbiBoZWFsdGggaW5kaWNhdG9ycz8NCg0KIyMjIENhc2VzIA0KDQoNCmBgYHtyfQ0KIyBVc2luZyB0aGUgZGltKCkgZnVuY3Rpb24gdG8gZ2V0IHRoZSBudW1iZXIgb2Ygcm93cyBhbmQgY29sdW1ucw0KZGltZW5zaW9ucyA8LSBkaW0oZGF0YSkNCg0KIyBFeHRyYWN0aW5nIHRoZSBudW1iZXIgb2Ygcm93cyBhbmQgY29sdW1ucw0KbnVtX3Jvd3MgPC0gZGltZW5zaW9uc1sxXQ0KbnVtX2NvbHVtbnMgPC0gZGltZW5zaW9uc1syXQ0KDQojIFByaW50aW5nIHRoZSBudW1iZXIgb2Ygcm93cyBhbmQgY29sdW1ucw0KY2F0KCJOdW1iZXIgb2Ygcm93czoiLCBudW1fcm93cywgIlxuIikNCmNhdCgiTnVtYmVyIG9mIGNvbHVtbnM6IiwgbnVtX2NvbHVtbnMsICJcbiIpDQoNCmBgYA0KDQojIyMgRGF0YSBjb2xsZWN0aW9uIA0KVGhlIEJlaGF2aW9yYWwgUmlzayBGYWN0b3IgU3VydmVpbGxhbmNlIFN5c3RlbSAoQlJGU1MpIGlzIGEgaGVhbHRoLXJlbGF0ZWQgdGVsZXBob25lIHN1cnZleSB0aGF0IGlzIGNvbGxlY3RlZCBhbm51YWxseSBieSB0aGUgQ0RDLiBFYWNoIHllYXIsIHRoZSBzdXJ2ZXkgY29sbGVjdHMgcmVzcG9uc2VzIGZyb20gb3ZlciA0MDAsMDAwIEFtZXJpY2FucyBvbiBoZWFsdGgtcmVsYXRlZCByaXNrIGJlaGF2aW9ycywgY2hyb25pYyBoZWFsdGggY29uZGl0aW9ucywgYW5kIHRoZSB1c2Ugb2YgcHJldmVudGF0aXZlIHNlcnZpY2VzLiBJdCBoYXMgYmVlbiBjb25kdWN0ZWQgZXZlcnkgeWVhciBzaW5jZSAxOTg0Lg0KaHR0cHM6Ly93d3cua2FnZ2xlLmNvbS9kYXRhc2V0cy9hbGV4dGVib3VsL2RpYWJldGVzLWhlYWx0aC1pbmRpY2F0b3JzLWRhdGFzZXQNCg0KDQojIyMgVHlwZSBvZiBzdHVkeSANClRoaXMgaXMgYW4gb2JzZXJ2YXRpb25hbCBzdHVkeS4NCg0KIyMjIERhdGEgU291cmNlDQpEYXRhc2V0IGlzIGF2YWlsYWJsZSBpbiBLYWdnbGUgaHR0cHM6Ly93d3cua2FnZ2xlLmNvbS9kYXRhc2V0cy9hbGV4dGVib3VsL2RpYWJldGVzLWhlYWx0aC1pbmRpY2F0b3JzLWRhdGFzZXQNCg0KIyMjIFJlc3BvbnNlDQpUaGUgcmVzcG9uc2UgdmFyaWFibGUgaXMgY2F0ZWdvcmljYWwuDQoNCg0KIyMjIEV4cGxhbmF0b3J5IA0KDQpUaGUgZXhwbGFuYXRvcnkgdmFyaWFibGUgaXMgbnVtZXJpY2FsLg0KDQojIyMgUmVsZXZhbnQgc3VtbWFyeSBzdGF0aXN0aWNzIA0KDQpgYGB7cn0NCiMgUHJpbnQgc3VtbWFyeSBzdGF0aXN0aWNzIGZvciB0aGUgZW50aXJlIGRhdGFzZXQNCnN1bW1hcnkoZGF0YSkNCmBgYA0KDQoNCg0KDQouLi4NCg0K