STROKE DATASET EDA

Packages needed for EDA

library(DataExplorer)
library(caTools)
library(tidyverse)

## -- Attaching packages ---------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## -- Conflicts ------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Reading the dataset

STROKE_DS = read.csv('train_2v.csv')

Viewing structure of the dataset

view(STROKE_DS)

names(STROKE_DS)

##  [1] "id"                "gender"            "age"              
##  [4] "hypertension"      "heart_disease"     "ever_married"     
##  [7] "work_type"         "Residence_type"    "avg_glucose_level"
## [10] "bmi"               "smoking_status"    "stroke"

names displays the names of the 12 variables in the stroke dataset.

The stroke dataet, openly available on Kaggle, has 43,400 instances and 12 variables

dim(STROKE_DS)

## [1] 43400    12

The dimesions as mentioned above are 43,400 instances and 12 variables.

str(STROKE_DS)

## 'data.frame':    43400 obs. of  12 variables:
##  $ id               : int  30669 30468 16523 56543 46136 32257 52800 41413 15266 28674 ...
##  $ gender           : Factor w/ 3 levels "Female","Male",..: 2 2 1 1 2 1 1 1 1 1 ...
##  $ age              : num  3 58 8 70 14 47 52 75 32 74 ...
##  $ hypertension     : int  0 1 0 0 0 0 0 0 0 1 ...
##  $ heart_disease    : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ ever_married     : Factor w/ 2 levels "No","Yes": 1 2 1 2 1 2 2 2 2 2 ...
##  $ work_type        : Factor w/ 5 levels "children","Govt_job",..: 1 4 4 4 3 4 4 5 4 5 ...
##  $ Residence_type   : Factor w/ 2 levels "Rural","Urban": 1 2 2 1 1 2 2 1 1 2 ...
##  $ avg_glucose_level: num  95.1 88 110.9 69 161.3 ...
##  $ bmi              : num  18 39.2 17.6 35.9 19.1 50.1 17.7 27 32.3 54.6 ...
##  $ smoking_status   : Factor w/ 4 levels "","formerly smoked",..: 1 3 1 2 1 1 2 3 4 3 ...
##  $ stroke           : int  0 0 0 0 0 0 0 0 0 0 ...

str gives the dtructure of the dataset. It indicates the data types of all 12 variables, in addition to what is contained in the variables.

head(STROKE_DS)

##      id gender age hypertension heart_disease ever_married    work_type
## 1 30669   Male   3            0             0           No     children
## 2 30468   Male  58            1             0          Yes      Private
## 3 16523 Female   8            0             0           No      Private
## 4 56543 Female  70            0             0          Yes      Private
## 5 46136   Male  14            0             0           No Never_worked
## 6 32257 Female  47            0             0          Yes      Private
##   Residence_type avg_glucose_level  bmi  smoking_status stroke
## 1          Rural             95.12 18.0                      0
## 2          Urban             87.96 39.2    never smoked      0
## 3          Urban            110.89 17.6                      0
## 4          Rural             69.04 35.9 formerly smoked      0
## 5          Rural            161.28 19.1                      0
## 6          Urban            210.95 50.1                      0

Checking for missing data in the dataset

plot_missing(STROKE_DS)

sum(is.na(STROKE_DS))

## [1] 1462

colSums(sapply(STROKE_DS,is.na))

##                id            gender               age      hypertension 
##                 0                 0                 0                 0 
##     heart_disease      ever_married         work_type    Residence_type 
##                 0                 0                 0                 0 
## avg_glucose_level               bmi    smoking_status            stroke 
##                 0              1462                 0                 0

sum total of all missing values are 1462. All these missing values are inunder BMI variable

Visualizing structure of the dataset

plot_str(STROKE_DS)

plot_intro(STROKE_DS)

### 96.63% of all rows are not completely missing ### 0.28% missing observations: Given the 96.63% complete rows, there are only 0.28% total missing observations

plot_missing(STROKE_DS)

### BMI is the only variable with missing data

plot_histogram(STROKE_DS)

boxplot(STROKE_DS)

plot_correlation(STROKE_DS,'continuous')

## Warning: Removed 12 rows containing missing values (geom_text).

plot_density(STROKE_DS)

AML STROKE DATASET EDA

Alexander Karari

24/02/2020