library(DataExplorer)
library(caTools)
library(tidyverse)
## -- Attaching packages ---------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
STROKE_DS = read.csv('train_2v.csv')
view(STROKE_DS)
names(STROKE_DS)
## [1] "id" "gender" "age"
## [4] "hypertension" "heart_disease" "ever_married"
## [7] "work_type" "Residence_type" "avg_glucose_level"
## [10] "bmi" "smoking_status" "stroke"
dim(STROKE_DS)
## [1] 43400 12
str(STROKE_DS)
## 'data.frame': 43400 obs. of 12 variables:
## $ id : int 30669 30468 16523 56543 46136 32257 52800 41413 15266 28674 ...
## $ gender : Factor w/ 3 levels "Female","Male",..: 2 2 1 1 2 1 1 1 1 1 ...
## $ age : num 3 58 8 70 14 47 52 75 32 74 ...
## $ hypertension : int 0 1 0 0 0 0 0 0 0 1 ...
## $ heart_disease : int 0 0 0 0 0 0 0 1 0 0 ...
## $ ever_married : Factor w/ 2 levels "No","Yes": 1 2 1 2 1 2 2 2 2 2 ...
## $ work_type : Factor w/ 5 levels "children","Govt_job",..: 1 4 4 4 3 4 4 5 4 5 ...
## $ Residence_type : Factor w/ 2 levels "Rural","Urban": 1 2 2 1 1 2 2 1 1 2 ...
## $ avg_glucose_level: num 95.1 88 110.9 69 161.3 ...
## $ bmi : num 18 39.2 17.6 35.9 19.1 50.1 17.7 27 32.3 54.6 ...
## $ smoking_status : Factor w/ 4 levels "","formerly smoked",..: 1 3 1 2 1 1 2 3 4 3 ...
## $ stroke : int 0 0 0 0 0 0 0 0 0 0 ...
head(STROKE_DS)
## id gender age hypertension heart_disease ever_married work_type
## 1 30669 Male 3 0 0 No children
## 2 30468 Male 58 1 0 Yes Private
## 3 16523 Female 8 0 0 No Private
## 4 56543 Female 70 0 0 Yes Private
## 5 46136 Male 14 0 0 No Never_worked
## 6 32257 Female 47 0 0 Yes Private
## Residence_type avg_glucose_level bmi smoking_status stroke
## 1 Rural 95.12 18.0 0
## 2 Urban 87.96 39.2 never smoked 0
## 3 Urban 110.89 17.6 0
## 4 Rural 69.04 35.9 formerly smoked 0
## 5 Rural 161.28 19.1 0
## 6 Urban 210.95 50.1 0
plot_missing(STROKE_DS)
sum(is.na(STROKE_DS))
## [1] 1462
colSums(sapply(STROKE_DS,is.na))
## id gender age hypertension
## 0 0 0 0
## heart_disease ever_married work_type Residence_type
## 0 0 0 0
## avg_glucose_level bmi smoking_status stroke
## 0 1462 0 0
plot_str(STROKE_DS)
plot_intro(STROKE_DS)
### 96.63% of all rows are not completely missing ### 0.28% missing observations: Given the 96.63% complete rows, there are only 0.28% total missing observations
plot_missing(STROKE_DS)
### BMI is the only variable with missing data
plot_histogram(STROKE_DS)
boxplot(STROKE_DS)
plot_correlation(STROKE_DS,'continuous')
## Warning: Removed 12 rows containing missing values (geom_text).
plot_density(STROKE_DS)