This report is a visualization of the impact of different medical conditions on deaths events. The comments below shown step by step how this report was created.

First, packages for data analysis were loaded.

Dataset was loaded on R Studio

HFCR_DS <- read_csv("C:/Users/yamid/OneDrive/Documentos/MASTER HS OFFENBURG/FIRST SEMESTER/DATA ENGINEERING NIRO/HeartFailurePrediction/HFCS_DE_Yamit_Ibarra/heart_failure_clinical_records_dataset.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   age = col_double(),
##   anaemia = col_logical(),
##   creatinine_phosphokinase = col_double(),
##   diabetes = col_logical(),
##   ejection_fraction = col_double(),
##   high_blood_pressure = col_logical(),
##   platelets = col_double(),
##   serum_creatinine = col_double(),
##   serum_sodium = col_double(),
##   sex = col_logical(),
##   smoking = col_logical(),
##   time = col_double(),
##   DEATH_EVENT = col_logical()
## )

Checking out information

Directory path checked out

getwd()
## [1] "C:/Users/yamid/OneDrive/Documentos/MASTER HS OFFENBURG/FIRST SEMESTER/DATA ENGINEERING NIRO/HeartFailurePrediction/HFCS_DE_Yamit_Ibarra"

Columns were checked out

View(HFCR_DS)

#1. How many observations and variables has the data set?

#2. What is the type and scale of each variable?

Some variable units were consulted on internet because they were not available on the dataset. Dataset variables are shown below.

str(HFCR_DS)
## tibble [299 x 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ age                     : num [1:299] 75 55 65 50 65 90 75 60 65 80 ...
##  $ anaemia                 : logi [1:299] FALSE FALSE FALSE TRUE TRUE TRUE ...
##  $ creatinine_phosphokinase: num [1:299] 582 7861 146 111 160 ...
##  $ diabetes                : logi [1:299] FALSE FALSE FALSE FALSE TRUE FALSE ...
##  $ ejection_fraction       : num [1:299] 20 38 20 20 20 40 15 60 65 35 ...
##  $ high_blood_pressure     : logi [1:299] TRUE FALSE FALSE FALSE FALSE TRUE ...
##  $ platelets               : num [1:299] 265000 263358 162000 210000 327000 ...
##  $ serum_creatinine        : num [1:299] 1.9 1.1 1.3 1.9 2.7 2.1 1.2 1.1 1.5 9.4 ...
##  $ serum_sodium            : num [1:299] 130 136 129 137 116 132 137 131 138 133 ...
##  $ sex                     : logi [1:299] TRUE TRUE TRUE TRUE FALSE TRUE ...
##  $ smoking                 : logi [1:299] FALSE FALSE TRUE FALSE FALSE TRUE ...
##  $ time                    : num [1:299] 4 6 7 7 8 8 10 10 10 10 ...
##  $ DEATH_EVENT             : logi [1:299] TRUE TRUE TRUE TRUE TRUE TRUE ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   age = col_double(),
##   ..   anaemia = col_logical(),
##   ..   creatinine_phosphokinase = col_double(),
##   ..   diabetes = col_logical(),
##   ..   ejection_fraction = col_double(),
##   ..   high_blood_pressure = col_logical(),
##   ..   platelets = col_double(),
##   ..   serum_creatinine = col_double(),
##   ..   serum_sodium = col_double(),
##   ..   sex = col_logical(),
##   ..   smoking = col_logical(),
##   ..   time = col_double(),
##   ..   DEATH_EVENT = col_logical()
##   .. )

#3. Determine the range of the variables #4. Calculate the average, standard deviation, variance, median and the quantiles of the variables. Which of the variables are left-skewed, which right-skewed?

Data Analysis

Summary Statistics are shown below:

Statistics of the age of patients.

summary(HFCR_DS$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   40.00   51.00   60.00   60.83   70.00   95.00

Statistics of creatinine phosphokinase.

summary(HFCR_DS$creatinine_phosphokinase)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    23.0   116.5   250.0   581.8   582.0  7861.0

Statistics of platelets

summary(HFCR_DS$platelets)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   25100  212500  262000  263358  303500  850000

Statistics of serum creatinine

summary(HFCR_DS$serum_creatinine)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.500   0.900   1.100   1.394   1.400   9.400

Statistics of serum sodium

summary(HFCR_DS$serum_sodium)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   113.0   134.0   137.0   136.6   140.0   148.0

Statistics of anaemia FALSE = No anaemia TRUE = Anaemic

summary(HFCR_DS$anaemia)
##    Mode   FALSE    TRUE 
## logical     170     129

Statistics of diabetes FALSE = No diabetic TRUE = Diabetic

summary(HFCR_DS$diabetes)
##    Mode   FALSE    TRUE 
## logical     174     125

Statistics of blood pressure FALSE = No high blood pressure TRUE = High blood pressure

summary(HFCR_DS$high_blood_pressure)
##    Mode   FALSE    TRUE 
## logical     194     105

Statistics of smoking habits FALSE = No smoking TRUE = Smoke

summary(HFCR_DS$smoking)
##    Mode   FALSE    TRUE 
## logical     203      96

Statistics of Death Events FALSE = No even of death TRUE = has had an event of death

summary(HFCR_DS$DEATH_EVENT)
##    Mode   FALSE    TRUE 
## logical     203      96

Standard deviation, average, median, variance and the quantiles of the age of patients are shown below.

Mean

age_mean <- mean(HFCR_DS$age)
age_mean
## [1] 60.83389

Standard Deviation

age_sd <- sd(HFCR_DS$age)
age_sd
## [1] 11.89481

Variance

age_var <- var(HFCR_DS$age)
age_var
## [1] 141.4865

Quantile 25

age_quantiles <- quantile(HFCR_DS$age)
age_q1<- age_quantiles[2]
age_q1
## 25% 
##  51

Quantile 50

age_median <-  age_quantiles[3]
age_median
## 50% 
##  60

Quantile 75

age_q3 <-age_quantiles[4]
age_q3
## 75% 
##  70

Age of patients was analyzed to see skewing. Age skew is to the left as shown below:

if (age_mean > age_median) {
  age_skewing <- "left"
} else {
  age_skewing <- "right"
}
  age_skewing
## [1] "left"

Creatine Phosphokinase skew

 creatinine_phosphokinase_mean <- mean(HFCR_DS$creatinine_phosphokinase)
  creatinine_phosphokinase_median <- median(HFCR_DS$creatinine_phosphokinase)
  creatinine_phosphokinase_skewing <- ifelse(creatinine_phosphokinase_mean > creatinine_phosphokinase_median, creatinine_phosphokinase_skewing <- "left", creatinine_phosphokinase_skewing <- "right")
  creatinine_phosphokinase_skewing
## [1] "left"

Platelets skew

 platelets_mean <- mean(HFCR_DS$platelets)
  platelets_median <- median(HFCR_DS$platelets)
  platelets_skewing <- ifelse(platelets_mean > platelets_median, platelets_skewing <- "left", platelets_skewing <- "right")
  platelets_skewing
## [1] "left"

Histograms were created to analyze distribution and frecuency of the dataset variables and its relation with death events.

A histogram of level in Creatinine Phosphokinase is shown in conjuntion with a relation of death events.

  hist(HFCR_DS$creatinine_phosphokinase, breaks= 20, xlab= "Creatine Phosphokinase (mcg/L)", xlim = range(0, 10000), ylim = range(0,200), main = "Distribution & relation of C. Phosphokinase with death events") 
  par(new=TRUE)
  HFCR_DS$creatinine_phosphokinase <- as.integer(as.character(HFCR_DS$creatinine_phosphokinase))
  true_DEATH_EVENT <-HFCR_DS[HFCR_DS$DEATH_EVENT,]
  hist(true_DEATH_EVENT$creatinine_phosphokinase, breaks = 20, freq = T, col = c("#66FF99"), main = "", xlab = "", xlim= range(0,10000),ylim = range(0,200)) 

Histogram of ejection fraction. Values of ejection fraction are right skewed. Compared to the distribution of the variable, it’s possible to infer that patients with a low ejection fraction have a higher possibility to have a death event.

 hist(HFCR_DS$ejection_fraction, breaks = 10, xlab= "Ejection fraction (%)", xlim = range(0, 100), ylim = range(0,100), main = "Distribution & relation of ejection fraction with death events")
  par(new=TRUE)
  HFCR_DS$ejection_integer <- as.integer(as.character(HFCR_DS$ejection_fraction))
  true_DEATH_EVENT <-HFCR_DS[HFCR_DS$DEATH_EVENT,]
  hist(true_DEATH_EVENT$ejection_integer, breaks = 10, freq = T, col = c("#990066"), main = "", xlab = "", xlim= range(0,100),ylim = range(0,100))

Histogram of platelets.

hist(HFCR_DS$platelets, breaks = 20, xlab= "Platelets (kiloplatelets/mL)",  xlim= range(0,1000000), ylim = range(0,100), main = "Distribution & relation of platelets with death events") 
  par(new=TRUE)
  HFCR_DS$platelets <- as.integer(as.character(HFCR_DS$platelets))
  true_DEATH_EVENT <-HFCR_DS[HFCR_DS$DEATH_EVENT,]
  hist(true_DEATH_EVENT$platelets, breaks = 20, freq = T, col = c("#FF6699"), main = "", xlab = "",  xlim= range(0,1000000), ylim = range(0,100))

Histogram of serum creatinine. Values higher than the media of serum creatinine are related to a higher probabilty of death events.

 hist(HFCR_DS$serum_creatinine , breaks = 20, xlab= "Serum creatinine (mg/dL)", xlim = range(0, 10), ylim = range(0,200), main = "Distribution & Relation of serum creatinine with death event ")
  par(new = TRUE) 
  HFCR_DS$serum_creatinine <- as.numeric(as.character(HFCR_DS$serum_creatinine))
  true_DEATH_EVENT <-HFCR_DS[HFCR_DS$DEATH_EVENT,]
  hist(true_DEATH_EVENT$serum_creatinine, breaks = 20, freq = T, col = c("#CCFF33"), main = "", xlab = "", xlim= range(0,10),ylim = range(0,200))

Histogram of serum sodium. Values of serum sodium are left skewed.

hist(HFCR_DS$serum_sodium , breaks = 20, xlab= "Serum sodium (mEq/L)", xlim = range(110, 150), ylim = range(0,100), main = "Frecuency of serum sodium & Relation with death event")
  par(new = TRUE)
  HFCR_DS$serum_sodium <- as.integer(as.character(HFCR_DS$serum_sodium))
  true_DEATH_EVENT <-HFCR_DS[HFCR_DS$DEATH_EVENT,]
  hist(true_DEATH_EVENT$serum_sodium, breaks = 20, freq = T, col = c("#FFFF33"), main = "", xlab = "", xlim= range(110,150),ylim = range(0,100))

Histogram of Age of Patients. Age of patients is right skewed.

 hist(HFCR_DS$age, xlab = "Age (years)", xlim = range(40, 100), main = "Distribution & relation of patients age with death events" )
  par(new = TRUE)
  HFCR_DS$age_integer <- as.integer(as.character(HFCR_DS$age))
  true_DEATH_EVENT <-HFCR_DS[HFCR_DS$DEATH_EVENT,]
  hist(true_DEATH_EVENT$age_integer, breaks = 10, freq = T, col = c("#336699"), main = "", xlab = "", xlim= range(40,100),ylim = range(0,50))

On the histograms below, the frecuency of age and medical conditions are shown.

Age & Anaemia

true_anaemia <-HFCR_DS[HFCR_DS$anaemia,]
  hist(true_anaemia$age_integer, breaks = 10, freq = T, col = c("#336699"), main = "Distribution of patiens with anaemia", xlab = "Age range (years)", xlim= range(40,100),ylim = range(0,50))

Age & Diabetes

true_diabetes <-HFCR_DS[HFCR_DS$diabetes,]
  hist(true_diabetes$age_integer, breaks = 10, freq = T, col = c("#336699"), main = "Distribution of patiens with diabetes", xlab = "Age range (years)", xlim= range(40,100),ylim = range(0,50))

Age & High Blood Pressure

 true_high_bp <-HFCR_DS[HFCR_DS$high_blood_pressure,]
  hist(true_high_bp$age_integer, breaks = 10, freq = T, col = c("#336699"), main = "Distribution of patiens with high blood pressure", xlab = "Age range (years)", xlim= range(40,100),ylim = range(0,30))

Age & Smoking

 true_smoking <-HFCR_DS[HFCR_DS$smoking,]
  hist(true_smoking$age_integer, breaks = 10, freq = T, col = c("#336699"), main = "Distribution of smokers", xlab = "Age range (years)", xlim= range(40,100),ylim = range(0,30))

Relation of medical conditions with death events. On the codes below, some calculations are carried out in order to plot the impact of some medical conditions (logical variables) on the total of death events.

Number of patients with death events

  Tde <- sum(HFCR_DS$DEATH_EVENT, na.rm = TRUE)
  Tde
## [1] 96

Number of death events with anaemia as a medical condition

 TdeTan <- sum(HFCR_DS$DEATH_EVENT & HFCR_DS$anaemia, na.rm = TRUE)
  TdeTan
## [1] 46

Number of death events with diabetes as a medical condition

  TdeTdi <- sum(HFCR_DS$DEATH_EVENT & HFCR_DS$diabetes, na.rm = TRUE ) 
  TdeTdi
## [1] 40

Number of death events with high blood pressure as a medical condition

 TdeTHp <- sum(HFCR_DS$DEATH_EVENT & HFCR_DS$high_blood_pressure, na.rm = TRUE)
  TdeTHp
## [1] 39

Number of death events related with smokers

  TdeTsm <- sum(HFCR_DS$DEATH_EVENT & HFCR_DS$smoking, na.rm = TRUE)  
  TdeTsm
## [1] 30

Plot of impact of medical conditions on total of death events.

 require(ggplot2)
  data = data.frame(x = c("Anaemia", "Diabetes", "High B. Pressure", "Smokers"),
                    y1 = c(TdeTan, TdeTdi, TdeTHp, TdeTsm),
                    y2 = c(Tde,Tde,Tde,Tde))
  
  data = rbind(data.frame(type = "Death_Events_per_Medical_conditions", Death_Events_Vs_Death_Events_related_to_M.Condition = data$y1, Medical_Condition = data$x), 
               data.frame(type = "Total_Death_events", Death_Events_Vs_Death_Events_related_to_M.Condition = data$y2, Medical_Condition = data$x))
  
  
  ggplot(data, aes(y=Death_Events_Vs_Death_Events_related_to_M.Condition, x=Medical_Condition, fill=type)) + geom_bar(width=1, stat="identity", position="dodge2")+
    scale_fill_manual("Legend", values = c("Death_Events_per_Medical_conditions" = "#666666", "Total_Death_events" = "#CCCCCC"))

Boxplot of each numerical variable showing outliers.

Age has not outliers as shown below.

boxplot(HFCR_DS$age, ylab = "Age (years)", ylim = c(0,100))

Creatinine phosphokinase has outliers

 boxplot(HFCR_DS$creatinine_phosphokinase, ylab = "Creatine Phosphokinase (mcg/L)")

Ejection fraction has outliers

 boxplot(HFCR_DS$ejection_fraction, ylab = "Ejection fraction (%)")

PLatelets has outliers

 boxplot(HFCR_DS$platelets, ylab = "Platelets (kiloplatelets/mL)")

Serum creatinine has outliers

 boxplot(HFCR_DS$serum_creatinine, ylab = "Serum creatinine (mg/dL))")

Serum Sodium has outliers

 boxplot(HFCR_DS$serum_sodium, ylab = "Serum sodium (mEq/dL))")

Visualization of correlations in a correlation plot (package corrplot).

  plot(HFCR_DS$age,HFCR_DS$creatinine_phosphokinase, xlab ="Age (years)", ylab= "Creatinine Phosphokinase (mcg/L)")

Serum Creatinine & Creatinine Phosphokinase have a strong correlation since are proportional.

plot(HFCR_DS$creatinine_phosphokinase,HFCR_DS$serum_creatinine, xlab ="Creatinine Phosphokinase (mcg/L)", ylab= "Serum Creatinine (mg/dL)")

Most of the average Values of creatinine phosphokinase are proportionaly related to the average values of serium sodium

plot(HFCR_DS$creatinine_phosphokinase,HFCR_DS$serum_sodium, xlab ="Creatinine Phosphokinase (mcg/L)", ylab= "Serum sodium (mEq/dL")

Serum creatinine values are weak related to ejection fraction

 plot(HFCR_DS$ejection_fraction,HFCR_DS$serum_creatinine, xlab ="Ejection Fraction (%)", ylab= "Serum Creatinine (mg/dL) ")

Calculation of the correlation and covariance between the numerical variables.

Correlation of Age and Ejection Fraction respectively

 age_cor <- cor(HFCR_DS$age,HFCR_DS$ejection_fraction)
  age_cor
## [1] 0.06009836
  ejectionf_cor <- cor(HFCR_DS$ejection_fraction,HFCR_DS$creatinine_phosphokinase)
  ejectionf_cor
## [1] -0.04407955

Variance of Age and Ejection Fraction respectively

 age_var <- var(HFCR_DS$age,HFCR_DS$ejection_fraction)
  age_var
## [1] 8.460237
  ejectionf_var <- var(HFCR_DS$ejection_fraction,HFCR_DS$creatinine_phosphokinase)
  ejectionf_var
## [1] -506.1745

Covariance of Age and Ejection Fraction respectively

  age_cov <- cov(HFCR_DS$age,HFCR_DS$ejection_fraction)
  age_cov
## [1] 8.460237
  ejectionf_cov <- cov(HFCR_DS$ejection_fraction,HFCR_DS$creatinine_phosphokinase)
  ejectionf_cov
## [1] -506.1745