This report is a visualization of the impact of different medical conditions on deaths events. The comments below shown step by step how this report was created.
First, packages for data analysis were loaded.
Dataset was loaded on R Studio
HFCR_DS <- read_csv("C:/Users/yamid/OneDrive/Documentos/MASTER HS OFFENBURG/FIRST SEMESTER/DATA ENGINEERING NIRO/HeartFailurePrediction/HFCS_DE_Yamit_Ibarra/heart_failure_clinical_records_dataset.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## age = col_double(),
## anaemia = col_logical(),
## creatinine_phosphokinase = col_double(),
## diabetes = col_logical(),
## ejection_fraction = col_double(),
## high_blood_pressure = col_logical(),
## platelets = col_double(),
## serum_creatinine = col_double(),
## serum_sodium = col_double(),
## sex = col_logical(),
## smoking = col_logical(),
## time = col_double(),
## DEATH_EVENT = col_logical()
## )
Checking out information
Directory path checked out
getwd()
## [1] "C:/Users/yamid/OneDrive/Documentos/MASTER HS OFFENBURG/FIRST SEMESTER/DATA ENGINEERING NIRO/HeartFailurePrediction/HFCS_DE_Yamit_Ibarra"
Columns were checked out
View(HFCR_DS)
#1. How many observations and variables has the data set?
#2. What is the type and scale of each variable?
Some variable units were consulted on internet because they were not available on the dataset. Dataset variables are shown below.
str(HFCR_DS)
## tibble [299 x 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ age : num [1:299] 75 55 65 50 65 90 75 60 65 80 ...
## $ anaemia : logi [1:299] FALSE FALSE FALSE TRUE TRUE TRUE ...
## $ creatinine_phosphokinase: num [1:299] 582 7861 146 111 160 ...
## $ diabetes : logi [1:299] FALSE FALSE FALSE FALSE TRUE FALSE ...
## $ ejection_fraction : num [1:299] 20 38 20 20 20 40 15 60 65 35 ...
## $ high_blood_pressure : logi [1:299] TRUE FALSE FALSE FALSE FALSE TRUE ...
## $ platelets : num [1:299] 265000 263358 162000 210000 327000 ...
## $ serum_creatinine : num [1:299] 1.9 1.1 1.3 1.9 2.7 2.1 1.2 1.1 1.5 9.4 ...
## $ serum_sodium : num [1:299] 130 136 129 137 116 132 137 131 138 133 ...
## $ sex : logi [1:299] TRUE TRUE TRUE TRUE FALSE TRUE ...
## $ smoking : logi [1:299] FALSE FALSE TRUE FALSE FALSE TRUE ...
## $ time : num [1:299] 4 6 7 7 8 8 10 10 10 10 ...
## $ DEATH_EVENT : logi [1:299] TRUE TRUE TRUE TRUE TRUE TRUE ...
## - attr(*, "spec")=
## .. cols(
## .. age = col_double(),
## .. anaemia = col_logical(),
## .. creatinine_phosphokinase = col_double(),
## .. diabetes = col_logical(),
## .. ejection_fraction = col_double(),
## .. high_blood_pressure = col_logical(),
## .. platelets = col_double(),
## .. serum_creatinine = col_double(),
## .. serum_sodium = col_double(),
## .. sex = col_logical(),
## .. smoking = col_logical(),
## .. time = col_double(),
## .. DEATH_EVENT = col_logical()
## .. )
#3. Determine the range of the variables #4. Calculate the average, standard deviation, variance, median and the quantiles of the variables. Which of the variables are left-skewed, which right-skewed?
Data Analysis
Summary Statistics are shown below:
Statistics of the age of patients.
summary(HFCR_DS$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 40.00 51.00 60.00 60.83 70.00 95.00
Statistics of creatinine phosphokinase.
summary(HFCR_DS$creatinine_phosphokinase)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 23.0 116.5 250.0 581.8 582.0 7861.0
Statistics of platelets
summary(HFCR_DS$platelets)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 25100 212500 262000 263358 303500 850000
Statistics of serum creatinine
summary(HFCR_DS$serum_creatinine)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.500 0.900 1.100 1.394 1.400 9.400
Statistics of serum sodium
summary(HFCR_DS$serum_sodium)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 113.0 134.0 137.0 136.6 140.0 148.0
Statistics of anaemia FALSE = No anaemia TRUE = Anaemic
summary(HFCR_DS$anaemia)
## Mode FALSE TRUE
## logical 170 129
Statistics of diabetes FALSE = No diabetic TRUE = Diabetic
summary(HFCR_DS$diabetes)
## Mode FALSE TRUE
## logical 174 125
Statistics of blood pressure FALSE = No high blood pressure TRUE = High blood pressure
summary(HFCR_DS$high_blood_pressure)
## Mode FALSE TRUE
## logical 194 105
Statistics of smoking habits FALSE = No smoking TRUE = Smoke
summary(HFCR_DS$smoking)
## Mode FALSE TRUE
## logical 203 96
Statistics of Death Events FALSE = No even of death TRUE = has had an event of death
summary(HFCR_DS$DEATH_EVENT)
## Mode FALSE TRUE
## logical 203 96
Standard deviation, average, median, variance and the quantiles of the age of patients are shown below.
Mean
age_mean <- mean(HFCR_DS$age)
age_mean
## [1] 60.83389
Standard Deviation
age_sd <- sd(HFCR_DS$age)
age_sd
## [1] 11.89481
Variance
age_var <- var(HFCR_DS$age)
age_var
## [1] 141.4865
Quantile 25
age_quantiles <- quantile(HFCR_DS$age)
age_q1<- age_quantiles[2]
age_q1
## 25%
## 51
Quantile 50
age_median <- age_quantiles[3]
age_median
## 50%
## 60
Quantile 75
age_q3 <-age_quantiles[4]
age_q3
## 75%
## 70
Age of patients was analyzed to see skewing. Age skew is to the left as shown below:
if (age_mean > age_median) {
age_skewing <- "left"
} else {
age_skewing <- "right"
}
age_skewing
## [1] "left"
Creatine Phosphokinase skew
creatinine_phosphokinase_mean <- mean(HFCR_DS$creatinine_phosphokinase)
creatinine_phosphokinase_median <- median(HFCR_DS$creatinine_phosphokinase)
creatinine_phosphokinase_skewing <- ifelse(creatinine_phosphokinase_mean > creatinine_phosphokinase_median, creatinine_phosphokinase_skewing <- "left", creatinine_phosphokinase_skewing <- "right")
creatinine_phosphokinase_skewing
## [1] "left"
Platelets skew
platelets_mean <- mean(HFCR_DS$platelets)
platelets_median <- median(HFCR_DS$platelets)
platelets_skewing <- ifelse(platelets_mean > platelets_median, platelets_skewing <- "left", platelets_skewing <- "right")
platelets_skewing
## [1] "left"
Histograms were created to analyze distribution and frecuency of the dataset variables and its relation with death events.
A histogram of level in Creatinine Phosphokinase is shown in conjuntion with a relation of death events.
hist(HFCR_DS$creatinine_phosphokinase, breaks= 20, xlab= "Creatine Phosphokinase (mcg/L)", xlim = range(0, 10000), ylim = range(0,200), main = "Distribution & relation of C. Phosphokinase with death events")
par(new=TRUE)
HFCR_DS$creatinine_phosphokinase <- as.integer(as.character(HFCR_DS$creatinine_phosphokinase))
true_DEATH_EVENT <-HFCR_DS[HFCR_DS$DEATH_EVENT,]
hist(true_DEATH_EVENT$creatinine_phosphokinase, breaks = 20, freq = T, col = c("#66FF99"), main = "", xlab = "", xlim= range(0,10000),ylim = range(0,200))
Histogram of ejection fraction. Values of ejection fraction are right skewed. Compared to the distribution of the variable, it’s possible to infer that patients with a low ejection fraction have a higher possibility to have a death event.
hist(HFCR_DS$ejection_fraction, breaks = 10, xlab= "Ejection fraction (%)", xlim = range(0, 100), ylim = range(0,100), main = "Distribution & relation of ejection fraction with death events")
par(new=TRUE)
HFCR_DS$ejection_integer <- as.integer(as.character(HFCR_DS$ejection_fraction))
true_DEATH_EVENT <-HFCR_DS[HFCR_DS$DEATH_EVENT,]
hist(true_DEATH_EVENT$ejection_integer, breaks = 10, freq = T, col = c("#990066"), main = "", xlab = "", xlim= range(0,100),ylim = range(0,100))
Histogram of platelets.
hist(HFCR_DS$platelets, breaks = 20, xlab= "Platelets (kiloplatelets/mL)", xlim= range(0,1000000), ylim = range(0,100), main = "Distribution & relation of platelets with death events")
par(new=TRUE)
HFCR_DS$platelets <- as.integer(as.character(HFCR_DS$platelets))
true_DEATH_EVENT <-HFCR_DS[HFCR_DS$DEATH_EVENT,]
hist(true_DEATH_EVENT$platelets, breaks = 20, freq = T, col = c("#FF6699"), main = "", xlab = "", xlim= range(0,1000000), ylim = range(0,100))
Histogram of serum creatinine. Values higher than the media of serum creatinine are related to a higher probabilty of death events.
hist(HFCR_DS$serum_creatinine , breaks = 20, xlab= "Serum creatinine (mg/dL)", xlim = range(0, 10), ylim = range(0,200), main = "Distribution & Relation of serum creatinine with death event ")
par(new = TRUE)
HFCR_DS$serum_creatinine <- as.numeric(as.character(HFCR_DS$serum_creatinine))
true_DEATH_EVENT <-HFCR_DS[HFCR_DS$DEATH_EVENT,]
hist(true_DEATH_EVENT$serum_creatinine, breaks = 20, freq = T, col = c("#CCFF33"), main = "", xlab = "", xlim= range(0,10),ylim = range(0,200))
Histogram of serum sodium. Values of serum sodium are left skewed.
hist(HFCR_DS$serum_sodium , breaks = 20, xlab= "Serum sodium (mEq/L)", xlim = range(110, 150), ylim = range(0,100), main = "Frecuency of serum sodium & Relation with death event")
par(new = TRUE)
HFCR_DS$serum_sodium <- as.integer(as.character(HFCR_DS$serum_sodium))
true_DEATH_EVENT <-HFCR_DS[HFCR_DS$DEATH_EVENT,]
hist(true_DEATH_EVENT$serum_sodium, breaks = 20, freq = T, col = c("#FFFF33"), main = "", xlab = "", xlim= range(110,150),ylim = range(0,100))
Histogram of Age of Patients. Age of patients is right skewed.
hist(HFCR_DS$age, xlab = "Age (years)", xlim = range(40, 100), main = "Distribution & relation of patients age with death events" )
par(new = TRUE)
HFCR_DS$age_integer <- as.integer(as.character(HFCR_DS$age))
true_DEATH_EVENT <-HFCR_DS[HFCR_DS$DEATH_EVENT,]
hist(true_DEATH_EVENT$age_integer, breaks = 10, freq = T, col = c("#336699"), main = "", xlab = "", xlim= range(40,100),ylim = range(0,50))
On the histograms below, the frecuency of age and medical conditions are shown.
Age & Anaemia
true_anaemia <-HFCR_DS[HFCR_DS$anaemia,]
hist(true_anaemia$age_integer, breaks = 10, freq = T, col = c("#336699"), main = "Distribution of patiens with anaemia", xlab = "Age range (years)", xlim= range(40,100),ylim = range(0,50))
Age & Diabetes
true_diabetes <-HFCR_DS[HFCR_DS$diabetes,]
hist(true_diabetes$age_integer, breaks = 10, freq = T, col = c("#336699"), main = "Distribution of patiens with diabetes", xlab = "Age range (years)", xlim= range(40,100),ylim = range(0,50))
Age & High Blood Pressure
true_high_bp <-HFCR_DS[HFCR_DS$high_blood_pressure,]
hist(true_high_bp$age_integer, breaks = 10, freq = T, col = c("#336699"), main = "Distribution of patiens with high blood pressure", xlab = "Age range (years)", xlim= range(40,100),ylim = range(0,30))
Age & Smoking
true_smoking <-HFCR_DS[HFCR_DS$smoking,]
hist(true_smoking$age_integer, breaks = 10, freq = T, col = c("#336699"), main = "Distribution of smokers", xlab = "Age range (years)", xlim= range(40,100),ylim = range(0,30))
Relation of medical conditions with death events. On the codes below, some calculations are carried out in order to plot the impact of some medical conditions (logical variables) on the total of death events.
Number of patients with death events
Tde <- sum(HFCR_DS$DEATH_EVENT, na.rm = TRUE)
Tde
## [1] 96
Number of death events with anaemia as a medical condition
TdeTan <- sum(HFCR_DS$DEATH_EVENT & HFCR_DS$anaemia, na.rm = TRUE)
TdeTan
## [1] 46
Number of death events with diabetes as a medical condition
TdeTdi <- sum(HFCR_DS$DEATH_EVENT & HFCR_DS$diabetes, na.rm = TRUE )
TdeTdi
## [1] 40
Number of death events with high blood pressure as a medical condition
TdeTHp <- sum(HFCR_DS$DEATH_EVENT & HFCR_DS$high_blood_pressure, na.rm = TRUE)
TdeTHp
## [1] 39
Number of death events related with smokers
TdeTsm <- sum(HFCR_DS$DEATH_EVENT & HFCR_DS$smoking, na.rm = TRUE)
TdeTsm
## [1] 30
Plot of impact of medical conditions on total of death events.
require(ggplot2)
data = data.frame(x = c("Anaemia", "Diabetes", "High B. Pressure", "Smokers"),
y1 = c(TdeTan, TdeTdi, TdeTHp, TdeTsm),
y2 = c(Tde,Tde,Tde,Tde))
data = rbind(data.frame(type = "Death_Events_per_Medical_conditions", Death_Events_Vs_Death_Events_related_to_M.Condition = data$y1, Medical_Condition = data$x),
data.frame(type = "Total_Death_events", Death_Events_Vs_Death_Events_related_to_M.Condition = data$y2, Medical_Condition = data$x))
ggplot(data, aes(y=Death_Events_Vs_Death_Events_related_to_M.Condition, x=Medical_Condition, fill=type)) + geom_bar(width=1, stat="identity", position="dodge2")+
scale_fill_manual("Legend", values = c("Death_Events_per_Medical_conditions" = "#666666", "Total_Death_events" = "#CCCCCC"))
Boxplot of each numerical variable showing outliers.
Age has not outliers as shown below.
boxplot(HFCR_DS$age, ylab = "Age (years)", ylim = c(0,100))
Creatinine phosphokinase has outliers
boxplot(HFCR_DS$creatinine_phosphokinase, ylab = "Creatine Phosphokinase (mcg/L)")
Ejection fraction has outliers
boxplot(HFCR_DS$ejection_fraction, ylab = "Ejection fraction (%)")
PLatelets has outliers
boxplot(HFCR_DS$platelets, ylab = "Platelets (kiloplatelets/mL)")
Serum creatinine has outliers
boxplot(HFCR_DS$serum_creatinine, ylab = "Serum creatinine (mg/dL))")
Serum Sodium has outliers
boxplot(HFCR_DS$serum_sodium, ylab = "Serum sodium (mEq/dL))")
Visualization of correlations in a correlation plot (package corrplot).
plot(HFCR_DS$age,HFCR_DS$creatinine_phosphokinase, xlab ="Age (years)", ylab= "Creatinine Phosphokinase (mcg/L)")
Serum Creatinine & Creatinine Phosphokinase have a strong correlation since are proportional.
plot(HFCR_DS$creatinine_phosphokinase,HFCR_DS$serum_creatinine, xlab ="Creatinine Phosphokinase (mcg/L)", ylab= "Serum Creatinine (mg/dL)")
Most of the average Values of creatinine phosphokinase are proportionaly related to the average values of serium sodium
plot(HFCR_DS$creatinine_phosphokinase,HFCR_DS$serum_sodium, xlab ="Creatinine Phosphokinase (mcg/L)", ylab= "Serum sodium (mEq/dL")
Serum creatinine values are weak related to ejection fraction
plot(HFCR_DS$ejection_fraction,HFCR_DS$serum_creatinine, xlab ="Ejection Fraction (%)", ylab= "Serum Creatinine (mg/dL) ")
Calculation of the correlation and covariance between the numerical variables.
Correlation of Age and Ejection Fraction respectively
age_cor <- cor(HFCR_DS$age,HFCR_DS$ejection_fraction)
age_cor
## [1] 0.06009836
ejectionf_cor <- cor(HFCR_DS$ejection_fraction,HFCR_DS$creatinine_phosphokinase)
ejectionf_cor
## [1] -0.04407955
Variance of Age and Ejection Fraction respectively
age_var <- var(HFCR_DS$age,HFCR_DS$ejection_fraction)
age_var
## [1] 8.460237
ejectionf_var <- var(HFCR_DS$ejection_fraction,HFCR_DS$creatinine_phosphokinase)
ejectionf_var
## [1] -506.1745
Covariance of Age and Ejection Fraction respectively
age_cov <- cov(HFCR_DS$age,HFCR_DS$ejection_fraction)
age_cov
## [1] 8.460237
ejectionf_cov <- cov(HFCR_DS$ejection_fraction,HFCR_DS$creatinine_phosphokinase)
ejectionf_cov
## [1] -506.1745