Final Project Quarto

Author

Telesphore E. L. Kabore

Loading Libraries

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(infer)
library(skimr)

Load data set

setwd("~/Telesphore/Personnel/Etudes/Montgomery_College/Data_Sciences_Certificate_program/Math_217/Final_Project")
SeasonalEffect <- read_csv("SeasonalEffect.csv")
Rows: 2919 Columns: 14
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
dbl (14): Age, Female, Race, BMI, ASAstatus, Diabetes, ChronicRenalFailure, ...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Data cleaning

# quick look at the data
glimpse(SeasonalEffect)
Rows: 2,919
Columns: 14
$ Age                 <dbl> 44.0, 28.1, 39.7, 26.6, 69.0, 40.2, 69.0, 27.8, 57…
$ Female              <dbl> 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,…
$ Race                <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ BMI                 <dbl> 24.3, 20.3, 21.6, 22.3, 16.0, 23.3, 19.9, 24.9, 30…
$ ASAstatus           <dbl> 2, 2, 2, 2, 3, 2, 3, 2, 2, 2, 2, 3, 2, 3, 2, 2, 3,…
$ Diabetes            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ ChronicRenalFailure <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ PreopSteroids       <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,…
$ Emergency           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
$ DurationSurgery     <dbl> 5.38, 2.78, 1.92, 3.45, 6.27, 4.33, 3.68, 3.52, 7.…
$ VitaminD            <dbl> NA, NA, NA, 37, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Season              <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
$ RBC                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 963, 0, 0, 0, 550, 0…
$ SSI                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
# Looking at understanding variables definitions
names(SeasonalEffect)
 [1] "Age"                 "Female"              "Race"               
 [4] "BMI"                 "ASAstatus"           "Diabetes"           
 [7] "ChronicRenalFailure" "PreopSteroids"       "Emergency"          
[10] "DurationSurgery"     "VitaminD"            "Season"             
[13] "RBC"                 "SSI"                
# Make all headers lowercase and remove spaces
names(SeasonalEffect) <- tolower(names(SeasonalEffect))
names(SeasonalEffect) <- gsub(" ","",names(SeasonalEffect))
head(SeasonalEffect)
# A tibble: 6 × 14
    age female  race   bmi asastatus diabetes chronicrenalfailure preopsteroids
  <dbl>  <dbl> <dbl> <dbl>     <dbl>    <dbl>               <dbl>         <dbl>
1  44        0     1  24.3         2        0                   0             0
2  28.1      0     1  20.3         2        0                   0             0
3  39.7      1     1  21.6         2        0                   0             1
4  26.6      1     1  22.3         2        0                   0             0
5  69        1     1  16           3        0                   0             0
6  40.2      1     1  23.3         2        0                   0             0
# ℹ 6 more variables: emergency <dbl>, durationsurgery <dbl>, vitamind <dbl>,
#   season <dbl>, rbc <dbl>, ssi <dbl>
SeasonalEffect1 <- SeasonalEffect %>% 
filter(!is.na("VitaminD"))
head(SeasonalEffect1)
# A tibble: 6 × 14
    age female  race   bmi asastatus diabetes chronicrenalfailure preopsteroids
  <dbl>  <dbl> <dbl> <dbl>     <dbl>    <dbl>               <dbl>         <dbl>
1  44        0     1  24.3         2        0                   0             0
2  28.1      0     1  20.3         2        0                   0             0
3  39.7      1     1  21.6         2        0                   0             1
4  26.6      1     1  22.3         2        0                   0             0
5  69        1     1  16           3        0                   0             0
6  40.2      1     1  23.3         2        0                   0             0
# ℹ 6 more variables: emergency <dbl>, durationsurgery <dbl>, vitamind <dbl>,
#   season <dbl>, rbc <dbl>, ssi <dbl>

Three (3) differents plotting of SeasonalEffect1 data set

# Plot 1 Histogram to see the age distribution
ggplot(SeasonalEffect1, aes(x= age)) +
geom_histogram(binwidth = 5, fill = "darkblue", color = "black") +
labs(title = "Distribution of Age", x = "Age", y = "Count")

# Plot 2: Scatter Plot BMI Distribution by Age
ggplot(SeasonalEffect1, aes(x=age, y = bmi))+
geom_point(color = "darkgreen")+
geom_smooth(method = "lm", se = TRUE, color = "red")+
labs(title = "BMI vs Age", 
x = "Age", y = "BMI")
`geom_smooth()` using formula = 'y ~ x'

# Plot 3 Showing SSI Count by season and health status
SeasonalEffect1$asastatus <- as.factor(SeasonalEffect1$asastatus)
SeasonalEffect1$season <- as.factor(SeasonalEffect1$season)
# Customizing colors
my_colors <- c("1" = "#1f77b4", # Blue for ASA 1
               "2" = "#ff7f0e", # Orange for ASA 2
               "3" = "#2ca02c", # Green for ASA 3
               "4" = "#d62729") # Red for ASA 4


# Creating the new Bar plot with own colors
ggplot(SeasonalEffect1, aes(x = (season), fill = (asastatus))) +
  geom_bar(position = "dodge", stat = "count") +
  labs(title = "SSI Distribution by Season and ASA Status",
       x = "season",
       y = "Count",
       fill = "ASA Status",
       caption = "Source: David Ngendahimana, “Season Effect Dataset”, TSHS Resources Portal (2016)") +
  scale_fill_manual(name = "ASA Status",
                    values = my_colors,
                    drop = FALSE) +
  
  theme_minimal()