# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(ggplot2)
library(readxl)
# Read the dataset
df <- read_excel("~/project/R PROGRAM/healthcare/Dataset.xlsx") %>% clean_names()
# Convert date column (modify column name if needed)
df$date_of_admission <- as.Date(df$date_of_admission, format = "%Y-%m-%d")
# Convert categorical columns to factors
df <- df %>%
mutate(across(where(is.character), as.factor))
# View structure
str(df)
## tibble [1,656 × 12] (S3: tbl_df/tbl/data.frame)
## $ id : num [1:1656] 6812 9043 1495 4309 8710 ...
## $ name : Factor w/ 1643 levels "Aaron Bradshaw",..: 157 1002 387 85 12 507 487 288 652 301 ...
## $ age : num [1:1656] 78 17 15 32 72 91 52 52 56 43 ...
## $ gender : Factor w/ 2 levels "Female","Male": 2 1 2 1 2 1 2 1 2 1 ...
## $ city : Factor w/ 3 levels "Albuquerque",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ blood_type : Factor w/ 9 levels "A-","A+","AB-",..: 5 2 1 8 4 2 3 2 4 3 ...
## $ education : Factor w/ 3 levels "Bachelor","High School",..: 1 1 3 2 3 2 2 1 2 3 ...
## $ employment_status: Factor w/ 5 levels "Employed","Retired",..: 1 5 3 1 2 4 5 1 3 1 ...
## $ salary : num [1:1656] 80000 0 120000 45000 50000 0 0 65000 30000 100000 ...
## $ health_condition : Factor w/ 3 levels "Excellent","Good",..: 2 1 3 1 2 2 3 1 1 2 ...
## $ credit_score : Factor w/ 43 levels "380","400","420",..: 26 43 11 35 22 43 2 29 14 33 ...
## $ date_of_admission: Date[1:1656], format: "2024-01-31" "2019-08-20" ...
df1 <- df %>%
select(-c(id,name))
#convert credit score to numeric
df1$credit_score <- as.numeric(df1$credit_score)
str(df1$credit_score)
## num [1:1656] 26 43 11 35 22 43 2 29 14 33 ...
# Total number of records
total_records <- nrow(df1)
# Average Age
avg_age <- mean(df1$age, na.rm = TRUE)
# Average Salary
avg_salary <- mean(df1$salary, na.rm = TRUE)
# Print the summary
summary_metrics <- data.frame(
Total_Records = total_records,
Average_Age = round(avg_age, 1),
Average_Salary = round(avg_salary, 2)
)
print(summary_metrics)
## Total_Records Average_Age Average_Salary
## 1 1656 33 43663.67
ggplot(df1, aes(x = health_condition)) +
geom_bar(aes(fill = health_condition)) +
theme_minimal() +
labs(title = "Health Condition Distribution", x = "Health Condition", y = "Frequency")

ggplot(df1, aes(x = education, y = credit_score, fill = education)) +
geom_boxplot(show.legend = FALSE) +
theme_minimal() +
labs(title = "Education Level vs. Credit Score", x = "Education Level", y = "Credit Score")

df1 <- df1 %>%
mutate(age_group = cut(age, breaks = c(0, 18, 35, 50, 65, Inf),
labels = c("0-18", "19-35", "36-50", "51-65", "65+")))
ggplot(df1, aes(x = age_group, fill = age_group)) +
geom_bar() +
theme_minimal() +
labs(title = "Age Group Distribution", x = "Age Group", y = "Count")

df1 %>%
count(city) %>%
ggplot(aes(x = "", y = n, fill = city)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y", start = 0) +
theme_void() +
labs(title = "City-wise Admission Counts")

df1$month <- format(df1$date_of_admission, "%Y-%m")
df1 %>%
count(month) %>%
ggplot(aes(x = month, y = n, group = 1)) +
geom_line(color = "blue") +
geom_point() +
theme_minimal() +
labs(title = "Monthly Admission Trends", x = "Month", y = "Admissions")
