# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(ggplot2)
library(readxl)

# Read the dataset
df <- read_excel("~/project/R PROGRAM/healthcare/Dataset.xlsx") %>% clean_names()

# Convert date column (modify column name if needed)
df$date_of_admission <- as.Date(df$date_of_admission, format = "%Y-%m-%d")

# Convert categorical columns to factors
df <- df %>%
  mutate(across(where(is.character), as.factor))

# View structure
str(df)
## tibble [1,656 × 12] (S3: tbl_df/tbl/data.frame)
##  $ id               : num [1:1656] 6812 9043 1495 4309 8710 ...
##  $ name             : Factor w/ 1643 levels "Aaron Bradshaw",..: 157 1002 387 85 12 507 487 288 652 301 ...
##  $ age              : num [1:1656] 78 17 15 32 72 91 52 52 56 43 ...
##  $ gender           : Factor w/ 2 levels "Female","Male": 2 1 2 1 2 1 2 1 2 1 ...
##  $ city             : Factor w/ 3 levels "Albuquerque",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ blood_type       : Factor w/ 9 levels "A-","A+","AB-",..: 5 2 1 8 4 2 3 2 4 3 ...
##  $ education        : Factor w/ 3 levels "Bachelor","High School",..: 1 1 3 2 3 2 2 1 2 3 ...
##  $ employment_status: Factor w/ 5 levels "Employed","Retired",..: 1 5 3 1 2 4 5 1 3 1 ...
##  $ salary           : num [1:1656] 80000 0 120000 45000 50000 0 0 65000 30000 100000 ...
##  $ health_condition : Factor w/ 3 levels "Excellent","Good",..: 2 1 3 1 2 2 3 1 1 2 ...
##  $ credit_score     : Factor w/ 43 levels "380","400","420",..: 26 43 11 35 22 43 2 29 14 33 ...
##  $ date_of_admission: Date[1:1656], format: "2024-01-31" "2019-08-20" ...
df1 <- df %>%
  select(-c(id,name))
#convert credit score to numeric
df1$credit_score <- as.numeric(df1$credit_score)
str(df1$credit_score)
##  num [1:1656] 26 43 11 35 22 43 2 29 14 33 ...
# Total number of records
total_records <- nrow(df1)

# Average Age
avg_age <- mean(df1$age, na.rm = TRUE)

# Average Salary
avg_salary <- mean(df1$salary, na.rm = TRUE)

# Print the summary
summary_metrics <- data.frame(
  Total_Records = total_records,
  Average_Age = round(avg_age, 1),
  Average_Salary = round(avg_salary, 2)
)

print(summary_metrics)
##   Total_Records Average_Age Average_Salary
## 1          1656          33       43663.67
ggplot(df1, aes(x = health_condition)) +
  geom_bar(aes(fill = health_condition)) +
  theme_minimal() +
  labs(title = "Health Condition Distribution", x = "Health Condition", y = "Frequency")

ggplot(df1, aes(x = education, y = credit_score, fill = education)) +
  geom_boxplot(show.legend = FALSE) +
  theme_minimal() +
  labs(title = "Education Level vs. Credit Score", x = "Education Level", y = "Credit Score")

df1 <- df1 %>%
  mutate(age_group = cut(age, breaks = c(0, 18, 35, 50, 65, Inf), 
                         labels = c("0-18", "19-35", "36-50", "51-65", "65+")))

ggplot(df1, aes(x = age_group, fill = age_group)) +
  geom_bar() +
  theme_minimal() +
  labs(title = "Age Group Distribution", x = "Age Group", y = "Count")

df1 %>%
  count(city) %>%
  ggplot(aes(x = "", y = n, fill = city)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y", start = 0) +
  theme_void() +
  labs(title = "City-wise Admission Counts")

df1$month <- format(df1$date_of_admission, "%Y-%m")

df1 %>%
  count(month) %>%
  ggplot(aes(x = month, y = n, group = 1)) +
  geom_line(color = "blue") +
  geom_point() +
  theme_minimal() +
  labs(title = "Monthly Admission Trends", x = "Month", y = "Admissions")