#Installing Libraries for the Project #install.packages(“tidyverse”) #install.packages(“readxl”)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(dplyr)
# Checking Data Attributes
head(df) # Head of the Data
## # A tibble: 6 × 6
## AGE FEMALE LOS RACE TOTCHG APRDRG
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 17 1 2 1 2660 560
## 2 17 0 2 1 1689 753
## 3 17 1 7 1 20060 930
## 4 17 1 1 1 736 758
## 5 17 1 1 1 1194 754
## 6 17 0 0 1 3305 347
tail(df) # Tail of the Data
## # A tibble: 6 × 6
## AGE FEMALE LOS RACE TOTCHG APRDRG
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 1 3 1 1886 640
## 2 0 1 6 1 5881 636
## 3 0 1 2 1 1171 640
## 4 0 1 2 1 1171 640
## 5 0 1 2 1 1086 640
## 6 0 0 4 1 4931 640
dim(df) # Shape of the Data - dimensions nrow(), ncol()
## [1] 500 6
length(df) # Length of the Data - no. of columns
## [1] 6
nrow(df) # No. of entries in the dataset
## [1] 500
str(df) # Data Information
## tibble [500 × 6] (S3: tbl_df/tbl/data.frame)
## $ AGE : num [1:500] 17 17 17 17 17 17 17 16 16 17 ...
## $ FEMALE: num [1:500] 1 0 1 1 1 0 1 1 1 1 ...
## $ LOS : num [1:500] 2 2 7 1 1 0 4 2 1 2 ...
## $ RACE : num [1:500] 1 1 1 1 1 1 1 1 1 1 ...
## $ TOTCHG: num [1:500] 2660 1689 20060 736 1194 ...
## $ APRDRG: num [1:500] 560 753 930 758 754 347 754 754 753 758 ...
glimpse(df) # From the Dplyr package, (part of tidyverse)
## Rows: 500
## Columns: 6
## $ AGE <dbl> 17, 17, 17, 17, 17, 17, 17, 16, 16, 17, 17, 15, 15, 15, 15, 15,…
## $ FEMALE <dbl> 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ LOS <dbl> 2, 2, 7, 1, 1, 0, 4, 2, 1, 2, 2, 2, 2, 4, 7, 4, 1, 4, 3, 3, 1, …
## $ RACE <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ TOTCHG <dbl> 2660, 1689, 20060, 736, 1194, 3305, 2205, 1167, 532, 1363, 1245…
## $ APRDRG <dbl> 560, 753, 930, 758, 754, 347, 754, 754, 753, 758, 758, 753, 751…
col_names_vec <- colnames(df) # Finding the names of the columns
col_names_vec <- names(df) # Finding the names of the columns
col_names_list <- as.list(col_names_vec) #Enlisting the name of the columns
col_names_vec
## [1] "AGE" "FEMALE" "LOS" "RACE" "TOTCHG" "APRDRG"
col_names_list
## [[1]]
## [1] "AGE"
##
## [[2]]
## [1] "FEMALE"
##
## [[3]]
## [1] "LOS"
##
## [[4]]
## [1] "RACE"
##
## [[5]]
## [1] "TOTCHG"
##
## [[6]]
## [1] "APRDRG"
sapply(df, class) # Finding the class or Data types of each and all column names
## AGE FEMALE LOS RACE TOTCHG APRDRG
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
sum(is.na(df)) # Count Total Null Values in the Entire Data Frame
## [1] 1
# Summarising using Tidyverse dplyr
df %>%
summarise(across(everything(), ~ sum(is.na(.))))
## # A tibble: 1 × 6
## AGE FEMALE LOS RACE TOTCHG APRDRG
## <int> <int> <int> <int> <int> <int>
## 1 0 0 0 1 0 0
# Count the number of rows that are exact duplicates
sum(duplicated(df))
## [1] 26
#### Removing Duplicate Enties
# View all rows that are exact duplicates (all columns match a prior entry)
df[duplicated(df), ]
## # A tibble: 26 × 6
## AGE FEMALE LOS RACE TOTCHG APRDRG
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 1 2 1 1191 640
## 2 0 0 2 1 1280 640
## 3 0 1 2 1 1096 640
## 4 0 0 3 1 1669 640
## 5 0 1 2 1 1096 640
## 6 0 1 2 1 1096 640
## 7 0 0 2 1 1395 640
## 8 0 0 2 1 1286 640
## 9 0 1 2 1 1096 640
## 10 0 0 2 1 1280 640
## # ℹ 16 more rows
# Using dplyr::distinct() to remove duplicates (recommended step)
#library(dplyr)
df <- df %>%
distinct()
# Using base R unique()
df <- unique(df)
#### Renaming Column Names
names(df) <- c("AGE","GENDER","LOS","RACE","TOTCHG","APRDRG")
names(df)
## [1] "AGE" "GENDER" "LOS" "RACE" "TOTCHG" "APRDRG"
str(df) # Confirming whether the name has changed in the dataframe
## tibble [474 × 6] (S3: tbl_df/tbl/data.frame)
## $ AGE : num [1:474] 17 17 17 17 17 17 17 16 16 17 ...
## $ GENDER: num [1:474] 1 0 1 1 1 0 1 1 1 1 ...
## $ LOS : num [1:474] 2 2 7 1 1 0 4 2 1 2 ...
## $ RACE : num [1:474] 1 1 1 1 1 1 1 1 1 1 ...
## $ TOTCHG: num [1:474] 2660 1689 20060 736 1194 ...
## $ APRDRG: num [1:474] 560 753 930 758 754 347 754 754 753 758 ...
#### Null Values Treatment
colSums(is.na(df)) # Count Null Values per Column
## AGE GENDER LOS RACE TOTCHG APRDRG
## 0 0 0 1 0 0
# Find all rows that contain at least one NA value
df[rowSums(is.na(df)) > 0, ]
## # A tibble: 1 × 6
## AGE GENDER LOS RACE TOTCHG APRDRG
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 1 2 NA 1156 640
# Find rows where the RACE column has an NA value
df[is.na(df$RACE), ]
## # A tibble: 1 × 6
## AGE GENDER LOS RACE TOTCHG APRDRG
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 1 2 NA 1156 640
# You can also use tidyr::drop_na() if you prefer the tidyverse
# library(tidyr)
df <- df %>% drop_na()
#### Distribution Check - All features Summary
summary(df) # Statistical Summary of the Data
## AGE GENDER LOS RACE
## Min. : 0.000 Min. :0.0000 Min. : 0.000 Min. :1.000
## 1st Qu.: 0.000 1st Qu.:0.0000 1st Qu.: 2.000 1st Qu.:1.000
## Median : 0.000 Median :1.0000 Median : 2.000 Median :1.000
## Mean : 5.376 Mean :0.5053 Mean : 2.871 Mean :1.082
## 3rd Qu.:14.000 3rd Qu.:1.0000 3rd Qu.: 3.000 3rd Qu.:1.000
## Max. :17.000 Max. :1.0000 Max. :41.000 Max. :6.000
## TOTCHG APRDRG
## Min. : 532 Min. : 21
## 1st Qu.: 1245 1st Qu.:640
## Median : 1620 Median :640
## Mean : 2863 Mean :615
## 3rd Qu.: 2582 3rd Qu.:753
## Max. :48388 Max. :952
# Get a vector of all column names
col_names_vec <- colnames(df)
# Loop through each column to create distribution plots
library(ggplot2)
#library(dplyr)
# Define the output directory where plots will be saved
output_path <- "C:/Users/pprad/PradM_WD/RStudioWD/My_Projects/P7_USHealthcare-Agency/outputs/figures"
# Check if the directory exists, if not, create it
if (!dir.exists(output_path)) {
dir.create(output_path, recursive = TRUE)
}
# Get a vector of all column names
col_names_vec <- names(df)
# Loop through each column to create distribution plots
for (col in col_names_vec) {
# Check if the column is numeric
if (is.numeric(df[[col]])) {
# Create the histogram plot object
hist_plot <- ggplot(df, aes(x = .data[[col]])) +
geom_histogram(fill = 'skyblue', color = 'black') +
labs(title = paste("Histogram of", col), x = col, y = "Count")
# Print the plot to the RStudio Plots panel
print(hist_plot)
# Save the plot to the specified directory
ggsave(filename = file.path(output_path, paste0(col, '_histogram.png')), plot = hist_plot)
# Create the density plot object
density_plot <- ggplot(df, aes(x = .data[[col]])) +
geom_density(fill = 'lightgreen', alpha = 0.5) +
labs(title = paste("Density Plot of", col), x = col)
# Print the plot to the RStudio Plots panel
print(density_plot)
# Save the plot to the specified directory
ggsave(filename = file.path(output_path, paste0(col, '_density.png')), plot = density_plot)
cat(paste("Saved histogram and density plot for", col, "\n"))
} else {
cat(paste(col, "is not a numeric variable. Skipping plots.\n"))
}
}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Saving 7 x 5 in image
## Saved histogram and density plot for AGE
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Saving 7 x 5 in image
## Saved histogram and density plot for GENDER
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Saving 7 x 5 in image
## Saved histogram and density plot for LOS
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Saving 7 x 5 in image
## Saved histogram and density plot for RACE
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Saving 7 x 5 in image
## Saved histogram and density plot for TOTCHG
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Saving 7 x 5 in image
## Saved histogram and density plot for APRDRG
# Find the age with the highest number of hospital visits
age_visits <- df %>%
count(AGE) %>%
arrange(desc(n))
print("Top 5 ages by hospital visits:")
## [1] "Top 5 ages by hospital visits:"
print(head(age_visits, 5))
## # A tibble: 5 × 2
## AGE n
## <dbl> <int>
## 1 0 280
## 2 17 38
## 3 15 29
## 4 16 29
## 5 14 25
# Find the age with the highest total expenditure
age_expenditure <- df %>%
group_by(AGE) %>%
summarize(Total_Expenditure = sum(TOTCHG, na.rm = TRUE)) %>%
arrange(desc(Total_Expenditure))
print("Top 5 ages by total expenditure:")
## [1] "Top 5 ages by total expenditure:"
print(head(age_expenditure, 5))
## # A tibble: 5 × 2
## AGE Total_Expenditure
## <dbl> <dbl>
## 1 0 644905
## 2 17 174777
## 3 15 111747
## 4 16 69149
## 5 14 64643
print(df[df$AGE == 0, ], n=300)
## # A tibble: 280 × 6
## AGE GENDER LOS RACE TOTCHG APRDRG
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 1 41 1 29188 602
## 2 0 0 2 1 4717 138
## 3 0 0 12 1 15129 137
## 4 0 1 2 1 1085 640
## 5 0 0 3 1 1607 640
## 6 0 1 3 1 1499 640
## 7 0 1 3 1 7648 53
## 8 0 1 2 1 1527 640
## 9 0 0 2 1 1483 640
## 10 0 1 4 1 2844 640
## 11 0 1 3 1 3124 640
## 12 0 0 3 1 1760 640
## 13 0 1 2 1 1278 640
## 14 0 1 2 1 1620 640
## 15 0 1 2 1 1220 640
## 16 0 1 2 1 1134 640
## 17 0 0 3 1 1656 640
## 18 0 0 4 5 4072 639
## 19 0 0 2 5 1393 143
## 20 0 0 0 5 615 254
## 21 0 0 2 1 1385 640
## 22 0 0 2 1 1224 640
## 23 0 1 3 1 1779 640
## 24 0 0 2 1 1526 640
## 25 0 0 1 1 2075 581
## 26 0 0 17 1 12042 633
## 27 0 0 2 1 1309 640
## 28 0 0 2 1 1290 640
## 29 0 0 2 1 1280 640
## 30 0 0 3 1 1719 640
## 31 0 1 2 1 1102 640
## 32 0 1 3 1 1543 640
## 33 0 1 2 1 1174 640
## 34 0 1 2 1 1105 640
## 35 0 0 2 1 1335 640
## 36 0 0 2 1 1550 640
## 37 0 0 4 1 2473 640
## 38 0 0 2 1 1322 640
## 39 0 0 4 1 2553 640
## 40 0 1 2 1 1191 640
## 41 0 0 2 1 1439 640
## 42 0 1 2 1 1237 640
## 43 0 0 2 1 1265 640
## 44 0 1 4 1 2280 640
## 45 0 0 2 1 1096 640
## 46 0 1 2 1 1156 640
## 47 0 0 2 1 1199 640
## 48 0 1 4 1 2518 640
## 49 0 1 2 1 1246 640
## 50 0 1 3 1 1821 640
## 51 0 0 5 1 3101 626
## 52 0 1 2 1 1176 640
## 53 0 0 3 1 1891 640
## 54 0 0 1 1 935 640
## 55 0 0 2 1 1395 640
## 56 0 0 2 1 1561 640
## 57 0 1 7 1 6912 636
## 58 0 0 3 1 2197 640
## 59 0 0 4 1 2288 640
## 60 0 0 2 1 1320 640
## 61 0 1 2 1 1139 640
## 62 0 1 4 1 2134 639
## 63 0 0 2 1 1407 640
## 64 0 0 2 1 1982 640
## 65 0 0 4 1 2539 640
## 66 0 0 2 1 1528 640
## 67 0 1 2 1 1513 640
## 68 0 0 2 1 3977 139
## 69 0 1 2 1 1269 640
## 70 0 0 2 1 1501 640
## 71 0 1 2 1 1396 640
## 72 0 0 3 1 1777 640
## 73 0 1 1 1 833 640
## 74 0 1 1 1 715 640
## 75 0 0 2 1 1375 640
## 76 0 0 2 1 1330 640
## 77 0 0 2 1 1628 640
## 78 0 0 2 1 1368 640
## 79 0 0 39 1 26356 421
## 80 0 0 2 1 1305 640
## 81 0 0 2 1 1416 640
## 82 0 0 2 1 877 640
## 83 0 1 4 1 2082 640
## 84 0 1 2 1 1096 640
## 85 0 0 3 1 2028 640
## 86 0 0 2 1 1104 640
## 87 0 1 2 1 1093 640
## 88 0 1 4 1 2592 640
## 89 0 0 4 1 3609 640
## 90 0 0 3 1 2118 639
## 91 0 0 2 1 1805 640
## 92 0 0 1 1 2825 580
## 93 0 1 2 1 1299 640
## 94 0 0 2 1 1413 640
## 95 0 1 3 1 2354 640
## 96 0 1 2 1 1214 640
## 97 0 1 3 1 1843 640
## 98 0 1 4 1 2090 640
## 99 0 1 2 1 1111 640
## 100 0 1 3 1 1505 640
## 101 0 0 2 1 1277 640
## 102 0 0 3 1 1828 640
## 103 0 0 2 1 1581 640
## 104 0 1 2 1 1128 640
## 105 0 0 2 1 1976 640
## 106 0 0 3 1 2144 640
## 107 0 0 2 1 1283 640
## 108 0 1 3 1 1725 640
## 109 0 1 0 1 2847 581
## 110 0 1 3 1 1947 640
## 111 0 0 3 1 1669 640
## 112 0 1 2 1 1404 640
## 113 0 0 2 1 1272 640
## 114 0 1 2 1 1161 640
## 115 0 0 2 1 1286 640
## 116 0 0 2 1 1454 640
## 117 0 0 1 1 874 640
## 118 0 1 3 1 2129 626
## 119 0 1 3 1 2149 626
## 120 0 1 2 1 1317 640
## 121 0 1 2 1 1281 640
## 122 0 0 3 1 2330 640
## 123 0 0 8 1 5014 640
## 124 0 1 3 1 2029 640
## 125 0 1 3 1 1530 640
## 126 0 0 3 1 2220 626
## 127 0 0 2 1 1267 640
## 128 0 0 3 1 2057 640
## 129 0 0 2 1 1351 640
## 130 0 1 18 1 10649 626
## 131 0 1 3 1 2777 640
## 132 0 1 3 1 1915 640
## 133 0 0 2 1 1420 640
## 134 0 0 2 1 1337 640
## 135 0 0 3 1 2187 640
## 136 0 1 3 1 1748 640
## 137 0 1 2 1 1141 640
## 138 0 0 3 1 1795 640
## 139 0 1 2 1 1436 640
## 140 0 0 2 1 1156 640
## 141 0 0 2 1 1263 640
## 142 0 1 2 1 1324 640
## 143 0 0 3 1 1874 640
## 144 0 0 2 1 1538 640
## 145 0 0 4 1 3257 640
## 146 0 1 2 1 1576 633
## 147 0 0 15 1 8631 614
## 148 0 1 2 1 1173 640
## 149 0 0 3 1 2516 640
## 150 0 0 2 1 1446 640
## 151 0 0 3 1 1807 640
## 152 0 0 2 1 1307 640
## 153 0 0 3 1 1739 640
## 154 0 1 2 1 1298 640
## 155 0 0 1 1 921 640
## 156 0 0 7 1 8184 634
## 157 0 1 3 1 2051 640
## 158 0 1 3 1 1743 640
## 159 0 0 2 1 1491 640
## 160 0 1 3 1 1772 640
## 161 0 0 2 1 1437 640
## 162 0 1 4 1 2724 640
## 163 0 0 2 2 1291 640
## 164 0 1 2 1 1211 640
## 165 0 0 2 1 1383 640
## 166 0 1 3 1 1660 640
## 167 0 0 2 1 1458 640
## 168 0 0 2 1 1427 640
## 169 0 1 2 1 1217 640
## 170 0 1 2 1 1099 640
## 171 0 0 1 1 871 640
## 172 0 1 1 1 1688 138
## 173 0 1 3 1 2877 422
## 174 0 0 1 1 2805 138
## 175 0 0 2 1 4412 138
## 176 0 1 2 1 1818 249
## 177 0 0 5 1 4296 640
## 178 0 0 3 1 1887 640
## 179 0 1 2 1 1418 640
## 180 0 1 2 1 1136 640
## 181 0 1 2 1 1475 640
## 182 0 1 3 1 1538 640
## 183 0 0 1 1 1097 640
## 184 0 1 2 1 1362 640
## 185 0 0 3 1 3342 640
## 186 0 0 6 1 5568 640
## 187 0 1 7 1 4288 639
## 188 0 1 7 1 5788 614
## 189 0 1 0 1 2531 581
## 190 0 1 23 1 13112 614
## 191 0 0 3 1 3497 640
## 192 0 0 3 1 1832 640
## 193 0 1 3 1 1685 640
## 194 0 0 2 1 1135 640
## 195 0 1 2 1 1303 640
## 196 0 1 3 1 1890 640
## 197 0 0 3 1 1674 640
## 198 0 0 4 1 2595 640
## 199 0 0 3 1 1782 640
## 200 0 1 2 1 1251 640
## 201 0 0 3 1 1997 640
## 202 0 0 2 1 1758 640
## 203 0 0 2 1 1369 640
## 204 0 0 2 1 1477 640
## 205 0 0 2 1 1340 640
## 206 0 0 2 1 1236 640
## 207 0 0 5 1 4677 640
## 208 0 0 1 1 1051 640
## 209 0 1 4 3 3041 626
## 210 0 0 1 1 550 640
## 211 0 0 2 1 1039 640
## 212 0 1 3 1 1769 633
## 213 0 0 2 1 1794 640
## 214 0 0 2 1 1578 640
## 215 0 0 3 1 1883 640
## 216 0 0 2 6 1535 640
## 217 0 1 2 1 1200 640
## 218 0 1 3 1 1946 640
## 219 0 1 2 1 1875 640
## 220 0 1 3 1 1908 640
## 221 0 0 3 1 2080 640
## 222 0 0 2 1 1408 640
## 223 0 0 3 1 2008 640
## 224 0 0 2 1 1278 640
## 225 0 1 2 1 1083 640
## 226 0 0 3 1 1964 640
## 227 0 0 2 1 1231 640
## 228 0 0 3 1 1845 640
## 229 0 0 2 1 1428 640
## 230 0 1 7 1 5934 640
## 231 0 1 3 1 1888 640
## 232 0 1 3 1 1431 640
## 233 0 0 1 1 651 640
## 234 0 1 4 1 3969 640
## 235 0 0 2 1 1363 640
## 236 0 1 2 1 1118 640
## 237 0 1 2 1 1543 640
## 238 0 1 5 1 4304 640
## 239 0 0 3 1 1933 640
## 240 0 0 2 1 1120 640
## 241 0 0 2 1 1465 640
## 242 0 0 2 1 1768 634
## 243 0 0 2 1 1284 640
## 244 0 0 3 1 1952 640
## 245 0 0 4 1 2100 640
## 246 0 0 2 1 1281 640
## 247 0 1 2 1 1250 640
## 248 0 1 2 1 1178 640
## 249 0 1 24 1 13040 863
## 250 0 1 3 1 1517 640
## 251 0 0 3 4 3126 640
## 252 0 1 4 1 2049 640
## 253 0 0 2 1 1245 640
## 254 0 1 3 1 1603 640
## 255 0 1 3 1 2840 640
## 256 0 1 2 1 1411 640
## 257 0 1 2 1 1175 640
## 258 0 0 2 1 1689 640
## 259 0 1 3 1 2023 640
## 260 0 0 2 1 2540 640
## 261 0 0 7 1 10431 636
## 262 0 0 3 1 2204 633
## 263 0 0 3 1 2218 640
## 264 0 0 2 1 1382 640
## 265 0 1 2 1 1115 640
## 266 0 0 1 1 825 640
## 267 0 0 3 1 1701 640
## 268 0 0 2 1 1273 640
## 269 0 0 2 1 1270 640
## 270 0 1 2 1 1106 640
## 271 0 1 2 1 1065 640
## 272 0 0 2 1 1264 640
## 273 0 0 2 1 1282 640
## 274 0 0 2 1 1393 640
## 275 0 0 2 1 1266 640
## 276 0 1 3 1 1886 640
## 277 0 1 6 1 5881 636
## 278 0 1 2 1 1171 640
## 279 0 1 2 1 1086 640
## 280 0 0 4 1 4931 640
sum(df$TOTCHG[df$AGE == 0], na.rm = TRUE)
## [1] 644905