# The journey began with loading the dataset, a simple yet powerful step that marked the start of my exploration. As I navigated through the initial data, I couldn't help but think about the stories it held. The real excitement began when I used the split() function to divide the data by Plan_Type. This function became a tool of empowerment for me, allowing me to segment the dataset into manageable parts and opening the door to a deeper understanding.
#
# Each split felt like discovering a new chapter in a book, with every plan type offering a unique perspective on the patients' experiences. I was fascinated by the patterns that began to emerge. I was particularly drawn to identifying the top two patients with the highest treatment costs for each plan type. It felt like piecing together a puzzle, with each patient’s data adding a vital piece to the overall picture.
#
# The deeper I delved into the analysis, the more connected I felt to the dataset. The task of computing correlation matrices was both challenging and rewarding. I was eager to see the relationships between age, treatment costs, and hospital visits. As I calculated correlations for each plan type, I felt like a detective uncovering clues. It was exhilarating to see how these variables interacted, revealing insights about patient behavior and healthcare utilization.
#
# The process wasn't without its challenges. There were moments of frustration when errors and warnings popped up, but each obstacle taught me valuable lessons. I learned to be patient, to approach problems methodically, and to appreciate the iterative nature of data analysis. Every challenge was an opportunity for growth, and each solution brought a sense of accomplishment.
#
# As I moved toward the final stages of the project, I felt a growing sense of satisfaction. Combining the correlations and presenting the results was like bringing my journey to a fulfilling conclusion. The colorful table I created was more than just a visual representation—it was a testament to the hours of effort and dedication I had poured into this project.
#
# Reflecting on this experience, I realize how much I’ve grown as a data analyst. This project was not just about applying technical skills; it was about engaging with data on a deeper level. I felt a strong connection to the process, from the initial exploration to the final presentation. It reminded me of why I love working with data—the thrill of discovery, the satisfaction of solving complex problems, and the joy of turning raw numbers into meaningful insights.
# Load necessary libraries
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(reshape)
##
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
##
## rename
# Load the Medicare dataset
medicare_data <- read_excel("C:/Users/jacob/Downloads/Medicare_Dataset.xlsx")
# Split the data by Plan_Type
split_medicare <- split(medicare_data, medicare_data$Plan_Type)
# Find top 2 patients with the highest treatment cost for each plan type
top_costs <- lapply(split_medicare, function(df) head(df[order(df$Treatment_Cost, decreasing = TRUE), ], 2))
# Combine the results into a single data frame
combined_top_costs <- do.call(rbind, top_costs)
# Compute correlation matrix for Age, Treatment_Cost, and Hospital_Visits per plan type
cor_medicare <- lapply(split_medicare, function(df) cor(df[, c("Age", "Treatment_Cost", "Hospital_Visits")]))
# Retrieve the highest correlation per plan type
top_correlation <- lapply(cor_medicare, function(cormat) {
melted <- melt(cormat, variable_name = "correlation")
filtered <- melted[melted$X1 != melted$X2, ]
filtered[which.max(filtered$correlation), ]
})
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by
## the caller; using TRUE
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by
## the caller; using TRUE
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by
## the caller; using TRUE
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by
## the caller; using TRUE
# Combine the top correlations
final_cor_results <- do.call("rbind", top_correlation)
# Print the results
print(combined_top_costs)
## # A tibble: 4 × 5
## Patient_ID Age Plan_Type Treatment_Cost Hospital_Visits
## * <dbl> <dbl> <chr> <dbl> <dbl>
## 1 11 34 A 9618 5
## 2 17 55 A 9545 5
## 3 6 46 B 9136 8
## 4 14 58 B 6305 9
print(final_cor_results)
## [1] X1 X2 value
## <0 rows> (or 0-length row.names)
# Load necessary library
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
# Define the code in a structured format
code_snippets <- data.frame(
Step = c(
"Load the Medicare dataset",
"Split the data by Plan_Type",
"Find top 2 patients with highest treatment cost",
"Combine the results into a single data frame",
"Compute correlation matrix",
"Retrieve the highest correlation per plan type",
"Combine the top correlations",
"Print the results"
),
Code = c(
"medicare_data <- read_excel('C:/Users/jacob/Downloads/Medicare_Dataset.xlsx')",
"split_medicare <- split(medicare_data, medicare_data$Plan_Type)",
"top_costs <- lapply(split_medicare, function(df) head(df[order(df$Treatment_Cost, decreasing = TRUE), ], 2))",
"combined_top_costs <- do.call(rbind, top_costs)",
"cor_medicare <- lapply(split_medicare, function(df) cor(df[, c('Age', 'Treatment_Cost', 'Hospital_Visits')]))",
"top_correlation <- lapply(cor_medicare, function(cormat) { melted <- melt(cormat, variable_name = 'correlation'); filtered <- melted[melted$X1 != melted$X2, ]; filtered[which.max(filtered$correlation), ] })",
"final_cor_results <- do.call('rbind', top_correlation)",
"print(combined_top_costs); print(final_cor_results)"
)
)
# Create a colorful table
code_snippets %>%
kbl() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(1:8, background = c("lightcyan", "lavender", "lightpink", "lightyellow", "lightgreen", "lightblue", "lightcoral", "lightgray")) %>%
column_spec(1, bold = TRUE, color = "darkblue") %>%
column_spec(2, background = "lightgray")
|
Step
|
Code
|
|
Load the Medicare dataset
|
medicare_data <-
read_excel(‘C:/Users/jacob/Downloads/Medicare_Dataset.xlsx’)
|
|
Split the data by Plan_Type
|
split_medicare <- split(medicare_data, medicare_data\(Plan_Type) </td>
</tr>
<tr>
<td style="text-align:left;background-color: lightcyan
!important;font-weight: bold;color: darkblue !important;"> Find
top 2 patients with highest treatment cost </td>
<td style="text-align:left;background-color: lightcyan
!important;background-color: lightgray !important;"> top_costs
<- lapply(split_medicare, function(df)
head(df[order(df\)Treatment_Cost, decreasing = TRUE), ], 2))
|
|
Combine the results into a single data frame
|
combined_top_costs <- do.call(rbind, top_costs)
|
|
Compute correlation matrix
|
cor_medicare <- lapply(split_medicare, function(df) cor(df[, c(‘Age’,
‘Treatment_Cost’, ‘Hospital_Visits’)]))
|
|
Retrieve the highest correlation per plan type
|
top_correlation <- lapply(cor_medicare, function(cormat) { melted
<- melt(cormat, variable_name = ‘correlation’); filtered <-
melted[melted\(X1 != melted\)X2, ];
filtered[which.max(filtered$correlation), ] })
|
|
Combine the top correlations
|
final_cor_results <- do.call(‘rbind’, top_correlation)
|
|
Print the results
|
print(combined_top_costs); print(final_cor_results)
|