Split Function

# The journey began with loading the dataset, a simple yet powerful step that marked the start of my exploration. As I navigated through the initial data, I couldn't help but think about the stories it held. The real excitement began when I used the split() function to divide the data by Plan_Type. This function became a tool of empowerment for me, allowing me to segment the dataset into manageable parts and opening the door to a deeper understanding.
# 
# Each split felt like discovering a new chapter in a book, with every plan type offering a unique perspective on the patients' experiences. I was fascinated by the patterns that began to emerge. I was particularly drawn to identifying the top two patients with the highest treatment costs for each plan type. It felt like piecing together a puzzle, with each patient’s data adding a vital piece to the overall picture.
# 
# The deeper I delved into the analysis, the more connected I felt to the dataset. The task of computing correlation matrices was both challenging and rewarding. I was eager to see the relationships between age, treatment costs, and hospital visits. As I calculated correlations for each plan type, I felt like a detective uncovering clues. It was exhilarating to see how these variables interacted, revealing insights about patient behavior and healthcare utilization.
# 
# The process wasn't without its challenges. There were moments of frustration when errors and warnings popped up, but each obstacle taught me valuable lessons. I learned to be patient, to approach problems methodically, and to appreciate the iterative nature of data analysis. Every challenge was an opportunity for growth, and each solution brought a sense of accomplishment.
# 
# As I moved toward the final stages of the project, I felt a growing sense of satisfaction. Combining the correlations and presenting the results was like bringing my journey to a fulfilling conclusion. The colorful table I created was more than just a visual representation—it was a testament to the hours of effort and dedication I had poured into this project.
# 
# Reflecting on this experience, I realize how much I’ve grown as a data analyst. This project was not just about applying technical skills; it was about engaging with data on a deeper level. I felt a strong connection to the process, from the initial exploration to the final presentation. It reminded me of why I love working with data—the thrill of discovery, the satisfaction of solving complex problems, and the joy of turning raw numbers into meaningful insights.

# Load necessary libraries
library(readxl)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(reshape)

## 
## Attaching package: 'reshape'

## The following object is masked from 'package:dplyr':
## 
##     rename

# Load the Medicare dataset
medicare_data <- read_excel("C:/Users/jacob/Downloads/Medicare_Dataset.xlsx")

# Split the data by Plan_Type
split_medicare <- split(medicare_data, medicare_data$Plan_Type)

# Find top 2 patients with the highest treatment cost for each plan type
top_costs <- lapply(split_medicare, function(df) head(df[order(df$Treatment_Cost, decreasing = TRUE), ], 2))

# Combine the results into a single data frame
combined_top_costs <- do.call(rbind, top_costs)

# Compute correlation matrix for Age, Treatment_Cost, and Hospital_Visits per plan type
cor_medicare <- lapply(split_medicare, function(df) cor(df[, c("Age", "Treatment_Cost", "Hospital_Visits")]))

# Retrieve the highest correlation per plan type
top_correlation <- lapply(cor_medicare, function(cormat) {
  melted <- melt(cormat, variable_name = "correlation")
  filtered <- melted[melted$X1 != melted$X2, ]
  filtered[which.max(filtered$correlation), ]
})

## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by
## the caller; using TRUE
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by
## the caller; using TRUE
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by
## the caller; using TRUE
## Warning in type.convert.default(X[[i]], ...): 'as.is' should be specified by
## the caller; using TRUE

# Combine the top correlations
final_cor_results <- do.call("rbind", top_correlation)

# Print the results
print(combined_top_costs)

## # A tibble: 4 × 5
##   Patient_ID   Age Plan_Type Treatment_Cost Hospital_Visits
## *      <dbl> <dbl> <chr>              <dbl>           <dbl>
## 1         11    34 A                   9618               5
## 2         17    55 A                   9545               5
## 3          6    46 B                   9136               8
## 4         14    58 B                   6305               9

print(final_cor_results)

## [1] X1    X2    value
## <0 rows> (or 0-length row.names)

# Load necessary library
library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

# Define the code in a structured format
code_snippets <- data.frame(
  Step = c(
    "Load the Medicare dataset",
    "Split the data by Plan_Type",
    "Find top 2 patients with highest treatment cost",
    "Combine the results into a single data frame",
    "Compute correlation matrix",
    "Retrieve the highest correlation per plan type",
    "Combine the top correlations",
    "Print the results"
  ),
  Code = c(
    "medicare_data <- read_excel('C:/Users/jacob/Downloads/Medicare_Dataset.xlsx')",
    "split_medicare <- split(medicare_data, medicare_data$Plan_Type)",
    "top_costs <- lapply(split_medicare, function(df) head(df[order(df$Treatment_Cost, decreasing = TRUE), ], 2))",
    "combined_top_costs <- do.call(rbind, top_costs)",
    "cor_medicare <- lapply(split_medicare, function(df) cor(df[, c('Age', 'Treatment_Cost', 'Hospital_Visits')]))",
    "top_correlation <- lapply(cor_medicare, function(cormat) { melted <- melt(cormat, variable_name = 'correlation'); filtered <- melted[melted$X1 != melted$X2, ]; filtered[which.max(filtered$correlation), ] })",
    "final_cor_results <- do.call('rbind', top_correlation)",
    "print(combined_top_costs); print(final_cor_results)"
  )
)

# Create a colorful table
code_snippets %>%
  kbl() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
  row_spec(1:8, background = c("lightcyan", "lavender", "lightpink", "lightyellow", "lightgreen", "lightblue", "lightcoral", "lightgray")) %>%
  column_spec(1, bold = TRUE, color = "darkblue") %>%
  column_spec(2, background = "lightgray")

Step	Code
Load the Medicare dataset	medicare_data <- read_excel(‘C:/Users/jacob/Downloads/Medicare_Dataset.xlsx’)
Split the data by Plan_Type	split_medicare <- split(medicare_data, medicare_data$Plan_Type) </td> </tr> <tr> <td style="text-align:left;background-color: lightcyan !important;font-weight: bold;color: darkblue !important;"> Find top 2 patients with highest treatment cost </td> <td style="text-align:left;background-color: lightcyan !important;background-color: lightgray !important;"> top_costs <- lapply(split_medicare, function(df) head(df[order(df$Treatment_Cost, decreasing = TRUE), ], 2))
Combine the results into a single data frame	combined_top_costs <- do.call(rbind, top_costs)
Compute correlation matrix	cor_medicare <- lapply(split_medicare, function(df) cor(df[, c(‘Age’, ‘Treatment_Cost’, ‘Hospital_Visits’)]))
Retrieve the highest correlation per plan type	top_correlation <- lapply(cor_medicare, function(cormat) { melted <- melt(cormat, variable_name = ‘correlation’); filtered <- melted[melted$X1 != melted$X2, ]; filtered[which.max(filtered$correlation), ] })
Combine the top correlations	final_cor_results <- do.call(‘rbind’, top_correlation)
Print the results	print(combined_top_costs); print(final_cor_results)

Split Function

Avery Holloman

2024-11-03