CAPSTONE PROJECT

# Load libraries
library(data.table)
## Warning: package 'data.table' was built under R version 4.3.3
library(ggplot2)
library(xgboost)
## Warning: package 'xgboost' was built under R version 4.3.3
library(caret)
## Loading required package: lattice
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:xgboost':
## 
##     slice
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Set seed for reproducibility
set.seed(123)

# Load datasets
train <- read.csv("~/train.csv", header=FALSE)
View(train)
str(train)
## 'data.frame':    13174 obs. of  152 variables:
##  $ V1  : chr  "patient_id" "268700" "484983" "277055" ...
##  $ V2  : chr  "patient_race" "" "White" "" ...
##  $ V3  : chr  "payer_type" "COMMERCIAL" "" "COMMERCIAL" ...
##  $ V4  : chr  "patient_state" "AR" "IL" "CA" ...
##  $ V5  : chr  "patient_zip3" "724" "629" "925" ...
##  $ V6  : chr  "Region" "South" "Midwest" "West" ...
##  $ V7  : chr  "Division" "West South Central" "East North Central" "Pacific" ...
##  $ V8  : chr  "patient_age" "39" "55" "59" ...
##  $ V9  : chr  "patient_gender" "F" "F" "F" ...
##  $ V10 : chr  "bmi" "" "35.36" "" ...
##  $ V11 : chr  "breast_cancer_diagnosis_code" "C50912" "C50412" "1749" ...
##  $ V12 : chr  "breast_cancer_diagnosis_desc" "Malignant neoplasm of unspecified site of left female breast" "Malig neoplasm of upper-outer quadrant of left female breast" "Malignant neoplasm of breast (female), unspecified" ...
##  $ V13 : chr  "metastatic_cancer_diagnosis_code" "C773" "C773" "C773" ...
##  $ V14 : chr  "metastatic_first_novel_treatment" "" "" "" ...
##  $ V15 : chr  "metastatic_first_novel_treatment_type" "" "" "" ...
##  $ V16 : chr  "population" "3924.87" "2745.39" "38343.18" ...
##  $ V17 : chr  "density" "82.63" "51.79" "700.34" ...
##  $ V18 : chr  "age_median" "42.58" "43.54" "36.28" ...
##  $ V19 : chr  "age_under_10" "11.61" "11.22" "13.27" ...
##  $ V20 : chr  "age_10_to_19" "13.03" "12.19" "15.66" ...
##  $ V21 : chr  "age_20s" "10.87" "11.45" "13.49" ...
##  $ V22 : chr  "age_30s" "11.80" "11.01" "13.45" ...
##  $ V23 : chr  "age_40s" "12.29" "11.35" "12.40" ...
##  $ V24 : chr  "age_50s" "13.22" "14.39" "11.58" ...
##  $ V25 : chr  "age_60s" "13.47" "14.15" "10.47" ...
##  $ V26 : chr  "age_70s" "10.07" "9.17" "6.38" ...
##  $ V27 : chr  "age_over_80" "3.64" "5.05" "3.28" ...
##  $ V28 : chr  "male" "51.43" "49.32" "49.99" ...
##  $ V29 : chr  "female" "48.57" "50.68" "50.01" ...
##  $ V30 : chr  "married" "51.05" "49.48" "48.81" ...
##  $ V31 : chr  "divorced" "16.72" "15.42" "11.90" ...
##  $ V32 : chr  "never_married" "23.57" "26.93" "34.35" ...
##  $ V33 : chr  "widowed" "8.66" "8.17" "4.95" ...
##  $ V34 : chr  "family_size" "3.01" "3.17" "3.80" ...
##  $ V35 : chr  "family_dual_income" "43.99" "41.41" "52.89" ...
##  $ V36 : chr  "income_household_median" "44483.35" "51796.79" "78696.87" ...
##  $ V37 : chr  "income_household_under_5" "2.21" "3.67" "2.59" ...
##  $ V38 : chr  "income_household_5_to_10" "3.97" "3.86" "1.81" ...
##  $ V39 : chr  "income_household_10_to_15" "8.52" "6.58" "3.16" ...
##  $ V40 : chr  "income_household_15_to_20" "7.08" "5.58" "3.71" ...
##  $ V41 : chr  "income_household_20_to_25" "7.67" "5.38" "3.23" ...
##  $ V42 : chr  "income_household_25_to_35" "13.82" "11.02" "7.40" ...
##  $ V43 : chr  "income_household_35_to_50" "15.14" "13.09" "10.42" ...
##  $ V44 : chr  "income_household_50_to_75" "17.51" "19.56" "16.83" ...
##  $ V45 : chr  "income_household_75_to_100" "11.26" "11.76" "13.45" ...
##  $ V46 : chr  "income_household_100_to_150" "8.90" "11.40" "19.21" ...
##  $ V47 : chr  "income_household_150_over" "3.93" "8.11" "18.23" ...
##  $ V48 : chr  "income_household_six_figure" "12.83" "19.51" "37.44" ...
##  $ V49 : chr  "income_individual_median" "24048.55" "28028.04" "32818.54" ...
##  $ V50 : chr  "home_ownership" "72.11" "76.71" "66.82" ...
##  $ V51 : chr  "housing_units" "1513.75" "1113.35" "10825.83" ...
##  $ V52 : chr  "home_value" "87384.33" "92026.84" "392600.40" ...
##  $ V53 : chr  "rent_median" "641.39" "638.60" "1631.64" ...
##  $ V54 : chr  "rent_burden" "27.52" "29.37" "35.56" ...
##  $ V55 : chr  "education_less_highschool" "16.55" "10.93" "16.25" ...
##  $ V56 : chr  "education_highschool" "41.83" "35.26" "27.55" ...
##  $ V57 : chr  "education_some_college" "28.31" "35.33" "33.88" ...
##  $ V58 : chr  "education_bachelors" "9.21" "12.46" "13.92" ...
##  $ V59 : chr  "education_graduate" "4.11" "6.04" "8.39" ...
##  $ V60 : chr  "education_college_or_above" "13.32" "18.49" "22.32" ...
##  $ V61 : chr  "education_stem_degree" "38.78" "36.35" "43.37" ...
##  $ V62 : chr  "labor_force_participation" "53.60" "52.51" "59.47" ...
##  $ V63 : chr  "unemployment_rate" "5.85" "7.45" "7.28" ...
##  $ V64 : chr  "self_employed" "11.82" "9.19" "13.21" ...
##  $ V65 : chr  "farmer" "5.31" "5.21" "0.44" ...
##  $ V66 : chr  "race_white" "92.95" "88.75" "53.95" ...
##  $ V67 : chr  "race_black" "1.73" "6.44" "6.41" ...
##  $ V68 : chr  "race_asian" "0.33" "0.53" "5.83" ...
##  $ V69 : chr  "race_native" "0.20" "0.19" "0.81" ...
##  $ V70 : chr  "race_pacific" "0.03" "0.05" "0.38" ...
##  $ V71 : chr  "race_other" "0.83" "0.61" "21.35" ...
##  $ V72 : chr  "race_multiple" "3.94" "3.42" "11.27" ...
##  $ V73 : chr  "hispanic" "3.03" "2.78" "46.88" ...
##  $ V74 : chr  "disabled" "22.24" "20.16" "12.83" ...
##  $ V75 : chr  "poverty" "19.27" "16.94" "12.72" ...
##  $ V76 : chr  "limited_english" "0.42" "0.43" "4.58" ...
##  $ V77 : chr  "commute_time" "25.35" "26.26" "37.07" ...
##  $ V78 : chr  "health_uninsured" "8.06" "6.93" "8.07" ...
##  $ V79 : chr  "veteran" "8.11" "9.71" "7.75" ...
##  $ V80 : chr  "Average of Jan-13" "38.55" "34.85" "53.14" ...
##  $ V81 : chr  "Average of Feb-13" "39.88" "36.15" "55.28" ...
##  $ V82 : chr  "Average of Mar-13" "42.75" "39.41" "64.75" ...
##  $ V83 : chr  "Average of Apr-13" "55.16" "54.63" "67.38" ...
##  $ V84 : chr  "Average of May-13" "65.17" "65.41" "73.31" ...
##  $ V85 : chr  "Average of Jun-13" "75.98" "73.89" "79.49" ...
##  $ V86 : chr  "Average of Jul-13" "76.75" "74.07" "84.01" ...
##  $ V87 : chr  "Average of Aug-13" "76.45" "74.37" "83.28" ...
##  $ V88 : chr  "Average of Sep-13" "73.67" "70.44" "79.88" ...
##  $ V89 : chr  "Average of Oct-13" "59.73" "57.37" "67.84" ...
##  $ V90 : chr  "Average of Nov-13" "45.18" "42.15" "61.92" ...
##  $ V91 : chr  "Average of Dec-13" "37.43" "33.16" "55.69" ...
##  $ V92 : chr  "Average of Jan-14" "31.67" "26.88" "60.56" ...
##  $ V93 : chr  "Average of Feb-14" "33.83" "28.36" "60.99" ...
##  $ V94 : chr  "Average of Mar-14" "42.35" "40.32" "65.16" ...
##  $ V95 : chr  "Average of Apr-14" "57.72" "56.85" "68.01" ...
##  $ V96 : chr  "Average of May-14" "67.35" "66.84" "74.24" ...
##  $ V97 : chr  "Average of Jun-14" "75.92" "75.12" "78.87" ...
##  $ V98 : chr  "Average of Jul-14" "74.28" "72.18" "84.65" ...
##  $ V99 : chr  "Average of Aug-14" "79.59" "77.08" "82.23" ...
##   [list output truncated]
#Removing Columns not being used in the data set
train<- train[,-c(80:151)]
#The column names are incorrect so the following code is to rename each column properly

col_names <- c("patient_id",
               "patient_race",
               "payer_type",
               "patient_state",
               "patient_zip3",
               "Region",
               "Division",
               "patient_age",
               "patient_gender",
               "bmi",
               "breast_cancer_diagnosis_code",
               "breast_cancer_diagnosis_desc",
               "metastatic_cancer_diagnosis_code",
               "metastatic_first_novel_treatment",
               "metastatic_first_novel_treatment_type",
               "population",
               "density",
               "age_median",
               "age_under_10",
               "age_10_to_19",
               "age_20s",
               "age_30s",
               "age_40s",
               "age_50s",
               "age_60s",
               "age_70s",
               "age_over_80",
               "male",
               "female",
               "married",
               "divorced",
               "never_married",
               "widowed",
               "family_size",
               "family_dual_income",
               "income_household_median",
               "income_household_under_5",
               "income_household_5_to_10",
               "income_household_10_to_15",
               "income_household_15_to_20",
               "income_household_20_to_25",
               "income_household_25_to_35",
               "income_household_35_to_50",
               "income_household_50_to_75",
               "income_household_75_to_100",
               "income_household_100_to_150",
               "income_household_150_over",
               "income_household_six_figure",
               "income_individual_median",
               "home_ownership","housing_units",
               "home_value",
               "rent_median",
               "rent_burden",
               "education_less_highschool",
               "education_highschool",
               "education_some_college",
               "education_bachelors",
               "education_graduate",
               "education_college_or_above",
               "education_stem_degree",
               "labor_force_participation",
               "unemployment_rate",
               "self_employed",
               "farmer",
               "race_white",
               "race_black",
               "race_asian",
               "race_native",
               "race_pacific",
               "race_other",
               "race_multiple",
               "hispanic",
               "disabled",
               "poverty",
               "limited_english",
               "commute_time",
               "health_uninsured",
               "veteran",
               "metastatic_diagnosis_period")
colnames(train) <- col_names

# Plot target distribution

# Check if the variable is continuous or categorical
if (is.numeric(train$metastatic_diagnosis_period) || is.integer(train$metastatic_diagnosis_period)) {
  # For continuous variable, use a histogram
  ggplot(train, aes(x = metastatic_diagnosis_period)) +
    geom_histogram(bins = 30, fill = "blue", alpha = 0.7) +
    ggtitle("Target Distribution") +
    theme_minimal()
} else {
  # If the variable is not numeric, convert it to numeric if possible
  if (is.character(train$metastatic_diagnosis_period) || is.factor(train$metastatic_diagnosis_period)) {
    train$metastatic_diagnosis_period <- as.numeric(as.character(train$metastatic_diagnosis_period))
  }
  
  # If conversion to numeric worked, use a histogram
  if (!any(is.na(train$metastatic_diagnosis_period))) {
    ggplot(train, aes(x = metastatic_diagnosis_period)) +
      geom_histogram(bins = 30, fill = "blue", alpha = 0.7) +
      ggtitle("Target Distribution") +
      theme_minimal()
  } else {
    # If the variable remains categorical, use a bar plot
    ggplot(train, aes(x = as.factor(metastatic_diagnosis_period))) +
      geom_bar(fill = "blue", alpha = 0.7) +
      ggtitle("Target Distribution") +
      theme_minimal()
  }
}
## Warning: NAs introduced by coercion

# Define target
y <- train$metastatic_diagnosis_period

# Combine train and test for preprocessing

train1 <- train %>%
  select(-patient_id, -metastatic_diagnosis_period)



# Convert character columns to factors and handle missing values
train1 <- train1 %>%
  mutate(across(where(is.character), ~ as.integer(as.factor(ifelse(is.na(.), "Missing", .)))))

# Split back into train and test
train_data <- train1[1:nrow(train), ]
test_data <- train1[(nrow(train) + 1):nrow(train1), ]

# Define features
features <- c("breast_cancer_diagnosis_code", "patient_age", "metastatic_cancer_diagnosis_code", 
              "patient_race", "payer_type", "breast_cancer_diagnosis_desc", "bmi", 
              "Division", "patient_state", "clust", "metastatic_first_novel_treatment_type")

# Convert to data.table if it's not already
train_data <- as.data.table(train_data)
test_data <- as.data.table(test_data)


train_data$clust <- ifelse(nchar(as.character(train_data$metastatic_cancer_diagnosis_code)) == 4, 1, 0)
test_data$clust <- ifelse(nchar(as.character(test_data$metastatic_cancer_diagnosis_code)) == 4, 1, 0)

sapply(train_data[, features, with = FALSE], class)
##          breast_cancer_diagnosis_code                           patient_age 
##                             "integer"                             "integer" 
##      metastatic_cancer_diagnosis_code                          patient_race 
##                             "integer"                             "integer" 
##                            payer_type          breast_cancer_diagnosis_desc 
##                             "integer"                             "integer" 
##                                   bmi                              Division 
##                             "integer"                             "integer" 
##                         patient_state                                 clust 
##                             "integer"                             "numeric" 
## metastatic_first_novel_treatment_type 
##                             "integer"
train_data[, features] <- lapply(train_data[, features, with = FALSE], function(col) {
  if (is.character(col) || is.factor(col)) {
    as.numeric(as.factor(col))
  } else {
    col
  }
})

test_data[, features] <- lapply(test_data[, features, with = FALSE], function(col) {
  if (is.character(col) || is.factor(col)) {
    as.numeric(as.factor(col))
  } else {
    col
  }
})
summary(y)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    3.00   44.00   96.52  181.00  365.00       1
any(is.na(y))  # Check for NaN
## [1] TRUE
any(is.infinite(y))  # Check for Inf
## [1] FALSE
any(abs(y) > 1e6)  # Check for excessively large values
## [1] NA
y[is.na(y) | is.infinite(y)] <- 0  # Replace with 0 or another appropriate value

# Convert to matrix format for xgboost
dtrain <- xgb.DMatrix(data = as.matrix(train_data[, features, with = FALSE]), label = y)
dtest <- xgb.DMatrix(data = as.matrix(test_data[, features, with = FALSE]))

# Train the model using Stratified K-Fold Cross-Validation
kf <- createFolds(y, k = 5, list = TRUE)
errors <- c()
final_prediction <- matrix(0, nrow = nrow(test_data), ncol = 5)

for (i in 1:5) {
  train_index <- kf[[i]]
  val_index <- setdiff(1:length(y), train_index)
  
  dtrain_fold <- xgb.DMatrix(data = as.matrix(train_data[train_index, features, with = FALSE]), label = y[train_index])
  dval_fold <- xgb.DMatrix(data = as.matrix(train_data[val_index, features, with = FALSE]), label = y[val_index])
  
  params <- list(
    objective = "reg:squarederror",
    eval_metric = "rmse",
    max_depth = 6,
    eta = 0.1,
    subsample = 0.8,
    colsample_bytree = 0.8
  )
  
  model <- xgb.train(params = params, 
                     data = dtrain_fold, 
                     nrounds = 500, 
                     watchlist = list(train = dtrain_fold, val = dval_fold),
                     early_stopping_rounds = 50,
                     verbose = 0)
  
  preds <- predict(model, dval_fold)
  test_preds <- predict(model, dtest)
  
  final_prediction[, i] <- test_preds
  rmse <- sqrt(mean((preds - y[val_index])^2))
  print(paste("RMSE:", rmse))
  errors <- c(errors, rmse)
}
## [1] "RMSE: 84.4341690111795"
## [1] "RMSE: 83.6512041261253"
## [1] "RMSE: 83.1329975796444"
## [1] "RMSE: 83.6756786598318"
## [1] "RMSE: 84.2130460574534"
# Average RMSE
mean(errors)
## [1] 83.82142
# Average predictions across folds
prediction <- rowMeans(final_prediction)

print(prediction)
## [1] 116.5447  43.9148
print(final_prediction)
##          [,1]      [,2]     [,3]     [,4]     [,5]
## [1,] 133.7839 109.45089 97.78553 144.7750 96.92808
## [2,]  39.9494  38.07242 52.91059  47.2085 41.43311