CAPSTONE PROJECT
# Load libraries
library(data.table)
## Warning: package 'data.table' was built under R version 4.3.3
library(ggplot2)
library(xgboost)
## Warning: package 'xgboost' was built under R version 4.3.3
library(caret)
## Loading required package: lattice
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:xgboost':
##
## slice
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Set seed for reproducibility
set.seed(123)
# Load datasets
train <- read.csv("~/train.csv", header=FALSE)
View(train)
str(train)
## 'data.frame': 13174 obs. of 152 variables:
## $ V1 : chr "patient_id" "268700" "484983" "277055" ...
## $ V2 : chr "patient_race" "" "White" "" ...
## $ V3 : chr "payer_type" "COMMERCIAL" "" "COMMERCIAL" ...
## $ V4 : chr "patient_state" "AR" "IL" "CA" ...
## $ V5 : chr "patient_zip3" "724" "629" "925" ...
## $ V6 : chr "Region" "South" "Midwest" "West" ...
## $ V7 : chr "Division" "West South Central" "East North Central" "Pacific" ...
## $ V8 : chr "patient_age" "39" "55" "59" ...
## $ V9 : chr "patient_gender" "F" "F" "F" ...
## $ V10 : chr "bmi" "" "35.36" "" ...
## $ V11 : chr "breast_cancer_diagnosis_code" "C50912" "C50412" "1749" ...
## $ V12 : chr "breast_cancer_diagnosis_desc" "Malignant neoplasm of unspecified site of left female breast" "Malig neoplasm of upper-outer quadrant of left female breast" "Malignant neoplasm of breast (female), unspecified" ...
## $ V13 : chr "metastatic_cancer_diagnosis_code" "C773" "C773" "C773" ...
## $ V14 : chr "metastatic_first_novel_treatment" "" "" "" ...
## $ V15 : chr "metastatic_first_novel_treatment_type" "" "" "" ...
## $ V16 : chr "population" "3924.87" "2745.39" "38343.18" ...
## $ V17 : chr "density" "82.63" "51.79" "700.34" ...
## $ V18 : chr "age_median" "42.58" "43.54" "36.28" ...
## $ V19 : chr "age_under_10" "11.61" "11.22" "13.27" ...
## $ V20 : chr "age_10_to_19" "13.03" "12.19" "15.66" ...
## $ V21 : chr "age_20s" "10.87" "11.45" "13.49" ...
## $ V22 : chr "age_30s" "11.80" "11.01" "13.45" ...
## $ V23 : chr "age_40s" "12.29" "11.35" "12.40" ...
## $ V24 : chr "age_50s" "13.22" "14.39" "11.58" ...
## $ V25 : chr "age_60s" "13.47" "14.15" "10.47" ...
## $ V26 : chr "age_70s" "10.07" "9.17" "6.38" ...
## $ V27 : chr "age_over_80" "3.64" "5.05" "3.28" ...
## $ V28 : chr "male" "51.43" "49.32" "49.99" ...
## $ V29 : chr "female" "48.57" "50.68" "50.01" ...
## $ V30 : chr "married" "51.05" "49.48" "48.81" ...
## $ V31 : chr "divorced" "16.72" "15.42" "11.90" ...
## $ V32 : chr "never_married" "23.57" "26.93" "34.35" ...
## $ V33 : chr "widowed" "8.66" "8.17" "4.95" ...
## $ V34 : chr "family_size" "3.01" "3.17" "3.80" ...
## $ V35 : chr "family_dual_income" "43.99" "41.41" "52.89" ...
## $ V36 : chr "income_household_median" "44483.35" "51796.79" "78696.87" ...
## $ V37 : chr "income_household_under_5" "2.21" "3.67" "2.59" ...
## $ V38 : chr "income_household_5_to_10" "3.97" "3.86" "1.81" ...
## $ V39 : chr "income_household_10_to_15" "8.52" "6.58" "3.16" ...
## $ V40 : chr "income_household_15_to_20" "7.08" "5.58" "3.71" ...
## $ V41 : chr "income_household_20_to_25" "7.67" "5.38" "3.23" ...
## $ V42 : chr "income_household_25_to_35" "13.82" "11.02" "7.40" ...
## $ V43 : chr "income_household_35_to_50" "15.14" "13.09" "10.42" ...
## $ V44 : chr "income_household_50_to_75" "17.51" "19.56" "16.83" ...
## $ V45 : chr "income_household_75_to_100" "11.26" "11.76" "13.45" ...
## $ V46 : chr "income_household_100_to_150" "8.90" "11.40" "19.21" ...
## $ V47 : chr "income_household_150_over" "3.93" "8.11" "18.23" ...
## $ V48 : chr "income_household_six_figure" "12.83" "19.51" "37.44" ...
## $ V49 : chr "income_individual_median" "24048.55" "28028.04" "32818.54" ...
## $ V50 : chr "home_ownership" "72.11" "76.71" "66.82" ...
## $ V51 : chr "housing_units" "1513.75" "1113.35" "10825.83" ...
## $ V52 : chr "home_value" "87384.33" "92026.84" "392600.40" ...
## $ V53 : chr "rent_median" "641.39" "638.60" "1631.64" ...
## $ V54 : chr "rent_burden" "27.52" "29.37" "35.56" ...
## $ V55 : chr "education_less_highschool" "16.55" "10.93" "16.25" ...
## $ V56 : chr "education_highschool" "41.83" "35.26" "27.55" ...
## $ V57 : chr "education_some_college" "28.31" "35.33" "33.88" ...
## $ V58 : chr "education_bachelors" "9.21" "12.46" "13.92" ...
## $ V59 : chr "education_graduate" "4.11" "6.04" "8.39" ...
## $ V60 : chr "education_college_or_above" "13.32" "18.49" "22.32" ...
## $ V61 : chr "education_stem_degree" "38.78" "36.35" "43.37" ...
## $ V62 : chr "labor_force_participation" "53.60" "52.51" "59.47" ...
## $ V63 : chr "unemployment_rate" "5.85" "7.45" "7.28" ...
## $ V64 : chr "self_employed" "11.82" "9.19" "13.21" ...
## $ V65 : chr "farmer" "5.31" "5.21" "0.44" ...
## $ V66 : chr "race_white" "92.95" "88.75" "53.95" ...
## $ V67 : chr "race_black" "1.73" "6.44" "6.41" ...
## $ V68 : chr "race_asian" "0.33" "0.53" "5.83" ...
## $ V69 : chr "race_native" "0.20" "0.19" "0.81" ...
## $ V70 : chr "race_pacific" "0.03" "0.05" "0.38" ...
## $ V71 : chr "race_other" "0.83" "0.61" "21.35" ...
## $ V72 : chr "race_multiple" "3.94" "3.42" "11.27" ...
## $ V73 : chr "hispanic" "3.03" "2.78" "46.88" ...
## $ V74 : chr "disabled" "22.24" "20.16" "12.83" ...
## $ V75 : chr "poverty" "19.27" "16.94" "12.72" ...
## $ V76 : chr "limited_english" "0.42" "0.43" "4.58" ...
## $ V77 : chr "commute_time" "25.35" "26.26" "37.07" ...
## $ V78 : chr "health_uninsured" "8.06" "6.93" "8.07" ...
## $ V79 : chr "veteran" "8.11" "9.71" "7.75" ...
## $ V80 : chr "Average of Jan-13" "38.55" "34.85" "53.14" ...
## $ V81 : chr "Average of Feb-13" "39.88" "36.15" "55.28" ...
## $ V82 : chr "Average of Mar-13" "42.75" "39.41" "64.75" ...
## $ V83 : chr "Average of Apr-13" "55.16" "54.63" "67.38" ...
## $ V84 : chr "Average of May-13" "65.17" "65.41" "73.31" ...
## $ V85 : chr "Average of Jun-13" "75.98" "73.89" "79.49" ...
## $ V86 : chr "Average of Jul-13" "76.75" "74.07" "84.01" ...
## $ V87 : chr "Average of Aug-13" "76.45" "74.37" "83.28" ...
## $ V88 : chr "Average of Sep-13" "73.67" "70.44" "79.88" ...
## $ V89 : chr "Average of Oct-13" "59.73" "57.37" "67.84" ...
## $ V90 : chr "Average of Nov-13" "45.18" "42.15" "61.92" ...
## $ V91 : chr "Average of Dec-13" "37.43" "33.16" "55.69" ...
## $ V92 : chr "Average of Jan-14" "31.67" "26.88" "60.56" ...
## $ V93 : chr "Average of Feb-14" "33.83" "28.36" "60.99" ...
## $ V94 : chr "Average of Mar-14" "42.35" "40.32" "65.16" ...
## $ V95 : chr "Average of Apr-14" "57.72" "56.85" "68.01" ...
## $ V96 : chr "Average of May-14" "67.35" "66.84" "74.24" ...
## $ V97 : chr "Average of Jun-14" "75.92" "75.12" "78.87" ...
## $ V98 : chr "Average of Jul-14" "74.28" "72.18" "84.65" ...
## $ V99 : chr "Average of Aug-14" "79.59" "77.08" "82.23" ...
## [list output truncated]
#Removing Columns not being used in the data set
train<- train[,-c(80:151)]
#The column names are incorrect so the following code is to rename each column properly
col_names <- c("patient_id",
"patient_race",
"payer_type",
"patient_state",
"patient_zip3",
"Region",
"Division",
"patient_age",
"patient_gender",
"bmi",
"breast_cancer_diagnosis_code",
"breast_cancer_diagnosis_desc",
"metastatic_cancer_diagnosis_code",
"metastatic_first_novel_treatment",
"metastatic_first_novel_treatment_type",
"population",
"density",
"age_median",
"age_under_10",
"age_10_to_19",
"age_20s",
"age_30s",
"age_40s",
"age_50s",
"age_60s",
"age_70s",
"age_over_80",
"male",
"female",
"married",
"divorced",
"never_married",
"widowed",
"family_size",
"family_dual_income",
"income_household_median",
"income_household_under_5",
"income_household_5_to_10",
"income_household_10_to_15",
"income_household_15_to_20",
"income_household_20_to_25",
"income_household_25_to_35",
"income_household_35_to_50",
"income_household_50_to_75",
"income_household_75_to_100",
"income_household_100_to_150",
"income_household_150_over",
"income_household_six_figure",
"income_individual_median",
"home_ownership","housing_units",
"home_value",
"rent_median",
"rent_burden",
"education_less_highschool",
"education_highschool",
"education_some_college",
"education_bachelors",
"education_graduate",
"education_college_or_above",
"education_stem_degree",
"labor_force_participation",
"unemployment_rate",
"self_employed",
"farmer",
"race_white",
"race_black",
"race_asian",
"race_native",
"race_pacific",
"race_other",
"race_multiple",
"hispanic",
"disabled",
"poverty",
"limited_english",
"commute_time",
"health_uninsured",
"veteran",
"metastatic_diagnosis_period")
colnames(train) <- col_names
# Plot target distribution
# Check if the variable is continuous or categorical
if (is.numeric(train$metastatic_diagnosis_period) || is.integer(train$metastatic_diagnosis_period)) {
# For continuous variable, use a histogram
ggplot(train, aes(x = metastatic_diagnosis_period)) +
geom_histogram(bins = 30, fill = "blue", alpha = 0.7) +
ggtitle("Target Distribution") +
theme_minimal()
} else {
# If the variable is not numeric, convert it to numeric if possible
if (is.character(train$metastatic_diagnosis_period) || is.factor(train$metastatic_diagnosis_period)) {
train$metastatic_diagnosis_period <- as.numeric(as.character(train$metastatic_diagnosis_period))
}
# If conversion to numeric worked, use a histogram
if (!any(is.na(train$metastatic_diagnosis_period))) {
ggplot(train, aes(x = metastatic_diagnosis_period)) +
geom_histogram(bins = 30, fill = "blue", alpha = 0.7) +
ggtitle("Target Distribution") +
theme_minimal()
} else {
# If the variable remains categorical, use a bar plot
ggplot(train, aes(x = as.factor(metastatic_diagnosis_period))) +
geom_bar(fill = "blue", alpha = 0.7) +
ggtitle("Target Distribution") +
theme_minimal()
}
}
## Warning: NAs introduced by coercion
# Define target
y <- train$metastatic_diagnosis_period
# Combine train and test for preprocessing
train1 <- train %>%
select(-patient_id, -metastatic_diagnosis_period)
# Convert character columns to factors and handle missing values
train1 <- train1 %>%
mutate(across(where(is.character), ~ as.integer(as.factor(ifelse(is.na(.), "Missing", .)))))
# Split back into train and test
train_data <- train1[1:nrow(train), ]
test_data <- train1[(nrow(train) + 1):nrow(train1), ]
# Define features
features <- c("breast_cancer_diagnosis_code", "patient_age", "metastatic_cancer_diagnosis_code",
"patient_race", "payer_type", "breast_cancer_diagnosis_desc", "bmi",
"Division", "patient_state", "clust", "metastatic_first_novel_treatment_type")
# Convert to data.table if it's not already
train_data <- as.data.table(train_data)
test_data <- as.data.table(test_data)
train_data$clust <- ifelse(nchar(as.character(train_data$metastatic_cancer_diagnosis_code)) == 4, 1, 0)
test_data$clust <- ifelse(nchar(as.character(test_data$metastatic_cancer_diagnosis_code)) == 4, 1, 0)
sapply(train_data[, features, with = FALSE], class)
## breast_cancer_diagnosis_code patient_age
## "integer" "integer"
## metastatic_cancer_diagnosis_code patient_race
## "integer" "integer"
## payer_type breast_cancer_diagnosis_desc
## "integer" "integer"
## bmi Division
## "integer" "integer"
## patient_state clust
## "integer" "numeric"
## metastatic_first_novel_treatment_type
## "integer"
train_data[, features] <- lapply(train_data[, features, with = FALSE], function(col) {
if (is.character(col) || is.factor(col)) {
as.numeric(as.factor(col))
} else {
col
}
})
test_data[, features] <- lapply(test_data[, features, with = FALSE], function(col) {
if (is.character(col) || is.factor(col)) {
as.numeric(as.factor(col))
} else {
col
}
})
summary(y)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 3.00 44.00 96.52 181.00 365.00 1
any(is.na(y)) # Check for NaN
## [1] TRUE
any(is.infinite(y)) # Check for Inf
## [1] FALSE
any(abs(y) > 1e6) # Check for excessively large values
## [1] NA
y[is.na(y) | is.infinite(y)] <- 0 # Replace with 0 or another appropriate value
# Convert to matrix format for xgboost
dtrain <- xgb.DMatrix(data = as.matrix(train_data[, features, with = FALSE]), label = y)
dtest <- xgb.DMatrix(data = as.matrix(test_data[, features, with = FALSE]))
# Train the model using Stratified K-Fold Cross-Validation
kf <- createFolds(y, k = 5, list = TRUE)
errors <- c()
final_prediction <- matrix(0, nrow = nrow(test_data), ncol = 5)
for (i in 1:5) {
train_index <- kf[[i]]
val_index <- setdiff(1:length(y), train_index)
dtrain_fold <- xgb.DMatrix(data = as.matrix(train_data[train_index, features, with = FALSE]), label = y[train_index])
dval_fold <- xgb.DMatrix(data = as.matrix(train_data[val_index, features, with = FALSE]), label = y[val_index])
params <- list(
objective = "reg:squarederror",
eval_metric = "rmse",
max_depth = 6,
eta = 0.1,
subsample = 0.8,
colsample_bytree = 0.8
)
model <- xgb.train(params = params,
data = dtrain_fold,
nrounds = 500,
watchlist = list(train = dtrain_fold, val = dval_fold),
early_stopping_rounds = 50,
verbose = 0)
preds <- predict(model, dval_fold)
test_preds <- predict(model, dtest)
final_prediction[, i] <- test_preds
rmse <- sqrt(mean((preds - y[val_index])^2))
print(paste("RMSE:", rmse))
errors <- c(errors, rmse)
}
## [1] "RMSE: 84.4341690111795"
## [1] "RMSE: 83.6512041261253"
## [1] "RMSE: 83.1329975796444"
## [1] "RMSE: 83.6756786598318"
## [1] "RMSE: 84.2130460574534"
# Average RMSE
mean(errors)
## [1] 83.82142
# Average predictions across folds
prediction <- rowMeans(final_prediction)
print(prediction)
## [1] 116.5447 43.9148
print(final_prediction)
## [,1] [,2] [,3] [,4] [,5]
## [1,] 133.7839 109.45089 97.78553 144.7750 96.92808
## [2,] 39.9494 38.07242 52.91059 47.2085 41.43311