We are developing models that predict disease progression in patients using electronic health record data. It focuses on creating impactful solutions in healthcare by analyzing patient trends and identifying potential high-risk cases.

First we are loading all the packages needed to visualize and understand the data using R.

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readxl)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.92 loaded
library(readxl)
train <- read.csv("~/train.csv", header=FALSE)
View(train)
str(train)
## 'data.frame':    13174 obs. of  152 variables:
##  $ V1  : chr  "patient_id" "268700" "484983" "277055" ...
##  $ V2  : chr  "patient_race" "" "White" "" ...
##  $ V3  : chr  "payer_type" "COMMERCIAL" "" "COMMERCIAL" ...
##  $ V4  : chr  "patient_state" "AR" "IL" "CA" ...
##  $ V5  : chr  "patient_zip3" "724" "629" "925" ...
##  $ V6  : chr  "Region" "South" "Midwest" "West" ...
##  $ V7  : chr  "Division" "West South Central" "East North Central" "Pacific" ...
##  $ V8  : chr  "patient_age" "39" "55" "59" ...
##  $ V9  : chr  "patient_gender" "F" "F" "F" ...
##  $ V10 : chr  "bmi" "" "35.36" "" ...
##  $ V11 : chr  "breast_cancer_diagnosis_code" "C50912" "C50412" "1749" ...
##  $ V12 : chr  "breast_cancer_diagnosis_desc" "Malignant neoplasm of unspecified site of left female breast" "Malig neoplasm of upper-outer quadrant of left female breast" "Malignant neoplasm of breast (female), unspecified" ...
##  $ V13 : chr  "metastatic_cancer_diagnosis_code" "C773" "C773" "C773" ...
##  $ V14 : chr  "metastatic_first_novel_treatment" "" "" "" ...
##  $ V15 : chr  "metastatic_first_novel_treatment_type" "" "" "" ...
##  $ V16 : chr  "population" "3924.87" "2745.39" "38343.18" ...
##  $ V17 : chr  "density" "82.63" "51.79" "700.34" ...
##  $ V18 : chr  "age_median" "42.58" "43.54" "36.28" ...
##  $ V19 : chr  "age_under_10" "11.61" "11.22" "13.27" ...
##  $ V20 : chr  "age_10_to_19" "13.03" "12.19" "15.66" ...
##  $ V21 : chr  "age_20s" "10.87" "11.45" "13.49" ...
##  $ V22 : chr  "age_30s" "11.80" "11.01" "13.45" ...
##  $ V23 : chr  "age_40s" "12.29" "11.35" "12.40" ...
##  $ V24 : chr  "age_50s" "13.22" "14.39" "11.58" ...
##  $ V25 : chr  "age_60s" "13.47" "14.15" "10.47" ...
##  $ V26 : chr  "age_70s" "10.07" "9.17" "6.38" ...
##  $ V27 : chr  "age_over_80" "3.64" "5.05" "3.28" ...
##  $ V28 : chr  "male" "51.43" "49.32" "49.99" ...
##  $ V29 : chr  "female" "48.57" "50.68" "50.01" ...
##  $ V30 : chr  "married" "51.05" "49.48" "48.81" ...
##  $ V31 : chr  "divorced" "16.72" "15.42" "11.90" ...
##  $ V32 : chr  "never_married" "23.57" "26.93" "34.35" ...
##  $ V33 : chr  "widowed" "8.66" "8.17" "4.95" ...
##  $ V34 : chr  "family_size" "3.01" "3.17" "3.80" ...
##  $ V35 : chr  "family_dual_income" "43.99" "41.41" "52.89" ...
##  $ V36 : chr  "income_household_median" "44483.35" "51796.79" "78696.87" ...
##  $ V37 : chr  "income_household_under_5" "2.21" "3.67" "2.59" ...
##  $ V38 : chr  "income_household_5_to_10" "3.97" "3.86" "1.81" ...
##  $ V39 : chr  "income_household_10_to_15" "8.52" "6.58" "3.16" ...
##  $ V40 : chr  "income_household_15_to_20" "7.08" "5.58" "3.71" ...
##  $ V41 : chr  "income_household_20_to_25" "7.67" "5.38" "3.23" ...
##  $ V42 : chr  "income_household_25_to_35" "13.82" "11.02" "7.40" ...
##  $ V43 : chr  "income_household_35_to_50" "15.14" "13.09" "10.42" ...
##  $ V44 : chr  "income_household_50_to_75" "17.51" "19.56" "16.83" ...
##  $ V45 : chr  "income_household_75_to_100" "11.26" "11.76" "13.45" ...
##  $ V46 : chr  "income_household_100_to_150" "8.90" "11.40" "19.21" ...
##  $ V47 : chr  "income_household_150_over" "3.93" "8.11" "18.23" ...
##  $ V48 : chr  "income_household_six_figure" "12.83" "19.51" "37.44" ...
##  $ V49 : chr  "income_individual_median" "24048.55" "28028.04" "32818.54" ...
##  $ V50 : chr  "home_ownership" "72.11" "76.71" "66.82" ...
##  $ V51 : chr  "housing_units" "1513.75" "1113.35" "10825.83" ...
##  $ V52 : chr  "home_value" "87384.33" "92026.84" "392600.40" ...
##  $ V53 : chr  "rent_median" "641.39" "638.60" "1631.64" ...
##  $ V54 : chr  "rent_burden" "27.52" "29.37" "35.56" ...
##  $ V55 : chr  "education_less_highschool" "16.55" "10.93" "16.25" ...
##  $ V56 : chr  "education_highschool" "41.83" "35.26" "27.55" ...
##  $ V57 : chr  "education_some_college" "28.31" "35.33" "33.88" ...
##  $ V58 : chr  "education_bachelors" "9.21" "12.46" "13.92" ...
##  $ V59 : chr  "education_graduate" "4.11" "6.04" "8.39" ...
##  $ V60 : chr  "education_college_or_above" "13.32" "18.49" "22.32" ...
##  $ V61 : chr  "education_stem_degree" "38.78" "36.35" "43.37" ...
##  $ V62 : chr  "labor_force_participation" "53.60" "52.51" "59.47" ...
##  $ V63 : chr  "unemployment_rate" "5.85" "7.45" "7.28" ...
##  $ V64 : chr  "self_employed" "11.82" "9.19" "13.21" ...
##  $ V65 : chr  "farmer" "5.31" "5.21" "0.44" ...
##  $ V66 : chr  "race_white" "92.95" "88.75" "53.95" ...
##  $ V67 : chr  "race_black" "1.73" "6.44" "6.41" ...
##  $ V68 : chr  "race_asian" "0.33" "0.53" "5.83" ...
##  $ V69 : chr  "race_native" "0.20" "0.19" "0.81" ...
##  $ V70 : chr  "race_pacific" "0.03" "0.05" "0.38" ...
##  $ V71 : chr  "race_other" "0.83" "0.61" "21.35" ...
##  $ V72 : chr  "race_multiple" "3.94" "3.42" "11.27" ...
##  $ V73 : chr  "hispanic" "3.03" "2.78" "46.88" ...
##  $ V74 : chr  "disabled" "22.24" "20.16" "12.83" ...
##  $ V75 : chr  "poverty" "19.27" "16.94" "12.72" ...
##  $ V76 : chr  "limited_english" "0.42" "0.43" "4.58" ...
##  $ V77 : chr  "commute_time" "25.35" "26.26" "37.07" ...
##  $ V78 : chr  "health_uninsured" "8.06" "6.93" "8.07" ...
##  $ V79 : chr  "veteran" "8.11" "9.71" "7.75" ...
##  $ V80 : chr  "Average of Jan-13" "38.55" "34.85" "53.14" ...
##  $ V81 : chr  "Average of Feb-13" "39.88" "36.15" "55.28" ...
##  $ V82 : chr  "Average of Mar-13" "42.75" "39.41" "64.75" ...
##  $ V83 : chr  "Average of Apr-13" "55.16" "54.63" "67.38" ...
##  $ V84 : chr  "Average of May-13" "65.17" "65.41" "73.31" ...
##  $ V85 : chr  "Average of Jun-13" "75.98" "73.89" "79.49" ...
##  $ V86 : chr  "Average of Jul-13" "76.75" "74.07" "84.01" ...
##  $ V87 : chr  "Average of Aug-13" "76.45" "74.37" "83.28" ...
##  $ V88 : chr  "Average of Sep-13" "73.67" "70.44" "79.88" ...
##  $ V89 : chr  "Average of Oct-13" "59.73" "57.37" "67.84" ...
##  $ V90 : chr  "Average of Nov-13" "45.18" "42.15" "61.92" ...
##  $ V91 : chr  "Average of Dec-13" "37.43" "33.16" "55.69" ...
##  $ V92 : chr  "Average of Jan-14" "31.67" "26.88" "60.56" ...
##  $ V93 : chr  "Average of Feb-14" "33.83" "28.36" "60.99" ...
##  $ V94 : chr  "Average of Mar-14" "42.35" "40.32" "65.16" ...
##  $ V95 : chr  "Average of Apr-14" "57.72" "56.85" "68.01" ...
##  $ V96 : chr  "Average of May-14" "67.35" "66.84" "74.24" ...
##  $ V97 : chr  "Average of Jun-14" "75.92" "75.12" "78.87" ...
##  $ V98 : chr  "Average of Jul-14" "74.28" "72.18" "84.65" ...
##  $ V99 : chr  "Average of Aug-14" "79.59" "77.08" "82.23" ...
##   [list output truncated]
#The column names are incorrect so the following code is to rename each column properly

col_names <- c("patient_id",
               "patient_race",
               "payer_type",
               "patient_state",
               "patient_zip3",
               "Region",
               "Division",
               "patient_age",
               "patient_gender",
               "bmi",
               "breast_cancer_diagnosis_code",
               "breast_cancer_diagnosis_desc",
               "metastatic_cancer_diagnosis_code",
               "metastatic_first_novel_treatment",
               "metastatic_first_novel_treatment_type",
               "population",
               "density",
               "age_median",
               "age_under_10",
               "age_10_to_19",
               "age_20s",
               "age_30s",
               "age_40s",
               "age_50s",
               "age_60s",
               "age_70s",
               "age_over_80",
               "male",
               "female",
               "married",
               "divorced",
               "never_married",
               "widowed",
               "family_size",
               "family_dual_income",
               "income_household_median",
               "income_household_under_5",
               "income_household_5_to_10",
               "income_household_10_to_15",
               "income_household_15_to_20",
               "income_household_20_to_25",
               "income_household_25_to_35",
               "income_household_35_to_50",
               "income_household_50_to_75",
               "income_household_75_to_100",
               "income_household_100_to_150",
               "income_household_150_over",
               "income_household_six_figure",
               "income_individual_median",
               "home_ownership","housing_units",
               "home_value",
               "rent_median",
               "rent_burden",
               "education_less_highschool",
               "education_highschool",
               "education_some_college",
               "education_bachelors",
               "education_graduate",
               "education_college_or_above",
               "education_stem_degree",
               "labor_force_participation",
               "unemployment_rate",
               "self_employed",
               "farmer",
               "race_white",
               "race_black",
               "race_asian",
               "race_native",
               "race_pacific",
               "race_other",
               "race_multiple",
               "hispanic",
               "disabled",
               "poverty",
               "limited_english",
               "commute_time",
               "health_uninsured",
               "veteran",
               "Average of Jan-13",
               "Average of Feb-13",
               "Average of Mar-13",
               "Average of Apr-13",
               "Average of May-13",
               "Average of Jun-13",
               "Average of Jul-13",
               "Average of Aug-13",
               "Average of Sep-13",
               "Average of Oct-13",
               "Average of Nov-13",
               "Average of Nov-13",
               "Average of Jan-14",
               "Average of Feb-14",
               "Average of Mar-14",
               "Average of Apr-14",
               "Average of May-14",
               "Average of Jun-14",
               "Average of Jul-14",
               "Average of Aug-14")
colnames(train) <- col_names
num_columns<- ncol(train)
print(num_columns)
## [1] 152
View(train)
str(train)
## 'data.frame':    13174 obs. of  152 variables:
##  $ patient_id                           : chr  "patient_id" "268700" "484983" "277055" ...
##  $ patient_race                         : chr  "patient_race" "" "White" "" ...
##  $ payer_type                           : chr  "payer_type" "COMMERCIAL" "" "COMMERCIAL" ...
##  $ patient_state                        : chr  "patient_state" "AR" "IL" "CA" ...
##  $ patient_zip3                         : chr  "patient_zip3" "724" "629" "925" ...
##  $ Region                               : chr  "Region" "South" "Midwest" "West" ...
##  $ Division                             : chr  "Division" "West South Central" "East North Central" "Pacific" ...
##  $ patient_age                          : chr  "patient_age" "39" "55" "59" ...
##  $ patient_gender                       : chr  "patient_gender" "F" "F" "F" ...
##  $ bmi                                  : chr  "bmi" "" "35.36" "" ...
##  $ breast_cancer_diagnosis_code         : chr  "breast_cancer_diagnosis_code" "C50912" "C50412" "1749" ...
##  $ breast_cancer_diagnosis_desc         : chr  "breast_cancer_diagnosis_desc" "Malignant neoplasm of unspecified site of left female breast" "Malig neoplasm of upper-outer quadrant of left female breast" "Malignant neoplasm of breast (female), unspecified" ...
##  $ metastatic_cancer_diagnosis_code     : chr  "metastatic_cancer_diagnosis_code" "C773" "C773" "C773" ...
##  $ metastatic_first_novel_treatment     : chr  "metastatic_first_novel_treatment" "" "" "" ...
##  $ metastatic_first_novel_treatment_type: chr  "metastatic_first_novel_treatment_type" "" "" "" ...
##  $ population                           : chr  "population" "3924.87" "2745.39" "38343.18" ...
##  $ density                              : chr  "density" "82.63" "51.79" "700.34" ...
##  $ age_median                           : chr  "age_median" "42.58" "43.54" "36.28" ...
##  $ age_under_10                         : chr  "age_under_10" "11.61" "11.22" "13.27" ...
##  $ age_10_to_19                         : chr  "age_10_to_19" "13.03" "12.19" "15.66" ...
##  $ age_20s                              : chr  "age_20s" "10.87" "11.45" "13.49" ...
##  $ age_30s                              : chr  "age_30s" "11.80" "11.01" "13.45" ...
##  $ age_40s                              : chr  "age_40s" "12.29" "11.35" "12.40" ...
##  $ age_50s                              : chr  "age_50s" "13.22" "14.39" "11.58" ...
##  $ age_60s                              : chr  "age_60s" "13.47" "14.15" "10.47" ...
##  $ age_70s                              : chr  "age_70s" "10.07" "9.17" "6.38" ...
##  $ age_over_80                          : chr  "age_over_80" "3.64" "5.05" "3.28" ...
##  $ male                                 : chr  "male" "51.43" "49.32" "49.99" ...
##  $ female                               : chr  "female" "48.57" "50.68" "50.01" ...
##  $ married                              : chr  "married" "51.05" "49.48" "48.81" ...
##  $ divorced                             : chr  "divorced" "16.72" "15.42" "11.90" ...
##  $ never_married                        : chr  "never_married" "23.57" "26.93" "34.35" ...
##  $ widowed                              : chr  "widowed" "8.66" "8.17" "4.95" ...
##  $ family_size                          : chr  "family_size" "3.01" "3.17" "3.80" ...
##  $ family_dual_income                   : chr  "family_dual_income" "43.99" "41.41" "52.89" ...
##  $ income_household_median              : chr  "income_household_median" "44483.35" "51796.79" "78696.87" ...
##  $ income_household_under_5             : chr  "income_household_under_5" "2.21" "3.67" "2.59" ...
##  $ income_household_5_to_10             : chr  "income_household_5_to_10" "3.97" "3.86" "1.81" ...
##  $ income_household_10_to_15            : chr  "income_household_10_to_15" "8.52" "6.58" "3.16" ...
##  $ income_household_15_to_20            : chr  "income_household_15_to_20" "7.08" "5.58" "3.71" ...
##  $ income_household_20_to_25            : chr  "income_household_20_to_25" "7.67" "5.38" "3.23" ...
##  $ income_household_25_to_35            : chr  "income_household_25_to_35" "13.82" "11.02" "7.40" ...
##  $ income_household_35_to_50            : chr  "income_household_35_to_50" "15.14" "13.09" "10.42" ...
##  $ income_household_50_to_75            : chr  "income_household_50_to_75" "17.51" "19.56" "16.83" ...
##  $ income_household_75_to_100           : chr  "income_household_75_to_100" "11.26" "11.76" "13.45" ...
##  $ income_household_100_to_150          : chr  "income_household_100_to_150" "8.90" "11.40" "19.21" ...
##  $ income_household_150_over            : chr  "income_household_150_over" "3.93" "8.11" "18.23" ...
##  $ income_household_six_figure          : chr  "income_household_six_figure" "12.83" "19.51" "37.44" ...
##  $ income_individual_median             : chr  "income_individual_median" "24048.55" "28028.04" "32818.54" ...
##  $ home_ownership                       : chr  "home_ownership" "72.11" "76.71" "66.82" ...
##  $ housing_units                        : chr  "housing_units" "1513.75" "1113.35" "10825.83" ...
##  $ home_value                           : chr  "home_value" "87384.33" "92026.84" "392600.40" ...
##  $ rent_median                          : chr  "rent_median" "641.39" "638.60" "1631.64" ...
##  $ rent_burden                          : chr  "rent_burden" "27.52" "29.37" "35.56" ...
##  $ education_less_highschool            : chr  "education_less_highschool" "16.55" "10.93" "16.25" ...
##  $ education_highschool                 : chr  "education_highschool" "41.83" "35.26" "27.55" ...
##  $ education_some_college               : chr  "education_some_college" "28.31" "35.33" "33.88" ...
##  $ education_bachelors                  : chr  "education_bachelors" "9.21" "12.46" "13.92" ...
##  $ education_graduate                   : chr  "education_graduate" "4.11" "6.04" "8.39" ...
##  $ education_college_or_above           : chr  "education_college_or_above" "13.32" "18.49" "22.32" ...
##  $ education_stem_degree                : chr  "education_stem_degree" "38.78" "36.35" "43.37" ...
##  $ labor_force_participation            : chr  "labor_force_participation" "53.60" "52.51" "59.47" ...
##  $ unemployment_rate                    : chr  "unemployment_rate" "5.85" "7.45" "7.28" ...
##  $ self_employed                        : chr  "self_employed" "11.82" "9.19" "13.21" ...
##  $ farmer                               : chr  "farmer" "5.31" "5.21" "0.44" ...
##  $ race_white                           : chr  "race_white" "92.95" "88.75" "53.95" ...
##  $ race_black                           : chr  "race_black" "1.73" "6.44" "6.41" ...
##  $ race_asian                           : chr  "race_asian" "0.33" "0.53" "5.83" ...
##  $ race_native                          : chr  "race_native" "0.20" "0.19" "0.81" ...
##  $ race_pacific                         : chr  "race_pacific" "0.03" "0.05" "0.38" ...
##  $ race_other                           : chr  "race_other" "0.83" "0.61" "21.35" ...
##  $ race_multiple                        : chr  "race_multiple" "3.94" "3.42" "11.27" ...
##  $ hispanic                             : chr  "hispanic" "3.03" "2.78" "46.88" ...
##  $ disabled                             : chr  "disabled" "22.24" "20.16" "12.83" ...
##  $ poverty                              : chr  "poverty" "19.27" "16.94" "12.72" ...
##  $ limited_english                      : chr  "limited_english" "0.42" "0.43" "4.58" ...
##  $ commute_time                         : chr  "commute_time" "25.35" "26.26" "37.07" ...
##  $ health_uninsured                     : chr  "health_uninsured" "8.06" "6.93" "8.07" ...
##  $ veteran                              : chr  "veteran" "8.11" "9.71" "7.75" ...
##  $ Average of Jan-13                    : chr  "Average of Jan-13" "38.55" "34.85" "53.14" ...
##  $ Average of Feb-13                    : chr  "Average of Feb-13" "39.88" "36.15" "55.28" ...
##  $ Average of Mar-13                    : chr  "Average of Mar-13" "42.75" "39.41" "64.75" ...
##  $ Average of Apr-13                    : chr  "Average of Apr-13" "55.16" "54.63" "67.38" ...
##  $ Average of May-13                    : chr  "Average of May-13" "65.17" "65.41" "73.31" ...
##  $ Average of Jun-13                    : chr  "Average of Jun-13" "75.98" "73.89" "79.49" ...
##  $ Average of Jul-13                    : chr  "Average of Jul-13" "76.75" "74.07" "84.01" ...
##  $ Average of Aug-13                    : chr  "Average of Aug-13" "76.45" "74.37" "83.28" ...
##  $ Average of Sep-13                    : chr  "Average of Sep-13" "73.67" "70.44" "79.88" ...
##  $ Average of Oct-13                    : chr  "Average of Oct-13" "59.73" "57.37" "67.84" ...
##  $ Average of Nov-13                    : chr  "Average of Nov-13" "45.18" "42.15" "61.92" ...
##  $ Average of Nov-13                    : chr  "Average of Dec-13" "37.43" "33.16" "55.69" ...
##  $ Average of Jan-14                    : chr  "Average of Jan-14" "31.67" "26.88" "60.56" ...
##  $ Average of Feb-14                    : chr  "Average of Feb-14" "33.83" "28.36" "60.99" ...
##  $ Average of Mar-14                    : chr  "Average of Mar-14" "42.35" "40.32" "65.16" ...
##  $ Average of Apr-14                    : chr  "Average of Apr-14" "57.72" "56.85" "68.01" ...
##  $ Average of May-14                    : chr  "Average of May-14" "67.35" "66.84" "74.24" ...
##  $ Average of Jun-14                    : chr  "Average of Jun-14" "75.92" "75.12" "78.87" ...
##  $ Average of Jul-14                    : chr  "Average of Jul-14" "74.28" "72.18" "84.65" ...
##  $ Average of Aug-14                    : chr  "Average of Aug-14" "79.59" "77.08" "82.23" ...
##   [list output truncated]
train<- train[-1,]
train<- na.omit(train)
str(train)
## 'data.frame':    12767 obs. of  152 variables:
##  $ patient_id                           : chr  "268700" "484983" "277055" "320055" ...
##  $ patient_race                         : chr  "" "White" "" "Hispanic" ...
##  $ payer_type                           : chr  "COMMERCIAL" "" "COMMERCIAL" "MEDICAID" ...
##  $ patient_state                        : chr  "AR" "IL" "CA" "CA" ...
##  $ patient_zip3                         : chr  "724" "629" "925" "900" ...
##  $ Region                               : chr  "South" "Midwest" "West" "West" ...
##  $ Division                             : chr  "West South Central" "East North Central" "Pacific" "Pacific" ...
##  $ patient_age                          : chr  "39" "55" "59" "59" ...
##  $ patient_gender                       : chr  "F" "F" "F" "F" ...
##  $ bmi                                  : chr  "" "35.36" "" "" ...
##  $ breast_cancer_diagnosis_code         : chr  "C50912" "C50412" "1749" "C50911" ...
##  $ breast_cancer_diagnosis_desc         : chr  "Malignant neoplasm of unspecified site of left female breast" "Malig neoplasm of upper-outer quadrant of left female breast" "Malignant neoplasm of breast (female), unspecified" "Malignant neoplasm of unsp site of right female breast" ...
##  $ metastatic_cancer_diagnosis_code     : chr  "C773" "C773" "C773" "C773" ...
##  $ metastatic_first_novel_treatment     : chr  "" "" "" "" ...
##  $ metastatic_first_novel_treatment_type: chr  "" "" "" "" ...
##  $ population                           : chr  "3924.87" "2745.39" "38343.18" "36054.12" ...
##  $ density                              : chr  "82.63" "51.79" "700.34" "5294.33" ...
##  $ age_median                           : chr  "42.58" "43.54" "36.28" "36.65" ...
##  $ age_under_10                         : chr  "11.61" "11.22" "13.27" "9.76" ...
##  $ age_10_to_19                         : chr  "13.03" "12.19" "15.66" "11.27" ...
##  $ age_20s                              : chr  "10.87" "11.45" "13.49" "17.23" ...
##  $ age_30s                              : chr  "11.80" "11.01" "13.45" "17.44" ...
##  $ age_40s                              : chr  "12.29" "11.35" "12.40" "13.09" ...
##  $ age_50s                              : chr  "13.22" "14.39" "11.58" "12.30" ...
##  $ age_60s                              : chr  "13.47" "14.15" "10.47" "9.41" ...
##  $ age_70s                              : chr  "10.07" "9.17" "6.38" "5.67" ...
##  $ age_over_80                          : chr  "3.64" "5.05" "3.28" "3.82" ...
##  $ male                                 : chr  "51.43" "49.32" "49.99" "50.51" ...
##  $ female                               : chr  "48.57" "50.68" "50.01" "49.49" ...
##  $ married                              : chr  "51.05" "49.48" "48.81" "33.48" ...
##  $ divorced                             : chr  "16.72" "15.42" "11.90" "11.30" ...
##  $ never_married                        : chr  "23.57" "26.93" "34.35" "50.46" ...
##  $ widowed                              : chr  "8.66" "8.17" "4.95" "4.77" ...
##  $ family_size                          : chr  "3.01" "3.17" "3.80" "3.44" ...
##  $ family_dual_income                   : chr  "43.99" "41.41" "52.89" "55.53" ...
##  $ income_household_median              : chr  "44483.35" "51796.79" "78696.87" "69266.69" ...
##  $ income_household_under_5             : chr  "2.21" "3.67" "2.59" "6.32" ...
##  $ income_household_5_to_10             : chr  "3.97" "3.86" "1.81" "2.95" ...
##  $ income_household_10_to_15            : chr  "8.52" "6.58" "3.16" "6.81" ...
##  $ income_household_15_to_20            : chr  "7.08" "5.58" "3.71" "4.18" ...
##  $ income_household_20_to_25            : chr  "7.67" "5.38" "3.23" "4.13" ...
##  $ income_household_25_to_35            : chr  "13.82" "11.02" "7.40" "7.84" ...
##  $ income_household_35_to_50            : chr  "15.14" "13.09" "10.42" "10.16" ...
##  $ income_household_50_to_75            : chr  "17.51" "19.56" "16.83" "14.42" ...
##  $ income_household_75_to_100           : chr  "11.26" "11.76" "13.45" "10.48" ...
##  $ income_household_100_to_150          : chr  "8.90" "11.40" "19.21" "13.73" ...
##  $ income_household_150_over            : chr  "3.93" "8.11" "18.23" "18.96" ...
##  $ income_household_six_figure          : chr  "12.83" "19.51" "37.44" "32.69" ...
##  $ income_individual_median             : chr  "24048.55" "28028.04" "32818.54" "36053.40" ...
##  $ home_ownership                       : chr  "72.11" "76.71" "66.82" "31.50" ...
##  $ housing_units                        : chr  "1513.75" "1113.35" "10825.83" "12949.12" ...
##  $ home_value                           : chr  "87384.33" "92026.84" "392600.40" "873756.00" ...
##  $ rent_median                          : chr  "641.39" "638.60" "1631.64" "1651.15" ...
##  $ rent_burden                          : chr  "27.52" "29.37" "35.56" "37.37" ...
##  $ education_less_highschool            : chr  "16.55" "10.93" "16.25" "22.92" ...
##  $ education_highschool                 : chr  "41.83" "35.26" "27.55" "18.24" ...
##  $ education_some_college               : chr  "28.31" "35.33" "33.88" "21.27" ...
##  $ education_bachelors                  : chr  "9.21" "12.46" "13.92" "23.89" ...
##  $ education_graduate                   : chr  "4.11" "6.04" "8.39" "13.69" ...
##  $ education_college_or_above           : chr  "13.32" "18.49" "22.32" "37.58" ...
##  $ education_stem_degree                : chr  "38.78" "36.35" "43.37" "41.75" ...
##  $ labor_force_participation            : chr  "53.60" "52.51" "59.47" "64.39" ...
##  $ unemployment_rate                    : chr  "5.85" "7.45" "7.28" "8.68" ...
##  $ self_employed                        : chr  "11.82" "9.19" "13.21" "21.23" ...
##  $ farmer                               : chr  "5.31" "5.21" "0.44" "0.01" ...
##  $ race_white                           : chr  "92.95" "88.75" "53.95" "42.82" ...
##  $ race_black                           : chr  "1.73" "6.44" "6.41" "12.22" ...
##  $ race_asian                           : chr  "0.33" "0.53" "5.83" "12.70" ...
##  $ race_native                          : chr  "0.20" "0.19" "0.81" "1.12" ...
##  $ race_pacific                         : chr  "0.03" "0.05" "0.38" "0.15" ...
##  $ race_other                           : chr  "0.83" "0.61" "21.35" "22.14" ...
##  $ race_multiple                        : chr  "3.94" "3.42" "11.27" "8.85" ...
##  $ hispanic                             : chr  "3.03" "2.78" "46.88" "45.53" ...
##  $ disabled                             : chr  "22.24" "20.16" "12.83" "11.90" ...
##  $ poverty                              : chr  "19.27" "16.94" "12.72" "20.76" ...
##  $ limited_english                      : chr  "0.42" "0.43" "4.58" "14.74" ...
##  $ commute_time                         : chr  "25.35" "26.26" "37.07" "30.71" ...
##  $ health_uninsured                     : chr  "8.06" "6.93" "8.07" "10.34" ...
##  $ veteran                              : chr  "8.11" "9.71" "7.75" "3.03" ...
##  $ Average of Jan-13                    : chr  "38.55" "34.85" "53.14" "57.88" ...
##  $ Average of Feb-13                    : chr  "39.88" "36.15" "55.28" "57.65" ...
##  $ Average of Mar-13                    : chr  "42.75" "39.41" "64.75" "60.86" ...
##  $ Average of Apr-13                    : chr  "55.16" "54.63" "67.38" "62.77" ...
##  $ Average of May-13                    : chr  "65.17" "65.41" "73.31" "67.07" ...
##  $ Average of Jun-13                    : chr  "75.98" "73.89" "79.49" "68.41" ...
##  $ Average of Jul-13                    : chr  "76.75" "74.07" "84.01" "70.69" ...
##  $ Average of Aug-13                    : chr  "76.45" "74.37" "83.28" "71.19" ...
##  $ Average of Sep-13                    : chr  "73.67" "70.44" "79.88" "72.74" ...
##  $ Average of Oct-13                    : chr  "59.73" "57.37" "67.84" "66.41" ...
##  $ Average of Nov-13                    : chr  "45.18" "42.15" "61.92" "65.09" ...
##  $ Average of Nov-13                    : chr  "37.43" "33.16" "55.69" "60.87" ...
##  $ Average of Jan-14                    : chr  "31.67" "26.88" "60.56" "64.30" ...
##  $ Average of Feb-14                    : chr  "33.83" "28.36" "60.99" "60.77" ...
##  $ Average of Mar-14                    : chr  "42.35" "40.32" "65.16" "63.01" ...
##  $ Average of Apr-14                    : chr  "57.72" "56.85" "68.01" "64.37" ...
##  $ Average of May-14                    : chr  "67.35" "66.84" "74.24" "69.73" ...
##  $ Average of Jun-14                    : chr  "75.92" "75.12" "78.87" "68.46" ...
##  $ Average of Jul-14                    : chr  "74.28" "72.18" "84.65" "73.62" ...
##  $ Average of Aug-14                    : chr  "79.59" "77.08" "82.23" "73.65" ...
##   [list output truncated]
##  - attr(*, "na.action")= 'omit' Named int [1:406] 51 54 75 85 86 88 201 247 261 299 ...
##   ..- attr(*, "names")= chr [1:406] "52" "55" "76" "86" ...
train<- train[train$patient_id != "", ]
sum(is.na(train$patient_id) | train$patient_id =="")
## [1] 0
train<- train[train$patient_race != "", ]
sum(is.na(train$patient_race) | train$patient_race =="")
## [1] 0
sum(is.na(train))
## [1] 0

Further Cleaning of the Data set getting rid of the columns not being used in the research

train<- train[,-c(101:152)]
ggplot(train, aes(x = patient_race, fill = patient_race)) +
  geom_bar(color = "black") +  # Border color for bars
  geom_text(stat = "count", aes(label = ..count..), vjust = -0.5) + # Add text labels
  labs(title = "Distribution of Patient Race", x = "Patient Race", y = "Frequency") +
  scale_fill_manual(values = scales::hue_pal()(length(unique(train$patient_race)))) + # Automatically generate distinct colors
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggplot(train, aes(x = Region, fill = patient_race)) +
  geom_bar(position = "fill") +  # Creates a proportional bar chart
  labs(title = "Proportion of Patient Races by Region",
       x = "Region",
       y = "Proportion",
       fill = "Patient Race") +
  theme_minimal()

ggplot(train, aes(x = patient_age, y = bmi, color = patient_race)) +
  geom_point(size = 3, alpha = 0.5) +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  scale_color_brewer(palette = "Set1") +
  facet_wrap(~ patient_race) +
  labs(
    title = "BMI vs. Age Across Different Races",
    subtitle = "Scatter plot with linear trends",
    x = "Patient Age (Years)",
    y = "Body Mass Index (BMI)",
    color = "Race Category"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12),
    legend.position = "bottom"
  )
## `geom_smooth()` using formula = 'y ~ x'