We are developing models that predict disease progression in patients using electronic health record data. It focuses on creating impactful solutions in healthcare by analyzing patient trends and identifying potential high-risk cases.
First we are loading all the packages needed to visualize and understand the data using R.
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readxl)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.92 loaded
library(readxl)
train <- read.csv("~/train.csv", header=FALSE)
View(train)
str(train)
## 'data.frame': 13174 obs. of 152 variables:
## $ V1 : chr "patient_id" "268700" "484983" "277055" ...
## $ V2 : chr "patient_race" "" "White" "" ...
## $ V3 : chr "payer_type" "COMMERCIAL" "" "COMMERCIAL" ...
## $ V4 : chr "patient_state" "AR" "IL" "CA" ...
## $ V5 : chr "patient_zip3" "724" "629" "925" ...
## $ V6 : chr "Region" "South" "Midwest" "West" ...
## $ V7 : chr "Division" "West South Central" "East North Central" "Pacific" ...
## $ V8 : chr "patient_age" "39" "55" "59" ...
## $ V9 : chr "patient_gender" "F" "F" "F" ...
## $ V10 : chr "bmi" "" "35.36" "" ...
## $ V11 : chr "breast_cancer_diagnosis_code" "C50912" "C50412" "1749" ...
## $ V12 : chr "breast_cancer_diagnosis_desc" "Malignant neoplasm of unspecified site of left female breast" "Malig neoplasm of upper-outer quadrant of left female breast" "Malignant neoplasm of breast (female), unspecified" ...
## $ V13 : chr "metastatic_cancer_diagnosis_code" "C773" "C773" "C773" ...
## $ V14 : chr "metastatic_first_novel_treatment" "" "" "" ...
## $ V15 : chr "metastatic_first_novel_treatment_type" "" "" "" ...
## $ V16 : chr "population" "3924.87" "2745.39" "38343.18" ...
## $ V17 : chr "density" "82.63" "51.79" "700.34" ...
## $ V18 : chr "age_median" "42.58" "43.54" "36.28" ...
## $ V19 : chr "age_under_10" "11.61" "11.22" "13.27" ...
## $ V20 : chr "age_10_to_19" "13.03" "12.19" "15.66" ...
## $ V21 : chr "age_20s" "10.87" "11.45" "13.49" ...
## $ V22 : chr "age_30s" "11.80" "11.01" "13.45" ...
## $ V23 : chr "age_40s" "12.29" "11.35" "12.40" ...
## $ V24 : chr "age_50s" "13.22" "14.39" "11.58" ...
## $ V25 : chr "age_60s" "13.47" "14.15" "10.47" ...
## $ V26 : chr "age_70s" "10.07" "9.17" "6.38" ...
## $ V27 : chr "age_over_80" "3.64" "5.05" "3.28" ...
## $ V28 : chr "male" "51.43" "49.32" "49.99" ...
## $ V29 : chr "female" "48.57" "50.68" "50.01" ...
## $ V30 : chr "married" "51.05" "49.48" "48.81" ...
## $ V31 : chr "divorced" "16.72" "15.42" "11.90" ...
## $ V32 : chr "never_married" "23.57" "26.93" "34.35" ...
## $ V33 : chr "widowed" "8.66" "8.17" "4.95" ...
## $ V34 : chr "family_size" "3.01" "3.17" "3.80" ...
## $ V35 : chr "family_dual_income" "43.99" "41.41" "52.89" ...
## $ V36 : chr "income_household_median" "44483.35" "51796.79" "78696.87" ...
## $ V37 : chr "income_household_under_5" "2.21" "3.67" "2.59" ...
## $ V38 : chr "income_household_5_to_10" "3.97" "3.86" "1.81" ...
## $ V39 : chr "income_household_10_to_15" "8.52" "6.58" "3.16" ...
## $ V40 : chr "income_household_15_to_20" "7.08" "5.58" "3.71" ...
## $ V41 : chr "income_household_20_to_25" "7.67" "5.38" "3.23" ...
## $ V42 : chr "income_household_25_to_35" "13.82" "11.02" "7.40" ...
## $ V43 : chr "income_household_35_to_50" "15.14" "13.09" "10.42" ...
## $ V44 : chr "income_household_50_to_75" "17.51" "19.56" "16.83" ...
## $ V45 : chr "income_household_75_to_100" "11.26" "11.76" "13.45" ...
## $ V46 : chr "income_household_100_to_150" "8.90" "11.40" "19.21" ...
## $ V47 : chr "income_household_150_over" "3.93" "8.11" "18.23" ...
## $ V48 : chr "income_household_six_figure" "12.83" "19.51" "37.44" ...
## $ V49 : chr "income_individual_median" "24048.55" "28028.04" "32818.54" ...
## $ V50 : chr "home_ownership" "72.11" "76.71" "66.82" ...
## $ V51 : chr "housing_units" "1513.75" "1113.35" "10825.83" ...
## $ V52 : chr "home_value" "87384.33" "92026.84" "392600.40" ...
## $ V53 : chr "rent_median" "641.39" "638.60" "1631.64" ...
## $ V54 : chr "rent_burden" "27.52" "29.37" "35.56" ...
## $ V55 : chr "education_less_highschool" "16.55" "10.93" "16.25" ...
## $ V56 : chr "education_highschool" "41.83" "35.26" "27.55" ...
## $ V57 : chr "education_some_college" "28.31" "35.33" "33.88" ...
## $ V58 : chr "education_bachelors" "9.21" "12.46" "13.92" ...
## $ V59 : chr "education_graduate" "4.11" "6.04" "8.39" ...
## $ V60 : chr "education_college_or_above" "13.32" "18.49" "22.32" ...
## $ V61 : chr "education_stem_degree" "38.78" "36.35" "43.37" ...
## $ V62 : chr "labor_force_participation" "53.60" "52.51" "59.47" ...
## $ V63 : chr "unemployment_rate" "5.85" "7.45" "7.28" ...
## $ V64 : chr "self_employed" "11.82" "9.19" "13.21" ...
## $ V65 : chr "farmer" "5.31" "5.21" "0.44" ...
## $ V66 : chr "race_white" "92.95" "88.75" "53.95" ...
## $ V67 : chr "race_black" "1.73" "6.44" "6.41" ...
## $ V68 : chr "race_asian" "0.33" "0.53" "5.83" ...
## $ V69 : chr "race_native" "0.20" "0.19" "0.81" ...
## $ V70 : chr "race_pacific" "0.03" "0.05" "0.38" ...
## $ V71 : chr "race_other" "0.83" "0.61" "21.35" ...
## $ V72 : chr "race_multiple" "3.94" "3.42" "11.27" ...
## $ V73 : chr "hispanic" "3.03" "2.78" "46.88" ...
## $ V74 : chr "disabled" "22.24" "20.16" "12.83" ...
## $ V75 : chr "poverty" "19.27" "16.94" "12.72" ...
## $ V76 : chr "limited_english" "0.42" "0.43" "4.58" ...
## $ V77 : chr "commute_time" "25.35" "26.26" "37.07" ...
## $ V78 : chr "health_uninsured" "8.06" "6.93" "8.07" ...
## $ V79 : chr "veteran" "8.11" "9.71" "7.75" ...
## $ V80 : chr "Average of Jan-13" "38.55" "34.85" "53.14" ...
## $ V81 : chr "Average of Feb-13" "39.88" "36.15" "55.28" ...
## $ V82 : chr "Average of Mar-13" "42.75" "39.41" "64.75" ...
## $ V83 : chr "Average of Apr-13" "55.16" "54.63" "67.38" ...
## $ V84 : chr "Average of May-13" "65.17" "65.41" "73.31" ...
## $ V85 : chr "Average of Jun-13" "75.98" "73.89" "79.49" ...
## $ V86 : chr "Average of Jul-13" "76.75" "74.07" "84.01" ...
## $ V87 : chr "Average of Aug-13" "76.45" "74.37" "83.28" ...
## $ V88 : chr "Average of Sep-13" "73.67" "70.44" "79.88" ...
## $ V89 : chr "Average of Oct-13" "59.73" "57.37" "67.84" ...
## $ V90 : chr "Average of Nov-13" "45.18" "42.15" "61.92" ...
## $ V91 : chr "Average of Dec-13" "37.43" "33.16" "55.69" ...
## $ V92 : chr "Average of Jan-14" "31.67" "26.88" "60.56" ...
## $ V93 : chr "Average of Feb-14" "33.83" "28.36" "60.99" ...
## $ V94 : chr "Average of Mar-14" "42.35" "40.32" "65.16" ...
## $ V95 : chr "Average of Apr-14" "57.72" "56.85" "68.01" ...
## $ V96 : chr "Average of May-14" "67.35" "66.84" "74.24" ...
## $ V97 : chr "Average of Jun-14" "75.92" "75.12" "78.87" ...
## $ V98 : chr "Average of Jul-14" "74.28" "72.18" "84.65" ...
## $ V99 : chr "Average of Aug-14" "79.59" "77.08" "82.23" ...
## [list output truncated]
#The column names are incorrect so the following code is to rename each column properly
col_names <- c("patient_id",
"patient_race",
"payer_type",
"patient_state",
"patient_zip3",
"Region",
"Division",
"patient_age",
"patient_gender",
"bmi",
"breast_cancer_diagnosis_code",
"breast_cancer_diagnosis_desc",
"metastatic_cancer_diagnosis_code",
"metastatic_first_novel_treatment",
"metastatic_first_novel_treatment_type",
"population",
"density",
"age_median",
"age_under_10",
"age_10_to_19",
"age_20s",
"age_30s",
"age_40s",
"age_50s",
"age_60s",
"age_70s",
"age_over_80",
"male",
"female",
"married",
"divorced",
"never_married",
"widowed",
"family_size",
"family_dual_income",
"income_household_median",
"income_household_under_5",
"income_household_5_to_10",
"income_household_10_to_15",
"income_household_15_to_20",
"income_household_20_to_25",
"income_household_25_to_35",
"income_household_35_to_50",
"income_household_50_to_75",
"income_household_75_to_100",
"income_household_100_to_150",
"income_household_150_over",
"income_household_six_figure",
"income_individual_median",
"home_ownership","housing_units",
"home_value",
"rent_median",
"rent_burden",
"education_less_highschool",
"education_highschool",
"education_some_college",
"education_bachelors",
"education_graduate",
"education_college_or_above",
"education_stem_degree",
"labor_force_participation",
"unemployment_rate",
"self_employed",
"farmer",
"race_white",
"race_black",
"race_asian",
"race_native",
"race_pacific",
"race_other",
"race_multiple",
"hispanic",
"disabled",
"poverty",
"limited_english",
"commute_time",
"health_uninsured",
"veteran",
"Average of Jan-13",
"Average of Feb-13",
"Average of Mar-13",
"Average of Apr-13",
"Average of May-13",
"Average of Jun-13",
"Average of Jul-13",
"Average of Aug-13",
"Average of Sep-13",
"Average of Oct-13",
"Average of Nov-13",
"Average of Nov-13",
"Average of Jan-14",
"Average of Feb-14",
"Average of Mar-14",
"Average of Apr-14",
"Average of May-14",
"Average of Jun-14",
"Average of Jul-14",
"Average of Aug-14")
colnames(train) <- col_names
num_columns<- ncol(train)
print(num_columns)
## [1] 152
View(train)
str(train)
## 'data.frame': 13174 obs. of 152 variables:
## $ patient_id : chr "patient_id" "268700" "484983" "277055" ...
## $ patient_race : chr "patient_race" "" "White" "" ...
## $ payer_type : chr "payer_type" "COMMERCIAL" "" "COMMERCIAL" ...
## $ patient_state : chr "patient_state" "AR" "IL" "CA" ...
## $ patient_zip3 : chr "patient_zip3" "724" "629" "925" ...
## $ Region : chr "Region" "South" "Midwest" "West" ...
## $ Division : chr "Division" "West South Central" "East North Central" "Pacific" ...
## $ patient_age : chr "patient_age" "39" "55" "59" ...
## $ patient_gender : chr "patient_gender" "F" "F" "F" ...
## $ bmi : chr "bmi" "" "35.36" "" ...
## $ breast_cancer_diagnosis_code : chr "breast_cancer_diagnosis_code" "C50912" "C50412" "1749" ...
## $ breast_cancer_diagnosis_desc : chr "breast_cancer_diagnosis_desc" "Malignant neoplasm of unspecified site of left female breast" "Malig neoplasm of upper-outer quadrant of left female breast" "Malignant neoplasm of breast (female), unspecified" ...
## $ metastatic_cancer_diagnosis_code : chr "metastatic_cancer_diagnosis_code" "C773" "C773" "C773" ...
## $ metastatic_first_novel_treatment : chr "metastatic_first_novel_treatment" "" "" "" ...
## $ metastatic_first_novel_treatment_type: chr "metastatic_first_novel_treatment_type" "" "" "" ...
## $ population : chr "population" "3924.87" "2745.39" "38343.18" ...
## $ density : chr "density" "82.63" "51.79" "700.34" ...
## $ age_median : chr "age_median" "42.58" "43.54" "36.28" ...
## $ age_under_10 : chr "age_under_10" "11.61" "11.22" "13.27" ...
## $ age_10_to_19 : chr "age_10_to_19" "13.03" "12.19" "15.66" ...
## $ age_20s : chr "age_20s" "10.87" "11.45" "13.49" ...
## $ age_30s : chr "age_30s" "11.80" "11.01" "13.45" ...
## $ age_40s : chr "age_40s" "12.29" "11.35" "12.40" ...
## $ age_50s : chr "age_50s" "13.22" "14.39" "11.58" ...
## $ age_60s : chr "age_60s" "13.47" "14.15" "10.47" ...
## $ age_70s : chr "age_70s" "10.07" "9.17" "6.38" ...
## $ age_over_80 : chr "age_over_80" "3.64" "5.05" "3.28" ...
## $ male : chr "male" "51.43" "49.32" "49.99" ...
## $ female : chr "female" "48.57" "50.68" "50.01" ...
## $ married : chr "married" "51.05" "49.48" "48.81" ...
## $ divorced : chr "divorced" "16.72" "15.42" "11.90" ...
## $ never_married : chr "never_married" "23.57" "26.93" "34.35" ...
## $ widowed : chr "widowed" "8.66" "8.17" "4.95" ...
## $ family_size : chr "family_size" "3.01" "3.17" "3.80" ...
## $ family_dual_income : chr "family_dual_income" "43.99" "41.41" "52.89" ...
## $ income_household_median : chr "income_household_median" "44483.35" "51796.79" "78696.87" ...
## $ income_household_under_5 : chr "income_household_under_5" "2.21" "3.67" "2.59" ...
## $ income_household_5_to_10 : chr "income_household_5_to_10" "3.97" "3.86" "1.81" ...
## $ income_household_10_to_15 : chr "income_household_10_to_15" "8.52" "6.58" "3.16" ...
## $ income_household_15_to_20 : chr "income_household_15_to_20" "7.08" "5.58" "3.71" ...
## $ income_household_20_to_25 : chr "income_household_20_to_25" "7.67" "5.38" "3.23" ...
## $ income_household_25_to_35 : chr "income_household_25_to_35" "13.82" "11.02" "7.40" ...
## $ income_household_35_to_50 : chr "income_household_35_to_50" "15.14" "13.09" "10.42" ...
## $ income_household_50_to_75 : chr "income_household_50_to_75" "17.51" "19.56" "16.83" ...
## $ income_household_75_to_100 : chr "income_household_75_to_100" "11.26" "11.76" "13.45" ...
## $ income_household_100_to_150 : chr "income_household_100_to_150" "8.90" "11.40" "19.21" ...
## $ income_household_150_over : chr "income_household_150_over" "3.93" "8.11" "18.23" ...
## $ income_household_six_figure : chr "income_household_six_figure" "12.83" "19.51" "37.44" ...
## $ income_individual_median : chr "income_individual_median" "24048.55" "28028.04" "32818.54" ...
## $ home_ownership : chr "home_ownership" "72.11" "76.71" "66.82" ...
## $ housing_units : chr "housing_units" "1513.75" "1113.35" "10825.83" ...
## $ home_value : chr "home_value" "87384.33" "92026.84" "392600.40" ...
## $ rent_median : chr "rent_median" "641.39" "638.60" "1631.64" ...
## $ rent_burden : chr "rent_burden" "27.52" "29.37" "35.56" ...
## $ education_less_highschool : chr "education_less_highschool" "16.55" "10.93" "16.25" ...
## $ education_highschool : chr "education_highschool" "41.83" "35.26" "27.55" ...
## $ education_some_college : chr "education_some_college" "28.31" "35.33" "33.88" ...
## $ education_bachelors : chr "education_bachelors" "9.21" "12.46" "13.92" ...
## $ education_graduate : chr "education_graduate" "4.11" "6.04" "8.39" ...
## $ education_college_or_above : chr "education_college_or_above" "13.32" "18.49" "22.32" ...
## $ education_stem_degree : chr "education_stem_degree" "38.78" "36.35" "43.37" ...
## $ labor_force_participation : chr "labor_force_participation" "53.60" "52.51" "59.47" ...
## $ unemployment_rate : chr "unemployment_rate" "5.85" "7.45" "7.28" ...
## $ self_employed : chr "self_employed" "11.82" "9.19" "13.21" ...
## $ farmer : chr "farmer" "5.31" "5.21" "0.44" ...
## $ race_white : chr "race_white" "92.95" "88.75" "53.95" ...
## $ race_black : chr "race_black" "1.73" "6.44" "6.41" ...
## $ race_asian : chr "race_asian" "0.33" "0.53" "5.83" ...
## $ race_native : chr "race_native" "0.20" "0.19" "0.81" ...
## $ race_pacific : chr "race_pacific" "0.03" "0.05" "0.38" ...
## $ race_other : chr "race_other" "0.83" "0.61" "21.35" ...
## $ race_multiple : chr "race_multiple" "3.94" "3.42" "11.27" ...
## $ hispanic : chr "hispanic" "3.03" "2.78" "46.88" ...
## $ disabled : chr "disabled" "22.24" "20.16" "12.83" ...
## $ poverty : chr "poverty" "19.27" "16.94" "12.72" ...
## $ limited_english : chr "limited_english" "0.42" "0.43" "4.58" ...
## $ commute_time : chr "commute_time" "25.35" "26.26" "37.07" ...
## $ health_uninsured : chr "health_uninsured" "8.06" "6.93" "8.07" ...
## $ veteran : chr "veteran" "8.11" "9.71" "7.75" ...
## $ Average of Jan-13 : chr "Average of Jan-13" "38.55" "34.85" "53.14" ...
## $ Average of Feb-13 : chr "Average of Feb-13" "39.88" "36.15" "55.28" ...
## $ Average of Mar-13 : chr "Average of Mar-13" "42.75" "39.41" "64.75" ...
## $ Average of Apr-13 : chr "Average of Apr-13" "55.16" "54.63" "67.38" ...
## $ Average of May-13 : chr "Average of May-13" "65.17" "65.41" "73.31" ...
## $ Average of Jun-13 : chr "Average of Jun-13" "75.98" "73.89" "79.49" ...
## $ Average of Jul-13 : chr "Average of Jul-13" "76.75" "74.07" "84.01" ...
## $ Average of Aug-13 : chr "Average of Aug-13" "76.45" "74.37" "83.28" ...
## $ Average of Sep-13 : chr "Average of Sep-13" "73.67" "70.44" "79.88" ...
## $ Average of Oct-13 : chr "Average of Oct-13" "59.73" "57.37" "67.84" ...
## $ Average of Nov-13 : chr "Average of Nov-13" "45.18" "42.15" "61.92" ...
## $ Average of Nov-13 : chr "Average of Dec-13" "37.43" "33.16" "55.69" ...
## $ Average of Jan-14 : chr "Average of Jan-14" "31.67" "26.88" "60.56" ...
## $ Average of Feb-14 : chr "Average of Feb-14" "33.83" "28.36" "60.99" ...
## $ Average of Mar-14 : chr "Average of Mar-14" "42.35" "40.32" "65.16" ...
## $ Average of Apr-14 : chr "Average of Apr-14" "57.72" "56.85" "68.01" ...
## $ Average of May-14 : chr "Average of May-14" "67.35" "66.84" "74.24" ...
## $ Average of Jun-14 : chr "Average of Jun-14" "75.92" "75.12" "78.87" ...
## $ Average of Jul-14 : chr "Average of Jul-14" "74.28" "72.18" "84.65" ...
## $ Average of Aug-14 : chr "Average of Aug-14" "79.59" "77.08" "82.23" ...
## [list output truncated]
train<- train[-1,]
train<- na.omit(train)
str(train)
## 'data.frame': 12767 obs. of 152 variables:
## $ patient_id : chr "268700" "484983" "277055" "320055" ...
## $ patient_race : chr "" "White" "" "Hispanic" ...
## $ payer_type : chr "COMMERCIAL" "" "COMMERCIAL" "MEDICAID" ...
## $ patient_state : chr "AR" "IL" "CA" "CA" ...
## $ patient_zip3 : chr "724" "629" "925" "900" ...
## $ Region : chr "South" "Midwest" "West" "West" ...
## $ Division : chr "West South Central" "East North Central" "Pacific" "Pacific" ...
## $ patient_age : chr "39" "55" "59" "59" ...
## $ patient_gender : chr "F" "F" "F" "F" ...
## $ bmi : chr "" "35.36" "" "" ...
## $ breast_cancer_diagnosis_code : chr "C50912" "C50412" "1749" "C50911" ...
## $ breast_cancer_diagnosis_desc : chr "Malignant neoplasm of unspecified site of left female breast" "Malig neoplasm of upper-outer quadrant of left female breast" "Malignant neoplasm of breast (female), unspecified" "Malignant neoplasm of unsp site of right female breast" ...
## $ metastatic_cancer_diagnosis_code : chr "C773" "C773" "C773" "C773" ...
## $ metastatic_first_novel_treatment : chr "" "" "" "" ...
## $ metastatic_first_novel_treatment_type: chr "" "" "" "" ...
## $ population : chr "3924.87" "2745.39" "38343.18" "36054.12" ...
## $ density : chr "82.63" "51.79" "700.34" "5294.33" ...
## $ age_median : chr "42.58" "43.54" "36.28" "36.65" ...
## $ age_under_10 : chr "11.61" "11.22" "13.27" "9.76" ...
## $ age_10_to_19 : chr "13.03" "12.19" "15.66" "11.27" ...
## $ age_20s : chr "10.87" "11.45" "13.49" "17.23" ...
## $ age_30s : chr "11.80" "11.01" "13.45" "17.44" ...
## $ age_40s : chr "12.29" "11.35" "12.40" "13.09" ...
## $ age_50s : chr "13.22" "14.39" "11.58" "12.30" ...
## $ age_60s : chr "13.47" "14.15" "10.47" "9.41" ...
## $ age_70s : chr "10.07" "9.17" "6.38" "5.67" ...
## $ age_over_80 : chr "3.64" "5.05" "3.28" "3.82" ...
## $ male : chr "51.43" "49.32" "49.99" "50.51" ...
## $ female : chr "48.57" "50.68" "50.01" "49.49" ...
## $ married : chr "51.05" "49.48" "48.81" "33.48" ...
## $ divorced : chr "16.72" "15.42" "11.90" "11.30" ...
## $ never_married : chr "23.57" "26.93" "34.35" "50.46" ...
## $ widowed : chr "8.66" "8.17" "4.95" "4.77" ...
## $ family_size : chr "3.01" "3.17" "3.80" "3.44" ...
## $ family_dual_income : chr "43.99" "41.41" "52.89" "55.53" ...
## $ income_household_median : chr "44483.35" "51796.79" "78696.87" "69266.69" ...
## $ income_household_under_5 : chr "2.21" "3.67" "2.59" "6.32" ...
## $ income_household_5_to_10 : chr "3.97" "3.86" "1.81" "2.95" ...
## $ income_household_10_to_15 : chr "8.52" "6.58" "3.16" "6.81" ...
## $ income_household_15_to_20 : chr "7.08" "5.58" "3.71" "4.18" ...
## $ income_household_20_to_25 : chr "7.67" "5.38" "3.23" "4.13" ...
## $ income_household_25_to_35 : chr "13.82" "11.02" "7.40" "7.84" ...
## $ income_household_35_to_50 : chr "15.14" "13.09" "10.42" "10.16" ...
## $ income_household_50_to_75 : chr "17.51" "19.56" "16.83" "14.42" ...
## $ income_household_75_to_100 : chr "11.26" "11.76" "13.45" "10.48" ...
## $ income_household_100_to_150 : chr "8.90" "11.40" "19.21" "13.73" ...
## $ income_household_150_over : chr "3.93" "8.11" "18.23" "18.96" ...
## $ income_household_six_figure : chr "12.83" "19.51" "37.44" "32.69" ...
## $ income_individual_median : chr "24048.55" "28028.04" "32818.54" "36053.40" ...
## $ home_ownership : chr "72.11" "76.71" "66.82" "31.50" ...
## $ housing_units : chr "1513.75" "1113.35" "10825.83" "12949.12" ...
## $ home_value : chr "87384.33" "92026.84" "392600.40" "873756.00" ...
## $ rent_median : chr "641.39" "638.60" "1631.64" "1651.15" ...
## $ rent_burden : chr "27.52" "29.37" "35.56" "37.37" ...
## $ education_less_highschool : chr "16.55" "10.93" "16.25" "22.92" ...
## $ education_highschool : chr "41.83" "35.26" "27.55" "18.24" ...
## $ education_some_college : chr "28.31" "35.33" "33.88" "21.27" ...
## $ education_bachelors : chr "9.21" "12.46" "13.92" "23.89" ...
## $ education_graduate : chr "4.11" "6.04" "8.39" "13.69" ...
## $ education_college_or_above : chr "13.32" "18.49" "22.32" "37.58" ...
## $ education_stem_degree : chr "38.78" "36.35" "43.37" "41.75" ...
## $ labor_force_participation : chr "53.60" "52.51" "59.47" "64.39" ...
## $ unemployment_rate : chr "5.85" "7.45" "7.28" "8.68" ...
## $ self_employed : chr "11.82" "9.19" "13.21" "21.23" ...
## $ farmer : chr "5.31" "5.21" "0.44" "0.01" ...
## $ race_white : chr "92.95" "88.75" "53.95" "42.82" ...
## $ race_black : chr "1.73" "6.44" "6.41" "12.22" ...
## $ race_asian : chr "0.33" "0.53" "5.83" "12.70" ...
## $ race_native : chr "0.20" "0.19" "0.81" "1.12" ...
## $ race_pacific : chr "0.03" "0.05" "0.38" "0.15" ...
## $ race_other : chr "0.83" "0.61" "21.35" "22.14" ...
## $ race_multiple : chr "3.94" "3.42" "11.27" "8.85" ...
## $ hispanic : chr "3.03" "2.78" "46.88" "45.53" ...
## $ disabled : chr "22.24" "20.16" "12.83" "11.90" ...
## $ poverty : chr "19.27" "16.94" "12.72" "20.76" ...
## $ limited_english : chr "0.42" "0.43" "4.58" "14.74" ...
## $ commute_time : chr "25.35" "26.26" "37.07" "30.71" ...
## $ health_uninsured : chr "8.06" "6.93" "8.07" "10.34" ...
## $ veteran : chr "8.11" "9.71" "7.75" "3.03" ...
## $ Average of Jan-13 : chr "38.55" "34.85" "53.14" "57.88" ...
## $ Average of Feb-13 : chr "39.88" "36.15" "55.28" "57.65" ...
## $ Average of Mar-13 : chr "42.75" "39.41" "64.75" "60.86" ...
## $ Average of Apr-13 : chr "55.16" "54.63" "67.38" "62.77" ...
## $ Average of May-13 : chr "65.17" "65.41" "73.31" "67.07" ...
## $ Average of Jun-13 : chr "75.98" "73.89" "79.49" "68.41" ...
## $ Average of Jul-13 : chr "76.75" "74.07" "84.01" "70.69" ...
## $ Average of Aug-13 : chr "76.45" "74.37" "83.28" "71.19" ...
## $ Average of Sep-13 : chr "73.67" "70.44" "79.88" "72.74" ...
## $ Average of Oct-13 : chr "59.73" "57.37" "67.84" "66.41" ...
## $ Average of Nov-13 : chr "45.18" "42.15" "61.92" "65.09" ...
## $ Average of Nov-13 : chr "37.43" "33.16" "55.69" "60.87" ...
## $ Average of Jan-14 : chr "31.67" "26.88" "60.56" "64.30" ...
## $ Average of Feb-14 : chr "33.83" "28.36" "60.99" "60.77" ...
## $ Average of Mar-14 : chr "42.35" "40.32" "65.16" "63.01" ...
## $ Average of Apr-14 : chr "57.72" "56.85" "68.01" "64.37" ...
## $ Average of May-14 : chr "67.35" "66.84" "74.24" "69.73" ...
## $ Average of Jun-14 : chr "75.92" "75.12" "78.87" "68.46" ...
## $ Average of Jul-14 : chr "74.28" "72.18" "84.65" "73.62" ...
## $ Average of Aug-14 : chr "79.59" "77.08" "82.23" "73.65" ...
## [list output truncated]
## - attr(*, "na.action")= 'omit' Named int [1:406] 51 54 75 85 86 88 201 247 261 299 ...
## ..- attr(*, "names")= chr [1:406] "52" "55" "76" "86" ...
train<- train[train$patient_id != "", ]
sum(is.na(train$patient_id) | train$patient_id =="")
## [1] 0
train<- train[train$patient_race != "", ]
sum(is.na(train$patient_race) | train$patient_race =="")
## [1] 0
sum(is.na(train))
## [1] 0
Further Cleaning of the Data set getting rid of the columns not being used in the research
train<- train[,-c(101:152)]
ggplot(train, aes(x = patient_race, fill = patient_race)) +
geom_bar(color = "black") + # Border color for bars
geom_text(stat = "count", aes(label = ..count..), vjust = -0.5) + # Add text labels
labs(title = "Distribution of Patient Race", x = "Patient Race", y = "Frequency") +
scale_fill_manual(values = scales::hue_pal()(length(unique(train$patient_race)))) + # Automatically generate distinct colors
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot(train, aes(x = Region, fill = patient_race)) +
geom_bar(position = "fill") + # Creates a proportional bar chart
labs(title = "Proportion of Patient Races by Region",
x = "Region",
y = "Proportion",
fill = "Patient Race") +
theme_minimal()
ggplot(train, aes(x = patient_age, y = bmi, color = patient_race)) +
geom_point(size = 3, alpha = 0.5) +
geom_smooth(method = "lm", se = FALSE, color = "black") +
scale_color_brewer(palette = "Set1") +
facet_wrap(~ patient_race) +
labs(
title = "BMI vs. Age Across Different Races",
subtitle = "Scatter plot with linear trends",
x = "Patient Age (Years)",
y = "Body Mass Index (BMI)",
color = "Race Category"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
legend.position = "bottom"
)
## `geom_smooth()` using formula = 'y ~ x'