This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
# Load all our libraries
library(performance) # check model assumptions visually
## Warning: package 'performance' was built under R version 4.5.3
library(data.table)
library(ggplot2)
library(ggeffects)
library(GGally)
#library(ggrepel)
library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.3
## corrplot 0.95 loaded
library(scales)
library(ggthemes)
library(RColorBrewer)
library(see)
## Warning: package 'see' was built under R version 4.5.3
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
#do a prediction
library(caTools)
## Warning: package 'caTools' was built under R version 4.5.3
library(rmarkdown)
#read dataset
m23<- read.csv("//apporto.com/dfs/LOYOLA/Users/jtong_loyola/Desktop/medicare2023.csv", header= TRUE)
m22<-read.csv("//apporto.com/dfs/LOYOLA/Users/jtong_loyola/Downloads/medicare22/m22.csv",header=TRUE)
m21<-read.csv("//apporto.com/dfs/LOYOLA/Users/jtong_loyola/Downloads/medicare21/m21.csv",header=TRUE)
m20<-read.csv("//apporto.com/dfs/LOYOLA/Users/jtong_loyola/Downloads/medicare20/m20.csv",header=TRUE)
m19<-read.csv("//apporto.com/dfs/LOYOLA/Users/jtong_loyola/Downloads/medicare19/m19.csv",header=TRUE)
m18<-read.csv("//apporto.com/dfs/LOYOLA/Users/jtong_loyola/Downloads/medicare18/m18.csv",header=TRUE)
#drop tot_Asmt_Cnt that is found in m19 and m18 so all columns match up
m19 <- subset(m19, select = -TOT_ASMT_CNT)
m18 <- subset(m18, select = -TOT_ASMT_CNT)
#drop national total for the years
m23<-m23[-1,]
m22<-m22[-1,]
m21<-m21[-1,]
m20<-m20[-1,]
m19<-m19[-1,]
m18<-m18[-1,]
#now combine the years together
m6<-rbind(m23, m22, m21, m20, m19, m18)
View(m6)
#convert over to integers
m6<-m6%>%mutate(across(c(COTRT_SLP_MNTS, COTRT_OT_MNTS,TOT_SLP_MNTS,
INDVDL_SLP_MNTS,CNCRNT_GRP_SLP_MNTS, CNCRNT_GRP_PT_MNTS,
COTRT_PT_MNTS, TOT_OT_MNTS, INDVDL_OT_MNTS, CNCRNT_GRP_OT_MNTS,
PRMRY_DX_SXILLDEF_PCT, PRMRY_DX_INJPOIS_PCT, PRMRY_DX_HLTHSRV_PCT,
TOT_PT_MNTS, INDVDL_PT_MNTS, PRMRY_DX_DIGSYSTM_PCT,
PRMRY_DX_SKNMUSSYSTM_PCT, PRMRY_DX_GUSYSTM_PCT, PRMRY_DX_PRGPERICONG_PCT,
PRMRY_DX_NERVSYSTM_PCT,PRMRY_DX_ENTSYS_PCT, PRMRY_DX_CIRCSYSTM_PCT,
PRMRY_DX_RSPSYSTM_PCT, PRMRY_DX_INFCTN_PCT,
PRMRY_DX_NEOBLD_PCT, PRMRY_DX_ENDONUTRMET_PCT,
PRMRY_DX_MNTBEHNEUDIS_PCT, BENE_CC_PH_OSTEOPOROSIS_V2_PCT,
BENE_CC_PH_PARKINSON_V2_PCT, BENE_CC_PH_STROKE_TIA_V2_PCT,
BENE_CC_PH_HYPERLIPIDEMIA_V2_PCT, BENE_CC_PH_HYPERTENSION_V2_PCT,
BENE_CC_PH_ISCHEMICHEART_V2_PCT,
BENE_CC_PH_CKD_V2_PCT, BENE_CC_PH_COPD_V2_PCT, BENE_CC_PH_DIABETES_V2_PCT,
BENE_CC_PH_HF_NONIHD_V2_PCT, BENE_CC_PH_AFIB_V2_PCT,
BENE_CC_PH_ARTHRITIS_V2_PCT, BENE_CC_PH_ASTHMA_V2_PCT, BENE_CC_PH_CANCER6_V2_PCT,
BENE_CC_BH_PD_V1_PCT, BENE_CC_BH_PTSD_V1_PCT, BENE_CC_BH_SCHIZO_OTHPSY_V1_PCT,
BENE_CC_BH_TOBACCO_V1_PCT,BENE_CC_BH_ANXIETY_V1_PCT,
BENE_CC_BH_BIPOLAR_V1_PCT, BENE_CC_BH_DEPRESS_V1_PCT, BENE_CC_BH_MOOD_V2_PCT,
BENE_CC_BH_ADHD_OTHCD_V1_PCT, BENE_CC_BH_ALCOHOL_DRUG_V1_PCT,
BENE_CC_BH_ALZ_NONALZDEM_V2_PCT,BENE_RACE_HSPNC_PCT,
BENE_RACE_NATIND_PCT, BENE_RACE_UNK_PCT, BENE_RACE_OTHR_PCT, BENE_MALE_PCT,
BENE_FEML_PCT, BENE_RACE_WHT_PCT,BENE_RACE_BLACK_PCT, BENE_RACE_API_PCT,
BENE_DUAL_PCT, BENE_RRL_PCT), as.numeric))
## Warning: There were 63 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `across(...)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 62 remaining warnings.
#add columns
#length of stay
los<- m6$TOT_SRVC_DAYS/ m6$TOT_EPSD_STAY_CNT
m6$los<-los
#PT PER EPISODE
ptperep<-m6$TOT_PT_MNTS/m6$TOT_EPSD_STAY_CNT
m6$ptperep<-ptperep
#SLP per episode
slpperep<-m6$TOT_SLP_MNTS/m6$TOT_EPSD_STAY_CNT
m6$slpperep<-slpperep
#OT per episode
otperep<-m6$TOT_OT_MNTS/m6$TOT_EPSD_STAY_CNT
m6$otperep<-otperep
#replace na w 0
m6<- m6%>% replace(is.na(.),0)
summary(m6)
## YEAR YEAR_TYPE SMRY_CTGRY SRVC_CTGRY
## Min. :2018 Length:7022 Length:7022 Length:7022
## 1st Qu.:2019 Class :character Class :character Class :character
## Median :2021 Mode :character Mode :character Mode :character
## Mean :2021
## 3rd Qu.:2022
## Max. :2023
## PRVDR_ID PRVDR_NAME PRVDR_CITY STATE
## Length:7022 Length:7022 Length:7022 Length:7022
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## PRVDR_ZIP BENE_DSTNCT_CNT TOT_EPSD_STAY_CNT TOT_SRVC_DAYS
## Length:7022 Min. : 11.0 Min. : 11 Min. : 86
## Class :character 1st Qu.: 121.0 1st Qu.: 128 1st Qu.: 1610
## Mode :character Median : 210.0 Median : 224 Median : 2847
## Mean : 596.7 Mean : 665 Mean : 8387
## 3rd Qu.: 467.0 3rd Qu.: 514 3rd Qu.: 6424
## Max. :54145.0 Max. :64561 Max. :793772
## TOT_CHRG_AMT TOT_ALOWD_AMT TOT_MDCR_PYMT_AMT
## Min. :1.480e+05 Min. :1.583e+05 Min. :1.556e+05
## 1st Qu.:6.580e+06 1st Qu.:2.893e+06 1st Qu.:2.854e+06
## Median :1.302e+07 Median :5.202e+06 Median :5.126e+06
## Mean :3.399e+07 Mean :1.510e+07 Mean :1.484e+07
## 3rd Qu.:2.528e+07 3rd Qu.:1.161e+07 3rd Qu.:1.141e+07
## Max. :3.413e+09 Max. :1.517e+09 Max. :1.481e+09
## TOT_MDCR_STDZD_PYMT_AMT TOT_OUTLIER_PYMT_AMT BENE_DUAL_PCT BENE_RRL_PCT
## Min. :1.600e+05 Min. : 0 Min. : 0.00 Min. : 0.000
## 1st Qu.:2.855e+06 1st Qu.: 23184 1st Qu.:12.00 1st Qu.: 0.000
## Median :5.007e+06 Median : 89723 Median :18.00 Median : 0.000
## Mean :1.458e+07 Mean : 480183 Mean :19.54 Mean : 7.228
## 3rd Qu.:1.119e+07 3rd Qu.: 281380 3rd Qu.:24.00 3rd Qu.: 11.000
## Max. :1.531e+09 Max. :52038264 Max. :92.00 Max. :100.000
## BENE_AVG_AGE BENE_MALE_PCT BENE_FEML_PCT BENE_RACE_WHT_PCT
## Min. :58.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.:75.00 1st Qu.:43.00 1st Qu.:48.00 1st Qu.: 74.00
## Median :76.00 Median :47.00 Median :53.00 Median : 85.00
## Mean :75.85 Mean :46.82 Mean :51.62 Mean : 79.48
## 3rd Qu.:77.00 3rd Qu.:52.00 3rd Qu.:57.00 3rd Qu.: 92.00
## Max. :85.00 Max. :76.00 Max. :73.00 Max. :100.00
## BENE_RACE_BLACK_PCT BENE_RACE_API_PCT BENE_RACE_HSPNC_PCT BENE_RACE_NATIND_PCT
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.0000
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.0000
## Median : 3.000 Median : 0.000 Median : 0.000 Median : 0.0000
## Mean : 8.495 Mean : 1.014 Mean : 4.806 Mean : 0.3703
## 3rd Qu.:13.000 3rd Qu.: 0.000 3rd Qu.: 4.000 3rd Qu.: 0.0000
## Max. :97.000 Max. :77.000 Max. :100.000 Max. :52.0000
## BENE_RACE_UNK_PCT BENE_RACE_OTHR_PCT BENE_AVG_RISK_SCRE
## Min. : 0.0000 Min. : 0.00000 Min. : 1.090
## 1st Qu.: 0.0000 1st Qu.: 0.00000 1st Qu.: 2.170
## Median : 0.0000 Median : 0.00000 Median : 2.450
## Mean : 0.1907 Mean : 0.06266 Mean : 2.515
## 3rd Qu.: 0.0000 3rd Qu.: 0.00000 3rd Qu.: 2.750
## Max. :11.0000 Max. :15.00000 Max. :98.000
## BENE_CC_BH_ADHD_OTHCD_V1_PCT BENE_CC_BH_ALCOHOL_DRUG_V1_PCT
## Min. : 0.0000 Min. : 0.00
## 1st Qu.: 0.0000 1st Qu.:11.00
## Median : 0.0000 Median :14.00
## Mean : 0.3948 Mean :13.76
## 3rd Qu.: 0.0000 3rd Qu.:18.00
## Max. :30.0000 Max. :53.00
## BENE_CC_BH_ALZ_NONALZDEM_V2_PCT BENE_CC_BH_ANXIETY_V1_PCT
## Min. : 0 Min. : 0.00
## 1st Qu.:20 1st Qu.:38.00
## Median :25 Median :43.00
## Mean :25 Mean :42.59
## 3rd Qu.:30 3rd Qu.:49.00
## Max. :75 Max. :96.00
## BENE_CC_BH_BIPOLAR_V1_PCT BENE_CC_BH_DEPRESS_V1_PCT BENE_CC_BH_MOOD_V2_PCT
## Min. : 0.000 Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.:41.00 1st Qu.:45.00
## Median : 6.000 Median :46.00 Median :50.00
## Mean : 5.833 Mean :45.17 Mean :49.63
## 3rd Qu.: 9.000 3rd Qu.:51.00 3rd Qu.:55.00
## Max. :69.000 Max. :94.00 Max. :95.00
## BENE_CC_BH_PD_V1_PCT BENE_CC_BH_PTSD_V1_PCT BENE_CC_BH_SCHIZO_OTHPSY_V1_PCT
## Min. : 0.000 Min. : 0.0000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.: 0.000
## Median : 0.000 Median : 0.0000 Median : 0.000
## Mean : 1.289 Mean : 0.8498 Mean : 2.857
## 3rd Qu.: 3.000 3rd Qu.: 2.0000 3rd Qu.: 5.000
## Max. :36.000 Max. :18.0000 Max. :76.000
## BENE_CC_BH_TOBACCO_V1_PCT BENE_CC_PH_AFIB_V2_PCT BENE_CC_PH_ARTHRITIS_V2_PCT
## Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.:15.00 1st Qu.:36.00 1st Qu.:64.00
## Median :19.00 Median :40.00 Median :70.00
## Mean :18.82 Mean :39.21 Mean :67.31
## 3rd Qu.:23.00 3rd Qu.:44.00 3rd Qu.:75.00
## Max. :78.00 Max. :63.00 Max. :96.00
## BENE_CC_PH_ASTHMA_V2_PCT BENE_CC_PH_CANCER6_V2_PCT BENE_CC_PH_CKD_V2_PCT
## Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.:11.00 1st Qu.:17.00 1st Qu.:41.00
## Median :14.00 Median :20.00 Median :47.00
## Mean :12.77 Mean :18.82 Mean :46.02
## 3rd Qu.:16.00 3rd Qu.:22.00 3rd Qu.:53.00
## Max. :42.00 Max. :39.00 Max. :85.00
## BENE_CC_PH_COPD_V2_PCT BENE_CC_PH_DIABETES_V2_PCT BENE_CC_PH_HF_NONIHD_V2_PCT
## Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.:29.00 1st Qu.:44.00 1st Qu.:38.00
## Median :35.00 Median :49.00 Median :44.00
## Mean :34.67 Mean :48.93 Mean :43.26
## 3rd Qu.:42.00 3rd Qu.:54.00 3rd Qu.:50.00
## Max. :73.00 Max. :86.00 Max. :84.00
## BENE_CC_PH_HYPERLIPIDEMIA_V2_PCT BENE_CC_PH_HYPERTENSION_V2_PCT
## Min. : 0 Min. : 0.00
## 1st Qu.: 80 1st Qu.: 0.00
## Median : 85 Median : 86.00
## Mean : 74 Mean : 48.21
## 3rd Qu.: 88 3rd Qu.: 95.00
## Max. :100 Max. :100.00
## BENE_CC_PH_ISCHEMICHEART_V2_PCT BENE_CC_PH_OSTEOPOROSIS_V2_PCT
## Min. : 0.00 Min. : 0.00
## 1st Qu.:46.00 1st Qu.:18.00
## Median :53.00 Median :22.00
## Mean :51.61 Mean :21.78
## 3rd Qu.:59.00 3rd Qu.:26.00
## Max. :95.00 Max. :76.00
## BENE_CC_PH_PARKINSON_V2_PCT BENE_CC_PH_STROKE_TIA_V2_PCT PRMRY_DX_INFCTN_PCT
## Min. : 0.000 Min. : 0.00 Min. : 0.00000
## 1st Qu.: 0.000 1st Qu.:38.00 1st Qu.: 0.00000
## Median : 5.000 Median :43.00 Median : 0.00000
## Mean : 4.503 Mean :44.27 Mean : 0.05825
## 3rd Qu.: 8.000 3rd Qu.:51.00 3rd Qu.: 0.00000
## Max. :28.000 Max. :86.00 Max. :10.00000
## PRMRY_DX_NEOBLD_PCT PRMRY_DX_ENDONUTRMET_PCT PRMRY_DX_MNTBEHNEUDIS_PCT
## Min. : 0.0000 Min. : 0.00000 Min. :0.000000
## 1st Qu.: 0.0000 1st Qu.: 0.00000 1st Qu.:0.000000
## Median : 0.0000 Median : 0.00000 Median :0.000000
## Mean : 0.1639 Mean : 0.08516 Mean :0.005981
## 3rd Qu.: 0.0000 3rd Qu.: 0.00000 3rd Qu.:0.000000
## Max. :13.0000 Max. :16.00000 Max. :4.000000
## PRMRY_DX_NERVSYSTM_PCT PRMRY_DX_ENTSYS_PCT PRMRY_DX_CIRCSYSTM_PCT
## Min. : 0.00 Min. :0.0000000 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.:0.0000000 1st Qu.:16.00
## Median :10.00 Median :0.0000000 Median :22.00
## Mean :12.27 Mean :0.0001424 Mean :23.59
## 3rd Qu.:19.00 3rd Qu.:0.0000000 3rd Qu.:30.00
## Max. :63.00 Max. :1.0000000 Max. :78.00
## PRMRY_DX_RSPSYSTM_PCT PRMRY_DX_DIGSYSTM_PCT PRMRY_DX_SKNMUSSYSTM_PCT
## Min. : 0.0000 Min. :0.00000 Min. : 0.000
## 1st Qu.: 0.0000 1st Qu.:0.00000 1st Qu.: 0.000
## Median : 0.0000 Median :0.00000 Median : 0.000
## Mean : 0.6992 Mean :0.06992 Mean : 2.985
## 3rd Qu.: 0.0000 3rd Qu.:0.00000 3rd Qu.: 5.000
## Max. :29.0000 Max. :9.00000 Max. :57.000
## PRMRY_DX_GUSYSTM_PCT PRMRY_DX_PRGPERICONG_PCT PRMRY_DX_SXILLDEF_PCT
## Min. : 0.00000 Min. :0 Min. : 0.00
## 1st Qu.: 0.00000 1st Qu.:0 1st Qu.: 0.00
## Median : 0.00000 Median :0 Median : 10.00
## Mean : 0.05882 Mean :0 Mean : 12.05
## 3rd Qu.: 0.00000 3rd Qu.:0 3rd Qu.: 18.00
## Max. :11.00000 Max. :0 Max. :100.00
## PRMRY_DX_INJPOIS_PCT PRMRY_DX_HLTHSRV_PCT TOT_PT_MNTS
## Min. : 0.00 Min. : 0.00 Min. : 0
## 1st Qu.:16.00 1st Qu.: 8.00 1st Qu.: 75596
## Median :22.00 Median : 15.00 Median : 140079
## Mean :21.19 Mean : 15.55 Mean : 420580
## 3rd Qu.:28.00 3rd Qu.: 22.00 3rd Qu.: 324245
## Max. :84.00 Max. :100.00 Max. :43320198
## INDVDL_PT_MNTS CNCRNT_GRP_PT_MNTS COTRT_PT_MNTS TOT_OT_MNTS
## Min. : 0 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 70562 1st Qu.: 0 1st Qu.: 0 1st Qu.: 74691
## Median : 132374 Median : 1136 Median : 45 Median : 139039
## Mean : 358574 Mean : 59117 Mean : 2889 Mean : 414448
## 3rd Qu.: 280852 3rd Qu.: 27853 3rd Qu.: 863 3rd Qu.: 317434
## Max. :35297371 Max. :7948190 Max. :242283 Max. :42495177
## INDVDL_OT_MNTS CNCRNT_GRP_OT_MNTS COTRT_OT_MNTS TOT_SLP_MNTS
## Min. : 0 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 69928 1st Qu.: 0 1st Qu.: 0 1st Qu.: 16161
## Median : 131216 Median : 1369 Median : 15 Median : 35295
## Mean : 359155 Mean : 52635 Mean : 2657 Mean : 101457
## 3rd Qu.: 280210 3rd Qu.: 24090 3rd Qu.: 628 3rd Qu.: 79550
## Max. :35084335 Max. :7364164 Max. :233261 Max. :8855320
## INDVDL_SLP_MNTS CNCRNT_GRP_SLP_MNTS COTRT_SLP_MNTS los
## Min. : 0 Min. : 0 Min. : 0.0 Min. : 6.143
## 1st Qu.: 15924 1st Qu.: 0 1st Qu.: 0.0 1st Qu.:11.695
## Median : 34308 Median : 0 Median : 0.0 Median :12.514
## Mean : 94980 Mean : 6178 Mean : 299.1 Mean :12.693
## 3rd Qu.: 73657 3rd Qu.: 1791 3rd Qu.: 30.0 3rd Qu.:13.467
## Max. :8392128 Max. :566577 Max. :70618.0 Max. :26.812
## ptperep slpperep otperep
## Min. : 0.0 Min. : 0.00 Min. : 0.0
## 1st Qu.: 569.6 1st Qu.: 98.93 1st Qu.: 556.0
## Median : 641.2 Median :147.65 Median : 634.2
## Mean : 625.0 Mean :161.55 Mean : 615.6
## 3rd Qu.: 704.8 3rd Qu.:209.74 3rd Qu.: 697.7
## Max. :1248.9 Max. :722.88 Max. :1202.5
# column names
colnames(m6)
## [1] "YEAR" "YEAR_TYPE"
## [3] "SMRY_CTGRY" "SRVC_CTGRY"
## [5] "PRVDR_ID" "PRVDR_NAME"
## [7] "PRVDR_CITY" "STATE"
## [9] "PRVDR_ZIP" "BENE_DSTNCT_CNT"
## [11] "TOT_EPSD_STAY_CNT" "TOT_SRVC_DAYS"
## [13] "TOT_CHRG_AMT" "TOT_ALOWD_AMT"
## [15] "TOT_MDCR_PYMT_AMT" "TOT_MDCR_STDZD_PYMT_AMT"
## [17] "TOT_OUTLIER_PYMT_AMT" "BENE_DUAL_PCT"
## [19] "BENE_RRL_PCT" "BENE_AVG_AGE"
## [21] "BENE_MALE_PCT" "BENE_FEML_PCT"
## [23] "BENE_RACE_WHT_PCT" "BENE_RACE_BLACK_PCT"
## [25] "BENE_RACE_API_PCT" "BENE_RACE_HSPNC_PCT"
## [27] "BENE_RACE_NATIND_PCT" "BENE_RACE_UNK_PCT"
## [29] "BENE_RACE_OTHR_PCT" "BENE_AVG_RISK_SCRE"
## [31] "BENE_CC_BH_ADHD_OTHCD_V1_PCT" "BENE_CC_BH_ALCOHOL_DRUG_V1_PCT"
## [33] "BENE_CC_BH_ALZ_NONALZDEM_V2_PCT" "BENE_CC_BH_ANXIETY_V1_PCT"
## [35] "BENE_CC_BH_BIPOLAR_V1_PCT" "BENE_CC_BH_DEPRESS_V1_PCT"
## [37] "BENE_CC_BH_MOOD_V2_PCT" "BENE_CC_BH_PD_V1_PCT"
## [39] "BENE_CC_BH_PTSD_V1_PCT" "BENE_CC_BH_SCHIZO_OTHPSY_V1_PCT"
## [41] "BENE_CC_BH_TOBACCO_V1_PCT" "BENE_CC_PH_AFIB_V2_PCT"
## [43] "BENE_CC_PH_ARTHRITIS_V2_PCT" "BENE_CC_PH_ASTHMA_V2_PCT"
## [45] "BENE_CC_PH_CANCER6_V2_PCT" "BENE_CC_PH_CKD_V2_PCT"
## [47] "BENE_CC_PH_COPD_V2_PCT" "BENE_CC_PH_DIABETES_V2_PCT"
## [49] "BENE_CC_PH_HF_NONIHD_V2_PCT" "BENE_CC_PH_HYPERLIPIDEMIA_V2_PCT"
## [51] "BENE_CC_PH_HYPERTENSION_V2_PCT" "BENE_CC_PH_ISCHEMICHEART_V2_PCT"
## [53] "BENE_CC_PH_OSTEOPOROSIS_V2_PCT" "BENE_CC_PH_PARKINSON_V2_PCT"
## [55] "BENE_CC_PH_STROKE_TIA_V2_PCT" "PRMRY_DX_INFCTN_PCT"
## [57] "PRMRY_DX_NEOBLD_PCT" "PRMRY_DX_ENDONUTRMET_PCT"
## [59] "PRMRY_DX_MNTBEHNEUDIS_PCT" "PRMRY_DX_NERVSYSTM_PCT"
## [61] "PRMRY_DX_ENTSYS_PCT" "PRMRY_DX_CIRCSYSTM_PCT"
## [63] "PRMRY_DX_RSPSYSTM_PCT" "PRMRY_DX_DIGSYSTM_PCT"
## [65] "PRMRY_DX_SKNMUSSYSTM_PCT" "PRMRY_DX_GUSYSTM_PCT"
## [67] "PRMRY_DX_PRGPERICONG_PCT" "PRMRY_DX_SXILLDEF_PCT"
## [69] "PRMRY_DX_INJPOIS_PCT" "PRMRY_DX_HLTHSRV_PCT"
## [71] "TOT_PT_MNTS" "INDVDL_PT_MNTS"
## [73] "CNCRNT_GRP_PT_MNTS" "COTRT_PT_MNTS"
## [75] "TOT_OT_MNTS" "INDVDL_OT_MNTS"
## [77] "CNCRNT_GRP_OT_MNTS" "COTRT_OT_MNTS"
## [79] "TOT_SLP_MNTS" "INDVDL_SLP_MNTS"
## [81] "CNCRNT_GRP_SLP_MNTS" "COTRT_SLP_MNTS"
## [83] "los" "ptperep"
## [85] "slpperep" "otperep"
###########
# only want to look at Irf factors:
summary(m6[, c("los","TOT_PT_MNTS","TOT_SLP_MNTS", "TOT_OT_MNTS", "BENE_DSTNCT_CNT", "TOT_EPSD_STAY_CNT" , "TOT_SRVC_DAYS", "TOT_MDCR_PYMT_AMT", "TOT_MDCR_STDZD_PYMT_AMT","TOT_CHRG_AMT" )])
## los TOT_PT_MNTS TOT_SLP_MNTS TOT_OT_MNTS
## Min. : 6.143 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.:11.695 1st Qu.: 75596 1st Qu.: 16161 1st Qu.: 74691
## Median :12.514 Median : 140079 Median : 35295 Median : 139039
## Mean :12.693 Mean : 420580 Mean : 101457 Mean : 414448
## 3rd Qu.:13.467 3rd Qu.: 324245 3rd Qu.: 79550 3rd Qu.: 317434
## Max. :26.812 Max. :43320198 Max. :8855320 Max. :42495177
## BENE_DSTNCT_CNT TOT_EPSD_STAY_CNT TOT_SRVC_DAYS TOT_MDCR_PYMT_AMT
## Min. : 11.0 Min. : 11 Min. : 86 Min. :1.556e+05
## 1st Qu.: 121.0 1st Qu.: 128 1st Qu.: 1610 1st Qu.:2.854e+06
## Median : 210.0 Median : 224 Median : 2847 Median :5.126e+06
## Mean : 596.7 Mean : 665 Mean : 8387 Mean :1.484e+07
## 3rd Qu.: 467.0 3rd Qu.: 514 3rd Qu.: 6424 3rd Qu.:1.141e+07
## Max. :54145.0 Max. :64561 Max. :793772 Max. :1.481e+09
## TOT_MDCR_STDZD_PYMT_AMT TOT_CHRG_AMT
## Min. :1.600e+05 Min. :1.480e+05
## 1st Qu.:2.855e+06 1st Qu.:6.580e+06
## Median :5.007e+06 Median :1.302e+07
## Mean :1.458e+07 Mean :3.399e+07
## 3rd Qu.:1.119e+07 3rd Qu.:2.528e+07
## Max. :1.531e+09 Max. :3.413e+09
#--------------------------------------------------------------------------
#NEXT QUESTION
#--------------------------------------------------------------------------
# Can we predict the likelihood of high Medicare IRF utilization in a state
# based on demographic and healthcare infrastructure variables?
# using a log regression where it's high vs not high so i need a binary variable
#make a binary variable first
stateutilize<- m6 %>%
group_by(STATE, YEAR) %>%
summarise(utrate=sum(TOT_SRVC_DAYS)/sum(BENE_DSTNCT_CNT), # gives us utilization rate where it's use over number of ppl
BENE_AVG_AGE=mean(BENE_AVG_AGE),
BENE_DUAL_PCT=mean(BENE_DUAL_PCT), # this is both medicare+medicaid
BENE_MALE_PCT=mean(BENE_MALE_PCT),
BENE_RRL_PCT=mean(BENE_RRL_PCT), # those who are classified as rural
BENE_RACE_WHT_PCT=mean(BENE_RACE_WHT_PCT),
BENE_RACE_API_PCT=mean(BENE_RACE_API_PCT),
BENE_RACE_BLACK_PCT=mean(BENE_RACE_BLACK_PCT),
.groups='drop')
# BENE_RACE_HSPNC_PCT=mean(BENE_RACE_HSPNC_PCT),
# BENE_RACE_NATIND_PCT=mean(BENE_RACE_NATIND_PCT),
#BENE_RACE_OTHR_PCT=mean(BENE_RACE_OTHR_PCT),
# BENE_RACE_UNK_PCT=mean(BENE_RACE_UNK_PCT))
stateutilize %>%
group_by(YEAR) %>%
summarise(n=n())
summary(stateutilize)
## STATE YEAR utrate BENE_AVG_AGE
## Length:312 Min. :2018 Min. :11.06 Min. :71.33
## Class :character 1st Qu.:2019 1st Qu.:13.35 1st Qu.:74.99
## Mode :character Median :2020 Median :13.81 Median :75.77
## Mean :2020 Mean :13.84 Mean :75.65
## 3rd Qu.:2022 3rd Qu.:14.46 3rd Qu.:76.44
## Max. :2023 Max. :16.29 Max. :78.52
## BENE_DUAL_PCT BENE_MALE_PCT BENE_RRL_PCT BENE_RACE_WHT_PCT
## Min. : 0.00 Min. :26.33 Min. : 0.000 Min. : 0.2857
## 1st Qu.:13.32 1st Qu.:45.10 1st Qu.: 1.448 1st Qu.:75.4157
## Median :17.34 Median :47.63 Median : 7.260 Median :84.7752
## Mean :18.10 Mean :47.71 Mean : 9.108 Mean :80.3155
## 3rd Qu.:22.37 3rd Qu.:50.22 3rd Qu.:14.000 3rd Qu.:90.4821
## Max. :38.00 Max. :60.50 Max. :40.333 Max. :98.0000
## BENE_RACE_API_PCT BENE_RACE_BLACK_PCT
## Min. : 0.00000 Min. : 0.0000
## 1st Qu.: 0.00000 1st Qu.: 0.2167
## Median : 0.05882 Median : 3.5147
## Mean : 1.59770 Mean : 7.2074
## 3rd Qu.: 0.33621 3rd Qu.:11.2090
## Max. :56.00000 Max. :55.0000
# make it binary based on median
cutoff<-median(stateutilize$utrate)
cutoff
## [1] 13.80912
stateutilize<-stateutilize %>%
mutate(longstay=ifelse(utrate >= cutoff,1,0))
View(stateutilize)
# how many are 1 vs 0?
table(stateutilize$longstay) # note it's 156 for 0 and 1
##
## 0 1
## 156 156
#check for multicollinearity
numstate<-stateutilize %>%select(where(is.numeric))
numstate
# make a new window to see the plot
x11()
#visually see correlations
pairs(numstate, col='dodgerblue')
#see the correlation
round(cor(numstate), 2)
## YEAR utrate BENE_AVG_AGE BENE_DUAL_PCT BENE_MALE_PCT
## YEAR 1.00 0.06 0.34 -0.26 -0.06
## utrate 0.06 1.00 -0.10 0.33 0.17
## BENE_AVG_AGE 0.34 -0.10 1.00 -0.32 -0.49
## BENE_DUAL_PCT -0.26 0.33 -0.32 1.00 0.08
## BENE_MALE_PCT -0.06 0.17 -0.49 0.08 1.00
## BENE_RRL_PCT -0.02 -0.19 -0.14 0.05 0.08
## BENE_RACE_WHT_PCT 0.04 0.09 0.12 0.07 0.29
## BENE_RACE_API_PCT 0.00 -0.20 0.00 -0.13 0.03
## BENE_RACE_BLACK_PCT -0.08 0.22 -0.24 0.36 -0.10
## longstay 0.08 0.77 -0.05 0.26 0.18
## BENE_RRL_PCT BENE_RACE_WHT_PCT BENE_RACE_API_PCT
## YEAR -0.02 0.04 0.00
## utrate -0.19 0.09 -0.20
## BENE_AVG_AGE -0.14 0.12 0.00
## BENE_DUAL_PCT 0.05 0.07 -0.13
## BENE_MALE_PCT 0.08 0.29 0.03
## BENE_RRL_PCT 1.00 0.41 -0.20
## BENE_RACE_WHT_PCT 0.41 1.00 -0.47
## BENE_RACE_API_PCT -0.20 -0.47 1.00
## BENE_RACE_BLACK_PCT -0.25 -0.36 -0.08
## longstay -0.20 -0.02 -0.04
## BENE_RACE_BLACK_PCT longstay
## YEAR -0.08 0.08
## utrate 0.22 0.77
## BENE_AVG_AGE -0.24 -0.05
## BENE_DUAL_PCT 0.36 0.26
## BENE_MALE_PCT -0.10 0.18
## BENE_RRL_PCT -0.25 -0.20
## BENE_RACE_WHT_PCT -0.36 -0.02
## BENE_RACE_API_PCT -0.08 -0.04
## BENE_RACE_BLACK_PCT 1.00 0.19
## longstay 0.19 1.00
#start a log model
#use a stepwise regression to find the most sig. variables
statemodstart<-glm(longstay~1, data=stateutilize, family= binomial)
stepwisestatelog<-step(statemodstart, scope= longstay~BENE_AVG_AGE +BENE_DUAL_PCT+BENE_MALE_PCT+ BENE_RACE_API_PCT+BENE_RACE_BLACK_PCT +BENE_RACE_WHT_PCT+ BENE_RRL_PCT, direction='both' )
## Start: AIC=434.52
## longstay ~ 1
##
## Df Deviance AIC
## + BENE_DUAL_PCT 1 410.95 414.95
## + BENE_RACE_BLACK_PCT 1 419.95 423.95
## + BENE_RRL_PCT 1 420.27 424.27
## + BENE_MALE_PCT 1 422.50 426.50
## <none> 432.52 434.52
## + BENE_AVG_AGE 1 431.77 435.77
## + BENE_RACE_API_PCT 1 431.90 435.90
## + BENE_RACE_WHT_PCT 1 432.35 436.35
##
## Step: AIC=414.95
## longstay ~ BENE_DUAL_PCT
##
## Df Deviance AIC
## + BENE_RRL_PCT 1 395.85 401.85
## + BENE_MALE_PCT 1 401.82 407.82
## + BENE_RACE_BLACK_PCT 1 406.72 412.72
## <none> 410.95 414.95
## + BENE_RACE_WHT_PCT 1 410.44 416.44
## + BENE_AVG_AGE 1 410.49 416.49
## + BENE_RACE_API_PCT 1 410.91 416.91
## - BENE_DUAL_PCT 1 432.52 434.52
##
## Step: AIC=401.85
## longstay ~ BENE_DUAL_PCT + BENE_RRL_PCT
##
## Df Deviance AIC
## + BENE_MALE_PCT 1 385.28 393.28
## <none> 395.85 401.85
## + BENE_RACE_API_PCT 1 394.96 402.96
## + BENE_RACE_BLACK_PCT 1 394.96 402.96
## + BENE_RACE_WHT_PCT 1 395.03 403.03
## + BENE_AVG_AGE 1 395.83 403.83
## - BENE_RRL_PCT 1 410.95 414.95
## - BENE_DUAL_PCT 1 420.27 424.27
##
## Step: AIC=393.28
## longstay ~ BENE_DUAL_PCT + BENE_RRL_PCT + BENE_MALE_PCT
##
## Df Deviance AIC
## + BENE_AVG_AGE 1 379.60 389.60
## + BENE_RACE_BLACK_PCT 1 382.84 392.84
## <none> 385.28 393.28
## + BENE_RACE_API_PCT 1 384.05 394.05
## + BENE_RACE_WHT_PCT 1 385.23 395.23
## - BENE_MALE_PCT 1 395.85 401.85
## - BENE_RRL_PCT 1 401.82 407.82
## - BENE_DUAL_PCT 1 407.71 413.71
##
## Step: AIC=389.6
## longstay ~ BENE_DUAL_PCT + BENE_RRL_PCT + BENE_MALE_PCT + BENE_AVG_AGE
##
## Df Deviance AIC
## + BENE_RACE_BLACK_PCT 1 374.59 386.59
## <none> 379.60 389.60
## + BENE_RACE_API_PCT 1 378.74 390.74
## + BENE_RACE_WHT_PCT 1 379.12 391.12
## - BENE_AVG_AGE 1 385.28 393.28
## - BENE_RRL_PCT 1 393.41 401.41
## - BENE_MALE_PCT 1 395.83 403.83
## - BENE_DUAL_PCT 1 407.59 415.59
##
## Step: AIC=386.59
## longstay ~ BENE_DUAL_PCT + BENE_RRL_PCT + BENE_MALE_PCT + BENE_AVG_AGE +
## BENE_RACE_BLACK_PCT
##
## Df Deviance AIC
## <none> 374.59 386.59
## + BENE_RACE_API_PCT 1 374.10 388.10
## + BENE_RACE_WHT_PCT 1 374.59 388.59
## - BENE_RACE_BLACK_PCT 1 379.60 389.60
## - BENE_RRL_PCT 1 381.98 391.98
## - BENE_AVG_AGE 1 382.84 392.84
## - BENE_DUAL_PCT 1 394.29 404.29
## - BENE_MALE_PCT 1 394.89 404.89
summary(stepwisestatelog)
##
## Call:
## glm(formula = longstay ~ BENE_DUAL_PCT + BENE_RRL_PCT + BENE_MALE_PCT +
## BENE_AVG_AGE + BENE_RACE_BLACK_PCT, family = binomial, data = stateutilize)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -40.36911 12.54050 -3.219 0.00129 **
## BENE_DUAL_PCT 0.09603 0.02330 4.121 3.77e-05 ***
## BENE_RRL_PCT -0.04169 0.01568 -2.659 0.00784 **
## BENE_MALE_PCT 0.17431 0.04211 4.139 3.48e-05 ***
## BENE_AVG_AGE 0.40208 0.14419 2.788 0.00530 **
## BENE_RACE_BLACK_PCT 0.03652 0.01707 2.140 0.03235 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 432.52 on 311 degrees of freedom
## Residual deviance: 374.59 on 306 degrees of freedom
## AIC: 386.59
##
## Number of Fisher Scoring iterations: 4
#do predictions
stateutilize$predicting<-predict(stepwisestatelog, type='response')
summary(stateutilize$predicting)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.004917 0.340792 0.482776 0.500000 0.673681 0.945906
#likelihood ratio test
anova(stepwisestatelog, test='Chisq')
anova(stepwisestatelog, test = 'LRT')
#compute an odds ratio from the model
#get coefficents from model
coeff<-coef(stepwisestatelog)
coeff
## (Intercept) BENE_DUAL_PCT BENE_RRL_PCT BENE_MALE_PCT
## -40.36910746 0.09602547 -0.04168960 0.17431340
## BENE_AVG_AGE BENE_RACE_BLACK_PCT
## 0.40208121 0.03652397
#compute odds ratio
odds<- exp(coeff)
odds
## (Intercept) BENE_DUAL_PCT BENE_RRL_PCT BENE_MALE_PCT
## 2.937104e-18 1.100787e+00 9.591675e-01 1.190429e+00
## BENE_AVG_AGE BENE_RACE_BLACK_PCT
## 1.494933e+00 1.037199e+00
#get standard error for coefficient
se<-summary(stepwisestatelog)$coeff[,'Std. Error']
se
## (Intercept) BENE_DUAL_PCT BENE_RRL_PCT BENE_MALE_PCT
## 12.54050373 0.02329999 0.01568041 0.04211036
## BENE_AVG_AGE BENE_RACE_BLACK_PCT
## 0.14419334 0.01706666
#compute CI levels
oddslower<-odds*exp(-1.96*se)
oddsupper<-odds*exp(1.96*se)
#make into df
oddsdf<-data.frame(odds,ci_lower =oddslower, ci_upper=oddsupper)
print(oddsdf)
## odds ci_lower ci_upper
## (Intercept) 2.937104e-18 6.211938e-29 1.388710e-07
## BENE_DUAL_PCT 1.100787e+00 1.051647e+00 1.152223e+00
## BENE_RRL_PCT 9.591675e-01 9.301372e-01 9.891038e-01
## BENE_MALE_PCT 1.190429e+00 1.096120e+00 1.292851e+00
## BENE_AVG_AGE 1.494933e+00 1.126891e+00 1.983177e+00
## BENE_RACE_BLACK_PCT 1.037199e+00 1.003078e+00 1.072481e+00
#final log model
summary(stepwisestatelog)
##
## Call:
## glm(formula = longstay ~ BENE_DUAL_PCT + BENE_RRL_PCT + BENE_MALE_PCT +
## BENE_AVG_AGE + BENE_RACE_BLACK_PCT, family = binomial, data = stateutilize)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -40.36911 12.54050 -3.219 0.00129 **
## BENE_DUAL_PCT 0.09603 0.02330 4.121 3.77e-05 ***
## BENE_RRL_PCT -0.04169 0.01568 -2.659 0.00784 **
## BENE_MALE_PCT 0.17431 0.04211 4.139 3.48e-05 ***
## BENE_AVG_AGE 0.40208 0.14419 2.788 0.00530 **
## BENE_RACE_BLACK_PCT 0.03652 0.01707 2.140 0.03235 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 432.52 on 311 degrees of freedom
## Residual deviance: 374.59 on 306 degrees of freedom
## AIC: 386.59
##
## Number of Fisher Scoring iterations: 4
# plot it
#make a df to see if it counts as high utilization or not
stateplot<-stateutilize %>%
group_by(STATE) %>%
summarise(predicting=mean(predicting),
highutilization=predicting >=0.5)
#plot state, probability and high utilization
ggplot(stateplot, aes(x=reorder(STATE, -predicting), y= predicting, fill=highutilization))+
geom_col(position='identity')+
#coord_flip()+
labs(x='State',y='Probability of a High Utilization', title='Prediction of Utilization Rate by State', fill='High Utlization')+
theme_minimal()+theme(plot.title=element_text(hjust=0.5))+
theme(
axis.text.y = element_text(lineheight = 4)
)
#this graph shows us the states and the chances that their medicare patients would have a long stay in the IRF based on the 3 factors.We can see that PR is the actual opposite where it's actually in the negative.
#pROC
#GET fitted probability values
predictions<-fitted.values(stepwisestatelog)
#create ROC curve and plot
roc_curve<-roc(stateutilize$longstay,predictions,
plot=TRUE, col='blue',lwd=2, print.auc=TRUE, main='ROC curve for the Medicare Utilization based on demographics',
ci=TRUE,legacy.axes=TRUE )
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
text(x=.1, y=.2, labels='model1- dual pct, rrl pct, male pct, black pct, avg age' , col='blue', cex=1, font=2)
# do another log regression to see for the better curve
fit2<-glm(longstay ~BENE_DUAL_PCT + BENE_RRL_PCT + BENE_MALE_PCT + BENE_AVG_AGE, data=stateutilize, family=binomial())
predictions2<-fitted.values(fit2)
roc2<-roc(stateutilize$longstay, predictions2)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc2, add=TRUE, col='black')
text(
x=1, y=1, labels=paste('AUC= ', round(auc(roc2), 3), 'model2- dual pct, rrl pct, male pct, avg age'), col='black', cex=1, font=2)
#add other models to compare for anova
fit3<-glm(longstay ~ BENE_DUAL_PCT + BENE_RRL_PCT + BENE_MALE_PCT, data=stateutilize, family=binomial())
fitog<-glm(longstay ~ BENE_DUAL_PCT, data=stateutilize, family=binomial)
#anova test for mulitple models
anova(fitog, fit2, stepwisestatelog, test='Chisq')
anova( fit2, stepwisestatelog, test='LRT')
# The difference of the two models is that model two includes race as a predictor. Assuming that the null hypothesis is that the models are equal to another we can see that the p value is 0.025. This confirms that the model containing the race does contribute to the fit of the model.
#####
# optional ggplot certain years
stateplott<-stateutilize %>%
group_by(STATE, YEAR) %>%
summarise(predicting=mean(predicting),
highutilization=predicting >=0.5,
.groups='drop')
stateplot1<-stateplott %>%
filter(YEAR %in% c(2018, 2019, 2020))
ggplot(stateplot1, aes(x=reorder(STATE, predicting), y=predicting, fill=factor(highutilization)))+
geom_col() +
coord_flip()+
facet_wrap(~YEAR)+
labs(x='State',y='Probability of a High Utilization', title='Prediction of Utilization Rate by State', fill='high utilization')+
theme_minimal()+theme(plot.title=element_text(hjust=0.5))+
theme(
axis.text.y = element_text(lineheight = 4)
)
#certain years
stateplot2<-stateplott %>%
filter(YEAR %in% c(2021, 2022, 2023))
ggplot(stateplot2, aes(x=reorder(STATE, predicting), y=predicting, fill=factor(highutilization)))+
geom_col() +
coord_flip()+
facet_wrap(~YEAR)+
labs(x='State',y='Probability of a High Utilization', title='Prediction of Utilization Rate by State', fill='high utilization')+
theme_minimal()+theme(plot.title=element_text(hjust=0.5))+
theme(
axis.text.y = element_text(lineheight = 4)
)
#--------------------------------------------------------------------------
#NEXT QUESTION
#--------------------------------------------------------------------------
#--------------------------------------------------------------------------
#Do states cluster into groups based on Medicare payment ( a utilization factor)
#and being able to see the change over years
# k means clustering
#make a new df
Cm6 <- m6 %>%
group_by(STATE, YEAR) %>%
summarise(
TOT_SRVC_DAYS = sum(TOT_SRVC_DAYS),
BENE_DSTNCT_CNT = sum(BENE_DSTNCT_CNT),
TOT_MDCR_STDZD_PYMT_AMT = sum(TOT_MDCR_STDZD_PYMT_AMT),
.groups = "drop"
) %>%
mutate(utrate= TOT_SRVC_DAYS / BENE_DSTNCT_CNT)
# check to see if we have 52 rows per yr
Cm6 %>%
group_by(YEAR) %>%
summarise(n = n())
#visualize what the data looks like before and after log
#make a unlog dataset
nologCm6 <- m6 %>%
group_by(STATE, YEAR) %>%
summarise(
TOT_SRVC_DAYS = sum(TOT_SRVC_DAYS),
BENE_DSTNCT_CNT = sum(BENE_DSTNCT_CNT),
TOT_MDCR_STDZD_PYMT_AMT = sum(TOT_MDCR_STDZD_PYMT_AMT),
.groups = "drop"
) %>%
mutate(utrate= TOT_SRVC_DAYS / BENE_DSTNCT_CNT)
# gauge distribution to determine kurtosis/ skewness
prelog<-ggplot(nologCm6, aes(x=as.factor(ceiling(utrate)), y=TOT_MDCR_STDZD_PYMT_AMT))+
geom_boxplot() +
labs(title="Distribution of utilization rate before log transformation", x='utrate', y='Total Medicare Payment') +
theme_minimal()
prelog
# do log for medicare payments
Cm6<- Cm6 %>%
mutate(logmedicare=log(TOT_MDCR_STDZD_PYMT_AMT))
clusterm6 <- Cm6 %>% select(logmedicare, utrate)
#scale data have balance dataset to normalize it
clusterm6scale<-scale(clusterm6)
clusterm6scale<-as.data.frame(clusterm6scale) #it was reading as a list
postlog<-ggplot(clusterm6scale, aes(x=as.factor(ceiling(utrate)), y=logmedicare))+
geom_boxplot() +
labs(title="Distribution of utilization rate after log transformation", x='utrate', y='Total Medicare Payment (Log)') +
theme_minimal()
postlog
#step 1 pick num of clusters
#calcualte wss
set.seed(1997)
wss<-sapply(1:10,function(k){
kmeans(clusterm6scale, centers = k, nstart = 25)$tot.withinss
})
# plot
plot(1:10, wss, type='b', pch=10,
xlab='num of cluster')
#here we find its 3
km.out<-kmeans(clusterm6scale, centers = 3, nstart=100 )
print(km.out)
## K-means clustering with 3 clusters of sizes 155, 70, 87
##
## Cluster means:
## logmedicare utrate
## 1 0.8225746 0.2697857
## 2 -1.0331925 0.7984398
## 3 -0.6342021 -1.1230755
##
## Clustering vector:
## [1] 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 1 1 3 3 2
## [38] 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 2 3 3
## [75] 3 3 3 3 3 3 3 3 2 3 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 2 2 2 2 2 1 3 3 2 2 3 3 1 1 1 1 1 1 3 2 2 2 2 2 1 1 1 1
## [149] 1 1 3 3 1 1 3 3 3 3 3 3 3 3 1 1 1 1 1 1 3 3 3 3 3 3 2 2 2 2 2 2 1 3 1 3 1
## [186] 3 1 1 1 1 1 1 3 3 3 3 2 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 1 1
## [223] 3 3 3 3 3 3 1 1 1 1 1 1 3 3 3 3 3 3 2 3 2 2 2 3 1 1 1 1 1 1 2 2 2 2 2 2 1
## [260] 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 1 1 1 1 1 1 3 2 2 2 2 2 2 3 1 2 2 2 3 3
## [297] 3 3 3 3 1 1 1 1 1 1 2 2 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 108.08733 52.36677 81.31190
## (between_SS / total_SS = 61.1 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
#visualize the cluster
km.cluster<-km.out$cluster
clusterm6scale$STATE<-Cm6$STATE
clusterm6scale$YEAR<-Cm6$YEAR
clusterm6scale$TOT_MDCR_STDZD_PYMT_AMT<-Cm6$TOT_MDCR_STDZD_PYMT_AMT
clusterm6scale$km.cluster<-km.out$cluster
#MAKE THE CLUSTER A FACTOR FIRST
is.factor(clusterm6scale$km.cluster)
## [1] FALSE
#make factor
clusterm6scale$km.cluster<-as.factor(clusterm6scale$km.cluster)
#check again
is.factor(clusterm6scale$km.cluster)
## [1] TRUE
#visualize
# states we want to emphasize from our log regression
highlightstates<- clusterm6scale %>%
filter(STATE %in% c('CA', 'MD', 'PR', 'MA', 'AK', 'DC'))
plot1<- ggplot(clusterm6scale, aes(x=logmedicare,y=utrate, group=STATE, color=km.cluster))+geom_point()+
facet_wrap(~YEAR)+
geom_point()+
labs(
x='Log of Tot. Medicare Payment',
y='Utilization Rate',
title = 'Comparing the Medicare payments over time and its effect on utilization',
color="Groups"
)
plot1+ geom_text(data = highlightstates, aes(label=STATE), vjust=-1.0, size=4)
# what does this mean?
km.out$centers
## logmedicare utrate
## 1 0.8225746 0.2697857
## 2 -1.0331925 0.7984398
## 3 -0.6342021 -1.1230755
#summary and analysis
summary(clusterm6scale)
## logmedicare utrate STATE YEAR
## Min. :-2.4057 Min. :-3.0876 Length:312 Min. :2018
## 1st Qu.:-0.7646 1st Qu.:-0.5492 Class :character 1st Qu.:2019
## Median : 0.1346 Median :-0.0351 Mode :character Median :2020
## Mean : 0.0000 Mean : 0.0000 Mean :2020
## 3rd Qu.: 0.7494 3rd Qu.: 0.6874 3rd Qu.:2022
## Max. : 2.3417 Max. : 2.7155 Max. :2023
## TOT_MDCR_STDZD_PYMT_AMT km.cluster
## Min. :8.388e+06 1:155
## 1st Qu.:6.448e+07 2: 70
## Median :1.971e+08 3: 87
## Mean :3.281e+08
## 3rd Qu.:4.232e+08
## Max. :3.062e+09
#--------------------------------------------------------------------------
#NEXT QUESTION
#--------------------------------------------------------------------------
# Using linear regression
# Does the average submitted charge predict the average Medicare payment?
# make df
pay6 <- m6 %>%
group_by(STATE, YEAR) %>%
summarise(avg_tot_charge= log(mean(TOT_CHRG_AMT)),
avg_tot_payment=log(mean(TOT_MDCR_PYMT_AMT)),
pptot=mean(TOT_CHRG_AMT),
ppcharge=mean(TOT_MDCR_PYMT_AMT),
.groups = "drop") %>%
mutate(percentpaid=(ppcharge/pptot)*100)
pay6
summary(pay6)
## STATE YEAR avg_tot_charge avg_tot_payment
## Length:312 Min. :2018 Min. :14.64 Min. :14.10
## Class :character 1st Qu.:2019 1st Qu.:16.69 1st Qu.:15.91
## Mode :character Median :2020 Median :17.07 Median :16.31
## Mean :2020 Mean :17.03 Mean :16.28
## 3rd Qu.:2022 3rd Qu.:17.36 3rd Qu.:16.63
## Max. :2023 Max. :18.77 Max. :17.54
## pptot ppcharge percentpaid
## Min. : 2274570 Min. : 1329725 Min. :24.36
## 1st Qu.: 17756813 1st Qu.: 8099371 1st Qu.:42.46
## Median : 26020160 Median :12093224 Median :46.41
## Mean : 29086136 Mean :13823244 Mean :48.17
## 3rd Qu.: 34750104 3rd Qu.:16761856 3rd Qu.:51.36
## Max. :141364177 Max. :41507395 Max. :81.54
# check to see if we have 52 rows per yr
pay6%>%
group_by(YEAR) %>%
summarise(n = n())
# see first few rows
head(pay6, n = 10)
#use correlation
coefficient <- cor.test(pay6$avg_tot_charge, pay6$avg_tot_payment)
coefficient
##
## Pearson's product-moment correlation
##
## data: pay6$avg_tot_charge and pay6$avg_tot_payment
## t = 46.38, df = 310, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9192950 0.9475724
## sample estimates:
## cor
## 0.9349015
coefficient$estimate
## cor
## 0.9349015
pay6logplot<-
ggplot(pay6,aes(y= avg_tot_payment)) +
labs( y='Avg tot payment', title = 'boxplot with a log transformation')+
geom_boxplot()
pay6logplot
# lets see it without log
pay6nolog <- m6 %>%
group_by(STATE, YEAR) %>%
summarise(avg_tot_charge= mean(TOT_CHRG_AMT),
avg_tot_payment=mean(TOT_MDCR_PYMT_AMT),
pptot=mean(TOT_CHRG_AMT),
ppcharge=mean(TOT_MDCR_PYMT_AMT),
.groups = "drop") %>%
mutate(percentpaid=(ppcharge/pptot)*100)
plotpaynolog <- ggplot(pay6nolog, aes(y=avg_tot_payment)) +
labs(
y = 'Avg tot payment',
title = 'Boxplot without a log transformation'
) +
geom_boxplot()
plotpaynolog
# combine the plots
library(patchwork)
## Warning: package 'patchwork' was built under R version 4.5.3
(plotpaynolog+pay6logplot) +
plot_annotation(title = 'Boxplot of the submitted and actual payments before and after log transformation')
#do modeling
linmodelpay6<-lm(avg_tot_payment ~avg_tot_charge +factor(STATE) +factor(YEAR), data=pay6)
summary(linmodelpay6)
##
## Call:
## lm(formula = avg_tot_payment ~ avg_tot_charge + factor(STATE) +
## factor(YEAR), data = pay6)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.135833 -0.023402 0.002264 0.024121 0.205041
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.253995 0.357021 -0.711 0.4775
## avg_tot_charge 0.941036 0.021441 43.890 < 2e-16 ***
## factor(STATE)AL 0.684404 0.030966 22.102 < 2e-16 ***
## factor(STATE)AR 0.617463 0.030228 20.427 < 2e-16 ***
## factor(STATE)AZ 0.578845 0.030950 18.703 < 2e-16 ***
## factor(STATE)CA 0.190852 0.036427 5.239 3.39e-07 ***
## factor(STATE)CO 0.295189 0.030067 9.818 < 2e-16 ***
## factor(STATE)CT 0.261938 0.028573 9.167 < 2e-16 ***
## factor(STATE)DC 0.409717 0.032522 12.598 < 2e-16 ***
## factor(STATE)DE 0.850261 0.032055 26.525 < 2e-16 ***
## factor(STATE)FL 0.413011 0.038197 10.813 < 2e-16 ***
## factor(STATE)GA 0.432210 0.030320 14.255 < 2e-16 ***
## factor(STATE)HI 0.835261 0.028038 29.791 < 2e-16 ***
## factor(STATE)IA 0.505913 0.028401 17.813 < 2e-16 ***
## factor(STATE)ID 0.572693 0.028148 20.346 < 2e-16 ***
## factor(STATE)IL 0.350671 0.032936 10.647 < 2e-16 ***
## factor(STATE)IN 0.547531 0.029011 18.874 < 2e-16 ***
## factor(STATE)KS 0.542394 0.028417 19.087 < 2e-16 ***
## factor(STATE)KY 0.587538 0.030723 19.124 < 2e-16 ***
## factor(STATE)LA 0.447936 0.028068 15.959 < 2e-16 ***
## factor(STATE)MA 1.009372 0.035078 28.775 < 2e-16 ***
## factor(STATE)MD 0.944077 0.035823 26.354 < 2e-16 ***
## factor(STATE)ME 0.773334 0.028049 27.570 < 2e-16 ***
## factor(STATE)MI 0.447507 0.028290 15.819 < 2e-16 ***
## factor(STATE)MN 0.393847 0.027984 14.074 < 2e-16 ***
## factor(STATE)MO 0.485235 0.029717 16.329 < 2e-16 ***
## factor(STATE)MS 0.514276 0.028839 17.832 < 2e-16 ***
## factor(STATE)MT 0.510638 0.029026 17.593 < 2e-16 ***
## factor(STATE)NC 0.467621 0.030376 15.394 < 2e-16 ***
## factor(STATE)ND 0.530296 0.028133 18.849 < 2e-16 ***
## factor(STATE)NE 0.302900 0.028024 10.809 < 2e-16 ***
## factor(STATE)NH 0.918090 0.032700 28.076 < 2e-16 ***
## factor(STATE)NJ 0.617530 0.041453 14.897 < 2e-16 ***
## factor(STATE)NM 0.459411 0.028493 16.123 < 2e-16 ***
## factor(STATE)NV 0.194835 0.046124 4.224 3.35e-05 ***
## factor(STATE)NY 0.191357 0.033227 5.759 2.43e-08 ***
## factor(STATE)OH 0.442562 0.029194 15.159 < 2e-16 ***
## factor(STATE)OK 0.498833 0.029083 17.152 < 2e-16 ***
## factor(STATE)OR 0.479349 0.029352 16.331 < 2e-16 ***
## factor(STATE)PA 0.428398 0.031513 13.595 < 2e-16 ***
## factor(STATE)PR 0.549030 0.046039 11.925 < 2e-16 ***
## factor(STATE)RI 0.412122 0.030270 13.615 < 2e-16 ***
## factor(STATE)SC 0.661725 0.031500 21.007 < 2e-16 ***
## factor(STATE)SD 0.221467 0.028065 7.891 8.95e-14 ***
## factor(STATE)TN 0.505551 0.029496 17.140 < 2e-16 ***
## factor(STATE)TX 0.471155 0.034422 13.688 < 2e-16 ***
## factor(STATE)UT 0.416135 0.028011 14.856 < 2e-16 ***
## factor(STATE)VA 0.582592 0.031136 18.711 < 2e-16 ***
## factor(STATE)VT 0.434881 0.028652 15.178 < 2e-16 ***
## factor(STATE)WA 0.404686 0.028088 14.408 < 2e-16 ***
## factor(STATE)WI 0.501523 0.028460 17.622 < 2e-16 ***
## factor(STATE)WV 0.855088 0.030572 27.970 < 2e-16 ***
## factor(STATE)WY 0.863529 0.028189 30.634 < 2e-16 ***
## factor(YEAR)2019 -0.013436 0.009545 -1.408 0.1604
## factor(YEAR)2020 -0.016183 0.009526 -1.699 0.0906 .
## factor(YEAR)2021 0.006848 0.009627 0.711 0.4776
## factor(YEAR)2022 -0.002697 0.009637 -0.280 0.7799
## factor(YEAR)2023 -0.011959 0.009982 -1.198 0.2320
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04847 on 254 degrees of freedom
## Multiple R-squared: 0.9945, Adjusted R-squared: 0.9932
## F-statistic: 800 on 57 and 254 DF, p-value: < 2.2e-16
nolinmodelpay6<-lm(avg_tot_payment ~avg_tot_charge, data=pay6)
summary(nolinmodelpay6)
##
## Call:
## lm(formula = avg_tot_payment ~ avg_tot_charge, data = pay6)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.64001 -0.10044 -0.02547 0.08212 0.53863
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.02702 0.35179 -0.077 0.939
## avg_tot_charge 0.95739 0.02064 46.380 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2092 on 310 degrees of freedom
## Multiple R-squared: 0.874, Adjusted R-squared: 0.8736
## F-statistic: 2151 on 1 and 310 DF, p-value: < 2.2e-16
#split dataset
split=sample.split(pay6$avg_tot_payment, SplitRatio=0.7)
trainingset=subset(pay6, split==TRUE)
testset=subset(pay6, split==FALSE)
SPLITlinmodelpay6<-lm(formula=avg_tot_payment ~avg_tot_charge, data=trainingset)
summary(SPLITlinmodelpay6)
##
## Call:
## lm(formula = avg_tot_payment ~ avg_tot_charge, data = trainingset)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.62920 -0.08776 -0.02555 0.07287 0.54179
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.22187 0.41587 0.534 0.594
## avg_tot_charge 0.94254 0.02443 38.582 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1995 on 216 degrees of freedom
## Multiple R-squared: 0.8733, Adjusted R-squared: 0.8727
## F-statistic: 1489 on 1 and 216 DF, p-value: < 2.2e-16
#We will split that data to test and train it. Here we did a split of 70/30.This will allow us to see the predict the actual payments based on the requested charges.
ggplot()+
geom_point(aes(x=trainingset$avg_tot_charge, y=trainingset$avg_tot_payment), color='red')+
geom_line(aes(x=trainingset$avg_tot_charge, y=predict(SPLITlinmodelpay6, newdata=trainingset)),
color='black')+
ggtitle('Avg charge amt vs Avg tot payment') +
xlab('avg tot charge') +ylab('avg total payment')
# see who got paid the most and least per year
hi_lo<-pay6 %>%
group_by(YEAR) %>%
filter(percentpaid==min(percentpaid)| percentpaid==max(percentpaid)) %>%
ungroup() %>%
data.frame()
testingplot<-ggplot(pay6, aes(x = avg_tot_charge, y =avg_tot_payment, color= percentpaid
)) +
geom_point() +
geom_smooth(method = "lm", se=TRUE) +
facet_wrap(~YEAR)+
scale_color_viridis_c(option = "magma") +
labs(x= "Avg. tot charge in LOG", y="Avg. tot payment in LOG",
color="% Paid", title = "Avg charge amt vs the Avg tot payment")+
theme_minimal()+theme(plot.title=element_text(hjust=0.5))+
geom_point(data=hi_lo, aes(x=avg_tot_charge, y=avg_tot_payment, color=percentpaid),
shape=21, size=5)+
geom_text(data=hi_lo,aes(x = avg_tot_charge, y = avg_tot_payment, label = STATE),
vjust=-1.0, size=5, color='black')
testingplot
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
#try this with the trained data
#redo the plot without years
hi_lotestnoyear<-testset %>%
filter(percentpaid==min(percentpaid)| percentpaid==max(percentpaid)) %>%
data.frame()
hi_lotestnoyear
ggplot() +
geom_point(data = testset,
aes(x = avg_tot_charge,
y = avg_tot_payment,
color = percentpaid)) +
geom_line(data=trainingset, aes(x=avg_tot_charge, y=predict(SPLITlinmodelpay6, newdata=trainingset)), color='black')+
scale_color_viridis_c(option = "magma") +
labs(x= "Avg. tot charge in LOG", y="Avg. tot payment in LOG",
color="% Paid", title = "Avg charge amt vs the Avg tot payment using our test set")+
theme_minimal()+theme(plot.title=element_text(hjust=0.5))
#predict values for test set
predictpay <-predict(SPLITlinmodelpay6, newdata=testset)
#calc RMSE to evaluate performance
actuals <-testset$avg_tot_payment
#remember that it is log
#pg 210 from applied stat
# need to use exp to see how it looks without log
rmseexp<-sqrt(mean((exp(actuals)-exp(predictpay))^2))
print(paste('RMSE:', rmseexp))
## [1] "RMSE: 4885853.65983674"
#to see the log
rmse<-sqrt(mean((actuals-predictpay)^2))
print(paste('RMSE:', rmse))
## [1] "RMSE: 0.230648677637856"
#--------------------------------------------------------------------------
#NEXT QUESTION
#--------------------------------------------------------------------------
# does PT, OT, and SLP therapy along with a specific condition have the ability to help predict the LOS at a state level?
# therapy minutes vs los
#removing the outliers and other extremes ex: 0 minutes
#making a new dataset to go off of
#start with tot_pt_mnts
Q1pt <- quantile(m6$TOT_PT_MNTS, 0.25)
Q3pt <- quantile(m6$TOT_PT_MNTS, 0.75)
IQR_valpt <- IQR(m6$TOT_PT_MNTS)
#ot
Q1ot <- quantile(m6$TOT_OT_MNTS, 0.25)
Q3ot <- quantile(m6$TOT_OT_MNTS, 0.75)
IQR_valot <- IQR(m6$TOT_OT_MNTS)
#slp
Q1slp <- quantile(m6$TOT_SLP_MNTS, 0.25)
Q3slp <- quantile(m6$TOT_SLP_MNTS, 0.75)
IQR_valslp <- IQR(m6$TOT_SLP_MNTS)
m6clean <- m6 %>%
filter(TOT_PT_MNTS > Q1pt - 1.5 * IQR_valpt,
TOT_PT_MNTS < Q3pt + 1.5 * IQR_valpt)%>%
filter(TOT_OT_MNTS > Q1ot - 1.5 * IQR_valot,
TOT_OT_MNTS < Q3ot + 1.5 * IQR_valot)%>%
filter(TOT_SLP_MNTS > Q1slp - 1.5 * IQR_valslp,
TOT_SLP_MNTS < Q3slp + 1.5 * IQR_valslp)%>%
filter(TOT_PT_MNTS >0)%>%
filter(TOT_SLP_MNTS >0) %>%
filter(TOT_OT_MNTS >0)
colnames(m6)
## [1] "YEAR" "YEAR_TYPE"
## [3] "SMRY_CTGRY" "SRVC_CTGRY"
## [5] "PRVDR_ID" "PRVDR_NAME"
## [7] "PRVDR_CITY" "STATE"
## [9] "PRVDR_ZIP" "BENE_DSTNCT_CNT"
## [11] "TOT_EPSD_STAY_CNT" "TOT_SRVC_DAYS"
## [13] "TOT_CHRG_AMT" "TOT_ALOWD_AMT"
## [15] "TOT_MDCR_PYMT_AMT" "TOT_MDCR_STDZD_PYMT_AMT"
## [17] "TOT_OUTLIER_PYMT_AMT" "BENE_DUAL_PCT"
## [19] "BENE_RRL_PCT" "BENE_AVG_AGE"
## [21] "BENE_MALE_PCT" "BENE_FEML_PCT"
## [23] "BENE_RACE_WHT_PCT" "BENE_RACE_BLACK_PCT"
## [25] "BENE_RACE_API_PCT" "BENE_RACE_HSPNC_PCT"
## [27] "BENE_RACE_NATIND_PCT" "BENE_RACE_UNK_PCT"
## [29] "BENE_RACE_OTHR_PCT" "BENE_AVG_RISK_SCRE"
## [31] "BENE_CC_BH_ADHD_OTHCD_V1_PCT" "BENE_CC_BH_ALCOHOL_DRUG_V1_PCT"
## [33] "BENE_CC_BH_ALZ_NONALZDEM_V2_PCT" "BENE_CC_BH_ANXIETY_V1_PCT"
## [35] "BENE_CC_BH_BIPOLAR_V1_PCT" "BENE_CC_BH_DEPRESS_V1_PCT"
## [37] "BENE_CC_BH_MOOD_V2_PCT" "BENE_CC_BH_PD_V1_PCT"
## [39] "BENE_CC_BH_PTSD_V1_PCT" "BENE_CC_BH_SCHIZO_OTHPSY_V1_PCT"
## [41] "BENE_CC_BH_TOBACCO_V1_PCT" "BENE_CC_PH_AFIB_V2_PCT"
## [43] "BENE_CC_PH_ARTHRITIS_V2_PCT" "BENE_CC_PH_ASTHMA_V2_PCT"
## [45] "BENE_CC_PH_CANCER6_V2_PCT" "BENE_CC_PH_CKD_V2_PCT"
## [47] "BENE_CC_PH_COPD_V2_PCT" "BENE_CC_PH_DIABETES_V2_PCT"
## [49] "BENE_CC_PH_HF_NONIHD_V2_PCT" "BENE_CC_PH_HYPERLIPIDEMIA_V2_PCT"
## [51] "BENE_CC_PH_HYPERTENSION_V2_PCT" "BENE_CC_PH_ISCHEMICHEART_V2_PCT"
## [53] "BENE_CC_PH_OSTEOPOROSIS_V2_PCT" "BENE_CC_PH_PARKINSON_V2_PCT"
## [55] "BENE_CC_PH_STROKE_TIA_V2_PCT" "PRMRY_DX_INFCTN_PCT"
## [57] "PRMRY_DX_NEOBLD_PCT" "PRMRY_DX_ENDONUTRMET_PCT"
## [59] "PRMRY_DX_MNTBEHNEUDIS_PCT" "PRMRY_DX_NERVSYSTM_PCT"
## [61] "PRMRY_DX_ENTSYS_PCT" "PRMRY_DX_CIRCSYSTM_PCT"
## [63] "PRMRY_DX_RSPSYSTM_PCT" "PRMRY_DX_DIGSYSTM_PCT"
## [65] "PRMRY_DX_SKNMUSSYSTM_PCT" "PRMRY_DX_GUSYSTM_PCT"
## [67] "PRMRY_DX_PRGPERICONG_PCT" "PRMRY_DX_SXILLDEF_PCT"
## [69] "PRMRY_DX_INJPOIS_PCT" "PRMRY_DX_HLTHSRV_PCT"
## [71] "TOT_PT_MNTS" "INDVDL_PT_MNTS"
## [73] "CNCRNT_GRP_PT_MNTS" "COTRT_PT_MNTS"
## [75] "TOT_OT_MNTS" "INDVDL_OT_MNTS"
## [77] "CNCRNT_GRP_OT_MNTS" "COTRT_OT_MNTS"
## [79] "TOT_SLP_MNTS" "INDVDL_SLP_MNTS"
## [81] "CNCRNT_GRP_SLP_MNTS" "COTRT_SLP_MNTS"
## [83] "los" "ptperep"
## [85] "slpperep" "otperep"
# MAKING A LOG TRANSFORMATION AND NEW DF
logm6<-m6clean %>%
group_by(STATE, YEAR) %>%
summarise(TOT_PT_MNTS=mean(TOT_PT_MNTS),
TOT_SLP_MNTS=mean(TOT_SLP_MNTS),
TOT_OT_MNTS=mean(TOT_OT_MNTS),
los=mean(los),
BENE_CC_BH_ADHD_OTHCD_V1_PCT= mean(BENE_CC_BH_ADHD_OTHCD_V1_PCT),
BENE_CC_BH_ALCOHOL_DRUG_V1_PCT=mean(BENE_CC_BH_ALCOHOL_DRUG_V1_PCT),
BENE_CC_BH_ALZ_NONALZDEM_V2_PCT=mean(BENE_CC_BH_ALZ_NONALZDEM_V2_PCT),
BENE_CC_BH_ANXIETY_V1_PCT=mean(BENE_CC_BH_ANXIETY_V1_PCT),
BENE_CC_BH_BIPOLAR_V1_PCT=mean(BENE_CC_BH_BIPOLAR_V1_PCT),
BENE_CC_BH_PD_V1_PCT=mean(BENE_CC_BH_PD_V1_PCT),
BENE_CC_BH_PTSD_V1_PCT=mean(BENE_CC_BH_PTSD_V1_PCT),
BENE_CC_BH_SCHIZO_OTHPSY_V1_PCT=mean(BENE_CC_BH_SCHIZO_OTHPSY_V1_PCT),
BENE_CC_BH_TOBACCO_V1_PCT=mean(BENE_CC_BH_TOBACCO_V1_PCT),
BENE_CC_PH_AFIB_V2_PCT=mean(BENE_CC_PH_AFIB_V2_PCT),
BENE_CC_PH_ARTHRITIS_V2_PCT=mean(BENE_CC_PH_ARTHRITIS_V2_PCT),
BENE_CC_PH_ASTHMA_V2_PCT=mean(BENE_CC_PH_ASTHMA_V2_PCT),
BENE_CC_PH_CANCER6_V2_PCT=mean(BENE_CC_PH_CANCER6_V2_PCT),
BENE_CC_PH_CKD_V2_PCT=mean(BENE_CC_PH_CKD_V2_PCT),
BENE_CC_PH_COPD_V2_PCT=mean(BENE_CC_PH_COPD_V2_PCT),
BENE_CC_PH_DIABETES_V2_PCT=mean(BENE_CC_PH_DIABETES_V2_PCT),
BENE_CC_PH_HYPERLIPIDEMIA_V2_PCT=mean(BENE_CC_PH_HYPERLIPIDEMIA_V2_PCT),
BENE_CC_PH_HYPERTENSION_V2_PCT=mean(BENE_CC_PH_HYPERTENSION_V2_PCT),
BENE_CC_PH_OSTEOPOROSIS_V2_PCT=mean(BENE_CC_PH_OSTEOPOROSIS_V2_PCT),
BENE_CC_PH_PARKINSON_V2_PCT=mean(BENE_CC_PH_PARKINSON_V2_PCT),
BENE_CC_PH_STROKE_TIA_V2_PCT= mean(BENE_CC_PH_STROKE_TIA_V2_PCT),
.groups='drop') %>%
mutate(ptlog =log(TOT_PT_MNTS),
otlog=log(TOT_OT_MNTS),
slplog=log(TOT_SLP_MNTS),
loglos=log(los))
# make a plot with the ones we want
dfm6clean<- logm6 %>%
group_by(STATE, YEAR) %>%
select(-TOT_PT_MNTS, -TOT_SLP_MNTS, -TOT_OT_MNTS, -los )
summary(dfm6clean)
## STATE YEAR BENE_CC_BH_ADHD_OTHCD_V1_PCT
## Length:307 Min. :2018 Min. :0.0000
## Class :character 1st Qu.:2019 1st Qu.:0.0000
## Mode :character Median :2021 Median :0.1250
## Mean :2021 Mean :0.2130
## 3rd Qu.:2022 3rd Qu.:0.3333
## Max. :2023 Max. :1.6667
## BENE_CC_BH_ALCOHOL_DRUG_V1_PCT BENE_CC_BH_ALZ_NONALZDEM_V2_PCT
## Min. : 0.75 Min. : 9.222
## 1st Qu.:11.42 1st Qu.:18.354
## Median :13.33 Median :22.929
## Mean :13.54 Mean :22.078
## 3rd Qu.:16.00 3rd Qu.:25.675
## Max. :25.00 Max. :33.071
## BENE_CC_BH_ANXIETY_V1_PCT BENE_CC_BH_BIPOLAR_V1_PCT BENE_CC_BH_PD_V1_PCT
## Min. :10.75 Min. : 0.000 Min. :0.0000
## 1st Qu.:36.10 1st Qu.: 3.000 1st Qu.:0.4286
## Median :41.17 Median : 4.625 Median :0.9388
## Mean :40.32 Mean : 4.876 Mean :1.0639
## 3rd Qu.:45.02 3rd Qu.: 6.500 3rd Qu.:1.3944
## Max. :59.33 Max. :17.600 Max. :6.0000
## BENE_CC_BH_PTSD_V1_PCT BENE_CC_BH_SCHIZO_OTHPSY_V1_PCT
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.2624 1st Qu.:0.6667
## Median :0.6000 Median :1.5000
## Mean :0.8419 Mean :1.8253
## 3rd Qu.:1.1222 3rd Qu.:2.5000
## Max. :4.0000 Max. :7.5455
## BENE_CC_BH_TOBACCO_V1_PCT BENE_CC_PH_AFIB_V2_PCT BENE_CC_PH_ARTHRITIS_V2_PCT
## Min. : 0.7778 Min. : 9.556 Min. :40.67
## 1st Qu.:15.5861 1st Qu.:35.299 1st Qu.:61.12
## Median :18.7436 Median :38.750 Median :66.27
## Mean :18.5880 Mean :37.623 Mean :64.88
## 3rd Qu.:21.9066 3rd Qu.:41.099 3rd Qu.:69.42
## Max. :31.0000 Max. :47.115 Max. :76.00
## BENE_CC_PH_ASTHMA_V2_PCT BENE_CC_PH_CANCER6_V2_PCT BENE_CC_PH_CKD_V2_PCT
## Min. : 2.25 Min. : 5.444 Min. :17.50
## 1st Qu.:10.43 1st Qu.:16.500 1st Qu.:38.76
## Median :12.56 Median :18.556 Median :44.00
## Mean :12.29 Mean :18.203 Mean :42.60
## 3rd Qu.:14.12 3rd Qu.:20.246 3rd Qu.:47.59
## Max. :21.00 Max. :24.400 Max. :54.86
## BENE_CC_PH_COPD_V2_PCT BENE_CC_PH_DIABETES_V2_PCT
## Min. : 8.625 Min. :24.83
## 1st Qu.:27.354 1st Qu.:42.39
## Median :32.300 Median :47.00
## Mean :31.995 Mean :46.44
## 3rd Qu.:37.578 3rd Qu.:51.37
## Max. :51.000 Max. :66.29
## BENE_CC_PH_HYPERLIPIDEMIA_V2_PCT BENE_CC_PH_HYPERTENSION_V2_PCT
## Min. :32.20 Min. : 0.00
## 1st Qu.:69.02 1st Qu.:35.34
## Median :74.38 Median :45.74
## Mean :73.65 Mean :49.69
## 3rd Qu.:79.60 3rd Qu.:63.60
## Max. :95.00 Max. :96.00
## BENE_CC_PH_OSTEOPOROSIS_V2_PCT BENE_CC_PH_PARKINSON_V2_PCT
## Min. : 9.00 Min. :0.000
## 1st Qu.:18.06 1st Qu.:2.367
## Median :21.00 Median :3.600
## Mean :20.87 Mean :3.604
## 3rd Qu.:23.95 3rd Qu.:4.787
## Max. :34.78 Max. :8.333
## BENE_CC_PH_STROKE_TIA_V2_PCT ptlog otlog slplog
## Min. :24.33 Min. :10.55 Min. :10.53 Min. : 9.343
## 1st Qu.:41.19 1st Qu.:11.76 1st Qu.:11.71 1st Qu.:10.467
## Median :45.39 Median :12.01 Median :12.00 Median :10.647
## Mean :45.53 Mean :12.00 Mean :11.98 Mean :10.655
## 3rd Qu.:49.32 3rd Qu.:12.25 3rd Qu.:12.25 3rd Qu.:10.856
## Max. :64.88 Max. :13.15 Max. :12.98 Max. :11.597
## loglos
## Min. :2.313
## 1st Qu.:2.501
## Median :2.539
## Mean :2.539
## 3rd Qu.:2.578
## Max. :2.753
#make state factor
dfm6clean$STATE <-as.factor(dfm6clean$STATE)
#high or low correlation
cor(logm6[, c("ptlog", "otlog", "slplog")], method = "pearson")
## ptlog otlog slplog
## ptlog 1.0000000 0.9890474 0.7584542
## otlog 0.9890474 1.0000000 0.7523555
## slplog 0.7584542 0.7523555 1.0000000
benecor<-cor(m6[,c('BENE_CC_BH_ADHD_OTHCD_V1_PCT', 'BENE_CC_BH_ALCOHOL_DRUG_V1_PCT',
'BENE_CC_BH_ALZ_NONALZDEM_V2_PCT', 'BENE_CC_BH_ANXIETY_V1_PCT', 'BENE_CC_BH_BIPOLAR_V1_PCT',
'BENE_CC_BH_DEPRESS_V1_PCT','BENE_CC_BH_MOOD_V2_PCT', 'BENE_CC_BH_PD_V1_PCT',
'BENE_CC_BH_PTSD_V1_PCT', 'BENE_CC_BH_SCHIZO_OTHPSY_V1_PCT', 'BENE_CC_BH_TOBACCO_V1_PCT',
'BENE_CC_PH_AFIB_V2_PCT', 'BENE_CC_PH_ARTHRITIS_V2_PCT',
'BENE_CC_PH_ASTHMA_V2_PCT', 'BENE_CC_PH_CANCER6_V2_PCT', 'BENE_CC_PH_CKD_V2_PCT',
'BENE_CC_PH_COPD_V2_PCT', 'BENE_CC_PH_DIABETES_V2_PCT', 'BENE_CC_PH_HF_NONIHD_V2_PCT',
'BENE_CC_PH_HYPERLIPIDEMIA_V2_PCT', 'BENE_CC_PH_HYPERTENSION_V2_PCT',
'BENE_CC_PH_ISCHEMICHEART_V2_PCT', 'BENE_CC_PH_OSTEOPOROSIS_V2_PCT',
'BENE_CC_PH_PARKINSON_V2_PCT', 'BENE_CC_PH_STROKE_TIA_V2_PCT'
)], method='pearson')
#see correlation between the bene
threshold <- 0.6
which(abs(benecor) > threshold & abs(benecor) < 1, arr.ind = TRUE)
## row col
## BENE_CC_BH_DEPRESS_V1_PCT 6 4
## BENE_CC_BH_MOOD_V2_PCT 7 4
## BENE_CC_BH_ANXIETY_V1_PCT 4 6
## BENE_CC_BH_MOOD_V2_PCT 7 6
## BENE_CC_BH_ANXIETY_V1_PCT 4 7
## BENE_CC_BH_DEPRESS_V1_PCT 6 7
## BENE_CC_PH_HF_NONIHD_V2_PCT 19 12
## BENE_CC_PH_ISCHEMICHEART_V2_PCT 22 12
## BENE_CC_PH_DIABETES_V2_PCT 18 16
## BENE_CC_PH_HF_NONIHD_V2_PCT 19 16
## BENE_CC_PH_ISCHEMICHEART_V2_PCT 22 16
## BENE_CC_PH_HF_NONIHD_V2_PCT 19 17
## BENE_CC_PH_ISCHEMICHEART_V2_PCT 22 17
## BENE_CC_PH_CKD_V2_PCT 16 18
## BENE_CC_PH_HF_NONIHD_V2_PCT 19 18
## BENE_CC_PH_AFIB_V2_PCT 12 19
## BENE_CC_PH_CKD_V2_PCT 16 19
## BENE_CC_PH_COPD_V2_PCT 17 19
## BENE_CC_PH_DIABETES_V2_PCT 18 19
## BENE_CC_PH_ISCHEMICHEART_V2_PCT 22 19
## BENE_CC_PH_AFIB_V2_PCT 12 22
## BENE_CC_PH_CKD_V2_PCT 16 22
## BENE_CC_PH_COPD_V2_PCT 17 22
## BENE_CC_PH_HF_NONIHD_V2_PCT 19 22
#doing pairs to understand relationship b/w variables
#use panel to add regression line
pairs(logm6[, c('los', 'ptlog','otlog','slplog')],
panel=function(x,y){points(x,y)
abline(lm(y~x), col='red')},
main= "pair plots of the LOS(length of stay) and the therapies( PT, OT, SLP")
ggpairs(logm6[, c('los', 'ptlog','otlog','slplog')],
panel=function(x,y){points(x,y)
abline(lm(y~x), col='red')},
main= "pair plots of the LOS(length of stay) and the therapies( PT, OT, SLP")
## Warning: The `...` argument of `ggpais()` is deprecated as of GGally 2.3.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#
#because of high correlation we will connect the therapies
dfm6clean$alltherapy<- dfm6clean$ptlog + dfm6clean$slplog + dfm6clean$otlog
# check to see if we have about 52 rows per yr
dfm6clean %>%
group_by(YEAR) %>%
summarise(n = n())
# make multiple linear regression for the 3 therapies w/ log transformation
nulltherapylog<- lm(loglos ~1, data=dfm6clean)
therapylogstep<- step(nulltherapylog, scope= loglos ~ YEAR +
BENE_CC_BH_ALZ_NONALZDEM_V2_PCT+
BENE_CC_PH_CANCER6_V2_PCT+
BENE_CC_PH_OSTEOPOROSIS_V2_PCT+
BENE_CC_PH_STROKE_TIA_V2_PCT+ alltherapy, direction='both')
## Start: AIC=-1681.53
## loglos ~ 1
##
## Df Sum of Sq RSS AIC
## + BENE_CC_PH_STROKE_TIA_V2_PCT 1 0.171642 1.1035 -1723.9
## + BENE_CC_PH_OSTEOPOROSIS_V2_PCT 1 0.126345 1.1488 -1711.6
## + alltherapy 1 0.083241 1.1919 -1700.2
## + BENE_CC_BH_ALZ_NONALZDEM_V2_PCT 1 0.008750 1.2664 -1681.6
## <none> 1.2751 -1681.5
## + BENE_CC_PH_CANCER6_V2_PCT 1 0.003835 1.2713 -1680.5
## + YEAR 1 0.001358 1.2738 -1679.9
##
## Step: AIC=-1723.91
## loglos ~ BENE_CC_PH_STROKE_TIA_V2_PCT
##
## Df Sum of Sq RSS AIC
## + BENE_CC_PH_OSTEOPOROSIS_V2_PCT 1 0.045486 1.0580 -1734.8
## + alltherapy 1 0.038800 1.0647 -1732.9
## + YEAR 1 0.020177 1.0833 -1727.6
## + BENE_CC_PH_CANCER6_V2_PCT 1 0.015689 1.0878 -1726.3
## <none> 1.1035 -1723.9
## + BENE_CC_BH_ALZ_NONALZDEM_V2_PCT 1 0.002225 1.1013 -1722.5
## - BENE_CC_PH_STROKE_TIA_V2_PCT 1 0.171642 1.2751 -1681.5
##
## Step: AIC=-1734.84
## loglos ~ BENE_CC_PH_STROKE_TIA_V2_PCT + BENE_CC_PH_OSTEOPOROSIS_V2_PCT
##
## Df Sum of Sq RSS AIC
## + YEAR 1 0.025002 1.0330 -1740.2
## + alltherapy 1 0.015857 1.0421 -1737.5
## <none> 1.0580 -1734.8
## + BENE_CC_PH_CANCER6_V2_PCT 1 0.001992 1.0560 -1733.4
## + BENE_CC_BH_ALZ_NONALZDEM_V2_PCT 1 0.000001 1.0580 -1732.8
## - BENE_CC_PH_OSTEOPOROSIS_V2_PCT 1 0.045486 1.1035 -1723.9
## - BENE_CC_PH_STROKE_TIA_V2_PCT 1 0.090783 1.1488 -1711.6
##
## Step: AIC=-1740.18
## loglos ~ BENE_CC_PH_STROKE_TIA_V2_PCT + BENE_CC_PH_OSTEOPOROSIS_V2_PCT +
## YEAR
##
## Df Sum of Sq RSS AIC
## + alltherapy 1 0.009114 1.0239 -1740.9
## <none> 1.0330 -1740.2
## + BENE_CC_PH_CANCER6_V2_PCT 1 0.005336 1.0277 -1739.8
## + BENE_CC_BH_ALZ_NONALZDEM_V2_PCT 1 0.000193 1.0328 -1738.2
## - YEAR 1 0.025002 1.0580 -1734.8
## - BENE_CC_PH_OSTEOPOROSIS_V2_PCT 1 0.050311 1.0833 -1727.6
## - BENE_CC_PH_STROKE_TIA_V2_PCT 1 0.106850 1.1398 -1712.0
##
## Step: AIC=-1740.9
## loglos ~ BENE_CC_PH_STROKE_TIA_V2_PCT + BENE_CC_PH_OSTEOPOROSIS_V2_PCT +
## YEAR + alltherapy
##
## Df Sum of Sq RSS AIC
## <none> 1.0239 -1740.9
## - alltherapy 1 0.009114 1.0330 -1740.2
## + BENE_CC_BH_ALZ_NONALZDEM_V2_PCT 1 0.002357 1.0215 -1739.6
## + BENE_CC_PH_CANCER6_V2_PCT 1 0.000718 1.0232 -1739.1
## - YEAR 1 0.018259 1.0421 -1737.5
## - BENE_CC_PH_OSTEOPOROSIS_V2_PCT 1 0.028355 1.0522 -1734.5
## - BENE_CC_PH_STROKE_TIA_V2_PCT 1 0.097496 1.1214 -1715.0
summary(therapylogstep)
##
## Call:
## lm(formula = loglos ~ BENE_CC_PH_STROKE_TIA_V2_PCT + BENE_CC_PH_OSTEOPOROSIS_V2_PCT +
## YEAR + alltherapy, data = dfm6clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.20366 -0.03648 -0.00334 0.03479 0.19471
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.9689244 4.1738755 -1.670 0.09602 .
## BENE_CC_PH_STROKE_TIA_V2_PCT 0.0028763 0.0005364 5.363 1.63e-07 ***
## BENE_CC_PH_OSTEOPOROSIS_V2_PCT -0.0025195 0.0008712 -2.892 0.00411 **
## YEAR 0.0047616 0.0020518 2.321 0.02097 *
## alltherapy -0.0055173 0.0033651 -1.640 0.10213
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05823 on 302 degrees of freedom
## Multiple R-squared: 0.197, Adjusted R-squared: 0.1864
## F-statistic: 18.53 on 4 and 302 DF, p-value: 1.25e-13
#alternative model w no therapy
alternativemodel<- lm(loglos ~ BENE_CC_PH_STROKE_TIA_V2_PCT + BENE_CC_PH_OSTEOPOROSIS_V2_PCT +
YEAR, data=dfm6clean )
anova(alternativemodel,therapylogstep , test='Chisq')
#check multicollinearity
vif(therapylogstep)
## BENE_CC_PH_STROKE_TIA_V2_PCT BENE_CC_PH_OSTEOPOROSIS_V2_PCT
## 1.239843 1.394908
## YEAR alltherapy
## 1.115964 1.287262
plot(therapylogstep)
library(performance)
check_model(therapylogstep)
#plot it using predict to predict the los vs actual los
dfm6clean$predict<-predict(therapylogstep)
ggplot(dfm6clean, aes(predict, loglos))+
facet_wrap(~YEAR)+
geom_point()+
geom_smooth(method='lm', color='blue')+
labs(
x='Predicted LOS',
y='Actual LOS',
title = 'Comparing the predicted and actual LOS'
) +theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
#plot it using predict to predict the los for the therapies
dfm6clean$predict<-predict(therapylogstep)
ggplot(dfm6clean, aes(alltherapy, predict))+
facet_wrap(~YEAR)+
geom_point()+
geom_smooth(method='lm', color='blue')+
labs(
x=' therapy',
y='pred LOS',
title = 'Comparing the predicted LOS with the therapy + condition'
) +theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
#--------------------------------------------