R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

# Load all our libraries
library(performance)  # check model assumptions visually
## Warning: package 'performance' was built under R version 4.5.3
library(data.table)
library(ggplot2)
library(ggeffects)
library(GGally)
#library(ggrepel)
library(plyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.3
## corrplot 0.95 loaded
library(scales)
library(ggthemes)
library(RColorBrewer)
library(see)
## Warning: package 'see' was built under R version 4.5.3
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
#do a prediction
library(caTools)
## Warning: package 'caTools' was built under R version 4.5.3
library(rmarkdown)
#read dataset
m23<- read.csv("//apporto.com/dfs/LOYOLA/Users/jtong_loyola/Desktop/medicare2023.csv", header= TRUE)
m22<-read.csv("//apporto.com/dfs/LOYOLA/Users/jtong_loyola/Downloads/medicare22/m22.csv",header=TRUE)
m21<-read.csv("//apporto.com/dfs/LOYOLA/Users/jtong_loyola/Downloads/medicare21/m21.csv",header=TRUE)
m20<-read.csv("//apporto.com/dfs/LOYOLA/Users/jtong_loyola/Downloads/medicare20/m20.csv",header=TRUE)
m19<-read.csv("//apporto.com/dfs/LOYOLA/Users/jtong_loyola/Downloads/medicare19/m19.csv",header=TRUE)
m18<-read.csv("//apporto.com/dfs/LOYOLA/Users/jtong_loyola/Downloads/medicare18/m18.csv",header=TRUE)
#drop tot_Asmt_Cnt that is found in m19 and m18 so all columns match up
m19 <- subset(m19, select = -TOT_ASMT_CNT)
m18 <- subset(m18, select = -TOT_ASMT_CNT)
#drop national total for the years
m23<-m23[-1,]
m22<-m22[-1,]
m21<-m21[-1,]
m20<-m20[-1,]
m19<-m19[-1,]
m18<-m18[-1,]
#now combine the years together
m6<-rbind(m23, m22, m21, m20, m19, m18)
View(m6)
#convert over to integers
m6<-m6%>%mutate(across(c(COTRT_SLP_MNTS, COTRT_OT_MNTS,TOT_SLP_MNTS,
                         INDVDL_SLP_MNTS,CNCRNT_GRP_SLP_MNTS, CNCRNT_GRP_PT_MNTS,
                         COTRT_PT_MNTS, TOT_OT_MNTS, INDVDL_OT_MNTS, CNCRNT_GRP_OT_MNTS,
                         PRMRY_DX_SXILLDEF_PCT, PRMRY_DX_INJPOIS_PCT, PRMRY_DX_HLTHSRV_PCT,
                         TOT_PT_MNTS, INDVDL_PT_MNTS, PRMRY_DX_DIGSYSTM_PCT,
                         PRMRY_DX_SKNMUSSYSTM_PCT, PRMRY_DX_GUSYSTM_PCT, PRMRY_DX_PRGPERICONG_PCT,
                         PRMRY_DX_NERVSYSTM_PCT,PRMRY_DX_ENTSYS_PCT, PRMRY_DX_CIRCSYSTM_PCT,
                         PRMRY_DX_RSPSYSTM_PCT, PRMRY_DX_INFCTN_PCT,
                         PRMRY_DX_NEOBLD_PCT, PRMRY_DX_ENDONUTRMET_PCT,
                         PRMRY_DX_MNTBEHNEUDIS_PCT, BENE_CC_PH_OSTEOPOROSIS_V2_PCT,
                         BENE_CC_PH_PARKINSON_V2_PCT, BENE_CC_PH_STROKE_TIA_V2_PCT,
                         BENE_CC_PH_HYPERLIPIDEMIA_V2_PCT, BENE_CC_PH_HYPERTENSION_V2_PCT,
                         BENE_CC_PH_ISCHEMICHEART_V2_PCT,
                         BENE_CC_PH_CKD_V2_PCT, BENE_CC_PH_COPD_V2_PCT, BENE_CC_PH_DIABETES_V2_PCT,
                         BENE_CC_PH_HF_NONIHD_V2_PCT, BENE_CC_PH_AFIB_V2_PCT,
                         BENE_CC_PH_ARTHRITIS_V2_PCT, BENE_CC_PH_ASTHMA_V2_PCT, BENE_CC_PH_CANCER6_V2_PCT,
                         BENE_CC_BH_PD_V1_PCT, BENE_CC_BH_PTSD_V1_PCT, BENE_CC_BH_SCHIZO_OTHPSY_V1_PCT,
                         BENE_CC_BH_TOBACCO_V1_PCT,BENE_CC_BH_ANXIETY_V1_PCT,
                         BENE_CC_BH_BIPOLAR_V1_PCT, BENE_CC_BH_DEPRESS_V1_PCT, BENE_CC_BH_MOOD_V2_PCT,
                         BENE_CC_BH_ADHD_OTHCD_V1_PCT, BENE_CC_BH_ALCOHOL_DRUG_V1_PCT,
                         BENE_CC_BH_ALZ_NONALZDEM_V2_PCT,BENE_RACE_HSPNC_PCT,
                         BENE_RACE_NATIND_PCT, BENE_RACE_UNK_PCT, BENE_RACE_OTHR_PCT, BENE_MALE_PCT,
                         BENE_FEML_PCT, BENE_RACE_WHT_PCT,BENE_RACE_BLACK_PCT, BENE_RACE_API_PCT,
                         BENE_DUAL_PCT, BENE_RRL_PCT), as.numeric))
## Warning: There were 63 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `across(...)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 62 remaining warnings.
#add columns
#length of stay
los<- m6$TOT_SRVC_DAYS/ m6$TOT_EPSD_STAY_CNT
m6$los<-los
#PT PER EPISODE
ptperep<-m6$TOT_PT_MNTS/m6$TOT_EPSD_STAY_CNT
m6$ptperep<-ptperep
#SLP per episode
slpperep<-m6$TOT_SLP_MNTS/m6$TOT_EPSD_STAY_CNT
m6$slpperep<-slpperep
#OT per episode
otperep<-m6$TOT_OT_MNTS/m6$TOT_EPSD_STAY_CNT
m6$otperep<-otperep
#replace na w 0
m6<- m6%>% replace(is.na(.),0)
summary(m6)
##       YEAR       YEAR_TYPE          SMRY_CTGRY         SRVC_CTGRY       
##  Min.   :2018   Length:7022        Length:7022        Length:7022       
##  1st Qu.:2019   Class :character   Class :character   Class :character  
##  Median :2021   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :2021                                                           
##  3rd Qu.:2022                                                           
##  Max.   :2023                                                           
##    PRVDR_ID          PRVDR_NAME         PRVDR_CITY           STATE          
##  Length:7022        Length:7022        Length:7022        Length:7022       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##   PRVDR_ZIP         BENE_DSTNCT_CNT   TOT_EPSD_STAY_CNT TOT_SRVC_DAYS   
##  Length:7022        Min.   :   11.0   Min.   :   11     Min.   :    86  
##  Class :character   1st Qu.:  121.0   1st Qu.:  128     1st Qu.:  1610  
##  Mode  :character   Median :  210.0   Median :  224     Median :  2847  
##                     Mean   :  596.7   Mean   :  665     Mean   :  8387  
##                     3rd Qu.:  467.0   3rd Qu.:  514     3rd Qu.:  6424  
##                     Max.   :54145.0   Max.   :64561     Max.   :793772  
##   TOT_CHRG_AMT       TOT_ALOWD_AMT       TOT_MDCR_PYMT_AMT  
##  Min.   :1.480e+05   Min.   :1.583e+05   Min.   :1.556e+05  
##  1st Qu.:6.580e+06   1st Qu.:2.893e+06   1st Qu.:2.854e+06  
##  Median :1.302e+07   Median :5.202e+06   Median :5.126e+06  
##  Mean   :3.399e+07   Mean   :1.510e+07   Mean   :1.484e+07  
##  3rd Qu.:2.528e+07   3rd Qu.:1.161e+07   3rd Qu.:1.141e+07  
##  Max.   :3.413e+09   Max.   :1.517e+09   Max.   :1.481e+09  
##  TOT_MDCR_STDZD_PYMT_AMT TOT_OUTLIER_PYMT_AMT BENE_DUAL_PCT    BENE_RRL_PCT    
##  Min.   :1.600e+05       Min.   :       0     Min.   : 0.00   Min.   :  0.000  
##  1st Qu.:2.855e+06       1st Qu.:   23184     1st Qu.:12.00   1st Qu.:  0.000  
##  Median :5.007e+06       Median :   89723     Median :18.00   Median :  0.000  
##  Mean   :1.458e+07       Mean   :  480183     Mean   :19.54   Mean   :  7.228  
##  3rd Qu.:1.119e+07       3rd Qu.:  281380     3rd Qu.:24.00   3rd Qu.: 11.000  
##  Max.   :1.531e+09       Max.   :52038264     Max.   :92.00   Max.   :100.000  
##   BENE_AVG_AGE   BENE_MALE_PCT   BENE_FEML_PCT   BENE_RACE_WHT_PCT
##  Min.   :58.00   Min.   : 0.00   Min.   : 0.00   Min.   :  0.00   
##  1st Qu.:75.00   1st Qu.:43.00   1st Qu.:48.00   1st Qu.: 74.00   
##  Median :76.00   Median :47.00   Median :53.00   Median : 85.00   
##  Mean   :75.85   Mean   :46.82   Mean   :51.62   Mean   : 79.48   
##  3rd Qu.:77.00   3rd Qu.:52.00   3rd Qu.:57.00   3rd Qu.: 92.00   
##  Max.   :85.00   Max.   :76.00   Max.   :73.00   Max.   :100.00   
##  BENE_RACE_BLACK_PCT BENE_RACE_API_PCT BENE_RACE_HSPNC_PCT BENE_RACE_NATIND_PCT
##  Min.   : 0.000      Min.   : 0.000    Min.   :  0.000     Min.   : 0.0000     
##  1st Qu.: 0.000      1st Qu.: 0.000    1st Qu.:  0.000     1st Qu.: 0.0000     
##  Median : 3.000      Median : 0.000    Median :  0.000     Median : 0.0000     
##  Mean   : 8.495      Mean   : 1.014    Mean   :  4.806     Mean   : 0.3703     
##  3rd Qu.:13.000      3rd Qu.: 0.000    3rd Qu.:  4.000     3rd Qu.: 0.0000     
##  Max.   :97.000      Max.   :77.000    Max.   :100.000     Max.   :52.0000     
##  BENE_RACE_UNK_PCT BENE_RACE_OTHR_PCT BENE_AVG_RISK_SCRE
##  Min.   : 0.0000   Min.   : 0.00000   Min.   : 1.090    
##  1st Qu.: 0.0000   1st Qu.: 0.00000   1st Qu.: 2.170    
##  Median : 0.0000   Median : 0.00000   Median : 2.450    
##  Mean   : 0.1907   Mean   : 0.06266   Mean   : 2.515    
##  3rd Qu.: 0.0000   3rd Qu.: 0.00000   3rd Qu.: 2.750    
##  Max.   :11.0000   Max.   :15.00000   Max.   :98.000    
##  BENE_CC_BH_ADHD_OTHCD_V1_PCT BENE_CC_BH_ALCOHOL_DRUG_V1_PCT
##  Min.   : 0.0000              Min.   : 0.00                 
##  1st Qu.: 0.0000              1st Qu.:11.00                 
##  Median : 0.0000              Median :14.00                 
##  Mean   : 0.3948              Mean   :13.76                 
##  3rd Qu.: 0.0000              3rd Qu.:18.00                 
##  Max.   :30.0000              Max.   :53.00                 
##  BENE_CC_BH_ALZ_NONALZDEM_V2_PCT BENE_CC_BH_ANXIETY_V1_PCT
##  Min.   : 0                      Min.   : 0.00            
##  1st Qu.:20                      1st Qu.:38.00            
##  Median :25                      Median :43.00            
##  Mean   :25                      Mean   :42.59            
##  3rd Qu.:30                      3rd Qu.:49.00            
##  Max.   :75                      Max.   :96.00            
##  BENE_CC_BH_BIPOLAR_V1_PCT BENE_CC_BH_DEPRESS_V1_PCT BENE_CC_BH_MOOD_V2_PCT
##  Min.   : 0.000            Min.   : 0.00             Min.   : 0.00         
##  1st Qu.: 0.000            1st Qu.:41.00             1st Qu.:45.00         
##  Median : 6.000            Median :46.00             Median :50.00         
##  Mean   : 5.833            Mean   :45.17             Mean   :49.63         
##  3rd Qu.: 9.000            3rd Qu.:51.00             3rd Qu.:55.00         
##  Max.   :69.000            Max.   :94.00             Max.   :95.00         
##  BENE_CC_BH_PD_V1_PCT BENE_CC_BH_PTSD_V1_PCT BENE_CC_BH_SCHIZO_OTHPSY_V1_PCT
##  Min.   : 0.000       Min.   : 0.0000        Min.   : 0.000                 
##  1st Qu.: 0.000       1st Qu.: 0.0000        1st Qu.: 0.000                 
##  Median : 0.000       Median : 0.0000        Median : 0.000                 
##  Mean   : 1.289       Mean   : 0.8498        Mean   : 2.857                 
##  3rd Qu.: 3.000       3rd Qu.: 2.0000        3rd Qu.: 5.000                 
##  Max.   :36.000       Max.   :18.0000        Max.   :76.000                 
##  BENE_CC_BH_TOBACCO_V1_PCT BENE_CC_PH_AFIB_V2_PCT BENE_CC_PH_ARTHRITIS_V2_PCT
##  Min.   : 0.00             Min.   : 0.00          Min.   : 0.00              
##  1st Qu.:15.00             1st Qu.:36.00          1st Qu.:64.00              
##  Median :19.00             Median :40.00          Median :70.00              
##  Mean   :18.82             Mean   :39.21          Mean   :67.31              
##  3rd Qu.:23.00             3rd Qu.:44.00          3rd Qu.:75.00              
##  Max.   :78.00             Max.   :63.00          Max.   :96.00              
##  BENE_CC_PH_ASTHMA_V2_PCT BENE_CC_PH_CANCER6_V2_PCT BENE_CC_PH_CKD_V2_PCT
##  Min.   : 0.00            Min.   : 0.00             Min.   : 0.00        
##  1st Qu.:11.00            1st Qu.:17.00             1st Qu.:41.00        
##  Median :14.00            Median :20.00             Median :47.00        
##  Mean   :12.77            Mean   :18.82             Mean   :46.02        
##  3rd Qu.:16.00            3rd Qu.:22.00             3rd Qu.:53.00        
##  Max.   :42.00            Max.   :39.00             Max.   :85.00        
##  BENE_CC_PH_COPD_V2_PCT BENE_CC_PH_DIABETES_V2_PCT BENE_CC_PH_HF_NONIHD_V2_PCT
##  Min.   : 0.00          Min.   : 0.00              Min.   : 0.00              
##  1st Qu.:29.00          1st Qu.:44.00              1st Qu.:38.00              
##  Median :35.00          Median :49.00              Median :44.00              
##  Mean   :34.67          Mean   :48.93              Mean   :43.26              
##  3rd Qu.:42.00          3rd Qu.:54.00              3rd Qu.:50.00              
##  Max.   :73.00          Max.   :86.00              Max.   :84.00              
##  BENE_CC_PH_HYPERLIPIDEMIA_V2_PCT BENE_CC_PH_HYPERTENSION_V2_PCT
##  Min.   :  0                      Min.   :  0.00                
##  1st Qu.: 80                      1st Qu.:  0.00                
##  Median : 85                      Median : 86.00                
##  Mean   : 74                      Mean   : 48.21                
##  3rd Qu.: 88                      3rd Qu.: 95.00                
##  Max.   :100                      Max.   :100.00                
##  BENE_CC_PH_ISCHEMICHEART_V2_PCT BENE_CC_PH_OSTEOPOROSIS_V2_PCT
##  Min.   : 0.00                   Min.   : 0.00                 
##  1st Qu.:46.00                   1st Qu.:18.00                 
##  Median :53.00                   Median :22.00                 
##  Mean   :51.61                   Mean   :21.78                 
##  3rd Qu.:59.00                   3rd Qu.:26.00                 
##  Max.   :95.00                   Max.   :76.00                 
##  BENE_CC_PH_PARKINSON_V2_PCT BENE_CC_PH_STROKE_TIA_V2_PCT PRMRY_DX_INFCTN_PCT
##  Min.   : 0.000              Min.   : 0.00                Min.   : 0.00000   
##  1st Qu.: 0.000              1st Qu.:38.00                1st Qu.: 0.00000   
##  Median : 5.000              Median :43.00                Median : 0.00000   
##  Mean   : 4.503              Mean   :44.27                Mean   : 0.05825   
##  3rd Qu.: 8.000              3rd Qu.:51.00                3rd Qu.: 0.00000   
##  Max.   :28.000              Max.   :86.00                Max.   :10.00000   
##  PRMRY_DX_NEOBLD_PCT PRMRY_DX_ENDONUTRMET_PCT PRMRY_DX_MNTBEHNEUDIS_PCT
##  Min.   : 0.0000     Min.   : 0.00000         Min.   :0.000000         
##  1st Qu.: 0.0000     1st Qu.: 0.00000         1st Qu.:0.000000         
##  Median : 0.0000     Median : 0.00000         Median :0.000000         
##  Mean   : 0.1639     Mean   : 0.08516         Mean   :0.005981         
##  3rd Qu.: 0.0000     3rd Qu.: 0.00000         3rd Qu.:0.000000         
##  Max.   :13.0000     Max.   :16.00000         Max.   :4.000000         
##  PRMRY_DX_NERVSYSTM_PCT PRMRY_DX_ENTSYS_PCT PRMRY_DX_CIRCSYSTM_PCT
##  Min.   : 0.00          Min.   :0.0000000   Min.   : 0.00         
##  1st Qu.: 0.00          1st Qu.:0.0000000   1st Qu.:16.00         
##  Median :10.00          Median :0.0000000   Median :22.00         
##  Mean   :12.27          Mean   :0.0001424   Mean   :23.59         
##  3rd Qu.:19.00          3rd Qu.:0.0000000   3rd Qu.:30.00         
##  Max.   :63.00          Max.   :1.0000000   Max.   :78.00         
##  PRMRY_DX_RSPSYSTM_PCT PRMRY_DX_DIGSYSTM_PCT PRMRY_DX_SKNMUSSYSTM_PCT
##  Min.   : 0.0000       Min.   :0.00000       Min.   : 0.000          
##  1st Qu.: 0.0000       1st Qu.:0.00000       1st Qu.: 0.000          
##  Median : 0.0000       Median :0.00000       Median : 0.000          
##  Mean   : 0.6992       Mean   :0.06992       Mean   : 2.985          
##  3rd Qu.: 0.0000       3rd Qu.:0.00000       3rd Qu.: 5.000          
##  Max.   :29.0000       Max.   :9.00000       Max.   :57.000          
##  PRMRY_DX_GUSYSTM_PCT PRMRY_DX_PRGPERICONG_PCT PRMRY_DX_SXILLDEF_PCT
##  Min.   : 0.00000     Min.   :0                Min.   :  0.00       
##  1st Qu.: 0.00000     1st Qu.:0                1st Qu.:  0.00       
##  Median : 0.00000     Median :0                Median : 10.00       
##  Mean   : 0.05882     Mean   :0                Mean   : 12.05       
##  3rd Qu.: 0.00000     3rd Qu.:0                3rd Qu.: 18.00       
##  Max.   :11.00000     Max.   :0                Max.   :100.00       
##  PRMRY_DX_INJPOIS_PCT PRMRY_DX_HLTHSRV_PCT  TOT_PT_MNTS      
##  Min.   : 0.00        Min.   :  0.00       Min.   :       0  
##  1st Qu.:16.00        1st Qu.:  8.00       1st Qu.:   75596  
##  Median :22.00        Median : 15.00       Median :  140079  
##  Mean   :21.19        Mean   : 15.55       Mean   :  420580  
##  3rd Qu.:28.00        3rd Qu.: 22.00       3rd Qu.:  324245  
##  Max.   :84.00        Max.   :100.00       Max.   :43320198  
##  INDVDL_PT_MNTS     CNCRNT_GRP_PT_MNTS COTRT_PT_MNTS     TOT_OT_MNTS      
##  Min.   :       0   Min.   :      0    Min.   :     0   Min.   :       0  
##  1st Qu.:   70562   1st Qu.:      0    1st Qu.:     0   1st Qu.:   74691  
##  Median :  132374   Median :   1136    Median :    45   Median :  139039  
##  Mean   :  358574   Mean   :  59117    Mean   :  2889   Mean   :  414448  
##  3rd Qu.:  280852   3rd Qu.:  27853    3rd Qu.:   863   3rd Qu.:  317434  
##  Max.   :35297371   Max.   :7948190    Max.   :242283   Max.   :42495177  
##  INDVDL_OT_MNTS     CNCRNT_GRP_OT_MNTS COTRT_OT_MNTS     TOT_SLP_MNTS    
##  Min.   :       0   Min.   :      0    Min.   :     0   Min.   :      0  
##  1st Qu.:   69928   1st Qu.:      0    1st Qu.:     0   1st Qu.:  16161  
##  Median :  131216   Median :   1369    Median :    15   Median :  35295  
##  Mean   :  359155   Mean   :  52635    Mean   :  2657   Mean   : 101457  
##  3rd Qu.:  280210   3rd Qu.:  24090    3rd Qu.:   628   3rd Qu.:  79550  
##  Max.   :35084335   Max.   :7364164    Max.   :233261   Max.   :8855320  
##  INDVDL_SLP_MNTS   CNCRNT_GRP_SLP_MNTS COTRT_SLP_MNTS         los        
##  Min.   :      0   Min.   :     0      Min.   :    0.0   Min.   : 6.143  
##  1st Qu.:  15924   1st Qu.:     0      1st Qu.:    0.0   1st Qu.:11.695  
##  Median :  34308   Median :     0      Median :    0.0   Median :12.514  
##  Mean   :  94980   Mean   :  6178      Mean   :  299.1   Mean   :12.693  
##  3rd Qu.:  73657   3rd Qu.:  1791      3rd Qu.:   30.0   3rd Qu.:13.467  
##  Max.   :8392128   Max.   :566577      Max.   :70618.0   Max.   :26.812  
##     ptperep          slpperep         otperep      
##  Min.   :   0.0   Min.   :  0.00   Min.   :   0.0  
##  1st Qu.: 569.6   1st Qu.: 98.93   1st Qu.: 556.0  
##  Median : 641.2   Median :147.65   Median : 634.2  
##  Mean   : 625.0   Mean   :161.55   Mean   : 615.6  
##  3rd Qu.: 704.8   3rd Qu.:209.74   3rd Qu.: 697.7  
##  Max.   :1248.9   Max.   :722.88   Max.   :1202.5
# column names
colnames(m6)
##  [1] "YEAR"                             "YEAR_TYPE"                       
##  [3] "SMRY_CTGRY"                       "SRVC_CTGRY"                      
##  [5] "PRVDR_ID"                         "PRVDR_NAME"                      
##  [7] "PRVDR_CITY"                       "STATE"                           
##  [9] "PRVDR_ZIP"                        "BENE_DSTNCT_CNT"                 
## [11] "TOT_EPSD_STAY_CNT"                "TOT_SRVC_DAYS"                   
## [13] "TOT_CHRG_AMT"                     "TOT_ALOWD_AMT"                   
## [15] "TOT_MDCR_PYMT_AMT"                "TOT_MDCR_STDZD_PYMT_AMT"         
## [17] "TOT_OUTLIER_PYMT_AMT"             "BENE_DUAL_PCT"                   
## [19] "BENE_RRL_PCT"                     "BENE_AVG_AGE"                    
## [21] "BENE_MALE_PCT"                    "BENE_FEML_PCT"                   
## [23] "BENE_RACE_WHT_PCT"                "BENE_RACE_BLACK_PCT"             
## [25] "BENE_RACE_API_PCT"                "BENE_RACE_HSPNC_PCT"             
## [27] "BENE_RACE_NATIND_PCT"             "BENE_RACE_UNK_PCT"               
## [29] "BENE_RACE_OTHR_PCT"               "BENE_AVG_RISK_SCRE"              
## [31] "BENE_CC_BH_ADHD_OTHCD_V1_PCT"     "BENE_CC_BH_ALCOHOL_DRUG_V1_PCT"  
## [33] "BENE_CC_BH_ALZ_NONALZDEM_V2_PCT"  "BENE_CC_BH_ANXIETY_V1_PCT"       
## [35] "BENE_CC_BH_BIPOLAR_V1_PCT"        "BENE_CC_BH_DEPRESS_V1_PCT"       
## [37] "BENE_CC_BH_MOOD_V2_PCT"           "BENE_CC_BH_PD_V1_PCT"            
## [39] "BENE_CC_BH_PTSD_V1_PCT"           "BENE_CC_BH_SCHIZO_OTHPSY_V1_PCT" 
## [41] "BENE_CC_BH_TOBACCO_V1_PCT"        "BENE_CC_PH_AFIB_V2_PCT"          
## [43] "BENE_CC_PH_ARTHRITIS_V2_PCT"      "BENE_CC_PH_ASTHMA_V2_PCT"        
## [45] "BENE_CC_PH_CANCER6_V2_PCT"        "BENE_CC_PH_CKD_V2_PCT"           
## [47] "BENE_CC_PH_COPD_V2_PCT"           "BENE_CC_PH_DIABETES_V2_PCT"      
## [49] "BENE_CC_PH_HF_NONIHD_V2_PCT"      "BENE_CC_PH_HYPERLIPIDEMIA_V2_PCT"
## [51] "BENE_CC_PH_HYPERTENSION_V2_PCT"   "BENE_CC_PH_ISCHEMICHEART_V2_PCT" 
## [53] "BENE_CC_PH_OSTEOPOROSIS_V2_PCT"   "BENE_CC_PH_PARKINSON_V2_PCT"     
## [55] "BENE_CC_PH_STROKE_TIA_V2_PCT"     "PRMRY_DX_INFCTN_PCT"             
## [57] "PRMRY_DX_NEOBLD_PCT"              "PRMRY_DX_ENDONUTRMET_PCT"        
## [59] "PRMRY_DX_MNTBEHNEUDIS_PCT"        "PRMRY_DX_NERVSYSTM_PCT"          
## [61] "PRMRY_DX_ENTSYS_PCT"              "PRMRY_DX_CIRCSYSTM_PCT"          
## [63] "PRMRY_DX_RSPSYSTM_PCT"            "PRMRY_DX_DIGSYSTM_PCT"           
## [65] "PRMRY_DX_SKNMUSSYSTM_PCT"         "PRMRY_DX_GUSYSTM_PCT"            
## [67] "PRMRY_DX_PRGPERICONG_PCT"         "PRMRY_DX_SXILLDEF_PCT"           
## [69] "PRMRY_DX_INJPOIS_PCT"             "PRMRY_DX_HLTHSRV_PCT"            
## [71] "TOT_PT_MNTS"                      "INDVDL_PT_MNTS"                  
## [73] "CNCRNT_GRP_PT_MNTS"               "COTRT_PT_MNTS"                   
## [75] "TOT_OT_MNTS"                      "INDVDL_OT_MNTS"                  
## [77] "CNCRNT_GRP_OT_MNTS"               "COTRT_OT_MNTS"                   
## [79] "TOT_SLP_MNTS"                     "INDVDL_SLP_MNTS"                 
## [81] "CNCRNT_GRP_SLP_MNTS"              "COTRT_SLP_MNTS"                  
## [83] "los"                              "ptperep"                         
## [85] "slpperep"                         "otperep"
###########
# only want to look at Irf factors:
summary(m6[, c("los","TOT_PT_MNTS","TOT_SLP_MNTS", "TOT_OT_MNTS", "BENE_DSTNCT_CNT", "TOT_EPSD_STAY_CNT" , "TOT_SRVC_DAYS", "TOT_MDCR_PYMT_AMT", "TOT_MDCR_STDZD_PYMT_AMT","TOT_CHRG_AMT" )])
##       los          TOT_PT_MNTS        TOT_SLP_MNTS      TOT_OT_MNTS      
##  Min.   : 6.143   Min.   :       0   Min.   :      0   Min.   :       0  
##  1st Qu.:11.695   1st Qu.:   75596   1st Qu.:  16161   1st Qu.:   74691  
##  Median :12.514   Median :  140079   Median :  35295   Median :  139039  
##  Mean   :12.693   Mean   :  420580   Mean   : 101457   Mean   :  414448  
##  3rd Qu.:13.467   3rd Qu.:  324245   3rd Qu.:  79550   3rd Qu.:  317434  
##  Max.   :26.812   Max.   :43320198   Max.   :8855320   Max.   :42495177  
##  BENE_DSTNCT_CNT   TOT_EPSD_STAY_CNT TOT_SRVC_DAYS    TOT_MDCR_PYMT_AMT  
##  Min.   :   11.0   Min.   :   11     Min.   :    86   Min.   :1.556e+05  
##  1st Qu.:  121.0   1st Qu.:  128     1st Qu.:  1610   1st Qu.:2.854e+06  
##  Median :  210.0   Median :  224     Median :  2847   Median :5.126e+06  
##  Mean   :  596.7   Mean   :  665     Mean   :  8387   Mean   :1.484e+07  
##  3rd Qu.:  467.0   3rd Qu.:  514     3rd Qu.:  6424   3rd Qu.:1.141e+07  
##  Max.   :54145.0   Max.   :64561     Max.   :793772   Max.   :1.481e+09  
##  TOT_MDCR_STDZD_PYMT_AMT  TOT_CHRG_AMT      
##  Min.   :1.600e+05       Min.   :1.480e+05  
##  1st Qu.:2.855e+06       1st Qu.:6.580e+06  
##  Median :5.007e+06       Median :1.302e+07  
##  Mean   :1.458e+07       Mean   :3.399e+07  
##  3rd Qu.:1.119e+07       3rd Qu.:2.528e+07  
##  Max.   :1.531e+09       Max.   :3.413e+09
#--------------------------------------------------------------------------


#NEXT QUESTION

#--------------------------------------------------------------------------
# Can we predict the likelihood of high Medicare IRF utilization in a state
# based on demographic and healthcare infrastructure variables?
# using a log regression where it's high vs not high so i need a binary variable

#make a binary variable first 
stateutilize<- m6 %>%
  group_by(STATE, YEAR) %>%
  summarise(utrate=sum(TOT_SRVC_DAYS)/sum(BENE_DSTNCT_CNT), # gives us utilization rate where it's use over number of ppl
            BENE_AVG_AGE=mean(BENE_AVG_AGE),
            BENE_DUAL_PCT=mean(BENE_DUAL_PCT), # this is both medicare+medicaid
            BENE_MALE_PCT=mean(BENE_MALE_PCT),
            BENE_RRL_PCT=mean(BENE_RRL_PCT), # those who are classified as rural 
            BENE_RACE_WHT_PCT=mean(BENE_RACE_WHT_PCT),
            BENE_RACE_API_PCT=mean(BENE_RACE_API_PCT),
            BENE_RACE_BLACK_PCT=mean(BENE_RACE_BLACK_PCT),
            .groups='drop')
# BENE_RACE_HSPNC_PCT=mean(BENE_RACE_HSPNC_PCT),
#  BENE_RACE_NATIND_PCT=mean(BENE_RACE_NATIND_PCT), 
#BENE_RACE_OTHR_PCT=mean(BENE_RACE_OTHR_PCT),
#  BENE_RACE_UNK_PCT=mean(BENE_RACE_UNK_PCT))

stateutilize %>%
  group_by(YEAR) %>%
  summarise(n=n())
summary(stateutilize)
##     STATE                YEAR          utrate       BENE_AVG_AGE  
##  Length:312         Min.   :2018   Min.   :11.06   Min.   :71.33  
##  Class :character   1st Qu.:2019   1st Qu.:13.35   1st Qu.:74.99  
##  Mode  :character   Median :2020   Median :13.81   Median :75.77  
##                     Mean   :2020   Mean   :13.84   Mean   :75.65  
##                     3rd Qu.:2022   3rd Qu.:14.46   3rd Qu.:76.44  
##                     Max.   :2023   Max.   :16.29   Max.   :78.52  
##  BENE_DUAL_PCT   BENE_MALE_PCT    BENE_RRL_PCT    BENE_RACE_WHT_PCT
##  Min.   : 0.00   Min.   :26.33   Min.   : 0.000   Min.   : 0.2857  
##  1st Qu.:13.32   1st Qu.:45.10   1st Qu.: 1.448   1st Qu.:75.4157  
##  Median :17.34   Median :47.63   Median : 7.260   Median :84.7752  
##  Mean   :18.10   Mean   :47.71   Mean   : 9.108   Mean   :80.3155  
##  3rd Qu.:22.37   3rd Qu.:50.22   3rd Qu.:14.000   3rd Qu.:90.4821  
##  Max.   :38.00   Max.   :60.50   Max.   :40.333   Max.   :98.0000  
##  BENE_RACE_API_PCT  BENE_RACE_BLACK_PCT
##  Min.   : 0.00000   Min.   : 0.0000    
##  1st Qu.: 0.00000   1st Qu.: 0.2167    
##  Median : 0.05882   Median : 3.5147    
##  Mean   : 1.59770   Mean   : 7.2074    
##  3rd Qu.: 0.33621   3rd Qu.:11.2090    
##  Max.   :56.00000   Max.   :55.0000
# make it binary based on median
cutoff<-median(stateutilize$utrate)
cutoff
## [1] 13.80912
stateutilize<-stateutilize %>%
  mutate(longstay=ifelse(utrate >= cutoff,1,0))
View(stateutilize)

# how many are 1 vs 0?
table(stateutilize$longstay) # note it's 156 for 0 and 1  
## 
##   0   1 
## 156 156
#check for multicollinearity 
numstate<-stateutilize %>%select(where(is.numeric))
numstate
# make a new window to see the plot
x11()
#visually see correlations
pairs(numstate, col='dodgerblue')
#see the correlation 
round(cor(numstate), 2)
##                      YEAR utrate BENE_AVG_AGE BENE_DUAL_PCT BENE_MALE_PCT
## YEAR                 1.00   0.06         0.34         -0.26         -0.06
## utrate               0.06   1.00        -0.10          0.33          0.17
## BENE_AVG_AGE         0.34  -0.10         1.00         -0.32         -0.49
## BENE_DUAL_PCT       -0.26   0.33        -0.32          1.00          0.08
## BENE_MALE_PCT       -0.06   0.17        -0.49          0.08          1.00
## BENE_RRL_PCT        -0.02  -0.19        -0.14          0.05          0.08
## BENE_RACE_WHT_PCT    0.04   0.09         0.12          0.07          0.29
## BENE_RACE_API_PCT    0.00  -0.20         0.00         -0.13          0.03
## BENE_RACE_BLACK_PCT -0.08   0.22        -0.24          0.36         -0.10
## longstay             0.08   0.77        -0.05          0.26          0.18
##                     BENE_RRL_PCT BENE_RACE_WHT_PCT BENE_RACE_API_PCT
## YEAR                       -0.02              0.04              0.00
## utrate                     -0.19              0.09             -0.20
## BENE_AVG_AGE               -0.14              0.12              0.00
## BENE_DUAL_PCT               0.05              0.07             -0.13
## BENE_MALE_PCT               0.08              0.29              0.03
## BENE_RRL_PCT                1.00              0.41             -0.20
## BENE_RACE_WHT_PCT           0.41              1.00             -0.47
## BENE_RACE_API_PCT          -0.20             -0.47              1.00
## BENE_RACE_BLACK_PCT        -0.25             -0.36             -0.08
## longstay                   -0.20             -0.02             -0.04
##                     BENE_RACE_BLACK_PCT longstay
## YEAR                              -0.08     0.08
## utrate                             0.22     0.77
## BENE_AVG_AGE                      -0.24    -0.05
## BENE_DUAL_PCT                      0.36     0.26
## BENE_MALE_PCT                     -0.10     0.18
## BENE_RRL_PCT                      -0.25    -0.20
## BENE_RACE_WHT_PCT                 -0.36    -0.02
## BENE_RACE_API_PCT                 -0.08    -0.04
## BENE_RACE_BLACK_PCT                1.00     0.19
## longstay                           0.19     1.00
#start a log model

#use a stepwise regression to find the most sig. variables

statemodstart<-glm(longstay~1, data=stateutilize, family= binomial)
stepwisestatelog<-step(statemodstart, scope= longstay~BENE_AVG_AGE +BENE_DUAL_PCT+BENE_MALE_PCT+ BENE_RACE_API_PCT+BENE_RACE_BLACK_PCT +BENE_RACE_WHT_PCT+ BENE_RRL_PCT, direction='both' )
## Start:  AIC=434.52
## longstay ~ 1
## 
##                       Df Deviance    AIC
## + BENE_DUAL_PCT        1   410.95 414.95
## + BENE_RACE_BLACK_PCT  1   419.95 423.95
## + BENE_RRL_PCT         1   420.27 424.27
## + BENE_MALE_PCT        1   422.50 426.50
## <none>                     432.52 434.52
## + BENE_AVG_AGE         1   431.77 435.77
## + BENE_RACE_API_PCT    1   431.90 435.90
## + BENE_RACE_WHT_PCT    1   432.35 436.35
## 
## Step:  AIC=414.95
## longstay ~ BENE_DUAL_PCT
## 
##                       Df Deviance    AIC
## + BENE_RRL_PCT         1   395.85 401.85
## + BENE_MALE_PCT        1   401.82 407.82
## + BENE_RACE_BLACK_PCT  1   406.72 412.72
## <none>                     410.95 414.95
## + BENE_RACE_WHT_PCT    1   410.44 416.44
## + BENE_AVG_AGE         1   410.49 416.49
## + BENE_RACE_API_PCT    1   410.91 416.91
## - BENE_DUAL_PCT        1   432.52 434.52
## 
## Step:  AIC=401.85
## longstay ~ BENE_DUAL_PCT + BENE_RRL_PCT
## 
##                       Df Deviance    AIC
## + BENE_MALE_PCT        1   385.28 393.28
## <none>                     395.85 401.85
## + BENE_RACE_API_PCT    1   394.96 402.96
## + BENE_RACE_BLACK_PCT  1   394.96 402.96
## + BENE_RACE_WHT_PCT    1   395.03 403.03
## + BENE_AVG_AGE         1   395.83 403.83
## - BENE_RRL_PCT         1   410.95 414.95
## - BENE_DUAL_PCT        1   420.27 424.27
## 
## Step:  AIC=393.28
## longstay ~ BENE_DUAL_PCT + BENE_RRL_PCT + BENE_MALE_PCT
## 
##                       Df Deviance    AIC
## + BENE_AVG_AGE         1   379.60 389.60
## + BENE_RACE_BLACK_PCT  1   382.84 392.84
## <none>                     385.28 393.28
## + BENE_RACE_API_PCT    1   384.05 394.05
## + BENE_RACE_WHT_PCT    1   385.23 395.23
## - BENE_MALE_PCT        1   395.85 401.85
## - BENE_RRL_PCT         1   401.82 407.82
## - BENE_DUAL_PCT        1   407.71 413.71
## 
## Step:  AIC=389.6
## longstay ~ BENE_DUAL_PCT + BENE_RRL_PCT + BENE_MALE_PCT + BENE_AVG_AGE
## 
##                       Df Deviance    AIC
## + BENE_RACE_BLACK_PCT  1   374.59 386.59
## <none>                     379.60 389.60
## + BENE_RACE_API_PCT    1   378.74 390.74
## + BENE_RACE_WHT_PCT    1   379.12 391.12
## - BENE_AVG_AGE         1   385.28 393.28
## - BENE_RRL_PCT         1   393.41 401.41
## - BENE_MALE_PCT        1   395.83 403.83
## - BENE_DUAL_PCT        1   407.59 415.59
## 
## Step:  AIC=386.59
## longstay ~ BENE_DUAL_PCT + BENE_RRL_PCT + BENE_MALE_PCT + BENE_AVG_AGE + 
##     BENE_RACE_BLACK_PCT
## 
##                       Df Deviance    AIC
## <none>                     374.59 386.59
## + BENE_RACE_API_PCT    1   374.10 388.10
## + BENE_RACE_WHT_PCT    1   374.59 388.59
## - BENE_RACE_BLACK_PCT  1   379.60 389.60
## - BENE_RRL_PCT         1   381.98 391.98
## - BENE_AVG_AGE         1   382.84 392.84
## - BENE_DUAL_PCT        1   394.29 404.29
## - BENE_MALE_PCT        1   394.89 404.89
summary(stepwisestatelog)
## 
## Call:
## glm(formula = longstay ~ BENE_DUAL_PCT + BENE_RRL_PCT + BENE_MALE_PCT + 
##     BENE_AVG_AGE + BENE_RACE_BLACK_PCT, family = binomial, data = stateutilize)
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -40.36911   12.54050  -3.219  0.00129 ** 
## BENE_DUAL_PCT         0.09603    0.02330   4.121 3.77e-05 ***
## BENE_RRL_PCT         -0.04169    0.01568  -2.659  0.00784 ** 
## BENE_MALE_PCT         0.17431    0.04211   4.139 3.48e-05 ***
## BENE_AVG_AGE          0.40208    0.14419   2.788  0.00530 ** 
## BENE_RACE_BLACK_PCT   0.03652    0.01707   2.140  0.03235 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 432.52  on 311  degrees of freedom
## Residual deviance: 374.59  on 306  degrees of freedom
## AIC: 386.59
## 
## Number of Fisher Scoring iterations: 4
#do predictions
stateutilize$predicting<-predict(stepwisestatelog, type='response')
summary(stateutilize$predicting)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.004917 0.340792 0.482776 0.500000 0.673681 0.945906
#likelihood ratio test
anova(stepwisestatelog, test='Chisq')
anova(stepwisestatelog, test = 'LRT')
#compute an odds ratio from the model
#get coefficents from model
coeff<-coef(stepwisestatelog)
coeff
##         (Intercept)       BENE_DUAL_PCT        BENE_RRL_PCT       BENE_MALE_PCT 
##        -40.36910746          0.09602547         -0.04168960          0.17431340 
##        BENE_AVG_AGE BENE_RACE_BLACK_PCT 
##          0.40208121          0.03652397
#compute odds ratio
odds<- exp(coeff)
odds
##         (Intercept)       BENE_DUAL_PCT        BENE_RRL_PCT       BENE_MALE_PCT 
##        2.937104e-18        1.100787e+00        9.591675e-01        1.190429e+00 
##        BENE_AVG_AGE BENE_RACE_BLACK_PCT 
##        1.494933e+00        1.037199e+00
#get standard error for coefficient
se<-summary(stepwisestatelog)$coeff[,'Std. Error']
se
##         (Intercept)       BENE_DUAL_PCT        BENE_RRL_PCT       BENE_MALE_PCT 
##         12.54050373          0.02329999          0.01568041          0.04211036 
##        BENE_AVG_AGE BENE_RACE_BLACK_PCT 
##          0.14419334          0.01706666
#compute CI levels 
oddslower<-odds*exp(-1.96*se)
oddsupper<-odds*exp(1.96*se)

#make into df
oddsdf<-data.frame(odds,ci_lower =oddslower, ci_upper=oddsupper)
print(oddsdf)
##                             odds     ci_lower     ci_upper
## (Intercept)         2.937104e-18 6.211938e-29 1.388710e-07
## BENE_DUAL_PCT       1.100787e+00 1.051647e+00 1.152223e+00
## BENE_RRL_PCT        9.591675e-01 9.301372e-01 9.891038e-01
## BENE_MALE_PCT       1.190429e+00 1.096120e+00 1.292851e+00
## BENE_AVG_AGE        1.494933e+00 1.126891e+00 1.983177e+00
## BENE_RACE_BLACK_PCT 1.037199e+00 1.003078e+00 1.072481e+00
#final log model 
summary(stepwisestatelog)
## 
## Call:
## glm(formula = longstay ~ BENE_DUAL_PCT + BENE_RRL_PCT + BENE_MALE_PCT + 
##     BENE_AVG_AGE + BENE_RACE_BLACK_PCT, family = binomial, data = stateutilize)
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -40.36911   12.54050  -3.219  0.00129 ** 
## BENE_DUAL_PCT         0.09603    0.02330   4.121 3.77e-05 ***
## BENE_RRL_PCT         -0.04169    0.01568  -2.659  0.00784 ** 
## BENE_MALE_PCT         0.17431    0.04211   4.139 3.48e-05 ***
## BENE_AVG_AGE          0.40208    0.14419   2.788  0.00530 ** 
## BENE_RACE_BLACK_PCT   0.03652    0.01707   2.140  0.03235 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 432.52  on 311  degrees of freedom
## Residual deviance: 374.59  on 306  degrees of freedom
## AIC: 386.59
## 
## Number of Fisher Scoring iterations: 4
# plot it 

#make a df to see if it counts as high utilization or not
stateplot<-stateutilize %>%
  group_by(STATE) %>%
  summarise(predicting=mean(predicting),
            highutilization=predicting >=0.5)



#plot state, probability and high utilization
ggplot(stateplot, aes(x=reorder(STATE, -predicting), y= predicting, fill=highutilization))+
  geom_col(position='identity')+
  #coord_flip()+
  labs(x='State',y='Probability of a High Utilization', title='Prediction of Utilization Rate by State', fill='High Utlization')+
  theme_minimal()+theme(plot.title=element_text(hjust=0.5))+
  theme(
    axis.text.y = element_text(lineheight = 4)
  )

#this graph shows us the states and the chances that their medicare patients would have a long stay in the IRF based on the 3 factors.We can see that PR is the actual opposite where it's actually in the negative. 

#pROC
#GET fitted probability values
predictions<-fitted.values(stepwisestatelog)


#create ROC curve and plot
roc_curve<-roc(stateutilize$longstay,predictions,
               plot=TRUE, col='blue',lwd=2, print.auc=TRUE, main='ROC curve for the Medicare Utilization based on demographics',
               ci=TRUE,legacy.axes=TRUE )
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
text(x=.1, y=.2, labels='model1- dual pct, rrl pct, male pct, black pct, avg age' , col='blue', cex=1, font=2)



# do another log regression to see for the better curve 

fit2<-glm(longstay ~BENE_DUAL_PCT + BENE_RRL_PCT + BENE_MALE_PCT + BENE_AVG_AGE, data=stateutilize, family=binomial())
predictions2<-fitted.values(fit2)
roc2<-roc(stateutilize$longstay, predictions2)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc2, add=TRUE, col='black')
text(
  x=1, y=1, labels=paste('AUC= ', round(auc(roc2), 3), 'model2- dual pct, rrl pct, male pct, avg age'), col='black', cex=1, font=2)

#add other models to compare for anova
fit3<-glm(longstay ~ BENE_DUAL_PCT + BENE_RRL_PCT + BENE_MALE_PCT, data=stateutilize, family=binomial())
fitog<-glm(longstay ~ BENE_DUAL_PCT, data=stateutilize, family=binomial)


#anova test for mulitple models 
anova(fitog, fit2, stepwisestatelog, test='Chisq')
anova( fit2, stepwisestatelog, test='LRT')
# The difference of the two models is that model two includes race as a predictor. Assuming that the null hypothesis is that the models are equal to another we can see that the p value is 0.025. This confirms that the model containing the race does contribute to the fit of the model.

##### 
# optional ggplot certain years

stateplott<-stateutilize %>%
  group_by(STATE, YEAR) %>%
  summarise(predicting=mean(predicting),
            highutilization=predicting >=0.5,
            .groups='drop')



stateplot1<-stateplott %>%
  filter(YEAR %in% c(2018, 2019, 2020))


ggplot(stateplot1, aes(x=reorder(STATE, predicting), y=predicting, fill=factor(highutilization)))+ 
  geom_col() + 
  coord_flip()+
  facet_wrap(~YEAR)+
  labs(x='State',y='Probability of a High Utilization', title='Prediction of Utilization Rate by State', fill='high utilization')+
  theme_minimal()+theme(plot.title=element_text(hjust=0.5))+
  theme(
    axis.text.y = element_text(lineheight = 4)
  )

#certain years
stateplot2<-stateplott %>%
  filter(YEAR %in% c(2021, 2022, 2023))


ggplot(stateplot2, aes(x=reorder(STATE, predicting), y=predicting, fill=factor(highutilization)))+ 
  geom_col() + 
  coord_flip()+
  facet_wrap(~YEAR)+
  labs(x='State',y='Probability of a High Utilization', title='Prediction of Utilization Rate by State', fill='high utilization')+
  theme_minimal()+theme(plot.title=element_text(hjust=0.5))+
  theme(
    axis.text.y = element_text(lineheight = 4)
  )

#--------------------------------------------------------------------------


#NEXT QUESTION

#--------------------------------------------------------------------------


#--------------------------------------------------------------------------
#Do states cluster into groups based on Medicare payment ( a utilization factor)
#and being able to see the change over years
# k means clustering
#make a new df


Cm6 <- m6 %>%
  group_by(STATE, YEAR) %>%
  summarise(
    TOT_SRVC_DAYS  = sum(TOT_SRVC_DAYS),
    BENE_DSTNCT_CNT = sum(BENE_DSTNCT_CNT),
    TOT_MDCR_STDZD_PYMT_AMT = sum(TOT_MDCR_STDZD_PYMT_AMT),
    .groups = "drop"
  ) %>%
  mutate(utrate= TOT_SRVC_DAYS / BENE_DSTNCT_CNT)

# check to see if we have 52 rows per yr
Cm6 %>%
  group_by(YEAR) %>%
  summarise(n = n())
#visualize what the data looks like before and after log
#make a unlog dataset
nologCm6 <- m6 %>%
  group_by(STATE, YEAR) %>%
  summarise(
    TOT_SRVC_DAYS  = sum(TOT_SRVC_DAYS),
    BENE_DSTNCT_CNT = sum(BENE_DSTNCT_CNT),
    TOT_MDCR_STDZD_PYMT_AMT = sum(TOT_MDCR_STDZD_PYMT_AMT),
    .groups = "drop"
  ) %>%
  mutate(utrate= TOT_SRVC_DAYS / BENE_DSTNCT_CNT)

# gauge distribution to determine kurtosis/ skewness

prelog<-ggplot(nologCm6, aes(x=as.factor(ceiling(utrate)), y=TOT_MDCR_STDZD_PYMT_AMT))+ 
  geom_boxplot() +
  labs(title="Distribution of utilization rate before log transformation", x='utrate', y='Total Medicare Payment') +
  theme_minimal()
prelog


# do log for medicare payments
Cm6<- Cm6 %>%
  mutate(logmedicare=log(TOT_MDCR_STDZD_PYMT_AMT))

clusterm6 <- Cm6 %>% select(logmedicare, utrate)

#scale data have balance dataset to normalize it
clusterm6scale<-scale(clusterm6)
clusterm6scale<-as.data.frame(clusterm6scale) #it was reading as a list

postlog<-ggplot(clusterm6scale, aes(x=as.factor(ceiling(utrate)), y=logmedicare))+
  geom_boxplot() +
  labs(title="Distribution of utilization rate after log transformation", x='utrate', y='Total Medicare Payment (Log)') +
  theme_minimal()
postlog

#step 1 pick num of clusters
#calcualte wss

set.seed(1997)
wss<-sapply(1:10,function(k){
  kmeans(clusterm6scale, centers = k, nstart = 25)$tot.withinss
})
# plot
plot(1:10, wss, type='b', pch=10,
     xlab='num of cluster')
#here we find its 3 

km.out<-kmeans(clusterm6scale, centers = 3, nstart=100 )
print(km.out)
## K-means clustering with 3 clusters of sizes 155, 70, 87
## 
## Cluster means:
##   logmedicare     utrate
## 1   0.8225746  0.2697857
## 2  -1.0331925  0.7984398
## 3  -0.6342021 -1.1230755
## 
## Clustering vector:
##   [1] 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 1 1 3 3 2
##  [38] 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 2 3 3
##  [75] 3 3 3 3 3 3 3 3 2 3 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 2 2 2 2 2 1 3 3 2 2 3 3 1 1 1 1 1 1 3 2 2 2 2 2 1 1 1 1
## [149] 1 1 3 3 1 1 3 3 3 3 3 3 3 3 1 1 1 1 1 1 3 3 3 3 3 3 2 2 2 2 2 2 1 3 1 3 1
## [186] 3 1 1 1 1 1 1 3 3 3 3 2 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 1 1
## [223] 3 3 3 3 3 3 1 1 1 1 1 1 3 3 3 3 3 3 2 3 2 2 2 3 1 1 1 1 1 1 2 2 2 2 2 2 1
## [260] 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 1 1 1 1 1 1 3 2 2 2 2 2 2 3 1 2 2 2 3 3
## [297] 3 3 3 3 1 1 1 1 1 1 2 2 2 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 108.08733  52.36677  81.31190
##  (between_SS / total_SS =  61.1 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
#visualize the cluster 
km.cluster<-km.out$cluster
clusterm6scale$STATE<-Cm6$STATE
clusterm6scale$YEAR<-Cm6$YEAR
clusterm6scale$TOT_MDCR_STDZD_PYMT_AMT<-Cm6$TOT_MDCR_STDZD_PYMT_AMT
clusterm6scale$km.cluster<-km.out$cluster
#MAKE THE CLUSTER  A FACTOR FIRST
is.factor(clusterm6scale$km.cluster)
## [1] FALSE
#make factor 
clusterm6scale$km.cluster<-as.factor(clusterm6scale$km.cluster)

#check again
is.factor(clusterm6scale$km.cluster)
## [1] TRUE
#visualize

# states we want to emphasize from our log regression
highlightstates<- clusterm6scale %>%
  filter(STATE %in% c('CA', 'MD', 'PR', 'MA', 'AK', 'DC'))



plot1<- ggplot(clusterm6scale, aes(x=logmedicare,y=utrate, group=STATE, color=km.cluster))+geom_point()+
  facet_wrap(~YEAR)+
  geom_point()+
  labs(
    x='Log of Tot. Medicare Payment',
    y='Utilization Rate',
    title = 'Comparing the Medicare payments over time and its effect on utilization',
    color="Groups"
  )

plot1+ geom_text(data = highlightstates, aes(label=STATE),  vjust=-1.0, size=4)

# what does this mean? 
km.out$centers
##   logmedicare     utrate
## 1   0.8225746  0.2697857
## 2  -1.0331925  0.7984398
## 3  -0.6342021 -1.1230755
#summary and analysis
summary(clusterm6scale)
##   logmedicare          utrate           STATE                YEAR     
##  Min.   :-2.4057   Min.   :-3.0876   Length:312         Min.   :2018  
##  1st Qu.:-0.7646   1st Qu.:-0.5492   Class :character   1st Qu.:2019  
##  Median : 0.1346   Median :-0.0351   Mode  :character   Median :2020  
##  Mean   : 0.0000   Mean   : 0.0000                      Mean   :2020  
##  3rd Qu.: 0.7494   3rd Qu.: 0.6874                      3rd Qu.:2022  
##  Max.   : 2.3417   Max.   : 2.7155                      Max.   :2023  
##  TOT_MDCR_STDZD_PYMT_AMT km.cluster
##  Min.   :8.388e+06       1:155     
##  1st Qu.:6.448e+07       2: 70     
##  Median :1.971e+08       3: 87     
##  Mean   :3.281e+08                 
##  3rd Qu.:4.232e+08                 
##  Max.   :3.062e+09
#--------------------------------------------------------------------------


#NEXT QUESTION

#--------------------------------------------------------------------------


# Using linear regression
# Does the average submitted charge predict the average Medicare payment?

# make df 
pay6 <- m6 %>%
  group_by(STATE, YEAR) %>%
  summarise(avg_tot_charge= log(mean(TOT_CHRG_AMT)),
            avg_tot_payment=log(mean(TOT_MDCR_PYMT_AMT)),
            pptot=mean(TOT_CHRG_AMT),
            ppcharge=mean(TOT_MDCR_PYMT_AMT),
            .groups = "drop") %>%
  mutate(percentpaid=(ppcharge/pptot)*100)
pay6
summary(pay6)
##     STATE                YEAR      avg_tot_charge  avg_tot_payment
##  Length:312         Min.   :2018   Min.   :14.64   Min.   :14.10  
##  Class :character   1st Qu.:2019   1st Qu.:16.69   1st Qu.:15.91  
##  Mode  :character   Median :2020   Median :17.07   Median :16.31  
##                     Mean   :2020   Mean   :17.03   Mean   :16.28  
##                     3rd Qu.:2022   3rd Qu.:17.36   3rd Qu.:16.63  
##                     Max.   :2023   Max.   :18.77   Max.   :17.54  
##      pptot              ppcharge         percentpaid   
##  Min.   :  2274570   Min.   : 1329725   Min.   :24.36  
##  1st Qu.: 17756813   1st Qu.: 8099371   1st Qu.:42.46  
##  Median : 26020160   Median :12093224   Median :46.41  
##  Mean   : 29086136   Mean   :13823244   Mean   :48.17  
##  3rd Qu.: 34750104   3rd Qu.:16761856   3rd Qu.:51.36  
##  Max.   :141364177   Max.   :41507395   Max.   :81.54
# check to see if we have 52 rows per yr
pay6%>%
  group_by(YEAR) %>%
  summarise(n = n())
# see first few rows 
head(pay6, n = 10)
#use correlation
coefficient <- cor.test(pay6$avg_tot_charge, pay6$avg_tot_payment)
coefficient
## 
##  Pearson's product-moment correlation
## 
## data:  pay6$avg_tot_charge and pay6$avg_tot_payment
## t = 46.38, df = 310, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9192950 0.9475724
## sample estimates:
##       cor 
## 0.9349015
coefficient$estimate
##       cor 
## 0.9349015
pay6logplot<-
  ggplot(pay6,aes(y= avg_tot_payment)) +
  labs( y='Avg tot payment', title = 'boxplot with a log transformation')+
  geom_boxplot()
pay6logplot

# lets see it without log

pay6nolog <- m6 %>%
  group_by(STATE, YEAR) %>%
  summarise(avg_tot_charge= mean(TOT_CHRG_AMT),
            avg_tot_payment=mean(TOT_MDCR_PYMT_AMT),
            pptot=mean(TOT_CHRG_AMT),
            ppcharge=mean(TOT_MDCR_PYMT_AMT),
            .groups = "drop") %>%
  mutate(percentpaid=(ppcharge/pptot)*100)


plotpaynolog <- ggplot(pay6nolog, aes(y=avg_tot_payment)) +
  labs(
    y = 'Avg tot payment',
    title = 'Boxplot without a log transformation'
  ) +
  geom_boxplot()

plotpaynolog


# combine the plots

library(patchwork)
## Warning: package 'patchwork' was built under R version 4.5.3
(plotpaynolog+pay6logplot) + 
  plot_annotation(title = 'Boxplot of the submitted and actual payments before and after log transformation')



#do modeling  
linmodelpay6<-lm(avg_tot_payment ~avg_tot_charge +factor(STATE) +factor(YEAR), data=pay6)

summary(linmodelpay6) 
## 
## Call:
## lm(formula = avg_tot_payment ~ avg_tot_charge + factor(STATE) + 
##     factor(YEAR), data = pay6)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.135833 -0.023402  0.002264  0.024121  0.205041 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -0.253995   0.357021  -0.711   0.4775    
## avg_tot_charge    0.941036   0.021441  43.890  < 2e-16 ***
## factor(STATE)AL   0.684404   0.030966  22.102  < 2e-16 ***
## factor(STATE)AR   0.617463   0.030228  20.427  < 2e-16 ***
## factor(STATE)AZ   0.578845   0.030950  18.703  < 2e-16 ***
## factor(STATE)CA   0.190852   0.036427   5.239 3.39e-07 ***
## factor(STATE)CO   0.295189   0.030067   9.818  < 2e-16 ***
## factor(STATE)CT   0.261938   0.028573   9.167  < 2e-16 ***
## factor(STATE)DC   0.409717   0.032522  12.598  < 2e-16 ***
## factor(STATE)DE   0.850261   0.032055  26.525  < 2e-16 ***
## factor(STATE)FL   0.413011   0.038197  10.813  < 2e-16 ***
## factor(STATE)GA   0.432210   0.030320  14.255  < 2e-16 ***
## factor(STATE)HI   0.835261   0.028038  29.791  < 2e-16 ***
## factor(STATE)IA   0.505913   0.028401  17.813  < 2e-16 ***
## factor(STATE)ID   0.572693   0.028148  20.346  < 2e-16 ***
## factor(STATE)IL   0.350671   0.032936  10.647  < 2e-16 ***
## factor(STATE)IN   0.547531   0.029011  18.874  < 2e-16 ***
## factor(STATE)KS   0.542394   0.028417  19.087  < 2e-16 ***
## factor(STATE)KY   0.587538   0.030723  19.124  < 2e-16 ***
## factor(STATE)LA   0.447936   0.028068  15.959  < 2e-16 ***
## factor(STATE)MA   1.009372   0.035078  28.775  < 2e-16 ***
## factor(STATE)MD   0.944077   0.035823  26.354  < 2e-16 ***
## factor(STATE)ME   0.773334   0.028049  27.570  < 2e-16 ***
## factor(STATE)MI   0.447507   0.028290  15.819  < 2e-16 ***
## factor(STATE)MN   0.393847   0.027984  14.074  < 2e-16 ***
## factor(STATE)MO   0.485235   0.029717  16.329  < 2e-16 ***
## factor(STATE)MS   0.514276   0.028839  17.832  < 2e-16 ***
## factor(STATE)MT   0.510638   0.029026  17.593  < 2e-16 ***
## factor(STATE)NC   0.467621   0.030376  15.394  < 2e-16 ***
## factor(STATE)ND   0.530296   0.028133  18.849  < 2e-16 ***
## factor(STATE)NE   0.302900   0.028024  10.809  < 2e-16 ***
## factor(STATE)NH   0.918090   0.032700  28.076  < 2e-16 ***
## factor(STATE)NJ   0.617530   0.041453  14.897  < 2e-16 ***
## factor(STATE)NM   0.459411   0.028493  16.123  < 2e-16 ***
## factor(STATE)NV   0.194835   0.046124   4.224 3.35e-05 ***
## factor(STATE)NY   0.191357   0.033227   5.759 2.43e-08 ***
## factor(STATE)OH   0.442562   0.029194  15.159  < 2e-16 ***
## factor(STATE)OK   0.498833   0.029083  17.152  < 2e-16 ***
## factor(STATE)OR   0.479349   0.029352  16.331  < 2e-16 ***
## factor(STATE)PA   0.428398   0.031513  13.595  < 2e-16 ***
## factor(STATE)PR   0.549030   0.046039  11.925  < 2e-16 ***
## factor(STATE)RI   0.412122   0.030270  13.615  < 2e-16 ***
## factor(STATE)SC   0.661725   0.031500  21.007  < 2e-16 ***
## factor(STATE)SD   0.221467   0.028065   7.891 8.95e-14 ***
## factor(STATE)TN   0.505551   0.029496  17.140  < 2e-16 ***
## factor(STATE)TX   0.471155   0.034422  13.688  < 2e-16 ***
## factor(STATE)UT   0.416135   0.028011  14.856  < 2e-16 ***
## factor(STATE)VA   0.582592   0.031136  18.711  < 2e-16 ***
## factor(STATE)VT   0.434881   0.028652  15.178  < 2e-16 ***
## factor(STATE)WA   0.404686   0.028088  14.408  < 2e-16 ***
## factor(STATE)WI   0.501523   0.028460  17.622  < 2e-16 ***
## factor(STATE)WV   0.855088   0.030572  27.970  < 2e-16 ***
## factor(STATE)WY   0.863529   0.028189  30.634  < 2e-16 ***
## factor(YEAR)2019 -0.013436   0.009545  -1.408   0.1604    
## factor(YEAR)2020 -0.016183   0.009526  -1.699   0.0906 .  
## factor(YEAR)2021  0.006848   0.009627   0.711   0.4776    
## factor(YEAR)2022 -0.002697   0.009637  -0.280   0.7799    
## factor(YEAR)2023 -0.011959   0.009982  -1.198   0.2320    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04847 on 254 degrees of freedom
## Multiple R-squared:  0.9945, Adjusted R-squared:  0.9932 
## F-statistic:   800 on 57 and 254 DF,  p-value: < 2.2e-16
nolinmodelpay6<-lm(avg_tot_payment ~avg_tot_charge, data=pay6)

summary(nolinmodelpay6) 
## 
## Call:
## lm(formula = avg_tot_payment ~ avg_tot_charge, data = pay6)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.64001 -0.10044 -0.02547  0.08212  0.53863 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -0.02702    0.35179  -0.077    0.939    
## avg_tot_charge  0.95739    0.02064  46.380   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2092 on 310 degrees of freedom
## Multiple R-squared:  0.874,  Adjusted R-squared:  0.8736 
## F-statistic:  2151 on 1 and 310 DF,  p-value: < 2.2e-16
#split dataset 
split=sample.split(pay6$avg_tot_payment, SplitRatio=0.7)
trainingset=subset(pay6, split==TRUE)
testset=subset(pay6, split==FALSE)
SPLITlinmodelpay6<-lm(formula=avg_tot_payment ~avg_tot_charge, data=trainingset)

summary(SPLITlinmodelpay6)
## 
## Call:
## lm(formula = avg_tot_payment ~ avg_tot_charge, data = trainingset)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.62920 -0.08776 -0.02555  0.07287  0.54179 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     0.22187    0.41587   0.534    0.594    
## avg_tot_charge  0.94254    0.02443  38.582   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1995 on 216 degrees of freedom
## Multiple R-squared:  0.8733, Adjusted R-squared:  0.8727 
## F-statistic:  1489 on 1 and 216 DF,  p-value: < 2.2e-16
#We will split that data to test and train it. Here we did a split of 70/30.This will allow us to see the predict the actual payments based on the requested charges. 

ggplot()+
  geom_point(aes(x=trainingset$avg_tot_charge, y=trainingset$avg_tot_payment), color='red')+
  geom_line(aes(x=trainingset$avg_tot_charge, y=predict(SPLITlinmodelpay6, newdata=trainingset)),
            color='black')+
  ggtitle('Avg charge amt vs Avg tot payment') +
  xlab('avg tot charge') +ylab('avg total payment')



# see who got paid the most and least per year 
hi_lo<-pay6 %>%
  group_by(YEAR) %>%
  filter(percentpaid==min(percentpaid)| percentpaid==max(percentpaid)) %>%
  ungroup() %>%
  data.frame()



testingplot<-ggplot(pay6, aes(x = avg_tot_charge, y =avg_tot_payment, color= percentpaid
)) +
  geom_point() +
  geom_smooth(method = "lm", se=TRUE) +
  facet_wrap(~YEAR)+
  scale_color_viridis_c(option = "magma") +
  labs(x= "Avg. tot charge in LOG", y="Avg. tot payment in LOG",
       color="% Paid", title = "Avg charge amt vs the Avg tot payment")+
  theme_minimal()+theme(plot.title=element_text(hjust=0.5))+
  geom_point(data=hi_lo, aes(x=avg_tot_charge, y=avg_tot_payment, color=percentpaid),
             shape=21, size=5)+
  geom_text(data=hi_lo,aes(x = avg_tot_charge, y = avg_tot_payment, label = STATE),
            vjust=-1.0, size=5, color='black')
testingplot
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
#try this with the trained data 

#redo the plot without years
hi_lotestnoyear<-testset %>%
  filter(percentpaid==min(percentpaid)| percentpaid==max(percentpaid)) %>%
  data.frame()
hi_lotestnoyear
ggplot() +
  geom_point(data = testset,
             aes(x = avg_tot_charge,
                 y = avg_tot_payment,
                 color = percentpaid)) +
  geom_line(data=trainingset, aes(x=avg_tot_charge, y=predict(SPLITlinmodelpay6, newdata=trainingset)), color='black')+
  scale_color_viridis_c(option = "magma") +
  labs(x= "Avg. tot charge in LOG", y="Avg. tot payment in LOG",
       color="% Paid", title = "Avg charge amt vs the Avg tot payment using our test set")+
  theme_minimal()+theme(plot.title=element_text(hjust=0.5))


#predict values for test set
predictpay <-predict(SPLITlinmodelpay6, newdata=testset)
#calc RMSE to evaluate performance
actuals <-testset$avg_tot_payment
#remember that it is log 

#pg 210 from applied stat

# need to use exp to see how it looks without log
rmseexp<-sqrt(mean((exp(actuals)-exp(predictpay))^2))
print(paste('RMSE:', rmseexp))
## [1] "RMSE: 4885853.65983674"
#to see the log
rmse<-sqrt(mean((actuals-predictpay)^2))
print(paste('RMSE:', rmse)) 
## [1] "RMSE: 0.230648677637856"
#--------------------------------------------------------------------------


#NEXT QUESTION

#--------------------------------------------------------------------------

# does PT, OT, and SLP therapy along with a specific condition have the ability to help predict the LOS at a state level?

# therapy minutes vs los
#removing the outliers and other extremes ex: 0 minutes

#making a new dataset to go off of

#start with tot_pt_mnts
Q1pt <- quantile(m6$TOT_PT_MNTS, 0.25)
Q3pt <- quantile(m6$TOT_PT_MNTS, 0.75)
IQR_valpt <- IQR(m6$TOT_PT_MNTS)

#ot
Q1ot <- quantile(m6$TOT_OT_MNTS, 0.25)
Q3ot <- quantile(m6$TOT_OT_MNTS, 0.75)
IQR_valot <- IQR(m6$TOT_OT_MNTS)

#slp
Q1slp <- quantile(m6$TOT_SLP_MNTS, 0.25)
Q3slp <- quantile(m6$TOT_SLP_MNTS, 0.75)
IQR_valslp <- IQR(m6$TOT_SLP_MNTS)


m6clean <- m6 %>%
  filter(TOT_PT_MNTS > Q1pt - 1.5 * IQR_valpt,
         TOT_PT_MNTS < Q3pt + 1.5 * IQR_valpt)%>%
  filter(TOT_OT_MNTS > Q1ot - 1.5 * IQR_valot,
         TOT_OT_MNTS < Q3ot + 1.5 * IQR_valot)%>%
  filter(TOT_SLP_MNTS > Q1slp - 1.5 * IQR_valslp,
         TOT_SLP_MNTS < Q3slp + 1.5 * IQR_valslp)%>%
  filter(TOT_PT_MNTS >0)%>%
  filter(TOT_SLP_MNTS >0) %>%
  filter(TOT_OT_MNTS >0)

colnames(m6)
##  [1] "YEAR"                             "YEAR_TYPE"                       
##  [3] "SMRY_CTGRY"                       "SRVC_CTGRY"                      
##  [5] "PRVDR_ID"                         "PRVDR_NAME"                      
##  [7] "PRVDR_CITY"                       "STATE"                           
##  [9] "PRVDR_ZIP"                        "BENE_DSTNCT_CNT"                 
## [11] "TOT_EPSD_STAY_CNT"                "TOT_SRVC_DAYS"                   
## [13] "TOT_CHRG_AMT"                     "TOT_ALOWD_AMT"                   
## [15] "TOT_MDCR_PYMT_AMT"                "TOT_MDCR_STDZD_PYMT_AMT"         
## [17] "TOT_OUTLIER_PYMT_AMT"             "BENE_DUAL_PCT"                   
## [19] "BENE_RRL_PCT"                     "BENE_AVG_AGE"                    
## [21] "BENE_MALE_PCT"                    "BENE_FEML_PCT"                   
## [23] "BENE_RACE_WHT_PCT"                "BENE_RACE_BLACK_PCT"             
## [25] "BENE_RACE_API_PCT"                "BENE_RACE_HSPNC_PCT"             
## [27] "BENE_RACE_NATIND_PCT"             "BENE_RACE_UNK_PCT"               
## [29] "BENE_RACE_OTHR_PCT"               "BENE_AVG_RISK_SCRE"              
## [31] "BENE_CC_BH_ADHD_OTHCD_V1_PCT"     "BENE_CC_BH_ALCOHOL_DRUG_V1_PCT"  
## [33] "BENE_CC_BH_ALZ_NONALZDEM_V2_PCT"  "BENE_CC_BH_ANXIETY_V1_PCT"       
## [35] "BENE_CC_BH_BIPOLAR_V1_PCT"        "BENE_CC_BH_DEPRESS_V1_PCT"       
## [37] "BENE_CC_BH_MOOD_V2_PCT"           "BENE_CC_BH_PD_V1_PCT"            
## [39] "BENE_CC_BH_PTSD_V1_PCT"           "BENE_CC_BH_SCHIZO_OTHPSY_V1_PCT" 
## [41] "BENE_CC_BH_TOBACCO_V1_PCT"        "BENE_CC_PH_AFIB_V2_PCT"          
## [43] "BENE_CC_PH_ARTHRITIS_V2_PCT"      "BENE_CC_PH_ASTHMA_V2_PCT"        
## [45] "BENE_CC_PH_CANCER6_V2_PCT"        "BENE_CC_PH_CKD_V2_PCT"           
## [47] "BENE_CC_PH_COPD_V2_PCT"           "BENE_CC_PH_DIABETES_V2_PCT"      
## [49] "BENE_CC_PH_HF_NONIHD_V2_PCT"      "BENE_CC_PH_HYPERLIPIDEMIA_V2_PCT"
## [51] "BENE_CC_PH_HYPERTENSION_V2_PCT"   "BENE_CC_PH_ISCHEMICHEART_V2_PCT" 
## [53] "BENE_CC_PH_OSTEOPOROSIS_V2_PCT"   "BENE_CC_PH_PARKINSON_V2_PCT"     
## [55] "BENE_CC_PH_STROKE_TIA_V2_PCT"     "PRMRY_DX_INFCTN_PCT"             
## [57] "PRMRY_DX_NEOBLD_PCT"              "PRMRY_DX_ENDONUTRMET_PCT"        
## [59] "PRMRY_DX_MNTBEHNEUDIS_PCT"        "PRMRY_DX_NERVSYSTM_PCT"          
## [61] "PRMRY_DX_ENTSYS_PCT"              "PRMRY_DX_CIRCSYSTM_PCT"          
## [63] "PRMRY_DX_RSPSYSTM_PCT"            "PRMRY_DX_DIGSYSTM_PCT"           
## [65] "PRMRY_DX_SKNMUSSYSTM_PCT"         "PRMRY_DX_GUSYSTM_PCT"            
## [67] "PRMRY_DX_PRGPERICONG_PCT"         "PRMRY_DX_SXILLDEF_PCT"           
## [69] "PRMRY_DX_INJPOIS_PCT"             "PRMRY_DX_HLTHSRV_PCT"            
## [71] "TOT_PT_MNTS"                      "INDVDL_PT_MNTS"                  
## [73] "CNCRNT_GRP_PT_MNTS"               "COTRT_PT_MNTS"                   
## [75] "TOT_OT_MNTS"                      "INDVDL_OT_MNTS"                  
## [77] "CNCRNT_GRP_OT_MNTS"               "COTRT_OT_MNTS"                   
## [79] "TOT_SLP_MNTS"                     "INDVDL_SLP_MNTS"                 
## [81] "CNCRNT_GRP_SLP_MNTS"              "COTRT_SLP_MNTS"                  
## [83] "los"                              "ptperep"                         
## [85] "slpperep"                         "otperep"
# MAKING A LOG TRANSFORMATION AND NEW DF
logm6<-m6clean %>%
  group_by(STATE, YEAR) %>%
  summarise(TOT_PT_MNTS=mean(TOT_PT_MNTS),
            TOT_SLP_MNTS=mean(TOT_SLP_MNTS),
            TOT_OT_MNTS=mean(TOT_OT_MNTS),
            los=mean(los),
            BENE_CC_BH_ADHD_OTHCD_V1_PCT= mean(BENE_CC_BH_ADHD_OTHCD_V1_PCT),
            BENE_CC_BH_ALCOHOL_DRUG_V1_PCT=mean(BENE_CC_BH_ALCOHOL_DRUG_V1_PCT),
            BENE_CC_BH_ALZ_NONALZDEM_V2_PCT=mean(BENE_CC_BH_ALZ_NONALZDEM_V2_PCT),
            BENE_CC_BH_ANXIETY_V1_PCT=mean(BENE_CC_BH_ANXIETY_V1_PCT), 
            BENE_CC_BH_BIPOLAR_V1_PCT=mean(BENE_CC_BH_BIPOLAR_V1_PCT),
            BENE_CC_BH_PD_V1_PCT=mean(BENE_CC_BH_PD_V1_PCT),
            BENE_CC_BH_PTSD_V1_PCT=mean(BENE_CC_BH_PTSD_V1_PCT),
            BENE_CC_BH_SCHIZO_OTHPSY_V1_PCT=mean(BENE_CC_BH_SCHIZO_OTHPSY_V1_PCT),
            BENE_CC_BH_TOBACCO_V1_PCT=mean(BENE_CC_BH_TOBACCO_V1_PCT),
            BENE_CC_PH_AFIB_V2_PCT=mean(BENE_CC_PH_AFIB_V2_PCT), 
            BENE_CC_PH_ARTHRITIS_V2_PCT=mean(BENE_CC_PH_ARTHRITIS_V2_PCT), 
            BENE_CC_PH_ASTHMA_V2_PCT=mean(BENE_CC_PH_ASTHMA_V2_PCT), 
            BENE_CC_PH_CANCER6_V2_PCT=mean(BENE_CC_PH_CANCER6_V2_PCT),
            BENE_CC_PH_CKD_V2_PCT=mean(BENE_CC_PH_CKD_V2_PCT),
            BENE_CC_PH_COPD_V2_PCT=mean(BENE_CC_PH_COPD_V2_PCT),
            BENE_CC_PH_DIABETES_V2_PCT=mean(BENE_CC_PH_DIABETES_V2_PCT),
            BENE_CC_PH_HYPERLIPIDEMIA_V2_PCT=mean(BENE_CC_PH_HYPERLIPIDEMIA_V2_PCT),
            BENE_CC_PH_HYPERTENSION_V2_PCT=mean(BENE_CC_PH_HYPERTENSION_V2_PCT), 
            BENE_CC_PH_OSTEOPOROSIS_V2_PCT=mean(BENE_CC_PH_OSTEOPOROSIS_V2_PCT), 
            BENE_CC_PH_PARKINSON_V2_PCT=mean(BENE_CC_PH_PARKINSON_V2_PCT),
            BENE_CC_PH_STROKE_TIA_V2_PCT= mean(BENE_CC_PH_STROKE_TIA_V2_PCT),
            .groups='drop') %>%
  mutate(ptlog =log(TOT_PT_MNTS),
         otlog=log(TOT_OT_MNTS),
         slplog=log(TOT_SLP_MNTS),
         loglos=log(los))



# make a plot with the ones we want 
dfm6clean<- logm6 %>% 
  group_by(STATE, YEAR) %>% 
  select(-TOT_PT_MNTS, -TOT_SLP_MNTS, -TOT_OT_MNTS, -los )

summary(dfm6clean)
##     STATE                YEAR      BENE_CC_BH_ADHD_OTHCD_V1_PCT
##  Length:307         Min.   :2018   Min.   :0.0000              
##  Class :character   1st Qu.:2019   1st Qu.:0.0000              
##  Mode  :character   Median :2021   Median :0.1250              
##                     Mean   :2021   Mean   :0.2130              
##                     3rd Qu.:2022   3rd Qu.:0.3333              
##                     Max.   :2023   Max.   :1.6667              
##  BENE_CC_BH_ALCOHOL_DRUG_V1_PCT BENE_CC_BH_ALZ_NONALZDEM_V2_PCT
##  Min.   : 0.75                  Min.   : 9.222                 
##  1st Qu.:11.42                  1st Qu.:18.354                 
##  Median :13.33                  Median :22.929                 
##  Mean   :13.54                  Mean   :22.078                 
##  3rd Qu.:16.00                  3rd Qu.:25.675                 
##  Max.   :25.00                  Max.   :33.071                 
##  BENE_CC_BH_ANXIETY_V1_PCT BENE_CC_BH_BIPOLAR_V1_PCT BENE_CC_BH_PD_V1_PCT
##  Min.   :10.75             Min.   : 0.000            Min.   :0.0000      
##  1st Qu.:36.10             1st Qu.: 3.000            1st Qu.:0.4286      
##  Median :41.17             Median : 4.625            Median :0.9388      
##  Mean   :40.32             Mean   : 4.876            Mean   :1.0639      
##  3rd Qu.:45.02             3rd Qu.: 6.500            3rd Qu.:1.3944      
##  Max.   :59.33             Max.   :17.600            Max.   :6.0000      
##  BENE_CC_BH_PTSD_V1_PCT BENE_CC_BH_SCHIZO_OTHPSY_V1_PCT
##  Min.   :0.0000         Min.   :0.0000                 
##  1st Qu.:0.2624         1st Qu.:0.6667                 
##  Median :0.6000         Median :1.5000                 
##  Mean   :0.8419         Mean   :1.8253                 
##  3rd Qu.:1.1222         3rd Qu.:2.5000                 
##  Max.   :4.0000         Max.   :7.5455                 
##  BENE_CC_BH_TOBACCO_V1_PCT BENE_CC_PH_AFIB_V2_PCT BENE_CC_PH_ARTHRITIS_V2_PCT
##  Min.   : 0.7778           Min.   : 9.556         Min.   :40.67              
##  1st Qu.:15.5861           1st Qu.:35.299         1st Qu.:61.12              
##  Median :18.7436           Median :38.750         Median :66.27              
##  Mean   :18.5880           Mean   :37.623         Mean   :64.88              
##  3rd Qu.:21.9066           3rd Qu.:41.099         3rd Qu.:69.42              
##  Max.   :31.0000           Max.   :47.115         Max.   :76.00              
##  BENE_CC_PH_ASTHMA_V2_PCT BENE_CC_PH_CANCER6_V2_PCT BENE_CC_PH_CKD_V2_PCT
##  Min.   : 2.25            Min.   : 5.444            Min.   :17.50        
##  1st Qu.:10.43            1st Qu.:16.500            1st Qu.:38.76        
##  Median :12.56            Median :18.556            Median :44.00        
##  Mean   :12.29            Mean   :18.203            Mean   :42.60        
##  3rd Qu.:14.12            3rd Qu.:20.246            3rd Qu.:47.59        
##  Max.   :21.00            Max.   :24.400            Max.   :54.86        
##  BENE_CC_PH_COPD_V2_PCT BENE_CC_PH_DIABETES_V2_PCT
##  Min.   : 8.625         Min.   :24.83             
##  1st Qu.:27.354         1st Qu.:42.39             
##  Median :32.300         Median :47.00             
##  Mean   :31.995         Mean   :46.44             
##  3rd Qu.:37.578         3rd Qu.:51.37             
##  Max.   :51.000         Max.   :66.29             
##  BENE_CC_PH_HYPERLIPIDEMIA_V2_PCT BENE_CC_PH_HYPERTENSION_V2_PCT
##  Min.   :32.20                    Min.   : 0.00                 
##  1st Qu.:69.02                    1st Qu.:35.34                 
##  Median :74.38                    Median :45.74                 
##  Mean   :73.65                    Mean   :49.69                 
##  3rd Qu.:79.60                    3rd Qu.:63.60                 
##  Max.   :95.00                    Max.   :96.00                 
##  BENE_CC_PH_OSTEOPOROSIS_V2_PCT BENE_CC_PH_PARKINSON_V2_PCT
##  Min.   : 9.00                  Min.   :0.000              
##  1st Qu.:18.06                  1st Qu.:2.367              
##  Median :21.00                  Median :3.600              
##  Mean   :20.87                  Mean   :3.604              
##  3rd Qu.:23.95                  3rd Qu.:4.787              
##  Max.   :34.78                  Max.   :8.333              
##  BENE_CC_PH_STROKE_TIA_V2_PCT     ptlog           otlog           slplog      
##  Min.   :24.33                Min.   :10.55   Min.   :10.53   Min.   : 9.343  
##  1st Qu.:41.19                1st Qu.:11.76   1st Qu.:11.71   1st Qu.:10.467  
##  Median :45.39                Median :12.01   Median :12.00   Median :10.647  
##  Mean   :45.53                Mean   :12.00   Mean   :11.98   Mean   :10.655  
##  3rd Qu.:49.32                3rd Qu.:12.25   3rd Qu.:12.25   3rd Qu.:10.856  
##  Max.   :64.88                Max.   :13.15   Max.   :12.98   Max.   :11.597  
##      loglos     
##  Min.   :2.313  
##  1st Qu.:2.501  
##  Median :2.539  
##  Mean   :2.539  
##  3rd Qu.:2.578  
##  Max.   :2.753
#make state factor
dfm6clean$STATE <-as.factor(dfm6clean$STATE)

#high or low correlation
cor(logm6[, c("ptlog", "otlog", "slplog")], method = "pearson")
##            ptlog     otlog    slplog
## ptlog  1.0000000 0.9890474 0.7584542
## otlog  0.9890474 1.0000000 0.7523555
## slplog 0.7584542 0.7523555 1.0000000
benecor<-cor(m6[,c('BENE_CC_BH_ADHD_OTHCD_V1_PCT', 'BENE_CC_BH_ALCOHOL_DRUG_V1_PCT',
                   'BENE_CC_BH_ALZ_NONALZDEM_V2_PCT', 'BENE_CC_BH_ANXIETY_V1_PCT', 'BENE_CC_BH_BIPOLAR_V1_PCT',
                   'BENE_CC_BH_DEPRESS_V1_PCT','BENE_CC_BH_MOOD_V2_PCT', 'BENE_CC_BH_PD_V1_PCT',
                   'BENE_CC_BH_PTSD_V1_PCT', 'BENE_CC_BH_SCHIZO_OTHPSY_V1_PCT', 'BENE_CC_BH_TOBACCO_V1_PCT',
                   'BENE_CC_PH_AFIB_V2_PCT', 'BENE_CC_PH_ARTHRITIS_V2_PCT', 
                   'BENE_CC_PH_ASTHMA_V2_PCT', 'BENE_CC_PH_CANCER6_V2_PCT', 'BENE_CC_PH_CKD_V2_PCT',
                   'BENE_CC_PH_COPD_V2_PCT', 'BENE_CC_PH_DIABETES_V2_PCT', 'BENE_CC_PH_HF_NONIHD_V2_PCT',
                   'BENE_CC_PH_HYPERLIPIDEMIA_V2_PCT', 'BENE_CC_PH_HYPERTENSION_V2_PCT', 
                   'BENE_CC_PH_ISCHEMICHEART_V2_PCT', 'BENE_CC_PH_OSTEOPOROSIS_V2_PCT', 
                   'BENE_CC_PH_PARKINSON_V2_PCT', 'BENE_CC_PH_STROKE_TIA_V2_PCT'
)], method='pearson')

#see correlation between the bene
threshold <- 0.6

which(abs(benecor) > threshold & abs(benecor) < 1, arr.ind = TRUE)
##                                 row col
## BENE_CC_BH_DEPRESS_V1_PCT         6   4
## BENE_CC_BH_MOOD_V2_PCT            7   4
## BENE_CC_BH_ANXIETY_V1_PCT         4   6
## BENE_CC_BH_MOOD_V2_PCT            7   6
## BENE_CC_BH_ANXIETY_V1_PCT         4   7
## BENE_CC_BH_DEPRESS_V1_PCT         6   7
## BENE_CC_PH_HF_NONIHD_V2_PCT      19  12
## BENE_CC_PH_ISCHEMICHEART_V2_PCT  22  12
## BENE_CC_PH_DIABETES_V2_PCT       18  16
## BENE_CC_PH_HF_NONIHD_V2_PCT      19  16
## BENE_CC_PH_ISCHEMICHEART_V2_PCT  22  16
## BENE_CC_PH_HF_NONIHD_V2_PCT      19  17
## BENE_CC_PH_ISCHEMICHEART_V2_PCT  22  17
## BENE_CC_PH_CKD_V2_PCT            16  18
## BENE_CC_PH_HF_NONIHD_V2_PCT      19  18
## BENE_CC_PH_AFIB_V2_PCT           12  19
## BENE_CC_PH_CKD_V2_PCT            16  19
## BENE_CC_PH_COPD_V2_PCT           17  19
## BENE_CC_PH_DIABETES_V2_PCT       18  19
## BENE_CC_PH_ISCHEMICHEART_V2_PCT  22  19
## BENE_CC_PH_AFIB_V2_PCT           12  22
## BENE_CC_PH_CKD_V2_PCT            16  22
## BENE_CC_PH_COPD_V2_PCT           17  22
## BENE_CC_PH_HF_NONIHD_V2_PCT      19  22
#doing pairs to understand relationship b/w variables 
#use panel to add regression line
pairs(logm6[, c('los', 'ptlog','otlog','slplog')], 
      panel=function(x,y){points(x,y) 
        abline(lm(y~x), col='red')},
      main= "pair plots of the LOS(length of stay) and the therapies( PT, OT, SLP")

ggpairs(logm6[, c('los', 'ptlog','otlog','slplog')], 
        panel=function(x,y){points(x,y) 
          abline(lm(y~x), col='red')},
        main= "pair plots of the LOS(length of stay) and the therapies( PT, OT, SLP")
## Warning: The `...` argument of `ggpais()` is deprecated as of GGally 2.3.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#  

#because of high correlation we will connect the therapies
dfm6clean$alltherapy<- dfm6clean$ptlog + dfm6clean$slplog + dfm6clean$otlog


# check to see if we have about 52 rows per yr
dfm6clean %>%
  group_by(YEAR) %>%
  summarise(n = n())
# make multiple linear regression for the 3 therapies w/ log transformation
nulltherapylog<- lm(loglos ~1, data=dfm6clean)


therapylogstep<- step(nulltherapylog, scope= loglos ~  YEAR +
                        BENE_CC_BH_ALZ_NONALZDEM_V2_PCT+
                        BENE_CC_PH_CANCER6_V2_PCT+
                        BENE_CC_PH_OSTEOPOROSIS_V2_PCT+
                        BENE_CC_PH_STROKE_TIA_V2_PCT+ alltherapy, direction='both')
## Start:  AIC=-1681.53
## loglos ~ 1
## 
##                                   Df Sum of Sq    RSS     AIC
## + BENE_CC_PH_STROKE_TIA_V2_PCT     1  0.171642 1.1035 -1723.9
## + BENE_CC_PH_OSTEOPOROSIS_V2_PCT   1  0.126345 1.1488 -1711.6
## + alltherapy                       1  0.083241 1.1919 -1700.2
## + BENE_CC_BH_ALZ_NONALZDEM_V2_PCT  1  0.008750 1.2664 -1681.6
## <none>                                         1.2751 -1681.5
## + BENE_CC_PH_CANCER6_V2_PCT        1  0.003835 1.2713 -1680.5
## + YEAR                             1  0.001358 1.2738 -1679.9
## 
## Step:  AIC=-1723.91
## loglos ~ BENE_CC_PH_STROKE_TIA_V2_PCT
## 
##                                   Df Sum of Sq    RSS     AIC
## + BENE_CC_PH_OSTEOPOROSIS_V2_PCT   1  0.045486 1.0580 -1734.8
## + alltherapy                       1  0.038800 1.0647 -1732.9
## + YEAR                             1  0.020177 1.0833 -1727.6
## + BENE_CC_PH_CANCER6_V2_PCT        1  0.015689 1.0878 -1726.3
## <none>                                         1.1035 -1723.9
## + BENE_CC_BH_ALZ_NONALZDEM_V2_PCT  1  0.002225 1.1013 -1722.5
## - BENE_CC_PH_STROKE_TIA_V2_PCT     1  0.171642 1.2751 -1681.5
## 
## Step:  AIC=-1734.84
## loglos ~ BENE_CC_PH_STROKE_TIA_V2_PCT + BENE_CC_PH_OSTEOPOROSIS_V2_PCT
## 
##                                   Df Sum of Sq    RSS     AIC
## + YEAR                             1  0.025002 1.0330 -1740.2
## + alltherapy                       1  0.015857 1.0421 -1737.5
## <none>                                         1.0580 -1734.8
## + BENE_CC_PH_CANCER6_V2_PCT        1  0.001992 1.0560 -1733.4
## + BENE_CC_BH_ALZ_NONALZDEM_V2_PCT  1  0.000001 1.0580 -1732.8
## - BENE_CC_PH_OSTEOPOROSIS_V2_PCT   1  0.045486 1.1035 -1723.9
## - BENE_CC_PH_STROKE_TIA_V2_PCT     1  0.090783 1.1488 -1711.6
## 
## Step:  AIC=-1740.18
## loglos ~ BENE_CC_PH_STROKE_TIA_V2_PCT + BENE_CC_PH_OSTEOPOROSIS_V2_PCT + 
##     YEAR
## 
##                                   Df Sum of Sq    RSS     AIC
## + alltherapy                       1  0.009114 1.0239 -1740.9
## <none>                                         1.0330 -1740.2
## + BENE_CC_PH_CANCER6_V2_PCT        1  0.005336 1.0277 -1739.8
## + BENE_CC_BH_ALZ_NONALZDEM_V2_PCT  1  0.000193 1.0328 -1738.2
## - YEAR                             1  0.025002 1.0580 -1734.8
## - BENE_CC_PH_OSTEOPOROSIS_V2_PCT   1  0.050311 1.0833 -1727.6
## - BENE_CC_PH_STROKE_TIA_V2_PCT     1  0.106850 1.1398 -1712.0
## 
## Step:  AIC=-1740.9
## loglos ~ BENE_CC_PH_STROKE_TIA_V2_PCT + BENE_CC_PH_OSTEOPOROSIS_V2_PCT + 
##     YEAR + alltherapy
## 
##                                   Df Sum of Sq    RSS     AIC
## <none>                                         1.0239 -1740.9
## - alltherapy                       1  0.009114 1.0330 -1740.2
## + BENE_CC_BH_ALZ_NONALZDEM_V2_PCT  1  0.002357 1.0215 -1739.6
## + BENE_CC_PH_CANCER6_V2_PCT        1  0.000718 1.0232 -1739.1
## - YEAR                             1  0.018259 1.0421 -1737.5
## - BENE_CC_PH_OSTEOPOROSIS_V2_PCT   1  0.028355 1.0522 -1734.5
## - BENE_CC_PH_STROKE_TIA_V2_PCT     1  0.097496 1.1214 -1715.0
summary(therapylogstep)
## 
## Call:
## lm(formula = loglos ~ BENE_CC_PH_STROKE_TIA_V2_PCT + BENE_CC_PH_OSTEOPOROSIS_V2_PCT + 
##     YEAR + alltherapy, data = dfm6clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.20366 -0.03648 -0.00334  0.03479  0.19471 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -6.9689244  4.1738755  -1.670  0.09602 .  
## BENE_CC_PH_STROKE_TIA_V2_PCT    0.0028763  0.0005364   5.363 1.63e-07 ***
## BENE_CC_PH_OSTEOPOROSIS_V2_PCT -0.0025195  0.0008712  -2.892  0.00411 ** 
## YEAR                            0.0047616  0.0020518   2.321  0.02097 *  
## alltherapy                     -0.0055173  0.0033651  -1.640  0.10213    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05823 on 302 degrees of freedom
## Multiple R-squared:  0.197,  Adjusted R-squared:  0.1864 
## F-statistic: 18.53 on 4 and 302 DF,  p-value: 1.25e-13
#alternative model w no therapy
alternativemodel<- lm(loglos ~ BENE_CC_PH_STROKE_TIA_V2_PCT + BENE_CC_PH_OSTEOPOROSIS_V2_PCT + 
                        YEAR, data=dfm6clean )

anova(alternativemodel,therapylogstep , test='Chisq')
#check multicollinearity
vif(therapylogstep)
##   BENE_CC_PH_STROKE_TIA_V2_PCT BENE_CC_PH_OSTEOPOROSIS_V2_PCT 
##                       1.239843                       1.394908 
##                           YEAR                     alltherapy 
##                       1.115964                       1.287262
plot(therapylogstep)



library(performance)
check_model(therapylogstep)

#plot it using predict to predict the los vs actual los

dfm6clean$predict<-predict(therapylogstep)
ggplot(dfm6clean, aes(predict, loglos))+ 
  facet_wrap(~YEAR)+
  geom_point()+
  geom_smooth(method='lm', color='blue')+
  labs(
    x='Predicted LOS',
    y='Actual LOS',
    title = 'Comparing the predicted and actual LOS'
  )  +theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
#plot it using predict to predict the los for the therapies 
dfm6clean$predict<-predict(therapylogstep)
ggplot(dfm6clean, aes(alltherapy, predict))+ 
  facet_wrap(~YEAR)+
  geom_point()+
  geom_smooth(method='lm', color='blue')+
  labs(
    x=' therapy',
    y='pred LOS',
    title = 'Comparing the predicted LOS with the therapy + condition'
  )  +theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
#--------------------------------------------