getwd()
## [1] "C:/Users/Jennifer/Documents/MC Data Science/Data Science 110 Writing and Comm/R Data Files and Markdown files"

Centers for Medicare & Medicaid Services (CMS) Center for Medicare and Medicaid Innovation (The Innovation Center) dataset

https://data.cms.gov/Special-Programs-Initiatives-Speed-Adoption-of-Bes/CMS-Innovation-Center-Model-Participants/x8pc-u7ta
An analysis of CMS Innovation Center’s Model Participants.
For my job, I support one of the CMS Innovation Center’s voluntary payment model tests called Bundled Payments for Care Improvement (BPCI) Advanced. The Innovation Center tests models like this to evaluate incentives to improve quality care and lower healthcare costs for Medicare beneficiaries.

Install packages and dataset.

Before importing the dataset into R, I did some cleaning in Excel. For example:

  • I deleted columns with all blanks.
    • Phase 1
    • MSA Name
    • Map Display
  • I edited the record for Unique ID #7437 (Baptist Cancer Center - Tipton) to remove artifacts
  • I renamed the dataset to “CMSdata”

I will use functions from the dplyr package like filter(), group_by(), and summarize() to manipulate the data.

#install.packages("tidyverse")
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts ----------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(readr)
library(ggplot2)
allparticipants<-read_csv("CMSdata.csv")
## Parsed with column specification:
## cols(
##   `Name of Initiative` = col_character(),
##   `Organization Name` = col_character(),
##   Notes = col_character(),
##   `Location 1` = col_character(),
##   `Street Address` = col_character(),
##   City = col_character(),
##   State = col_character(),
##   `State Based` = col_logical(),
##   `Phase 2` = col_character(),
##   Facebook = col_character(),
##   Twitter = col_character(),
##   Youtube = col_character(),
##   Website = col_character(),
##   Category = col_character(),
##   `Unique ID` = col_double()
## )
dim(allparticipants)
## [1] 7886   15

This dataset has 7,886 records with 15 variables. R is returning a problem note for 979 parsing failures in the column called “Phase 2”. These rows are the only records with values (strings) in this field. R is trying to read this column as a logical value. I will use the mutate command with as.character to convert these values to characters to dismiss this problem.

allparticipants <- allparticipants %>%
  mutate_if(is.logical, as.character) #this changes all the columns/fields in the dataset from logical (if applicable) to character
summary(allparticipants) #get a summary of the dataset
##  Name of Initiative Organization Name     Notes            Location 1       
##  Length:7886        Length:7886        Length:7886        Length:7886       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  Street Address         City              State           State Based       
##  Length:7886        Length:7886        Length:7886        Length:7886       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    Phase 2            Facebook           Twitter            Youtube         
##  Length:7886        Length:7886        Length:7886        Length:7886       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    Website            Category           Unique ID   
##  Length:7886        Length:7886        Min.   :   1  
##  Class :character   Class :character   1st Qu.:1973  
##  Mode  :character   Mode  :character   Median :3970  
##                                        Mean   :3976  
##                                        3rd Qu.:5981  
##                                        Max.   :7976

Make all headings (column names) lowercase. Remove the spaces between words in the headings. Then look at the data with head command.

names(allparticipants) <- tolower(names(allparticipants)) #makes the column headings lowercase
names(allparticipants) <- gsub(" ","",names(allparticipants)) #removes the space in the headings, if applicable.
head(allparticipants)
## # A tibble: 6 x 15
##   nameofinitiative organizationname notes location1 streetaddress city  state
##   <chr>            <chr>            <chr> <chr>     <chr>         <chr> <chr>
## 1 Next Generation~ ProHealth Solut~ <NA>  N17 W241~ N17 W24100 R~ Wauk~ WI   
## 2 Comprehensive P~ Chelsea Family ~ Oper~ 14700 Ea~ 14700 East O~ Chel~ MI   
## 3 Comprehensive P~ Northeast Arkan~ Oper~ 8170 US ~ 8170 US Hwy ~ Broo~ AR   
## 4 Innovation Advi~ Anna Flattau MD~ <NA>  3335 Ste~ 3335 Steuben~ New ~ NY   
## 5 Transforming Cl~ VHQC             PTN ~ VA (37.5~ <NA>          <NA>  VA   
## 6 Federally Quali~ Pine Bluff Medi~ <NA>  1101 S T~ 1101 S Tenne~ Pine~ AR   
## # ... with 8 more variables: statebased <chr>, phase2 <chr>, facebook <chr>,
## #   twitter <chr>, youtube <chr>, website <chr>, category <chr>, uniqueid <dbl>

I cleaned up the Notes column using OpenRefine. I extracted the number of clinical episodes each participant selected in the BPCI initiatives into a separate column. http://127.0.0.1:3333/project?project=2432498327141

I used OpenRefine to:

  • Filter to select BPCI Initiatives
    • On Notes column - Edit column - Split column Notes into several columns
    • By separator
    • Separator :
    • Guess cell Type - checked
  • On Notes 2 column - Edit column - Split column Notes 2 into several columns
    • By separator
    • Separator //
  • Delete Notes 3 column
  • On Notes 2 column - Edit column - Split column Notes 2 several columns
    • By separator
    • Separator //
  • Delete Notes 2 column and Notes 2 2 column
  • Rename the column “Number of Clinical Episodes”
  • Remove the BPCI Initiatives filter.
  • Export the dataset and name it as “CMSdata2”

Pull the refined dataset into R.

allparticipants<-read_csv("CMSdata2.csv")
## Parsed with column specification:
## cols(
##   Initiative = col_character(),
##   `Organization Name` = col_character(),
##   `Number of Clinical Episodes` = col_double(),
##   `Location 1` = col_character(),
##   `Street Address` = col_character(),
##   City = col_character(),
##   State = col_character(),
##   `State Based` = col_logical(),
##   `Phase 2` = col_logical(),
##   Facebook = col_character(),
##   Twitter = col_character(),
##   Youtube = col_character(),
##   Website = col_character(),
##   Category = col_character(),
##   `Unique ID` = col_double()
## )
## Warning: 979 parsing failures.
##  row     col           expected                                                    actual           file
## 1889 Phase 2 1/0/T/F/TRUE/FALSE Major joint replacement of the lower extremity            'CMSdata2.csv'
## 1890 Phase 2 1/0/T/F/TRUE/FALSE Major joint replacement of the lower extremity            'CMSdata2.csv'
## 1891 Phase 2 1/0/T/F/TRUE/FALSE Major joint replacement of the lower extremity            'CMSdata2.csv'
## 1892 Phase 2 1/0/T/F/TRUE/FALSE Major bowel procedure                                     'CMSdata2.csv'
## 1893 Phase 2 1/0/T/F/TRUE/FALSE Chronic obstructive pulmonary disease, bronchitis, asthma 'CMSdata2.csv'
## .... ....... .................. ......................................................... ..............
## See problems(...) for more details.
allparticipants <- allparticipants %>%
  mutate_if(is.logical, as.character) #this changes all the columns/fields in the dataset from logical (if applicable) to character

Adjust the variable names again.

names(allparticipants) <- tolower(names(allparticipants)) #makes the column headings lowercase
names(allparticipants) <- gsub(" ","",names(allparticipants)) #removes the space in the headings, if applicable.
head(allparticipants)
## # A tibble: 6 x 15
##   initiative organizationname numberofclinica~ location1 streetaddress city 
##   <chr>      <chr>                       <dbl> <chr>     <chr>         <chr>
## 1 Comprehen~ Brown Medicine ~               NA 375 Wamp~ 375 Wampanoa~ East~
## 2 Comprehen~ Brown Medicine ~               NA 375 Wamp~ 375 Wampanoa~ East~
## 3 Comprehen~ Brown Medicine ~               NA 375 Wamp~ 375 Wampanoa~ East~
## 4 Comprehen~ Brown Medicine ~               NA 909 Nort~ 909 North Ma~ Prov~
## 5 Comprehen~ Coastal Medical~               NA 11445 Wa~ 11445 Wampan~ East~
## 6 Comprehen~ LPG Primary Car~               NA 50 Memor~ 50 Memorial ~ Newp~
## # ... with 9 more variables: state <chr>, statebased <chr>, phase2 <chr>,
## #   facebook <chr>, twitter <chr>, youtube <chr>, website <chr>,
## #   category <chr>, uniqueid <dbl>

Look at the summary of the dataset again.

summary(allparticipants)
##   initiative        organizationname   numberofclinicalepisodes
##  Length:7886        Length:7886        Min.   : 1.000          
##  Class :character   Class :character   1st Qu.: 2.000          
##  Mode  :character   Mode  :character   Median : 4.000          
##                                        Mean   : 7.023          
##                                        3rd Qu.: 9.000          
##                                        Max.   :47.000          
##                                        NA's   :5612            
##   location1         streetaddress          city              state          
##  Length:7886        Length:7886        Length:7886        Length:7886       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   statebased           phase2            facebook           twitter         
##  Length:7886        Length:7886        Length:7886        Length:7886       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    youtube            website            category            uniqueid   
##  Length:7886        Length:7886        Length:7886        Min.   :   1  
##  Class :character   Class :character   Class :character   1st Qu.:1973  
##  Mode  :character   Mode  :character   Mode  :character   Median :3970  
##                                                           Mean   :3976  
##                                                           3rd Qu.:5981  
##                                                           Max.   :7976  
## 

Now I have a quantitative variable for the number of clinical episodes.

Group the dataframe by multiple columns with mean number of clinical episodes.

I want to see the mean number of clinical episodes for each of the BPCI models.

#this says to group the df by the initiative column
#take the variable called numberofclinicalepisodes and apply the function mean and remove the nas
#I assigned that summary to something called bpcimodels.
bpcimodels <- summarise_at(group_by(allparticipants,initiative),vars(numberofclinicalepisodes),funs(mean(.,na.rm=TRUE))) %>%
  filter(!is.na(numberofclinicalepisodes)) 
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
#filter here says to take all rows that do not have an 'na' value in the numberofclinicalepisodes variable.
#that leaves me with the 4 BPCI models with their mean number of clinical episodes selected.
bpcimodels
## # A tibble: 4 x 2
##   initiative               numberofclinicalepisodes
##   <chr>                                       <dbl>
## 1 BPCI Advanced                                5.95
## 2 BPCI Initiative: Model 2                     5.43
## 3 BPCI Initiative: Model 3                    10.5 
## 4 BPCI Initiative: Model 4                     6.5

Let’s try a plot

I’ll look at the average number of clinical episodes that participants chose within each iteration of the BPCI Models.

#install.packages("plotly")
#install.packages("ggthemes")
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggthemes)
bpciepisodes <- bpcimodels %>%
  ggplot(aes(x=initiative, y=numberofclinicalepisodes,fill=initiative)) +
  geom_bar(stat="identity") +
  theme_stata() +
  labs(x="Initiative", y="Average Number of Clinical Episodes Chosen", title="Average Number of Clinical Episodes within each Initiative")
ggplotly(bpciepisodes)

Let’s try another plot

First I will get some statistics on the variation.

#install.packages("rcompanion")
library(rcompanion)
bpciparticipants <- allparticipants %>%
  filter(!is.na(numberofclinicalepisodes))
Sum <- groupwiseMean(data=bpciparticipants, var="numberofclinicalepisodes", group="initiative",conf=0.95,digits=3)
Sum
##                 initiative    n  Mean Conf.level Trad.lower Trad.upper
## 1            BPCI Advanced 1295  5.95       0.95       5.63       6.27
## 2 BPCI Initiative: Model 2  402  5.43       0.95       4.83       6.03
## 3 BPCI Initiative: Model 3  575 10.50       0.95       9.65      11.40
## 4 BPCI Initiative: Model 4    2  6.50       0.95     -63.40      76.40
summary(Sum)
##   initiative              n               Mean          Conf.level  
##  Length:4           Min.   :   2.0   Min.   : 5.430   Min.   :0.95  
##  Class :character   1st Qu.: 302.0   1st Qu.: 5.820   1st Qu.:0.95  
##  Mode  :character   Median : 488.5   Median : 6.225   Median :0.95  
##                     Mean   : 568.5   Mean   : 7.095   Mean   :0.95  
##                     3rd Qu.: 755.0   3rd Qu.: 7.500   3rd Qu.:0.95  
##                     Max.   :1295.0   Max.   :10.500   Max.   :0.95  
##    Trad.lower        Trad.upper    
##  Min.   :-63.400   Min.   : 6.030  
##  1st Qu.:-12.227   1st Qu.: 6.210  
##  Median :  5.230   Median : 8.835  
##  Mean   :-10.822   Mean   :25.025  
##  3rd Qu.:  6.635   3rd Qu.:27.650  
##  Max.   :  9.650   Max.   :76.400

###trying to plot

Sum %>% ggplot(aes(x=initiative, y=Mean,fill=initiative)) +
  geom_bar(stat="identity") +
  geom_errorbar(aes(ymin=Trad.lower, ymax=Trad.upper,width=0.15))

I need to remove BPCI Initiative: Model 4 since it is skewing everything.

#install.packages(data.table)
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
#Sum %>% outlier.replace(Sum,"numberofclinicalepisodes", which(numberofclinicalepisodes < 3), newValue=NA) 
#Sum %>% filter(!is.na(numberofclinicalepisodes))
statson3models <- Sum %>% filter(n >= 3) %>% ggplot(aes(x=initiative, y=Mean)) +
  geom_bar(stat="identity", aes(fill=initiative)) +
  geom_errorbar(aes(ymin=Trad.lower, ymax=Trad.upper,width=0.15))
ggplotly(statson3models)

Let’s make a plot

#plotse <- allparticipants %>% 
 # filter(!is.na(numberofclinicalepisodes)) +
#  ggplot(aes(x=initiative, y=numberofclinicalepisodes)) +
#  geom_bar(stat="identity",aes(fill=initiative)) +
#  geom_errorbar(aes(ymin=Trad.lower,ymax=Trad.upper,width=0.15))
#plotse
#bpcisum <- bpcimodels %>%
#  ggplot(Sum,aes(x=initiative, y=numberofclinicalepisodes,fill=initiative)) +
#  geom_bar(stat="identity",color="black") #+
  #geom_errorbar(aes(ymin=mean-se,ymax=mean+se),width=0.2,size=0.7,color="black") +
  #theme_stata() +
  #labs(x="Initiative", y="Number of Clinical Episodes Chosen", title="Number of Clinical Episodes within each Initiative")
#ggplotly(bpcisum)