getwd()
## [1] "C:/Users/Jennifer/Documents/MC Data Science/Data Science 110 Writing and Comm/R Data Files and Markdown files"
https://data.cms.gov/Special-Programs-Initiatives-Speed-Adoption-of-Bes/CMS-Innovation-Center-Model-Participants/x8pc-u7ta
An analysis of CMS Innovation Center’s Model Participants.
For my job, I support one of the CMS Innovation Center’s voluntary payment model tests called Bundled Payments for Care Improvement (BPCI) Advanced. The Innovation Center tests models like this to evaluate incentives to improve quality care and lower healthcare costs for Medicare beneficiaries.
I will use functions from the dplyr package like filter(), group_by(), and summarize() to manipulate the data.
#install.packages("tidyverse")
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ----------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(readr)
library(ggplot2)
allparticipants<-read_csv("CMSdata.csv")
## Parsed with column specification:
## cols(
## `Name of Initiative` = col_character(),
## `Organization Name` = col_character(),
## Notes = col_character(),
## `Location 1` = col_character(),
## `Street Address` = col_character(),
## City = col_character(),
## State = col_character(),
## `State Based` = col_logical(),
## `Phase 2` = col_character(),
## Facebook = col_character(),
## Twitter = col_character(),
## Youtube = col_character(),
## Website = col_character(),
## Category = col_character(),
## `Unique ID` = col_double()
## )
dim(allparticipants)
## [1] 7886 15
This dataset has 7,886 records with 15 variables. R is returning a problem note for 979 parsing failures in the column called “Phase 2”. These rows are the only records with values (strings) in this field. R is trying to read this column as a logical value. I will use the mutate command with as.character to convert these values to characters to dismiss this problem.
allparticipants <- allparticipants %>%
mutate_if(is.logical, as.character) #this changes all the columns/fields in the dataset from logical (if applicable) to character
summary(allparticipants) #get a summary of the dataset
## Name of Initiative Organization Name Notes Location 1
## Length:7886 Length:7886 Length:7886 Length:7886
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Street Address City State State Based
## Length:7886 Length:7886 Length:7886 Length:7886
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Phase 2 Facebook Twitter Youtube
## Length:7886 Length:7886 Length:7886 Length:7886
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Website Category Unique ID
## Length:7886 Length:7886 Min. : 1
## Class :character Class :character 1st Qu.:1973
## Mode :character Mode :character Median :3970
## Mean :3976
## 3rd Qu.:5981
## Max. :7976
names(allparticipants) <- tolower(names(allparticipants)) #makes the column headings lowercase
names(allparticipants) <- gsub(" ","",names(allparticipants)) #removes the space in the headings, if applicable.
head(allparticipants)
## # A tibble: 6 x 15
## nameofinitiative organizationname notes location1 streetaddress city state
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Next Generation~ ProHealth Solut~ <NA> N17 W241~ N17 W24100 R~ Wauk~ WI
## 2 Comprehensive P~ Chelsea Family ~ Oper~ 14700 Ea~ 14700 East O~ Chel~ MI
## 3 Comprehensive P~ Northeast Arkan~ Oper~ 8170 US ~ 8170 US Hwy ~ Broo~ AR
## 4 Innovation Advi~ Anna Flattau MD~ <NA> 3335 Ste~ 3335 Steuben~ New ~ NY
## 5 Transforming Cl~ VHQC PTN ~ VA (37.5~ <NA> <NA> VA
## 6 Federally Quali~ Pine Bluff Medi~ <NA> 1101 S T~ 1101 S Tenne~ Pine~ AR
## # ... with 8 more variables: statebased <chr>, phase2 <chr>, facebook <chr>,
## # twitter <chr>, youtube <chr>, website <chr>, category <chr>, uniqueid <dbl>
I cleaned up the Notes column using OpenRefine. I extracted the number of clinical episodes each participant selected in the BPCI initiatives into a separate column. http://127.0.0.1:3333/project?project=2432498327141
I used OpenRefine to:
Pull the refined dataset into R.
allparticipants<-read_csv("CMSdata2.csv")
## Parsed with column specification:
## cols(
## Initiative = col_character(),
## `Organization Name` = col_character(),
## `Number of Clinical Episodes` = col_double(),
## `Location 1` = col_character(),
## `Street Address` = col_character(),
## City = col_character(),
## State = col_character(),
## `State Based` = col_logical(),
## `Phase 2` = col_logical(),
## Facebook = col_character(),
## Twitter = col_character(),
## Youtube = col_character(),
## Website = col_character(),
## Category = col_character(),
## `Unique ID` = col_double()
## )
## Warning: 979 parsing failures.
## row col expected actual file
## 1889 Phase 2 1/0/T/F/TRUE/FALSE Major joint replacement of the lower extremity 'CMSdata2.csv'
## 1890 Phase 2 1/0/T/F/TRUE/FALSE Major joint replacement of the lower extremity 'CMSdata2.csv'
## 1891 Phase 2 1/0/T/F/TRUE/FALSE Major joint replacement of the lower extremity 'CMSdata2.csv'
## 1892 Phase 2 1/0/T/F/TRUE/FALSE Major bowel procedure 'CMSdata2.csv'
## 1893 Phase 2 1/0/T/F/TRUE/FALSE Chronic obstructive pulmonary disease, bronchitis, asthma 'CMSdata2.csv'
## .... ....... .................. ......................................................... ..............
## See problems(...) for more details.
allparticipants <- allparticipants %>%
mutate_if(is.logical, as.character) #this changes all the columns/fields in the dataset from logical (if applicable) to character
names(allparticipants) <- tolower(names(allparticipants)) #makes the column headings lowercase
names(allparticipants) <- gsub(" ","",names(allparticipants)) #removes the space in the headings, if applicable.
head(allparticipants)
## # A tibble: 6 x 15
## initiative organizationname numberofclinica~ location1 streetaddress city
## <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 Comprehen~ Brown Medicine ~ NA 375 Wamp~ 375 Wampanoa~ East~
## 2 Comprehen~ Brown Medicine ~ NA 375 Wamp~ 375 Wampanoa~ East~
## 3 Comprehen~ Brown Medicine ~ NA 375 Wamp~ 375 Wampanoa~ East~
## 4 Comprehen~ Brown Medicine ~ NA 909 Nort~ 909 North Ma~ Prov~
## 5 Comprehen~ Coastal Medical~ NA 11445 Wa~ 11445 Wampan~ East~
## 6 Comprehen~ LPG Primary Car~ NA 50 Memor~ 50 Memorial ~ Newp~
## # ... with 9 more variables: state <chr>, statebased <chr>, phase2 <chr>,
## # facebook <chr>, twitter <chr>, youtube <chr>, website <chr>,
## # category <chr>, uniqueid <dbl>
summary(allparticipants)
## initiative organizationname numberofclinicalepisodes
## Length:7886 Length:7886 Min. : 1.000
## Class :character Class :character 1st Qu.: 2.000
## Mode :character Mode :character Median : 4.000
## Mean : 7.023
## 3rd Qu.: 9.000
## Max. :47.000
## NA's :5612
## location1 streetaddress city state
## Length:7886 Length:7886 Length:7886 Length:7886
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## statebased phase2 facebook twitter
## Length:7886 Length:7886 Length:7886 Length:7886
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## youtube website category uniqueid
## Length:7886 Length:7886 Length:7886 Min. : 1
## Class :character Class :character Class :character 1st Qu.:1973
## Mode :character Mode :character Mode :character Median :3970
## Mean :3976
## 3rd Qu.:5981
## Max. :7976
##
#this says to group the df by the initiative column
#take the variable called numberofclinicalepisodes and apply the function mean and remove the nas
#I assigned that summary to something called bpcimodels.
bpcimodels <- summarise_at(group_by(allparticipants,initiative),vars(numberofclinicalepisodes),funs(mean(.,na.rm=TRUE))) %>%
filter(!is.na(numberofclinicalepisodes))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
#filter here says to take all rows that do not have an 'na' value in the numberofclinicalepisodes variable.
#that leaves me with the 4 BPCI models with their mean number of clinical episodes selected.
bpcimodels
## # A tibble: 4 x 2
## initiative numberofclinicalepisodes
## <chr> <dbl>
## 1 BPCI Advanced 5.95
## 2 BPCI Initiative: Model 2 5.43
## 3 BPCI Initiative: Model 3 10.5
## 4 BPCI Initiative: Model 4 6.5
#install.packages("plotly")
#install.packages("ggthemes")
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(ggthemes)
bpciepisodes <- bpcimodels %>%
ggplot(aes(x=initiative, y=numberofclinicalepisodes,fill=initiative)) +
geom_bar(stat="identity") +
theme_stata() +
labs(x="Initiative", y="Average Number of Clinical Episodes Chosen", title="Average Number of Clinical Episodes within each Initiative")
ggplotly(bpciepisodes)
#install.packages("rcompanion")
library(rcompanion)
bpciparticipants <- allparticipants %>%
filter(!is.na(numberofclinicalepisodes))
Sum <- groupwiseMean(data=bpciparticipants, var="numberofclinicalepisodes", group="initiative",conf=0.95,digits=3)
Sum
## initiative n Mean Conf.level Trad.lower Trad.upper
## 1 BPCI Advanced 1295 5.95 0.95 5.63 6.27
## 2 BPCI Initiative: Model 2 402 5.43 0.95 4.83 6.03
## 3 BPCI Initiative: Model 3 575 10.50 0.95 9.65 11.40
## 4 BPCI Initiative: Model 4 2 6.50 0.95 -63.40 76.40
summary(Sum)
## initiative n Mean Conf.level
## Length:4 Min. : 2.0 Min. : 5.430 Min. :0.95
## Class :character 1st Qu.: 302.0 1st Qu.: 5.820 1st Qu.:0.95
## Mode :character Median : 488.5 Median : 6.225 Median :0.95
## Mean : 568.5 Mean : 7.095 Mean :0.95
## 3rd Qu.: 755.0 3rd Qu.: 7.500 3rd Qu.:0.95
## Max. :1295.0 Max. :10.500 Max. :0.95
## Trad.lower Trad.upper
## Min. :-63.400 Min. : 6.030
## 1st Qu.:-12.227 1st Qu.: 6.210
## Median : 5.230 Median : 8.835
## Mean :-10.822 Mean :25.025
## 3rd Qu.: 6.635 3rd Qu.:27.650
## Max. : 9.650 Max. :76.400
###trying to plot
Sum %>% ggplot(aes(x=initiative, y=Mean,fill=initiative)) +
geom_bar(stat="identity") +
geom_errorbar(aes(ymin=Trad.lower, ymax=Trad.upper,width=0.15))
#install.packages(data.table)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
#Sum %>% outlier.replace(Sum,"numberofclinicalepisodes", which(numberofclinicalepisodes < 3), newValue=NA)
#Sum %>% filter(!is.na(numberofclinicalepisodes))
statson3models <- Sum %>% filter(n >= 3) %>% ggplot(aes(x=initiative, y=Mean)) +
geom_bar(stat="identity", aes(fill=initiative)) +
geom_errorbar(aes(ymin=Trad.lower, ymax=Trad.upper,width=0.15))
ggplotly(statson3models)
#plotse <- allparticipants %>%
# filter(!is.na(numberofclinicalepisodes)) +
# ggplot(aes(x=initiative, y=numberofclinicalepisodes)) +
# geom_bar(stat="identity",aes(fill=initiative)) +
# geom_errorbar(aes(ymin=Trad.lower,ymax=Trad.upper,width=0.15))
#plotse
#bpcisum <- bpcimodels %>%
# ggplot(Sum,aes(x=initiative, y=numberofclinicalepisodes,fill=initiative)) +
# geom_bar(stat="identity",color="black") #+
#geom_errorbar(aes(ymin=mean-se,ymax=mean+se),width=0.2,size=0.7,color="black") +
#theme_stata() +
#labs(x="Initiative", y="Number of Clinical Episodes Chosen", title="Number of Clinical Episodes within each Initiative")
#ggplotly(bpcisum)