This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
#getting some packages
#install.packages("readxl")
library(readxl)
#library(xlsx)
#detach(package:readxl)
#devtools::install_github("hadley/readxl")
#getting the file
url='http://dashboard.healthit.gov/data/data/MU_REPORT.xlsx'
#a tempororary file; it has to have the right extension
file=tempfile(fileext = '.xlsx')
#to download the file, we have to use the right mode (otherwise the import will fail)
download.file(url,destfile = file,mode="wb")
#just checking what is the path to the file
file
## [1] "C:\\Users\\huserv\\AppData\\Local\\Temp\\1\\Rtmpw7mPFZ\\file23d8702b4700.xlsx"
#reading the actual data into data.frame
d=read_excel(file,sheet = 1)
#d=read.xlsx(file,sheetIndex = 1)
Now we have the data in variable d
#some quick lookups
nrow(d)
## [1] 1028716
table(d$`Provider Stage Number`)
##
## Stage 1 Stage 2
## 934142 94574
table(d$`Provider Type`)
##
## EP Hospital
## 965358 63358
table(d$`Product Setting`)
##
## Ambulatory Inpatient
## 932268 96448
#what vendors there are
vendors=as.data.frame(table(d$`Vendor Name`))
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(magrittr)
vendors %<>% arrange(desc(Freq))
#all vendor names (too long to output, commented out)
#unique(d$`Vendor Name`)
#http://stackoverflow.com/questions/3744178/ggplot2-sorting-a-plot
#plotting vendors
library(ggplot2)
#we have to order it for ggplot correctly
vendors <- transform(vendors, Var1=reorder(Var1, -Freq) )
ggplot(head(vendors,30),aes(y=Freq,x=Var1))+geom_point()+ geom_point() + theme(axis.text.x = element_text(angle = 90, hjust = 1))
#better way (more complex graph)
tta<-d %>% group_by(`Provider Type`,`Vendor Name`,`Attestation Year`) %>% summarise(count = n()) %>% arrange(-count) %>% filter(count>800) %>% rename(pt=`Provider Type`,year=`Attestation Year`)
tta
## Source: local data frame [142 x 4]
## Groups: pt, Vendor Name
##
## pt Vendor Name year count
## 1 EP ACOM Health 2014 937
## 2 EP Acumen Physician Solutions 2014 955
## 3 EP AllMeds Inc 2014 1340
## 4 EP Allscripts 2014 38033
## 5 EP Allscripts 2015 21387
## 6 EP Allscripts 2013 19476
## 7 EP Allscripts 2012 17215
## 8 EP Allscripts 2011 3732
## 9 EP Altos Solutions Inc 2014 925
## 10 EP AmazingCharts.com Inc 2014 1693
## .. .. ... ... ...
ggplot(tta,aes(y=count,x=`Vendor Name`))+geom_point()+ geom_point() + theme(axis.text.x = element_text(angle = 90, hjust = 1))
#color
ggplot(tta,aes(y=count,x=`Vendor Name`,color=pt))+geom_point()+ geom_point() + theme(axis.text.x = element_text(angle = 90, hjust = 1))
#facet
#backticks do not work for faceitn
p<-ggplot(tta,aes(y=count,x=`Vendor Name`)) +geom_point() + geom_point() + theme(axis.text.x = element_text(angle = 90, hjust = 1))
p<-ggplot(tta,aes(y=count,x=`Vendor Name`)) +geom_point() + geom_point() + theme(axis.text.x = element_text(angle = 90, hjust = 1))
#p +facet_wrap(~pt )
#make the x axis different for each facet
p +facet_wrap(~pt,scales='free' )
#not so good
p + facet_grid(year~pt, scales = 'free')
#better for year will be color
p<-ggplot(tta,aes(y=count,x=`Vendor Name`,color=year)) +geom_point() + geom_point() + theme(axis.text.x = element_text(angle = 90, hjust = 1))
p +facet_wrap(~pt,scales='free' )
#dplyr is a good data science library to use
library(dplyr)
#looking at one vendor data
d %>% filter(`Vendor Name`=='Allscripts')
## Source: local data frame [101,205 x 21]
##
## EHR Certification Number Vendor Name EHR Product CHP Id
## 1 A014E01O5FAMEAN Allscripts CHP-021796
## 2 30000004TQGMEAQ Allscripts CHP-008032
## 3 30000004TQGMEAQ Allscripts CHP-008032
## 4 1314E01P3YKUEAB Allscripts CHP-021796
## 5 A000001DYO2NEAT Allscripts CHP-009573
## 6 A000001DYO2NEAT Allscripts CHP-009573
## 7 30000001SVGHEAS Allscripts CHP-007427
## 8 1314E01OUSICEAV Allscripts CHP-023059
## 9 30000004QR1ZEAS Allscripts CHP-006453
## 10 30000004QR1ZEAS Allscripts CHP-006453
## .. ... ... ...
## Variables not shown: EHR Product Name (chr), EHR Product Version (chr),
## Product Classification (chr), Product Setting (chr), Product
## Certification Edition Yr (dbl), Attestation Month (dbl), Attestation
## Year (dbl), Business State/Territory (chr), Provider Type (chr),
## Specialty (chr), Program Year (dbl), Payment Year (dbl), Provider Stage
## Number (chr), Program Type (chr), Stage 2 Scheduled 2014 (dbl), MU
## Definition 2014 (dbl), Attestation_ID (dbl), NPI (chr)
#simpler view and only hospitals
oneVendor= d %>% filter(`Vendor Name`=='Allscripts' & `Provider Type`=='Hospital') %>% select(`Vendor Name`,`EHR Product Name`) %>% distinct()
#dplyr sometimes shortens the list to top 10, so we convert to traditional data.frame
as.data.frame(oneVendor)
## Vendor Name EHR Product Name
## 1 Allscripts Sunrise Acute Care
## 2 Allscripts Allscripts FollowMyHealth
## 3 Allscripts Sunrise Ambulatory Care
## 4 Allscripts Allscripts ED
## 5 Allscripts Sunrise Acute Care Module Set
## 6 Allscripts Sunrise Patient Portal and Sunrise Acute Care EHR
## 7 Allscripts Sunrise Emergency Care Module Set
## 8 Allscripts Sunrise Emergency Care
## 9 Allscripts dbMotion?
## 10 Allscripts dbMotion
#we see 138 products from this vendor
oneVendorData = d %>% filter(`Vendor Name`=='Allscripts' & `Provider Type`=='Hospital'& `EHR Product Name` =='Sunrise Acute Care') %>% arrange(`Attestation Year`)
oneVendorData
## Source: local data frame [686 x 21]
##
## EHR Certification Number Vendor Name EHR Product CHP Id
## 1 30000001SWXCEA0 Allscripts CHP-007983
## 2 30000001SWXCEA0 Allscripts CHP-007983
## 3 30000001SWXCEA0 Allscripts CHP-007983
## 4 30000001SWXCEA0 Allscripts CHP-007983
## 5 30000001SWXCEA0 Allscripts CHP-007983
## 6 30000001SWXCEA0 Allscripts CHP-007983
## 7 30000002EDF8EAA Allscripts CHP-007983
## 8 30000002EDF8EAA Allscripts CHP-007983
## 9 30000001SWXCEA0 Allscripts CHP-007983
## 10 30000001SWXCEA0 Allscripts CHP-007983
## .. ... ... ...
## Variables not shown: EHR Product Name (chr), EHR Product Version (chr),
## Product Classification (chr), Product Setting (chr), Product
## Certification Edition Yr (dbl), Attestation Month (dbl), Attestation
## Year (dbl), Business State/Territory (chr), Provider Type (chr),
## Specialty (chr), Program Year (dbl), Payment Year (dbl), Provider Stage
## Number (chr), Program Type (chr), Stage 2 Scheduled 2014 (dbl), MU
## Definition 2014 (dbl), Attestation_ID (dbl), NPI (chr)
library(gmodels)
CrossTable(d$`Payment Year`,d$`Provider Type`)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 1028716
##
##
## | d$`Provider Type`
## d$`Payment Year` | EP | Hospital | Row Total |
## -----------------|-----------|-----------|-----------|
## 1 | 378702 | 18122 | 396824 |
## | 107.199 | 1633.338 | |
## | 0.954 | 0.046 | 0.386 |
## | 0.392 | 0.286 | |
## | 0.368 | 0.018 | |
## -----------------|-----------|-----------|-----------|
## 2 | 318385 | 22354 | 340739 |
## | 5.853 | 89.187 | |
## | 0.934 | 0.066 | 0.331 |
## | 0.330 | 0.353 | |
## | 0.309 | 0.022 | |
## -----------------|-----------|-----------|-----------|
## 3 | 212309 | 17262 | 229571 |
## | 45.268 | 689.735 | |
## | 0.925 | 0.075 | 0.223 |
## | 0.220 | 0.272 | |
## | 0.206 | 0.017 | |
## -----------------|-----------|-----------|-----------|
## 4 | 55962 | 5620 | 61582 |
## | 57.773 | 880.265 | |
## | 0.909 | 0.091 | 0.060 |
## | 0.058 | 0.089 | |
## | 0.054 | 0.005 | |
## -----------------|-----------|-----------|-----------|
## Column Total | 965358 | 63358 | 1028716 |
## | 0.938 | 0.062 | |
## -----------------|-----------|-----------|-----------|
##
##
#nicer format
CrossTable(d$`Payment Year`,d$`Provider Type`,prop.chisq=F,format='SPSS',digits=1)
##
## Cell Contents
## |-------------------------|
## | Count |
## | Row Percent |
## | Column Percent |
## | Total Percent |
## |-------------------------|
##
## Total Observations in Table: 1028716
##
## | d$`Provider Type`
## d$`Payment Year` | EP | Hospital | Row Total |
## -----------------|-----------|-----------|-----------|
## 1 | 378702 | 18122 | 396824 |
## | 95.4% | 4.6% | 38.6% |
## | 39.2% | 28.6% | |
## | 36.8% | 1.8% | |
## -----------------|-----------|-----------|-----------|
## 2 | 318385 | 22354 | 340739 |
## | 93.4% | 6.6% | 33.1% |
## | 33.0% | 35.3% | |
## | 30.9% | 2.2% | |
## -----------------|-----------|-----------|-----------|
## 3 | 212309 | 17262 | 229571 |
## | 92.5% | 7.5% | 22.3% |
## | 22.0% | 27.2% | |
## | 20.6% | 1.7% | |
## -----------------|-----------|-----------|-----------|
## 4 | 55962 | 5620 | 61582 |
## | 90.9% | 9.1% | 6.0% |
## | 5.8% | 8.9% | |
## | 5.4% | 0.5% | |
## -----------------|-----------|-----------|-----------|
## Column Total | 965358 | 63358 | 1028716 |
## | 93.8% | 6.2% | |
## -----------------|-----------|-----------|-----------|
##
##