1 Plot the 30-day mortality rates for heart attack

#load data
outcome <- read.csv("outcome-of-care-measures.csv", colClasses = "character")
head(outcome, 1)
##   Provider.Number                    Hospital.Name              Address.1
## 1          010001 SOUTHEAST ALABAMA MEDICAL CENTER 1108 ROSS CLARK CIRCLE
##   Address.2 Address.3   City State ZIP.Code County.Name Phone.Number
## 1                     DOTHAN    AL    36301     HOUSTON   3347938701
##   Hospital.30.Day.Death..Mortality..Rates.from.Heart.Attack
## 1                                                      14.3
##   Comparison.to.U.S..Rate...Hospital.30.Day.Death..Mortality..Rates.from.Heart.Attack
## 1                                                No Different than U.S. National Rate
##   Lower.Mortality.Estimate...Hospital.30.Day.Death..Mortality..Rates.from.Heart.Attack
## 1                                                                                 12.1
##   Upper.Mortality.Estimate...Hospital.30.Day.Death..Mortality..Rates.from.Heart.Attack
## 1                                                                                 17.0
##   Number.of.Patients...Hospital.30.Day.Death..Mortality..Rates.from.Heart.Attack
## 1                                                                            666
##   Footnote...Hospital.30.Day.Death..Mortality..Rates.from.Heart.Attack
## 1                                                                     
##   Hospital.30.Day.Death..Mortality..Rates.from.Heart.Failure
## 1                                                       11.4
##   Comparison.to.U.S..Rate...Hospital.30.Day.Death..Mortality..Rates.from.Heart.Failure
## 1                                                 No Different than U.S. National Rate
##   Lower.Mortality.Estimate...Hospital.30.Day.Death..Mortality..Rates.from.Heart.Failure
## 1                                                                                   9.5
##   Upper.Mortality.Estimate...Hospital.30.Day.Death..Mortality..Rates.from.Heart.Failure
## 1                                                                                  13.7
##   Number.of.Patients...Hospital.30.Day.Death..Mortality..Rates.from.Heart.Failure
## 1                                                                             741
##   Footnote...Hospital.30.Day.Death..Mortality..Rates.from.Heart.Failure
## 1                                                                      
##   Hospital.30.Day.Death..Mortality..Rates.from.Pneumonia
## 1                                                   10.9
##   Comparison.to.U.S..Rate...Hospital.30.Day.Death..Mortality..Rates.from.Pneumonia
## 1                                             No Different than U.S. National Rate
##   Lower.Mortality.Estimate...Hospital.30.Day.Death..Mortality..Rates.from.Pneumonia
## 1                                                                               8.6
##   Upper.Mortality.Estimate...Hospital.30.Day.Death..Mortality..Rates.from.Pneumonia
## 1                                                                              13.7
##   Number.of.Patients...Hospital.30.Day.Death..Mortality..Rates.from.Pneumonia
## 1                                                                         371
##   Footnote...Hospital.30.Day.Death..Mortality..Rates.from.Pneumonia
## 1                                                                  
##   Hospital.30.Day.Readmission.Rates.from.Heart.Attack
## 1                                                19.0
##   Comparison.to.U.S..Rate...Hospital.30.Day.Readmission.Rates.from.Heart.Attack
## 1                                          No Different than U.S. National Rate
##   Lower.Readmission.Estimate...Hospital.30.Day.Readmission.Rates.from.Heart.Attack
## 1                                                                             16.6
##   Upper.Readmission.Estimate...Hospital.30.Day.Readmission.Rates.from.Heart.Attack
## 1                                                                             21.7
##   Number.of.Patients...Hospital.30.Day.Readmission.Rates.from.Heart.Attack
## 1                                                                      728
##   Footnote...Hospital.30.Day.Readmission.Rates.from.Heart.Attack
## 1                                                               
##   Hospital.30.Day.Readmission.Rates.from.Heart.Failure
## 1                                                 23.7
##   Comparison.to.U.S..Rate...Hospital.30.Day.Readmission.Rates.from.Heart.Failure
## 1                                           No Different than U.S. National Rate
##   Lower.Readmission.Estimate...Hospital.30.Day.Readmission.Rates.from.Heart.Failure
## 1                                                                              21.3
##   Upper.Readmission.Estimate...Hospital.30.Day.Readmission.Rates.from.Heart.Failure
## 1                                                                              26.5
##   Number.of.Patients...Hospital.30.Day.Readmission.Rates.from.Heart.Failure
## 1                                                                       891
##   Footnote...Hospital.30.Day.Readmission.Rates.from.Heart.Failure
## 1                                                                
##   Hospital.30.Day.Readmission.Rates.from.Pneumonia
## 1                                             17.1
##   Comparison.to.U.S..Rate...Hospital.30.Day.Readmission.Rates.from.Pneumonia
## 1                                       No Different than U.S. National Rate
##   Lower.Readmission.Estimate...Hospital.30.Day.Readmission.Rates.from.Pneumonia
## 1                                                                          14.4
##   Upper.Readmission.Estimate...Hospital.30.Day.Readmission.Rates.from.Pneumonia
## 1                                                                          20.4
##   Number.of.Patients...Hospital.30.Day.Readmission.Rates.from.Pneumonia
## 1                                                                   400
##   Footnote...Hospital.30.Day.Readmission.Rates.from.Pneumonia
## 1
names(outcome)[11]#查看第11列的名字
## [1] "Hospital.30.Day.Death..Mortality..Rates.from.Heart.Attack"
head(outcome[, 11])#查看第11列的值,均是字符串
## [1] "14.3"          "18.5"          "18.1"          "Not Available"
## [5] "Not Available" "Not Available"
outcome[, 11] <- as.numeric(outcome[, 11])
## You may get a warning about NAs being introduced; that is okay
hist(outcome[, 11])

2 Finding the best hospital in a state

library(stringr)
best <- function(state = "TX", outcome_item = "heart attack"){
    ## Read outcome data
    outcome <- read.csv("outcome-of-care-measures.csv", colClasses = "character")
   
    ##Check that state and outcome are valid
    if (!outcome_item %in% c("heart attack", "heart failure","pneumonia")){
        message("invalid outcome")
    }else if (!state %in% unique(outcome$State) ){
        message("invalid state")
    }else{    
        ##Return hospital name in that state with lowest 30-day death
        #正则匹配,注意Hospital.Name,中间是逗号,而在EXCEL里面是空格
        
        data <- outcome[outcome$State==state, grep(pattern = paste("Hospital.Name", "State", 
                                                                   str_sub(outcome_item,-5), sep = "|"), 
                                                   colnames(outcome))]  #参考文献1
        data <- data[,c(1:3)]
        names(data) <- c("Hospital.Name", "State","untreatedData")
        mu <- data.frame(Rate = as.numeric(data$u))
        data1 <- cbind(data,mu) #添加一列
        good <- complete.cases(data1) #去除NA数据
        data2 <- data1[good, ]
        data2 <- data2[order(data2[, "Rate"], data2[,"Hospital.Name"]), ] #参考文献2
        data2$Hospital.Name[1]
    }
}
# data <- best("MD", "pneumonia")
best("SC", "heart attack")
## [1] "MUSC MEDICAL CENTER"
best("NY", "pneumonia")
## [1] "MAIMONIDES MEDICAL CENTER"
best("AK", "pneumonia")
## [1] "YUKON KUSKOKWIM DELTA REG HOSPITAL"

3 Ranking hospitals by outcome in a state

rankhospital <- function(state, outcome_item, num = "best") {
    ## Read outcome data
    outcome <- read.csv("outcome-of-care-measures.csv", colClasses = "character")
    
    ##Check that state and outcome are valid
    if (!outcome_item %in% c("heart attack", "heart failure","pneumonia")){
        message("invalid outcome")
    }else if (!state %in% unique(outcome$State) ){
        message("invalid state")
    }else{    
        ##Return hospital name in that state with lowest 30-day death
        #正则匹配,注意Hospital.Name,中间是逗号,而在EXCEL里面是空格
        
        data <- outcome[outcome$State == state, grep(pattern = paste("Hospital.Name", "^State$", 
                                                                   str_sub(outcome_item,-5), sep = "|"), 
                                                   colnames(outcome))]  #参考文献1
        data <- data[,c(1:3)]
        names(data) <- c("Hospital.Name", "State","untreatedData")
        mu <- data.frame(Rate = as.numeric(data$u))
        data1 <- cbind(data,mu) #添加一列
        good <- complete.cases(data1) #去除NA数据
        data2 <- data1[good, ]
        data2 <- data2[order(data2[, "Rate"], data2[,"Hospital.Name"]), ] #参考文献2
        data2$rank <- c(1:nrow(data2))
        data2 <- data2[, c(1, 2, 4, 5)]
        if (num == "best"){
            return(data2$Hospital.Name[1])
        }else if (num == "worst"){
            return(data2$Hospital.Name[nrow(data2)])
        }else{
            return(data2$Hospital.Name[num])
        }
            
    }
}
# rankhospital("MN", "heart attack", 5000)
rankhospital("NC", "heart attack", "worst")
## [1] "WAYNE MEMORIAL HOSPITAL"
rankhospital("WA", "heart attack", 7)
## [1] "YAKIMA VALLEY MEMORIAL HOSPITAL"
rankhospital("TX", "pneumonia", 10)
## [1] "SETON SMITHVILLE REGIONAL HOSPITAL"
rankhospital("NY", "heart attack", 7)
## [1] "BELLEVUE HOSPITAL CENTER"

####4 Ranking hospitals in all states

library(stringr)
library(dplyr)
rankall <- function(outcome_item, num = "best") {
    ## Read outcome data
    outcome <- read.csv("outcome-of-care-measures.csv", colClasses = "character")
    
    ## Check that state and outcome are valid
    if (!outcome_item %in% c("heart attack", "heart failure","pneumonia")){
        message("invalid outcome")
    }else{
        data <- outcome[, grep(pattern = paste("Hospital.Name", "^State$", 
                                               str_sub(outcome_item,-5), sep = "|"), 
                               colnames(outcome))]
        data <- data[, c(1:3)]
        data[[3]] <- as.numeric(data$Hospital.30) #对列进行处理,在未添加新列的情况下
        data <- data[!is.na(data$Hospital.30), ] #去NA
        names(data)[3] <- "usefulData"
        
        #对数据进行分组,注意存在相同State值按照字母表顺序
        data <- data %>%
            group_by(State) %>%
            arrange(Hospital.Name) %>%  #升序排列
            #前者字母表升序,所以在前面,名次
            mutate(rank_z = rank(usefulData, ties.method = "first"))
        #找出每个州对应outcomer_1最差的医院
        worst_data <- data %>% 
            group_by(State) %>% 
            arrange(desc(rank_z)) %>%  #降序
            slice(1) #选择最大排名
        # worst_data
        if (num =="best"){
            data <- data[data$rank_z == 1, ]
        }else if (num =="worst"){  #每个state的最大值
            data <- worst_data
        }else{
            data <- data[data$rank_z == num, ]
        }

        ##Return a data frame with the hospital names and the (abbreviated) state name
        # 申明一个空的data.frame 包含hospital和state两列
        result_by_num <- data.frame("hospital" = vector(mode = "character", length = nrow(data)),
                                    "state" = unique(data$State),
                                    stringsAsFactors = FALSE)
        result_by_num
        for (i in data$State){
            index <- which(data == i, arr.ind = TRUE)[1, "row"] #返回data对应行坐标
            index2 <- which(result_by_num == i, arr.ind = TRUE)[1,"row"] #返回result_by_num对应的值
            result_by_num[[1]][index2] <- data[[1]][index]
        }
        result_by_num 
        #不足之处:没有返回排名大于某些州最大排名,然而并不影响做题
        #其实也很简单,就是定义 result_by_num 这个data.frame时用矩阵定义形如data.frame(matrix(NA,3,4))
        #赋予初始值,这一块希望读者继续优化吧
    }
}
# data <- rankall("heart attack", 20)
r <- rankall("heart attack", 4)
as.character(subset(r, state == "HI")$hospital)
## [1] "CASTLE MEDICAL CENTER"
r <- rankall("heart failure", 10)
as.character(subset(r, state == "NV")$hospital)
## [1] "RENOWN SOUTH MEADOWS MEDICAL CENTER"
r <- rankall("pneumonia", "worst")
# r
as.character(subset(r, state == "NJ")$hospital)
## [1] "BERGEN REGIONAL MEDICAL CENTER"

参考文献

R语言实例-数据过滤grep正则表达式

R 语言中,数据框依据不同列进行排序

R语言rank函数详细解析

R语言扩展包dplyr——数据清洗和整理

R排序sort、order、rank、arrange