#
# Statistics Project 3 - Evaluate readmissions rates report 
#

# use some libraries etc
library(stats)
library(lsr)
library(data.table)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:data.table':
## 
##     between, last
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# https://rstudio-pubs-static.s3.amazonaws.com/23230_b2e9b87251a2488da0fba51325e26040.html
load(url("http://assets.datacamp.com/course/dasi/inference.Rdata"))

setwd("~/apps/sliderule/statistics_project3/statistics project 3")
hospital <- read.csv("clean_hospital_read_df.csv")

# add scatterplot, graphs to explore data (outliers, missing values, big relationships)
plot(hospital$Number.of.Discharges, hospital$Number.of.Readmissions)

# try subsetting by hospital size (based on suggested feature, #discharges)
small <- subset(hospital, hospital$Number.of.Discharges < 100)
large <- subset(hospital, hospital$Number.of.Discharges > 1000)
med <- subset(hospital, hospital$Number.of.Discharges < 1000 & hospital$Number.of.Discharges > 100 )
xlarge <- subset(hospital, hospital$Number.of.Discharges > 2000)

# BASIC THEME
theme.chart <- 
  theme(legend.position = "none") +
  theme(plot.title = element_text(size=26, family="Trebuchet MS", face="bold", hjust=0, color="#666666")) +
  theme(axis.title = element_text(size=18, family="Trebuchet MS", face="bold", color="#666666")) +
  theme(axis.title.y = element_text(angle=0)) 

# SCATTERPLOT THEME
theme.chart_SCATTER <- theme.chart +
  theme(axis.title.x = element_text(hjust=0, vjust=-.5))

# HISTOGRAM THEME
theme.chart_HIST <- theme.chart +
  theme(axis.title.x = element_text(hjust=0, vjust=-.5))

# SMALL MULTIPLE THEME
theme.chart_SMALLM <- theme.chart +
  theme(panel.grid.minor = element_blank()) +
  theme(strip.text.x = element_text(size=16, family="Trebuchet MS", face="bold", color="#666666"))  

# plot small hospitals discharges to readmissions
ggplot(data=small, aes(x=Number.of.Discharges, y=Number.of.Readmissions)) +
  geom_point(alpha=.4, size=4, color="#9ecae1") +
  ggtitle("Discharges vs Readmissions (small hospitals)") +
  labs(x="Discharges", y="Readmissions") +
  theme.chart_SCATTER
## Warning: Removed 81 rows containing missing values (geom_point).

# plot med hospitals discharges to readmissions
ggplot(data=med, aes(x=Number.of.Discharges, y=Number.of.Readmissions)) +
  geom_point(alpha=.4, size=4, color="#9ecae1") +
  ggtitle("Discharges vs Readmissions (med hospitals)") +
  labs(x="Discharges", y="Readmissions") +
  theme.chart_SCATTER

# plot large hospitals discharges to readmissions
ggplot(data=large, aes(x=Number.of.Discharges, y=Number.of.Readmissions)) +
  geom_point(alpha=.4, size=4, color="#9ecae1") +
  ggtitle("Discharges vs Readmissions (large hospitals)") +
  labs(x="Discharges", y="Readmissions") +
  theme.chart_SCATTER

# plot xlarge hospitals discharges to readmissions
ggplot(data=xlarge, aes(x=Number.of.Discharges, y=Number.of.Readmissions)) +
  geom_point(alpha=.4, size=4, color="#9ecae1") +
  ggtitle("Discharges vs Readmissions (xlarge hospitals)") +
  labs(x="Discharges", y="Readmissions") +
  theme.chart_SCATTER

# look at expected vs predicated readmission across all hospitals
ggplot(data=hospital, aes(x=Expected.Readmission.Rate, y=Predicted.Readmission.Rate)) +
  geom_point(alpha=.4, size=4, color="#9ecae1") +
  ggtitle("Expected Readmissions vs Predicted Readmissions") +
  labs(x="Expected Readmissions", y="Predicted Readmissions") +
  theme.chart_SCATTER
## Warning: Removed 81 rows containing missing values (geom_point).

# distribution of readmissions
ggplot(data=hospital, aes(x=Number.of.Discharges)) +
  geom_histogram(fill="#9ecae1") +  
  ggtitle("Histogram of Readmissions") +
  labs(x="Readmissions", y="Count\nof Records") +
  theme.chart_HIST
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

# distribution of discharges
ggplot(data=hospital, aes(x=Number.of.Discharges)) +
  geom_histogram(fill="#9ecae1") +  
  ggtitle("Histogram of Discharges") +
  labs(x="Discharges", y="Count\nof Records") +
  theme.chart_HIST
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

# distribution of smaller discharges (0..75)
hospital %>%
  filter(Number.of.Discharges >0 & Number.of.Discharges <75) %>%
  ggplot(aes(x= as.factor(Number.of.Discharges))) +
  geom_bar(fill="#9ecae1") +
  labs(x="Discharges") +
  theme.chart

# readmissions by state
ggplot(data=hospital, aes(x=Number.of.Readmissions)) +
  geom_histogram(fill="#9ecae1") +
  ggtitle("Histogram of Readmissions\nby State") +
  labs(x="Readmissions", y="Count\nof Records") +
  facet_wrap(~State) +
  theme.chart_SMALLM
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

# readmissions by procedures
ggplot(data=hospital, aes(x=Number.of.Readmissions)) +
  geom_histogram(fill="#9ecae1") +
  ggtitle("Histogram of Readmissions\nby Procedure") +
  labs(x="Readmissions", y="Count\nof Records") +
  facet_wrap(~Measure.Name) +
  theme.chart_SMALLM
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

# TO TEST ASSERTION: Rate of readmissions in trending down with increasing number of discharges#
#
# TRY: Looking for inverse (negative correlation) between #discharges and #readmissions
# 
# RESULTS: 
#   1) with #readmissions as a prectior of #discharges, significance with so-so R-squared with positive correlation
#   2) with #discharges as a predictor, significance with so-so R-squared with smaller positive correction
#



summary(lm(hospital$Number.of.Discharges~hospital$Number.of.Readmissions))
## 
## Call:
## lm(formula = hospital$Number.of.Discharges ~ hospital$Number.of.Readmissions)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -499.4  -86.6  -48.2   17.3 5630.0 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     104.36082    2.57723   40.49   <2e-16 ***
## hospital$Number.of.Readmissions   4.10325    0.02957  138.74   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 188.8 on 11495 degrees of freedom
##   (81 observations deleted due to missingness)
## Multiple R-squared:  0.6261, Adjusted R-squared:  0.6261 
## F-statistic: 1.925e+04 on 1 and 11495 DF,  p-value: < 2.2e-16
summary(lm(hospital$Number.of.Readmissions~hospital$Number.of.Discharges))
## 
## Call:
## lm(formula = hospital$Number.of.Readmissions ~ hospital$Number.of.Discharges)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -786.41   -8.27   -0.27   12.33  326.39 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     7.8676     0.5262   14.95   <2e-16 ***
## hospital$Number.of.Discharges   0.1526     0.0011  138.74   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36.41 on 11495 degrees of freedom
##   (81 observations deleted due to missingness)
## Multiple R-squared:  0.6261, Adjusted R-squared:  0.6261 
## F-statistic: 1.925e+04 on 1 and 11495 DF,  p-value: < 2.2e-16
# In hospitals/facilities with number of discharges < 100, mean excess readmission rate is 1.023 and 63% have excess readmission rate greater than 1
# In hospitals/facilities with number of discharges > 1000, mean excess readmission rate is 0.978 and 44% have excess readmission rate greater than 1

# fit1 = lm(hospital$Number.of.Discharges~hospital$Number.of.Readmissions)
# summary(fit1)
# res.fit1 = resid(fit1)
# plot(hospital$Number.of.Readmissions, res.fit1, 
#      ylab="Residuals", xlab="Number of Readmissions", main="Number of Discharges") 
# abline(0, 0)                  

# TO TEST ASSERTION: In hospitals/facilities with number of discharges < 100, mean excess readmission rate is 1.023 and 63% have excess readmission rate greater than 1
# TO TEST ASSERTION: In hospitals/facilities with number of discharges > 1000, mean excess readmission rate is 0.978 and 44% have excess readmission rate greater than 1
small <- subset(hospital, hospital$Number.of.Discharges < 100)
plot(small$Number.of.Discharges, small$Number.of.Readmissions)

mean(small$Excess.Readmission.Ratio, na.rm = TRUE)
## [1] 1.022618
# 1.022618
nrow(subset(small,small$Excess.Readmission.Ratio > 1))/nrow(small)
## [1] 0.5918046
# 0.5918046 (less than 63% asserted)

large <- subset(hospital, hospital$Number.of.Discharges > 1000)
plot(large$Number.of.Discharges, large$Number.of.Readmissions)

# add small/large flag variable, plot both with color

mean(large$Excess.Readmission.Ratio, na.rm = TRUE)
## [1] 0.9783354
# 0.9783354
nrow(subset(large,large$Excess.Readmission.Ratio > 1))/nrow(large)
## [1] 0.4449244
# 0.4449244

# ASSERTION:  There is a significant correlation between hospital capacity (number of discharges) and readmission rates.
# VALIDATION: Found ~.62 adjusted R^2
summary(lm(hospital$Number.of.Discharges~hospital$Number.of.Readmissions))
## 
## Call:
## lm(formula = hospital$Number.of.Discharges ~ hospital$Number.of.Readmissions)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -499.4  -86.6  -48.2   17.3 5630.0 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     104.36082    2.57723   40.49   <2e-16 ***
## hospital$Number.of.Readmissions   4.10325    0.02957  138.74   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 188.8 on 11495 degrees of freedom
##   (81 observations deleted due to missingness)
## Multiple R-squared:  0.6261, Adjusted R-squared:  0.6261 
## F-statistic: 1.925e+04 on 1 and 11495 DF,  p-value: < 2.2e-16
aov.1 = aov(Number.of.Discharges~Number.of.Readmissions, data=data.table(hospital))
summary(aov.1)
##                           Df    Sum Sq   Mean Sq F value Pr(>F)    
## Number.of.Readmissions     1 686160392 686160392   19250 <2e-16 ***
## Residuals              11495 409746355     35646                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 81 observations deleted due to missingness
print(model.tables(aov.1,"means"), digits=3)
## Warning in replications(paste("~", xx), data = mf): non-factors ignored:
## Number.of.Readmissions
## Tables of means
## Grand mean
##          
## 365.4662 
## 
##  Number.of.Readmissions 
## Number.of.Readmissions
##   11   12   13   14   15   16   17   18   19   20   21   22   23   24   25 
##  149  154  158  162  166  170  174  178  182  186  191  195  199  203  207 
##   26   27   28   29   30   31   32   33   34   35   36   37   38   39   40 
##  211  215  219  223  227  232  236  240  244  248  252  256  260  264  268 
##   41   42   43   44   45   46   47   48   49   50   51   52   53   54   55 
##  273  277  281  285  289  293  297  301  305  310  314  318  322  326  330 
##   56   57   58   59   60   61   62   63   64   65   66   67   68   69   70 
##  334  338  342  346  351  355  359  363  367  371  375  379  383  387  392 
##   71   72   73   74   75   76   77   78   79   80   81   82   83   84   85 
##  396  400  404  408  412  416  420  424  429  433  437  441  445  449  453 
##   86   87   88   89   90   91   92   93   94   95   96   97   98   99  100 
##  457  461  465  470  474  478  482  486  490  494  498  502  506  511  515 
##  101  102  103  104  105  106  107  108  109  110  111  112  113  114  115 
##  519  523  527  531  535  539  543  548  552  556  560  564  568  572  576 
##  116  117  118  119  120  121  122  123  124  125  126  127  128  129  130 
##  580  584  589  593  597  601  605  609  613  617  621  625  630  634  638 
##  131  132  133  134  135  136  137  138  139  140  141  142  143  144  145 
##  642  646  650  654  658  662  667  671  675  679  683  687  691  695  699 
##  146  147  148  149  150  151  152  153  154  155  156  157  158  159  160 
##  703  708  712  716  720  724  728  732  736  740  744  749  753  757  761 
##  161  162  163  164  165  166  167  168  169  170  171  172  173  174  175 
##  765  769  773  777  781  786  790  794  798  802  806  810  814  818  822 
##  176  177  178  179  180  181  182  183  184  185  186  187  188  189  190 
##  827  831  835  839  843  847  851  855  859  863  868  872  876  880  884 
##  191  192  193  194  195  196  197  198  199  200  201  202  203  204  205 
##  888  892  896  900  904  909  913  917  921  925  929  933  937  941  946 
##  206  207  208  209  210  211  212  213  214  215  216  217  218  219  220 
##  950  954  958  962  966  970  974  978  982  987  991  995  999 1003 1007 
##  221  223  224  225  226  227  228  229  230  231  232  233  234  235  236 
## 1011 1019 1023 1028 1032 1036 1040 1044 1048 1052 1056 1060 1065 1069 1073 
##  237  238  239  240  241  242  243  244  245  246  247  248  249  250  251 
## 1077 1081 1085 1089 1093 1097 1101 1106 1110 1114 1118 1122 1126 1130 1134 
##  252  253  255  256  257  258  259  260  261  262  263  264  265  267  268 
## 1138 1142 1151 1155 1159 1163 1167 1171 1175 1179 1184 1188 1192 1200 1204 
##  269  270  271  272  273  275  276  277  278  279  280  281  283  284  285 
## 1208 1212 1216 1220 1225 1233 1237 1241 1245 1249 1253 1257 1266 1270 1274 
##  288  291  292  295  296  298  299  300  301  302  304  305  307  308  309 
## 1286 1298 1303 1315 1319 1327 1331 1335 1339 1344 1352 1356 1364 1368 1372 
##  310  311  314  315  316  319  323  325  326  327  329  331  333  334  336 
## 1376 1380 1393 1397 1401 1413 1430 1438 1442 1446 1454 1463 1471 1475 1483 
##  337  341  342  344  345  346  349  351  352  353  354  357  360  361  368 
## 1487 1504 1508 1516 1520 1524 1536 1545 1549 1553 1557 1569 1582 1586 1614 
##  371  376  377  378  379  381  384  385  387  388  389  390  393  394  396 
## 1627 1647 1651 1655 1659 1668 1680 1684 1692 1696 1701 1705 1717 1721 1729 
##  399  414  417  426  431  434  436  442  445  446  456  459  486  516  533 
## 1742 1803 1815 1852 1873 1885 1893 1918 1930 1934 1975 1988 2099 2222 2291 
##  546  564  574  588  596  603  619  623  879 
## 2345 2419 2460 2517 2550 2579 2644 2661 3711
boxplot(Number.of.Discharges~Number.of.Readmissions, data=hospital)

# TODO why does this look so strange?

# Smaller hospitals/facilities may be lacking necessary resources to ensure quality care and prevent complications that lead to readmissions
# TODO find out what is separating lines
# TODO model dummy vars and interaction terms


# Setup an appropriate hypothesis test.
# H0 : there is no difference in excess readmission ratio between small and large hospitals
# HA : there is a significant difference in excess readmission ratio between small and large hospitals

# Compute and report the observed significance value (or p-value).
# Report statistical significance for α = .01.
# inference(small$Excess.Readmission.Ratio, large$Excess.Readmission.Ratio, type = "ht",
#           est = "mean", method = "theoretical", alternative = "greater", boot_method = "perc",
#           conflevel = 0.99, null = 0)

t.test(small$Excess.Readmission.Ratio, large$Excess.Readmission.Ratio, alternative = "two.sided", mu = 0)
## 
##  Welch Two Sample t-test
## 
## data:  small$Excess.Readmission.Ratio and large$Excess.Readmission.Ratio
## t = 7.6017, df = 548.11, p-value = 1.275e-13
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.03284015 0.05572570
## sample estimates:
## mean of x mean of y 
## 1.0226184 0.9783354
# Discuss statistical significance and practical significance
# At large enough sample sizes, significance will be found with even small differences
# Is there practical meaning to the significant difference?

# Look at differences by $State
# TODO Look at F-stat and p-value to evaluate model fit  
summary(lm(hospital$Number.of.Discharges~hospital$State)) #low R
## 
## Call:
## lm(formula = hospital$Number.of.Discharges ~ hospital$State)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -634.4 -189.5  -73.9  105.9 6418.0 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       191.909     64.242   2.987 0.002821 ** 
## hospital$StateAL  134.054     66.781   2.007 0.044732 *  
## hospital$StateAR  144.114     68.273   2.111 0.034805 *  
## hospital$StateAZ  124.550     67.539   1.844 0.065190 .  
## hospital$StateCA   98.258     64.986   1.512 0.130566    
## hospital$StateCO   78.868     69.466   1.135 0.256253    
## hospital$StateCT  266.305     69.625   3.825 0.000132 ***
## hospital$StateDC  154.815     85.194   1.817 0.069210 .  
## hospital$StateDE  519.461     86.544   6.002 2.00e-09 ***
## hospital$StateFL  263.123     65.235   4.033 5.53e-05 ***
## hospital$StateGA  128.256     66.187   1.938 0.052674 .  
## hospital$StateHI   -1.558     81.123  -0.019 0.984680    
## hospital$StateIA  176.682     69.208   2.553 0.010696 *  
## hospital$StateID  109.291     79.981   1.366 0.171820    
## hospital$StateIL  215.353     65.533   3.286 0.001019 ** 
## hospital$StateIN  195.190     66.455   2.937 0.003319 ** 
## hospital$StateKS   88.078     68.792   1.280 0.200450    
## hospital$StateKY  216.099     67.042   3.223 0.001271 ** 
## hospital$StateLA   77.230     66.998   1.153 0.249050    
## hospital$StateMA  272.648     67.076   4.065 4.84e-05 ***
## hospital$StateMD  253.892     67.752   3.747 0.000180 ***
## hospital$StateME  125.277     73.649   1.701 0.088970 .  
## hospital$StateMI  286.505     66.150   4.331 1.50e-05 ***
## hospital$StateMN   53.042     69.007   0.769 0.442120    
## hospital$StateMO  194.697     66.745   2.917 0.003540 ** 
## hospital$StateMS  104.851     67.823   1.546 0.122143    
## hospital$StateMT   90.246     78.388   1.151 0.249643    
## hospital$StateNC  277.913     66.343   4.189 2.82e-05 ***
## hospital$StateND  225.884     85.194   2.651 0.008026 ** 
## hospital$StateNE  166.597     72.443   2.300 0.021483 *  
## hospital$StateNH  187.315     75.449   2.483 0.013054 *  
## hospital$StateNJ  344.956     66.701   5.172 2.36e-07 ***
## hospital$StateNM   13.155     72.740   0.181 0.856489    
## hospital$StateNV   88.573     72.078   1.229 0.219152    
## hospital$StateNY  183.061     65.425   2.798 0.005150 ** 
## hospital$StateOH  171.467     65.677   2.611 0.009046 ** 
## hospital$StateOK   91.741     67.337   1.362 0.173090    
## hospital$StateOR   61.628     70.482   0.874 0.381933    
## hospital$StatePA  154.856     65.519   2.364 0.018119 *  
## hospital$StateRI  154.741     79.981   1.935 0.053049 .  
## hospital$StateSC  173.809     67.650   2.569 0.010205 *  
## hospital$StateSD  147.463     78.985   1.867 0.061929 .  
## hospital$StateTN  184.007     66.414   2.771 0.005604 ** 
## hospital$StateTX  131.407     65.030   2.021 0.043331 *  
## hospital$StateUT   79.525     76.421   1.041 0.298076    
## hospital$StateVA  212.991     66.549   3.201 0.001375 ** 
## hospital$StateVT  188.662     91.927   2.052 0.040163 *  
## hospital$StateWA  161.829     67.916   2.383 0.017199 *  
## hospital$StateWI   82.112     67.122   1.223 0.221234    
## hospital$StateWV  177.155     70.428   2.515 0.011902 *  
## hospital$StateWY    9.424     88.939   0.106 0.915614    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 301.3 on 11527 degrees of freedom
## Multiple R-squared:  0.05426,    Adjusted R-squared:  0.05016 
## F-statistic: 13.23 on 50 and 11527 DF,  p-value: < 2.2e-16
summary(lm(hospital$Number.of.Readmissions~hospital$State)) #low R
## 
## Call:
## lm(formula = hospital$Number.of.Readmissions ~ hospital$State)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -98.93 -35.15 -14.57  18.32 797.37 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        26.409     12.217   2.162 0.030664 *  
## hospital$StateAL   28.846     12.703   2.271 0.023181 *  
## hospital$StateAR   29.950     12.983   2.307 0.021085 *  
## hospital$StateAZ   20.848     12.853   1.622 0.104814    
## hospital$StateCA   24.274     12.360   1.964 0.049570 *  
## hospital$StateCO    6.168     13.210   0.467 0.640583    
## hospital$StateCT   56.115     13.241   4.238 2.27e-05 ***
## hospital$StateDC   40.246     16.201   2.484 0.013001 *  
## hospital$StateDE   87.517     16.458   5.318 1.07e-07 ***
## hospital$StateFL   55.223     12.406   4.451 8.62e-06 ***
## hospital$StateGA   28.409     12.588   2.257 0.024035 *  
## hospital$StateHI    2.915     15.427   0.189 0.850122    
## hospital$StateIA   24.297     13.168   1.845 0.065044 .  
## hospital$StateID    6.091     15.210   0.400 0.688829    
## hospital$StateIL   48.163     12.465   3.864 0.000112 ***
## hospital$StateIN   39.771     12.642   3.146 0.001659 ** 
## hospital$StateKS   12.356     13.088   0.944 0.345147    
## hospital$StateKY   55.195     12.754   4.328 1.52e-05 ***
## hospital$StateLA   25.225     12.758   1.977 0.048047 *  
## hospital$StateMA   57.681     12.756   4.522 6.19e-06 ***
## hospital$StateMD   58.709     12.888   4.555 5.28e-06 ***
## hospital$StateME   22.905     14.006   1.635 0.101991    
## hospital$StateMI   63.731     12.581   5.066 4.13e-07 ***
## hospital$StateMN    9.851     13.129   0.750 0.453063    
## hospital$StateMO   40.382     12.693   3.181 0.001469 ** 
## hospital$StateMS   28.779     12.901   2.231 0.025719 *  
## hospital$StateMT    4.136     14.963   0.276 0.782210    
## hospital$StateNC   51.606     12.618   4.090 4.34e-05 ***
## hospital$StateND   22.970     16.201   1.418 0.156275    
## hospital$StateNE   19.085     13.776   1.385 0.165984    
## hospital$StateNH   29.367     14.348   2.047 0.040706 *  
## hospital$StateNJ   81.694     12.685   6.440 1.24e-10 ***
## hospital$StateNM    4.354     13.873   0.314 0.753638    
## hospital$StateNV   26.249     13.759   1.908 0.056434 .  
## hospital$StateNY   49.621     12.443   3.988 6.70e-05 ***
## hospital$StateOH   41.735     12.490   3.341 0.000836 ***
## hospital$StateOK   21.962     12.819   1.713 0.086679 .  
## hospital$StateOR    6.100     13.404   0.455 0.649036    
## hospital$StatePA   36.066     12.461   2.894 0.003807 ** 
## hospital$StateRI   37.841     15.210   2.488 0.012864 *  
## hospital$StateSC   33.740     12.868   2.622 0.008754 ** 
## hospital$StateSD   12.518     15.144   0.827 0.408492    
## hospital$StateTN   41.673     12.632   3.299 0.000974 ***
## hospital$StateTX   28.809     12.369   2.329 0.019867 *  
## hospital$StateUT    1.534     14.533   0.106 0.915922    
## hospital$StateVA   43.919     12.658   3.470 0.000523 ***
## hospital$StateVT   31.686     17.482   1.813 0.069932 .  
## hospital$StateWA   24.409     12.916   1.890 0.058798 .  
## hospital$StateWI   13.349     12.765   1.046 0.295675    
## hospital$StateWV   49.435     13.393   3.691 0.000224 ***
## hospital$StateWY    6.156     17.089   0.360 0.718668    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 57.3 on 11446 degrees of freedom
##   (81 observations deleted due to missingness)
## Multiple R-squared:  0.07778,    Adjusted R-squared:  0.07376 
## F-statistic: 19.31 on 50 and 11446 DF,  p-value: < 2.2e-16
# Look at differences procedure called $Measure.Name
summary(lm(hospital$Number.of.Discharges~hospital$Measure.Name)) #low R
## 
## Call:
## lm(formula = hospital$Number.of.Discharges ~ hospital$Measure.Name)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -533.5 -192.5  -76.6  106.2 6259.5 
## 
## Coefficients:
##                                             Estimate Std. Error t value
## (Intercept)                                  262.824      7.049  37.283
## hospital$Measure.NameREADM-30-COPD-HRRP       65.761      9.082   7.240
## hospital$Measure.NameREADM-30-HF-HRRP        137.192      9.017  15.215
## hospital$Measure.NameREADM-30-HIP-KNEE-HRRP  270.631     10.685  25.329
## hospital$Measure.NameREADM-30-PN-HRRP         75.735      9.043   8.375
##                                             Pr(>|t|)    
## (Intercept)                                  < 2e-16 ***
## hospital$Measure.NameREADM-30-COPD-HRRP     4.76e-13 ***
## hospital$Measure.NameREADM-30-HF-HRRP        < 2e-16 ***
## hospital$Measure.NameREADM-30-HIP-KNEE-HRRP  < 2e-16 ***
## hospital$Measure.NameREADM-30-PN-HRRP        < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 299.7 on 11573 degrees of freedom
## Multiple R-squared:  0.06092,    Adjusted R-squared:  0.06059 
## F-statistic: 187.7 on 4 and 11573 DF,  p-value: < 2.2e-16
summary(lm(hospital$Number.of.Readmissions~hospital$Measure.Name)) #low R
## 
## Call:
## lm(formula = hospital$Number.of.Readmissions ~ hospital$Measure.Name)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -79.75 -33.69 -11.83  17.25 788.25 
## 
## Coefficients:
##                                             Estimate Std. Error t value
## (Intercept)                                   46.688      1.333  35.026
## hospital$Measure.NameREADM-30-COPD-HRRP       22.074      1.713  12.886
## hospital$Measure.NameREADM-30-HF-HRRP         44.063      1.700  25.914
## hospital$Measure.NameREADM-30-HIP-KNEE-HRRP  -18.855      2.020  -9.335
## hospital$Measure.NameREADM-30-PN-HRRP         12.783      1.706   7.495
##                                             Pr(>|t|)    
## (Intercept)                                  < 2e-16 ***
## hospital$Measure.NameREADM-30-COPD-HRRP      < 2e-16 ***
## hospital$Measure.NameREADM-30-HF-HRRP        < 2e-16 ***
## hospital$Measure.NameREADM-30-HIP-KNEE-HRRP  < 2e-16 ***
## hospital$Measure.NameREADM-30-PN-HRRP       7.11e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 56.19 on 11492 degrees of freedom
##   (81 observations deleted due to missingness)
## Multiple R-squared:  0.1097, Adjusted R-squared:  0.1094 
## F-statistic:   354 on 4 and 11492 DF,  p-value: < 2.2e-16
# Look at AOV 
aov <- aov(Excess.Readmission.Ratio~Number.of.Discharges*Number.of.Readmissions*State*Measure.Name, data=hospital)
aov
## Call:
##    aov(formula = Excess.Readmission.Ratio ~ Number.of.Discharges * 
##     Number.of.Readmissions * State * Measure.Name, data = hospital)
## 
## Terms:
##                 Number.of.Discharges Number.of.Readmissions     State
## Sum of Squares              0.922326               9.854469  6.867373
## Deg. of Freedom                    1                      1        50
##                 Measure.Name Number.of.Discharges:Number.of.Readmissions
## Sum of Squares     16.424253                                    0.317690
## Deg. of Freedom            4                                           1
##                 Number.of.Discharges:State Number.of.Readmissions:State
## Sum of Squares                    2.526706                     2.864135
## Deg. of Freedom                         50                           50
##                 Number.of.Discharges:Measure.Name
## Sum of Squares                          25.277323
## Deg. of Freedom                                 4
##                 Number.of.Readmissions:Measure.Name State:Measure.Name
## Sum of Squares                             9.002350           1.768707
## Deg. of Freedom                                   4                200
##                 Number.of.Discharges:Number.of.Readmissions:State
## Sum of Squares                                           0.364245
## Deg. of Freedom                                                50
##                 Number.of.Discharges:Number.of.Readmissions:Measure.Name
## Sum of Squares                                                  0.280962
## Deg. of Freedom                                                        4
##                 Number.of.Discharges:State:Measure.Name
## Sum of Squares                                 2.054828
## Deg. of Freedom                                     200
##                 Number.of.Readmissions:State:Measure.Name
## Sum of Squares                                   1.188367
## Deg. of Freedom                                       195
##                 Number.of.Discharges:Number.of.Readmissions:State:Measure.Name
## Sum of Squares                                                        1.179756
## Deg. of Freedom                                                            191
##                 Residuals
## Sum of Squares  16.333082
## Deg. of Freedom     10491
## 
## Residual standard error: 0.03945714
## 14 out of 1020 effects not estimable
## Estimated effects may be unbalanced
## 81 observations deleted due to missingness
# TODO: Can I use Tukey HSD to tell which groups differ?
# tuk <- TukeyHSD(aov)
# tuk
# Warning messages:
#   1: In replications(paste("~", xx), data = mf) :
#   non-factors ignored: Number.of.Discharges
# 2: In replications(paste("~", xx), data = mf) :
#   non-factors ignored: Number.of.Readmissions
# 3: In replications(paste("~", xx), data = mf) :
#   non-factors ignored: Number.of.Discharges, Number.of.Readmissions
# 4: In replications(paste("~", xx), data = mf) :
#   non-factors ignored: Number.of.Discharges, State
# 5: In replications(paste("~", xx), data = mf) :
#   non-factors ignored: Number.of.Readmissions, State
# 6: In replications(paste("~", xx), data = mf) :
#   non-factors ignored: Number.of.Discharges, Measure.Name
# 7: In replications(paste("~", xx), data = mf) :
#   non-factors ignored: Number.of.Readmissions, Measure.Name
# 8: In replications(paste("~", xx), data = mf) :
#   non-factors ignored: Number.of.Discharges, Number.of.Readmissions, State
# 9: In replications(paste("~", xx), data = mf) :
#   non-factors ignored: Number.of.Discharges, Number.of.Readmissions, Measure.Name
# 10: In replications(paste("~", xx), data = mf) :
#   non-factors ignored: Number.of.Discharges, State, Measure.Name
# 11: In replications(paste("~", xx), data = mf) :
#   non-factors ignored: Number.of.Readmissions, State, Measure.Name
# 12: In replications(paste("~", xx), data = mf) :
#   non-factors ignored: Number.of.Discharges, Number.of.Readmissions, State, Measure.Name