author: Foo C.Y. date: April 2017 autosize: true transition: none
alt text
Load data hosp.csv in module 4
hosp <- read.csv("F:/Dropbox/Conference.Seminar.Talks/NIH Monash R Course/06 Module 4/hosp.csv")
We are interest to see if ALOS of a hospital is related to their SMR
Run some descriptive stat.
summary(hosp$ALOS_NHEWS)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.000 2.680 3.000 3.197 3.633 7.000
summary(hosp$SMR)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.0000 0.7394 1.0590 1.2460 1.5070 6.2170 5
library(ggplot2)
ggplot(data=hosp,aes(x=ALOS_NHEWS,y=SMR))+
geom_point()
ggplot(data=hosp,aes(x=ALOS_NHEWS,y=SMR))+
geom_point()+
stat_smooth(method = "lm")
ggplot(data=hosp,aes(x=ALOS_NHEWS,y=SMR))+
geom_point()+
stat_smooth()
plot of chunk unnamed-chunk-6
# a loess smoothed fit curve with confidence region
type:prompt
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.000 2.680 3.000 3.197 3.633 7.000
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.0000 0.7394 1.0590 1.2460 1.5070 6.2170 5
type: subsection
Add the use
argument
cor(x=hosp$ALOS_NHEWS,y=hosp$SMR, use = "pairwise.complete.obs")
[1] -0.178427
type: prompt
?cor
cor(x, y = NULL, use = “everything”, method = c(“pearson”, “kendall”, “spearman”))
method
a character string indicating which correlation coefficient (or covariance) is to be computed. One of “pearson” (default), “kendall”, or “spearman”: can be abbreviated.
type: section
cor(x=hosp$ALOS_NHEWS,y=hosp$SMR, use = "pairwise.complete.obs",method = "spearman")
[1] -0.1446368
Prepare the data - use the with
function
cor.data <- with(hosp, data.frame(TotInPtAdm,edTotal,Red,Totaledv,TOTAL))
cor(cor.data)
TotInPtAdm edTotal Red Totaledv TOTAL
TotInPtAdm 1.0000000 0.8812196 0.4186901 0.8782979 0.9216846
edTotal 0.8812196 1.0000000 0.3960457 0.9988560 0.8560574
Red 0.4186901 0.3960457 1.0000000 0.3955063 0.3659505
Totaledv 0.8782979 0.9988560 0.3955063 1.0000000 0.8522218
TOTAL 0.9216846 0.8560574 0.3659505 0.8522218 1.0000000
round(cor(cor.data),1)
TotInPtAdm edTotal Red Totaledv TOTAL
TotInPtAdm 1.0 0.9 0.4 0.9 0.9
edTotal 0.9 1.0 0.4 1.0 0.9
Red 0.4 0.4 1.0 0.4 0.4
Totaledv 0.9 1.0 0.4 1.0 0.9
TOTAL 0.9 0.9 0.4 0.9 1.0
plot(cor.data)
install.packages("corrplot")
library(corrplot)
Run a correlation matrix and save it as cor.mat
cor.mat <- cor(cor.data,use="complete")
Try different visualization
corrplot(cor.mat)
corrplot(cor.mat,method = "square")
corrplot(cor.mat,method = "square",type = "upper")
corrplot(cor.mat,method = "square",type = "upper",diag = TRUE)
corrplot.mixed(cor.mat)
lm(SMR~SurLoad2,data=hosp)
Call:
lm(formula = SMR ~ SurLoad2, data = hosp)
Coefficients:
(Intercept) SurLoad2
1.6360 -0.9645
lm(SMR~SurLoad2+Year+IT,data=hosp)
Call:
lm(formula = SMR ~ SurLoad2 + Year + IT, data = hosp)
Coefficients:
(Intercept) SurLoad2 Year ITYes
487.59296 -0.82699 -0.24161 -0.09654
simply adding more variables to the formula
summary(lm(SMR~SurLoad2+Year*IT,data=hosp))
Call:
lm(formula = SMR ~ SurLoad2 + Year * IT, data = hosp)
Residuals:
Min 1Q Median 3Q Max
-1.6606 -0.4805 -0.0884 0.3287 4.6349
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 523.18962 68.98171 7.584 1.62e-13 ***
SurLoad2 -0.85923 0.25640 -3.351 0.000865 ***
Year -0.25930 0.03429 -7.561 1.91e-13 ***
ITYes -329.50435 209.27298 -1.575 0.115995
Year:ITYes 0.16377 0.10404 1.574 0.116101
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.8168 on 504 degrees of freedom
(5 observations deleted due to missingness)
Multiple R-squared: 0.1277, Adjusted R-squared: 0.1208
F-statistic: 18.44 on 4 and 504 DF, p-value: 3.729e-14
model.1 <- lm(SMR~SurLoad2+Year+IT,data=hosp)
check your Environment
summary(model.1)
Call:
lm(formula = SMR ~ SurLoad2 + Year + IT, data = hosp)
Residuals:
Min 1Q Median 3Q Max
-1.8565 -0.4820 -0.0807 0.3298 4.5851
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 487.59296 65.26473 7.471 3.53e-13 ***
SurLoad2 -0.82699 0.25596 -3.231 0.00131 **
Year -0.24161 0.03245 -7.446 4.18e-13 ***
ITYes -0.09654 0.12103 -0.798 0.42545
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.818 on 505 degrees of freedom
(5 observations deleted due to missingness)
Multiple R-squared: 0.1234, Adjusted R-squared: 0.1182
F-statistic: 23.69 on 3 and 505 DF, p-value: 2.316e-14
coef(model.1)
(Intercept) SurLoad2 Year ITYes
487.59296237 -0.82698740 -0.24161192 -0.09653957
confint(model.1)
2.5 % 97.5 %
(Intercept) 359.3691393 615.8167854
SurLoad2 -1.3298581 -0.3241167
Year -0.3053621 -0.1778618
ITYes -0.3343250 0.1412459
summary(model.1)$coefficients[,4]
(Intercept) SurLoad2 Year ITYes
3.528772e-13 1.314167e-03 4.184284e-13 4.254508e-01
layout(matrix(c(1,2,3,4),2,2)) # optional layout
plot(model.1)
head(predict(model.1))
1 2 3 4 5 6
1.5803052 1.3165459 1.0722654 0.8282688 1.4400447 1.1949931
# Assessing Outliers
library(car)
outlierTest(model.1)
leveragePlots(model.1)
# Influential Observations
influencePlot(model.1)
?influencePlot
influencePlot(model.1,id.method = "identify")
# Non-constant error variance
ncvTest(model.1)
# Test for Autocorrelated Errors
durbinWatsonTest(model.1)
capture.output(summary(model.1),
file = "./lm_output.txt")
capture.output(outlierTest(model.1),
append = T,
split = TRUE,
file = "./lm_output.txt")
install.packages("stargazer")
library(stargazer)
stargazer(model.1,type="html")
stargazer(model.1,title="Table 3: Three Linear Models",type="text")
alt text
1st - Compare IT, Bed & Total by TrainingCenter (tableone) 2nd - Run a Logistic Regression - Regress “TrainingCenter” on IT, Bed & Total
3rd - Use Stargazer to present the model output
hosp2<-with(hosp,data.frame(TrainingCenter,IT,InPtWard_TotBed,TOTAL))
hosp2$TOTAL <- hosp2$TOTAL/100000
library(tableone)
CreateTableOne(var=c("IT","InPtWard_TotBed","TOTAL"),data=hosp2,strata = "TrainingCenter")
Stratified by TrainingCenter
No Yes p test
n 426 88
IT = Yes (%) 27 (6.3) 33 (37.5) <0.001
InPtWard_TotBed (mean (sd)) 145.67 (120.48) 739.89 (230.24) <0.001
TOTAL (mean (sd)) 338.40 (338.85) 2550.66 (899.58) <0.001
LR <- glm(TrainingCenter~IT+InPtWard_TotBed+TOTAL,data=hosp2,family = "binomial")
summary(LR)
Call:
glm(formula = TrainingCenter ~ IT + InPtWard_TotBed + TOTAL,
family = "binomial", data = hosp2)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.06554 -0.08080 -0.06327 -0.05417 2.51556
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -7.1486185 0.9159187 -7.805 5.96e-15 ***
ITYes -0.2214081 0.7056664 -0.314 0.754
InPtWard_TotBed 0.0022965 0.0026500 0.867 0.386
TOTAL 0.0041224 0.0009966 4.136 3.53e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 470.612 on 513 degrees of freedom
Residual deviance: 89.258 on 510 degrees of freedom
AIC: 97.258
Number of Fisher Scoring iterations: 8
stargazer(LR,type = "text")