** DATA_605_Discussion_12_Regression_2 **
** Post Operative Patient Decisioning **
http://rpubs.com/danthonn/381978
data: https://archive.ics.uci.edu/ml/datasets.html
if (!require(RCurl)) install.packages("RCurl")
## Loading required package: RCurl
## Loading required package: bitops
if (!require(stringr)) install.packages("stringr")
## Loading required package: stringr
library(RCurl)
library(stringr)
if (!require(tidyr)) install.packages("tidyr")
## Loading required package: tidyr
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:RCurl':
##
## complete
if (!require(dplyr)) install.packages("dplyr")
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(dplyr)
if (!require(gvlma)) install.packages("gvlma")
## Loading required package: gvlma
library(gvlma)
** Discussion_12_Regression_Post_Op_Patient_Data **
Scope: check regression on Post Operative Patient Data
Fields: Attribute Information: 1. L-CORE (patient’s internal temperature in C): high (> 37), mid (>= 36 and <= 37), low (< 36) 2. L-SURF (patient’s surface temperature in C): high (> 36.5), mid (>= 36.5 and <= 35), low (< 35) 3. L-O2 (oxygen saturation in %): excellent (>= 98), good (>= 90 and < 98), fair (>= 80 and < 90), poor (< 80) 4. L-BP (last measurement of blood pressure): high (> 130/90), mid (<= 130/90 and >= 90/70), low (< 90/70) 5. SURF-STBL (stability of patient’s surface temperature): stable, mod-stable, unstable 6. CORE-STBL (stability of patient’s core temperature) stable, mod-stable, unstable 7. BP-STBL (stability of patient’s blood pressure) stable, mod-stable, unstable 8. COMFORT (patient’s perceived comfort at discharge, measured as an integer between 0 and 20) 9. decision ADM-DECS (discharge decision): I (patient sent to Intensive Care Unit), S (patient prepared to go home), A (patient sent to general hospital floor)
# Load and check Post Operative Patient data (post_op)
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/postoperative-patient-data/post-operative.data'
post_op <- read.csv(url, header = F, stringsAsFactors = TRUE)
head(post_op)
## V1 V2 V3 V4 V5 V6 V7 V8 V9
## 1 mid low excellent mid stable stable stable 15 A
## 2 mid high excellent high stable stable stable 10 S
## 3 high low excellent high stable stable mod-stable 10 A
## 4 mid low good high stable unstable mod-stable 15 A
## 5 mid mid excellent high stable stable stable 10 A
## 6 high low good mid stable stable unstable 15 S
# ncol(post_op)
# nrow(post_op)
str(post_op)
## 'data.frame': 90 obs. of 9 variables:
## $ V1: Factor w/ 3 levels "high","low","mid": 3 3 1 3 3 1 3 1 3 3 ...
## $ V2: Factor w/ 3 levels "high","low","mid": 2 1 2 2 3 2 2 3 1 2 ...
## $ V3: Factor w/ 2 levels "excellent","good": 1 1 1 2 1 2 1 1 2 1 ...
## $ V4: Factor w/ 3 levels "high","low","mid": 3 1 1 1 1 3 1 3 3 3 ...
## $ V5: Factor w/ 2 levels "stable","unstable": 1 1 1 1 1 1 1 2 1 2 ...
## $ V6: Factor w/ 3 levels "mod-stable","stable",..: 2 2 2 3 2 2 2 3 2 2 ...
## $ V7: Factor w/ 3 levels "mod-stable","stable",..: 2 2 1 1 2 3 1 2 2 1 ...
## $ V8: Factor w/ 5 levels "?","05","07",..: 5 4 4 5 4 5 2 4 4 4 ...
## $ V9: Factor w/ 4 levels "A","A ","I","S": 1 4 1 2 1 4 4 4 4 4 ...
#post_op
# Remove extra spaces
post_op[] <- lapply(post_op, function(x) if (is.factor(x)) factor(sub(" +$", "", x)) else x)
# label the columns of the data frame for easy readability
# head(post_op)
# names(post_op)
names(post_op)[1]<-"L_CORE"
names(post_op)[2]<-"L_SURF"
names(post_op)[3]<-"L_O2"
names(post_op)[4]<-"L_BP"
names(post_op)[5]<-"SURF_STABLE"
names(post_op)[6]<-"CORE_STABLE"
names(post_op)[7]<-"BP_STABLE"
names(post_op)[8]<-"COMFORT"
names(post_op)[9]<-"DECISION"
# Factors: and numbering from left to right
# 'data.frame': 90 obs. of 9 variables:
# $ L_CORE : Factor w/ 3 levels "high","low","mid": 3 3 1 3 3 1 3 1 3 3 ...
# $ L_SURF : Factor w/ 3 levels "high","low","mid": 2 1 2 2 3 2 2 3 1 2 ...
# $ L_O2 : Factor w/ 2 levels "excellent","good": 1 1 1 2 1 2 1 1 2 1 ...
# $ L_BP : Factor w/ 3 levels "high","low","mid": 3 1 1 1 1 3 1 3 3 3 ...
# $ SURF_STABLE: Factor w/ 2 levels "stable","unstable": 1 1 1 1 1 1 1 2 1 2 ...
# $ cORE_STABLE: Factor w/ 3 levels "mod-stable","stable",..: 2 2 2 3 2 2 2 3 2 2 ...
# $ BP_STABLE : Factor w/ 3 levels "mod-stable","stable",..: 2 2 1 1 2 3 1 2 2 1 ...
# $ COMFORT : Factor w/ 5 levels "?","05","07",..: 5 4 4 5 4 5 2 4 4 4 ...
# $ DECISION : Factor w/ 3 levels "A","I","S": 1 3 1 1 1 3 3 3 3 3 ...
# Note type of terms:
# dichotomous variable: SURF_STABLE with two values: stable or unstable
# quadratic term: create a quadratic term: BP_STABLE_SQ (BP_STABLE^2)
# one dichotomous vs. quantitative interaction term: SURF_STABLE vs L_SURF (quantitative as this is measured)
post_op$L_CORE = as.numeric(post_op$L_CORE)
post_op$L_SURF = as.numeric(post_op$L_SURF)
post_op$L_O2 = as.numeric(post_op$L_O2)
post_op$L_BP = as.numeric(post_op$L_BP)
post_op$SURF_STABLE = as.numeric(post_op$SURF_STABLE)
post_op$CORE_STABLE = as.numeric(post_op$CORE_STABLE)
post_op$BP_STABLE = as.numeric(post_op$BP_STABLE)
post_op$COMFORT = as.numeric(post_op$COMFORT)
post_op$DECISION = as.numeric(post_op$DECISION)
post_op$BP_STABLE_SQ = (post_op$BP_STABLE)^2
# post_op
# str(post_op)
# create a mult-factor model
attach(post_op)
#po_mod.lm <- lm(DECISION ~ L_CORE + L_SURV + L_O2 + L_BP + SURF_STABLE + cORE_STABLE + BP_STABLE + COMFORT)
# post_op.lm <- lm(DECISION ~ L_CORE,data=post_op,na.action=na.omit)
post_op.lm <- lm(DECISION ~ L_SURF + L_O2 + L_BP + SURF_STABLE + CORE_STABLE + BP_STABLE + COMFORT + BP_STABLE_SQ,na.action=na.omit)
post_op.lm
##
## Call:
## lm(formula = DECISION ~ L_SURF + L_O2 + L_BP + SURF_STABLE +
## CORE_STABLE + BP_STABLE + COMFORT + BP_STABLE_SQ, na.action = na.omit)
##
## Coefficients:
## (Intercept) L_SURF L_O2 L_BP SURF_STABLE
## -0.48285 0.05268 0.12403 0.10421 -0.05599
## CORE_STABLE BP_STABLE COMFORT BP_STABLE_SQ
## 0.49280 0.78636 -0.03225 -0.19703
# Call:
# lm(formula = DECISION ~ L_SURV + L_O2 + L_BP + SURF_STABLE +
# CORE_STABLE + BP_STABLE + COMFORT + BP_STABLE_SQ, na.action = na.omit)
#
# Coefficients:
# (Intercept) L_SURV L_O2 L_BP SURF_STABLE CORE_STABLE BP_STABLE COMFORT
# -0.48285 0.05268 0.12403 0.10421 -0.05599 0.49280 0.78636 -0.03225
# BP_STABLE_SQ
# -0.19703
# Plot all data
# Plot patient post op data
postop_vars <- c('DECISION', 'L_SURF', 'L_O2', 'L_BP', 'SURF_STABLE', 'CORE_STABLE', 'BP_STABLE', 'COMFORT','BP_STABLE_SQ')
Patients <- post_op[postop_vars]
str(Patients)
## 'data.frame': 90 obs. of 9 variables:
## $ DECISION : num 1 3 1 1 1 3 3 3 3 3 ...
## $ L_SURF : num 2 1 2 2 3 2 2 3 1 2 ...
## $ L_O2 : num 1 1 1 2 1 2 1 1 2 1 ...
## $ L_BP : num 3 1 1 1 1 3 1 3 3 3 ...
## $ SURF_STABLE : num 1 1 1 1 1 1 1 2 1 2 ...
## $ CORE_STABLE : num 2 2 2 3 2 2 2 3 2 2 ...
## $ BP_STABLE : num 2 2 1 1 2 3 1 2 2 1 ...
## $ COMFORT : num 5 4 4 5 4 5 2 4 4 4 ...
## $ BP_STABLE_SQ: num 4 4 1 1 4 9 1 4 4 1 ...
plot(Patients)
# note in plot that the relationships of DECISION to 'BP_STABLE', 'CORR_STABLE' appear to be better in correspondance with the plot but still not good.
summary(post_op.lm)
##
## Call:
## lm(formula = DECISION ~ L_SURF + L_O2 + L_BP + SURF_STABLE +
## CORE_STABLE + BP_STABLE + COMFORT + BP_STABLE_SQ, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9791 -0.6125 -0.4627 0.8661 1.7662
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.48285 1.14729 -0.421 0.675
## L_SURF 0.05268 0.12911 0.408 0.684
## L_O2 0.12403 0.19942 0.622 0.536
## L_BP 0.10421 0.10645 0.979 0.331
## SURF_STABLE -0.05599 0.19718 -0.284 0.777
## CORE_STABLE 0.49280 0.36000 1.369 0.175
## BP_STABLE 0.78636 0.80981 0.971 0.334
## COMFORT -0.03225 0.12669 -0.255 0.800
## BP_STABLE_SQ -0.19703 0.19720 -0.999 0.321
##
## Residual standard error: 0.9033 on 81 degrees of freedom
## Multiple R-squared: 0.05877, Adjusted R-squared: -0.0342
## F-statistic: 0.6322 on 8 and 81 DF, p-value: 0.7485
# in the sample the values of the facotrs are > .05 which is not significatn. The adjusted R value is -0.0342 which is not a good correlation.
# the sample model is not linear across the entire range, but is more linear in two ranges, one less than zero and the other greater than one.
qqnorm(resid(post_op.lm))
qqline(resid(post_op.lm))
hist(post_op.lm$residuals)
# the histogram of residuals shows two different plots, one in the negative range, the other in the positive showing different patterns in the data and not consistent.
plot(fitted(post_op.lm),resid(post_op.lm))
# the plot of the residuals shows two diffent ranges and not consistent.
# Check dichotomous variable: SURF_STABLE with two values: stable or unstable
# check single variable model - SURF_STABLE
post_op2.lm <- lm(DECISION ~ SURF_STABLE)
post_op2.lm
##
## Call:
## lm(formula = DECISION ~ SURF_STABLE)
##
## Coefficients:
## (Intercept) SURF_STABLE
## 1.556e+00 5.149e-16
summary(post_op2.lm)
##
## Call:
## lm(formula = DECISION ~ SURF_STABLE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5556 -0.5556 -0.5556 1.4444 1.4444
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.556e+00 2.978e-01 5.224 1.16e-06 ***
## SURF_STABLE 5.149e-16 1.883e-01 0.000 1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8933 on 88 degrees of freedom
## Multiple R-squared: 1.411e-30, Adjusted R-squared: -0.01136
## F-statistic: 1.241e-28 on 1 and 88 DF, p-value: 1
# The adjusted R value is -0.01136 which is not a good correlation.
# correlation on the plot does not show a linear relationship.
plot(SURF_STABLE,DECISION)
abline(post_op2.lm)
# the sample model is not a consistent fit.
qqnorm(resid(post_op2.lm))
qqline(resid(post_op2.lm))
hist(post_op2.lm$residuals)
# the histogram of residuals does not show a normal type pattern, therefor linear model not good.
# plot does not show linearity
plot(fitted(post_op2.lm),resid(post_op2.lm))
## Warning in plot.window(...): relative range of values = 35 * EPS, is small
## (axis 1)
gvlma(post_op2.lm)
##
## Call:
## lm(formula = DECISION ~ SURF_STABLE)
##
## Coefficients:
## (Intercept) SURF_STABLE
## 1.556e+00 5.149e-16
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = post_op2.lm)
##
## Value p-value Decision
## Global Stat 18.8261 0.0008503 Assumptions NOT satisfied!
## Skewness 14.7603 0.0001221 Assumptions NOT satisfied!
## Kurtosis 3.6051 0.0576020 Assumptions acceptable.
## Link Function 0.0209 0.8850650 Assumptions acceptable.
## Heteroscedasticity 0.4397 0.5072630 Assumptions acceptable.
# Check one dichotomous vs. quantitative interaction term: L_SURF vs SURF_STABLE
post_op3.lm <- lm(L_SURF ~ SURF_STABLE)
post_op3.lm
##
## Call:
## lm(formula = L_SURF ~ SURF_STABLE)
##
## Coefficients:
## (Intercept) SURF_STABLE
## 2.644 -0.200
summary(post_op3.lm)
##
## Call:
## lm(formula = L_SURF ~ SURF_STABLE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.4444 -0.4444 0.5556 0.7056 0.7556
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.6444 0.2597 10.182 <2e-16 ***
## SURF_STABLE -0.2000 0.1643 -1.218 0.227
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7791 on 88 degrees of freedom
## Multiple R-squared: 0.01657, Adjusted R-squared: 0.005392
## F-statistic: 1.483 on 1 and 88 DF, p-value: 0.2266
# Adjusted R-squared is not significant at -0.01116
# not a good linear fit per plot
plot(L_CORE,SURF_STABLE)
abline(post_op3.lm)
# not a good linear fit per residuals
qqnorm(resid(post_op3.lm))
qqline(resid(post_op3.lm))
# not a good linear fit per historgram of residuals
hist(post_op3.lm$residuals)
# not a good linear fit per plot of residuals
plot(fitted(post_op3.lm),resid(post_op3.lm))
gvlma(post_op3.lm)
##
## Call:
## lm(formula = L_SURF ~ SURF_STABLE)
##
## Coefficients:
## (Intercept) SURF_STABLE
## 2.644 -0.200
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = post_op3.lm)
##
## Value p-value Decision
## Global Stat 1.142e+01 0.02224 Assumptions NOT satisfied!
## Skewness 5.736e+00 0.01663 Assumptions NOT satisfied!
## Kurtosis 4.333e+00 0.03737 Assumptions NOT satisfied!
## Link Function -4.333e-13 1.00000 Assumptions acceptable.
## Heteroscedasticity 1.350e+00 0.24526 Assumptions acceptable.
Conclusion: This post operative patient data does not show a linear relationship between patient factors and the descison of where to send the patient after the operation. We cannot predict this decision on the patient factors as demostrated with the lack of linearity and relationship per the above models, plots, residuals, summaries of model and analysis.
This data first appeared to have the potential for a linear relationship, so it was important to note that the linear relationship did not exist.
** END **