ST301 - Regression Analysis

P.I.N.Kehelbedda, Department of Statistics and Computer Science, Faculty of Science, University of Peradeniya

2020-09-28

Example 1: Outlying Observations.

library(ggplot2)
library(plotly) 
library(dplyr)
library(readr)
infectionRisk <- read_csv("ST301-infectionRisk.csv")

infectionRisk <- infectionRisk[,1:4]


pairs(infectionRisk[,-1])

lm.infecRisk <- lm(InfctRsk ~ . , data=infectionRisk[,-1])

infMeasures <- data.frame(ID=infectionRisk[1:112,1],hatValues = hatvalues(lm.infecRisk),cookDist = cooks.distance(lm.infecRisk))


gpStayIR <- ggplot(infectionRisk, aes(x= InfctRsk, y= Stay)) 
gpStayIR <- gpStayIR + geom_point() +geom_text(aes(label=ID),hjust=-0.5, vjust=0.1) + theme_classic() 
gpStayIR

gpAgeIR <- ggplot(infectionRisk, aes(x= InfctRsk, y= Age)) 
gpAgeIR <- gpAgeIR + geom_point() +geom_text(aes(label=ID),hjust=-0.5, vjust=0.1) + theme_classic() 
gpAgeIR

plot_ly(x=infectionRisk$Stay, y=infectionRisk$InfctRsk, z=infectionRisk$Age, type="scatter3d", mode="markers") %>%
layout(scene = list(xaxis = list(title = 'Stay'),
                    yaxis = list(title = 'Infction Risk'),
                    zaxis = list(title = 'Age')))
gpHatVal <- ggplot(infMeasures, aes(x= ID, y= hatValues , label=ID)) 
gpHatVal <- gpHatVal +  geom_point() +geom_text(aes(label=ID),hjust=-0.5, vjust=0.1) + theme_classic() 
gpHatVal <- gpHatVal    +   geom_hline(yintercept=2*mean(infMeasures$hatValues), linetype="dashed",     color = "red")
gpHatVal

gpCookDist <-  ggplot(infMeasures, aes(x= ID, y= cookDist , label=ID)) 
gpCookDist <- gpCookDist +  geom_point() +geom_text(aes(label=ID),hjust=-0.5, vjust=0.1) + theme_classic() 
gpCookDist

#### After removing outlying observarions 

infectionRisk_2 <- infectionRisk[c(-47,-112),1:4]

lm.infecRisk_2 <- lm(InfctRsk ~ . , data = infectionRisk_2[,-1])


summary(lm.infecRisk_2)
## 
## Call:
## lm(formula = InfctRsk ~ ., data = infectionRisk_2[, -1])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.74426 -0.61422  0.07069  0.61723  2.71618 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.88189    1.45512   1.293    0.199    
## Stay         0.42412    0.06669   6.360 5.09e-09 ***
## Age         -0.03007    0.02555  -1.177    0.242    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.134 on 107 degrees of freedom
## Multiple R-squared:  0.2767, Adjusted R-squared:  0.2631 
## F-statistic: 20.46 on 2 and 107 DF,  p-value: 2.983e-08
lm.infecRisk_Stay <- lm(InfctRsk ~ Stay , data = infectionRisk_2[,-1])

Example 2:

We are going to select best fitted model for data on age (X) and plasma level of a polyamine (Y ).

Let’s put the data,

assig <- data.frame(x=c(0,0,1,1,2,2,3,3,4,4,5,5,6,6), 
                     y=c(17.0,11.2,9.2,12.6,7.4,10.5,8.3,5.8,4.6,6.5,5.3,3.8,3.2,4.5))

Note that our Assumptions are, 1.The Y values(e errors) are independent. 2.The Y values can be expressed as a linear f^n of the X variables. 3.Variation of obs^ns around the regression line(residual) is constant(homoscedasticity)

library(ggplot2)
gpassig <- ggplot(assig, aes(x=x, y=y)) + geom_point() + theme_bw()
gpassig <- gpassig + xlab("age") + ylab("plasma level of a polyamine")

gpassig

Now create the Linear Model with the Residuals,

lmassig <- lm(y ~ x, data = assig)

summary(lmassig)
## 
## Call:
## lm(formula = y ~ x, data = assig)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1232 -1.6464  0.3464  0.9317  4.1304 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  12.8696     0.9048  14.224 7.12e-09 ***
## x            -1.6732     0.2509  -6.668 2.30e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.878 on 12 degrees of freedom
## Multiple R-squared:  0.7875, Adjusted R-squared:  0.7698 
## F-statistic: 44.46 on 1 and 12 DF,  p-value: 2.3e-05
par(mfrow=c(2,2))
plot(lmassig)

par(mfrow=c(1,1))

Here we can see a clean pattern

But let’s consider some Transformations,

  • Transformation-1
assigT1 <- data.frame(x = assig$x, y = sqrt(assig$y))

gpassigT1 <- ggplot(assigT1, aes(x=x, y=y)) + geom_point() + theme_bw()
gpassigT1 <- gpassigT1 +  xlab("age") + ylab("(plasma level of a polyamine)^1/2")

gpassigT1

lmassigT1 <- lm(y ~ x, data = assigT1)

summary(lmassigT1)
## 
## Call:
## lm(formula = y ~ x, data = assigT1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.31729 -0.28037  0.03671  0.20607  0.50781 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.61530    0.13697  26.395 5.36e-12 ***
## x           -0.29656    0.03799  -7.807 4.83e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2843 on 12 degrees of freedom
## Multiple R-squared:  0.8355, Adjusted R-squared:  0.8218 
## F-statistic: 60.94 on 1 and 12 DF,  p-value: 4.826e-06
par(mfrow=c(2,2))
plot(lmassigT1)

par(mfrow=c(1,1))
  • Transformation-2
assigT2 <- data.frame(x = assig$x, y = 1/(assig$y))

gpassigT2 <- ggplot(assigT1, aes(x=x, y=y)) + geom_point() + theme_bw()
gpassigT2 <- gpassigT1 +  xlab("age") + ylab("1/(plasma level of a polyamine)")

gpassigT2

lmassigT2 <- lm(y ~ x, data = assigT2)

summary(lmassigT2)
## 
## Call:
## lm(formula = y ~ x, data = assigT2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.03789 -0.03319  0.00390  0.02368  0.05618 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.060432   0.015918   3.796  0.00255 ** 
## x           0.032647   0.004415   7.395 8.33e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03304 on 12 degrees of freedom
## Multiple R-squared:   0.82,  Adjusted R-squared:  0.805 
## F-statistic: 54.68 on 1 and 12 DF,  p-value: 8.332e-06
par(mfrow=c(2,2))
plot(lmassigT2)

par(mfrow=c(1,1))
  • Transformation-3
assigT3 <- data.frame(x = assig$x, y = log10(assig$y))

gpassigT3 <- ggplot(assigT1, aes(x=x, y=y)) + geom_point() + theme_bw()
gpassigT3 <- gpassigT1 +  xlab("age") + ylab("log10(plasma level of a polyamine)")

gpassigT3

lmassigT3 <- lm(y ~ x, data = assigT3)

summary(lmassigT3)
## 
## Call:
## lm(formula = y ~ x, data = assigT3)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.089394 -0.076368  0.001502  0.070914  0.099807 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.13064    0.04016  28.153 2.50e-12 ***
## x           -0.09462    0.01114  -8.495 2.02e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08335 on 12 degrees of freedom
## Multiple R-squared:  0.8574, Adjusted R-squared:  0.8455 
## F-statistic: 72.17 on 1 and 12 DF,  p-value: 2.023e-06
par(mfrow=c(2,2))
plot(lmassigT3)

par(mfrow=c(1,1))

Finally compare the summary(of linear models),

summary(lmassig)
## 
## Call:
## lm(formula = y ~ x, data = assig)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1232 -1.6464  0.3464  0.9317  4.1304 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  12.8696     0.9048  14.224 7.12e-09 ***
## x            -1.6732     0.2509  -6.668 2.30e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.878 on 12 degrees of freedom
## Multiple R-squared:  0.7875, Adjusted R-squared:  0.7698 
## F-statistic: 44.46 on 1 and 12 DF,  p-value: 2.3e-05
summary(lmassigT1)
## 
## Call:
## lm(formula = y ~ x, data = assigT1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.31729 -0.28037  0.03671  0.20607  0.50781 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.61530    0.13697  26.395 5.36e-12 ***
## x           -0.29656    0.03799  -7.807 4.83e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2843 on 12 degrees of freedom
## Multiple R-squared:  0.8355, Adjusted R-squared:  0.8218 
## F-statistic: 60.94 on 1 and 12 DF,  p-value: 4.826e-06
summary(lmassigT2)
## 
## Call:
## lm(formula = y ~ x, data = assigT2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.03789 -0.03319  0.00390  0.02368  0.05618 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.060432   0.015918   3.796  0.00255 ** 
## x           0.032647   0.004415   7.395 8.33e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03304 on 12 degrees of freedom
## Multiple R-squared:   0.82,  Adjusted R-squared:  0.805 
## F-statistic: 54.68 on 1 and 12 DF,  p-value: 8.332e-06
summary(lmassigT3)
## 
## Call:
## lm(formula = y ~ x, data = assigT3)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.089394 -0.076368  0.001502  0.070914  0.099807 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.13064    0.04016  28.153 2.50e-12 ***
## x           -0.09462    0.01114  -8.495 2.02e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08335 on 12 degrees of freedom
## Multiple R-squared:  0.8574, Adjusted R-squared:  0.8455 
## F-statistic: 72.17 on 1 and 12 DF,  p-value: 2.023e-06

So to interpret the best result or best fitted model I’m going to consider about two statistics, 1. Comparing the R-Squred values 2. Comparing the p-value but we can also consider other statistics F-statistic, Coefficient and Resdual standard error as well.

Now let’s check the results,

read.delim(file = "table.txt", header = T)
##            X R.squred  p.value F.satatistic.1.12df RSE.with.12df
## 1      Mod 1   0.7875 2.30e-05               44.46       1.87800
## 2 Mod 2 (T1)   0.8355 4.83e-06               60.94       0.28430
## 3 Mod 3 (T2)   0.8200 8.33e-06               54.68       0.03304
## 4 Mod 4 (T3)   0.8574 2.02e-06               72.17       0.08335

From the above table we can clearly see highest R-squred value and lowest p-value has been taken by mod4 which is the transformtion 3. i.e. The best fitted model is Transformation model 3.

Another tricky simple way to do,

plot(assig$x, assig$y)
mod <- lm(y ~ x, data = assig)
summary(mod)
## 
## Call:
## lm(formula = y ~ x, data = assig)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1232 -1.6464  0.3464  0.9317  4.1304 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  12.8696     0.9048  14.224 7.12e-09 ***
## x            -1.6732     0.2509  -6.668 2.30e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.878 on 12 degrees of freedom
## Multiple R-squared:  0.7875, Adjusted R-squared:  0.7698 
## F-statistic: 44.46 on 1 and 12 DF,  p-value: 2.3e-05
abline(mod)

par(mfrow=c(2,2))
plot(mod)

par(mfrow=c(1,1))

Example 3:

# help(read.table)
dataset <- read.table(file = "BloodPressure.txt", header = TRUE, sep = "\t")
bloodpressure <- dataset[,c(2,3,4,5,6,7,8)]
attach(bloodpressure)
names(bloodpressure)
## [1] "BP"     "Age"    "Weight" "BSA"    "Dur"    "Pulse"  "Stress"
########################################
# Do Forward selection,
# then at iteration 4 do Backward elimination(drop1())
# then see for significance difference of two models(FM vs RM)
########################################

## 1.step backward,
full.model <- lm(BP ~ ., data = bloodpressure)
# help(step)
step(full.model , direction = "backward")
## Start:  AIC=-30.55
## BP ~ Age + Weight + BSA + Dur + Pulse + Stress
## 
##          Df Sum of Sq    RSS     AIC
## <none>                 2.156 -30.551
## - Dur     1     0.330  2.486 -29.698
## - Stress  1     0.442  2.598 -28.820
## - Pulse   1     0.444  2.600 -28.802
## - BSA     1     0.947  3.103 -25.267
## - Age     1    33.331 35.486  23.468
## - Weight  1    39.172 41.328  26.516
## 
## Call:
## lm(formula = BP ~ Age + Weight + BSA + Dur + Pulse + Stress, 
##     data = bloodpressure)
## 
## Coefficients:
## (Intercept)          Age       Weight          BSA          Dur        Pulse  
##  -12.870476     0.703259     0.969920     3.776491     0.068383    -0.084485  
##      Stress  
##    0.005572
## 2.step forward,
step(lm(BP ~ 1, data = bloodpressure), direction = "forward", 
     scope = ~Age+Weight+BSA+Dur+Pulse+Stress)
## Start:  AIC=68.64
## BP ~ 1
## 
##          Df Sum of Sq    RSS    AIC
## + Weight  1    505.47  54.53 24.060
## + BSA     1    419.86 140.14 42.938
## + Pulse   1    291.44 268.56 55.946
## + Age     1    243.27 316.73 59.247
## <none>                560.00 68.644
## + Dur     1     48.02 511.98 68.851
## + Stress  1     15.04 544.96 70.099
## 
## Step:  AIC=24.06
## BP ~ Weight
## 
##          Df Sum of Sq    RSS     AIC
## + Age     1    49.704  4.824 -22.443
## + Stress  1     9.660 44.868  22.160
## + Pulse   1     8.940 45.588  22.478
## + Dur     1     6.095 48.433  23.689
## <none>                54.528  24.060
## + BSA     1     2.814 51.714  25.000
## 
## Step:  AIC=-22.44
## BP ~ Weight + Age
## 
##          Df Sum of Sq    RSS     AIC
## + BSA     1   1.76778 3.0561 -29.572
## + Pulse   1   0.49557 4.3284 -22.611
## <none>                4.8239 -22.443
## + Dur     1   0.17835 4.6456 -21.196
## + Stress  1   0.16286 4.6611 -21.130
## 
## Step:  AIC=-29.57
## BP ~ Weight + Age + BSA
## 
##          Df Sum of Sq    RSS     AIC
## + Dur     1   0.33510 2.7210 -29.894
## <none>                3.0561 -29.572
## + Stress  1   0.21774 2.8384 -29.050
## + Pulse   1   0.04111 3.0150 -27.842
## 
## Step:  AIC=-29.89
## BP ~ Weight + Age + BSA + Dur
## 
##          Df Sum of Sq    RSS     AIC
## <none>                2.7210 -29.894
## + Pulse   1   0.12307 2.5980 -28.820
## + Stress  1   0.12077 2.6003 -28.802
## 
## Call:
## lm(formula = BP ~ Weight + Age + BSA + Dur, data = bloodpressure)
## 
## Coefficients:
## (Intercept)       Weight          Age          BSA          Dur  
##   -12.85206      0.89701      0.68335      4.86037      0.06653
## 3.stepwise
# 3.1 Method 1
step(lm(BP ~ ., data = bloodpressure), direction = "both")
## Start:  AIC=-30.55
## BP ~ Age + Weight + BSA + Dur + Pulse + Stress
## 
##          Df Sum of Sq    RSS     AIC
## <none>                 2.156 -30.551
## - Dur     1     0.330  2.486 -29.698
## - Stress  1     0.442  2.598 -28.820
## - Pulse   1     0.444  2.600 -28.802
## - BSA     1     0.947  3.103 -25.267
## - Age     1    33.331 35.486  23.468
## - Weight  1    39.172 41.328  26.516
## 
## Call:
## lm(formula = BP ~ Age + Weight + BSA + Dur + Pulse + Stress, 
##     data = bloodpressure)
## 
## Coefficients:
## (Intercept)          Age       Weight          BSA          Dur        Pulse  
##  -12.870476     0.703259     0.969920     3.776491     0.068383    -0.084485  
##      Stress  
##    0.005572
# 3.2 Method 2
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## The following object is masked from 'package:plotly':
## 
##     select
stepAIC(lm(BP ~ ., data = bloodpressure), direction = "both")
## Start:  AIC=-30.55
## BP ~ Age + Weight + BSA + Dur + Pulse + Stress
## 
##          Df Sum of Sq    RSS     AIC
## <none>                 2.156 -30.551
## - Dur     1     0.330  2.486 -29.698
## - Stress  1     0.442  2.598 -28.820
## - Pulse   1     0.444  2.600 -28.802
## - BSA     1     0.947  3.103 -25.267
## - Age     1    33.331 35.486  23.468
## - Weight  1    39.172 41.328  26.516
## 
## Call:
## lm(formula = BP ~ Age + Weight + BSA + Dur + Pulse + Stress, 
##     data = bloodpressure)
## 
## Coefficients:
## (Intercept)          Age       Weight          BSA          Dur        Pulse  
##  -12.870476     0.703259     0.969920     3.776491     0.068383    -0.084485  
##      Stress  
##    0.005572
# so this has some problems,
# it allows us to not to think about the actual problem.

Example 4:

library(ggplot2)
library(plotly)

# we analasis our rediused model(RM)
# RM(BP) = b0 + b1*Weight + b2*Age +b3*BSA

## wee need data set with idNO.

dataset <- read.table(file = "BloodPressure.txt", header = TRUE, sep = "\t")
dataset
##    PtID  BP Age Weight  BSA  Dur Pulse Stress
## 1     1 105  47   85.4 1.75  5.1    63     33
## 2     2 115  49   94.2 2.10  3.8    70     14
## 3     3 116  49   95.3 1.98  8.2    72     10
## 4     4 117  50   94.7 2.01  5.8    73     99
## 5     5 112  51   89.4 1.89  7.0    72     95
## 6     6 121  48   99.5 2.25  9.3    71     10
## 7     7 121  49   99.8 2.25  2.5    69     42
## 8     8 110  47   90.9 1.90  6.2    66      8
## 9     9 110  49   89.2 1.83  7.1    69     62
## 10   10 114  48   92.7 2.07  5.6    64     35
## 11   11 114  47   94.4 2.07  5.3    74     90
## 12   12 115  49   94.1 1.98  5.6    71     21
## 13   13 114  50   91.6 2.05 10.2    68     47
## 14   14 106  45   87.1 1.92  5.6    67     80
## 15   15 125  52  101.3 2.19 10.0    76     98
## 16   16 114  46   94.5 1.98  7.4    69     95
## 17   17 106  46   87.0 1.87  3.6    62     18
## 18   18 113  46   94.5 1.90  4.3    70     12
## 19   19 110  48   90.5 1.88  9.0    71     99
## 20   20 122  56   95.7 2.09  7.0    75     99
bloodpressure <- dataset[c(1,2,3,4,5)]
attach(bloodpressure)
## The following objects are masked from bloodpressure (pos = 4):
## 
##     Age, BP, BSA, Weight
names(bloodpressure)
## [1] "PtID"   "BP"     "Age"    "Weight" "BSA"
bloodpressure
##    PtID  BP Age Weight  BSA
## 1     1 105  47   85.4 1.75
## 2     2 115  49   94.2 2.10
## 3     3 116  49   95.3 1.98
## 4     4 117  50   94.7 2.01
## 5     5 112  51   89.4 1.89
## 6     6 121  48   99.5 2.25
## 7     7 121  49   99.8 2.25
## 8     8 110  47   90.9 1.90
## 9     9 110  49   89.2 1.83
## 10   10 114  48   92.7 2.07
## 11   11 114  47   94.4 2.07
## 12   12 115  49   94.1 1.98
## 13   13 114  50   91.6 2.05
## 14   14 106  45   87.1 1.92
## 15   15 125  52  101.3 2.19
## 16   16 114  46   94.5 1.98
## 17   17 106  46   87.0 1.87
## 18   18 113  46   94.5 1.90
## 19   19 110  48   90.5 1.88
## 20   20 122  56   95.7 2.09
bloodpressure[,-1]
##     BP Age Weight  BSA
## 1  105  47   85.4 1.75
## 2  115  49   94.2 2.10
## 3  116  49   95.3 1.98
## 4  117  50   94.7 2.01
## 5  112  51   89.4 1.89
## 6  121  48   99.5 2.25
## 7  121  49   99.8 2.25
## 8  110  47   90.9 1.90
## 9  110  49   89.2 1.83
## 10 114  48   92.7 2.07
## 11 114  47   94.4 2.07
## 12 115  49   94.1 1.98
## 13 114  50   91.6 2.05
## 14 106  45   87.1 1.92
## 15 125  52  101.3 2.19
## 16 114  46   94.5 1.98
## 17 106  46   87.0 1.87
## 18 113  46   94.5 1.90
## 19 110  48   90.5 1.88
## 20 122  56   95.7 2.09
lm.RM <- lm(BP ~ ., data = bloodpressure[,-1])
hatvalues(lm.RM)
##          1          2          3          4          5          6          7 
## 0.23358742 0.11464284 0.14024962 0.08206011 0.19832771 0.28393997 0.24727290 
##          8          9         10         11         12         13         14 
## 0.09076553 0.15693197 0.13763868 0.10950828 0.07916255 0.18751614 0.30224526 
##         15         16         17         18         19         20 
## 0.28152669 0.17661546 0.19174974 0.35688026 0.09474927 0.53462960
cooks.distance(lm.RM)
##            1            2            3            4            5            6 
## 2.897873e-02 1.100156e-01 9.856423e-03 3.235619e-02 1.012834e-02 1.458255e-01 
##            7            8            9           10           11           12 
## 1.573275e-01 2.784216e-02 1.202892e-04 4.712645e-02 2.851731e-02 1.539220e-03 
##           13           14           15           16           17           18 
## 5.976127e-03 7.942169e-02 5.971442e-02 1.354558e-01 1.725834e-03 2.668880e-07 
##           19           20 
## 7.140561e-02 1.058192e-03
infbdMes <- data.frame(PtID=bloodpressure[1:20,1], hatVal=hatvalues(lm.RM), cookdist=cooks.distance(lm.RM))
infbdMes
##    PtID     hatVal     cookdist
## 1     1 0.23358742 2.897873e-02
## 2     2 0.11464284 1.100156e-01
## 3     3 0.14024962 9.856423e-03
## 4     4 0.08206011 3.235619e-02
## 5     5 0.19832771 1.012834e-02
## 6     6 0.28393997 1.458255e-01
## 7     7 0.24727290 1.573275e-01
## 8     8 0.09076553 2.784216e-02
## 9     9 0.15693197 1.202892e-04
## 10   10 0.13763868 4.712645e-02
## 11   11 0.10950828 2.851731e-02
## 12   12 0.07916255 1.539220e-03
## 13   13 0.18751614 5.976127e-03
## 14   14 0.30224526 7.942169e-02
## 15   15 0.28152669 5.971442e-02
## 16   16 0.17661546 1.354558e-01
## 17   17 0.19174974 1.725834e-03
## 18   18 0.35688026 2.668880e-07
## 19   19 0.09474927 7.140561e-02
## 20   20 0.53462960 1.058192e-03
## for age,
gpAge <- ggplot(bloodpressure, aes(x = BP, y = Age)) 
gpAge <- gpAge + geom_point() +geom_text(aes(label=PtID),hjust=-0.5, vjust=0.1) + theme_classic()
gpAge

## for weight,
gpWeight <- ggplot(bloodpressure, aes(x = BP, y = Weight)) 
gpWeight <- gpWeight + geom_point() +geom_text(aes(label=PtID),hjust=-0.5, vjust=0.1) + theme_classic()
gpWeight

## for bsa,
gpBSA <- ggplot(bloodpressure, aes(x = BP, y = BSA)) 
gpBSA <- gpBSA + geom_point() +geom_text(aes(label=PtID),hjust=-0.5, vjust=0.1) + theme_classic()
gpBSA

## now the 3D build part,
## but we cannt buid a 3D model because this has 4 variable,

gpbdHatval <- ggplot(infbdMes, aes(x= PtID, y= hatVal , label=PtID)) 
gpbdHatval <- gpbdHatval +  geom_point() +geom_text(aes(label=PtID),hjust=-0.5, vjust=0.1) + theme_classic() 
gpbdHatval <- gpbdHatval    +   geom_hline(yintercept=2*mean(infbdMes$hatVal), linetype="dashed",     color = "red")
gpbdHatval

gpbdCookdis <-  ggplot(infbdMes, aes(x= PtID, y= cookdist , label=PtID)) 
gpbdCookdis <- gpbdCookdis +  geom_point() +geom_text(aes(label=PtID),hjust=-0.5, vjust=0.1) + theme_classic() 
gpbdCookdis

# 2,6,7,14,15,16,19

## After removing influential points/ Outlyers,
after_data <- bloodpressure[c(-2,-6,-7,-14,-15,-16,-19), 1:5]
after_data
##    PtID  BP Age Weight  BSA
## 1     1 105  47   85.4 1.75
## 3     3 116  49   95.3 1.98
## 4     4 117  50   94.7 2.01
## 5     5 112  51   89.4 1.89
## 8     8 110  47   90.9 1.90
## 9     9 110  49   89.2 1.83
## 10   10 114  48   92.7 2.07
## 11   11 114  47   94.4 2.07
## 12   12 115  49   94.1 1.98
## 13   13 114  50   91.6 2.05
## 17   17 106  46   87.0 1.87
## 18   18 113  46   94.5 1.90
## 20   20 122  56   95.7 2.09
lm.afterData <- lm(BP ~ ., data = after_data[,-1])
summary(lm.afterData)
## 
## Call:
## lm(formula = BP ~ ., data = after_data[, -1])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.41242 -0.13224 -0.03031  0.16507  0.50658 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -13.18019    2.64098  -4.991 0.000748 ***
## Age           0.73160    0.03716  19.686 1.04e-08 ***
## Weight        0.87407    0.04300  20.328 7.86e-09 ***
## BSA           5.13414    1.42006   3.615 0.005612 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3052 on 9 degrees of freedom
## Multiple R-squared:  0.9966, Adjusted R-squared:  0.9954 
## F-statistic: 873.3 on 3 and 9 DF,  p-value: 2.076e-11
## so the model is adequate.//