tdat<- read.csv(file.choose())
head(tdat)
## adi leptin cfpwv wmhv age sbp
## 1 9482.7 5343.7 6.7 1.985805e-04 42 115
## 2 7213.2 9319.1 5.8 1.108385e-03 45 126
## 3 14482.3 4961.0 6.4 5.816304e-04 46 111
## 4 7031.4 12789.4 6.3 3.078742e-04 38 107
## 5 6230.2 21878.0 5.7 6.447901e-05 34 103
## 6 19129.8 18686.6 NA NA 41 110
g<- lm(cfpwv ~ sbp, tdat)
summary(g)
##
## Call:
## lm(formula = cfpwv ~ sbp, data = tdat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.9953 -0.7390 -0.1401 0.5829 10.5530
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.796121 0.234125 3.40 0.000687 ***
## sbp 0.054026 0.002012 26.85 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.172 on 1836 degrees of freedom
## (99 observations deleted due to missingness)
## Multiple R-squared: 0.2819, Adjusted R-squared: 0.2815
## F-statistic: 720.7 on 1 and 1836 DF, p-value: < 2.2e-16
# At the alpha level of 0.05, we reject the null that beta is zero (F test = 720.7 df = (1,1836), p -value= <0.05)
#there is an association between sbp and dfpwv in this dataset
library(car)
## Warning: package 'car' was built under R version 3.4.2
##NORMALITY##
par(mfrow=c(2,2))
plot(fitted(g), residuals(g), xlab="Fitted", ylab="Residuals")
qqnorm(residuals (g), ylab="Residuals")
qqline(residuals(g))
qqPlot(residuals(g))
hist(g$residuals)

plot(g)

shapiro.test(g$residuals)
##
## Shapiro-Wilk normality test
##
## data: g$residuals
## W = 0.90759, p-value < 2.2e-16
##The null is that the residuals are normal; the alternative is the residuals are not normally distributed.
#By the Shapiro-Wilk normality test, we reject the null (W = 0.90759, p-value < 2.2e-16) and claim that the residuals are not normally distributed.
#Before Trans
hist(tdat$cfpwv)

#right-skewed thus not symmetric
#Trans by inverse
inversecfpwv<- 1/(tdat$cfpwv)
hist(inversecfpwv)

#it is symmetric
g1<- lm (inversecfpwv ~ sbp, tdat)
#CHECK NORMALITY AGAIN UPON TRANS
par(mfrow=c(2,2))
plot(fitted(g1), residuals(g1), xlab="Fitted", ylab="Residuals")
qqnorm(residuals (g1), ylab="Residuals")
qqline(residuals(g1))
qqPlot(residuals(g1))
hist(g1$residuals)

plot(g1)##LEVERAGE POINT ?

#OUTLIER
#using g1 model
jack <- rstudent(g1)
jack[which.max(abs(jack))]
## 971
## -4.426165
n <- length(jack)
pprime <- 2
rstud2 <- jack[which.max(abs(jack))]**2
ti <- jack[which.max(abs(jack))]*sqrt((n-pprime-1)/(n-pprime-rstud2))
ti
## 971
## -4.448758
2*pt(ti, df =n-pprime-1)
## 971
## 9.154894e-06
2*pt(ti, df =n-pprime-1)*n ##Bonf Cor
## 971
## 0.0168267
library(faraway)
## Warning: package 'faraway' was built under R version 3.4.2
##
## Attaching package: 'faraway'
## The following objects are masked from 'package:car':
##
## logit, vif
cook<- cooks.distance(g1)
influenceIndexPlot(g1)

patients <-row.names(tdat)
halfnorm (cook, 2, labs= patients, ylab = "Cooks' D" )

##Model With influiencial point removed (i.e. removing patient 1172)
g2<- lm (inversecfpwv ~ sbp, tdat, subset = (patients != "1172"))
# this should generates the same result as if you were to determine the model by using the cook<max(cook) method
g3<- lm (inversecfpwv ~ sbp, tdat, subset = (cook <max(cook)))
#SUMMARY
summary(g2)
##
## Call:
## lm(formula = inversecfpwv ~ sbp, data = tdat, subset = (patients !=
## "1172"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.092847 -0.014405 0.000195 0.013977 0.083923
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.625e-01 4.211e-03 62.34 <2e-16 ***
## sbp -1.002e-03 3.619e-05 -27.67 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02109 on 1835 degrees of freedom
## (99 observations deleted due to missingness)
## Multiple R-squared: 0.2944, Adjusted R-squared: 0.2941
## F-statistic: 765.8 on 1 and 1835 DF, p-value: < 2.2e-16
#You can also use plot(g2)
par(mfrow=c(2,2))
plot(fitted(g2), residuals(g2), xlab="Fitted", ylab="Residuals")
qqnorm(residuals (g2), ylab="Residuals")
qqline(residuals(g2))
qqPlot(residuals(g2))
hist(g2$residuals)
