tdat<- read.csv(file.choose())
head(tdat)
##       adi  leptin cfpwv         wmhv age sbp
## 1  9482.7  5343.7   6.7 1.985805e-04  42 115
## 2  7213.2  9319.1   5.8 1.108385e-03  45 126
## 3 14482.3  4961.0   6.4 5.816304e-04  46 111
## 4  7031.4 12789.4   6.3 3.078742e-04  38 107
## 5  6230.2 21878.0   5.7 6.447901e-05  34 103
## 6 19129.8 18686.6    NA           NA  41 110
g<- lm(cfpwv ~ sbp, tdat)
summary(g)
## 
## Call:
## lm(formula = cfpwv ~ sbp, data = tdat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.9953 -0.7390 -0.1401  0.5829 10.5530 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.796121   0.234125    3.40 0.000687 ***
## sbp         0.054026   0.002012   26.85  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.172 on 1836 degrees of freedom
##   (99 observations deleted due to missingness)
## Multiple R-squared:  0.2819, Adjusted R-squared:  0.2815 
## F-statistic: 720.7 on 1 and 1836 DF,  p-value: < 2.2e-16
# At the alpha level of 0.05, we reject the null that beta is zero (F test = 720.7 df = (1,1836), p -value= <0.05)
#there is an association between sbp and dfpwv in this dataset
library(car)
## Warning: package 'car' was built under R version 3.4.2
##NORMALITY##
par(mfrow=c(2,2))
plot(fitted(g), residuals(g), xlab="Fitted", ylab="Residuals")
qqnorm(residuals (g), ylab="Residuals")
qqline(residuals(g))
qqPlot(residuals(g))
hist(g$residuals)

plot(g)

shapiro.test(g$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  g$residuals
## W = 0.90759, p-value < 2.2e-16
##The null is that the residuals are normal; the alternative is the residuals are not normally distributed.
#By the Shapiro-Wilk normality test, we reject the null (W = 0.90759, p-value < 2.2e-16) and claim that the residuals are not normally distributed.
#Before Trans
hist(tdat$cfpwv)

#right-skewed thus not symmetric

#Trans by inverse
inversecfpwv<- 1/(tdat$cfpwv)
hist(inversecfpwv)

#it is symmetric
g1<- lm (inversecfpwv ~ sbp, tdat)
#CHECK NORMALITY AGAIN UPON TRANS
par(mfrow=c(2,2))
plot(fitted(g1), residuals(g1), xlab="Fitted", ylab="Residuals")
qqnorm(residuals (g1), ylab="Residuals")
qqline(residuals(g1))
qqPlot(residuals(g1))
hist(g1$residuals)

plot(g1)##LEVERAGE POINT ? 

#OUTLIER 
#using g1 model
jack <- rstudent(g1)
jack[which.max(abs(jack))]
##       971 
## -4.426165
n <- length(jack)
pprime <- 2
rstud2 <- jack[which.max(abs(jack))]**2
ti <- jack[which.max(abs(jack))]*sqrt((n-pprime-1)/(n-pprime-rstud2))
ti
##       971 
## -4.448758
2*pt(ti, df =n-pprime-1)
##          971 
## 9.154894e-06
2*pt(ti, df =n-pprime-1)*n ##Bonf Cor
##       971 
## 0.0168267
library(faraway)
## Warning: package 'faraway' was built under R version 3.4.2
## 
## Attaching package: 'faraway'
## The following objects are masked from 'package:car':
## 
##     logit, vif
cook<- cooks.distance(g1)
influenceIndexPlot(g1)

patients <-row.names(tdat)
halfnorm (cook, 2, labs= patients, ylab = "Cooks' D" )

##Model With influiencial point removed (i.e. removing patient 1172)
g2<- lm (inversecfpwv ~ sbp, tdat, subset = (patients != "1172"))
# this should generates the same result as if you were to determine the model by using the cook<max(cook) method
g3<- lm (inversecfpwv ~ sbp, tdat, subset = (cook <max(cook)))
#SUMMARY
summary(g2)
## 
## Call:
## lm(formula = inversecfpwv ~ sbp, data = tdat, subset = (patients != 
##     "1172"))
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.092847 -0.014405  0.000195  0.013977  0.083923 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.625e-01  4.211e-03   62.34   <2e-16 ***
## sbp         -1.002e-03  3.619e-05  -27.67   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02109 on 1835 degrees of freedom
##   (99 observations deleted due to missingness)
## Multiple R-squared:  0.2944, Adjusted R-squared:  0.2941 
## F-statistic: 765.8 on 1 and 1835 DF,  p-value: < 2.2e-16
#You can also use plot(g2)
par(mfrow=c(2,2))
plot(fitted(g2), residuals(g2), xlab="Fitted", ylab="Residuals")
qqnorm(residuals (g2), ylab="Residuals")
qqline(residuals(g2))
qqPlot(residuals(g2))
hist(g2$residuals)