library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
pima <- read.csv("~/Downloads/diabetes.csv")
str(pima)
## 'data.frame': 768 obs. of 9 variables:
## $ Pregnancies : int 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : int 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : int 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : int 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : int 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : int 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : int 1 0 1 0 1 0 1 0 1 1 ...
pima$bmi_squared <- pima$BMI^2
pima$age_outcome_interaction <- pima$Age * pima$Outcome
# Fit the model
model <- lm(Outcome ~ Glucose + BloodPressure + BMI + bmi_squared + Age + age_outcome_interaction, data = pima)
summary(model)
##
## Call:
## lm(formula = Outcome ~ Glucose + BloodPressure + BMI + bmi_squared +
## Age + age_outcome_interaction, data = pima)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.50700 -0.08253 -0.03425 0.07941 0.38294
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.478e-01 4.275e-02 3.457 0.000577 ***
## Glucose 8.580e-04 1.790e-04 4.793 1.97e-06 ***
## BloodPressure -8.738e-04 2.801e-04 -3.120 0.001876 **
## BMI 8.994e-05 2.304e-03 0.039 0.968871
## bmi_squared 6.155e-05 3.610e-05 1.705 0.088624 .
## Age -6.931e-03 4.852e-04 -14.285 < 2e-16 ***
## age_outcome_interaction 2.462e-02 3.246e-04 75.863 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1387 on 761 degrees of freedom
## Multiple R-squared: 0.9161, Adjusted R-squared: 0.9154
## F-statistic: 1384 on 6 and 761 DF, p-value: < 2.2e-16
# residual analysis
par(mfrow=c(2,2))
plot(model)
# Normality test for residuals
shapiro.test(residuals(model))
##
## Shapiro-Wilk normality test
##
## data: residuals(model)
## W = 0.95154, p-value = 3.405e-15
# Breusch-Pagan test for heteroscedasticity
bptest(model)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 162.96, df = 6, p-value < 2.2e-16
# Reset par to default
par(mfrow=c(1,1))
The linear model used with the Pima Indians Diabetes dataset does not appear to be appropriate. The residual diagnostics indicate a non-linear relationship between the variables, with patterns in the Residuals vs. Fitted plot and a ‘fan’ shape in the Scale-Location plot, which would suggest heteroscedasticity. The normal Q-Q plot and the Shapiro-Wilk test show that the residuals are not normally distributed.