library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
pima <- read.csv("~/Downloads/diabetes.csv")

str(pima)
## 'data.frame':    768 obs. of  9 variables:
##  $ Pregnancies             : int  6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : int  148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : int  72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness           : int  35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin                 : int  0 0 0 94 168 0 88 0 543 0 ...
##  $ BMI                     : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ DiabetesPedigreeFunction: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : int  50 31 32 21 33 30 26 29 53 54 ...
##  $ Outcome                 : int  1 0 1 0 1 0 1 0 1 1 ...
pima$bmi_squared <- pima$BMI^2

pima$age_outcome_interaction <- pima$Age * pima$Outcome

# Fit the model
model <- lm(Outcome ~ Glucose + BloodPressure + BMI + bmi_squared + Age + age_outcome_interaction, data = pima)

summary(model)
## 
## Call:
## lm(formula = Outcome ~ Glucose + BloodPressure + BMI + bmi_squared + 
##     Age + age_outcome_interaction, data = pima)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.50700 -0.08253 -0.03425  0.07941  0.38294 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              1.478e-01  4.275e-02   3.457 0.000577 ***
## Glucose                  8.580e-04  1.790e-04   4.793 1.97e-06 ***
## BloodPressure           -8.738e-04  2.801e-04  -3.120 0.001876 ** 
## BMI                      8.994e-05  2.304e-03   0.039 0.968871    
## bmi_squared              6.155e-05  3.610e-05   1.705 0.088624 .  
## Age                     -6.931e-03  4.852e-04 -14.285  < 2e-16 ***
## age_outcome_interaction  2.462e-02  3.246e-04  75.863  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1387 on 761 degrees of freedom
## Multiple R-squared:  0.9161, Adjusted R-squared:  0.9154 
## F-statistic:  1384 on 6 and 761 DF,  p-value: < 2.2e-16
# residual analysis
par(mfrow=c(2,2))
plot(model)

# Normality test for residuals
shapiro.test(residuals(model))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model)
## W = 0.95154, p-value = 3.405e-15
# Breusch-Pagan test for heteroscedasticity
bptest(model)
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 162.96, df = 6, p-value < 2.2e-16
# Reset par to default
par(mfrow=c(1,1))

The linear model used with the Pima Indians Diabetes dataset does not appear to be appropriate. The residual diagnostics indicate a non-linear relationship between the variables, with patterns in the Residuals vs. Fitted plot and a ‘fan’ shape in the Scale-Location plot, which would suggest heteroscedasticity. The normal Q-Q plot and the Shapiro-Wilk test show that the residuals are not normally distributed.