Preparing the data

#setting random seed
set.seed(1)
#creating age variable with 1000 random samples between 0 to 99
Age <- sample(0:99,size = 1000,replace=T)
#creating BMI variable with 1000 random samples between 10 to 40
BMI <- sample(10:40, size = 1000,replace = T)
#creating binary variable sex (1=Male and 0=Female) of 1000 random samples
Sex <- sample(x=c("0","1"),size = 1000,replace = T)
#data frame as df containing four variables/features:Serial Number, BMI, Age and Sex
df <- data.frame(S.N=c(1:1000),BMI,Age,Sex)

Splitting the data into “train” and “test” data using 80-20 partition

set.seed(1)
index = sample(2,nrow(df),replace = T,prob=c(0.8,0.2))
train_data <- df[index== 1,]
test_data <- df[index== 2,]

Fit a linear regression model with BMI as dependent variable and age and sex and predictors in the train data samples

linear_model <- lm(BMI~Age+Sex,data=train_data)
summary(linear_model)
## 
## Call:
## lm(formula = BMI ~ Age + Sex, data = train_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.8061  -7.2599  -0.3709   7.4977  15.6787 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 24.165717   0.679635  35.557   <2e-16 ***
## Age          0.006765   0.010438   0.648    0.517    
## Sex1         0.997760   0.611281   1.632    0.103    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.655 on 799 degrees of freedom
## Multiple R-squared:  0.003811,   Adjusted R-squared:  0.001318 
## F-statistic: 1.528 on 2 and 799 DF,  p-value: 0.2175
print(linear_model)
## 
## Call:
## lm(formula = BMI ~ Age + Sex, data = train_data)
## 
## Coefficients:
## (Intercept)          Age         Sex1  
##   24.165717     0.006765     0.997760

Conduct residual analysis of the fitted model with graphs (suggestive) and tests (confirmative)

#Graphical(Suggestive)
residuals <- linear_model$residuals
#LOESS scatterplot of residuals(y-axis) and predicted values(x-axis)
plot(linear_model,which = 1,col=c("red"))

Here, the LOESS line lies in the zero line of the y-axis then we can say that the residuals are linear.

#Test(confirmative)
summary(linear_model$residuals)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -15.8061  -7.2599  -0.3709   0.0000   7.4977  15.6787

since the mean of the residuals is zero then the residuals are linear.

#Graphical(Suggestive)-Autocorrelation Function Plot(ACF)
acf(linear_model$residuals)

Here, the plot shows ups and down bars on x-axis so there is no autocorrelation.

#calculation(confirmative)
library(car)
durbinWatsonTest(linear_model)
##  lag Autocorrelation D-W Statistic p-value
##    1     -0.03164665      2.060889   0.386
##  Alternative hypothesis: rho != 0

p-value>0.05, so there is no autocorrelation present. * Normality of Residuals

#Graphical(Suggestive) 
plot(linear_model, which = 2,col = c("red"))

#calculation(confirmative)
shapiro.test(linear_model$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  linear_model$residuals
## W = 0.96159, p-value = 1.16e-13

p-value<0.05, so residuals do not follow the normal distribution. * Equal variance of residuals

#Graphical(Suggestive)
plot(linear_model,which=3,col=c("red"))

Here the values are distributed randomly in the plot so there is homoscedasticity.

#Calculation(Confirmative)
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.1.2
bptest(linear_model)
## 
##  studentized Breusch-Pagan test
## 
## data:  linear_model
## BP = 0.17184, df = 2, p-value = 0.9177

p-value>0.05, so residual variances are equal(homoscedasticity).

Use the fitted model to predict the random test data samples

library(dplyr)
library(caret)
## Warning: package 'caret' was built under R version 4.1.2
## Warning: package 'lattice' was built under R version 4.1.2
predictions <- linear_model%>%
predict(test_data)
predict_test_data <- data.frame(R2= R2(predictions,test_data$BMI),
           RMSE = RMSE(predictions,test_data$BMI),
           MAE = MAE(predictions,test_data$BMI))
summary(linear_model)
## 
## Call:
## lm(formula = BMI ~ Age + Sex, data = train_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.8061  -7.2599  -0.3709   7.4977  15.6787 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 24.165717   0.679635  35.557   <2e-16 ***
## Age          0.006765   0.010438   0.648    0.517    
## Sex1         0.997760   0.611281   1.632    0.103    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.655 on 799 degrees of freedom
## Multiple R-squared:  0.003811,   Adjusted R-squared:  0.001318 
## F-statistic: 1.528 on 2 and 799 DF,  p-value: 0.2175
#model accuracy of test dataset
print.data.frame(predict_test_data)
##             R2     RMSE     MAE
## 1 0.0006508965 8.872329 7.51177

R2 is 0.003811(train data) > 0.0006508965(test data), which is not better. Value of RMSE is 8.655(train data) < 8.872329(test data), lower value of RMSE (almost similar) of RMSE is better.