#setting random seed
set.seed(1)
#creating age variable with 1000 random samples between 0 to 99
Age <- sample(0:99,size = 1000,replace=T)
#creating BMI variable with 1000 random samples between 10 to 40
BMI <- sample(10:40, size = 1000,replace = T)
#creating binary variable sex (1=Male and 0=Female) of 1000 random samples
Sex <- sample(x=c("0","1"),size = 1000,replace = T)
#data frame as df containing four variables/features:Serial Number, BMI, Age and Sex
df <- data.frame(S.N=c(1:1000),BMI,Age,Sex)
set.seed(1)
index = sample(2,nrow(df),replace = T,prob=c(0.8,0.2))
train_data <- df[index== 1,]
test_data <- df[index== 2,]
linear_model <- lm(BMI~Age+Sex,data=train_data)
summary(linear_model)
##
## Call:
## lm(formula = BMI ~ Age + Sex, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.8061 -7.2599 -0.3709 7.4977 15.6787
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.165717 0.679635 35.557 <2e-16 ***
## Age 0.006765 0.010438 0.648 0.517
## Sex1 0.997760 0.611281 1.632 0.103
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.655 on 799 degrees of freedom
## Multiple R-squared: 0.003811, Adjusted R-squared: 0.001318
## F-statistic: 1.528 on 2 and 799 DF, p-value: 0.2175
print(linear_model)
##
## Call:
## lm(formula = BMI ~ Age + Sex, data = train_data)
##
## Coefficients:
## (Intercept) Age Sex1
## 24.165717 0.006765 0.997760
#Graphical(Suggestive)
residuals <- linear_model$residuals
#LOESS scatterplot of residuals(y-axis) and predicted values(x-axis)
plot(linear_model,which = 1,col=c("red"))
Here, the LOESS line lies in the zero line of the y-axis then we can say that the residuals are linear.
#Test(confirmative)
summary(linear_model$residuals)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -15.8061 -7.2599 -0.3709 0.0000 7.4977 15.6787
since the mean of the residuals is zero then the residuals are linear.
#Graphical(Suggestive)-Autocorrelation Function Plot(ACF)
acf(linear_model$residuals)
Here, the plot shows ups and down bars on x-axis so there is no autocorrelation.
#calculation(confirmative)
library(car)
durbinWatsonTest(linear_model)
## lag Autocorrelation D-W Statistic p-value
## 1 -0.03164665 2.060889 0.386
## Alternative hypothesis: rho != 0
p-value>0.05, so there is no autocorrelation present. * Normality of Residuals
#Graphical(Suggestive)
plot(linear_model, which = 2,col = c("red"))
#calculation(confirmative)
shapiro.test(linear_model$residuals)
##
## Shapiro-Wilk normality test
##
## data: linear_model$residuals
## W = 0.96159, p-value = 1.16e-13
p-value<0.05, so residuals do not follow the normal distribution. * Equal variance of residuals
#Graphical(Suggestive)
plot(linear_model,which=3,col=c("red"))
Here the values are distributed randomly in the plot so there is homoscedasticity.
#Calculation(Confirmative)
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.1.2
bptest(linear_model)
##
## studentized Breusch-Pagan test
##
## data: linear_model
## BP = 0.17184, df = 2, p-value = 0.9177
p-value>0.05, so residual variances are equal(homoscedasticity).
library(dplyr)
library(caret)
## Warning: package 'caret' was built under R version 4.1.2
## Warning: package 'lattice' was built under R version 4.1.2
predictions <- linear_model%>%
predict(test_data)
predict_test_data <- data.frame(R2= R2(predictions,test_data$BMI),
RMSE = RMSE(predictions,test_data$BMI),
MAE = MAE(predictions,test_data$BMI))
summary(linear_model)
##
## Call:
## lm(formula = BMI ~ Age + Sex, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.8061 -7.2599 -0.3709 7.4977 15.6787
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.165717 0.679635 35.557 <2e-16 ***
## Age 0.006765 0.010438 0.648 0.517
## Sex1 0.997760 0.611281 1.632 0.103
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.655 on 799 degrees of freedom
## Multiple R-squared: 0.003811, Adjusted R-squared: 0.001318
## F-statistic: 1.528 on 2 and 799 DF, p-value: 0.2175
#model accuracy of test dataset
print.data.frame(predict_test_data)
## R2 RMSE MAE
## 1 0.0006508965 8.872329 7.51177
R2 is 0.003811(train data) > 0.0006508965(test data), which is not better. Value of RMSE is 8.655(train data) < 8.872329(test data), lower value of RMSE (almost similar) of RMSE is better.