#creating age variable with 0 to 99 random samples of size 1000
Age<-sample(0:99,size = 1000,replace=TRUE)
#creating BMI variable with 10 to 40 random samples of size 1000
BMI<-sample(10:40, size=1000,replace = TRUE)
#creating sex binary variable of size 1000
sex<-sample(x=c(1,0),size = 1000, replace = TRUE)
#creating a data frame of four variables
df<-data.frame(Serialno=c(1:1000), BMI, Age, sex)
head(df)
## Serialno BMI Age sex
## 1 1 31 96 1
## 2 2 12 18 1
## 3 3 11 54 1
## 4 4 15 56 0
## 5 5 12 92 1
## 6 6 16 15 1
#Setting random seed
set.seed(36)
#Splitting data into 2 parts
data<- sample(2, nrow(df), replace = TRUE,prob=c(0.8,0.2))
train<-df[data==1,]
test<-df[data==2,]
#fitting a linear regression model
lr<-lm(BMI~Age+sex,data=train)
summary(lr)
##
## Call:
## lm(formula = BMI ~ Age + sex, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.9517 -7.7424 0.1493 7.4708 15.7234
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.287757 0.701760 36.035 <2e-16 ***
## Age 0.007217 0.011088 0.651 0.515
## sex -1.011188 0.625780 -1.616 0.107
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.886 on 806 degrees of freedom
## Multiple R-squared: 0.003897, Adjusted R-squared: 0.001425
## F-statistic: 1.576 on 2 and 806 DF, p-value: 0.2073
#residual analysis of the fitted model
plot(lr, which=1, col="blue")
summary(lr$residuals)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -15.9517 -7.7424 0.1493 0.0000 7.4708 15.7234
acf(lr$residuals)
#calculation
library(car)
## Loading required package: carData
durbinWatsonTest(lr)
## lag Autocorrelation D-W Statistic p-value
## 1 -0.0439445 2.086932 0.242
## Alternative hypothesis: rho != 0
Here, the Dp-value obtained i.e. 0.396 > 0.05, so we reject null hypothesis of this test. Therefore there is autocorrelation between residuals
# Q-Q plot of linear model
plot(lr, which = 2, col = "pink")
# test
shapiro.test(lr$residuals)
##
## Shapiro-Wilk normality test
##
## data: lr$residuals
## W = 0.95522, p-value = 5.493e-15
Here, from the obtained Q-Q plot we can see the residuals are not normally distributed. Also, from the Shapiro-Wilk test the obtained p-value i.e. 6.433e-15 is more than the predefined significance level i.e. 0.05 which confirms that the residuals follows the normal distribution. So the fitted model failed to pass the normality test.
# scatter plot of standardized residuals
plot(lr, which = 3, col = "blue")
# bptest
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
bptest(lr)
##
## studentized Breusch-Pagan test
##
## data: lr
## BP = 0.14698, df = 2, p-value = 0.9291
Since from the obtained plot, the values are distributed randomly and from the bptest, obtained p-value i.e. 0.5599 > 0.05, so we accept null hypothesis. Therefore the residual variances are equal (homoscedasticity). As the fitted linear regression model passes the necessary condition, we assume this is a valid model and it can be further used to make predictions.
# library
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Warning: package 'caret' was built under R version 4.1.2
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.1.2
## Loading required package: lattice
# predictions using fitted model
predictions <- lr %>%
predict(test)
# calculation of R2, RMSE and MAE
results <- data.frame(R2 = R2(predictions, test$BMI),
RMSE = RMSE(predictions, test$BMI),
MAE = MAE(predictions, test$BMI)
)
# results of testing
summary(lr)
##
## Call:
## lm(formula = BMI ~ Age + sex, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.9517 -7.7424 0.1493 7.4708 15.7234
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.287757 0.701760 36.035 <2e-16 ***
## Age 0.007217 0.011088 0.651 0.515
## sex -1.011188 0.625780 -1.616 0.107
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.886 on 806 degrees of freedom
## Multiple R-squared: 0.003897, Adjusted R-squared: 0.001425
## F-statistic: 1.576 on 2 and 806 DF, p-value: 0.2073