Simple Linear Regression

Problem Defination

Check if there is good correlation in the above dataset and if it can be used for regression model
If yes, predict weight for the following heights 160, 170, 180

Setup

Loading Libraries

library(ggplot2)
library(corrgram)
library(gridExtra)

Creating Dataset from the given vectors

# height in cms
hght <- c(151, 174, 138, 186, 128, 136, 179, 163, 152, 131, 153, 177, 148, 189, 138, 146, 199, 167, 153, 130)
# Checking for number of records for hght vector
length(hght)
## [1] 20
# weight in kgs
wght <- c(63, 81, 56, 91, 47, 57, 76, 72, 62, 48, 65, 84, 59, 93, 49, 55, 79, 75, 66, 49)
# Checking for number of records for hght vector
length(wght)
## [1] 20
# Creating a data frame for the above two vectors
dfrBodyMass <- data.frame(hght,wght)
names(dfrBodyMass) <- c("Height_cm","Weight_kg")
class(dfrBodyMass)
## [1] "data.frame"
head(dfrBodyMass)
##   Height_cm Weight_kg
## 1       151        63
## 2       174        81
## 3       138        56
## 4       186        91
## 5       128        47
## 6       136        57

Exploratory Analysis

# Check for Height_cm & Weight_kg
summary(dfrBodyMass$Height_cm)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   128.0   138.0   152.5   156.9   174.8   199.0
summary(dfrBodyMass$Weight_kg)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   47.00   55.75   64.00   66.35   76.75   93.00
nrow(dfrBodyMass)
## [1] 20

Checking for outliers for the column Height_cm

Height_cm.qnt <- quantile(dfrBodyMass$Height_cm, probs=c(.25, .75))
Height_cm.max <- 1.5 * IQR(dfrBodyMass$Height_cm)  
Height_cm.out <- dfrBodyMass$Height_cm
Height_cm.out[dfrBodyMass$Height_cm < (Height_cm.qnt[1] - Height_cm.max)] <- NA
Height_cm.out[dfrBodyMass$Height_cm > (Height_cm.qnt[2] + Height_cm.max)] <- NA
print(dfrBodyMass$Height_cm[is.na(Height_cm.out)])
## numeric(0)

Checking for outliers for the column Weight_kg

Weight_kg.qnt <- quantile(dfrBodyMass$Weight_kg, probs=c(.25, .75))
Weight_kg.max <- 1.5 * IQR(dfrBodyMass$Weight_kg)
Weight_kg.out <- dfrBodyMass$Weight_kg
Weight_kg.out[dfrBodyMass$Weight_kg < (Weight_kg.qnt[1] - Weight_kg.max)] <- NA
Weight_kg.out[dfrBodyMass$Weight_kg > (Weight_kg.qnt[2] + Weight_kg.max)] <- NA
print(dfrBodyMass$Weight_kg[is.na(Weight_kg.out)])
## numeric(0)

Plotting outliers plot

# check outliers in Weight
WeightPlot <- ggplot(dfrBodyMass, aes(x="", y=Weight_kg)) +
            geom_boxplot(aes(fill=Weight_kg), color="green") +
            labs(title="Weight Outliers")
                              
# check outliers in Height
HeightPlot <- ggplot(dfrBodyMass, aes(x="", y=Height_cm)) +
            geom_boxplot(aes(fill=Height_cm), color="blue") +
            labs(title="Height Outliers")
# show plot
grid.arrange(WeightPlot, HeightPlot, nrow=1, ncol=2)

Observation 1. There are no outliers in the given data for both the parameters i.e. Height and Weight

Correlation

cor.test(dfrBodyMass$Height_cm, dfrBodyMass$Weight_kg)
## 
##  Pearson's product-moment correlation
## 
## data:  dfrBodyMass$Height_cm and dfrBodyMass$Weight_kg
## t = 12.215, df = 18, p-value = 3.788e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8627911 0.9782375
## sample estimates:
##      cor 
## 0.944644

Observation 1. The correlation coefficient value (r) = 0.944. 2. This indicates that there exist strong positive correlation between the parameters.

Correlation Visualization

pairs(dfrBodyMass)

corrgram(dfrBodyMass)

Observation 1. There exist strong positive correlation between the parameters Height and Weight.

Plot Graph

# base chart
plot(dfrBodyMass$Weight_kg,dfrBodyMass$Height_cm, col="blue", main="Regression",
abline(lm(dfrBodyMass$Height_cm~dfrBodyMass$Weight_kg)), cex=1, pch=16, xlab="Height in cm", ylab="Weight in kg")

# ggplot
ggplot(dfrBodyMass, aes(x=Height_cm, y=Weight_kg)) +
    geom_point(shape=19, colour="blue", fill="blue") +
    geom_smooth(method='lm', formula=y~x) +
    labs(title="Weight & Height Regression") +
    labs(x="Height in cm") +
    labs(y="Weight in kg")

Observation
1. It seen that as the weight increases height also increases

Linear Model

y <- dfrBodyMass$Weight_kg
x <- dfrBodyMass$Height_cm
lmModel <- lm(y~x)

Observation
No errors. Model successfully created.

Show Model

summary(lmModel)
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.1573  -1.7267   0.7701   2.6045   6.2102 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -33.55669    8.25032  -4.067 0.000723 ***
## x             0.63675    0.05213  12.215 3.79e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.846 on 18 degrees of freedom
## Multiple R-squared:  0.8924, Adjusted R-squared:  0.8864 
## F-statistic: 149.2 on 1 and 18 DF,  p-value: 3.788e-10

Observation
1. R-squared: Ideal Scenario:- Should be as much closer to 1 as possible Present Scenario:- R-Square being more than 0.89 indicates strong relationship 2. P-Value : Ideal Scenario:- Should be less than 0.05 Present Scenario:- P-Value of Weight (x) is less than 0.05 3. Model is acceptable and we can use this for predictive analytics

Test Data

# Find the weight for the given values of heights
dfrReqData <- data.frame(x=c(160, 170, 180))
dfrReqData
##     x
## 1 160
## 2 170
## 3 180

Observation
Test Data successfully created.

Predict

result <-  predict(lmModel, dfrReqData)
print(result)
##        1        2        3 
## 68.32394 74.69148 81.05902

Observation
Prediction is on expected lines.