Simple Linear Regression

Problem Defination

To check if there is a good correlation in the above dataset and if it can be used for regression model
If yes, predict weight for the following heights 160, 170, 180

Setup

Loading Libraries

library(ggplot2)
library(corrgram)
library(gridExtra)

Dataset

# height in cms
hght <- c(151, 174, 138, 186, 128, 136, 179, 163, 152, 131, 153, 177, 148, 189, 138, 146, 199, 167, 153, 130)
# weight in kgs
wght <- c(63, 81, 56, 91, 47, 57, 76, 72, 62, 48, 65, 84, 59, 93, 49, 55, 79, 75, 66, 49)

dfrModel <- data.frame(hght,wght)
head(dfrModel)
##   hght wght
## 1  151   63
## 2  174   81
## 3  138   56
## 4  186   91
## 5  128   47
## 6  136   57

Exploratory Analysis

# to check out height & weight
summary(dfrModel$hght)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   128.0   138.0   152.5   156.9   174.8   199.0
summary(dfrModel$wght)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   47.00   55.75   64.00   66.35   76.75   93.00
nrow(dfrModel)
## [1] 20

Finding out whether outliers are present in given heights

# ?quantile()
hght.qnt <- quantile(dfrModel$hght, probs=c(.25, .75))
# ?IQR()
hght.max <- 1.5 * IQR(dfrModel$hght)
hght.out <- dfrModel$hght
hght.out[dfrModel$hght < (hght.qnt[1] - hght.max)] <- NA
hght.out[dfrModel$hght > (hght.qnt[2] + hght.max)] <- NA
print(dfrModel$hght[is.na(hght.out)])
## numeric(0)

Finding out whether outliers are present in given weights

# ?quantile()
wght.qnt <- quantile(dfrModel$wght, probs=c(.25, .75))
# ?IQR()
wght.max <- 1.5 * IQR(dfrModel$wght)
wght.out <- dfrModel$wght
wght.out[dfrModel$wght < (wght.qnt[1] - wght.max)] <- NA
wght.out[dfrModel$wght > (wght.qnt[2] + wght.max)] <- NA
print(dfrModel$wght[is.na(wght.out)])
## numeric(0)

visualizing outliers

# checking outliers in height
hghtPlot <- ggplot(dfrModel, aes(x="", y=hght)) +
            geom_boxplot(aes(fill=hght), color="green") +
            labs(title="Height Outliers")
 hghtPlot                             

# checking outliers in weight
wghtPlot <- ggplot(dfrModel, aes(x="", y=wght)) +
            geom_boxplot(aes(fill=wght), color="blue") +
            labs(title="Weight Outliers")
wghtPlot

# show plot
grid.arrange(hghtPlot, wghtPlot, nrow=1, ncol=2)

Observation
No outliers are present in hght & wght.
Zero outlier count

Correlation

# correlation coefficient
cor(dfrModel$wght, dfrModel$hght)
## [1] 0.944644
#cor(x, y, method = c("pearson", "kendall", "spearman"))
# correlation test
cor.test(dfrModel$wght, dfrModel$hght)
## 
##  Pearson's product-moment correlation
## 
## data:  dfrModel$wght and dfrModel$hght
## t = 12.215, df = 18, p-value = 3.788e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8627911 0.9782375
## sample estimates:
##      cor 
## 0.944644
#cor.test(x, y, method=c("pearson", "kendall", "spearman"))

Observation
Strong positive correlation is obeserved between hght & wght

Correlation Visualization

# visualize correlation
pairs(dfrModel)

#plot(dfrModel)

Observation
Strong positive correlation is obeserved between hght & wght

Plot Graph

# base chart
plot(dfrModel$hght,dfrModel$wght, col="blue", main="Regression",
abline(lm(dfrModel$wght~dfrModel$hght)), cex=1, pch=16, xlab="height in cms", ylab="Weight in KG Tonnes")

# ggplot
ggplot(dfrModel, aes(x=hght, y=wght)) +
    geom_point(shape=19, colour="blue", fill="blue") +
    geom_smooth(method='lm', formula=y~x) +
    labs(title="Height & Weight Regression") +
    labs(x="Height in Cms") +
    labs(y="Weight in KG Tones")

Observation
It is seen that as height increases, weight also increases

Making a Linear Model

x <- dfrModel$hght
y <- dfrModel$wght
slmModel <- lm(y~x)

Observation
No errors. Model successfully created.

Show Model

# print summary
summary(slmModel)
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.1573  -1.7267   0.7701   2.6045   6.2102 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -33.55669    8.25032  -4.067 0.000723 ***
## x             0.63675    0.05213  12.215 3.79e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.846 on 18 degrees of freedom
## Multiple R-squared:  0.8924, Adjusted R-squared:  0.8864 
## F-statistic: 149.2 on 1 and 18 DF,  p-value: 3.788e-10

Observation
## R-squared: Should be as much closer to 1 as possible Adjusted R-Square value here is 88 ,which being more than 75 is good ## P-Value: Should be less than 0.05 P-Value of height (x) is less than 0.05 … Model is acceptable and we can use this for predictive analytics

Test Data

# find mpg of a car with weight 3.0
dfrTest <- data.frame(x=c(160,170,180))

#names(dfrTest) <- c("x")
dfrTest 
##     x
## 1 160
## 2 170
## 3 180

Observation
Test Data successfully created.

Predict

result <-  predict(slmModel, dfrTest)
print(result)
##        1        2        3 
## 68.32394 74.69148 81.05902

Observation
Prediction is as expected.