Simple Linear Regression

Problem Defination

The data set consists of height(in cms) of certain people and weights (in kgs). We need check if there is good correlation in the above dataset and if it can be used for regression model. If yes, predict weight for the following heights 160, 170, 180

Dataset

# height in cms
hght <- c(151, 174, 138, 186, 128, 136, 179, 163, 152, 131, 153, 177, 148, 189, 138, 146, 199, 167, 153, 130)
# weight in kgs
wght <- c(63, 81, 56, 91, 47, 57, 76, 72, 62, 48, 65, 84, 59, 93, 49, 55, 79, 75, 66, 49)

Setup

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.3
library(corrgram)
## Warning: package 'corrgram' was built under R version 3.3.3
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.3.3

Dataset

dfrModel <- data.frame(hght, wght)
names(dfrModel) <- c("hght","wght")
head(dfrModel)
##   hght wght
## 1  151   63
## 2  174   81
## 3  138   56
## 4  186   91
## 5  128   47
## 6  136   57

Exploratory Analysis

# check ut hght & wght
summary(dfrModel$hght)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   128.0   138.0   152.5   156.9   174.8   199.0
summary(dfrModel$wght)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   47.00   55.75   64.00   66.35   76.75   93.00
nrow(dfrModel)
## [1] 20
length(dfrModel$hght)
## [1] 20
length(dfrModel$wght)
## [1] 20
# ?quantile()
wght.qnt <- quantile(dfrModel$wght, probs=c(.25, .75))
# ?IQR()
wght.max <- 1.5 * IQR(dfrModel$wght) #1.5 is thumb rule
wght.out <- dfrModel$wght
wght.out[dfrModel$wght < (wght.qnt[1] - wght.max)] <- NA
wght.out[dfrModel$wght > (wght.qnt[2] + wght.max)] <- NA
#print(dfrModel$wght)
#print(wght.out)
print(dfrModel$wght[is.na(wght.out)])
## numeric(0)
hght.qnt <- quantile(dfrModel$hght, probs=c(.25, .75))
hght.max <- 1.5 * IQR(dfrModel$hght) 
hght.out <- dfrModel$hght
hght.out[dfrModel$hght < (hght.qnt[1] - hght.max)] <- NA
hght.out[dfrModel$hght > (hght.qnt[2] + hght.max)] <- NA
#print(dfrModel$hght)
#print(hght.out)
print(dfrModel$hght[is.na(hght.out)])
## numeric(0)
# check outliers in Weight
wghtPlot <- ggplot(dfrModel, aes(x="", y=wght)) +
            geom_boxplot(aes(fill=wght), color="green") +
            labs(title="Weight Outliers")

# check out Height
hghtPlot <- ggplot(dfrModel, aes(x="", y=hght)) +
            geom_boxplot(aes(fill=hght), color="blue") +
            labs(title="Height Outliers")
# show plot 
grid.arrange(hghtPlot, wghtPlot, nrow=1, ncol=2)

Observation
No outliers present in height and weight

Correlation

# correlation coefficient

cor(dfrModel$hght, dfrModel$wght)
## [1] 0.944644
#cor(x, y, method = c("pearson", "kendall", "spearman"))
# correlation test
cor.test(dfrModel$hght, dfrModel$wght)
## 
##  Pearson's product-moment correlation
## 
## data:  dfrModel$hght and dfrModel$wght
## t = 12.215, df = 18, p-value = 3.788e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8627911 0.9782375
## sample estimates:
##      cor 
## 0.944644
#cor.test(x, y, method=c("pearson", "kendall", "spearman"))

Observation
The Correlation Co-efficient is 0.944644.
This indicates that there is a Strong Positive correlation is between height & weight

Correlation Visualization

# visualize correlation
pairs(dfrModel)

Observation
Positive correlation is observed between weight & height

# visualize correlation
# http://www.statmethods.net/advgraphs/correlograms.html
corrgram(dfrModel)

Observation
Strong Positive correlation is observed between height and weight

Plot Graph

# base chart
plot(dfrModel$wght,dfrModel$hght, col="blue", main="Regression",
abline(lm(dfrModel$hght~dfrModel$wght)), cex=1, pch=16, xlab="height in cms", ylab="weight in kgs")

# ggplot
ggplot(dfrModel, aes(x=hght, y=wght)) +
    geom_point(shape=19, colour="blue", fill="blue") +
    geom_smooth(method='lm', formula=y~x) +
    labs(title="Weight & height Regression") +
    labs(x="height in cms") +
    labs(y="weight in kgs")

Observation
It is seen that height increases as weight increases

Linear Model

x <- dfrModel$hght
y <- dfrModel$wght
slmModel <- lm(y~x)

Observation
No errors. Model successfully created.

Show Model

# print summary
summary(slmModel)
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.1573  -1.7267   0.7701   2.6045   6.2102 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -33.55669    8.25032  -4.067 0.000723 ***
## x             0.63675    0.05213  12.215 3.79e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.846 on 18 degrees of freedom
## Multiple R-squared:  0.8924, Adjusted R-squared:  0.8864 
## F-statistic: 149.2 on 1 and 18 DF,  p-value: 3.788e-10

Observation
The P value is less than 0.05, thus, this model is good for predicting the weights based on any given height value

R-squared: Should be as much closer to 1 as possible R-Square being more than 75 is good

P-Value: Should be less than 0.05
P-Value of hght (x) is less than 0.05.Therefore model is acceptable and we can use this for predictive analytics

Test Data

# find wght of a person with weight 160,170,180
dfrTest <- data.frame(x=c(160,170,180))
#names(dfrTest) <- c("x")
dfrTest 
##     x
## 1 160
## 2 170
## 3 180

Observation
Test Data successfully created.

Predict
To predict the weights of people with height 160, 170 and 180 cms respectively

result <-  predict(slmModel, dfrTest)
print(result)
##        1        2        3 
## 68.32394 74.69148 81.05902

Results
The weights of people with heights 160, 170 and 180 cms have been predicted respectively as above.