Problem Definition

To check if there is a correlation between Weight and Height and if it can be used as a regression model.
If yes, then predict the weight(kgs) for the following heights(cms) 160, 170, 180

Dataset

# height in cms
hght <- c(151, 174, 138, 186, 128, 136, 179, 163, 152, 131, 153, 177, 148, 189, 138, 146, 199, 167, 153, 130)
# weight in kgs
wght <- c(63, 81, 56, 91, 47, 57, 76, 72, 62, 48, 65, 84, 59, 93, 49, 55, 79, 75, 66, 49)

Setup

Load Libraries

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.3
#install.packages("corrgram")
library(corrgram)
## Warning: package 'corrgram' was built under R version 3.3.3
#install.packages("gridExtra")
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.3.3

Dataset

dfrModel <- data.frame(wght, hght)
names(dfrModel) <- c("wght","hght")
head(dfrModel)
##   wght hght
## 1   63  151
## 2   81  174
## 3   56  138
## 4   91  186
## 5   47  128
## 6   57  136

Exploratory Analysis

# check out wght and hght
summary(dfrModel$wght)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   47.00   55.75   64.00   66.35   76.75   93.00
summary(dfrModel$hght)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   128.0   138.0   152.5   156.9   174.8   199.0
nrow(dfrModel)  
## [1] 20

Check for outliers in Height

# ?quantile()
hght.qnt <- quantile(dfrModel$hght, probs=c(.25, .75))
# ?IQR()
hght.max <- 1.5 * IQR(dfrModel$hght)
hght.out <- dfrModel$hght
hght.out[dfrModel$hght < (hght.qnt[1] - hght.max)] <- NA
hght.out[dfrModel$hght > (hght.qnt[2] + hght.max)] <- NA
print(dfrModel$hght[is.na(hght.out)])
## numeric(0)

Check for outliers in Weight

# ?quantile()
wght.qnt <- quantile(dfrModel$wght, probs=c(.25, .75))
# ?IQR()
wght.max <- 1.5 * IQR(dfrModel$wght)
wght.out <- dfrModel$wght
wght.out[dfrModel$wght < (wght.qnt[1] - wght.max)] <- NA
wght.out[dfrModel$wght > (wght.qnt[2] + wght.max)] <- NA
print(dfrModel$wght[is.na(wght.out)])
## numeric(0)

Observation
No Outliers are present in hght & wght dataset.
Thus the model will work well.

Correlation between Height and Weight

# correlation coefficient
cor(dfrModel$wght, dfrModel$hght)
## [1] 0.944644
# correlation test
cor.test(dfrModel$hght, dfrModel$wght, method=c("pearson")) 
## 
##  Pearson's product-moment correlation
## 
## data:  dfrModel$hght and dfrModel$wght
## t = 12.215, df = 18, p-value = 3.788e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8627911 0.9782375
## sample estimates:
##      cor 
## 0.944644

Observation
A strong positive correlation is obeserved between height and weight

Correlation Visualization

# visualize correlation
plot(dfrModel)

corrgram(dfrModel)

Observation
A strong positive correlation is obeserved between height and weight

Plot Correlation Graph

# ggplot
ggplot(dfrModel, aes(x=hght, y=wght)) +
    geom_point(shape=19, colour="blue", fill="blue") +
    geom_smooth(method='lm', formula=y~x) +  
    labs(title="Weight & Height Regression") +
    labs(x="Height in Cms") +
    labs(y="Weight in Kgs")

Observation
It is observed that as Height increases the Weight tends to increase.

Linear Model

x <- dfrModel$hght
y <- dfrModel$wght
slmModel <- lm(y~x) 

Observation
No errors found. As a result the model is successfully created.

Show Model

# print summary
summary(slmModel)
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.1573  -1.7267   0.7701   2.6045   6.2102 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -33.55669    8.25032  -4.067 0.000723 ***
## x             0.63675    0.05213  12.215 3.79e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.846 on 18 degrees of freedom
## Multiple R-squared:  0.8924, Adjusted R-squared:  0.8864 
## F-statistic: 149.2 on 1 and 18 DF,  p-value: 3.788e-10

Observation
R-Square is found be 0.8924 which justifies it to be a good model. P-Value of hght (x) is less than 0.05 Thus,Model is acceptable and we can use this for predictive analytics

Test Data for prediction

# Find the weight(kgs) for the following heights(cms) 160, 170, 180
dfrTest <- data.frame(x=c(160, 170, 180))
names(dfrTest) <- c("x")
dfrTest 
##     x
## 1 160
## 2 170
## 3 180

Observation
Test Data successfully created.

Predictive Analytics Test

result <-  predict(slmModel, dfrTest)
print(round(result), digits=2)
##  1  2  3 
## 68 75 81

Observation
Weight of a 160 cms height is predicted to be 68.32 kg, for 170 it is 74.69 Kg and for 180 it is predicted to be 81.05 kg