Problem Definition

Check if there is good correlation in the above dataset and if it can be used for regression model
If yes, predict weight for the following heights 160, 170, 180

Data Description

The data comprises of heights and weights for 20 people. A data frame with 20 observations on 2 variables
[, 1] height in cms
[, 2] weight in kgs

Dataset

# height in cms
hght <- c(151, 174, 138, 186, 128, 136, 179, 163, 152, 131, 153, 177, 148, 189, 138, 146, 199, 167, 153, 130)
# weight in kgs
wght <- c(63, 81, 56, 91, 47, 57, 76, 72, 62, 48, 65, 84, 59, 93, 49, 55, 79, 75, 66, 49)

Setup

library(ggplot2)
library(corrgram)
library(gridExtra)

Dataset

dfrModel <- data.frame(hght, wght)
names(dfrModel) <- c("hght","wght")
head(dfrModel)
##   hght wght
## 1  151   63
## 2  174   81
## 3  138   56
## 4  186   91
## 5  128   47
## 6  136   57

Exploratory Analysis

# check hght & wght
summary(dfrModel$hght)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   128.0   138.0   152.5   156.9   174.8   199.0
summary(dfrModel$wght)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   47.00   55.75   64.00   66.35   76.75   93.00
nrow(dfrModel)
## [1] 20
mpg.qnt <- quantile(wght, probs=c(.25, .75))
mpg.max <- 1.5 * IQR(wght)
mpg.out <- wght
mpg.out[wght < (mpg.qnt[1] - mpg.max)] <- NA
mpg.out[wght > (mpg.qnt[2] + mpg.max)] <- NA
print(wght[is.na(mpg.out)])
## numeric(0)
wt.qnt <- quantile(hght, probs=c(.25, .75))
wt.max <- 1.5 * IQR(hght)
wt.out <- hght
wt.out[hght < (wt.qnt[1] - wt.max)] <- NA
wt.out[hght > (wt.qnt[2] + wt.max)] <- NA
print(hght[is.na(wt.out)])
## numeric(0)
# check outliers in wght
mpgPlot <- ggplot(dfrModel, aes(x="", y=wght)) +
            geom_boxplot(aes(fill=wght), color="green") +
            labs(title="Weight Outliers")
                              
# check outliers in hght
wtPlot <- ggplot(dfrModel, aes(x="", y=hght)) +
            geom_boxplot(aes(fill=hght), color="blue") +
            labs(title="Height Outliers")
# show plot
grid.arrange(wtPlot, mpgPlot, nrow=1, ncol=2)

Observation
No outliers present in hght & wght.

Correlation

# correlation coefficient
cor(hght, wght)
## [1] 0.944644
# correlation test
cor.test(hght, wght)
## 
##  Pearson's product-moment correlation
## 
## data:  hght and wght
## t = 12.215, df = 18, p-value = 3.788e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8627911 0.9782375
## sample estimates:
##      cor 
## 0.944644

Correlation Visualization

# visualize correlation
pairs(dfrModel)

plot(dfrModel)

# ggplot
ggplot(dfrModel, aes(x=hght, y=wght)) +
    geom_point(shape=19, colour="blue", fill="blue") +
    geom_smooth(method='lm', formula=y~x) +
    labs(title="Weight & Height Regression") +
    labs(x="Height") +
    labs(y="Weight")

Linear Model

x <- hght
y <- wght
slmModel <- lm(y~x)

Observation
No errors. Model successfully created.

Show Model

# print summary
summary(slmModel)
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.1573  -1.7267   0.7701   2.6045   6.2102 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -33.55669    8.25032  -4.067 0.000723 ***
## x             0.63675    0.05213  12.215 3.79e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.846 on 18 degrees of freedom
## Multiple R-squared:  0.8924, Adjusted R-squared:  0.8864 
## F-statistic: 149.2 on 1 and 18 DF,  p-value: 3.788e-10

Observation
R-Square being .89 is very good. P-Value of hght (x) is less than 0.05. Model is acceptable and we can use this for predictive analytics.

Test Data

# find wght
dfrTest <- data.frame(x=c(160, 170, 180))
dfrTest 
##     x
## 1 160
## 2 170
## 3 180

Observation
Test Data successfully created.

Predict

result <-  predict(slmModel, dfrTest)
print(result)
##        1        2        3 
## 68.32394 74.69148 81.05902

Observation
Prediction is on expected lines.