Simple Linear Regression

Problem Defination

Check if there is good correlation in the above dataset and if it can be used for regression model
If yes, predict weight for the following heights 160, 170, 180

Setup

library(ggplot2)
library(corrgram)
library(gridExtra)

Dataset

# height in cms
hght <- c(151, 174, 138, 186, 128, 136, 179, 163, 152, 131, 153, 177, 148, 189, 138, 146, 199, 167, 153, 130)
# weight in kgs
wght <- c(63, 81, 56, 91, 47, 57, 76, 72, 62, 48, 65, 84, 59, 93, 49, 55, 79, 75, 66, 49)

Dataset

dfrModel <- data.frame(wght, hght)
names(dfrModel) <- c("wght","hght")
head(dfrModel)
##   wght hght
## 1   63  151
## 2   81  174
## 3   56  138
## 4   91  186
## 5   47  128
## 6   57  136

Exploratory Analysis

# check ut mpg & wt
summary(dfrModel$wght)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   47.00   55.75   64.00   66.35   76.75   93.00
summary(dfrModel$hght)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   128.0   138.0   152.5   156.9   174.8   199.0
length(dfrModel$wght)
## [1] 20
length(dfrModel$hght)
## [1] 20
# ?quantile()
wght.qnt <- quantile(dfrModel$wght, probs=c(.25, .75))
# ?IQR()
wght.max <- 1.5 * IQR(dfrModel$wght)
wght.out <- dfrModel$wght
wght.out[dfrModel$wght < (wght.qnt[1] - wght.max)] <- NA
wght.out[dfrModel$wght > (wght.qnt[2] + wght.max)] <- NA
#print(dfrModel$wght)
#print(wght.out)
print(dfrModel$wght[is.na(wght.out)])
## numeric(0)
hght.qnt <- quantile(dfrModel$hght, probs=c(.25, .75))
hght.max <- 1.5 * IQR(dfrModel$hght)
hght.out <- dfrModel$hght
hght.out[dfrModel$hght < (hght.qnt[1] - hght.max)] <- NA
hght.out[dfrModel$hght > (hght.qnt[2] + hght.max)] <- NA
#print(dfrModel$hght)
#print(hght.out)
print(dfrModel$hght[is.na(hght.out)])
## numeric(0)
# check outliers in wght
wghtPlot <- ggplot(dfrModel, aes(x="", y=wght)) +
            geom_boxplot(aes(fill=wght), color="green") +
            labs(title="Weight Outliers")
wghtPlot  

# check out hght
hghtPlot <- ggplot(dfrModel, aes(x="", y=hght)) +
            geom_boxplot(aes(fill=hght), color="blue") +
            labs(title="Height Outliers")
hghtPlot

# show plot
#grid.arrange(wghtPlot, hghtPlot, nrow=1, ncol=2)

Observation
No Outliers present in wght & hght.
For this model we will work without the outliers.

Correlation

# correlation coefficient
#cor(dfrModel$wght, dfrModel$hght)
#cor(x, y, method = c("pearson", "kendall", "spearman"))
# correlation test
cor.test(dfrModel$wght, dfrModel$hght)
## 
##  Pearson's product-moment correlation
## 
## data:  dfrModel$wght and dfrModel$hght
## t = 12.215, df = 18, p-value = 3.788e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8627911 0.9782375
## sample estimates:
##      cor 
## 0.944644
#cor.test(x, y, method=c("pearson", "kendall", "spearman"))

Observation
Decent posiive correlation is obeserved between wght & hght

Correlation Visualization

# visualize correlation
pairs(dfrModel)

#plot(dfrModel)
# visualize correlation
# http://www.statmethods.net/advgraphs/correlograms.html
corrgram(dfrModel)

Observation
Decent positive correlation is obeserved between wght & hght

Plot Graph

# base chart
plot(dfrModel$wght,dfrModel$hght, col="blue", main="Regression",
abline(lm(dfrModel$hght~dfrModel$wght)), cex=1, pch=16, xlab="Weight in Kg Tones", ylab="Heights in cms")

# ggplot
ggplot(dfrModel, aes(x=wght, y=hght)) +
    geom_point(shape=19, colour="blue", fill="blue") +
    geom_smooth(method='lm', formula=y~x) +
    labs(title="Weight & height Regression") +
    labs(x="Weight in Kg Tons") +
    labs(y="Height in cms")

Observation
It seen that as hght increases wght also increases. This shows a positive relation between the two.

Linear Model

x <- dfrModel$hght
y <- dfrModel$wght
slmModel <- lm(y~x)

Observation
No errors. Model successfully created.

Show Model

# print summary
summary(slmModel)
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.1573  -1.7267   0.7701   2.6045   6.2102 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -33.55669    8.25032  -4.067 0.000723 ***
## x             0.63675    0.05213  12.215 3.79e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.846 on 18 degrees of freedom
## Multiple R-squared:  0.8924, Adjusted R-squared:  0.8864 
## F-statistic: 149.2 on 1 and 18 DF,  p-value: 3.788e-10

Observation
## R-squared: Should be as much closer to 1 as possible R-Square being more than 75 is good ## P-Value: Should be less than 0.05 P-Value of wt (x) is less than 0.05 … Model is acceptable and we can use this for predictive analytics.

Test Data

# find wght when hght is 160,170 and 180
dfrTest <- data.frame(x=c(160,170,180))
#names(dfrTest) <- c("x")
dfrTest 
##     x
## 1 160
## 2 170
## 3 180

Observation
Test Data successfully created.

Predict

result <-  predict(slmModel, dfrTest)
print(result)
##        1        2        3 
## 68.32394 74.69148 81.05902

Observation
Prediction is on expected lines. The weight increases with increase in the height. This shows the positive regressive model.