Simple Linear Regression

Problem Defination

We have a data set available with us us, which includes the heights (in cms) of certain people and their respective weights (in kgs).
Here, we need to check if there is good correlation in the above dataset and if it can be used for regression model
If yes, predict weight for the following heights 160, 170, 180

Dataset

# height in cms
hght <- c(151, 174, 138, 186, 128, 136, 179, 163, 152, 131, 153, 177, 148, 189, 138, 146, 199, 167, 153, 130)
# weight in kgs
wght <- c(63, 81, 56, 91, 47, 57, 76, 72, 62, 48, 65, 84, 59, 93, 49, 55, 79, 75, 66, 49)

Setup

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.3
library(corrgram)
## Warning: package 'corrgram' was built under R version 3.3.3
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.3.3

Dataset

dfrModel <- data.frame(hght, wght)
names(dfrModel) <- c("hght","wght")
head(dfrModel)
##   hght wght
## 1  151   63
## 2  174   81
## 3  138   56
## 4  186   91
## 5  128   47
## 6  136   57

Exploratory Analysis

# check ut mpg & wt
summary(hght)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   128.0   138.0   152.5   156.9   174.8   199.0
summary(wght)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   47.00   55.75   64.00   66.35   76.75   93.00
# ?quantile()
wght.qnt <- quantile(wght, probs=c(.25, .75))
# ?IQR()
wght.max <- 1.5 * IQR(wght)
wght.out <- wght
wght.out[wght < (wght.qnt[1] - wght.max)] <- NA
wght.out[wght > (wght.qnt[2] + wght.max)] <- NA
#print(dfrModel$mpg)
#print(mpg.out)
print(wght[is.na(wght.out)])
## numeric(0)
hght.qnt <- quantile(hght, probs=c(.25, .75))
hght.max <- 1.5 * IQR(hght)
hght.out <- dfrModel$wt
hght.out[hght < (hght.qnt[1] - hght.max)] <- NA
hght.out[hght > (hght.qnt[2] + hght.max)] <- NA
#print(dfrModel$wt)
#print(wt.out)
print(hght[is.na(hght.out)])
##  [1] 151 174 138 186 128 136 179 163 152 131 153 177 148 189 138 146 199
## [18] 167 153 130
# check outliers in weight
wghtPlot <- ggplot(dfrModel, aes(x="", y=wght)) +
            geom_boxplot(aes(fill=wght), color="green") +
            labs(title="Weight Outliers")
                              
# check out height
hghtPlot <- ggplot(dfrModel, aes(x="", y=hght)) +
            geom_boxplot(aes(fill=hght), color="blue") +
            labs(title="Height Outliers")
# show plot
grid.arrange(hghtPlot, wghtPlot, nrow=1, ncol=2)

Correlation

# correlation coefficient
cor(dfrModel$hght, dfrModel$wght)
## [1] 0.944644
#cor(x, y, method = c("pearson", "kendall", "spearman"))
# correlation test
cor.test(dfrModel$hght, dfrModel$wght)
## 
##  Pearson's product-moment correlation
## 
## data:  dfrModel$hght and dfrModel$wght
## t = 12.215, df = 18, p-value = 3.788e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8627911 0.9782375
## sample estimates:
##      cor 
## 0.944644
#cor.test(x, y, method=c("pearson", "kendall", "spearman"))

Observation
The Correlation Co-efficient is 0.944644.
This indicates that there is a strong positive correlation between the Height of a person and his/her weight.

# visualize correlation
#pairs(dfrModel)
plot(dfrModel)

# visualize correlation
# http://www.statmethods.net/advgraphs/correlograms.html
corrgram(dfrModel)

Plot Graph

# base chart
plot(dfrModel$wght,dfrModel$hght, col="blue", main="Regression",
abline(lm(dfrModel$hght~dfrModel$wght)), cex=1, pch=16, xlab="Height in Cm", ylab="Weight in Kg")

# ggplot
ggplot(dfrModel, aes(x=hght, y=wght)) +
    geom_point(shape=19, colour="blue", fill="blue") +
    geom_smooth(method='lm', formula=y~x) +
    labs(title="Height and Weight Regression") +
    labs(x="Height in Cms") +
    labs(y="Weight in Kgs")

Linear Model

x <- dfrModel$hght
y <- dfrModel$wght
slmModel <- lm(y~x)

Observation
No errors. Model successfully created.

Show Model

# print summary
summary(slmModel)
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.1573  -1.7267   0.7701   2.6045   6.2102 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -33.55669    8.25032  -4.067 0.000723 ***
## x             0.63675    0.05213  12.215 3.79e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.846 on 18 degrees of freedom
## Multiple R-squared:  0.8924, Adjusted R-squared:  0.8864 
## F-statistic: 149.2 on 1 and 18 DF,  p-value: 3.788e-10

Observation
The P value is less than 0.05, thus, this model is good for predicting the weights based on any given height value

R-squared: Should be as much closer to 1 as possible
R-Square being more than 75 is good
P-Value: Should be less than 0.05
P-Value of wt (x) is less than 0.05 …
Model is acceptable and we can use this for predictive analytics

Test Data

# find Weight of a person with heoghts 160, 170 and 180
dfrTest <- data.frame(x=c(160,170,180))
#names(dfrTest) <- c("x")
dfrTest 
##     x
## 1 160
## 2 170
## 3 180

Observation
Test Data successfully created.

Predict
To predict the weights of people with height 160, 170 and 180 cms respectively, using the above model.

result <-  predict(slmModel, dfrTest)
print(result)
##        1        2        3 
## 68.32394 74.69148 81.05902

Results
The weights of people with heights 160, 170 and 180 cms have been predicted respectively as above.