To check if there is a good correlation in the above dataset and if it can be used for regression model
If yes, predict weight for the following heights 160, 170, 180
Setup
Loading Libraries
library(ggplot2)
library(corrgram)
library(gridExtra)
# height in cms
hght <- c(151, 174, 138, 186, 128, 136, 179, 163, 152, 131, 153, 177, 148, 189, 138, 146, 199, 167, 153, 130)
# weight in kgs
wght <- c(63, 81, 56, 91, 47, 57, 76, 72, 62, 48, 65, 84, 59, 93, 49, 55, 79, 75, 66, 49)
dfrModel <- data.frame(hght,wght)
head(dfrModel)
## hght wght
## 1 151 63
## 2 174 81
## 3 138 56
## 4 186 91
## 5 128 47
## 6 136 57
Exploratory Analysis
# to check out height & weight
summary(dfrModel$hght)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 128.0 138.0 152.5 156.9 174.8 199.0
summary(dfrModel$wght)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 47.00 55.75 64.00 66.35 76.75 93.00
nrow(dfrModel)
## [1] 20
Finding out whether outliers are present in given heights
# ?quantile()
hght.qnt <- quantile(dfrModel$hght, probs=c(.25, .75))
# ?IQR()
hght.max <- 1.5 * IQR(dfrModel$hght)
hght.out <- dfrModel$hght
hght.out[dfrModel$hght < (hght.qnt[1] - hght.max)] <- NA
hght.out[dfrModel$hght > (hght.qnt[2] + hght.max)] <- NA
print(dfrModel$hght[is.na(hght.out)])
## numeric(0)
Finding out whether outliers are present in given weights
# ?quantile()
wght.qnt <- quantile(dfrModel$wght, probs=c(.25, .75))
# ?IQR()
wght.max <- 1.5 * IQR(dfrModel$wght)
wght.out <- dfrModel$wght
wght.out[dfrModel$wght < (wght.qnt[1] - wght.max)] <- NA
wght.out[dfrModel$wght > (wght.qnt[2] + wght.max)] <- NA
print(dfrModel$wght[is.na(wght.out)])
## numeric(0)
visualizing outliers
# checking outliers in height
hghtPlot <- ggplot(dfrModel, aes(x="", y=hght)) +
geom_boxplot(aes(fill=hght), color="green") +
labs(title="Height Outliers")
hghtPlot
# checking outliers in weight
wghtPlot <- ggplot(dfrModel, aes(x="", y=wght)) +
geom_boxplot(aes(fill=wght), color="blue") +
labs(title="Weight Outliers")
wghtPlot
# show plot
grid.arrange(hghtPlot, wghtPlot, nrow=1, ncol=2)
Observation
No outliers are present in hght & wght.
Zero outlier count
Correlation
# correlation coefficient
cor(dfrModel$wght, dfrModel$hght)
## [1] 0.944644
#cor(x, y, method = c("pearson", "kendall", "spearman"))
# correlation test
cor.test(dfrModel$wght, dfrModel$hght)
##
## Pearson's product-moment correlation
##
## data: dfrModel$wght and dfrModel$hght
## t = 12.215, df = 18, p-value = 3.788e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8627911 0.9782375
## sample estimates:
## cor
## 0.944644
#cor.test(x, y, method=c("pearson", "kendall", "spearman"))
Observation
Strong positive correlation is obeserved between hght & wght
Correlation Visualization
# visualize correlation
pairs(dfrModel)
#plot(dfrModel)
Observation
Strong positive correlation is obeserved between hght & wght
Plot Graph
# base chart
plot(dfrModel$hght,dfrModel$wght, col="blue", main="Regression",
abline(lm(dfrModel$wght~dfrModel$hght)), cex=1, pch=16, xlab="height in cms", ylab="Weight in KG Tonnes")
# ggplot
ggplot(dfrModel, aes(x=hght, y=wght)) +
geom_point(shape=19, colour="blue", fill="blue") +
geom_smooth(method='lm', formula=y~x) +
labs(title="Height & Weight Regression") +
labs(x="Height in Cms") +
labs(y="Weight in KG Tones")
Observation
It is seen that as height increases, weight also increases
Making a Linear Model
x <- dfrModel$hght
y <- dfrModel$wght
slmModel <- lm(y~x)
Observation
No errors. Model successfully created.
Show Model
# print summary
summary(slmModel)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.1573 -1.7267 0.7701 2.6045 6.2102
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -33.55669 8.25032 -4.067 0.000723 ***
## x 0.63675 0.05213 12.215 3.79e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.846 on 18 degrees of freedom
## Multiple R-squared: 0.8924, Adjusted R-squared: 0.8864
## F-statistic: 149.2 on 1 and 18 DF, p-value: 3.788e-10
Observation
## R-squared: Should be as much closer to 1 as possible Adjusted R-Square value here is 88 ,which being more than 75 is good ## P-Value: Should be less than 0.05 P-Value of height (x) is less than 0.05 … Model is acceptable and we can use this for predictive analytics
Test Data
# find mpg of a car with weight 3.0
dfrTest <- data.frame(x=c(160,170,180))
#names(dfrTest) <- c("x")
dfrTest
## x
## 1 160
## 2 170
## 3 180
Observation
Test Data successfully created.
Predict
result <- predict(slmModel, dfrTest)
print(result)
## 1 2 3
## 68.32394 74.69148 81.05902
Observation
Prediction is as expected.