Check if there is good correlation in the above dataset and if it can be used for regression model
If yes, predict weight for the following heights 160, 170, 180
# height in cms
hght <- c(151, 174, 138, 186, 128, 136, 179, 163, 152, 131, 153, 177, 148, 189, 138, 146, 199, 167, 153, 130)
# weight in kgs
wght <- c(63, 81, 56, 91, 47, 57, 76, 72, 62, 48, 65, 84, 59, 93, 49, 55, 79, 75, 66, 49)
library(ggplot2)
library(corrgram)
library(gridExtra)
Setup
Dataset
dfrModel1 <- data.frame(hght, wght)
names(dfrModel1) <- c("hght","wght")
head(dfrModel1)
## hght wght
## 1 151 63
## 2 174 81
## 3 138 56
## 4 186 91
## 5 128 47
## 6 136 57
Exploratory Analysis
# check ut hght & wght
summary(hght)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 128.0 138.0 152.5 156.9 174.8 199.0
summary(wght)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 47.00 55.75 64.00 66.35 76.75 93.00
# ?quantile()
hght.qnt <- quantile(hght, probs=c(.25, .75))
# ?IQR()
hght.max <- 1.5 * IQR(hght)
hght.out <- hght
hght.out[hght < (hght.qnt[1] - hght.max)] <- NA
hght.out[hght > (hght.qnt[2] + hght.max)] <- NA
#print(hght)
#print(hght.out)
print(hght[is.na(hght.out)])
## numeric(0)
wght.qnt <- quantile(wght, probs=c(.25, .75))
wght.max <- 1.5 * IQR(wght)
wght.out <- wght
wght.out[wght < (wght.qnt[1] - wght.max)] <- NA
wght.out[wght > (wght.qnt[2] + wght.max)] <- NA
#print(wght)
#print(wght.out)
print(wght[is.na(wght.out)])
## numeric(0)
# correlation coefficient
cor(hght,wght)
## [1] 0.944644
#cor(x, y, method = c("pearson", "kendall", "spearman"))
# correlation test
cor.test(hght,wght)
##
## Pearson's product-moment correlation
##
## data: hght and wght
## t = 12.215, df = 18, p-value = 3.788e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8627911 0.9782375
## sample estimates:
## cor
## 0.944644
#cor.test(x, y, method=c("pearson", "kendall", "spearman"))
Observation
Good Positive correlation is obeserved between hght & wght
pairs(dfrModel1)
plot(dfrModel1)
corrgram(dfrModel1)
Observation
Decent Positive correlation is obeserved between hght & wght
Plot Graph
# ggplot
ggplot(dfrModel1, aes(x=hght, y=wght)) +
geom_point(shape=19, colour="blue", fill="blue") +
geom_smooth(method='lm', formula=y~x) +
labs(title="hght & wght regression") +
labs(x="hght") +
labs(y="wght")
Observation
It is seen that as hght increases wght increases
Linear Model
x <- hght
y <- wght
slmModel1 <- lm(y~x)
# print summary
summary(slmModel1)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.1573 -1.7267 0.7701 2.6045 6.2102
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -33.55669 8.25032 -4.067 0.000723 ***
## x 0.63675 0.05213 12.215 3.79e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.846 on 18 degrees of freedom
## Multiple R-squared: 0.8924, Adjusted R-squared: 0.8864
## F-statistic: 149.2 on 1 and 18 DF, p-value: 3.788e-10
Observation
R-Square being more than 75 is good P-Value of hght (x) is less than 0.05 … Model is acceptable and we can use this for predictive analytics
Test Data
# find mpg of a car with weight 3.0
dfrTest1 <- data.frame(x=c(160,170,180))
#names(dfrTest) <- c("x")
dfrTest1
## x
## 1 160
## 2 170
## 3 180
Observation
Test Data successfully created.
result <- predict(slmModel1, dfrTest1)
print(result)
## 1 2 3
## 68.32394 74.69148 81.05902