Simple Linear Regression

Problem Defination

Check if there is good correlation in the above dataset and if it can be used for regression model
If yes, predict weight for the following heights 160, 170, 180

Dataset

# height in cms
hght <- c(151, 174, 138, 186, 128, 136, 179, 163, 152, 131, 153, 177, 148, 189, 138, 146, 199, 167, 153, 130)
# weight in kgs
wght <- c(63, 81, 56, 91, 47, 57, 76, 72, 62, 48, 65, 84, 59, 93, 49, 55, 79, 75, 66, 49)
library(ggplot2)
library(corrgram)
library(gridExtra)

Setup

Dataset

dfrModel1 <- data.frame(hght, wght)
names(dfrModel1) <- c("hght","wght")
head(dfrModel1)
##   hght wght
## 1  151   63
## 2  174   81
## 3  138   56
## 4  186   91
## 5  128   47
## 6  136   57

Exploratory Analysis

# check ut hght & wght
summary(hght)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   128.0   138.0   152.5   156.9   174.8   199.0
summary(wght)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   47.00   55.75   64.00   66.35   76.75   93.00
# ?quantile()
hght.qnt <- quantile(hght, probs=c(.25, .75))
# ?IQR()
hght.max <- 1.5 * IQR(hght)
hght.out <- hght
hght.out[hght < (hght.qnt[1] - hght.max)] <- NA
hght.out[hght > (hght.qnt[2] + hght.max)] <- NA
#print(hght)
#print(hght.out)
print(hght[is.na(hght.out)])
## numeric(0)
wght.qnt <- quantile(wght, probs=c(.25, .75))
wght.max <- 1.5 * IQR(wght)
wght.out <- wght
wght.out[wght < (wght.qnt[1] - wght.max)] <- NA
wght.out[wght > (wght.qnt[2] + wght.max)] <- NA
#print(wght)
#print(wght.out)
print(wght[is.na(wght.out)])
## numeric(0)
# correlation coefficient
cor(hght,wght)
## [1] 0.944644
#cor(x, y, method = c("pearson", "kendall", "spearman"))
# correlation test
cor.test(hght,wght)
## 
##  Pearson's product-moment correlation
## 
## data:  hght and wght
## t = 12.215, df = 18, p-value = 3.788e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8627911 0.9782375
## sample estimates:
##      cor 
## 0.944644
#cor.test(x, y, method=c("pearson", "kendall", "spearman"))

Observation
Good Positive correlation is obeserved between hght & wght

visualize correlation

pairs(dfrModel1)

plot(dfrModel1)

corrgram(dfrModel1)

Observation
Decent Positive correlation is obeserved between hght & wght

Plot Graph

# ggplot
ggplot(dfrModel1, aes(x=hght, y=wght)) +
    geom_point(shape=19, colour="blue", fill="blue") +
    geom_smooth(method='lm', formula=y~x) +
    labs(title="hght & wght regression") +
    labs(x="hght") +
    labs(y="wght")

Observation
It is seen that as hght increases wght increases

Linear Model

x <- hght
y <- wght
slmModel1 <- lm(y~x)
# print summary
summary(slmModel1)
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.1573  -1.7267   0.7701   2.6045   6.2102 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -33.55669    8.25032  -4.067 0.000723 ***
## x             0.63675    0.05213  12.215 3.79e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.846 on 18 degrees of freedom
## Multiple R-squared:  0.8924, Adjusted R-squared:  0.8864 
## F-statistic: 149.2 on 1 and 18 DF,  p-value: 3.788e-10

Observation

R-Square being more than 75 is good P-Value of hght (x) is less than 0.05 … Model is acceptable and we can use this for predictive analytics

Test Data

# find mpg of a car with weight 3.0
dfrTest1 <- data.frame(x=c(160,170,180))
#names(dfrTest) <- c("x")
dfrTest1 
##     x
## 1 160
## 2 170
## 3 180

Observation
Test Data successfully created.

result <-  predict(slmModel1, dfrTest1)
print(result)
##        1        2        3 
## 68.32394 74.69148 81.05902