We begin by loading packages.
setwd("~/Desktop/Portfolio/Height-Weight")
library(ggplot2)
library(reshape2)
library(plyr)
library(dplyr)
library(tidyr)
library(caTools)
library(VIM)
z10 <- read.table("HT&WT.txt", stringsAsFactors=FALSE)
z11 <- z10[,-1]
z11 <- z11[-1, ]
colnames(z11) <- c("height", "weight")
head(z11)
## height weight
## 2 65.78331 112.9925
## 3 71.51521 136.4873
## 4 69.39874 153.0269
## 5 68.2166 142.3354
## 6 67.78781 144.2971
## 7 68.69784 123.3024
z11$height <- as.numeric(z11$height)
z11$weight <- as.numeric(z11$weight)
## Warning: NAs introduced by coercion
z11 <- na.omit(z11)
set.seed(123)
split <- sample(seq_len(nrow(z11)), size = floor(0.75 * nrow(z11)))
trainData <- z11[split, ]
testData <- z11[-split, ]
str(trainData)
## 'data.frame': 18749 obs. of 2 variables:
## $ height: num 69.9 65.6 65.5 68.8 68.9 ...
## $ weight: num 128 107 105 126 125 ...
## - attr(*, "na.action")=Class 'omit' Named int 24982
## .. ..- attr(*, "names")= chr "24983"
summary(trainData)
## height weight
## Min. :60.28 Min. : 78.01
## 1st Qu.:66.70 1st Qu.:119.40
## Median :68.00 Median :127.22
## Mean :67.99 Mean :127.14
## 3rd Qu.:69.28 3rd Qu.:134.95
## Max. :75.15 Max. :170.92
str(testData)
## 'data.frame': 6250 obs. of 2 variables:
## $ height: num 68.7 67.8 68.4 70.8 69.5 ...
## $ weight: num 123 141 130 142 103 ...
## - attr(*, "na.action")=Class 'omit' Named int 24982
## .. ..- attr(*, "names")= chr "24983"
summary(testData)
## height weight
## Min. :60.86 Min. : 86.5
## 1st Qu.:66.73 1st Qu.:119.1
## Median :67.98 Median :126.9
## Mean :68.00 Mean :126.9
## 3rd Qu.:69.25 3rd Qu.:134.7
## Max. :74.53 Max. :168.9
Overlapping Plots – Scatter plot & Regression best-fit line
library(ggplot2)
sp1 <- ggplot(data = z11, aes(x = height, y = weight, ylim))
sp1 + geom_point(position = "jitter", color = "blue", alpha = 0.1) + geom_smooth(method = "lm", color = "red") + geom_smooth( color = "orange")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
It really does look like there is a linear relationship. Execute an lm fit to see.
prediction_model <- lm(weight ~ height, data = trainData)
prediction_model
##
## Call:
## lm(formula = weight ~ height, data = trainData)
##
## Coefficients:
## (Intercept) height
## -82.693 3.086
summary(prediction_model)
##
## Call:
## lm(formula = weight ~ height, data = trainData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40.359 -6.750 -0.053 6.874 38.766
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -82.69303 2.62365 -31.52 <2e-16 ***
## height 3.08609 0.03857 80.01 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.08 on 18747 degrees of freedom
## Multiple R-squared: 0.2545, Adjusted R-squared: 0.2545
## F-statistic: 6401 on 1 and 18747 DF, p-value: < 2.2e-16
confint(prediction_model, levels = 0.95)
## 2.5 % 97.5 %
## (Intercept) -87.835627 -77.550441
## height 3.010486 3.161698
Tesing of Prediction Model
## Predicted Values
prediction <- predict(prediction_model, newdata = testData)
head(prediction)
## 7 22 25 28 32 34
## 129.3148 126.6483 128.4631 135.9329 131.8622 126.5943
## ActualValuss
head(testData$weight)
## [1] 123.3024 141.2807 129.5027 142.4235 103.3016 125.7886