setwd("~/Desktop/Portfolio/Height-Weight")
library(ggplot2)
library(reshape2)
library(plyr)
library(dplyr)
library(tidyr)
library(caTools)
library(VIM)
library(magrittr)
z10 <- read.table("HT&WT.txt", stringsAsFactors=FALSE)
z11 <- z10[, -1] ## Removed index
Z11 <- colnames(z11) <- c("height", "weight")
head(z11)
## height weight
## 1 Height(Inches) Weight(Pounds)
## 2 65.78331 112.9925
## 3 71.51521 136.4873
## 4 69.39874 153.0269
## 5 68.2166 142.3354
## 6 67.78781 144.2971
z11$height <- as.numeric(z11$height)
## Warning: NAs introduced by coercion
height <- as.numeric(z11$height)
z11$weight <- as.numeric(z11$weight)
## Warning: NAs introduced by coercion
weight <- as.numeric(z11$weight)
z11 <- na.omit(z11) ## Removes row/observation with excessive missing value
set.seed(123)
split <- sample(seq_len(nrow(z11)), size = floor(0.75 * nrow(z11)))
split2 <- sample(seq_len(nrow(z11)), size = floor(0.25 * nrow(z11)))
trainData <- z11[split, ]
testData <- z11[split2, ]
str(trainData)
## 'data.frame': 18749 obs. of 2 variables:
## $ height: num 69.9 65.6 65.5 68.8 68.9 ...
## $ weight: num 128 107 105 126 125 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:2] 1 24983
## .. ..- attr(*, "names")= chr [1:2] "1" "24983"
summary(trainData)
## height weight
## Min. :60.28 Min. : 78.01
## 1st Qu.:66.70 1st Qu.:119.40
## Median :68.00 Median :127.22
## Mean :67.99 Mean :127.14
## 3rd Qu.:69.28 3rd Qu.:134.95
## Max. :75.15 Max. :170.92
str(testData)
## 'data.frame': 6249 obs. of 2 variables:
## $ height: num 68.9 68.7 69.7 69.4 71.1 ...
## $ weight: num 132 149 145 112 135 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:2] 1 24983
## .. ..- attr(*, "names")= chr [1:2] "1" "24983"
summary(testData)
## height weight
## Min. :61.91 Min. : 78.01
## 1st Qu.:66.73 1st Qu.:119.21
## Median :68.03 Median :127.11
## Mean :67.99 Mean :127.00
## 3rd Qu.:69.25 3rd Qu.:134.75
## Max. :74.85 Max. :168.23
library(ggplot2)
## WEIGHT TO HEIGHT RELATIONSHIP
sp1 <- ggplot(data = z11, aes(x = height, y = weight)) + ylab("weight -- pounds") + xlab("height--inches")
sp1 + geom_point(position = "jitter", color = "blue", alpha = 0.1) + geom_smooth(method = "lm", color = "red") + geom_smooth(color = "orange")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## DENITY GRAPHS SHOWING DISTRIBUTION OF WEIGHT & HEIGHT IN DATASET
ggplot(z11, aes(x = weight)) + geom_density(fill = "yellow") + xlab("weight -- pounds")
ggplot(z11, aes(x = height)) + geom_density(fill = "red")+ xlab("height -- inches")
predWeight_model <- lm(weight ~ height, data = trainData)
cor(z11$weight, z11$height) ##3 correlation
## [1] 0.5028351
head(predWeight_model$residuals, 20) ## 20 sample residuals
## 7191 19708 10225 22073 23508 1140
## -5.2438727 -13.0379206 -14.1011964 -4.0420285 -4.7571259 16.1326566
## 13200 22305 13782 11412 23912 11329
## 0.2819167 7.3448544 -0.4759996 -1.0835996 -7.1723067 0.7170434
## 16932 14309 2573 22483 6150 1052
## -6.6014104 1.5501176 -10.0210291 12.4137565 -3.5355182 -10.2325670
## 8193 23845
## 0.7077147 -3.6545638
mean(predWeight_model$residuals) ## maen of residuals near
## [1] -7.906255e-17
summary(predWeight_model)
##
## Call:
## lm(formula = weight ~ height, data = trainData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40.359 -6.750 -0.053 6.874 38.766
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -82.69303 2.62365 -31.52 <2e-16 ***
## height 3.08609 0.03857 80.01 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.08 on 18747 degrees of freedom
## Multiple R-squared: 0.2545, Adjusted R-squared: 0.2545
## F-statistic: 6401 on 1 and 18747 DF, p-value: < 2.2e-16
dim(trainData)
## [1] 18749 2
par(mfrow=c(2,2))
plot(predWeight_model)
Evaluation shows, thus far, contrsdictory signals as to model suitability.
– Very kow p-value ( 2.2e-16) indicates high significance/relation between height and weight.
– However, the R-Squared statistic show that our model explains only 25% of the variance in the data.
– And also, mean of the residual (-0.045) sesm to converge towards zero which suggests a near (but not perfect) fit of the modal to the data - some predictive value.
confint(predWeight_model) ## confidance interval
## 2.5 % 97.5 %
## (Intercept) -87.835627 -77.550441
## height 3.010486 3.161698
prediction <- predict(predWeight_model, newdata = testData)
head(prediction)
## 4384 3123 10058 19622 446 19571
## 130.0230 129.4010 132.5045 131.5324 136.7398 114.6026
prediction <- as.numeric(prediction)
head(testData$weight)
## [1] 131.7141 148.7980 144.7280 112.3215 134.7804 112.9760
actual <- as.numeric(testData$weight)
predictdataB <- cbind(actual, prediction)
predictdataB <- as.data.frame(predictdataB)
predictdataB <- dplyr::mutate(predictdataB, residual = prediction - actual)
## 20 observatiions
head(predictdataB, 20)
## actual prediction residual
## 1 131.7141 130.0230 -1.691142
## 2 148.7980 129.4010 -19.396982
## 3 144.7280 132.5045 -12.223546
## 4 112.3215 131.5324 19.210897
## 5 134.7804 136.7398 1.959437
## 6 112.9760 114.6026 1.626561
## 7 114.2239 130.4802 16.256263
## 8 125.1168 123.8093 -1.307533
## 9 111.5448 129.5808 18.035983
## 10 133.0581 121.6350 -11.423108
## 11 147.5404 132.9607 -14.579699
## 12 127.6106 128.8800 1.269424
## 13 131.1966 123.2691 -7.927522
## 14 130.6425 136.4735 5.831007
## 15 124.0167 127.4889 3.472207
## 16 119.8945 124.6474 4.752950
## 17 153.3295 131.2546 -22.074852
## 18 118.7652 126.0393 7.274108
## 19 129.9915 127.2305 -2.760992
## 20 119.2446 138.6589 19.414323
mean(predictdataB$residual)
## [1] 0.1444634
SST <- sum((testData$weight - mean(testData$weight)) ^ 2)
SSE <- sum((testData$weight - prediction) ^ 2)
1 - SSE/SST
## [1] 0.2507801