Rashbir Singh Kohli (s3810585)
24/05/2020
bdims.csv is in csv(comma seperated value) format that is read using fread() function from data.table library.GatherDf <- df %>% gather(che.di, hgt, key = 'Parameter', value = 'value')
knitr::kable(GatherDf %>% group_by(GatherDf$Parameter) %>% summarise(Min = min(value,na.rm = TRUE),
Max = max(value, na.rm = TRUE),
n = n(),
Missing = sum(is.na(value)),
Q1 = quantile(value ,probs = .25,na.rm = TRUE),
Median = median(value, na.rm = TRUE),
Q3 = quantile(value, probs = .75,na.rm = TRUE),
Mean = mean(value, na.rm = TRUE),
SD = sd(value, na.rm = TRUE),
IQR = IQR(value ,na.rm = TRUE),
Corr = cor(df$che.di, df$hgt)), "html", caption = "Table 1: Descriptive Statistics", align = "llllllllll", col.names = c("Peer Groups", "Minimum", "Maximum", "Sample Size", "Missing Count","First Quartile", "Median", "Third Quartile", "Mean", "Standard Deviation", "IQR", "Correlation"), digits = 2) %>% kable_styling(latex_options = "HOLD_position") %>% column_spec(1, bold = TRUE) %>% column_spec(c(2,4,6,8,10,12), color = 'white', background = 'black')| Peer Groups | Minimum | Maximum | Sample Size | Missing Count | First Quartile | Median | Third Quartile | Mean | Standard Deviation | IQR | Correlation |
|---|---|---|---|---|---|---|---|---|---|---|---|
| che.di | 22.2 | 35.6 | 507 | 0 | 25.65 | 27.8 | 29.95 | 27.97 | 2.74 | 4.3 | 0.63 |
| hgt | 147.2 | 198.1 | 507 | 0 | 163.80 | 170.3 | 177.80 | 171.14 | 9.41 | 14.0 | 0.63 |
scatterplot() function from the car library is much more informative.hist(df$hgt, breaks = 20, probability = TRUE, xlab = 'Height(cm)', ylab = 'Frequency', main = "Histogram for person's Height in cm")
abline(v = mean(df$hgt), col="red", lwd=2, lty=2)
abline(v = median(df$hgt), col="orange", lwd=2, lty=2)
text(x=174, y=0.0505, labels= 'μ = 171.14', cex = 0.72)
text(x=167, y=0.048, labels= 'Median = 170.3', cex = 0.73)
lines(density(df$hgt), col = 'Blue', lwd=2)
curve(dnorm(x, mean=mean(df$hgt), sd=sd(df$hgt)), yaxt="n", lty="dotted", col="darkgreen", lwd=4, add=TRUE)
legend("topright", legend = c("Density Curve for Female Sample", "Normal Curve", 'Mean', 'Median'), bty = "n", text.col = "black", horiz = F, pch=c(15,15, 15, 15), col = c('Blue', "darkgreen", 'red', 'orange'))hist(df$che.di, breaks = 20, probability = TRUE, xlab = 'Chest Diameter(cm)', ylab = 'Frequency', main = "Histogram for person's Chest Diameter in cm")
abline(v = mean(df$che.di), col="red", lwd=2, lty=2)
abline(v = median(df$che.di), col="orange", lwd=2, lty=2)
text(x=28.7, y=0.16, labels= 'μ = 27.92', cex = 0.75)
text(x=27, y=0.17, labels= 'Median = 27.8', cex = 0.73)
lines(density(df$che.di), col = 'Blue', lwd=2)
curve(dnorm(x, mean=mean(df$che.di), sd=sd(df$che.di)), yaxt="n", lty="dotted", col="darkgreen", lwd=4, add=TRUE)
legend("topright", legend = c("Density Curve for Female Sample", "Normal Curve", 'Mean', 'Median'), bty = "n", text.col = "black", horiz = F, pch=c(15,15, 15, 15), col = c('Blue', "darkgreen", 'red', 'orange'))p1 <- ggqqplot(df$che.di, size = 0.5) + ggtitle("QQ Plot for the person's chest") + theme(plot.title = element_text(hjust = 0.5))
p2 <- ggqqplot(df$hgt, size = 0.5) + ggtitle("QQ Plot for the person's height") + theme(plot.title = element_text(hjust = 0.5))
grid.arrange(p1, p2, nrow = 1)sum_x <- sum(df$hgt) # raw sum for x
sum_y <- sum(df$che.di) # raw sum for y
sum_x_sq <- sum(df$hgt^2) # raw sum of squares for x
sum_y_sq <- sum(df$che.di^2) # raw sum of squares for y
sum_xy <- sum(df$hgt*df$che.di) # raw sum of the cross product
n <- length(df$hgt) # Number of variables
Lxx <- sum_x_sq-((sum_x^2)/n) # squared deviation from the mean x
Lyy <- sum_y_sq-((sum_y^2)/n) # squared deviation from the mean y
Lxy = sum_xy - (((sum_x)*(sum_y))/n) # corrected sum of the cross products
b = Lxy/Lxx # slope
a = mean(df$che.di - b*mean(df$hgt)) # Interceptabline() function.plot(df$che.di ~ df$hgt, data = df, xlab = "Height", ylab = "Chest")
abline(a = a, b = b, col= "red")rcorr() function from the Hmisc library.## [1] 0.6268931
## che.di hgt
## che.di 1.00 0.63
## hgt 0.63 1.00
##
## n= 507
##
##
## P
## che.di hgt
## che.di 0
## hgt 0
## [1] 0.000000000000000000000000000000000000000000000000000000009623577
## [1] 0.5709813 0.6770164
##
## Call:
## lm(formula = che.di ~ hgt, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.3102 -1.4326 -0.0696 1.4168 6.8929
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.2947 1.7319 -1.902 0.0577 .
## hgt 0.1827 0.0101 18.082 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.138 on 505 degrees of freedom
## Multiple R-squared: 0.393, Adjusted R-squared: 0.3918
## F-statistic: 327 on 1 and 505 DF, p-value: < 0.00000000000000022
## Estimate Std. Error t value
## (Intercept) -3.2946566 1.7318756 -1.902363
## hgt 0.1827027 0.0101042 18.081859
## Pr(>|t|)
## (Intercept) 0.05769227415322646101980552657551015727221965789794921875000000
## hgt 0.00000000000000000000000000000000000000000000000000000001017694
## 2.5 % 97.5 %
## (Intercept) -6.6972252 0.1079121
## hgt 0.1628512 0.2025541
[1] “Exploring Relationships in Body Dimensions”, Journal of Statistics Education, [Online]. Available: https://ww2.amstat.org/publications/jse/v11n2/datasets.heinz.html [Accessed: 24-May-2020].
[2] “Esting Two-Sided Hypotheses Concerning the Slope Coefficient” , Econometrics With R , [Online]. Available: https://www.econometrics-with-r.org/5-1-testing-two-sided-hypotheses-concerning-the-slope-coefficient.html [Accessed: 24-May-2020].