MATH1324 Assignment 3

Test to see if there is any statistical significant relationship between a person’s chest diameter (che.di) and height (hgt)

Rashbir Singh Kohli (s3810585)

24/05/2020

Introduction

Problem Statement

Data

df <- fread('bdims.csv')
df <- df[,c('che.di','hgt')]

Descriptive Statistics and Visualisation

GatherDf <- df %>% gather(che.di, hgt, key = 'Parameter', value = 'value')
knitr::kable(GatherDf %>% group_by(GatherDf$Parameter) %>% summarise(Min = min(value,na.rm = TRUE),
                                        Max = max(value, na.rm = TRUE),
                                        n = n(),
                                        Missing = sum(is.na(value)),
                                        Q1 = quantile(value ,probs = .25,na.rm = TRUE),
                                        Median = median(value, na.rm = TRUE),
                                        Q3 = quantile(value, probs = .75,na.rm = TRUE),
                                        Mean = mean(value, na.rm = TRUE),
                                        SD = sd(value, na.rm = TRUE),
                                        IQR = IQR(value ,na.rm = TRUE),
                                        Corr = cor(df$che.di, df$hgt)), "html", caption = "Table 1: Descriptive Statistics", align = "llllllllll", col.names = c("Peer Groups", "Minimum", "Maximum", "Sample Size", "Missing Count","First Quartile", "Median", "Third Quartile", "Mean", "Standard Deviation", "IQR", "Correlation"), digits = 2) %>% kable_styling(latex_options = "HOLD_position") %>% column_spec(1, bold = TRUE) %>% column_spec(c(2,4,6,8,10,12), color = 'white', background = 'black')
Table 1: Descriptive Statistics
Peer Groups Minimum Maximum Sample Size Missing Count First Quartile Median Third Quartile Mean Standard Deviation IQR Correlation
che.di 22.2 35.6 507 0 25.65 27.8 29.95 27.97 2.74 4.3 0.63
hgt 147.2 198.1 507 0 163.80 170.3 177.80 171.14 9.41 14.0 0.63

Descriptive Statistics and Visualisation (Cont.)

ggplot(data = GatherDf, aes(x=Parameter, y=value)) + geom_boxplot(aes(fill=Parameter)) + facet_wrap( ~ Parameter, scales="free") + scale_y_continuous(name = 'Size in cm\n') + ggtitle("Side by side Boxplot for person's Height and Chest Size in cm\n (on different scales)\n") + theme(plot.title = element_text(family="Tahoma", hjust = 0.5), axis.title = element_text(size = 14)); ggplot(GatherDf, aes(x=Parameter, y=value)) + geom_boxplot(outlier.colour="black", outlier.shape=1, outlier.size=1.5 ,fill='#4271AE', color="#1F3552") + theme_economist() + theme(plot.title = element_text(family="Tahoma", hjust = 0.5), text = element_text(family="Tahoma"), axis.title = element_text(size = 12)) + scale_x_discrete(name = "\nParameter")+ ggtitle("Boxplot for person's Height and Chest Size in cm\n") + scale_y_continuous(name = 'Size in cm\n') 

Descriptive Statistics and Visualisation (Cont.)

scatterplot(x = df$hgt, y = df$che.di, xlab = "Height(cm)", ylab = "Chest Diameter(cm)", main = "Scatter plot for relation between person's Chest and Height in cm\n")

Descriptive Statistics and Visualisation (Cont.)

hist(df$hgt, breaks = 20, probability = TRUE, xlab = 'Height(cm)', ylab = 'Frequency', main = "Histogram for person's Height  in cm")
abline(v = mean(df$hgt), col="red", lwd=2, lty=2)
abline(v = median(df$hgt), col="orange", lwd=2, lty=2)
text(x=174, y=0.0505, labels= 'μ = 171.14', cex = 0.72)
text(x=167, y=0.048, labels= 'Median = 170.3', cex = 0.73)
lines(density(df$hgt), col = 'Blue', lwd=2)
curve(dnorm(x, mean=mean(df$hgt), sd=sd(df$hgt)), yaxt="n", lty="dotted", col="darkgreen", lwd=4, add=TRUE)
legend("topright", legend = c("Density Curve for Female Sample", "Normal Curve", 'Mean', 'Median'), bty = "n", text.col = "black", horiz = F, pch=c(15,15, 15, 15), col = c('Blue', "darkgreen", 'red', 'orange'))

Descriptive Statistics and Visualisation (Cont.)

hist(df$che.di, breaks = 20, probability = TRUE, xlab = 'Chest Diameter(cm)', ylab = 'Frequency', main = "Histogram for person's Chest Diameter in cm")
abline(v = mean(df$che.di), col="red", lwd=2, lty=2)
abline(v = median(df$che.di), col="orange", lwd=2, lty=2)
text(x=28.7, y=0.16, labels= 'μ = 27.92', cex = 0.75)
text(x=27, y=0.17, labels= 'Median = 27.8', cex = 0.73)
lines(density(df$che.di), col = 'Blue', lwd=2)
curve(dnorm(x, mean=mean(df$che.di), sd=sd(df$che.di)), yaxt="n", lty="dotted", col="darkgreen", lwd=4, add=TRUE)
legend("topright", legend = c("Density Curve for Female Sample", "Normal Curve", 'Mean', 'Median'), bty = "n", text.col = "black", horiz = F, pch=c(15,15, 15, 15), col = c('Blue', "darkgreen", 'red', 'orange'))

Descriptive Statistics and Visualisation (Cont.)

p1 <- ggqqplot(df$che.di, size = 0.5) + ggtitle("QQ Plot for the person's chest") + theme(plot.title = element_text(hjust = 0.5))
p2 <- ggqqplot(df$hgt, size = 0.5) +  ggtitle("QQ Plot for the person's height") + theme(plot.title = element_text(hjust = 0.5))
grid.arrange(p1, p2, nrow = 1)

Hypothesis Testing

sum_x <- sum(df$hgt) # raw sum for x
sum_y <- sum(df$che.di) # raw sum for y
sum_x_sq <- sum(df$hgt^2) # raw sum of squares for x
sum_y_sq <- sum(df$che.di^2) # raw sum of squares for y
sum_xy <- sum(df$hgt*df$che.di) # raw sum of the cross product
n <- length(df$hgt) # Number of variables
Lxx <- sum_x_sq-((sum_x^2)/n) # squared deviation from the mean x
Lyy <- sum_y_sq-((sum_y^2)/n) # squared deviation from the mean y
Lxy = sum_xy - (((sum_x)*(sum_y))/n) # corrected sum of the cross products
b = Lxy/Lxx # slope
a = mean(df$che.di - b*mean(df$hgt)) # Intercept

Hypothesis Testing (Cont.)

plot(df$che.di ~ df$hgt, data = df, xlab = "Height", ylab = "Chest")
abline(a = a, b = b, col= "red")

Hypothesis Testing (Cont.)

print(cor(df$che.di, df$hgt))
## [1] 0.6268931
rcorr(as.matrix(dplyr::select(df, che.di, hgt)), type = "pearson")
##        che.di  hgt
## che.di   1.00 0.63
## hgt      0.63 1.00
## 
## n= 507 
## 
## 
## P
##        che.di hgt
## che.di         0 
## hgt     0
r = 0.627
n = 507

Hypothesis Testing (Cont.)

t = r*(sqrt((n-2)/(1-(r)^2)))
2*pt(q = t,df = n - 2,lower.tail=FALSE)
## [1] 0.000000000000000000000000000000000000000000000000000000009623577
CIr(r = cor(df$che.di, df$hgt), n = n, level = 0.95)
## [1] 0.5709813 0.6770164

Hypothesis Testing (Cont.)

Hypothesis Testing (Cont.)

LinearModel <- lm(che.di ~ hgt, data = df) 
LinearModel %>% summary()
## 
## Call:
## lm(formula = che.di ~ hgt, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.3102 -1.4326 -0.0696  1.4168  6.8929 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)  -3.2947     1.7319  -1.902              0.0577 .  
## hgt           0.1827     0.0101  18.082 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.138 on 505 degrees of freedom
## Multiple R-squared:  0.393,  Adjusted R-squared:  0.3918 
## F-statistic:   327 on 1 and 505 DF,  p-value: < 0.00000000000000022

Hypothesis Testing (Cont.)

LinearModel %>% summary() %>% coef()
##               Estimate Std. Error   t value
## (Intercept) -3.2946566  1.7318756 -1.902363
## hgt          0.1827027  0.0101042 18.081859
##                                                                     Pr(>|t|)
## (Intercept) 0.05769227415322646101980552657551015727221965789794921875000000
## hgt         0.00000000000000000000000000000000000000000000000000000001017694
LinearModel %>% confint()
##                  2.5 %    97.5 %
## (Intercept) -6.6972252 0.1079121
## hgt          0.1628512 0.2025541

Discussion

plot(LinearModel)

References

[1] “Exploring Relationships in Body Dimensions”, Journal of Statistics Education, [Online]. Available: https://ww2.amstat.org/publications/jse/v11n2/datasets.heinz.html [Accessed: 24-May-2020].

[2] “Esting Two-Sided Hypotheses Concerning the Slope Coefficient” , Econometrics With R , [Online]. Available: https://www.econometrics-with-r.org/5-1-testing-two-sided-hypotheses-concerning-the-slope-coefficient.html [Accessed: 24-May-2020].