library(readxl)
library(dplyr)
library(performance)
library(see)
library(ggplot2)
library(kableExtra)
library(knitr)
library(here)
library(countrycode)

haw <- read_xlsx(here( "Height and weight.xlsx"))
data<-as.data.frame(haw)
names(data) <- make.names(names(data))

1 Exploratory Data Analysis

Exploratory Data Analysis (EDA) is conducted to gain an initial understanding of the dataset and to identify key patterns, trends, and potential anomalies in the variables of interest. In this study, EDA focuses on examining the distribution and relationship between height and weight, as well as how these variables vary across socio-demographic profiles such as gender and geographic location.

1.1 Distribution of Countries, grouped into their respective continents

To begin the analysis, the distribution of respondents by country is examined. However, given the large number of unique countries present in the dataset, analyzing country-level differences may not provide meaningful insights due to fragmentation.

To address this issue, countries are grouped into their respective continents. This transformation reduces complexity and allows for more interpretable comparisons across broader geographic regions, which will be used in further analyses.

Freq <- data %>%
  group_by(`Country.Team`) %>%
  summarise(frequency = n()) %>%
 mutate(percentage = round((frequency / sum(frequency)) * 100, 2))
data$Country.Team[data$Country.Team == "Team GB"] <- "United Kingdom"
data$Country.Team[data$Country.Team == "Team USA"] <- "United States"
data$Country.Team[data$Country.Team == "Australian Olympic Team"] <- "Australia"
data$Country.Team[data$Country.Team == "Russian Federation"] <- "Russia"
data$Country.Team[data$Country.Team == "Chinese Taipei"] <- "China"
data$Country.Team[data$Country.Team == "Independent Olympic Athletes"] <- "No Data"
data$Country.Team[data$Country.Team == "American Virgin Islands"] <- "United States"
data$Country.Team[data$Country.Team == "Micronesia"] <- "Micronesia (Federated States of)"
data$Country.Team[data$Country.Team == "São Tomé and Príncipe"] <- "Portugal"




data$Continent <- countrycode(data$Country.Team,
                             origin = "country.name",
                             destination = "continent")
Freq.con <- data %>%
  group_by(`Continent`) %>%
  summarise(frequency = n()) %>%
 mutate(percentage = round((frequency / sum(frequency)) * 100, 2))

kable(Freq.con, title="Count of respondents per continent")%>%
  kable_styling(full_width = TRUE)
Continent frequency percentage
Africa 95 5.99
Americas 194 12.22
Asia 215 13.55
Europe 1008 63.52
Oceania 75 4.73

As shown in the table, the respondents were successfully grouped into their respective continents, with the majority coming from Europe, followed by Asia and the Americas. This indicates that the dataset is heavily represented by European participants.

ggplot(Freq.con, aes(x = "", y = frequency, fill = Continent)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y") +
  geom_text(aes(label = frequency),
            position = position_stack(vjust = 0.5),
            color = "white", size = 3) +
  theme_void() +
  labs(title = "Frequency of Respondents by Continent")

1.2 Distribution of Gender

Freq.g <- data %>%
  group_by(Gender) %>%
  summarise(frequency = n()) %>%
 mutate(percentage = round((frequency / sum(frequency)) * 100, 2))

kable(Freq.g,title="Frequency count of Male and Female") %>%
  kable_styling(full_width = TRUE)
Gender frequency percentage
M 869 54.76
W 718 45.24

The table shows the frequency count and percentage of male and female in the dataset. It shows that there are more male respondents (54.76 %) than female (45.24%).

ggplot(Freq.g, aes(x = "", y = frequency, fill = Gender)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y", start = 0) +
   geom_text(aes(label = frequency),
            position = position_stack(vjust = 0.5),
            color = "white", size = 3)+
  theme_void() + labs(title="Frequency of respondents by Gender")

The main focus of these analyses is to gain insights into the height (cm) and weight (kg) of the respondents according to their respective demographic profiles.

1.3 Summary Statistics of Height in centimeters

1.3.1 Total distribution of Height

summary_height <- data.frame(
  Parameter = names(summary(data$Height.cm.)),
  Value = as.numeric(summary(data$Height.cm.))
)

kable(summary_height, caption = "Summary Statistics for Height") %>%
  kable_styling(full_width = TRUE)
Summary Statistics for Height
Parameter Value
Min. 136.0000
1st Qu. 170.0000
Median 177.0000
Mean 177.4512
3rd Qu. 185.0000
Max. 219.0000

The height of the respondents ranged from 136 cm to 219 cm, with a mean height of 177.45 cm and a median height of 177.00 cm. This indicates that the respondents generally fall within the average adult height range. This also can provide us an insight of the shape of distribution of our data.

  • If Mean < Median, the distribution is negatively skewed, the tail extends more to the left.

  • If Mean > Median, the distribution is positively skewed, the tail extends more to the right.

  • If Mean = Median, the distribution is approximately symmetric, and may follow a normal distribution.

The mean of the height distribution is greater than the median, this means that the tail extends more to the right. This means that height is more focused on the lower values.

ggplot(data, aes(x = Height.cm.)) +
  
  geom_histogram(aes(y = after_stat(density)),
                 binwidth = 5,
                 color = "black",
                 fill = "skyblue") +
  
  geom_density(color = "red",
               linewidth = 1.5) +
  
  geom_vline(aes(xintercept = mean(Height.cm., na.rm = TRUE)),
             color = "darkred",
             linewidth = 1.2,
             linetype = "dashed") +
  
  labs(title = "Distribution of Height",
       x = "Height (cm)",
       y = "Density") +
  
  theme_minimal()

The histogram and density curve suggest that the height data are approximately normally distributed.

1.3.2 Distribution of Height grouped to Gender

avg_height.gen <- data %>%
  group_by(Gender) %>%
  summarise( 
    Min = round(min(Height.cm.,na.rm = TRUE), 3),
    Max = round(max(Height.cm.,na.rm=TRUE), 3),
    Mean = round(mean(Height.cm., na.rm = TRUE),3),
    "Median" = round(median(Height.cm., na.rm=TRUE),3),
    "Standard Deviation" = round(sd(Height.cm., na.rm = TRUE),3),
    Count = n()
  )
kable(avg_height.gen, caption = "Summary Statistics for Height grouped according to their Genders") %>%
  kable_styling(full_width = FALSE)
Summary Statistics for Height grouped according to their Genders
Gender Min Max Mean Median Standard Deviation Count
M 140 219 182.992 183 10.278 869
W 136 194 170.745 170 8.469 718

The height of the respondents per gender are ranged from 140 cm to 219 cm for the male, and 136 cm to 194 cm for the female. The average height for the male is 182.992 cm which is taller than the average height for the female which is 170.745 cm. This supports the fact that on average, Male are taller than Female which is primarily attributed to sex hormones, and genetics that influences growth patterns. In terms of the distribution, for male, the mean < median, this means distributions extends more to the left, and for female, mean > median, this means that distributions extends more to the right. The range are not that far, and we are expected to see a symmetric distribution.

ggplot(data, aes(x = Height.cm., fill=Gender)) +
  
  geom_histogram(binwidth = 7, color="black") +
  facet_wrap(~Gender)+
  labs(title = "Height Distribution by Gender",
       x = "Height (cm)",
       y = "Frequency") +
  theme_minimal()

The height distributions for both males and females appear approximately normal. However, males generally exhibit higher height values than females.

ggplot(avg_height.gen, aes(x = reorder(Gender, Mean), y = Mean, fill = Gender)) +
  geom_col() +
  geom_errorbar(aes(ymin = Mean - `Standard Deviation`,
                    ymax = Mean + `Standard Deviation`),
                width = 0.5,
                linewidth = 0.5) +
  labs(title = "Average Height by Gender",
       x = "Continent",
       y = "Average Height (cm)") +
  theme_minimal()

1.3.3 Distribution of Height grouped to their Continents

avg_height.con <- data %>%
  group_by(Continent) %>%
  summarise(
     Min = round(min(Height.cm.,na.rm = TRUE), 3),
    Max = round(max(Height.cm.,na.rm=TRUE), 3),
    Mean = round(mean(Height.cm., na.rm = TRUE),3),
    Median = round(median(Height.cm., na.rm=TRUE),3),
    "Standard Deviation" = round(sd(Height.cm., na.rm = TRUE),3),
    Count = n()
  )

kable(avg_height.con, caption = "Summary Statistics for Height grouped according to their Continent") %>%
  kable_styling(full_width = FALSE)
Summary Statistics for Height grouped according to their Continent
Continent Min Max Mean Median Standard Deviation Count
Africa 153 205 174.926 173 10.177 95
Americas 150 206 175.531 175 11.420 194
Asia 136 219 173.814 174 11.254 215
Europe 149 212 178.847 178 10.977 1008
Oceania 140 212 177.280 180 13.040 75

The height of the respondents according to continent ranged from 153 cm to 205 cm for Africa, 150 cm to 206 cm for the Americas, 136 cm to 219 cm for Asia, 149 cm to 212 cm for Europe, and 140 cm to 212 cm for Oceania. Among the continents, Europe has the highest average height with 178.847 cm, followed by Oceania with 177.280 cm, the Americas with 175.531 cm, Africa with 174.926 cm, and Asia with the lowest average height of 173.814 cm. This suggests that respondents from Europe and Oceania tend to be taller on average compared to respondents from other continents.

In terms of the distribution, Africa, the Americas, and Asia have mean values that are slightly greater than their median values, indicating that the distributions extend slightly to the right. On the other hand, Europe has a mean value slightly greater than the median as well, suggesting a mildly right-skewed distribution, while Oceania has a mean value lower than the median, indicating that the distribution extends slightly to the left. However, the differences between the mean and median values across all continents are relatively small, suggesting that the distributions are approximately symmetric and close to normal.

ggplot(data, aes(x = Height.cm., fill = Continent)) +
  geom_histogram(binwidth = 5, color = "black") +
  facet_wrap(~Continent) +
  labs(title = "Height Distribution by Continent",
       x = "Height (cm)",
       y = "Frequency") +
  theme_minimal()

The histogram shows that the height distributions across all continents are approximately bell-shaped and symmetric, suggesting that the data are approximately normally distributed. Europe and Oceania generally exhibit higher height values compared to Africa, the Americas, and Asia. Although slight skewness is observed in some continent groups, no severe skewness or extreme outliers are present.

ggplot(avg_height.con, aes(x = reorder(Continent, Mean), y = Mean, fill = Continent)) +
  geom_col() +
  geom_errorbar(aes(ymin = Mean - `Standard Deviation`,
                    ymax = Mean + `Standard Deviation`),
                width = 0.5,
                linewidth = 0.5) +
  
  labs(title = "Average Height by Continent",
       x = "Continent",
       y = "Average Height (cm)") +
  theme_minimal()

1.4 Summary Statistics of Weight in kilograms

1.4.1 Total Distribution of Weight in kilograms

summary_weight <- data.frame(
  Parameter = names(summary(data$Weight.kg.)),
  Value = as.numeric(summary(data$Weight.kg.))
)

kable(summary_weight, caption = "Summary Statistics for Height") %>%
  kable_styling(full_width = TRUE)
Summary Statistics for Height
Parameter Value
Min. 30.00000
1st Qu. 61.00000
Median 70.00000
Mean 72.65343
3rd Qu. 81.00000
Max. 218.00000
ggplot(data, aes(x = Weight.kg.)) +
  geom_histogram(binwidth = 5, color = "black", fill = "skyblue") +
  labs(title = "Distribution of Weight",
       x = "Height (cm)",
       y = "Frequency") +
  theme_minimal()

1.4.2 Distribution of Weight grouped to Gender

avg_weight.gen <- data %>%
  group_by(Gender) %>%
  summarise(
    Mean = round(mean(Weight.kg., na.rm = TRUE),3),
    "Median" = round(median(Weight.kg., na.rm=TRUE),3),
    "Standard Deviation" = round(sd(Weight.kg., na.rm = TRUE),3),
    Count = n()
  )
kable(avg_weight.gen, caption = "Summary Statistics for Weight grouped according to their Genders") %>%
  kable_styling(full_width = FALSE)
Summary Statistics for Weight grouped according to their Genders
Gender Mean Median Standard Deviation Count
M 80.512 79 15.684 869
W 63.142 62 10.184 718
ggplot(data, aes(x = Weight.kg., fill = Gender)) +
  geom_histogram(position = "identity", binwidth = 5) +
  facet_wrap(~Gender)+
  labs(title = "Weight Distribution by Gender",
       x = "Height (cm)",
       y = "Frequency") +
  theme_minimal()

1.4.3 Distribution of Weight grouped according to their continent

avg_weight.con <- data %>%
  group_by(Continent) %>%
  summarise(
    Mean = round(mean(Weight.kg., na.rm = TRUE),3),
    Median = round(median(Weight.kg.,na.rm=TRUE),3),
    "Standard Deviation" = round(sd(Weight.kg., na.rm = TRUE),3),
    count = n()
  )

kable(avg_weight.con, caption = "Summary Statistics for Weight grouped according to their Continent") %>%
  kable_styling(full_width = FALSE)
Summary Statistics for Weight grouped according to their Continent
Continent Mean Median Standard Deviation count
Africa 69.979 68 15.427 95
Americas 71.108 70 13.755 194
Asia 69.628 66 16.908 215
Europe 73.673 71 15.555 1008
Oceania 75.013 73 22.726 75
ggplot(data, aes(x = Weight.kg., fill = Continent)) +
  geom_histogram(binwidth = 5, color = "black") +
  facet_wrap(~Continent) +
  labs(title = "Weight Distribution by Continent",
       x = "Weight (kg)",
       y = "Frequency") +
  theme_minimal()

ggplot(avg_weight.con, aes(x = reorder(Continent, Mean), y = Mean, fill = Continent)) +
  geom_col() +
  labs(title = "Average Weight by Continent",
       x = "Continent",
       y = "Average Weight(kg)") +
  theme_minimal()

1.4.4 Distribution of Gender Across Continents

Gen <- data %>%
  group_by(Continent) %>%
  count(Gender)
kable(Gen, title="Gender count per Continent")%>%
  kable_styling(full_width = TRUE)
Continent Gender n
Africa M 57
Africa W 38
Americas M 109
Americas W 85
Asia M 131
Asia W 84
Europe M 528
Europe W 480
Oceania M 44
Oceania W 31
ggplot(Gen, aes(x = Continent, y = n, fill = Gender)) +
  geom_col() +
  coord_flip() +
  labs(title = "Gender Distribution Across Continents",
       x = "Continent",
       y = "Count") +
     geom_text(aes(label = n),
            position = position_stack(vjust = 0.5),
            color = "white", size = 3) +
  theme_minimal()

2 Analysis of Variances

2.1 Independent T-test between the height and weight of male and female

2.1.1 Assumption 1: Identifying extreme outliers (Height)

library(rstatix)
kable(data %>%
  group_by(Gender) %>%
  identify_outliers(Height.cm.),title="Identifying Outliers") %>%
  kable_styling(full_width = TRUE)
Gender Second.name First.name Height.cm. Weight.kg. Country.Team Continent is.outlier is.extreme
M Andersen David 212 102 Australia Oceania TRUE FALSE
M Archibald Robert 212 115 United Kingdom Europe TRUE FALSE
M Bakhet Mohammed Abduh 155 50 Qatar Asia TRUE FALSE
M Boateng Eric 211 114 United Kingdom Europe TRUE FALSE
M Bohme Marcus 211 115 Germany Europe TRUE FALSE
M Clark Dan 210 109 United Kingdom Europe TRUE FALSE
M Eko Yuli Irawan 157 62 Indonesia Asia TRUE FALSE
M Freeland Joel 211 111 United Kingdom Europe TRUE FALSE
M Horton Jonathan 157 59 United States Americas TRUE FALSE
M Hristov Valentin 156 56 Azerbaijan Asia TRUE FALSE
M Jackson Ashley 155 71 United Kingdom Europe TRUE FALSE
M Kaun Sasha 211 114 Russia Europe TRUE FALSE
M Lapua Lapua Tuau 140 62 Tuvalu Oceania TRUE FALSE
M Nabarrete Zanetti Arthur 156 62 Brazil Americas TRUE FALSE
M Om Yun Chol 152 56 North Korea Asia TRUE FALSE
M Setiadi Jadi 152 56 Indonesia Asia TRUE FALSE
M Zhang Zhaoxu 219 110 China Asia TRUE FALSE
W Fagbenle Temi 193 79 United Kingdom Europe TRUE FALSE
W Houghton Frances 193 81 United Kingdom Europe TRUE FALSE
W Michel Ciara 194 70 United Kingdom Europe TRUE FALSE
W Miyake Hiromi 146 48 Japan Asia TRUE FALSE
W Rasic Milena 193 75 Serbia Europe TRUE FALSE
W Sliskovic Iva 193 82 Croatia Europe TRUE FALSE
W Stewart Azania 194 89 United Kingdom Europe TRUE FALSE
W Teramoto Asuka 136 30 Japan Asia TRUE FALSE
W Thornley Victoria 193 76 United Kingdom Europe TRUE FALSE
W Tsurumi Koko 141 34 Japan Asia TRUE FALSE

The table shows the outliers and extreme outliers in the data. It appears that our data have no extreme outliers.

2.1.2 Assumption 2: Homogeneity of Variance (Height)

library(car)
kable(leveneTest(Height.cm.~Gender, data=data),title="Levene's Test of Homogenience") %>%
  kable_styling(full_width = TRUE)
Df F value Pr(>F)
group 1 24.50489 8e-07
1585 NA NA

Since the p-value (0.0000008) is fairly less than 0.05 hence, there is an issue of homogeneity of variance in the male and female. The assumption of homogeneity is being violated. Even though the homogeneity is violated by the observation, we car resolve this issue by using Welch one way test which assumes that the variances among the groups are not equal.

2.1.3 Assumption 3: Checking Independence (Height)

dw.test.hgen <-durbinWatsonTest(aov(Height.cm.~Gender, data = data))
dw_df <- data.frame(
  Test = "Durbin Wtason Test for independence",
  Statistic = dw.test.hgen$dw,
  p_value = dw.test.hgen$p
)

kable(dw_df, digits = 3, caption = "Durbin-Watson Test") %>%
  kable_styling(full_width = TRUE)
Durbin-Watson Test
Test Statistic p_value
Durbin Wtason Test for independence 1.949 0.294

The result clearly indicates that we fail to reject the null hypothesis since the p-value (0.296) is far greater than 0.05. Therefore, we can conclude that the data have no issue with this assumption. The residuals are fairly independent of each other.

2.1.4 Assumption 4: Normality of Residuals (Height)

qqnorm(resid(lm(Height.cm. ~ Gender, data = data)))
qqline(resid(lm(Height.cm. ~ Gender, data = data)))

The normal Q-Q plot showed that the residuals are generally follow the reference line, although there were slight deviations at the tails, indicating the data are approximately normal. To confirm our assessment using the Q-Q plot, we will use Shapiro Wilk Normality test in the residuals. If the p-value is greater than 0.05, then the data is normally distributed, otherwise not normal.

library(ggpubr)
ggqqplot(data,x  = "Height.cm.", facet.by = "Gender")

From the output above, we can conclude that the data of the two groups are approximately normally distributed.

shapiro.test(resid(lm(Height.cm.~ Gender, data = data)))
  
    Shapiro-Wilk normality test
  
  data:  resid(lm(Height.cm. ~ Gender, data = data))
  W = 0.99605, p-value = 0.0003816

The Shapiro-Wilk normality test revealed that the residuals were not normally distributed, however, the Normal Q-Q plot shows an approximately normally dsitributed residuals. With the homogeniety of variance being violated, and data were approximately normal, indicating a slight violation, hence we we will use Welch’s Test.

2.2 Result (Height)

t.test(Height.cm.~Gender,data=data)
  
    Welch Two Sample t-test
  
  data:  Height.cm. by Gender
  t = 26.025, df = 1585, p-value < 2.2e-16
  alternative hypothesis: true difference in means between group M and group W is not equal to 0
  95 percent confidence interval:
   11.32381 13.16983
  sample estimates:
  mean in group M mean in group W 
         182.9919        170.7451

“An independent samples t-test was conducted to compare the Height between Male and Female. The result show that there was a significant difference in the scores for Male (Mean = 182.992) and Female (Mean = 170.745), t(1585) = 26.025, p < 0.05.

boxplot(data$Height.cm.~data$Gender,main ="Height in cm grouped according to Genders",xlab = "Gender",ylab="Height (cm)")

The boxplot clearly shows that Male and Female have different averages in height.

2.2.1 Assumption 1: Identifying extreme outliers (Height)

kable(data %>%
  group_by(Gender) %>%
  identify_outliers(Weight.kg.),title="Identifying outliers") %>%
  kable_styling(full_width = TRUE)
Gender Second.name First.name Height.cm. Weight.kg. Country.Team Continent is.outlier is.extreme
M Baroev Khasan 188 120 Russia Europe TRUE FALSE
M Blas Jr Ricardo 185 218 Guam Oceania TRUE TRUE
M Brooks Lance 198 123 United States Americas TRUE FALSE
M Buhari Abdul 192 125 United Kingdom Europe TRUE FALSE
M Cadee Erik 201 120 Netherlands Europe TRUE FALSE
M Chintoan Rares Daniel 189 120 Romania Europe TRUE FALSE
M Eltrabily Abdelrahman 190 120 Egypt Africa TRUE FALSE
M Hoshina Tomohiko 180 125 Philippines Asia TRUE FALSE
M Jargalsaikhan Chuluunbat 184 116 Mongolia Asia TRUE FALSE
M Macek Bostjan 174 118 Slovenia Europe TRUE FALSE
M Majewski Tomasz 204 142 Poland Europe TRUE FALSE
M Mandembo Kebika Cedric 180 120 DR Congo Africa TRUE FALSE
M Myerscough Carl 208 160 United Kingdom Europe TRUE TRUE
M Nikfar Amin 198 130 Iran Asia TRUE FALSE
M Okoye Lawrence 197 129 United Kingdom Europe TRUE FALSE
M Ota Kazuomi 183 145 Japan Asia TRUE TRUE
M Padar Martin 194 140 Estonia Europe TRUE FALSE
M Salman Ali Nadhim 190 120 Iraq Asia TRUE FALSE
M Sherrington Christopher 194 135 United Kingdom Europe TRUE FALSE
M Udachyn Artem 186 160 Ukraine Europe TRUE TRUE
M Yushkov Ivan 192 130 Russia Europe TRUE FALSE
M Zhang Jun 186 120 China Asia TRUE FALSE
W Avdeeva Anna 175 100 Russia Europe TRUE FALSE
W Bingson Amanda 165 91 United States Americas TRUE FALSE
W Bryant Karina 184 103 United Kingdom Europe TRUE FALSE
W Casanova Elisa 185 100 Italy Europe TRUE FALSE
W FOKINA-SEMENOVA Natalya 178 90 Ukraine Europe TRUE FALSE
W Gallardo Karen 175 95 Chile Americas TRUE FALSE
W Gong Lijiao 175 108 China Asia TRUE FALSE
W Horasan Yasemin 186 92 Turkey Asia TRUE FALSE
W Ivashchenko Elena 184 103 Russia Europe TRUE FALSE
W Kleinert Nadine 190 94 Germany Europe TRUE FALSE
W Marton Anita 172 90 Hungary Europe TRUE FALSE
W Moniz Adysangela 166 105 Cape Verde Africa TRUE FALSE
W Sendriute Zinaida 188 94 Lithuania Europe TRUE FALSE
W Shimamoto Mami 165 103 Japan Asia TRUE FALSE
W Stewart Azania 194 89 United Kingdom Europe TRUE FALSE
W Teramoto Asuka 136 30 Japan Asia TRUE FALSE
W Tsurumi Koko 141 34 Japan Asia TRUE FALSE
W Tunney Rebecca 149 35 United Kingdom Europe TRUE FALSE

The table shows the outliers and extreme outliers in the data, it appears that our data have extreme outliers, however, outliers identified in the dataset were retained because they represented plausible observations rather than data entry errors.

2.2.2 Assumption 2: Homogeneity of Variance (Weight)

kable(leveneTest(Weight.kg.~Gender, data=data),title="Levene's Test of Homogenience") %>%
  kable_styling(full_width = TRUE)
Df F value Pr(>F)
group 1 60.97902 0
1585 NA NA

Since the p-value is fairly less than 0.05 hence, there is an issue of homogeneity of variance in the weight of male and female. The assumption of homogeneity is being violated. Even though the homogeneity is violated by the observation, we car resolve this issue by using Welch one way test which assumes that the variances among the groups are not equal.

2.2.3 Assumption 3: Checking Independence (Weight)

dw.test.wgen <-durbinWatsonTest(aov(Weight.kg.~Gender, data = data))
dh_df <- data.frame(
  Test = "Durbin Wtason Test for independence",
  Statistic = dw.test.wgen$dw,
  p_value = dw.test.wgen$p
)

# Display as table
kable(dh_df, digits = 3, caption = "Durbin-Watson Test") %>%
  kable_styling(full_width = TRUE)
Durbin-Watson Test
Test Statistic p_value
Durbin Wtason Test for independence 1.988 0.856

The result clearly indicates that we fail to reject the null hypothesis since the p-value (0.812) is far greater than 0.05. Therefore, we can conclude that the data have no issue with this assumption. The residuals are fairly independent of each other.

2.2.4 Assumption 4: Normality of Residuals (Weight)

qqnorm(resid(lm(Weight.kg. ~ Gender, data = data)))
qqline(resid(lm(Weight.kg. ~ Gender, data = data)))

The normal Q-Q plot showed that the residuals are generally do not follow the reference line with deviations at the tails, indicating the data are not approximately normal. To confirm our assessment using the Q-Q plot, we will use Shapiro Wilk Normality test in the residuals. If the p-value is greater than 0.05, then the data is normally distributed, otherwise not normal.

ggqqplot(data,x  = "Weight.kg.", facet.by = "Gender")

From the output above, we can conclude that the data of the two groups are not normally distributed.

shapiro.test(resid(lm(Weight.kg.~ Gender, data = data)))
  
    Shapiro-Wilk normality test
  
  data:  resid(lm(Weight.kg. ~ Gender, data = data))
  W = 0.92117, p-value < 2.2e-16

The Shapiro–Wilk normality test revealed that the residuals were not normally distributed, confirming the earlier assessment from the Normal Q-Q plot. Furthermore, the assumption of homogeneity of variances was violated, and extreme outliers were identified in the dataset. Therefore, the Wilcoxon rank-sum test was employed as a nonparametric alternative to compare the groups.

2.3 Result (Weight)

wilcox.test(Weight.kg. ~ Gender, data = data)
  
    Wilcoxon rank sum test with continuity correction
  
  data:  Weight.kg. by Gender
  W = 528001, p-value < 2.2e-16
  alternative hypothesis: true location shift is not equal to 0

A Wilcoxon rank-sum test revealed a significant difference in weight between male and female respondents, W=528001, p<.05. Since Wilcoxon is nonparametric, medians are better to report than means.

kable(aggregate(Weight.kg. ~ Gender,
          data = data,
          median), title="Median weights of Male and Female") %>%
  kable_styling(full_width = TRUE)
Gender Weight.kg.
M 79
W 62

The results showed that the median height of male respondents was greater than that of female respondents.

2.3.1 Analysis of Variance between the height and weight across different continents

kable(data %>%
  group_by(Continent) %>%
  get_summary_stats(Height.cm., type = "mean_sd"), main = "Summary Statistics per continent in  Height") %>%
  kable_styling(full_width = TRUE)
Continent variable n mean sd
Africa Height.cm. 95 174.926 10.177
Americas Height.cm. 194 175.531 11.420
Asia Height.cm. 215 173.814 11.254
Europe Height.cm. 1008 178.847 10.977
Oceania Height.cm. 75 177.280 13.040
ggboxplot(data, x = "Continent", y = "Height.cm.")

2.3.2 Assumption 1: Identifying extreme outliers (Height)

kable(data %>%
  group_by(Continent) %>%
  identify_outliers(Height.cm.), title="Identifying extreme outliers") %>%
  kable_styling(full_width = TRUE)
Continent Second.name First.name Height.cm. Weight.kg. Country.Team Gender is.outlier is.extreme
Africa Ghayaza Mokhtar 205 106 Tunisia M TRUE FALSE
Africa Maoua Atef 200 100 Tunisia M TRUE FALSE
Asia Teramoto Asuka 136 30 Japan W TRUE FALSE
Asia Zhang Zhaoxu 219 110 China M TRUE FALSE
Europe Archibald Robert 212 115 United Kingdom M TRUE FALSE

The table shows the identified outliers and extreme outliers in the dataset.It shows that the data do not extreme outliers.

2.3.3 Assumption 2: Homogeneity of Variance (Height)

kable(leveneTest(Height.cm.~Continent, data=data),title="Levene's Test of Homogenience") %>%
  kable_styling(full_width = TRUE)
Df F value Pr(>F)
group 4 1.603232 0.1709401
1582 NA NA

The assumption of homogeneity of variances is not violated.

2.3.4 Assumption 3: Checking Independence (Height)

dw.test.hgen <-durbinWatsonTest(aov(Height.cm.~Continent, data = data))
dw_df <- data.frame(
  Test = "Durbin Wtason Test for independence",
  Statistic = dw.test.hgen$dw,
  p_value = dw.test.hgen$p
)

kable(dw_df, digits = 3, caption = "Durbin-Watson Test") %>%
  kable_styling(full_width = TRUE)
Durbin-Watson Test
Test Statistic p_value
Durbin Wtason Test for independence 1.96 0.406

The result clearly indicates that we fail to reject the null hypothesis since the p-value (0.296) is far greater than 0.05. Therefore, we can conclude that the data have no issue with this assumption. The residuals are fairly independent of each other.

2.3.5 Assumption 4: Normality of Residuals (Height)

model  <- lm(Height.cm. ~ Continent, data =data)

ggqqplot(residuals(model))

kable(shapiro_test(residuals(model)),title="Shapiro Wilk Normality Test") %>%
  kable_styling(full_width = TRUE)
variable statistic p.value
residuals(model) 0.9958123 0.0002234

In the QQ plot, as most of the points fall approximately along the reference line, we can assume an approximate normality. This conclusion is supported by the Shapiro-Wilk test. The p-value is not significant (p = 0.0002), the assumption of normality is violated.

kab <- data %>%
  group_by(Continent) %>%
  filter(n() >= 3 & n() <= 5000) %>%
  shapiro_test(Height.cm.)

kable(kab, caption = "Shapiro-Wilk Normality Test per Continent") %>%
  kable_styling(full_width = TRUE)
Shapiro-Wilk Normality Test per Continent
Continent variable statistic p
Africa Height.cm. 0.9700153 0.0280158
Americas Height.cm. 0.9909473 0.2645250
Asia Height.cm. 0.9872017 0.0505911
Europe Height.cm. 0.9932799 0.0001619
Oceania Height.cm. 0.9795728 0.2667922
ggqqplot(data, "Height.cm.", facet.by = "Continent")

Most of the points fall approximately along the reference line, for each cell. But the Shapiro Wilk normality test says otherwise for Africa, and Europe. So the normality of the data is violated. Since the residuals are not normally distributed, we will use Kruskal-Wallis rank sum test as a non-parametric alternative to One-Way ANOVA.

2.4 Result (Height)

kruskal.test(Height.cm. ~ Continent, data = data)
  
    Kruskal-Wallis rank sum test
  
  data:  Height.cm. by Continent
  Kruskal-Wallis chi-squared = 44.49, df = 4, p-value = 5.075e-09

A Kruskal-Wallis test showed that there was a statistically significant difference in Height in cm among the five continents \(\chi^2(4)=44.49,p=5.075e-09\).Hence, Dunn’s post hoc test with Bonferroni adjustment was conducted to determine which specific continent pairs differed significantly.

kable(dunn_test(data,
          Height.cm. ~ Continent,
          p.adjust.method = "bonferroni"),title="Dunn's Test - identifies which specific continent pairs differ.") %>%
  kable_styling(full_width = TRUE)
.y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
Height.cm. Africa Americas 95 194 0.8348311 0.4038128 1.0000000 ns
Height.cm. Africa Asia 95 215 -0.3522088 0.7246817 1.0000000 ns
Height.cm. Africa Europe 95 1008 3.5046461 0.0004572 0.0045721 **
Height.cm. Africa Oceania 95 75 1.6425663 0.1004727 1.0000000 ns
Height.cm. Americas Asia 194 215 -1.4938928 0.1352036 1.0000000 ns
Height.cm. Americas Europe 194 1008 3.4641314 0.0005319 0.0053195 **
Height.cm. Americas Oceania 194 75 1.0971466 0.2725773 1.0000000 ns
Height.cm. Asia Europe 215 1008 5.5845952 0.0000000 0.0000002 ****
Height.cm. Asia Oceania 215 75 2.2154902 0.0267264 0.2672644 ns
Height.cm. Europe Oceania 1008 75 -1.0227486 0.3064267 1.0000000 ns
dunn_res <- data %>%
  dunn_test(
    Height.cm. ~ Continent,
    p.adjust.method = "bonferroni"
  ) %>%
  add_y_position()

ggboxplot(data,
          x = "Continent",
          y = "Height.cm.",
          color = "Continent") +
  stat_pvalue_manual(
    dunn_res,
    label = "p.adj.signif",
    hide.ns = TRUE
  )

Dunn’s post hoc test with Bonferroni adjustment was conducted to identify which continent pairs differed significantly in height. The results revealed significant differences between several continent pairs, as indicated by the significance annotations in the boxplot. Specifically, Europe showed significantly higher height distributions compared to Africa and the Americas. These findings suggest that height varies significantly across certain continents.

kable(data %>%
  group_by(Continent) %>%
  get_summary_stats(Weight.kg., type = "mean_sd"), main = "Summary Statistics per continent in  Weight") %>%
  kable_styling(full_width = TRUE)
Continent variable n mean sd
Africa Weight.kg. 95 69.979 15.427
Americas Weight.kg. 194 71.108 13.755
Asia Weight.kg. 215 69.628 16.908
Europe Weight.kg. 1008 73.673 15.555
Oceania Weight.kg. 75 75.013 22.726
ggboxplot(data, x = "Continent", y = "Weight.kg.")

2.4.0.1 Assumption 1: Identifying extreme outliers (Weight)
kable(data %>%
  group_by(Continent) %>%
  identify_outliers(Weight.kg.), title="Identifying extreme outliers") %>%
  kable_styling(full_width = TRUE)
Continent Second.name First.name Height.cm. Weight.kg. Country.Team Gender is.outlier is.extreme
Africa Eltrabily Abdelrahman 190 120 Egypt M TRUE FALSE
Africa Ghayaza Mokhtar 205 106 Tunisia M TRUE FALSE
Africa Mandembo Kebika Cedric 180 120 DR Congo M TRUE FALSE
Americas Brooks Lance 198 123 United States M TRUE FALSE
Asia Gong Lijiao 175 108 China W TRUE FALSE
Asia Hoshina Tomohiko 180 125 Philippines M TRUE FALSE
Asia Jargalsaikhan Chuluunbat 184 116 Mongolia M TRUE FALSE
Asia Nikfar Amin 198 130 Iran M TRUE FALSE
Asia Ota Kazuomi 183 145 Japan M TRUE TRUE
Asia Salman Ali Nadhim 190 120 Iraq M TRUE FALSE
Asia Zhang Jun 186 120 China M TRUE FALSE
Asia Zhang Zhaoxu 219 110 China M TRUE FALSE
Europe Archibald Robert 212 115 United Kingdom M TRUE FALSE
Europe Baroev Khasan 188 120 Russia M TRUE FALSE
Europe Boateng Eric 211 114 United Kingdom M TRUE FALSE
Europe Bohme Marcus 211 115 Germany M TRUE FALSE
Europe Buhari Abdul 192 125 United Kingdom M TRUE FALSE
Europe Buric Damir 205 115 Croatia M TRUE FALSE
Europe Cadee Erik 201 120 Netherlands M TRUE FALSE
Europe Chintoan Rares Daniel 189 120 Romania M TRUE FALSE
Europe Chioveanu Mihnea 198 115 Romania M TRUE FALSE
Europe Kaun Sasha 211 114 Russia M TRUE FALSE
Europe Macek Bostjan 174 118 Slovenia M TRUE FALSE
Europe Majewski Tomasz 204 142 Poland M TRUE FALSE
Europe Myerscough Carl 208 160 United Kingdom M TRUE TRUE
Europe Okoye Lawrence 197 129 United Kingdom M TRUE FALSE
Europe Padar Martin 194 140 Estonia M TRUE FALSE
Europe Sherrington Christopher 194 135 United Kingdom M TRUE FALSE
Europe Smith Alexander 183 115 United Kingdom M TRUE FALSE
Europe Udachyn Artem 186 160 Ukraine M TRUE TRUE
Europe Yushkov Ivan 192 130 Russia M TRUE FALSE
Oceania Blas Jr Ricardo 185 218 Guam M TRUE TRUE

The table shows the identified outliers and extreme outliers in the dataset. Although several observations were classified as extreme outliers, these values were retained in the analysis because they represented plausible observations rather than data entry errors or measurement inaccuracies. Moreover, the number of extreme outliers constituted only a small proportion.

2.4.1 Assumption 2: Homogeneity of Variance (Weight)

kable(leveneTest(Weight.kg.~Continent, data=data),title="Levene's Test of Homogenience") %>%
  kable_styling(full_width = TRUE)
Df F value Pr(>F)
group 4 1.462206 0.2112841
1582 NA NA

The assumption of homogeneity of variances is not violated.

2.4.2 Assumption 3: Checking Independence (Height)

dw.test.hgen <-durbinWatsonTest(aov(Weight.kg.~Continent, data = data))
dw_df <- data.frame(
  Test = "Durbin Wtason Test for independence",
  Statistic = dw.test.hgen$dw,
  p_value = dw.test.hgen$p
)

kable(dw_df, digits = 3, caption = "Durbin-Watson Test") %>%
  kable_styling(full_width = TRUE)
Durbin-Watson Test
Test Statistic p_value
Durbin Wtason Test for independence 1.987 0.858

The result clearly indicates that we fail to reject the null hypothesis since the p-value (0.708) is far greater than 0.05. Therefore, we can conclude that the data have no issue with this assumption. The residuals are fairly independent of each other.

2.4.3 Assumption 4: Normality of Residuals (Height)

model  <- lm(Weight.kg. ~ Continent, data =data)

ggqqplot(residuals(model))

kable(shapiro_test(residuals(model)),title="Shapiro Wilk Normality Test") %>%
  kable_styling(full_width = TRUE)
variable statistic p.value
residuals(model) 0.9307511 0

In the QQ plot, as most of the points fall approximately along the reference line, we can assume an approximate normality. This conclusion is supported by the Shapiro-Wilk test. The p-value is not significant (p = 0.0002), the assumption of normality is violated.

data %>%
  group_by(Continent) %>%
  summarise(n = sum(!is.na(Height.cm.)))
  # A tibble: 5 × 2
    Continent     n
    <chr>     <int>
  1 Africa       95
  2 Americas    194
  3 Asia        215
  4 Europe     1008
  5 Oceania      75
kab <- data %>%
  group_by(Continent) %>%
  filter(n() >= 3 & n() <= 5000) %>%
  shapiro_test(Height.cm.)

kable(kab, caption = "Shapiro-Wilk Normality Test per Continent") %>%
  kable_styling(full_width = TRUE)
Shapiro-Wilk Normality Test per Continent
Continent variable statistic p
Africa Height.cm. 0.9700153 0.0280158
Americas Height.cm. 0.9909473 0.2645250
Asia Height.cm. 0.9872017 0.0505911
Europe Height.cm. 0.9932799 0.0001619
Oceania Height.cm. 0.9795728 0.2667922
ggqqplot(data, "Height.cm.", facet.by = "Continent")

Most of the points fall approximately along the reference line, for each cell. But the Shapiro Wilk normality test says otherwise for Africa, and Europe. So the normality of the data is violated. With the extreme outliers being present, and residuals are not normally distributed, we will use Kruskal-Wallis rank sum test as a non-parametric alternative to One-Way ANOVA.

2.5 Result (Height)

kruskal.test(Height.cm. ~ Continent, data = data)
  
    Kruskal-Wallis rank sum test
  
  data:  Height.cm. by Continent
  Kruskal-Wallis chi-squared = 44.49, df = 4, p-value = 5.075e-09

A Kruskal-Wallis test showed that there was a statistically significant difference in Height in cm among the five continents \(\chi^2(4)=44.49,p=5.075e-09\).Hence, Dunn’s post hoc test with Bonferroni adjustment was conducted to determine which specific continent pairs differed significantly.

kable(dunn_test(data,
          Height.cm. ~ Continent,
          p.adjust.method = "bonferroni"),title="Dunn's Test - identifies which specific continent pairs differ.") %>%
  kable_styling(full_width = TRUE)
.y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
Height.cm. Africa Americas 95 194 0.8348311 0.4038128 1.0000000 ns
Height.cm. Africa Asia 95 215 -0.3522088 0.7246817 1.0000000 ns
Height.cm. Africa Europe 95 1008 3.5046461 0.0004572 0.0045721 **
Height.cm. Africa Oceania 95 75 1.6425663 0.1004727 1.0000000 ns
Height.cm. Americas Asia 194 215 -1.4938928 0.1352036 1.0000000 ns
Height.cm. Americas Europe 194 1008 3.4641314 0.0005319 0.0053195 **
Height.cm. Americas Oceania 194 75 1.0971466 0.2725773 1.0000000 ns
Height.cm. Asia Europe 215 1008 5.5845952 0.0000000 0.0000002 ****
Height.cm. Asia Oceania 215 75 2.2154902 0.0267264 0.2672644 ns
Height.cm. Europe Oceania 1008 75 -1.0227486 0.3064267 1.0000000 ns
dunn_res <- data %>%
  dunn_test(
    Height.cm. ~ Continent,
    p.adjust.method = "bonferroni"
  ) %>%
  add_y_position()


ggboxplot(data,
          x = "Continent",
          y = "Height.cm.",
          color = "Continent") +
  stat_pvalue_manual(
    dunn_res,
    label = "p.adj.signif",
    hide.ns = TRUE
  )

Dunn’s post hoc test with Bonferroni adjustment was conducted to identify which continent pairs differed significantly in height. The results revealed significant differences between several continent pairs, as indicated by the significance annotations in the boxplot. Specifically, Europe showed significantly higher height distributions compared to Africa and the Americas. These findings suggest that height varies significantly across certain continents.

3 Correlation analysis of Height and Weight

Before using the Pearson r Correlation Coefficient, we need to check the assumptions first for Pearson r Correlation Coefficient. It is a parametric test that identifies the relationship between the two variables. If at least one of the assumptions are violated, then we need to proceed with a non-parametric test.

Key Assumptions:

  • No outliers: The data should not contain any extreme outliers that could disproportionately influence the results.
  • Linear relationship: The relationship between the two variables should be approximately linear, which can be checked with a scatterplot.

  • Normality: Both variables should be approximately normally distributed. This is a key assumption for the statistical significance tests, and if violated, a non-parametric test like Spearman’s rho may be more appropriate.

3.1 Assessing Linear relationship

ggplot(data=data, aes(x=Height.cm.,y=Weight.kg.)) +
  geom_point(color = "black",alpha=0.5,size=1) +
  labs(title= "Scatterplot between Height and Weight", x="Height", y= "Weight") + 
  geom_smooth(method = "lm", se=FALSE, color="red") +
  theme_lucid()

The result shows a linear relationship between the Height in centimeters and Weight in kilograms of the respondents in the data. We can also observe points far from the main cluster, which tells us a potential extreme outlier. To assess outliers, the Mahalanobis distance was computed using height and weight variables. Observations with Mahalanobis distance values exceeding the chi-square critical value at a specified significance level were identified as potential outliers.

3.2 Assessing the extreme outliers

md <- mahalanobis(
  data[, c("Height.cm.", "Weight.kg.")],
  colMeans(data[, c("Height.cm.", "Weight.kg.")]),
  cov(data[, c("Height.cm.", "Weight.kg.")])
)


cutoff <- qchisq(0.999, df = 2)


outliers <- which(md > cutoff)

outliers
   [1]  142  185  500  601  777  845  856  862  948  988 1010 1037 1051 1061 1279
  [16] 1282 1439 1569 1578 1579
haw[outliers, ]
  # A tibble: 20 × 6
     `Second name`   `First name` `Height(cm)` `Weight(kg)` `Country/Team`  Gender
     <chr>           <chr>               <dbl>        <dbl> <chr>           <chr> 
   1 Blas Jr         Ricardo               185          218 Guam            M     
   2 Buhari          Abdul                 192          125 Team GB         M     
   3 Gong            Lijiao                175          108 China           W     
   4 Hoshina         Tomohiko              180          125 Philippines     M     
   5 Lapua Lapua     Tuau                  140           62 Tuvalu          M     
   6 Macek           Bostjan               174          118 Slovenia        M     
   7 Majewski        Tomasz                204          142 Poland          M     
   8 Mandembo Kebika Cedric                180          120 DR Congo        M     
   9 Moniz           Adysangela            166          105 Cape Verde      W     
  10 Myerscough      Carl                  208          160 Team GB         M     
  11 Nikfar          Amin                  198          130 Iran            M     
  12 Okoye           Lawrence              197          129 Team GB         M     
  13 Ota             Kazuomi               183          145 Japan           M     
  14 Padar           Martin                194          140 Estonia         M     
  15 Sherrington     Christopher           194          135 Team GB         M     
  16 Shimamoto       Mami                  165          103 Japan           W     
  17 Udachyn         Artem                 186          160 Ukraine         M     
  18 Yushkov         Ivan                  192          130 Russian Federa… M     
  19 Zhang           Jun                   186          120 China           M     
  20 Zhang           Zhaoxu                219          110 China           M
data$outlier <- md > cutoff

ggplot(data, aes(x = Height.cm., y = Weight.kg., color = outlier)) +
  geom_point() +
  scale_color_manual(values = c("black", "red")) +
  theme_classic()

A scatterplot of height and weight was examined to assess linearity and detect potential outliers. The plot indicated a positive linear relationship between the variables.

Multivariate outliers were identified using Mahalanobis distance, with observations exceeding the chi-square critical value flagged as extreme cases. As shown in the plot, several observations were identified as outliers, appearing distant from the main cluster of data points.

Despite the presence of these outliers, the overall linear pattern between height and weight remained evident.

3.3 Assessing the normality of variables

normality.height <- shapiro.test(data$Height.cm.)
normality.weight <- shapiro.test(data$Weight.kg.)
summary_normality.height <- data.frame(
  Parameter = c("Method","Statistic","p-value"),
  Height= c(normality.height$method,normality.height$statistic,normality.height$p.value),
  Weight= c(normality.weight$method,normality.weight$statistic,normality.weight$p.value)
)
kable(summary_normality.height,title="Shapiro Wilk Normality test of Height and Weight")%>%
  kable_styling(full_width = TRUE)
Parameter Height Weight
Method Shapiro-Wilk normality test Shapiro-Wilk normality test
Statistic 0.996004518548028 0.93153775256766
p-value 0.000346469434004617 2.34748362486449e-26

The Shapiro-Wilk normality test revealed that both Height and Weight significantly deviated from a normal distribution. For Height, the test yielded a statistic of W=0.9960 with a p-value of 0.00035, while Weight obtained W=0.9315 with a p-value less than 0.001. Since the p-values for both variables were less than the 0.05 level of significance. This indicates that the distributions of Height and Weight are not normally distributed.

3.4 Correlation Analysis

The table in (Anesthesia & Analgesia 126(5):p 1763-1768, May 2018. | DOI: 10.1213/ANE.0000000000002864) was used to determine the strength of relationship.

Correlation Coefficient Interpretation
0.00-0.10 Negligible correlation
0.10-0.39 Weak correlation
0.40-0.69 Moderate correlation
0.70-0.89 Strong correlation
0.90-1.00 Very strong correlation
correlation <- cor.test(data$Height.cm.,data$Weight.kg., method="spearman")


cor_table <- data.frame(
  Parameter = c( "Method","Correlation (r)", "p-value"),
  Value = c( correlation$method ,correlation$estimate, correlation$p.value)
)

kable(cor_table,title ="Spearman Rank Correlation Coefficient between Height in cm and Weight in cm") %>%
  kable_styling(full_width = TRUE)
Parameter Value
Method Spearman’s rank correlation rho
Correlation (r) 0.829417829182253
p-value 0
qplot(Height.cm., Weight.kg.,data=data) + 
  geom_smooth(color="red") +
  labs(title= "Scatterplot between Height and Weight", x="Height", y= "Weight") +
  theme_minimal()

data$rank_Height <- rank(data$Height.cm.)
data$rank_Weight <- rank(data$Weight.kg.)

# Scatter plot for ranks 
ggplot(data, aes(x = rank_Weight, y = rank_Height)) +
  geom_point(alpha = 0.7, size = 3) +
  geom_smooth(method = "lm", se = TRUE, color = "blue") +
  labs(title = "Scatter Plot of Ranked Variables",
       x = "Rank of Height of the respondents",
       y = "Rank of Weight of the respondents") +
  theme_minimal()

The Spearman rank correlation analysis revealed a statistically significant strong positive relationship between Height and Weight (rho =0.8295, p<0.001). This indicates that as height increases, weight also tends to increase. Based on the magnitude of the correlation coefficient, the relationship between the two variables can be interpreted as strong.

Spearman’s rank correlation coefficient was used to determine the relationship between Height and Weight because the variables were found to be non-normally distributed based on the Shapiro-Wilk normality test. Unlike Pearson correlation, Spearman correlation is a non-parametric statistical method that does not require the assumption of normality. The procedure involves converting the raw values of Height and Weight into ranks and then measuring the degree to which the ranked variables move together in a monotonic manner. After ranking the observations, the differences between the ranks of each paired observation were calculated and used to compute the Spearman correlation coefficient. The resulting coefficient indicates both the strength and direction of the relationship between the two variables.

4 Regression Analysis

In this analysis, regression analysis was used to determine whether Height significantly predicts Weight. In the present study, Height served as the independent variable, while Weight served as the dependent variable. Height was treated as the predictor variable because variations in height may help explain differences in body weight. On the other hand, Weight cannot directly determine the height of a person, since individuals with higher body weight may still be either short or tall. Therefore, the regression model was structured to examine the extent to which Height predicts Weight.

Prior to conducting the regression analysis, the assumptions of simple linear regression were assessed. These included normality of residuals, absence of influential outliers, independence of errors, homoscedasticity, and linearity. Diagnostic tests and graphical assessments were conducted using the performance package in RStudio.

library(performance)
model_reg<-lm((Weight.kg.) ~ (Height.cm.), data=data)

check_normality(model_reg)
  Warning: Non-normality of residuals detected (p < .001).
check_outliers(model_reg)
  OK: No outliers detected.
  - Based on the following method and threshold: cook (0.7).
  - For variable: (Whole model)
check_autocorrelation(model_reg)
  OK: Residuals appear to be independent and not autocorrelated (p = 0.752).
check_heteroscedasticity(model_reg)
  Warning: Heteroscedasticity (non-constant error variance) detected (p < .001).

Because violations in normality and homoscedasticity were observed, several data transformations were explored in an attempt to improve the distribution of the residuals and stabilize the variance of the errors. These transformations included logarithmic, Box-Cox, and Yeo-Johnson transformations. The transformed models were subsequently reassessed to determine whether the regression assumptions improved and whether the transformed model provided a better fit to the data.

4.1 Log Transformation

model_log<-lm(log(Weight.kg.) ~ (Height.cm.), data=data)

check_normality(model_log)
  Warning: Non-normality of residuals detected (p < .001).
check_outliers(model_log)
  OK: No outliers detected.
  - Based on the following method and threshold: cook (0.7).
  - For variable: (Whole model)
check_autocorrelation(model_log)
  OK: Residuals appear to be independent and not autocorrelated (p = 0.598).
check_heteroscedasticity(model_log)
  OK: Error variance appears to be homoscedastic (p = 0.409).

After applying the logarithmic transformation, the assumptions of independence of residuals, absence of influential outliers, and homoscedasticity were satisfied. However, the normality of residuals assumption remained violated (p<0.001). Despite this, the transformation improved the model by stabilizing the variance of the residuals and reducing heteroscedasticity, making the regression model more acceptable for further analysis. Nevertheless, another transformation will be explored to determine whether the normality of residuals can still be improved further.

4.2 Square root transformation

model_sqrt<-lm(sqrt(Weight.kg.) ~ (Height.cm.), data=data)

check_normality(model_sqrt)
  Warning: Non-normality of residuals detected (p < .001).
check_outliers(model_sqrt)
  OK: No outliers detected.
  - Based on the following method and threshold: cook (0.7).
  - For variable: (Whole model)
check_autocorrelation(model_sqrt)
  OK: Residuals appear to be independent and not autocorrelated (p = 0.720).
check_heteroscedasticity(model_sqrt)
  Warning: Heteroscedasticity (non-constant error variance) detected (p < .001).

4.3 Inverse Transformation

model_inv<-lm(1/(Weight.kg.) ~ (Height.cm.), data=data)

check_normality(model_inv)
  Warning: Non-normality of residuals detected (p < .001).
check_outliers(model_inv)
  OK: No outliers detected.
  - Based on the following method and threshold: cook (0.7).
  - For variable: (Whole model)
check_autocorrelation(model_inv)
  OK: Residuals appear to be independent and not autocorrelated (p = 0.758).
check_heteroscedasticity(model_inv)
  Warning: Heteroscedasticity (non-constant error variance) detected (p < .001).

After applying the square root and inverse transformation, the assumptions of independence of residuals and absence of influential outliers were satisfied. However, the assumptions of normality of residuals (p<0.001) and homoscedasticity (p<0.001) remained violated. Compared to the logarithmic transformation, the square root and inverse transformation did not sufficiently improve the regression assumptions. Therefore, another transformation will be explored to further improve the residual distribution and stabilize the variance of the errors.

4.4 Box Cox Transformation

library(MASS)

model_bc <- lm(Weight.kg. ~ Height.cm., data=data)

bc_final<- boxcox(model_bc, lambda = seq(-2, 2, 0.1))

lambda_opt <- bc_final$x[which.max(bc_final$y)]

data$weight_bc <-
  if (lambda_opt == 0) {log(data$Weight.kg.)
    } else {(data$Weight.kg.^lambda_opt - 1) / lambda_opt
      }

The Box-Cox transformation plot suggests that the optimal lambda (λ) value is approximately close to 0, indicating that a logarithmic transformation may be the most appropriate transformation for the data. This is because a Box-Cox lambda near zero commonly corresponds to the natural logarithm transformation.

model_bc<-lm((weight_bc) ~ (Height.cm.), data=data)

check_normality(model_bc)
  Warning: Non-normality of residuals detected (p < .001).
check_outliers(model_bc)
  OK: No outliers detected.
  - Based on the following method and threshold: cook (0.7).
  - For variable: (Whole model)
check_autocorrelation(model_bc)
  OK: Residuals appear to be independent and not autocorrelated (p = 0.706).
check_heteroscedasticity(model_bc)
  Warning: Heteroscedasticity (non-constant error variance) detected (p < .001).

4.5 Yeo-Johnson transformation

library(bestNormalize)

yj <- yeojohnson(data$Weight.kg.)


data$Weight_YJ <- predict(yj)


model_yj <- lm(Weight_YJ ~ Height.cm., data = data)

check_normality(model_yj)
  Warning: Non-normality of residuals detected (p < .001).
check_outliers(model_yj)
  OK: No outliers detected.
  - Based on the following method and threshold: cook (0.7).
  - For variable: (Whole model)
check_autocorrelation(model_yj)
  OK: Residuals appear to be independent and not autocorrelated (p = 0.678).
check_heteroscedasticity(model_yj)
  Warning: Heteroscedasticity (non-constant error variance) detected (p < .001).

The Yeo-Johnson transformation did not substantially improve the regression assumptions. Although the assumptions of independence of residuals and absence of influential outliers were satisfied, the assumptions of normality of residuals and homoscedasticity remained violated.

Since the assumptions of normality and homoscedasticity remained violated even after several transformations, HC3 robust standard errors were employed. This method adjusts the standard errors to account for heteroscedasticity and mild violations of normality, allowing for more reliable statistical inference.

4.6 Regression Analysis with HC3 Robust Standard Errors

library(lmtest)
library(sandwich)

model <- lm(log(Weight.kg.) ~ Height.cm., data = data)

hc3.model <- coeftest(model, vcov = vcovHC(model, type = "HC3"))
hc3.model
  
  t test of coefficients:
  
                Estimate Std. Error t value  Pr(>|t|)    
  (Intercept) 1.61838489 0.05081462  31.849 < 2.2e-16 ***
  Height.cm.  0.01490579 0.00028533  52.240 < 2.2e-16 ***
  ---
  Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(data, aes(x = Height.cm., y = log(Weight.kg.))) +
  geom_point() +
  geom_smooth(method = "lm",
              se = TRUE,
              color = "red") +
  labs(
    title = "Scatterplot between Height and Log-Transformed Weight",
    x = "Height (cm)",
    y = "Log Weight (kg)"
  ) +
  theme_minimal()

The regression analysis using HC3 robust standard errors revealed that Height significantly predicts the logarithm of Weight (β=0.0149, t=52.240, p<0.001). This indicates that taller individuals tend to have greater body weight. Since the p-value was less than 0.05, this indicates that Height is a significant predictor of Weight. The scatterplot indicates a positive linear relationship between Height and the logarithm of Weight. As Height increases, the log-transformed Weight also tends to increase. The points generally follow the upward trend of the regression line, suggesting that Height is a significant predictor of Weight.