# Load libraries
library(readxl)
## Warning: package 'readxl' was built under R version 4.4.2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.2
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.2
## corrplot 0.95 loaded
library(GGally)
## Warning: package 'GGally' was built under R version 4.4.2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.2
library(ggfortify)
## Warning: package 'ggfortify' was built under R version 4.4.2
library(forecast)
## Warning: package 'forecast' was built under R version 4.4.2
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'forecast':
## method from
## autoplot.Arima ggfortify
## autoplot.acf ggfortify
## autoplot.ar ggfortify
## autoplot.bats ggfortify
## autoplot.decomposed.ts ggfortify
## autoplot.ets ggfortify
## autoplot.forecast ggfortify
## autoplot.stl ggfortify
## autoplot.ts ggfortify
## fitted.ar ggfortify
## fortify.ts ggfortify
## residuals.ar ggfortify
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.4.2
ais <- read_excel("C:/Users/Dell/Downloads/aic.xlsx")
head(ais)
summary(ais)
## rownames rcc wcc hc
## Min. : 1.00 Min. :3.800 Min. : 3.300 Min. :35.90
## 1st Qu.: 51.25 1st Qu.:4.372 1st Qu.: 5.900 1st Qu.:40.60
## Median :101.50 Median :4.755 Median : 6.850 Median :43.50
## Mean :101.50 Mean :4.719 Mean : 7.109 Mean :43.09
## 3rd Qu.:151.75 3rd Qu.:5.030 3rd Qu.: 8.275 3rd Qu.:45.58
## Max. :202.00 Max. :6.720 Max. :14.300 Max. :59.70
## hg ferr bmi ssf
## Min. :11.60 Min. : 8.00 Min. :16.75 Min. : 28.00
## 1st Qu.:13.50 1st Qu.: 41.25 1st Qu.:21.08 1st Qu.: 43.85
## Median :14.70 Median : 65.50 Median :22.72 Median : 58.60
## Mean :14.57 Mean : 76.88 Mean :22.96 Mean : 69.02
## 3rd Qu.:15.57 3rd Qu.: 97.00 3rd Qu.:24.46 3rd Qu.: 90.35
## Max. :19.20 Max. :234.00 Max. :34.42 Max. :200.80
## pcBfat lbm ht wt
## Min. : 5.630 Min. : 34.36 Min. :148.9 Min. : 37.80
## 1st Qu.: 8.545 1st Qu.: 54.67 1st Qu.:174.0 1st Qu.: 66.53
## Median :11.650 Median : 63.03 Median :179.7 Median : 74.40
## Mean :13.507 Mean : 64.87 Mean :180.1 Mean : 75.01
## 3rd Qu.:18.080 3rd Qu.: 74.75 3rd Qu.:186.2 3rd Qu.: 84.12
## Max. :35.520 Max. :106.00 Max. :209.4 Max. :123.20
## sex sport
## Length:202 Length:202
## Class :character Class :character
## Mode :character Mode :character
##
##
##
# Mean, Median, and Mode for Hemoglobin Concentration
mean_hg <- mean(ais$hg, na.rm = TRUE)
median_hg <- median(ais$hg, na.rm = TRUE)
mode_hg <- as.numeric(names(sort(table(ais$hg), decreasing = TRUE))[1])
cat("Mean:", mean_hg, "\n")
## Mean: 14.56634
cat("Median:", median_hg, "\n")
## Median: 14.7
cat("Mode:", mode_hg, "\n")
## Mode: 15.9
# Standard Deviation and Variance
std_dev_hg <- sd(ais$hg, na.rm = TRUE)
variance_hg <- var(ais$hg, na.rm = TRUE)
cat("Standard Deviation:", std_dev_hg, "\n")
## Standard Deviation: 1.362451
cat("Variance:", variance_hg, "\n")
## Variance: 1.856274
# Skewness and Kurtosis
skewness_hg <- skewness(ais$hg, na.rm = TRUE)
kurtosis_hg <- kurtosis(ais$hg, na.rm = TRUE)
cat("Skewness:", skewness_hg, "\n")
## Skewness: 0.1746303
cat("Kurtosis:", kurtosis_hg, "\n")
## Kurtosis: -0.02164488
# Pearson Correlation Coefficient between Hemoglobin and BMI
correlation_hg_bmi <- cor(ais$hg, ais$bmi, use = "complete.obs")
cat("Correlation between Hemoglobin and BMI:", correlation_hg_bmi, "\n")
## Correlation between Hemoglobin and BMI: 0.3825241
# Spearman Correlation Coefficient
correlation_hg_bmi_spearman <- cor(ais$hg, ais$bmi, method = "spearman", use = "complete.obs")
cat("Spearman correlation between Hemoglobin and BMI:", correlation_hg_bmi_spearman, "\n")
## Spearman correlation between Hemoglobin and BMI: 0.3703866
# Linear regression: Predict Hemoglobin from BMI
model <- lm(hg ~ bmi, data = ais)
summary(model)
##
## Call:
## lm(formula = hg ~ bmi, data = ais)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.0038 -0.9061 0.0997 0.8650 4.2963
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.38889 0.71899 14.449 < 2e-16 ***
## bmi 0.18198 0.03108 5.855 1.93e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.262 on 200 degrees of freedom
## Multiple R-squared: 0.1463, Adjusted R-squared: 0.1421
## F-statistic: 34.28 on 1 and 200 DF, p-value: 1.931e-08
# Predict Hemoglobin for a new value of BMI
new_bmi <- data.frame(bmi = c(25, 30, 35))
predictions <- predict(model, newdata = new_bmi)
cat("Predictions for new BMI values:", predictions, "\n")
## Predictions for new BMI values: 14.93832 15.8482 16.75809
# Multiple Linear Regression: Predict Hemoglobin from BMI, Weight, and Height
model_multi <- lm(hg ~ bmi + wt + ht, data = ais)
summary(model_multi)
##
## Call:
## lm(formula = hg ~ bmi + wt + ht, data = ais)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6778 -0.6638 0.0369 0.7280 4.6356
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.18848 10.58077 1.719 0.0872 .
## bmi -0.15513 0.23680 -0.655 0.5132
## wt 0.09282 0.07341 1.264 0.2075
## ht -0.03900 0.05948 -0.656 0.5128
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.221 on 198 degrees of freedom
## Multiple R-squared: 0.209, Adjusted R-squared: 0.197
## F-statistic: 17.44 on 3 and 198 DF, p-value: 4.357e-10
# Predict Hemoglobin for new values of BMI, Weight, and Height
new_data <- data.frame(bmi = c(25, 30), wt = c(70, 75), ht = c(170, 180))
predictions_multi <- predict(model_multi, newdata = new_data)
cat("Predictions for new data:", predictions_multi, "\n")
## Predictions for new data: 14.17841 13.47692
# One-way ANOVA: Comparing Hemoglobin Concentration across Sport Types
anova_result <- aov(hg ~ sport, data = ais)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## sport 9 131.7 14.628 11.63 1.62e-14 ***
## Residuals 192 241.5 1.258
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Tukey's HSD Test for pairwise comparison
tukey_result <- TukeyHSD(anova_result)
summary(tukey_result)
## Length Class Mode
## sport 180 -none- numeric
# Chi-Square Test for association between Sport and Gender
chi_square_result <- chisq.test(table(ais$sport, ais$sex))
## Warning in chisq.test(table(ais$sport, ais$sex)): Chi-squared approximation may
## be incorrect
cat("Chi-Square Test p-value:", chi_square_result$p.value, "\n")
## Chi-Square Test p-value: 2.716978e-08
# Principal Component Analysis (PCA)
pca <- prcomp(ais[, c("hg", "bmi", "wt", "ht", "pcBfat")], center = TRUE, scale. = TRUE)
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 1.6289 1.1846 0.8254 0.5084 0.06167
## Proportion of Variance 0.5306 0.2807 0.1362 0.0517 0.00076
## Cumulative Proportion 0.5306 0.8113 0.9475 0.9992 1.00000
# Plot PCA
autoplot(pca, data = ais, colour = 'sex')

# K-means clustering: Clustering based on Hemoglobin, BMI, and Body Fat Percentage
set.seed(123) # For reproducibility
kmeans_result <- kmeans(ais[, c("hg", "bmi", "pcBfat")], centers = 3)
ais$cluster <- as.factor(kmeans_result$cluster)
# Plot clusters
ggplot(ais, aes(x = bmi, y = hg, color = cluster)) +
geom_point() +
labs(title = "K-means Clustering of Hemoglobin, BMI, and Body Fat Percentage") +
theme_minimal()

# Matrix multiplication and inverse
matrix_a <- matrix(c(1, 2, 3, 4), nrow = 2)
matrix_b <- matrix(c(5, 6, 7, 8), nrow = 2)
product <- matrix_a %*% matrix_b
inverse <- solve(matrix_a)
cat("Matrix Product:\n", product, "\n")
## Matrix Product:
## 23 34 31 46
cat("Inverse of Matrix A:\n", inverse, "\n")
## Inverse of Matrix A:
## -2 1 1.5 -0.5
# Summary statistics by Sport Type
summary_by_sport <- ais %>%
group_by(sport) %>%
summarise(
mean_hg = mean(hg, na.rm = TRUE),
median_hg = median(hg, na.rm = TRUE),
std_dev_hg = sd(hg, na.rm = TRUE)
)
print(summary_by_sport)
## # A tibble: 10 × 4
## sport mean_hg median_hg std_dev_hg
## <chr> <dbl> <dbl> <dbl>
## 1 B_Ball 14.1 14 1.35
## 2 Field 15.5 15.5 1.01
## 3 Gym 13.6 13.7 0.860
## 4 Netball 12.8 12.7 0.567
## 5 Row 14.6 14.7 0.985
## 6 Swim 14.7 15.1 1.15
## 7 T_400m 14.7 14.8 1.17
## 8 T_Sprnt 15.7 15.2 1.58
## 9 Tennis 14.3 14.3 1.59
## 10 W_Polo 15.5 15.6 0.718
# Box-Cox Transformation
boxcox_result <- boxcox(hg ~ bmi + wt + ht, data = ais)

cat("Optimal lambda for Box-Cox transformation:", boxcox_result$x[which.max(boxcox_result$y)], "\n")
## Optimal lambda for Box-Cox transformation: 0.3434343
# Log transformation for Hemoglobin Concentration
ais$log_hg <- log(ais$hg + 1) # Add 1 to avoid log(0)
summary(ais$log_hg)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.534 2.674 2.754 2.741 2.808 3.006
# Correlation matrix for selected variables
cor_matrix <- cor(ais[, c("hg", "bmi", "wt", "ht", "pcBfat", "ssf")], use = "complete.obs")
# Boxplot for Hemoglobin Concentration by Sport Type
boxplot(hg ~ sport, data = ais, main = "Hemoglobinii Concentration by sport type",
xlab = "Sport type", ylab = "Hemoglobin Concentration (g/dL)", col = "lightblue", border = "darkblue")

# ggplot for Body Fat Percentage by Gender and Sport Type
ggplot(ais, aes(x = sport, y = pcBfat, color = sex)) +
geom_boxplot() +
labs(title = "Body Fat Percentage by Gender and Sport Type",
x = "Sport Type", y = "Body Fat Percentage") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Correlation heatmap of selected variables
corrplot(cor_matrix, method = "color", type = "upper", tl.cex = 0.8,
main = "Correlation Heatmap of Hemoglobin and Selected Variables")

# Pair plot of selected variables
ggpairs(ais[, c("hg", "bmi", "wt", "ht", "pcBfat")], title = "Pair Plot of Hemoglobin and Selected Variables")

# Histogram for Hemoglobin Concentration
ggplot(ais, aes(x = hg)) +
geom_histogram(binwidth = 0.5, fill = "skyblue", color = "black", alpha = 0.7) +
labs(title = "Histogram of Hemoglobin Concentration", x = "Hemoglobin Concentration (g/dL)", y = "Frequency") +
theme_minimal()

# Density plot for Hemoglobin Concentration
ggplot(ais, aes(x = hg)) +
geom_density(fill = "lightgreen", alpha = 0.5) +
labs(title = "Density Plot of Hemoglobin Concentration", x = "Hemoglobin Concentration (g/dL)", y = "Density") +
theme_minimal()

# Scatter plot with regression line for BMI vs Hemoglobin Concentration
ggplot(ais, aes(x = bmi, y = hg)) +
geom_point(color = "blue", alpha = 0.7) +
geom_smooth(method = "lm", color = "red") +
labs(title = "BMI vs Hemoglobin Concentration", x = "Body Mass Index (BMI)", y = "Hemoglobin Concentration (g/dL)") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# Bar plot for Hemoglobin Concentration by Sport Type
ggplot(ais, aes(x = sport, y = hg, fill = sport)) +
geom_bar(stat = "summary", fun = "mean", color = "black", position = "dodge") +
labs(title = "Average Hemoglobin Concentration by Sport Type", x = "Sport Type", y = "Average Hemoglobin Concentration (g/dL)") +
theme_minimal()

# Violin plot for Hemoglobin Concentration by Sport Type
ggplot(ais, aes(x = sport, y = hg, fill = sport)) +
geom_violin() +
labs(title = "Violin Plot of Hemoglobin Concentration by Sport Type", x = "Sport Type", y = "Hemoglobin Concentration (g/dL)") +
theme_minimal()

# Facet grid for Body Fat Percentage by Sport Type and Gender
ggplot(ais, aes(x = sport, y = pcBfat, color = sex)) +
geom_boxplot() +
labs(title = "Body Fat Percentage by Gender and Sport Type", x = "Sport Type", y = "Body Fat Percentage") +
facet_grid(~ sex) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Pair plot for selected variables
ggpairs(ais[, c("hg", "bmi", "wt", "ht", "pcBfat")],
title = "Pair Plot of Hemoglobin and Selected Variables")

# Correlation heatmap using ggplot2
cor_matrix_melted <- melt(cor_matrix)
ggplot(cor_matrix_melted, aes(Var1, Var2, fill = value)) +
geom_tile() +
scale_fill_gradient2(midpoint = 0, low = "red", high = "blue", mid = "white") +
theme_minimal() +
labs(title = "Correlation Heatmap", x = "Variables", y = "Variables")

# Bubble plot of Hemoglobin Concentration vs BMI, with Weight as the size
ggplot(ais, aes(x = bmi, y = hg, size = wt)) +
geom_point(color = "purple", alpha = 0.6) +
labs(title = "Bubble Plot of Hemoglobin vs BMI (Weight as Size)", x = "BMI", y = "Hemoglobin Concentration (g/dL)") +
theme_minimal()

# Heatmap for correlation matrix with ggplot2
cor_matrix_melted <- melt(cor_matrix)
ggplot(cor_matrix_melted, aes(Var1, Var2, fill = value)) +
geom_tile() +
scale_fill_gradient2(low = "blue", high = "red", mid = "white") +
theme_minimal() +
labs(title = "Correlation Heatmap", x = "Variables", y = "Variables")

# Customize ggplot theme
ggplot(ais, aes(x = sport, y = hg, color = sex)) +
geom_boxplot() +
labs(title = "Hemoglobin Concentration by Sport Type and Gender", x = "Sport Type", y = "Hemoglobin Concentration (g/dL)") +
theme_light() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
