# Load libraries
library(readxl)
## Warning: package 'readxl' was built under R version 4.4.2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.2
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.2
## corrplot 0.95 loaded
library(GGally)
## Warning: package 'GGally' was built under R version 4.4.2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.2
library(ggfortify)
## Warning: package 'ggfortify' was built under R version 4.4.2
library(forecast)
## Warning: package 'forecast' was built under R version 4.4.2
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'forecast':
##   method                 from     
##   autoplot.Arima         ggfortify
##   autoplot.acf           ggfortify
##   autoplot.ar            ggfortify
##   autoplot.bats          ggfortify
##   autoplot.decomposed.ts ggfortify
##   autoplot.ets           ggfortify
##   autoplot.forecast      ggfortify
##   autoplot.stl           ggfortify
##   autoplot.ts            ggfortify
##   fitted.ar              ggfortify
##   fortify.ts             ggfortify
##   residuals.ar           ggfortify
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.4.2
ais <- read_excel("C:/Users/Dell/Downloads/aic.xlsx")
head(ais)
summary(ais)
##     rownames           rcc             wcc               hc       
##  Min.   :  1.00   Min.   :3.800   Min.   : 3.300   Min.   :35.90  
##  1st Qu.: 51.25   1st Qu.:4.372   1st Qu.: 5.900   1st Qu.:40.60  
##  Median :101.50   Median :4.755   Median : 6.850   Median :43.50  
##  Mean   :101.50   Mean   :4.719   Mean   : 7.109   Mean   :43.09  
##  3rd Qu.:151.75   3rd Qu.:5.030   3rd Qu.: 8.275   3rd Qu.:45.58  
##  Max.   :202.00   Max.   :6.720   Max.   :14.300   Max.   :59.70  
##        hg             ferr             bmi             ssf        
##  Min.   :11.60   Min.   :  8.00   Min.   :16.75   Min.   : 28.00  
##  1st Qu.:13.50   1st Qu.: 41.25   1st Qu.:21.08   1st Qu.: 43.85  
##  Median :14.70   Median : 65.50   Median :22.72   Median : 58.60  
##  Mean   :14.57   Mean   : 76.88   Mean   :22.96   Mean   : 69.02  
##  3rd Qu.:15.57   3rd Qu.: 97.00   3rd Qu.:24.46   3rd Qu.: 90.35  
##  Max.   :19.20   Max.   :234.00   Max.   :34.42   Max.   :200.80  
##      pcBfat            lbm               ht              wt        
##  Min.   : 5.630   Min.   : 34.36   Min.   :148.9   Min.   : 37.80  
##  1st Qu.: 8.545   1st Qu.: 54.67   1st Qu.:174.0   1st Qu.: 66.53  
##  Median :11.650   Median : 63.03   Median :179.7   Median : 74.40  
##  Mean   :13.507   Mean   : 64.87   Mean   :180.1   Mean   : 75.01  
##  3rd Qu.:18.080   3rd Qu.: 74.75   3rd Qu.:186.2   3rd Qu.: 84.12  
##  Max.   :35.520   Max.   :106.00   Max.   :209.4   Max.   :123.20  
##      sex               sport          
##  Length:202         Length:202        
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 
# Mean, Median, and Mode for Hemoglobin Concentration
mean_hg <- mean(ais$hg, na.rm = TRUE)
median_hg <- median(ais$hg, na.rm = TRUE)
mode_hg <- as.numeric(names(sort(table(ais$hg), decreasing = TRUE))[1])

cat("Mean:", mean_hg, "\n")
## Mean: 14.56634
cat("Median:", median_hg, "\n")
## Median: 14.7
cat("Mode:", mode_hg, "\n")
## Mode: 15.9
# Standard Deviation and Variance
std_dev_hg <- sd(ais$hg, na.rm = TRUE)
variance_hg <- var(ais$hg, na.rm = TRUE)

cat("Standard Deviation:", std_dev_hg, "\n")
## Standard Deviation: 1.362451
cat("Variance:", variance_hg, "\n")
## Variance: 1.856274
# Skewness and Kurtosis
skewness_hg <- skewness(ais$hg, na.rm = TRUE)
kurtosis_hg <- kurtosis(ais$hg, na.rm = TRUE)

cat("Skewness:", skewness_hg, "\n")
## Skewness: 0.1746303
cat("Kurtosis:", kurtosis_hg, "\n")
## Kurtosis: -0.02164488
# Pearson Correlation Coefficient between Hemoglobin and BMI
correlation_hg_bmi <- cor(ais$hg, ais$bmi, use = "complete.obs")
cat("Correlation between Hemoglobin and BMI:", correlation_hg_bmi, "\n")
## Correlation between Hemoglobin and BMI: 0.3825241
# Spearman Correlation Coefficient
correlation_hg_bmi_spearman <- cor(ais$hg, ais$bmi, method = "spearman", use = "complete.obs")
cat("Spearman correlation between Hemoglobin and BMI:", correlation_hg_bmi_spearman, "\n")
## Spearman correlation between Hemoglobin and BMI: 0.3703866
# Linear regression: Predict Hemoglobin from BMI
model <- lm(hg ~ bmi, data = ais)
summary(model)
## 
## Call:
## lm(formula = hg ~ bmi, data = ais)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.0038 -0.9061  0.0997  0.8650  4.2963 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.38889    0.71899  14.449  < 2e-16 ***
## bmi          0.18198    0.03108   5.855 1.93e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.262 on 200 degrees of freedom
## Multiple R-squared:  0.1463, Adjusted R-squared:  0.1421 
## F-statistic: 34.28 on 1 and 200 DF,  p-value: 1.931e-08
# Predict Hemoglobin for a new value of BMI
new_bmi <- data.frame(bmi = c(25, 30, 35))
predictions <- predict(model, newdata = new_bmi)
cat("Predictions for new BMI values:", predictions, "\n")
## Predictions for new BMI values: 14.93832 15.8482 16.75809
# Multiple Linear Regression: Predict Hemoglobin from BMI, Weight, and Height
model_multi <- lm(hg ~ bmi + wt + ht, data = ais)
summary(model_multi)
## 
## Call:
## lm(formula = hg ~ bmi + wt + ht, data = ais)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6778 -0.6638  0.0369  0.7280  4.6356 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 18.18848   10.58077   1.719   0.0872 .
## bmi         -0.15513    0.23680  -0.655   0.5132  
## wt           0.09282    0.07341   1.264   0.2075  
## ht          -0.03900    0.05948  -0.656   0.5128  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.221 on 198 degrees of freedom
## Multiple R-squared:  0.209,  Adjusted R-squared:  0.197 
## F-statistic: 17.44 on 3 and 198 DF,  p-value: 4.357e-10
# Predict Hemoglobin for new values of BMI, Weight, and Height
new_data <- data.frame(bmi = c(25, 30), wt = c(70, 75), ht = c(170, 180))
predictions_multi <- predict(model_multi, newdata = new_data)
cat("Predictions for new data:", predictions_multi, "\n")
## Predictions for new data: 14.17841 13.47692
# One-way ANOVA: Comparing Hemoglobin Concentration across Sport Types
anova_result <- aov(hg ~ sport, data = ais)
summary(anova_result)
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## sport         9  131.7  14.628   11.63 1.62e-14 ***
## Residuals   192  241.5   1.258                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Tukey's HSD Test for pairwise comparison
tukey_result <- TukeyHSD(anova_result)
summary(tukey_result)
##       Length Class  Mode   
## sport 180    -none- numeric
# Chi-Square Test for association between Sport and Gender
chi_square_result <- chisq.test(table(ais$sport, ais$sex))
## Warning in chisq.test(table(ais$sport, ais$sex)): Chi-squared approximation may
## be incorrect
cat("Chi-Square Test p-value:", chi_square_result$p.value, "\n")
## Chi-Square Test p-value: 2.716978e-08
# Principal Component Analysis (PCA)
pca <- prcomp(ais[, c("hg", "bmi", "wt", "ht", "pcBfat")], center = TRUE, scale. = TRUE)
summary(pca)
## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5
## Standard deviation     1.6289 1.1846 0.8254 0.5084 0.06167
## Proportion of Variance 0.5306 0.2807 0.1362 0.0517 0.00076
## Cumulative Proportion  0.5306 0.8113 0.9475 0.9992 1.00000
# Plot PCA
autoplot(pca, data = ais, colour = 'sex')

# K-means clustering: Clustering based on Hemoglobin, BMI, and Body Fat Percentage
set.seed(123)  # For reproducibility
kmeans_result <- kmeans(ais[, c("hg", "bmi", "pcBfat")], centers = 3)
ais$cluster <- as.factor(kmeans_result$cluster)

# Plot clusters
ggplot(ais, aes(x = bmi, y = hg, color = cluster)) +
  geom_point() +
  labs(title = "K-means Clustering of Hemoglobin, BMI, and Body Fat Percentage") +
  theme_minimal()

# Matrix multiplication and inverse
matrix_a <- matrix(c(1, 2, 3, 4), nrow = 2)
matrix_b <- matrix(c(5, 6, 7, 8), nrow = 2)
product <- matrix_a %*% matrix_b
inverse <- solve(matrix_a)

cat("Matrix Product:\n", product, "\n")
## Matrix Product:
##  23 34 31 46
cat("Inverse of Matrix A:\n", inverse, "\n")
## Inverse of Matrix A:
##  -2 1 1.5 -0.5
# Summary statistics by Sport Type
summary_by_sport <- ais %>%
  group_by(sport) %>%
  summarise(
    mean_hg = mean(hg, na.rm = TRUE),
    median_hg = median(hg, na.rm = TRUE),
    std_dev_hg = sd(hg, na.rm = TRUE)
  )
print(summary_by_sport)
## # A tibble: 10 × 4
##    sport   mean_hg median_hg std_dev_hg
##    <chr>     <dbl>     <dbl>      <dbl>
##  1 B_Ball     14.1      14        1.35 
##  2 Field      15.5      15.5      1.01 
##  3 Gym        13.6      13.7      0.860
##  4 Netball    12.8      12.7      0.567
##  5 Row        14.6      14.7      0.985
##  6 Swim       14.7      15.1      1.15 
##  7 T_400m     14.7      14.8      1.17 
##  8 T_Sprnt    15.7      15.2      1.58 
##  9 Tennis     14.3      14.3      1.59 
## 10 W_Polo     15.5      15.6      0.718
# Box-Cox Transformation
boxcox_result <- boxcox(hg ~ bmi + wt + ht, data = ais)

cat("Optimal lambda for Box-Cox transformation:", boxcox_result$x[which.max(boxcox_result$y)], "\n")
## Optimal lambda for Box-Cox transformation: 0.3434343
# Log transformation for Hemoglobin Concentration
ais$log_hg <- log(ais$hg + 1)  # Add 1 to avoid log(0)
summary(ais$log_hg)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.534   2.674   2.754   2.741   2.808   3.006
# Correlation matrix for selected variables
cor_matrix <- cor(ais[, c("hg", "bmi", "wt", "ht", "pcBfat", "ssf")], use = "complete.obs")
# Boxplot for Hemoglobin Concentration by Sport Type
boxplot(hg ~ sport, data = ais, main = "Hemoglobinii Concentration by sport type",
        xlab = "Sport type", ylab = "Hemoglobin Concentration (g/dL)", col = "lightblue", border = "darkblue")

# ggplot for Body Fat Percentage by Gender and Sport Type
ggplot(ais, aes(x = sport, y = pcBfat, color = sex)) + 
  geom_boxplot() + 
  labs(title = "Body Fat Percentage by Gender and Sport Type", 
       x = "Sport Type", y = "Body Fat Percentage") + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Correlation heatmap of selected variables
corrplot(cor_matrix, method = "color", type = "upper", tl.cex = 0.8,
         main = "Correlation Heatmap of Hemoglobin and Selected Variables")

# Pair plot of selected variables
ggpairs(ais[, c("hg", "bmi", "wt", "ht", "pcBfat")], title = "Pair Plot of Hemoglobin and Selected Variables")

# Histogram for Hemoglobin Concentration
ggplot(ais, aes(x = hg)) + 
  geom_histogram(binwidth = 0.5, fill = "skyblue", color = "black", alpha = 0.7) +
  labs(title = "Histogram of Hemoglobin Concentration", x = "Hemoglobin Concentration (g/dL)", y = "Frequency") +
  theme_minimal()

# Density plot for Hemoglobin Concentration
ggplot(ais, aes(x = hg)) + 
  geom_density(fill = "lightgreen", alpha = 0.5) + 
  labs(title = "Density Plot of Hemoglobin Concentration", x = "Hemoglobin Concentration (g/dL)", y = "Density") + 
  theme_minimal()

# Scatter plot with regression line for BMI vs Hemoglobin Concentration
ggplot(ais, aes(x = bmi, y = hg)) + 
  geom_point(color = "blue", alpha = 0.7) + 
  geom_smooth(method = "lm", color = "red") + 
  labs(title = "BMI vs Hemoglobin Concentration", x = "Body Mass Index (BMI)", y = "Hemoglobin Concentration (g/dL)") + 
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# Bar plot for Hemoglobin Concentration by Sport Type
ggplot(ais, aes(x = sport, y = hg, fill = sport)) + 
  geom_bar(stat = "summary", fun = "mean", color = "black", position = "dodge") + 
  labs(title = "Average Hemoglobin Concentration by Sport Type", x = "Sport Type", y = "Average Hemoglobin Concentration (g/dL)") + 
  theme_minimal()

# Violin plot for Hemoglobin Concentration by Sport Type
ggplot(ais, aes(x = sport, y = hg, fill = sport)) + 
  geom_violin() + 
  labs(title = "Violin Plot of Hemoglobin Concentration by Sport Type", x = "Sport Type", y = "Hemoglobin Concentration (g/dL)") + 
  theme_minimal()

# Facet grid for Body Fat Percentage by Sport Type and Gender
ggplot(ais, aes(x = sport, y = pcBfat, color = sex)) + 
  geom_boxplot() + 
  labs(title = "Body Fat Percentage by Gender and Sport Type", x = "Sport Type", y = "Body Fat Percentage") + 
  facet_grid(~ sex) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Pair plot for selected variables
ggpairs(ais[, c("hg", "bmi", "wt", "ht", "pcBfat")], 
        title = "Pair Plot of Hemoglobin and Selected Variables")

# Correlation heatmap using ggplot2
cor_matrix_melted <- melt(cor_matrix)
ggplot(cor_matrix_melted, aes(Var1, Var2, fill = value)) + 
  geom_tile() + 
  scale_fill_gradient2(midpoint = 0, low = "red", high = "blue", mid = "white") +
  theme_minimal() + 
  labs(title = "Correlation Heatmap", x = "Variables", y = "Variables")

# Bubble plot of Hemoglobin Concentration vs BMI, with Weight as the size
ggplot(ais, aes(x = bmi, y = hg, size = wt)) + 
  geom_point(color = "purple", alpha = 0.6) + 
  labs(title = "Bubble Plot of Hemoglobin vs BMI (Weight as Size)", x = "BMI", y = "Hemoglobin Concentration (g/dL)") + 
  theme_minimal()

# Heatmap for correlation matrix with ggplot2
cor_matrix_melted <- melt(cor_matrix)
ggplot(cor_matrix_melted, aes(Var1, Var2, fill = value)) + 
  geom_tile() + 
  scale_fill_gradient2(low = "blue", high = "red", mid = "white") + 
  theme_minimal() + 
  labs(title = "Correlation Heatmap", x = "Variables", y = "Variables")

# Customize ggplot theme
ggplot(ais, aes(x = sport, y = hg, color = sex)) + 
  geom_boxplot() + 
  labs(title = "Hemoglobin Concentration by Sport Type and Gender", x = "Sport Type", y = "Hemoglobin Concentration (g/dL)") + 
  theme_light() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))