knitr::opts_chunk$set(warning = FALSE,echo = TRUE,message = FALSE)
library(ggplot2)
library(tidyverse)
library(dplyr)
library(tidyr)
library(gridExtra)
library(mice)
library(cluster)
library(factoextra)
library(pls)
library(RColorBrewer)
dat=read.csv("E:/UCD/study/spring/MultivariateAnalysis/Milk_MIR_Traits_data_2023.csv")
dim(dat)
## [1] 431 582
set.seed(22200868)
#generate a number between 1 and n ,and delete that observation/row from the dataset
d <- sample(1:nrow(dat), 1);d
## [1] 411
dat=dat[-d,]
df <- dat[!is.na(dat$beta_lactoglobulin_b), ]#delete NA
mean=mean(df$beta_lactoglobulin_b);sd=sd(df$beta_lactoglobulin_b)
df<-df %>%
filter((beta_lactoglobulin_b >mean-3*sd)&(beta_lactoglobulin_b <mean+3*sd))
#Remove any observations with β outside of 3 standard deviations from the mean of the trait.
X=df[,-c(1:51)]#MIR spectra
X=scale(X)#normalization
df$Breed=factor(df$Breed)
df$Parity=factor(df$Parity)
df$Milking_Time=factor(df$Milking_Time)
wavelength=as.numeric(gsub("X","",colnames((X))))
tail(wavelength)
## [1] 3803 3807 3811 3815 3818 3822
class<-(df$Milking_Time)
table(class)
## class
## 1 2
## 189 114
col <- c("darkorange2", "deepskyblue3") # set colors according to classes
cols <- col[class]
# plot the spectra over the wavelengths
matplot(t(X),X=wavelength,type = "l", lty = 1, col = adjustcolor(cols, 0.5))
legend("topleft", fill = col, legend = levels(class), bty = "n")
beta=as.data.frame(df$beta_lactoglobulin_b)
# Histogram with kernel density
ggplot(beta, aes(x = df$beta_lactoglobulin_b)) +
geom_histogram(aes(y =..density..),
colour = 1, fill = "white") +
geom_density(lwd = 1, colour = 4,
fill = 4, alpha = 0.25)
summary(beta)
## df$beta_lactoglobulin_b
## Min. :0.0000
## 1st Qu.:0.9833
## Median :2.3748
## Mean :2.3876
## 3rd Qu.:3.3629
## Max. :7.4363
The x-axis represents the wavelength (nm), and the y-axis represents
the intensity of light at that wavelength. In this example, many
observers have similar trails, wavelengths are highly correlated and
they have lots of noise. After normalising data and removing β outside
of 3 standard deviations from the mean of the trait, most y-axis ranges
from -5 to 10. “beta_lactoglobulin_b” has many observations centered at
0, which may be an anomaly in the data collection. Also, the
observations are centered around 2.38 (mean) and the mean and median are
about the same.
df_spec=df[,-c(1:51)]# Remove column from the dataset
df_spec=as.data.frame(scale(df_spec))#scale dataset
#colSums(is.na(df_spec))#no missing value
hc1 = hclust(dist(df_spec), method="average")
# Perform clustering
plot(hc1)
hc2 = hclust(dist(df_spec), method="single")
# Perform clustering
plot(hc2)
dist_matrix <- dist(df_spec, method = "euclidean")
cl.complete = hclust(dist_matrix, method="complete")
plot(cl.complete,xlab="complete linkage")
hcl = cutree(cl.complete, k = 3)
table(hcl)
## hcl
## 1 2 3
## 177 118 8
#df_hcl <- cbind(df_spec, hcl)
#df_hcl
# Elbow method(k vs total within sum of squares)
fviz_nbclust(df_spec, kmeans, method = "wss")
# Silhouette method
fviz_nbclust(df_spec, kmeans, method = "silhouette")+labs(subtitle = "Silhouette method")
# Gap statistic
set.seed(123)
fviz_nbclust(df_spec, kmeans, nstart = 25, method = "gap_stat", nboot = 50)+
labs(subtitle = "Gap statistic method")#try 25 different random initializations
k <- 3 # best fit
# Perform k-means clustering
set.seed(2678)
km <- kmeans(df_spec, centers = k,nstart = 25)
#km$centers
#km$cluster
table(km$cluster)
##
## 1 2 3
## 128 8 167
km$betweenss/km$totss
## [1] 0.5764217
km[["withinss"]]
## [1] 26062.814 2735.051 39127.999
fviz_cluster(km, data = df_spec)
#find best k by package
library(parameters)
n_clust <- n_clusters(df_spec,
package = c("easystats", "NbClust", "mclust"),
standardize = FALSE
)
n_clust
## # Method Agreement Procedure:
##
## The choice of 3 clusters is supported by 9 (42.86%) methods out of 21 (Silhouette, Ch, Hartigan, DB, Ratkowsky, Ball, PtBiserial, Mcclain, SDindex).
Create hierarchical clustering with “average”,“single”,“complete”
method . They have similar results in hierarchical clustering method and
“complete” method has the best result.MIR spectra in the data has three
similar clusters.By hierarchical clustering,setting the label of the
grouped by results for each variable, we see that the first cluster
group contains 177 variables, the second group has 118 variables, and
the third group has 8 variables.
K-means clustering(change 25 different initial value) Elbow method and
Gap Statistic show the trend of best k, (k=3 or 4 can be choise)I prefer
k=3 is the best cluster. By K-means clustering,setting the label of the
grouped by results for each variable, we see that the first cluster
group contains 128 variables, the second group has 8 variables,and the
third group has 167 variables.The quality of the partition is
57.64%.(Between Sum of Squares and Total Sum of Squares,The higher the
better)
Compared with hierarchical clustering and k-means clustering, the number
of variables in the three clusters is very similar.
Sum of Square Error (SSE) of each three clusters is 26062,2735,39127.
Visually display the classified variables, mapping the original
variables to two-dimensional coordinates, and you can see the clustering
results without overlapping with each other.
corr_matrix=cor(df_spec)#correlation matrix
y=eigen(corr_matrix) #list includeing eig.val and eig.vec
#y$values
#y$vectors
#get_eig(pca)
sum(y$values[1:10])/sum(y$values) #top 10 principal com
## [1] 0.9981032
pca <- prcomp(df_spec, scale. = TRUE, center = TRUE)
fviz_screeplot(pca, addlabels = TRUE)#scree plot
#pca$sdev#standard deviation of each principal component
pca.var <- pca$sdev^2#variance explained by each principal component
pve <- pca.var[1:3] / sum(pca.var)
sum(pve)
## [1] 0.9488622
The var of each new principal component =the corresponding
eigenvalue.
Top 10 principal components can explain 99.8% variance of the entire
spectral data.
First principal component explains 52.7% of the variance,the next
principal component explains 30.3% of the variance,the third principal
component explains 12% of the variance.I prefer choose the first 3
principal components to represent the spectral data.They contain 95% of
the variance of the entire variable.
#Select the top 1 eigenvectors
comp1 <- y$vectors[,1]
comp1=as.matrix(comp1)
# Compute the principal component scores
pc_scores1 <- (as.matrix(df_spec) )%*% comp1
#choose the first 3 PCA,repeat then perform the plot
comp <- y$vectors[,1:3]
comp=as.matrix(comp)
# Compute the principal component scores
pc_scores <- (as.matrix(df_spec) )%*% comp
pc_scores=as.data.frame(pc_scores)
pc_scores[c(1:10),]
## V1 V2 V3
## 1 -9.4100381 -3.3875467 -1.3481005
## 2 -8.7167003 0.6455594 -5.0269717
## 3 -3.6643165 8.1445727 -2.5359467
## 4 -2.1210604 -0.4598011 1.0902765
## 5 -1.4683492 14.5332760 -5.5730029
## 6 -7.0532628 -3.5927216 -1.8731947
## 7 4.0819343 -1.9829788 0.8671145
## 8 5.0410879 -3.9478251 -1.3286470
## 9 -2.8351012 3.8519670 -1.4993405
## 10 -0.6809999 0.1939222 -0.1543304
PCAcolors <- c("#fc8d62")
#PCAcolors <- c("#66c2a5","#fc8d62")[as.integer(df$Milking_Time)]
par(mfrow = c(1,3))
plot(pc_scores[, 1], pc_scores[, 2],
pch=21, # point shape
col=PCAcolors, # point border color
bg=PCAcolors, # point color
cex=1.5, # point size
xlab = "PC1", ylab = "PC2",
main = "Principal Component Scores")
plot(pc_scores[, 1], pc_scores[, 3],
pch=21, # point shape
col=PCAcolors, # point border color
bg=PCAcolors, # point color
cex=1.5, # point size
xlab = "PC1", ylab = "PC3",
main = "Principal Component Scores")
plot(pc_scores[, 2], pc_scores[, 3],
pch=21, # point shape
col=PCAcolors, # point border color
bg=PCAcolors, # point color
cex=1.5, # point size
xlab = "PC2", ylab = "PC3",
main = "Principal Component Scores")
The first principal component scores is
-9.56,-8.72,-3.66,-2.12,-1.46,-7.05.(the first 5 rows of principal
component scores) The scores matrix will have as many rows as the
original data and as many columns as there are principal components.
Each value in the matrix represents the score for that observation on
that principal component.
In first plot,some observers have a negative effect in PC2 and PC1,as
they have have larger negative range.
Principal component scores of first 3 component(In the first 10 rows)
.There are many negative correlations between PC1 and observed values,
PC2, PC3 and observed values have positive and negative correlations,
and the coefficient of PC3 is smaller than that of the other two
principal components.
PCR aims to reduce the number of predictors and improve the
stability and predictive power of the model, particularly if there are a
large number of correlated predictors. Principal component regression is
known as a dimension reduction method for regression analysis as it
reduces the number of coefficients to be estimated in regression
modeling.It can potentially lead to improvements in model performance
compared to standard linear regression.
Principal Component Regression (PCR) is a regression technique
that serves the same goal as standard linear regression.PCR provides a
simple way to perform regression using M <p predictors.
The difference is that PCR uses the principal components as the
predictor variables for regression analysis instead of the original
features. So if we want to use PCR method we need to apply PCA analysis
to data first.
1).Apply PCA to generate principal components from the predictor
variables,
2).Keep the first k principal components that explain most of the
variance (where k < p), the number of principal components, M, is
typically chosen by cross-validation.
3).Fit a linear regression model on these k principal components
The use of PCR involves several choices, including the selection
of the number of principal components to include in the regression
model.This can be determined using cross-validation or other methods to
identify the optimal number of components. Another choice is the method
used to estimate the regression coefficients, such as ordinary least
squares or partial least squares.
The advantages of PCR include its ability to handle
high-dimensional data and correlated predictors, as well as improving
the interpretability and predictive power of the model. However, it has
some disadvantages, including reliance on assumptions of normality and
linearity, potential overfitting if too many principal components are
included, and it does not perform feature selection.It may not be
suitable when the goal is to identify the specific predictors that are
most strongly associated with the response variable.
# Train-test split
N=nrow(df_spec)
n1=round(N/3)
set.seed(2678)
test_indices=sample(1:N,n1)
#Split data into training and validation(for test acc)
df_test=df_spec[test_indices,]
df_train=df_spec[-test_indices,]
beta_test=df[test_indices,]$beta_lactoglobulin_b
beta_train <- df[-test_indices,]$beta_lactoglobulin_b
training=cbind(df_train,beta_train)
testing=cbind(df_test,beta_test)
pcr_fit = pcr(beta_train~., ncomp = 10 ,data =training, scale = TRUE, validation = "CV")
summary(pcr_fit)
## Data: X dimension: 202 531
## Y dimension: 202 1
## Fit method: svdpc
## Number of components considered: 10
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 1.606 1.624 1.630 1.616 1.607 1.606 1.609
## adjCV 1.606 1.623 1.629 1.613 1.604 1.603 1.606
## 7 comps 8 comps 9 comps 10 comps
## CV 1.622 1.628 1.634 1.613
## adjCV 1.618 1.624 1.630 1.608
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
## X 51.409690 80.98974 94.140 97.931 98.714 99.252 99.523
## beta_train 0.002371 0.07352 2.829 4.588 4.824 5.386 5.466
## 8 comps 9 comps 10 comps
## X 99.640 99.75 99.805
## beta_train 5.704 5.71 9.757
#visualize cross-validation plots
validationplot(pcr_fit, val.type = "MSEP")
#validationplot(pcr_fit, val.type="R2")
predplot(pcr_fit)
coefplot(pcr_fit)
pcr_pred <- predict(pcr_fit, testing, ncomp = 4)
head(pcr_pred)
## , , 4 comps
##
## beta_train
## 92 2.590504
## 164 2.095311
## 170 2.757731
## 62 2.598836
## 21 2.716386
## 273 2.231886
#calculate RMSE
sqrt(mean((pcr_pred - beta_test)^2))
## [1] 1.73033
VALIDATION: RMSEP:
In each plot we can see that the model fit.The model fluctuates as the
number of principal components increases. We can see that adding
additional principal components actually leads to an increase in test
RMSE. In the RMSEP diagram, we tend to n=4,5,6 as the appropriate number
of principal components.Thus, it appears that it would be optimal to use
4 principal components in the final model.
TRAINING: % variance explained:
The results suggest that the first 4 principal components explain a
large proportion of the variance in X (97.9%), it doesn’t have a big
difference as n is bigger than 5.The response variable is explained
relatively poorly by the principal components.
We can see that the test RMSE turns out to be 1.73(ncomp = 4). This is
the average deviation between the predicted value for and the observed
value in the testing set.
colnames(df[,3:13])
## [1] "Parity" "Milking_Time" "DaysInMilk"
## [4] "Protein_content" "kappa_casein" "alpha_s2_casein"
## [7] "alpha_s1_casein" "beta_casein" "alpha_lactalbumin"
## [10] "beta_lactoglobulin_a" "beta_lactoglobulin_b"
df_protein=dat[,which(names(dat)%in% c("alpha_s2_casein","alpha_s1_casein","beta_casein",
"kappa_casein","beta_lactoglobulin_a","beta_lactoglobulin_b",
"alpha_lactalbumin"))]
dim(df_protein)
## [1] 430 7
colSums(is.na(df_protein))
## kappa_casein alpha_s2_casein alpha_s1_casein
## 124 124 124
## beta_casein alpha_lactalbumin beta_lactoglobulin_a
## 124 124 124
## beta_lactoglobulin_b
## 125
df_protein_comp=na.omit(df_protein)#deal with NA
dim(df_protein_comp)
## [1] 305 7
index=which(df_protein_comp$beta_lactoglobulin_b==0);index
## [1] 4 8 12 19 22 25 28 34 39 43 52 62 66 92 101 105 136 196 197
## [20] 207 208 209 211 219 220 232 234 236 243 245 260 274
#find index of beta=0
df_protein_pca=df_protein_comp[-index,]#remove beta=0(missing value)
pca=prcomp(df_protein_pca,center = TRUE, scale. = TRUE)
imputed_data <- predict(pca, newdata =df_protein_comp[index,])[,1]
imputed_data
## 4 8 12 20 24 27
## 3.17605741 0.51177304 -1.29224469 -0.02604674 0.03971935 0.05217773
## 30 36 41 45 54 64
## 0.69874611 1.19716963 0.22948211 0.35630691 -1.13032735 2.43429219
## 68 94 103 112 240 300
## 1.93513851 -0.20118834 -0.27028799 2.34409741 -1.50370834 -1.18447040
## 301 312 313 314 316 324
## -0.77117901 0.30326343 0.38765393 -0.38812970 -0.10645637 -0.48684530
## 325 357 359 361 368 370
## 0.33824603 -0.16530115 0.26697343 -0.76283965 1.37821985 0.02855176
## 385 399
## 1.10359383 0.68824504
# Replace imputed values in original data frame
df_protein_comp_copy=df_protein_comp
df_protein_comp_copy[index, "beta_lactoglobulin_b"] <- imputed_data
#compare impute value and observed value
summary(df_protein_comp_copy[index,]$beta_lactoglobulin_b)#impute value
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.5037 -0.2997 0.1408 0.2869 0.6909 3.1761
summary(df_protein_comp_copy[-index,]$beta_lactoglobulin_b)#observed value
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.08449 1.59415 2.51106 2.71591 3.59079 9.70154
To imputing missing values by Principal Component Analysis (PCA) I
assume that the “beta_lactoglobulin_b” is linearly related to the other
seven milk proteins data and missing at random values. If if the
relationship between the missing values and the other variables in the
data is non-linear, PCA may not be able to capture this relationship and
the imputed values may not be accurate.This can lead to biased
estimates.
I impute 32 missing value only by the first principal component to make
the predictions. This means that the predicted values are based only on
the information captured by the first principal component.In the end I
compared the observed and imputed values and there are some differences
in the range of values.Observed value range from 0.08449 to 9.70154 with
mean =2.71591, but imputed value range from -1.5037 to 3.1761 and
mean=0.2869.It shows the imputed value is slightly lower.
#non-zero value of β Lactoglobulin B
df1=df[,-c(1:12,14:51)]#all observed value
#find index with 0 value,32 rows 0
index=which(df1$beta_lactoglobulin_b==0);index
## [1] 4 8 12 19 22 25 28 34 39 43 52 61 65 90 99 103 134 194 195
## [20] 205 206 207 209 217 218 230 232 234 241 243 258 272
df1_complete=df1[-index,]
N=nrow(df1);N
## [1] 303
n1=round(N/3);n1
## [1] 101
set.seed(2678)
test_indices=sample(1:(N-length(index)),n1)
#Split data into training and validation(for test acc)
#three model have the same test data (no 0 value in beta)
df_test=as.data.frame(df1_complete[test_indices,])# dim=101*532
df1_train=as.data.frame(df1_complete[-test_indices,])
pcr_fit1 = pcr(beta_lactoglobulin_b~., ncomp = 10 ,data =df1_train, scale = TRUE, validation = "CV")
summary(pcr_fit1)
## Data: X dimension: 170 531
## Y dimension: 170 1
## Fit method: svdpc
## Number of components considered: 10
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 1.458 1.441 1.442 1.433 1.385 1.390 1.385
## adjCV 1.458 1.440 1.441 1.432 1.383 1.388 1.382
## 7 comps 8 comps 9 comps 10 comps
## CV 1.395 1.403 1.416 1.398
## adjCV 1.391 1.399 1.412 1.393
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## X 53.766 83.421 95.050 98.32 98.99 99.40
## beta_lactoglobulin_b 2.927 3.504 6.056 14.30 14.44 15.64
## 7 comps 8 comps 9 comps 10 comps
## X 99.57 99.69 99.77 99.83
## beta_lactoglobulin_b 16.10 16.31 16.36 18.91
pcr_pred1 <- predict(pcr_fit1,df_test, ncomp = 4)
head(pcr_pred1)
## , , 4 comps
##
## beta_lactoglobulin_b
## 108 1.874151
## 181 1.259660
## 187 2.046250
## 75 2.934118
## 27 2.289911
## 6 2.582123
#calculate RMSE
sqrt(mean((pcr_pred1 - df_test$beta_lactoglobulin_b)^2))
## [1] 1.587219
#0 values of β Lactoglobulin B are imputed using the observed mean
#calculate mean of observed
mean_beta <- mean(subset(df1$beta_lactoglobulin_b, df1$beta_lactoglobulin_b != 0));mean_beta
## [1] 2.669547
#the same test_indices as df1
df2_b0=df1[index,]
#impute by mean
df2_b0$beta_lactoglobulin_b=mean_beta
df2_train=rbind(df2_b0,df1_train)
pcr_fit2 = pcr(beta_lactoglobulin_b~., ncomp = 10 ,data =df2_train, scale = TRUE, validation = "CV")
summary(pcr_fit2)
## Data: X dimension: 202 531
## Y dimension: 202 1
## Fit method: svdpc
## Number of components considered: 10
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 1.337 1.328 1.324 1.316 1.279 1.284 1.282
## adjCV 1.337 1.328 1.323 1.315 1.277 1.282 1.279
## 7 comps 8 comps 9 comps 10 comps
## CV 1.284 1.288 1.296 1.292
## adjCV 1.281 1.285 1.293 1.288
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## X 52.810 83.525 94.998 98.29 98.97 99.35
## beta_lactoglobulin_b 2.396 3.344 5.755 12.96 13.09 14.14
## 7 comps 8 comps 9 comps 10 comps
## X 99.54 99.66 99.75 99.81
## beta_lactoglobulin_b 14.32 14.58 14.58 16.55
pcr_pred2 <- predict(pcr_fit2,df_test, ncomp = 4)
head(pcr_pred2)
## , , 4 comps
##
## beta_lactoglobulin_b
## 108 1.957048
## 181 1.299003
## 187 2.101357
## 75 2.894630
## 27 2.337222
## 6 2.567976
#calculate RMSE
sqrt(mean((pcr_pred2 - df_test$beta_lactoglobulin_b)^2))
## [1] 1.593233
#0 values of β Lactoglobulin B values are imputed using principal components analysis
#impute beta_lactoglobulin_b by PCA(same imputed data from No8 quesiton)
df2_b0$beta_lactoglobulin_b=imputed_data
df3_train=rbind(df2_b0,df1_train)
pcr_fit3 = pcr(beta_lactoglobulin_b~., ncomp = 10 ,data =df3_train, scale = TRUE, validation = "CV")
summary(pcr_fit3)
## Data: X dimension: 202 531
## Y dimension: 202 1
## Fit method: svdpc
## Number of components considered: 10
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 1.638 1.622 1.628 1.625 1.595 1.601 1.598
## adjCV 1.638 1.621 1.627 1.624 1.594 1.599 1.596
## 7 comps 8 comps 9 comps 10 comps
## CV 1.591 1.599 1.613 1.633
## adjCV 1.588 1.596 1.610 1.627
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## X 52.81 83.525 94.998 98.288 98.974 99.347
## beta_lactoglobulin_b 2.18 2.263 3.831 7.753 7.771 8.769
## 7 comps 8 comps 9 comps 10 comps
## X 99.54 99.66 99.75 99.81
## beta_lactoglobulin_b 10.36 10.47 10.49 11.20
pcr_pred3 <- predict(pcr_fit3,df_test, ncomp = 4)
head(pcr_pred3)
## , , 4 comps
##
## beta_lactoglobulin_b
## 108 1.552243
## 181 1.114056
## 187 1.799181
## 75 2.496207
## 27 1.790603
## 6 2.196051
#calculate RMSE
sqrt(mean((pcr_pred3 - df_test$beta_lactoglobulin_b)^2))
## [1] 1.683506
In PCR, principal component=4 has the best result,as 4 has the lowest
RMSEP.We can find this on the plot and it is the same as question7 model
result.Split data of 1/3 test data and 2/3 train data and then perform
cross-validation in the model.I use the same impute data generated from
question8.The three models have the same testing data and the
observations do not contain 0.
(a)RMSE=1.587219
(b)RMSE=1.593233
(c)RMSE=1.683506
(a),(b)have similar regression results and (a) is the best fit.
(c)Imputing through the first principal component can make up for the
missing information, but because the effect of PCA interpolation is not
good enough, (in the comparison of the results of the eighth question,
there is a slight deviation from the observed value).(b) Imputed by
observed mean has some effects, which are similar to the prediction
results of directly deleting 0 values.