# Load data and format accordingly
#setwd("C:/Users/isaaa/Dropbox/R stuff/ratOA") # Windows
setwd("~/Dropbox/R stuff/ratOA") # MAC
library(ggplot2)
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
NIR = I(as.matrix(read.csv("NIR.csv"))) # convert to as-is for easy merging
wavenumber <- as.matrix((read.csv("NIR.csv", header=F))[1,]) # wavenumber range
data = read.csv("OAdata.csv") # meta- and reference data
allData = data.frame(data, NIR) # merge all data together as one for easy manipulation
rm(data,NIR) # free up some memory right-off-the-bat
# attributes(allData) # confirm data attributes
# Split the data according to compartments
s = split(allData, allData$compart) # split data by compartment
# str(s) # summary structure view



# Function for calculating standard error
# stder <- function(x){
# mean(x)/sqrt(length(x))
# }
# split and compute mean mankin score of lateral data by class (normal and oa)
s1 = split(s$lateral, s$lateral$class)
sapply(s1, function(x) mean(x[,"mankin"]))
## normal oa
## 0.400000 4.818182
# split and compute mean mankin score of each week in oa group of lateral
s2 = split(s1$oa, s1$oa$time) # split oa data by time (week 1, 2, 4 and 6)
sapply(s2, function(x) mean(x[,"mankin"]))
## week1 week2 week4 week6
## 2.000000 3.333333 5.333333 7.666667
p2 <- ggplot(s1$oa, aes(time, mankin))
p2 + geom_boxplot(aes(fill = time)) + theme_bw()# using factor as fill

mankinMean <- sapply(s2, function(x) mean(x[,"mankin"]))
mankinSD <- sapply(s2, function(x) sd(x[,"mankin"]))
df <- data.frame(mankinMean,mankinSD)
#mankinErr <- sapply(s2, function(x) stder(x[,"mankin"]))
limits <- aes(ymax = mankinMean + mankinSD, ymin=mankinMean - mankinSD)
dodge <- position_dodge(width=0.1)
p21 <- ggplot(s1$oa, aes(fill=time, y=mankin, x=time))
p21 + geom_bar(position="dodge", stat="identity") + theme_bw()

nweek = levels(s1$oa$time)
p22 <- ggplot(df, aes(fill=factor(nweek), y=mankinMean,
x=factor(nweek)))
p22 + geom_bar(position=dodge, stat="identity") + geom_errorbar(limits, position=dodge, width=0.25) + ylab("Mankin score") + xlab("Time") + theme_bw()

# PCA Analyses for raw NIR spectra
# 1. For lateral OA samples only
latOAnir <- s1$oa$NIR
lateralOApca = prcomp(latOAnir, center = TRUE, scale = FALSE)
explained_lateralOApca = cumsum(lateralOApca$sdev[1:10]^2/sum(lateralOApca$sdev^2))*100
explained_lateralOApca
## [1] 80.48376 96.73553 99.49896 99.81011 99.93729 99.97771 99.98771
## [8] 99.99472 99.99799 100.00000
# 2. For lateral OA and normal samples only
latNIR <- s$lateral$NIR
lateralpca = prcomp(latNIR, center = TRUE, scale = FALSE)
explained_lateralpca = cumsum(lateralpca$sdev[1:10]^2/sum(lateralpca$sdev^2))*100
explained_lateralpca
## [1] 81.08149 97.00544 99.36061 99.70730 99.86336 99.92471 99.95426
## [8] 99.97077 99.98030 99.98426



## [1] 79.66689 95.65624 97.61909 98.63839 99.04120 99.35891 99.58146
## [8] 99.75045 99.89778 100.00000


## [1] 85.70626 94.48027 97.14253 97.87154 98.44877 98.91986 99.31922
## [8] 99.60404 99.83219 100.00000

# PLS Analyses using raw, 1-D and 2-D NIR spectra
mankin <- s$lateral$mankin
time <- s$lateral$time
ggplotRegression <- function (fit) {
ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) +
geom_point(aes(color = time), size=5) +
stat_smooth(method = "lm", col = "red") +
labs(title = paste("Adj_r2 = ",signif(summary(fit)$adj.r.squared, 3),
"; c =",signif(fit$coef[[1]],3),
"; slope =",signif(fit$coef[[2]], 3),
"; p =",signif(summary(fit)$coef[2,4], 3)))
}
latNIRd1 <- t(diff(t(latNIR), differences = 1, lag = 10)) # first derivative
latNIRd2 <- t(diff(t(latNIR), differences = 2, lag = 10)) # first derivative
PLSlatNIR <- plsr(mankin ~ latNIR, 10, validation = "LOO")
PLSlatNIRd1 <- plsr(mankin ~ latNIRd1, 10, validation = "LOO")
PLSlatNIRd2 <- plsr(mankin ~ latNIRd2, 10, validation = "LOO")
# PLOTS
# RAW
fitData <- data.frame(matrix(as.numeric(PLSlatNIR$fitted.values),nrow = length(mankin), ncol = 10))
colnames(fitData) <- c("cmp1","cmp2","cmp3","cmp4","cmp5","cmp6","cmp7","cmp8","cmp9","cmp10")
for (i in 1:length(mankin)){
fitData$cmp5[i] <- round(fitData$cmp5[i])
if (fitData$cmp5[i] < 0){
fitData$cmp5[i] <- 0
}
}
measPred <- cbind(time,mankin,fitData)
p7 = ggplotRegression(lm(cmp5 ~ mankin, data = measPred))
p7 + ylab("Predicted Mankin score") + xlab("Measured Mankin score") + theme_bw()

#D1
fitData1 <- data.frame(matrix(as.numeric(PLSlatNIRd1$fitted.values),nrow = length(mankin), ncol = 10))
colnames(fitData1) <- c("cmp1","cmp2","cmp3","cmp4","cmp5","cmp6","cmp7","cmp8","cmp9","cmp10")
for (i in 1:length(mankin)){
fitData1$cmp5[i] <- round(fitData1$cmp5[i])
if (fitData1$cmp5[i] < 0){
fitData1$cmp5[i] <- 0
}
}
measPred1 <- cbind(time,mankin,fitData1)
p8 = ggplotRegression(lm(cmp5 ~ mankin, data = measPred1))
p8 + ylab("1-D predicted Mankin score") + xlab("Measured Mankin score") + theme_bw()

#D2
fitData2 <- data.frame(matrix(as.numeric(PLSlatNIRd2$fitted.values),nrow = length(mankin), ncol = 10))
colnames(fitData2) <- c("cmp1","cmp2","cmp3","cmp4","cmp5","cmp6","cmp7","cmp8","cmp9","cmp10")
for (i in 1:length(mankin)){
fitData2$cmp4[i] <- round(fitData2$cmp4[i])
if (fitData2$cmp4[i] < 0){
fitData2$cmp4[i] <- 0
}
}
measPred2 <- cbind(time,mankin,fitData2)
p9 = ggplotRegression(lm(cmp4 ~ mankin, data = measPred2))
p9 + ylab("2-D predicted Mankin score") + xlab("Measured Mankin score") + theme_bw()
