# Load data and format accordingly
# setwd("C:/Users/isaaa/Dropbox/R stuff/ratOA") # Windows
setwd("~/Dropbox/R stuff/ratOA") # MAC
library(ggplot2)
library(pls)
## 
## Attaching package: 'pls'
## 
## The following object is masked from 'package:stats':
## 
##     loadings
NIR = I(as.matrix(read.csv("NIR.csv")))  # convert to as-is for easy merging
wavenumber <- as.matrix((read.csv("NIR.csv", header=F))[1,]) # wavenumber range
data = read.csv("OAdata.csv") # meta- and reference data

allData = data.frame(data, NIR) # merge all data together as one for easy manipulation
rm(data,NIR)    # free up some memory right-off-the-bat

# attributes(allData) # confirm data attributes
# Split the data according to compartments
s = split(allData, allData$compart) # split data by compartment
# str(s) # summary structure view

# Function for calculating standard error
# stder <- function(x){
#         mean(x)/sqrt(length(x))
# }

# split and compute mean mankin score of lateral data by class (normal and oa)
s1 = split(s$lateral, s$lateral$class) 
sapply(s1, function(x) mean(x[,"mankin"])) 
##   normal       oa 
## 0.400000 4.818182
# split and compute mean mankin score of each week in oa group of lateral
s2 = split(s1$oa, s1$oa$time)  # split oa data by time (week 1, 2, 4 and 6)
sapply(s2, function(x) mean(x[,"mankin"])) 
##    week1    week2    week4    week6 
## 2.000000 3.333333 5.333333 7.666667
p2 <- ggplot(s1$oa, aes(time, mankin))
p2 + geom_boxplot(aes(fill = time)) + theme_bw()# using factor as fill

mankinMean <- sapply(s2, function(x) mean(x[,"mankin"]))
mankinSD <- sapply(s2, function(x) sd(x[,"mankin"]))
df <- data.frame(mankinMean,mankinSD)
#mankinErr <- sapply(s2, function(x) stder(x[,"mankin"])) 
limits <- aes(ymax = mankinMean + mankinSD, ymin=mankinMean - mankinSD)
dodge <- position_dodge(width=0.1)
p21 <- ggplot(s1$oa, aes(fill=time, y=mankin, x=time))
p21 + geom_bar(position="dodge", stat="identity") + theme_bw()

nweek = levels(s1$oa$time)
p22 <- ggplot(df, aes(fill=factor(nweek), y=mankinMean, 
                      x=factor(nweek)))
p22 + geom_bar(position=dodge, stat="identity") + geom_errorbar(limits, position=dodge, width=0.25) + ylab("Mankin score") + xlab("Time") + theme_bw() 

# PCA Analyses for raw NIR spectra
# 1. For lateral OA samples only
latOAnir <- s1$oa$NIR
lateralOApca = prcomp(latOAnir, center = TRUE, scale = FALSE)
explained_lateralOApca = cumsum(lateralOApca$sdev[1:10]^2/sum(lateralOApca$sdev^2))*100
explained_lateralOApca
##  [1]  80.48376  96.73553  99.49896  99.81011  99.93729  99.97771  99.98771
##  [8]  99.99472  99.99799 100.00000
# 2. For lateral OA and normal samples only
latNIR <- s$lateral$NIR
lateralpca = prcomp(latNIR, center = TRUE, scale = FALSE)
explained_lateralpca = cumsum(lateralpca$sdev[1:10]^2/sum(lateralpca$sdev^2))*100
explained_lateralpca
##  [1] 81.08149 97.00544 99.36061 99.70730 99.86336 99.92471 99.95426
##  [8] 99.97077 99.98030 99.98426

##  [1]  79.66689  95.65624  97.61909  98.63839  99.04120  99.35891  99.58146
##  [8]  99.75045  99.89778 100.00000

##  [1]  85.70626  94.48027  97.14253  97.87154  98.44877  98.91986  99.31922
##  [8]  99.60404  99.83219 100.00000

# PLS Analyses using raw, 1-D and 2-D NIR spectra
mankin <- s$lateral$mankin
time <- s$lateral$time

ggplotRegression <- function (fit) {
        ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) + 
                geom_point(aes(color = time), size=5) +
                stat_smooth(method = "lm", col = "red") +
                labs(title = paste("Adj_r2 = ",signif(summary(fit)$adj.r.squared, 3),
                                   ";  c =",signif(fit$coef[[1]],3),
                                   ";  slope =",signif(fit$coef[[2]], 3),
                                   ";  p =",signif(summary(fit)$coef[2,4], 3)))
}

latNIRd1 <- t(diff(t(latNIR), differences = 1, lag = 10)) # first derivative
latNIRd2 <- t(diff(t(latNIR), differences = 2, lag = 10)) # first derivative


PLSlatNIR <- plsr(mankin ~ latNIR, 10, validation = "LOO")
PLSlatNIRd1 <- plsr(mankin ~ latNIRd1, 10, validation = "LOO")
PLSlatNIRd2 <- plsr(mankin ~ latNIRd2, 10, validation = "LOO")

# PLOTS 
# RAW
fitData <- data.frame(matrix(as.numeric(PLSlatNIR$fitted.values),nrow = length(mankin), ncol = 10))
colnames(fitData) <- c("cmp1","cmp2","cmp3","cmp4","cmp5","cmp6","cmp7","cmp8","cmp9","cmp10")
for (i in 1:length(mankin)){
        fitData$cmp5[i] <- round(fitData$cmp5[i])
        if (fitData$cmp5[i] < 0){
                fitData$cmp5[i] <- 0
        }
}
measPred <- cbind(time,mankin,fitData)

p7 = ggplotRegression(lm(cmp5 ~ mankin, data = measPred))
p7 + ylab("Predicted Mankin score") + xlab("Measured Mankin score") + theme_bw()

#D1
fitData1 <- data.frame(matrix(as.numeric(PLSlatNIRd1$fitted.values),nrow = length(mankin), ncol = 10))
colnames(fitData1) <- c("cmp1","cmp2","cmp3","cmp4","cmp5","cmp6","cmp7","cmp8","cmp9","cmp10")
for (i in 1:length(mankin)){
        fitData1$cmp5[i] <- round(fitData1$cmp5[i])
        if (fitData1$cmp5[i] < 0){
                fitData1$cmp5[i] <- 0
        }
}
measPred1 <- cbind(time,mankin,fitData1)

p8 = ggplotRegression(lm(cmp5 ~ mankin, data = measPred1))
p8 + ylab("1-D predicted Mankin score") + xlab("Measured Mankin score") + theme_bw()

#D2
fitData2 <- data.frame(matrix(as.numeric(PLSlatNIRd2$fitted.values),nrow = length(mankin), ncol = 10))
colnames(fitData2) <- c("cmp1","cmp2","cmp3","cmp4","cmp5","cmp6","cmp7","cmp8","cmp9","cmp10")
for (i in 1:length(mankin)){
        fitData2$cmp4[i] <- round(fitData2$cmp4[i])
        if (fitData2$cmp4[i] < 0){
                fitData2$cmp4[i] <- 0
        }
}
measPred2 <- cbind(time,mankin,fitData2)

p9 = ggplotRegression(lm(cmp4 ~ mankin, data = measPred2))
p9 + ylab("2-D predicted Mankin score") + xlab("Measured Mankin score") + theme_bw()