library(dplyr)
library(lavaan)
library(semPlot)
library(data.table)
library(rmarkdown)
library(knitr)

Data pre-processing

Load data

#reload data so that the category blocks are not standardized
setwd('/Users/tzhu9/Google Drive/ExecFcnBattery/Analysis')
df<-read.csv('overall.badremoved.chanceremoved.DATA.csv')
df<-select(df, 
             "Subj", 
             "Antisaccade", 
             #"Stroop", 
             "StroopPerf", 
             "SSRTavg", 
             "LetMem", 
             "KeepTrack", 
             "SpatialBack", 
             "NumLett", 
             #"NumLetPerf", 
             "ColShape", 
             #"ColShapePerf", 
             "CatSwitch", 
             #"CatSwitchPerf", 
             "FR1", "FR2", "FR3", "FR4", 
             "SD1", "SD2", "SD3", "SD4")

Data reshaping and manipulation

library(ggplot2)
library(tidyr)

#only subsetting the 2 types of category performance across 4 blocks
testset<-df[11:18]

#fill NA with column mean
library(zoo)
testset<-na.aggregate(testset)

#transform data from wide to long format
data_long <- gather(testset, category, performance, FR1:SD4,
                    factor_key=TRUE)

#Split column 'category' names into 3 columns 
library(stringr)
split<-str_split_fixed(data_long$category,"", 3)
#Bind these splitted columns back to the original dataframe
data_long<-cbind(split,data_long)

#Combine first 2 splitted columns to form a new column 
#in order to isolate the digit to form 'block' variable
data_long<-unite(data_long, type, c('1', '2'), remove=T)
head(data_long)

Renaming and convert data type

#rename columns
colnames(data_long)[colnames(data_long)=="1"] <- "type"
colnames(data_long)[colnames(data_long)=="3"] <- "block"

data_long$type=as.factor(data_long$type)
data_long$block=as.numeric(data_long$block)
data_long<-data_long[ , c("type","block","performance")]

head(data_long)

Create summary table for graph

library(Rmisc)
summary <- summarySE(data_long, 
                     measurevar="performance",
                     groupvars=c("type","block"))
kable(summary)
type block N performance sd se ci
F_R 1 176 0.6330925 0.0702514 0.0052954 0.0104511
F_R 2 176 0.6715318 0.0821730 0.0061940 0.0122246
F_R 3 176 0.6838873 0.0875436 0.0065988 0.0130236
F_R 4 176 0.6955425 0.0970524 0.0073156 0.0144382
S_D 1 176 0.6663589 0.1279153 0.0096420 0.0190295
S_D 2 176 0.7679913 0.1367238 0.0103059 0.0203399
S_D 3 176 0.7845376 0.1227922 0.0092558 0.0182674
S_D 4 176 0.7895954 0.1296291 0.0097712 0.0192845

Graph Performance for SD and FR categories

ggplot(summary, aes(x=block, y=performance, colour=type)) + 
    geom_errorbar(aes(ymin=performance-se, ymax=performance+se),
                  width=.1) +
    geom_line() +
    geom_point()

Testing some Machine Learning code

Simulate 3 more blocks to avoid DF issues

Simulation for SD category

Only run the simulation blocks once to get designated N

rtnorm <- function(n, mean = 0, sd = 1, min = 0.65, max = 0.95) {
    bounds <- pnorm(c(min, max), mean, sd)
    u <- runif(n, bounds[1], bounds[2])
    qnorm(u, mean, sd)
}

#simulate 3 extra blocks with increasing performance
sdblock_5 <- rtnorm(176, .82, .13)
sdblock_6<- rtnorm(176, .86, .13)
sdblock_7 <- rtnorm(176, .92, .13)


sim<-cbind(sdblock_5, sdblock_6, sdblock_7)
#stack the 3 blocks and rename new cols
#stack function only work with dataframes
sim<-as.data.frame(sim)
sim<-stack(sim)
colnames(sim)[colnames(sim)=="values"] <- "performance"
colnames(sim)[colnames(sim)=="ind"] <- "block"

#concatenate a column filled with 'S_D'
type <- rep("S_D",length(sim))
sim<- cbind(type, sim)

#split by delimiter '_'
split2<-str_split_fixed(sim$block,"_", 2)
#Bind these splitted columns back to the original dataframe
sim<-cbind(split2,sim)
#drop the excessive columns 
sim<-sim[ , -which(names(sim) %in% c('1','block'))]

colnames(sim)[colnames(sim)=="2"] <- "block"

#join simulated blocks with data_long 
data_long <- rbind(data_long, sim)
data_long$block=as.numeric(data_long$block)
summarySE(data_long, 
                     measurevar="performance",
                     groupvars=c("type","block"))

Simulation for FR category

frblock_5 <- rtnorm(176, .63, .082)
frblock_6<- rtnorm(176, .64, .070)
frblock_7 <- rtnorm(176, .65, .078)

#stack the 3 blocks and rename new cols
sim2<-cbind(frblock_5, frblock_6, frblock_7)
#stack function only work with dataframes
sim2<-as.data.frame(sim2)
sim2<-stack(sim2)
colnames(sim2)[colnames(sim2)=="values"] <- "performance"
colnames(sim2)[colnames(sim2)=="ind"] <- "block"

#concatenate a column filled with 'F_R'
type <- rep("F_R",length(sim2))
sim2<- cbind(type, sim2)

#split by delimiter '_'
split3<-str_split_fixed(sim2$block,"_", 2)
#Bind these splitted columns back to the original dataframe
sim2<-cbind(split3,sim2)
#drop the excessive columns 
sim2<-sim2[ , -which(names(sim2) %in% c('1','block'))]

colnames(sim2)[colnames(sim2)=="2"] <- "block"

#join simulated blocks with data_long 
data_long <- rbind(data_long, sim2)
data_long$block=as.numeric(data_long$block)

#to get an overview of the 7 blocks across the 2 types of categories
summary_7block <- summarySE(data_long, 
                     measurevar="performance",
                     groupvars=c("type","block"))
kable(summary_7block)
type block N performance sd se ci
F_R 1 176 0.6330925 0.0702514 0.0052954 0.0104511
F_R 2 176 0.6715318 0.0821730 0.0061940 0.0122246
F_R 3 176 0.6838873 0.0875436 0.0065988 0.0130236
F_R 4 176 0.6955425 0.0970524 0.0073156 0.0144382
F_R 5 176 0.7083804 0.0450728 0.0033975 0.0067053
F_R 6 176 0.7007382 0.0381022 0.0028721 0.0056683
F_R 7 176 0.7122595 0.0467828 0.0035264 0.0069597
S_D 1 176 0.6663589 0.1279153 0.0096420 0.0190295
S_D 2 176 0.7679913 0.1367238 0.0103059 0.0203399
S_D 3 176 0.7845376 0.1227922 0.0092558 0.0182674
S_D 4 176 0.7895954 0.1296291 0.0097712 0.0192845
S_D 5 176 0.8059197 0.0808092 0.0060912 0.0120217
S_D 6 176 0.8114663 0.0751776 0.0056667 0.0111839
S_D 7 176 0.8356334 0.0769306 0.0057989 0.0114447

Fitting linear model for FR category

#fitting linear model
lin_reg= lm(formula= performance ~ block, 
           data= summary_7block[c(1:7),])
summary(lin_reg)
## 
## Call:
## lm(formula = performance ~ block, data = summary_7block[c(1:7), 
##     ])
## 
## Residuals:
##         1         2         3         4         5         6         7 
## -0.019069  0.007928  0.008840  0.009052  0.010447 -0.008638 -0.008560 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.640718   0.010978  58.365 2.79e-08 ***
## block       0.011443   0.002455   4.662  0.00552 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01299 on 5 degrees of freedom
## Multiple R-squared:  0.813,  Adjusted R-squared:  0.7755 
## F-statistic: 21.73 on 1 and 5 DF,  p-value: 0.005524

Graph linear model

options(repr.plot.width=4, repr.plot.height=3)

ggplot()+ 
   #observed
    geom_point(aes(x= summary_7block[c(1:7),]$block, 
                   y= summary_7block[c(1:7),]$performance),
              col= 'red') +
   #model predicted
    geom_line(aes(x= summary_7block[c(1:7),]$block, 
                  y= predict(lin_reg, 
                             newdata= summary_7block[c(1:7),])),
              col= 'blue') +
    ggtitle('Learning Curve') +
    xlab('Time') +
    ylab('Performance')

Fitting polynomial regression model for FR category

summary_7block$block2 = summary_7block$block^2
summary_7block$block3 = summary_7block$block^3
summary_7block$block4 = summary_7block$block^4

newdata<-summary_7block[c('block','block2','block3','block4','performance')]
newdata[c(1:7),]
poly_reg= lm(formula= performance ~., 
            data= newdata[c(1:7),])
summary(poly_reg)
## 
## Call:
## lm(formula = performance ~ ., data = newdata[c(1:7), ])
## 
## Residuals:
##          1          2          3          4          5          6 
## -0.0009177  0.0034872 -0.0036698 -0.0018376  0.0064262 -0.0045897 
##          7 
##  0.0011015 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  5.802e-01  4.141e-02  14.011  0.00506 **
## block        6.549e-02  6.102e-02   1.073  0.39546   
## block2      -1.266e-02  2.833e-02  -0.447  0.69860   
## block3       9.687e-04  5.161e-03   0.188  0.86844   
## block4      -1.633e-05  3.213e-04  -0.051  0.96408   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.006835 on 2 degrees of freedom
## Multiple R-squared:  0.9793, Adjusted R-squared:  0.9379 
## F-statistic: 23.64 on 4 and 2 DF,  p-value: 0.041

Graph polynomial model for FR category

options(repr.plot.width=4, repr.plot.height=3)

ggplot()+ 
    geom_point(aes(x= newdata[c(1:7),]$block, 
                   y= newdata[c(1:7),]$performance),
              col= 'red') +
    geom_line(aes(x= newdata[c(1:7),]$block,
                  y= predict(poly_reg, newdata= newdata[c(1:7),])),
              col= 'blue') +
    ggtitle('Learning Curve') +
    xlab('Time') +
    ylab('Performance')

Graph polynomial model with smoother curve(smaller predictor interval)

x_grid= seq(min(newdata[c(1:7),]$block), max(newdata[c(1:7),]$block), 0.1)

df2=data.frame(block= x_grid,
             block2= x_grid^2,
             block3 = x_grid^3,
             block4 = x_grid^4)

ggplot()+ 
    geom_point(aes(x= newdata[c(1:7),]$block, 
                   y= newdata[c(1:7),]$performance),
              col= 'red') +
              geom_line(aes(x= x_grid,
                  y= predict(poly_reg,
                             newdata=df2)), col= 'blue')+
    ggtitle('Learning Curve') +
    xlab('Time') +
    ylab('Performance')

Create csv file with simulated FR and SD blocks

#this is a separate task from the ones above
#certainly that simulation can be done so easily
#here we use the above simulated column data to create a df for SD 
add_sd<-cbind(sdblock_5, sdblock_6, sdblock_7)
colnames(add_sd)[colnames(add_sd)=="sdblock_5"] <- "SD5"
colnames(add_sd)[colnames(add_sd)=="sdblock_6"] <- "SD6"
colnames(add_sd)[colnames(add_sd)=="sdblock_7"] <- "SD7"

#here we use the above simulated column data to create a df for FR 
add_fr<-cbind(frblock_5, frblock_6, frblock_7)
colnames(add_fr)[colnames(add_fr)=="frblock_5"] <- "FR5"
colnames(add_fr)[colnames(add_fr)=="frblock_6"] <- "FR6"
colnames(add_fr)[colnames(add_fr)=="frblock_7"] <- "FR7"

#bind SD and FR dfs
sd_fr<-cbind(add_sd,add_fr)

#append simulated df to original dataset
df_7block<-cbind(sd_fr,df)
write.csv(df_7block, 'data_7block.csv')