library(dplyr)
library(lavaan)
library(semPlot)
library(data.table)
library(rmarkdown)
library(knitr)

Data pre-processing

Load data

#reload data so that the category blocks are not standardized
setwd('/Users/tzhu9/Google Drive/ExecFcnBattery/Analysis')
df<-read.csv('overall.badremoved.chanceremoved.DATA.csv')
df<-select(df, 
             "Subj", 
             "Antisaccade", 
             #"Stroop", 
             "StroopPerf", 
             "SSRTavg", 
             "LetMem", 
             "KeepTrack", 
             "SpatialBack", 
             "NumLett", 
             #"NumLetPerf", 
             "ColShape", 
             #"ColShapePerf", 
             "CatSwitch", 
             #"CatSwitchPerf", 
             "FR1", "FR2", "FR3", "FR4", 
             "SD1", "SD2", "SD3", "SD4")

Data reshaping and manipulation

library(ggplot2)
library(tidyr)

#only subsetting the 2 types of category performance across 4 blocks
testset<-df[11:18]

#fill NA with column mean
library(zoo)
testset<-na.aggregate(testset)

#transform data from wide to long format
data_long <- gather(testset, category, performance, FR1:SD4,
                    factor_key=TRUE)

#Split column 'category' names into 3 columns 
library(stringr)
split<-str_split_fixed(data_long$category,"", 3)
#Bind these splitted columns back to the original dataframe
data_long<-cbind(split,data_long)

#Combine first 2 splitted columns to form a new column 
#in order to isolate the digit to form 'block' variable
data_long<-unite(data_long, type, c('1', '2'), remove=T)
head(data_long)

Renaming and convert data type

#rename columns
colnames(data_long)[colnames(data_long)=="1"] <- "type"
colnames(data_long)[colnames(data_long)=="3"] <- "block"

data_long$type=as.factor(data_long$type)
data_long$block=as.numeric(data_long$block)
data_long<-data_long[ , c("type","block","performance")]

head(data_long)

Create summary table for graph

library(Rmisc)
summary <- summarySE(data_long, 
                     measurevar="performance",
                     groupvars=c("type","block"))
kable(summary)

type	block	N	performance	sd	se	ci
F_R	1	176	0.6330925	0.0702514	0.0052954	0.0104511
F_R	2	176	0.6715318	0.0821730	0.0061940	0.0122246
F_R	3	176	0.6838873	0.0875436	0.0065988	0.0130236
F_R	4	176	0.6955425	0.0970524	0.0073156	0.0144382
S_D	1	176	0.6663589	0.1279153	0.0096420	0.0190295
S_D	2	176	0.7679913	0.1367238	0.0103059	0.0203399
S_D	3	176	0.7845376	0.1227922	0.0092558	0.0182674
S_D	4	176	0.7895954	0.1296291	0.0097712	0.0192845

Graph Performance for SD and FR categories

ggplot(summary, aes(x=block, y=performance, colour=type)) + 
    geom_errorbar(aes(ymin=performance-se, ymax=performance+se),
                  width=.1) +
    geom_line() +
    geom_point()

Testing some Machine Learning code

Simulate 3 more blocks to avoid DF issues

Simulation for SD category

Only run the simulation blocks once to get designated N

rtnorm <- function(n, mean = 0, sd = 1, min = 0.65, max = 0.95) {
    bounds <- pnorm(c(min, max), mean, sd)
    u <- runif(n, bounds[1], bounds[2])
    qnorm(u, mean, sd)
}

#simulate 3 extra blocks with increasing performance
sdblock_5 <- rtnorm(176, .82, .13)
sdblock_6<- rtnorm(176, .86, .13)
sdblock_7 <- rtnorm(176, .92, .13)


sim<-cbind(sdblock_5, sdblock_6, sdblock_7)
#stack the 3 blocks and rename new cols
#stack function only work with dataframes
sim<-as.data.frame(sim)
sim<-stack(sim)
colnames(sim)[colnames(sim)=="values"] <- "performance"
colnames(sim)[colnames(sim)=="ind"] <- "block"

#concatenate a column filled with 'S_D'
type <- rep("S_D",length(sim))
sim<- cbind(type, sim)

#split by delimiter '_'
split2<-str_split_fixed(sim$block,"_", 2)
#Bind these splitted columns back to the original dataframe
sim<-cbind(split2,sim)
#drop the excessive columns 
sim<-sim[ , -which(names(sim) %in% c('1','block'))]

colnames(sim)[colnames(sim)=="2"] <- "block"

#join simulated blocks with data_long 
data_long <- rbind(data_long, sim)
data_long$block=as.numeric(data_long$block)
summarySE(data_long, 
                     measurevar="performance",
                     groupvars=c("type","block"))

Simulation for FR category

frblock_5 <- rtnorm(176, .63, .082)
frblock_6<- rtnorm(176, .64, .070)
frblock_7 <- rtnorm(176, .65, .078)

#stack the 3 blocks and rename new cols
sim2<-cbind(frblock_5, frblock_6, frblock_7)
#stack function only work with dataframes
sim2<-as.data.frame(sim2)
sim2<-stack(sim2)
colnames(sim2)[colnames(sim2)=="values"] <- "performance"
colnames(sim2)[colnames(sim2)=="ind"] <- "block"

#concatenate a column filled with 'F_R'
type <- rep("F_R",length(sim2))
sim2<- cbind(type, sim2)

#split by delimiter '_'
split3<-str_split_fixed(sim2$block,"_", 2)
#Bind these splitted columns back to the original dataframe
sim2<-cbind(split3,sim2)
#drop the excessive columns 
sim2<-sim2[ , -which(names(sim2) %in% c('1','block'))]

colnames(sim2)[colnames(sim2)=="2"] <- "block"

#join simulated blocks with data_long 
data_long <- rbind(data_long, sim2)
data_long$block=as.numeric(data_long$block)

#to get an overview of the 7 blocks across the 2 types of categories
summary_7block <- summarySE(data_long, 
                     measurevar="performance",
                     groupvars=c("type","block"))
kable(summary_7block)

type	block	N	performance	sd	se	ci
F_R	1	176	0.6330925	0.0702514	0.0052954	0.0104511
F_R	2	176	0.6715318	0.0821730	0.0061940	0.0122246
F_R	3	176	0.6838873	0.0875436	0.0065988	0.0130236
F_R	4	176	0.6955425	0.0970524	0.0073156	0.0144382
F_R	5	176	0.7083804	0.0450728	0.0033975	0.0067053
F_R	6	176	0.7007382	0.0381022	0.0028721	0.0056683
F_R	7	176	0.7122595	0.0467828	0.0035264	0.0069597
S_D	1	176	0.6663589	0.1279153	0.0096420	0.0190295
S_D	2	176	0.7679913	0.1367238	0.0103059	0.0203399
S_D	3	176	0.7845376	0.1227922	0.0092558	0.0182674
S_D	4	176	0.7895954	0.1296291	0.0097712	0.0192845
S_D	5	176	0.8059197	0.0808092	0.0060912	0.0120217
S_D	6	176	0.8114663	0.0751776	0.0056667	0.0111839
S_D	7	176	0.8356334	0.0769306	0.0057989	0.0114447

Fitting linear model for FR category

#fitting linear model
lin_reg= lm(formula= performance ~ block, 
           data= summary_7block[c(1:7),])
summary(lin_reg)

## 
## Call:
## lm(formula = performance ~ block, data = summary_7block[c(1:7), 
##     ])
## 
## Residuals:
##         1         2         3         4         5         6         7 
## -0.019069  0.007928  0.008840  0.009052  0.010447 -0.008638 -0.008560 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.640718   0.010978  58.365 2.79e-08 ***
## block       0.011443   0.002455   4.662  0.00552 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01299 on 5 degrees of freedom
## Multiple R-squared:  0.813,  Adjusted R-squared:  0.7755 
## F-statistic: 21.73 on 1 and 5 DF,  p-value: 0.005524

Graph linear model

options(repr.plot.width=4, repr.plot.height=3)

ggplot()+ 
   #observed
    geom_point(aes(x= summary_7block[c(1:7),]$block, 
                   y= summary_7block[c(1:7),]$performance),
              col= 'red') +
   #model predicted
    geom_line(aes(x= summary_7block[c(1:7),]$block, 
                  y= predict(lin_reg, 
                             newdata= summary_7block[c(1:7),])),
              col= 'blue') +
    ggtitle('Learning Curve') +
    xlab('Time') +
    ylab('Performance')

Fitting polynomial regression model for FR category

summary_7block$block2 = summary_7block$block^2
summary_7block$block3 = summary_7block$block^3
summary_7block$block4 = summary_7block$block^4

newdata<-summary_7block[c('block','block2','block3','block4','performance')]
newdata[c(1:7),]

poly_reg= lm(formula= performance ~., 
            data= newdata[c(1:7),])
summary(poly_reg)

## 
## Call:
## lm(formula = performance ~ ., data = newdata[c(1:7), ])
## 
## Residuals:
##          1          2          3          4          5          6 
## -0.0009177  0.0034872 -0.0036698 -0.0018376  0.0064262 -0.0045897 
##          7 
##  0.0011015 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  5.802e-01  4.141e-02  14.011  0.00506 **
## block        6.549e-02  6.102e-02   1.073  0.39546   
## block2      -1.266e-02  2.833e-02  -0.447  0.69860   
## block3       9.687e-04  5.161e-03   0.188  0.86844   
## block4      -1.633e-05  3.213e-04  -0.051  0.96408   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.006835 on 2 degrees of freedom
## Multiple R-squared:  0.9793, Adjusted R-squared:  0.9379 
## F-statistic: 23.64 on 4 and 2 DF,  p-value: 0.041

Graph polynomial model for FR category

options(repr.plot.width=4, repr.plot.height=3)

ggplot()+ 
    geom_point(aes(x= newdata[c(1:7),]$block, 
                   y= newdata[c(1:7),]$performance),
              col= 'red') +
    geom_line(aes(x= newdata[c(1:7),]$block,
                  y= predict(poly_reg, newdata= newdata[c(1:7),])),
              col= 'blue') +
    ggtitle('Learning Curve') +
    xlab('Time') +
    ylab('Performance')

Graph polynomial model with smoother curve(smaller predictor interval)

x_grid= seq(min(newdata[c(1:7),]$block), max(newdata[c(1:7),]$block), 0.1)

df2=data.frame(block= x_grid,
             block2= x_grid^2,
             block3 = x_grid^3,
             block4 = x_grid^4)

ggplot()+ 
    geom_point(aes(x= newdata[c(1:7),]$block, 
                   y= newdata[c(1:7),]$performance),
              col= 'red') +
              geom_line(aes(x= x_grid,
                  y= predict(poly_reg,
                             newdata=df2)), col= 'blue')+
    ggtitle('Learning Curve') +
    xlab('Time') +
    ylab('Performance')

Create csv file with simulated FR and SD blocks

#this is a separate task from the ones above
#certainly that simulation can be done so easily
#here we use the above simulated column data to create a df for SD 
add_sd<-cbind(sdblock_5, sdblock_6, sdblock_7)
colnames(add_sd)[colnames(add_sd)=="sdblock_5"] <- "SD5"
colnames(add_sd)[colnames(add_sd)=="sdblock_6"] <- "SD6"
colnames(add_sd)[colnames(add_sd)=="sdblock_7"] <- "SD7"

#here we use the above simulated column data to create a df for FR 
add_fr<-cbind(frblock_5, frblock_6, frblock_7)
colnames(add_fr)[colnames(add_fr)=="frblock_5"] <- "FR5"
colnames(add_fr)[colnames(add_fr)=="frblock_6"] <- "FR6"
colnames(add_fr)[colnames(add_fr)=="frblock_7"] <- "FR7"

#bind SD and FR dfs
sd_fr<-cbind(add_sd,add_fr)

#append simulated df to original dataset
df_7block<-cbind(sd_fr,df)
write.csv(df_7block, 'data_7block.csv')

Data Manipulation and Simulation

Data pre-processing

Load data

Data reshaping and manipulation

Renaming and convert data type

Create summary table for graph

Graph Performance for SD and FR categories

Testing some Machine Learning code

Simulate 3 more blocks to avoid DF issues

Simulation for SD category

Simulation for FR category

Fitting linear model for FR category

Graph linear model

Fitting polynomial regression model for FR category

Graph polynomial model for FR category

Graph polynomial model with smoother curve(smaller predictor interval)

Create csv file with simulated FR and SD blocks