Data pre-processing
Load data
#reload data so that the category blocks are not standardized
setwd('/Users/tzhu9/Google Drive/ExecFcnBattery/Analysis')
df<-read.csv('overall.badremoved.chanceremoved.DATA.csv')
df<-select(df,
"Subj",
"Antisaccade",
#"Stroop",
"StroopPerf",
"SSRTavg",
"LetMem",
"KeepTrack",
"SpatialBack",
"NumLett",
#"NumLetPerf",
"ColShape",
#"ColShapePerf",
"CatSwitch",
#"CatSwitchPerf",
"FR1", "FR2", "FR3", "FR4",
"SD1", "SD2", "SD3", "SD4")
Data reshaping and manipulation
library(ggplot2)
library(tidyr)
#only subsetting the 2 types of category performance across 4 blocks
testset<-df[11:18]
#fill NA with column mean
library(zoo)
testset<-na.aggregate(testset)
#transform data from wide to long format
data_long <- gather(testset, category, performance, FR1:SD4,
factor_key=TRUE)
#Split column 'category' names into 3 columns
library(stringr)
split<-str_split_fixed(data_long$category,"", 3)
#Bind these splitted columns back to the original dataframe
data_long<-cbind(split,data_long)
#Combine first 2 splitted columns to form a new column
#in order to isolate the digit to form 'block' variable
data_long<-unite(data_long, type, c('1', '2'), remove=T)
head(data_long)
Renaming and convert data type
#rename columns
colnames(data_long)[colnames(data_long)=="1"] <- "type"
colnames(data_long)[colnames(data_long)=="3"] <- "block"
data_long$type=as.factor(data_long$type)
data_long$block=as.numeric(data_long$block)
data_long<-data_long[ , c("type","block","performance")]
head(data_long)
Create summary table for graph
library(Rmisc)
summary <- summarySE(data_long,
measurevar="performance",
groupvars=c("type","block"))
kable(summary)
| F_R |
1 |
176 |
0.6330925 |
0.0702514 |
0.0052954 |
0.0104511 |
| F_R |
2 |
176 |
0.6715318 |
0.0821730 |
0.0061940 |
0.0122246 |
| F_R |
3 |
176 |
0.6838873 |
0.0875436 |
0.0065988 |
0.0130236 |
| F_R |
4 |
176 |
0.6955425 |
0.0970524 |
0.0073156 |
0.0144382 |
| S_D |
1 |
176 |
0.6663589 |
0.1279153 |
0.0096420 |
0.0190295 |
| S_D |
2 |
176 |
0.7679913 |
0.1367238 |
0.0103059 |
0.0203399 |
| S_D |
3 |
176 |
0.7845376 |
0.1227922 |
0.0092558 |
0.0182674 |
| S_D |
4 |
176 |
0.7895954 |
0.1296291 |
0.0097712 |
0.0192845 |
Testing some Machine Learning code
Simulate 3 more blocks to avoid DF issues
Simulation for SD category
Only run the simulation blocks once to get designated N
rtnorm <- function(n, mean = 0, sd = 1, min = 0.65, max = 0.95) {
bounds <- pnorm(c(min, max), mean, sd)
u <- runif(n, bounds[1], bounds[2])
qnorm(u, mean, sd)
}
#simulate 3 extra blocks with increasing performance
sdblock_5 <- rtnorm(176, .82, .13)
sdblock_6<- rtnorm(176, .86, .13)
sdblock_7 <- rtnorm(176, .92, .13)
sim<-cbind(sdblock_5, sdblock_6, sdblock_7)
#stack the 3 blocks and rename new cols
#stack function only work with dataframes
sim<-as.data.frame(sim)
sim<-stack(sim)
colnames(sim)[colnames(sim)=="values"] <- "performance"
colnames(sim)[colnames(sim)=="ind"] <- "block"
#concatenate a column filled with 'S_D'
type <- rep("S_D",length(sim))
sim<- cbind(type, sim)
#split by delimiter '_'
split2<-str_split_fixed(sim$block,"_", 2)
#Bind these splitted columns back to the original dataframe
sim<-cbind(split2,sim)
#drop the excessive columns
sim<-sim[ , -which(names(sim) %in% c('1','block'))]
colnames(sim)[colnames(sim)=="2"] <- "block"
#join simulated blocks with data_long
data_long <- rbind(data_long, sim)
data_long$block=as.numeric(data_long$block)
summarySE(data_long,
measurevar="performance",
groupvars=c("type","block"))
Simulation for FR category
frblock_5 <- rtnorm(176, .63, .082)
frblock_6<- rtnorm(176, .64, .070)
frblock_7 <- rtnorm(176, .65, .078)
#stack the 3 blocks and rename new cols
sim2<-cbind(frblock_5, frblock_6, frblock_7)
#stack function only work with dataframes
sim2<-as.data.frame(sim2)
sim2<-stack(sim2)
colnames(sim2)[colnames(sim2)=="values"] <- "performance"
colnames(sim2)[colnames(sim2)=="ind"] <- "block"
#concatenate a column filled with 'F_R'
type <- rep("F_R",length(sim2))
sim2<- cbind(type, sim2)
#split by delimiter '_'
split3<-str_split_fixed(sim2$block,"_", 2)
#Bind these splitted columns back to the original dataframe
sim2<-cbind(split3,sim2)
#drop the excessive columns
sim2<-sim2[ , -which(names(sim2) %in% c('1','block'))]
colnames(sim2)[colnames(sim2)=="2"] <- "block"
#join simulated blocks with data_long
data_long <- rbind(data_long, sim2)
data_long$block=as.numeric(data_long$block)
#to get an overview of the 7 blocks across the 2 types of categories
summary_7block <- summarySE(data_long,
measurevar="performance",
groupvars=c("type","block"))
kable(summary_7block)
| F_R |
1 |
176 |
0.6330925 |
0.0702514 |
0.0052954 |
0.0104511 |
| F_R |
2 |
176 |
0.6715318 |
0.0821730 |
0.0061940 |
0.0122246 |
| F_R |
3 |
176 |
0.6838873 |
0.0875436 |
0.0065988 |
0.0130236 |
| F_R |
4 |
176 |
0.6955425 |
0.0970524 |
0.0073156 |
0.0144382 |
| F_R |
5 |
176 |
0.7083804 |
0.0450728 |
0.0033975 |
0.0067053 |
| F_R |
6 |
176 |
0.7007382 |
0.0381022 |
0.0028721 |
0.0056683 |
| F_R |
7 |
176 |
0.7122595 |
0.0467828 |
0.0035264 |
0.0069597 |
| S_D |
1 |
176 |
0.6663589 |
0.1279153 |
0.0096420 |
0.0190295 |
| S_D |
2 |
176 |
0.7679913 |
0.1367238 |
0.0103059 |
0.0203399 |
| S_D |
3 |
176 |
0.7845376 |
0.1227922 |
0.0092558 |
0.0182674 |
| S_D |
4 |
176 |
0.7895954 |
0.1296291 |
0.0097712 |
0.0192845 |
| S_D |
5 |
176 |
0.8059197 |
0.0808092 |
0.0060912 |
0.0120217 |
| S_D |
6 |
176 |
0.8114663 |
0.0751776 |
0.0056667 |
0.0111839 |
| S_D |
7 |
176 |
0.8356334 |
0.0769306 |
0.0057989 |
0.0114447 |
Fitting linear model for FR category
#fitting linear model
lin_reg= lm(formula= performance ~ block,
data= summary_7block[c(1:7),])
summary(lin_reg)
##
## Call:
## lm(formula = performance ~ block, data = summary_7block[c(1:7),
## ])
##
## Residuals:
## 1 2 3 4 5 6 7
## -0.019069 0.007928 0.008840 0.009052 0.010447 -0.008638 -0.008560
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.640718 0.010978 58.365 2.79e-08 ***
## block 0.011443 0.002455 4.662 0.00552 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01299 on 5 degrees of freedom
## Multiple R-squared: 0.813, Adjusted R-squared: 0.7755
## F-statistic: 21.73 on 1 and 5 DF, p-value: 0.005524
Graph linear model
options(repr.plot.width=4, repr.plot.height=3)
ggplot()+
#observed
geom_point(aes(x= summary_7block[c(1:7),]$block,
y= summary_7block[c(1:7),]$performance),
col= 'red') +
#model predicted
geom_line(aes(x= summary_7block[c(1:7),]$block,
y= predict(lin_reg,
newdata= summary_7block[c(1:7),])),
col= 'blue') +
ggtitle('Learning Curve') +
xlab('Time') +
ylab('Performance')

Fitting polynomial regression model for FR category
summary_7block$block2 = summary_7block$block^2
summary_7block$block3 = summary_7block$block^3
summary_7block$block4 = summary_7block$block^4
newdata<-summary_7block[c('block','block2','block3','block4','performance')]
newdata[c(1:7),]
poly_reg= lm(formula= performance ~.,
data= newdata[c(1:7),])
summary(poly_reg)
##
## Call:
## lm(formula = performance ~ ., data = newdata[c(1:7), ])
##
## Residuals:
## 1 2 3 4 5 6
## -0.0009177 0.0034872 -0.0036698 -0.0018376 0.0064262 -0.0045897
## 7
## 0.0011015
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.802e-01 4.141e-02 14.011 0.00506 **
## block 6.549e-02 6.102e-02 1.073 0.39546
## block2 -1.266e-02 2.833e-02 -0.447 0.69860
## block3 9.687e-04 5.161e-03 0.188 0.86844
## block4 -1.633e-05 3.213e-04 -0.051 0.96408
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.006835 on 2 degrees of freedom
## Multiple R-squared: 0.9793, Adjusted R-squared: 0.9379
## F-statistic: 23.64 on 4 and 2 DF, p-value: 0.041
Graph polynomial model for FR category
options(repr.plot.width=4, repr.plot.height=3)
ggplot()+
geom_point(aes(x= newdata[c(1:7),]$block,
y= newdata[c(1:7),]$performance),
col= 'red') +
geom_line(aes(x= newdata[c(1:7),]$block,
y= predict(poly_reg, newdata= newdata[c(1:7),])),
col= 'blue') +
ggtitle('Learning Curve') +
xlab('Time') +
ylab('Performance')

Graph polynomial model with smoother curve(smaller predictor interval)
x_grid= seq(min(newdata[c(1:7),]$block), max(newdata[c(1:7),]$block), 0.1)
df2=data.frame(block= x_grid,
block2= x_grid^2,
block3 = x_grid^3,
block4 = x_grid^4)
ggplot()+
geom_point(aes(x= newdata[c(1:7),]$block,
y= newdata[c(1:7),]$performance),
col= 'red') +
geom_line(aes(x= x_grid,
y= predict(poly_reg,
newdata=df2)), col= 'blue')+
ggtitle('Learning Curve') +
xlab('Time') +
ylab('Performance')

Create csv file with simulated FR and SD blocks
#this is a separate task from the ones above
#certainly that simulation can be done so easily
#here we use the above simulated column data to create a df for SD
add_sd<-cbind(sdblock_5, sdblock_6, sdblock_7)
colnames(add_sd)[colnames(add_sd)=="sdblock_5"] <- "SD5"
colnames(add_sd)[colnames(add_sd)=="sdblock_6"] <- "SD6"
colnames(add_sd)[colnames(add_sd)=="sdblock_7"] <- "SD7"
#here we use the above simulated column data to create a df for FR
add_fr<-cbind(frblock_5, frblock_6, frblock_7)
colnames(add_fr)[colnames(add_fr)=="frblock_5"] <- "FR5"
colnames(add_fr)[colnames(add_fr)=="frblock_6"] <- "FR6"
colnames(add_fr)[colnames(add_fr)=="frblock_7"] <- "FR7"
#bind SD and FR dfs
sd_fr<-cbind(add_sd,add_fr)
#append simulated df to original dataset
df_7block<-cbind(sd_fr,df)
write.csv(df_7block, 'data_7block.csv')