# merge tables
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Loading required package: sandwich
#write.xlsx(depression, file="d.xlsx", sheetName="Sheet1")

t5 <- read.csv("d.csv", header = TRUE)
t5$b[t5$sum < 37 ] <- "one"  
t5$b[t5$sum >=  37 & t5$sum < 46 ] <- "two"  
t5$b[t5$sum >= 46 ] <- "three" 


t5[10:73]<- data.frame(lapply(t5[10:73], function(X) X/X[670]))
t5 = t5[-670,]
t5[,10:73]<-scale(t5[,10:73]) 
t5[,83:85]<-scale(t5[,83:85]) 

trainset1<- t5

fit1 <- cforest((b== "three") ~ relativ+ past +time+ certain+cogmech+negemo+relativ+negate
                + funct +ppron+pronoun +incl+social+ tentat + leisure + sad+ they + bio + affect+body
                +excl+feel+filler+humans+incl + certain
                + age + s_all + s_sad + gender 
                , data = trainset1, 
                controls=cforest_unbiased(ntree=500, mtry= 1))

table1 <- table(predict(fit1, OOB=TRUE, type = 'response') > 0.5, trainset1$b == "three" ) 

# Training Step 2. 

nonthree <- trainset1[trainset1$b != 'three',]
nonthree$b <- nonthree$b == 'two'
fit2 <- cforest(b~ incl+leisure+negemo+posemo+social+they+humans+percept+sexual+space
                +family+body+affect+motion+negate+sad+space+tentat+they+conj+ipron+we
                +article+auxverb+funct+i+ppron+preps+pronoun+shehe+we+you+future+time
                + age + s_all + s_sad + gender 
                ,  data = nonthree,
                controls=cforest_unbiased(ntree=500, mtry = 1))

### here I adjust the probablity, only p> 0.6 will be classified as a two  (two = 521, one = 121)
table2<-table(predict(fit2, OOB=TRUE, type = 'response') > 0.7, nonthree$b)




# the equation for the test set accuracy is then:
#total_percentage_test<-(test_table1[2,2]+test_table2a[2,2]+test_table3a[1,1]+test_table3a[2,2])/nrow(testset1)*100 

total_percentage_train<-(table1[2,2]+table2[2,2]+table2[1,1])/nrow(trainset1)*100
total_percentage_train
## [1] 60.98655