require(party)
## Loading required package: party
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Loading required package: sandwich
# merge tables
liwc <- read.csv("liwc.csv", header = TRUE)
smi <- read.csv("smileys2.csv", header = TRUE)

smi$s_all <- smi$smile1+smi$smile2+smi$smile3+smi$smile4+smi$smile5+smi$smile6+smi$smile7+smi$smile8+smi$smile9
+smi$smile10+smi$smile11+smi$smile12+smi$smile13+smi$smile14+smi$smile15+smi$smile16+smi$smile17
+smi$smile18+smi$smile19+smi$smile20+smi$smile21+smi$smile22++smi$smile23+smi$smile24+smi$smile25
+smi$smile26+smi$smile27+smi$smile28+smi$simle13+smi$simle12

smi$s_sad <- smi$sad1 + smi$sad2 + smi$sad4 + smi$sad5 + smi$sad6 + smi$sad7 + smi$sad8 + 
        smi$sad9 + smi$sad10 + smi$sad11 + smi$sad12 + smi$sad13 + smi$sad14 + smi$sad15 + smi$sad16 + smi$sad17 + 
        smi$sad18 

smi$s_kiss <- smi$kiss+smi$kiss.1+smi$kiss.2

smi <- smi[, !(colnames(smi) %in% c("g.1","g.2","g.3", "g.4","g.5","g.6","g.7","g.8","g.9","g.10","g.11","g.12",
                                    "g.13","g.14","g.15","g.16","g.17","g.18","g.19","g.20","g.21","g.21"
                                    ,"g.21","g.21","g.22","g.23","g.24","g.25", "X..","g.26","g.27","g.28","g.29"
                                    ,"g.30","g.31","g.32","g.33","g.34","g.35","g.36","g.37","g.38","g.39","g.40",
                                    "g.41","g.42","g.43", "g.44", "g.45", "g.46", "g.47", "g.48", "g.49", "g.40",
                                    "gg","g.50","g.52","g.53","g.54","g.55","g.56","g.57","g.58","g.59","g.60","g.61"
                                    ,"g.62","g.63","g.64","g.65","g.66","g.67","g.68","g.69","g.70","g.71","g.72"
                                    ,"g.73","g.74","g.75","g.76","g.77","g.78","g.79","g.80","g.81","g.82","g.83",
                                    "g.84","g.85","g.86","g.87","g.88","e","g","kiss","kiss.1","kiss.2"))]

smi <- smi[, !(colnames(smi) %in% c("smile1","smile2","smile3","smile4","smile5","smile6","smile7","smile8",
                                    "smile9","smile10","smile11","smile12","smile13","smile14","smile15","smile16",
                                    "smile17","smile18","smile19","smile20","smile21","smile22","smile23"
                                    ,"smile24","smile25","smile26","smile27","smile28","simle12","simle13"))]

smi <- smi[, !(colnames(smi) %in% c("sad1","sad2","sad3","sad4","sad5","sad6","sad7","sad8","sad9","sad10",
                                    "sad11","sad12","sad13","sad14","sad15","sad16","sad17","sad18"))]



s_merged <- merge(liwc, smi, by.x="userid", by.y="userid", all.x=TRUE)
s_merged$X <- NULL
p1 <- s_merged[complete.cases(s_merged),]
p2 <- p1[sample(nrow(p1), 6000), ]

write smi as excel document and add the weights at the last row.

write.xlsx(p2, file= "p2.xlsx" , sheetName="Sheet1")
t5 <- read.csv("p2.csv", header = TRUE)


t5[2:65]<- data.frame(lapply(t5[2:65], function(X) X/X[6001]))
## Warning in Ops.factor(X, X[6001]): '/' not meaningful for factors
t5 = t5[-6001,]
t5[,2:65]<-scale(t5[,2:65]) 
t5[,75:77]<-scale(t5[,75:77]) 

t5$b[t5$neu < 2.7 ] <- "one"  
t5$b[t5$neu >=  2.7 & t5$neu < 3.5 ] <- "two"  
t5$b[t5$neu >= 3.5] <- "three" 

#length(t5$b[t5$b == "three"])  # one is the biggest group

ind = sample(2, nrow(t5), replace = TRUE, prob=c(0.7, 0.3))
trainset1 = t5[ind == 1,]
testset1 = t5[ind == 2,]

fit1 <- cforest((b == 'one')~ affect + article + assent + bio + body + cause + certain + relig
                 +cogmech + death + discrep + excl + family + feel + filler + friend + home +anger
                 +humans + i + incl + ingest + insight + leisure + money + motion + negate + negemo
                 +percept + posemo + ppron + quant + relativ + sad + sexual + auxverb + conj + funct 
                 +ipron + preps+ pronoun + shehe+ they + age + s_all + s_sad + s_kiss + gender + past + time
                 +s_all + s_sad + gender, data = trainset1, 
                controls=cforest_unbiased(ntree=1000, mtry= 1))

table1 <- table(predict(fit1, OOB=TRUE, type = 'response') > 0.5, trainset1$b == 'one') 

# Training Step 2. 

nonone <- trainset1[trainset1$b != 'one',]
nonone$b <- nonone$b == 'two'
fit2 <- cforest(b~ affect + article + assent + bio + body + cause + certain + 
                cogmech + death + discrep + excl + family + feel + filler + friend + home + 
                humans+ i + incl + ingest + insight + leisure + money + motion + negate + negemo
                + percept + posemo + ppron + quant + relativ + sad + sexual + social + space +swear
                + tentat + they +age  + quant + auxverb + conj + funct + i + ipron 
                + preps+ pronoun + shehe+ we + you + future + present + anger
                + s_all + s_sad + gender
                ,  data = nonone,
                controls=cforest_unbiased(ntree=1000, mtry = 1))

### here I adjust the probablity, only p> 0.60 will be classified as a two  (two = 521, one = 121)
table2<-table(predict(fit2, OOB=TRUE, type = 'response') > 0.60, nonone$b)

#training step 3, predict "one" and "three" after step two

fit3 <- cforest((b == 'three')~  affect + article  + bio + body +
                 +  cogmech + excl + i + insight + negemo+ sad + anger + death
                + posemo + ppron + relativ  + social + space +swear + friend +home
                + age + they + quant + auxverb + conj + funct + i + ipron + preps+ pronoun + shehe+ we 
                + you+ future + past + present + time + s_all + s_sad + gender + s_kiss
                , data = trainset1, 
                controls=cforest_unbiased(ntree=1000, mtry= 1))

# Testing with separate test set (testset2)

#run the test set through step1 
test_predict1<-predict(fit1, newdata=testset1, type='response')
#produce the table for splitting 'three' and 'other'
test_table1<-table(test_predict1>0.5, testset1$b=="one")

#get the data that is classed as 'other', this will be passed to step 2
step2_index<-test_predict1<=0.5

#create the dataset that would be passed to step 2 (this will probably contain 'one', 'two' and 'three' data)
testset2<-testset1[step2_index,]
# run the second step on test set 2
test_predict2<-predict(fit2, newdata=testset2, type='response')
#find out how many 'two' would be correctly classified
test_table2a<-table(test_predict2>0.60, testset2$b=="two")
step3_index<-test_predict2<=0.60
testset3<-testset2[step3_index,]

test_predict3<-predict(fit3, newdata=testset3, type='response')
#find out how many 'one' would be correctly classified
#here I justed the threshold to 0.23, because the response in this step ranged from 0.1-0.3
test_table3a<-table(test_predict3 <= 0.20, testset3$b=="two")
test_table3b<-table(test_predict3 > 0.20, testset3$b=="three")

# the equation for the test set accuracy is then:
total_percentage_test<-(test_table1[2,2]+test_table2a[2,2]+test_table3a[1,1]+test_table3a[2,2])/nrow(testset1)*100
total_percentage_train<-(table1[2,2]+table2[2,2]+table2[1,1])/nrow(trainset1)*100
total_percentage_test
## [1] 47.04545
#testset 50%
#train 58%
###ext
t5$b[ t5$ext < 3.5] <- "one"  
t5$b[t5$ext >=  3.5 & t5$ext < 4.3 ] <- "two"  
t5$b[t5$ext >= 4.3 ] <- "three" 


fit1 <- cforest((b == 'one')~   affect + negemo +posemo + swear + negate 
                + article + auxverb + conj + funct + i+ excl+ social + bio + body 
                +ipron + ppron+  preps+ pronoun + shehe+ they + we + you +age
                + s_all + s_sad + s_kiss + gender + future + past + present + time
                , data = trainset1, 
                controls=cforest_unbiased(ntree=1000, mtry= 1))

table1 <- table(predict(fit1, OOB=TRUE, type = 'response') > 0.5, trainset1$b == 'one') 

# Training Step 2. 

nonone <- trainset1[trainset1$b != 'one',]
nonone$b <- nonone$b == 'two'
fit2 <- cforest(b~ affect + bio  + body+ assent
                +cogmech + excl + discrep + humans + ingest + insight 
                + leisure + motion + negate + negemo
                +percept + posemo + ppron + social + space + swear  + conj + funct + i
                +ppron + pronoun + we 
                + age + s_all + s_sad + s_kiss + gender + past  + future + present
                +s_all + s_sad + gender
                ,  data = nonone,
                controls=cforest_unbiased(ntree=1000, mtry = 1))

### here I adjust the probablity, only p> 0.6 will be classified as a two  (two = 521, one = 121)
table2<-table(predict(fit2, OOB=TRUE, type = 'response') > 0.6, nonone$b)

#training step 3, predict "one" and "three" after step two

fit3 <- cforest((b == 'three')~  affect  + bio
                +cogmech + discrep + excl + family  + incl + negemo + posemo +  ppron  + relativ 
                + social + space + swear + article + auxverb + conj + funct + i
                +ipron + ppron+  preps+ pronoun  + you
                + age + s_all + s_sad + s_kiss + gender + past + time + future 
                +s_all + s_sad + gender
                , data = trainset1, 
                controls=cforest_unbiased(ntree=1000, mtry= 1))

# Testing with separate test set (testset2)

#run the test set through step1 
test_predict1<-predict(fit1, newdata=testset1, type='response')
#produce the table for splitting 'three' and 'other'
test_table1<-table(test_predict1>0.5, testset1$b=="one")

#get the data that is classed as 'other', this will be passed to step 2
step2_index<-test_predict1<=0.5

#create the dataset that would be passed to step 2 (this will probably contain 'one', 'two' and 'three' data)
testset2<-testset1[step2_index,]
# run the second step on test set 2
test_predict2<-predict(fit2, newdata=testset2, type='response')
#find out how many 'two' would be correctly classified
test_table2a<-table(test_predict2>0.6, testset2$b=="two")

step3_index<-test_predict2<=0.6
testset3<-testset2[step3_index,]

test_predict3<-predict(fit3, newdata=testset3, type='response')
#find out how many 'one' would be correctly classified
#here I justed the threshold to 0.23, because the response in this step ranged from 0.1-0.3
test_table3a<-table(test_predict3 <= 0.2, testset3$b=="two")
test_table3b<-table(test_predict3 > 0.20, testset3$b=="three")

# the equation for the test set accuracy is then:
total_percentage_test<-(test_table1[2,2]+test_table2a[2,2]+test_table3a[1,1]+test_table3a[2,2])/nrow(testset1)*100
total_percentage_train<-(table1[2,2]+table2[2,2]+table2[1,1])/nrow(trainset1)*100
total_percentage_test
## [1] 49.03409
#test: 50.56
#train: 59
#agr
t5$b[t5$agr < 3.5] <- "one"  
t5$b[t5$agr >=  3.5 & t5$ext < 4.3 ] <- "two"  
t5$b[t5$agr >= 4.3 ] <- "three" 

fit1 <- cforest((b == 'one')~   affect + assent + bio  + cause + certain + family + friend
                +cogmech + death + discrep + excl  + filler + friend + home 
                +humans + i  + insight + leisure  + motion + negate + negemo 
                + posemo + ppron + quant + relativ + sad + sexual+ tentat + i + we + funct
                + age + s_all + s_sad + s_kiss + gender + future + present + time
                +s_all + s_sad + gender
                , data = trainset1, 
                controls=cforest_unbiased(ntree=1000, mtry= 1))

table1 <- table(predict(fit1, OOB=TRUE, type = 'response') > 0.5, trainset1$b == 'one') 

# Training Step 2. 

nonone <- trainset1[trainset1$b != 'one',]
nonone$b <- nonone$b == 'two'
fit2 <- cforest(b~ affect + article  + body 
                +cogmech + excl + family + feel + friend + incl + ingest + insight 
                + leisure + negemo + space + tentat
                +percept + posemo + ppron + quant + relativ + sad + sexual + auxverb + conj + funct + i
                +ipron + ppron+  preps+ pronoun + shehe+ they + we + you
                + age + s_all + s_sad + s_kiss + gender + past + time + future + present
                +s_all + s_sad + gender
                ,  data = nonone,
                controls=cforest_unbiased(ntree=1000, mtry = 1))

### here I adjust the probablity, only p> 0.6 will be classified as a two  (two = 521, one = 121)
table2<-table(predict(fit2, OOB=TRUE, type = 'response') > 0.6, nonone$b)

#training step 3, predict "one" and "three" after step two

fit3 <- cforest((b == 'one')~  affect + article  + body 
                +cogmech + excl + family + feel + friend + incl + ingest + insight 
                + leisure + negemo + space + tentat
                +percept + posemo + ppron + quant + relativ + sad + sexual + auxverb + conj + funct + i
                +ipron + ppron+  preps+ pronoun + shehe+ they + we + you
                + age + s_all + s_sad + s_kiss + gender + past + time + future + present
                +s_all + s_sad + gender
                , data = trainset1, 
                controls=cforest_unbiased(ntree=1000, mtry= 1))

# Testing with separate test set (testset2)

#run the test set through step1 
test_predict1<-predict(fit1, newdata=testset1, type='response')
#produce the table for splitting 'three' and 'other'
test_table1<-table(test_predict1>0.5, testset1$b=="one")

#get the data that is classed as 'other', this will be passed to step 2
step2_index<-test_predict1<=0.5

#create the dataset that would be passed to step 2 (this will probably contain 'one', 'two' and 'three' data)
testset2<-testset1[step2_index,]
# run the second step on test set 2
test_predict2<-predict(fit2, newdata=testset2, type='response')
#find out how many 'two' would be correctly classified
test_table2a<-table(test_predict2>0.6, testset2$b=="two")

step3_index<-test_predict2<=0.6
testset3<-testset2[step3_index,]

test_predict3<-predict(fit3, newdata=testset3, type='response')
#find out how many 'one' would be correctly classified
#here I justed the threshold to 0.23, because the response in this step ranged from 0.1-0.3
test_table3a<-table(test_predict3 <= 0.40, testset3$b=="two")
test_table3b<-table(test_predict3 > 0.40, testset3$b=="three")

# the equation for the test set accuracy is then:
total_percentage_test<-(test_table1[2,2]+test_table2a[2,2]+test_table3a[1,1]+test_table3a[2,2])/nrow(testset1)*100
total_percentage_train<-(table1[2,2]+table2[2,2]+table2[1,1])/nrow(trainset1)*100
total_percentage_test
## [1] 48.52273
#test 51.52
#train 57.67
#ope
t5$b[t5$ope < 3.8] <- "one"  
t5$b[t5$ope >=  3.8 & t5$ope < 4.5 ] <- "two"  
t5$b[t5$ope >= 4.5 ] <- "three" 

fit1 <- cforest((b == 'one')~  article + assent + cause 
                +cogmech + death + family + home + incl
                + i  + insight + leisure + percept 
                + posemo + ppron  + relativ + social + space+ tentat + i + we + funct
                + conj + funct +ipron + ppron+ pronoun + shehe + you
                + age + s_all + s_sad + s_kiss + gender + future + present 
                +s_all + s_sad + gender
                , data = trainset1, 
                controls=cforest_unbiased(ntree=1000, mtry= 1))

table1 <- table(predict(fit1, OOB=TRUE, type = 'response') > 0.5, trainset1$b == 'one') 

# Training Step 2. 

nonone <- trainset1[trainset1$b != 'one',]
nonone$b <- nonone$b == 'two'
fit2 <- cforest(b~  body + certain + discrep 
                +cogmech + excl + family  + friend + incl  + negemo 
                + posemo + quant + relativ  + sexual + social + tentat + article + conj + funct + i
                +ipron + ppron+  preps+ pronoun  + we + you
                + age + s_all + s_sad + s_kiss + gender  + time + future + present
                +s_all + s_sad + gender
                ,  data = nonone,
                controls=cforest_unbiased(ntree=1000, mtry = 1))

### here I adjust the probablity, only p> 0.6 will be classified as a two  (two = 521, one = 121)
table2<-table(predict(fit2, OOB=TRUE, type = 'response') > 0.6, nonone$b)

#training step 3, predict "one" and "three" after step two

fit3 <- cforest((b == 'one')~  affect + article  + social + tentat
                +cogmech + excl  + ppron  + relativ  + article + auxverb + funct + i
                +ipron + ppron+  preps+ pronoun  + we + you
                + age + s_all + s_sad + s_kiss + gender + past + time + future + present
                +s_all + s_sad + gender
                , data = trainset1, 
                controls=cforest_unbiased(ntree=1000, mtry= 1))

# Testing with separate test set (testset2)

#run the test set through step1 
test_predict1<-predict(fit1, newdata=testset1, type='response')
#produce the table for splitting 'three' and 'other'
test_table1<-table(test_predict1>0.5, testset1$b=="one")

#get the data that is classed as 'other', this will be passed to step 2
step2_index<-test_predict1<=0.5

#create the dataset that would be passed to step 2 (this will probably contain 'one', 'two' and 'three' data)
testset2<-testset1[step2_index,]
# run the second step on test set 2
test_predict2<-predict(fit2, newdata=testset2, type='response')
#find out how many 'two' would be correctly classified
test_table2a<-table(test_predict2>0.6, testset2$b=="two")

step3_index<-test_predict2<=0.6
testset3<-testset2[step3_index,]

test_predict3<-predict(fit3, newdata=testset3, type='response')
#find out how many 'one' would be correctly classified
#here I justed the threshold to 0.23, because the response in this step ranged from 0.1-0.3
test_table3a<-table(test_predict3 <= 0.40, testset3$b=="two")
test_table3b<-table(test_predict3 > 0.40, testset3$b=="three")

# the equation for the test set accuracy is then:
total_percentage_test<-(test_table1[2,2]+test_table2a[2,2]+test_table3a[1,1]+test_table3a[2,2])/nrow(testset1)*100
total_percentage_train<-(table1[2,2]+table2[2,2]+table2[1,1])/nrow(trainset1)*100
total_percentage_test
## [1] 50.39773
#test 48.47
#train 55.87
#con
t5$b[t5$con < 3.5 ] <- "one"  
t5$b[t5$con >=  3.5 & t5$con < 4.2 ] <- "two"  
t5$b[t5$con >= 4.2] <- "three" 

length(t5$b[t5$b == "one"])
## [1] 2709
ind = sample(2, nrow(t5), replace = TRUE, prob=c(0.7, 0.3))
trainset1 = t5[ind == 1,]
testset1 = t5[ind == 2,]

fit1 <- cforest((b == 'one')~ affect + article + assent + bio + body + certain 
                         + death + discrep + excl +  + filler + friend 
                        + incl +  insight +  money + motion + negate + negemo
                + percept + posemo + ppron + quant + relativ  + sexual + social  +swear
                + tentat + they +age  + auxverb + conj + funct + i + ipron 
                + preps+ pronoun + shehe+ we + you + future + present + past
                + s_all + s_sad + gender
                , data = trainset1, 
                controls=cforest_unbiased(ntree=1000, mtry= 1))

table1 <- table(predict(fit1, OOB=TRUE, type = 'response') > 0.45, trainset1$b == 'one') 

# Training Step 2. 

nonone <- trainset1[trainset1$b != 'one',]
nonone$b <- nonone$b == 'two'
fit2 <- cforest(b~ affect + article + assent + bio + body + cause + certain + 
                        cogmech + death + discrep + excl + family + feel + filler + friend + home + 
                        humans+ i + incl + ingest + insight + leisure + money + motion + negate + negemo
                + percept + posemo + ppron + quant + relativ + sad + sexual + social + space +swear
                + tentat + they +age  + auxverb + conj + funct + i + ipron 
                + preps+ pronoun + shehe+ we + you + future + present + past
                + s_all + s_sad + gender
                ,  data = nonone,
                controls=cforest_unbiased(ntree=1000, mtry = 1))

### here I adjust the probablity, only p> 0.60 will be classified as a two  (two = 521, one = 121)
table2<-table(predict(fit2, OOB=TRUE, type = 'response') > 0.60, nonone$b)

#training step 3, predict "one" and "three" after step two

fit3 <- cforest((b == 'three')~  affect + article +incl+motion + negemo+ posemo 
                + ppron + relativ  + social + space + ppron 
                + auxverb + conj + funct + i + ipron + preps+ pronoun 
                +future + past  + time + s_all + s_sad + gender + s_kiss
                , data = trainset1, 
                controls=cforest_unbiased(ntree=1000, mtry= 1))

# Testing with separate test set (testset2)

#run the test set through step1 
test_predict1<-predict(fit1, newdata=testset1, type='response')
#produce the table for splitting 'three' and 'other'
test_table1<-table(test_predict1>0.45, testset1$b=="one")

#get the data that is classed as 'other', this will be passed to step 2
step2_index<-test_predict1<=0.45

#create the dataset that would be passed to step 2 (this will probably contain 'one', 'two' and 'three' data)
testset2<-testset1[step2_index,]
# run the second step on test set 2
test_predict2<-predict(fit2, newdata=testset2, type='response')
#find out how many 'two' would be correctly classified
test_table2a<-table(test_predict2>0.60, testset2$b=="two")
step3_index<-test_predict2<=0.60
testset3<-testset2[step3_index,]

test_predict3<-predict(fit3, newdata=testset3, type='response')
#find out how many 'one' would be correctly classified
#here I justed the threshold to 0.23, because the response in this step ranged from 0.1-0.3
test_table3a<-table(test_predict3 <= 0.23, testset3$b=="one")
test_table3b<-table(test_predict3 > 0.23, testset3$b=="three")

# the equation for the test set accuracy is then:
total_percentage_test<-(test_table1[2,2]+test_table2a[2,2]+test_table3a[1,1]+test_table3a[2,2])/nrow(testset1)*100
total_percentage_train<-(table1[2,2]+table2[2,2]+table2[1,1])/nrow(trainset1)*100
total_percentage_test
## [1] 47.33595