library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Loading required package: sandwich
merge the tables
liwc <- read.csv("liwc.csv", header = TRUE)
smi <- read.csv("smileys2.csv", header = TRUE)
swl <- read.csv("swl.csv", header = TRUE)
s <- merge(liwc, swl, by.x="userid", by.y="userid", all.x=TRUE)
s$X <- NULL
s <- s[complete.cases(s),]
s_merged <- merge(s, smi, by.x="userid", by.y="userid", all.x=TRUE)
t6 <- s_merged[complete.cases(s_merged),]
t6 <- s_merged
sum up all the happy, sad and kiss smileys
smi <- t6
smi$s_all <- smi$smile1+smi$smile2+smi$smile3+smi$smile4+smi$smile5+smi$smile6+smi$smile7+smi$smile8+smi$smile9+smi$smile10+smi$smile11+smi$smile12+smi$smile13+smi$smile14+smi$smile15+smi$smile16+smi$smile17+smi$smile18+smi$smile19+smi$smile20+smi$smile21+smi$smile22++smi$smile23+smi$smile24+smi$smile25+smi$smile26+smi$smile27+smi$smile28+smi$simle13+smi$simle12
smi$s_sad <- smi$sad1 + smi$sad2 + smi$sad4 + smi$sad5 + smi$sad6 + smi$sad7 + smi$sad8 + smi$sad9 + smi$sad10 + smi$sad11 + smi$sad12 + smi$sad13 + smi$sad14 + smi$sad15 + smi$sad16 + smi$sad17 + smi$sad18
smi$s_kiss <- smi$kiss+smi$kiss.1+smi$kiss.2
#delete these columns
smi <- smi[, !(colnames(smi) %in% c("g.1","g.2","g.3", "g.4","g.5","g.6","g.7","g.8","g.9","g.10","g.11","g.12","g.13","g.14","g.15","g.16","g.17","g.18","g.19","g.20","g.21","g.21","g.21","g.21","g.22","g.23","g.24","g.25", "X..","g.26","g.27","g.28","g.29", "g.30","g.31","g.32","g.33","g.34","g.35","g.36","g.37","g.38","g.39","g.40", "g.41","g.42","g.43", "g.44", "g.45", "g.46", "g.47", "g.48", "g.49","g.40","gg","g.50","g.52","g.53","g.54","g.55","g.56","g.57","g.58","g.59","g.60","g.61","g.62","g.63","g.64","g.65","g.66","g.67","g.68","g.69","g.70","g.71","g.72","g.73","g.74","g.75","g.76","g.77","g.78","g.79","g.80","g.81","g.82","g.83","g.84","g.85","g.86","g.87","g.88","e","g","kiss","kiss.1","kiss.2"))]
smi <- smi[, !(colnames(smi) %in% c("smile1","smile2","smile3","smile4","smile5","smile6","smile7","smile8",
"smile9","smile10","smile11","smile12","smile13","smile14","smile15","smile16","smile17","smile18","smile19","smile20","smile21","smile22","smile23", "smile24","smile25","smile26","smile27","smile28","simle12","simle13"))]
smi <- smi[, !(colnames(smi) %in% c("sad1","sad2","sad3","sad4","sad5","sad6","sad7","sad8","sad9","sad10",
"sad11","sad12","sad13","sad14","sad15","sad16","sad17","sad18"))]
write smi as excel document and add the weights at the last row.
write.xlsx(smi, file="smi.xlsx", sheetName="Sheet1")
summary(swl$swl)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 3.400 4.600 4.379 5.400 7.000
sd(swl$swl)
## [1] 1.369576
t5 <- read.csv("smi.csv", header = TRUE)
t5[3:66]<- data.frame(lapply(t5[3:66], function(X) X/X[2292]))
t5 = t5[-2292,]
t5[,3:66]<-scale(t5[,3:66])
t5[,70:72]<-scale(t5[,70:72])
t5$b[t5$swl >= 1 & t5$swl < 2.8 ] <- "one"
t5$b[t5$swl >= 2.8 & t5$swl < 4.2 ] <- "two"
t5$b[t5$swl >= 4.2 & t5$swl <= 7] <- "three"
ind = sample(2, nrow(t5), replace = TRUE, prob=c(0.7, 0.3))
trainset1 = t5[ind == 1,]
testset1 = t5[ind == 2,]
training model:
fit1 <- cforest((b == 'three')~ affect + negemo+ future+swear+sad
+negate+sexual+death + filler+leisure + conj+ funct +discrep + i
+future + past + bio + body + cogmech + death + cause + quant
+ future +incl + motion + sad + tentat + excl+insight +percept +posemo
+ppron +quant + relativ + space + article + age + s_all + s_sad + gender
, data = trainset1,
controls=cforest_unbiased(ntree=500, mtry= 1))
table1 <- table(predict(fit1, OOB=TRUE, type = 'response') > 0.5, trainset1$b == 'three')
table1
##
## FALSE TRUE
## FALSE 187 120
## TRUE 541 793
# Training Step 2.
nonthree <- trainset1[trainset1$b != 'three',]
nonthree$b <- nonthree$b == 'two'
fit2 <- cforest(b~ affect +bio+body + cogmech + discrep + excl
+ i + insight + negemo + article + relativ +space + swear + percept
+ posemo + ppron + ipron + preps + pronoun + motion
+ tentat + we + you+ conj+ funct +age + shehe + they
+ quant + s_all + s_sad + gender
+ future+swear+sad + ingest
+sexual+death + filler+leisure
+incl + motion + quant + sad + tentat
+ family + cause + see + hear + feel+ verb + past + present
+ humans
, data = nonthree,
controls=cforest_unbiased(ntree=500, mtry = 1))
### here I adjust the probablity, only p> 0.6 will be classified as a two (two = 521, one = 121)
table2<-table(predict(fit2, OOB=TRUE, type = 'response') > 0.6, nonthree$b)
table2
##
## FALSE TRUE
## FALSE 108 151
## TRUE 172 297
#training step 3, predict "one" and "three" after step two
fit3 <- cforest((b == 'one')~ affect + negemo+swear+sad
+negate+sexual+death + past + bio + body + cogmech + death + cause + quant
+incl + motion + sad + tentat + excl+insight +percept
+ age + s_all + s_sad + gender
, data = trainset1,
controls=cforest_unbiased(ntree=500, mtry= 1))
# Testing with separate test set (testset2)
#run the test set through step1
test_predict1<-predict(fit1, newdata=testset1, type='response')
#produce the table for splitting 'three' and 'other'
test_table1<-table(test_predict1>0.5, testset1$b=="three")
#get the data that is classed as 'other', this will be passed to step 2
step2_index<-test_predict1<=0.5
#create the dataset that would be passed to step 2 (this will probably contain 'one', 'two' and 'three' data)
testset2<-testset1[step2_index,]
# run the second step on test set 2
test_predict2<-predict(fit2, newdata=testset2, type='response')
#find out how many 'two' would be correctly classified
test_table2a<-table(test_predict2>0.6, testset2$b=="two")
step3_index<-test_predict2<=0.6
testset3<-testset2[step3_index,]
test_predict3<-predict(fit3, newdata=testset3, type='response')
#find out how many 'one' would be correctly classified
#here I justed the threshold to 0.23, because the response in this step ranged from 0.1-0.3
test_table3a<-table(test_predict3 <= 0.23, testset3$b=="three")
test_table3b<-table(test_predict3 > 0.23, testset3$b=="one")
result
total_percentage_test<-(test_table1[2,2]+test_table2a[2,2]+test_table3b[2,2]+test_table3a[2,2])/nrow(testset1)*100
total_percentage_train<-(table1[2,2]+table2[2,2]+table2[1,1])/nrow(trainset1)*100
total_percentage_test
## [1] 58.76923