cri_PILL

library(party)

## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Loading required package: sandwich

library(pspearman)

write smi as excel document and add the weights at the last row.

write.xlsx(smi, file="smi.xlsx", sheetName="Sheet1")

t5 <- read.csv("smi.csv", header = TRUE)


t5[3:66]<- data.frame(lapply(t5[3:66], function(X) X/X[2292]))
t5 = t5[-2292,]
t5[,3:66]<-scale(t5[,3:66]) 
t5[,70:72]<-scale(t5[,70:72]) 

t5$b[t5$swl >= 1 & t5$swl < 2.8 ] <- "one"  
t5$b[t5$swl >=  2.8 & t5$swl < 4.2 ] <- "two"  
t5$b[t5$swl >= 4.2  & t5$swl <= 7] <- "three" 

ind = sample(2, nrow(t5), replace = TRUE, prob=c(0.7, 0.3))
trainset1 = t5[ind == 1,]
testset1 = t5[ind == 2,]

training model: Please repeat the training model for more than 10 times and test the criterion validity when ACC > 0.58

fit1 <- cforest((b == 'three')~   affect + negemo+ future+swear+sad
                +negate+sexual+death + filler+leisure + conj+ funct +discrep + i
                +future + past + bio + body + cogmech + death + cause + quant
                + future +incl + motion  + sad + tentat + excl+insight +percept +posemo
                +ppron +quant + relativ + space + article + age + s_all + s_sad + gender
                , data = trainset1, 
                controls=cforest_unbiased(ntree=500, mtry= 1))

table1 <- table(predict(fit1, OOB=TRUE, type = 'response') > 0.5, trainset1$b == 'three') 

# Training Step 2. 

nonthree <- trainset1[trainset1$b != 'three',]
nonthree$b <- nonthree$b == 'two'
fit2 <- cforest(b~ affect +bio+body + cogmech + discrep + excl
                + i + insight + negemo + article + relativ +space + swear + percept
                + posemo + ppron + ipron + preps + pronoun + motion 
                + tentat + we + you+ conj+ funct +age + shehe + they
                + quant + s_all + s_sad + gender
                + future+swear+sad + ingest
                +sexual+death + filler+leisure 
                +incl + motion + quant + sad + tentat
                + family + cause + see + hear + feel+ verb + past + present
                + humans
                ,  data = nonthree,
                controls=cforest_unbiased(ntree=500, mtry = 1))

### here I adjust the probablity, only p> 0.6 will be classified as a two  (two = 521, one = 121)
table2<-table(predict(fit2, OOB=TRUE, type = 'response') > 0.6, nonthree$b)

#training step 3, predict "one" and "three" after step two

fit3 <- cforest((b == 'one')~  affect + negemo+ future+swear+sad
                +negate+sexual+death + filler+leisure + conj+ funct +discrep + i
                +future + past + bio + body + cogmech + death + cause + quant
                +incl + motion  + sad + tentat + excl+insight +percept +posemo
                +ppron +quant + relativ + space + article + age + s_all + s_sad + gender 
                , data = trainset1, 
                controls=cforest_unbiased(ntree=500, mtry= 1))

# Testing with separate test set (testset2)

#run the test set through step1 
test_predict1<-predict(fit1, newdata=testset1, type='response')
#produce the table for splitting 'three' and 'other'
test_table1<-table(test_predict1>0.5, testset1$b=="three")

#get the data that is classed as 'other', this will be passed to step 2
step2_index<-test_predict1<=0.5

#create the dataset that would be passed to step 2 (this will probably contain 'one', 'two' and 'three' data)
testset2<-testset1[step2_index,]
# run the second step on test set 2
test_predict2<-predict(fit2, newdata=testset2, type='response')
#find out how many 'two' would be correctly classified
test_table2a<-table(test_predict2>0.6, testset2$b=="two")

step3_index<-test_predict2<=0.6
testset3<-testset2[step3_index,]

test_predict3<-predict(fit3, newdata=testset3, type='response')
#find out how many 'one' would be correctly classified
#here I justed the threshold to 0.23, because the response in this step ranged from 0.1-0.3
test_table3a<-table(test_predict3 <= 0.23, testset3$b=="three")


# the equation for the test set accuracy is then:
total_percentage_test<-(test_table1[2,2]+test_table2a[2,2]+test_table3a[1,1]+test_table3a[2,2])/nrow(testset1)*100 

total_percentage_train<-(table1[2,2]+table2[2,2]+table2[1,1])/nrow(trainset1)*100
total_percentage_test

## [1] 56.03053

t5 <- read.csv("t16.csv", header = TRUE)

t5 <- read.csv('t16.csv', header=TRUE, stringsAsFactors=FALSE,
       sep=",", na.strings="\\N")



t5[10:73]<- data.frame(lapply(t5[10:73], function(X) X/X[811]))
t5 = t5[-811,]
t5[,10:73]<-scale(t5[,10:73]) 
t5[,10:73]<-scale(t5[,10:73]) 

t5$b[t5$swl >= 1 & t5$swl < 2.8 ] <- "1"  
t5$b[t5$swl >=  2.8 & t5$swl < 4.2 ] <- "2"  
t5$b[t5$swl >= 4.2  & t5$swl <= 7] <- "3" 
t5$b <- as.numeric(t5$b)  


t5<- t5[complete.cases(t5),]


testset2 <- t5

testset2$pre_swl<-predict(fit1, newdata=testset2 , type='response')
testset2$pre_swl[testset2$pre_swl > 0.5] <- 3

test_predict1<-predict(fit1, newdata=testset1, type='response')
#produce the table for splitting 'three' and 'other'
test_table1<-table(test_predict1>0.5, testset1$b=="3")


#get the data that is classed as 'other', this will be passed to step 2
step2_index<-testset2$pre_swl<= 0.5
testset3<-testset2[step2_index,]

testset3$pre_swl3<-predict(fit2, newdata=testset3, type='response')
testset3$pre_swl3[testset3$pre_swl3> 0.6] <- 2

step3_index<-test_predict2<= 0.6
testset4<-testset3[step3_index,]

test_predict2<-predict(fit2, newdata=testset2, type='response')
test_table2a<-table(test_predict2>0.6, testset2$b=="2")


#Step three 
testset4$pre_swl4<-predict(fit3, newdata=testset4, type='response')
testset4$pre_swl4[testset4$pre_swl4 <= 0.23] <- 3
testset4$pre_swl4[testset4$pre_swl4 > 0.23 & testset4$pre_swl4 < 3 ] <- 1

testset5 <- merge(testset2, testset3, by = "userid", all.x=T)
testset5$swl2 <- ifelse(is.na(testset5$pre_swl3), testset5$pre_swl.x, testset5$pre_swl3)
testset6 <- merge(testset5, testset4, by = "userid", all.x=T)
testset6$swl3 <- ifelse(is.na(testset6$pre_swl4), testset6$swl2, testset6$pre_swl4)
testset6$swl3[testset6$swl3 < 1 ] <- 2



#machine
cor1 <- cor.test(testset6$PILL.x, testset6$swl3, method="spearman", exact=FALSE)
cor2 <- cor.test(testset6$days_activity_restricted.x,testset6$swl3, method="spearman", exact=FALSE)
cor3 <- cor.test(testset6$days_sick.x, testset6$swl3, method="spearman", exact=FALSE)

cor1

## 
##  Spearman's rank correlation rho
## 
## data:  testset6$PILL.x and testset6$swl3
## S = 87427000, p-value = 0.4229
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##         rho 
## -0.02838837

cor2

## 
##  Spearman's rank correlation rho
## 
## data:  testset6$days_activity_restricted.x and testset6$swl3
## S = 77652000, p-value = 0.01435
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## 0.08659173

cor3

## 
##  Spearman's rank correlation rho
## 
## data:  testset6$days_sick.x and testset6$swl3
## S = 73415000, p-value = 0.0001095
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.1364293

#self-report

cor1 <- cor.test(testset2$PILL, testset2$b, method="spearman", exact=FALSE)
cor2 <- cor.test(testset2$days_activity_restricted,testset2$b, method="spearman", exact=FALSE)
cor3 <- cor.test(testset2$days_sick, testset2$b, method="spearman", exact=FALSE)

cor1

## 
##  Spearman's rank correlation rho
## 
## data:  testset2$PILL and testset2$b
## S = 73934000, p-value = 3.869e-08
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.2034825

cor2

## 
##  Spearman's rank correlation rho
## 
## data:  testset2$days_activity_restricted and testset2$b
## S = 66479000, p-value = 0.02787
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##         rho 
## -0.08213442

cor3

## 
##  Spearman's rank correlation rho
## 
## data:  testset2$days_sick and testset2$b
## S = 63360000, p-value = 0.4017
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##         rho 
## -0.03136633

cri_PILL

Lushi Chen

22 October, 2015