# Método
###Estudo Transversal
require(RcmdrMisc)
require(dplyr)
library(colorspace, pos=33)

#notifcações match
banco <- read.csv("C:/Users/edson/Downloads/Base_de_dados_match  (1).csv", comment.char="#")
banco$recorrencia2<-0
banco$recorrencia2[banco$recorrencia!="1" & banco$diff<365]<-1
banco$recorrencia2<-as.factor(banco$recorrencia2)

banco$CS_ESCOL_N<-as.factor(banco$CS_ESCOL_N)
#repetição

## Load survival package
library(survival)

#remoção de duplicatas

dataset    <- banco
dataset$NU_IDADE_N<-dataset$NU_IDADE_N-4000
#dataset$recorrencia2<-0
#dataset$recorrencia2[dataset$recorrencia>2 & dataset$diff<60]<-1
#banco$recorrencia2<-as.factor(banco$recorrencia2)

dataset<-arrange(dataset,desc(dataset$DT_NOTIFIC))

dataset<-dplyr::distinct(dataset,dataset$study_ID,.keep_all=T)




tabela <- within(dataset, {
CS_ESCOL_N <- as.factor(CS_ESCOL_N)
CS_GESTANT <- as.factor(CS_GESTANT)
CS_RACA <- as.factor(CS_RACA)
MES <- as.factor(MES)
PCRUZ <- as.factor(PCRUZ)
RESULT <- as.factor(RESULT)
recorrencia2 <- as.factor(recorrencia2)

})

dataset$TRA_ESQUEM<-as.factor(dataset$TRA_ESQUEM)

#criar variável recorrência

tabela<-subset(dataset, select=c(NU_IDADE_N , CS_SEXO,     CS_GESTANT,  CS_RACA,     CS_ESCOL_N,  RESULT,     
PMM       ,  PCRUZ   ,    TRA_ESQUEM , MES     ,      recorrencia2,
diff,        LVC ,        Tempo_class))
#teste de hiotese em banco bruto as.it is
require(gtsummary)

gtsummary::tbl_summary(tabela)
Characteristic N = 6,2341
NU_IDADE_N 36 (27, 48)
CS_SEXO
F 1,374 (22%)
M 4,860 (78%)
CS_GESTANT
1 9 (0.1%)
2 18 (0.3%)
3 9 (0.1%)
4 2 (<0.1%)
5 971 (16%)
6 5,079 (81%)
9 146 (2.3%)
CS_RACA
1 2,859 (47%)
2 313 (5.1%)
3 50 (0.8%)
4 2,419 (40%)
5 125 (2.0%)
9 354 (5.8%)
Unknown 114
CS_ESCOL_N
0 85 (1.5%)
1 571 (10.0%)
2 271 (4.7%)
3 721 (13%)
4 514 (9.0%)
5 424 (7.4%)
6 869 (15%)
7 181 (3.2%)
8 629 (11%)
9 1,285 (22%)
10 165 (2.9%)
Unknown 519
RESULT
4 5,841 (94%)
5 272 (4.4%)
6 121 (1.9%)
PMM 501 (118, 4,088)
Unknown 1,887
PCRUZ
1 1,192 (19%)
2 759 (12%)
3 935 (15%)
4 2,577 (41%)
5 581 (9.3%)
6 190 (3.0%)
TRA_ESQUEM
1 4,499 (75%)
2 45 (0.8%)
3 80 (1.3%)
4 65 (1.1%)
5 19 (0.3%)
6 13 (0.2%)
7 18 (0.3%)
8 5 (<0.1%)
9 36 (0.6%)
10 34 (0.6%)
11 43 (0.7%)
12 21 (0.4%)
99 1,096 (18%)
Unknown 260
MES 7 (4, 10)
Unknown 64
recorrencia2
0 5,542 (89%)
1 692 (11%)
diff 0 (0, 0)
LVC
LVC 635 (10%)
Ñ_LVC 5,599 (90%)
Tempo_class
maior_60_dias 475 (7.6%)
menor_igual_60_dias 5,759 (92%)

1 Median (IQR); n (%)

gtsummary::tbl_summary(tabela,by="recorrencia2")%>%add_p(simulate.p.value=TRUE)
Characteristic 0, N = 5,5421 1, N = 6921 p-value2
NU_IDADE_N 36 (26, 48) 37 (28, 50) 0.037
CS_SEXO 0.017
F 1,246 (22%) 128 (18%)
M 4,296 (78%) 564 (82%)
CS_GESTANT 0.7
1 9 (0.2%) 0 (0%)
2 17 (0.3%) 1 (0.1%)
3 9 (0.2%) 0 (0%)
4 2 (<0.1%) 0 (0%)
5 873 (16%) 98 (14%)
6 4,499 (81%) 580 (84%)
9 133 (2.4%) 13 (1.9%)
CS_RACA 0.3
1 2,534 (47%) 325 (48%)
2 272 (5.0%) 41 (6.0%)
3 48 (0.9%) 2 (0.3%)
4 2,151 (40%) 268 (39%)
5 116 (2.1%) 9 (1.3%)
9 315 (5.8%) 39 (5.7%)
Unknown 106 8
CS_ESCOL_N 0.3
0 80 (1.6%) 5 (0.8%)
1 509 (10%) 62 (9.6%)
2 244 (4.8%) 27 (4.2%)
3 647 (13%) 74 (12%)
4 447 (8.8%) 67 (10%)
5 385 (7.6%) 39 (6.1%)
6 756 (15%) 113 (18%)
7 161 (3.2%) 20 (3.1%)
8 554 (11%) 75 (12%)
9 1,138 (22%) 147 (23%)
10 151 (3.0%) 14 (2.2%)
Unknown 470 49
RESULT 0.4
4 5,189 (94%) 652 (94%)
5 248 (4.5%) 24 (3.5%)
6 105 (1.9%) 16 (2.3%)
PMM 500 (100, 4,000) 501 (200, 5,158) 0.003
Unknown 1,649 238
PCRUZ 0.022
1 1,089 (20%) 103 (15%)
2 684 (12%) 75 (11%)
3 829 (15%) 106 (15%)
4 2,258 (41%) 319 (46%)
5 512 (9.2%) 69 (10.0%)
6 170 (3.1%) 20 (2.9%)
TRA_ESQUEM
1 4,077 (77%) 422 (64%)
2 45 (0.8%) 0 (0%)
3 68 (1.3%) 12 (1.8%)
4 56 (1.1%) 9 (1.4%)
5 19 (0.4%) 0 (0%)
6 13 (0.2%) 0 (0%)
7 15 (0.3%) 3 (0.5%)
8 3 (<0.1%) 2 (0.3%)
9 30 (0.6%) 6 (0.9%)
10 16 (0.3%) 18 (2.7%)
11 43 (0.8%) 0 (0%)
12 21 (0.4%) 0 (0%)
99 908 (17%) 188 (28%)
Unknown 228 32
MES 7 (4, 10) 7 (4, 10) 0.082
Unknown 58 6
diff 0 (0, 0) 70 (38, 111) <0.001
LVC <0.001
LVC 387 (7.0%) 248 (36%)
Ñ_LVC 5,155 (93%) 444 (64%)
Tempo_class <0.001
maior_60_dias 65 (1.2%) 410 (59%)
menor_igual_60_dias 5,477 (99%) 282 (41%)

1 Median (IQR); n (%)

2 Wilcoxon rank sum test; Pearson's Chi-squared test; Fisher's exact test

#levando em conta subset de recorrências
#Script de Cox Exaustivo

options(digits=3)





dataset <- within(dataset, {
CS_ESCOL_N <- as.factor(CS_ESCOL_N)
CS_GESTANT <- as.factor(CS_GESTANT)
CS_RACA <- as.factor(CS_RACA)
MES <- as.factor(MES)
PCRUZ <- as.factor(PCRUZ)
RESULT <- as.factor(RESULT)
TRA_ESQUEM <- as.factor(TRA_ESQUEM)
})


## Create outcome variable, then survival object
dataset <- within(dataset, {
  status.dichotomous <- recorrencia2 != 0
  survival.vector    <- Surv(diff, recorrencia2)
})

dataset$TRA_ESQUEM2<-0
dataset$TRA_ESQUEM2[dataset$TRA_ESQUEM=="4"]<-1
dataset$TRA_ESQUEM2<-as.factor(dataset$TRA_ESQUEM2)

dataset$gestante2<-0
dataset$gestante2[dataset$CS_GESTANT!=1]<-1
dataset$gestante2<-as.factor(dataset$gestante2)

dataset$raca2<-0
dataset$raca2[dataset$CS_RACA!=1]<-1
dataset$raca2<-as.factor(dataset$raca2)

dataset$escola2<-0
dataset$escola2[dataset$CS_ESCOL_N=="7"|dataset$CS_ESCOL_N=="8"|dataset$CS_ESCOL_N=="9"|dataset$CS_ESCOL_N=="10"]<-1
dataset$escola2<-as.factor(dataset$escola2)


## Create vectors for outcome and predictors
outcome    <- c("survival.vector")
#predictors <- c("esquema2","cd8_cd4", "idade","cv","COR","gon","Grupo6")
predictors <- c('NU_IDADE_N' , 'CS_SEXO',  'raca2',     'escola2',  'RESULT',     'PMM'       ,  'PCRUZ'   ,    'TRA_ESQUEM2'   ,   'LVC','TRA_ESQUEM','CS_GESTANT')





## The lines below should not need modification.

## Create list of models
list.of.models <- lapply(seq_along((predictors)), function(n) {
  
  left.hand.side  <- outcome
  right.hand.side <- apply(X = combn(predictors, n), MARGIN = 2, paste, collapse = " + ")
  
  paste(left.hand.side, right.hand.side, sep = "  ~  ")
})

## Convert to a 

vector.of.models <- unlist(list.of.models)

## Fit coxph to all models
list.of.fits <- lapply(vector.of.models, function(x) {
  
  formula    <- as.formula(x)
  fit        <- coxph(formula, data = dataset, id=study_ID)
  suma<- summary(fit)
  result.AIC <- extractAIC(fit)
  
  data.frame(num.predictors = result.AIC[1],
             AIC            = result.AIC[2],
             model          = x,
             suma= suma[["coefficients"]])
})


## Collapse to a data frame
result <- do.call(rbind, list.of.fits)
result$ciup<-exp(result$suma.coef + 2*result$suma.se.coef.)
result$cidown<-exp(result$suma.coef - 2*result$suma.se.coef.)
result$var<- row.names(result)


## Sort and print
library(doBy)
a<-orderBy(~ AIC + model, result)
a<-as.data.frame(a, digits=2)
result$pvalorsig<-0
result$pvalorsig[result$suma.Pr...z..<0.05]<-1

f<-result %>%
group_by(model) %>%
summarise(somamodelo= sum(pvalorsig))
a<-merge(a,f)

a$dif<-a$num.predictors- a$somamodelo




#filtrar modelos com todas as var em todos os fatores < 0.05
modelocomtudosig<-subset(a,a$dif<1 & a$suma.Pr...z..<0.05)
#filtrar modelos com mais de um preditor, vai ser todos uni se binária ou numérica todas as var em todos os fatores < 0.05
modelomulticomtudosig<-subset(a,a$dif<1 & a$num.predictors>1)
##filtrar modelos com mais de um preditor, vai ser todos uni se binária ou numérica todas -1 as var em todos os fatores < 0.05, para pegar multinomial
modelomulticomtudosigmenos1<-subset(a,a$dif<2 & a$num.predictors>1)
modelomulticomtudosigmenos3<-subset(a,a$dif<2 & a$num.predictors>4)
modelo_uni<-subset(a,a$num.predictors==1)

htmlTable::htmlTable(modelo_uni)
model num.predictors AIC suma.coef suma.exp.coef. suma.se.coef. suma.z suma.Pr…z.. ciup cidown var somamodelo dif
7 survival.vector ~ CS_SEXO 1 8113.55292267763 -0.103139783866269 0.902000879498455 0.0979299138115787 -1.05319998611164 0.292249344846739 1.09715453700676 0.741559697538738 CS_SEXOM 0 1
8199 survival.vector ~ escola2 1 8107.9681115605 0.205578300869994 1.22823514907753 0.0788376332971222 2.6076163409829 0.00911750812918678 1.43800044314189 1.04906892666419 escola21 1 0
10119 survival.vector ~ LVC 1 8087.40690013913 -0.429209114383688 0.651023776509997 0.0804735255129662 -5.33354431345912 9.63140449580789e-08 0.764707353064944 0.554240724746037 LVCÑ_LVC 1 0
10159 survival.vector ~ NU_IDADE_N 1 8113.75140960879 -0.000205568896110861 0.999794452231727 0.000204435357995372 -1.0055447263458 0.314634624068129 1.0002033224871 0.999385749117261 NU_IDADE_N 0 1
27295 survival.vector ~ PMM 1 4929.10360520894 3.30684913161125e-09 1.00000000330685 5.00462821210673e-09 0.660758200501614 0.50876739291489 1.00000001331611 0.999999993297593 PMM 0 1
27727 survival.vector ~ raca2 1 8111.86850534762 -0.127091866259103 0.880652766605711 0.0762807088131721 -1.66610756817138 0.0956919914192787 1.02579667168878 0.756045829290417 raca21 0 1
32685 survival.vector ~ TRA_ESQUEM2 1 8109.34327382061 -0.687735578891047 0.50271313541725 0.335879502956417 -2.04756638269852 0.0406025061529627 0.984150375497512 0.256790529997294 TRA_ESQUEM21 1 0
htmlTable::htmlTable(modelomulticomtudosig)
model num.predictors AIC suma.coef suma.exp.coef. suma.se.coef. suma.z suma.Pr…z.. ciup cidown var somamodelo dif
8207 survival.vector ~ escola2 + LVC 2 8082.66182637341 0.206802330960217 1.22973946633484 0.0788617230066802 2.62234101761509 0.00873280009688099 1.43983104518785 1.05030320058434 escola218 2 0
8208 survival.vector ~ escola2 + LVC 2 8082.66182637341 -0.42994122992774 0.650547326313245 0.0805008409165198 -5.34082905262559 9.25224727302636e-08 0.764189450960208 0.553804849362346 LVCÑ_LVC4 2 0
10027 survival.vector ~ escola2 + TRA_ESQUEM2 2 8104.66023155941 0.205785445001466 1.22848959713353 0.0788401135718815 2.61016170168049 0.00904994334530299 1.4383054821161 1.04928105262097 escola217 2 0
10028 survival.vector ~ escola2 + TRA_ESQUEM2 2 8104.66023155941 -0.688532574587842 0.502312634831422 0.335897141135911 -2.04983160100564 0.0403808667018732 0.983401014562777 0.256576899326737 TRA_ESQUEM214 2 0
#htmlTable::htmlTable(modelomulticomtudosig)

#dataset

.Survfit <- survfit(Surv(diff, recorrencia2) ~ LVC , 
              error="greenwood", 
                    data=dataset,id=study_ID)
plot(.Survfit)

table(dataset$LVC)
## 
##   LVC Ñ_LVC 
##   635  5599
#a<-dataset$diff[dataset$recorrencia3==1]
#edu::resumosnumericos(a)
#a<-as.data.frame(a)
#edu::resumosnumericos(a)
#a<-dataset$diff[dataset$recorrencia2==1]
#a<-as.data.frame(a)
#table(dataset$recorrencia3)
#table(dataset$recorrencia3)
#a<-dataset$diff[dataset$recorrencia2==1]
#edu::resumosnumericos(a)
#a<-dataset$diff[dataset$recorrencia2==1]
#a<-as.data.frame(a)
#edu::resumosnumericos(a)