# Método
###Estudo Transversal
require(RcmdrMisc)
require(dplyr)
library(colorspace, pos=33)

#notifcações match
banco <- read.csv("C:/Users/edson/Downloads/Base_de_dados_match  (1).csv", comment.char="#")
banco$recorrencia2<-0
banco$recorrencia2[banco$recorrencia!="1" & banco$diff<60]<-1
banco$recorrencia2<-as.factor(banco$recorrencia2)
#repetição

table(banco$recorrencia)
## 
##    1    2    3    4    5 
## 6234  820  142   28    6
banco<-arrange(banco,desc(banco$DT_NOTIFIC))
banco_umavez<-dplyr::distinct(banco, banco$study_ID,.keep_all=T)
banco_repete<-subset(banco,banco$recorrencia>1)
banco_repete<-dplyr::arrange(banco_repete,banco_repete$DT_NOTIFIC, banco_repete$study_ID)
banco_repete2<-dplyr::distinct(banco_repete, banco_repete$study_ID,.keep_all=T)


banco3<-dplyr::full_join(banco_umavez,banco_repete2)
tabela<-subset(banco_umavez, select=c(NU_IDADE_N , CS_SEXO,     CS_GESTANT,  CS_RACA,     CS_ESCOL_N,  RESULT,     
PMM       ,  PCRUZ   ,    TRA_ESQUEM , MES  ,       ANO   ,      recorrencia,
diff,        LVC ,        Tempo_class))

tabela <- within(tabela, {
ANO <- as.factor(ANO)
CS_ESCOL_N <- as.factor(CS_ESCOL_N)
CS_GESTANT <- as.factor(CS_GESTANT)
CS_RACA <- as.factor(CS_RACA)
MES <- as.factor(MES)
PCRUZ <- as.factor(PCRUZ)
recorrencia <- as.factor(recorrencia)
RESULT <- as.factor(RESULT)
TRA_ESQUEM <- as.factor(TRA_ESQUEM)
})

#descritiva apenas 
edu::resumonumericooucategorico(tabela)
Characteristic N = 6,2341
NU_IDADE_N 4,036 (4,027, 4,048)
CS_SEXO
F 1,374 (22.04%)
M 4,860 (77.96%)
CS_GESTANT
1 9 (0.14%)
2 18 (0.29%)
3 9 (0.14%)
4 2 (0.03%)
5 971 (15.58%)
6 5,079 (81.47%)
9 146 (2.34%)
CS_RACA
1 2,859 (46.72%)
2 313 (5.11%)
3 50 (0.82%)
4 2,419 (39.53%)
5 125 (2.04%)
9 354 (5.78%)
Unknown 114
CS_ESCOL_N
0 85 (1.49%)
1 571 (9.99%)
2 271 (4.74%)
3 721 (12.62%)
4 514 (8.99%)
5 424 (7.42%)
6 869 (15.21%)
7 181 (3.17%)
8 629 (11.01%)
9 1,285 (22.48%)
10 165 (2.89%)
Unknown 519
RESULT
4 5,841 (93.70%)
5 272 (4.36%)
6 121 (1.94%)
PMM 501 (118, 4,088)
Unknown 1,887
PCRUZ
1 1,192 (19.12%)
2 759 (12.18%)
3 935 (15.00%)
4 2,577 (41.34%)
5 581 (9.32%)
6 190 (3.05%)
TRA_ESQUEM
1 4,499 (75.31%)
2 45 (0.75%)
3 80 (1.34%)
4 65 (1.09%)
5 19 (0.32%)
6 13 (0.22%)
7 18 (0.30%)
8 5 (0.08%)
9 36 (0.60%)
10 34 (0.57%)
11 43 (0.72%)
12 21 (0.35%)
99 1,096 (18.35%)
Unknown 260
MES
1 521 (8.44%)
2 437 (7.08%)
3 533 (8.64%)
4 510 (8.27%)
5 524 (8.49%)
6 515 (8.35%)
7 495 (8.02%)
8 521 (8.44%)
9 525 (8.51%)
10 547 (8.87%)
11 536 (8.69%)
12 506 (8.20%)
Unknown 64
ANO
1920 2 (0.03%)
1922 1 (0.02%)
1923 1 (0.02%)
1925 4 (0.06%)
1926 2 (0.03%)
1927 4 (0.06%)
1928 1 (0.02%)
1929 1 (0.02%)
1930 5 (0.08%)
1931 1 (0.02%)
1932 3 (0.05%)
1933 7 (0.11%)
1934 13 (0.21%)
1935 9 (0.15%)
1936 10 (0.16%)
1937 15 (0.24%)
1938 16 (0.26%)
1939 12 (0.19%)
1940 22 (0.36%)
1941 15 (0.24%)
1942 26 (0.42%)
1943 22 (0.36%)
1944 26 (0.42%)
1945 27 (0.44%)
1946 39 (0.63%)
1947 41 (0.66%)
1948 30 (0.49%)
1949 34 (0.55%)
1950 50 (0.81%)
1951 49 (0.79%)
1952 68 (1.10%)
1953 72 (1.17%)
1954 60 (0.97%)
1955 77 (1.25%)
1956 69 (1.12%)
1957 90 (1.46%)
1958 87 (1.41%)
1959 118 (1.91%)
1960 89 (1.44%)
1961 90 (1.46%)
1962 120 (1.94%)
1963 125 (2.03%)
1964 110 (1.78%)
1965 127 (2.06%)
1966 128 (2.07%)
1967 113 (1.83%)
1968 129 (2.09%)
1969 130 (2.11%)
1970 131 (2.12%)
1971 133 (2.16%)
1972 129 (2.09%)
1973 125 (2.03%)
1974 118 (1.91%)
1975 148 (2.40%)
1976 142 (2.30%)
1977 156 (2.53%)
1978 169 (2.74%)
1979 169 (2.74%)
1980 148 (2.40%)
1981 159 (2.58%)
1982 179 (2.90%)
1983 171 (2.77%)
1984 159 (2.58%)
1985 165 (2.67%)
1986 147 (2.38%)
1987 154 (2.50%)
1988 125 (2.03%)
1989 117 (1.90%)
1990 104 (1.69%)
1991 85 (1.38%)
1992 92 (1.49%)
1993 60 (0.97%)
1994 64 (1.04%)
1995 66 (1.07%)
1996 41 (0.66%)
1997 44 (0.71%)
1998 41 (0.66%)
1999 43 (0.70%)
2000 32 (0.52%)
2001 30 (0.49%)
2002 25 (0.41%)
2003 22 (0.36%)
2004 26 (0.42%)
2005 29 (0.47%)
2006 19 (0.31%)
2007 19 (0.31%)
2008 22 (0.36%)
2009 14 (0.23%)
2010 15 (0.24%)
2011 9 (0.15%)
2012 6 (0.10%)
2013 7 (0.11%)
2014 6 (0.10%)
2015 5 (0.08%)
2016 2 (0.03%)
2017 3 (0.05%)
2018 2 (0.03%)
2019 3 (0.05%)
Unknown 64
recorrencia
1 5,477 (87.86%)
2 620 (9.95%)
3 111 (1.78%)
4 20 (0.32%)
5 6 (0.10%)
diff 0 (0, 0)
LVC
LVC 635 (10.19%)
Ñ_LVC 5,599 (89.81%)
Tempo_class
maior_60_dias 475 (7.62%)
menor_igual_60_dias 5,759 (92.38%)

1 Median (IQR); n (%)

#notificações
dim(banco)
## [1] 7230   98
#casos
dim(banco3)
## [1] 6434  100
table(banco$recorrencia2)
## 
##    0    1 
## 6784  446
table(banco$recorrencia)
## 
##    1    2    3    4    5 
## 6234  820  142   28    6
tabela<-subset(banco, select=c(NU_IDADE_N , CS_SEXO,     CS_GESTANT,  CS_RACA,     CS_ESCOL_N,  RESULT,     
PMM       ,  PCRUZ   ,    TRA_ESQUEM , MES,
diff ,        Tempo_class, recorrencia2))

tabela <- within(tabela, {
CS_ESCOL_N <- as.factor(CS_ESCOL_N)
CS_GESTANT <- as.factor(CS_GESTANT)
CS_RACA <- as.factor(CS_RACA)
MES <- as.factor(MES)
PCRUZ <- as.factor(PCRUZ)
RESULT <- as.factor(RESULT)
recorrencia2 <- as.factor(recorrencia2)

})

#criar variável recorrência


#teste de hiotese em banco bruto as.it is
require(gtsummary)
gtsummary::tbl_summary(tabela,by="recorrencia2")%>%add_p(simulate.p.value=TRUE)
Characteristic 0, N = 6,7841 1, N = 4461 p-value2
NU_IDADE_N 4,036 (4,027, 4,049) 4,035 (4,027, 4,050) 0.8
CS_SEXO 0.3
F 1,471 (21.68%) 88 (19.73%)
M 5,313 (78.32%) 358 (80.27%)
CS_GESTANT 0.3
1 10 (0.15%) 1 (0.22%)
2 22 (0.32%) 0 (0.00%)
3 10 (0.15%) 1 (0.22%)
4 3 (0.04%) 1 (0.22%)
5 1,024 (15.09%) 58 (13.00%)
6 5,556 (81.90%) 375 (84.08%)
9 159 (2.34%) 10 (2.24%)
CS_RACA
1 3,140 (47.16%) 190 (43.28%)
2 339 (5.09%) 28 (6.38%)
3 56 (0.84%) 1 (0.23%)
4 2,616 (39.29%) 176 (40.09%)
5 135 (2.03%) 5 (1.14%)
9 372 (5.59%) 39 (8.88%)
Unknown 126 7
CS_ESCOL_N 0.006
0 89 (1.43%) 2 (0.49%)
1 627 (10.08%) 36 (8.78%)
2 312 (5.02%) 16 (3.90%)
3 780 (12.54%) 41 (10.00%)
4 555 (8.93%) 41 (10.00%)
5 464 (7.46%) 17 (4.15%)
6 955 (15.36%) 69 (16.83%)
7 200 (3.22%) 9 (2.20%)
8 680 (10.94%) 45 (10.98%)
9 1,384 (22.26%) 115 (28.05%)
10 172 (2.77%) 19 (4.63%)
Unknown 566 36
RESULT 0.3
4 6,354 (93.66%) 424 (95.07%)
5 298 (4.39%) 18 (4.04%)
6 132 (1.95%) 4 (0.90%)
PMM 501 (135, 4,500) 600 (200, 7,220) 0.017
Unknown 2,076 159
PCRUZ 0.11
1 1,266 (18.66%) 70 (15.70%)
2 804 (11.85%) 46 (10.31%)
3 1,017 (14.99%) 55 (12.33%)
4 2,831 (41.73%) 208 (46.64%)
5 654 (9.64%) 51 (11.43%)
6 212 (3.12%) 16 (3.59%)
TRA_ESQUEM 1 (1, 1) 1 (1, 99) <0.001
Unknown 280 32
MES 0.059
1 558 (8.31%) 20 (4.52%)
2 467 (6.95%) 34 (7.69%)
3 573 (8.53%) 32 (7.24%)
4 558 (8.31%) 42 (9.50%)
5 575 (8.56%) 48 (10.86%)
6 567 (8.44%) 39 (8.82%)
7 544 (8.10%) 34 (7.69%)
8 566 (8.43%) 27 (6.11%)
9 583 (8.68%) 40 (9.05%)
10 609 (9.07%) 52 (11.76%)
11 573 (8.53%) 33 (7.47%)
12 545 (8.11%) 41 (9.28%)
Unknown 66 4
diff 0 (0, 0) 16 (1, 43) <0.001
Tempo_class <0.001
maior_60_dias 547 (8.06%) 0 (0.00%)
menor_igual_60_dias 6,237 (91.94%) 446 (100.00%)

1 Median (IQR); n (%)

2 Wilcoxon rank sum test; Pearson's Chi-squared test; Fisher's exact test

banco <- within(banco, {
  recorrencia2 <- as.factor(recorrencia2)
})
with(banco, piechart(recorrencia2, xlab="", ylab="", main="recorrencia2", col=palette()[2:3],
   scale="percent"))

#Casos por ano

library(ggplot2)
library(dplyr)
library(plotly)
library(hrbrthemes)

data<-banco
data$DT_NOTIFIC <- as.Date(data$DT_NOTIFIC)
a<-table(data$DT_NOTIFIC,data$NU_NOTIFIC)
a<-as.data.frame(a)
a<-subset(a,a$Freq==1)
a$Var1<-as.Date(a$Var1)
data<-a
# Usual area chart
p <- data %>%
  ggplot( aes(x=Var1, y=Freq)) +
    geom_area(fill="#69b3a4", alpha=0.5) +
    geom_line(color="#69b3a4") +
    theme_ipsum()

# Turn it interactive with ggplotly
p <- ggplotly(p)
p
#tempo de recorrencia

data<-banco
data<-subset(data, banco$recorrencia2==1)
Boxplot( ~ diff, data=data, id=list(method="y"))

#recorrencia

# Load dataset from github



data$DT_NOTIFIC <- as.Date(data$DT_NOTIFIC)
a<-table(data$DT_NOTIFIC,data$NU_NOTIFIC)
a<-as.data.frame(a)
a<-subset(a,a$Freq==1)
a$Var1<-as.Date(a$Var1)
data<-a
# Usual area chart
p <- data %>%
  ggplot( aes(x=Var1, y=Freq)) +
    geom_area(fill="#69b3a4", alpha=0.5) +
    geom_line(color="#69b3a4") +
    theme_ipsum()

# Turn it interactive with ggplotly
p <- ggplotly(p)
p
## Load survival package
library(survival)

#remoção de duplicatas

dataset    <- banco

#dataset$recorrencia2<-0
#dataset$recorrencia2[dataset$recorrencia>2 & dataset$diff<60]<-1
#banco$recorrencia2<-as.factor(banco$recorrencia2)

dataset<-arrange(dataset,desc(dataset$DT_NOTIFIC))

dataset<-dplyr::distinct(dataset,dataset$study_ID,.keep_all=T)


#levando em conta subset de recorrências
#Script de Cox Exaustivo

options(digits=3)


dataset<-subset(dataset, select=c(NU_IDADE_N , CS_SEXO,     CS_GESTANT,  CS_RACA,     CS_ESCOL_N,  RESULT,     
PMM       ,  PCRUZ   ,    TRA_ESQUEM , MES     ,      recorrencia2,
diff,        LVC ,        Tempo_class,study_ID))


dataset <- within(dataset, {
CS_ESCOL_N <- as.factor(CS_ESCOL_N)
CS_GESTANT <- as.factor(CS_GESTANT)
CS_RACA <- as.factor(CS_RACA)
MES <- as.factor(MES)
PCRUZ <- as.factor(PCRUZ)
RESULT <- as.factor(RESULT)
TRA_ESQUEM <- as.factor(TRA_ESQUEM)
})
dataset$diff[dataset$diff>60]<-NA


## Create outcome variable, then survival object
dataset <- within(dataset, {
  status.dichotomous <- recorrencia2 != 0
  survival.vector    <- Surv(diff, recorrencia2)
})

dataset$TRA_ESQUEM2<-0
dataset$TRA_ESQUEM2[dataset$TRA_ESQUEM=="4"]<-1
dataset$TRA_ESQUEM2<-as.factor(dataset$TRA_ESQUEM2)

dataset$gestante2<-0
dataset$gestante2[dataset$CS_GESTANT!=1]<-1
dataset$gestante2<-as.factor(dataset$gestante2)

dataset$raca2<-0
dataset$raca2[dataset$CS_RACA!=1]<-1
dataset$raca2<-as.factor(dataset$raca2)

dataset$escola2<-0
dataset$escola2[dataset$CS_ESCOL_N=="7"|dataset$CS_ESCOL_N=="8"|dataset$CS_ESCOL_N=="9"|dataset$CS_ESCOL_N=="10"]<-1
dataset$escola2<-as.factor(dataset$escola2)


## Create vectors for outcome and predictors
outcome    <- c("survival.vector")
#predictors <- c("esquema2","cd8_cd4", "idade","cv","COR","gon","Grupo6")
predictors <- c('NU_IDADE_N' , 'CS_SEXO',  'raca2',     'escola2',  'RESULT',     'PMM'       ,  'PCRUZ'   ,    'TRA_ESQUEM2'   ,   'LVC')






## The lines below should not need modification.

## Create list of models
list.of.models <- lapply(seq_along((predictors)), function(n) {
  
  left.hand.side  <- outcome
  right.hand.side <- apply(X = combn(predictors, n), MARGIN = 2, paste, collapse = " + ")
  
  paste(left.hand.side, right.hand.side, sep = "  ~  ")
})

## Convert to a 

vector.of.models <- unlist(list.of.models)

## Fit coxph to all models
list.of.fits <- lapply(vector.of.models, function(x) {
  
  formula    <- as.formula(x)
  fit        <- coxph(formula, data = dataset, id=study_ID)
  suma<- summary(fit)
  result.AIC <- extractAIC(fit)
  
  data.frame(num.predictors = result.AIC[1],
             AIC            = result.AIC[2],
             model          = x,
             suma= suma[["coefficients"]])
})


## Collapse to a data frame
result <- do.call(rbind, list.of.fits)
result$ciup<-exp(result$suma.coef + 2*result$suma.se.coef.)
result$cidown<-exp(result$suma.coef - 2*result$suma.se.coef.)
result$var<- row.names(result)


## Sort and print
library(doBy)
a<-orderBy(~ AIC + model, result)
a<-as.data.frame(a, digits=2)
result$pvalorsig<-0
result$pvalorsig[result$suma.Pr...z..<0.05]<-1

f<-result %>%
group_by(model) %>%
summarise(somamodelo= sum(pvalorsig))
a<-merge(a,f)

a$dif<-a$num.predictors- a$somamodelo




#filtrar modelos com todas as var em todos os fatores < 0.05
modelocomtudosig<-subset(a,a$dif<1 & a$suma.Pr...z..<0.05)
#filtrar modelos com mais de um preditor, vai ser todos uni se binária ou numérica todas as var em todos os fatores < 0.05
modelomulticomtudosig<-subset(a,a$dif<1 & a$num.predictors>1)
##filtrar modelos com mais de um preditor, vai ser todos uni se binária ou numérica todas -1 as var em todos os fatores < 0.05, para pegar multinomial
modelomulticomtudosigmenos1<-subset(a,a$dif<2 & a$num.predictors>1)
modelomulticomtudosigmenos3<-subset(a,a$dif<2 & a$num.predictors>4)
modelo_uni<-subset(a,a$num.predictors==1)

htmlTable::htmlTable(modelo_uni)
model num.predictors AIC suma.coef suma.exp.coef. suma.se.coef. suma.z suma.Pr…z.. ciup cidown var somamodelo dif
1 survival.vector ~ CS_SEXO 1 2624.51240894881 -0.198627956944199 0.81985485790608 0.153000915033901 -1.29821417669423 0.194213747926069 1.1133504284787 0.603729042392027 CS_SEXOM 0 1
897 survival.vector ~ escola2 1 2625.58204930766 0.0905067280371467 1.09472887299339 0.12193085558174 0.742279118811506 0.457918240603139 1.39705777961419 0.857825154301308 escola21 0 1
1089 survival.vector ~ LVC 1 2617.78600216494 0.350111564027484 1.41922587431581 0.122157881973692 2.86605791104731 0.0041561820825042 1.81199297009523 1.11159486574698 LVCÑ_LVC 1 0
1090 survival.vector ~ NU_IDADE_N 1 2625.26491675674 -0.000427473246744527 0.999572618106926 0.000403955166237912 -1.05821953145356 0.289955368791973 1.0003805094611 0.998765379192936 NU_IDADE_N 0 1
3034 survival.vector ~ PMM 1 1455.48734693847 -7.51547722450545e-09 0.999999992484523 8.70143307734639e-09 -0.863705685914141 0.387749589345111 1.00000000988739 0.999999975081657 PMM 0 1
3070 survival.vector ~ raca2 1 2624.67023605154 0.146247352870244 1.15748245925289 0.121291573317825 1.20575031611658 0.227913777770747 1.47525447377728 0.908159010728327 raca21 0 1
3582 survival.vector ~ TRA_ESQUEM2 1 2624.56849671888 1.04018013365329 2.82972669745246 0.712677982580516 1.45953734937472 0.144417268576137 11.7697902315402 0.680331002061313 TRA_ESQUEM21 0 1
#htmlTable::htmlTable(modelomulticomtudosig)

#dataset

.Survfit <- survfit(Surv(diff, recorrencia2) ~ LVC , 
              error="greenwood", 
                    data=dataset,id=study_ID)
plot(.Survfit)

table(dataset$LVC)
## 
##   LVC Ñ_LVC 
##   635  5599