karina

# Método
###Estudo Transversal
require(RcmdrMisc)
require(dplyr)
library(colorspace, pos=33)

#notifcações match
banco <- read.csv("C:/Users/edson/Downloads/Base_de_dados_match  (1).csv", comment.char="#")
banco$recorrencia2<-0
banco$recorrencia2[banco$recorrencia!="1" & banco$diff<60]<-1
banco$recorrencia2<-as.factor(banco$recorrencia2)
#repetição

table(banco$recorrencia)

## 
##    1    2    3    4    5 
## 6234  820  142   28    6

banco<-arrange(banco,desc(banco$DT_NOTIFIC))
banco_umavez<-dplyr::distinct(banco, banco$study_ID,.keep_all=T)
banco_repete<-subset(banco,banco$recorrencia>1)
banco_repete<-dplyr::arrange(banco_repete,banco_repete$DT_NOTIFIC, banco_repete$study_ID)
banco_repete2<-dplyr::distinct(banco_repete, banco_repete$study_ID,.keep_all=T)


banco3<-dplyr::full_join(banco_umavez,banco_repete2)
tabela<-subset(banco_umavez, select=c(NU_IDADE_N , CS_SEXO,     CS_GESTANT,  CS_RACA,     CS_ESCOL_N,  RESULT,     
PMM       ,  PCRUZ   ,    TRA_ESQUEM , MES  ,       ANO   ,      recorrencia,
diff,        LVC ,        Tempo_class))

tabela <- within(tabela, {
ANO <- as.factor(ANO)
CS_ESCOL_N <- as.factor(CS_ESCOL_N)
CS_GESTANT <- as.factor(CS_GESTANT)
CS_RACA <- as.factor(CS_RACA)
MES <- as.factor(MES)
PCRUZ <- as.factor(PCRUZ)
recorrencia <- as.factor(recorrencia)
RESULT <- as.factor(RESULT)
TRA_ESQUEM <- as.factor(TRA_ESQUEM)
})

#descritiva apenas 
edu::resumonumericooucategorico(tabela)

Characteristic	N = 6,234¹
NU_IDADE_N	4,036 (4,027, 4,048)
CS_SEXO
F	1,374 (22.04%)
M	4,860 (77.96%)
CS_GESTANT
1	9 (0.14%)
2	18 (0.29%)
3	9 (0.14%)
4	2 (0.03%)
5	971 (15.58%)
6	5,079 (81.47%)
9	146 (2.34%)
CS_RACA
1	2,859 (46.72%)
2	313 (5.11%)
3	50 (0.82%)
4	2,419 (39.53%)
5	125 (2.04%)
9	354 (5.78%)
Unknown	114
CS_ESCOL_N
0	85 (1.49%)
1	571 (9.99%)
2	271 (4.74%)
3	721 (12.62%)
4	514 (8.99%)
5	424 (7.42%)
6	869 (15.21%)
7	181 (3.17%)
8	629 (11.01%)
9	1,285 (22.48%)
10	165 (2.89%)
Unknown	519
RESULT
4	5,841 (93.70%)
5	272 (4.36%)
6	121 (1.94%)
PMM	501 (118, 4,088)
Unknown	1,887
PCRUZ
1	1,192 (19.12%)
2	759 (12.18%)
3	935 (15.00%)
4	2,577 (41.34%)
5	581 (9.32%)
6	190 (3.05%)
TRA_ESQUEM
1	4,499 (75.31%)
2	45 (0.75%)
3	80 (1.34%)
4	65 (1.09%)
5	19 (0.32%)
6	13 (0.22%)
7	18 (0.30%)
8	5 (0.08%)
9	36 (0.60%)
10	34 (0.57%)
11	43 (0.72%)
12	21 (0.35%)
99	1,096 (18.35%)
Unknown	260
MES
1	521 (8.44%)
2	437 (7.08%)
3	533 (8.64%)
4	510 (8.27%)
5	524 (8.49%)
6	515 (8.35%)
7	495 (8.02%)
8	521 (8.44%)
9	525 (8.51%)
10	547 (8.87%)
11	536 (8.69%)
12	506 (8.20%)
Unknown	64
ANO
1920	2 (0.03%)
1922	1 (0.02%)
1923	1 (0.02%)
1925	4 (0.06%)
1926	2 (0.03%)
1927	4 (0.06%)
1928	1 (0.02%)
1929	1 (0.02%)
1930	5 (0.08%)
1931	1 (0.02%)
1932	3 (0.05%)
1933	7 (0.11%)
1934	13 (0.21%)
1935	9 (0.15%)
1936	10 (0.16%)
1937	15 (0.24%)
1938	16 (0.26%)
1939	12 (0.19%)
1940	22 (0.36%)
1941	15 (0.24%)
1942	26 (0.42%)
1943	22 (0.36%)
1944	26 (0.42%)
1945	27 (0.44%)
1946	39 (0.63%)
1947	41 (0.66%)
1948	30 (0.49%)
1949	34 (0.55%)
1950	50 (0.81%)
1951	49 (0.79%)
1952	68 (1.10%)
1953	72 (1.17%)
1954	60 (0.97%)
1955	77 (1.25%)
1956	69 (1.12%)
1957	90 (1.46%)
1958	87 (1.41%)
1959	118 (1.91%)
1960	89 (1.44%)
1961	90 (1.46%)
1962	120 (1.94%)
1963	125 (2.03%)
1964	110 (1.78%)
1965	127 (2.06%)
1966	128 (2.07%)
1967	113 (1.83%)
1968	129 (2.09%)
1969	130 (2.11%)
1970	131 (2.12%)
1971	133 (2.16%)
1972	129 (2.09%)
1973	125 (2.03%)
1974	118 (1.91%)
1975	148 (2.40%)
1976	142 (2.30%)
1977	156 (2.53%)
1978	169 (2.74%)
1979	169 (2.74%)
1980	148 (2.40%)
1981	159 (2.58%)
1982	179 (2.90%)
1983	171 (2.77%)
1984	159 (2.58%)
1985	165 (2.67%)
1986	147 (2.38%)
1987	154 (2.50%)
1988	125 (2.03%)
1989	117 (1.90%)
1990	104 (1.69%)
1991	85 (1.38%)
1992	92 (1.49%)
1993	60 (0.97%)
1994	64 (1.04%)
1995	66 (1.07%)
1996	41 (0.66%)
1997	44 (0.71%)
1998	41 (0.66%)
1999	43 (0.70%)
2000	32 (0.52%)
2001	30 (0.49%)
2002	25 (0.41%)
2003	22 (0.36%)
2004	26 (0.42%)
2005	29 (0.47%)
2006	19 (0.31%)
2007	19 (0.31%)
2008	22 (0.36%)
2009	14 (0.23%)
2010	15 (0.24%)
2011	9 (0.15%)
2012	6 (0.10%)
2013	7 (0.11%)
2014	6 (0.10%)
2015	5 (0.08%)
2016	2 (0.03%)
2017	3 (0.05%)
2018	2 (0.03%)
2019	3 (0.05%)
Unknown	64
recorrencia
1	5,477 (87.86%)
2	620 (9.95%)
3	111 (1.78%)
4	20 (0.32%)
5	6 (0.10%)
diff	0 (0, 0)
LVC
LVC	635 (10.19%)
Ñ_LVC	5,599 (89.81%)
Tempo_class
maior_60_dias	475 (7.62%)
menor_igual_60_dias	5,759 (92.38%)
¹ Median (IQR); n (%)

#notificações
dim(banco)

## [1] 7230   98

#casos
dim(banco3)

## [1] 6434  100

table(banco$recorrencia2)

## 
##    0    1 
## 6784  446

table(banco$recorrencia)

## 
##    1    2    3    4    5 
## 6234  820  142   28    6

tabela<-subset(banco, select=c(NU_IDADE_N , CS_SEXO,     CS_GESTANT,  CS_RACA,     CS_ESCOL_N,  RESULT,     
PMM       ,  PCRUZ   ,    TRA_ESQUEM , MES,
diff ,        Tempo_class, recorrencia2))

tabela <- within(tabela, {
CS_ESCOL_N <- as.factor(CS_ESCOL_N)
CS_GESTANT <- as.factor(CS_GESTANT)
CS_RACA <- as.factor(CS_RACA)
MES <- as.factor(MES)
PCRUZ <- as.factor(PCRUZ)
RESULT <- as.factor(RESULT)
recorrencia2 <- as.factor(recorrencia2)

})

#criar variável recorrência


#teste de hiotese em banco bruto as.it is
require(gtsummary)
gtsummary::tbl_summary(tabela,by="recorrencia2")%>%add_p(simulate.p.value=TRUE)

Characteristic	0, N = 6,784¹	1, N = 446¹	p-value²
NU_IDADE_N	4,036 (4,027, 4,049)	4,035 (4,027, 4,050)	0.8
CS_SEXO			0.3
F	1,471 (21.68%)	88 (19.73%)
M	5,313 (78.32%)	358 (80.27%)
CS_GESTANT			0.3
1	10 (0.15%)	1 (0.22%)
2	22 (0.32%)	0 (0.00%)
3	10 (0.15%)	1 (0.22%)
4	3 (0.04%)	1 (0.22%)
5	1,024 (15.09%)	58 (13.00%)
6	5,556 (81.90%)	375 (84.08%)
9	159 (2.34%)	10 (2.24%)
CS_RACA
1	3,140 (47.16%)	190 (43.28%)
2	339 (5.09%)	28 (6.38%)
3	56 (0.84%)	1 (0.23%)
4	2,616 (39.29%)	176 (40.09%)
5	135 (2.03%)	5 (1.14%)
9	372 (5.59%)	39 (8.88%)
Unknown	126	7
CS_ESCOL_N			0.006
0	89 (1.43%)	2 (0.49%)
1	627 (10.08%)	36 (8.78%)
2	312 (5.02%)	16 (3.90%)
3	780 (12.54%)	41 (10.00%)
4	555 (8.93%)	41 (10.00%)
5	464 (7.46%)	17 (4.15%)
6	955 (15.36%)	69 (16.83%)
7	200 (3.22%)	9 (2.20%)
8	680 (10.94%)	45 (10.98%)
9	1,384 (22.26%)	115 (28.05%)
10	172 (2.77%)	19 (4.63%)
Unknown	566	36
RESULT			0.3
4	6,354 (93.66%)	424 (95.07%)
5	298 (4.39%)	18 (4.04%)
6	132 (1.95%)	4 (0.90%)
PMM	501 (135, 4,500)	600 (200, 7,220)	0.017
Unknown	2,076	159
PCRUZ			0.11
1	1,266 (18.66%)	70 (15.70%)
2	804 (11.85%)	46 (10.31%)
3	1,017 (14.99%)	55 (12.33%)
4	2,831 (41.73%)	208 (46.64%)
5	654 (9.64%)	51 (11.43%)
6	212 (3.12%)	16 (3.59%)
TRA_ESQUEM	1 (1, 1)	1 (1, 99)	<0.001
Unknown	280	32
MES			0.059
1	558 (8.31%)	20 (4.52%)
2	467 (6.95%)	34 (7.69%)
3	573 (8.53%)	32 (7.24%)
4	558 (8.31%)	42 (9.50%)
5	575 (8.56%)	48 (10.86%)
6	567 (8.44%)	39 (8.82%)
7	544 (8.10%)	34 (7.69%)
8	566 (8.43%)	27 (6.11%)
9	583 (8.68%)	40 (9.05%)
10	609 (9.07%)	52 (11.76%)
11	573 (8.53%)	33 (7.47%)
12	545 (8.11%)	41 (9.28%)
Unknown	66	4
diff	0 (0, 0)	16 (1, 43)	<0.001
Tempo_class			<0.001
maior_60_dias	547 (8.06%)	0 (0.00%)
menor_igual_60_dias	6,237 (91.94%)	446 (100.00%)
¹ Median (IQR); n (%) ² Wilcoxon rank sum test; Pearson's Chi-squared test; Fisher's exact test

banco <- within(banco, {
  recorrencia2 <- as.factor(recorrencia2)
})
with(banco, piechart(recorrencia2, xlab="", ylab="", main="recorrencia2", col=palette()[2:3],
   scale="percent"))

#Casos por ano

library(ggplot2)
library(dplyr)
library(plotly)
library(hrbrthemes)

data<-banco
data$DT_NOTIFIC <- as.Date(data$DT_NOTIFIC)
a<-table(data$DT_NOTIFIC,data$NU_NOTIFIC)
a<-as.data.frame(a)
a<-subset(a,a$Freq==1)
a$Var1<-as.Date(a$Var1)
data<-a
# Usual area chart
p <- data %>%
  ggplot( aes(x=Var1, y=Freq)) +
    geom_area(fill="#69b3a4", alpha=0.5) +
    geom_line(color="#69b3a4") +
    theme_ipsum()

# Turn it interactive with ggplotly
p <- ggplotly(p)
p

#tempo de recorrencia

data<-banco
data<-subset(data, banco$recorrencia2==1)
Boxplot( ~ diff, data=data, id=list(method="y"))

#recorrencia

# Load dataset from github



data$DT_NOTIFIC <- as.Date(data$DT_NOTIFIC)
a<-table(data$DT_NOTIFIC,data$NU_NOTIFIC)
a<-as.data.frame(a)
a<-subset(a,a$Freq==1)
a$Var1<-as.Date(a$Var1)
data<-a
# Usual area chart
p <- data %>%
  ggplot( aes(x=Var1, y=Freq)) +
    geom_area(fill="#69b3a4", alpha=0.5) +
    geom_line(color="#69b3a4") +
    theme_ipsum()

# Turn it interactive with ggplotly
p <- ggplotly(p)
p

## Load survival package
library(survival)

#remoção de duplicatas

dataset    <- banco

#dataset$recorrencia2<-0
#dataset$recorrencia2[dataset$recorrencia>2 & dataset$diff<60]<-1
#banco$recorrencia2<-as.factor(banco$recorrencia2)

dataset<-arrange(dataset,desc(dataset$DT_NOTIFIC))

dataset<-dplyr::distinct(dataset,dataset$study_ID,.keep_all=T)


#levando em conta subset de recorrências
#Script de Cox Exaustivo

options(digits=3)


dataset<-subset(dataset, select=c(NU_IDADE_N , CS_SEXO,     CS_GESTANT,  CS_RACA,     CS_ESCOL_N,  RESULT,     
PMM       ,  PCRUZ   ,    TRA_ESQUEM , MES     ,      recorrencia2,
diff,        LVC ,        Tempo_class,study_ID))


dataset <- within(dataset, {
CS_ESCOL_N <- as.factor(CS_ESCOL_N)
CS_GESTANT <- as.factor(CS_GESTANT)
CS_RACA <- as.factor(CS_RACA)
MES <- as.factor(MES)
PCRUZ <- as.factor(PCRUZ)
RESULT <- as.factor(RESULT)
TRA_ESQUEM <- as.factor(TRA_ESQUEM)
})
dataset$diff[dataset$diff>60]<-NA


## Create outcome variable, then survival object
dataset <- within(dataset, {
  status.dichotomous <- recorrencia2 != 0
  survival.vector    <- Surv(diff, recorrencia2)
})

dataset$TRA_ESQUEM2<-0
dataset$TRA_ESQUEM2[dataset$TRA_ESQUEM=="4"]<-1
dataset$TRA_ESQUEM2<-as.factor(dataset$TRA_ESQUEM2)

dataset$gestante2<-0
dataset$gestante2[dataset$CS_GESTANT!=1]<-1
dataset$gestante2<-as.factor(dataset$gestante2)

dataset$raca2<-0
dataset$raca2[dataset$CS_RACA!=1]<-1
dataset$raca2<-as.factor(dataset$raca2)

dataset$escola2<-0
dataset$escola2[dataset$CS_ESCOL_N=="7"|dataset$CS_ESCOL_N=="8"|dataset$CS_ESCOL_N=="9"|dataset$CS_ESCOL_N=="10"]<-1
dataset$escola2<-as.factor(dataset$escola2)


## Create vectors for outcome and predictors
outcome    <- c("survival.vector")
#predictors <- c("esquema2","cd8_cd4", "idade","cv","COR","gon","Grupo6")
predictors <- c('NU_IDADE_N' , 'CS_SEXO',  'raca2',     'escola2',  'RESULT',     'PMM'       ,  'PCRUZ'   ,    'TRA_ESQUEM2'   ,   'LVC')






## The lines below should not need modification.

## Create list of models
list.of.models <- lapply(seq_along((predictors)), function(n) {
  
  left.hand.side  <- outcome
  right.hand.side <- apply(X = combn(predictors, n), MARGIN = 2, paste, collapse = " + ")
  
  paste(left.hand.side, right.hand.side, sep = "  ~  ")
})

## Convert to a 

vector.of.models <- unlist(list.of.models)

## Fit coxph to all models
list.of.fits <- lapply(vector.of.models, function(x) {
  
  formula    <- as.formula(x)
  fit        <- coxph(formula, data = dataset, id=study_ID)
  suma<- summary(fit)
  result.AIC <- extractAIC(fit)
  
  data.frame(num.predictors = result.AIC[1],
             AIC            = result.AIC[2],
             model          = x,
             suma= suma[["coefficients"]])
})


## Collapse to a data frame
result <- do.call(rbind, list.of.fits)
result$ciup<-exp(result$suma.coef + 2*result$suma.se.coef.)
result$cidown<-exp(result$suma.coef - 2*result$suma.se.coef.)
result$var<- row.names(result)


## Sort and print
library(doBy)
a<-orderBy(~ AIC + model, result)
a<-as.data.frame(a, digits=2)
result$pvalorsig<-0
result$pvalorsig[result$suma.Pr...z..<0.05]<-1

f<-result %>%
group_by(model) %>%
summarise(somamodelo= sum(pvalorsig))
a<-merge(a,f)

a$dif<-a$num.predictors- a$somamodelo




#filtrar modelos com todas as var em todos os fatores < 0.05
modelocomtudosig<-subset(a,a$dif<1 & a$suma.Pr...z..<0.05)
#filtrar modelos com mais de um preditor, vai ser todos uni se binária ou numérica todas as var em todos os fatores < 0.05
modelomulticomtudosig<-subset(a,a$dif<1 & a$num.predictors>1)
##filtrar modelos com mais de um preditor, vai ser todos uni se binária ou numérica todas -1 as var em todos os fatores < 0.05, para pegar multinomial
modelomulticomtudosigmenos1<-subset(a,a$dif<2 & a$num.predictors>1)
modelomulticomtudosigmenos3<-subset(a,a$dif<2 & a$num.predictors>4)
modelo_uni<-subset(a,a$num.predictors==1)

htmlTable::htmlTable(modelo_uni)

	model	num.predictors	AIC	suma.coef	suma.exp.coef.	suma.se.coef.	suma.z	suma.Pr…z..	ciup	cidown	var	somamodelo	dif
1	survival.vector ~ CS_SEXO	1	2624.51240894881	-0.198627956944199	0.81985485790608	0.153000915033901	-1.29821417669423	0.194213747926069	1.1133504284787	0.603729042392027	CS_SEXOM	0	1
897	survival.vector ~ escola2	1	2625.58204930766	0.0905067280371467	1.09472887299339	0.12193085558174	0.742279118811506	0.457918240603139	1.39705777961419	0.857825154301308	escola21	0	1
1089	survival.vector ~ LVC	1	2617.78600216494	0.350111564027484	1.41922587431581	0.122157881973692	2.86605791104731	0.0041561820825042	1.81199297009523	1.11159486574698	LVCÑ_LVC	1	0
1090	survival.vector ~ NU_IDADE_N	1	2625.26491675674	-0.000427473246744527	0.999572618106926	0.000403955166237912	-1.05821953145356	0.289955368791973	1.0003805094611	0.998765379192936	NU_IDADE_N	0	1
3034	survival.vector ~ PMM	1	1455.48734693847	-7.51547722450545e-09	0.999999992484523	8.70143307734639e-09	-0.863705685914141	0.387749589345111	1.00000000988739	0.999999975081657	PMM	0	1
3070	survival.vector ~ raca2	1	2624.67023605154	0.146247352870244	1.15748245925289	0.121291573317825	1.20575031611658	0.227913777770747	1.47525447377728	0.908159010728327	raca21	0	1
3582	survival.vector ~ TRA_ESQUEM2	1	2624.56849671888	1.04018013365329	2.82972669745246	0.712677982580516	1.45953734937472	0.144417268576137	11.7697902315402	0.680331002061313	TRA_ESQUEM21	0	1

#htmlTable::htmlTable(modelomulticomtudosig)

#dataset

.Survfit <- survfit(Surv(diff, recorrencia2) ~ LVC , 
              error="greenwood", 
                    data=dataset,id=study_ID)
plot(.Survfit)

table(dataset$LVC)

## 
##   LVC Ñ_LVC 
##   635  5599

karina

Eduardo Peixoto

13/07/2021