Modulos
library(readxl)
library(readr)
#install.packages("factoextra")
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.0.5
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#install.packages("Methy1IT")
library("FactoMineR")
## Warning: package 'FactoMineR' was built under R version 4.0.5
#install.packages("viridis")
library(viridis)
## Warning: package 'viridis' was built under R version 4.0.5
## Loading required package: viridisLite
## Warning: package 'viridisLite' was built under R version 4.0.5
#install.packages("paletteer")
library(paletteer)
## Warning: package 'paletteer' was built under R version 4.0.5
library(caret)
## Warning: package 'caret' was built under R version 4.0.5
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.0.5
library(mlbench)
## Warning: package 'mlbench' was built under R version 4.0.5
library(foreign)
library(nnet)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#install.packages("report")
library(report)
## Warning: package 'report' was built under R version 4.0.5
library(mlogit)
## Warning: package 'mlogit' was built under R version 4.0.5
## Loading required package: dfidx
## Warning: package 'dfidx' was built under R version 4.0.5
##
## Attaching package: 'dfidx'
## The following object is masked from 'package:stats':
##
## filter
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.0.5
library(caret)
# if (!require(remotes)) {
# install.packages("remotes")
# }
# remotes::install_github('jorvlan/raincloudplots')
library(raincloudplots)
Leitura dos dados
# snv_data = as.data.frame(read.csv("snv_data.csv"))
# #snv_data[is.na(snv_data)] = 0
# snv_data = snv_data[,-c(1)]
# snv_data[is.na(snv_data)] = 0
snv_data_complete = as.data.frame(read.csv("snv_data.csv"))
snv_data_complete = snv_data_complete[,-c(1)]
snv_data_complete$clade = as.factor(snv_data_complete$clade)
snv_data_complete[is.na(snv_data_complete)] = 0
snv_data = as.data.frame(read.csv("snv_data_r.csv")) # > 200
snv_data[is.na(snv_data)] = 0
#snv_data = snv_data[,-c(1,2)] #,7,10,11,15,16,17,18,19,20,21,22
snv_data = snv_data[,-c(1,2)] # 3,4,5,6,7,8,9,10,14,15,16,17,18,19,21
snv_data
# snv_data = as.data.frame(read.csv("snv_data_r2.csv")) #> 5000
# #snv_data[is.na(snv_data)] = 0
# snv_data = snv_data[,-c(1,2)]
# snv_data
snv_data$clade = as.factor(snv_data$clade)
table(snv_data$clade) # we have some clades/variants that are very lowly represented. Remove all clades that are under 200 samples/patients(excluding 19A since it will be our reference for the multinomial logistic regression)
##
## 19A 19B 20A 20B 20C
## 137 60 1073 1042 71
## 20D 20E (EU1) 20G 20H (Beta, V2) 20I (Alpha, V1)
## 120 1273 16 117 4998
## 20J (Gamma, V3) 21A (Delta) 21B (Kappa) 21D (Eta) 21F (Iota)
## 198 116 9 25 2
## 21G (Lambda) 21H (Mu) 21I (Delta) 21J (Delta)
## 2 24 290 10537
table(snv_data_complete$clade) #remover todas abaixo de 200 samples - 19B, 20C, 20D, 20G,20H,20J,21A,21B,21D,21F,21G,21H,21I
##
## 19A 19B 20A 20B 20C
## 137 60 1073 1042 71
## 20D 20E (EU1) 20G 20H (Beta, V2) 20I (Alpha, V1)
## 120 1273 16 117 4998
## 20J (Gamma, V3) 21A (Delta) 21B (Kappa) 21D (Eta) 21F (Iota)
## 198 116 9 25 2
## 21G (Lambda) 21H (Mu) 21I (Delta) 21J (Delta)
## 2 24 290 10537
clades_sub_200 = c("19B", "20C", "20D", "20G","20H (Beta, V2)","21A (Delta)","21B (Kappa)","21D (Eta)","21F (Iota)","21G (Lambda)","21H (Mu)","21I (Delta)")
for (sub_clade in clades_sub_200) {
snv_data_complete = snv_data_complete[!(snv_data_complete$clade == sub_clade),]
}
table(snv_data_complete$clade)
##
## 19A 19B 20A 20B 20C
## 137 0 1073 1042 0
## 20D 20E (EU1) 20G 20H (Beta, V2) 20I (Alpha, V1)
## 0 1273 0 0 4998
## 20J (Gamma, V3) 21A (Delta) 21B (Kappa) 21D (Eta) 21F (Iota)
## 198 0 0 0 0
## 21G (Lambda) 21H (Mu) 21I (Delta) 21J (Delta)
## 0 0 0 10537
snv_data_complete # We went from 20110 samples to 19258 when removing the clades < 200 samples