DATA MANAGEMENT
Delete rows whose variables are NA
df <- df[apply(df[,c("var1", "var2", "var3")], 1, function(y) !all(is.na(y))),]
# ou
df <- df[rowSums(is.na(df)) != ncol(df),]
Sum a variable by group
Using aggregate
aggregate(df$Var1, by = list(Var2 = df$Var2), FUN=sum)
Groupement selon les modalités de Var2 et somme de la Var1. multiple dimensions can be specified in the list. Multiple aggregated metrics of the same data type can be incorporated via cbind:
aggregate(cbind(x$Frequency, x$Metric2, x$Metric3) by = list(Var2 = df$Var2), FUN=sum)
Exportation de données
# Writes a df into a file
{utils} write.table(df, file, append, quote, sep, na, rownames, colnames, ...)
{utils} write.csv2(df, file, append, quote, sep, na, rownames, colnames, ...)
# Writes an ASCII text representation into an R object
{base} dput(df, file, control)
Création de données
rep(x,times)
rep(x,times) répète times fois la valeur x; utilisez each=n pour répéter n fois chaque élément de x
rep(c(1,2,3),2)
rep(c(1,2,3),each=2)
{base} seq(from, to, by, length) {base} rep(x, times, each)
df$var1 <- c(1:100) #{base}
{base} df$var2 <- rep(c("val1", NA_character_, "val2"), time = 3, each = 4, length.out = nrow(df))
Création de data frames
{base} data.frame(v=1:4, ch=c("a", "b", "c", "d"), lettre="A")
{base} as.data.frame(matrix(ncol = 10, nrow = 100)); names(df) <- c("var1", ...)
# From another data frame
df2 <- df1[, c("var1", "var2", "var3")]
Remplacement de valeurs dans df
df[which(df$var1 == "val1"), "var2"] <- "val2" # {base}
Indexer des vecteurs
vec[n]; vec[-n]; vec[1:n]; vec[-(1:n)]; vec[c(1,2,4)]; vec["val1"]; vec[vec>3]; vec[vec>3 & vec<5]
{base} vec1 %in% vec2
Manipulation Dates et Durées
Reshaping data ‘tidyr’ package
my_data <- USArrests[c(1, 10, 20, 30), ]
my_data <- cbind(state = rownames(my_data), my_data)
my_data
Gather()
my_data2 <- gather(my_data, key = "arrest_attribute", value = "test", -state)
my_data2
# Entrer manuellement des données dans data frame
Extraction de données #### Eliminer les lignes pour lesquelles var1 et var2 sont vides simultanément
df[apply(df[,c("var1", "var2", "var3")], 1, function(y) !all(is.na(y))),]
STATS DESCRIPTIVES
Variables qualitatives
tab1(drep$SEXE) # {epiDisplay}
Variables Quantitatives
#epi.descriptives(drep$AGE)
#age <- epi.descriptives(drep$AGE)$a
# Tableau de variables
varquan <- c("AGE", "POIDS", "TAILLE", "PAS")
colnames <- c("n", "mean", "sd", "q25", "q50", "q75", "lower", "upper", "min", "max", "na")
tab <- as.data.frame(matrix(ncol = 11, nrow = 0 )); colnames(tab) <- colnames;
for (var in varquan) {
x <- epi.descriptives(drep[,var])$a; tab <- rbind(tab, x); rm(x)
}
tab$varquan <- varquan; tab <- tab %>% dplyr::select(12, 1:11); rm(varquan, var, colnames)
tab
Graphiques
Fonction hist
# Création de la variable
drep <- transform(drep, PAM=(PAS+2*PAD)/3)
# Histogramme Effectifs
hist(drep$PAM, col = "grey", border = "white", xlim=c(50, 120), xlab = "Pression artérielle moyenne (mmHg)", ylim=c(0,50), ylab="Effectifs", main=NULL)
# Histogramme Densité
hist(drep$PAM, prob=TRUE, col = "grey", border = "white", xlim=c(50, 120), xlab = "Pression artérielle moyenne (mmHg)", ylim=c(0,0.05), ylab="Densités", main=NULL)
# Courbe de densité de la distribution
lines(density(drep$PAM,na.rm=TRUE),lwd=2,col="orange")
# Courbe de densité de la loi normale
PAMord <- drep$PAM[order(drep$PAM)]
lines(PAMord, dnorm(PAMord, mean=mean(PAMord), sd=sd(PAMord)), col="red")
# Rajour d'un texte
text(110,0.04,paste("N =",sum(complete.cases(drep$PAM))),cex=0.9)
Histogramme : 1 variable continue
par(mfrow=c(3,3))
a <- ggplot(dataset, aes(imc))
a + geom_histogram(color = "grey", fill="ivory3", linetype = 1) +
#geom_freqpoly(color = "salmon") +
stat_density() +
labs(x = expression(paste("IMC en ", kg/m^2)), y="Effectif", title=paste("Répartition des", nrow(dataset),"individus selon leur IMC")) +
theme_classic() +
theme(plot.title = element_text(color="grey20", size=15, face="bold.italic", hjust =0.5))
1 Variable discrète et 1 variable continue
a <- ggplot(dataset, aes(sexe, imc))
a + theme_bw() + geom_boxplot() + xlab("Sexe") + ylab("IMC en cm2")
TESTS STATISTIQUES
Packages nécessaires : epiDisplay, epiR et/ou epicalc
library(epiDisplay, epiR, epicalc)
Variables qualitatives : test du Chi2
- Tableau de contingence observé
tabpct
tabpct(drep$SEXE, drep$TYPEHB)
- Tableau de contingence théorique
chisq.test(drep$SEXE, drep$TYPEHB)$expected
