DATA MANAGEMENT

Delete rows whose variables are NA

df <- df[apply(df[,c("var1", "var2", "var3")], 1, function(y) !all(is.na(y))),]
# ou
df <- df[rowSums(is.na(df)) != ncol(df),]

Sum a variable by group

Using aggregate

aggregate(df$Var1, by = list(Var2 = df$Var2), FUN=sum)

Groupement selon les modalités de Var2 et somme de la Var1. multiple dimensions can be specified in the list. Multiple aggregated metrics of the same data type can be incorporated via cbind:

aggregate(cbind(x$Frequency, x$Metric2, x$Metric3) by = list(Var2 = df$Var2), FUN=sum)

Extract first n characters from a string

substr(x, 1, n)

Extract last n characters from String

n_last <- n                                # Specify number of characters to extract
substr(x, nchar(x) - n_last + 1, nchar(x)) # Extract last three characters

Importation de données

Données texte : read.table()

d <- read.table("donnees.txt", header = TRUE, sep = "", dec = ".")

Remarques : arguments dec= comme délimiteur décimal ; header= T ou Fsi le fichier contient une ligne d’en-tête.

df <- read.table(file, sep=“, header=T, na.strings=”") df <- read.csv2(file) df <- read.delim(file) df <- read.xls()

Exportation de données

# Writes a df into a file
{utils} write.table(df, file, append, quote, sep, na, rownames, colnames, ...)
{utils} write.csv2(df, file, append, quote, sep, na, rownames, colnames, ...)

# Writes an ASCII text representation into an R object
{base} dput(df, file, control)

Création de données

seq(from, to)

seq(5, 10)

rep(x,times)

rep(x,times) répète times fois la valeur x; utilisez each=n pour répéter n fois chaque élément de x

rep(c(1,2,3),2)
rep(c(1,2,3),each=2)

{base} seq(from, to, by, length) {base} rep(x, times, each)

df$var1 <- c(1:100) #{base}
{base} df$var2 <- rep(c("val1", NA_character_, "val2"), time = 3, each = 4, length.out = nrow(df))

Création de data frames

{base} data.frame(v=1:4, ch=c("a", "b", "c", "d"), lettre="A")
{base} as.data.frame(matrix(ncol = 10, nrow = 100)); names(df) <- c("var1", ...)
# From another data frame
df2 <- df1[, c("var1", "var2", "var3")]

Remplacement de valeurs dans df

df[which(df$var1 == "val1"), "var2"] <- "val2" # {base}

Indexer des vecteurs

vec[n]; vec[-n]; vec[1:n]; vec[-(1:n)]; vec[c(1,2,4)]; vec["val1"]; vec[vec>3]; vec[vec>3 & vec<5] 
{base} vec1 %in% vec2

Information sur les objects

is.na(vec); is.null(val or vec); length(vec); dim(df); nrow(df)
class(x); unclass(x)
which.max(vec); which.max(vec)
rev(vec); sort(vec); order(vec)
z <- setdiff(vec1, vec2)

Manipulation Dates et Durées

Transformation d’un jeu de données

transform(df, newvar = var1 + var2)
transform(df, newvar = ifelse())

Reshaping data ‘tidyr’ package

my_data <- USArrests[c(1, 10, 20, 30), ]
my_data <- cbind(state = rownames(my_data), my_data)
my_data

Gather()

my_data2 <- gather(my_data, key = "arrest_attribute", value = "test", -state)
my_data2
# Entrer manuellement des données dans data frame

Extraction de données #### Eliminer les lignes pour lesquelles var1 et var2 sont vides simultanément

df[apply(df[,c("var1", "var2", "var3")], 1, function(y) !all(is.na(y))),]

STATS DESCRIPTIVES

Variables qualitatives

tab1(drep$SEXE) # {epiDisplay}

Variables Quantitatives

#epi.descriptives(drep$AGE)
#age <- epi.descriptives(drep$AGE)$a
# Tableau de variables
varquan <- c("AGE", "POIDS", "TAILLE", "PAS")
colnames <- c("n", "mean", "sd", "q25", "q50", "q75", "lower", "upper", "min", "max", "na")
tab <- as.data.frame(matrix(ncol = 11, nrow = 0 )); colnames(tab) <- colnames;
for (var in varquan) {
  x <- epi.descriptives(drep[,var])$a; tab <- rbind(tab, x); rm(x)
}
tab$varquan <- varquan; tab <- tab %>% dplyr::select(12, 1:11); rm(varquan, var, colnames)
tab

Graphiques

Fonction hist

# Création de la variable
drep <- transform(drep, PAM=(PAS+2*PAD)/3)
# Histogramme Effectifs
hist(drep$PAM, col = "grey", border = "white", xlim=c(50, 120), xlab = "Pression artérielle moyenne (mmHg)", ylim=c(0,50), ylab="Effectifs", main=NULL)
# Histogramme Densité
hist(drep$PAM, prob=TRUE, col = "grey", border = "white", xlim=c(50, 120), xlab = "Pression artérielle moyenne (mmHg)", ylim=c(0,0.05), ylab="Densités", main=NULL)
# Courbe de densité de la distribution
lines(density(drep$PAM,na.rm=TRUE),lwd=2,col="orange")
# Courbe de densité de la loi normale
PAMord <- drep$PAM[order(drep$PAM)]
lines(PAMord, dnorm(PAMord, mean=mean(PAMord), sd=sd(PAMord)), col="red")
# Rajour d'un texte
text(110,0.04,paste("N =",sum(complete.cases(drep$PAM))),cex=0.9)

Histogramme : 1 variable continue

par(mfrow=c(3,3))
a <- ggplot(dataset, aes(imc))
a + geom_histogram(color = "grey", fill="ivory3", linetype = 1) +
  #geom_freqpoly(color = "salmon") +
  stat_density() +
  labs(x = expression(paste("IMC en ", kg/m^2)), y="Effectif",   title=paste("Répartition des", nrow(dataset),"individus selon leur IMC")) +
  theme_classic() +
  theme(plot.title = element_text(color="grey20", size=15, face="bold.italic", hjust =0.5))

1 Variable discrète et 1 variable continue

a <- ggplot(dataset, aes(sexe, imc))
a + theme_bw()  + geom_boxplot() + xlab("Sexe") + ylab("IMC en cm2")

TESTS STATISTIQUES

Packages nécessaires : epiDisplay, epiR et/ou epicalc

library(epiDisplay, epiR, epicalc)

Variables qualitatives : test du Chi2

  1. Tableau de contingence observé tabpct
tabpct(drep$SEXE, drep$TYPEHB)
  1. Tableau de contingence théorique
chisq.test(drep$SEXE, drep$TYPEHB)$expected
