DATA MANAGEMENT
Delete rows whose variables are NA
df <- df[apply(df[,c("var1", "var2", "var3")], 1, function(y) !all(is.na(y))),]
# ou
df <- df[rowSums(is.na(df)) != ncol(df),]
Sum a variable by group
Using aggregate
aggregate(df$Var1, by = list(Var2 = df$Var2), FUN=sum)
Groupement selon les modalités de Var2 et somme de la Var1. multiple dimensions can be specified in the list. Multiple aggregated metrics of the same data type can be incorporated via cbind:
aggregate(cbind(x$Frequency, x$Metric2, x$Metric3) by = list(Var2 = df$Var2), FUN=sum)
Exportation de données
# Writes a df into a file
{utils} write.table(df, file, append, quote, sep, na, rownames, colnames, ...)
{utils} write.csv2(df, file, append, quote, sep, na, rownames, colnames, ...)
# Writes an ASCII text representation into an R object
{base} dput(df, file, control)
Création de données
rep(x,times)
rep(x,times) répète times fois la valeur x; utilisez each=n pour répéter n fois chaque élément de x
rep(c(1,2,3),2)
rep(c(1,2,3),each=2)
{base} seq(from, to, by, length) {base} rep(x, times, each)
df$var1 <- c(1:100) #{base}
{base} df$var2 <- rep(c("val1", NA_character_, "val2"), time = 3, each = 4, length.out = nrow(df))
Création de data frames
{base} data.frame(v=1:4, ch=c("a", "b", "c", "d"), lettre="A")
{base} as.data.frame(matrix(ncol = 10, nrow = 100)); names(df) <- c("var1", ...)
# From another data frame
df2 <- df1[, c("var1", "var2", "var3")]
Remplacement de valeurs dans df
df[which(df$var1 == "val1"), "var2"] <- "val2" # {base}
Indexer des vecteurs
vec[n]; vec[-n]; vec[1:n]; vec[-(1:n)]; vec[c(1,2,4)]; vec["val1"]; vec[vec>3]; vec[vec>3 & vec<5]
{base} vec1 %in% vec2
Manipulation Dates et Durées
Reshaping data ‘tidyr’ package
my_data <- USArrests[c(1, 10, 20, 30), ]
my_data <- cbind(state = rownames(my_data), my_data)
my_data
Gather()
my_data2 <- gather(my_data, key = "arrest_attribute", value = "test", -state)
my_data2
# Entrer manuellement des données dans data frame
Extraction de données #### Eliminer les lignes pour lesquelles var1 et var2 sont vides simultanément
df[apply(df[,c("var1", "var2", "var3")], 1, function(y) !all(is.na(y))),]
STATS DESCRIPTIVES
Variables qualitatives
tab1(drep$SEXE) # {epiDisplay}
Variables Quantitatives
#epi.descriptives(drep$AGE)
#age <- epi.descriptives(drep$AGE)$a
# Tableau de variables
varquan <- c("AGE", "POIDS", "TAILLE", "PAS")
colnames <- c("n", "mean", "sd", "q25", "q50", "q75", "lower", "upper", "min", "max", "na")
tab <- as.data.frame(matrix(ncol = 11, nrow = 0 )); colnames(tab) <- colnames;
for (var in varquan) {
x <- epi.descriptives(drep[,var])$a; tab <- rbind(tab, x); rm(x)
}
tab$varquan <- varquan; tab <- tab %>% dplyr::select(12, 1:11); rm(varquan, var, colnames)
tab
Graphiques
Fonction hist
# Création de la variable
drep <- transform(drep, PAM=(PAS+2*PAD)/3)
# Histogramme Effectifs
hist(drep$PAM, col = "grey", border = "white", xlim=c(50, 120), xlab = "Pression artérielle moyenne (mmHg)", ylim=c(0,50), ylab="Effectifs", main=NULL)
# Histogramme Densité
hist(drep$PAM, prob=TRUE, col = "grey", border = "white", xlim=c(50, 120), xlab = "Pression artérielle moyenne (mmHg)", ylim=c(0,0.05), ylab="Densités", main=NULL)
# Courbe de densité de la distribution
lines(density(drep$PAM,na.rm=TRUE),lwd=2,col="orange")
# Courbe de densité de la loi normale
PAMord <- drep$PAM[order(drep$PAM)]
lines(PAMord, dnorm(PAMord, mean=mean(PAMord), sd=sd(PAMord)), col="red")
# Rajour d'un texte
text(110,0.04,paste("N =",sum(complete.cases(drep$PAM))),cex=0.9)
Histogramme : 1 variable continue
par(mfrow=c(3,3))
a <- ggplot(dataset, aes(imc))
a + geom_histogram(color = "grey", fill="ivory3", linetype = 1) +
#geom_freqpoly(color = "salmon") +
stat_density() +
labs(x = expression(paste("IMC en ", kg/m^2)), y="Effectif", title=paste("Répartition des", nrow(dataset),"individus selon leur IMC")) +
theme_classic() +
theme(plot.title = element_text(color="grey20", size=15, face="bold.italic", hjust =0.5))
1 Variable discrète et 1 variable continue
a <- ggplot(dataset, aes(sexe, imc))
a + theme_bw() + geom_boxplot() + xlab("Sexe") + ylab("IMC en cm2")
TESTS STATISTIQUES
Packages nécessaires : epiDisplay, epiR et/ou epicalc
library(epiDisplay, epiR, epicalc)
Variables qualitatives : test du Chi2
- Tableau de contingence observé
tabpct
tabpct(drep$SEXE, drep$TYPEHB)
- Tableau de contingence théorique
chisq.test(drep$SEXE, drep$TYPEHB)$expected
---
title: "R - Aide Mémoire"
output:
  html_notebook:
    code_folding: none
    fig_caption: yes
    theme: yeti
    toc: yes
    toc_float: yes
  html_document:
    df_print: paged
    toc: yes
  pdf_document:
    toc: yes
---

```{css echo=F, eval=T}
body      {/* Normal */  font-family:"Calibri"; font-size:14px; font-style: ; color:greyblack  }
td        {/* Table */   font-family:"Calibri"; font-size:12px; }

h1.title  {/* Title */   font-family:"Calibri"; font-size:40px; font-style:bold; color:red }
h1.subtitle {/* Subtitle */   font-family:"Calibri"; font-size:50px; font-style:bold; color:red }
h1        {/* Header 1 */  font-family:"Calibri"; font-size:22px; font-style:bold; color:blue;
                           margin-top: 6px; margin-bottom: 12px  }
h2        {/* Header 2 */  font-family:"Calibri"; font-size:18px; font-styles:italic; color:blackgrey;
                           margin-top: 0px; margin-bottom: 6px }
h3        {/* Header 3 */  font-family:"Calibri"; font-size:16px; font-styles:bold; color:green;
                           margin-top: 0px; margin-bottom: 6px}

code.r    {/* Code block */ font-family:"Calibri"; font-size:50px; color: red}
pre       {/* Code block - determines code spacing between lines */ font-size: 12px; color: }

p:first-of-type {/* Define a margin after every first p elements  */margin-bottom: 12px;}
```


```{r, out.width="0.3\\linewidth", include=T, fig.align="center", fig.cap=c("your caption"), echo=F}
knitr::include_graphics("/Users/Elisee/GB_OwnCloud/R/CheatSheets/base-r.pdf")
```


```{r echo=FALSE, include = FALSE}
knitr::opts_chunk$set(echo = T, eval = F, include = T)
```

# DATA MANAGEMENT  
## Delete rows whose variables are NA  
```{r}
df <- df[apply(df[,c("var1", "var2", "var3")], 1, function(y) !all(is.na(y))),]
# ou
df <- df[rowSums(is.na(df)) != ncol(df),]
```

## Sum a variable by group 
Using `aggregate`
```{r }
aggregate(df$Var1, by = list(Var2 = df$Var2), FUN=sum)
```
Groupement selon les modalités de Var2 et somme de la Var1. multiple dimensions can be specified in 
the `list`. Multiple aggregated metrics of the same data type can be incorporated via `cbind`:
```{r}
aggregate(cbind(x$Frequency, x$Metric2, x$Metric3) by = list(Var2 = df$Var2), FUN=sum)
```

## Extract first n characters from a string
```{r}
substr(x, 1, n)
```

## Extract last n characters from String
```{r}
n_last <- n                                # Specify number of characters to extract
substr(x, nchar(x) - n_last + 1, nchar(x)) # Extract last three characters
```







### Importation de données   
#### Données texte : read.table()
```{r}
d <- read.table("donnees.txt", header = TRUE, sep = "", dec = ".")
```
Remarques : arguments `dec=` comme délimiteur décimal ; `header= T ou F`si le fichier contient une
ligne d'en-tête.

df <- read.table(file, sep="\t", header=T, na.strings="")
df <- read.csv2(file)
df <- read.delim(file)
df <- read.xls()


## Exportation de données
```{r}
# Writes a df into a file
{utils} write.table(df, file, append, quote, sep, na, rownames, colnames, ...)
{utils} write.csv2(df, file, append, quote, sep, na, rownames, colnames, ...)

# Writes an ASCII text representation into an R object
{base} dput(df, file, control)
```
### Création de données
#### seq(from, to)
```{r}
seq(5, 10)
```
#### rep(x,times)
rep(x,times) répète times fois la valeur x; utilisez
each=n pour répéter n fois chaque élément de x
```{r}
rep(c(1,2,3),2)
rep(c(1,2,3),each=2)
```


{base} seq(from, to, by, length)
{base} rep(x, times, each)
```{r}
df$var1 <- c(1:100) #{base}
{base} df$var2 <- rep(c("val1", NA_character_, "val2"), time = 3, each = 4, length.out = nrow(df))
```
## Création de data frames
```{r}
{base} data.frame(v=1:4, ch=c("a", "b", "c", "d"), lettre="A")
{base} as.data.frame(matrix(ncol = 10, nrow = 100)); names(df) <- c("var1", ...)
# From another data frame
df2 <- df1[, c("var1", "var2", "var3")]
```
## Remplacement de valeurs dans df
```{r}
df[which(df$var1 == "val1"), "var2"] <- "val2" # {base}
```

## Indexer des vecteurs
```{r}
vec[n]; vec[-n]; vec[1:n]; vec[-(1:n)]; vec[c(1,2,4)]; vec["val1"]; vec[vec>3]; vec[vec>3 & vec<5] 
{base} vec1 %in% vec2
```

## Information sur les objects
```{r}
is.na(vec); is.null(val or vec); length(vec); dim(df); nrow(df)
class(x); unclass(x)
which.max(vec); which.max(vec)
rev(vec); sort(vec); order(vec)
z <- setdiff(vec1, vec2)
```



## Manipulation Dates et Durées
```{r, echo=FALSE, include=FALSE}
# Extraire l'année d'une date   
#format(as.Date(df$Date, format="%d/%m/%Y"),"%Y")  
# Use a different origin 
as.Date(41149, origin = "1900-01-01")  
# Take a difference**  
Sys.Date() - as.Date("1970-01-01")  
difftime(Sys.Date(), as.Date("1970-01-01"), units = "days")  
# See internal integer representation**  
unclass(Sys.Date())
# Convert time (mm:ss) to decimal
vec <- c("4:30","2:20","34:10")
sapply(strsplit(vec,":"), function(x) {x <- as.numeric(x); x[1]+x[2]/60})
```
## Transformation d'un jeu de données 
```{r}
transform(df, newvar = var1 + var2)
transform(df, newvar = ifelse())
```
## Reshaping data 'tidyr' package
```{r}
my_data <- USArrests[c(1, 10, 20, 30), ]
my_data <- cbind(state = rownames(my_data), my_data)
my_data
```
Gather()
```{r}
my_data2 <- gather(my_data, key = "arrest_attribute", value = "test", -state)
my_data2
```
```{r}
# Entrer manuellement des données dans data frame

```

```{r, echo=FALSE, include=FALSE}
df2 <- df %>% group_by("var3")
df2
```

Extraction de données
#### Eliminer les lignes pour lesquelles var1 et var2 sont vides simultanément
```{r}
df[apply(df[,c("var1", "var2", "var3")], 1, function(y) !all(is.na(y))),]
```





# STATS DESCRIPTIVES
## Variables qualitatives
```{r}
tab1(drep$SEXE) # {epiDisplay}
```
## Variables Quantitatives
```{r}
#epi.descriptives(drep$AGE)
#age <- epi.descriptives(drep$AGE)$a
# Tableau de variables
varquan <- c("AGE", "POIDS", "TAILLE", "PAS")
colnames <- c("n", "mean", "sd", "q25", "q50", "q75", "lower", "upper", "min", "max", "na")
tab <- as.data.frame(matrix(ncol = 11, nrow = 0 )); colnames(tab) <- colnames;
for (var in varquan) {
  x <- epi.descriptives(drep[,var])$a; tab <- rbind(tab, x); rm(x)
}
tab$varquan <- varquan; tab <- tab %>% dplyr::select(12, 1:11); rm(varquan, var, colnames)
tab
```

## Graphiques
### Fonction `hist`
```{r}
# Création de la variable
drep <- transform(drep, PAM=(PAS+2*PAD)/3)
# Histogramme Effectifs
hist(drep$PAM, col = "grey", border = "white", xlim=c(50, 120), xlab = "Pression artérielle moyenne (mmHg)", ylim=c(0,50), ylab="Effectifs", main=NULL)
# Histogramme Densité
hist(drep$PAM, prob=TRUE, col = "grey", border = "white", xlim=c(50, 120), xlab = "Pression artérielle moyenne (mmHg)", ylim=c(0,0.05), ylab="Densités", main=NULL)
# Courbe de densité de la distribution
lines(density(drep$PAM,na.rm=TRUE),lwd=2,col="orange")
# Courbe de densité de la loi normale
PAMord <- drep$PAM[order(drep$PAM)]
lines(PAMord, dnorm(PAMord, mean=mean(PAMord), sd=sd(PAMord)), col="red")
# Rajour d'un texte
text(110,0.04,paste("N =",sum(complete.cases(drep$PAM))),cex=0.9)
```

### Histogramme : 1 variable continue
```{r}
par(mfrow=c(3,3))
a <- ggplot(dataset, aes(imc))
a + geom_histogram(color = "grey", fill="ivory3", linetype = 1) +
  #geom_freqpoly(color = "salmon") +
  stat_density() +
  labs(x = expression(paste("IMC en ", kg/m^2)), y="Effectif",   title=paste("Répartition des", nrow(dataset),"individus selon leur IMC")) +
  theme_classic() +
  theme(plot.title = element_text(color="grey20", size=15, face="bold.italic", hjust =0.5))
```

### 1 Variable discrète et 1 variable continue
```{r}
a <- ggplot(dataset, aes(sexe, imc))
a + theme_bw()  + geom_boxplot() + xlab("Sexe") + ylab("IMC en cm2")
```

```{r echo=FALSE, include=FALSE}
a <- ggplot(dataset, aes(imc))
a + geom_density(kernel="gaussian")
a + geom_freqpoly()
```

# TESTS STATISTIQUES
Packages nécessaires : epiDisplay, epiR et/ou epicalc
```{r}
library(epiDisplay, epiR, epicalc)
```
## Variables qualitatives : test du Chi2
1) Tableau de contingence observé `tabpct`
```{r}
tabpct(drep$SEXE, drep$TYPEHB)
```
2) Tableau de contingence théorique
```{r}
chisq.test(drep$SEXE, drep$TYPEHB)$expected
```
3) 







