DATA MANAGEMENT

Delete rows whose variables are NA

df <- df[apply(df[,c("var1", "var2", "var3")], 1, function(y) !all(is.na(y))),]
# ou
df <- df[rowSums(is.na(df)) != ncol(df),]

Sum a variable by group

Using aggregate

aggregate(df$Var1, by = list(Var2 = df$Var2), FUN=sum)

Groupement selon les modalités de Var2 et somme de la Var1. multiple dimensions can be specified in the list. Multiple aggregated metrics of the same data type can be incorporated via cbind:

aggregate(cbind(x$Frequency, x$Metric2, x$Metric3) by = list(Var2 = df$Var2), FUN=sum)

Extract first n characters from a string

substr(x, 1, n)

Extract last n characters from String

n_last <- n                                # Specify number of characters to extract
substr(x, nchar(x) - n_last + 1, nchar(x)) # Extract last three characters

Importation de données

Données texte : read.table()

d <- read.table("donnees.txt", header = TRUE, sep = "", dec = ".")

Remarques : arguments dec= comme délimiteur décimal ; header= T ou Fsi le fichier contient une ligne d’en-tête.

df <- read.table(file, sep=“, header=T, na.strings=”") df <- read.csv2(file) df <- read.delim(file) df <- read.xls()

Exportation de données

# Writes a df into a file
{utils} write.table(df, file, append, quote, sep, na, rownames, colnames, ...)
{utils} write.csv2(df, file, append, quote, sep, na, rownames, colnames, ...)

# Writes an ASCII text representation into an R object
{base} dput(df, file, control)

Création de données

seq(from, to)

seq(5, 10)

rep(x,times)

rep(x,times) répète times fois la valeur x; utilisez each=n pour répéter n fois chaque élément de x

rep(c(1,2,3),2)
rep(c(1,2,3),each=2)

{base} seq(from, to, by, length) {base} rep(x, times, each)

df$var1 <- c(1:100) #{base}
{base} df$var2 <- rep(c("val1", NA_character_, "val2"), time = 3, each = 4, length.out = nrow(df))

Création de data frames

{base} data.frame(v=1:4, ch=c("a", "b", "c", "d"), lettre="A")
{base} as.data.frame(matrix(ncol = 10, nrow = 100)); names(df) <- c("var1", ...)
# From another data frame
df2 <- df1[, c("var1", "var2", "var3")]

Remplacement de valeurs dans df

df[which(df$var1 == "val1"), "var2"] <- "val2" # {base}

Indexer des vecteurs

vec[n]; vec[-n]; vec[1:n]; vec[-(1:n)]; vec[c(1,2,4)]; vec["val1"]; vec[vec>3]; vec[vec>3 & vec<5] 
{base} vec1 %in% vec2

Information sur les objects

is.na(vec); is.null(val or vec); length(vec); dim(df); nrow(df)
class(x); unclass(x)
which.max(vec); which.max(vec)
rev(vec); sort(vec); order(vec)
z <- setdiff(vec1, vec2)

Manipulation Dates et Durées

Transformation d’un jeu de données

transform(df, newvar = var1 + var2)
transform(df, newvar = ifelse())

Reshaping data ‘tidyr’ package

my_data <- USArrests[c(1, 10, 20, 30), ]
my_data <- cbind(state = rownames(my_data), my_data)
my_data

Gather()

my_data2 <- gather(my_data, key = "arrest_attribute", value = "test", -state)
my_data2
# Entrer manuellement des données dans data frame

Extraction de données #### Eliminer les lignes pour lesquelles var1 et var2 sont vides simultanément

df[apply(df[,c("var1", "var2", "var3")], 1, function(y) !all(is.na(y))),]

STATS DESCRIPTIVES

Variables qualitatives

tab1(drep$SEXE) # {epiDisplay}

Variables Quantitatives

#epi.descriptives(drep$AGE)
#age <- epi.descriptives(drep$AGE)$a
# Tableau de variables
varquan <- c("AGE", "POIDS", "TAILLE", "PAS")
colnames <- c("n", "mean", "sd", "q25", "q50", "q75", "lower", "upper", "min", "max", "na")
tab <- as.data.frame(matrix(ncol = 11, nrow = 0 )); colnames(tab) <- colnames;
for (var in varquan) {
  x <- epi.descriptives(drep[,var])$a; tab <- rbind(tab, x); rm(x)
}
tab$varquan <- varquan; tab <- tab %>% dplyr::select(12, 1:11); rm(varquan, var, colnames)
tab

Graphiques

Fonction hist

# Création de la variable
drep <- transform(drep, PAM=(PAS+2*PAD)/3)
# Histogramme Effectifs
hist(drep$PAM, col = "grey", border = "white", xlim=c(50, 120), xlab = "Pression artérielle moyenne (mmHg)", ylim=c(0,50), ylab="Effectifs", main=NULL)
# Histogramme Densité
hist(drep$PAM, prob=TRUE, col = "grey", border = "white", xlim=c(50, 120), xlab = "Pression artérielle moyenne (mmHg)", ylim=c(0,0.05), ylab="Densités", main=NULL)
# Courbe de densité de la distribution
lines(density(drep$PAM,na.rm=TRUE),lwd=2,col="orange")
# Courbe de densité de la loi normale
PAMord <- drep$PAM[order(drep$PAM)]
lines(PAMord, dnorm(PAMord, mean=mean(PAMord), sd=sd(PAMord)), col="red")
# Rajour d'un texte
text(110,0.04,paste("N =",sum(complete.cases(drep$PAM))),cex=0.9)

Histogramme : 1 variable continue

par(mfrow=c(3,3))
a <- ggplot(dataset, aes(imc))
a + geom_histogram(color = "grey", fill="ivory3", linetype = 1) +
  #geom_freqpoly(color = "salmon") +
  stat_density() +
  labs(x = expression(paste("IMC en ", kg/m^2)), y="Effectif",   title=paste("Répartition des", nrow(dataset),"individus selon leur IMC")) +
  theme_classic() +
  theme(plot.title = element_text(color="grey20", size=15, face="bold.italic", hjust =0.5))

1 Variable discrète et 1 variable continue

a <- ggplot(dataset, aes(sexe, imc))
a + theme_bw()  + geom_boxplot() + xlab("Sexe") + ylab("IMC en cm2")

TESTS STATISTIQUES

Packages nécessaires : epiDisplay, epiR et/ou epicalc

library(epiDisplay, epiR, epicalc)

Variables qualitatives : test du Chi2

  1. Tableau de contingence observé tabpct
tabpct(drep$SEXE, drep$TYPEHB)
  1. Tableau de contingence théorique
chisq.test(drep$SEXE, drep$TYPEHB)$expected
---
title: "R - Aide Mémoire"
output:
  html_notebook:
    code_folding: none
    fig_caption: yes
    theme: yeti
    toc: yes
    toc_float: yes
  html_document:
    df_print: paged
    toc: yes
  pdf_document:
    toc: yes
---

```{css echo=F, eval=T}
body      {/* Normal */  font-family:"Calibri"; font-size:14px; font-style: ; color:greyblack  }
td        {/* Table */   font-family:"Calibri"; font-size:12px; }

h1.title  {/* Title */   font-family:"Calibri"; font-size:40px; font-style:bold; color:red }
h1.subtitle {/* Subtitle */   font-family:"Calibri"; font-size:50px; font-style:bold; color:red }
h1        {/* Header 1 */  font-family:"Calibri"; font-size:22px; font-style:bold; color:blue;
                           margin-top: 6px; margin-bottom: 12px  }
h2        {/* Header 2 */  font-family:"Calibri"; font-size:18px; font-styles:italic; color:blackgrey;
                           margin-top: 0px; margin-bottom: 6px }
h3        {/* Header 3 */  font-family:"Calibri"; font-size:16px; font-styles:bold; color:green;
                           margin-top: 0px; margin-bottom: 6px}

code.r    {/* Code block */ font-family:"Calibri"; font-size:50px; color: red}
pre       {/* Code block - determines code spacing between lines */ font-size: 12px; color: }

p:first-of-type {/* Define a margin after every first p elements  */margin-bottom: 12px;}
```


```{r, out.width="0.3\\linewidth", include=T, fig.align="center", fig.cap=c("your caption"), echo=F}
knitr::include_graphics("/Users/Elisee/GB_OwnCloud/R/CheatSheets/base-r.pdf")
```


```{r echo=FALSE, include = FALSE}
knitr::opts_chunk$set(echo = T, eval = F, include = T)
```

# DATA MANAGEMENT  
## Delete rows whose variables are NA  
```{r}
df <- df[apply(df[,c("var1", "var2", "var3")], 1, function(y) !all(is.na(y))),]
# ou
df <- df[rowSums(is.na(df)) != ncol(df),]
```

## Sum a variable by group 
Using `aggregate`
```{r }
aggregate(df$Var1, by = list(Var2 = df$Var2), FUN=sum)
```
Groupement selon les modalités de Var2 et somme de la Var1. multiple dimensions can be specified in 
the `list`. Multiple aggregated metrics of the same data type can be incorporated via `cbind`:
```{r}
aggregate(cbind(x$Frequency, x$Metric2, x$Metric3) by = list(Var2 = df$Var2), FUN=sum)
```

## Extract first n characters from a string
```{r}
substr(x, 1, n)
```

## Extract last n characters from String
```{r}
n_last <- n                                # Specify number of characters to extract
substr(x, nchar(x) - n_last + 1, nchar(x)) # Extract last three characters
```







### Importation de données   
#### Données texte : read.table()
```{r}
d <- read.table("donnees.txt", header = TRUE, sep = "", dec = ".")
```
Remarques : arguments `dec=` comme délimiteur décimal ; `header= T ou F`si le fichier contient une
ligne d'en-tête.

df <- read.table(file, sep="\t", header=T, na.strings="")
df <- read.csv2(file)
df <- read.delim(file)
df <- read.xls()


## Exportation de données
```{r}
# Writes a df into a file
{utils} write.table(df, file, append, quote, sep, na, rownames, colnames, ...)
{utils} write.csv2(df, file, append, quote, sep, na, rownames, colnames, ...)

# Writes an ASCII text representation into an R object
{base} dput(df, file, control)
```
### Création de données
#### seq(from, to)
```{r}
seq(5, 10)
```
#### rep(x,times)
rep(x,times) répète times fois la valeur x; utilisez
each=n pour répéter n fois chaque élément de x
```{r}
rep(c(1,2,3),2)
rep(c(1,2,3),each=2)
```


{base} seq(from, to, by, length)
{base} rep(x, times, each)
```{r}
df$var1 <- c(1:100) #{base}
{base} df$var2 <- rep(c("val1", NA_character_, "val2"), time = 3, each = 4, length.out = nrow(df))
```
## Création de data frames
```{r}
{base} data.frame(v=1:4, ch=c("a", "b", "c", "d"), lettre="A")
{base} as.data.frame(matrix(ncol = 10, nrow = 100)); names(df) <- c("var1", ...)
# From another data frame
df2 <- df1[, c("var1", "var2", "var3")]
```
## Remplacement de valeurs dans df
```{r}
df[which(df$var1 == "val1"), "var2"] <- "val2" # {base}
```

## Indexer des vecteurs
```{r}
vec[n]; vec[-n]; vec[1:n]; vec[-(1:n)]; vec[c(1,2,4)]; vec["val1"]; vec[vec>3]; vec[vec>3 & vec<5] 
{base} vec1 %in% vec2
```

## Information sur les objects
```{r}
is.na(vec); is.null(val or vec); length(vec); dim(df); nrow(df)
class(x); unclass(x)
which.max(vec); which.max(vec)
rev(vec); sort(vec); order(vec)
z <- setdiff(vec1, vec2)
```



## Manipulation Dates et Durées
```{r, echo=FALSE, include=FALSE}
# Extraire l'année d'une date   
#format(as.Date(df$Date, format="%d/%m/%Y"),"%Y")  
# Use a different origin 
as.Date(41149, origin = "1900-01-01")  
# Take a difference**  
Sys.Date() - as.Date("1970-01-01")  
difftime(Sys.Date(), as.Date("1970-01-01"), units = "days")  
# See internal integer representation**  
unclass(Sys.Date())
# Convert time (mm:ss) to decimal
vec <- c("4:30","2:20","34:10")
sapply(strsplit(vec,":"), function(x) {x <- as.numeric(x); x[1]+x[2]/60})
```
## Transformation d'un jeu de données 
```{r}
transform(df, newvar = var1 + var2)
transform(df, newvar = ifelse())
```
## Reshaping data 'tidyr' package
```{r}
my_data <- USArrests[c(1, 10, 20, 30), ]
my_data <- cbind(state = rownames(my_data), my_data)
my_data
```
Gather()
```{r}
my_data2 <- gather(my_data, key = "arrest_attribute", value = "test", -state)
my_data2
```
```{r}
# Entrer manuellement des données dans data frame

```

```{r, echo=FALSE, include=FALSE}
df2 <- df %>% group_by("var3")
df2
```

Extraction de données
#### Eliminer les lignes pour lesquelles var1 et var2 sont vides simultanément
```{r}
df[apply(df[,c("var1", "var2", "var3")], 1, function(y) !all(is.na(y))),]
```





# STATS DESCRIPTIVES
## Variables qualitatives
```{r}
tab1(drep$SEXE) # {epiDisplay}
```
## Variables Quantitatives
```{r}
#epi.descriptives(drep$AGE)
#age <- epi.descriptives(drep$AGE)$a
# Tableau de variables
varquan <- c("AGE", "POIDS", "TAILLE", "PAS")
colnames <- c("n", "mean", "sd", "q25", "q50", "q75", "lower", "upper", "min", "max", "na")
tab <- as.data.frame(matrix(ncol = 11, nrow = 0 )); colnames(tab) <- colnames;
for (var in varquan) {
  x <- epi.descriptives(drep[,var])$a; tab <- rbind(tab, x); rm(x)
}
tab$varquan <- varquan; tab <- tab %>% dplyr::select(12, 1:11); rm(varquan, var, colnames)
tab
```

## Graphiques
### Fonction `hist`
```{r}
# Création de la variable
drep <- transform(drep, PAM=(PAS+2*PAD)/3)
# Histogramme Effectifs
hist(drep$PAM, col = "grey", border = "white", xlim=c(50, 120), xlab = "Pression artérielle moyenne (mmHg)", ylim=c(0,50), ylab="Effectifs", main=NULL)
# Histogramme Densité
hist(drep$PAM, prob=TRUE, col = "grey", border = "white", xlim=c(50, 120), xlab = "Pression artérielle moyenne (mmHg)", ylim=c(0,0.05), ylab="Densités", main=NULL)
# Courbe de densité de la distribution
lines(density(drep$PAM,na.rm=TRUE),lwd=2,col="orange")
# Courbe de densité de la loi normale
PAMord <- drep$PAM[order(drep$PAM)]
lines(PAMord, dnorm(PAMord, mean=mean(PAMord), sd=sd(PAMord)), col="red")
# Rajour d'un texte
text(110,0.04,paste("N =",sum(complete.cases(drep$PAM))),cex=0.9)
```

### Histogramme : 1 variable continue
```{r}
par(mfrow=c(3,3))
a <- ggplot(dataset, aes(imc))
a + geom_histogram(color = "grey", fill="ivory3", linetype = 1) +
  #geom_freqpoly(color = "salmon") +
  stat_density() +
  labs(x = expression(paste("IMC en ", kg/m^2)), y="Effectif",   title=paste("Répartition des", nrow(dataset),"individus selon leur IMC")) +
  theme_classic() +
  theme(plot.title = element_text(color="grey20", size=15, face="bold.italic", hjust =0.5))
```

### 1 Variable discrète et 1 variable continue
```{r}
a <- ggplot(dataset, aes(sexe, imc))
a + theme_bw()  + geom_boxplot() + xlab("Sexe") + ylab("IMC en cm2")
```

```{r echo=FALSE, include=FALSE}
a <- ggplot(dataset, aes(imc))
a + geom_density(kernel="gaussian")
a + geom_freqpoly()
```

# TESTS STATISTIQUES
Packages nécessaires : epiDisplay, epiR et/ou epicalc
```{r}
library(epiDisplay, epiR, epicalc)
```
## Variables qualitatives : test du Chi2
1) Tableau de contingence observé `tabpct`
```{r}
tabpct(drep$SEXE, drep$TYPEHB)
```
2) Tableau de contingence théorique
```{r}
chisq.test(drep$SEXE, drep$TYPEHB)$expected
```
3) 







