Atelier R : Traitement des données

Bibliothèques nécessaires

Importer / Exporter

Depuis un fichier

au format texte

OPTION au format R (.RData)

Depuis un ensemble de fichiers

Depuis une base de donnée

Analyser

Sélectionner

sélectionner des colonnes d'un data.frame

filtrer les données sur un seuil

Trier

trier un data frame selon une, ou plusieurs colonnes

trier les niveaux d'un facteur

Traiter

changer le type de données d'une colonne

ajouter des colonnes (informations, variables élaborées…)

changer la disposition du tableau

appliquer une fonction par modalité d'un facteur : {split - apply - combine}

Bibliothèques nécessaires

Documentations en ligne pour ggplot2 et plyr

# traitement de données
library(plyr)
library(reshape)

## Attaching package: 'reshape'

## The following object(s) are masked from 'package:plyr':
## 
## rename, round_any

# graphiques
library(ggplot2)

Importer / Exporter

Depuis un fichier

au format texte

# supprimer tout les objets en memoire
rm(list = ls())

# définition du répertoire de travail
wd <- "/home/casadebaig/Documents/Travail/Documentation/Seminaires/2012_Atelier_R/01_Donnees_2012/rscript/"

# lecture
d <- read.table(file = paste(wd, "../data/BLA_1975-2011.csv", sep = ""), header = T, 
    sep = ";", dec = ".")

# ecriture
write.table(d, file = paste(wd, "../data/export.csv", sep = ""), sep = ";", 
    dec = ".", row.names = FALSE)

OPTION au format R (.RData)

# ecriture
save(d, file = paste(wd, "../data/climate.RData", sep = ""), compress = "bzip2")

# lecture
load(file = paste(wd, "../data/climate.RData", sep = ""))

Depuis un ensemble de fichiers

Automatisation de la lecture pour importer des séries de fichiers

Depuis une base de donnée

Plus tard

Analyser

Sélectionner

sélectionner des colonnes d'un data.frame

# en se basant sur les noms de colonnes
str(d)

## 'data.frame':    13510 obs. of  8 variables:
##  $ Code.MF: int  31069001 31069001 31069001 31069001 31069001 31069001 31069001 31069001 31069001 31069001 ...
##  $ Date   : Factor w/ 13510 levels "01/01/1975","01/01/1976",..: 1 445 889 1333 1777 2221 2665 3109 3553 3997 ...
##  $ TX     : num  10 6.3 8.4 11.2 3 2 8.1 11.5 7.1 13.7 ...
##  $ TN     : num  -2.9 -4 -4.1 -1.3 1.2 0.4 0.5 6.2 5.8 2.6 ...
##  $ TM     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ RR     : num  0 0 0 0 0 0.2 0 0 0 0 ...
##  $ GLOT   : num  114 114 114 114 114 114 114 114 114 114 ...
##  $ ETP    : num  0.37 0.37 0.37 0.37 0.37 0.37 0.37 0.37 0.37 0.37 ...

s <- colnames(d) %in% c("TX", "TN", "TM") != TRUE  # vecteur pour la sélection
str(d[, s])

## 'data.frame':    13510 obs. of  5 variables:
##  $ Code.MF: int  31069001 31069001 31069001 31069001 31069001 31069001 31069001 31069001 31069001 31069001 ...
##  $ Date   : Factor w/ 13510 levels "01/01/1975","01/01/1976",..: 1 445 889 1333 1777 2221 2665 3109 3553 3997 ...
##  $ RR     : num  0 0 0 0 0 0.2 0 0 0 0 ...
##  $ GLOT   : num  114 114 114 114 114 114 114 114 114 114 ...
##  $ ETP    : num  0.37 0.37 0.37 0.37 0.37 0.37 0.37 0.37 0.37 0.37 ...


# en se basant sur les indices
str(d[, -(5:7)])

## 'data.frame':    13510 obs. of  5 variables:
##  $ Code.MF: int  31069001 31069001 31069001 31069001 31069001 31069001 31069001 31069001 31069001 31069001 ...
##  $ Date   : Factor w/ 13510 levels "01/01/1975","01/01/1976",..: 1 445 889 1333 1777 2221 2665 3109 3553 3997 ...
##  $ TX     : num  10 6.3 8.4 11.2 3 2 8.1 11.5 7.1 13.7 ...
##  $ TN     : num  -2.9 -4 -4.1 -1.3 1.2 0.4 0.5 6.2 5.8 2.6 ...
##  $ ETP    : num  0.37 0.37 0.37 0.37 0.37 0.37 0.37 0.37 0.37 0.37 ...

str(d[, -match(c("TN", "TX", "TM"), colnames(d))])

## 'data.frame':    13510 obs. of  5 variables:
##  $ Code.MF: int  31069001 31069001 31069001 31069001 31069001 31069001 31069001 31069001 31069001 31069001 ...
##  $ Date   : Factor w/ 13510 levels "01/01/1975","01/01/1976",..: 1 445 889 1333 1777 2221 2665 3109 3553 3997 ...
##  $ RR     : num  0 0 0 0 0 0.2 0 0 0 0 ...
##  $ GLOT   : num  114 114 114 114 114 114 114 114 114 114 ...
##  $ ETP    : num  0.37 0.37 0.37 0.37 0.37 0.37 0.37 0.37 0.37 0.37 ...

filtrer les données sur un seuil

# ET logique
ggplot(data = d[(d$TM >= 0 & d$TM <= 10), ], aes(x = TM)) + geom_histogram(binwidth = 0.1)

plot of chunk filter

# OU logique
ggplot(data = d[(d$TX >= 30 | d$TN <= -5), ], aes(x = TM)) + geom_histogram(binwidth = 0.5)

plot of chunk filter

Trier

Trier un data frame selon une, ou plusieurs colonnes

# OPTIONS : plyr::arrange, structure de donnée de type data.table
head(d[order(-d$TX), ])

##        Code.MF       Date   TX   TN   TM RR GLOT  ETP
## 10439 31069001 04/08/2003 40.7 22.8 31.7  0 2725 9.00
## 10440 31069001 05/08/2003 40.3 21.1 30.7  0 2591 9.00
## 10447 31069001 12/08/2003 40.3 23.2 31.7  0 2430 9.00
## 2744  31069001 08/07/1982 40.2 19.5   NA  0  525 5.73
## 10448 31069001 13/08/2003 40.2 22.3 31.2  0 2344 9.00
## 3131  31069001 30/07/1983 40.1 19.3   NA  0  523 5.10

head(d[order(-d$TX, -d$TM), ])

##        Code.MF       Date   TX   TN   TM RR GLOT  ETP
## 10439 31069001 04/08/2003 40.7 22.8 31.7  0 2725 9.00
## 10447 31069001 12/08/2003 40.3 23.2 31.7  0 2430 9.00
## 10440 31069001 05/08/2003 40.3 21.1 30.7  0 2591 9.00
## 10448 31069001 13/08/2003 40.2 22.3 31.2  0 2344 9.00
## 2744  31069001 08/07/1982 40.2 19.5   NA  0  525 5.73
## 3131  31069001 30/07/1983 40.1 19.3   NA  0  523 5.10

# utilisation de with pour éviter de répéter des noms
head(with(d, d[order(-TX, -TM), ]))

##        Code.MF       Date   TX   TN   TM RR GLOT  ETP
## 10439 31069001 04/08/2003 40.7 22.8 31.7  0 2725 9.00
## 10447 31069001 12/08/2003 40.3 23.2 31.7  0 2430 9.00
## 10440 31069001 05/08/2003 40.3 21.1 30.7  0 2591 9.00
## 10448 31069001 13/08/2003 40.2 22.3 31.2  0 2344 9.00
## 2744  31069001 08/07/1982 40.2 19.5   NA  0  525 5.73
## 3131  31069001 30/07/1983 40.1 19.3   NA  0  523 5.10

Trier les niveaux d'un facteur

f <- factor(letters[1:4])
f

## [1] a b c d
## Levels: a b c d

f <- factor(f, levels = c("b", "a", "c", "d"))
f

## [1] a b c d
## Levels: b a c d

Traiter

changer le type de données d'une colonne

str(d$Date)

##  Factor w/ 13510 levels "01/01/1975","01/01/1976",..: 1 445 889 1333 1777 2221 2665 3109 3553 3997 ...

d$Date <- strptime(d$Date, "%d/%m/%Y")
str(d$Date)

##  POSIXlt[1:13510], format: "1975-01-01" "1975-01-02" "1975-01-03" "1975-01-04" ...

ajouter des colonnes (informations, variables élaborées…)

# ajout
d <- cbind(d, PAR = d$GLOT/100 * 0.48)

# suppression
d$departement <- NULL

# ajouts multiples OPTION : utilisation de plyr::mutate (utilisation de
# colonne itérativement)
d <- transform(d, JD = as.numeric(format(Date, format = "%j")), Y = as.numeric(format(Date, 
    format = "%Y")), M = as.numeric(format(Date, format = "%m")), D = as.numeric(format(Date, 
    format = "%d")))
str(d)

## 'data.frame':    13510 obs. of  13 variables:
##  $ Code.MF: int  31069001 31069001 31069001 31069001 31069001 31069001 31069001 31069001 31069001 31069001 ...
##  $ Date   : POSIXlt, format: "1975-01-01" "1975-01-02" ...
##  $ TX     : num  10 6.3 8.4 11.2 3 2 8.1 11.5 7.1 13.7 ...
##  $ TN     : num  -2.9 -4 -4.1 -1.3 1.2 0.4 0.5 6.2 5.8 2.6 ...
##  $ TM     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ RR     : num  0 0 0 0 0 0.2 0 0 0 0 ...
##  $ GLOT   : num  114 114 114 114 114 114 114 114 114 114 ...
##  $ ETP    : num  0.37 0.37 0.37 0.37 0.37 0.37 0.37 0.37 0.37 0.37 ...
##  $ PAR    : num  0.547 0.547 0.547 0.547 0.547 ...
##  $ JD     : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Y      : num  1975 1975 1975 1975 1975 ...
##  $ M      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ D      : num  1 2 3 4 5 6 7 8 9 10 ...


ggplot(aes(x = GLOT), data = d) + geom_histogram() + facet_wrap(~Y)

plot of chunk add_column


# jointures (assembler des tableaux)
postes <- data.frame(Code.MF = 31069001, site = "Blagnac", region = "MP", departement = 31)
postes

##    Code.MF    site region departement
## 1 31069001 Blagnac     MP          31

head(merge(postes, d))

##    Code.MF    site region departement       Date   TX   TN TM  RR GLOT
## 1 31069001 Blagnac     MP          31 1975-01-01 10.0 -2.9 NA 0.0  114
## 2 31069001 Blagnac     MP          31 1975-01-02  6.3 -4.0 NA 0.0  114
## 3 31069001 Blagnac     MP          31 1975-01-03  8.4 -4.1 NA 0.0  114
## 4 31069001 Blagnac     MP          31 1975-01-04 11.2 -1.3 NA 0.0  114
## 5 31069001 Blagnac     MP          31 1975-01-05  3.0  1.2 NA 0.0  114
## 6 31069001 Blagnac     MP          31 1975-01-06  2.0  0.4 NA 0.2  114
##    ETP    PAR JD    Y M D
## 1 0.37 0.5472  1 1975 1 1
## 2 0.37 0.5472  2 1975 1 2
## 3 0.37 0.5472  3 1975 1 3
## 4 0.37 0.5472  4 1975 1 4
## 5 0.37 0.5472  5 1975 1 5
## 6 0.37 0.5472  6 1975 1 6

changer la disposition du tableau

# format DB vers analyse
head(xtabs(RR ~ JD + Y, data = d[, c("JD", "Y", "RR")]))  # base::xtabs (simple)

##    Y
## JD  1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988
##   1  0.0  0.0  5.5  0.0  1.0 18.3  1.6 10.4    0  0.0  0.1  9.3  1.2  0.0
##   2  0.0  0.0  2.4  1.3  0.0  0.0  0.0  0.0    0  0.0  0.0  1.5  2.6  0.8
##   3  0.0  0.4  0.0  0.0 10.8  0.0  0.0  0.0    0  8.0  0.0  1.1  0.6  2.2
##   4  0.0  0.0  0.0  1.8  0.0  6.0  3.2  0.0    0  1.5  0.0  0.0  0.0  0.0
##   5  0.0  0.0  0.4  0.0  0.2  2.0  0.0  0.0    0  0.5  0.0  5.7  0.6  0.0
##   6  0.2  0.0  2.2  0.0  0.0  0.3  1.8  3.0    0  4.0  0.8  1.7  3.4  0.4
##    Y
## JD  1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002
##   1  0.0    0  0.0    0    0  6.6  0.4  2.6  2.2  0.0  0.2  0.0  0.0    0
##   2  0.0    0  0.0    0    0  1.4  0.2  1.2  1.1  3.8  2.4  0.0  0.2    0
##   3  0.0    0  0.0    0    0  0.6  0.0  0.0 32.0  6.2  0.0  0.0  5.6    0
##   4  0.8    0  1.6    0    0  0.0  0.0  0.0  4.0  0.4  0.2  0.2  1.0    0
##   5  0.1    0  0.0    0    0  0.0  8.0  9.6  0.0  0.0  0.0  0.0  6.0    0
##   6 16.8    0  1.0    0    0  1.8  1.8  1.8  0.4  0.0  0.0  0.0  7.6    0
##    Y
## JD  2003 2004 2005 2006 2007 2008 2009 2010 2011
##   1  3.4  2.8  0.0  8.0  3.2  0.0    0  1.6  0.0
##   2  0.8  0.2  1.2  1.0  2.6  0.2    0  0.6  0.0
##   3  0.2  0.0  0.0  0.4  0.0  1.2    0  4.0  0.0
##   4  2.4  0.0  0.0  0.0  1.2  0.0    0  0.2  0.8
##   5  0.6  0.0  0.0  0.2  0.0  6.6    0  0.2  0.2
##   6  1.2  0.0  0.2  0.0  0.6  0.6    0  0.0  0.0

head(cast(d[, c("JD", "Y", "RR")], JD ~ Y))  # reshape::cast (possible de coupler des opération sur les groupes)

##   JD 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988
## 1  1  0.0  0.0  5.5  0.0  1.0 18.3  1.6 10.4    0  0.0  0.1  9.3  1.2  0.0
## 2  2  0.0  0.0  2.4  1.3  0.0  0.0  0.0  0.0    0  0.0  0.0  1.5  2.6  0.8
## 3  3  0.0  0.4  0.0  0.0 10.8  0.0  0.0  0.0    0  8.0  0.0  1.1  0.6  2.2
## 4  4  0.0  0.0  0.0  1.8  0.0  6.0  3.2  0.0    0  1.5  0.0  0.0  0.0  0.0
## 5  5  0.0  0.0  0.4  0.0  0.2  2.0  0.0  0.0    0  0.5  0.0  5.7  0.6  0.0
## 6  6  0.2  0.0  2.2  0.0  0.0  0.3  1.8  3.0    0  4.0  0.8  1.7  3.4  0.4
##   1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002
## 1  0.0    0  0.0    0    0  6.6  0.4  2.6  2.2  0.0  0.2  0.0  0.0    0
## 2  0.0    0  0.0    0    0  1.4  0.2  1.2  1.1  3.8  2.4  0.0  0.2    0
## 3  0.0    0  0.0    0    0  0.6  0.0  0.0 32.0  6.2  0.0  0.0  5.6    0
## 4  0.8    0  1.6    0    0  0.0  0.0  0.0  4.0  0.4  0.2  0.2  1.0    0
## 5  0.1    0  0.0    0    0  0.0  8.0  9.6  0.0  0.0  0.0  0.0  6.0    0
## 6 16.8    0  1.0    0    0  1.8  1.8  1.8  0.4  0.0  0.0  0.0  7.6    0
##   2003 2004 2005 2006 2007 2008 2009 2010 2011
## 1  3.4  2.8  0.0  8.0  3.2  0.0    0  1.6  0.0
## 2  0.8  0.2  1.2  1.0  2.6  0.2    0  0.6  0.0
## 3  0.2  0.0  0.0  0.4  0.0  1.2    0  4.0  0.0
## 4  2.4  0.0  0.0  0.0  1.2  0.0    0  0.2  0.8
## 5  0.6  0.0  0.0  0.2  0.0  6.6    0  0.2  0.2
## 6  1.2  0.0  0.2  0.0  0.6  0.6    0  0.0  0.0

head(reshape(d[, c("JD", "Y", "RR")], idvar = "JD", timevar = "Y", direction = "wide"))  # base::reshape (arguments difficiles à retenir)

##   JD RR.1975 RR.1976 RR.1977 RR.1978 RR.1979 RR.1980 RR.1981 RR.1982
## 1  1     0.0     0.0     5.5     0.0     1.0    18.3     1.6    10.4
## 2  2     0.0     0.0     2.4     1.3     0.0     0.0     0.0     0.0
## 3  3     0.0     0.4     0.0     0.0    10.8     0.0     0.0     0.0
## 4  4     0.0     0.0     0.0     1.8     0.0     6.0     3.2     0.0
## 5  5     0.0     0.0     0.4     0.0     0.2     2.0     0.0     0.0
## 6  6     0.2     0.0     2.2     0.0     0.0     0.3     1.8     3.0
##   RR.1983 RR.1984 RR.1985 RR.1986 RR.1987 RR.1988 RR.1989 RR.1990 RR.1991
## 1       0     0.0     0.1     9.3     1.2     0.0     0.0       0     0.0
## 2       0     0.0     0.0     1.5     2.6     0.8     0.0       0     0.0
## 3       0     8.0     0.0     1.1     0.6     2.2     0.0       0     0.0
## 4       0     1.5     0.0     0.0     0.0     0.0     0.8       0     1.6
## 5       0     0.5     0.0     5.7     0.6     0.0     0.1       0     0.0
## 6       0     4.0     0.8     1.7     3.4     0.4    16.8       0     1.0
##   RR.1992 RR.1993 RR.1994 RR.1995 RR.1996 RR.1997 RR.1998 RR.1999 RR.2000
## 1       0       0     6.6     0.4     2.6     2.2     0.0     0.2     0.0
## 2       0       0     1.4     0.2     1.2     1.1     3.8     2.4     0.0
## 3       0       0     0.6     0.0     0.0    32.0     6.2     0.0     0.0
## 4       0       0     0.0     0.0     0.0     4.0     0.4     0.2     0.2
## 5       0       0     0.0     8.0     9.6     0.0     0.0     0.0     0.0
## 6       0       0     1.8     1.8     1.8     0.4     0.0     0.0     0.0
##   RR.2001 RR.2002 RR.2003 RR.2004 RR.2005 RR.2006 RR.2007 RR.2008 RR.2009
## 1     0.0       0     3.4     2.8     0.0     8.0     3.2     0.0       0
## 2     0.2       0     0.8     0.2     1.2     1.0     2.6     0.2       0
## 3     5.6       0     0.2     0.0     0.0     0.4     0.0     1.2       0
## 4     1.0       0     2.4     0.0     0.0     0.0     1.2     0.0       0
## 5     6.0       0     0.6     0.0     0.0     0.2     0.0     6.6       0
## 6     7.6       0     1.2     0.0     0.2     0.0     0.6     0.6       0
##   RR.2010 RR.2011
## 1     1.6     0.0
## 2     0.6     0.0
## 3     4.0     0.0
## 4     0.2     0.8
## 5     0.2     0.2
## 6     0.0     0.0


# l'inverse, format analyse vers DB (très utile pour les graphiques)
# OPTION stack
dw <- xtabs(RR ~ JD + Y, data = d[, c("JD", "Y", "RR")])
head(melt(dw))  # reshape::melt

##   JD    Y value
## 1  1 1975   0.0
## 2  2 1975   0.0
## 3  3 1975   0.0
## 4  4 1975   0.0
## 5  5 1975   0.0
## 6  6 1975   0.2

ggplot(data = melt(dw), aes(x = JD, y = value)) + geom_line() + facet_wrap(~Y)

plot of chunk reshape

appliquer une fonction par modalité d'un facteur : {split - apply - combine}

# tableau croisé dynamique, pivoting (plyr)
em <- ddply(d, "Y", summarise, s = sum(GLOT))  # somme de rayonnement par année
head(em)

##      Y      s
## 1 1975 105679
## 2 1976 117201
## 3 1977 103814
## 4 1978 105746
## 5 1979 108346
## 6 1980 111569

ggplot(aes(x = Y, y = s), data = em) + geom_point()

plot of chunk split

# correction sur une portion de colonne
d[d$Y %in% 1975:1991 == TRUE, ]$GLOT <- d[d$Y %in% 1975:1991 == TRUE, ]$GLOT * 
    4.1868

# OPTION aggregate (utilisation de l'interface 'formule')
head(aggregate(GLOT ~ Y, data = d, sum))

##      Y   GLOT
## 1 1975 442457
## 2 1976 490697
## 3 1977 434648
## 4 1978 442737
## 5 1979 453623
## 6 1980 467117

head(aggregate(GLOT ~ Y + M, data = d, sum))

##      Y M  GLOT
## 1 1975 1 14985
## 2 1976 1 15286
## 3 1977 1 13846
## 4 1978 1 12925
## 5 1979 1 14030
## 6 1980 1 15236

head(aggregate(cbind(GLOT, RR) ~ Y + M, data = d, sum))

##      Y M  GLOT   RR
## 1 1975 1 14985 59.8
## 2 1976 1 15286 18.8
## 3 1977 1 13846 47.6
## 4 1978 1 12925 91.8
## 5 1979 1 14030 74.4
## 6 1980 1 15236 77.7


# OPTION plyr (plus simple pour différents traitements de sortie)
dm <- ddply(d, "Y", summarise, s = sum(GLOT), p = sum(RR), t = mean((TN + TX)/2))
ggplot(dm, aes(x = t, y = p)) + geom_text(aes(label = Y), size = 3)

plot of chunk split


# fonctions personnalisées : écriture en série (OPTION apply, lapply)
ecriture <- function(x) {
    write.table(x, file = paste("../data/serie/", unique(x$Y), ".csv", sep = ""), 
        sep = ";", dec = ".", row.names = FALSE)
}
ddply(d, "Y", ecriture)

## data frame with 0 columns and 0 rows


# fonctions personnalisées : liste de différents modèles statistique
model <- function(x) {
    lm(ETP ~ TX + RR + GLOT, data = x)
}
models <- dlply(d, "Y", model)  # liste de modèles ajustés
models.coefs <- ldply(models, coef)
ggplot(data = melt(models.coefs[, -2], id.vars = "Y"), aes(x = variable, y = value, 
    group = Y)) + geom_line()

plot of chunk split