Documentations en ligne pour ggplot2 et plyr
# traitement de données
library(plyr)
library(reshape)
## Attaching package: 'reshape'
## The following object(s) are masked from 'package:plyr':
##
## rename, round_any
# graphiques
library(ggplot2)
# supprimer tout les objets en memoire
rm(list = ls())
# définition du répertoire de travail
wd <- "/home/casadebaig/Documents/Travail/Documentation/Seminaires/2012_Atelier_R/01_Donnees_2012/rscript/"
# lecture
d <- read.table(file = paste(wd, "../data/BLA_1975-2011.csv", sep = ""), header = T,
sep = ";", dec = ".")
# ecriture
write.table(d, file = paste(wd, "../data/export.csv", sep = ""), sep = ";",
dec = ".", row.names = FALSE)
# ecriture
save(d, file = paste(wd, "../data/climate.RData", sep = ""), compress = "bzip2")
# lecture
load(file = paste(wd, "../data/climate.RData", sep = ""))
Automatisation de la lecture pour importer des séries de fichiers
Plus tard
# en se basant sur les noms de colonnes
str(d)
## 'data.frame': 13510 obs. of 8 variables:
## $ Code.MF: int 31069001 31069001 31069001 31069001 31069001 31069001 31069001 31069001 31069001 31069001 ...
## $ Date : Factor w/ 13510 levels "01/01/1975","01/01/1976",..: 1 445 889 1333 1777 2221 2665 3109 3553 3997 ...
## $ TX : num 10 6.3 8.4 11.2 3 2 8.1 11.5 7.1 13.7 ...
## $ TN : num -2.9 -4 -4.1 -1.3 1.2 0.4 0.5 6.2 5.8 2.6 ...
## $ TM : num NA NA NA NA NA NA NA NA NA NA ...
## $ RR : num 0 0 0 0 0 0.2 0 0 0 0 ...
## $ GLOT : num 114 114 114 114 114 114 114 114 114 114 ...
## $ ETP : num 0.37 0.37 0.37 0.37 0.37 0.37 0.37 0.37 0.37 0.37 ...
s <- colnames(d) %in% c("TX", "TN", "TM") != TRUE # vecteur pour la sélection
str(d[, s])
## 'data.frame': 13510 obs. of 5 variables:
## $ Code.MF: int 31069001 31069001 31069001 31069001 31069001 31069001 31069001 31069001 31069001 31069001 ...
## $ Date : Factor w/ 13510 levels "01/01/1975","01/01/1976",..: 1 445 889 1333 1777 2221 2665 3109 3553 3997 ...
## $ RR : num 0 0 0 0 0 0.2 0 0 0 0 ...
## $ GLOT : num 114 114 114 114 114 114 114 114 114 114 ...
## $ ETP : num 0.37 0.37 0.37 0.37 0.37 0.37 0.37 0.37 0.37 0.37 ...
# en se basant sur les indices
str(d[, -(5:7)])
## 'data.frame': 13510 obs. of 5 variables:
## $ Code.MF: int 31069001 31069001 31069001 31069001 31069001 31069001 31069001 31069001 31069001 31069001 ...
## $ Date : Factor w/ 13510 levels "01/01/1975","01/01/1976",..: 1 445 889 1333 1777 2221 2665 3109 3553 3997 ...
## $ TX : num 10 6.3 8.4 11.2 3 2 8.1 11.5 7.1 13.7 ...
## $ TN : num -2.9 -4 -4.1 -1.3 1.2 0.4 0.5 6.2 5.8 2.6 ...
## $ ETP : num 0.37 0.37 0.37 0.37 0.37 0.37 0.37 0.37 0.37 0.37 ...
str(d[, -match(c("TN", "TX", "TM"), colnames(d))])
## 'data.frame': 13510 obs. of 5 variables:
## $ Code.MF: int 31069001 31069001 31069001 31069001 31069001 31069001 31069001 31069001 31069001 31069001 ...
## $ Date : Factor w/ 13510 levels "01/01/1975","01/01/1976",..: 1 445 889 1333 1777 2221 2665 3109 3553 3997 ...
## $ RR : num 0 0 0 0 0 0.2 0 0 0 0 ...
## $ GLOT : num 114 114 114 114 114 114 114 114 114 114 ...
## $ ETP : num 0.37 0.37 0.37 0.37 0.37 0.37 0.37 0.37 0.37 0.37 ...
# ET logique
ggplot(data = d[(d$TM >= 0 & d$TM <= 10), ], aes(x = TM)) + geom_histogram(binwidth = 0.1)
# OU logique
ggplot(data = d[(d$TX >= 30 | d$TN <= -5), ], aes(x = TM)) + geom_histogram(binwidth = 0.5)
# OPTIONS : plyr::arrange, structure de donnée de type data.table
head(d[order(-d$TX), ])
## Code.MF Date TX TN TM RR GLOT ETP
## 10439 31069001 04/08/2003 40.7 22.8 31.7 0 2725 9.00
## 10440 31069001 05/08/2003 40.3 21.1 30.7 0 2591 9.00
## 10447 31069001 12/08/2003 40.3 23.2 31.7 0 2430 9.00
## 2744 31069001 08/07/1982 40.2 19.5 NA 0 525 5.73
## 10448 31069001 13/08/2003 40.2 22.3 31.2 0 2344 9.00
## 3131 31069001 30/07/1983 40.1 19.3 NA 0 523 5.10
head(d[order(-d$TX, -d$TM), ])
## Code.MF Date TX TN TM RR GLOT ETP
## 10439 31069001 04/08/2003 40.7 22.8 31.7 0 2725 9.00
## 10447 31069001 12/08/2003 40.3 23.2 31.7 0 2430 9.00
## 10440 31069001 05/08/2003 40.3 21.1 30.7 0 2591 9.00
## 10448 31069001 13/08/2003 40.2 22.3 31.2 0 2344 9.00
## 2744 31069001 08/07/1982 40.2 19.5 NA 0 525 5.73
## 3131 31069001 30/07/1983 40.1 19.3 NA 0 523 5.10
# utilisation de with pour éviter de répéter des noms
head(with(d, d[order(-TX, -TM), ]))
## Code.MF Date TX TN TM RR GLOT ETP
## 10439 31069001 04/08/2003 40.7 22.8 31.7 0 2725 9.00
## 10447 31069001 12/08/2003 40.3 23.2 31.7 0 2430 9.00
## 10440 31069001 05/08/2003 40.3 21.1 30.7 0 2591 9.00
## 10448 31069001 13/08/2003 40.2 22.3 31.2 0 2344 9.00
## 2744 31069001 08/07/1982 40.2 19.5 NA 0 525 5.73
## 3131 31069001 30/07/1983 40.1 19.3 NA 0 523 5.10
f <- factor(letters[1:4])
f
## [1] a b c d
## Levels: a b c d
f <- factor(f, levels = c("b", "a", "c", "d"))
f
## [1] a b c d
## Levels: b a c d
str(d$Date)
## Factor w/ 13510 levels "01/01/1975","01/01/1976",..: 1 445 889 1333 1777 2221 2665 3109 3553 3997 ...
d$Date <- strptime(d$Date, "%d/%m/%Y")
str(d$Date)
## POSIXlt[1:13510], format: "1975-01-01" "1975-01-02" "1975-01-03" "1975-01-04" ...
# ajout
d <- cbind(d, PAR = d$GLOT/100 * 0.48)
# suppression
d$departement <- NULL
# ajouts multiples OPTION : utilisation de plyr::mutate (utilisation de
# colonne itérativement)
d <- transform(d, JD = as.numeric(format(Date, format = "%j")), Y = as.numeric(format(Date,
format = "%Y")), M = as.numeric(format(Date, format = "%m")), D = as.numeric(format(Date,
format = "%d")))
str(d)
## 'data.frame': 13510 obs. of 13 variables:
## $ Code.MF: int 31069001 31069001 31069001 31069001 31069001 31069001 31069001 31069001 31069001 31069001 ...
## $ Date : POSIXlt, format: "1975-01-01" "1975-01-02" ...
## $ TX : num 10 6.3 8.4 11.2 3 2 8.1 11.5 7.1 13.7 ...
## $ TN : num -2.9 -4 -4.1 -1.3 1.2 0.4 0.5 6.2 5.8 2.6 ...
## $ TM : num NA NA NA NA NA NA NA NA NA NA ...
## $ RR : num 0 0 0 0 0 0.2 0 0 0 0 ...
## $ GLOT : num 114 114 114 114 114 114 114 114 114 114 ...
## $ ETP : num 0.37 0.37 0.37 0.37 0.37 0.37 0.37 0.37 0.37 0.37 ...
## $ PAR : num 0.547 0.547 0.547 0.547 0.547 ...
## $ JD : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Y : num 1975 1975 1975 1975 1975 ...
## $ M : num 1 1 1 1 1 1 1 1 1 1 ...
## $ D : num 1 2 3 4 5 6 7 8 9 10 ...
ggplot(aes(x = GLOT), data = d) + geom_histogram() + facet_wrap(~Y)
# jointures (assembler des tableaux)
postes <- data.frame(Code.MF = 31069001, site = "Blagnac", region = "MP", departement = 31)
postes
## Code.MF site region departement
## 1 31069001 Blagnac MP 31
head(merge(postes, d))
## Code.MF site region departement Date TX TN TM RR GLOT
## 1 31069001 Blagnac MP 31 1975-01-01 10.0 -2.9 NA 0.0 114
## 2 31069001 Blagnac MP 31 1975-01-02 6.3 -4.0 NA 0.0 114
## 3 31069001 Blagnac MP 31 1975-01-03 8.4 -4.1 NA 0.0 114
## 4 31069001 Blagnac MP 31 1975-01-04 11.2 -1.3 NA 0.0 114
## 5 31069001 Blagnac MP 31 1975-01-05 3.0 1.2 NA 0.0 114
## 6 31069001 Blagnac MP 31 1975-01-06 2.0 0.4 NA 0.2 114
## ETP PAR JD Y M D
## 1 0.37 0.5472 1 1975 1 1
## 2 0.37 0.5472 2 1975 1 2
## 3 0.37 0.5472 3 1975 1 3
## 4 0.37 0.5472 4 1975 1 4
## 5 0.37 0.5472 5 1975 1 5
## 6 0.37 0.5472 6 1975 1 6
# format DB vers analyse
head(xtabs(RR ~ JD + Y, data = d[, c("JD", "Y", "RR")])) # base::xtabs (simple)
## Y
## JD 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988
## 1 0.0 0.0 5.5 0.0 1.0 18.3 1.6 10.4 0 0.0 0.1 9.3 1.2 0.0
## 2 0.0 0.0 2.4 1.3 0.0 0.0 0.0 0.0 0 0.0 0.0 1.5 2.6 0.8
## 3 0.0 0.4 0.0 0.0 10.8 0.0 0.0 0.0 0 8.0 0.0 1.1 0.6 2.2
## 4 0.0 0.0 0.0 1.8 0.0 6.0 3.2 0.0 0 1.5 0.0 0.0 0.0 0.0
## 5 0.0 0.0 0.4 0.0 0.2 2.0 0.0 0.0 0 0.5 0.0 5.7 0.6 0.0
## 6 0.2 0.0 2.2 0.0 0.0 0.3 1.8 3.0 0 4.0 0.8 1.7 3.4 0.4
## Y
## JD 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002
## 1 0.0 0 0.0 0 0 6.6 0.4 2.6 2.2 0.0 0.2 0.0 0.0 0
## 2 0.0 0 0.0 0 0 1.4 0.2 1.2 1.1 3.8 2.4 0.0 0.2 0
## 3 0.0 0 0.0 0 0 0.6 0.0 0.0 32.0 6.2 0.0 0.0 5.6 0
## 4 0.8 0 1.6 0 0 0.0 0.0 0.0 4.0 0.4 0.2 0.2 1.0 0
## 5 0.1 0 0.0 0 0 0.0 8.0 9.6 0.0 0.0 0.0 0.0 6.0 0
## 6 16.8 0 1.0 0 0 1.8 1.8 1.8 0.4 0.0 0.0 0.0 7.6 0
## Y
## JD 2003 2004 2005 2006 2007 2008 2009 2010 2011
## 1 3.4 2.8 0.0 8.0 3.2 0.0 0 1.6 0.0
## 2 0.8 0.2 1.2 1.0 2.6 0.2 0 0.6 0.0
## 3 0.2 0.0 0.0 0.4 0.0 1.2 0 4.0 0.0
## 4 2.4 0.0 0.0 0.0 1.2 0.0 0 0.2 0.8
## 5 0.6 0.0 0.0 0.2 0.0 6.6 0 0.2 0.2
## 6 1.2 0.0 0.2 0.0 0.6 0.6 0 0.0 0.0
head(cast(d[, c("JD", "Y", "RR")], JD ~ Y)) # reshape::cast (possible de coupler des opération sur les groupes)
## JD 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988
## 1 1 0.0 0.0 5.5 0.0 1.0 18.3 1.6 10.4 0 0.0 0.1 9.3 1.2 0.0
## 2 2 0.0 0.0 2.4 1.3 0.0 0.0 0.0 0.0 0 0.0 0.0 1.5 2.6 0.8
## 3 3 0.0 0.4 0.0 0.0 10.8 0.0 0.0 0.0 0 8.0 0.0 1.1 0.6 2.2
## 4 4 0.0 0.0 0.0 1.8 0.0 6.0 3.2 0.0 0 1.5 0.0 0.0 0.0 0.0
## 5 5 0.0 0.0 0.4 0.0 0.2 2.0 0.0 0.0 0 0.5 0.0 5.7 0.6 0.0
## 6 6 0.2 0.0 2.2 0.0 0.0 0.3 1.8 3.0 0 4.0 0.8 1.7 3.4 0.4
## 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002
## 1 0.0 0 0.0 0 0 6.6 0.4 2.6 2.2 0.0 0.2 0.0 0.0 0
## 2 0.0 0 0.0 0 0 1.4 0.2 1.2 1.1 3.8 2.4 0.0 0.2 0
## 3 0.0 0 0.0 0 0 0.6 0.0 0.0 32.0 6.2 0.0 0.0 5.6 0
## 4 0.8 0 1.6 0 0 0.0 0.0 0.0 4.0 0.4 0.2 0.2 1.0 0
## 5 0.1 0 0.0 0 0 0.0 8.0 9.6 0.0 0.0 0.0 0.0 6.0 0
## 6 16.8 0 1.0 0 0 1.8 1.8 1.8 0.4 0.0 0.0 0.0 7.6 0
## 2003 2004 2005 2006 2007 2008 2009 2010 2011
## 1 3.4 2.8 0.0 8.0 3.2 0.0 0 1.6 0.0
## 2 0.8 0.2 1.2 1.0 2.6 0.2 0 0.6 0.0
## 3 0.2 0.0 0.0 0.4 0.0 1.2 0 4.0 0.0
## 4 2.4 0.0 0.0 0.0 1.2 0.0 0 0.2 0.8
## 5 0.6 0.0 0.0 0.2 0.0 6.6 0 0.2 0.2
## 6 1.2 0.0 0.2 0.0 0.6 0.6 0 0.0 0.0
head(reshape(d[, c("JD", "Y", "RR")], idvar = "JD", timevar = "Y", direction = "wide")) # base::reshape (arguments difficiles à retenir)
## JD RR.1975 RR.1976 RR.1977 RR.1978 RR.1979 RR.1980 RR.1981 RR.1982
## 1 1 0.0 0.0 5.5 0.0 1.0 18.3 1.6 10.4
## 2 2 0.0 0.0 2.4 1.3 0.0 0.0 0.0 0.0
## 3 3 0.0 0.4 0.0 0.0 10.8 0.0 0.0 0.0
## 4 4 0.0 0.0 0.0 1.8 0.0 6.0 3.2 0.0
## 5 5 0.0 0.0 0.4 0.0 0.2 2.0 0.0 0.0
## 6 6 0.2 0.0 2.2 0.0 0.0 0.3 1.8 3.0
## RR.1983 RR.1984 RR.1985 RR.1986 RR.1987 RR.1988 RR.1989 RR.1990 RR.1991
## 1 0 0.0 0.1 9.3 1.2 0.0 0.0 0 0.0
## 2 0 0.0 0.0 1.5 2.6 0.8 0.0 0 0.0
## 3 0 8.0 0.0 1.1 0.6 2.2 0.0 0 0.0
## 4 0 1.5 0.0 0.0 0.0 0.0 0.8 0 1.6
## 5 0 0.5 0.0 5.7 0.6 0.0 0.1 0 0.0
## 6 0 4.0 0.8 1.7 3.4 0.4 16.8 0 1.0
## RR.1992 RR.1993 RR.1994 RR.1995 RR.1996 RR.1997 RR.1998 RR.1999 RR.2000
## 1 0 0 6.6 0.4 2.6 2.2 0.0 0.2 0.0
## 2 0 0 1.4 0.2 1.2 1.1 3.8 2.4 0.0
## 3 0 0 0.6 0.0 0.0 32.0 6.2 0.0 0.0
## 4 0 0 0.0 0.0 0.0 4.0 0.4 0.2 0.2
## 5 0 0 0.0 8.0 9.6 0.0 0.0 0.0 0.0
## 6 0 0 1.8 1.8 1.8 0.4 0.0 0.0 0.0
## RR.2001 RR.2002 RR.2003 RR.2004 RR.2005 RR.2006 RR.2007 RR.2008 RR.2009
## 1 0.0 0 3.4 2.8 0.0 8.0 3.2 0.0 0
## 2 0.2 0 0.8 0.2 1.2 1.0 2.6 0.2 0
## 3 5.6 0 0.2 0.0 0.0 0.4 0.0 1.2 0
## 4 1.0 0 2.4 0.0 0.0 0.0 1.2 0.0 0
## 5 6.0 0 0.6 0.0 0.0 0.2 0.0 6.6 0
## 6 7.6 0 1.2 0.0 0.2 0.0 0.6 0.6 0
## RR.2010 RR.2011
## 1 1.6 0.0
## 2 0.6 0.0
## 3 4.0 0.0
## 4 0.2 0.8
## 5 0.2 0.2
## 6 0.0 0.0
# l'inverse, format analyse vers DB (très utile pour les graphiques)
# OPTION stack
dw <- xtabs(RR ~ JD + Y, data = d[, c("JD", "Y", "RR")])
head(melt(dw)) # reshape::melt
## JD Y value
## 1 1 1975 0.0
## 2 2 1975 0.0
## 3 3 1975 0.0
## 4 4 1975 0.0
## 5 5 1975 0.0
## 6 6 1975 0.2
ggplot(data = melt(dw), aes(x = JD, y = value)) + geom_line() + facet_wrap(~Y)
# tableau croisé dynamique, pivoting (plyr)
em <- ddply(d, "Y", summarise, s = sum(GLOT)) # somme de rayonnement par année
head(em)
## Y s
## 1 1975 105679
## 2 1976 117201
## 3 1977 103814
## 4 1978 105746
## 5 1979 108346
## 6 1980 111569
ggplot(aes(x = Y, y = s), data = em) + geom_point()
# correction sur une portion de colonne
d[d$Y %in% 1975:1991 == TRUE, ]$GLOT <- d[d$Y %in% 1975:1991 == TRUE, ]$GLOT *
4.1868
# OPTION aggregate (utilisation de l'interface 'formule')
head(aggregate(GLOT ~ Y, data = d, sum))
## Y GLOT
## 1 1975 442457
## 2 1976 490697
## 3 1977 434648
## 4 1978 442737
## 5 1979 453623
## 6 1980 467117
head(aggregate(GLOT ~ Y + M, data = d, sum))
## Y M GLOT
## 1 1975 1 14985
## 2 1976 1 15286
## 3 1977 1 13846
## 4 1978 1 12925
## 5 1979 1 14030
## 6 1980 1 15236
head(aggregate(cbind(GLOT, RR) ~ Y + M, data = d, sum))
## Y M GLOT RR
## 1 1975 1 14985 59.8
## 2 1976 1 15286 18.8
## 3 1977 1 13846 47.6
## 4 1978 1 12925 91.8
## 5 1979 1 14030 74.4
## 6 1980 1 15236 77.7
# OPTION plyr (plus simple pour différents traitements de sortie)
dm <- ddply(d, "Y", summarise, s = sum(GLOT), p = sum(RR), t = mean((TN + TX)/2))
ggplot(dm, aes(x = t, y = p)) + geom_text(aes(label = Y), size = 3)
# fonctions personnalisées : écriture en série (OPTION apply, lapply)
ecriture <- function(x) {
write.table(x, file = paste("../data/serie/", unique(x$Y), ".csv", sep = ""),
sep = ";", dec = ".", row.names = FALSE)
}
ddply(d, "Y", ecriture)
## data frame with 0 columns and 0 rows
# fonctions personnalisées : liste de différents modèles statistique
model <- function(x) {
lm(ETP ~ TX + RR + GLOT, data = x)
}
models <- dlply(d, "Y", model) # liste de modèles ajustés
models.coefs <- ldply(models, coef)
ggplot(data = melt(models.coefs[, -2], id.vars = "Y"), aes(x = variable, y = value,
group = Y)) + geom_line()