Importation des données dans R

Fichier .txt

  • Lecture de donnĂ©es avec read.table() : elle permet d’importer les donnĂ©es format .txt

donnees <- read.table(file=file.choose(),header=T,sep="\t", dec=".",row.names=1)

 

 

> banques <- read.table("/Users/dhafermalouche/Documents/Teaching/CoursDataMining_1516/Visualisation_avec_R/banques.txt",header = T,sep = '\t',dec = ",")
  • Utiliser readLines pour avoir une libre lecture du document Ă  importer
> readLines("/Users/dhafermalouche/Documents/Teaching/CoursDataMining_1516/Visualisation_avec_R/banques.txt",n=4)
[1] "Classe_PV\tAge_PV\tCadre\tNon_Cadre\tDiplome\tAge_Moy\tAnc_Moy\tSurface_\tType_concep\tNbr_reclam\tAge_RPV\tAnc_RPV\tAnc_RPV_PV\tQualt_client"
[2] "3\t29\t14\t10\t10\t41,71\t5,54\t12,75\tA\t16\t38\t21\t20\t0,37"                                                                               
[3] "3\t21\t6\t14\t6\t34,2\t4,7\t11,8\tA\t7\t41\t28\t9\t0,53"                                                                                      
[4] "5\t13\t3\t5\t3\t31,13\t6,63\t12,25\tA\t7\t46\t66\t24\t0,49"                                                                                   


 

read.csv(file.choose())  # Pour lire des donn?es separ?es par
                         # des virgules
                         # (avec le . pour s?parateur d?cimal).
read.csv2(file.choose()) # Pour lire des donnees separ?es par
                         # des points-virgules
                         # (avec la , pour separateur decimal).

Importation des fichiers SPSS, SAS, STATA, Excel dans R,

Logiciel Package Fonction R Extension du fichier
SPSS foreign read.spss .sav
Stata foreign read.dta .dta
SAS foreign read. xport .xpt
Matlab R.Matlab readMat .mat
Excel gdata read.xls .xls/xlxs

Résumé des données

On considère les données suivantes

> data <- read.table(header=TRUE, text='
+  subject sex condition before after change
+        1   F   placebo   10.1   6.9   -3.2
+        2   F   placebo    6.3   4.2   -2.1
+        3   M   aspirin   12.4   6.3   -6.1
+        4   F   placebo    8.1   6.1   -2.0
+        5   M   aspirin   15.2   9.9   -5.3
+        6   F   aspirin   10.9   7.0   -3.9
+        7   F   aspirin   11.6   8.5   -3.1
+        8   M   aspirin    9.5   3.0   -6.5
+        9   F   placebo   11.5   9.0   -2.5
+       10   M   placebo   11.9  11.0   -0.9
+       11   F   aspirin   11.4   8.0   -3.4
+       12   M   aspirin   10.0   4.4   -5.6
+       13   M   aspirin   12.5   5.4   -7.1
+       14   M   placebo   10.6  10.6    0.0
+       15   M   aspirin    9.1   4.3   -4.8
+       16   F   placebo   12.1  10.2   -1.9
+       17   F   placebo   11.0   8.8   -2.2
+       18   F   placebo   11.9  10.2   -1.7
+       19   M   aspirin    9.1   3.6   -5.5
+       20   M   placebo   13.5  12.4   -1.1
+       21   M   aspirin   12.0   7.5   -4.5
+       22   F   placebo    9.1   7.6   -1.5
+       23   M   placebo    9.9   8.0   -1.9
+       24   F   placebo    7.6   5.2   -2.4
+       25   F   placebo   11.8   9.7   -2.1
+       26   F   placebo   11.8  10.7   -1.1
+       27   F   aspirin   10.1   7.9   -2.2
+       28   M   aspirin   11.6   8.3   -3.3
+       29   F   aspirin   11.3   6.8   -4.5
+       30   F   placebo   10.3   8.3   -2.0
+  ')
> library(plyr)
> cdata <- ddply(data, c("sex", "condition"), summarise,
+                N    = length(change),
+                mean = mean(change),
+                sd   = sd(change),
+                se   = sd / sqrt(N)
+ )
> cdata
  sex condition  N      mean        sd        se
1   F   aspirin  5 -3.420000 0.8642916 0.3865230
2   F   placebo 12 -2.058333 0.5247655 0.1514867
3   M   aspirin  9 -5.411111 1.1307569 0.3769190
4   M   placebo  4 -0.975000 0.7804913 0.3902456

D’abord installer le package printr

> install.packages(
+   'printr',
+   type = 'source',
+   repos = c('http://yihui.name/xran', 'http://cran.rstudio.com')
+ )
> install.packages(
+   'printr',
+   type = 'source',
+   repos = c('http://yihui.name/xran', 'http://cran.rstudio.com')
+ )
> library(printr)
> colnames(cdata)=c("Genre","Condition","N","Moyenne","Ecart-type","Erreur Standard")
> cdata=format(cdata,digits = 3,decimal.mark = ".")
> cdata
Genre Condition N Moyenne Ecart-type Erreur Standard
F aspirin 5 -3.420 0.864 0.387
F placebo 12 -2.058 0.525 0.151
M aspirin 9 -5.411 1.131 0.377
M placebo 4 -0.975 0.780 0.390
> # On met quelques données manquantes.
> dataNA <- data
> dataNA$change[11:14] <- NA
> cdata <- ddply(dataNA, c("sex", "condition"), summarise,
+                N    = sum(!is.na(change)),
+                N.na = sum(is.na(change)==T),
+                mean = mean(change, na.rm=TRUE),
+                sd   = sd(change, na.rm=TRUE),
+                se   = sd / sqrt(N)
+ )
> colnames(cdata)=c("Genre","Condition","N"," NAs","Moyenne","Ecart-type","Erreur Standard")
> cdata=format(cdata,digits = 3,decimal.mark = ".")
> cdata
Genre Condition N NAs Moyenne Ecart-type Erreur Standard
F aspirin 4 1 -3.42 0.998 0.499
F placebo 12 0 -2.06 0.525 0.151
M aspirin 7 2 -5.14 1.067 0.403
M placebo 3 1 -1.30 0.529 0.306
> df=data.frame(data[,4:6])
> tmp <- do.call(data.frame, 
+            list(mean = apply(df, 2, mean),
+                 sd = apply(df, 2, sd),
+                 median = apply(df, 2, median),
+                 min = apply(df, 2, min),
+                 max = apply(df, 2, max),
+                 n = apply(df, 2, length)))
> tmp=format(tmp,digits = 3,decimal.mark = ".")
> tmp
mean sd median min max n
before 10.81 1.79 11.15 6.3 15.2 30
after 7.66 2.41 7.95 3.0 12.4 30
change -3.15 1.84 -2.45 -7.1 0.0 30
> df=data.frame(data[,2:3])
> library(sjPlot)
> tmp=sjt.frq(df,alternateRowColors = T,showSummary = F, no.output = TRUE)
> cat(tmp$knitr)
sex
value N raw % valid % cumulative %
F 17 56.67 56.67 56.67
M 13 43.33 43.33 100.00
missings 0 0.00

 

condition
value N raw % valid % cumulative %
aspirin 14 46.67 46.67 46.67
placebo 16 53.33 53.33 100.00
missings 0 0.00

 

Quelques Graphiques

2 Variables discrètes

Considérons le tableau de contigence suivant :

> require(graphics)
> M <- as.table( cbind( c( 18,28,14), c( 20,51,28) , c( 12,25,9))) 
> dimnames( M) <- list( Husband = c(" Tall", "Medium", "Short"), Wife = c(" Tall"," Medium", "Short"))
> M
Husband/Wife Tall Medium Short
Tall 18 20 12
Medium 28 51 25
Short 14 28 9

Graphe en mosaiques

> mosaicplot( M, col = c(" green", "red"),main = "Husband x Wife")

ou encore

> library(vcd)
Loading required package: grid
> mosaic(M, shade=T,main = "Husband x Wife")

Graphe en araigné.

On considère une base de données sur le pourcentage d’abondan scolaire dans 6 gouvernorats en Tunisie

> library(ggplot2)
>  source('~/Documents/Important_R_functions/CreatRadialPlot.R')
> df <- read.csv("/Users/dhafermalouche/Documents/Teaching/CoursDataMining_1516/Visualisation_avec_R/drop_out_school_tunisia.csv")
> df <- df[,-1]
> tmp=format(df,digits = 3,decimal.mark = ".")
> tmp
group bizerte siliana monastir mahdia tunis sfax National
Female 7.58 5.95 11.8 6.57 12.33 3.96 8.25
Male 14.29 6.36 23.0 6.21 6.02 5.34 11.47
All 11.59 6.19 17.9 6.38 8.97 4.74 10.02
> CreateRadialPlot(df,grid.label.size = 5,axis.label.size = 4,group.line.width = 2,plot.extent.x.sf = 1.5,
+                  background.circle.colour = 'gray', grid.max = 26,grid.mid = round(df[3,8],1),grid.min = 4.5,axis.line.colour = 'black',legend.title = '')

Graphe avec googleVis

> library(googleVis)

Welcome to googleVis version 0.5.10

Please read the Google API Terms of Use
before you start using the package:
https://developers.google.com/terms/

Note, the plot method of googleVis will by default use
the standard browser to display its output.

See the googleVis package vignettes for more details,
or visit http://github.com/mages/googleVis.

To suppress this message use:
suppressPackageStartupMessages(library(googleVis))
> op <- options(gvis.plot.tag="chart")
> df1=t(df[,-1])
> df1=cbind.data.frame(colnames(df[,-1]),df1)
> 
> colnames(df1)=c("Gouvernorat","Female","Male","All")
> df1[,-1]=round(df1[,-1],2)
> Bar <- gvisBarChart(df1,xvar="Gouvernorat",yvar=c("Female","Male","All"),
+                     options=list(width=1250, height=700,title="Drop out School Rate",titleTextStyle="{color:'red',fontName:'Courier',fontSize:16}",bar="{groupWidth:'100%'}",hAxis="{format:'##,##%'}"))
> plot(Bar)
> Bar <- gvisLineChart(df1[,-4],xvar="Gouvernorat",
+                     options=list(width=1250, height=700,title="Drop out School Rate",titleTextStyle="{color:'red',fontName:'Courier',fontSize:16}",bar="{groupWidth:'100%'}",vAxis="{format:'##,##%'}"))
> plot(Bar)

Variables quantitatives

Boxplot

Imaginons le cas oĂ¹ on veut comparer la concentration dans l’air de certains polluants entre diffĂ©rentes grandes villes

  • On va d’abord simuler des donnĂ©es (crĂ©er notre base de donnĂ©es)
> set.seed(25)
> sd1 = 3
> co2.ny = rnorm(120, 7, sd1)
> co2.london = rnorm(120, 9, sd1)
> co2.la = rnorm(120, 10.5, sd1)
> ch4.ny = c(rnorm(100, 12, sd1), rep(NA, 20))
> ch4.london = c(rnorm(100, 15, sd1), rep(NA, 20)) 
> ch4.la = c(rnorm(100, 18, sd1), rep(NA, 20))

Remarquer qu’on ajouter des données manquantes. + On ajoute des localités et des poluants

> location = c('New York', 'London', 'Los Angeles', 'New York', 'London', 'Los Angeles')
> pollutant = c('CO2', 'CO2', 'CO2', 'CH4', 'CH4', 'CH4')
  • On combine le tout dans une mĂªme base de donnĂ©es (6 lignes et 120 colonnes)
> all.data = data.frame(rbind(co2.ny, co2.london, co2.la, ch4.ny, ch4.london, ch4.la))
  • On ajoute les locations et pollutants to the data
> all.data$location = location
> all.data$pollutant = pollutant
  • Les donnĂ©es
> head(all.data[,c(1:3,121:122)])
X1 X2 X3 location pollutant
co2.ny 6.364499 3.875227 3.540077 New York CO2
co2.london 10.627675 12.222713 6.985466 London CO2
co2.la 14.839187 17.808999 7.963327 Los Angeles CO2
ch4.ny 9.752869 13.481911 12.440884 New York CH4
ch4.london 15.726980 10.992601 20.837115 London CH4
ch4.la 16.812641 16.468553 19.941190 Los Angeles CH4
  • Maintenant on construit une base de donnĂ©es qui contient trois colonnes : (concentration du polluant, nom du polluant, ville)
> library(reshape2)
> stacked.data = melt(all.data, id = c('location', 'pollutant'))
> head(stacked.data)
location pollutant variable value
New York CO2 X1 6.364499
London CO2 X1 10.627675
Los Angeles CO2 X1 14.839187
New York CH4 X1 9.752869
London CH4 X1 15.726980
Los Angeles CH4 X1 16.812641
  • On supprime la colonne 3
> stacked.data=stacked.data[,-3]
  • Tracer le Boxplot
> boxplots.triple = boxplot(value~location + pollutant, data = stacked.data, at = c(1, 1.8, 2.6, 6, 6.8, 7.6), xaxt='n', ylim = c(min(0, min(co2.ny, co2.london, co2.la)), max(ch4.ny, ch4.london, ch4.la, na.rm = T)), col = c('white', 'white', 'gray'))
> 
> axis(side=1, at=c(1.8, 6.8), labels=c('Methane (ppb)\nNumber of Collections = 100', 'Carbon Dioxide (ppb)\nNumber of Collections = 120'), line=0.5, lwd=0)
> title('Comparing Pollution\nin London, Los Angeles, and New York')
> 
> ### Pour mettre en évidence le rectangle de Los Angeles
> rect(c(1.4, 6.4), boxplots.triple$stats[2, c(2, 5)], c(2.2, 7.2), boxplots.triple$stats[4, c(2, 5)], density=12, angle=45)
> 
> ### On ajoute les noms des villes
> text(c(1, 1.8, 2.6, 6, 6.8, 7.6), c(6, 8, 1, 17.75, 20, 15.5), c('London', 'Los\nAngeles', 'New\nYork', 'London', 'Los\nAngeles', 'New\nYork'))

  • L’object boxplots.triple
> boxplots.triple
$stats
          [,1]     [,2]      [,3]      [,4]      [,5]        [,6]
[1,]  6.913498 12.49495  5.222066  3.257579  2.825742 -0.03473925
[2,] 13.002184 16.30169  9.888339  7.150818  8.155106  4.03761211
[3,] 14.670203 18.12857 11.830681  9.257937 10.491617  6.03218343
[4,] 17.227927 20.03807 13.728101 11.244151 12.500854  8.08651344
[5,] 23.121217 24.24172 17.976904 16.045698 17.808999 14.10329422

$n
[1] 100 100 100 120 120 120

$conf
         [,1]     [,2]     [,3]     [,4]      [,5]     [,6]
[1,] 14.00254 17.53822 11.22400 8.667541  9.864814 5.448196
[2,] 15.33787 18.71891 12.43736 9.848333 11.118420 6.616171

$out
[1]  6.473599 10.288849 19.546686 17.623021  0.767745

$group
[1] 1 2 3 4 4

$names
[1] "London.CH4"      "Los Angeles.CH4" "New York.CH4"    "London.CO2"     
[5] "Los Angeles.CO2" "New York.CO2"   

Graphe de la densité ggplot2

> library(ggplot2)
> gr<-ggplot(data=stacked.data,aes(x=value,fill=location,color=location))+geom_density(alpha=.4)
> gr+xlab('Polution')+facet_wrap(~pollutant,ncol=1)

Matrice de corrélations : le package corrplot

Le package coroplot permet de tracer des matrices de corrélations. Il contient aussi des algorithmes qui permettent de réordonner les variables selon leurs corrélations.

Différentes méthodes de visualisation

En cercles.

> library(corrplot)
> data(mtcars)
> head(mtcars)
mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
> M <- cor(mtcars)
> corrplot(M, method = "circle")

En carrés.

> corrplot(M, method = "square")

En ellipses.

> corrplot(M, method = "ellipse")

En nombres.

> corrplot(M, method = "number")

Avec des camemberts.

> corrplot(M, method = "pie")

Just la partie supérieure de la matrice

> corrplot(M, type = "upper")

Une représentation mixte

> corrplot.mixed(M, lower = "ellipse", upper = "number")

Réordonner la matrice de corrélations.

  • Il y a quatre mĂ©thodes appelĂ©es : AOE, FPC, ’hclustet 'alphabet. On peut aussi rĂ©ordonner la matrice manuellement utilisant la commande corrMatorder()

  • AOE est basĂ©e sur les angles entre les vecteurs propres de la matrice de corrĂ©lations

\[ a_i = \left\{\begin{array}{ll} \mbox{arctan}(e_{i2}/e_{i1}) & \mbox{si} e_{i1}>0 \\ \mbox{arctan} (e_{i2}/e_{i1}) + \pi, & \mbox{sinon.} \end{array}\right. \] oĂ¹ \(e_1\) et \(e_2\) sont les deux premiers vecteurs propres. + FPC basĂ©e sur la corrĂ©lation avec la 1ère composante principale.

  • hclust basĂ©e sur differentes methodes de classification hierarchiques.

  • alphabet ordonne les variables selon l’ordre alphabĂ©tique

> corrplot(M, order = "hclust")

  • On peut ajouter des rectangles.
> corrplot(M, order = "hclust",addrect = 3)

Quelques autres facons pour personaliser le graphe

  • Couleurs
> mycol <- colorRampPalette(c("red", "white", "blue"))
> corrplot(M, order = "hclust",addrect = 2,col=mycol(50))

  • Changer le fond
> wb <- c("white", "black")
> corrplot(M, order = "hclust", addrect = 2, col = wb, bg = "gold2")

Ajouter un test d’indépendance

> cor.mtest <- function(mat, conf.level = 0.95) {
+     mat <- as.matrix(mat)
+     n <- ncol(mat)
+     p.mat <- lowCI.mat <- uppCI.mat <- matrix(NA, n, n)
+     diag(p.mat) <- 0
+     diag(lowCI.mat) <- diag(uppCI.mat) <- 1
+     for (i in 1:(n - 1)) {
+         for (j in (i + 1):n) {
+             tmp <- cor.test(mat[, i], mat[, j], conf.level = conf.level)
+             p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
+             lowCI.mat[i, j] <- lowCI.mat[j, i] <- tmp$conf.int[1]
+             uppCI.mat[i, j] <- uppCI.mat[j, i] <- tmp$conf.int[2]
+         }
+     }
+     return(list(p.mat, lowCI.mat, uppCI.mat))
+ }
> 
> res1 <- cor.mtest(mtcars, 0.95)
> res1
[[1]]
              [,1]         [,2]         [,3]         [,4]         [,5]
 [1,] 0.000000e+00 6.112687e-10 9.380327e-10 1.787835e-07 1.776240e-05
 [2,] 6.112687e-10 0.000000e+00 1.803002e-12 3.477861e-09 8.244636e-06
 [3,] 9.380327e-10 1.803002e-12 0.000000e+00 7.142679e-08 5.282022e-06
 [4,] 1.787835e-07 3.477861e-09 7.142679e-08 0.000000e+00 9.988772e-03
 [5,] 1.776240e-05 8.244636e-06 5.282022e-06 9.988772e-03 0.000000e+00
 [6,] 1.293959e-10 1.217567e-07 1.222311e-11 4.145827e-05 4.784260e-06
 [7,] 1.708199e-02 3.660533e-04 1.314404e-02 5.766253e-06 6.195826e-01
 [8,] 3.415937e-05 1.843018e-08 5.235012e-06 2.940896e-06 1.167553e-02
 [9,] 2.850207e-04 2.151207e-03 3.662114e-04 1.798309e-01 4.726790e-06
[10,] 5.400948e-03 4.173297e-03 9.635921e-04 4.930119e-01 8.360110e-06
[11,] 1.084446e-03 1.942340e-03 2.526789e-02 7.827810e-07 6.211834e-01
              [,6]         [,7]         [,8]         [,9]        [,10]
 [1,] 1.293959e-10 1.708199e-02 3.415937e-05 2.850207e-04 5.400948e-03
 [2,] 1.217567e-07 3.660533e-04 1.843018e-08 2.151207e-03 4.173297e-03
 [3,] 1.222311e-11 1.314404e-02 5.235012e-06 3.662114e-04 9.635921e-04
 [4,] 4.145827e-05 5.766253e-06 2.940896e-06 1.798309e-01 4.930119e-01
 [5,] 4.784260e-06 6.195826e-01 1.167553e-02 4.726790e-06 8.360110e-06
 [6,] 0.000000e+00 3.388683e-01 9.798492e-04 1.125440e-05 4.586601e-04
 [7,] 3.388683e-01 0.000000e+00 1.029669e-06 2.056621e-01 2.425344e-01
 [8,] 9.798492e-04 1.029669e-06 0.000000e+00 3.570439e-01 2.579439e-01
 [9,] 1.125440e-05 2.056621e-01 3.570439e-01 0.000000e+00 5.834043e-08
[10,] 4.586601e-04 2.425344e-01 2.579439e-01 5.834043e-08 0.000000e+00
[11,] 1.463861e-02 4.536949e-05 6.670496e-04 7.544526e-01 1.290291e-01
             [,11]
 [1,] 1.084446e-03
 [2,] 1.942340e-03
 [3,] 2.526789e-02
 [4,] 7.827810e-07
 [5,] 6.211834e-01
 [6,] 1.463861e-02
 [7,] 4.536949e-05
 [8,] 6.670496e-04
 [9,] 7.544526e-01
[10,] 1.290291e-01
[11,] 0.000000e+00

[[2]]
             [,1]       [,2]        [,3]       [,4]       [,5]        [,6]
 [1,]  1.00000000 -0.9257694 -0.92335937 -0.8852686  0.4360484 -0.93382641
 [2,] -0.92576936  1.0000000  0.80724418  0.6816016 -0.8429083  0.59657947
 [3,] -0.92335937  0.8072442  1.00000000  0.6106794 -0.8487237  0.78115863
 [4,] -0.88526861  0.6816016  0.61067938  1.0000000 -0.6895522  0.40251134
 [5,]  0.43604838 -0.8429083 -0.84872374 -0.6895522  1.0000000 -0.84997951
 [6,] -0.93382641  0.5965795  0.78115863  0.4025113 -0.8499795  1.00000000
 [7,]  0.08195487 -0.7792781 -0.67961513 -0.8475998 -0.2659470 -0.49335358
 [8,]  0.41036301 -0.9039393 -0.84883771 -0.8559675  0.1081948 -0.75711174
 [9,]  0.31755830 -0.7369979 -0.77926901 -0.5456270  0.4843991 -0.83867523
[10,]  0.15806177 -0.7180260 -0.75751468 -0.4544774  0.4641440 -0.77446381
[11,] -0.75464796  0.2184331  0.05367539  0.5431200 -0.4259976  0.09273981
             [,7]       [,8]       [,9]       [,10]       [,11]
 [1,]  0.08195487  0.4103630  0.3175583  0.15806177 -0.75464796
 [2,] -0.77927809 -0.9039393 -0.7369979 -0.71802597  0.21843307
 [3,] -0.67961513 -0.8488377 -0.7792690 -0.75751468  0.05367539
 [4,] -0.84759984 -0.8559675 -0.5456270 -0.45447743  0.54311998
 [5,] -0.26594700  0.1081948  0.4843991  0.46414402 -0.42599760
 [6,] -0.49335358 -0.7571117 -0.8386752 -0.77446381  0.09273981
 [7,]  1.00000000  0.5346428 -0.5356240 -0.52261830 -0.81780480
 [8,]  0.53464277  1.0000000 -0.1915957 -0.15371324 -0.76613289
 [9,] -0.53562398 -0.1915957  1.0000000  0.61589632 -0.29712041
[10,] -0.52261830 -0.1537132  0.6158963  1.00000000 -0.08250603
[11,] -0.81780480 -0.7661329 -0.2971204 -0.08250603  1.00000000

[[3]]
            [,1]       [,2]       [,3]       [,4]       [,5]       [,6]
 [1,]  1.0000000 -0.7163171 -0.7081376 -0.5860994  0.8322010 -0.7440872
 [2,] -0.7163171  1.0000000  0.9514607  0.9154223 -0.4646481  0.8887052
 [3,] -0.7081376  0.9514607  1.0000000  0.8932775 -0.4805193  0.9442902
 [4,] -0.5860994  0.9154223  0.8932775  1.0000000 -0.1186280  0.8192573
 [5,]  0.8322010 -0.4646481 -0.4805193 -0.1186280  1.0000000 -0.4839784
 [6,] -0.7440872  0.8887052  0.9442902  0.8192573 -0.4839784  1.0000000
 [7,]  0.6696186 -0.3055388 -0.1001493 -0.4774331  0.4263400  0.1852649
 [8,]  0.8223262 -0.6442689 -0.4808327 -0.5006318  0.6839680 -0.2556982
 [9,]  0.7844520 -0.2126675 -0.3055178  0.1152646  0.8501319 -0.4532461
[10,]  0.7100628 -0.1738615 -0.2565810  0.2332119  0.8427222 -0.2944887
[11,] -0.2503183  0.7397479  0.6536467  0.8708249  0.2663358  0.6755700
            [,7]       [,8]       [,9]      [,10]      [,11]
 [1,]  0.6696186  0.8223262  0.7844520  0.7100628 -0.2503183
 [2,] -0.3055388 -0.6442689 -0.2126675 -0.1738615  0.7397479
 [3,] -0.1001493 -0.4808327 -0.3055178 -0.2565810  0.6536467
 [4,] -0.4774331 -0.5006318  0.1152646  0.2332119  0.8708249
 [5,]  0.4263400  0.6839680  0.8501319  0.8427222  0.2663358
 [6,]  0.1852649 -0.2556982 -0.4532461 -0.2944887  0.6755700
 [7,]  1.0000000  0.8679076  0.1291876  0.1469065 -0.3988165
 [8,]  0.8679076  1.0000000  0.4883712  0.5175379 -0.2756654
 [9,]  0.1291876  0.4883712  1.0000000  0.8949546  0.3982389
[10,]  0.1469065  0.5175379  0.8949546  1.0000000  0.5684422
[11,] -0.3988165 -0.2756654  0.3982389  0.5684422  1.0000000
> res2 <- cor.mtest(mtcars, 0.99)
> corrplot(M, p.mat = res1[[1]], sig.level = 0.1)

ou

> corrplot(M, p.mat = res1[[1]], sig.level = 0.01)

Ecrire les p-valeurs

> corrplot(M, p.mat = res1[[1]], sig.level = 0.01,insig = "p-value")

Mettre du blanc Ă  la place.

> corrplot(M, p.mat = res1[[1]], sig.level = 0.01,insig = "blank")

Etude de cas : (ANOVA) Comparaison entre variétés de huiles.

Objectif

L’analyse de la variance (ANOVA) est une méthode qui permet d’étudier la variantion de la moyenne \(\mu\) du phénoméne étudié \(Y\) (variable quantitative) selon l’influence d’un ou de plusieurs facteurs d’expérience qualitatifs (traitements…).

Dans le cas oĂ¹ la moyenne n’est influencĂ©e que par un seul facteur (notĂ© \(X\)), il s’agit d’une analyse de la variance Ă  un facteur (one way ANOVA). Un facteur est une variable qualitative prĂ©sentant un nombre restreint de modalitĂ©s.

Le nombre de modalités (c’est-a-dire de niveaux) du facteur \(X\) sera noté \(I\). On suppose que \(Y\) suit une loi normale \({\mathcal N}(\mu_i, \sigma^2)\) sur chaque sous-population \(i\) définie par les modalités de \(X.\)

L’objectif est de tester les l’égalité des moyennes de ces \(I\) populations, à savoir tester l’hypothése nulle : \[H_0\,:\, \mu_1=\ldots=\mu_I,\;\;\mbox{ vs}\;\;H_1\;:\;\exists \,i\not j \mu_i\not=\mu_j.\]

Les données

Pour chaque type de population \(i\) (ou modalité \(i\) de \(X\) ou groupe \(i\)), on dispose d’un échantillon de \(n_i\) observations de la variable quantitative \(Y\) : \[y_{i,1},\ldots,y_{i,n_i}\]

Exemple

On considère une base de données sur des scores de préference de deux variétés de huiles d’olive. Ces scores sont donnés par des panels d’expert.

> sensoHuile <- read.csv("/Users/dhafermalouche/Documents/Teaching/CoursDataMining_1516/Visualisation_avec_R/sensoHuile.csv")
> head(sensoHuile)
Echant Product Variety locality System Jury Fusty Moldy Viney Muddy Métali Rancid Fruitty Bitter Pungent
2 cheml,bouz,sp cheml bouzC sp A 0 0 0 0 0 0 2.0 1.23 0.50
2 cheml,bouz,sp cheml bouzC sp B 0 0 0 0 0 0 2.5 1.55 0.70
2 cheml,bouz,sp cheml bouzC sp C 0 0 0 0 0 0 3.0 1.50 0.55
2 cheml,bouz,sp cheml bouzC sp D 0 0 0 0 0 0 2.8 1.30 0.80
2 cheml,bouz,sp cheml bouzC sp E 0 0 0 0 0 0 3.0 1.46 0.50
2 cheml,bouz,sp cheml bouzC sp F 0 0 0 0 0 0 4.5 2.12 1.50
> attach(sensoHuile)

On veut déterminer s’il y a une différence significative dans le goût fruité entre les différentes variétés.

  • Une reprĂ©sentation graphique s’impose
  1. Une matrice résumant les statistiques par variété
> library(doBy)
Loading required package: survival
> cdata <- summaryBy(Fruitty ~ Variety, data=sensoHuile, FUN=c(length,mean,sd))
> colnames(cdata)=c("Variety","N","Mean","sd")
> cdata=format(cdata,digits=3)
> cdata
Variety N Mean sd
arbeq 144 2.89 1.34
cheml 136 3.22 1.07
  1. Le graphe des boxplots

  1. Test d’ANOVA
> mon.aov <- aov(Fruitty~Variety)
> summary(mon.aov)
             Df Sum Sq Mean Sq F value Pr(>F)  
Variety       1    7.5   7.483   5.052 0.0254 *
Residuals   278  411.8   1.481                 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Validation des hypothéses : Egalité de la variance

> bartlett.test(Fruitty~Variety)

    Bartlett test of homogeneity of variances

data:  Fruitty by Variety
Bartlett's K-squared = 6.8875, df = 1, p-value = 0.00868

Deux solutions sont possibles

  1. Transformation de la variable `Fruitty
> LFruitty=log(Fruitty)
> bartlett.test(LFruitty~Variety)

    Bartlett test of homogeneity of variances

data:  LFruitty by Variety
Bartlett's K-squared = 11.383, df = 1, p-value = 0.0007412

Ou on peut utiliser un test (le test de Levene) plus robuste (moins sensible à la condition de normalité)

> library(car)
> leveneTest(LFruitty,Variety)
Df F value Pr(>F)
group 1 13.22624 0.0003289
278 NA NA
  1. Utiliser un autre test qui tient compte de l’inégalité des variances (test de Welch).
> oneway.test(Fruitty~Variety, var.equal = F)

    One-way analysis of means (not assuming equal variances)

data:  Fruitty and Variety
F = 5.1162, num df = 1.00, denom df = 270.65, p-value = 0.0245

Comparaison entre deux facteurs.

On va s’intéresser maintenant aux deux facteurs System et Variety

> library(doBy)
> int.conf=function(x){
+   z=t.test(x)
+   return(z$conf.int)
+ }
> cdata <- summaryBy(Fruitty ~ Variety+System, data=sensoHuile, FUN=c(length,mean,sd,int.conf))
> colnames(cdata)=c("Variety","System","N","Mean","sd","Lower","Upper")
> cdata1=format(cdata,digits=3)
> cdata1
Variety System N Mean sd Lower Upper
arbeq 2p 48 2.92 1.269 2.55 3.28
arbeq 3p 48 3.79 1.017 3.50 4.09
arbeq sp 48 1.98 1.065 1.67 2.28
cheml 2p 48 3.72 0.977 3.44 4.00
cheml 3p 48 3.71 0.527 3.56 3.86
cheml sp 40 2.04 0.669 1.82 2.25
Un graphique des intervalles de confiances
> p1<-ggplot(cdata, aes(x=System, y=Mean)) + 
+     geom_errorbar(aes(ymin=Lower, ymax=Upper), width=.2) +
+     geom_line(size=8) +
+     geom_point(size=8)+facet_wrap(~Variety)
> p1

ou

> pd <- position_dodge(0.4)
> p1<-ggplot(cdata, aes(x=System, y=Mean,group=Variety,color=Variety)) + 
+     geom_errorbar(aes(y=Mean,ymin=Lower, ymax=Upper), width=.2,position=pd) +
+     geom_point(size=4,position=pd)
> p1
ymax not defined: adjusting position using y instead

L’ANOVA à deux facteurs
> mon.aov <- aov(Fruitty~Variety+System)
> summary(mon.aov)
             Df Sum Sq Mean Sq F value  Pr(>F)    
Variety       1   7.48    7.48   7.825 0.00551 ** 
System        2 147.86   73.93  77.315 < 2e-16 ***
Residuals   276 263.92    0.96                    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Comparaison deux Ă  deux

On veut effectuer toutes les comparaisons deux à deux en proposant plusieurs méthodes de correction du risque afin de tenir compte du problème de la multiplicité des tests.

> TukeyHSD(mon.aov)
  Tukey multiple comparisons of means
    95% family-wise confidence level

Fit: aov(formula = Fruitty ~ Variety + System)

$Variety
                 diff        lwr       upr    p adj
cheml-arbeq 0.3270874 0.09690877 0.5572661 0.005514

$System
            diff         lwr        upr     p adj
3p-2p  0.4315625  0.09896904  0.7641560 0.0068994
sp-2p -1.3001608 -1.64022920 -0.9600924 0.0000000
sp-3p -1.7317233 -2.07179170 -1.3916549 0.0000000
Comparaison deux Ă  deux (par Variety)
  1. Deux ANOVA
> mon.aov1 <- aov(Fruitty~System,data=sensoHuile[Variety=='cheml',])
> summary(mon.aov1)
             Df Sum Sq Mean Sq F value Pr(>F)    
System        2  79.42   39.71   70.03 <2e-16 ***
Residuals   133  75.41    0.57                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
> mon.aov2 <- aov(Fruitty~System,data=sensoHuile[Variety=='arbeq',])
> summary(mon.aov2)
             Df Sum Sq Mean Sq F value   Pr(>F)    
System        2  79.24   39.62   31.44 5.13e-12 ***
Residuals   141 177.71    1.26                     
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  1. Comparaison deux Ă  deux
> TukeyHSD(mon.aov1)
  Tukey multiple comparisons of means
    95% family-wise confidence level

Fit: aov(formula = Fruitty ~ System, data = sensoHuile[Variety == "cheml", ])

$System
           diff       lwr       upr     p adj
3p-2p -0.012500 -0.376813  0.351813 0.9963604
sp-2p -1.683333 -2.065428 -1.301239 0.0000000
sp-3p -1.670833 -2.052928 -1.288739 0.0000000
> TukeyHSD(mon.aov2)
  Tukey multiple comparisons of means
    95% family-wise confidence level

Fit: aov(formula = Fruitty ~ System, data = sensoHuile[Variety == "arbeq", ])

$System
            diff        lwr        upr     p adj
3p-2p  0.8756250  0.3328071  1.4184429 0.0005793
sp-2p -0.9410417 -1.4838596 -0.3982238 0.0001992
sp-3p -1.8166667 -2.3594846 -1.2738488 0.0000000
  1. Un graphique pour finir.
> x1=TukeyHSD(mon.aov1)$System
> x2=TukeyHSD(mon.aov2)$System
> cdata=rbind.data.frame(x1,x2)
> cdata=cbind.data.frame(c(rep('Chemlali',3),rep('Arbequina',3)),rownames(cdata),cdata)
> colnames(cdata)[1:2]=c('Variety','System')
> cdata$System[4:6]=cdata$System[1:3]
> cdata
Variety System diff lwr upr p adj
3p-2p Chemlali 3p-2p -0.0125000 -0.3768130 0.3518130 0.9963604
sp-2p Chemlali sp-2p -1.6833333 -2.0654280 -1.3012386 0.0000000
sp-3p Chemlali sp-3p -1.6708333 -2.0529280 -1.2887386 0.0000000
3p-2p1 Arbequina 3p-2p 0.8756250 0.3328071 1.4184429 0.0005793
sp-2p1 Arbequina sp-2p -0.9410417 -1.4838596 -0.3982238 0.0001992
sp-3p1 Arbequina sp-3p -1.8166667 -2.3594846 -1.2738488 0.0000000
> p1<-ggplot(cdata, aes(x=System, y=diff)) + 
+     geom_errorbar(aes(ymin=lwr, ymax=upr), width=.2) +
+     geom_line(size=8) +
+     geom_point(size=8)+facet_wrap(~Variety)+geom_hline(y=0,type=2,colour='gray')
> p1
geom_path: Each group consist of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consist of only one observation. Do you need to adjust the group aesthetic?

Utilisant la commande pairwise.t.test()

La fonction pairwise.t.test() permet d’effectuer toutes les comparaisons deux é deux en proposant plusieurs méthodes de correction du risque \(\alpha\) afin de tenir compte du probléme de la multiplicité des tests

> pair1=pairwise.t.test(Fruitty[Variety=='cheml'],System[Variety=='cheml'],p.adjust="bonf",pool.sd = F)
> pair1

    Pairwise comparisons using t tests with non-pooled SD 

data:  Fruitty[Variety == "cheml"] and System[Variety == "cheml"] 

   2p      3p     
3p 1       -      
sp 1.6e-14 < 2e-16

P value adjustment method: bonferroni 
> pair2=pairwise.t.test(Fruitty[Variety=='arbeq'],System[Variety=='arbeq'],p.adjust="bonf",pool.sd = F)
> pair2

    Pairwise comparisons using t tests with non-pooled SD 

data:  Fruitty[Variety == "arbeq"] and System[Variety == "arbeq"] 

   2p      3p     
3p 0.00101 -      
sp 0.00049 6.9e-13

P value adjustment method: bonferroni 

Le package agricolae

> library(agricolae)
> HSD.test(mon.aov1,"System", group=TRUE,console = T)

Study: mon.aov1 ~ "System"

HSD Test for Fruitty 

Mean Square Error:  0.5669893 

System,  means

    Fruitty       std  r Min Max
2p 3.720833 0.9773952 48 1.7 5.1
3p 3.708333 0.5274722 48 2.5 4.8
sp 2.037500 0.6685950 40 1.1 4.5

alpha: 0.05 ; Df Error: 133 
Critical Value of Studentized Range: 3.352029 

Harmonic Mean of Cell Sizes  45
Honestly Significant Difference: 0.3762608 

Means with the same letter are not significantly different.

Groups, Treatments and means
a    2p      3.721 
a    3p      3.708 
b    sp      2.038 
> HSD.test(mon.aov2,"System", group=TRUE,console = T)

Study: mon.aov2 ~ "System"

HSD Test for Fruitty 

Mean Square Error:  1.260349 

System,  means

    Fruitty      std  r Min Max
2p 2.916042 1.269298 48   1 5.0
3p 3.791667 1.017262 48   2 5.3
sp 1.975000 1.065414 48   1 4.5

alpha: 0.05 ; Df Error: 141 
Critical Value of Studentized Range: 3.349881 

Honestly Significant Difference: 0.5428179 

Means with the same letter are not significantly different.

Groups, Treatments and means
a    3p      3.792 
b    2p      2.916 
c    sp      1.975 

ou encore

> library(multcomp)
Loading required package: mvtnorm
Loading required package: TH.data
> tuk1 <- glht(mon.aov1, linfct = mcp(System = "Tukey"))
> summary(tuk1)         

     Simultaneous Tests for General Linear Hypotheses

Multiple Comparisons of Means: Tukey Contrasts


Fit: aov(formula = Fruitty ~ System, data = sensoHuile[Variety == 
    "cheml", ])

Linear Hypotheses:
             Estimate Std. Error t value Pr(>|t|)    
3p - 2p == 0  -0.0125     0.1537  -0.081    0.996    
sp - 2p == 0  -1.6833     0.1612 -10.442   <1e-06 ***
sp - 3p == 0  -1.6708     0.1612 -10.365   <1e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Adjusted p values reported -- single-step method)
> tuk1.cld <- cld(tuk1)
> tuk2 <- glht(mon.aov2, linfct = mcp(System = "Tukey"))
> summary(tuk2)         

     Simultaneous Tests for General Linear Hypotheses

Multiple Comparisons of Means: Tukey Contrasts


Fit: aov(formula = Fruitty ~ System, data = sensoHuile[Variety == 
    "arbeq", ])

Linear Hypotheses:
             Estimate Std. Error t value Pr(>|t|)    
3p - 2p == 0   0.8756     0.2292   3.821 0.000578 ***
sp - 2p == 0  -0.9410     0.2292  -4.106 0.000231 ***
sp - 3p == 0  -1.8167     0.2292  -7.927  < 1e-04 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Adjusted p values reported -- single-step method)
> tuk1.cld <- cld(tuk1)
> tuk2.cld <- cld(tuk2)
> opar <- par(mai=c(1,1,1.5,1),mfrow = c(1,2))
> cols1=as.numeric(factor(tuk1.cld$mcletters$Letters))+1
> cols2=as.numeric(factor(tuk2.cld$mcletters$Letters))+1
> plot(tuk1.cld,col=cols1,xlab="Chemlali")
> plot(tuk2.cld,col=cols2,xlab="Arbequina")

> par(opar)