R
read.table()
: elle permet d’importer les données format .txt
donnees <- read.table(file=file.choose(),header=T,sep="\t", dec=".",row.names=1)
Â
Â
> banques <- read.table("/Users/dhafermalouche/Documents/Teaching/CoursDataMining_1516/Visualisation_avec_R/banques.txt",header = T,sep = '\t',dec = ",")
readLines
pour avoir une libre lecture du document Ă importer> readLines("/Users/dhafermalouche/Documents/Teaching/CoursDataMining_1516/Visualisation_avec_R/banques.txt",n=4)
[1] "Classe_PV\tAge_PV\tCadre\tNon_Cadre\tDiplome\tAge_Moy\tAnc_Moy\tSurface_\tType_concep\tNbr_reclam\tAge_RPV\tAnc_RPV\tAnc_RPV_PV\tQualt_client"
[2] "3\t29\t14\t10\t10\t41,71\t5,54\t12,75\tA\t16\t38\t21\t20\t0,37"
[3] "3\t21\t6\t14\t6\t34,2\t4,7\t11,8\tA\t7\t41\t28\t9\t0,53"
[4] "5\t13\t3\t5\t3\t31,13\t6,63\t12,25\tA\t7\t46\t66\t24\t0,49"
Â
read.csv(file.choose()) # Pour lire des donn?es separ?es par
# des virgules
# (avec le . pour s?parateur d?cimal).
read.csv2(file.choose()) # Pour lire des donnees separ?es par
# des points-virgules
# (avec la , pour separateur decimal).
R
,Logiciel | Package | Fonction R |
Extension du fichier |
---|---|---|---|
SPSS | foreign |
read.spss |
.sav |
Stata | foreign |
read.dta |
.dta |
SAS | foreign |
read. xport |
.xpt |
Matlab | R.Matlab |
readMat |
.mat |
Excel | gdata |
read.xls |
.xls/xlxs |
On considère les données suivantes
> data <- read.table(header=TRUE, text='
+ subject sex condition before after change
+ 1 F placebo 10.1 6.9 -3.2
+ 2 F placebo 6.3 4.2 -2.1
+ 3 M aspirin 12.4 6.3 -6.1
+ 4 F placebo 8.1 6.1 -2.0
+ 5 M aspirin 15.2 9.9 -5.3
+ 6 F aspirin 10.9 7.0 -3.9
+ 7 F aspirin 11.6 8.5 -3.1
+ 8 M aspirin 9.5 3.0 -6.5
+ 9 F placebo 11.5 9.0 -2.5
+ 10 M placebo 11.9 11.0 -0.9
+ 11 F aspirin 11.4 8.0 -3.4
+ 12 M aspirin 10.0 4.4 -5.6
+ 13 M aspirin 12.5 5.4 -7.1
+ 14 M placebo 10.6 10.6 0.0
+ 15 M aspirin 9.1 4.3 -4.8
+ 16 F placebo 12.1 10.2 -1.9
+ 17 F placebo 11.0 8.8 -2.2
+ 18 F placebo 11.9 10.2 -1.7
+ 19 M aspirin 9.1 3.6 -5.5
+ 20 M placebo 13.5 12.4 -1.1
+ 21 M aspirin 12.0 7.5 -4.5
+ 22 F placebo 9.1 7.6 -1.5
+ 23 M placebo 9.9 8.0 -1.9
+ 24 F placebo 7.6 5.2 -2.4
+ 25 F placebo 11.8 9.7 -2.1
+ 26 F placebo 11.8 10.7 -1.1
+ 27 F aspirin 10.1 7.9 -2.2
+ 28 M aspirin 11.6 8.3 -3.3
+ 29 F aspirin 11.3 6.8 -4.5
+ 30 F placebo 10.3 8.3 -2.0
+ ')
plyr
.> library(plyr)
> cdata <- ddply(data, c("sex", "condition"), summarise,
+ N = length(change),
+ mean = mean(change),
+ sd = sd(change),
+ se = sd / sqrt(N)
+ )
> cdata
sex condition N mean sd se
1 F aspirin 5 -3.420000 0.8642916 0.3865230
2 F placebo 12 -2.058333 0.5247655 0.1514867
3 M aspirin 9 -5.411111 1.1307569 0.3769190
4 M placebo 4 -0.975000 0.7804913 0.3902456
html
.D’abord installer le package printr
> install.packages(
+ 'printr',
+ type = 'source',
+ repos = c('http://yihui.name/xran', 'http://cran.rstudio.com')
+ )
> install.packages(
+ 'printr',
+ type = 'source',
+ repos = c('http://yihui.name/xran', 'http://cran.rstudio.com')
+ )
> library(printr)
> colnames(cdata)=c("Genre","Condition","N","Moyenne","Ecart-type","Erreur Standard")
> cdata=format(cdata,digits = 3,decimal.mark = ".")
> cdata
Genre | Condition | N | Moyenne | Ecart-type | Erreur Standard |
---|---|---|---|---|---|
F | aspirin | 5 | -3.420 | 0.864 | 0.387 |
F | placebo | 12 | -2.058 | 0.525 | 0.151 |
M | aspirin | 9 | -5.411 | 1.131 | 0.377 |
M | placebo | 4 | -0.975 | 0.780 | 0.390 |
> # On met quelques données manquantes.
> dataNA <- data
> dataNA$change[11:14] <- NA
> cdata <- ddply(dataNA, c("sex", "condition"), summarise,
+ N = sum(!is.na(change)),
+ N.na = sum(is.na(change)==T),
+ mean = mean(change, na.rm=TRUE),
+ sd = sd(change, na.rm=TRUE),
+ se = sd / sqrt(N)
+ )
> colnames(cdata)=c("Genre","Condition","N"," NAs","Moyenne","Ecart-type","Erreur Standard")
> cdata=format(cdata,digits = 3,decimal.mark = ".")
> cdata
Genre | Condition | N | NAs | Moyenne | Ecart-type | Erreur Standard |
---|---|---|---|---|---|---|
F | aspirin | 4 | 1 | -3.42 | 0.998 | 0.499 |
F | placebo | 12 | 0 | -2.06 | 0.525 | 0.151 |
M | aspirin | 7 | 2 | -5.14 | 1.067 | 0.403 |
M | placebo | 3 | 1 | -1.30 | 0.529 | 0.306 |
> df=data.frame(data[,4:6])
> tmp <- do.call(data.frame,
+ list(mean = apply(df, 2, mean),
+ sd = apply(df, 2, sd),
+ median = apply(df, 2, median),
+ min = apply(df, 2, min),
+ max = apply(df, 2, max),
+ n = apply(df, 2, length)))
> tmp=format(tmp,digits = 3,decimal.mark = ".")
> tmp
mean | sd | median | min | max | n | |
---|---|---|---|---|---|---|
before | 10.81 | 1.79 | 11.15 | 6.3 | 15.2 | 30 |
after | 7.66 | 2.41 | 7.95 | 3.0 | 12.4 | 30 |
change | -3.15 | 1.84 | -2.45 | -7.1 | 0.0 | 30 |
> df=data.frame(data[,2:3])
> library(sjPlot)
> tmp=sjt.frq(df,alternateRowColors = T,showSummary = F, no.output = TRUE)
> cat(tmp$knitr)
value | N | raw % | valid % | cumulative % |
---|---|---|---|---|
F | 17 | 56.67 | 56.67 | 56.67 |
M | 13 | 43.33 | 43.33 | 100.00 |
missings | 0 | 0.00 |
Â
value | N | raw % | valid % | cumulative % |
---|---|---|---|---|
aspirin | 14 | 46.67 | 46.67 | 46.67 |
placebo | 16 | 53.33 | 53.33 | 100.00 |
missings | 0 | 0.00 |
Â
Considérons le tableau de contigence suivant :
> require(graphics)
> M <- as.table( cbind( c( 18,28,14), c( 20,51,28) , c( 12,25,9)))
> dimnames( M) <- list( Husband = c(" Tall", "Medium", "Short"), Wife = c(" Tall"," Medium", "Short"))
> M
Husband/Wife | Tall | Medium | Short |
---|---|---|---|
Tall | 18 | 20 | 12 |
Medium | 28 | 51 | 25 |
Short | 14 | 28 | 9 |
> mosaicplot( M, col = c(" green", "red"),main = "Husband x Wife")
ou encore
> library(vcd)
Loading required package: grid
> mosaic(M, shade=T,main = "Husband x Wife")
On considère une base de données sur le pourcentage d’abondan scolaire dans 6 gouvernorats en Tunisie
> library(ggplot2)
> source('~/Documents/Important_R_functions/CreatRadialPlot.R')
> df <- read.csv("/Users/dhafermalouche/Documents/Teaching/CoursDataMining_1516/Visualisation_avec_R/drop_out_school_tunisia.csv")
> df <- df[,-1]
> tmp=format(df,digits = 3,decimal.mark = ".")
> tmp
group | bizerte | siliana | monastir | mahdia | tunis | sfax | National |
---|---|---|---|---|---|---|---|
Female | 7.58 | 5.95 | 11.8 | 6.57 | 12.33 | 3.96 | 8.25 |
Male | 14.29 | 6.36 | 23.0 | 6.21 | 6.02 | 5.34 | 11.47 |
All | 11.59 | 6.19 | 17.9 | 6.38 | 8.97 | 4.74 | 10.02 |
> CreateRadialPlot(df,grid.label.size = 5,axis.label.size = 4,group.line.width = 2,plot.extent.x.sf = 1.5,
+ background.circle.colour = 'gray', grid.max = 26,grid.mid = round(df[3,8],1),grid.min = 4.5,axis.line.colour = 'black',legend.title = '')
googleVis
> library(googleVis)
Welcome to googleVis version 0.5.10
Please read the Google API Terms of Use
before you start using the package:
https://developers.google.com/terms/
Note, the plot method of googleVis will by default use
the standard browser to display its output.
See the googleVis package vignettes for more details,
or visit http://github.com/mages/googleVis.
To suppress this message use:
suppressPackageStartupMessages(library(googleVis))
> op <- options(gvis.plot.tag="chart")
> df1=t(df[,-1])
> df1=cbind.data.frame(colnames(df[,-1]),df1)
>
> colnames(df1)=c("Gouvernorat","Female","Male","All")
> df1[,-1]=round(df1[,-1],2)
> Bar <- gvisBarChart(df1,xvar="Gouvernorat",yvar=c("Female","Male","All"),
+ options=list(width=1250, height=700,title="Drop out School Rate",titleTextStyle="{color:'red',fontName:'Courier',fontSize:16}",bar="{groupWidth:'100%'}",hAxis="{format:'##,##%'}"))
> plot(Bar)
> Bar <- gvisLineChart(df1[,-4],xvar="Gouvernorat",
+ options=list(width=1250, height=700,title="Drop out School Rate",titleTextStyle="{color:'red',fontName:'Courier',fontSize:16}",bar="{groupWidth:'100%'}",vAxis="{format:'##,##%'}"))
> plot(Bar)
Imaginons le cas oĂ¹ on veut comparer la concentration dans l’air de certains polluants entre diffĂ©rentes grandes villes
> set.seed(25)
> sd1 = 3
> co2.ny = rnorm(120, 7, sd1)
> co2.london = rnorm(120, 9, sd1)
> co2.la = rnorm(120, 10.5, sd1)
> ch4.ny = c(rnorm(100, 12, sd1), rep(NA, 20))
> ch4.london = c(rnorm(100, 15, sd1), rep(NA, 20))
> ch4.la = c(rnorm(100, 18, sd1), rep(NA, 20))
Remarquer qu’on ajouter des données manquantes. + On ajoute des localités et des poluants
> location = c('New York', 'London', 'Los Angeles', 'New York', 'London', 'Los Angeles')
> pollutant = c('CO2', 'CO2', 'CO2', 'CH4', 'CH4', 'CH4')
> all.data = data.frame(rbind(co2.ny, co2.london, co2.la, ch4.ny, ch4.london, ch4.la))
> all.data$location = location
> all.data$pollutant = pollutant
> head(all.data[,c(1:3,121:122)])
X1 | X2 | X3 | location | pollutant | |
---|---|---|---|---|---|
co2.ny | 6.364499 | 3.875227 | 3.540077 | New York | CO2 |
co2.london | 10.627675 | 12.222713 | 6.985466 | London | CO2 |
co2.la | 14.839187 | 17.808999 | 7.963327 | Los Angeles | CO2 |
ch4.ny | 9.752869 | 13.481911 | 12.440884 | New York | CH4 |
ch4.london | 15.726980 | 10.992601 | 20.837115 | London | CH4 |
ch4.la | 16.812641 | 16.468553 | 19.941190 | Los Angeles | CH4 |
> library(reshape2)
> stacked.data = melt(all.data, id = c('location', 'pollutant'))
> head(stacked.data)
location | pollutant | variable | value |
---|---|---|---|
New York | CO2 | X1 | 6.364499 |
London | CO2 | X1 | 10.627675 |
Los Angeles | CO2 | X1 | 14.839187 |
New York | CH4 | X1 | 9.752869 |
London | CH4 | X1 | 15.726980 |
Los Angeles | CH4 | X1 | 16.812641 |
> stacked.data=stacked.data[,-3]
> boxplots.triple = boxplot(value~location + pollutant, data = stacked.data, at = c(1, 1.8, 2.6, 6, 6.8, 7.6), xaxt='n', ylim = c(min(0, min(co2.ny, co2.london, co2.la)), max(ch4.ny, ch4.london, ch4.la, na.rm = T)), col = c('white', 'white', 'gray'))
>
> axis(side=1, at=c(1.8, 6.8), labels=c('Methane (ppb)\nNumber of Collections = 100', 'Carbon Dioxide (ppb)\nNumber of Collections = 120'), line=0.5, lwd=0)
> title('Comparing Pollution\nin London, Los Angeles, and New York')
>
> ### Pour mettre en évidence le rectangle de Los Angeles
> rect(c(1.4, 6.4), boxplots.triple$stats[2, c(2, 5)], c(2.2, 7.2), boxplots.triple$stats[4, c(2, 5)], density=12, angle=45)
>
> ### On ajoute les noms des villes
> text(c(1, 1.8, 2.6, 6, 6.8, 7.6), c(6, 8, 1, 17.75, 20, 15.5), c('London', 'Los\nAngeles', 'New\nYork', 'London', 'Los\nAngeles', 'New\nYork'))
boxplots.triple
> boxplots.triple
$stats
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] 6.913498 12.49495 5.222066 3.257579 2.825742 -0.03473925
[2,] 13.002184 16.30169 9.888339 7.150818 8.155106 4.03761211
[3,] 14.670203 18.12857 11.830681 9.257937 10.491617 6.03218343
[4,] 17.227927 20.03807 13.728101 11.244151 12.500854 8.08651344
[5,] 23.121217 24.24172 17.976904 16.045698 17.808999 14.10329422
$n
[1] 100 100 100 120 120 120
$conf
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] 14.00254 17.53822 11.22400 8.667541 9.864814 5.448196
[2,] 15.33787 18.71891 12.43736 9.848333 11.118420 6.616171
$out
[1] 6.473599 10.288849 19.546686 17.623021 0.767745
$group
[1] 1 2 3 4 4
$names
[1] "London.CH4" "Los Angeles.CH4" "New York.CH4" "London.CO2"
[5] "Los Angeles.CO2" "New York.CO2"
ggplot2
> library(ggplot2)
> gr<-ggplot(data=stacked.data,aes(x=value,fill=location,color=location))+geom_density(alpha=.4)
> gr+xlab('Polution')+facet_wrap(~pollutant,ncol=1)
package
corrplot
Le package
coroplot
permet de tracer des matrices de corrélations. Il contient aussi des algorithmes qui permettent de réordonner les variables selon leurs corrélations.
> library(corrplot)
> data(mtcars)
> head(mtcars)
mpg | cyl | disp | hp | drat | wt | qsec | vs | am | gear | carb | |
---|---|---|---|---|---|---|---|---|---|---|---|
Mazda RX4 | 21.0 | 6 | 160 | 110 | 3.90 | 2.620 | 16.46 | 0 | 1 | 4 | 4 |
Mazda RX4 Wag | 21.0 | 6 | 160 | 110 | 3.90 | 2.875 | 17.02 | 0 | 1 | 4 | 4 |
Datsun 710 | 22.8 | 4 | 108 | 93 | 3.85 | 2.320 | 18.61 | 1 | 1 | 4 | 1 |
Hornet 4 Drive | 21.4 | 6 | 258 | 110 | 3.08 | 3.215 | 19.44 | 1 | 0 | 3 | 1 |
Hornet Sportabout | 18.7 | 8 | 360 | 175 | 3.15 | 3.440 | 17.02 | 0 | 0 | 3 | 2 |
Valiant | 18.1 | 6 | 225 | 105 | 2.76 | 3.460 | 20.22 | 1 | 0 | 3 | 1 |
> M <- cor(mtcars)
> corrplot(M, method = "circle")
> corrplot(M, method = "square")
> corrplot(M, method = "ellipse")
> corrplot(M, method = "number")
> corrplot(M, method = "pie")
> corrplot(M, type = "upper")
> corrplot.mixed(M, lower = "ellipse", upper = "number")
Il y a quatre méthodes appelées : AOE
, FPC
, ’hclustet 'alphabet
. On peut aussi réordonner la matrice manuellement utilisant la commande corrMatorder()
AOE
est basée sur les angles entre les vecteurs propres de la matrice de corrélations
\[ a_i = \left\{\begin{array}{ll} \mbox{arctan}(e_{i2}/e_{i1}) & \mbox{si} e_{i1}>0 \\
\mbox{arctan} (e_{i2}/e_{i1}) + \pi, & \mbox{sinon.} \end{array}\right. \] oĂ¹ \(e_1\) et \(e_2\) sont les deux premiers vecteurs propres. + FPC
basée sur la corrélation avec la 1ère composante principale.
hclust
basée sur differentes methodes de classification hierarchiques.
alphabet
ordonne les variables selon l’ordre alphabétique
> corrplot(M, order = "hclust")
> corrplot(M, order = "hclust",addrect = 3)
> mycol <- colorRampPalette(c("red", "white", "blue"))
> corrplot(M, order = "hclust",addrect = 2,col=mycol(50))
> wb <- c("white", "black")
> corrplot(M, order = "hclust", addrect = 2, col = wb, bg = "gold2")
> cor.mtest <- function(mat, conf.level = 0.95) {
+ mat <- as.matrix(mat)
+ n <- ncol(mat)
+ p.mat <- lowCI.mat <- uppCI.mat <- matrix(NA, n, n)
+ diag(p.mat) <- 0
+ diag(lowCI.mat) <- diag(uppCI.mat) <- 1
+ for (i in 1:(n - 1)) {
+ for (j in (i + 1):n) {
+ tmp <- cor.test(mat[, i], mat[, j], conf.level = conf.level)
+ p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
+ lowCI.mat[i, j] <- lowCI.mat[j, i] <- tmp$conf.int[1]
+ uppCI.mat[i, j] <- uppCI.mat[j, i] <- tmp$conf.int[2]
+ }
+ }
+ return(list(p.mat, lowCI.mat, uppCI.mat))
+ }
>
> res1 <- cor.mtest(mtcars, 0.95)
> res1
[[1]]
[,1] [,2] [,3] [,4] [,5]
[1,] 0.000000e+00 6.112687e-10 9.380327e-10 1.787835e-07 1.776240e-05
[2,] 6.112687e-10 0.000000e+00 1.803002e-12 3.477861e-09 8.244636e-06
[3,] 9.380327e-10 1.803002e-12 0.000000e+00 7.142679e-08 5.282022e-06
[4,] 1.787835e-07 3.477861e-09 7.142679e-08 0.000000e+00 9.988772e-03
[5,] 1.776240e-05 8.244636e-06 5.282022e-06 9.988772e-03 0.000000e+00
[6,] 1.293959e-10 1.217567e-07 1.222311e-11 4.145827e-05 4.784260e-06
[7,] 1.708199e-02 3.660533e-04 1.314404e-02 5.766253e-06 6.195826e-01
[8,] 3.415937e-05 1.843018e-08 5.235012e-06 2.940896e-06 1.167553e-02
[9,] 2.850207e-04 2.151207e-03 3.662114e-04 1.798309e-01 4.726790e-06
[10,] 5.400948e-03 4.173297e-03 9.635921e-04 4.930119e-01 8.360110e-06
[11,] 1.084446e-03 1.942340e-03 2.526789e-02 7.827810e-07 6.211834e-01
[,6] [,7] [,8] [,9] [,10]
[1,] 1.293959e-10 1.708199e-02 3.415937e-05 2.850207e-04 5.400948e-03
[2,] 1.217567e-07 3.660533e-04 1.843018e-08 2.151207e-03 4.173297e-03
[3,] 1.222311e-11 1.314404e-02 5.235012e-06 3.662114e-04 9.635921e-04
[4,] 4.145827e-05 5.766253e-06 2.940896e-06 1.798309e-01 4.930119e-01
[5,] 4.784260e-06 6.195826e-01 1.167553e-02 4.726790e-06 8.360110e-06
[6,] 0.000000e+00 3.388683e-01 9.798492e-04 1.125440e-05 4.586601e-04
[7,] 3.388683e-01 0.000000e+00 1.029669e-06 2.056621e-01 2.425344e-01
[8,] 9.798492e-04 1.029669e-06 0.000000e+00 3.570439e-01 2.579439e-01
[9,] 1.125440e-05 2.056621e-01 3.570439e-01 0.000000e+00 5.834043e-08
[10,] 4.586601e-04 2.425344e-01 2.579439e-01 5.834043e-08 0.000000e+00
[11,] 1.463861e-02 4.536949e-05 6.670496e-04 7.544526e-01 1.290291e-01
[,11]
[1,] 1.084446e-03
[2,] 1.942340e-03
[3,] 2.526789e-02
[4,] 7.827810e-07
[5,] 6.211834e-01
[6,] 1.463861e-02
[7,] 4.536949e-05
[8,] 6.670496e-04
[9,] 7.544526e-01
[10,] 1.290291e-01
[11,] 0.000000e+00
[[2]]
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] 1.00000000 -0.9257694 -0.92335937 -0.8852686 0.4360484 -0.93382641
[2,] -0.92576936 1.0000000 0.80724418 0.6816016 -0.8429083 0.59657947
[3,] -0.92335937 0.8072442 1.00000000 0.6106794 -0.8487237 0.78115863
[4,] -0.88526861 0.6816016 0.61067938 1.0000000 -0.6895522 0.40251134
[5,] 0.43604838 -0.8429083 -0.84872374 -0.6895522 1.0000000 -0.84997951
[6,] -0.93382641 0.5965795 0.78115863 0.4025113 -0.8499795 1.00000000
[7,] 0.08195487 -0.7792781 -0.67961513 -0.8475998 -0.2659470 -0.49335358
[8,] 0.41036301 -0.9039393 -0.84883771 -0.8559675 0.1081948 -0.75711174
[9,] 0.31755830 -0.7369979 -0.77926901 -0.5456270 0.4843991 -0.83867523
[10,] 0.15806177 -0.7180260 -0.75751468 -0.4544774 0.4641440 -0.77446381
[11,] -0.75464796 0.2184331 0.05367539 0.5431200 -0.4259976 0.09273981
[,7] [,8] [,9] [,10] [,11]
[1,] 0.08195487 0.4103630 0.3175583 0.15806177 -0.75464796
[2,] -0.77927809 -0.9039393 -0.7369979 -0.71802597 0.21843307
[3,] -0.67961513 -0.8488377 -0.7792690 -0.75751468 0.05367539
[4,] -0.84759984 -0.8559675 -0.5456270 -0.45447743 0.54311998
[5,] -0.26594700 0.1081948 0.4843991 0.46414402 -0.42599760
[6,] -0.49335358 -0.7571117 -0.8386752 -0.77446381 0.09273981
[7,] 1.00000000 0.5346428 -0.5356240 -0.52261830 -0.81780480
[8,] 0.53464277 1.0000000 -0.1915957 -0.15371324 -0.76613289
[9,] -0.53562398 -0.1915957 1.0000000 0.61589632 -0.29712041
[10,] -0.52261830 -0.1537132 0.6158963 1.00000000 -0.08250603
[11,] -0.81780480 -0.7661329 -0.2971204 -0.08250603 1.00000000
[[3]]
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] 1.0000000 -0.7163171 -0.7081376 -0.5860994 0.8322010 -0.7440872
[2,] -0.7163171 1.0000000 0.9514607 0.9154223 -0.4646481 0.8887052
[3,] -0.7081376 0.9514607 1.0000000 0.8932775 -0.4805193 0.9442902
[4,] -0.5860994 0.9154223 0.8932775 1.0000000 -0.1186280 0.8192573
[5,] 0.8322010 -0.4646481 -0.4805193 -0.1186280 1.0000000 -0.4839784
[6,] -0.7440872 0.8887052 0.9442902 0.8192573 -0.4839784 1.0000000
[7,] 0.6696186 -0.3055388 -0.1001493 -0.4774331 0.4263400 0.1852649
[8,] 0.8223262 -0.6442689 -0.4808327 -0.5006318 0.6839680 -0.2556982
[9,] 0.7844520 -0.2126675 -0.3055178 0.1152646 0.8501319 -0.4532461
[10,] 0.7100628 -0.1738615 -0.2565810 0.2332119 0.8427222 -0.2944887
[11,] -0.2503183 0.7397479 0.6536467 0.8708249 0.2663358 0.6755700
[,7] [,8] [,9] [,10] [,11]
[1,] 0.6696186 0.8223262 0.7844520 0.7100628 -0.2503183
[2,] -0.3055388 -0.6442689 -0.2126675 -0.1738615 0.7397479
[3,] -0.1001493 -0.4808327 -0.3055178 -0.2565810 0.6536467
[4,] -0.4774331 -0.5006318 0.1152646 0.2332119 0.8708249
[5,] 0.4263400 0.6839680 0.8501319 0.8427222 0.2663358
[6,] 0.1852649 -0.2556982 -0.4532461 -0.2944887 0.6755700
[7,] 1.0000000 0.8679076 0.1291876 0.1469065 -0.3988165
[8,] 0.8679076 1.0000000 0.4883712 0.5175379 -0.2756654
[9,] 0.1291876 0.4883712 1.0000000 0.8949546 0.3982389
[10,] 0.1469065 0.5175379 0.8949546 1.0000000 0.5684422
[11,] -0.3988165 -0.2756654 0.3982389 0.5684422 1.0000000
> res2 <- cor.mtest(mtcars, 0.99)
> corrplot(M, p.mat = res1[[1]], sig.level = 0.1)
ou
> corrplot(M, p.mat = res1[[1]], sig.level = 0.01)
Ecrire les p-valeurs
> corrplot(M, p.mat = res1[[1]], sig.level = 0.01,insig = "p-value")
Mettre du blanc Ă la place.
> corrplot(M, p.mat = res1[[1]], sig.level = 0.01,insig = "blank")
L’analyse de la variance (ANOVA) est une méthode qui permet d’étudier la variantion de la moyenne \(\mu\) du phénoméne étudié \(Y\) (variable quantitative) selon l’influence d’un ou de plusieurs facteurs d’expérience qualitatifs (traitements…).
Dans le cas oĂ¹ la moyenne n’est influencĂ©e que par un seul facteur (notĂ© \(X\)), il s’agit d’une analyse de la variance Ă un facteur (one way ANOVA). Un facteur est une variable qualitative prĂ©sentant un nombre restreint de modalitĂ©s.
Le nombre de modalités (c’est-a-dire de niveaux) du facteur \(X\) sera noté \(I\). On suppose que \(Y\) suit une loi normale \({\mathcal N}(\mu_i, \sigma^2)\) sur chaque sous-population \(i\) définie par les modalités de \(X.\)
L’objectif est de tester les l’égalité des moyennes de ces \(I\) populations, à savoir tester l’hypothése nulle : \[H_0\,:\, \mu_1=\ldots=\mu_I,\;\;\mbox{ vs}\;\;H_1\;:\;\exists \,i\not j \mu_i\not=\mu_j.\]
Pour chaque type de population \(i\) (ou modalité \(i\) de \(X\) ou groupe \(i\)), on dispose d’un échantillon de \(n_i\) observations de la variable quantitative \(Y\) : \[y_{i,1},\ldots,y_{i,n_i}\]
On considère une base de données sur des scores de préference de deux variétés de huiles d’olive. Ces scores sont donnés par des panels d’expert.
> sensoHuile <- read.csv("/Users/dhafermalouche/Documents/Teaching/CoursDataMining_1516/Visualisation_avec_R/sensoHuile.csv")
> head(sensoHuile)
Echant | Product | Variety | locality | System | Jury | Fusty | Moldy | Viney | Muddy | Métali | Rancid | Fruitty | Bitter | Pungent |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | cheml,bouz,sp | cheml | bouzC | sp | A | 0 | 0 | 0 | 0 | 0 | 0 | 2.0 | 1.23 | 0.50 |
2 | cheml,bouz,sp | cheml | bouzC | sp | B | 0 | 0 | 0 | 0 | 0 | 0 | 2.5 | 1.55 | 0.70 |
2 | cheml,bouz,sp | cheml | bouzC | sp | C | 0 | 0 | 0 | 0 | 0 | 0 | 3.0 | 1.50 | 0.55 |
2 | cheml,bouz,sp | cheml | bouzC | sp | D | 0 | 0 | 0 | 0 | 0 | 0 | 2.8 | 1.30 | 0.80 |
2 | cheml,bouz,sp | cheml | bouzC | sp | E | 0 | 0 | 0 | 0 | 0 | 0 | 3.0 | 1.46 | 0.50 |
2 | cheml,bouz,sp | cheml | bouzC | sp | F | 0 | 0 | 0 | 0 | 0 | 0 | 4.5 | 2.12 | 1.50 |
> attach(sensoHuile)
On veut déterminer s’il y a une différence significative dans le goût fruité entre les différentes variétés.
> library(doBy)
Loading required package: survival
> cdata <- summaryBy(Fruitty ~ Variety, data=sensoHuile, FUN=c(length,mean,sd))
> colnames(cdata)=c("Variety","N","Mean","sd")
> cdata=format(cdata,digits=3)
> cdata
Variety | N | Mean | sd |
---|---|---|---|
arbeq | 144 | 2.89 | 1.34 |
cheml | 136 | 3.22 | 1.07 |
> mon.aov <- aov(Fruitty~Variety)
> summary(mon.aov)
Df Sum Sq Mean Sq F value Pr(>F)
Variety 1 7.5 7.483 5.052 0.0254 *
Residuals 278 411.8 1.481
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
> bartlett.test(Fruitty~Variety)
Bartlett test of homogeneity of variances
data: Fruitty by Variety
Bartlett's K-squared = 6.8875, df = 1, p-value = 0.00868
Fruitty
> LFruitty=log(Fruitty)
> bartlett.test(LFruitty~Variety)
Bartlett test of homogeneity of variances
data: LFruitty by Variety
Bartlett's K-squared = 11.383, df = 1, p-value = 0.0007412
Ou on peut utiliser un test (le test de Levene) plus robuste (moins sensible à la condition de normalité)
> library(car)
> leveneTest(LFruitty,Variety)
Df | F value | Pr(>F) | |
---|---|---|---|
group | 1 | 13.22624 | 0.0003289 |
278 | NA | NA |
> oneway.test(Fruitty~Variety, var.equal = F)
One-way analysis of means (not assuming equal variances)
data: Fruitty and Variety
F = 5.1162, num df = 1.00, denom df = 270.65, p-value = 0.0245
On va s’intéresser maintenant aux deux facteurs System
et Variety
> library(doBy)
> int.conf=function(x){
+ z=t.test(x)
+ return(z$conf.int)
+ }
> cdata <- summaryBy(Fruitty ~ Variety+System, data=sensoHuile, FUN=c(length,mean,sd,int.conf))
> colnames(cdata)=c("Variety","System","N","Mean","sd","Lower","Upper")
> cdata1=format(cdata,digits=3)
> cdata1
Variety | System | N | Mean | sd | Lower | Upper |
---|---|---|---|---|---|---|
arbeq | 2p | 48 | 2.92 | 1.269 | 2.55 | 3.28 |
arbeq | 3p | 48 | 3.79 | 1.017 | 3.50 | 4.09 |
arbeq | sp | 48 | 1.98 | 1.065 | 1.67 | 2.28 |
cheml | 2p | 48 | 3.72 | 0.977 | 3.44 | 4.00 |
cheml | 3p | 48 | 3.71 | 0.527 | 3.56 | 3.86 |
cheml | sp | 40 | 2.04 | 0.669 | 1.82 | 2.25 |
> p1<-ggplot(cdata, aes(x=System, y=Mean)) +
+ geom_errorbar(aes(ymin=Lower, ymax=Upper), width=.2) +
+ geom_line(size=8) +
+ geom_point(size=8)+facet_wrap(~Variety)
> p1
ou
> pd <- position_dodge(0.4)
> p1<-ggplot(cdata, aes(x=System, y=Mean,group=Variety,color=Variety)) +
+ geom_errorbar(aes(y=Mean,ymin=Lower, ymax=Upper), width=.2,position=pd) +
+ geom_point(size=4,position=pd)
> p1
ymax not defined: adjusting position using y instead
> mon.aov <- aov(Fruitty~Variety+System)
> summary(mon.aov)
Df Sum Sq Mean Sq F value Pr(>F)
Variety 1 7.48 7.48 7.825 0.00551 **
System 2 147.86 73.93 77.315 < 2e-16 ***
Residuals 276 263.92 0.96
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
On veut effectuer toutes les comparaisons deux à deux en proposant plusieurs méthodes de correction du risque afin de tenir compte du problème de la multiplicité des tests.
> TukeyHSD(mon.aov)
Tukey multiple comparisons of means
95% family-wise confidence level
Fit: aov(formula = Fruitty ~ Variety + System)
$Variety
diff lwr upr p adj
cheml-arbeq 0.3270874 0.09690877 0.5572661 0.005514
$System
diff lwr upr p adj
3p-2p 0.4315625 0.09896904 0.7641560 0.0068994
sp-2p -1.3001608 -1.64022920 -0.9600924 0.0000000
sp-3p -1.7317233 -2.07179170 -1.3916549 0.0000000
Variety
)> mon.aov1 <- aov(Fruitty~System,data=sensoHuile[Variety=='cheml',])
> summary(mon.aov1)
Df Sum Sq Mean Sq F value Pr(>F)
System 2 79.42 39.71 70.03 <2e-16 ***
Residuals 133 75.41 0.57
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
> mon.aov2 <- aov(Fruitty~System,data=sensoHuile[Variety=='arbeq',])
> summary(mon.aov2)
Df Sum Sq Mean Sq F value Pr(>F)
System 2 79.24 39.62 31.44 5.13e-12 ***
Residuals 141 177.71 1.26
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
> TukeyHSD(mon.aov1)
Tukey multiple comparisons of means
95% family-wise confidence level
Fit: aov(formula = Fruitty ~ System, data = sensoHuile[Variety == "cheml", ])
$System
diff lwr upr p adj
3p-2p -0.012500 -0.376813 0.351813 0.9963604
sp-2p -1.683333 -2.065428 -1.301239 0.0000000
sp-3p -1.670833 -2.052928 -1.288739 0.0000000
> TukeyHSD(mon.aov2)
Tukey multiple comparisons of means
95% family-wise confidence level
Fit: aov(formula = Fruitty ~ System, data = sensoHuile[Variety == "arbeq", ])
$System
diff lwr upr p adj
3p-2p 0.8756250 0.3328071 1.4184429 0.0005793
sp-2p -0.9410417 -1.4838596 -0.3982238 0.0001992
sp-3p -1.8166667 -2.3594846 -1.2738488 0.0000000
> x1=TukeyHSD(mon.aov1)$System
> x2=TukeyHSD(mon.aov2)$System
> cdata=rbind.data.frame(x1,x2)
> cdata=cbind.data.frame(c(rep('Chemlali',3),rep('Arbequina',3)),rownames(cdata),cdata)
> colnames(cdata)[1:2]=c('Variety','System')
> cdata$System[4:6]=cdata$System[1:3]
> cdata
Variety | System | diff | lwr | upr | p adj | |
---|---|---|---|---|---|---|
3p-2p | Chemlali | 3p-2p | -0.0125000 | -0.3768130 | 0.3518130 | 0.9963604 |
sp-2p | Chemlali | sp-2p | -1.6833333 | -2.0654280 | -1.3012386 | 0.0000000 |
sp-3p | Chemlali | sp-3p | -1.6708333 | -2.0529280 | -1.2887386 | 0.0000000 |
3p-2p1 | Arbequina | 3p-2p | 0.8756250 | 0.3328071 | 1.4184429 | 0.0005793 |
sp-2p1 | Arbequina | sp-2p | -0.9410417 | -1.4838596 | -0.3982238 | 0.0001992 |
sp-3p1 | Arbequina | sp-3p | -1.8166667 | -2.3594846 | -1.2738488 | 0.0000000 |
> p1<-ggplot(cdata, aes(x=System, y=diff)) +
+ geom_errorbar(aes(ymin=lwr, ymax=upr), width=.2) +
+ geom_line(size=8) +
+ geom_point(size=8)+facet_wrap(~Variety)+geom_hline(y=0,type=2,colour='gray')
> p1
geom_path: Each group consist of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consist of only one observation. Do you need to adjust the group aesthetic?
pairwise.t.test()
La fonction pairwise.t.test()
permet d’effectuer toutes les comparaisons deux é deux en proposant plusieurs méthodes de correction du risque \(\alpha\) afin de tenir compte du probléme de la multiplicité des tests
> pair1=pairwise.t.test(Fruitty[Variety=='cheml'],System[Variety=='cheml'],p.adjust="bonf",pool.sd = F)
> pair1
Pairwise comparisons using t tests with non-pooled SD
data: Fruitty[Variety == "cheml"] and System[Variety == "cheml"]
2p 3p
3p 1 -
sp 1.6e-14 < 2e-16
P value adjustment method: bonferroni
> pair2=pairwise.t.test(Fruitty[Variety=='arbeq'],System[Variety=='arbeq'],p.adjust="bonf",pool.sd = F)
> pair2
Pairwise comparisons using t tests with non-pooled SD
data: Fruitty[Variety == "arbeq"] and System[Variety == "arbeq"]
2p 3p
3p 0.00101 -
sp 0.00049 6.9e-13
P value adjustment method: bonferroni
Le package agricolae
> library(agricolae)
> HSD.test(mon.aov1,"System", group=TRUE,console = T)
Study: mon.aov1 ~ "System"
HSD Test for Fruitty
Mean Square Error: 0.5669893
System, means
Fruitty std r Min Max
2p 3.720833 0.9773952 48 1.7 5.1
3p 3.708333 0.5274722 48 2.5 4.8
sp 2.037500 0.6685950 40 1.1 4.5
alpha: 0.05 ; Df Error: 133
Critical Value of Studentized Range: 3.352029
Harmonic Mean of Cell Sizes 45
Honestly Significant Difference: 0.3762608
Means with the same letter are not significantly different.
Groups, Treatments and means
a 2p 3.721
a 3p 3.708
b sp 2.038
> HSD.test(mon.aov2,"System", group=TRUE,console = T)
Study: mon.aov2 ~ "System"
HSD Test for Fruitty
Mean Square Error: 1.260349
System, means
Fruitty std r Min Max
2p 2.916042 1.269298 48 1 5.0
3p 3.791667 1.017262 48 2 5.3
sp 1.975000 1.065414 48 1 4.5
alpha: 0.05 ; Df Error: 141
Critical Value of Studentized Range: 3.349881
Honestly Significant Difference: 0.5428179
Means with the same letter are not significantly different.
Groups, Treatments and means
a 3p 3.792
b 2p 2.916
c sp 1.975
ou encore
> library(multcomp)
Loading required package: mvtnorm
Loading required package: TH.data
> tuk1 <- glht(mon.aov1, linfct = mcp(System = "Tukey"))
> summary(tuk1)
Simultaneous Tests for General Linear Hypotheses
Multiple Comparisons of Means: Tukey Contrasts
Fit: aov(formula = Fruitty ~ System, data = sensoHuile[Variety ==
"cheml", ])
Linear Hypotheses:
Estimate Std. Error t value Pr(>|t|)
3p - 2p == 0 -0.0125 0.1537 -0.081 0.996
sp - 2p == 0 -1.6833 0.1612 -10.442 <1e-06 ***
sp - 3p == 0 -1.6708 0.1612 -10.365 <1e-06 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Adjusted p values reported -- single-step method)
> tuk1.cld <- cld(tuk1)
> tuk2 <- glht(mon.aov2, linfct = mcp(System = "Tukey"))
> summary(tuk2)
Simultaneous Tests for General Linear Hypotheses
Multiple Comparisons of Means: Tukey Contrasts
Fit: aov(formula = Fruitty ~ System, data = sensoHuile[Variety ==
"arbeq", ])
Linear Hypotheses:
Estimate Std. Error t value Pr(>|t|)
3p - 2p == 0 0.8756 0.2292 3.821 0.000578 ***
sp - 2p == 0 -0.9410 0.2292 -4.106 0.000231 ***
sp - 3p == 0 -1.8167 0.2292 -7.927 < 1e-04 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Adjusted p values reported -- single-step method)
> tuk1.cld <- cld(tuk1)
> tuk2.cld <- cld(tuk2)
> opar <- par(mai=c(1,1,1.5,1),mfrow = c(1,2))
> cols1=as.numeric(factor(tuk1.cld$mcletters$Letters))+1
> cols2=as.numeric(factor(tuk2.cld$mcletters$Letters))+1
> plot(tuk1.cld,col=cols1,xlab="Chemlali")
> plot(tuk2.cld,col=cols2,xlab="Arbequina")
> par(opar)