# getwd() + setwd()
R.home()
## [1] "/Library/Frameworks/R.framework/Resources"
# install.packages(c("rpart", "chron", "Hmisc", "Design",
# "Matrix", "lme4", "coda", "e1071", "zipfR", "ape",
# "languageR"), repos = "http://cran.r-project.org") # Load package in one line
library(languageR)
head(verbs,10) # R index begins with 1
## RealizationOfRec Verb AnimacyOfRec AnimacyOfTheme LengthOfTheme
## 1 NP feed animate inanimate 2.6390573
## 2 NP give animate inanimate 1.0986123
## 3 NP give animate inanimate 2.5649494
## 4 NP give animate inanimate 1.6094379
## 5 NP offer animate inanimate 1.0986123
## 6 NP give animate inanimate 1.3862944
## 7 NP pay animate inanimate 1.3862944
## 8 NP bring animate inanimate 0.0000000
## 9 NP teach animate inanimate 2.3978953
## 10 NP give animate inanimate 0.6931472
exp(2.639) # with e as base
## [1] 13.9992
log(14) # again with e as index
## [1] 2.639057
write.table(verbs,"delete.txt")
verbs = read.table("delete.txt")
head(verbs$LengthOfTheme,5) # get a column
## [1] 2.639057 1.098612 2.564949 1.609438 1.098612
rs = c(638, 799, 390, 569, 567) # not very useful though
verbs[rs, ] # Several rows
## RealizationOfRec Verb AnimacyOfRec AnimacyOfTheme LengthOfTheme
## 638 PP pay animate inanimate 0.6931472
## 799 PP sell animate inanimate 1.3862944
## 390 NP lend animate animate 0.6931472
## 569 PP sell animate inanimate 1.6094379
## 567 PP send inanimate inanimate 1.3862944
verbs[rs, 1:3] # Display several columns
## RealizationOfRec Verb AnimacyOfRec
## 638 PP pay animate
## 799 PP sell animate
## 390 NP lend animate
## 569 PP sell animate
## 567 PP send inanimate
# \several columns
verbs[rs, c("RealizationOfRec" , "Verb" , "AnimacyOfRec" )]
## RealizationOfRec Verb AnimacyOfRec
## 638 PP pay animate
## 799 PP sell animate
## 390 NP lend animate
## 569 PP sell animate
## 567 PP send inanimate
# get rows and columns with condition (filtering)
verbs[verbs$AnimacyOfTheme == "animate" , ]
## RealizationOfRec Verb AnimacyOfRec AnimacyOfTheme LengthOfTheme
## 58 NP give animate animate 1.0986123
## 100 NP give animate animate 2.8903718
## 143 NP give inanimate animate 2.6390573
## 390 NP lend animate animate 0.6931472
## 506 NP give animate animate 1.9459101
## 736 PP trade animate animate 1.6094379
subset(verbs, AnimacyOfTheme == "animate" )
## RealizationOfRec Verb AnimacyOfRec AnimacyOfTheme LengthOfTheme
## 58 NP give animate animate 1.0986123
## 100 NP give animate animate 2.8903718
## 143 NP give inanimate animate 2.6390573
## 390 NP lend animate animate 0.6931472
## 506 NP give animate animate 1.9459101
## 736 PP trade animate animate 1.6094379
subset(verbs, AnimacyOfTheme == "animate" & verbs$LengthOfTheme >2)
## RealizationOfRec Verb AnimacyOfRec AnimacyOfTheme LengthOfTheme
## 100 NP give animate animate 2.890372
## 143 NP give inanimate animate 2.639057
# logical operators Get a subset
# convention to subset
verbs.rs = verbs[rs,]
verbs.rs$AnimacyOfRec = as.character(verbs.rs$AnimacyOfRec)
verbs.rs$AnimacyOfRec # now with quotes after casting
## [1] "animate" "animate" "animate" "animate" "inanimate"
verbs.rs$AnimacyOfRec = as.factor(verbs.rs$AnimacyOfRec)
# eliminate redundant levels verbs.rs2$AnimacyofRec[drop=TRUE]
verbs.rs[order(verbs.rs$RealizationOfRec), ] # ordering
## RealizationOfRec Verb AnimacyOfRec AnimacyOfTheme LengthOfTheme
## 390 NP lend animate animate 0.6931472
## 638 PP pay animate inanimate 0.6931472
## 799 PP sell animate inanimate 1.3862944
## 569 PP sell animate inanimate 1.6094379
## 567 PP send inanimate inanimate 1.3862944
verbs.rs[order(verbs.rs$Verb, verbs.rs$LengthOfTheme), ] # ordering
## RealizationOfRec Verb AnimacyOfRec AnimacyOfTheme LengthOfTheme
## 390 NP lend animate animate 0.6931472
## 638 PP pay animate inanimate 0.6931472
## 799 PP sell animate inanimate 1.3862944
## 569 PP sell animate inanimate 1.6094379
## 567 PP send inanimate inanimate 1.3862944
order(verbs.rs$Verb) # the no.3 item goes the first in the array
## [1] 3 1 2 4 5
verbs.rs
## RealizationOfRec Verb AnimacyOfRec AnimacyOfTheme LengthOfTheme
## 638 PP pay animate inanimate 0.6931472
## 799 PP sell animate inanimate 1.3862944
## 390 NP lend animate animate 0.6931472
## 569 PP sell animate inanimate 1.6094379
## 567 PP send inanimate inanimate 1.3862944
v = c("pay" , "sell" , " lend" , "sell" , "send" , "sell" , "give" , "give" , "pay" , "cost")
order(v)
## [1] 3 10 7 8 1 9 2 4 6 5
v[order(v)] # ici parce qu'il y a un espace devant lend
## [1] " lend" "cost" "give" "give" "pay" "pay" "sell" "sell"
## [9] "sell" "send"
v = c("pay" , "sell" , "lend" , "sell" , "send" , "sell" , "give" , "give" , "pay" , "cost")
v[order(v)] # voila
## [1] "cost" "give" "give" "lend" "pay" "pay" "sell" "sell" "sell" "send"
# another way
sort(v)
## [1] "cost" "give" "give" "lend" "pay" "pay" "sell" "sell" "sell" "send"
nchar(c(" antidisestablishmentarianém" , " a" ))# il compte l'espace
## [1] 28 2
nchar(c("一只大猪" , "a" )) # le chinois marche aussi en Mac mais pas en windows,windows de merde
## [1] 4 1
# When R is installed, it checks the System encoding. On Linux and Mac OS, the system encoding is usually UTF-8, and R uses that. However Windows is different
sessionInfo()
## R version 3.4.3 (2017-11-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.2
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] reshape_0.8.7 foreign_0.8-69 ggplot2_2.2.1 DSUR.noof_0.1.1
## [5] languageR_1.4.1
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.14 knitr_1.17 magrittr_1.5 munsell_0.4.3
## [5] colorspace_1.3-2 rlang_0.1.4 stringr_1.2.0 plyr_1.8.4
## [9] tools_3.4.3 grid_3.4.3 gtable_0.2.0 htmltools_0.3.6
## [13] lazyeval_0.2.1 digest_0.6.15 rprojroot_1.2 tibble_1.3.4
## [17] evaluate_0.10.1 rmarkdown_1.8 stringi_1.1.6 compiler_3.4.3
## [21] scales_0.5.0 backports_1.1.1
verbs.rs$Length = nchar(as.character(verbs.rs$Verb)) # Add a column
verbs.rs[1:4, c("Verb" , "Length" )]
## Verb Length
## 638 pay 3
## 799 sell 4
## 390 lend 4
## 569 sell 4
xtabs(~RealizationOfRec + AnimacyOfRec, data = verbs)
## AnimacyOfRec
## RealizationOfRec animate inanimate
## NP 521 34
## PP 301 47
# Matrice de confusion
# dependent variable ∼ predictor 1 + predictor 2 + . . .
# more than 2 factore are possible
xtabs(~ AnimacyOfRec + AnimacyOfTheme + RealizationOfRec , data = verbs)
## , , RealizationOfRec = NP
##
## AnimacyOfTheme
## AnimacyOfRec animate inanimate
## animate 4 517
## inanimate 1 33
##
## , , RealizationOfRec = PP
##
## AnimacyOfTheme
## AnimacyOfRec animate inanimate
## animate 1 300
## inanimate 0 47
# Poll subjects according to woman or man for example
# Subset revisited
verbs.xtabs = xtabs(~ AnimacyOfRec + RealizationOfRec , data = verbs,subset = AnimacyOfTheme != "animate")
verbs.xtabs # quand anime y a plus de chance que ce soit un NP (ou l'inverse, enfin correlation quoi)
## RealizationOfRec
## AnimacyOfRec NP PP
## animate 517 300
## inanimate 33 47
# allez on va vers la proportion sur toutes les entrees
verbs.xtabs/sum(verbs.xtabs)
## RealizationOfRec
## AnimacyOfRec NP PP
## animate 0.57636566 0.33444816
## inanimate 0.03678930 0.05239688
# et pourcentage
verbs.xtabs/sum(verbs.xtabs) * 100
## RealizationOfRec
## AnimacyOfRec NP PP
## animate 57.636566 33.444816
## inanimate 3.678930 5.239688
prop.table(verbs.xtabs, 1) # with respect to row proportion table
## RealizationOfRec
## AnimacyOfRec NP PP
## animate 0.6328029 0.3671971
## inanimate 0.4125000 0.5875000
prop.table(verbs.xtabs, 2) # with respect to column
## RealizationOfRec
## AnimacyOfRec NP PP
## animate 0.9400000 0.8645533
## inanimate 0.0600000 0.1354467
# For animate rec, the NP realisation is more likely
# tapply(verbs$LengthOfTheme, verbs$AnimacyOfRec, mean) # erreur bizarre mais bon
tapply(verbs$LengthOfTheme, verbs$AnimacyOfRec, mean)[1]
## animate
## 1.540278
tapply(verbs$LengthOfTheme, verbs$AnimacyOfRec, mean)[2]
## inanimate
## 1.07113
# filtering
# two factors tapply instead of only AnimacyOfRec
with(verbs, tapply(LengthOfTheme, list(AnimacyOfRec, AnimacyOfTheme), mean))
## animate inanimate
## animate 1.647496 1.539622
## inanimate 2.639057 1.051531
heid[1:5, ] # log reaction times for 26 subjects to 40 words we expect that neologisms with a higher base frequency elicit shorter reaction times.
## Subject Word RT BaseFrequency
## 1 pp1 basaalheid 6.69 3.56
## 2 pp1 markantheid 6.81 5.16
## 3 pp1 ontroerdheid 6.51 5.55
## 4 pp1 contentheid 6.58 4.50
## 5 pp1 riantheid 6.86 4.53
# Psycholinguistic studies often report two analyses, one for reaction times averaged over subjects, and one for reaction times averaged over words. So let's see.
heid2 = aggregate(heid$RT, list(heid$Word), mean) # a list
heid3 = tapply(heid$RT,heid$Word,mean)
head(heid3,5)
## aftandsheid antiekheid banaalheid basaalheid bebrildheid
## 6.705000 6.542353 6.587727 6.585714 6.673333
heid2[1:5,]
## Group.1 x
## 1 aftandsheid 6.705000
## 2 antiekheid 6.542353
## 3 banaalheid 6.587727
## 4 basaalheid 6.585714
## 5 bebrildheid 6.673333
aggregate(verbs$LengthOfTheme, list(verbs$AnimacyOfRec), mean) # fait attention a list()
## Group.1 x
## 1 animate 1.540278
## 2 inanimate 1.071130
colnames(heid2) = c("Word" , "MeanRT" )
items = heid[, c("Word" , "BaseFrequency" )]
head(items, 5) # trop de rows
## Word BaseFrequency
## 1 basaalheid 3.56
## 2 markantheid 5.16
## 3 ontroerdheid 5.55
## 4 contentheid 4.50
## 5 riantheid 4.53
items = unique(items) # ok !
heid2 = merge(heid2, items, by.x = "Word" , by.y = "Word" ) # merge two dataframes by keyword
head(heid2, n= 4) # c'est beau
## Word MeanRT BaseFrequency
## 1 aftandsheid 6.705000 4.20
## 2 antiekheid 6.542353 6.75
## 3 banaalheid 6.587727 5.74
## 4 basaalheid 6.585714 3.56
heid3 = aggregate(heid$RT, list(heid$Word, heid$BaseFrequency), mean) # tout en une seule commande, mais notez l'usage de merge
# how a measure varies depending on two factors
colnames(heid3) = c("Word" , "BaseFrequency" , "MeanRT" )
head(heid3[order(heid3$Word),], 4)
## Word BaseFrequency MeanRT
## 20 aftandsheid 4.20 6.705000
## 39 antiekheid 6.75 6.542353
## 34 banaalheid 5.74 6.587727
## 15 basaalheid 3.56 6.585714
# heid3$sort(heid3$Word) # ici ca ne marche pas car the $ signmarche pas en recursion
objects()
## [1] "heid2" "heid3" "items" "rs" "v"
## [6] "verbs" "verbs.rs" "verbs.xtabs"
# rm(verbs.rs) q() if answered no then dead, no objects available next time
colnames(spanishMeta)
## [1] "Author" "YearOfBirth" "TextName" "PubDate" "Nwords"
## [6] "FullName"
nrow(spanishMeta)
## [1] 15
spanishMeta[16,] = spanishMeta[15,] # just a test
aggregate(spanishMeta$TextName,list(spanishMeta$Author),length)
## Group.1 x
## 1 C 5
## 2 M 5
## 3 V 6
with(spanishMeta,aggregate(TextName,list(Author),length))
## Group.1 x
## 1 C 5
## 2 M 5
## 3 V 6
# with(spanishMeta,tapply(TextName,liPust(Author),length)) # without column
head(spanishMeta)
## Author YearOfBirth TextName PubDate Nwords FullName
## 1 C 1916 X14458gll 1983 2972 Cela
## 2 C 1916 X14459gll 1951 3040 Cela
## 3 C 1916 X14460gll 1956 3066 Cela
## 4 C 1916 X14461gll 1948 3044 Cela
## 5 C 1916 X14462gll 1942 3053 Cela
## 6 M 1943 X14463gll 1986 3013 Mendoza
spanishMeta[order(spanishMeta$YearOfBirth,spanishMeta$Nwords),]
## Author YearOfBirth TextName PubDate Nwords FullName
## 1 C 1916 X14458gll 1983 2972 Cela
## 2 C 1916 X14459gll 1951 3040 Cela
## 4 C 1916 X14461gll 1948 3044 Cela
## 5 C 1916 X14462gll 1942 3053 Cela
## 3 C 1916 X14460gll 1956 3066 Cela
## 14 V 1936 X14475gll 1987 3016 VargasLLosa
## 13 V 1936 X14474gll 1977 3020 VargasLLosa
## 11 V 1936 X14472gll 1965 3037 VargasLLosa
## 15 V 1936 X14476gll 1981 3054 VargasLLosa
## 16 V 1936 X14476gll 1981 3054 VargasLLosa
## 12 V 1936 X14473gll 1963 3067 VargasLLosa
## 6 M 1943 X14463gll 1986 3013 Mendoza
## 9 M 1943 X14466gll 1982 3039 Mendoza
## 8 M 1943 X14465gll 1989 3042 Mendoza
## 10 M 1943 X14467gll 2002 3045 Mendoza
## 7 M 1943 X14464gll 1992 3049 Mendoza
PublicationData = spanishMeta$PubDate
sort(PublicationData)
## [1] 1942 1948 1951 1956 1963 1965 1977 1981 1981 1982 1983 1986 1987 1989
## [15] 1992 2002
sort(PublicationData,decreasing = T)
## [1] 2002 1992 1989 1987 1986 1983 1982 1981 1981 1977 1965 1963 1956 1951
## [15] 1948 1942
?sort
rownames(spanishMeta)
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14"
## [15] "15" "16"
(spanishMeta[spanishMeta$PubDate<=1980,]) # just a test
## Author YearOfBirth TextName PubDate Nwords FullName
## 2 C 1916 X14459gll 1951 3040 Cela
## 3 C 1916 X14460gll 1956 3066 Cela
## 4 C 1916 X14461gll 1948 3044 Cela
## 5 C 1916 X14462gll 1942 3053 Cela
## 11 V 1936 X14472gll 1965 3037 VargasLLosa
## 12 V 1936 X14473gll 1963 3067 VargasLLosa
## 13 V 1936 X14474gll 1977 3020 VargasLLosa
mean(spanishMeta$PubDate)
## [1] 1974.062
sum(spanishMeta$PubDate)/length(spanishMeta$PubDate)
## [1] 1974.062
composer = data.frame(Author = c("Cela","Mendoza","VargasLLosa"),Favorite = c("Stravinsky","Bach","Villa-Lobos"))
composer
## Author Favorite
## 1 Cela Stravinsky
## 2 Mendoza Bach
## 3 VargasLLosa Villa-Lobos
merge(spanishMeta,composer,by.x = "FullName", by.y = "Author" )
## FullName Author YearOfBirth TextName PubDate Nwords Favorite
## 1 Cela C 1916 X14458gll 1983 2972 Stravinsky
## 2 Cela C 1916 X14459gll 1951 3040 Stravinsky
## 3 Cela C 1916 X14460gll 1956 3066 Stravinsky
## 4 Cela C 1916 X14461gll 1948 3044 Stravinsky
## 5 Cela C 1916 X14462gll 1942 3053 Stravinsky
## 6 Mendoza M 1943 X14463gll 1986 3013 Bach
## 7 Mendoza M 1943 X14464gll 1992 3049 Bach
## 8 Mendoza M 1943 X14465gll 1989 3042 Bach
## 9 Mendoza M 1943 X14466gll 1982 3039 Bach
## 10 Mendoza M 1943 X14467gll 2002 3045 Bach
## 11 VargasLLosa V 1936 X14472gll 1965 3037 Villa-Lobos
## 12 VargasLLosa V 1936 X14473gll 1963 3067 Villa-Lobos
## 13 VargasLLosa V 1936 X14474gll 1977 3020 Villa-Lobos
## 14 VargasLLosa V 1936 X14475gll 1987 3016 Villa-Lobos
## 15 VargasLLosa V 1936 X14476gll 1981 3054 Villa-Lobos
## 16 VargasLLosa V 1936 X14476gll 1981 3054 Villa-Lobos
xtabs(~ratings$Length) # grouped into bins
## ratings$Length
## 3 4 5 6 7 8 9 10
## 8 10 19 17 9 10 5 3
ratings$Length
## [1] 6 3 5 7 9 7 6 6 3 6 3 8 10 9 8 5 9 5 6 3 6 7 5
## [24] 9 8 3 7 6 5 8 8 3 4 7 4 5 5 4 8 5 4 4 5 7 4 6
## [47] 5 4 6 5 5 8 7 5 6 3 7 6 4 3 6 9 6 6 8 5 5 5 6
## [70] 5 8 5 10 4 6 8 7 6 4 5 10
# barplot for discrete values
barplot(xtabs(~ratings$Length),xlab="Word length", col = "grey")

barplot(ratings$Length,xlab="Word length", col = "grey") # without xtabs

library(MASS) # histogram for continous variable and proportions
# detach(package:Mass) detach a package
truehist(ratings$Length, xlab=" word length" , col=" grey" )

par(mfrow = c(1, 2)) # doesn't work on Jupyter # correction : yes it works needs to put it into blocks
truehist(ratings$Length, xlab=" word length" , col=" grey" )
jpeg("barplot.jpeg",width = 400,height = 420) # doesn't work
barplot(ratings$Length,xlab="Word length", col = "grey") # without xtabs
# The function truehist() that we used above has defaults that are chosen to minimize the risk of obtaining a rather arbitrarily shaped histogram (see also (Haerdle, 1991; Venables and Ripley, 2003)).
# But Problems with real numbers # Not smooth enough
# here for the density curve
h = hist(lexdec$RT, freq = FALSE, plot = FALSE) # make an object without plotting FALSe = total area = 1
## Warning in hist.default(lexdec$RT, freq = FALSE, plot = FALSE): argument
## 'freq' is not made use of
d = density(lexdec$RT)
xlimit = range(h$breaks, d$x) # breaks = edges of bars
print(xlimit) # 5.8 ~ 7.6
## [1] 5.697477 7.718780
ylimit = range(0, h$density, d$y)
hist(lexdec$RT, freq=FALSE, xlim=xlimit, ylim=ylimit, main=" " , xlab=" log RT" , ylab=" " , col=" lightgrey" , border=" darkgrey" , breaks = seq(5.8, 7.6, by = 0.1))
lines(d$x, d$y)
lines(d)
plot(h)
plot(d)
plot(sort(lexdec$RT), ylab = " log RT" ) # x = rank (which row in the ordered vector)
plot(quantile(lexdec$RT), xaxt = "n" , xlab = " Quartiles" , ylab = " log RT" )
mtext(c("0%" , "25%" , "50%" , "75%" , "100%" ), side = 1, at = 1:5, line = 1, cex = 0.7)
plot(quantile(lexdec$RT, seq(0, 1, 0.1)), xaxt = "n" , xlab = "Deciles" , ylab = "log RT" )
mtext(paste(seq(0, 100, 10), rep("%" , 11), sep = " " ), side = 1, at = 1:11, line = 1, cex = 0.7, las = 2)
quantile(lexdec$RT, seq(0, 1, 0.1))
## 0% 10% 20% 30% 40% 50% 60% 70%
## 5.828946 6.122493 6.188264 6.248816 6.297109 6.345636 6.395262 6.459904
## 80% 90% 100%
## 6.553933 6.721907 7.587311
paste(" a" , " b" , " c" , sep = "" )
## [1] " a b c"
paste(seq(0, 100, 10), rep(" %" , 11), sep = "" )
## [1] "0 %" "10 %" "20 %" "30 %" "40 %" "50 %" "60 %" "70 %"
## [9] "80 %" "90 %" "100 %"
boxplot(exp(lexdec$RT))# boxplot first median third and 1.5 * Interquartile range + potential outliers
boxplot(lexdec$RT) # the distribution is no longer the same less outliers and the box is more centered
# How to read a boxplot http://informationandvisualization.de/blog/box-plot
# How to read a boxplot http://informationandvisualization.de/blog/box-plot#
# Without the logarithmic transformation, just a few extreme outliers might dominate the outcome, partially or even completely obscuring the main trends characterizing the majority of data points.
# In order to facilitate interpretation, it is useful to use as transformation −1000/RT
# Plot two or more variables contingency table
verbs.xtabs = xtabs( ~ AnimacyOfRec + RealizationOfRec, data = verbs[verbs$AnimacyOfTheme!= "animate",])
verbs.xtabs
## RealizationOfRec
## AnimacyOfRec NP PP
## animate 517 300
## inanimate 33 47
par(mfrow = c(1, 2))
barplot(verbs.xtabs, legend.text=c("anim" , "inanim" ))
barplot(verbs.xtabs, beside = T, legend.text = rownames(verbs.xtabs))
par(mfrow = c(1, 1)) # useful for four different speeds, the second one is better
# Three variables contingency table
verbs.xtabs = xtabs(~AnimacyOfRec ++ AccessOfRec + RealizationOfRecipient, data = dative)
verbs.xtabs
## , , RealizationOfRecipient = NP
##
## AccessOfRec
## AnimacyOfRec accessible given new
## animate 290 1931 78
## inanimate 11 99 5
##
## , , RealizationOfRecipient = PP
##
## AccessOfRec
## AnimacyOfRec accessible given new
## animate 259 239 227
## inanimate 55 33 36
mosaicplot(verbs.xtabs, main = " dative" ) # pas facile a interpreter, j'aime pas. Pour des connards ca
verbs.xtabs
## , , RealizationOfRecipient = NP
##
## AccessOfRec
## AnimacyOfRec accessible given new
## animate 290 1931 78
## inanimate 11 99 5
##
## , , RealizationOfRecipient = PP
##
## AccessOfRec
## AnimacyOfRec accessible given new
## animate 259 239 227
## inanimate 55 33 36