example_data = cbind(c(3,5,9,3,5,9),c(5,6,3,4,3,4))
rownames(example_data) <- c("Dogs","Cats","Fish","Gerbils","Horses","Rocks")
colnames(example_data) <- c("Cemetery","Cremetorium")
mydotchart = function(data,main=NULL,xlab=NULL,ylab=NULL,xlim=NULL,ylim=NULL,normalize=NULL,col=NULL,pch=NULL,cex=NULL,labels=NULL) {
#mydotchart = function(data,...normalize=F, col=F) {
if(!is.integer(dim(data))){
cat('You need to pass a matrix')
#}else if(main == F & xlab == F & ylab == F & xlim == F & ylim == F & col == F & pch == F & cex == F){
#matplot(data,1:length(data[,1]),main='no title given',cex.axis = 0.9)
#segments(0,1:max(data),max(data),1:max(data),lty=2)
#axis(2, at=1:length(rownames(data)), labels=rownames(data),las = 1)
}else{
if((normalize == F & col == F)){
n_of_colors = length(data[1,])
random_colors = colors()[round(runif(n_of_colors,min=1, max=657))]
matplot(data,1:length(data[,1]),main=main,
xlab=xlab,ylab=ylab,xlim=xlim,ylim=ylim,col = random_colors,pch=pch,cex=cex,yaxt='n',cex.axis = 0.9)
segments(0,1:max(data),max(data),1:max(data),lty=2)
axis(2, at=1:length(rownames(data)), labels=rownames(data),las = 1)
}else if (col == F){
n_of_colors = length(data[1,])
random_colors = colors()[round(runif(n_of_colors,min=1, max=657))]
normalized_data = (data-min(data))/(max(data)-min(data))
matplot(normalized_data,1:length(normalized_data[,1]),main=main,
xlab=xlab,ylab=ylab,xlim=xlim,ylim=ylim,col=random_colors,pch=pch,cex=cex,yaxt='n',cex.axis = 0.9)
segments(0,1:length(normalized_data[,1]),1,1:length(normalized_data[,1]),lty=2)
axis(2,at=0.1:length(rownames(data)),labels=rownames(normalized_data),las = 1)
}else if (normalize == F){
matplot(data,1:length(data[,1]),main=main,
xlab=xlab,ylab=ylab,xlim=xlim,ylim=ylim,col=col,pch=pch,cex=cex,yaxt='n',cex.axis = 0.9)
segments(0,1:max(data),max(data),1:max(data),lty=2)
axis(2,at=1:length(rownames(data)), labels=rownames(data),las = 1)
}else{
normalized_data = (data-min(data))/(max(data)-min(data))
matplot(normalized_data,1:length(normalized_data[,1]),main=main,
xlab=xlab,ylab=ylab,xlim=xlim,ylim=ylim,col=col,pch=pch,cex=cex,yaxt='n',cex.axis = 0.9)
segments(0,1:length(normalized_data[,1]),1,1:length(normalized_data[,1]),lty=2)
axis(2,at=0.1:length(rownames(data)),labels=rownames(normalized_data),las = 1)
}
}
}
par(mfrow=c(2,2))
mydotchart(example_data,main = ' normalized & w/random colors', xlim=c(0,1), ylim=c(0,6),xlab='number of animals',ylab='',normalize = T,col = F,pch=16,cex=1.2)
mydotchart(example_data,main = 'normalized w/ red and yellow', xlim=c(0,1), ylim=c(0,6),xlab='number of animals',ylab='',normalize = T,col = c('red','yellow'),pch=16,cex=1.2)
## Warning in if ((normalize == F & col == F)) {: la condizione la lunghezza > 1 e
## solo il promo elemento verrà utilizzato
## Warning in if (col == F) {: la condizione la lunghezza > 1 e solo il promo
## elemento verrà utilizzato
mydotchart(example_data,main = 'no normalization & w/ red and yellow', xlim=c(0,6), ylim=c(0,6),xlab='number of animals',ylab='',normalize = F,col = c('red','yellow'),pch=16,cex=1.2)
## Warning in if ((normalize == F & col == F)) {: la condizione la lunghezza > 1 e
## solo il promo elemento verrà utilizzato
## Warning in if ((normalize == F & col == F)) {: la condizione la lunghezza > 1 e
## solo il promo elemento verrà utilizzato
mydotchart(example_data,main = 'no normalization and random colors', xlim=c(0,6), ylim=c(0,6),xlab='number of animals',ylab='',normalize = F,col = F,pch=16,cex=1.2)
# what is this error? Does it result from this? f((normalize == F & col == F)) as if r could only take the first identity and not the second one... Am I right?
then
set.seed(100)
data2 <- data.frame(q1=sample(letters[1:10],100,replace=T),
q2=sample(letters[1:10],100,replace=T),
q3=sample(letters[1:10],100,replace=T),
q4=sample(letters[1:10],100,replace=T),
q5=sample(letters[1:10],100,replace=T))
datatable2<-apply(data2,2,table)
mydotchart(datatable2,main = 'no normalization and random colors', xlim=c(0,max(datatable2[,1])), ylim=c(0,length(datatable2[,1])),xlab='values',ylab='',normalize = F,col = F,pch=16,cex=1.2)
#mydotchart(datatable2)#mydotchart(datatable2[,1:2])
#mydotchart(as.matrix(datatable2[,1]))
#mydotchart(datatable2[,1:3])
#mydotchart(datatable2,col=1:5)
#mydotchart(datatable2,col=1:5,pch=16)
#mydotchart(datatable2,col=1:5,pch=16,cex=2.5,main="Everything",xlab="Value", ylab="Category")
#these are the same, here it does not let me move forward because the if (normalize == F & col == F) statement has length equal to 0. So I tried with if(is.null(normalize) & is.null(col)) and it now works for mydotchart(datatable2) but won't work for the previous charts! What should I do?
mydotchart(datatable2[,1])
## You need to pass a matrix
mydotchart(datatable2,col=1:5,pch=16,cex=2.5,main="Everything normalized",xlab="Value", ylab="Category",normalize=T)#these two work
## Warning in if ((normalize == F & col == F)) {: la condizione la lunghezza > 1 e
## solo il promo elemento verrà utilizzato
## Warning in if (col == F) {: la condizione la lunghezza > 1 e solo il promo
## elemento verrà utilizzato
#2. Correlating word frequency with SCRABBLE scores
The following data frame specifies the English letter frequency of letters, the points earned in Scrabble, and the number of Scrabble tiles.
lf <- c(8.167,1.492,2.782,4.253,12.702,2.228,2.015,6.094,
6.966,0.153,0.772,4.025,2.406,6.749,7.507,1.929,
0.095,5.987,6.327,9.056,2.758,0.978,2.36,0.15,1.974,0.074)/100
pts <- c(1,3,3,2,1,4,2,4,1,8,5,1,3,1,1,3,10,1,1,1,1,4,4,8,4,10)
tiles <- c(9,2,2,4,12,2,3,2,9,1,1,4,2,6,8,2,1,6,4,6,4,2,2,1,2,1)
lf.table <- data.frame(LETTERS,
freq=lf,
points=pts,
ntiles=tiles)
For any word, you can split it into its letters, and then compute some statistics based on this scoring. The following computes the sum of the inverse letter frequency of the letters, the total scrabble points, the mean numbers of tiles of the letters in the word, and the length of the word:
scoreme <- function(word)
{
lets <- strsplit(splus2R::upperCase(word),"")[[1]] #strsplit splits each word into substrings of length [[1]] and spaces "" in UPPERCASE
data <- matrix(0,ncol=4,nrow=length(lets))
for(i in 1:length(lets))
{
index <- which(lets[i]==LETTERS)
data[i,1] <- lf.table$freq[index] #fills up a matrix where the iterators is in the lf.table with respect
#to our input!
data[i,2] <- lf.table$points[index]
data[i,3] <- lf.table$ntiles[index]
}
list(suminvfreq= sum(1/data[,1]),
points=sum(data[,2]),
meantiles=mean(data[,3]),
length=length(lets))
}
then
horses <- scoreme("HORSES")
horses
## $suminvfreq
## [1] 85.91667
##
## $points
## [1] 9
##
## $meantiles
## [1] 6
##
## $length
## [1] 6
print(horses$points)
## [1] 9
then
test <- read.table(text='rank word frequency
1081 CUP 1441306
2310 FOUND 573305
5285 BUTTERFLY 171410
7371 brew 94904
11821 CUMBERSOME 39698
17331 useable 17790
18526 WHITTLE 15315
25416 SPINY 7207
27381 uppercase 5959
37281 halfnaked 2459
47381 bellhop 1106
57351 tetherball 425
7309 attic 2711
17311 tearful 542
27303 tailgate 198
37310 hydraulically 78
47309 unsparing 35
57309 embryogenesis 22 ',header=T)[,c(2,1,3)] ##reorder colums
test$meantiles <- NA
test$suminvfreq <- NA
test$points <- NA
test$length <- NA
then
for(i in 1:length(test[,1])){
test[i,4] = scoreme(test[i,1])[3]
test[i,5] = scoreme(test[i,1])[1] #let's populate the 4th through the last columns
test[i,6] = scoreme(test[i,1])[2]
test[i,7] = scoreme(test[i,1])[4]
}
par(mfrow = c(2,4))
#plot rank frequency with meantiles, suminvfreq, points, length
plot(test$rank,test$length,main=sprintf('r = %g',round(cor(test$rank,test$length),3)),xlab='word rank position',
ylab='length')
plot(test$rank,test$meantiles,main=sprintf('r = %g',round(cor(test$rank,test$meantiles),3)),xlab='word rank position',
ylab='meantiles')
plot(test$rank,test$suminvfreq,main=sprintf('r = %g',round(cor(test$rank,test$suminvfreq),3)),xlab='word rank position',
ylab='suminvfreq')
plot(test$rank,test$points,main=sprintf('r = %g',round(cor(test$rank,test$points),3)),xlab='word rank position',
ylab='points')
plot(test$length,test$frequency/10000,main=sprintf('r = %g',round(cor(test$frequency,test$length),3)),xlab='length',ylab='frequency / 10000')
plot(test$meantiles,test$frequency/10000,main=sprintf('r = %g',round(cor(test$frequency,test$meantiles),3)),xlab='meantiles',ylab='frequency / 10000')
plot(test$suminvfreq,test$frequency/10000,main=sprintf('r = %g',round(cor(test$frequency,test$suminvfreq),3)),xlab='suminvfreq',ylab='frequency / 10000')
plot(test$points,test$frequency/10000,main=sprintf('r = %g',round(cor(test$frequency,test$points),3)),xlab='points',ylab='frequency / 10000')
Here, we see that the higherthe frequency the shorter the word, probably also the longer the word the lower the frequency –> less points. We can also appreciate as where frequency is always negatively correlated to our statistics, rank is not. This is due to the fact that the higher the frequency of a certain event ,or word in this case, the lower its rank as in Zipf’s law. In our plots it is clear that we can easily notice the order all our statistical parameters are displayed.
plot(test$rank,test$frequency/10000,main=sprintf('r = %g',round(cor(test$rank,test$frequency),3)),xlab='word rank position',ylab='word frequency / 10000',type='o')
text(x=30000,100,labels="frequency is inversely proportional to rank as in Zipf's law")