Problem 3

example_data = cbind(c(3,5,9,3,5,9),c(5,6,3,4,3,4))

rownames(example_data) <- c("Dogs","Cats","Fish","Gerbils","Horses","Rocks")
colnames(example_data) <- c("Cemetery","Cremetorium")


mydotchart = function(data,main=NULL,xlab=NULL,ylab=NULL,xlim=NULL,ylim=NULL,normalize=NULL,col=NULL,pch=NULL,cex=NULL,labels=NULL) {
#mydotchart = function(data,...normalize=F, col=F) {
  if(!is.integer(dim(data))){
    cat('You need to pass a matrix')
    
  #}else if(main == F & xlab == F & ylab == F & xlim == F & ylim == F & col == F & pch == F & cex == F){
    #matplot(data,1:length(data[,1]),main='no title given',cex.axis = 0.9)
    #segments(0,1:max(data),max(data),1:max(data),lty=2)
    #axis(2, at=1:length(rownames(data)), labels=rownames(data),las = 1)
    
  }else{
    if((normalize == F & col == F)){
    n_of_colors = length(data[1,])
    random_colors = colors()[round(runif(n_of_colors,min=1, max=657))]
    matplot(data,1:length(data[,1]),main=main,
                xlab=xlab,ylab=ylab,xlim=xlim,ylim=ylim,col = random_colors,pch=pch,cex=cex,yaxt='n',cex.axis = 0.9)
    segments(0,1:max(data),max(data),1:max(data),lty=2)
    axis(2, at=1:length(rownames(data)), labels=rownames(data),las = 1)
    
  }else if (col == F){
    n_of_colors = length(data[1,])
    random_colors = colors()[round(runif(n_of_colors,min=1, max=657))]
    normalized_data = (data-min(data))/(max(data)-min(data))
    matplot(normalized_data,1:length(normalized_data[,1]),main=main,
                xlab=xlab,ylab=ylab,xlim=xlim,ylim=ylim,col=random_colors,pch=pch,cex=cex,yaxt='n',cex.axis = 0.9)
    segments(0,1:length(normalized_data[,1]),1,1:length(normalized_data[,1]),lty=2)
    axis(2,at=0.1:length(rownames(data)),labels=rownames(normalized_data),las = 1)

  }else if (normalize == F){
    matplot(data,1:length(data[,1]),main=main,
                xlab=xlab,ylab=ylab,xlim=xlim,ylim=ylim,col=col,pch=pch,cex=cex,yaxt='n',cex.axis = 0.9)
    segments(0,1:max(data),max(data),1:max(data),lty=2)
    axis(2,at=1:length(rownames(data)), labels=rownames(data),las = 1)

  }else{
    normalized_data = (data-min(data))/(max(data)-min(data))
    matplot(normalized_data,1:length(normalized_data[,1]),main=main,
                xlab=xlab,ylab=ylab,xlim=xlim,ylim=ylim,col=col,pch=pch,cex=cex,yaxt='n',cex.axis = 0.9)
    segments(0,1:length(normalized_data[,1]),1,1:length(normalized_data[,1]),lty=2)
    axis(2,at=0.1:length(rownames(data)),labels=rownames(normalized_data),las = 1)

    }
    }
    }

par(mfrow=c(2,2))

mydotchart(example_data,main = ' normalized & w/random colors', xlim=c(0,1), ylim=c(0,6),xlab='number of animals',ylab='',normalize = T,col = F,pch=16,cex=1.2)

mydotchart(example_data,main = 'normalized w/ red and yellow', xlim=c(0,1), ylim=c(0,6),xlab='number of animals',ylab='',normalize = T,col = c('red','yellow'),pch=16,cex=1.2)

## Warning in if ((normalize == F & col == F)) {: la condizione la lunghezza > 1 e
## solo il promo elemento verrà utilizzato

## Warning in if (col == F) {: la condizione la lunghezza > 1 e solo il promo
## elemento verrà utilizzato

mydotchart(example_data,main = 'no normalization & w/ red and yellow', xlim=c(0,6), ylim=c(0,6),xlab='number of animals',ylab='',normalize = F,col = c('red','yellow'),pch=16,cex=1.2)

## Warning in if ((normalize == F & col == F)) {: la condizione la lunghezza > 1 e
## solo il promo elemento verrà utilizzato

## Warning in if ((normalize == F & col == F)) {: la condizione la lunghezza > 1 e
## solo il promo elemento verrà utilizzato

mydotchart(example_data,main = 'no normalization and random colors', xlim=c(0,6), ylim=c(0,6),xlab='number of animals',ylab='',normalize = F,col = F,pch=16,cex=1.2)

# what is this error? Does it result from this?  f((normalize == F & col == F)) as if r could only take the first identity and not the second one... Am I right?

then

set.seed(100)
data2 <- data.frame(q1=sample(letters[1:10],100,replace=T),
                   q2=sample(letters[1:10],100,replace=T),
                   q3=sample(letters[1:10],100,replace=T),
                   q4=sample(letters[1:10],100,replace=T), 
                   q5=sample(letters[1:10],100,replace=T))

datatable2<-apply(data2,2,table)


mydotchart(datatable2,main = 'no normalization and random colors', xlim=c(0,max(datatable2[,1])), ylim=c(0,length(datatable2[,1])),xlab='values',ylab='',normalize = F,col = F,pch=16,cex=1.2)

#mydotchart(datatable2)#mydotchart(datatable2[,1:2])
#mydotchart(as.matrix(datatable2[,1]))
#mydotchart(datatable2[,1:3])
#mydotchart(datatable2,col=1:5)
#mydotchart(datatable2,col=1:5,pch=16)
#mydotchart(datatable2,col=1:5,pch=16,cex=2.5,main="Everything",xlab="Value", ylab="Category")
#these are the same, here it does not let me move forward because the if (normalize == F & col == F) statement has length equal to 0. So I tried with if(is.null(normalize) & is.null(col)) and it now works for mydotchart(datatable2) but won't work for the previous charts! What should I do?


mydotchart(datatable2[,1])

## You need to pass a matrix

mydotchart(datatable2,col=1:5,pch=16,cex=2.5,main="Everything normalized",xlab="Value", ylab="Category",normalize=T)#these two work

## Warning in if ((normalize == F & col == F)) {: la condizione la lunghezza > 1 e
## solo il promo elemento verrà utilizzato

## Warning in if (col == F) {: la condizione la lunghezza > 1 e solo il promo
## elemento verrà utilizzato

#2. Correlating word frequency with SCRABBLE scores

The following data frame specifies the English letter frequency of letters, the points earned in Scrabble, and the number of Scrabble tiles.

lf <- c(8.167,1.492,2.782,4.253,12.702,2.228,2.015,6.094,
        6.966,0.153,0.772,4.025,2.406,6.749,7.507,1.929,
        0.095,5.987,6.327,9.056,2.758,0.978,2.36,0.15,1.974,0.074)/100
pts <- c(1,3,3,2,1,4,2,4,1,8,5,1,3,1,1,3,10,1,1,1,1,4,4,8,4,10)
tiles <- c(9,2,2,4,12,2,3,2,9,1,1,4,2,6,8,2,1,6,4,6,4,2,2,1,2,1)
lf.table <- data.frame(LETTERS,
                       freq=lf,
                       points=pts,
                       ntiles=tiles)

For any word, you can split it into its letters, and then compute some statistics based on this scoring. The following computes the sum of the inverse letter frequency of the letters, the total scrabble points, the mean numbers of tiles of the letters in the word, and the length of the word:

scoreme <- function(word)
{
  
  lets <- strsplit(splus2R::upperCase(word),"")[[1]]  #strsplit splits each word into substrings of length [[1]] and spaces "" in UPPERCASE
  data <- matrix(0,ncol=4,nrow=length(lets))

  for(i in 1:length(lets))
  {
    index <- which(lets[i]==LETTERS)
    data[i,1] <- lf.table$freq[index]           #fills up a matrix where the iterators is in the lf.table with respect
                                                #to our input!
    data[i,2] <- lf.table$points[index]
    data[i,3] <- lf.table$ntiles[index]
    
  } 
  list(suminvfreq= sum(1/data[,1]),
       points=sum(data[,2]),
       meantiles=mean(data[,3]),
       length=length(lets))
}

then

horses <- scoreme("HORSES")
horses

## $suminvfreq
## [1] 85.91667
## 
## $points
## [1] 9
## 
## $meantiles
## [1] 6
## 
## $length
## [1] 6

print(horses$points)

## [1] 9

then

test <- read.table(text='rank word frequency
1081  CUP      1441306
2310  FOUND     573305
5285  BUTTERFLY 171410
7371    brew       94904    
11821 CUMBERSOME 39698
17331 useable        17790 
18526 WHITTLE    15315
25416 SPINY       7207
27381 uppercase         5959
37281     halfnaked         2459
47381     bellhop       1106 
57351     tetherball        425
7309        attic       2711    
17311     tearful       542 
27303     tailgate  198 
37310     hydraulically         78  
47309     unsparing         35  
57309     embryogenesis     22 ',header=T)[,c(2,1,3)] ##reorder colums

test$meantiles <- NA
test$suminvfreq <- NA
test$points <- NA
test$length <- NA

then

for(i in 1:length(test[,1])){

    test[i,4] = scoreme(test[i,1])[3]
    test[i,5] = scoreme(test[i,1])[1]       #let's populate the 4th through the last columns
    test[i,6] = scoreme(test[i,1])[2]
    test[i,7] = scoreme(test[i,1])[4]
  
}

par(mfrow = c(2,4))

#plot rank frequency with meantiles, suminvfreq, points, length
plot(test$rank,test$length,main=sprintf('r = %g',round(cor(test$rank,test$length),3)),xlab='word rank position',
     ylab='length')
plot(test$rank,test$meantiles,main=sprintf('r = %g',round(cor(test$rank,test$meantiles),3)),xlab='word rank position',
     ylab='meantiles')
plot(test$rank,test$suminvfreq,main=sprintf('r = %g',round(cor(test$rank,test$suminvfreq),3)),xlab='word rank position',
     ylab='suminvfreq')
plot(test$rank,test$points,main=sprintf('r = %g',round(cor(test$rank,test$points),3)),xlab='word rank position',
     ylab='points')

plot(test$length,test$frequency/10000,main=sprintf('r = %g',round(cor(test$frequency,test$length),3)),xlab='length',ylab='frequency / 10000')
plot(test$meantiles,test$frequency/10000,main=sprintf('r = %g',round(cor(test$frequency,test$meantiles),3)),xlab='meantiles',ylab='frequency / 10000')
plot(test$suminvfreq,test$frequency/10000,main=sprintf('r = %g',round(cor(test$frequency,test$suminvfreq),3)),xlab='suminvfreq',ylab='frequency / 10000')
plot(test$points,test$frequency/10000,main=sprintf('r = %g',round(cor(test$frequency,test$points),3)),xlab='points',ylab='frequency / 10000')

Here, we see that the higherthe frequency the shorter the word, probably also the longer the word the lower the frequency –> less points. We can also appreciate as where frequency is always negatively correlated to our statistics, rank is not. This is due to the fact that the higher the frequency of a certain event ,or word in this case, the lower its rank as in Zipf’s law. In our plots it is clear that we can easily notice the order all our statistical parameters are displayed.

plot(test$rank,test$frequency/10000,main=sprintf('r = %g',round(cor(test$rank,test$frequency),3)),xlab='word rank position',ylab='word frequency / 10000',type='o')
text(x=30000,100,labels="frequency is inversely proportional to rank as in Zipf's law")

Problem 3

aserrano@mtu.edu

20/9/2020