This problem set covers materials in chapters 2 and 3.

1. Reading Data and aggregating

#knitr::opts_chunk$set(fig.width=18, fig.height=9) the histograms xlabels are cut and I wanted to include something like this but if I do it looks even worse!


data = read.csv('C:/Users/Andrea/Desktop/PSY5210/R files/Sample Data/delegates.csv')
data$Majority = data$Democrat > data$Republican # if 1 --> Democratic
plot(data$Republican,data$Democrat, col = c('red','blue')[data$Majority + 1])
#text(data$Democrat,data$Republican,labels = data$Abbreviation) # this is very messy so I didn't include it
abline(-5, 2)
legend(130,200,legend=c('Rebublicans','Democratics'),col=c('red','blue'),pch=1,cex=0.8)

data$Ratio = data$Democrat / data$Republican # important to see how many times is one over the other one
total_delegates = aggregate(data$Democrat + data$Republican,list(data$Abbreviation),FUN=sort) # I don't understand how to get to use aggregate here --> aggregate(data$Abbreviation,list(data$Democrat + data$Republican),FUN=sum) I wanted to have aggregate show me the sum between democr and rep with sum but something is wrong...

#votes_distribution = aggregate(data$Majority,list(data$Democrat,data$Republican),FUN=sum) # this is just weird and I don't really need the number of times a certain amount of votes has been reached by both parties

mean_ration_US = mean(data$Ratio) # this tells us that if a state votes ratio (demo/rep) is higher than 1.63 that state is above average preference of democratic party in the US

geography = c('east','north','west','west','east','west','center','east','east','east','south','south','west','west','east','center','east','center','center','east','west','north','east','east','north','center','east','center','north','center','west','east','east','south','east','east','north','east','east','center','west','east','south','east','east','center','east','center','west','east','east','east','west','east','north','west')

data$Location = geography

tapply(data$Ratio,list(data$Location,data$Majority),FUN=mean) #this 2x2 table
##            FALSE     TRUE
## center 0.8631276 2.054955
## east   0.8375000 1.789796
## north  0.7120811 1.999889
## south         NA 1.995699
## west   0.6431034 1.695049
# now find party's majority of voters in center,east,west, and south.

hist(data$Ratio,breaks=10) #this gives us a sense ofthe ratio distribution of votes

2. Filtering and Sorting

data2 = read.csv('C:/Users/Andrea/Desktop/PSY5210/R files/Sample Data/senate-2014.csv')
table(data2$FirstName,data2$Gender)
##            
##             F M
##   Al        0 1
##   Amy       1 0
##   Angus     0 1
##   Barbara   2 0
##   Ben       0 1
##   Bernie    0 1
##   Bill      0 1
##   Bob       0 3
##   Brian     0 1
##   Carl      0 1
##   Chris     0 2
##   Chuck     0 2
##   Claire    1 0
##   Dan       0 1
##   David     0 1
##   Dean      0 1
##   Deb       1 0
##   Debbie    1 0
##   Dianne    1 0
##   Dick      0 1
##   Ed        0 1
##   Elizabeth 1 0
##   Harry     0 1
##   Heidi     1 0
##   Jack      0 1
##   Jay       0 1
##   Jeanne    1 0
##   Jeff      0 3
##   Jeffrey   0 1
##   Jerry     0 1
##   Jim       0 2
##   Joe       0 2
##   John      0 5
##   John      0 1
##   Johnny    0 1
##   Jon       0 1
##   Kay       1 0
##   Kelly     1 0
##   Kirsten   1 0
##   Lamar     0 1
##   Lindsey   0 1
##   Lisa      1 0
##   Marco     0 1
##   Maria     1 0
##   Mark      0 4
##   Martin    0 1
##   Mary      1 0
##   Max       0 1
##   Mazie     1 0
##   Michael   0 1
##   Mike      0 5
##   Mitch     0 1
##   Orrin     0 1
##   Pat       0 2
##   Patrick   0 1
##   Patty     1 0
##   Rand      0 1
##   Richard   0 3
##   Rob       0 1
##   Roger     0 1
##   Ron       0 2
##   Roy       0 1
##   Saxby     0 1
##   Sheldon   0 1
##   Sherrod   0 1
##   Susan     1 0
##   Tammy     1 0
##   Ted       0 1
##   Thad      0 1
##   Tim       0 3
##   Tom       0 4
data2$Served10years = data2$YearsServed > 10
table(data2$Served10years,data2$Gender,data2$Affiliation) # where false are those who served less than 10 years and true those who served for more than 10 years
## , ,  = D
## 
##        
##          F  M
##   FALSE  8 22
##   TRUE   7 13
## 
## , ,  = DFL
## 
##        
##          F  M
##   FALSE  1  1
##   TRUE   0  0
## 
## , ,  = I
## 
##        
##          F  M
##   FALSE  0  2
##   TRUE   0  0
## 
## , ,  = R
## 
##        
##          F  M
##   FALSE  2 30
##   TRUE   2 12
aggregate(data2$DOB,list(data2$Affiliation,data2$Gender),FUN=mean) #this gives us the mean of DOB for each gender and each party
##   Group.1 Group.2        x
## 1       D       F 1950.267
## 2     DFL       F 1960.000
## 3       R       F 1957.000
## 4       D       M 1951.571
## 5     DFL       M 1951.000
## 6       I       M 1942.500
## 7       R       M 1951.071
tapply (data2$DOB,list(data2$Affiliation,data2$Gender),FUN=mean) #this is the same but it is better "organized"
##            F        M
## D   1950.267 1951.571
## DFL 1960.000 1951.000
## I         NA 1942.500
## R   1957.000 1951.071
#sort(data2$YearsServed) I didn't get the question in the problem
#sort(data2$AssumedOffice)

plot(data2$Age,data2$AssumedOffice,type='pl') #matplots gives me ones ?
## Warning in plot.xy(xy, type, ...): tipo lpot 'pl' verrà troncato al primo
## carattere

#plot(data2$Age[order(data2$Age)],data2$YearsServed[order(data2$YearsServed)]) # this should plot the age on the x-axis in order with respect to the years serves, but I am not sure because
#plot(data2$Age,data2$YearsServed)

data2$Age_at_Assumedoffice = data2$AssumedOffice - data2$DOB # this gives us an idea at what age the senators assumed office
plot(data2$Age_at_Assumedoffice,data2$AssumedOffice,col = c('green','purple')[data2$Served10years + 1]) # I thought that plotting this would be more informative
legend(21,2000,legend=c('> 10y of Service','< 10y of Service'),col=c('purple','green'),pch=1,cex=0.8)

3. Writing a function in R

DoLetters = function(x)
{
  return(LETTERS[1:x])
}

#DoLetter2 = function(x,y)
#{
#  if(0<x<26) and (0<y<26){return(LETTERS[x:y])
#    
#}else{
#      print(paste('Insert two numbers between 0 and 26'))}  # why is this not working? 
#}

DoLetters = function(x,y){
  return(LETTERS[x:y])
}


DoLetters = function(x,y){
  if (x < 0){
    warning('the first value is too small!')        # in this way if one inserts the first value bigger than 26 it won't crash and the same for the second one, not optimal but I can't find a way to if (x<0) or (x>26) crash...
  }else if (y>26) {
    warning('the second value is too large!')
  }else{
    return(LETTERS[x:y])
  }
  }
  
DoLetters(1,18)
##  [1] "A" "B" "C" "D" "E" "F" "G" "H" "I" "J" "K" "L" "M" "N" "O" "P" "Q" "R"
DoLetters(-2,15)
## Warning in DoLetters(-2, 15): the first value is too small!
DoLetters(3,29)
## Warning in DoLetters(3, 29): the second value is too large!

4. Conditional Logic.

goodhist = function(x,xlab1 = 'this was extracted from numerical value',xlab2 = 'this was extracted from a factor',ylab = 'frequency!'){
  if (length(x)>0 & is.numeric(x)){return(hist(x,xlab = xlab1,ylab = ylab))
  }else if (is.factor(x)){barplot(table(x),xlab = xlab2,ylab = ylab)
  }else{warning('data is not numeric')
  }
}

x = (runif(1000) + 2) * 3/2
y = as.factor(sample(1:10, 20, replace=T))

par(mfrow = c(1,2))
goodhist(x)
goodhist(y)

#goodhist = function(x,xlim = max(x)/2,xlab = 'this was extracted from a factor',ylab = 'frequency!'){
# I wanted to use this one but a strange outcome occurs if I run --> goodhist(x), if I do hist(x) it works but if I set the xlim it doesn't... Why?

5. Testing and filtering values

set.seed(100)
a = runif(100)
b = runif(100) + a
c = runif(100) + b
mat = cbind(a,b,c)

mat2 = matrix(,,)


#for (i in 1:nrow(mat)){         
#   if (mat[i+1,3] - mat[i,1]  > 0.2){
  # print(paste(mat[i,1:3]))
   #}else{warning('lol')}
 #  }}

for(i in 1:nrow(mat)){
 print(paste(mat[i,3]- mat[i,1] < 0.2)) # this prints out a vector of truth that I could save and filter out my matrix
}
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "TRUE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "TRUE"
## [1] "TRUE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
# this is another way of doing it
vector_of_truth = mat[,3] - mat[,1] < 0.2 
mat2 = matrix(mat[vector_of_truth],ncol=3)
mat2
##           [,1]      [,2]      [,3]
## [1,] 0.5465586 0.5581041 0.6305252
## [2,] 0.8842270 0.9275829 0.9685956
## [3,] 0.2077139 0.2273132 0.2884547