This problem set covers materials in chapters 2 and 3.
#knitr::opts_chunk$set(fig.width=18, fig.height=9) the histograms xlabels are cut and I wanted to include something like this but if I do it looks even worse!
data = read.csv('C:/Users/Andrea/Desktop/PSY5210/R files/Sample Data/delegates.csv')
data$Majority = data$Democrat > data$Republican # if 1 --> Democratic
plot(data$Republican,data$Democrat, col = c('red','blue')[data$Majority + 1])
#text(data$Democrat,data$Republican,labels = data$Abbreviation) # this is very messy so I didn't include it
abline(-5, 2)
legend(130,200,legend=c('Rebublicans','Democratics'),col=c('red','blue'),pch=1,cex=0.8)
data$Ratio = data$Democrat / data$Republican # important to see how many times is one over the other one
total_delegates = aggregate(data$Democrat + data$Republican,list(data$Abbreviation),FUN=sort) # I don't understand how to get to use aggregate here --> aggregate(data$Abbreviation,list(data$Democrat + data$Republican),FUN=sum) I wanted to have aggregate show me the sum between democr and rep with sum but something is wrong...
#votes_distribution = aggregate(data$Majority,list(data$Democrat,data$Republican),FUN=sum) # this is just weird and I don't really need the number of times a certain amount of votes has been reached by both parties
mean_ration_US = mean(data$Ratio) # this tells us that if a state votes ratio (demo/rep) is higher than 1.63 that state is above average preference of democratic party in the US
geography = c('east','north','west','west','east','west','center','east','east','east','south','south','west','west','east','center','east','center','center','east','west','north','east','east','north','center','east','center','north','center','west','east','east','south','east','east','north','east','east','center','west','east','south','east','east','center','east','center','west','east','east','east','west','east','north','west')
data$Location = geography
tapply(data$Ratio,list(data$Location,data$Majority),FUN=mean) #this 2x2 table
## FALSE TRUE
## center 0.8631276 2.054955
## east 0.8375000 1.789796
## north 0.7120811 1.999889
## south NA 1.995699
## west 0.6431034 1.695049
# now find party's majority of voters in center,east,west, and south.
hist(data$Ratio,breaks=10) #this gives us a sense ofthe ratio distribution of votes
data2 = read.csv('C:/Users/Andrea/Desktop/PSY5210/R files/Sample Data/senate-2014.csv')
table(data2$FirstName,data2$Gender)
##
## F M
## Al 0 1
## Amy 1 0
## Angus 0 1
## Barbara 2 0
## Ben 0 1
## Bernie 0 1
## Bill 0 1
## Bob 0 3
## Brian 0 1
## Carl 0 1
## Chris 0 2
## Chuck 0 2
## Claire 1 0
## Dan 0 1
## David 0 1
## Dean 0 1
## Deb 1 0
## Debbie 1 0
## Dianne 1 0
## Dick 0 1
## Ed 0 1
## Elizabeth 1 0
## Harry 0 1
## Heidi 1 0
## Jack 0 1
## Jay 0 1
## Jeanne 1 0
## Jeff 0 3
## Jeffrey 0 1
## Jerry 0 1
## Jim 0 2
## Joe 0 2
## John 0 5
## John 0 1
## Johnny 0 1
## Jon 0 1
## Kay 1 0
## Kelly 1 0
## Kirsten 1 0
## Lamar 0 1
## Lindsey 0 1
## Lisa 1 0
## Marco 0 1
## Maria 1 0
## Mark 0 4
## Martin 0 1
## Mary 1 0
## Max 0 1
## Mazie 1 0
## Michael 0 1
## Mike 0 5
## Mitch 0 1
## Orrin 0 1
## Pat 0 2
## Patrick 0 1
## Patty 1 0
## Rand 0 1
## Richard 0 3
## Rob 0 1
## Roger 0 1
## Ron 0 2
## Roy 0 1
## Saxby 0 1
## Sheldon 0 1
## Sherrod 0 1
## Susan 1 0
## Tammy 1 0
## Ted 0 1
## Thad 0 1
## Tim 0 3
## Tom 0 4
data2$Served10years = data2$YearsServed > 10
table(data2$Served10years,data2$Gender,data2$Affiliation) # where false are those who served less than 10 years and true those who served for more than 10 years
## , , = D
##
##
## F M
## FALSE 8 22
## TRUE 7 13
##
## , , = DFL
##
##
## F M
## FALSE 1 1
## TRUE 0 0
##
## , , = I
##
##
## F M
## FALSE 0 2
## TRUE 0 0
##
## , , = R
##
##
## F M
## FALSE 2 30
## TRUE 2 12
aggregate(data2$DOB,list(data2$Affiliation,data2$Gender),FUN=mean) #this gives us the mean of DOB for each gender and each party
## Group.1 Group.2 x
## 1 D F 1950.267
## 2 DFL F 1960.000
## 3 R F 1957.000
## 4 D M 1951.571
## 5 DFL M 1951.000
## 6 I M 1942.500
## 7 R M 1951.071
tapply (data2$DOB,list(data2$Affiliation,data2$Gender),FUN=mean) #this is the same but it is better "organized"
## F M
## D 1950.267 1951.571
## DFL 1960.000 1951.000
## I NA 1942.500
## R 1957.000 1951.071
#sort(data2$YearsServed) I didn't get the question in the problem
#sort(data2$AssumedOffice)
plot(data2$Age,data2$AssumedOffice,type='pl') #matplots gives me ones ?
## Warning in plot.xy(xy, type, ...): tipo lpot 'pl' verrà troncato al primo
## carattere
#plot(data2$Age[order(data2$Age)],data2$YearsServed[order(data2$YearsServed)]) # this should plot the age on the x-axis in order with respect to the years serves, but I am not sure because
#plot(data2$Age,data2$YearsServed)
data2$Age_at_Assumedoffice = data2$AssumedOffice - data2$DOB # this gives us an idea at what age the senators assumed office
plot(data2$Age_at_Assumedoffice,data2$AssumedOffice,col = c('green','purple')[data2$Served10years + 1]) # I thought that plotting this would be more informative
legend(21,2000,legend=c('> 10y of Service','< 10y of Service'),col=c('purple','green'),pch=1,cex=0.8)
DoLetters = function(x)
{
return(LETTERS[1:x])
}
#DoLetter2 = function(x,y)
#{
# if(0<x<26) and (0<y<26){return(LETTERS[x:y])
#
#}else{
# print(paste('Insert two numbers between 0 and 26'))} # why is this not working?
#}
DoLetters = function(x,y){
return(LETTERS[x:y])
}
DoLetters = function(x,y){
if (x < 0){
warning('the first value is too small!') # in this way if one inserts the first value bigger than 26 it won't crash and the same for the second one, not optimal but I can't find a way to if (x<0) or (x>26) crash...
}else if (y>26) {
warning('the second value is too large!')
}else{
return(LETTERS[x:y])
}
}
DoLetters(1,18)
## [1] "A" "B" "C" "D" "E" "F" "G" "H" "I" "J" "K" "L" "M" "N" "O" "P" "Q" "R"
DoLetters(-2,15)
## Warning in DoLetters(-2, 15): the first value is too small!
DoLetters(3,29)
## Warning in DoLetters(3, 29): the second value is too large!
goodhist = function(x,xlab1 = 'this was extracted from numerical value',xlab2 = 'this was extracted from a factor',ylab = 'frequency!'){
if (length(x)>0 & is.numeric(x)){return(hist(x,xlab = xlab1,ylab = ylab))
}else if (is.factor(x)){barplot(table(x),xlab = xlab2,ylab = ylab)
}else{warning('data is not numeric')
}
}
x = (runif(1000) + 2) * 3/2
y = as.factor(sample(1:10, 20, replace=T))
par(mfrow = c(1,2))
goodhist(x)
goodhist(y)
#goodhist = function(x,xlim = max(x)/2,xlab = 'this was extracted from a factor',ylab = 'frequency!'){
# I wanted to use this one but a strange outcome occurs if I run --> goodhist(x), if I do hist(x) it works but if I set the xlim it doesn't... Why?
set.seed(100)
a = runif(100)
b = runif(100) + a
c = runif(100) + b
mat = cbind(a,b,c)
mat2 = matrix(,,)
#for (i in 1:nrow(mat)){
# if (mat[i+1,3] - mat[i,1] > 0.2){
# print(paste(mat[i,1:3]))
#}else{warning('lol')}
# }}
for(i in 1:nrow(mat)){
print(paste(mat[i,3]- mat[i,1] < 0.2)) # this prints out a vector of truth that I could save and filter out my matrix
}
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "TRUE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "TRUE"
## [1] "TRUE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
## [1] "FALSE"
# this is another way of doing it
vector_of_truth = mat[,3] - mat[,1] < 0.2
mat2 = matrix(mat[vector_of_truth],ncol=3)
mat2
## [,1] [,2] [,3]
## [1,] 0.5465586 0.5581041 0.6305252
## [2,] 0.8842270 0.9275829 0.9685956
## [3,] 0.2077139 0.2273132 0.2884547