Basic R Operations

For each of the following questions, provide R commands used to solve the problem, and example output. Include both the questions and answers in your solution. You can create an R Markdown file to turn in; you should include the relevant code you used in the markdown, but EVERY QUESTION SHOULD BE ANSWERED IN WORDS with the code and output just supporting that response. Unless otherwise specified, write your text as text-based markdown, and not as code comments.

##1. Reading and commenting code

The ‘#’ symbol is used to write comments, explanations and notes. For example:

x <- 3 + 6  # Add two numbers together

You can use it to add sections as well:

##########################################
## Find the maximum of a set of ten numbers
print(max(runif(10)))

Read the following code, and write comments on each line explaining what is going on.

x <- c(1,2,3,4,5,6,7) # this creates a vector of valyes from 1 to 7
y <- mean(x)# this calls a in-built function that calculates the mean of the numbers in the vector
x2 <- x - y  # this subtracts the mean from each number in the vector
a1 <- runif(100)# this creates a vector of 100 random numbers from 0 to 1
a2 <- rnorm(100)# this creates a vector of 100 normally distributed numbers from 0 to 1
a3 <- a1 * a2   # this creates a new variable and its values are the result of multiplication between a1 and a1
a.tab <- cbind(a1,a2,a3)  # creates a matrix combining those 3 vectors in rows and columns
pairs(a.tab)# this "cool" function plots each possible combination between columns like a1 with a2, a1 with a3, and so on. On the diagonal are column numbers.

cor(a.tab)# gives Pearson correlation values r of all combinations between columns 
##             a1          a2          a3
## a1  1.00000000 -0.01490316 -0.06692689
## a2 -0.01490316  1.00000000  0.88422428
## a3 -0.06692689  0.88422428  1.00000000

#2. Assignment of values

Determine what happens to both a and b when you do the following. Write an answer in words.

a <- 100
b <- a
a <- 200

Answer: to “a” is assigned the value 100, then b gets the same value from being assigned as “a”, lastly to ‘a’ we assign a new value but this does not modify b as well because they are two dinstinct variables

 a <- c(1,2,3,4,5)
 b <- a
 b[3] <- 10

Answer: Here it happens the same and when we modify the third value of the b vector it only changes in b and not in a.

#3. Computing Means of data The arithmetic mean of a set of numbers is the sum of those numbers divided by the number of numbers, and for a set of numbers \(x\), it can be written:

  sum(x)/length(x)

There are other ways of finding the average, including the geometric mean (the Nth root of the product of N things) and the harmonic mean (the reciprocal of the mean of the reciprocals of the numbers, where reciprocal means 1/x.). Again, for X, a way of calculating this is:

##Geometric mean (assuming all values are greater than 0):

exp(mean(log(x)))
exp(sum(log(x)) / length(x))

##Harmonic mean:

1 / mean(1/x)

For each of the following definitions of x1..x5 below create a histogram, and then use the abline(v=number) function to plot the arithmetic, geometric, and harmonic means as vertical lines on the plot, and text() to label it.

x1 <- runif(1000)
x2  <- runif(1000) * 2
x3 <- runif(1000) + 2
x4 <- 5/(runif(1000)+.04)
x5 <- exp(rnorm(1000))

That is, if you have a data set x, and you computed the means and saved it as x.mean, x.hmean, and x.gmean, the following will make a histogram and plot the vertical lines and text labels:

x1.mean = mean(x1)
x1.gmean = exp(mean(log(x1)))
x1.hmean = 1 / mean(1/x1)
x2.mean = mean(x2)
x2.gmean = exp(mean(log(x2)))
x2.hmean = 1 / mean(1/x2)
x3.mean = mean(x3)
x3.gmean = exp(mean(log(x3)))
x3.hmean = 1 / mean(1/x3)
x4.mean = mean(x4)
x4.gmean = exp(mean(log(x4)))
x4.hmean = 1 / mean(1/x4)
x5.mean = mean(x5)
x5.gmean = exp(mean(log(x5)))
x5.hmean = 1 / mean(1/x5)



x1_means = c(x1.mean,x1.gmean,x1.hmean)
x2_means = c(x2.mean,x2.gmean,x2.hmean)
x3_means = c(x3.mean,x3.gmean,x3.hmean)
x4_means = c(x4.mean,x4.gmean,x4.hmean)
x5_means = c(x5.mean,x5.gmean,x5.hmean)







par(mfrow=c(2,3))


hist(x1_means,yaxt = "n",breaks =14,xlim=c(min(x1_means),max(x1_means)*105/100),xlab='values',ylab='instances',col = 'lightblue')
abline(v=x1.mean,lwd=3, col="blue")
abline(v=x1.gmean,lwd=3, col="green")          #how do I move the text a little to the right?
abline(v=x1.hmean,lwd=3, col="red")


hist(x2_means,yaxt = "n",breaks =14,xlim=c(min(x2_means),max(x2_means)*105/100),xlab='values',ylab='instances',col = 'lightblue')
abline(v=x2.mean,lwd=3, col="blue")
abline(v=x2.gmean,lwd=3, col="green")          #how do I move the text a little to the right?
abline(v=x2.hmean,lwd=3, col="red")

hist(x3_means,yaxt = "n",breaks =14,xlim=c(2,3),xlab='values',ylab='instances',col = 'lightblue')
abline(v=x3.mean,lwd=3, col="blue")
abline(v=x3.gmean,lwd=3, col="green")          #how do I move the text a little to the right?
abline(v=x3.hmean,lwd=3, col="red")


legend(2, 1, legend=c("Arithmetic Mean", "Geometric Mean","Harmonic Mean"),
       col=c("blue", "green", "red"), lty=1)



hist(x4_means,yaxt = "n",breaks =14,xlim=c(min(x4_means),max(x4_means)*105/100),xlab='values',ylab='instances',col = 'lightblue')
abline(v=x4.mean,lwd=3, col="blue")
abline(v=x4.gmean,lwd=3, col="green")          #how do I move the text a little to the right?
abline(v=x4.hmean,lwd=3, col="red")

hist(x5_means,yaxt = "n",breaks =14,xlim=c(min(x5_means),max(x5_means)*105/100),xlab='values',ylab='instances',col = 'lightblue')
abline(v=x5.mean,lwd=3, col="blue")
abline(v=x5.gmean,lwd=3, col="green")          #how do I move the text a little to the right?
abline(v=x5.hmean,lwd=3, col="red")


#text(x1.mean,.5,srt = 90,"A mean")
#text(x1.gmean,.5,srt = 90,"G mean") # I realize than when graphing multiple histograms is better to just insert a legend
#text(x1.hmean,.5,srt = 90,"H mean")


#plot(x5,ylim=c(0,8),col='aquamarine')
#abline(v=x5.mean,lwd=5,h=T, col="black")
#abline(v=x5.gmean,lwd=2,h=T, col="green") # I tried to plot horizontal lines instead on plots instead of using histograms but I need more time to figure out why I can't do that!
#abline(v=x5.hmean,lwd=1,h=T, col="red")


Try to adjust the locations of the text, etc. to make it look good.
Add at least three customizations to the hist function (i.e., change the color,
the number of breaks, the title, etc.) to make them look nice.
For each of the five data sets, describe how the different means are af-
fected by the transforms in the distribution. Suggest which means would be
appropriate for describing the middle tendency of the data


##Answers: In all datasets the three means have similar trends where the geometric and harmonic means are markedly lower than the arithmetic mean. This inequality between means has already been matematically proved and the arithmetic one is always greater than or equal than the two. Concerning which descriptors of middle tendency are used in descriptive statistics I would say the arithmetic mean, the mode, and the median (if I understood the question correctly). The x3_means dataset appears to be less spread than the others but I don't understand whether this is only related to case or not because we could see the same in x1_means also because the range is similar (2 to 3 and 0 to 1).


#4. Normalizing data.  

Sometimes you want to transform a set of numbers so they
have a mean of 0 and a standard deviation of 1.  This is done by subtracting the mean
and dividing be the standard deviation.  Normalize and plot the following befor and after,
and compute the mean and standard deviation of each normalized data set to verify
that you have normalized them.
For example, suppose you have a vector x containing unknown values:


```r
x <- (1:1000)[order(runif(1000))]
par(mfrow=c(1,2))
plot(x, main="before",sub=paste("mean=", mean(x),
       "sd=",round(sd(x),4)),cex=.5)
x.norm <- (x-mean(x))/sd(x)
plot(x.norm, main="after",
     sub=paste("mean=",mean(x.norm),"sd=",sd(x.norm)),cex=.5)

mean(x.norm) #[1] 0
## [1] 1.734723e-21
sd(x.norm)   #[1]
## [1] 1

Do the same for x1 through x5:

x1 <- 1:20
x1_norm =(x1-mean(x1))/sd(x1)
x2 <- runif(100) + rnorm(100)*3
x2_norm = (x2-mean(x2))/sd(x2)



x3 <- seq(1,100,3)
x3_norm = (x3-mean(x3))/sd(x3)

x4 <- c(runif(10),300)
x4_norm = (x4-mean(x4))/sd(x4)

x5 <- 1/(runif(50)*3)
x5_norm = (x5-mean(x5))/sd(x5)

par(mfrow=c(2,5))
#par(mfrow=c(2,2))
plot(x1,main='before_x1',sub=paste('mean=',round(mean(x1),1),'sd=',round(sd(x1),2)),cex=.5)
plot(x1_norm,main='after_x1',sub=paste('mean=',round(mean(x1_norm),1),'sd=',round(sd(x1_norm),2)),cex=.5)

plot(x2,main='before_x2',sub=paste('mean=',round(mean(x2),1),'sd=',round(sd(x2),2)),cex=.5)
plot(x2_norm,main='after_x2',sub=paste('mean=',round(mean(x2_norm),1),'sd=',round(sd(x2_norm),2)),cex=.5)

#par(mfrow=c(1,2))
plot(x3,main='before_x3',sub=paste('mean=',round(mean(x3),1),'sd=',round(sd(x3),2)),cex=.5)
plot(x3_norm,main='after_x3',sub=paste('mean=',round(mean(x3_norm),1),'sd=',round(sd(x3_norm),2)),cex=.5)


#par(mfrow=c(2,2))
plot(x4,main='before_x4',sub=paste('mean=',round(mean(x4),1),'sd=',round(sd(x4),2)),cex=.5)
plot(x4_norm,main='after_x4',sub=paste('mean=',round(mean(x4_norm),1),'sd=',round(sd(x4_norm),2)),cex=.5)


plot(x5,main='before_x5',sub=paste('mean=',round(mean(x5),1),'sd=',round(sd(x5),2)),cex=.5)
plot(x5_norm,main='after_x5',sub=paste('mean=',round(mean(x5_norm),1),'sd=',round(sd(x5_norm),2)),cex=.5)