library(DATA606)
## 
## Welcome to CUNY DATA606 Statistics and Probability for Data Analytics 
## This package is designed to support this course. The text book used 
## is OpenIntro Statistics, 3rd Edition. You can read this by typing 
## vignette('os3') or visit www.OpenIntro.org. 
##  
## The getLabs() function will return a list of the labs available. 
##  
## The demo(package='DATA606') will list the demos that are available.
## 
## Attaching package: 'DATA606'
## The following object is masked from 'package:utils':
## 
##     demo

Part I

Please put the answers for Part I next to the question number (2pts each):

    1. daysDrive is quantitative and discreet.
    1. mean = 3.3, median = 3.5
    1. Both studies (a) and (b) can be conducted in order to establish that the treatment does indeed cause improvement with regards to fever in Ebola patients.
    1. there is an association between natural hair color and eye color.
    1. 17.8 and 69.0
    1. median and interquartile range; mean and standard deviation.

7a. Describe the two distributions (2pts).

Distribution A is unimodal and slightly skewed while Distribution B is unimodal, symmetrical and nearly normal.

7b. Explain why the means of these two distributions are similar but the standard deviations are not (2 pts).

The means of the two distributions are similar because distribution B is a sample distribution of the population A. The standard deviations are different because distribution B has wider spread with a smaller population compare to distribution A.

7c. What is the statistical principal that describes this phenomenon (2 pts)?

This phenomenon is described as the Central Limit Theorem.

Part II

Consider the four datasets, each with two columns (x and y), provided below.

options(digits=2)
data1 <- data.frame(x=c(10,8,13,9,11,14,6,4,12,7,5),
                    y=c(8.04,6.95,7.58,8.81,8.33,9.96,7.24,4.26,10.84,4.82,5.68))
data2 <- data.frame(x=c(10,8,13,9,11,14,6,4,12,7,5),
                    y=c(9.14,8.14,8.74,8.77,9.26,8.1,6.13,3.1,9.13,7.26,4.74))
data3 <- data.frame(x=c(10,8,13,9,11,14,6,4,12,7,5),
                    y=c(7.46,6.77,12.74,7.11,7.81,8.84,6.08,5.39,8.15,6.42,5.73))
data4 <- data.frame(x=c(8,8,8,8,8,8,8,19,8,8,8),
                    y=c(6.58,5.76,7.71,8.84,8.47,7.04,5.25,12.5,5.56,7.91,6.89))

For each column, calculate (to two decimal places):

a. The mean (for x and y separately; 1 pt).

mean1 <- data.frame(c(meanx= mean(data1$x), meany=mean(data1$y)))
mean1
mean2 <- data.frame(c(meanx= mean(data2$x), meany=mean(data2$y)))
mean2
mean3 <- data.frame(c(meanx= mean(data3$x), meany=mean(data3$y)))
mean3
mean4 <- data.frame(c(meanx= mean(data4$x), meany=mean(data4$y)))
mean4

b. The median (for x and y separately; 1 pt).

median1 <- data.frame(c(medianx= median(data1$x), mediany=median(data1$y)))
median1
median2 <- data.frame(c(medianx= median(data2$x), mediany=median(data2$y)))
median2
median3 <- data.frame(c(medianx= median(data3$x), mediany=median(data3$y)))
median3
median4 <- data.frame(c(medianx= mean(data4$x), mediany=median(data4$y)))
median4

c. The standard deviation (for x and y separately; 1 pt).

sd1 <- data.frame(c(sdx= sd(data1$x), sdy=sd(data1$y)))
sd1
sd2 <- data.frame(c(sdx= sd(data2$x), sdy=sd(data2$y)))
sd2
sd3 <- data.frame(c(sdx= sd(data3$x), sdy=sd(data3$y)))
sd3
sd4 <- data.frame(c(sdx= sd(data4$x), sdy=sd(data4$y)))
sd4

For each x and y pair, calculate (also to two decimal places; 1 pt):

d. The correlation (1 pt).

cor(data1)
##      x    y
## x 1.00 0.82
## y 0.82 1.00
cor(data2)
##      x    y
## x 1.00 0.82
## y 0.82 1.00
cor(data3)
##      x    y
## x 1.00 0.82
## y 0.82 1.00
cor(data4)
##      x    y
## x 1.00 0.82
## y 0.82 1.00

e. Linear regression equation (2 pts).

Linear1<- lm(data1$y ~ data1$x)
Linear1
## 
## Call:
## lm(formula = data1$y ~ data1$x)
## 
## Coefficients:
## (Intercept)      data1$x  
##         3.0          0.5
Linear2<- lm(data2$y ~ data2$x)
Linear2
## 
## Call:
## lm(formula = data2$y ~ data2$x)
## 
## Coefficients:
## (Intercept)      data2$x  
##         3.0          0.5
Linear3<- lm(data3$y ~ data3$x)
Linear3
## 
## Call:
## lm(formula = data3$y ~ data3$x)
## 
## Coefficients:
## (Intercept)      data3$x  
##         3.0          0.5
Linear4<- lm(data4$y ~ data4$x)
Linear4
## 
## Call:
## lm(formula = data4$y ~ data4$x)
## 
## Coefficients:
## (Intercept)      data4$x  
##         3.0          0.5

f. R-Squared (2 pts).

summary(Linear1)$r.squared
## [1] 0.67
summary(Linear2)$r.squared
## [1] 0.67
summary(Linear3)$r.squared
## [1] 0.67
summary(Linear4)$r.squared
## [1] 0.67

For each pair, is it appropriate to estimate a linear regression model? Why or why not? Be specific as to why for each pair and include appropriate plots! (4 pts)

#Data 1
par(mfrow=c(2,2))
plot(data1)
plot(Linear1$residuals)
hist(Linear1$residuals)
qqnorm(Linear1$residuals)
qqline(Linear1$residuals)

#Data 2
par(mfrow=c(2,2))
plot(data2)
plot(Linear2$residuals)
hist(Linear2$residuals)
qqnorm(Linear2$residuals)
qqline(Linear2$residuals)

#Data 3
par(mfrow=c(2,2))
plot(data3)
plot(Linear3$residuals)
hist(Linear3$residuals)
qqnorm(Linear3$residuals)
qqline(Linear3$residuals)

#Data 4
par(mfrow=c(2,2))
plot(data4)
plot(Linear4$residuals)
hist(Linear4$residuals)
qqnorm(Linear4$residuals)
qqline(Linear4$residuals)

Explain why it is important to include appropriate visualizations when analyzing data. Include any visualization(s) you create. (2 pts)

It is quite difficult to see relationships and correlations when simply looking at the data. While analyzing data, visualizations aid in estimating data and strengthening conclusions. One can quickly show trends using visualizations.

plot(data2)