library(DATA606)
##
## Welcome to CUNY DATA606 Statistics and Probability for Data Analytics
## This package is designed to support this course. The text book used
## is OpenIntro Statistics, 3rd Edition. You can read this by typing
## vignette('os3') or visit www.OpenIntro.org.
##
## The getLabs() function will return a list of the labs available.
##
## The demo(package='DATA606') will list the demos that are available.
##
## Attaching package: 'DATA606'
## The following object is masked from 'package:utils':
##
## demo
Please put the answers for Part I next to the question number (2pts each):
7a. Describe the two distributions (2pts).
Distribution A is unimodal and slightly skewed while Distribution B is unimodal, symmetrical and nearly normal.
7b. Explain why the means of these two distributions are similar but the standard deviations are not (2 pts).
The means of the two distributions are similar because distribution B is a sample distribution of the population A. The standard deviations are different because distribution B has wider spread with a smaller population compare to distribution A.
7c. What is the statistical principal that describes this phenomenon (2 pts)?
This phenomenon is described as the Central Limit Theorem.
Consider the four datasets, each with two columns (x and y), provided below.
options(digits=2)
data1 <- data.frame(x=c(10,8,13,9,11,14,6,4,12,7,5),
y=c(8.04,6.95,7.58,8.81,8.33,9.96,7.24,4.26,10.84,4.82,5.68))
data2 <- data.frame(x=c(10,8,13,9,11,14,6,4,12,7,5),
y=c(9.14,8.14,8.74,8.77,9.26,8.1,6.13,3.1,9.13,7.26,4.74))
data3 <- data.frame(x=c(10,8,13,9,11,14,6,4,12,7,5),
y=c(7.46,6.77,12.74,7.11,7.81,8.84,6.08,5.39,8.15,6.42,5.73))
data4 <- data.frame(x=c(8,8,8,8,8,8,8,19,8,8,8),
y=c(6.58,5.76,7.71,8.84,8.47,7.04,5.25,12.5,5.56,7.91,6.89))
For each column, calculate (to two decimal places):
mean1 <- data.frame(c(meanx= mean(data1$x), meany=mean(data1$y)))
mean1
mean2 <- data.frame(c(meanx= mean(data2$x), meany=mean(data2$y)))
mean2
mean3 <- data.frame(c(meanx= mean(data3$x), meany=mean(data3$y)))
mean3
mean4 <- data.frame(c(meanx= mean(data4$x), meany=mean(data4$y)))
mean4
median1 <- data.frame(c(medianx= median(data1$x), mediany=median(data1$y)))
median1
median2 <- data.frame(c(medianx= median(data2$x), mediany=median(data2$y)))
median2
median3 <- data.frame(c(medianx= median(data3$x), mediany=median(data3$y)))
median3
median4 <- data.frame(c(medianx= mean(data4$x), mediany=median(data4$y)))
median4
sd1 <- data.frame(c(sdx= sd(data1$x), sdy=sd(data1$y)))
sd1
sd2 <- data.frame(c(sdx= sd(data2$x), sdy=sd(data2$y)))
sd2
sd3 <- data.frame(c(sdx= sd(data3$x), sdy=sd(data3$y)))
sd3
sd4 <- data.frame(c(sdx= sd(data4$x), sdy=sd(data4$y)))
sd4
cor(data1)
## x y
## x 1.00 0.82
## y 0.82 1.00
cor(data2)
## x y
## x 1.00 0.82
## y 0.82 1.00
cor(data3)
## x y
## x 1.00 0.82
## y 0.82 1.00
cor(data4)
## x y
## x 1.00 0.82
## y 0.82 1.00
Linear1<- lm(data1$y ~ data1$x)
Linear1
##
## Call:
## lm(formula = data1$y ~ data1$x)
##
## Coefficients:
## (Intercept) data1$x
## 3.0 0.5
Linear2<- lm(data2$y ~ data2$x)
Linear2
##
## Call:
## lm(formula = data2$y ~ data2$x)
##
## Coefficients:
## (Intercept) data2$x
## 3.0 0.5
Linear3<- lm(data3$y ~ data3$x)
Linear3
##
## Call:
## lm(formula = data3$y ~ data3$x)
##
## Coefficients:
## (Intercept) data3$x
## 3.0 0.5
Linear4<- lm(data4$y ~ data4$x)
Linear4
##
## Call:
## lm(formula = data4$y ~ data4$x)
##
## Coefficients:
## (Intercept) data4$x
## 3.0 0.5
summary(Linear1)$r.squared
## [1] 0.67
summary(Linear2)$r.squared
## [1] 0.67
summary(Linear3)$r.squared
## [1] 0.67
summary(Linear4)$r.squared
## [1] 0.67
#Data 1
par(mfrow=c(2,2))
plot(data1)
plot(Linear1$residuals)
hist(Linear1$residuals)
qqnorm(Linear1$residuals)
qqline(Linear1$residuals)
#Data 2
par(mfrow=c(2,2))
plot(data2)
plot(Linear2$residuals)
hist(Linear2$residuals)
qqnorm(Linear2$residuals)
qqline(Linear2$residuals)
#Data 3
par(mfrow=c(2,2))
plot(data3)
plot(Linear3$residuals)
hist(Linear3$residuals)
qqnorm(Linear3$residuals)
qqline(Linear3$residuals)
#Data 4
par(mfrow=c(2,2))
plot(data4)
plot(Linear4$residuals)
hist(Linear4$residuals)
qqnorm(Linear4$residuals)
qqline(Linear4$residuals)
It is quite difficult to see relationships and correlations when simply looking at the data. While analyzing data, visualizations aid in estimating data and strengthening conclusions. One can quickly show trends using visualizations.
plot(data2)