Please put the answers for Part I next to the question number (2pts each):
7a. Describe the two distributions (2pts).
Graph A: Unimodal distribution with a right skew. Small spread in the distribution. Graph B: Normal distribution with a large spread.
7b. Explain why the means of these two distributions are similar but the standard deviations are not (2 pts).
Graph A is a distribution of an observed variable, while Graph B is a sample distribution of means from 30 observations. The standard deviation of a sample distribution of means is calculated by the standard error.
7c. What is the statistical principal that describes this phenomenon (2 pts)?
The statistical principal used is the Central Limits Theorem.
Consider the four datasets, each with two columns (x and y), provided below.
options(digits=2)
data1 <- data.frame(x=c(10,8,13,9,11,14,6,4,12,7,5),
y=c(8.04,6.95,7.58,8.81,8.33,9.96,7.24,4.26,10.84,4.82,5.68))
data2 <- data.frame(x=c(10,8,13,9,11,14,6,4,12,7,5),
y=c(9.14,8.14,8.74,8.77,9.26,8.1,6.13,3.1,9.13,7.26,4.74))
data3 <- data.frame(x=c(10,8,13,9,11,14,6,4,12,7,5),
y=c(7.46,6.77,12.74,7.11,7.81,8.84,6.08,5.39,8.15,6.42,5.73))
data4 <- data.frame(x=c(8,8,8,8,8,8,8,19,8,8,8),
y=c(6.58,5.76,7.71,8.84,8.47,7.04,5.25,12.5,5.56,7.91,6.89))
For each column, calculate (to two decimal places):
d1.x <- round(mean(data1$x), 2)
d1.y <- round(mean(data1$y), 2)
d2.x <- round(mean(data2$x), 2)
d2.y <- round(mean(data2$y), 2)
d3.x <- round(mean(data3$x), 2)
d3.y <- round(mean(data3$y), 2)
d4.x <- round(mean(data4$x), 2)
d4.y <- round(mean(data4$y), 2)
meandf <- data.frame(set= c(1,2,3,4), mean.x=c(d1.x, d2.x, d3.x, d4.x), mean.y= c(d1.y, d2.y, d3.y, d4.y))
meandf
## set mean.x mean.y
## 1 1 9 7.5
## 2 2 9 7.5
## 3 3 9 7.5
## 4 4 9 7.5
d1.x <- round(median(data1$x), 2)
d1.y <- round(median(data1$y), 2)
d2.x <- round(median(data2$x), 2)
d2.y <- round(median(data2$y), 2)
d3.x <- round(median(data3$x), 2)
d3.y <- round(median(data3$y), 2)
d4.x <- round(median(data4$x), 2)
d4.y <- round(median(data4$y), 2)
mediandf <- data.frame(set= c(1,2,3,4), median.x=c(d1.x, d2.x, d3.x, d4.x), median.y= c(d1.y, d2.y, d3.y, d4.y))
mediandf
## set median.x median.y
## 1 1 9 7.6
## 2 2 9 8.1
## 3 3 9 7.1
## 4 4 8 7.0
d1.x <- round(sd(data1$x), 2)
d1.y <- round(sd(data1$y), 2)
d2.x <- round(sd(data2$x), 2)
d2.y <- round(sd(data2$y), 2)
d3.x <- round(sd(data3$x), 2)
d3.y <- round(sd(data3$y), 2)
d4.x <- round(sd(data4$x), 2)
d4.y <- round(sd(data4$y), 2)
sd.df <- data.frame(set= c(1,2,3,4), sd.x=c(d1.x, d2.x, d3.x, d4.x), sd.y= c(d1.y, d2.y, d3.y, d4.y))
sd.df
## set sd.x sd.y
## 1 1 3.3 2
## 2 2 3.3 2
## 3 3 3.3 2
## 4 4 3.3 2
cor.1 <- cor(data1$x, data1$y)
cor.2 <- cor(data2$x, data2$y)
cor.3 <- cor(data3$x, data3$y)
cor.4 <- cor(data4$x, data4$y)
cor.df <- data.frame(set=c(1,2,3,4), correlation = c(cor.1, cor.2, cor.3, cor.4))
cor.df
## set correlation
## 1 1 0.82
## 2 2 0.82
## 3 3 0.82
## 4 4 0.82
LR1 <- lm(y ~ x, data = data1)
summary(LR1)
##
## Call:
## lm(formula = y ~ x, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9213 -0.4558 -0.0414 0.7094 1.8388
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.000 1.125 2.67 0.0257 *
## x 0.500 0.118 4.24 0.0022 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.2 on 9 degrees of freedom
## Multiple R-squared: 0.667, Adjusted R-squared: 0.629
## F-statistic: 18 on 1 and 9 DF, p-value: 0.00217
data 1 equation: y_hat = 3.00 + 0.5x
LR2 <- lm(y ~ x, data = data2)
summary(LR2)
##
## Call:
## lm(formula = y ~ x, data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.901 -0.761 0.129 0.949 1.269
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.001 1.125 2.67 0.0258 *
## x 0.500 0.118 4.24 0.0022 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.2 on 9 degrees of freedom
## Multiple R-squared: 0.666, Adjusted R-squared: 0.629
## F-statistic: 18 on 1 and 9 DF, p-value: 0.00218
data 2 equation: y_hat = 3.00 + 0.5x
LR3 <- lm(y ~ x, data = data3)
summary(LR3)
##
## Call:
## lm(formula = y ~ x, data = data3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.159 -0.615 -0.230 0.154 3.241
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.002 1.124 2.67 0.0256 *
## x 0.500 0.118 4.24 0.0022 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.2 on 9 degrees of freedom
## Multiple R-squared: 0.666, Adjusted R-squared: 0.629
## F-statistic: 18 on 1 and 9 DF, p-value: 0.00218
data 3 equation: y_hat = 3.00 + 0.5x
LR4 <- lm(y ~ x, data = data4)
summary(LR4)
##
## Call:
## lm(formula = y ~ x, data = data4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.751 -0.831 0.000 0.809 1.839
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.002 1.124 2.67 0.0256 *
## x 0.500 0.118 4.24 0.0022 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.2 on 9 degrees of freedom
## Multiple R-squared: 0.667, Adjusted R-squared: 0.63
## F-statistic: 18 on 1 and 9 DF, p-value: 0.00216
data 4 equation: y_hat = 3.00 + 0.5x
Data 1: R-Squared = 0.667 Data 2: R-Squared = 0.666 Data 3: R-Squared = 0.666 Data 4: R-Squared = 0.667
Data 1:
par( mfrow = c( 2, 2 ) )
#linear plot
plot(x = data1$x, y = data1$y)
#nearly normal residuals
hist(LR1$residuals)
#qq norm
qqnorm(LR1$residuals)
qqline(LR1$residuals)
#variability
plot(LR1$residuals ~ data1$x)
abline(h = 0, lty = 4)
No, the data does not appear to be linear.
Data 2:
par( mfrow = c( 2, 2 ) )
#linear plot
plot(x = data2$x, y = data2$y)
#nearly normal residuals
hist(LR2$residuals)
#qq norm
qqnorm(LR2$residuals)
qqline(LR2$residuals)
#variability
plot(LR2$residuals ~ data2$x)
abline(h = 0, lty = 4)
No, the data is not linear.
Data 3:
par( mfrow = c( 2, 2 ) )
#linear plot
plot(x = data3$x, y = data3$y)
#nearly normal residuals
hist(LR3$residuals)
#qq norm
qqnorm(LR3$residuals)
qqline(LR3$residuals)
#variability
plot(LR3$residuals ~ data3$x)
abline(h = 0, lty = 4)
Yes, the data almost fits a linear model.
Data 4:
par( mfrow = c( 2, 2 ) )
#linear plot
plot(x = data4$x, y = data4$y)
#nearly normal residuals
hist(LR4$residuals)
#qq norm
qqnorm(LR4$residuals)
qqline(LR4$residuals)
#variability
plot(LR4$residuals ~ data4$x)
abline(h = 0, lty = 4)
No, the data is very far from linear.
Visualizations help to illustrate the linearity of data, the relationship between variables, and give a better picture of the data. Just by looking at the data frames, you would not be able to see patterns in the data, since many attributes of the data were the same.