library (kableExtra)
library(knitr)Please put the answers for Part I next to the question number (2pts each):
q3 <- 49.8
q1 <- 37
iqr <- q3-q1
q3+1.5*iqr## [1] 69
q1-1.5*iqr## [1] 17.8
7b. Explain why the means of these two distributions are similar but the standard deviations are not (2 pts).
7c. What is the statistical principal that describes this phenomenon (2 pts)?
Consider the four data sets, each with two columns (x and y), provided below.
options(digits=2)
data1 <- data.frame(x=c(10,8,13,9,11,14,6,4,12,7,5),
y=c(8.04,6.95,7.58,8.81,8.33,9.96,7.24,4.26,10.84,4.82,5.68))
data2 <- data.frame(x=c(10,8,13,9,11,14,6,4,12,7,5),
y=c(9.14,8.14,8.74,8.77,9.26,8.1,6.13,3.1,9.13,7.26,4.74))
data3 <- data.frame(x=c(10,8,13,9,11,14,6,4,12,7,5),
y=c(7.46,6.77,12.74,7.11,7.81,8.84,6.08,5.39,8.15,6.42,5.73))
data4 <- data.frame(x=c(8,8,8,8,8,8,8,19,8,8,8),
y=c(6.58,5.76,7.71,8.84,8.47,7.04,5.25,12.5,5.56,7.91,6.89))For each column, calculate (to two decimal places):
my_rownames<- c("df1 ","df2 ","df3 ","df4 ")
my_colnames <- c("x","y")
my_means <- lapply(c(data1,data2,data3,data4),mean)
my_means <- lapply(my_means,round,2)
matrix_means <- matrix(my_means, nrow = 4, byrow = TRUE)
row.names(matrix_means) <- my_rownames
colnames(matrix_means) <- my_colnames
kable(matrix_means)| x | y | |
|---|---|---|
| df1 | 9 | 7.5 |
| df2 | 9 | 7.5 |
| df3 | 9 | 7.5 |
| df4 | 9 | 7.5 |
my_medians <- lapply(c(data1,data2,data3,data4),median)
matrix_medians<- matrix(my_medians, nrow = 4, byrow = TRUE)
row.names(matrix_medians) <- my_rownames
colnames(matrix_medians) <- my_colnames
kable(matrix_medians)| x | y | |
|---|---|---|
| df1 | 9 | 7.58 |
| df2 | 9 | 8.14 |
| df3 | 9 | 7.11 |
| df4 | 8 | 7.04 |
my_sd <- lapply(c(data1,data2,data3,data4),sd)
my_sd <- lapply(my_sd,round,2)
matrix_my_sd<- matrix(my_sd, nrow = 4, byrow = TRUE)
row.names(matrix_my_sd) <- my_rownames
colnames(matrix_my_sd) <- my_colnames
kable(matrix_my_sd)| x | y | |
|---|---|---|
| df1 | 3.32 | 2.03 |
| df2 | 3.32 | 2.03 |
| df3 | 3.32 | 2.03 |
| df4 | 3.32 | 2.03 |
my_correlations <- lapply(c(data1,data2,data3,data4),function(x){cor(data1[sapply(data1, is.numeric)])})
my_correlations[c(1,3,5,7)]## $x
## x y
## x 1.00 0.82
## y 0.82 1.00
##
## $x
## x y
## x 1.00 0.82
## y 0.82 1.00
##
## $x
## x y
## x 1.00 0.82
## y 0.82 1.00
##
## $x
## x y
## x 1.00 0.82
## y 0.82 1.00
lm_1 <- lm(y~x,data1)
lm_2 <- lm(y~x,data2)
lm_3 <- lm(y~x,data3)
lm_4 <- lm(y~x,data4)
lin_reg_equation <- paste( " Lm1 is y= ",round(lm_1$coefficients[2],2),"(x) +", round(lm_1$coefficients[1],2))
lin_reg2_equation <- paste( " Lm2 is y= ",round(lm_2$coefficients[2],2),"(x) +", round(lm_2$coefficients[1],2))
lin_reg3_equation <- paste( " Lm3 is y= ",round(lm_3$coefficients[2],2),"(x) +", round(lm_3$coefficients[1],2))
lin_reg4_equation <- paste( " Lm4 is y= ",round(lm_4$coefficients[2],2),"(x) +", round(lm_4$coefficients[1],2))
kable(cbind(lin_reg_equation,lin_reg2_equation,lin_reg3_equation,lin_reg4_equation))| lin_reg_equation | lin_reg2_equation | lin_reg3_equation | lin_reg4_equation |
|---|---|---|---|
| Lm1 is y= 0.5 (x) + 3 | Lm2 is y= 0.5 (x) + 3 | Lm3 is y= 0.5 (x) + 3 | Lm4 is y= 0.5 (x) + 3 |
sum1<- summary(lm_1)
sum2<- summary(lm_2)
sum3<- summary(lm_3)
sum4<- summary(lm_4)
my_rsquared <- rbind(sum1$r.squared,sum2$r.squared,sum3$r.squared,sum4$r.squared)
rownames(my_rsquared) <- my_rownames
kable(my_rsquared)| df1 | 0.67 |
| df2 | 0.67 |
| df3 | 0.67 |
| df4 | 0.67 |
layout(matrix(c(1,2,3,4,5,6),2,3))
plot(lm_1)
summary(lm_1)##
## Call:
## lm(formula = y ~ x, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9213 -0.4558 -0.0414 0.7094 1.8388
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.000 1.125 2.67 0.0257 *
## x 0.500 0.118 4.24 0.0022 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.2 on 9 degrees of freedom
## Multiple R-squared: 0.667, Adjusted R-squared: 0.629
## F-statistic: 18 on 1 and 9 DF, p-value: 0.00217
plot(data1$x, data1$y, main = "data1 with abline")
abline(lm_1)
hist(lm_1$residuals, main = "Hist Residuals")layout(matrix(c(1,2,3,4,5,6),2,3))
plot(lm_2)
summary(lm_2)##
## Call:
## lm(formula = y ~ x, data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.901 -0.761 0.129 0.949 1.269
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.001 1.125 2.67 0.0258 *
## x 0.500 0.118 4.24 0.0022 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.2 on 9 degrees of freedom
## Multiple R-squared: 0.666, Adjusted R-squared: 0.629
## F-statistic: 18 on 1 and 9 DF, p-value: 0.00218
plot(data2$x, data2$y, main = "data2 with abline")
abline(lm_2)
hist(lm_2$residuals, main = "Hist Residuals")layout(matrix(c(1,2,3,4,5,6),2,3))
plot(lm_3)
summary(lm_3)##
## Call:
## lm(formula = y ~ x, data = data3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.159 -0.615 -0.230 0.154 3.241
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.002 1.124 2.67 0.0256 *
## x 0.500 0.118 4.24 0.0022 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.2 on 9 degrees of freedom
## Multiple R-squared: 0.666, Adjusted R-squared: 0.629
## F-statistic: 18 on 1 and 9 DF, p-value: 0.00218
plot(data3$x, data3$y, main = "data3 with abline")
abline(lm_3)
hist(lm_3$residuals, main = "Hist Residuals")layout(matrix(c(1,2,3,4,5,6),2,3))
plot(lm_4)## Warning: not plotting observations with leverage one:
## 8
## Warning: not plotting observations with leverage one:
## 8
summary(lm_4)##
## Call:
## lm(formula = y ~ x, data = data4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.751 -0.831 0.000 0.809 1.839
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.002 1.124 2.67 0.0256 *
## x 0.500 0.118 4.24 0.0022 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.2 on 9 degrees of freedom
## Multiple R-squared: 0.667, Adjusted R-squared: 0.63
## F-statistic: 18 on 1 and 9 DF, p-value: 0.00216
plot(data4$x, data4$y, main = "data4 with abline")
abline(lm_4)
hist(lm_4$residuals, main = "Hist Residuals")