The objectives of this problem set is to orient you to a number of activities in R and to conduct a thoughtful exercise in appreciating the importance of data visualization. For each question enter your code or text response in the code chunk that completes/answers the activity or question requested. To submit this homework you will create the document in Rstudio, using the knitr package (button included in Rstudio) and then submit the document to your Rpubs account. Once uploaded you will submit the link to that document on Canvas. Please make sure that this link is hyper linked and that I can see the visualization and the code required to create it. Each question is worth 5 points.
anscombe data that is part of the library(datasets) in R. And assign that data to a new object called data.library(datasets)
data = datasets::anscombe
head(data)
## x1 x2 x3 x4 y1 y2 y3 y4
## 1 10 10 10 8 8.04 9.14 7.46 6.58
## 2 8 8 8 8 6.95 8.14 6.77 5.76
## 3 13 13 13 8 7.58 8.74 12.74 7.71
## 4 9 9 9 8 8.81 8.77 7.11 8.84
## 5 11 11 11 8 8.33 9.26 7.81 8.47
## 6 14 14 14 8 9.96 8.10 8.84 7.04
dplyr package!)library(dplyr)
sapply(data, mean)
## x1 x2 x3 x4 y1 y2 y3 y4
## 9.000000 9.000000 9.000000 9.000000 7.500909 7.500909 7.500000 7.500909
sapply(data, var)
## x1 x2 x3 x4 y1 y2 y3 y4
## 11.000000 11.000000 11.000000 11.000000 4.127269 4.127629 4.122620 4.123249
cor(data[,1:4],data[,5:8])
## y1 y2 y3 y4
## x1 0.8164205 0.8162365 0.8162867 -0.3140467
## x2 0.8164205 0.8162365 0.8162867 -0.3140467
## x3 0.8164205 0.8162365 0.8162867 -0.3140467
## x4 -0.5290927 -0.7184365 -0.3446610 0.8165214
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
data <- data.frame(x = c(data$x1, data$x2, data$x3, data$x4),
y = c(data$y1, data$y2, data$y3, data$y4),
dataset = factor(rep(1:4, each = 11)))
ggplot(data, aes(x = x, y = y)) +
geom_point() +
facet_wrap(~ dataset, nrow = 2)
p1 <- ggplot(data=anscombe, aes(x=x1, y=y1)) +
geom_point(color = "blue") +
labs(title="Pair 1")
p2 <- ggplot(data=anscombe, aes(x=x2, y=y2)) +
geom_point(color = "blue") +
labs(title="Pair 2")
p3 <-ggplot(data=anscombe, aes(x=x3, y=y3)) +
geom_point(color = "blue") +
labs(title="Pair 3")
p4 <- ggplot(data=anscombe, mapping=aes(x=x4, y=y4)) +
geom_point(color = "blue") +
labs(title="Pair 4")
grid.arrange(p1, p2, p3, p4, nrow = 2, ncol = 2)
lm() function.lm1 = lm(anscombe$y1~anscombe$x1)
lm2 = lm(anscombe$y2~anscombe$x2)
lm3 = lm(anscombe$y3~anscombe$x3)
lm4 = lm(anscombe$y4~anscombe$x4)
p1 <- ggplot(data=anscombe, aes(x=x1, y=y1)) +
geom_point(color = "blue") +
labs(title="Pair 1") +
geom_smooth(method="lm", color = "red",se=FALSE)
p2 <- ggplot(data=anscombe, aes(x=x2, y=y2)) +
geom_point(color = "blue") +
labs(title="Pair 2") +
geom_smooth(method="lm", color = "red",se=FALSE)
p3 <-ggplot(data=anscombe, aes(x=x3, y=y3)) +
geom_point(color = "blue") +
labs(title="Pair 3") +
geom_smooth(method="lm", color = "red",se=FALSE)
p4 <- ggplot(data=anscombe, mapping=aes(x=x4, y=y4)) +
geom_point(color = "blue") +
labs(title="Pair 4") +
geom_smooth(method="lm", color = "red",se=FALSE)
grid.arrange(p1, p2, p3, p4, nrow = 2, ncol = 2)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
anova(lm1)
Analysis of Variance Table
Response: anscombe\(y1 Df Sum Sq Mean Sq F value Pr(>F) anscombe\)x1 1 27.510 27.5100 17.99 0.00217 ** Residuals 9 13.763 1.5292
— Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ’ ’ 1
anova(lm2)
Analysis of Variance Table
Response: anscombe\(y2 Df Sum Sq Mean Sq F value Pr(>F) anscombe\)x2 1 27.500 27.5000 17.966 0.002179 ** Residuals 9 13.776 1.5307
— Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ’ ’ 1
anova(lm3)
Analysis of Variance Table
Response: anscombe\(y3 Df Sum Sq Mean Sq F value Pr(>F) anscombe\)x3 1 27.470 27.4700 17.972 0.002176 ** Residuals 9 13.756 1.5285
— Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ’ ’ 1
anova(lm4)
Analysis of Variance Table
Response: anscombe\(y4 Df Sum Sq Mean Sq F value Pr(>F) anscombe\)x4 1 27.490 27.4900 18.003 0.002165 ** Residuals 9 13.742 1.5269
— Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ’ ’ 1
#By looking at statistical summaries, the four sets of data look identical. But we have spotted different patterns when we first plot the scatter plot. This is the value and importance of data visualization.