The objectives of this problem set is to orient you to a number of
activities in R and to conduct a thoughtful exercise in
appreciating the importance of data visualization. For each question
enter your code or text response in the code chunk that
completes/answers the activity or question requested. To submit this
homework you will create the document in Rstudio, using the knitr
package (button included in Rstudio) and then submit the document to
your Rpubs account. Once uploaded you
will submit the link to that document on Canvas. Please make sure that
this link is hyper linked and that I can see the visualization and the
code required to create it. Each question is worth 5 points.
anscombe data that is part of the
library(datasets) in R. And assign that data
to a new object called data.library(datasets)
data("anscombe")
#View(anscombe)
data=anscombe
dplyr package!)library(dplyr)
summary(data)
## x1 x2 x3 x4 y1
## Min. : 4.0 Min. : 4.0 Min. : 4.0 Min. : 8 Min. : 4.260
## 1st Qu.: 6.5 1st Qu.: 6.5 1st Qu.: 6.5 1st Qu.: 8 1st Qu.: 6.315
## Median : 9.0 Median : 9.0 Median : 9.0 Median : 8 Median : 7.580
## Mean : 9.0 Mean : 9.0 Mean : 9.0 Mean : 9 Mean : 7.501
## 3rd Qu.:11.5 3rd Qu.:11.5 3rd Qu.:11.5 3rd Qu.: 8 3rd Qu.: 8.570
## Max. :14.0 Max. :14.0 Max. :14.0 Max. :19 Max. :10.840
## y2 y3 y4
## Min. :3.100 Min. : 5.39 Min. : 5.250
## 1st Qu.:6.695 1st Qu.: 6.25 1st Qu.: 6.170
## Median :8.140 Median : 7.11 Median : 7.040
## Mean :7.501 Mean : 7.50 Mean : 7.501
## 3rd Qu.:8.950 3rd Qu.: 7.98 3rd Qu.: 8.190
## Max. :9.260 Max. :12.74 Max. :12.500
colMeans(data)
## x1 x2 x3 x4 y1 y2 y3 y4
## 9.000000 9.000000 9.000000 9.000000 7.500909 7.500909 7.500000 7.500909
cor(data[,1:4],data[,5:8])
## y1 y2 y3 y4
## x1 0.8164205 0.8162365 0.8162867 -0.3140467
## x2 0.8164205 0.8162365 0.8162867 -0.3140467
## x3 0.8164205 0.8162365 0.8162867 -0.3140467
## x4 -0.5290927 -0.7184365 -0.3446610 0.8165214
library(ggplot2)
plot1 <- ggplot(anscombe) +
geom_point(aes(x1, y1), color = "green", size = 1.5) +
scale_x_continuous(breaks = seq(0,20,2)) +
scale_y_continuous(breaks = seq(0,12,2)) +
expand_limits(x = 0, y = 0) +
labs(x = "x1", y = "y1",
title = " x1 " , x = "x4", y = "y4" ) +
theme_bw()
plot1
plot2 <- ggplot(anscombe) +
geom_point(aes(x2, y2), color = "green", size = 1.5) +
scale_x_continuous(breaks = seq(0,20,2)) +
scale_y_continuous(breaks = seq(0,12,2)) +
expand_limits(x = 0, y = 0) +
labs(title = " x2 " , x = "x4", y = "y4",x = "x2", y = "y2" ) +
theme_bw()
plot2
plot3 <- ggplot(anscombe) +
geom_point(aes(x3, y3), color = "green", size = 1.5) +
scale_x_continuous(breaks = seq(0,20,2)) +
scale_y_continuous(breaks = seq(0,12,2)) +
expand_limits(x = 0, y = 0) +
labs(title = "x3 " , x = "x4", y = "y4", x = "x3", y = "y3" ) +
theme_bw()
plot3
plot4 <- ggplot(anscombe) +
geom_point(aes(x4, y4), color = "green", size = 1.5) +
scale_x_continuous(breaks = seq(0,20,2)) +
scale_y_continuous(breaks = seq(0,12,2)) +
expand_limits(x = 0, y = 0) +
labs(title = " x4 " , x = "x4", y = "y4" ) +
theme_bw()
plot4
plot1 <- ggplot(anscombe) +
geom_point(aes(x1, y1), color = "blue", size = 1.5) +
scale_x_continuous(breaks = seq(0,20,2)) +
scale_y_continuous(breaks = seq(0,12,2)) +
expand_limits(x = 0, y = 0) +
labs(x = "x1", y = "y1",
title = "x1 " , x = "x4", y = "y4" ) +
theme_bw()
plot1
plot2 <- ggplot(anscombe) +
geom_point(aes(x2, y2), color = "blue", size = 1.5) +
scale_x_continuous(breaks = seq(0,20,2)) +
scale_y_continuous(breaks = seq(0,12,2)) +
expand_limits(x = 0, y = 0) +
labs(title = " x2 " , x = "x4", y = "y4",x = "x2", y = "y2" ) +
theme_bw()
plot2
plot3 <- ggplot(anscombe) +
geom_point(aes(x3, y3), color = "blue", size = 1.5) +
scale_x_continuous(breaks = seq(0,20,2)) +
scale_y_continuous(breaks = seq(0,12,2)) +
expand_limits(x = 0, y = 0) +
labs(title = "x3 " , x = "x4", y = "y4", x = "x3", y = "y3" ) +
theme_bw()
plot3
plot4 <- ggplot(anscombe) +
geom_point(aes(x4, y4), color = "blue", size = 1.5) +
scale_x_continuous(breaks = seq(0,20,2)) +
scale_y_continuous(breaks = seq(0,12,2)) +
expand_limits(x = 0, y = 0) +
labs(title = " x4 " , x = "x4", y = "y4" ) +
theme_bw()
plot4
grid.arrange(plot1, plot2, plot3, plot4, top='Four Panel Scatter ')
lm()
function.lm1 <- lm(y1 ~ x1, data=data)
lm1
##
## Call:
## lm(formula = y1 ~ x1, data = data)
##
## Coefficients:
## (Intercept) x1
## 3.0001 0.5001
lm2 <- lm(y2 ~ x2, data=data)
lm2
##
## Call:
## lm(formula = y2 ~ x2, data = data)
##
## Coefficients:
## (Intercept) x2
## 3.001 0.500
lm3 <- lm(y3 ~ x3, data=data)
lm3
##
## Call:
## lm(formula = y3 ~ x3, data = data)
##
## Coefficients:
## (Intercept) x3
## 3.0025 0.4997
lm4 <- lm(y4 ~ x4, data=data)
lm4
##
## Call:
## lm(formula = y4 ~ x4, data = data)
##
## Coefficients:
## (Intercept) x4
## 3.0017 0.4999
LMP1 <- ggplot(anscombe) +
geom_point(aes(x1, y1), color = "blue", size = 1.5) +
scale_x_continuous(breaks = seq(0,20,2)) +
scale_y_continuous(breaks = seq(0,12,2)) +
expand_limits(x = 0, y = 0) +
geom_abline(intercept=lm1$coefficients[1], slope=lm1$coefficients[2])+
labs(x = "x1", y = "y1",
title = "x1 " , x = "x4", y = "y4" ) +
theme_bw()
LMP1
LMP2 <- ggplot(anscombe) +
geom_point(aes(x2, y2), color = "blue", size = 1.5) +
scale_x_continuous(breaks = seq(0,20,2)) +
scale_y_continuous(breaks = seq(0,12,2)) +
expand_limits(x = 0, y = 0) +
geom_abline(intercept=lm1$coefficients[1], slope=lm1$coefficients[2])+
labs(title = " x2 " , x = "x4", y = "y4",x = "x2", y = "y2" ) +
theme_bw()
LMP2
LMP3 <- ggplot(anscombe) +
geom_point(aes(x3, y3), color = "blue", size = 1.5) +
scale_x_continuous(breaks = seq(0,20,2)) +
scale_y_continuous(breaks = seq(0,12,2)) +
geom_abline(intercept=lm1$coefficients[1], slope=lm1$coefficients[2])+
expand_limits(x = 0, y = 0) +
labs(title = "x3 " , x = "x4", y = "y4", x = "x3", y = "y3" ) +
theme_bw()
LMP3
LMP4 <- ggplot(anscombe) +
geom_point(aes(x4, y4), color = "blue", size = 1.5) +
scale_x_continuous(breaks = seq(0,20,2)) +
scale_y_continuous(breaks = seq(0,12,2)) +
geom_abline(intercept=lm1$coefficients[1], slope=lm1$coefficients[2])+
expand_limits(x = 0, y = 0) +
labs(title = " x4 " , x = "x4", y = "y4" ) +
theme_bw()
LMP4
grid.arrange(LMP1, LMP2, LMP3, LMP4, top='Four Panel Scatter Plot Matrix ')
summary(lm1)
Call: lm(formula = y1 ~ x1, data = data)
Residuals: Min 1Q Median 3Q Max -1.92127 -0.45577 -0.04136 0.70941 1.83882
Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.0001 1.1247 2.667 0.02573 * x1 0.5001 0.1179 4.241 0.00217
** — Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05
‘.’ 0.1 ’ ’ 1
Residual standard error: 1.237 on 9 degrees of freedom Multiple R-squared: 0.6665, Adjusted R-squared: 0.6295 F-statistic: 17.99 on 1 and 9 DF, p-value: 0.00217
summary(lm2)
Call: lm(formula = y2 ~ x2, data = data)
Residuals: Min 1Q Median 3Q Max -1.9009 -0.7609 0.1291 0.9491 1.2691
Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.001 1.125 2.667 0.02576 * x2 0.500 0.118 4.239 0.00218 **
— Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05
‘.’ 0.1 ’ ’ 1
Residual standard error: 1.237 on 9 degrees of freedom Multiple R-squared: 0.6662, Adjusted R-squared: 0.6292 F-statistic: 17.97 on 1 and 9 DF, p-value: 0.002179
summary(lm3)
Call: lm(formula = y3 ~ x3, data = data)
Residuals: Min 1Q Median 3Q Max -1.1586 -0.6146 -0.2303 0.1540 3.2411
Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.0025 1.1245 2.670 0.02562 * x3 0.4997 0.1179 4.239 0.00218
** — Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05
‘.’ 0.1 ’ ’ 1
Residual standard error: 1.236 on 9 degrees of freedom Multiple R-squared: 0.6663, Adjusted R-squared: 0.6292 F-statistic: 17.97 on 1 and 9 DF, p-value: 0.002176
summary(lm4)
Call: lm(formula = y4 ~ x4, data = data)
Residuals: Min 1Q Median 3Q Max -1.751 -0.831 0.000 0.809 1.839
Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.0017 1.1239 2.671 0.02559 * x4 0.4999 0.1178 4.243 0.00216
** — Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05
‘.’ 0.1 ’ ’ 1
Residual standard error: 1.236 on 9 degrees of freedom Multiple R-squared: 0.6667, Adjusted R-squared: 0.6297 F-statistic: 18 on 1 and 9 DF, p-value: 0.002165
#The fits for all four models are almost the same ( even p-value).
Anscombe’s data visualization reveals distinct variations in patterns among the four datasets. Despite this, statistical measures such as mean and standard deviation suggest that these datasets share similar summary statistics. Moreover, fitting linear regression models on these datasets shows that all four models have comparable fitting. Nevertheless, it is evident from the data visualization that only one dataset is reasonably suitable for a linear model. Consequently, this dataset underscores the significance of data visualization and exposes the potential for erroneous conclusions when constructing models directly from data without proper visualization.