ANLY 512 Problem Set 5

Questions

Anscombe’s quartet is a set of 4 \(x,y\) data sets that were published by Francis Anscombe in a 1973 paper Graphs in statistical analysis. For this first question load the anscombe data that is part of the library(datasets) in R. And assign that data to a new object called data.

data=anscombe
head(data)

##   x1 x2 x3 x4   y1   y2    y3   y4
## 1 10 10 10  8 8.04 9.14  7.46 6.58
## 2  8  8  8  8 6.95 8.14  6.77 5.76
## 3 13 13 13  8 7.58 8.74 12.74 7.71
## 4  9  9  9  8 8.81 8.77  7.11 8.84
## 5 11 11 11  8 8.33 9.26  7.81 8.47
## 6 14 14 14  8 9.96 8.10  8.84 7.04

Summarise the data by calculating the mean, variance, for each column and the correlation between each pair (eg. x1 and y1, x2 and y2, etc) (Hint: use the dplyr package!)

library(dplyr)
sapply(data, mean)

##       x1       x2       x3       x4       y1       y2       y3       y4 
## 9.000000 9.000000 9.000000 9.000000 7.500909 7.500909 7.500000 7.500909

sapply(data, var)

##        x1        x2        x3        x4        y1        y2        y3        y4 
## 11.000000 11.000000 11.000000 11.000000  4.127269  4.127629  4.122620  4.123249

cor(data[,1:4],data[,5:8])

##            y1         y2         y3         y4
## x1  0.8164205  0.8162365  0.8162867 -0.3140467
## x2  0.8164205  0.8162365  0.8162867 -0.3140467
## x3  0.8164205  0.8162365  0.8162867 -0.3140467
## x4 -0.5290927 -0.7184365 -0.3446610  0.8165214

Using ggplot, create scatter plots for each \(x, y\) pair of data (maybe use ‘facet_grid’ or ‘facet_wrap’).

library(ggplot2)
library(gridExtra)

## Warning: package 'gridExtra' was built under R version 4.1.3

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

pair1 = ggplot(data=anscombe, aes(x=x1, y=y1)) + geom_point() + labs(title="Pair 1")
  
pair2 = ggplot(data=anscombe, aes(x=x2, y=y2)) + geom_point() + labs(title="Pair 2")

pair3 = ggplot(data=anscombe, aes(x=x3, y=y3)) + geom_point() + labs(title="Pair 3")

pair4 = ggplot(data=anscombe, aes(x=x4, y=y4)) + geom_point() + labs(title="Pair 4")

grid.arrange(pair1, pair2, pair3, pair4, nrow = 2, ncol = 2)

Now change the symbols on the scatter plots to solid blue circles.

pair1 <- ggplot(data=anscombe, aes(x=x1, y=y1)) + geom_point(shape = 19, color = "blue", size = 3) + labs(title="Pair 1")
  
pair2 <- ggplot(data=anscombe, aes(x=x2, y=y2)) + geom_point(shape = 19, color = "blue", size = 3) + labs(title="Pair 2")

pair3 <-ggplot(data=anscombe, aes(x=x3, y=y3)) + geom_point(shape = 19, color = "blue", size = 3) + labs(title="Pair 3")

pair4 <- ggplot(data=anscombe, aes(x=x4, y=y4)) + geom_point(shape = 19, color = "blue", size = 3) + labs(title="Pair 4")

grid.arrange(pair1, pair2, pair3, pair4, nrow = 2, ncol = 2)

Now fit a linear model to each data set using the lm() function.

lm1 = lm(data$y1~data$x1)
lm2 = lm(data$y2~data$x2)
lm3 = lm(data$y3~data$x3)
lm4 = lm(data$y4~data$x4)

Now combine the last two tasks. Create a four panel scatter plot matrix that has both the data points and the regression lines. (hint: the model objects will carry over chunks!)

pair1 <- ggplot(data=anscombe, aes(x=x1, y=y1)) + geom_point(color = "blue", size = 2) + labs(title="Pair 1") + geom_smooth(method="lm", color = "red",se=FALSE)
  
pair2 <- ggplot(data=anscombe, aes(x=x2, y=y2)) + geom_point(color = "blue", size = 2) + labs(title="Pair 2") + geom_smooth(method="lm", color = "red",se=FALSE)

pair3 <-ggplot(data=anscombe, aes(x=x3, y=y3)) + geom_point(color = "blue", size = 2) + labs(title="Pair 3") + geom_smooth(method="lm", color = "red",se=FALSE)

pair4 <- ggplot(data=anscombe, aes(x=x4, y=y4)) + geom_point(color = "blue", size = 2) + labs(title="Pair 4") + geom_smooth(method="lm", color = "red",se=FALSE)

grid.arrange(pair1, pair2, pair3, pair4, nrow = 2, ncol = 2)

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

Now compare the model fits for each model object.

anova(lm1)

Analysis of Variance Table

Response: data\(y1 Df Sum Sq Mean Sq F value Pr(>F) data\)x1 1 27.510 27.5100 17.99 0.00217 ** Residuals 9 13.763 1.5292
— Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ’ ’ 1

anova(lm2)

Analysis of Variance Table

Response: data\(y2 Df Sum Sq Mean Sq F value Pr(>F) data\)x2 1 27.500 27.5000 17.966 0.002179 ** Residuals 9 13.776 1.5307
— Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ’ ’ 1

anova(lm3)

Analysis of Variance Table

Response: data\(y3 Df Sum Sq Mean Sq F value Pr(>F) data\)x3 1 27.470 27.4700 17.972 0.002176 ** Residuals 9 13.756 1.5285
— Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ’ ’ 1

anova(lm4)

Analysis of Variance Table

Response: data\(y4 Df Sum Sq Mean Sq F value Pr(>F) data\)x4 1 27.490 27.4900 18.003 0.002165 ** Residuals 9 13.742 1.5269
— Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ’ ’ 1

In text, summarize the lesson of Anscombe’s Quartet and what it says about the value of data visualization.

#Anscombe's Quartet is a set of four datasets with almost identical statistical properties yet their visual patterns are quite distinct. We can conclude that even though the mean, variance & correlation  of the datasets looked identical, date looked quite different visually, meaning the mean, variance & correlation can be insufficient to fully understand the dataset. The datasets have quite interesting patterns. The lesson of Anscombe's Quartet highlights 
# the importance of data visualization and that it could be used to understand the data, gain insights from it and make data driven decisions.

ANLY 512 Problem Set 5

Anscombe’s quartet

Mithil Kashyap Vyas

2023-03-14

Objectives

Questions