ANLY 512 Problem Set 5

Questions

Anscombe’s quartet is a set of 4 \(x,y\) data sets that were published by Francis Anscombe in a 1973 paper Graphs in statistical analysis. For this first question load the anscombe data that is part of the library(datasets) in R. And assign that data to a new object called data.

library(datasets)
data(anscombe)
data <- anscombe
data

##    x1 x2 x3 x4    y1   y2    y3    y4
## 1  10 10 10  8  8.04 9.14  7.46  6.58
## 2   8  8  8  8  6.95 8.14  6.77  5.76
## 3  13 13 13  8  7.58 8.74 12.74  7.71
## 4   9  9  9  8  8.81 8.77  7.11  8.84
## 5  11 11 11  8  8.33 9.26  7.81  8.47
## 6  14 14 14  8  9.96 8.10  8.84  7.04
## 7   6  6  6  8  7.24 6.13  6.08  5.25
## 8   4  4  4 19  4.26 3.10  5.39 12.50
## 9  12 12 12  8 10.84 9.13  8.15  5.56
## 10  7  7  7  8  4.82 7.26  6.42  7.91
## 11  5  5  5  8  5.68 4.74  5.73  6.89

Summarise the data by calculating the mean, variance, for each column and the correlation between each pair (eg. x1 and y1, x2 and y2, etc) (Hint: use the dplyr package!)

library(dplyr)
#MEAN:
mean <- data %>% summarise_all(funs(mean))
mean

##   x1 x2 x3 x4       y1       y2  y3       y4
## 1  9  9  9  9 7.500909 7.500909 7.5 7.500909

#variance
variance <- data %>% summarise_all(funs(var))
variance

##   x1 x2 x3 x4       y1       y2      y3       y4
## 1 11 11 11 11 4.127269 4.127629 4.12262 4.123249

#correlation
cor(data[,1:4],data[,5:8])

##            y1         y2         y3         y4
## x1  0.8164205  0.8162365  0.8162867 -0.3140467
## x2  0.8164205  0.8162365  0.8162867 -0.3140467
## x3  0.8164205  0.8162365  0.8162867 -0.3140467
## x4 -0.5290927 -0.7184365 -0.3446610  0.8165214

Using ggplot, create scatter plots for each \(x, y\) pair of data (maybe use ‘facet_grid’ or ‘facet_wrap’).

pair1 <- ggplot(data=anscombe, aes(x=x1, y=y1)) + 
  geom_point() + labs(title="Pair 1")
pair1

pair2 <- ggplot(data=anscombe, aes(x=x2, y=y2)) + 
  geom_point() + labs(title="Pair 2")
pair2

pair3 <- ggplot(data=anscombe, aes(x=x3, y=y3)) + 
  geom_point() + labs(title="Pair 3")
pair3

pair4 <- ggplot(data=anscombe, aes(x=x4, y=y4)) + 
  geom_point() + labs(title="Pair 4")
pair4

Now change the symbols on the scatter plots to solid blue circles.

pair1 <- ggplot(data=anscombe, aes(x=x1, y=y1)) + 
  geom_point(color="blue") + labs(title="Pair 1")
pair1

pair2 <- ggplot(data=anscombe, aes(x=x2, y=y2)) + 
  geom_point(color="blue") + labs(title="Pair 2")
pair2

pair3 <- ggplot(data=anscombe, aes(x=x3, y=y3)) + 
  geom_point(color="blue") + labs(title="Pair 3")
pair3

pair4 <- ggplot(data=anscombe, aes(x=x4, y=y4)) + 
  geom_point(color="blue") + labs(title="Pair 4")
pair4

Now fit a linear model to each data set using the lm() function.

fit1 <-lm(y1 ~ x1, data = data)
fit2 <-lm(y2 ~ x2, data = data)
fit3 <-lm(y3 ~ x3, data = data)
fit4 <-lm(y4 ~ x4, data = data)

fit1

## 
## Call:
## lm(formula = y1 ~ x1, data = data)
## 
## Coefficients:
## (Intercept)           x1  
##      3.0001       0.5001

fit2

## 
## Call:
## lm(formula = y2 ~ x2, data = data)
## 
## Coefficients:
## (Intercept)           x2  
##       3.001        0.500

fit3

## 
## Call:
## lm(formula = y3 ~ x3, data = data)
## 
## Coefficients:
## (Intercept)           x3  
##      3.0025       0.4997

fit4

## 
## Call:
## lm(formula = y4 ~ x4, data = data)
## 
## Coefficients:
## (Intercept)           x4  
##      3.0017       0.4999

Now combine the last two tasks. Create a four panel scatter plot matrix that has both the data points and the regression lines. (hint: the model objects will carry over chunks!)

#pair 1
ggplot(data=anscombe, aes(x = x1, y = y1)) +
  geom_point(color = "blue") +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Pair 1")

## `geom_smooth()` using formula = 'y ~ x'

#pair 2
ggplot(data=anscombe, aes(x = x2, y = y2)) +
  geom_point(color = "blue") +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Pair 2")

## `geom_smooth()` using formula = 'y ~ x'

#pair 3
ggplot(data=anscombe, aes(x = x3, y = y3)) +
  geom_point(color = "blue") +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Pair 3")

## `geom_smooth()` using formula = 'y ~ x'

#pair 4
ggplot(data=anscombe, aes(x = x4, y = y4)) +
  geom_point(color = "blue") +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Pair 4")

## `geom_smooth()` using formula = 'y ~ x'

Now compare the model fits for each model object.

#model 1
summary(fit1)

Call: lm(formula = y1 ~ x1, data = data)

Residuals: Min 1Q Median 3Q Max -1.92127 -0.45577 -0.04136 0.70941 1.83882

Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.0001 1.1247 2.667 0.02573 * x1 0.5001 0.1179 4.241 0.00217 ** — Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ’ ’ 1

Residual standard error: 1.237 on 9 degrees of freedom Multiple R-squared: 0.6665, Adjusted R-squared: 0.6295 F-statistic: 17.99 on 1 and 9 DF, p-value: 0.00217

#model 2
summary(fit2)

Call: lm(formula = y2 ~ x2, data = data)

Residuals: Min 1Q Median 3Q Max -1.9009 -0.7609 0.1291 0.9491 1.2691

Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.001 1.125 2.667 0.02576 * x2 0.500 0.118 4.239 0.00218 ** — Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ’ ’ 1

Residual standard error: 1.237 on 9 degrees of freedom Multiple R-squared: 0.6662, Adjusted R-squared: 0.6292 F-statistic: 17.97 on 1 and 9 DF, p-value: 0.002179

#model 3
summary(fit3)

Call: lm(formula = y3 ~ x3, data = data)

Residuals: Min 1Q Median 3Q Max -1.1586 -0.6146 -0.2303 0.1540 3.2411

Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.0025 1.1245 2.670 0.02562 * x3 0.4997 0.1179 4.239 0.00218 ** — Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ’ ’ 1

Residual standard error: 1.236 on 9 degrees of freedom Multiple R-squared: 0.6663, Adjusted R-squared: 0.6292 F-statistic: 17.97 on 1 and 9 DF, p-value: 0.002176

#model 4
summary(fit4)

Call: lm(formula = y4 ~ x4, data = data)

Residuals: Min 1Q Median 3Q Max -1.751 -0.831 0.000 0.809 1.839

Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.0017 1.1239 2.671 0.02559 * x4 0.4999 0.1178 4.243 0.00216 ** — Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ’ ’ 1

Residual standard error: 1.236 on 9 degrees of freedom Multiple R-squared: 0.6667, Adjusted R-squared: 0.6297 F-statistic: 18 on 1 and 9 DF, p-value: 0.002165

In text, summarize the lesson of Anscombe’s Quartet and what it says about the value of data visualization.

#AnawerQ8:
#From my perspective, the descriptive statistics of these four groups look same, I can't tell the difference between them. However, by data visualization, they are very different. I feel that visualizing data is a very important way to understand and analyze data. Also, it's a more clear and intuitive way to explain data insights to others who are not familiar with the data.

ANLY 512 Problem Set 5

Anscombe’s quartet

Xinjie Chen

2023-03-27

Objectives

Questions