ANLY 512 Problem Set 5

Questions

Anscombe’s quartet is a set of 4 \(x,y\) data sets that were published by Francis Anscombe in a 1973 paper Graphs in statistical analysis. For this first question load the anscombe data that is part of the library(datasets) in R. And assign that data to a new object called data.

library(datasets)
data(anscombe)
data <- anscombe
data

##    x1 x2 x3 x4    y1   y2    y3    y4
## 1  10 10 10  8  8.04 9.14  7.46  6.58
## 2   8  8  8  8  6.95 8.14  6.77  5.76
## 3  13 13 13  8  7.58 8.74 12.74  7.71
## 4   9  9  9  8  8.81 8.77  7.11  8.84
## 5  11 11 11  8  8.33 9.26  7.81  8.47
## 6  14 14 14  8  9.96 8.10  8.84  7.04
## 7   6  6  6  8  7.24 6.13  6.08  5.25
## 8   4  4  4 19  4.26 3.10  5.39 12.50
## 9  12 12 12  8 10.84 9.13  8.15  5.56
## 10  7  7  7  8  4.82 7.26  6.42  7.91
## 11  5  5  5  8  5.68 4.74  5.73  6.89

str(data)

## 'data.frame':    11 obs. of  8 variables:
##  $ x1: num  10 8 13 9 11 14 6 4 12 7 ...
##  $ x2: num  10 8 13 9 11 14 6 4 12 7 ...
##  $ x3: num  10 8 13 9 11 14 6 4 12 7 ...
##  $ x4: num  8 8 8 8 8 8 8 19 8 8 ...
##  $ y1: num  8.04 6.95 7.58 8.81 8.33 ...
##  $ y2: num  9.14 8.14 8.74 8.77 9.26 8.1 6.13 3.1 9.13 7.26 ...
##  $ y3: num  7.46 6.77 12.74 7.11 7.81 ...
##  $ y4: num  6.58 5.76 7.71 8.84 8.47 7.04 5.25 12.5 5.56 7.91 ...

Summarise the data by calculating the mean, variance, for each column and the correlation between each pair (eg. x1 and y1, x2 and y2, etc) (Hint: use the dplyr package!)

library(dplyr)
summary(data)

##        x1             x2             x3             x4           y1        
##  Min.   : 4.0   Min.   : 4.0   Min.   : 4.0   Min.   : 8   Min.   : 4.260  
##  1st Qu.: 6.5   1st Qu.: 6.5   1st Qu.: 6.5   1st Qu.: 8   1st Qu.: 6.315  
##  Median : 9.0   Median : 9.0   Median : 9.0   Median : 8   Median : 7.580  
##  Mean   : 9.0   Mean   : 9.0   Mean   : 9.0   Mean   : 9   Mean   : 7.501  
##  3rd Qu.:11.5   3rd Qu.:11.5   3rd Qu.:11.5   3rd Qu.: 8   3rd Qu.: 8.570  
##  Max.   :14.0   Max.   :14.0   Max.   :14.0   Max.   :19   Max.   :10.840  
##        y2              y3              y4        
##  Min.   :3.100   Min.   : 5.39   Min.   : 5.250  
##  1st Qu.:6.695   1st Qu.: 6.25   1st Qu.: 6.170  
##  Median :8.140   Median : 7.11   Median : 7.040  
##  Mean   :7.501   Mean   : 7.50   Mean   : 7.501  
##  3rd Qu.:8.950   3rd Qu.: 7.98   3rd Qu.: 8.190  
##  Max.   :9.260   Max.   :12.74   Max.   :12.500

colMeans(data)

##       x1       x2       x3       x4       y1       y2       y3       y4 
## 9.000000 9.000000 9.000000 9.000000 7.500909 7.500909 7.500000 7.500909

apply(data,2,var)

##        x1        x2        x3        x4        y1        y2        y3        y4 
## 11.000000 11.000000 11.000000 11.000000  4.127269  4.127629  4.122620  4.123249

cor(data[,1:4],data[,5:8])

##            y1         y2         y3         y4
## x1  0.8164205  0.8162365  0.8162867 -0.3140467
## x2  0.8164205  0.8162365  0.8162867 -0.3140467
## x3  0.8164205  0.8162365  0.8162867 -0.3140467
## x4 -0.5290927 -0.7184365 -0.3446610  0.8165214

Using ggplot, create scatter plots for each \(x, y\) pair of data (maybe use ‘facet_grid’ or ‘facet_wrap’).

library(ggplot2)
par(mfrow = c(2, 2))
plot(data$x1, data$y1, main = "Scatterplot for Pair x1 and y1", xlab = "x1", ylab = "y1")
plot(data$x2, data$y2, main = "Scatterplot for Pair x2 and y2", xlab = "x2", ylab = "y2")
plot(data$x3, data$y3, main = "Scatterplot for Pair x3 and y3", xlab = "x3", ylab = "y3")
plot(data$x4, data$y4, main = "Scatterplot for Pair x4 and y4", xlab = "x4", ylab = "y4")

Now change the symbols on the scatter plots to solid blue circles.

par(mfrow = c(2, 2))
plot(data$x1, data$y1, main = "Scatterplot for Pair x1 and y1", xlab = "x1", ylab = "y1", pch = 19, col='Blue')
plot(data$x2, data$y2, main = "Scatterplot for Pair x2 and y2", xlab = "x2", ylab = "y2", pch = 19, col='Blue')
plot(data$x3, data$y3, main = "Scatterplot for Pair x3 and y3", xlab = "x3", ylab = "y3", pch = 19, col='Blue')
plot(data$x4, data$y4, main = "Scatterplot for Pair x4 and y4", xlab = "x4", ylab = "y4", pch = 19, col='Blue')

Now fit a linear model to each data set using the lm() function.

attach(data)
Lm1 <- lm(x1~y1)
summary(Lm1)

## 
## Call:
## lm(formula = x1 ~ y1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6522 -1.5117 -0.2657  1.2341  3.8946 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  -0.9975     2.4344  -0.410  0.69156   
## y1            1.3328     0.3142   4.241  0.00217 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.019 on 9 degrees of freedom
## Multiple R-squared:  0.6665, Adjusted R-squared:  0.6295 
## F-statistic: 17.99 on 1 and 9 DF,  p-value: 0.00217

Lm2 <- lm(x2~y2)
summary(Lm2)

## 
## Call:
## lm(formula = x2 ~ y2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8516 -1.4315 -0.3440  0.8467  4.2017 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  -0.9948     2.4354  -0.408  0.69246   
## y2            1.3325     0.3144   4.239  0.00218 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.02 on 9 degrees of freedom
## Multiple R-squared:  0.6662, Adjusted R-squared:  0.6292 
## F-statistic: 17.97 on 1 and 9 DF,  p-value: 0.002179

Lm3 <- lm(x3~y3)
summary(Lm3)

## 
## Call:
## lm(formula = x3 ~ y3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.9869 -1.3733 -0.0266  1.3200  3.2133 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  -1.0003     2.4362  -0.411  0.69097   
## y3            1.3334     0.3145   4.239  0.00218 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.019 on 9 degrees of freedom
## Multiple R-squared:  0.6663, Adjusted R-squared:  0.6292 
## F-statistic: 17.97 on 1 and 9 DF,  p-value: 0.002176

Lm4 <- lm(x4~y4)
summary(Lm4)

## 
## Call:
## lm(formula = x4 ~ y4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7859 -1.4122 -0.1853  1.4551  3.3329 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  -1.0036     2.4349  -0.412  0.68985   
## y4            1.3337     0.3143   4.243  0.00216 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.018 on 9 degrees of freedom
## Multiple R-squared:  0.6667, Adjusted R-squared:  0.6297 
## F-statistic:    18 on 1 and 9 DF,  p-value: 0.002165

Now combine the last two tasks. Create a four panel scatter plot matrix that has both the data points and the regression lines. (hint: the model objects will carry over chunks!)

attach(data)

## The following objects are masked from data (pos = 3):
## 
##     x1, x2, x3, x4, y1, y2, y3, y4

par(mfrow=c(2,2))
plot(x1,y1,main="Scatterplot for x1 vs y1",xlab="x1",ylab="y1",pch=20)
abline(lm(x1 ~ y1))
plot(x2,y2,main="Scatterplot for x2 vs y2",xlab="x2",ylab="y2",pch=20)
abline(lm(x2 ~ y2))
plot(x3,y3,main="Scatterplot for x3 vs y3 ",xlab="x3",ylab="y3",pch=20)
abline(lm(x3 ~ y3))
plot(x4,y4,main="Scatterplot for x4 vs y4",xlab="x4",ylab="y4",pch=20)
abline(lm(x4 ~ y4))

Now compare the model fits for each model object.

It can be seen that the linear regression model is a agood fit for the data in dataset 1 but it is unfit for the data in dataset 2 as it appears to be nonlinear. In figure 3, only one data point passes through the fitted line and the other point is far from the regression line thus linear nodel is inapt. It can be seen in figure 4 that the points cluster around same x value and an outlier. If the data is accurate and reliable we could use linear regression model.

In text, summarize the lesson of Anscombe’s Quartet and what it says about the value of data visualization.

All the four datasets in Anscombe’s Quartet share identical summary statistics but different visual patterns. Thus, one should not rely only on summary statistics to identify true patterns in the data. Anscombe’s Quartet teaches us the value of data visualization and that it should be used to explore the data efficiently signifying the importance of data visualization in data analysis.

ANLY 512 Problem Set 5

Anscombe’s quartet

Samkit Anjal Dhanki

2023-03-13

Objectives

Questions