ANLY 512-50 - Problem Set 2

Questions

Anscombes quartet is a set of 4 $x,y$ data sets that were published by Francis Anscombe in a 1973 paper Graphs in statistical analysis. For this first question load the anscombe data that is part of the library(datasets) in R. And assign that data to a new object called data.

library(datasets)
dataanscombe <- anscombe

Summarise the data by calculating the mean, variance, for each column and the correlation between each pair (eg. x1 and y1, x2 and y2, etc) (Hint: use the fBasics() package!)

library(fBasics)

## Loading required package: timeDate

## Loading required package: timeSeries

##

## Rmetrics Package fBasics

## Analysing Markets and calculating Basic Statistics

## Copyright (C) 2005-2014 Rmetrics Association Zurich

## Educational Software for Financial Engineering and Computational Science

## Rmetrics is free software and comes with ABSOLUTELY NO WARRANTY.

## https://www.rmetrics.org --- Mail to: info@rmetrics.org

colMeans(dataanscombe)

##       x1       x2       x3       x4       y1       y2       y3       y4 
## 9.000000 9.000000 9.000000 9.000000 7.500909 7.500909 7.500000 7.500909

colVars(dataanscombe)

##        x1        x2        x3        x4        y1        y2        y3 
## 11.000000 11.000000 11.000000 11.000000  4.127269  4.127629  4.122620 
##        y4 
##  4.123249

pearsonTest(dataanscombe[,1],dataanscombe[,5])

## 
## Title:
##  Pearson's Correlation Test
## 
## Test Results:
##   PARAMETER:
##     Degrees of Freedom: 9
##   SAMPLE ESTIMATES:
##     Correlation: 0.8164
##   STATISTIC:
##     t: 4.2415
##   P VALUE:
##     Alternative Two-Sided: 0.00217 
##     Alternative      Less: 0.9989 
##     Alternative   Greater: 0.001085 
##   CONFIDENCE INTERVAL:
##     Two-Sided: 0.4244, 0.9507
##          Less: -1, 0.9388
##       Greater: 0.5113, 1
## 
## Description:
##  Mon Jul 31 22:46:10 2017

pearsonTest(dataanscombe[,2],dataanscombe[,6])

## 
## Title:
##  Pearson's Correlation Test
## 
## Test Results:
##   PARAMETER:
##     Degrees of Freedom: 9
##   SAMPLE ESTIMATES:
##     Correlation: 0.8162
##   STATISTIC:
##     t: 4.2386
##   P VALUE:
##     Alternative Two-Sided: 0.002179 
##     Alternative      Less: 0.9989 
##     Alternative   Greater: 0.001089 
##   CONFIDENCE INTERVAL:
##     Two-Sided: 0.4239, 0.9506
##          Less: -1, 0.9387
##       Greater: 0.5109, 1
## 
## Description:
##  Mon Jul 31 22:46:10 2017

pearsonTest(dataanscombe[,3],dataanscombe[,7])

## 
## Title:
##  Pearson's Correlation Test
## 
## Test Results:
##   PARAMETER:
##     Degrees of Freedom: 9
##   SAMPLE ESTIMATES:
##     Correlation: 0.8163
##   STATISTIC:
##     t: 4.2394
##   P VALUE:
##     Alternative Two-Sided: 0.002176 
##     Alternative      Less: 0.9989 
##     Alternative   Greater: 0.001088 
##   CONFIDENCE INTERVAL:
##     Two-Sided: 0.4241, 0.9507
##          Less: -1, 0.9387
##       Greater: 0.511, 1
## 
## Description:
##  Mon Jul 31 22:46:10 2017

pearsonTest(dataanscombe[,4],dataanscombe[,8])

## 
## Title:
##  Pearson's Correlation Test
## 
## Test Results:
##   PARAMETER:
##     Degrees of Freedom: 9
##   SAMPLE ESTIMATES:
##     Correlation: 0.8165
##   STATISTIC:
##     t: 4.243
##   P VALUE:
##     Alternative Two-Sided: 0.002165 
##     Alternative      Less: 0.9989 
##     Alternative   Greater: 0.001082 
##   CONFIDENCE INTERVAL:
##     Two-Sided: 0.4246, 0.9507
##          Less: -1, 0.9388
##       Greater: 0.5115, 1
## 
## Description:
##  Mon Jul 31 22:46:10 2017

x1y1 <- cor(dataanscombe[,1],dataanscombe[,5])
x2y2 <- cor(dataanscombe[,2],dataanscombe[,6])
x3y3 <- cor(dataanscombe[,3],dataanscombe[,7])
x4y4 <- cor(dataanscombe[,4],dataanscombe[,8])
correl1 <- rbind(x1y1,x2y2,x3y3,x4y4)
correl1

##           [,1]
## x1y1 0.8164205
## x2y2 0.8162365
## x3y3 0.8162867
## x4y4 0.8165214

Create scatter plots for each $x, y$ pair of data.

plot(dataanscombe$x1,dataanscombe$y1)

plot(dataanscombe$x2,dataanscombe$y2)

plot(dataanscombe$x3,dataanscombe$y3)

plot(dataanscombe$x4,dataanscombe$y4)

Now change the symbols on the scatter plots to solid circles and plot them together as a 4 panel graphic

library(ggplot2)
library(gridExtra)
plot1 <- ggplot(dataanscombe) + geom_point(aes(x1, y1),color="blue",size = 2) + labs(title = "plot1") + theme(plot.title = element_text(hjust = 0.5),panel.border=element_rect(fill=NA)) + 
scale_x_continuous(breaks = seq(0, 20, 2)) + scale_y_continuous(breaks = seq(0, 12, 2)) 
plot2 <- ggplot(dataanscombe) + geom_point(aes(x2, y2),color="blue",size = 2) + labs(title = "plot2") + theme(plot.title = element_text(hjust = 0.5),panel.border=element_rect(fill=NA)) + scale_x_continuous(breaks = seq(0, 20, 2)) + scale_y_continuous(breaks = seq(0, 12, 2))
plot3 <- ggplot(dataanscombe) + geom_point(aes(x3, y3),color="blue",size = 2) + labs(title = "plot3") + theme(plot.title = element_text(hjust = 0.5),panel.border=element_rect(fill=NA)) + 
scale_x_continuous(breaks = seq(0, 20, 2)) + scale_y_continuous(breaks = seq(0, 12, 2))
plot4 <- ggplot(dataanscombe) + geom_point(aes(x4, y4),color="blue",size = 2) + labs(title = "plot4") + theme(plot.title = element_text(hjust = 0.5),panel.border=element_rect(fill=NA)) + scale_x_continuous(breaks = seq(0, 20, 2)) + scale_y_continuous(breaks = seq(0, 12, 2))
grid.arrange(plot1, plot2, plot3, plot4)

Now fit a linear model to each data set using the lm() function.

lmx1y1 <- lm(dataanscombe$y1 ~ dataanscombe$x1)
lmx2y2 <- lm(dataanscombe$y2 ~ dataanscombe$x2)
lmx3y3 <- lm(dataanscombe$y3 ~ dataanscombe$x3)
lmx4y4 <- lm(dataanscombe$y4 ~ dataanscombe$x4)
summary(lmx1y1)

## 
## Call:
## lm(formula = dataanscombe$y1 ~ dataanscombe$x1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.92127 -0.45577 -0.04136  0.70941  1.83882 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)   
## (Intercept)       3.0001     1.1247   2.667  0.02573 * 
## dataanscombe$x1   0.5001     0.1179   4.241  0.00217 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.237 on 9 degrees of freedom
## Multiple R-squared:  0.6665, Adjusted R-squared:  0.6295 
## F-statistic: 17.99 on 1 and 9 DF,  p-value: 0.00217

summary(lmx2y2)

## 
## Call:
## lm(formula = dataanscombe$y2 ~ dataanscombe$x2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9009 -0.7609  0.1291  0.9491  1.2691 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)   
## (Intercept)        3.001      1.125   2.667  0.02576 * 
## dataanscombe$x2    0.500      0.118   4.239  0.00218 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.237 on 9 degrees of freedom
## Multiple R-squared:  0.6662, Adjusted R-squared:  0.6292 
## F-statistic: 17.97 on 1 and 9 DF,  p-value: 0.002179

summary(lmx3y3)

## 
## Call:
## lm(formula = dataanscombe$y3 ~ dataanscombe$x3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1586 -0.6146 -0.2303  0.1540  3.2411 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)   
## (Intercept)       3.0025     1.1245   2.670  0.02562 * 
## dataanscombe$x3   0.4997     0.1179   4.239  0.00218 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.236 on 9 degrees of freedom
## Multiple R-squared:  0.6663, Adjusted R-squared:  0.6292 
## F-statistic: 17.97 on 1 and 9 DF,  p-value: 0.002176

summary(lmx4y4)

## 
## Call:
## lm(formula = dataanscombe$y4 ~ dataanscombe$x4)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.751 -0.831  0.000  0.809  1.839 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)   
## (Intercept)       3.0017     1.1239   2.671  0.02559 * 
## dataanscombe$x4   0.4999     0.1178   4.243  0.00216 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.236 on 9 degrees of freedom
## Multiple R-squared:  0.6667, Adjusted R-squared:  0.6297 
## F-statistic:    18 on 1 and 9 DF,  p-value: 0.002165

Now combine the last two tasks. Create a four panel scatter plot matrix that has both the data points and the regression lines. (hint: the model objects will carry over chunks!)

plot1 <- ggplot(dataanscombe) + geom_point(aes(x1, y1),color="red",size = 2) + labs(title = "plot1") + theme(plot.title = element_text(hjust = 0.5),panel.border=element_rect(fill=NA)) + 
scale_x_continuous(breaks = seq(0, 20, 2)) + scale_y_continuous(breaks = seq(0, 12, 2)) + aes(x1, y1) + geom_smooth(method = "lm",se=FALSE)
plot2 <- ggplot(dataanscombe) + geom_point(aes(x2, y2),color="red",size = 2) + labs(title = "plot2") + theme(plot.title = element_text(hjust = 0.5),panel.border=element_rect(fill=NA)) + scale_x_continuous(breaks = seq(0, 20, 2)) + scale_y_continuous(breaks = seq(0, 12, 2)) + aes(x2, y2) + geom_smooth(method = "lm",se=FALSE)
plot3 <- ggplot(dataanscombe) + geom_point(aes(x3, y3),color="red",size = 2) + labs(title = "plot3") + theme(plot.title = element_text(hjust = 0.5),panel.border=element_rect(fill=NA)) + 
scale_x_continuous(breaks = seq(0, 20, 2)) + scale_y_continuous(breaks = seq(0, 12, 2)) + aes(x3, y3) + geom_smooth(method = "lm",se=FALSE)
plot4 <- ggplot(dataanscombe) + geom_point(aes(x4, y4),color="red",size = 2) + labs(title = "plot4") + theme(plot.title = element_text(hjust = 0.5),panel.border=element_rect(fill=NA)) + scale_x_continuous(breaks = seq(0, 20, 2)) + scale_y_continuous(breaks = seq(0, 12, 2)) + aes(x4, y4) + geom_smooth(method = "lm",se=FALSE)
grid.arrange(plot1, plot2, plot3, plot4)

Now compare the model fits for each model object.

library(fit.models)
modelfit <- fit.models(lmx1y1,lmx2y2,lmx3y3,lmx4y4)
summary(modelfit)

Calls: lmx1y1: lm(formula = dataanscombe$y1 ~ dataanscombe$x1) lmx2y2: lm(formula = dataanscombe$y2 ~ dataanscombe$x2) lmx3y3: lm(formula = dataanscombe$y3 ~ dataanscombe$x3) lmx4y4: lm(formula = dataanscombe$y4 ~ dataanscombe$x4)

Residual Statistics: Min 1Q Median 3Q Max lmx1y1: -1.921 -0.4558 -4.136e-02 0.7094 1.839 lmx2y2: -1.901 -0.7609 1.291e-01 0.9491 1.269 lmx3y3: -1.159 -0.6146 -2.303e-01 0.1540 3.241 lmx4y4: -1.751 -0.8310 1.110e-16 0.8090 1.839

Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept): lmx1y1: 3.0001 1.1247 2.667 0.02573 * lmx2y2: 3.0009 1.1253 2.667 0.02576 * lmx3y3: 3.0025 1.1245 2.670 0.02562 * lmx4y4: 3.0017 1.1239 2.671 0.02559 *

dataanscombe$x1: lmx1y1: 0.5001 0.1179 4.241 0.00217 ** lmx2y2:
lmx3y3:
lmx4y4:

dataanscombe$x2: lmx1y1:
lmx2y2: 0.5000 0.1180 4.239 0.00218 ** lmx3y3:
lmx4y4:

dataanscombe$x3: lmx1y1:
lmx2y2:
lmx3y3: 0.4997 0.1179 4.239 0.00218 ** lmx4y4:

dataanscombe$x4: lmx1y1:
lmx2y2:
lmx3y3:
lmx4y4: 0.4999 0.1178 4.243 0.00216 ** — Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ‘’ 1

Residual Scale Estimates: lmx1y1: 1.237 on 9 degrees of freedom lmx2y2: 1.237 on 9 degrees of freedom lmx3y3: 1.236 on 9 degrees of freedom lmx4y4: 1.236 on 9 degrees of freedom

Multiple R-squared: lmx1y1: 0.6665 lmx2y2: 0.6662 lmx3y3: 0.6663 lmx4y4: 0.6667

In text, summarize the lesson of Anscombe’s Quartet and what it says about the value of data visualization.

First Dataset: set of points following a linear relationship. Second Dataset: Not a linear relationship but more like a curve or polynomial relationship Dataset 3: stronger linear relationship; one outlier Dataset 4: Same values of x; one outlier

ANLY 512-50 - Problem Set 2

Anscombe’s quartet

Mohnish K Singh

2017-07-31

Objectives

Questions