Part II

Consider the four datasets, each with two columns (x and y), provided below.

options(digits=2)
data1 <- data.frame(x=c(10,8,13,9,11,14,6,4,12,7,5),
                    y=c(8.04,6.95,7.58,8.81,8.33,9.96,7.24,4.26,10.84,4.82,5.68))
data2 <- data.frame(x=c(10,8,13,9,11,14,6,4,12,7,5),
                    y=c(9.14,8.14,8.74,8.77,9.26,8.1,6.13,3.1,9.13,7.26,4.74))
data3 <- data.frame(x=c(10,8,13,9,11,14,6,4,12,7,5),
                    y=c(7.46,6.77,12.74,7.11,7.81,8.84,6.08,5.39,8.15,6.42,5.73))
data4 <- data.frame(x=c(8,8,8,8,8,8,8,19,8,8,8),
                    y=c(6.58,5.76,7.71,8.84,8.47,7.04,5.25,12.5,5.56,7.91,6.89))

For each column, calculate (to two decimal places):

a. The mean (for x and y separately; 1 pt).

d1.x <- round(mean(data1$x), 2)
d1.y <- round(mean(data1$y), 2)
d2.x <- round(mean(data2$x), 2)
d2.y <- round(mean(data2$y), 2)
d3.x <- round(mean(data3$x), 2)
d3.y <- round(mean(data3$y), 2)
d4.x <- round(mean(data4$x), 2)
d4.y <- round(mean(data4$y), 2)

meandf <- data.frame(set= c(1,2,3,4), mean.x=c(d1.x, d2.x, d3.x, d4.x), mean.y= c(d1.y, d2.y, d3.y, d4.y))
meandf

##   set mean.x mean.y
## 1   1      9    7.5
## 2   2      9    7.5
## 3   3      9    7.5
## 4   4      9    7.5

b. The median (for x and y separately; 1 pt).

d1.x <- round(median(data1$x), 2)
d1.y <- round(median(data1$y), 2)
d2.x <- round(median(data2$x), 2)
d2.y <- round(median(data2$y), 2)
d3.x <- round(median(data3$x), 2)
d3.y <- round(median(data3$y), 2)
d4.x <- round(median(data4$x), 2)
d4.y <- round(median(data4$y), 2)

mediandf <- data.frame(set= c(1,2,3,4), median.x=c(d1.x, d2.x, d3.x, d4.x), median.y= c(d1.y, d2.y, d3.y, d4.y))
mediandf

##   set median.x median.y
## 1   1        9      7.6
## 2   2        9      8.1
## 3   3        9      7.1
## 4   4        8      7.0

c. The standard deviation (for x and y separately; 1 pt).

d1.x <- round(sd(data1$x), 2)
d1.y <- round(sd(data1$y), 2)
d2.x <- round(sd(data2$x), 2)
d2.y <- round(sd(data2$y), 2)
d3.x <- round(sd(data3$x), 2)
d3.y <- round(sd(data3$y), 2)
d4.x <- round(sd(data4$x), 2)
d4.y <- round(sd(data4$y), 2)

sd.df <- data.frame(set= c(1,2,3,4), sd.x=c(d1.x, d2.x, d3.x, d4.x), sd.y= c(d1.y, d2.y, d3.y, d4.y))
sd.df

##   set sd.x sd.y
## 1   1  3.3    2
## 2   2  3.3    2
## 3   3  3.3    2
## 4   4  3.3    2

For each x and y pair, calculate (also to two decimal places; 1 pt):

d. The correlation (1 pt).

cor.1 <- cor(data1$x, data1$y)
cor.2 <- cor(data2$x, data2$y)
cor.3 <- cor(data3$x, data3$y)
cor.4 <- cor(data4$x, data4$y)
cor.df <- data.frame(set=c(1,2,3,4), correlation = c(cor.1, cor.2, cor.3, cor.4))
cor.df

##   set correlation
## 1   1        0.82
## 2   2        0.82
## 3   3        0.82
## 4   4        0.82

e. Linear regression equation (2 pts).

LR1 <- lm(y ~ x, data = data1)
summary(LR1)

## 
## Call:
## lm(formula = y ~ x, data = data1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9213 -0.4558 -0.0414  0.7094  1.8388 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)    3.000      1.125    2.67   0.0257 * 
## x              0.500      0.118    4.24   0.0022 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.2 on 9 degrees of freedom
## Multiple R-squared:  0.667,  Adjusted R-squared:  0.629 
## F-statistic:   18 on 1 and 9 DF,  p-value: 0.00217

data 1 equation: y_hat = 3.00 + 0.5x

LR2 <- lm(y ~ x, data = data2)
summary(LR2)

## 
## Call:
## lm(formula = y ~ x, data = data2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.901 -0.761  0.129  0.949  1.269 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)    3.001      1.125    2.67   0.0258 * 
## x              0.500      0.118    4.24   0.0022 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.2 on 9 degrees of freedom
## Multiple R-squared:  0.666,  Adjusted R-squared:  0.629 
## F-statistic:   18 on 1 and 9 DF,  p-value: 0.00218

data 2 equation: y_hat = 3.00 + 0.5x

LR3 <- lm(y ~ x, data = data3)
summary(LR3)

## 
## Call:
## lm(formula = y ~ x, data = data3)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.159 -0.615 -0.230  0.154  3.241 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)    3.002      1.124    2.67   0.0256 * 
## x              0.500      0.118    4.24   0.0022 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.2 on 9 degrees of freedom
## Multiple R-squared:  0.666,  Adjusted R-squared:  0.629 
## F-statistic:   18 on 1 and 9 DF,  p-value: 0.00218

data 3 equation: y_hat = 3.00 + 0.5x

LR4 <- lm(y ~ x, data = data4)
summary(LR4)

## 
## Call:
## lm(formula = y ~ x, data = data4)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.751 -0.831  0.000  0.809  1.839 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)    3.002      1.124    2.67   0.0256 * 
## x              0.500      0.118    4.24   0.0022 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.2 on 9 degrees of freedom
## Multiple R-squared:  0.667,  Adjusted R-squared:  0.63 
## F-statistic:   18 on 1 and 9 DF,  p-value: 0.00216

data 4 equation: y_hat = 3.00 + 0.5x

f. R-Squared (2 pts).

Data 1: R-Squared = 0.667 Data 2: R-Squared = 0.666 Data 3: R-Squared = 0.666 Data 4: R-Squared = 0.667

For each pair, is it appropriate to estimate a linear regression model? Why or why not? Be specific as to why for each pair and include appropriate plots! (4 pts)

Data 1:

par( mfrow = c( 2, 2 ) )

#linear plot
plot(x = data1$x, y = data1$y)

#nearly normal residuals
hist(LR1$residuals)

#qq norm

qqnorm(LR1$residuals)
qqline(LR1$residuals)

#variability
plot(LR1$residuals ~ data1$x)
abline(h = 0, lty = 4)

No, the data does not appear to be linear.

Data 2:

par( mfrow = c( 2, 2 ) )

#linear plot
plot(x = data2$x, y = data2$y)

#nearly normal residuals
hist(LR2$residuals)

#qq norm

qqnorm(LR2$residuals)
qqline(LR2$residuals)

#variability
plot(LR2$residuals ~ data2$x)
abline(h = 0, lty = 4)

No, the data is not linear.

Data 3:

par( mfrow = c( 2, 2 ) )

#linear plot
plot(x = data3$x, y = data3$y)

#nearly normal residuals
hist(LR3$residuals)

#qq norm

qqnorm(LR3$residuals)
qqline(LR3$residuals)

#variability
plot(LR3$residuals ~ data3$x)
abline(h = 0, lty = 4)

Yes, the data almost fits a linear model.

Data 4:

par( mfrow = c( 2, 2 ) )

#linear plot
plot(x = data4$x, y = data4$y)

#nearly normal residuals
hist(LR4$residuals)

#qq norm

qqnorm(LR4$residuals)
qqline(LR4$residuals)

#variability
plot(LR4$residuals ~ data4$x)
abline(h = 0, lty = 4)

No, the data is very far from linear.

Explain why it is important to include appropriate visualizations when analyzing data. Include any visualization(s) you create. (2 pts)

Visualizations help to illustrate the linearity of data, the relationship between variables, and give a better picture of the data. Just by looking at the data frames, you would not be able to see patterns in the data, since many attributes of the data were the same.

DATA 606 Fall 2017 - Final Exam

Natalie Mollaghan

Part I