Part II

Consider the four datasets, each with two columns (x and y), provided below.

options(digits=2)
data1 <- data.frame(x=c(10,8,13,9,11,14,6,4,12,7,5),
                    y=c(8.04,6.95,7.58,8.81,8.33,9.96,7.24,4.26,10.84,4.82,5.68))
data2 <- data.frame(x=c(10,8,13,9,11,14,6,4,12,7,5),
                    y=c(9.14,8.14,8.74,8.77,9.26,8.1,6.13,3.1,9.13,7.26,4.74))
data3 <- data.frame(x=c(10,8,13,9,11,14,6,4,12,7,5),
                    y=c(7.46,6.77,12.74,7.11,7.81,8.84,6.08,5.39,8.15,6.42,5.73))
data4 <- data.frame(x=c(8,8,8,8,8,8,8,19,8,8,8),
                    y=c(6.58,5.76,7.71,8.84,8.47,7.04,5.25,12.5,5.56,7.91,6.89))

For each column, calculate (to two decimal places):

a. The mean (for x and y separately; 1 pt).

data1_x_mean <-mean(data1$x)
data1_y_mean <-mean(data1$y)
data1_x_mean

## [1] 9

data1_y_mean

## [1] 7.5

data2_x_mean <-mean(data2$x)
data2_y_mean <-mean(data2$y)
data2_x_mean

## [1] 9

data2_y_mean

## [1] 7.5

data3_x_mean <-mean(data3$x)
data3_y_mean <-mean(data3$y)
data3_x_mean

## [1] 9

data3_y_mean

## [1] 7.5

data4_x_mean <-mean(data4$x)
data4_y_mean <-mean(data4$y)
data4_x_mean

## [1] 9

data4_y_mean

## [1] 7.5

b. The median (for x and y separately; 1 pt).

data1_x_median <-median(data1$x)
data1_y_median <-median(data1$y)
data1_x_median

## [1] 9

data1_y_median

## [1] 7.6

data2_x_median <-median(data2$x)
data2_y_median <-median(data2$y)
data2_x_median

## [1] 9

data2_y_median

## [1] 8.1

data3_x_median <-median(data3$x)
data3_y_median <-median(data3$y)
data3_x_median

## [1] 9

data3_y_median

## [1] 7.1

data4_x_median <-median(data4$x)
data4_y_median <-median(data4$y)
data4_x_median

## [1] 8

data4_y_median

## [1] 7

c. The standard deviation (for x and y separately; 1 pt).

data1_x_sd <-sd(data1$x)
data1_y_sd <-sd(data1$y)
data1_x_sd

## [1] 3.3

data1_y_sd

## [1] 2

data2_x_sd <-sd(data2$x)
data2_y_sd <-sd(data2$y)
data2_x_sd

## [1] 3.3

data2_y_sd

## [1] 2

data3_x_sd <-sd(data3$x)
data3_y_sd <-sd(data3$y)
data3_x_sd

## [1] 3.3

data3_y_sd

## [1] 2

data4_x_sd <-sd(data4$x)
data4_y_sd <-sd(data4$y)
data4_x_sd

## [1] 3.3

data4_y_sd

## [1] 2

For each x and y pair, calculate (also to two decimal places; 1 pt):

d. The correlation (1 pt).

data1_cor <-cor(data1$x, data1$y)
data1_cor

## [1] 0.82

data2_cor <-cor(data2$x, data2$y)
data2_cor

## [1] 0.82

data3_cor <-cor(data3$x, data3$y)
data3_cor

## [1] 0.82

data4_cor <-cor(data4$x, data4$y)
data4_cor

## [1] 0.82

e. Linear regression equation (2 pts).

data1_lm <-lm(formula = data1$y ~ data1$x)
data1_lm

## 
## Call:
## lm(formula = data1$y ~ data1$x)
## 
## Coefficients:
## (Intercept)      data1$x  
##         3.0          0.5

data2_lm <-lm(formula = data2$y ~ data2$x)
data2_lm

## 
## Call:
## lm(formula = data2$y ~ data2$x)
## 
## Coefficients:
## (Intercept)      data2$x  
##         3.0          0.5

data3_lm <-lm(formula = data3$y ~ data3$x)

data3_lm

## 
## Call:
## lm(formula = data3$y ~ data3$x)
## 
## Coefficients:
## (Intercept)      data3$x  
##         3.0          0.5

data4_lm <-lm(formula = data4$y ~ data4$x)
data4_lm

## 
## Call:
## lm(formula = data4$y ~ data4$x)
## 
## Coefficients:
## (Intercept)      data4$x  
##         3.0          0.5

f. R-Squared (2 pts).

summary(data1_lm)

## 
## Call:
## lm(formula = data1$y ~ data1$x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9213 -0.4558 -0.0414  0.7094  1.8388 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)    3.000      1.125    2.67   0.0257 * 
## data1$x        0.500      0.118    4.24   0.0022 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.2 on 9 degrees of freedom
## Multiple R-squared:  0.667,  Adjusted R-squared:  0.629 
## F-statistic:   18 on 1 and 9 DF,  p-value: 0.00217

summary(data2_lm)

## 
## Call:
## lm(formula = data2$y ~ data2$x)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.901 -0.761  0.129  0.949  1.269 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)    3.001      1.125    2.67   0.0258 * 
## data2$x        0.500      0.118    4.24   0.0022 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.2 on 9 degrees of freedom
## Multiple R-squared:  0.666,  Adjusted R-squared:  0.629 
## F-statistic:   18 on 1 and 9 DF,  p-value: 0.00218

summary(data3_lm)

## 
## Call:
## lm(formula = data3$y ~ data3$x)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.159 -0.615 -0.230  0.154  3.241 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)    3.002      1.124    2.67   0.0256 * 
## data3$x        0.500      0.118    4.24   0.0022 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.2 on 9 degrees of freedom
## Multiple R-squared:  0.666,  Adjusted R-squared:  0.629 
## F-statistic:   18 on 1 and 9 DF,  p-value: 0.00218

summary(data4_lm)

## 
## Call:
## lm(formula = data4$y ~ data4$x)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.751 -0.831  0.000  0.809  1.839 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)    3.002      1.124    2.67   0.0256 * 
## data4$x        0.500      0.118    4.24   0.0022 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.2 on 9 degrees of freedom
## Multiple R-squared:  0.667,  Adjusted R-squared:  0.63 
## F-statistic:   18 on 1 and 9 DF,  p-value: 0.00216

For each pair, is it appropriate to estimate a linear regression model? Why or why not? Be specific as to why for each pair and include appropriate plots! (4 pts)

Data 1 Plots

#Linearity 
plot(data1$y ~ data1$x)
abline(data1_lm)

plot(data1_lm$residuals ~ data1$x)
abline(h=0, lty=3)

#Nearly Normal Residuals
hist(data1_lm$residuals)

#QQNORM/QQLINE
qqnorm(data1_lm$residuals)
qqline(data1_lm$residuals)

Data1 seems linear

Data 2 Plots

#Linearity 
plot(data2$y ~ data2$x)
abline(data2_lm)

plot(data2_lm$residuals ~ data2$x)
abline(h=0, lty=3)

#Nearly Normal Residuals
hist(data2_lm$residuals)

#QQNORM/QQLINE
qqnorm(data2_lm$residuals)
qqline(data2_lm$residuals)

According to several websites Data2 is a parabola which is a curve where any point is at an equal distance from either a fixed point/fixed straight line.

Data 3 Plots

#Linearity 
plot(data3$y ~ data3$x)
abline(data3_lm)

plot(data3_lm$residuals ~ data3$x)
abline(h=0, lty=3)

#Nearly Normal Residuals
hist(data3_lm$residuals)

#QQNORM/QQLINE
qqnorm(data3_lm$residuals)
qqline(data3_lm$residuals)

Data 3 data seems to be skewed with one outlier.

Data 4 Plots

#Linearity 
plot(data4$y ~ data4$x)
abline(data4_lm)

plot(data4_lm$residuals ~ data4$x)
abline(h=0, lty=3)

#Nearly Normal Residuals
hist(data4_lm$residuals)

#QQNORM/QQLINE
qqnorm(data4_lm$residuals)
qqline(data4_lm$residuals)

Data 4 has a large amount of outliers.

Explain why it is important to include appropriate visualizations when analyzing data. Include any visualization(s) you create. (2 pts)

Based off the examples provided in the exam we see four situations where the mean, median, and standard deviations were close. If we didn’t investigate further and create plots one could present questionable analysis. Visualizations are needed in order to provide a glimpse of how the data is structured.

DATA 606 Fall 2017 - Final Exam

Brian K. Liles

Part I