#Weekly Lab 2
### Objectives
#The objectives of this problem set is to orient you to a number of activities in `R`. And to conduct a thoughtful exercise in appreciating the importance of data visualization. For each question create a code chunk or text response that completes/answers the activity or question requested. Finally, upon completion name your final output `.html` file as: `YourName_ANLY512-Section-Year-Semester.html` and upload it to the "Problem Set 2" assignment to your R Pubs account and submit the link to Moodle. Points will be deducted for uploading the improper format.
anscombe data that is part of the library(datasets) in R. And assign that data to a new object called data.library(datasets)
data = anscombe
data
## x1 x2 x3 x4 y1 y2 y3 y4
## 1 10 10 10 8 8.04 9.14 7.46 6.58
## 2 8 8 8 8 6.95 8.14 6.77 5.76
## 3 13 13 13 8 7.58 8.74 12.74 7.71
## 4 9 9 9 8 8.81 8.77 7.11 8.84
## 5 11 11 11 8 8.33 9.26 7.81 8.47
## 6 14 14 14 8 9.96 8.10 8.84 7.04
## 7 6 6 6 8 7.24 6.13 6.08 5.25
## 8 4 4 4 19 4.26 3.10 5.39 12.50
## 9 12 12 12 8 10.84 9.13 8.15 5.56
## 10 7 7 7 8 4.82 7.26 6.42 7.91
## 11 5 5 5 8 5.68 4.74 5.73 6.89
fBasics() package!)#install.packages("fBasics")
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(reshape2)
dataA=select(data,x=x1,y=y1)
dataB=select(data,x=x2,y=y2)
dataC=select(data,x=x3,y=y3)
dataD=select(data,x=x4,y=y4)
dataA$group='DataA'
dataB$group='DataB'
dataC$group='DataC'
dataD$group='DataD'
data_all=rbind(dataA,dataB,dataC,dataD)
library("fBasics")
## Loading required package: timeDate
## Loading required package: timeSeries
correlationTest(dataA$x, dataA$y)
##
## Title:
## Pearson's Correlation Test
##
## Test Results:
## PARAMETER:
## Degrees of Freedom: 9
## SAMPLE ESTIMATES:
## Correlation: 0.8164
## STATISTIC:
## t: 4.2415
## P VALUE:
## Alternative Two-Sided: 0.00217
## Alternative Less: 0.9989
## Alternative Greater: 0.001085
## CONFIDENCE INTERVAL:
## Two-Sided: 0.4244, 0.9507
## Less: -1, 0.9388
## Greater: 0.5113, 1
##
## Description:
## Sun Feb 3 09:40:34 2019
correlationTest(dataB$x, dataB$y)
##
## Title:
## Pearson's Correlation Test
##
## Test Results:
## PARAMETER:
## Degrees of Freedom: 9
## SAMPLE ESTIMATES:
## Correlation: 0.8162
## STATISTIC:
## t: 4.2386
## P VALUE:
## Alternative Two-Sided: 0.002179
## Alternative Less: 0.9989
## Alternative Greater: 0.001089
## CONFIDENCE INTERVAL:
## Two-Sided: 0.4239, 0.9506
## Less: -1, 0.9387
## Greater: 0.5109, 1
##
## Description:
## Sun Feb 3 09:40:34 2019
correlationTest(dataC$x, dataC$y)
##
## Title:
## Pearson's Correlation Test
##
## Test Results:
## PARAMETER:
## Degrees of Freedom: 9
## SAMPLE ESTIMATES:
## Correlation: 0.8163
## STATISTIC:
## t: 4.2394
## P VALUE:
## Alternative Two-Sided: 0.002176
## Alternative Less: 0.9989
## Alternative Greater: 0.001088
## CONFIDENCE INTERVAL:
## Two-Sided: 0.4241, 0.9507
## Less: -1, 0.9387
## Greater: 0.511, 1
##
## Description:
## Sun Feb 3 09:40:34 2019
correlationTest(dataD$x, dataD$y)
##
## Title:
## Pearson's Correlation Test
##
## Test Results:
## PARAMETER:
## Degrees of Freedom: 9
## SAMPLE ESTIMATES:
## Correlation: 0.8165
## STATISTIC:
## t: 4.243
## P VALUE:
## Alternative Two-Sided: 0.002165
## Alternative Less: 0.9989
## Alternative Greater: 0.001082
## CONFIDENCE INTERVAL:
## Two-Sided: 0.4246, 0.9507
## Less: -1, 0.9388
## Greater: 0.5115, 1
##
## Description:
## Sun Feb 3 09:40:34 2019
stats_summ=data_all%>%group_by(group)%>%summarise("Mean X"=mean(x),
"Sample Variance X"=var(x),
"Mean Y" = mean(y),
"Sample Variance Y"=var(y),
"Correlation Between X and Y"=cor(x,y))
stats_summ
## # A tibble: 4 x 6
## group `Mean X` `Sample Varianc… `Mean Y` `Sample Varianc…
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 DataA 9 11 7.50 4.13
## 2 DataB 9 11 7.50 4.13
## 3 DataC 9 11 7.5 4.12
## 4 DataD 9 11 7.50 4.12
## # ... with 1 more variable: `Correlation Between X and Y` <dbl>
plot(dataA$x, dataA$y, main = "Scatter Plot # 1 -> y1, x1")
plot(dataB$x, dataB$y, main = "Scatter Plot # 2 -> y2, x2")
plot(dataC$x, dataC$y, main = "Scatter Plot # 3 -> y3, x3")
plot(dataD$x, dataD$y, main = "Scatter Plot # 4 -> y4, x4")
par(mfrow= c(2,2))
plot(dataA$x, dataA$y, main = "Scatter Plot # 1 -> y1, x1", pch = 20)
plot(dataB$x, dataB$y, main = "Scatter Plot # 2 -> y2, x2", pch = 20)
plot(dataC$x, dataC$y, main = "Scatter Plot # 3 -> y3, x3", pch = 20)
plot(dataD$x, dataD$y, main = "Scatter Plot # 4 -> y4, x4", pch = 20)
lm() function.model1 = lm(dataA$y ~ dataA$x)
model2 = lm(dataB$y ~ dataB$x)
model3 = lm(dataC$y ~ dataC$x)
model4 = lm(dataD$y ~ dataD$x)
summary(model1)
##
## Call:
## lm(formula = dataA$y ~ dataA$x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.92127 -0.45577 -0.04136 0.70941 1.83882
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.0001 1.1247 2.667 0.02573 *
## dataA$x 0.5001 0.1179 4.241 0.00217 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.237 on 9 degrees of freedom
## Multiple R-squared: 0.6665, Adjusted R-squared: 0.6295
## F-statistic: 17.99 on 1 and 9 DF, p-value: 0.00217
summary(model2)
##
## Call:
## lm(formula = dataB$y ~ dataB$x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9009 -0.7609 0.1291 0.9491 1.2691
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.001 1.125 2.667 0.02576 *
## dataB$x 0.500 0.118 4.239 0.00218 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.237 on 9 degrees of freedom
## Multiple R-squared: 0.6662, Adjusted R-squared: 0.6292
## F-statistic: 17.97 on 1 and 9 DF, p-value: 0.002179
summary(model3)
##
## Call:
## lm(formula = dataC$y ~ dataC$x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.1586 -0.6146 -0.2303 0.1540 3.2411
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.0025 1.1245 2.670 0.02562 *
## dataC$x 0.4997 0.1179 4.239 0.00218 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.236 on 9 degrees of freedom
## Multiple R-squared: 0.6663, Adjusted R-squared: 0.6292
## F-statistic: 17.97 on 1 and 9 DF, p-value: 0.002176
summary(model4)
##
## Call:
## lm(formula = dataD$y ~ dataD$x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.751 -0.831 0.000 0.809 1.839
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.0017 1.1239 2.671 0.02559 *
## dataD$x 0.4999 0.1178 4.243 0.00216 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.236 on 9 degrees of freedom
## Multiple R-squared: 0.6667, Adjusted R-squared: 0.6297
## F-statistic: 18 on 1 and 9 DF, p-value: 0.002165
linear_model = data_all %>% group_by(group) %>%
do(mod=lm(y~x,data=.)) %>%
do(data.frame(var=names(coef(.$mod)),coef=round(coef(.$mod),2),group=.$group)) %>%
dcast(.,group~var,value.var="coef")
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
reg_summ=data_frame("Linear Regression"=paste0("y=",linear_model$"(Intercept)","+",linear_model$x,"x"))
stats_and_linear_model_summ = cbind(stats_summ,reg_summ)
stats_and_linear_model_summ
## group Mean X Sample Variance X Mean Y Sample Variance Y
## 1 DataA 9 11 7.500909 4.127269
## 2 DataB 9 11 7.500909 4.127629
## 3 DataC 9 11 7.500000 4.122620
## 4 DataD 9 11 7.500909 4.123249
## Correlation Between X and Y Linear Regression
## 1 0.8164205 y=3+0.5x
## 2 0.8162365 y=3+0.5x
## 3 0.8162867 y=3+0.5x
## 4 0.8165214 y=3+0.5x
ggplot(data_all, aes(x=x,y=y)) +geom_point(shape=21,color="red",fill="red",size=2) +ggtitle("Anscombe's Datasets") +geom_smooth(method ="lm", se = FALSE, color="blue") +facet_wrap(~group,scales="free")
7. Now compare the model fits for each model object.
anova(model1)
Analysis of Variance Table
Response: dataA\(y Df Sum Sq Mean Sq F value Pr(>F) dataA\)x 1 27.510 27.5100 17.99 0.00217 ** Residuals 9 13.763 1.5292
— Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ‘’ 1
anova(model2)
Analysis of Variance Table
Response: dataB\(y Df Sum Sq Mean Sq F value Pr(>F) dataB\)x 1 27.500 27.5000 17.966 0.002179 ** Residuals 9 13.776 1.5307
— Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ‘’ 1
anova(model3)
Analysis of Variance Table
Response: dataC\(y Df Sum Sq Mean Sq F value Pr(>F) dataC\)x 1 27.470 27.4700 17.972 0.002176 ** Residuals 9 13.756 1.5285
— Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ‘’ 1
anova(model4)
Analysis of Variance Table
Response: dataD\(y Df Sum Sq Mean Sq F value Pr(>F) dataD\)x 1 27.490 27.4900 18.003 0.002165 ** Residuals 9 13.742 1.5269
— Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ‘’ 1
Anscombe’s Quartet is a dataset/study which proves the importance of data visualization. We get to see the relationships among X and Y pairs and conclude some key takeaways:
This is an identical model between the four pairs of x and y values.Through the mean and variance analysis it is to be found that, Mean of X = 9 Variance of X = 11 Mean of Y = 7.50 Variane of Y = 4.1276 Correlation between X and Y (all 4 pairs of X and Y) = 0.81642
From the linear model we know that Intercept is 3 and slope is 0.5x for all 4 pairs Hence, the linear equation would be: y = 3 + 0.50x
Furthermore, while looking at the scatter plots for all the four data pairs, we notice that: There is a weak linear relationship for DataA, no linear relationship shown for DataB, a strond linear relationship is shown for DataC (has an outlier) and no progressing x values for DataD, where only 2 of the data points coinside on the line.