## Loading required package: ggplot2
## Loading required package: grid
## Loading required package: gridExtra
1 - Data Setup
I chose to initially set the data up as separate data frames for flexibility, I additionally created an aggregate with an additional column ‘grouping’ to differentiate the data sets from the source. This aggregated data frame was key to utilizing ggplot2 facets capability.
# Data setup - I entered the data manually to prevent issues when the code was rerun or executed on rpubs, for larger data sets I would have loaded into a data frame from a csv file
df1 <- data.frame( x = c(10.00, 8.00, 13.00, 9.00, 11.00, 14.00, 6.00, 4.00, 12.00, 7.00, 5.00),
y = c(8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68) )
df2 <- data.frame( x = c(10.00, 8.00, 13.00, 9.00, 11.00, 14.00, 6.00, 4.00, 12.00, 7.00, 5.00),
y = c(9.14, 8.14, 8.74, 8.77, 9.26, 8.10, 6.13, 3.10, 9.13, 7.26, 4.74) )
df3 <- data.frame( x = c(10.00, 8.00, 13.00, 9.00, 11.00, 14.00, 6.00, 4.00, 12.00, 7.00, 5.00),
y = c(7.46, 6.77, 12.74, 7.11, 7.81, 8.84, 6.08, 5.39, 8.15, 6.42, 5.73) )
df4 <- data.frame( x = c(8.00, 8.00, 8.00, 8.00, 8.00, 8.00, 8.00, 19.00, 8.00, 8.00, 8.00),
y = c(6.58, 5.76, 7.71, 8.84, 8.47, 7.04, 5.25, 12.50, 5.56, 7.91, 6.89) )
agg_df2 <- rbind(df1, df2, df3, df4)
agg_df2$grouping[1:11] <- 1
agg_df2$grouping[12:22] <- 2
agg_df2$grouping[23:33] <- 3
agg_df2$grouping[34:44] <- 4
2 - High-level group data analysis
Overall scatter plot of the different groups of data is as follows. For this I used individual ggplots and grid to layout each plot
p1 <- ggplot(df1, aes(x=x, y=y)) + geom_point(color='blue', size=3) + labs(title = '1')
p2 <- ggplot(df2, aes(x=x, y=y)) + geom_point(color ='green4', size=3, pch=17) + labs(title = '2')
p3 <- ggplot(df3, aes(x=x, y=y)) + geom_point(color='red', size=3, pch=23) + labs(title = '3')
p4 <- ggplot(df4, aes(x=x, y=y)) + geom_point() + labs(title = '4')
grid.arrange(p1, p2, p3, p4, ncol=2)
Scatter plot using ggplot2 facet_wrap by ‘grouping’
ggplot(agg_df2, aes(x=x, y=y)) + geom_point() + facet_wrap(~grouping)
Individual data set analysis
Analysis of the first group:
summary(df1)
## x y
## Min. : 4.0 Min. : 4.260
## 1st Qu.: 6.5 1st Qu.: 6.315
## Median : 9.0 Median : 7.580
## Mean : 9.0 Mean : 7.501
## 3rd Qu.:11.5 3rd Qu.: 8.570
## Max. :14.0 Max. :10.840
lm(data=df1)
##
## Call:
## lm(data = df1)
##
## Coefficients:
## (Intercept) y
## -0.9975 1.3328
b1 <- ggplot(df1, aes(x=x, y=y)) + geom_boxplot(outlier.color = NULL, outlier.size=4) + labs(title = '1')
#Using 'grid' was key to laying out different plot types next to each other
grid.arrange(p1 + geom_smooth(method="lm"), b1, ncol=2, main="First Group")
Analysis of the second group:
summary(df2)
## x y
## Min. : 4.0 Min. :3.100
## 1st Qu.: 6.5 1st Qu.:6.695
## Median : 9.0 Median :8.140
## Mean : 9.0 Mean :7.501
## 3rd Qu.:11.5 3rd Qu.:8.950
## Max. :14.0 Max. :9.260
lm(data=df2)
##
## Call:
## lm(data = df2)
##
## Coefficients:
## (Intercept) y
## -0.9948 1.3325
b2 <- ggplot(df2, aes(x=x, y=y)) + geom_boxplot(outlier.color = NULL, outlier.size=4) + labs(title = '2')
grid.arrange(p2 + geom_smooth(method="lm"), b2, ncol=2, main="Second Group")
In addition to this I attempted to determine the normality of the data as it is visually mound-shaped. I believe that this was ultimately inconclusive (i.e. coud not reject the null hypothesis that the data was normal), but had to do more with the nature of the test conducted:
http://www.r-bloggers.com/normality-tests-don’t-do-what-you-think-they-do/
# Special note -- Shapiro-Wilk tests for normality are not definitive
# http://en.wikipedia.org/wiki/Shapiro–Wilk_test
shapiro.test(df2$x)
##
## Shapiro-Wilk normality test
##
## data: df2$x
## W = 0.9684, p-value = 0.8698
shapiro.test(df2$y)
##
## Shapiro-Wilk normality test
##
## data: df2$y
## W = 0.8284, p-value = 0.02222
# It's recommended to include a QQ plot with a Shapiro-Wilk
qqnorm(df2$y);qqline(df2$y, col = 2)
qqnorm(df2$x);qqline(df2$x, col = 2)
Analysis of the third group:
summary(df3)
## x y
## Min. : 4.0 Min. : 5.39
## 1st Qu.: 6.5 1st Qu.: 6.25
## Median : 9.0 Median : 7.11
## Mean : 9.0 Mean : 7.50
## 3rd Qu.:11.5 3rd Qu.: 7.98
## Max. :14.0 Max. :12.74
lm(data=df3)
##
## Call:
## lm(data = df3)
##
## Coefficients:
## (Intercept) y
## -1.000 1.333
b3 <- ggplot(df3, aes(x=x, y=y)) + geom_boxplot(outlier.color = NULL, outlier.size=4) + labs(title = '3')
grid.arrange(p3 + geom_smooth(method="lm"), b3, ncol=2, main="Third Group")
Analysis of the fourth group:
summary(df4)
## x y
## Min. : 8 Min. : 5.250
## 1st Qu.: 8 1st Qu.: 6.170
## Median : 8 Median : 7.040
## Mean : 9 Mean : 7.501
## 3rd Qu.: 8 3rd Qu.: 8.190
## Max. :19 Max. :12.500
table(df4)
## y
## x 5.25 5.56 5.76 6.58 6.89 7.04 7.71 7.91 8.47 8.84 12.5
## 8 1 1 1 1 1 1 1 1 1 1 0
## 19 0 0 0 0 0 0 0 0 0 0 1
ggplot(df4, aes(x=x, y=y)) + geom_boxplot(outlier.color = NULL, outlier.size=4) + labs(title = '4')
#flipped scatter plot
ggplot(df4, aes(x=y, y=x)) + geom_point()
In addition to this I attempted to determine the uniformity of the data as it is visually indicative of that characteristic
ggplot(df4, aes(x=x)) + geom_histogram(binwidth=.5)
chisq.test(df4$x)
##
## Chi-squared test for given probabilities
##
## data: df4$x
## X-squared = 12.2222, df = 10, p-value = 0.2705
ks.test(df4$x, "punif")
## Warning in ks.test(df4$x, "punif"): ties should not be present for the
## Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: df4$x
## D = 1, p-value = 5.579e-10
## alternative hypothesis: two-sided
This was not definitive for “best fit”, and I believe that that one of the causes is the outlier at x=13