Perform exploratory data analysis that compares the below data sets.Use ggplot2 with R and draw conclusions from analysis
Load the required libraries
library(ggplot2)
library(plyr)
Load the csv file of the data sets into a dataframe
dataframe <- read.table("IS_607_Proj_2.csv", header = TRUE, sep = ",", stringsAsFactors=TRUE)
str(dataframe)
## 'data.frame': 44 obs. of 3 variables:
## $ dataset: Factor w/ 4 levels "set1","set2",..: 1 2 3 4 1 2 3 4 1 2 ...
## $ x : num 10 10 10 8 8 8 8 8 13 13 ...
## $ y : num 8.04 9.14 7.46 6.58 6.95 8.14 6.77 5.76 7.58 8.74 ...
aggregate(cbind(x,y) ~ dataset, data = dataframe, summary)
## dataset x.Min. x.1st Qu. x.Median x.Mean x.3rd Qu. x.Max. y.Min.
## 1 set1 4.0 6.5 9.0 9.0 11.5 14.0 4.260
## 2 set2 4.0 6.5 9.0 9.0 11.5 14.0 3.100
## 3 set3 4.0 6.5 9.0 9.0 11.5 14.0 5.390
## 4 set4 8.0 8.0 8.0 9.0 8.0 19.0 5.250
## y.1st Qu. y.Median y.Mean y.3rd Qu. y.Max.
## 1 6.315 7.580 7.501 8.570 10.840
## 2 6.695 8.140 7.501 8.950 9.260
## 3 6.250 7.110 7.500 7.980 12.740
## 4 6.170 7.040 7.501 8.190 12.500
Review the distribution of y values across all four sets
ggplot(dataframe, aes(x=y, alpha=.5)) + geom_density(aes(fill=dataset)) + labs(title="Distribution of Y values - Overlapped view", xlab="y")
ggplot(dataframe, aes(x=y)) + geom_density(aes(fill=dataset)) + labs(title="Distribution of Y values", xlab="y") + facet_grid(dataset~.)
Observation: set1 and set2’s y values appears to have similarity. And set3 and set4’s y distribution appears similar
ggplot(dataframe, aes(x=x)) + geom_density(aes(fill=dataset)) + labs(title="Distribution of X values", xlab="x") + facet_grid(dataset~.)
Observation: x values distribution from set1, set2, and set3 appears similar. However, set4 is different here.
mutate the dataframe and add additional fields Q1, Q2, IQR, upper and lower limits for data points for all 4 sets
df.quartile <- ddply(dataframe, .(dataset), mutate, Q1=quantile(y, 1/4), Q3=quantile(y, 3/4), IQR=Q3-Q1, upper.limit=Q3+1.5*IQR, lower.limit=Q1-1.5*IQR)
str(df.quartile)
## 'data.frame': 44 obs. of 8 variables:
## $ dataset : Factor w/ 4 levels "set1","set2",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ x : num 10 8 13 9 11 14 6 4 12 7 ...
## $ y : num 8.04 6.95 7.58 8.81 8.33 ...
## $ Q1 : num 6.31 6.31 6.31 6.31 6.31 ...
## $ Q3 : num 8.57 8.57 8.57 8.57 8.57 8.57 8.57 8.57 8.57 8.57 ...
## $ IQR : num 2.26 2.26 2.26 2.26 2.26 ...
## $ upper.limit: num 12 12 12 12 12 ...
## $ lower.limit: num 2.93 2.93 2.93 2.93 2.93 ...
ggplot() +
geom_boxplot(data=df.quartile, aes(x=factor(dataset), y=y, color = factor(dataset))) +
geom_point(data=df.quartile[df.quartile$y > df.quartile$upper.limit | df.quartile$y < df.quartile$lower.limit,], size=8, alpha=.4 ,aes( x=factor(dataset), y=y, color=factor(dataset))) +
labs(title="Distribution of Y with 'outlier' points for each set", x="Data Sets", y="Y")
Observation: set2, set3 and set4 has got outliers.
ggplot(dataframe, aes(x=x, y=y, shape=dataset, color=dataset)) + geom_point(size=2) + facet_grid(.~dataset) + geom_line() +
labs(title="Dataset scatter plot - facet grid", x="Data Sets", y="Y")
Draw a geom smooth - with lm
ggplot(dataframe,aes(x=x, y=y, color=dataset)) + facet_grid(.~dataset) + geom_line() + geom_smooth(method=lm) +
labs(title="Datasets - Regression of Y on X - 99% confidence interval - with outliers", x="Data Sets", y="Y")
Observation: All 4 sets appears to show some what similar trends. But we do have the outliers here.
df_cleandata <- subset(df.quartile, y <= upper.limit & y >= df.quartile$lower.limit)
str(df_cleandata)
## 'data.frame': 41 obs. of 8 variables:
## $ dataset : Factor w/ 4 levels "set1","set2",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ x : num 10 8 13 9 11 14 6 4 12 7 ...
## $ y : num 8.04 6.95 7.58 8.81 8.33 ...
## $ Q1 : num 6.31 6.31 6.31 6.31 6.31 ...
## $ Q3 : num 8.57 8.57 8.57 8.57 8.57 8.57 8.57 8.57 8.57 8.57 ...
## $ IQR : num 2.26 2.26 2.26 2.26 2.26 ...
## $ upper.limit: num 12 12 12 12 12 ...
## $ lower.limit: num 2.93 2.93 2.93 2.93 2.93 ...
#Draw the scatter plot facet grid with no outliers
ggplot(df_cleandata, aes(x=x, y=y, shape=dataset, color=dataset)) + geom_point(size=2) + facet_grid(.~dataset) + geom_line() +
labs(title="Dataset scatter plot with no outliers - facet grid", x="Data Sets", y="Y") + theme(legend.position="top")
#Regressions of y on x for each set with No confidence region
ggplot(df_cleandata,aes(x=x, y=y, color=dataset)) + geom_point(size=2) + geom_smooth(method=lm, se=FALSE) + facet_grid(.~dataset) + labs(title="Datasets - Regression of Y on X - without confidence interval") + theme(plot.title = element_text(colour = "purple"))
#Regression of y on x with 99% confidence region
ggplot(df_cleandata,aes(x=x, y=y, color=dataset)) + geom_point(size=2) + geom_smooth(method=lm, level=0.99) + facet_grid(.~dataset) + labs(title="Datasets - Regression of Y on X - 99% confidence interval") + theme(plot.title = element_text(colour = "purple"), legend.position="bottom")
Observation: When outliers removed, the datasets , set1 , set2 and set3 got similar trend line ( set1 and set2 got points scattered, but set3 has strictly linear trend). set4 has got constant x with increasing y values, a vertical line (undefined slope).