Goal: compute summary statistics
#Load ggplot2
#library("ggplot2")
require("ggplot2")
## Loading required package: ggplot2
library("ggplot2")
require("RCurl")
## Loading required package: RCurl
## Loading required package: bitops
url <- "https://raw.githubusercontent.com/Nguyver/IS607/master/DataExploratory.csv"
myData <- getURL(url, ssl.verifypeer=FALSE)
data.df <- read.csv(textConnection(myData))
#dir <- "C:/MSDASem1/607"
#setwd(dir)
#data.df <- read.csv(file="DataExploratory.csv", header=TRUE)
x1<-data.df$x1
x2<-data.df$x2
x3<-data.df$x3
x4<-data.df$x4
y1<-data.df$y1
y2<-data.df$y2
y3<-data.df$y3
y4<-data.df$y4
## summary statistics Dataset1
summary(x1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.0 6.5 9.0 9.0 11.5 14.0
summary(y1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.260 6.315 7.580 7.501 8.570 10.840
coef(lm(y1 ~ x1))
## (Intercept) x1
## 3.0000909 0.5000909
#fitting a linear model
fit1 <- lm(y1~x1)
m1 <- summary(fit1)
#m1
## summary statistics Dataset2
summary(x2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.0 6.5 9.0 9.0 11.5 14.0
summary(y2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.100 6.695 8.140 7.501 8.950 9.260
coef(lm(y2 ~ x2))
## (Intercept) x2
## 3.000909 0.500000
#fitting a linear model
fit2 <- lm(y2~x2)
m2 <- summary(fit2)
#m2
## summary statistics Dataset3
summary(x3)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.0 6.5 9.0 9.0 11.5 14.0
summary(y3)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.39 6.25 7.11 7.50 7.98 12.74
coef(lm(y3 ~ x3))
## (Intercept) x3
## 3.0024545 0.4997273
#fitting a linear model
fit3 <- lm(y3~x3)
m3 <- summary(fit3)
#m3
## summary statistics Dataset4
summary(x4)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8 8 8 9 8 19
summary(y4)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.25 5.67 7.04 7.41 8.19 12.50
coef(lm(y4 ~ x4))
## (Intercept) x4
## 2.829 0.509
#fitting a linear model
fit4 <- lm(y4~x4)
m4 <- summary(fit4)
#m4
In Quantitative sense, all four of the data sets are “equivalent”
Scatter Plots
p1 <- ggplot(data.df, aes(x = x1, y = y1)) +
geom_point() +
geom_point(data = data.df, aes(y = y1),
colour = 'red', size = 2)+geom_smooth(method = "lm")+ggtitle("Data Set1")
p2 <- ggplot(data.df, aes(x = x2, y = y2)) +
geom_point() +
geom_point(data = data.df, aes(y = y2),
colour = 'blue', size = 2)+geom_smooth(method = "lm")+ggtitle("Data Set2")
p3 <- ggplot(data.df, aes(x = x3, y = y3)) +
geom_point() +
geom_point(data = data.df, aes(y = y3),
colour = 'green', size = 2)+geom_smooth(method = "lm")+ggtitle("Data Set3")
p4 <- ggplot(data.df, aes(x = x4, y = y4)) +
geom_point() +
geom_point(data = data.df, aes(y = y4),
colour = 'black', size = 2)+geom_smooth(method = "lm")+ggtitle("Data Set4")
#installed gridextra package
require(gridExtra)
## Loading required package: gridExtra
## Loading required package: grid
grid.arrange(p1,p2,p3,p4)

ggplot(data = data.df) +
labs(x="X", y="Y") +
geom_line(aes(x=x1, y=y1, color = "red")) +
geom_line(aes(x=x2, y=y2, color = "blue")) +
geom_line(aes(x=x3, y=y3, color = "yellow"))

ggplot(data.df) + geom_density(alpha=.2, aes(y1, fill="yellow")) +
geom_density(alpha=.2, aes(y2, fill="red")) +
geom_density(alpha=.2, aes(y3, fill="lightgreen")) +
geom_density(alpha=.2, aes(y4, fill="purple"))

After viewing the plots we came with the following conclusions
data set 1 is linear with some scatter.
data set 2 is quadratic.
data set 3 is linear and has an outlier.
data set 4 does not have a slope, Y is not a function of X , It might be a poor experimental design