Goal: compute summary statistics

#Load ggplot2
#library("ggplot2")
require("ggplot2")
## Loading required package: ggplot2
library("ggplot2")


dir <- "C:/MSDASem1/607"
setwd(dir)
# Read dasets[(x1,y1),(x2,y2),(x3,y3),(x4,y4)] to data.df dataframe
data.df <- read.csv(file="DataExploratory.csv", header=TRUE)

x1<-data.df$x1
x2<-data.df$x2
x3<-data.df$x3
x4<-data.df$x4
y1<-data.df$y1
y2<-data.df$y2
y3<-data.df$y3
y4<-data.df$y4
## summary statistics Dataset1
summary(x1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     4.0     6.5     9.0     9.0    11.5    14.0
summary(y1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.260   6.315   7.580   7.501   8.570  10.840
coef(lm(y1 ~ x1))
## (Intercept)          x1 
##   3.0000909   0.5000909
#fitting a linear model
fit1 <- lm(y1~x1)
m1 <- summary(fit1)
#m1
## summary statistics Dataset2
summary(x2)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     4.0     6.5     9.0     9.0    11.5    14.0
summary(y2)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.100   6.695   8.140   7.501   8.950   9.260
coef(lm(y2 ~ x2))
## (Intercept)          x2 
##    3.000909    0.500000
#fitting a linear model
fit2 <- lm(y2~x2)
m2 <- summary(fit2)
#m2
## summary statistics Dataset3
summary(x3)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     4.0     6.5     9.0     9.0    11.5    14.0
summary(y3)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.39    6.25    7.11    7.50    7.98   12.74
coef(lm(y3 ~ x3))
## (Intercept)          x3 
##   3.0024545   0.4997273
#fitting a linear model
fit3 <- lm(y3~x3)
m3 <- summary(fit3)
#m3
## summary statistics Dataset4
summary(x4)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       8       8       8       9       8      19
summary(y4)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.25    5.67    7.04    7.41    8.19   12.50
coef(lm(y4 ~ x4))
## (Intercept)          x4 
##       2.829       0.509
#fitting a linear model
fit4 <- lm(y4~x4)
m4 <- summary(fit4)
#m4

In Quantitative sense, all four of the data sets are “equivalent”

Scatter Plots

p1 <- ggplot(data.df, aes(x = x1, y = y1)) +
  geom_point() +
  geom_point(data = data.df, aes(y = y1),
             colour = 'red', size = 2)+geom_smooth(method = "lm")+ggtitle("Data Set1")


p2 <- ggplot(data.df, aes(x = x2, y = y2)) +
  geom_point() +
  geom_point(data = data.df, aes(y = y2),
             colour = 'blue', size = 2)+geom_smooth(method = "lm")+ggtitle("Data Set2")

p3 <- ggplot(data.df, aes(x = x3, y = y3)) +
  geom_point() +
  geom_point(data = data.df, aes(y = y3),
             colour = 'green', size = 2)+geom_smooth(method = "lm")+ggtitle("Data Set3")

p4 <- ggplot(data.df, aes(x = x4, y = y4)) +
  geom_point() +
  geom_point(data = data.df, aes(y = y4),
             colour = 'black', size = 2)+geom_smooth(method = "lm")+ggtitle("Data Set4")
#installed gridextra package
require(gridExtra)
## Loading required package: gridExtra
## Loading required package: grid
grid.arrange(p1,p2,p3,p4)

ggplot(data = data.df) + 
  labs(x="X", y="Y") + 
  geom_line(aes(x=x1, y=y1, color = "red")) +
  geom_line(aes(x=x2, y=y2, color = "blue")) +
  geom_line(aes(x=x3, y=y3, color = "yellow"))

ggplot(data.df) + geom_density(alpha=.2, aes(y1, fill="yellow")) +
  geom_density(alpha=.2, aes(y2, fill="red")) +
  geom_density(alpha=.2, aes(y3, fill="lightgreen")) +
  geom_density(alpha=.2, aes(y4, fill="purple"))

After viewing the plots we concluded that

data set 1 is inear with some scatter.

data set 2 is quadratic.

data set 3 has an outlier.

data set 4 does not seem a right set