This project analyses 4 datasets based on different statistical data.

1. install necessary packages

#install.packages("ggplot2")
#install.packages("gridExtra")
require(ggplot2)
## Loading required package: ggplot2
require(gridExtra)
## Loading required package: gridExtra
## Warning: package 'gridExtra' was built under R version 3.1.3
## Loading required package: grid

2. load input file

prj2.wd <- getwd()
prj2.inputcsv.loc <- paste(prj2.wd, "Input.csv", sep = "/")
prj2.input.df <- read.table(file = prj2.inputcsv.loc, header = T, sep = ",")

3. Mean of X for all datasets

prj2.input.df.X1.mean <- mean(prj2.input.df$X1)
prj2.input.df.X2.mean <- mean(prj2.input.df$X2)
prj2.input.df.X3.mean <- mean(prj2.input.df$X3)
prj2.input.df.X4.mean <- mean(prj2.input.df$X4)

prj2.input.df.mean <- c(prj2.input.df.X1.mean, prj2.input.df.X2.mean, prj2.input.df.X3.mean, prj2.input.df.X4.mean)
prj2.input.df.dsNames = c("X1","X2","X3","X4")


sprintf ("Mean for X1 = %f, X2 = %f, X3 = %f, X4 = %f", prj2.input.df.X1.mean, prj2.input.df.X2.mean, prj2.input.df.X3.mean, prj2.input.df.X4.mean)
## [1] "Mean for X1 = 9.000000, X2 = 9.000000, X3 = 9.000000, X4 = 9.000000"
## [1] "Mean value for all Xs for all datasets are the same"
prj2.input.df.meandf.X <- data.frame(prj2.input.df.dsNames, prj2.input.df.mean)

ggplot(prj2.input.df.meandf.X, aes(prj2.input.df.dsNames, prj2.input.df.mean, fill = as.factor(prj2.input.df.dsNames))) + geom_bar(stat="identity", color="red") + labs(title="Mean Comparison of X values", x="DatasetNames", y="MeanValue")

4. Mean of Y for all datasets

prj2.input.df.Y1.mean <- mean(prj2.input.df$Y1)
prj2.input.df.Y2.mean <- mean(prj2.input.df$Y2)
prj2.input.df.Y3.mean <- mean(prj2.input.df$Y3)
prj2.input.df.Y4.mean <- mean(prj2.input.df$Y4)

prj2.input.df.mean <- c(prj2.input.df.Y1.mean, prj2.input.df.Y2.mean, prj2.input.df.Y3.mean, prj2.input.df.Y4.mean)
prj2.input.df.dsNames = c("Y1","Y2","Y3","Y4")

sprintf ("Mean for Y1 = %f, Y2 = %f, Y3 = %f, Y4 = %f", prj2.input.df.Y1.mean, prj2.input.df.Y2.mean, prj2.input.df.Y3.mean, prj2.input.df.Y4.mean)
## [1] "Mean for Y1 = 7.500909, Y2 = 7.500909, Y3 = 7.500000, Y4 = 7.500909"
## [1] "Mean value for all Ys for all datasets are almost the same"
prj2.input.df.meandf.Y <- data.frame(prj2.input.df.dsNames, prj2.input.df.mean)

ggplot(prj2.input.df.meandf.Y, aes(prj2.input.df.dsNames, prj2.input.df.mean, fill = as.factor(prj2.input.df.dsNames))) + geom_bar(stat="identity", color="red") + labs(title="Mean Comparison of Y Values", x="DatasetNames", y="MeanValue")

5. Standard Deviation of X for all datasets

prj2.input.df.X1.sd <- sd(prj2.input.df$X1)
prj2.input.df.X2.sd <- sd(prj2.input.df$X2)
prj2.input.df.X3.sd <- sd(prj2.input.df$X3)
prj2.input.df.X4.sd <- sd(prj2.input.df$X4)

prj2.input.df.sd <- c(prj2.input.df.X1.sd, prj2.input.df.X2.sd, prj2.input.df.X3.sd, prj2.input.df.X4.sd)
prj2.input.df.dsNames = c("X1","X2","X3","X4")


sprintf ("Standard deviation for X1 = %f, X2 = %f, X3 = %f, X4 = %f", prj2.input.df.X1.sd, prj2.input.df.X2.sd, prj2.input.df.X3.sd, prj2.input.df.X4.sd)
## [1] "Standard deviation for X1 = 3.316625, X2 = 3.316625, X3 = 3.316625, X4 = 3.316625"
## [1] "Standard deviation value for all Xs for all datasets are almost the same"
prj2.input.df.sddf.X <- data.frame(prj2.input.df.dsNames, prj2.input.df.sd)

ggplot(prj2.input.df.sddf.X, aes(prj2.input.df.dsNames, prj2.input.df.sd, fill = as.factor(prj2.input.df.dsNames))) + geom_bar(stat="identity", color="red") + labs(title="Standard Deviation Comparison of X values", x="DatasetNames", y="SDValue")

6. Standard Deviation of Y for all datasets

prj2.input.df.Y1.sd <- sd(prj2.input.df$Y1)
prj2.input.df.Y2.sd <- sd(prj2.input.df$Y2)
prj2.input.df.Y3.sd <- sd(prj2.input.df$Y3)
prj2.input.df.Y4.sd <- sd(prj2.input.df$Y4)

prj2.input.df.sd <- c(prj2.input.df.Y1.sd, prj2.input.df.Y2.sd, prj2.input.df.Y3.sd, prj2.input.df.Y4.sd)
prj2.input.df.dsNames = c("Y1","Y2","Y3","Y4")

sprintf ("Standard deviation for Y1 = %f, Y2 = %f, Y3 = %f, Y4 = %f", prj2.input.df.Y1.sd, prj2.input.df.Y2.sd, prj2.input.df.Y3.sd, prj2.input.df.Y4.sd)
## [1] "Standard deviation for Y1 = 2.031568, Y2 = 2.031657, Y3 = 2.030424, Y4 = 2.030579"
## [1] "Standard deviation value for all Ys for all datasets are almost the same"
prj2.input.df.sddf.Y <- data.frame(prj2.input.df.dsNames, prj2.input.df.sd)

ggplot(prj2.input.df.sddf.Y, aes(prj2.input.df.dsNames, prj2.input.df.sd, fill = as.factor(prj2.input.df.dsNames))) + geom_bar(stat="identity", color="red") + labs(title="Standard Deviation Comparison of Y values", x="DatasetNames", y="SDValue")

7. Correlation of X for all datasets

prj2.input.df.cor1 <- cor(prj2.input.df$X1,prj2.input.df$Y1)
prj2.input.df.cor2 <- cor(prj2.input.df$X2,prj2.input.df$Y2)
prj2.input.df.cor3 <- cor(prj2.input.df$X3,prj2.input.df$Y3)
prj2.input.df.cor4 <- cor(prj2.input.df$X4,prj2.input.df$Y4)

prj2.input.df.cor <- c(prj2.input.df.cor1, prj2.input.df.cor2, prj2.input.df.cor3, prj2.input.df.cor4)
prj2.input.df.dsNames = c("DS1","DS2","DS3","DS4")

sprintf ("Correlation for dataset1 = %f, dataset2 = %f, dataset3 = %f, dataset4 = %f", prj2.input.df.cor1, prj2.input.df.cor2, prj2.input.df.cor3, prj2.input.df.cor4)
## [1] "Correlation for dataset1 = 0.816421, dataset2 = 0.816237, dataset3 = 0.816287, dataset4 = 0.816521"
## [1] "Correlation for all datasets for all datasets are almost the same"
prj2.input.df.cordf <- data.frame(prj2.input.df.dsNames, prj2.input.df.cor)

ggplot(prj2.input.df.cordf, aes(prj2.input.df.dsNames, prj2.input.df.cor, fill = as.factor(prj2.input.df.dsNames))) + geom_bar(stat="identity", color="red") + labs(title="Correlation of (X,Y) Comparison", x="DatasetNames", y="Cor-Val")

8.1. All data in one plot using line graph

ggplot(prj2.input.df) + 
  geom_line(aes(X1,Y1), colour="blue") +
  geom_line(aes(X2,Y2), colour="green")  +
  geom_line(aes(X3,Y3), colour="red") + 
  geom_line(aes(X4,Y4), colour="black") + 
labs(x = "cx", y = "cy")

8.2. All data in one plot using violin graph

ggplot(prj2.input.df) + 
  geom_violin(aes(X1,Y1), fill="blue") +
  geom_violin(aes(X2,Y2), fill="green")  +
  geom_violin(aes(X3,Y3), fill="red") + 
  geom_violin(aes(X4,Y4), fill="black") + 
  labs(x = "cx", y = "cy")

9. (X,Y) representation of each dataset shown side by side

plot1 <- ggplot(prj2.input.df, aes(x=X1,y=Y1)) + geom_smooth( method=lm) + geom_point(color="blue")
plot2 <- ggplot(prj2.input.df, aes(x=X2,y=Y2)) + geom_smooth( method=lm) + geom_point(color="green")
plot3 <- ggplot(prj2.input.df, aes(x=X3,y=Y3)) + geom_smooth( method=lm) + geom_point(color="red")
plot4 <- ggplot(prj2.input.df, aes(x=X4,y=Y4)) + geom_smooth( method=lm) + geom_point(color="black")
grid.arrange(plot1, plot2, plot3, plot4, ncol=2)

10.1. Linear regression for Dataset1 (X1,Y1) using base R

prj2.X1.lm <- lm(prj2.input.df$Y1 ~ prj2.input.df$X1)
summary(prj2.X1.lm)
## 
## Call:
## lm(formula = prj2.input.df$Y1 ~ prj2.input.df$X1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.92127 -0.45577 -0.04136  0.70941  1.83882 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)   
## (Intercept)        3.0001     1.1247   2.667  0.02573 * 
## prj2.input.df$X1   0.5001     0.1179   4.241  0.00217 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.237 on 9 degrees of freedom
## Multiple R-squared:  0.6665, Adjusted R-squared:  0.6295 
## F-statistic: 17.99 on 1 and 9 DF,  p-value: 0.00217
#prj2.input.df$Y1
#fitted(prj2.X1.lm)
#residuals(prj2.X1.lm)
#plot(prj2.input.df$X1,prj2.input.df$Y1)
#abline(prj2.X1.lm)
#confint(prj2.X1.lm)
par(mfrow=c(2,2))
plot(prj2.X1.lm)

10.2. Linear regression for Dataset2 (X2,Y2) using base R

prj2.X2.lm <- lm(prj2.input.df$Y2 ~ prj2.input.df$X2)
summary(prj2.X2.lm)
## 
## Call:
## lm(formula = prj2.input.df$Y2 ~ prj2.input.df$X2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9009 -0.7609  0.1291  0.9491  1.2691 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)   
## (Intercept)         3.001      1.125   2.667  0.02576 * 
## prj2.input.df$X2    0.500      0.118   4.239  0.00218 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.237 on 9 degrees of freedom
## Multiple R-squared:  0.6662, Adjusted R-squared:  0.6292 
## F-statistic: 17.97 on 1 and 9 DF,  p-value: 0.002179
#prj2.input.df$Y2
#fitted(prj2.X2.lm)
#residuals(prj2.X2.lm)
#plot(prj2.input.df$X2,prj2.input.df$Y2)
#abline(prj2.X2.lm)
#confint(prj2.X2.lm)
par(mfrow=c(2,2))
plot(prj2.X2.lm)

10.3. Linear regression for Dataset3 (X3,Y3) using base R

prj2.X3.lm <- lm(prj2.input.df$Y3 ~ prj2.input.df$X3)
summary(prj2.X3.lm)
## 
## Call:
## lm(formula = prj2.input.df$Y3 ~ prj2.input.df$X3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1586 -0.6146 -0.2303  0.1540  3.2411 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)   
## (Intercept)        3.0025     1.1245   2.670  0.02562 * 
## prj2.input.df$X3   0.4997     0.1179   4.239  0.00218 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.236 on 9 degrees of freedom
## Multiple R-squared:  0.6663, Adjusted R-squared:  0.6292 
## F-statistic: 17.97 on 1 and 9 DF,  p-value: 0.002176
#prj2.input.df$Y3
#fitted(prj2.X3.lm)
#residuals(prj2.X3.lm)
#plot(prj2.input.df$X3,prj2.input.df$Y3)
#abline(prj2.X3.lm)
#confint(prj2.X3.lm)
par(mfrow=c(2,2))
plot(prj2.X3.lm)

10.4. Linear regression for Dataset4 (X4,Y4) use base R

prj2.X4.lm <- lm(prj2.input.df$Y4 ~ prj2.input.df$X4)
summary(prj2.X4.lm)
## 
## Call:
## lm(formula = prj2.input.df$Y4 ~ prj2.input.df$X4)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.751 -0.831  0.000  0.809  1.839 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)   
## (Intercept)        3.0017     1.1239   2.671  0.02559 * 
## prj2.input.df$X4   0.4999     0.1178   4.243  0.00216 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.236 on 9 degrees of freedom
## Multiple R-squared:  0.6667, Adjusted R-squared:  0.6297 
## F-statistic:    18 on 1 and 9 DF,  p-value: 0.002165
#prj2.input.df$Y4
#fitted(prj2.X4.lm)
#residuals(prj2.X4.lm)
#plot(prj2.input.df$X4,prj2.input.df$Y4)
#abline(prj2.X4.lm)
#confint(prj2.X4.lm)
par(mfrow=c(2,2))
plot(prj2.X4.lm)
## Warning: not plotting observations with leverage one:
##   8
## Warning: not plotting observations with leverage one:
##   8

Conclusion

-> Even though all 4 datasets appear to be showing different data, all their statistical points are very much the same. -> outlier is 19 on X4 value. -> In the summary of each linear regression, it can be noticed that Intercept is 3 (rounded value) and slope is 0.5 value leading to a predictive model of Y(n) = 0.5X(n) + 3