#install.packages("ggplot2")
#install.packages("gridExtra")
require(ggplot2)
## Loading required package: ggplot2
require(gridExtra)
## Loading required package: gridExtra
## Warning: package 'gridExtra' was built under R version 3.1.3
## Loading required package: grid
prj2.wd <- getwd()
prj2.inputcsv.loc <- paste(prj2.wd, "Input.csv", sep = "/")
prj2.input.df <- read.table(file = prj2.inputcsv.loc, header = T, sep = ",")
prj2.input.df.X1.mean <- mean(prj2.input.df$X1)
prj2.input.df.X2.mean <- mean(prj2.input.df$X2)
prj2.input.df.X3.mean <- mean(prj2.input.df$X3)
prj2.input.df.X4.mean <- mean(prj2.input.df$X4)
prj2.input.df.mean <- c(prj2.input.df.X1.mean, prj2.input.df.X2.mean, prj2.input.df.X3.mean, prj2.input.df.X4.mean)
prj2.input.df.dsNames = c("X1","X2","X3","X4")
sprintf ("Mean for X1 = %f, X2 = %f, X3 = %f, X4 = %f", prj2.input.df.X1.mean, prj2.input.df.X2.mean, prj2.input.df.X3.mean, prj2.input.df.X4.mean)
## [1] "Mean for X1 = 9.000000, X2 = 9.000000, X3 = 9.000000, X4 = 9.000000"
## [1] "Mean value for all Xs for all datasets are the same"
prj2.input.df.meandf.X <- data.frame(prj2.input.df.dsNames, prj2.input.df.mean)
ggplot(prj2.input.df.meandf.X, aes(prj2.input.df.dsNames, prj2.input.df.mean, fill = as.factor(prj2.input.df.dsNames))) + geom_bar(stat="identity", color="red") + labs(title="Mean Comparison of X values", x="DatasetNames", y="MeanValue")
prj2.input.df.Y1.mean <- mean(prj2.input.df$Y1)
prj2.input.df.Y2.mean <- mean(prj2.input.df$Y2)
prj2.input.df.Y3.mean <- mean(prj2.input.df$Y3)
prj2.input.df.Y4.mean <- mean(prj2.input.df$Y4)
prj2.input.df.mean <- c(prj2.input.df.Y1.mean, prj2.input.df.Y2.mean, prj2.input.df.Y3.mean, prj2.input.df.Y4.mean)
prj2.input.df.dsNames = c("Y1","Y2","Y3","Y4")
sprintf ("Mean for Y1 = %f, Y2 = %f, Y3 = %f, Y4 = %f", prj2.input.df.Y1.mean, prj2.input.df.Y2.mean, prj2.input.df.Y3.mean, prj2.input.df.Y4.mean)
## [1] "Mean for Y1 = 7.500909, Y2 = 7.500909, Y3 = 7.500000, Y4 = 7.500909"
## [1] "Mean value for all Ys for all datasets are almost the same"
prj2.input.df.meandf.Y <- data.frame(prj2.input.df.dsNames, prj2.input.df.mean)
ggplot(prj2.input.df.meandf.Y, aes(prj2.input.df.dsNames, prj2.input.df.mean, fill = as.factor(prj2.input.df.dsNames))) + geom_bar(stat="identity", color="red") + labs(title="Mean Comparison of Y Values", x="DatasetNames", y="MeanValue")
prj2.input.df.X1.sd <- sd(prj2.input.df$X1)
prj2.input.df.X2.sd <- sd(prj2.input.df$X2)
prj2.input.df.X3.sd <- sd(prj2.input.df$X3)
prj2.input.df.X4.sd <- sd(prj2.input.df$X4)
prj2.input.df.sd <- c(prj2.input.df.X1.sd, prj2.input.df.X2.sd, prj2.input.df.X3.sd, prj2.input.df.X4.sd)
prj2.input.df.dsNames = c("X1","X2","X3","X4")
sprintf ("Standard deviation for X1 = %f, X2 = %f, X3 = %f, X4 = %f", prj2.input.df.X1.sd, prj2.input.df.X2.sd, prj2.input.df.X3.sd, prj2.input.df.X4.sd)
## [1] "Standard deviation for X1 = 3.316625, X2 = 3.316625, X3 = 3.316625, X4 = 3.316625"
## [1] "Standard deviation value for all Xs for all datasets are almost the same"
prj2.input.df.sddf.X <- data.frame(prj2.input.df.dsNames, prj2.input.df.sd)
ggplot(prj2.input.df.sddf.X, aes(prj2.input.df.dsNames, prj2.input.df.sd, fill = as.factor(prj2.input.df.dsNames))) + geom_bar(stat="identity", color="red") + labs(title="Standard Deviation Comparison of X values", x="DatasetNames", y="SDValue")
prj2.input.df.Y1.sd <- sd(prj2.input.df$Y1)
prj2.input.df.Y2.sd <- sd(prj2.input.df$Y2)
prj2.input.df.Y3.sd <- sd(prj2.input.df$Y3)
prj2.input.df.Y4.sd <- sd(prj2.input.df$Y4)
prj2.input.df.sd <- c(prj2.input.df.Y1.sd, prj2.input.df.Y2.sd, prj2.input.df.Y3.sd, prj2.input.df.Y4.sd)
prj2.input.df.dsNames = c("Y1","Y2","Y3","Y4")
sprintf ("Standard deviation for Y1 = %f, Y2 = %f, Y3 = %f, Y4 = %f", prj2.input.df.Y1.sd, prj2.input.df.Y2.sd, prj2.input.df.Y3.sd, prj2.input.df.Y4.sd)
## [1] "Standard deviation for Y1 = 2.031568, Y2 = 2.031657, Y3 = 2.030424, Y4 = 2.030579"
## [1] "Standard deviation value for all Ys for all datasets are almost the same"
prj2.input.df.sddf.Y <- data.frame(prj2.input.df.dsNames, prj2.input.df.sd)
ggplot(prj2.input.df.sddf.Y, aes(prj2.input.df.dsNames, prj2.input.df.sd, fill = as.factor(prj2.input.df.dsNames))) + geom_bar(stat="identity", color="red") + labs(title="Standard Deviation Comparison of Y values", x="DatasetNames", y="SDValue")
prj2.input.df.cor1 <- cor(prj2.input.df$X1,prj2.input.df$Y1)
prj2.input.df.cor2 <- cor(prj2.input.df$X2,prj2.input.df$Y2)
prj2.input.df.cor3 <- cor(prj2.input.df$X3,prj2.input.df$Y3)
prj2.input.df.cor4 <- cor(prj2.input.df$X4,prj2.input.df$Y4)
prj2.input.df.cor <- c(prj2.input.df.cor1, prj2.input.df.cor2, prj2.input.df.cor3, prj2.input.df.cor4)
prj2.input.df.dsNames = c("DS1","DS2","DS3","DS4")
sprintf ("Correlation for dataset1 = %f, dataset2 = %f, dataset3 = %f, dataset4 = %f", prj2.input.df.cor1, prj2.input.df.cor2, prj2.input.df.cor3, prj2.input.df.cor4)
## [1] "Correlation for dataset1 = 0.816421, dataset2 = 0.816237, dataset3 = 0.816287, dataset4 = 0.816521"
## [1] "Correlation for all datasets for all datasets are almost the same"
prj2.input.df.cordf <- data.frame(prj2.input.df.dsNames, prj2.input.df.cor)
ggplot(prj2.input.df.cordf, aes(prj2.input.df.dsNames, prj2.input.df.cor, fill = as.factor(prj2.input.df.dsNames))) + geom_bar(stat="identity", color="red") + labs(title="Correlation of (X,Y) Comparison", x="DatasetNames", y="Cor-Val")
ggplot(prj2.input.df) +
geom_line(aes(X1,Y1), colour="blue") +
geom_line(aes(X2,Y2), colour="green") +
geom_line(aes(X3,Y3), colour="red") +
geom_line(aes(X4,Y4), colour="black") +
labs(x = "cx", y = "cy")
ggplot(prj2.input.df) +
geom_violin(aes(X1,Y1), fill="blue") +
geom_violin(aes(X2,Y2), fill="green") +
geom_violin(aes(X3,Y3), fill="red") +
geom_violin(aes(X4,Y4), fill="black") +
labs(x = "cx", y = "cy")
plot1 <- ggplot(prj2.input.df, aes(x=X1,y=Y1)) + geom_smooth( method=lm) + geom_point(color="blue")
plot2 <- ggplot(prj2.input.df, aes(x=X2,y=Y2)) + geom_smooth( method=lm) + geom_point(color="green")
plot3 <- ggplot(prj2.input.df, aes(x=X3,y=Y3)) + geom_smooth( method=lm) + geom_point(color="red")
plot4 <- ggplot(prj2.input.df, aes(x=X4,y=Y4)) + geom_smooth( method=lm) + geom_point(color="black")
grid.arrange(plot1, plot2, plot3, plot4, ncol=2)
prj2.X1.lm <- lm(prj2.input.df$Y1 ~ prj2.input.df$X1)
summary(prj2.X1.lm)
##
## Call:
## lm(formula = prj2.input.df$Y1 ~ prj2.input.df$X1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.92127 -0.45577 -0.04136 0.70941 1.83882
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.0001 1.1247 2.667 0.02573 *
## prj2.input.df$X1 0.5001 0.1179 4.241 0.00217 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.237 on 9 degrees of freedom
## Multiple R-squared: 0.6665, Adjusted R-squared: 0.6295
## F-statistic: 17.99 on 1 and 9 DF, p-value: 0.00217
#prj2.input.df$Y1
#fitted(prj2.X1.lm)
#residuals(prj2.X1.lm)
#plot(prj2.input.df$X1,prj2.input.df$Y1)
#abline(prj2.X1.lm)
#confint(prj2.X1.lm)
par(mfrow=c(2,2))
plot(prj2.X1.lm)
prj2.X2.lm <- lm(prj2.input.df$Y2 ~ prj2.input.df$X2)
summary(prj2.X2.lm)
##
## Call:
## lm(formula = prj2.input.df$Y2 ~ prj2.input.df$X2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9009 -0.7609 0.1291 0.9491 1.2691
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.001 1.125 2.667 0.02576 *
## prj2.input.df$X2 0.500 0.118 4.239 0.00218 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.237 on 9 degrees of freedom
## Multiple R-squared: 0.6662, Adjusted R-squared: 0.6292
## F-statistic: 17.97 on 1 and 9 DF, p-value: 0.002179
#prj2.input.df$Y2
#fitted(prj2.X2.lm)
#residuals(prj2.X2.lm)
#plot(prj2.input.df$X2,prj2.input.df$Y2)
#abline(prj2.X2.lm)
#confint(prj2.X2.lm)
par(mfrow=c(2,2))
plot(prj2.X2.lm)
prj2.X3.lm <- lm(prj2.input.df$Y3 ~ prj2.input.df$X3)
summary(prj2.X3.lm)
##
## Call:
## lm(formula = prj2.input.df$Y3 ~ prj2.input.df$X3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.1586 -0.6146 -0.2303 0.1540 3.2411
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.0025 1.1245 2.670 0.02562 *
## prj2.input.df$X3 0.4997 0.1179 4.239 0.00218 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.236 on 9 degrees of freedom
## Multiple R-squared: 0.6663, Adjusted R-squared: 0.6292
## F-statistic: 17.97 on 1 and 9 DF, p-value: 0.002176
#prj2.input.df$Y3
#fitted(prj2.X3.lm)
#residuals(prj2.X3.lm)
#plot(prj2.input.df$X3,prj2.input.df$Y3)
#abline(prj2.X3.lm)
#confint(prj2.X3.lm)
par(mfrow=c(2,2))
plot(prj2.X3.lm)
prj2.X4.lm <- lm(prj2.input.df$Y4 ~ prj2.input.df$X4)
summary(prj2.X4.lm)
##
## Call:
## lm(formula = prj2.input.df$Y4 ~ prj2.input.df$X4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.751 -0.831 0.000 0.809 1.839
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.0017 1.1239 2.671 0.02559 *
## prj2.input.df$X4 0.4999 0.1178 4.243 0.00216 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.236 on 9 degrees of freedom
## Multiple R-squared: 0.6667, Adjusted R-squared: 0.6297
## F-statistic: 18 on 1 and 9 DF, p-value: 0.002165
#prj2.input.df$Y4
#fitted(prj2.X4.lm)
#residuals(prj2.X4.lm)
#plot(prj2.input.df$X4,prj2.input.df$Y4)
#abline(prj2.X4.lm)
#confint(prj2.X4.lm)
par(mfrow=c(2,2))
plot(prj2.X4.lm)
## Warning: not plotting observations with leverage one:
## 8
## Warning: not plotting observations with leverage one:
## 8
-> Even though all 4 datasets appear to be showing different data, all their statistical points are very much the same. -> outlier is 19 on X4 value. -> In the summary of each linear regression, it can be noticed that Intercept is 3 (rounded value) and slope is 0.5 value leading to a predictive model of Y(n) = 0.5X(n) + 3