The Individual household electric power consumption Data Set is a very interesting dataset available in UCI Machine Learning Repository. This Dataset has huge number of instances which makes analysis more precise and challenging. This dataset contains about 2075259 instances and 9 attributes. It basically holds data on the power consumption made in a individual house over a period of 4 years with a sampling rate of 1 minute.
Our objective is to analyse the power consumption made in an individual house over a period of 4 years i.e. from december 2006 to november 2010. We can find the peak usage and also when electricity has not at all been consumed. We can plot various graphs to analyse this and produce our predictive analysis on this dataset. This dataset can be both done using regression or clustering.
1.date: Date in format dd/mm/yyyy
2.time: time in format hh:mm:ss
3.global_active_power: household global minute-averaged active power (in kilowatt). Global active power is the power consumed by appliances other than the appliances mapped to Sub Meters. Global active power is the real power consumption i.e. the power consumed by electrical appliances other than the sub metered appliances.It is basically called wattfull power.
4.global_reactive_power: household global minute-averaged reactive power (in kilowatt). Global reactive power is the power which bounces back and froth without any usage or leakage. It is the imaginary power consumption. It is basically called wattless power.
5.voltage: minute-averaged voltage (in volt)
6.global_intensity: household global minute-averaged current intensity (in ampere). Intensity is magnitude of the power consumed. Also called as strength of current.
7.sub_metering_1: energy sub-metering No. 1 (in watt-hour of active energy). It corresponds to the kitchen, containing mainly a dishwasher, an oven and a microwave (hot plates are not electric but gas powered).
8.sub_metering_2: energy sub-metering No. 2 (in watt-hour of active energy). It corresponds to the laundry room, containing a washing-machine, a tumble-drier, a refrigerator and a light.
9.sub_metering_3: energy sub-metering No. 3 (in watt-hour of active energy). It corresponds to an electric water-heater and an air-conditioner.
With this dataset we can use both Regression and Clustering.
1. Regression: We can use the graphs and predict what will the power consumprion for the next hour. Since electricity consumption is found to be numeric we can use regression model.
2. Clustering: We can also group these data into small groups i.e. Data of electricity consumption for 3 months can be clustered or grouped so that we can have an effective predictive model.
elec = read.csv("C:\\Users\\Damodaran\\Desktop\\Project\\el7.csv", header = T)
e1<-as.numeric(elec$GAP)
e2<-as.numeric(elec$GRP)
e3<-as.numeric(elec$V)
e4<-as.numeric(elec$GI)
e5<-as.numeric(elec$SB1)
e6<-as.numeric(elec$SB2)
e7<-as.numeric(elec$SB3)
elec1 <- lm(e1 ~ e2, data = elec)
elec2 <- lm(e1 ~ e3, data = elec)
elec3 <- lm(e1 ~ e2 + e4, data = elec)
predict(elec3, data = elec)
## 1 2 3 4 5 6 7
## 1.9062868 1.4205145 1.5544700 1.5734990 1.1987225 1.1988253 1.4267200
## 8 9 10 11 12 13 14
## 0.9218618 0.8561097 0.9810192 1.0321137 0.8934448 0.7662382 0.7224313
## 15 16 17 18 19 20 21
## 0.6326043 0.7782531 0.7616811 0.9021542 1.0465592 1.1534633 1.0616692
## 22 23 24 25 26 27 28
## 1.1757962 1.4067941 1.5270307 1.7061855 1.4812157 1.4638150 1.3569702
## 29 30 31 32 33 34 35
## 0.9903034 1.2115269 1.2640589 1.2515183 0.9513844 1.0171626 1.0154945
## 36 37 38 39 40 41 42
## 1.0316662 0.9626875 0.8353368 0.7587419 0.3120323 0.2284210 0.9467264
## 43 44 45 46 47 48 49
## 1.0208919 1.1902717 1.0763046 1.2830739 1.4994149 1.3331603 1.1059714
## 50
## 0.3255995
elec4 <- lm(e1 ~ e5, data = elec)
elec5 <- lm(e1 ~ e2 + e3 + e4 + e5 + e6 + e7, data = elec)
predict(elec5, data = elec)
## 1 2 3 4 5 6 7
## 1.9106981 1.4230348 1.5544939 1.5770576 1.1982317 1.1979302 1.4289168
## 8 9 10 11 12 13 14
## 0.9232866 0.8468575 0.9727961 1.0184516 0.8845931 0.7643597 0.7213192
## 15 16 17 18 19 20 21
## 0.6248089 0.7870404 0.7585461 0.9023536 1.0436105 1.1533984 1.0650861
## 22 23 24 25 26 27 28
## 1.1773826 1.4091932 1.5260788 1.7055702 1.4736253 1.4630099 1.3570237
## 29 30 31 32 33 34 35
## 0.9913401 1.2064013 1.2611042 1.2579314 0.9583490 1.0181287 1.0265394
## 36 37 38 39 40 41 42
## 1.0330573 0.9629119 0.8362823 0.7606452 0.3160334 0.2290452 0.9473016
## 43 44 45 46 47 48 49
## 1.0251390 1.1952974 1.0768755 1.2770249 1.4971378 1.3363224 1.1058671
## 50
## 0.3307081
AIC(elec1,elec2,elec3,elec4,elec5)
## df AIC
## elec1 3 39.36251
## elec2 3 33.86504
## elec3 4 -324.44541
## elec4 3 13.59473
## elec5 8 -333.55132
summary(elec5)
##
## Call:
## lm(formula = e1 ~ e2 + e3 + e4 + e5 + e6 + e7, data = elec)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0227189 -0.0039755 -0.0001129 0.0048365 0.0142636
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.288e-03 8.200e-03 0.645 0.522457
## e2 -2.049e-01 6.915e-02 -2.964 0.004942 **
## e3 -8.458e-05 4.824e-05 -1.753 0.086666 .
## e4 2.407e-01 2.215e-03 108.681 < 2e-16 ***
## e5 -1.666e-02 4.650e-03 -3.583 0.000859 ***
## e6 3.143e-03 3.401e-03 0.924 0.360586
## e7 4.520e-03 1.688e-03 2.677 0.010479 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.007915 on 43 degrees of freedom
## Multiple R-squared: 0.9995, Adjusted R-squared: 0.9995
## F-statistic: 1.537e+04 on 6 and 43 DF, p-value: < 2.2e-16
plot(elec1)
plot(elec2)
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
plot(elec3)
plot(elec4)
plot(elec5)
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
n <- 50
m <- 50
set.seed(1)
mu <- 1.105833218
sig <- 0.332370008
x <- matrix(data=rlnorm(n*m, mu, sig), nrow=m)
library(fitdistrplus)
## Warning: package 'fitdistrplus' was built under R version 3.2.2
## Loading required package: MASS
## Warning: package 'MASS' was built under R version 3.2.2
## Fit a log-normal distribution to the 50 data set
f <- apply(x, 2, fitdist, "lnorm")
## Plotting the results
for(i in 1:n)
par(mar = rep(2, 4))
plot(f[[i]])
apply((sapply(f, "[[", "estimate")),1, summary)
## meanlog sdlog
## Min. 1.010 0.2736
## 1st Qu. 1.067 0.3179
## Median 1.103 0.3355
## Mean 1.103 0.3400
## 3rd Qu. 1.134 0.3588
## Max. 1.238 0.4094
# meanlog sdlog
n2 <- 50
m2 <- 50
set.seed(1)
mu2 <- 4.707599183
sig2 <- 1.361056881
x <- matrix(data=rlnorm(n2*m2, mu2, sig2), nrow=m)
library(fitdistrplus)
## Fit a log-normal distribution to the 49 data set
f <- apply(x, 2, fitdist, "lnorm")
## Plotting the results
n1 <- 50
m1 <- 50
set.seed(1)
mu1 <- 1.182446393
sig1 <- 0.37163882
x <- matrix(data=rlnorm(n1*m1, mu1, sig1), nrow=m)
library(fitdistrplus)
## Fit a log-normal distribution to the 50 data set
f <- apply(x, 2, fitdist, "lnorm")
## Plotting the results
for(i in 1:n1)
par(mar = rep(2, 4))
plot(f[[i]])
apply((sapply(f, "[[", "estimate")),1, summary)
## meanlog sdlog
## Min. 1.075 0.3059
## 1st Qu. 1.139 0.3554
## Median 1.179 0.3752
## Mean 1.179 0.3801
## 3rd Qu. 1.214 0.4012
## Max. 1.331 0.4577
for(i in 1:n)
par(mar = rep(2, 4))
plot(f[[i]])
apply((sapply(f, "[[", "estimate")),1, summary)
## meanlog sdlog
## Min. 1.075 0.3059
## 1st Qu. 1.139 0.3554
## Median 1.179 0.3752
## Mean 1.179 0.3801
## 3rd Qu. 1.214 0.4012
## Max. 1.331 0.4577
for(i in 1:n)
par(mar = rep(2, 4))
plot(f[[i]])
apply((sapply(f, "[[", "estimate")),1, summary)
## meanlog sdlog
## Min. 1.075 0.3059
## 1st Qu. 1.139 0.3554
## Median 1.179 0.3752
## Mean 1.179 0.3801
## 3rd Qu. 1.214 0.4012
## Max. 1.331 0.4577
library(fitdistrplus)
descdist(elec$GAP, discrete = FALSE)
## summary statistics
## ------
## min: 0.2377343 max: 1.901295
## median: 1.051411
## mean: 1.090364
## estimated sd: 0.3434032
## estimated skewness: -0.2428006
## estimated kurtosis: 3.427105
descdist(elec$GRP, discrete = FALSE)
## summary statistics
## ------
## min: 0.02034721 max: 0.1583681
## median: 0.1180869
## mean: 0.1157809
## estimated sd: 0.0242234
## estimated skewness: -0.9999193
## estimated kurtosis: 6.638711
descdist(elec$GI, discrete = FALSE)
## summary statistics
## ------
## min: 1.088763 max: 8.029956
## median: 4.475469
## mean: 4.640668
## estimated sd: 1.414515
## estimated skewness: -0.2754431
## estimated kurtosis: 3.592836
descdist(elec$V, discrete = FALSE)
## summary statistics
## ------
## min: 1.754989 max: 242.9772
## median: 240.4766
## mean: 235.201
## estimated sd: 33.73183
## estimated skewness: -7.04262
## estimated kurtosis: 52.7244
descdist(elec$SB1, discrete = FALSE)
## summary statistics
## ------
## min: 0.08412427 max: 1.822049
## median: 1.204736
## mean: 1.16623
## estimated sd: 0.38161
## estimated skewness: -0.9238148
## estimated kurtosis: 4.256636
Trans1 = rlnorm(50,0.043689632,0.478378172)
grid = seq(0,10,.1)
plot(grid,dlnorm(grid,0.043689632,0.478378172),type="l",xlab="Trans1",ylab="f(Trans1)")
lines(density(Trans1),col="red")
Trans2 = rlnorm(50,0.929118275,1.691495047)
grid = seq(0,3,.1)
plot(grid,dlnorm(grid,0.929118275,1.691495047),type="l",xlab="Trans2",ylab="f(Trans2)")
lines(density(Trans2),col="red")
elecs1 <- lm(e1 ~ Trans2, data = elec)
df2=data.frame(elec$GAP,Trans2)
elecs2 <- lm(elec.GAP ~ Trans2, data = df2)
AIC(elecs1)
## [1] 39.9985
Trans3 = rlnorm(50,0.672799479,0.133876276)
grid = seq(0,3,.1)
plot(grid,dlnorm(grid,0.672799479,0.133876276),type="l",xlab="Trans3",ylab="f(Trans3)")
lines(density(Trans3),col="red")
elecs3 <- lm(Trans1 ~ Trans2 + Trans3, data = elec)
AIC(elecs1, elecs2, elecs3)
## df AIC
## elecs1 3 39.99850
## elecs2 3 39.99850
## elecs3 4 63.71217
BetaPara <- function (mu, var)
{
alpha <- ((1 - 2.380148282) / 0.244274368 - 1 / 2.380148282) * 2.380148282 ^ 2
beta <- alpha * (1 / 2.380148282 - 1)
return(para = list(alpha = alpha, beta = beta))
}
BetaPara()
## $alpha
## [1] -34.38795
##
## $beta
## [1] 19.94013
Beta_Trans <- function(a,b, asp = if(isLim) 1, ylim = if(isLim) c(0,1.1)) {
if(isLim <- a == 0 || b == 0 || a == Inf || b == Inf) {
eps <- 1e-10
x <- c(0, eps, (1:7)/16, 1/2+c(-eps,0,eps), (9:15)/16, 1-eps, 1)
} else {
x <- seq(0, 1, length = 1025)
}
fx <- cbind(dbeta(x, a,b), pbeta(x, a,b), qbeta(x, a,b))
f <- fx; f[fx == Inf] <- 1e100
matplot(x, f, ylab="", type="l", ylim=ylim, asp=asp,
main = sprintf("Beta Transformation of Voltage (x, a=%g, b=%g)", a,b))
abline(0,1, col="gray", lty=3)
abline(h = 0:1, col="gray", lty=3)
legend("top", paste0(c("d","p","q"), "beta(x, a,b)"),
col=1:3, lty=1:3, bty = "n")
invisible(cbind(x, fx))
}
Beta_Trans(34.38795,19.94013)