Univariate analysis will be done for both categorical and numerical variables.
# use package "here"
library(here)
## here() starts at /Users/kelvinosuagwu/Desktop/parent/datavisualisation_&Analysis_CW
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
# Setting the working directory to the folder where this file is.
setwd(here::here())
# Loading the data files
energy <- read.csv("data/energy.csv", header = T, stringsAsFactors = T)
# let us check the structure of the dataset to view the datatype
str(energy)
## 'data.frame': 795 obs. of 10 variables:
## $ Instance : int 1 2 3 4 5 6 7 8 9 10 ...
## $ AproxArea : num 556 463 463 540 514 ...
## $ WallArea : num 279 283 296 294 297 ...
## $ RoofArea : num 104 117 108 104 105 ...
## $ GlassArea : num 0 0 0 0 30 ...
## $ Height : Factor w/ 2 levels "high","low": 1 1 1 1 1 1 1 1 1 1 ...
## $ Condition : Factor w/ 5 levels "A","B","C","D",..: 1 3 2 2 1 3 3 3 3 3 ...
## $ Orientation: Factor w/ 4 levels "E","N","S","W": 1 2 3 4 1 1 1 1 1 2 ...
## $ HeatingLoad: num 15.6 15.6 15.6 15.6 24.6 ...
## $ CoolingLoad: num 21.3 21.3 21.3 21.3 26.3 ...
# view dataset if NA is present
summary(energy) #heating load variable has 4NA'S
## Instance AproxArea WallArea RoofArea
## Min. : 1.0 Min. :463.1 Min. :227.8 Min. :103.6
## 1st Qu.:199.5 1st Qu.:602.0 1st Qu.:285.2 1st Qu.:138.2
## Median :398.0 Median :673.8 Median :314.6 Median :207.3
## Mean :398.0 Mean :674.1 Mean :319.1 Mean :179.0
## 3rd Qu.:596.5 3rd Qu.:746.7 3rd Qu.:344.2 3rd Qu.:220.5
## Max. :795.0 Max. :889.4 Max. :550.8 Max. :238.1
##
## GlassArea Height Condition Orientation HeatingLoad
## Min. : 0.00 high:384 A:117 E:202 Min. : 6.01
## 1st Qu.: 32.49 low :411 B:135 N:197 1st Qu.:12.93
## Median : 75.71 C:517 S:195 Median :17.37
## Mean : 75.00 D: 4 W:201 Mean :21.98
## 3rd Qu.:112.90 E: 22 3rd Qu.:31.20
## Max. :174.93 Max. :43.10
## NA's :4
## CoolingLoad
## Min. :10.90
## 1st Qu.:15.49
## Median :21.33
## Mean :24.28
## 3rd Qu.:32.92
## Max. :48.03
##
795 Observations Heating load variable has 4 NA’S ( Missing values). The minimum of glass area is 0. Seems like glass wasn’t used in that building. The instance variable has no meaning in the dataset.
# drop the instance variable using subset function
energy <- subset(energy, select = -c(Instance) )
# This function replaces missing(NA'S) Heating load data with Mean
# returns a mean with decimals up to 5 places
energy$HeatingLoad <- ifelse(is.na(energy$HeatingLoad),
ave(energy$HeatingLoad,
FUN = function(x) mean(x, na.rm = TRUE) )
,energy$HeatingLoad)
#format and round the variable to the nearest decimal number of 2
# to be at par with the other features/variables
energy$HeatingLoad <- format(round(energy$HeatingLoad, digits = 2), nsmall = 2)
Check summary
Central tendency and Spread for APROX AREA variable: This is going to be considered as a continuous scale
Standard deviation
sd(energy$AproxArea)
## [1] 96.66033
Median value using a vertical red line
# let us use dot plot to describe the data
#using line to check if the median is a typical value of this dataset
aproxArea.plot1 <- ggplot(energy, aes(x= AproxArea)) +
geom_dotplot(col="black", fill="gold" , binwidth= 7) +
labs(x="Approximate area of home", y="proportion") +
theme_classic() +
geom_vline(xintercept = 673.8, color = "red", size=0.5) +
theme_minimal()
# PLOTTING BOX PLOT
aproxArea.plot2<- ggplot(energy, aes(y= AproxArea)) +
geom_boxplot(col="blue", fill="lightblue") +
labs(title="The Approx area of the building", x="",y="AproxArea")+
theme_classic() +
theme_minimal()
aproxArea.plot3 <- ggplot(energy, aes(x= AproxArea)) +
geom_histogram(aes(y=..density..),col="red", fill="grey" , binwidth=20) +
geom_vline(xintercept = median(energy$AproxArea), lwd = 2) +
labs(x="Distribution of Approx area ", y="Density") +
geom_density(col="blue") +
theme_classic() +
theme_minimal()
library(ggpubr)
# Arrange
ggarrange(aproxArea.plot1, aproxArea.plot2, aproxArea.plot3, ncol = 2, nrow = 2)
# The line is the median
wallArea.plot1 <- ggplot(energy, aes(x= WallArea)) +
geom_histogram(aes(y=..density..),col="red", fill="grey" , binwidth=20) +
geom_vline(xintercept = median(energy$WallArea), lwd = 2) +
labs(x="Distribution of Approximate area of the Home per energy usage in 2016", y="Density") +
geom_density(col="blue") +
theme_classic() +
theme_minimal()
# let us use dot plot to describe the data
#using line to check if the median is a typical value of this dataset
library(cowplot) #cowplot package for grid
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggpubr':
##
## get_legend
#using line to check if the mean is a typical value of this dataset
wallA.plotMean <- ggplot(energy, aes(x= WallArea)) +
geom_dotplot(col="black", fill="gold" , binwidth= 6.0) +
labs(x="Distribution of wall Area", y="Proportion") +
theme_classic() +
geom_vline(xintercept = 319.1, color = "red", size=0.7) +
theme_minimal()
#box plot
wallArea.plot2<- ggplot(energy, aes(y= WallArea)) +
geom_boxplot(col="blue", fill="lightblue") +
labs(title="The Wall area in (sqft) for Home energy usage in 2016", x="Approx area",y="Average Wall Area in sqft") +
theme_classic()
# Arrange
ggarrange( wallA.plotMean, wallArea.plot1, wallArea.plot2, ncol = 2, nrow = 2)
Central tendency and Spread for ROOF AREA variable: This is going to be considered as a continuous scale
roofArea.plot1 <- ggplot(energy, aes(y= RoofArea)) +
geom_boxplot(col="blue", fill="lightblue") +
labs(title="The Roof area in (sqft) ", x="Roof Area",y="Average ") +
theme_classic()
# The line is the mean
roofArea.plotMean <- ggplot(energy, aes(x= RoofArea)) +
geom_histogram(aes(y=..density..),col="red", fill="grey" , binwidth=20) +
geom_vline(xintercept = mean(energy$RoofArea), lwd = 1) +
labs(x="Distribution of Roof area ", y="Density") +
theme_classic()
# The line is the median
roofArea.plotMedian <- ggplot(energy, aes(x= RoofArea)) +
geom_histogram(aes(y=..density..),col="red", fill="grey" , binwidth=20) +
geom_vline(xintercept = median(energy$RoofArea), lwd = 1) +
labs(x="Distribution of Roof area ", y="Density") +
theme_classic()
# Arrange
ggarrange(roofArea.plot1, roofArea.plotMean,roofArea.plotMedian, ncol = 3, nrow = 1)
Central tendency and Spread for GLASS AREA variable: This is going to be considered as a continuous variable
# The line is the mean
glassArea.plotMean <- ggplot(energy, aes(x= GlassArea)) +
geom_histogram(aes(y=..density..),col="red", fill="grey" , binwidth=20) +
geom_vline(xintercept = mean(energy$GlassArea), lwd = 1) +
labs(x="Distribution of Glass area ", y="Density") +
theme_classic()
# The line is the median
glassArea.plotMedian <- ggplot(energy, aes(x= GlassArea)) +
geom_histogram(aes(y=..density..),col="red", fill="grey" , binwidth=20) +
geom_vline(xintercept = median(energy$GlassArea), lwd = 1) +
labs(x="Distribution of Glass area ", y="Density") +
theme_classic()
ggarrange( glassArea.plotMean ,glassArea.plotMedian, ncol = 2, nrow = 1)
Description
The second bar plot shows the mean at the vertical line. While the third bar plot shows the median at the vertical line. The mean value is close to the median value.
NA: No missing values
Outliers: No outliers
Distribution: rightly skewed in the distribution
Typical Values: The mean is representative of the dataset.
Spread: The data is compact
Central tendency and Spread for Height variable: This is going to be considered as a discrete /categorical variable
# plot the distribution as percentages
height.plot2<- ggplot(energy,
aes(x = Height,
y = ..count.. / sum(..count..))) +
geom_bar() +
labs(x = "Height of building",
y = "Percent",
title = "The Height of building for Home energy usage in 2016") +
scale_y_continuous(labels = scales::percent)
ggarrange( height.plot2, ncol = 1, nrow = 1)
# Basic piechart for the Condition dataset
condition.Table = table(energy$Condition)
tableCount = condition.Table
pie.percent <- round(100*condition.Table /sum(condition.Table ), 1) #change to percentage
countWithPercent = paste(tableCount, "-", pie.percent , sep=" ") #Concatenate count with percentage
countWithPercent = paste(countWithPercent, "%", sep="")
pie.Colors = c("red", "magenta", "lightblue", "yellow","green") #CHOOSE COLORS
# Below code produces a pie chart
pie(condition.Table, labels = countWithPercent, main= "Pie chart for count - percentages of Home Condition Category", col = pie.Colors)
legend("topright", c("A","B","C", "D", "E"), fill = pie.Colors)
Central tendency and Spread for Orientation variable: This is going to be considered as a discrete /categorical variable
# Basic piechart for the Condition dataset
orientation.Table = table(energy$Orientation)
tableCount = orientation.Table
tableCount
##
## E N S W
## 202 197 195 201
pie.percent <- round(100*orientation.Table /sum(orientation.Table ), 1) #get the percentage
pie.percent
##
## E N S W
## 25.4 24.8 24.5 25.3
countWithPercent = paste(tableCount, "-", pie.percent , sep=" ") #Concatenate count with percentage
countWithPercent = paste(countWithPercent, "%", sep="")
pie.Colors = c("blue", "#EDEAE0", "#9966CC","#FFBF00") #CHOOSE COLORS
# Below code produces a pie chart
pie(orientation.Table, labels = countWithPercent, main= "Pie chart for count - percentages of house orientation category", col = pie.Colors)
legend("topright", c("EAST","NORTH","SOUTH", "WEST"), fill = pie.Colors)
Central tendency and Spread for Heating Load dependent variable: This is going to be considered as a continuous variable
#using line to check if the median is representative of this dataset
heatLoadA.plotMedian <- ggplot(energy, aes(x= HeatingLoad)) +
geom_dotplot(col="black", fill="gold" , binwidth= 1) +
labs(x="Distribution of HeatingLoad data", y="") +
theme_classic() +
geom_vline(xintercept = 17.50 , color = "red", size=0.7)
#using line to check if the mean is representative of this dataset
heatLoadB.plotMean <- ggplot(energy, aes(x= HeatingLoad)) +
geom_dotplot(col="black", fill="gold" , binwidth= 1) +
labs(x="Distribution of HeatingLoad data", y="") +
theme_classic() +
geom_vline(xintercept = 21.98, color = "red", size=0.7)
plot_grid(heatLoadA.plotMedian, heatLoadB.plotMean, labels = "AUTO") #grid of two rows
# Use histogram to visualize the distribution
Heating.plot <- ggplot(energy, aes(x= HeatingLoad)) +
geom_histogram(aes(y=..density..),col="red", fill="grey" , binwidth=5) +
labs(x="Distribution of Heating Load ", y="Density") +
theme_classic() +
theme_minimal()
Heating.plot
Central tendency and Spread for Cooling Load dependent variable: This is going to be considered as a continuous variable
#boxplot on cooling load object
cl.plotA <- ggplot(energy, aes(y= CoolingLoad)) +
geom_boxplot(col="black", fill="lightblue") +
labs(title="The Cooling Load for Homes in 2016", x="cooling ",y="Average cooling Load in British Thermal Unit") +
theme_classic()
#using line to check if the mean is representative of this dataset
cLoad.plotMeanB <- ggplot(energy, aes(x= CoolingLoad)) +
geom_dotplot(col="black", fill="gold" , binwidth= 0.5) +
labs(x="Distribution of Cooling Load data", y="") +
theme_classic() +
geom_vline(xintercept = 24.28, color = "red", size=0.7)
# Use histogram to visualize the distribution
Cooling.plotC <- ggplot(energy, aes(x= CoolingLoad)) +
geom_histogram(aes(y=..density..),col="red", fill="grey" , binwidth=6) +
labs(x="Distribution of Cooling Load ", y="Density") +
theme_classic() +
theme_minimal()
ggarrange( cl.plotA, cLoad.plotMeanB, Cooling.plotC, ncol = 2, nrow = 2)
Create a dataset called noDCondition which contains only the instances of the energy dataset where the condition value is “A”, “B”, “C” or “E”, i.e. instances with Condition value “D” are excluded.
#Drop levels filter with D
noDCondition = droplevels(energy[!energy$Condition == 'D',])
# Basic piechart for the Condition dataset
noDCondition.Table = table(noDCondition$Condition)
tableCount = noDCondition.Table
tableCount
##
## A B C E
## 117 135 517 22
pie.percent <- round(100*noDCondition.Table /sum(noDCondition.Table), 1) #get the percentage
pie.percent
##
## A B C E
## 14.8 17.1 65.4 2.8
countWithPercent = paste(tableCount, "-", pie.percent , sep=" ") #Concatenate count with percentage
countWithPercent = paste(countWithPercent, "%", sep="")
pie.Colors = c("blue", "#EDEAE0", "#9966CC","#FFBF00") #CHOOSE COLORS
# Below code produces a pie chart
pie(noDCondition.Table, labels = countWithPercent, main= "Chart for count - percentages of house condition without B category ", col = pie.Colors)
legend("topright", c("A","B","C", "E"), fill = pie.Colors)
#heating load as hl and condition as con
#Drop levels filter with Orientation is “W” and Height is “high”
hlCon.data <- droplevels(energy[energy$Orientation == 'W' & energy$Height == 'high',])
# library(cowplot)
# plot the Heating load distribution by condition
pl <- ggplot(hlCon.data ,
aes(x = Condition,
y = HeatingLoad,
group=Condition,
color=Condition)) +
geom_point(size = 2) +
labs(title = "Heating load distribution by condition")
#Plot for HeatingLoad by Condition
pl.box <- ggplot(data = hlCon.data, aes(x = Condition, y = HeatingLoad)) +
geom_boxplot() +
labs(x = "Condition", y = "HeatingLoad", title = "Boxplot of HeatingLoad by Condition") +
theme_classic() +
theme_minimal()
plot_grid(pl, pl.box, labels = "AUTO" )
Comments
The above plots represents the distribution of HeatingLoad for different conditions
From the Boxplot on the right, the HeatingLoad has low values for Condition A with median of 29.5.HeatingLoad has high values for Condition “B” with median is almost 29.7. The median of the HeatingLoad for Condition “C” is 34.56. There is an outlier in condition “B” building. The distribution in “C” Condition is skewed to the left(negative)
From the dot plot on the left, the “C” category appears to be most compact and frequent in values. It has an increase but not widely spread. “A” category appears to be less widely spread than the B category in values.
“B” category appears to be widely spread than the others in values and also has the lowest and highest heating Load value.
Use the energy dataset to produce a plot where it is possible to compare the Heating Load and the Cooling Load values.
#Drop levels filter with Orientation is “N” and Condition is “B with AproxArea >= 650
hlCon.data <- droplevels(energy[energy$Orientation == 'N' & energy$Condition == 'B' & energy$AproxArea >= 650,])
compare Aprox area vs heating load / Cooling load
# plot the data set with Heating Load and cooling load values where AproxArea is at least 650
#filter by Orientation and Condition
plot <- ggplot(hlCon.data, aes(AproxArea)) +
geom_line(aes(y = HeatingLoad, colour = "HeatingLoad")) +
geom_line(aes(y = CoolingLoad, colour = "CoolingLoad")) +
scale_colour_hue("channels") +
labs(x= "AproxArea", y= "Heating Load / Cooling Load", title="Energy consumed in an Aproximate area")
plot
The above plot represents the relation between columns AproxArea vs HeatingLoad/ Cooling Load . From the distribution of the line in above plot, The heating load and cooling load over approx area is similar in trend and pattern . The both increase and decrease almost same.However the cooling load appear to be higher in value when compared to the approx area.The highest value at approx area is 556.
Bivariate Data and Linear Regression
# calculate the covariance (this gives the sample covariance) that determine if Heating load and
# coolingLoad covary.The default method is pearsons.
cov(energy$CoolingLoad,energy$HeatingLoad) #The Heating Load is the response variable and the Cooling load is the predictor
## [1] 93.02044
# calculate the correlation that determines if Heating load and coolingLoad correlate.
# The default method is pearsons.
cor(energy$CoolingLoad,energy$HeatingLoad)
## [1] 0.9752675
# Create the data frame from a subset of energy data
energy.data <- data.frame(
HeatingLoad = energy$HeatingLoad,
CoolingLoad = energy$CoolingLoad,
stringsAsFactors = FALSE
)
# import Library corplot to visualize if there is a linear relationship between variables
library(corrplot)
## corrplot 0.84 loaded
cor.matrix <- corrplot(cor(energy.data)) #correlation matrices
cor.matrix
## HeatingLoad CoolingLoad
## HeatingLoad 1.0000000 0.9752675
## CoolingLoad 0.9752675 1.0000000
# scatter plotting the heating load and cooling to see if there is a linear relationship
ggplot(energy.data, aes(x=CoolingLoad ,y=HeatingLoad)) +
geom_point() +
labs(title = "Cooling Load vs Heating Load",
x="Cooling load (KBTU)",
y = "Heating Load (KBTU)")
Covariance: From the results, covariance is 93.02044 ~ 93%. The two variable covary
Correlation: correlation coefficient is 0.9752675 very close to 1
Correlation Matrix: Shows strong positive correlation
Scatter plot: The variables have a strong linear correlation and the graph looks reasonably linear
A linear model is appropriate. So we shall proceed to a linear regression.
# HeatingLoad is to be regressed on CoolingLoad,
# model
regressor <- lm(formula=HeatingLoad~CoolingLoad,data=energy.data)
#Analysis of variance table
summary(regressor)
##
## Call:
## lm(formula = HeatingLoad ~ CoolingLoad, data = energy.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.2435 -0.9171 -0.1044 1.3788 6.2989
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.045746 0.216277 -14.08 <2e-16 ***
## CoolingLoad 1.030965 0.008297 124.25 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.221 on 793 degrees of freedom
## Multiple R-squared: 0.9511, Adjusted R-squared: 0.9511
## F-statistic: 1.544e+04 on 1 and 793 DF, p-value: < 2.2e-16
Observation
From the summary statistics, The p-VALUE shows the cooling load is statistically significant and has a strong influence on the Heating load. The heating load increases as the cooling load increases.
# let us visualise the result graph using ggplot2
# let us design the visualization in layers that we trained our model on using the simple linear equation ^y = B0 + B1X
# 1. the ggplot function()
# 2. the point function through the channel
# 3. the liner model using the y predicted value of the energy.data set
p <- ggplot() +
geom_point(aes(x= energy.data$CoolingLoad, y= energy.data$HeatingLoad),
color= "red") +
geom_line(aes(x= energy.data$CoolingLoad, y =predict(regressor, newdata= energy.data)),
color=" blue") +
ggtitle('Cooling vs Heating load(Energy observation points)') +
xlab('Cooling load') +
ylab('Heating load')
p
Predict the Heating Load based on CoolingLoad of 32 kBTU.
# create a dataframe from the new data
newdata <- data.frame(CoolingLoad=32)
# let us predict the Heating load result after we have trained our model with the subset data (energy.data)
hl.predict <- predict(regressor, newdata = newdata) # heating load as hl
hl.predict
## 1
## 29.94512
The predicted heating load is 29.94512 ~ 29.95KBTU
# CoolingLoad, is to be regressed on HeatingLoad.
# model
regressorNew <- lm(formula=CoolingLoad~HeatingLoad, data=energy.data)
#Analysis of variance table
summary(regressorNew)
##
## Call:
## lm(formula = CoolingLoad ~ HeatingLoad, data = energy.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.1246 -1.1918 -0.1625 0.6572 8.9658
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.995919 0.179418 22.27 <2e-16 ***
## HeatingLoad 0.922579 0.007425 124.25 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.101 on 793 degrees of freedom
## Multiple R-squared: 0.9511, Adjusted R-squared: 0.9511
## F-statistic: 1.544e+04 on 1 and 793 DF, p-value: < 2.2e-16
From the summary statistics, The p-VALUE shows the cooling load is statistically significant and has a strong influence on the Heating load. The heating load increases as the cooling load increases.Also the coefficient of determination is 0.9511. A linear model is ideal because it is close to 1
# let us visualise the result graph using ggplot2
# let us design the visualization in layers that we trained our model on using the simple linear equation ^y = B0 + B1X
# 1. the ggplot function()
# 2. the point function through the channel
# 3. the liner model using the y predicted value of the energy.data set
p <- ggplot() +
geom_point(aes(x= energy.data$HeatingLoad, y= energy.data$CoolingLoad),
color= "red") +
geom_line(aes(x= energy.data$HeatingLoad,, y =predict(regressorNew, newdata= energy.data)),
color=" blue") +
ggtitle('Heating vs Cooling load(Energy observation points)') +
xlab('Heating load') +
ylab('Cooling load')
p
# create a dataframe from the new data
data.new <- data.frame(HeatingLoad =41)
# let us predict the Heating load result after we have trained our model with the subset data (energy.data)
cl.predict <- predict(regressorNew, newdata = data.new ) # Cooling load as cl
cl.predict
## 1
## 41.82167
Results:
Based on the regression model and the data, we expect or estimate to have a cooling load of 33.5 KBTU approximately for a heating load of 41 KBTU.
Hypothesis testing and Confidence Interval of 99%
# calculate the mean for the cooling load, cl as cooling load
cl.mean <- mean(energy$CoolingLoad)
# approximate the value to whole number
cl.mean <- round(cl.mean)
# create a vector of datasetNumber
datasetNumber <- c(1:18)
# create a vector of cooling Load Average
coolingLoadAvg <- c(23,24,23,25,24,23,26,24,23,25,24,23,26,22,25,25,22,cl.mean)
coolLoads <- data.frame(datasetNumber, coolingLoadAvg)
# Check for normality in distribution since the sample size( 18) is not greater than 30
shapiro.test(coolLoads$coolingLoadAvg)
##
## Shapiro-Wilk normality test
##
## data: coolLoads$coolingLoadAvg
## W = 0.92863, p-value = 0.1837
# Using : Q-Q plot
p <- ggplot(coolLoads, aes(sample = coolingLoadAvg))
p <- p + stat_qq()
p <- p + stat_qq_line( )
p
#Using Dot plot of distribution
p <- ggplot(coolLoads, aes(x = coolingLoadAvg ))
p <- p+ geom_dotplot( binwidth=0.4)
p <- p + labs( x="cooling load average",y="proportions")
p <- p + xlim(20,27)
p
The data is quite reasonably evenly distributed over the range.
H0: Null Hypothesis states that mean cooling load is equal to 23.5 Ha : Alternate Hypothesis states that mean cooling load is not equal to 23.5
# parametric t-test to check for alternative hypothesis
t.test(x=coolLoads$coolingLoadAvg, alternative="two.sided", paired=F, mu=23.5, conf.level = 0.99)
##
## One Sample t-test
##
## data: coolLoads$coolingLoadAvg
## t = 1.5567, df = 17, p-value = 0.138
## alternative hypothesis: true mean is not equal to 23.5
## 99 percent confidence interval:
## 23.11696 24.77193
## sample estimates:
## mean of x
## 23.94444
Exploring the relationship between RoofArea (Response) and Height( Predictor)
#let us explore the relationship using boxplot with ggplot2
# Boxplot showcasing the distribution of RoofArea by Height
colors <- c(rgb(0.1,0.1,0.7,0.5), rgb(0.8,0.1,0.3,0.6))
ggplot(energy, aes(Height, RoofArea, fill = Height)) + geom_boxplot()+
ggtitle('Roof area by Height') + xlab('') + ylab('Roof Area') + scale_fill_manual(values=colors) + theme_classic()
No outliers.
It is assumed that the value of the height influences the Roof area. Let test this using T-test
Independent two sample parametric test.
$H_0 : Null Hypothesis states that Mean Roof Area is equal to the Mean of height $H_a : Alternate Hypothesis states that mean Roof Area is not equal to Mean of height
# parametric t-test to check for alternative hypothesis using confidence level of 95%
t.test(energy$RoofArea~energy$Height, alternative="two.sided", paired=F, mu=0, var.equal=T, conf.level = 0.95)
##
## Two Sample t-test
##
## data: energy$RoofArea by energy$Height
## t = -95.38, df = 793, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -88.98369 -85.39492
## sample estimates:
## mean in group high mean in group low
## 133.9513 221.1406
Exploring the relationship between GlassArea (Response) and Height( Predictor)
#let us explore the relationship using boxplot with ggplot2
# Boxplot showcasing the distribution of GlassArea by Height
colors <- c(rgb(0.4,0.1,0.5,0.5), rgb(0.8,0.1,0.3,0.6))
ggplot(energy, aes(Height, GlassArea, fill = Height)) + geom_boxplot()+
ggtitle('Glass area by Height') + xlab('') + ylab('Glass Area') + scale_fill_manual(values=colors) + theme_classic()
It is assumed that the value of the height influences the Glass area. Let test this using T-test
Ho: Mean Glass Area = Mean of height
H1: two-sided test, Mean Glass Area is not equal to Mean of height
Independent two sample parametric test. Since sample size (energy) > 30, Neglect check for normality in distribution of data
# parametric t-test to check for alternative hypothesis using confidence level of 95%
t.test(energy$GlassArea~energy$Height, alternative="two.sided", paired=F, mu=0, var.equal=T, conf.level = 0.95)
##
## Two Sample t-test
##
## data: energy$GlassArea by energy$Height
## t = 1.6481, df = 793, p-value = 0.09973
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.986432 11.312884
## sample estimates:
## mean in group high mean in group low
## 77.67133 72.50810
# Create a subset of the energy data
energy.data <- as.data.frame(energy[,c('Orientation', 'CoolingLoad')])
# check 10 rows
head(energy.data, 10)
# filter with condition for each category
orientation.N <- energy.data[ energy.data$Orientation == 'N',]
orientation.E <- energy.data[ energy.data$Orientation == 'E',]
orientation.W <- energy.data[ energy.data$Orientation == 'W',]
orientation.S <- energy.data[ energy.data$Orientation == 'S',]
# assign them to each variable
North <- data.frame(N = c( orientation.N$CoolingLoad))
East <- data.frame(E = c( orientation.E$CoolingLoad))
West <- data.frame(W = c( orientation.W$CoolingLoad))
South <- data.frame(S = c( orientation.S$CoolingLoad))
Check the distribution of each variable
N <- ggplot(North , aes(x= N)) +
geom_dotplot(binwidth=0.5) +
labs (x="North category") +
geom_vline(xintercept = mean(North$N), color = "red", size=0.7)
S <- ggplot(South , aes(x= S)) +
geom_dotplot(binwidth=0.5) +
labs (x="South category") +
geom_vline(xintercept = mean(South$S), color = "red", size=0.7)
E <- ggplot(East , aes(x= E)) +
geom_dotplot(binwidth=0.5) +
labs (x="East category") +
geom_vline(xintercept = mean(East$E), color = "red", size=0.7)
W <- ggplot(West , aes(x= W)) +
geom_dotplot(binwidth=0.5) +
labs (x="West category") +
geom_vline(xintercept = mean(West$W), color = "red", size=0.7)
# Use a dot plot to check the distribution of values for each orientation.
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(N, W, E, S, nrow = 2)
The mean is on the red vertical line. It is almost the same for all plots, but let us test with Anova
Defining the hypotheses
H0: There is no difference in mean values for coooling load values for the different categories of orientation i.e. µNorth = µEast = µWest = µSouth
H1: at least two means are different or there is a difference.
Applying the one-way ANOVA test
anova <- aov( energy.data$CoolingLoad ~ energy.data$Orientation, data = energy.data)
summary(anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## energy.data$Orientation 3 18 6.08 0.067 0.977
## Residuals 791 71622 90.55
The test is only valid if the residuals are distributed normally. check the normality of the residuals using a Q-Q plot
anovaFrame <- data.frame(residuals = anova$residuals)
pl <- ggplot(anovaFrame, aes(sample = residuals)) +
stat_qq(size=2) + stat_qq_line( alpha = 0.7, color='red', linetype="dashed") +
theme_classic() +
theme_minimal()
pl
shapiro.test(anova$residuals)
##
## Shapiro-Wilk normality test
##
## data: anova$residuals
## W = 0.90131, p-value < 2.2e-16
The p-value < 0.05 so it is reasonable to assume the distribution is not normal. Therefore the one-way ANOVA test is not valid.
Comments