# Install required packages if not already installed
packages <- c("tidyverse", "car", "ggpubr", "gtsummary", "effects", "broom", "shiny",
"bslib", "readr", "car")
install_if_missing <- function(p) {
if (!requireNamespace(p, quietly = TRUE)) install.packages(p)
}
lapply(packages, install_if_missing)
## Warning in check_dep_version(): ABI version mismatch:
## lme4 was built with Matrix ABI version 1
## Current Matrix ABI version is 0
## Please re-install lme4 from source or restore original 'Matrix' package
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
##
## [[6]]
## NULL
##
## [[7]]
## NULL
##
## [[8]]
## NULL
##
## [[9]]
## NULL
##
## [[10]]
## NULL
# Load libraries
# PART 1
library(car)
## Warning: package 'car' was built under R version 4.3.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.3.3
## Loading required package: ggplot2
library(gtsummary)
library(effects)
## Warning: package 'effects' was built under R version 4.3.3
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
library(broom)
## Warning: package 'broom' was built under R version 4.3.3
# PART 2
library(shiny)
## Warning: package 'shiny' was built under R version 4.3.3
library(readr)
## Warning: package 'readr' was built under R version 4.3.3
library(car)
Interpretation:
We load the core packages needed for data analysis, visualization, ANOVA, regression, and diagnostics.
df <- read.csv("C:/Users/mohaa/Downloads/Omani_household_energy.csv")
df$Governorate <- as.factor(df$Governorate)
df$HouseType <- as.factor(df$HouseType)
Interpretation:
The dataset is loaded and categorical variables
(Governorate and HouseType) are converted to
factors for analysis.
df %>%
group_by(Governorate, HouseType) %>%
summarise(across(where(is.numeric),
list(mean = mean, sd = sd),
.names = "{.col}_{.fn}"),
.groups = "drop")
## # A tibble: 22 × 10
## Governorate HouseType Monthly_Income_OMR_mean Monthly_Income_OMR_sd
## <fct> <fct> <dbl> <dbl>
## 1 Al Batinah North Apartment 1522. 398.
## 2 Al Batinah North Villa 1312. 201.
## 3 Al Batinah South Apartment 993. 445.
## 4 Al Batinah South Villa 1452. 374.
## 5 Al Buraimi Apartment 1367. 439.
## 6 Al Buraimi Villa 1298. 292.
## 7 Al Dakhiliyah Apartment 1218. 395.
## 8 Al Dakhiliyah Villa 1225. 513.
## 9 Al Dhahirah Apartment 1394. 448.
## 10 Al Dhahirah Villa 1113. 441.
## # ℹ 12 more rows
## # ℹ 6 more variables: Family_Size_mean <dbl>, Family_Size_sd <dbl>,
## # Electricity_Usage_kWh_mean <dbl>, Electricity_Usage_kWh_sd <dbl>,
## # Water_Consumption_L_day_mean <dbl>, Water_Consumption_L_day_sd <dbl>
Interpretation:
We computed grouped summary statistics of income, family size, electricity usage, and water consumption across governorates and house types.
On average, villas showed higher electricity usage and water consumption compared to apartments.
Households in Muscat had relatively higher income and energy usage compared to other governorates.
ggboxplot(df, x = "Governorate", y = "Electricity_Usage_kWh", color = "HouseType",
palette = "jco", add = "jitter",
title = "Electricity Usage by Governorate and House Type")
ggscatter(df, x = "Monthly_Income_OMR", y = "Electricity_Usage_kWh", color = "Governorate",
add = "reg.line", conf.int = TRUE) +
labs(title = "Electricity Usage vs Income by Governorate")
Interpretation:
The boxplot indicates substantial variation in electricity usage between different governorates, with Muscat having wider usage ranges. Villas generally exhibit higher electricity usage compared to apartments.
The scatter plot shows a positive relationship between household income and electricity usage
by(df$`Electricity_Usage_kWh`, df$Governorate, shapiro.test)
## df$Governorate: Al Batinah North
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.95535, p-value = 0.4013
##
## ------------------------------------------------------------
## df$Governorate: Al Batinah South
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.84742, p-value = 0.01597
##
## ------------------------------------------------------------
## df$Governorate: Al Buraimi
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.98832, p-value = 0.9932
##
## ------------------------------------------------------------
## df$Governorate: Al Dakhiliyah
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.9395, p-value = 0.3123
##
## ------------------------------------------------------------
## df$Governorate: Al Dhahirah
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.96486, p-value = 0.7237
##
## ------------------------------------------------------------
## df$Governorate: Al Sharqiyah North
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.93424, p-value = 0.2072
##
## ------------------------------------------------------------
## df$Governorate: Al Sharqiyah South
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.97209, p-value = 0.9179
##
## ------------------------------------------------------------
## df$Governorate: Al Wusta
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.92153, p-value = 0.2033
##
## ------------------------------------------------------------
## df$Governorate: Dhofar
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.95865, p-value = 0.5172
##
## ------------------------------------------------------------
## df$Governorate: Musandam
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.95017, p-value = 0.5633
##
## ------------------------------------------------------------
## df$Governorate: Muscat
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.96514, p-value = 0.5026
leveneTest(Electricity_Usage_kWh ~ Governorate, data = df)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 10 0.4634 0.9118
## 189
Interpretation:
Shapiro-Wilk tests per governorate show that electricity usage is approximately normally distributed within most groups (p > 0.05).
Levene’s Test p-value > 0.05 suggests that variances are equal across governorates, so the assumption of homogeneity of variances is met.
anova1 <- aov(Electricity_Usage_kWh ~ Governorate, data = df)
base::summary(anova1)
## Df Sum Sq Mean Sq F value Pr(>F)
## Governorate 10 71224 7122 0.283 0.984
## Residuals 189 4760528 25188
Interpretation: The one-way ANOVA shows that there is no statistically significant difference in electricity usage between governorates (p = 0.984). This means that electricity consumption patterns are similar across all governorates in the dataset.
anova2 <- aov(Electricity_Usage_kWh ~ Governorate * HouseType, data = df)
Anova(anova2, type = 3)
## Anova Table (Type III tests)
##
## Response: Electricity_Usage_kWh
## Sum Sq Df F value Pr(>F)
## (Intercept) 4298125 1 170.2698 <2e-16 ***
## Governorate 59225 10 0.2346 0.9925
## HouseType 44040 1 1.7446 0.1882
## Governorate:HouseType 149403 10 0.5919 0.8193
## Residuals 4493260 178
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Interpretation: The two-way ANOVA shows that there are no statistically significant main effects of governorate (p = 0.992) or house type (p = 0.188) on electricity usage.
Furthermore, the interaction between governorate and house type is not statistically significant (p = 0.819), indicating that the relationship between house type and electricity usage does not depend on the governorate.
plot(allEffects(anova2), main = "Interaction: Governorate * HouseType")
Interpretation: This plot visualizes how the effect of house type varies by governorate. Parallel lines imply no interaction; crossing lines suggest interaction exists.
model <- lm(Electricity_Usage_kWh ~ Governorate + HouseType + Monthly_Income_OMR +
Family_Size + Water_Consumption_L_day, data = df)
summary(model)
##
## Call:
## lm(formula = Electricity_Usage_kWh ~ Governorate + HouseType +
## Monthly_Income_OMR + Family_Size + Water_Consumption_L_day,
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -322.02 -119.28 -6.69 131.54 334.44
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 608.78820 70.85832 8.592 3.52e-15 ***
## GovernorateAl Batinah South -44.72978 53.21669 -0.841 0.4017
## GovernorateAl Buraimi -7.10293 47.75428 -0.149 0.8819
## GovernorateAl Dakhiliyah 6.98910 51.61969 0.135 0.8924
## GovernorateAl Dhahirah -5.17447 51.67438 -0.100 0.9203
## GovernorateAl Sharqiyah North -28.89216 50.09155 -0.577 0.5648
## GovernorateAl Sharqiyah South 3.52391 55.82040 0.063 0.9497
## GovernorateAl Wusta -55.88265 52.92171 -1.056 0.2924
## GovernorateDhofar -26.72079 49.35620 -0.541 0.5889
## GovernorateMusandam -1.67871 54.35566 -0.031 0.9754
## GovernorateMuscat -7.39094 46.37661 -0.159 0.8736
## HouseTypeVilla -51.13798 22.81714 -2.241 0.0262 *
## Monthly_Income_OMR -0.03003 0.02814 -1.067 0.2873
## Family_Size -3.12693 4.43556 -0.705 0.4817
## Water_Consumption_L_day 0.01297 0.05170 0.251 0.8022
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 157.6 on 185 degrees of freedom
## Multiple R-squared: 0.04884, Adjusted R-squared: -0.02314
## F-statistic: 0.6785 on 14 and 185 DF, p-value: 0.7936
Interpretation: The model estimates how each variable affects electricity usage. Coefficients show the direction and strength of influence; p-values indicate significance.
par(mfrow = c(2, 2))
plot(model)
Interpretation: These plots help assess linear regression assumptions: linearity, normality of residuals, constant variance, and detection of influential outliers.
Residuals are roughly centered around zero (no major bias).
No strong funnel pattern is observed (homoscedasticity is reasonable).
Q-Q plot suggests residuals are approximately normally distributed.
No major outliers or influential points based on leverage/Cook’s distance.
vif(model)
## GVIF Df GVIF^(1/(2*Df))
## Governorate 1.221129 10 1.010039
## HouseType 1.047865 1 1.023653
## Monthly_Income_OMR 1.064766 1 1.031875
## Family_Size 1.078470 1 1.038494
## Water_Consumption_L_day 1.053703 1 1.026500
Interpretation: Variance Inflation Factor (VIF) values for all predictors are below 2, indicating no serious multicollinearity issues among the predictors.
tbl_regression(model, exponentiate = FALSE) %>% bold_labels()
| Characteristic | Beta | 95% CI | p-value |
|---|---|---|---|
| Governorate | |||
| Al Batinah North | — | — | |
| Al Batinah South | -45 | -150, 60 | 0.4 |
| Al Buraimi | -7.1 | -101, 87 | 0.9 |
| Al Dakhiliyah | 7.0 | -95, 109 | 0.9 |
| Al Dhahirah | -5.2 | -107, 97 | >0.9 |
| Al Sharqiyah North | -29 | -128, 70 | 0.6 |
| Al Sharqiyah South | 3.5 | -107, 114 | >0.9 |
| Al Wusta | -56 | -160, 49 | 0.3 |
| Dhofar | -27 | -124, 71 | 0.6 |
| Musandam | -1.7 | -109, 106 | >0.9 |
| Muscat | -7.4 | -99, 84 | 0.9 |
| HouseType | |||
| Apartment | — | — | |
| Villa | -51 | -96, -6.1 | 0.026 |
| Monthly_Income_OMR | -0.03 | -0.09, 0.03 | 0.3 |
| Family_Size | -3.1 | -12, 5.6 | 0.5 |
| Water_Consumption_L_day | 0.01 | -0.09, 0.11 | 0.8 |
| Abbreviation: CI = Confidence Interval | |||
Interpretation: Presents a clean table summarizing regression coefficients, confidence intervals, and significance for reporting.
tidy(Anova(anova2, type = 3))
## # A tibble: 5 × 5
## term sumsq df statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 4298125. 1 170. 9.68e-28
## 2 Governorate 59225. 10 0.235 9.92e- 1
## 3 HouseType 44040. 1 1.74 1.88e- 1
## 4 Governorate:HouseType 149403. 10 0.592 8.19e- 1
## 5 Residuals 4493260. 178 NA NA
Interpretation: The ANOVA summary table confirms the significance of main and interaction effects between governorate and house type on electricity usage.
# Load the data
data <- read_csv("C:/Users/mohaa/OneDrive/Desktop/S3336_PRT2_PRJCT/omani_household_energy_survey_200.csv")
## Rows: 200 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Governorate, HouseType
## dbl (4): Monthly_Income(OMR), Family_Size, Electricity_Usage_kWh, Water_Cons...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(data) <- make.names(colnames(data)) # Make variable names safe for formula
# Get numeric variables
numeric_vars <- names(data)[sapply(data, is.numeric)]
# UI
ui <- fluidPage(
titlePanel("Omani Household Energy Survey - Linear Model App"),
sidebarLayout(
sidebarPanel(
selectInput("response", "Response Variable:", choices = numeric_vars),
checkboxGroupInput("predictors", "Predictor(s):", choices = numeric_vars)
),
mainPanel(
verbatimTextOutput("summary"),
verbatimTextOutput("vif"),
plotOutput("plots")
)
)
)
# Server
server <- function(input, output) {
model <- reactive({
req(input$response, input$predictors)
predictors <- setdiff(input$predictors, input$response)
if (length(predictors) == 0) return(NULL)
formula <- as.formula(paste(input$response, "~", paste(predictors, collapse = "+")))
lm(formula, data = data)
})
output$summary <- renderPrint({
if (!is.null(model())) summary(model())
})
output$vif <- renderPrint({
if (!is.null(model())) vif(model())
})
output$plots <- renderPlot({
if (!is.null(model())) {
par(mfrow = c(2, 2))
plot(model())
}
})
}
shinyApp(ui = ui, server = server)