# Install required packages if not already installed
packages <- c("tidyverse", "car", "ggpubr", "gtsummary", "effects", "broom")
install_if_missing <- function(p) {
if (!requireNamespace(p, quietly = TRUE)) install.packages(p)
}
lapply(packages, install_if_missing)
## Warning in check_dep_version(): ABI version mismatch:
## lme4 was built with Matrix ABI version 1
## Current Matrix ABI version is 0
## Please re-install lme4 from source or restore original 'Matrix' package
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
##
## [[6]]
## NULL
# Part 2 Packages
# Load packages
library(car)
## Warning: package 'car' was built under R version 4.3.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.3.3
## Loading required package: ggplot2
library(gtsummary)
library(effects)
## Warning: package 'effects' was built under R version 4.3.3
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
library(broom)
## Warning: package 'broom' was built under R version 4.3.3
Interpretation: We load the core packages needed for data analysis, visualization, ANOVA, regression, and diagnostics.
df <- read.csv("C:/Users/mohaa/Downloads/Omani_household_energy.csv")
df$Governorate <- as.factor(df$Governorate)
df$HouseType <- as.factor(df$HouseType)
Interpretation: The dataset is loaded and
categorical variables (Governorate and
HouseType) are converted to factors for analysis.
df %>%
group_by(Governorate, HouseType) %>%
summarise(across(where(is.numeric),
list(mean = mean, sd = sd),
.names = "{.col}_{.fn}"),
.groups = "drop")
## # A tibble: 22 × 10
## Governorate HouseType Monthly_Income_OMR_mean Monthly_Income_OMR_sd
## <fct> <fct> <dbl> <dbl>
## 1 Al Batinah North Apartment 1522. 398.
## 2 Al Batinah North Villa 1312. 201.
## 3 Al Batinah South Apartment 993. 445.
## 4 Al Batinah South Villa 1452. 374.
## 5 Al Buraimi Apartment 1367. 439.
## 6 Al Buraimi Villa 1298. 292.
## 7 Al Dakhiliyah Apartment 1218. 395.
## 8 Al Dakhiliyah Villa 1225. 513.
## 9 Al Dhahirah Apartment 1394. 448.
## 10 Al Dhahirah Villa 1113. 441.
## # ℹ 12 more rows
## # ℹ 6 more variables: Family_Size_mean <dbl>, Family_Size_sd <dbl>,
## # Electricity_Usage_kWh_mean <dbl>, Electricity_Usage_kWh_sd <dbl>,
## # Water_Consumption_L_day_mean <dbl>, Water_Consumption_L_day_sd <dbl>
Interpretation: We compute the mean and standard deviation for each numeric variable grouped by governorate and house type to understand data distribution.
ggboxplot(df, x = "Governorate", y = "Electricity_Usage_kWh", color = "HouseType",
palette = "jco", add = "jitter",
title = "Electricity Usage by Governorate and House Type")
ggscatter(df, x = "Monthly_Income_OMR", y = "Electricity_Usage_kWh", color = "Governorate",
add = "reg.line", conf.int = TRUE) +
labs(title = "Electricity Usage vs Income by Governorate")
Interpretation: The boxplot shows variation in electricity usage by region and house type. The scatter plot explores the relationship between income and electricity use across governorates.
by(df$`Electricity_Usage_kWh`, df$Governorate, shapiro.test)
## df$Governorate: Al Batinah North
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.95535, p-value = 0.4013
##
## ------------------------------------------------------------
## df$Governorate: Al Batinah South
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.84742, p-value = 0.01597
##
## ------------------------------------------------------------
## df$Governorate: Al Buraimi
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.98832, p-value = 0.9932
##
## ------------------------------------------------------------
## df$Governorate: Al Dakhiliyah
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.9395, p-value = 0.3123
##
## ------------------------------------------------------------
## df$Governorate: Al Dhahirah
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.96486, p-value = 0.7237
##
## ------------------------------------------------------------
## df$Governorate: Al Sharqiyah North
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.93424, p-value = 0.2072
##
## ------------------------------------------------------------
## df$Governorate: Al Sharqiyah South
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.97209, p-value = 0.9179
##
## ------------------------------------------------------------
## df$Governorate: Al Wusta
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.92153, p-value = 0.2033
##
## ------------------------------------------------------------
## df$Governorate: Dhofar
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.95865, p-value = 0.5172
##
## ------------------------------------------------------------
## df$Governorate: Musandam
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.95017, p-value = 0.5633
##
## ------------------------------------------------------------
## df$Governorate: Muscat
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.96514, p-value = 0.5026
leveneTest(Electricity_Usage_kWh ~ Governorate, data = df)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 10 0.4634 0.9118
## 189
Interpretation:
Shapiro-Wilk test checks if electricity usage is normally distributed within each governorate. Levene’s test assesses whether variances are equal across governorates.
anova1 <- aov(Electricity_Usage_kWh ~ Governorate, data = df)
base::summary(anova1)
## Df Sum Sq Mean Sq F value Pr(>F)
## Governorate 10 71224 7122 0.283 0.984
## Residuals 189 4760528 25188
Interpretation: One-way ANOVA tests if the average electricity usage differs significantly between governorates. A low p-value (< 0.05) would indicate significant differences.
anova2 <- aov(Electricity_Usage_kWh ~ Governorate * HouseType, data = df)
Anova(anova2, type = 3)
## Anova Table (Type III tests)
##
## Response: Electricity_Usage_kWh
## Sum Sq Df F value Pr(>F)
## (Intercept) 4298125 1 170.2698 <2e-16 ***
## Governorate 59225 10 0.2346 0.9925
## HouseType 44040 1 1.7446 0.1882
## Governorate:HouseType 149403 10 0.5919 0.8193
## Residuals 4493260 178
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Interpretation: This model tests main and interaction effects of governorate and house type on electricity usage. Interaction checks whether house type influences usage differently across regions.
plot(allEffects(anova2), main = "Interaction: Governorate * HouseType")
Interpretation: This plot visualizes how the effect of
house type varies by governorate. Parallel lines imply no interaction;
crossing lines suggest interaction exists.
model <- lm(Electricity_Usage_kWh ~ Governorate + HouseType + Monthly_Income_OMR +
Family_Size + Water_Consumption_L_day, data = df)
summary(model)
##
## Call:
## lm(formula = Electricity_Usage_kWh ~ Governorate + HouseType +
## Monthly_Income_OMR + Family_Size + Water_Consumption_L_day,
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -322.02 -119.28 -6.69 131.54 334.44
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 608.78820 70.85832 8.592 3.52e-15 ***
## GovernorateAl Batinah South -44.72978 53.21669 -0.841 0.4017
## GovernorateAl Buraimi -7.10293 47.75428 -0.149 0.8819
## GovernorateAl Dakhiliyah 6.98910 51.61969 0.135 0.8924
## GovernorateAl Dhahirah -5.17447 51.67438 -0.100 0.9203
## GovernorateAl Sharqiyah North -28.89216 50.09155 -0.577 0.5648
## GovernorateAl Sharqiyah South 3.52391 55.82040 0.063 0.9497
## GovernorateAl Wusta -55.88265 52.92171 -1.056 0.2924
## GovernorateDhofar -26.72079 49.35620 -0.541 0.5889
## GovernorateMusandam -1.67871 54.35566 -0.031 0.9754
## GovernorateMuscat -7.39094 46.37661 -0.159 0.8736
## HouseTypeVilla -51.13798 22.81714 -2.241 0.0262 *
## Monthly_Income_OMR -0.03003 0.02814 -1.067 0.2873
## Family_Size -3.12693 4.43556 -0.705 0.4817
## Water_Consumption_L_day 0.01297 0.05170 0.251 0.8022
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 157.6 on 185 degrees of freedom
## Multiple R-squared: 0.04884, Adjusted R-squared: -0.02314
## F-statistic: 0.6785 on 14 and 185 DF, p-value: 0.7936
Interpretation: The model estimates how each variable affects electricity usage. Coefficients show the direction and strength of influence; p-values indicate significance.
par(mfrow = c(2, 2))
plot(model)
Interpretation: These plots help assess linear
regression assumptions: linearity, normality of residuals, constant
variance, and detection of influential outliers.
vif(model)
## GVIF Df GVIF^(1/(2*Df))
## Governorate 1.221129 10 1.010039
## HouseType 1.047865 1 1.023653
## Monthly_Income_OMR 1.064766 1 1.031875
## Family_Size 1.078470 1 1.038494
## Water_Consumption_L_day 1.053703 1 1.026500
Interpretation: Variance Inflation Factor (VIF) checks if predictors are highly correlated. VIF > 5 indicates multicollinearity problems.
tbl_regression(model, exponentiate = FALSE) %>% bold_labels()
| Characteristic | Beta | 95% CI | p-value |
|---|---|---|---|
| Governorate | |||
| Al Batinah North | — | — | |
| Al Batinah South | -45 | -150, 60 | 0.4 |
| Al Buraimi | -7.1 | -101, 87 | 0.9 |
| Al Dakhiliyah | 7.0 | -95, 109 | 0.9 |
| Al Dhahirah | -5.2 | -107, 97 | >0.9 |
| Al Sharqiyah North | -29 | -128, 70 | 0.6 |
| Al Sharqiyah South | 3.5 | -107, 114 | >0.9 |
| Al Wusta | -56 | -160, 49 | 0.3 |
| Dhofar | -27 | -124, 71 | 0.6 |
| Musandam | -1.7 | -109, 106 | >0.9 |
| Muscat | -7.4 | -99, 84 | 0.9 |
| HouseType | |||
| Apartment | — | — | |
| Villa | -51 | -96, -6.1 | 0.026 |
| Monthly_Income_OMR | -0.03 | -0.09, 0.03 | 0.3 |
| Family_Size | -3.1 | -12, 5.6 | 0.5 |
| Water_Consumption_L_day | 0.01 | -0.09, 0.11 | 0.8 |
| Abbreviation: CI = Confidence Interval | |||
Interpretation: Presents a clean table summarizing regression coefficients, confidence intervals, and significance for reporting.
tidy(Anova(anova2, type = 3))
## # A tibble: 5 × 5
## term sumsq df statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 4298125. 1 170. 9.68e-28
## 2 Governorate 59225. 10 0.235 9.92e- 1
## 3 HouseType 44040. 1 1.74 1.88e- 1
## 4 Governorate:HouseType 149403. 10 0.592 8.19e- 1
## 5 Residuals 4493260. 178 NA NA
Interpretation: Shows ANOVA results in a compact, report-ready format including sum of squares, degrees of freedom, F-statistics, and p-values.
# Fit a linear model to predict electricity usage
model <- lm(Electricity_Usage_kWh ~ Governorate + HouseType + Monthly_Income_OMR + Family_Size, data = df)
summary(model)
##
## Call:
## lm(formula = Electricity_Usage_kWh ~ Governorate + HouseType +
## Monthly_Income_OMR + Family_Size, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -318.29 -120.13 -7.25 129.13 330.50
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 619.24366 57.16454 10.833 <2e-16 ***
## GovernorateAl Batinah South -44.42352 53.06850 -0.837 0.4036
## GovernorateAl Buraimi -7.65418 47.58339 -0.161 0.8724
## GovernorateAl Dakhiliyah 6.30241 51.41707 0.123 0.9026
## GovernorateAl Dhahirah -4.86143 51.52903 -0.094 0.9249
## GovernorateAl Sharqiyah North -29.70834 49.85973 -0.596 0.5520
## GovernorateAl Sharqiyah South 2.39247 55.49762 0.043 0.9657
## GovernorateAl Wusta -55.21353 52.72116 -1.047 0.2963
## GovernorateDhofar -27.57641 49.11406 -0.561 0.5751
## GovernorateMusandam -3.12657 53.91215 -0.058 0.9538
## GovernorateMuscat -7.09593 46.24477 -0.153 0.8782
## HouseTypeVilla -50.91373 22.74212 -2.239 0.0264 *
## Monthly_Income_OMR -0.02975 0.02805 -1.061 0.2902
## Family_Size -3.17915 4.41949 -0.719 0.4728
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 157.2 on 186 degrees of freedom
## Multiple R-squared: 0.04852, Adjusted R-squared: -0.01799
## F-statistic: 0.7295 on 13 and 186 DF, p-value: 0.7326
Interpretation: This model checks the combined influence of governorate, house type, income, and family size on electricity usage.
par(mfrow = c(2, 2))
plot(model)
Interpretation: These plots help assess assumptions such as linearity, homoscedasticity, and outliers. Patterns or funnel shapes could indicate violations.
vif(model)
## GVIF Df GVIF^(1/(2*Df))
## Governorate 1.163453 10 1.007598
## HouseType 1.046257 1 1.022867
## Monthly_Income_OMR 1.063130 1 1.031082
## Family_Size 1.076095 1 1.037350
Interpretation: VIF values above 5 suggest multicollinearity. All VIFs should ideally be below 2 in this context.
tbl_regression(model) %>%
bold_labels()
| Characteristic | Beta | 95% CI | p-value |
|---|---|---|---|
| Governorate | |||
| Al Batinah North | — | — | |
| Al Batinah South | -44 | -149, 60 | 0.4 |
| Al Buraimi | -7.7 | -102, 86 | 0.9 |
| Al Dakhiliyah | 6.3 | -95, 108 | >0.9 |
| Al Dhahirah | -4.9 | -107, 97 | >0.9 |
| Al Sharqiyah North | -30 | -128, 69 | 0.6 |
| Al Sharqiyah South | 2.4 | -107, 112 | >0.9 |
| Al Wusta | -55 | -159, 49 | 0.3 |
| Dhofar | -28 | -124, 69 | 0.6 |
| Musandam | -3.1 | -109, 103 | >0.9 |
| Muscat | -7.1 | -98, 84 | 0.9 |
| HouseType | |||
| Apartment | — | — | |
| Villa | -51 | -96, -6.0 | 0.026 |
| Monthly_Income_OMR | -0.03 | -0.09, 0.03 | 0.3 |
| Family_Size | -3.2 | -12, 5.5 | 0.5 |
| Abbreviation: CI = Confidence Interval | |||
tidy(Anova(anova2, type = 3))
## # A tibble: 5 × 5
## term sumsq df statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 4298125. 1 170. 9.68e-28
## 2 Governorate 59225. 10 0.235 9.92e- 1
## 3 HouseType 44040. 1 1.74 1.88e- 1
## 4 Governorate:HouseType 149403. 10 0.592 8.19e- 1
## 5 Residuals 4493260. 178 NA NA
Interpretation: These tables are clean and ready for publication or presentation. They summarize the main and interaction effects and regression coefficients.
# Load necessary libraries
library(shiny)
## Warning: package 'shiny' was built under R version 4.3.3
library(readr)
## Warning: package 'readr' was built under R version 4.3.3
library(car)
colnames(df) <- make.names(colnames(data)) # Make variable names safe for formula
# Get numeric variables
numeric_vars <- names(df)[sapply(data, is.numeric)]
# UI
ui <- fluidPage(
titlePanel("Omani Household Energy Survey - Linear Model App"),
sidebarLayout(
sidebarPanel(
selectInput("response", "Response Variable:", choices = numeric_vars),
checkboxGroupInput("predictors", "Predictor(s):", choices = numeric_vars)
),
mainPanel(
verbatimTextOutput("summary"),
verbatimTextOutput("vif"),
plotOutput("plots")
)
)
)
# Server
server <- function(input, output) {
model <- reactive({
req(input$response, input$predictors)
predictors <- setdiff(input$predictors, input$response)
if (length(predictors) == 0) return(NULL)
formula <- as.formula(paste(input$response, "~", paste(predictors, collapse = "+")))
lm(formula, data = df)
})
output$summary <- renderPrint({
if (!is.null(model())) summary(model())
})
output$vif <- renderPrint({
if (!is.null(model())) vif(model())
})
output$plots <- renderPlot({
if (!is.null(model())) {
par(mfrow = c(2, 2))
plot(model())
}
})
}
shinyApp(ui = ui, server = server)