# Install required packages if not already installed
packages <- c("tidyverse", "car", "ggpubr", "gtsummary", "effects", "broom", "shiny",
              "bslib", "readr", "car")
install_if_missing <- function(p) {
  if (!requireNamespace(p, quietly = TRUE)) install.packages(p)
}

lapply(packages, install_if_missing)
## Warning in check_dep_version(): ABI version mismatch: 
## lme4 was built with Matrix ABI version 1
## Current Matrix ABI version is 0
## Please re-install lme4 from source or restore original 'Matrix' package
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
## 
## [[8]]
## NULL
## 
## [[9]]
## NULL
## 
## [[10]]
## NULL
# Load libraries
# PART 1
library(car)
## Warning: package 'car' was built under R version 4.3.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.3.3
## Loading required package: ggplot2
library(gtsummary)
library(effects)
## Warning: package 'effects' was built under R version 4.3.3
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
library(broom)
## Warning: package 'broom' was built under R version 4.3.3
# PART 2
library(shiny)
## Warning: package 'shiny' was built under R version 4.3.3
library(readr)
## Warning: package 'readr' was built under R version 4.3.3
library(car)

Interpretation:

We load the core packages needed for data analysis, visualization, ANOVA, regression, and diagnostics.

2. Import Data

df <- read.csv("C:/Users/mohaa/Downloads/Omani_household_energy.csv")
df$Governorate <- as.factor(df$Governorate)
df$HouseType <- as.factor(df$HouseType)

Interpretation:

The dataset is loaded and categorical variables (Governorate and HouseType) are converted to factors for analysis.

3. Exploratory Data Analysis

3.1 Summary Statistics

df %>%
  group_by(Governorate, HouseType) %>%
  summarise(across(where(is.numeric),
                   list(mean = mean, sd = sd),
                   .names = "{.col}_{.fn}"),
            .groups = "drop")
## # A tibble: 22 × 10
##    Governorate      HouseType Monthly_Income_OMR_mean Monthly_Income_OMR_sd
##    <fct>            <fct>                       <dbl>                 <dbl>
##  1 Al Batinah North Apartment                   1522.                  398.
##  2 Al Batinah North Villa                       1312.                  201.
##  3 Al Batinah South Apartment                    993.                  445.
##  4 Al Batinah South Villa                       1452.                  374.
##  5 Al Buraimi       Apartment                   1367.                  439.
##  6 Al Buraimi       Villa                       1298.                  292.
##  7 Al Dakhiliyah    Apartment                   1218.                  395.
##  8 Al Dakhiliyah    Villa                       1225.                  513.
##  9 Al Dhahirah      Apartment                   1394.                  448.
## 10 Al Dhahirah      Villa                       1113.                  441.
## # ℹ 12 more rows
## # ℹ 6 more variables: Family_Size_mean <dbl>, Family_Size_sd <dbl>,
## #   Electricity_Usage_kWh_mean <dbl>, Electricity_Usage_kWh_sd <dbl>,
## #   Water_Consumption_L_day_mean <dbl>, Water_Consumption_L_day_sd <dbl>

Interpretation:

We computed grouped summary statistics of income, family size, electricity usage, and water consumption across governorates and house types.

  • On average, villas showed higher electricity usage and water consumption compared to apartments.

  • Households in Muscat had relatively higher income and energy usage compared to other governorates.

3.2 Visualizations

ggboxplot(df, x = "Governorate", y = "Electricity_Usage_kWh", color = "HouseType",
          palette = "jco", add = "jitter",
          title = "Electricity Usage by Governorate and House Type")

ggscatter(df, x = "Monthly_Income_OMR", y = "Electricity_Usage_kWh", color = "Governorate",
          add = "reg.line", conf.int = TRUE) +
  labs(title = "Electricity Usage vs Income by Governorate")

Interpretation:

The boxplot indicates substantial variation in electricity usage between different governorates, with Muscat having wider usage ranges. Villas generally exhibit higher electricity usage compared to apartments.

The scatter plot shows a positive relationship between household income and electricity usage

5. Test for Normality and Homogeneity

by(df$`Electricity_Usage_kWh`, df$Governorate, shapiro.test)
## df$Governorate: Al Batinah North
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.95535, p-value = 0.4013
## 
## ------------------------------------------------------------ 
## df$Governorate: Al Batinah South
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.84742, p-value = 0.01597
## 
## ------------------------------------------------------------ 
## df$Governorate: Al Buraimi
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.98832, p-value = 0.9932
## 
## ------------------------------------------------------------ 
## df$Governorate: Al Dakhiliyah
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.9395, p-value = 0.3123
## 
## ------------------------------------------------------------ 
## df$Governorate: Al Dhahirah
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.96486, p-value = 0.7237
## 
## ------------------------------------------------------------ 
## df$Governorate: Al Sharqiyah North
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.93424, p-value = 0.2072
## 
## ------------------------------------------------------------ 
## df$Governorate: Al Sharqiyah South
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.97209, p-value = 0.9179
## 
## ------------------------------------------------------------ 
## df$Governorate: Al Wusta
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.92153, p-value = 0.2033
## 
## ------------------------------------------------------------ 
## df$Governorate: Dhofar
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.95865, p-value = 0.5172
## 
## ------------------------------------------------------------ 
## df$Governorate: Musandam
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.95017, p-value = 0.5633
## 
## ------------------------------------------------------------ 
## df$Governorate: Muscat
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.96514, p-value = 0.5026
leveneTest(Electricity_Usage_kWh ~ Governorate, data = df)
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value Pr(>F)
## group  10  0.4634 0.9118
##       189

Interpretation:

Shapiro-Wilk tests per governorate show that electricity usage is approximately normally distributed within most groups (p > 0.05).

Levene’s Test p-value > 0.05 suggests that variances are equal across governorates, so the assumption of homogeneity of variances is met.

6. ANOVA Analysis

6.1 One-Way ANOVA

anova1 <- aov(Electricity_Usage_kWh ~ Governorate, data = df)
base::summary(anova1)
##              Df  Sum Sq Mean Sq F value Pr(>F)
## Governorate  10   71224    7122   0.283  0.984
## Residuals   189 4760528   25188

Interpretation: The one-way ANOVA shows that there is no statistically significant difference in electricity usage between governorates (p = 0.984). This means that electricity consumption patterns are similar across all governorates in the dataset.

6.2 Two-Way ANOVA with Interaction

anova2 <- aov(Electricity_Usage_kWh ~ Governorate * HouseType, data = df)
Anova(anova2, type = 3)
## Anova Table (Type III tests)
## 
## Response: Electricity_Usage_kWh
##                        Sum Sq  Df  F value Pr(>F)    
## (Intercept)           4298125   1 170.2698 <2e-16 ***
## Governorate             59225  10   0.2346 0.9925    
## HouseType               44040   1   1.7446 0.1882    
## Governorate:HouseType  149403  10   0.5919 0.8193    
## Residuals             4493260 178                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Interpretation: The two-way ANOVA shows that there are no statistically significant main effects of governorate (p = 0.992) or house type (p = 0.188) on electricity usage.

Furthermore, the interaction between governorate and house type is not statistically significant (p = 0.819), indicating that the relationship between house type and electricity usage does not depend on the governorate.

6.3 Effect Plot

plot(allEffects(anova2), main = "Interaction: Governorate * HouseType")

Interpretation: This plot visualizes how the effect of house type varies by governorate. Parallel lines imply no interaction; crossing lines suggest interaction exists.

7. Linear Regression and Diagnostics

7.1 Fit the Model

model <- lm(Electricity_Usage_kWh ~ Governorate + HouseType + Monthly_Income_OMR +
              Family_Size + Water_Consumption_L_day, data = df)
summary(model)
## 
## Call:
## lm(formula = Electricity_Usage_kWh ~ Governorate + HouseType + 
##     Monthly_Income_OMR + Family_Size + Water_Consumption_L_day, 
##     data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -322.02 -119.28   -6.69  131.54  334.44 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   608.78820   70.85832   8.592 3.52e-15 ***
## GovernorateAl Batinah South   -44.72978   53.21669  -0.841   0.4017    
## GovernorateAl Buraimi          -7.10293   47.75428  -0.149   0.8819    
## GovernorateAl Dakhiliyah        6.98910   51.61969   0.135   0.8924    
## GovernorateAl Dhahirah         -5.17447   51.67438  -0.100   0.9203    
## GovernorateAl Sharqiyah North -28.89216   50.09155  -0.577   0.5648    
## GovernorateAl Sharqiyah South   3.52391   55.82040   0.063   0.9497    
## GovernorateAl Wusta           -55.88265   52.92171  -1.056   0.2924    
## GovernorateDhofar             -26.72079   49.35620  -0.541   0.5889    
## GovernorateMusandam            -1.67871   54.35566  -0.031   0.9754    
## GovernorateMuscat              -7.39094   46.37661  -0.159   0.8736    
## HouseTypeVilla                -51.13798   22.81714  -2.241   0.0262 *  
## Monthly_Income_OMR             -0.03003    0.02814  -1.067   0.2873    
## Family_Size                    -3.12693    4.43556  -0.705   0.4817    
## Water_Consumption_L_day         0.01297    0.05170   0.251   0.8022    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 157.6 on 185 degrees of freedom
## Multiple R-squared:  0.04884,    Adjusted R-squared:  -0.02314 
## F-statistic: 0.6785 on 14 and 185 DF,  p-value: 0.7936

Interpretation: The model estimates how each variable affects electricity usage. Coefficients show the direction and strength of influence; p-values indicate significance.

7.2 Diagnostic Plots

par(mfrow = c(2, 2))
plot(model)

Interpretation: These plots help assess linear regression assumptions: linearity, normality of residuals, constant variance, and detection of influential outliers.

  • Residuals are roughly centered around zero (no major bias).

  • No strong funnel pattern is observed (homoscedasticity is reasonable).

  • Q-Q plot suggests residuals are approximately normally distributed.

  • No major outliers or influential points based on leverage/Cook’s distance.

7.3 Multicollinearity

vif(model)
##                             GVIF Df GVIF^(1/(2*Df))
## Governorate             1.221129 10        1.010039
## HouseType               1.047865  1        1.023653
## Monthly_Income_OMR      1.064766  1        1.031875
## Family_Size             1.078470  1        1.038494
## Water_Consumption_L_day 1.053703  1        1.026500

Interpretation: Variance Inflation Factor (VIF) values for all predictors are below 2, indicating no serious multicollinearity issues among the predictors.

8. Publication-Ready Output

8.1 Regression Summary Table

tbl_regression(model, exponentiate = FALSE) %>% bold_labels()
Characteristic Beta 95% CI p-value
Governorate


    Al Batinah North
    Al Batinah South -45 -150, 60 0.4
    Al Buraimi -7.1 -101, 87 0.9
    Al Dakhiliyah 7.0 -95, 109 0.9
    Al Dhahirah -5.2 -107, 97 >0.9
    Al Sharqiyah North -29 -128, 70 0.6
    Al Sharqiyah South 3.5 -107, 114 >0.9
    Al Wusta -56 -160, 49 0.3
    Dhofar -27 -124, 71 0.6
    Musandam -1.7 -109, 106 >0.9
    Muscat -7.4 -99, 84 0.9
HouseType


    Apartment
    Villa -51 -96, -6.1 0.026
Monthly_Income_OMR -0.03 -0.09, 0.03 0.3
Family_Size -3.1 -12, 5.6 0.5
Water_Consumption_L_day 0.01 -0.09, 0.11 0.8
Abbreviation: CI = Confidence Interval

Interpretation: Presents a clean table summarizing regression coefficients, confidence intervals, and significance for reporting.

8.2 ANOVA Summary Table

tidy(Anova(anova2, type = 3))
## # A tibble: 5 × 5
##   term                     sumsq    df statistic   p.value
##   <chr>                    <dbl> <dbl>     <dbl>     <dbl>
## 1 (Intercept)           4298125.     1   170.     9.68e-28
## 2 Governorate             59225.    10     0.235  9.92e- 1
## 3 HouseType               44040.     1     1.74   1.88e- 1
## 4 Governorate:HouseType  149403.    10     0.592  8.19e- 1
## 5 Residuals             4493260.   178    NA     NA

Interpretation: The ANOVA summary table confirms the significance of main and interaction effects between governorate and house type on electricity usage.

# Load the data
data <- read_csv("C:/Users/mohaa/OneDrive/Desktop/S3336_PRT2_PRJCT/omani_household_energy_survey_200.csv")
## Rows: 200 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Governorate, HouseType
## dbl (4): Monthly_Income(OMR), Family_Size, Electricity_Usage_kWh, Water_Cons...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(data) <- make.names(colnames(data))  # Make variable names safe for formula

# Get numeric variables
numeric_vars <- names(data)[sapply(data, is.numeric)]

# UI
ui <- fluidPage(
  titlePanel("Omani Household Energy Survey - Linear Model App"),
  
  sidebarLayout(
    sidebarPanel(
      selectInput("response", "Response Variable:", choices = numeric_vars),
      checkboxGroupInput("predictors", "Predictor(s):", choices = numeric_vars)
    ),
    
    mainPanel(
      verbatimTextOutput("summary"),
      verbatimTextOutput("vif"),
      plotOutput("plots")
    )
  )
)

# Server
server <- function(input, output) {
  
  model <- reactive({
    req(input$response, input$predictors)
    predictors <- setdiff(input$predictors, input$response)
    if (length(predictors) == 0) return(NULL)
    
    formula <- as.formula(paste(input$response, "~", paste(predictors, collapse = "+")))
    lm(formula, data = data)
  })
  
  output$summary <- renderPrint({
    if (!is.null(model())) summary(model())
  })
  
  output$vif <- renderPrint({
    if (!is.null(model())) vif(model())
  })
  
  output$plots <- renderPlot({
    if (!is.null(model())) {
      par(mfrow = c(2, 2))
      plot(model())
    }
  })
}

shinyApp(ui = ui, server = server)
Shiny applications not supported in static R Markdown documents