# Install required packages if not already installed
packages <- c("tidyverse", "car", "ggpubr", "gtsummary", "effects", "broom")
install_if_missing <- function(p) {
  if (!requireNamespace(p, quietly = TRUE)) install.packages(p)
}
lapply(packages, install_if_missing)
## Warning in check_dep_version(): ABI version mismatch: 
## lme4 was built with Matrix ABI version 1
## Current Matrix ABI version is 0
## Please re-install lme4 from source or restore original 'Matrix' package
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
# Part 2 Packages



# Load packages
library(car)
## Warning: package 'car' was built under R version 4.3.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.3.3
## Loading required package: ggplot2
library(gtsummary)
library(effects)
## Warning: package 'effects' was built under R version 4.3.3
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
library(broom)
## Warning: package 'broom' was built under R version 4.3.3

Interpretation: We load the core packages needed for data analysis, visualization, ANOVA, regression, and diagnostics.

2. Import Data

df <- read.csv("C:/Users/mohaa/Downloads/Omani_household_energy.csv")
df$Governorate <- as.factor(df$Governorate)
df$HouseType <- as.factor(df$HouseType)

Interpretation: The dataset is loaded and categorical variables (Governorate and HouseType) are converted to factors for analysis.

3. Exploratory Data Analysis

3.1 Summary Statistics

df %>%
  group_by(Governorate, HouseType) %>%
  summarise(across(where(is.numeric),
                   list(mean = mean, sd = sd),
                   .names = "{.col}_{.fn}"),
            .groups = "drop")
## # A tibble: 22 × 10
##    Governorate      HouseType Monthly_Income_OMR_mean Monthly_Income_OMR_sd
##    <fct>            <fct>                       <dbl>                 <dbl>
##  1 Al Batinah North Apartment                   1522.                  398.
##  2 Al Batinah North Villa                       1312.                  201.
##  3 Al Batinah South Apartment                    993.                  445.
##  4 Al Batinah South Villa                       1452.                  374.
##  5 Al Buraimi       Apartment                   1367.                  439.
##  6 Al Buraimi       Villa                       1298.                  292.
##  7 Al Dakhiliyah    Apartment                   1218.                  395.
##  8 Al Dakhiliyah    Villa                       1225.                  513.
##  9 Al Dhahirah      Apartment                   1394.                  448.
## 10 Al Dhahirah      Villa                       1113.                  441.
## # ℹ 12 more rows
## # ℹ 6 more variables: Family_Size_mean <dbl>, Family_Size_sd <dbl>,
## #   Electricity_Usage_kWh_mean <dbl>, Electricity_Usage_kWh_sd <dbl>,
## #   Water_Consumption_L_day_mean <dbl>, Water_Consumption_L_day_sd <dbl>

Interpretation: We compute the mean and standard deviation for each numeric variable grouped by governorate and house type to understand data distribution.

3.2 Visualizations

ggboxplot(df, x = "Governorate", y = "Electricity_Usage_kWh", color = "HouseType",
          palette = "jco", add = "jitter",
          title = "Electricity Usage by Governorate and House Type")

ggscatter(df, x = "Monthly_Income_OMR", y = "Electricity_Usage_kWh", color = "Governorate",
          add = "reg.line", conf.int = TRUE) +
  labs(title = "Electricity Usage vs Income by Governorate")

Interpretation: The boxplot shows variation in electricity usage by region and house type. The scatter plot explores the relationship between income and electricity use across governorates.

5. Test for Normality and Homogeneity

by(df$`Electricity_Usage_kWh`, df$Governorate, shapiro.test)
## df$Governorate: Al Batinah North
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.95535, p-value = 0.4013
## 
## ------------------------------------------------------------ 
## df$Governorate: Al Batinah South
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.84742, p-value = 0.01597
## 
## ------------------------------------------------------------ 
## df$Governorate: Al Buraimi
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.98832, p-value = 0.9932
## 
## ------------------------------------------------------------ 
## df$Governorate: Al Dakhiliyah
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.9395, p-value = 0.3123
## 
## ------------------------------------------------------------ 
## df$Governorate: Al Dhahirah
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.96486, p-value = 0.7237
## 
## ------------------------------------------------------------ 
## df$Governorate: Al Sharqiyah North
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.93424, p-value = 0.2072
## 
## ------------------------------------------------------------ 
## df$Governorate: Al Sharqiyah South
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.97209, p-value = 0.9179
## 
## ------------------------------------------------------------ 
## df$Governorate: Al Wusta
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.92153, p-value = 0.2033
## 
## ------------------------------------------------------------ 
## df$Governorate: Dhofar
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.95865, p-value = 0.5172
## 
## ------------------------------------------------------------ 
## df$Governorate: Musandam
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.95017, p-value = 0.5633
## 
## ------------------------------------------------------------ 
## df$Governorate: Muscat
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.96514, p-value = 0.5026
leveneTest(Electricity_Usage_kWh ~ Governorate, data = df)
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value Pr(>F)
## group  10  0.4634 0.9118
##       189

Interpretation:

Shapiro-Wilk test checks if electricity usage is normally distributed within each governorate. Levene’s test assesses whether variances are equal across governorates.

6. ANOVA Analysis

6.1 One-Way ANOVA

anova1 <- aov(Electricity_Usage_kWh ~ Governorate, data = df)
base::summary(anova1)
##              Df  Sum Sq Mean Sq F value Pr(>F)
## Governorate  10   71224    7122   0.283  0.984
## Residuals   189 4760528   25188

Interpretation: One-way ANOVA tests if the average electricity usage differs significantly between governorates. A low p-value (< 0.05) would indicate significant differences.

6.2 Two-Way ANOVA with Interaction

anova2 <- aov(Electricity_Usage_kWh ~ Governorate * HouseType, data = df)
Anova(anova2, type = 3)
## Anova Table (Type III tests)
## 
## Response: Electricity_Usage_kWh
##                        Sum Sq  Df  F value Pr(>F)    
## (Intercept)           4298125   1 170.2698 <2e-16 ***
## Governorate             59225  10   0.2346 0.9925    
## HouseType               44040   1   1.7446 0.1882    
## Governorate:HouseType  149403  10   0.5919 0.8193    
## Residuals             4493260 178                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Interpretation: This model tests main and interaction effects of governorate and house type on electricity usage. Interaction checks whether house type influences usage differently across regions.

6.3 Effect Plot

plot(allEffects(anova2), main = "Interaction: Governorate * HouseType")

Interpretation: This plot visualizes how the effect of house type varies by governorate. Parallel lines imply no interaction; crossing lines suggest interaction exists.

7. Linear Regression and Diagnostics

7.1 Fit the Model

model <- lm(Electricity_Usage_kWh ~ Governorate + HouseType + Monthly_Income_OMR +
              Family_Size + Water_Consumption_L_day, data = df)
summary(model)
## 
## Call:
## lm(formula = Electricity_Usage_kWh ~ Governorate + HouseType + 
##     Monthly_Income_OMR + Family_Size + Water_Consumption_L_day, 
##     data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -322.02 -119.28   -6.69  131.54  334.44 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   608.78820   70.85832   8.592 3.52e-15 ***
## GovernorateAl Batinah South   -44.72978   53.21669  -0.841   0.4017    
## GovernorateAl Buraimi          -7.10293   47.75428  -0.149   0.8819    
## GovernorateAl Dakhiliyah        6.98910   51.61969   0.135   0.8924    
## GovernorateAl Dhahirah         -5.17447   51.67438  -0.100   0.9203    
## GovernorateAl Sharqiyah North -28.89216   50.09155  -0.577   0.5648    
## GovernorateAl Sharqiyah South   3.52391   55.82040   0.063   0.9497    
## GovernorateAl Wusta           -55.88265   52.92171  -1.056   0.2924    
## GovernorateDhofar             -26.72079   49.35620  -0.541   0.5889    
## GovernorateMusandam            -1.67871   54.35566  -0.031   0.9754    
## GovernorateMuscat              -7.39094   46.37661  -0.159   0.8736    
## HouseTypeVilla                -51.13798   22.81714  -2.241   0.0262 *  
## Monthly_Income_OMR             -0.03003    0.02814  -1.067   0.2873    
## Family_Size                    -3.12693    4.43556  -0.705   0.4817    
## Water_Consumption_L_day         0.01297    0.05170   0.251   0.8022    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 157.6 on 185 degrees of freedom
## Multiple R-squared:  0.04884,    Adjusted R-squared:  -0.02314 
## F-statistic: 0.6785 on 14 and 185 DF,  p-value: 0.7936

Interpretation: The model estimates how each variable affects electricity usage. Coefficients show the direction and strength of influence; p-values indicate significance.

7.2 Diagnostic Plots

par(mfrow = c(2, 2))
plot(model)

Interpretation: These plots help assess linear regression assumptions: linearity, normality of residuals, constant variance, and detection of influential outliers.

7.3 Multicollinearity

vif(model)
##                             GVIF Df GVIF^(1/(2*Df))
## Governorate             1.221129 10        1.010039
## HouseType               1.047865  1        1.023653
## Monthly_Income_OMR      1.064766  1        1.031875
## Family_Size             1.078470  1        1.038494
## Water_Consumption_L_day 1.053703  1        1.026500

Interpretation: Variance Inflation Factor (VIF) checks if predictors are highly correlated. VIF > 5 indicates multicollinearity problems.

8. Publication-Ready Output

8.1 Regression Summary Table

tbl_regression(model, exponentiate = FALSE) %>% bold_labels()
Characteristic Beta 95% CI p-value
Governorate


    Al Batinah North
    Al Batinah South -45 -150, 60 0.4
    Al Buraimi -7.1 -101, 87 0.9
    Al Dakhiliyah 7.0 -95, 109 0.9
    Al Dhahirah -5.2 -107, 97 >0.9
    Al Sharqiyah North -29 -128, 70 0.6
    Al Sharqiyah South 3.5 -107, 114 >0.9
    Al Wusta -56 -160, 49 0.3
    Dhofar -27 -124, 71 0.6
    Musandam -1.7 -109, 106 >0.9
    Muscat -7.4 -99, 84 0.9
HouseType


    Apartment
    Villa -51 -96, -6.1 0.026
Monthly_Income_OMR -0.03 -0.09, 0.03 0.3
Family_Size -3.1 -12, 5.6 0.5
Water_Consumption_L_day 0.01 -0.09, 0.11 0.8
Abbreviation: CI = Confidence Interval

Interpretation: Presents a clean table summarizing regression coefficients, confidence intervals, and significance for reporting.

8.2 ANOVA Summary Table

tidy(Anova(anova2, type = 3))
## # A tibble: 5 × 5
##   term                     sumsq    df statistic   p.value
##   <chr>                    <dbl> <dbl>     <dbl>     <dbl>
## 1 (Intercept)           4298125.     1   170.     9.68e-28
## 2 Governorate             59225.    10     0.235  9.92e- 1
## 3 HouseType               44040.     1     1.74   1.88e- 1
## 4 Governorate:HouseType  149403.    10     0.592  8.19e- 1
## 5 Residuals             4493260.   178    NA     NA

Interpretation: Shows ANOVA results in a compact, report-ready format including sum of squares, degrees of freedom, F-statistics, and p-values.

7. Linear Regression and Diagnostics

# Fit a linear model to predict electricity usage
model <- lm(Electricity_Usage_kWh ~ Governorate + HouseType + Monthly_Income_OMR + Family_Size, data = df)
summary(model)
## 
## Call:
## lm(formula = Electricity_Usage_kWh ~ Governorate + HouseType + 
##     Monthly_Income_OMR + Family_Size, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -318.29 -120.13   -7.25  129.13  330.50 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   619.24366   57.16454  10.833   <2e-16 ***
## GovernorateAl Batinah South   -44.42352   53.06850  -0.837   0.4036    
## GovernorateAl Buraimi          -7.65418   47.58339  -0.161   0.8724    
## GovernorateAl Dakhiliyah        6.30241   51.41707   0.123   0.9026    
## GovernorateAl Dhahirah         -4.86143   51.52903  -0.094   0.9249    
## GovernorateAl Sharqiyah North -29.70834   49.85973  -0.596   0.5520    
## GovernorateAl Sharqiyah South   2.39247   55.49762   0.043   0.9657    
## GovernorateAl Wusta           -55.21353   52.72116  -1.047   0.2963    
## GovernorateDhofar             -27.57641   49.11406  -0.561   0.5751    
## GovernorateMusandam            -3.12657   53.91215  -0.058   0.9538    
## GovernorateMuscat              -7.09593   46.24477  -0.153   0.8782    
## HouseTypeVilla                -50.91373   22.74212  -2.239   0.0264 *  
## Monthly_Income_OMR             -0.02975    0.02805  -1.061   0.2902    
## Family_Size                    -3.17915    4.41949  -0.719   0.4728    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 157.2 on 186 degrees of freedom
## Multiple R-squared:  0.04852,    Adjusted R-squared:  -0.01799 
## F-statistic: 0.7295 on 13 and 186 DF,  p-value: 0.7326

Interpretation: This model checks the combined influence of governorate, house type, income, and family size on electricity usage.

7.2 Diagnostic Plots

par(mfrow = c(2, 2))
plot(model)

Interpretation: These plots help assess assumptions such as linearity, homoscedasticity, and outliers. Patterns or funnel shapes could indicate violations.

7.3 Multicollinearity Check

vif(model)
##                        GVIF Df GVIF^(1/(2*Df))
## Governorate        1.163453 10        1.007598
## HouseType          1.046257  1        1.022867
## Monthly_Income_OMR 1.063130  1        1.031082
## Family_Size        1.076095  1        1.037350

Interpretation: VIF values above 5 suggest multicollinearity. All VIFs should ideally be below 2 in this context.

8. Publication-Ready Tables

8.1 Regression Table

tbl_regression(model) %>%
  bold_labels()
Characteristic Beta 95% CI p-value
Governorate


    Al Batinah North
    Al Batinah South -44 -149, 60 0.4
    Al Buraimi -7.7 -102, 86 0.9
    Al Dakhiliyah 6.3 -95, 108 >0.9
    Al Dhahirah -4.9 -107, 97 >0.9
    Al Sharqiyah North -30 -128, 69 0.6
    Al Sharqiyah South 2.4 -107, 112 >0.9
    Al Wusta -55 -159, 49 0.3
    Dhofar -28 -124, 69 0.6
    Musandam -3.1 -109, 103 >0.9
    Muscat -7.1 -98, 84 0.9
HouseType


    Apartment
    Villa -51 -96, -6.0 0.026
Monthly_Income_OMR -0.03 -0.09, 0.03 0.3
Family_Size -3.2 -12, 5.5 0.5
Abbreviation: CI = Confidence Interval

8.2 ANOVA Summary Table

tidy(Anova(anova2, type = 3))
## # A tibble: 5 × 5
##   term                     sumsq    df statistic   p.value
##   <chr>                    <dbl> <dbl>     <dbl>     <dbl>
## 1 (Intercept)           4298125.     1   170.     9.68e-28
## 2 Governorate             59225.    10     0.235  9.92e- 1
## 3 HouseType               44040.     1     1.74   1.88e- 1
## 4 Governorate:HouseType  149403.    10     0.592  8.19e- 1
## 5 Residuals             4493260.   178    NA     NA

Interpretation: These tables are clean and ready for publication or presentation. They summarize the main and interaction effects and regression coefficients.

# Load necessary libraries
library(shiny)
## Warning: package 'shiny' was built under R version 4.3.3
library(readr)
## Warning: package 'readr' was built under R version 4.3.3
library(car)


colnames(df) <- make.names(colnames(data))  # Make variable names safe for formula

# Get numeric variables
numeric_vars <- names(df)[sapply(data, is.numeric)]

# UI
ui <- fluidPage(
  titlePanel("Omani Household Energy Survey - Linear Model App"),
  
  sidebarLayout(
    sidebarPanel(
      selectInput("response", "Response Variable:", choices = numeric_vars),
      checkboxGroupInput("predictors", "Predictor(s):", choices = numeric_vars)
    ),
    
    mainPanel(
      verbatimTextOutput("summary"),
      verbatimTextOutput("vif"),
      plotOutput("plots")
    )
  )
)

# Server
server <- function(input, output) {
  
  model <- reactive({
    req(input$response, input$predictors)
    predictors <- setdiff(input$predictors, input$response)
    if (length(predictors) == 0) return(NULL)
    
    formula <- as.formula(paste(input$response, "~", paste(predictors, collapse = "+")))
    lm(formula, data = df)
  })
  
  output$summary <- renderPrint({
    if (!is.null(model())) summary(model())
  })
  
  output$vif <- renderPrint({
    if (!is.null(model())) vif(model())
  })
  
  output$plots <- renderPlot({
    if (!is.null(model())) {
      par(mfrow = c(2, 2))
      plot(model())
    }
  })
}

shinyApp(ui = ui, server = server)
Shiny applications not supported in static R Markdown documents