# Install required packages if not already installed
packages <- c("tidyverse", "car", "ggpubr", "gtsummary", "effects", "broom", "shiny",
              "bslib", "readr", "car")
install_if_missing <- function(p) {
  if (!requireNamespace(p, quietly = TRUE)) install.packages(p)
}

lapply(packages, install_if_missing)

## Warning in check_dep_version(): ABI version mismatch: 
## lme4 was built with Matrix ABI version 1
## Current Matrix ABI version is 0
## Please re-install lme4 from source or restore original 'Matrix' package

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
## 
## [[8]]
## NULL
## 
## [[9]]
## NULL
## 
## [[10]]
## NULL

# Load libraries
# PART 1
library(car)

## Warning: package 'car' was built under R version 4.3.3

## Loading required package: carData

## Warning: package 'carData' was built under R version 4.3.3

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.3.3

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggpubr)

## Warning: package 'ggpubr' was built under R version 4.3.3

## Loading required package: ggplot2

library(gtsummary)
library(effects)

## Warning: package 'effects' was built under R version 4.3.3

## lattice theme set by effectsTheme()
## See ?effectsTheme for details.

library(broom)

## Warning: package 'broom' was built under R version 4.3.3

# PART 2
library(shiny)

## Warning: package 'shiny' was built under R version 4.3.3

library(readr)

## Warning: package 'readr' was built under R version 4.3.3

library(car)

Interpretation:

We load the core packages needed for data analysis, visualization, ANOVA, regression, and diagnostics.

2. Import Data

df <- read.csv("C:/Users/mohaa/Downloads/Omani_household_energy.csv")
df$Governorate <- as.factor(df$Governorate)
df$HouseType <- as.factor(df$HouseType)

Interpretation:

The dataset is loaded and categorical variables (Governorate and HouseType) are converted to factors for analysis.

3. Exploratory Data Analysis

3.1 Summary Statistics

df %>%
  group_by(Governorate, HouseType) %>%
  summarise(across(where(is.numeric),
                   list(mean = mean, sd = sd),
                   .names = "{.col}_{.fn}"),
            .groups = "drop")

## # A tibble: 22 × 10
##    Governorate      HouseType Monthly_Income_OMR_mean Monthly_Income_OMR_sd
##    <fct>            <fct>                       <dbl>                 <dbl>
##  1 Al Batinah North Apartment                   1522.                  398.
##  2 Al Batinah North Villa                       1312.                  201.
##  3 Al Batinah South Apartment                    993.                  445.
##  4 Al Batinah South Villa                       1452.                  374.
##  5 Al Buraimi       Apartment                   1367.                  439.
##  6 Al Buraimi       Villa                       1298.                  292.
##  7 Al Dakhiliyah    Apartment                   1218.                  395.
##  8 Al Dakhiliyah    Villa                       1225.                  513.
##  9 Al Dhahirah      Apartment                   1394.                  448.
## 10 Al Dhahirah      Villa                       1113.                  441.
## # ℹ 12 more rows
## # ℹ 6 more variables: Family_Size_mean <dbl>, Family_Size_sd <dbl>,
## #   Electricity_Usage_kWh_mean <dbl>, Electricity_Usage_kWh_sd <dbl>,
## #   Water_Consumption_L_day_mean <dbl>, Water_Consumption_L_day_sd <dbl>

Interpretation:

We computed grouped summary statistics of income, family size, electricity usage, and water consumption across governorates and house types.

On average, villas showed higher electricity usage and water consumption compared to apartments.
Households in Muscat had relatively higher income and energy usage compared to other governorates.

3.2 Visualizations

ggboxplot(df, x = "Governorate", y = "Electricity_Usage_kWh", color = "HouseType",
          palette = "jco", add = "jitter",
          title = "Electricity Usage by Governorate and House Type")

ggscatter(df, x = "Monthly_Income_OMR", y = "Electricity_Usage_kWh", color = "Governorate",
          add = "reg.line", conf.int = TRUE) +
  labs(title = "Electricity Usage vs Income by Governorate")

Interpretation:

The boxplot indicates substantial variation in electricity usage between different governorates, with Muscat having wider usage ranges. Villas generally exhibit higher electricity usage compared to apartments.

The scatter plot shows a positive relationship between household income and electricity usage

5. Test for Normality and Homogeneity

by(df$`Electricity_Usage_kWh`, df$Governorate, shapiro.test)

## df$Governorate: Al Batinah North
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.95535, p-value = 0.4013
## 
## ------------------------------------------------------------ 
## df$Governorate: Al Batinah South
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.84742, p-value = 0.01597
## 
## ------------------------------------------------------------ 
## df$Governorate: Al Buraimi
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.98832, p-value = 0.9932
## 
## ------------------------------------------------------------ 
## df$Governorate: Al Dakhiliyah
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.9395, p-value = 0.3123
## 
## ------------------------------------------------------------ 
## df$Governorate: Al Dhahirah
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.96486, p-value = 0.7237
## 
## ------------------------------------------------------------ 
## df$Governorate: Al Sharqiyah North
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.93424, p-value = 0.2072
## 
## ------------------------------------------------------------ 
## df$Governorate: Al Sharqiyah South
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.97209, p-value = 0.9179
## 
## ------------------------------------------------------------ 
## df$Governorate: Al Wusta
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.92153, p-value = 0.2033
## 
## ------------------------------------------------------------ 
## df$Governorate: Dhofar
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.95865, p-value = 0.5172
## 
## ------------------------------------------------------------ 
## df$Governorate: Musandam
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.95017, p-value = 0.5633
## 
## ------------------------------------------------------------ 
## df$Governorate: Muscat
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.96514, p-value = 0.5026

leveneTest(Electricity_Usage_kWh ~ Governorate, data = df)

## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value Pr(>F)
## group  10  0.4634 0.9118
##       189

Interpretation:

Shapiro-Wilk tests per governorate show that electricity usage is approximately normally distributed within most groups (p > 0.05).

Levene’s Test p-value > 0.05 suggests that variances are equal across governorates, so the assumption of homogeneity of variances is met.

6. ANOVA Analysis

6.1 One-Way ANOVA

anova1 <- aov(Electricity_Usage_kWh ~ Governorate, data = df)
base::summary(anova1)

##              Df  Sum Sq Mean Sq F value Pr(>F)
## Governorate  10   71224    7122   0.283  0.984
## Residuals   189 4760528   25188

Interpretation: The one-way ANOVA shows that there is no statistically significant difference in electricity usage between governorates (p = 0.984). This means that electricity consumption patterns are similar across all governorates in the dataset.

6.2 Two-Way ANOVA with Interaction

anova2 <- aov(Electricity_Usage_kWh ~ Governorate * HouseType, data = df)
Anova(anova2, type = 3)

## Anova Table (Type III tests)
## 
## Response: Electricity_Usage_kWh
##                        Sum Sq  Df  F value Pr(>F)    
## (Intercept)           4298125   1 170.2698 <2e-16 ***
## Governorate             59225  10   0.2346 0.9925    
## HouseType               44040   1   1.7446 0.1882    
## Governorate:HouseType  149403  10   0.5919 0.8193    
## Residuals             4493260 178                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Interpretation: The two-way ANOVA shows that there are no statistically significant main effects of governorate (p = 0.992) or house type (p = 0.188) on electricity usage.

Furthermore, the interaction between governorate and house type is not statistically significant (p = 0.819), indicating that the relationship between house type and electricity usage does not depend on the governorate.

6.3 Effect Plot

plot(allEffects(anova2), main = "Interaction: Governorate * HouseType")

Interpretation: This plot visualizes how the effect of house type varies by governorate. Parallel lines imply no interaction; crossing lines suggest interaction exists.

7. Linear Regression and Diagnostics

7.1 Fit the Model

model <- lm(Electricity_Usage_kWh ~ Governorate + HouseType + Monthly_Income_OMR +
              Family_Size + Water_Consumption_L_day, data = df)
summary(model)

## 
## Call:
## lm(formula = Electricity_Usage_kWh ~ Governorate + HouseType + 
##     Monthly_Income_OMR + Family_Size + Water_Consumption_L_day, 
##     data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -322.02 -119.28   -6.69  131.54  334.44 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   608.78820   70.85832   8.592 3.52e-15 ***
## GovernorateAl Batinah South   -44.72978   53.21669  -0.841   0.4017    
## GovernorateAl Buraimi          -7.10293   47.75428  -0.149   0.8819    
## GovernorateAl Dakhiliyah        6.98910   51.61969   0.135   0.8924    
## GovernorateAl Dhahirah         -5.17447   51.67438  -0.100   0.9203    
## GovernorateAl Sharqiyah North -28.89216   50.09155  -0.577   0.5648    
## GovernorateAl Sharqiyah South   3.52391   55.82040   0.063   0.9497    
## GovernorateAl Wusta           -55.88265   52.92171  -1.056   0.2924    
## GovernorateDhofar             -26.72079   49.35620  -0.541   0.5889    
## GovernorateMusandam            -1.67871   54.35566  -0.031   0.9754    
## GovernorateMuscat              -7.39094   46.37661  -0.159   0.8736    
## HouseTypeVilla                -51.13798   22.81714  -2.241   0.0262 *  
## Monthly_Income_OMR             -0.03003    0.02814  -1.067   0.2873    
## Family_Size                    -3.12693    4.43556  -0.705   0.4817    
## Water_Consumption_L_day         0.01297    0.05170   0.251   0.8022    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 157.6 on 185 degrees of freedom
## Multiple R-squared:  0.04884,    Adjusted R-squared:  -0.02314 
## F-statistic: 0.6785 on 14 and 185 DF,  p-value: 0.7936

Interpretation: The model estimates how each variable affects electricity usage. Coefficients show the direction and strength of influence; p-values indicate significance.

7.2 Diagnostic Plots

par(mfrow = c(2, 2))
plot(model)

Interpretation: These plots help assess linear regression assumptions: linearity, normality of residuals, constant variance, and detection of influential outliers.

Residuals are roughly centered around zero (no major bias).
No strong funnel pattern is observed (homoscedasticity is reasonable).
Q-Q plot suggests residuals are approximately normally distributed.
No major outliers or influential points based on leverage/Cook’s distance.

7.3 Multicollinearity

vif(model)

##                             GVIF Df GVIF^(1/(2*Df))
## Governorate             1.221129 10        1.010039
## HouseType               1.047865  1        1.023653
## Monthly_Income_OMR      1.064766  1        1.031875
## Family_Size             1.078470  1        1.038494
## Water_Consumption_L_day 1.053703  1        1.026500

Interpretation: Variance Inflation Factor (VIF) values for all predictors are below 2, indicating no serious multicollinearity issues among the predictors.

8. Publication-Ready Output

8.1 Regression Summary Table

tbl_regression(model, exponentiate = FALSE) %>% bold_labels()

Characteristic	Beta	95% CI	p-value
Governorate
Al Batinah North	—	—
Al Batinah South	-45	-150, 60	0.4
Al Buraimi	-7.1	-101, 87	0.9
Al Dakhiliyah	7.0	-95, 109	0.9
Al Dhahirah	-5.2	-107, 97	>0.9
Al Sharqiyah North	-29	-128, 70	0.6
Al Sharqiyah South	3.5	-107, 114	>0.9
Al Wusta	-56	-160, 49	0.3
Dhofar	-27	-124, 71	0.6
Musandam	-1.7	-109, 106	>0.9
Muscat	-7.4	-99, 84	0.9
HouseType
Apartment	—	—
Villa	-51	-96, -6.1	0.026
Monthly_Income_OMR	-0.03	-0.09, 0.03	0.3
Family_Size	-3.1	-12, 5.6	0.5
Water_Consumption_L_day	0.01	-0.09, 0.11	0.8
Abbreviation: CI = Confidence Interval

Interpretation: Presents a clean table summarizing regression coefficients, confidence intervals, and significance for reporting.

8.2 ANOVA Summary Table

tidy(Anova(anova2, type = 3))

## # A tibble: 5 × 5
##   term                     sumsq    df statistic   p.value
##   <chr>                    <dbl> <dbl>     <dbl>     <dbl>
## 1 (Intercept)           4298125.     1   170.     9.68e-28
## 2 Governorate             59225.    10     0.235  9.92e- 1
## 3 HouseType               44040.     1     1.74   1.88e- 1
## 4 Governorate:HouseType  149403.    10     0.592  8.19e- 1
## 5 Residuals             4493260.   178    NA     NA

Interpretation: The ANOVA summary table confirms the significance of main and interaction effects between governorate and house type on electricity usage.

# Load the data
data <- read_csv("C:/Users/mohaa/OneDrive/Desktop/S3336_PRT2_PRJCT/omani_household_energy_survey_200.csv")

## Rows: 200 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Governorate, HouseType
## dbl (4): Monthly_Income(OMR), Family_Size, Electricity_Usage_kWh, Water_Cons...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

colnames(data) <- make.names(colnames(data))  # Make variable names safe for formula

# Get numeric variables
numeric_vars <- names(data)[sapply(data, is.numeric)]

# UI
ui <- fluidPage(
  titlePanel("Omani Household Energy Survey - Linear Model App"),
  
  sidebarLayout(
    sidebarPanel(
      selectInput("response", "Response Variable:", choices = numeric_vars),
      checkboxGroupInput("predictors", "Predictor(s):", choices = numeric_vars)
    ),
    
    mainPanel(
      verbatimTextOutput("summary"),
      verbatimTextOutput("vif"),
      plotOutput("plots")
    )
  )
)

# Server
server <- function(input, output) {
  
  model <- reactive({
    req(input$response, input$predictors)
    predictors <- setdiff(input$predictors, input$response)
    if (length(predictors) == 0) return(NULL)
    
    formula <- as.formula(paste(input$response, "~", paste(predictors, collapse = "+")))
    lm(formula, data = data)
  })
  
  output$summary <- renderPrint({
    if (!is.null(model())) summary(model())
  })
  
  output$vif <- renderPrint({
    if (!is.null(model())) vif(model())
  })
  
  output$plots <- renderPlot({
    if (!is.null(model())) {
      par(mfrow = c(2, 2))
      plot(model())
    }
  })
}

shinyApp(ui = ui, server = server)

Shiny applications not supported in static R Markdown documents

S3336 PROJECT

Mohammed AlRashdi

2025-05-7

2. Import Data

3. Exploratory Data Analysis

3.1 Summary Statistics

3.2 Visualizations

5. Test for Normality and Homogeneity

6. ANOVA Analysis

6.1 One-Way ANOVA

6.2 Two-Way ANOVA with Interaction

6.3 Effect Plot

7. Linear Regression and Diagnostics

7.1 Fit the Model

7.2 Diagnostic Plots

7.3 Multicollinearity

8. Publication-Ready Output

8.1 Regression Summary Table

8.2 ANOVA Summary Table