library(readxl)
Mydata <- read_excel("RData.xlsx") # Make sure the file is in your working directory
head(Mydata)
#Interpretation:
#Loads your Excel data (RData.xlsx) into a dataframe named data.
library(dplyr)
library(tidyverse)
Mydata %>%
group_by(Gender, Wilayat) %>%
summarise(across(where(is.numeric), list(mean = mean, sd = sd), .names = "{.col}_{.fn}"), .groups = "drop")
#Interpretation:
#Groups the data by Gender and Wilayat.
#Calculates mean and standard deviation for all numeric columns in each group.
###Boxplot of Monthly Income
library(ggpubr)
ggboxplot(Mydata, x = "Wilayat", y = "Monthly_Income", color = "Gender",
palette = "jco", add = "jitter",
title = "Monthly Income by Wilayat and Gender")

#Interpretation:
#Draws a boxplot of Monthly_Income across Wilayat, split by Gender.
#Enlarges outliers and adds jittered points.
### Scatterplot of Age vs Income
library(ggpubr)
ggscatter(Mydata, x = "Age", y = "Monthly_Income", color = "Wilayat",
add = "reg.line", conf.int = TRUE) +
labs(title = "Income vs Age by Wilayat")

#Interpretation:
#Creates a scatterplot of Age vs Monthly_Income, colored by Wilayat.
#Adds a regression line with confidence interval to show trend.
## Assumption Tests
### Shapiro-Wilk Test for Normality
by(Mydata$Monthly_Income, Mydata$Wilayat, shapiro.test)
## Mydata$Wilayat: Buraimi
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.92421, p-value = 0.02129
##
## ---------------------------------------------------------
## Mydata$Wilayat: Muscat
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.96657, p-value = 0.4922
##
## ---------------------------------------------------------
## Mydata$Wilayat: Nizwa
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.96511, p-value = 0.3072
##
## ---------------------------------------------------------
## Mydata$Wilayat: Salalah
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.98409, p-value = 0.9673
##
## ---------------------------------------------------------
## Mydata$Wilayat: Sohar
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.9548, p-value = 0.08312
##
## ---------------------------------------------------------
## Mydata$Wilayat: Sur
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.95734, p-value = 0.1778
#Interpretation:
#This line performs the Shapiro-Wilk normality test on Monthly_Income separately for each Wilayat.
#It uses the by() function to apply shapiro.test() to subsets of the data.
#Each subset is grouped by the levels of the Wilayat categorical variable.
### Levene’s Test for Homogeneity
library(car)
leveneTest(Monthly_Income ~ Wilayat, data = Mydata)
## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 5 1.0826 0.3713
## 194
#Interpretation:
#This code performs Levene’s Test to assess the homogeneity of variances in Monthly_Income across different Wilayat groups.
#Look at the p-value:
#p > 0.05 → Fail to reject H₀ → Variances are equal → Assumption met.
#p ≤ 0.05 → Reject H₀ → Variances are unequal → Assumption violated.
#One-way ANOVA
anova_1 <- aov(Monthly_Income ~ Wilayat, data = Mydata)
base::summary(anova_1)
## Df Sum Sq Mean Sq F value Pr(>F)
## Wilayat 5 348342 69668 1.578 0.168
## Residuals 194 8566898 44159
#Interpretation:
#Check the p-value under Pr(>F):
#If p ≤ 0.05 → Reject H₀ → Significant difference in means between wilayats.
#If p > 0.05 → Fail to reject H₀ → No significant difference in income across wilayats.
#Two-way ANOVA
anova_2 <- aov(Monthly_Income ~ Gender * Wilayat, data = Mydata)
Anova(anova_2, type = 3)
## Anova Table (Type III tests)
##
## Response: Monthly_Income
## Sum Sq Df F value Pr(>F)
## (Intercept) 6356703 1 141.3584 <2e-16 ***
## Gender 83022 1 1.8462 0.1759
## Wilayat 202440 5 0.9004 0.4820
## Gender:Wilayat 103198 5 0.4590 0.8064
## Residuals 8454115 188
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Interpretation:
#for the term "Gender" , If p < 0.05 → significant difference between genders
#for the term "Wilayat" , If p < 0.05 → income differs across wilayats
#for the term "Gender:Wilayat" , If p < 0.05 → interaction is significant (effect of gender differs across wilayats)
#Effect Plot
library(effects)
plot(allEffects(anova_2), main = "Interaction Effects: Gender * Wilayat")

#Interpretation:
#This code generates an effect plot from your two-way ANOVA model (anova_2), which includes both main effects and interaction between Gender and Wilayat.
#It uses the allEffects() function from the effects package to calculate and plot the estimated marginal means (adjusted means) of Monthly_Income for each group and interaction level.
#The plot shows:
#The y-axis represents the predicted (mean) Monthly_Income.
#The x-axis typically shows Wilayat, and separate lines or panels show the effect of Gender.
#If the lines for Male and Female are parallel, there’s no interaction.
#If the lines cross or diverge, it indicates a potential interaction effect between Gender and Wilayat.
#Linear Regression
model <- lm(Monthly_Income ~ Gender + Wilayat + Age + Working_Hours + Education_Years, data = Mydata)
base::summary(model)
##
## Call:
## lm(formula = Monthly_Income ~ Gender + Wilayat + Age + Working_Hours +
## Education_Years, data = Mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -410.04 -152.29 -9.61 146.75 712.68
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 615.5357 148.3911 4.148 5.05e-05 ***
## GenderMale 21.4365 31.3181 0.684 0.4945
## WilayatMuscat -34.7767 55.0471 -0.632 0.5283
## WilayatNizwa 50.6218 50.5520 1.001 0.3179
## WilayatSalalah 98.5612 57.9828 1.700 0.0908 .
## WilayatSohar -24.2928 48.8176 -0.498 0.6193
## WilayatSur -13.8282 50.7264 -0.273 0.7855
## Age -1.4981 1.2135 -1.234 0.2185
## Working_Hours 0.5694 1.9312 0.295 0.7684
## Education_Years 8.0238 5.7290 1.401 0.1630
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 210 on 190 degrees of freedom
## Multiple R-squared: 0.05984, Adjusted R-squared: 0.01531
## F-statistic: 1.344 on 9 and 190 DF, p-value: 0.2168
#Interpretation:
#Coefficients: Show how much income changes for each unit change in predictors.
#For example: If Age has a coefficient of +5, then income increases 5 units per year of age.
#Intercept: Expected income when all predictors are at baseline (e.g., Female, Muscat, Age = 0, etc.).
#R-squared: Shows how much of the variability in income is explained by the model.
#Closer to 1 = better fit.
#F-statistic: Overall test of whether the regression model is significant.
## Diagnostic Plots
par(mfrow = c(2, 2))
plot(model)

#Interpretation:
#Residuals vs Fitted
#Checks linearity and homoscedasticity (equal variance).
#Random scatter → OK
#Curved patterns or fan shapes → Violations present
#Normal Q-Q Plot
#Checks if residuals are normally distributed.
#Points should lie along the straight line
#Major deviations → Normality assumption may be violated
#Scale-Location Plot (Spread vs Fitted)
#Checks for homoscedasticity (constant variance).
#A horizontal band = good
#Spread increasing/decreasing = heteroscedasticity
#Residuals vs Leverage
#Detects influential observations using leverage and Cook’s distance.
#Watch for points far from others or with large Cook’s distance
#These can disproportionately affect the model
## Multicollinearity Check (VIF)
vif(model)
## GVIF Df GVIF^(1/(2*Df))
## Gender 1.057881 1 1.028534
## Wilayat 1.111337 5 1.010612
## Age 1.060179 1 1.029650
## Working_Hours 1.033705 1 1.016713
## Education_Years 1.047832 1 1.023637
#Interpretation:
#vif(model) computes the Variance Inflation Factor (VIF) for each predictor in your regression model.
#It helps you detect multicollinearity — a situation where predictor variables
#are highly correlated with each other.
# VIF Value | Interpretation
#≈ 1 | No multicollinearity
#1 – 5 | Moderate multicollinearity (acceptable)
#> 5 | High multicollinearity (concerning)
#> 10 | Very high (problematic – needs attention)
### Regression Summary Table
library(gtsummary)
tbl_regression(model, exponentiate = FALSE) %>%
bold_labels()
| Characteristic |
Beta |
95% CI |
p-value |
| Gender |
|
|
|
| Female |
— |
— |
|
| Male |
21 |
-40, 83 |
0.5 |
| Wilayat |
|
|
|
| Buraimi |
— |
— |
|
| Muscat |
-35 |
-143, 74 |
0.5 |
| Nizwa |
51 |
-49, 150 |
0.3 |
| Salalah |
99 |
-16, 213 |
0.091 |
| Sohar |
-24 |
-121, 72 |
0.6 |
| Sur |
-14 |
-114, 86 |
0.8 |
| Age |
-1.5 |
-3.9, 0.90 |
0.2 |
| Working_Hours |
0.57 |
-3.2, 4.4 |
0.8 |
| Education_Years |
8.0 |
-3.3, 19 |
0.2 |
| Abbreviation: CI = Confidence Interval |
#Interpretation:
#Generates a formatted summary table of a regression model.
#It displays the raw regression coefficients (not exponentiated).
#And it bolds the labels of the variables to improve the table’s readability and presentation.
#ANOVA Summary Table
library(broom)
tidy(Anova(anova_2, type = 3))
#Interpretation:
#This code runs a Type III ANOVA on the model anova_2 to assess the significance of each term (including interactions), and tidies the results into a clean data frame format for easier analysis and reporting.
library(shiny)
library(car)
library(ggplot2)
library(readxl)
# Load the dataset
data <- read_excel("RData.xlsx", sheet = "Sheet1")
# UI
ui <- fluidPage(
titlePanel("STAT 3336 - Linear Regression App with car Package"),
sidebarLayout(
sidebarPanel(
selectInput("response", "Select Response Variable:",
choices = names(data)[sapply(data, is.numeric)],
selected = "Monthly_Income"),
checkboxGroupInput("predictors", "Select Predictor(s):",
choices = names(data)[sapply(data, is.numeric) & names(data) != "Monthly_Income"],
selected = c("Age", "Working_Hours", "Education_Years"))
),
mainPanel(
h4("Model Summary"),
verbatimTextOutput("modelSummary"),
h4("VIF (Variance Inflation Factor)"),
verbatimTextOutput("vifOutput"),
h4("Diagnostic Plots"),
plotOutput("diagnosticPlots")
)
)
)
# Server
server <- function(input, output) {
model <- reactive({
req(input$predictors)
formula <- as.formula(paste(input$response, "~", paste(input$predictors, collapse = "+")))
lm(formula, data = data)
})
output$modelSummary <- renderPrint({
summary(model())
})
output$vifOutput <- renderPrint({
if (length(input$predictors) >= 2) {
vif(model())
} else {
"VIF not available: Select at least 2 predictors."
}
})
output$diagnosticPlots <- renderPlot({
par(mfrow = c(2, 2))
plot(model())
})
}
# Launch App
shinyApp(ui = ui, server = server)
Shiny applications not supported in static R Markdown documents