STAT3336 Project

library(readxl)
Mydata <- read_excel("RData.xlsx")  # Make sure the file is in your working directory
head(Mydata)


#Interpretation:
#Loads your Excel data (RData.xlsx) into a dataframe named data.

library(dplyr)
library(tidyverse)
Mydata %>%
  group_by(Gender, Wilayat) %>%
  summarise(across(where(is.numeric), list(mean = mean, sd = sd), .names = "{.col}_{.fn}"), .groups = "drop")

#Interpretation:
#Groups the data by Gender and Wilayat.
#Calculates mean and standard deviation for all numeric columns in each group.

###Boxplot of Monthly Income
library(ggpubr)
ggboxplot(Mydata, x = "Wilayat", y = "Monthly_Income", color = "Gender",
          palette = "jco", add = "jitter",
          title = "Monthly Income by Wilayat and Gender")

#Interpretation:
#Draws a boxplot of Monthly_Income across Wilayat, split by Gender.
#Enlarges outliers and adds jittered points.

### Scatterplot of Age vs Income
library(ggpubr)
ggscatter(Mydata, x = "Age", y = "Monthly_Income", color = "Wilayat",
          add = "reg.line", conf.int = TRUE) +
  labs(title = "Income vs Age by Wilayat")

#Interpretation:
#Creates a scatterplot of Age vs Monthly_Income, colored by Wilayat.
#Adds a regression line with confidence interval to show trend.


## Assumption Tests
### Shapiro-Wilk Test for Normality
by(Mydata$Monthly_Income, Mydata$Wilayat, shapiro.test)

## Mydata$Wilayat: Buraimi
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.92421, p-value = 0.02129
## 
## --------------------------------------------------------- 
## Mydata$Wilayat: Muscat
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.96657, p-value = 0.4922
## 
## --------------------------------------------------------- 
## Mydata$Wilayat: Nizwa
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.96511, p-value = 0.3072
## 
## --------------------------------------------------------- 
## Mydata$Wilayat: Salalah
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.98409, p-value = 0.9673
## 
## --------------------------------------------------------- 
## Mydata$Wilayat: Sohar
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.9548, p-value = 0.08312
## 
## --------------------------------------------------------- 
## Mydata$Wilayat: Sur
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.95734, p-value = 0.1778

#Interpretation:
#This line performs the Shapiro-Wilk normality test on Monthly_Income separately for each Wilayat.
#It uses the by() function to apply shapiro.test() to subsets of the data.
#Each subset is grouped by the levels of the Wilayat categorical variable.

### Levene’s Test for Homogeneity
library(car)
leveneTest(Monthly_Income ~ Wilayat, data = Mydata)

## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.

## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value Pr(>F)
## group   5  1.0826 0.3713
##       194

#Interpretation:
#This code performs Levene’s Test to assess the homogeneity of variances in Monthly_Income across different Wilayat groups.
#Look at the p-value:
#p > 0.05 → Fail to reject H₀ → Variances are equal → Assumption met.
#p ≤ 0.05 → Reject H₀ → Variances are unequal → Assumption violated.

#One-way ANOVA
anova_1 <- aov(Monthly_Income ~ Wilayat, data = Mydata)
base::summary(anova_1)

##              Df  Sum Sq Mean Sq F value Pr(>F)
## Wilayat       5  348342   69668   1.578  0.168
## Residuals   194 8566898   44159

#Interpretation:
#Check the p-value under Pr(>F):
#If p ≤ 0.05 → Reject H₀ → Significant difference in means between wilayats.
#If p > 0.05 → Fail to reject H₀ → No significant difference in income across wilayats.

#Two-way ANOVA
anova_2 <- aov(Monthly_Income ~ Gender * Wilayat, data = Mydata)
Anova(anova_2, type = 3)

## Anova Table (Type III tests)
## 
## Response: Monthly_Income
##                 Sum Sq  Df  F value Pr(>F)    
## (Intercept)    6356703   1 141.3584 <2e-16 ***
## Gender           83022   1   1.8462 0.1759    
## Wilayat         202440   5   0.9004 0.4820    
## Gender:Wilayat  103198   5   0.4590 0.8064    
## Residuals      8454115 188                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Interpretation:
#for the term "Gender" , If p < 0.05 → significant difference between genders
#for the term "Wilayat" , If p < 0.05 → income differs across wilayats
#for the term "Gender:Wilayat" , If p < 0.05 → interaction is significant (effect of gender differs across wilayats)

#Effect Plot
library(effects)
plot(allEffects(anova_2), main = "Interaction Effects: Gender * Wilayat")

#Interpretation:
#This code generates an effect plot from your two-way ANOVA model (anova_2), which includes both main effects and interaction between Gender and Wilayat.
#It uses the allEffects() function from the effects package to calculate and plot the estimated marginal means (adjusted means) of Monthly_Income for each group and interaction level.
#The plot shows:
#The y-axis represents the predicted (mean) Monthly_Income.
#The x-axis typically shows Wilayat, and separate lines or panels show the effect of Gender.
#If the lines for Male and Female are parallel, there’s no interaction.
#If the lines cross or diverge, it indicates a potential interaction effect between Gender and Wilayat.

#Linear Regression
model <- lm(Monthly_Income ~ Gender + Wilayat + Age + Working_Hours + Education_Years, data = Mydata)
base::summary(model)

## 
## Call:
## lm(formula = Monthly_Income ~ Gender + Wilayat + Age + Working_Hours + 
##     Education_Years, data = Mydata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -410.04 -152.29   -9.61  146.75  712.68 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     615.5357   148.3911   4.148 5.05e-05 ***
## GenderMale       21.4365    31.3181   0.684   0.4945    
## WilayatMuscat   -34.7767    55.0471  -0.632   0.5283    
## WilayatNizwa     50.6218    50.5520   1.001   0.3179    
## WilayatSalalah   98.5612    57.9828   1.700   0.0908 .  
## WilayatSohar    -24.2928    48.8176  -0.498   0.6193    
## WilayatSur      -13.8282    50.7264  -0.273   0.7855    
## Age              -1.4981     1.2135  -1.234   0.2185    
## Working_Hours     0.5694     1.9312   0.295   0.7684    
## Education_Years   8.0238     5.7290   1.401   0.1630    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 210 on 190 degrees of freedom
## Multiple R-squared:  0.05984,    Adjusted R-squared:  0.01531 
## F-statistic: 1.344 on 9 and 190 DF,  p-value: 0.2168

#Interpretation:
#Coefficients: Show how much income changes for each unit change in predictors.
#For example: If Age has a coefficient of +5, then income increases 5 units per year of age.
#Intercept: Expected income when all predictors are at baseline (e.g., Female, Muscat, Age = 0, etc.).
#R-squared: Shows how much of the variability in income is explained by the model.
#Closer to 1 = better fit.
#F-statistic: Overall test of whether the regression model is significant.

## Diagnostic Plots
par(mfrow = c(2, 2))
plot(model)

#Interpretation:
#Residuals vs Fitted
#Checks linearity and homoscedasticity (equal variance).
#Random scatter → OK
#Curved patterns or fan shapes → Violations present
#Normal Q-Q Plot
#Checks if residuals are normally distributed.
#Points should lie along the straight line
#Major deviations → Normality assumption may be violated
#Scale-Location Plot (Spread vs Fitted)
#Checks for homoscedasticity (constant variance).
#A horizontal band = good
#Spread increasing/decreasing = heteroscedasticity
#Residuals vs Leverage
#Detects influential observations using leverage and Cook’s distance.
#Watch for points far from others or with large Cook’s distance
#These can disproportionately affect the model

## Multicollinearity Check (VIF)
vif(model)

##                     GVIF Df GVIF^(1/(2*Df))
## Gender          1.057881  1        1.028534
## Wilayat         1.111337  5        1.010612
## Age             1.060179  1        1.029650
## Working_Hours   1.033705  1        1.016713
## Education_Years 1.047832  1        1.023637

#Interpretation:
#vif(model) computes the Variance Inflation Factor (VIF) for each predictor in your regression model.
#It helps you detect multicollinearity — a situation where predictor variables 
#are highly correlated with each other.

# VIF Value | Interpretation
#≈ 1 | No multicollinearity
#1 – 5 | Moderate multicollinearity (acceptable)
#> 5 | High multicollinearity (concerning)
#> 10 | Very high (problematic – needs attention)

### Regression Summary Table
library(gtsummary)
tbl_regression(model, exponentiate = FALSE) %>%
  bold_labels()

Characteristic	Beta	95% CI	p-value
Gender
Female	—	—
Male	21	-40, 83	0.5
Wilayat
Buraimi	—	—
Muscat	-35	-143, 74	0.5
Nizwa	51	-49, 150	0.3
Salalah	99	-16, 213	0.091
Sohar	-24	-121, 72	0.6
Sur	-14	-114, 86	0.8
Age	-1.5	-3.9, 0.90	0.2
Working_Hours	0.57	-3.2, 4.4	0.8
Education_Years	8.0	-3.3, 19	0.2
Abbreviation: CI = Confidence Interval

#Interpretation:
#Generates a formatted summary table of a regression model.
#It displays the raw regression coefficients (not exponentiated).
#And it bolds the labels of the variables to improve the table’s readability and presentation.


#ANOVA Summary Table
library(broom)
tidy(Anova(anova_2, type = 3))

#Interpretation:
#This code runs a Type III ANOVA on the model anova_2 to assess the significance of each term (including interactions), and tidies the results into a clean data frame format for easier analysis and reporting.

library(shiny)
library(car)
library(ggplot2)
library(readxl)

# Load the dataset
data <- read_excel("RData.xlsx", sheet = "Sheet1")

# UI
ui <- fluidPage(
  titlePanel("STAT 3336 - Linear Regression App with car Package"),
  sidebarLayout(
    sidebarPanel(
      selectInput("response", "Select Response Variable:", 
                  choices = names(data)[sapply(data, is.numeric)], 
                  selected = "Monthly_Income"),
      checkboxGroupInput("predictors", "Select Predictor(s):", 
                         choices = names(data)[sapply(data, is.numeric) & names(data) != "Monthly_Income"],
                         selected = c("Age", "Working_Hours", "Education_Years"))
    ),
    mainPanel(
      h4("Model Summary"),
      verbatimTextOutput("modelSummary"),
      h4("VIF (Variance Inflation Factor)"),
      verbatimTextOutput("vifOutput"),
      h4("Diagnostic Plots"),
      plotOutput("diagnosticPlots")
    )
  )
)

# Server
server <- function(input, output) {
  
  model <- reactive({
    req(input$predictors)
    formula <- as.formula(paste(input$response, "~", paste(input$predictors, collapse = "+")))
    lm(formula, data = data)
  })
  
  output$modelSummary <- renderPrint({
    summary(model())
  })
  
  output$vifOutput <- renderPrint({
    if (length(input$predictors) >= 2) {
      vif(model())
    } else {
      "VIF not available: Select at least 2 predictors."
    }
  })
  
  output$diagnosticPlots <- renderPlot({
    par(mfrow = c(2, 2))
    plot(model())
  })
}

# Launch App
shinyApp(ui = ui, server = server)

Shiny applications not supported in static R Markdown documents

STAT3336 Project

Al-Waleed Al Masqari

2025-05-08