library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.3     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

data(iris)                  # Load the built-in dataset
head(iris)                  # Preview the data

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

model <- lm(Sepal.Length ~ Petal.Length + Petal.Width, data = iris)
summary(model)

## 
## Call:
## lm(formula = Sepal.Length ~ Petal.Length + Petal.Width, data = iris)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.18534 -0.29838 -0.02763  0.28925  1.02320 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.19058    0.09705  43.181  < 2e-16 ***
## Petal.Length  0.54178    0.06928   7.820 9.41e-13 ***
## Petal.Width  -0.31955    0.16045  -1.992   0.0483 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4031 on 147 degrees of freedom
## Multiple R-squared:  0.7663, Adjusted R-squared:  0.7631 
## F-statistic:   241 on 2 and 147 DF,  p-value: < 2.2e-16

sum(is.na(iris))

## [1] 0

summary(iris) # summary statistics for all variables

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##

str(iris) # shows variable names, data types, and number of observations

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

colSums(is.na(iris))

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##            0            0            0            0            0

# Count the number of missing values in each variable

tree <- na.omit(iris) 
# Remove rows containing missing values

dim(iris)

## [1] 150   5

# Check the dimensions of the cleaned dataset

model <- lm(Sepal.Length ~ Petal.Length + Petal.Width, data = iris)
predicted_sepal_length <- predict(model)  # Obtain predicted Sepal.Length values from the model
head(predicted_sepal_length)

##        1        2        3        4        5        6 
## 4.885160 4.885160 4.830983 4.939338 4.885160 4.983783

# Boxplot outlier detection for all numerical variables combined in one plot
boxplot(tree$Sepal.Length, tree$Sepal.Width, tree$Petal.Length, tree$Petal.Width,
        names  = c("Sepal Length", "Sepal Width", "Petal Length", "Petal Width"),
        main   = "Outlier Detection for Iris Numerical Variables",
        ylab   = "Measurement (cm)",
        col    = c("lightblue", "lightgreen", "lightyellow", "lightsalmon"),
        outline = TRUE)       # outline = TRUE highlights outliers as dots

###MODEL DIAGNOSTICS
###Diagnostic plots include:
 # Residuals vs Fitted
    #Checks linearity assumption.
    
 # Normal Q-Q Plot
#Checks normality of residuals.
 #Scale-Location Plot
#Checks constant variance (homoscedasticity).

 #Residuals vs Leverage
    #Identifies influential observations.


par(mfrow = c(2,2)) # Arrange four plots in a 2 × 2 layout

plot(model)

###SCATTER PLOTS FOR VISUALIZATION
## Relationship between Petal.Length and Sepal.Length
plot(iris$Petal.Length, iris$Sepal.Length,
     main = "Sepal.Length vs Petal.Length",
     xlab = "Petal Length (cm)",
     ylab = "Sepal Length (cm)")
abline(lm(Sepal.Length ~ Petal.Length, data = iris), col = "tomato", lwd = 2)

## Relationship between Petal.Width and Sepal.Length
plot(iris$Petal.Width, iris$Sepal.Length,
     main = "Sepal.Length vs Petal.Width",
     xlab = "Petal Width (cm)",
     ylab = "Sepal Length (cm)")
abline(lm(Sepal.Length ~ Petal.Width, data = iris), col = "darkblue", lwd = 2)

## MULTIPLE LINEAR REGRESSION SUMMARY
## The multiple linear regression analysis indicates that both Petal.Length and
# Petal.Width positively influence Sepal.Length in iris flowers. Petal.Length
# is the strongest predictor, with a one-cm increase in petal length associated
# with approximately 0.54 cm increase in sepal length, while Petal.Width has
# a smaller but still statistically significant positive effect. The model
# explains 76.6% of the variability in Sepal.Length (R² = 0.766), demonstrating
# a good fit, and can reliably be used to predict the sepal length of iris
# flowers based on their petal measurements.

model <- lm(Sepal.Length ~ Petal.Length + Petal.Width, data = iris)
summary(model)

## 
## Call:
## lm(formula = Sepal.Length ~ Petal.Length + Petal.Width, data = iris)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.18534 -0.29838 -0.02763  0.28925  1.02320 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.19058    0.09705  43.181  < 2e-16 ***
## Petal.Length  0.54178    0.06928   7.820 9.41e-13 ***
## Petal.Width  -0.31955    0.16045  -1.992   0.0483 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4031 on 147 degrees of freedom
## Multiple R-squared:  0.7663, Adjusted R-squared:  0.7631 
## F-statistic:   241 on 2 and 147 DF,  p-value: < 2.2e-16

## CORRELATION MATRIX
# Examining the correlation among selected variables reveals the strength
# and direction of linear relationships between Sepal.Length, Petal.Length,
# and Petal.Width. Values close to 1 indicate a strong positive correlation.

# Examine correlations among selected variables
cor(iris[, c("Sepal.Length", "Petal.Length", "Petal.Width")])

##              Sepal.Length Petal.Length Petal.Width
## Sepal.Length    1.0000000    0.8717538   0.8179411
## Petal.Length    0.8717538    1.0000000   0.9628654
## Petal.Width     0.8179411    0.9628654   1.0000000

# Examine correlations among all numerical variables
cor(iris[, c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")])

##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length    1.0000000  -0.1175698    0.8717538   0.8179411
## Sepal.Width    -0.1175698   1.0000000   -0.4284401  -0.3661259
## Petal.Length    0.8717538  -0.4284401    1.0000000   0.9628654
## Petal.Width     0.8179411  -0.3661259    0.9628654   1.0000000

Question 2: The main variable selection method in R programming

1. Stepwise Selection (Sequential Search)

Definition: An automated algorithm that adds or removes variables one by one based on statistical tests or criteria like AIC or BIC.

Role: To quickly simplify an overwhelming list of predictors down to a manageable baseline model.

When to use: Use this when you have a moderate number of predictors (under 30) and need a fast, traditional statistical model where every step is easy to explain.

Example:

# 1. Fit the full model
full_model <- lm(mpg ~ ., data = mtcars)

# 2. Run bidirectional stepwise selection using AIC
step_model <- step(full_model, direction = "both", trace = 0)

# 3. View the selected final variables
summary(step_model)

## 
## Call:
## lm(formula = mpg ~ wt + qsec + am, data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4811 -1.5555 -0.7257  1.4110  4.6610 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   9.6178     6.9596   1.382 0.177915    
## wt           -3.9165     0.7112  -5.507 6.95e-06 ***
## qsec          1.2259     0.2887   4.247 0.000216 ***
## am            2.9358     1.4109   2.081 0.046716 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.459 on 28 degrees of freedom
## Multiple R-squared:  0.8497, Adjusted R-squared:  0.8336 
## F-statistic: 52.75 on 3 and 28 DF,  p-value: 1.21e-11

2.Best Subset Selection (Exhaustive Search)

Definition: A brute-force search that evaluates every possible combination of variables to find the mathematically ideal model.

Role: To guarantee that you are not missing out on the single best combination of a specific model size.

When to use: Use this only when your dataset has a very small number of predictors (fewer than 20) and you want absolute certainty that you found the best subset.

3.LASSO Regression

Definition: A regression technique that adds a mathematical penalty to the size of coefficients, forcing the least important coefficients to become exactly zero.

Role: To perform automated variable selection and prevent overfitting simultaneously.

When to use: Use this when you have more variables than observations (p>n), or when you suspect only a few variables are actually important out of a massive list

4.Random Forest Importance

Definition: Trains an ensemble of decision trees; measures each variable’s importance by the mean decrease in impurity (Gini) or accuracy when the variable is permuted.

Role: Model-agnostic screening. Detects non-linear effects and interactions that regression-based methods miss. Good first step before modelling.

R PROGRAMMING ASSIGNMENT

20251MBI032 SSERUMANYA CHARLES JUNIOR

2026-06-04

Question 2: The main variable selection method in R programming

1. Stepwise Selection (Sequential Search)

2.Best Subset Selection (Exhaustive Search)

3.LASSO Regression

4.Random Forest Importance