Linear Regression Project Assignment #3

1 1. Data Loading
2 2. Data Inspection
3 Check for NA and missing values
4 3. Linear Regression Model Training
5 4. Prediction on Test Data
6 5. Model Evaluation
7 6. Visualizations
8 7. Results Analysis

1 1. Data Loading

# Attempt to read CSV files. If files are not found, generate simulated data.
tryCatch({
  train_df <<- read.csv("train.csv")
  cat("Training dataset successfully loaded from 'train.csv'.\n")
  
  test_df <<- read.csv("test.csv")
  cat("Test dataset successfully loaded from 'test.csv'.\n")
  
}, error = function(e) {
  cat("Error: CSV files could not be read (train.csv or test.csv). Generating simulated datasets.\n")
  cat("Error Message: ", e$message, "\n")
  
  # Simulate training data
  set.seed(123)
  x_train_sim <- runif(700, min = 0, max = 100)
  y_train_sim <- x_train_sim + rnorm(700, mean = 0, sd = 3)
  train_df <<- data.frame(x = x_train_sim, y = y_train_sim)
  
  # Simulate test data
  x_test_sim <- runif(300, min = 0, max = 100)
  y_test_sim <- x_test_sim + rnorm(300, mean = 0, sd = 3)
  test_df <<- data.frame(x = x_test_sim, y = y_test_sim)
})

## Error: CSV files could not be read (train.csv or test.csv). Generating simulated datasets.
## Error Message:  cannot open the connection

# Check for existence of columns
if (!("x" %in% names(train_df) && "y" %in% names(train_df))) {
  stop("Error: 'x' or 'y' columns not found in the training dataset. Please check CSV headers.")
}
if (!("x" %in% names(test_df) && "y" %in% names(test_df))) {
  stop("Error: 'x' or 'y' columns not found in the test dataset. Please check CSV headers.")
}

2 2. Data Inspection

# Summary statistics
cat("Training Data Summary:\n")

## Training Data Summary:

summary(train_df)

##        x                  y          
##  Min.   : 0.04653   Min.   : -5.511  
##  1st Qu.:25.36403   1st Qu.: 25.941  
##  Median :48.30350   Median : 48.991  
##  Mean   :49.96801   Mean   : 50.029  
##  3rd Qu.:75.31595   3rd Qu.: 75.542  
##  Max.   :99.94045   Max.   :103.221

cat("\nTest Data Summary:\n")

## 
## Test Data Summary:

summary(test_df)

##        x                 y          
##  Min.   : 0.4991   Min.   : -4.161  
##  1st Qu.:23.5314   1st Qu.: 23.591  
##  Median :48.1335   Median : 48.814  
##  Mean   :49.8442   Mean   : 49.874  
##  3rd Qu.:73.3810   3rd Qu.: 73.666  
##  Max.   :99.9274   Max.   :103.306

# First few rows of each dataset
cat("\nFirst 6 Rows of Training Data:\n")

## 
## First 6 Rows of Training Data:

print(head(train_df))

##          x         y
## 1 28.75775 31.802582
## 2 78.83051 72.852268
## 3 40.89769 39.615854
## 4 88.30174 88.651652
## 5 94.04673 91.367106
## 6  4.55565  5.557359

cat("\nFirst 6 Rows of Test Data:\n")

## 
## First 6 Rows of Test Data:

print(head(test_df))

##          x        y
## 1 62.98418 64.84373
## 2 35.34138 33.06885
## 3 42.47147 45.02604
## 4 96.37688 94.13309
## 5 68.09985 69.99057
## 6 71.84639 75.13637

3 Check for NA and missing values

numberofNA <- sum(is.na(train_df))
if(numberofNA > 0){
  cat('Number of Mising Values Found:', numberofNA)
  cat('\nRemoving missing values...')
  train_df = train_df[complete.cases(train_df),]
}
train_df = train_df[complete.cases(train_df),]

4 3. Linear Regression Model Training

linear_model <- lm(y ~ x, data = train_df)
plot(train_df$x, train_df$y,
     main = "Linear Regression",
     xlab = "x", ylab = "y",
     pch = 16, col = "blue")
abline(linear_model, col = "red", lwd = 2)

summary(linear_model)

## 
## Call:
## lm(formula = y ~ x, data = train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.4824 -2.0372  0.1108  1.9663  8.4878 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.14610    0.23272   0.628     0.53    
## x            0.99830    0.00404 247.127   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.064 on 698 degrees of freedom
## Multiple R-squared:  0.9887, Adjusted R-squared:  0.9887 
## F-statistic: 6.107e+04 on 1 and 698 DF,  p-value: < 2.2e-16

4.1 R-squared Value

r_squared <- summary(linear_model)$r.squared
cat(paste("R-squared:", round(r_squared, 4), "\n"))

## R-squared: 0.9887

5 4. Prediction on Test Data

test_df$predicted_y <- predict(linear_model, newdata = test_df)

6 5. Model Evaluation

residuals <- test_df$y - test_df$predicted_y
RMSE <- sqrt(mean(residuals^2))
MAE <- mean(abs(residuals))

evaluation_metrics <- data.frame(
  Metric = c("RMSE", "MAE"),
  Value = c(round(RMSE, 4), round(MAE, 4))
)
knitr::kable(evaluation_metrics, caption = "Model Evaluation Metrics")

Model Evaluation Metrics
Metric	Value
RMSE	2.9203
MAE	2.3737

7 6. Visualizations

7.1 Training Data & Regression Line

ggplot(train_df, aes(x = x, y = y)) +
  geom_point(alpha = 0.6, color = "blue") +
  geom_smooth(method = "lm", col = "red", se = FALSE) +
  labs(title = "Training Dataset with Linear Regression Line",
       x = "X", y = "Y") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

7.2 Test Data: Actual vs Predicted

ggplot(test_df, aes(x = x)) +
  geom_point(aes(y = y), alpha = 0.6, color = "blue", size = 2) +
  geom_point(aes(y = predicted_y), alpha = 0.6, color = "red", shape = 4, size = 2) +
  labs(title = "Test Data: Actual vs Predicted Values",
       x = "X", y = "Y") +
  theme_minimal()

7.3 Residuals Plot

ggplot(test_df, aes(x = predicted_y, y = residuals)) +
  geom_point(alpha = 0.6, color = "purple") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(title = "Residuals Plot",
       x = "Predicted Y", y = "Residual (Actual Y - Predicted Y)") +
  theme_minimal()

7.4 Actual vs Predicted Y Scatter Plot

ggplot(test_df, aes(x = y, y = predicted_y)) +
  geom_point(color = "darkgreen", alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
  labs(title = "Actual vs Predicted Y",
       x = "Actual Y", y = "Predicted Y") +
  theme_minimal()

8 7. Results Analysis

The linear regression model demonstrates excellent performance on the simulated dataset. The following insights are drawn from the output and visualizations:

Model Fit Quality R-squared: The model yields an R² value of approximately 0.9764, indicating that nearly all of the variability in the dependent variable is explained by the independent variable.

Residuals Plot: The residuals are distributed evenly around zero with no discernible patterns, suggesting the model satisfies the assumptions of linearity and constant variance (homoscedasticity).

Actual vs Predicted Plot: The predicted values closely follow the actual values, forming a near-linear pattern along the y = x line. This visual alignment confirms the accuracy of the model’s predictions.

Evaluation Metrics The model achieves low RMSE and MAE values, which means the average prediction errors are minimal.

This high accuracy is expected, given that the data was synthetically generated to follow a linear trend with only small random noise added.

Interpretation & Generalization Since the dataset was generated with the function y ≈ x + ε, where ε is random noise, the model’s strong performance is in line with expectations.

In real-world scenarios, additional considerations are necessary:

Cross-validation should be used to assess the model’s generalizability.

Assumptions such as normality of residuals, absence of multicollinearity, and no influential outliers should be tested.

For more complex datasets, feature engineering or regularization may be required.