Linear Regression Project Assignment #3

1 1. Data Loading
2 2. Data Inspection
3 Check for NA and missing values
4 3. Linear Regression Model Training
5 4. Prediction on Test Data
6 5. Model Evaluation
7 6. Visualizations
8 7. Results Analysis

1 1. Data Loading

# Attempt to read CSV files. If files are not found, generate simulated data.
tryCatch({
  train_df <<- read.csv("train.csv")
  cat("Training dataset successfully loaded from 'train.csv'.\n")
  
  test_df <<- read.csv("test.csv")
  cat("Test dataset successfully loaded from 'test.csv'.\n")
  
}, error = function(e) {
  cat("Error: CSV files could not be read (train.csv or test.csv). Generating simulated datasets.\n")
  cat("Error Message: ", e$message, "\n")
  
  # Simulate training data
  set.seed(123)
  x_train_sim <- runif(700, min = 0, max = 100)
  y_train_sim <- x_train_sim + rnorm(700, mean = 0, sd = 3)
  train_df <<- data.frame(x = x_train_sim, y = y_train_sim)
  
  # Simulate test data
  x_test_sim <- runif(300, min = 0, max = 100)
  y_test_sim <- x_test_sim + rnorm(300, mean = 0, sd = 3)
  test_df <<- data.frame(x = x_test_sim, y = y_test_sim)
})

## Error: CSV files could not be read (train.csv or test.csv). Generating simulated datasets.
## Error Message:  cannot open the connection

# Check for existence of columns
if (!("x" %in% names(train_df) && "y" %in% names(train_df))) {
  stop("Error: 'x' or 'y' columns not found in the training dataset. Please check CSV headers.")
}
if (!("x" %in% names(test_df) && "y" %in% names(test_df))) {
  stop("Error: 'x' or 'y' columns not found in the test dataset. Please check CSV headers.")
}

2 2. Data Inspection

# Summary statistics
cat("Training Data Summary:\n")

## Training Data Summary:

summary(train_df)

##        x                  y          
##  Min.   : 0.04653   Min.   : -5.511  
##  1st Qu.:25.36403   1st Qu.: 25.941  
##  Median :48.30350   Median : 48.991  
##  Mean   :49.96801   Mean   : 50.029  
##  3rd Qu.:75.31595   3rd Qu.: 75.542  
##  Max.   :99.94045   Max.   :103.221

cat("\nTest Data Summary:\n")

## 
## Test Data Summary:

summary(test_df)

##        x                 y          
##  Min.   : 0.4991   Min.   : -4.161  
##  1st Qu.:23.5314   1st Qu.: 23.591  
##  Median :48.1335   Median : 48.814  
##  Mean   :49.8442   Mean   : 49.874  
##  3rd Qu.:73.3810   3rd Qu.: 73.666  
##  Max.   :99.9274   Max.   :103.306

# First few rows of each dataset
cat("\nFirst 6 Rows of Training Data:\n")

## 
## First 6 Rows of Training Data:

print(head(train_df))

##          x         y
## 1 28.75775 31.802582
## 2 78.83051 72.852268
## 3 40.89769 39.615854
## 4 88.30174 88.651652
## 5 94.04673 91.367106
## 6  4.55565  5.557359

cat("\nFirst 6 Rows of Test Data:\n")

## 
## First 6 Rows of Test Data:

print(head(test_df))

##          x        y
## 1 62.98418 64.84373
## 2 35.34138 33.06885
## 3 42.47147 45.02604
## 4 96.37688 94.13309
## 5 68.09985 69.99057
## 6 71.84639 75.13637

3 Check for NA and missing values

numberofNA <- sum(is.na(train_df))
if(numberofNA > 0){
  cat('Number of Mising Values Found:', numberofNA)
  cat('\nRemoving missing values...')
  train_df = train_df[complete.cases(train_df),]
}
train_df = train_df[complete.cases(train_df),]

4 3. Linear Regression Model Training

linear_model <- lm(y ~ x, data = train_df)
plot(train_df$x, train_df$y,
     main = "Linear Regression",
     xlab = "x", ylab = "y",
     pch = 16, col = "blue")
abline(linear_model, col = "red", lwd = 2)

summary(linear_model)

## 
## Call:
## lm(formula = y ~ x, data = train_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.4824 -2.0372  0.1108  1.9663  8.4878 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.14610    0.23272   0.628     0.53    
## x            0.99830    0.00404 247.127   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.064 on 698 degrees of freedom
## Multiple R-squared:  0.9887, Adjusted R-squared:  0.9887 
## F-statistic: 6.107e+04 on 1 and 698 DF,  p-value: < 2.2e-16

4.1 R-squared Value

r_squared <- summary(linear_model)$r.squared
cat(paste("R-squared:", round(r_squared, 4), "\n"))

## R-squared: 0.9887

5 4. Prediction on Test Data

test_df$predicted_y <- predict(linear_model, newdata = test_df)

6 5. Model Evaluation

residuals <- test_df$y - test_df$predicted_y
RMSE <- sqrt(mean(residuals^2))
MAE <- mean(abs(residuals))

evaluation_metrics <- data.frame(
  Metric = c("RMSE", "MAE"),
  Value = c(round(RMSE, 4), round(MAE, 4))
)
knitr::kable(evaluation_metrics, caption = "Model Evaluation Metrics")

Model Evaluation Metrics
Metric	Value
RMSE	2.9203
MAE	2.3737

7 6. Visualizations

7.1 Training Data & Regression Line

ggplot(train_df, aes(x = x, y = y)) +
  geom_point(alpha = 0.6, color = "blue") +
  geom_smooth(method = "lm", col = "red", se = FALSE) +
  labs(title = "Training Dataset with Linear Regression Line",
       x = "X", y = "Y") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

7.2 Test Data: Actual vs Predicted

ggplot(test_df, aes(x = x)) +
  geom_point(aes(y = y), alpha = 0.6, color = "blue", size = 2) +
  geom_point(aes(y = predicted_y), alpha = 0.6, color = "red", shape = 4, size = 2) +
  labs(title = "Test Data: Actual vs Predicted Values",
       x = "X", y = "Y") +
  theme_minimal()

7.3 Residuals Plot

ggplot(test_df, aes(x = predicted_y, y = residuals)) +
  geom_point(alpha = 0.6, color = "purple") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(title = "Residuals Plot",
       x = "Predicted Y", y = "Residual (Actual Y - Predicted Y)") +
  theme_minimal()

7.4 Actual vs Predicted Y Scatter Plot

ggplot(test_df, aes(x = y, y = predicted_y)) +
  geom_point(color = "darkgreen", alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
  labs(title = "Actual vs Predicted Y",
       x = "Actual Y", y = "Predicted Y") +
  theme_minimal()

8 7. Results Analysis

The model performs well, with an R-squared value close to 1 and low RMSE/MAE values. This is expected since the data is generated with a linear trend and small noise. The visualizations support this by showing close alignment between actual and predicted values.

This analysis confirms that a simple linear regression is effective for this dataset.