# Attempt to read CSV files. If files are not found, generate simulated data.
tryCatch({
train_df <<- read.csv("train.csv")
cat("Training dataset successfully loaded from 'train.csv'.\n")
test_df <<- read.csv("test.csv")
cat("Test dataset successfully loaded from 'test.csv'.\n")
}, error = function(e) {
cat("Error: CSV files could not be read (train.csv or test.csv). Generating simulated datasets.\n")
cat("Error Message: ", e$message, "\n")
# Simulate training data
set.seed(123)
x_train_sim <- runif(700, min = 0, max = 100)
y_train_sim <- x_train_sim + rnorm(700, mean = 0, sd = 3)
train_df <<- data.frame(x = x_train_sim, y = y_train_sim)
# Simulate test data
x_test_sim <- runif(300, min = 0, max = 100)
y_test_sim <- x_test_sim + rnorm(300, mean = 0, sd = 3)
test_df <<- data.frame(x = x_test_sim, y = y_test_sim)
})
## Error: CSV files could not be read (train.csv or test.csv). Generating simulated datasets.
## Error Message: cannot open the connection
# Check for existence of columns
if (!("x" %in% names(train_df) && "y" %in% names(train_df))) {
stop("Error: 'x' or 'y' columns not found in the training dataset. Please check CSV headers.")
}
if (!("x" %in% names(test_df) && "y" %in% names(test_df))) {
stop("Error: 'x' or 'y' columns not found in the test dataset. Please check CSV headers.")
}
# Summary statistics
cat("Training Data Summary:\n")
## Training Data Summary:
summary(train_df)
## x y
## Min. : 0.04653 Min. : -5.511
## 1st Qu.:25.36403 1st Qu.: 25.941
## Median :48.30350 Median : 48.991
## Mean :49.96801 Mean : 50.029
## 3rd Qu.:75.31595 3rd Qu.: 75.542
## Max. :99.94045 Max. :103.221
cat("\nTest Data Summary:\n")
##
## Test Data Summary:
summary(test_df)
## x y
## Min. : 0.4991 Min. : -4.161
## 1st Qu.:23.5314 1st Qu.: 23.591
## Median :48.1335 Median : 48.814
## Mean :49.8442 Mean : 49.874
## 3rd Qu.:73.3810 3rd Qu.: 73.666
## Max. :99.9274 Max. :103.306
# First few rows of each dataset
cat("\nFirst 6 Rows of Training Data:\n")
##
## First 6 Rows of Training Data:
print(head(train_df))
## x y
## 1 28.75775 31.802582
## 2 78.83051 72.852268
## 3 40.89769 39.615854
## 4 88.30174 88.651652
## 5 94.04673 91.367106
## 6 4.55565 5.557359
cat("\nFirst 6 Rows of Test Data:\n")
##
## First 6 Rows of Test Data:
print(head(test_df))
## x y
## 1 62.98418 64.84373
## 2 35.34138 33.06885
## 3 42.47147 45.02604
## 4 96.37688 94.13309
## 5 68.09985 69.99057
## 6 71.84639 75.13637
numberofNA <- sum(is.na(train_df))
if(numberofNA > 0){
cat('Number of Mising Values Found:', numberofNA)
cat('\nRemoving missing values...')
train_df = train_df[complete.cases(train_df),]
}
train_df = train_df[complete.cases(train_df),]
linear_model <- lm(y ~ x, data = train_df)
plot(train_df$x, train_df$y,
main = "Linear Regression",
xlab = "x", ylab = "y",
pch = 16, col = "blue")
abline(linear_model, col = "red", lwd = 2)
summary(linear_model)
##
## Call:
## lm(formula = y ~ x, data = train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.4824 -2.0372 0.1108 1.9663 8.4878
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.14610 0.23272 0.628 0.53
## x 0.99830 0.00404 247.127 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.064 on 698 degrees of freedom
## Multiple R-squared: 0.9887, Adjusted R-squared: 0.9887
## F-statistic: 6.107e+04 on 1 and 698 DF, p-value: < 2.2e-16
r_squared <- summary(linear_model)$r.squared
cat(paste("R-squared:", round(r_squared, 4), "\n"))
## R-squared: 0.9887
test_df$predicted_y <- predict(linear_model, newdata = test_df)
residuals <- test_df$y - test_df$predicted_y
RMSE <- sqrt(mean(residuals^2))
MAE <- mean(abs(residuals))
evaluation_metrics <- data.frame(
Metric = c("RMSE", "MAE"),
Value = c(round(RMSE, 4), round(MAE, 4))
)
knitr::kable(evaluation_metrics, caption = "Model Evaluation Metrics")
| Metric | Value |
|---|---|
| RMSE | 2.9203 |
| MAE | 2.3737 |
ggplot(train_df, aes(x = x, y = y)) +
geom_point(alpha = 0.6, color = "blue") +
geom_smooth(method = "lm", col = "red", se = FALSE) +
labs(title = "Training Dataset with Linear Regression Line",
x = "X", y = "Y") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
ggplot(test_df, aes(x = x)) +
geom_point(aes(y = y), alpha = 0.6, color = "blue", size = 2) +
geom_point(aes(y = predicted_y), alpha = 0.6, color = "red", shape = 4, size = 2) +
labs(title = "Test Data: Actual vs Predicted Values",
x = "X", y = "Y") +
theme_minimal()
ggplot(test_df, aes(x = predicted_y, y = residuals)) +
geom_point(alpha = 0.6, color = "purple") +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(title = "Residuals Plot",
x = "Predicted Y", y = "Residual (Actual Y - Predicted Y)") +
theme_minimal()
ggplot(test_df, aes(x = y, y = predicted_y)) +
geom_point(color = "darkgreen", alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
labs(title = "Actual vs Predicted Y",
x = "Actual Y", y = "Predicted Y") +
theme_minimal()
The model performs well, with an R-squared value close to 1 and low RMSE/MAE values. This is expected since the data is generated with a linear trend and small noise. The visualizations support this by showing close alignment between actual and predicted values.
This analysis confirms that a simple linear regression is effective for this dataset.