# Load necessary libraries
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(broom)

# Load data
data <- read.csv("C:/Users/aiden/OneDrive/mergedfile.csv")

Create Model

Refer to the simple linear regression model you built last week. Include 1-3 more variables into your regression model.

# Create enhanced linear model with additional variables and an interaction term
enhanced_model <- lm(Adj.Close ~ Revenuegrowth + Marketcap + Volume + Revenuegrowth:Marketcap, data = data)

# Summary of the enhanced model
summary(enhanced_model)
## 
## Call:
## lm(formula = Adj.Close ~ Revenuegrowth + Marketcap + Volume + 
##     Revenuegrowth:Marketcap, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -195.3  -76.2  -51.9   -3.2 4006.0 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              1.052e+02  4.500e-01  233.71   <2e-16 ***
## Revenuegrowth            6.160e+01  3.799e+00   16.21   <2e-16 ***
## Marketcap                7.003e-11  1.993e-12   35.13   <2e-16 ***
## Volume                  -6.520e-07  1.355e-08  -48.13   <2e-16 ***
## Revenuegrowth:Marketcap -5.833e-10  1.823e-11  -32.00   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 232.2 on 339530 degrees of freedom
##   (12562 observations deleted due to missingness)
## Multiple R-squared:  0.008589,   Adjusted R-squared:  0.008578 
## F-statistic: 735.4 on 4 and 339530 DF,  p-value: < 2.2e-16

Evaluate

Evaluate this model.

# 1. Residuals vs Fitted
plot(enhanced_model, which = 1, main = "Residuals vs Fitted")

# 2. Normal Q-Q Plot
plot(enhanced_model, which = 2, main = "Normal Q-Q")

# 3. Scale-Location (Spread-Location) Plot
plot(enhanced_model, which = 3, main = "Scale-Location")

# 4. Cook's Distance Plot
plot(enhanced_model, which = 4, main = "Cook's Distance")

# 5. Residuals vs Leverage
plot(enhanced_model, which = 5, main = "Residuals vs Leverage")

Summary

  1. Residuals vs Fitted Plot: Checks for non-linearity and variance consistency.

    • Interpretation: This plot reveals a clear funnel shape, indicating heteroscedasticity. Residuals do not have constant variance across the range of fitted values, which violates the homoscedasticity assumption.

    • Implication: The model’s residuals have varying spread, suggesting that predictions may be more reliable for some fitted values than others. This issue can affect the accuracy of confidence intervals and p-values.

    • Recommendation: Consider transforming the response variable (e.g., log or square root transformation) or using weighted regression to address heteroscedasticity.

  2. Normal Q-Q Plot: Assesses normality of residuals.

    • Interpretation: The Q-Q plot shows that residuals deviate significantly from the theoretical normal line, especially at the tails. This indicates that the residuals are not normally distributed, which violates the normality assumption.

    • Implication: Non-normality of residuals can impact hypothesis testing, making confidence intervals and p-values less reliable.

    • Recommendation: Consider robust regression techniques or transforming the response variable to improve the normality of residuals.

  3. Scale-Location Plot: Examines homoscedasticity (equal variance).

    • Interpretation: This plot shows an increase in the spread of residuals as fitted values increase, further confirming heteroscedasticity.

    • Implication: Heteroscedasticity suggests that the residual variance changes with the level of the fitted values, which could lead to biased standard errors and unreliable significance tests.

    • Recommendation: Similar to the Residuals vs Fitted plot, a transformation of the response variable or weighted regression could help stabilize the variance.

  4. Cook’s Distance Plot: Identifies influential data points.

    • Interpretation: A few observations have high Cook’s distance, indicating they have a significant influence on the model’s parameters. These points could be outliers or high-leverage points that disproportionately affect the regression results.

    • Implication: Influential points can distort model estimates, making it crucial to assess their validity or consider alternative methods to reduce their impact.

    • Recommendation: Investigate these influential points to determine if they are legitimate data points or errors. If they are valid data, consider using robust regression methods that are less sensitive to outliers.

  5. Residuals vs Leverage Plot: Detects points that disproportionately influence the regression fit.

    • Interpretation: The plot shows several high-leverage points, some of which are outside the Cook’s distance boundary. These high-leverage points can have a significant impact on the model’s fit and can indicate potential outliers.

    • Implication: High-leverage points with substantial residuals are problematic as they can skew the model’s parameters.

    • Recommendation: Consider examining these high-leverage observations more closely. Depending on their validity, either adjust the model to mitigate their effect or apply robust regression techniques.

    # Set up output directory
    output_dir <- "C:/Users/aiden/OneDrive/"  # Replace with your desired directory
    
    # 1. Residuals vs Fitted Plot
    jpeg(filename = paste0(output_dir, "residuals_vs_fitted.jpg"))
    plot(enhanced_model, which = 1, main = "Residuals vs Fitted")
    dev.off()
    ## png 
    ##   2
    # 2. Normal Q-Q Plot
    jpeg(filename = paste0(output_dir, "normal_qq.jpg"))
    plot(enhanced_model, which = 2, main = "Normal Q-Q")
    dev.off()
    ## png 
    ##   2
    # 3. Scale-Location Plot
    jpeg(filename = paste0(output_dir, "scale_location.jpg"))
    plot(enhanced_model, which = 3, main = "Scale-Location")
    dev.off()
    ## png 
    ##   2
    # 4. Cook's Distance Plot
    jpeg(filename = paste0(output_dir, "cooks_distance.jpg"))
    plot(enhanced_model, which = 4, main = "Cook's Distance")
    dev.off()
    ## png 
    ##   2
    # 5. Residuals vs Leverage Plot
    jpeg(filename = paste0(output_dir, "residuals_vs_leverage.jpg"))
    plot(enhanced_model, which = 5, main = "Residuals vs Leverage")
    dev.off()
    ## png 
    ##   2