#Load dataset

library(readr)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

eci <- read_csv("eci.csv")

## Rows: 274 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): County, CSCS, CSFA, TS, PPSC, TPPS
## dbl (1): B3P
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

cleaned_data <- na.omit(eci)

#Dependent and Independent variables

dep_var <- cleaned_data$CSCS
ind_var <- cleaned_data$TS

cleaned_data$CSCS<-as.numeric(cleaned_data$CSCS)

## Warning: NAs introduced by coercion

cleaned_data$TS<-as.numeric(cleaned_data$TS)

## Warning: NAs introduced by coercion

model<-lm(CSCS~TS, data=cleaned_data)

#Linearity (plot and raintest)

# Plot for linearity
plot(cleaned_data$TS, cleaned_data$CSCS, main = "Scatter Plot of TS vs CSCS",
     xlab = "TS", ylab = "CSCS")
abline(model, col = "red")

if (!require(lmtest)) {
  install.packages("lmtest")
  library(lmtest)
}

## Loading required package: lmtest

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

# Rainbow test for linearity
raintest(model)

## 
##  Rainbow test
## 
## data:  model
## Rain = 69.954, df1 = 3, df2 = 1, p-value = 0.08761

Independence of errors (durbin-watson)

if (!require(car)) {
  install.packages("car")
  library(car)
}

## Loading required package: car

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

# Durbin-Watson test
durbinWatsonTest(model)

##  lag Autocorrelation D-W Statistic p-value
##    1      0.03864063      1.770235   0.594
##  Alternative hypothesis: rho != 0

#Homoscedasticity (plot, bptest)

plot(model$fitted.values, model$residuals, main = "Residuals vs Fitted Values",
     xlab = "Fitted Values", ylab = "Residuals")
abline(h = 0, col = "red")

bptest(model)

## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 1.209, df = 1, p-value = 0.2715

#Normality of residuals (QQ plot, shapiro test)

qqnorm(model$residuals)
qqline(model$residuals, col = "red")

shapiro.test(model$residuals)

## 
##  Shapiro-Wilk normality test
## 
## data:  model$residuals
## W = 0.96987, p-value = 0.8916

#No multicolinarity (VIF, cor)

# Variance Inflation Factor (VIF)
if (!require(car)) {
  install.packages("car")
  library(car)
}

# Correlation between TS and CSCS
cor(cleaned_data$TS, cleaned_data$CSCS)

## [1] NA

#With the variables I was using VIF did not apply using B3P and CSCS because CSCS is the dependent variable and B3P would be the only independent variable. 

# I converted possible predictors to numeric  
cleaned_data$B3P <- as.numeric(cleaned_data$B3P)
cleaned_data$PPSC <- as.numeric(cleaned_data$PPSC)

## Warning: NAs introduced by coercion

cleaned_data$TPPS <- as.numeric(cleaned_data$TPPS)

## Warning: NAs introduced by coercion

str(cleaned_data)

## tibble [60 × 7] (S3: tbl_df/tbl/data.frame)
##  $ County: chr [1:60] "Andrews" "Archer" "Bailey" "Bexar" ...
##  $ B3P   : num [1:60] 1703 400 513 130524 511 ...
##  $ CSCS  : num [1:60] 78 15 41 7721 13 ...
##  $ CSFA  : chr [1:60] "*" "*" "*" "40" ...
##  $ TS    : num [1:60] NA NA NA NA NA NA 120 NA NA NA ...
##  $ PPSC  : num [1:60] NA NA NA NA NA NA NA NA NA NA ...
##  $ TPPS  : num [1:60] NA NA NA NA NA NA NA NA NA NA ...
##  - attr(*, "na.action")= 'omit' Named int [1:214] 1 3 4 6 7 8 10 11 12 13 ...
##   ..- attr(*, "names")= chr [1:214] "1" "3" "4" "6" ...

# Multiple linear regression model with two predictors
model_multiple <- lm(CSCS ~ TS + B3P, data = cleaned_data)

# Calculated VIF
if (!require(car)) {
  install.packages("car")
  library(car)
}
vif(model_multiple)

##       TS      B3P 
## 21.05824 21.05824

Does Your Model Meet Those Assumptions? The model checks several assumptions necessary for a linear regression to be valid:

Linearity: The scatter plot of TS vs. CSCS and the fitted line indicates that the relationship between TS and CSCS is reasonably linear. This suggests that the linearity assumption is likely met.

Independence of Errors: The Durbin-Watson statistic is close to 2, which typically indicates that the errors are independent. This means the independence of errors assumption is likely met.

Homoscedasticity: The residual plot shows a potential spread in residuals, which might suggest heteroscedasticity (unequal variance). This was confirmed by the Breusch-Pagan test, which yielded a significant p-value, indicating that the homoscedasticity assumption is not met.

Normality of Residuals: The QQ plot shows deviations from normality, and the Shapiro-Wilk test has a very low p-value, indicating non-normal residuals. Therefore, the normality assumption is also not met.

No Multicollinearity: Since I only have one predictor (TS) in the main model, multicollinearity isn’t a concern. If I add B3P as a second predictor, I can calculate the VIF to ensure it’s below 5, which would indicate no significant multicollinearity.

The linearity and independence of errors are well-supported based on the plots and tests we’ve conducted. However, homoscedasticity and normality of residuals do not meet the model’s assumptions.

While a violation in homoscedasticity or normality may affect some interpretations, these violations do not necessarily invalidate the model’s predictions.

Which Assumption is Violated? The model violates:

Homoscedasticity: as indicated by the Breusch-Pagan test. Normality of Residuals: based on the Shapiro-Wilk test and QQ plot.

Mitigating Violated Assumptions To address heteroscedasticity and non-normal residuals,trying a log transformation of the dependent variable (CSCS). This transformation often stabilizes variance (helping with homoscedasticity) and makes the residuals more normally distributed.

# log transformation to the dependent variable
cleaned_data$log_CSCS <- log(cleaned_data$CSCS + 1)  # Add 1 to handle any zero values

# transformed dependent variable
model_log <- lm(log_CSCS ~ TS, data = cleaned_data)

# Summary of the new model
summary(model_log)

## 
## Call:
## lm(formula = log_CSCS ~ TS, data = cleaned_data)
## 
## Residuals:
##       7      16      28      30      31      38 
##  0.4983 -0.3559  0.5126 -0.1514 -0.3570 -0.1466 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.6023225  0.2388440  15.082 0.000113 ***
## TS          0.0046900  0.0008963   5.232 0.006373 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4499 on 4 degrees of freedom
##   (54 observations deleted due to missingness)
## Multiple R-squared:  0.8725, Adjusted R-squared:  0.8407 
## F-statistic: 27.38 on 1 and 4 DF,  p-value: 0.006373

# Homoscedasticity: Residual plot
plot(model_log$fitted.values, model_log$residuals, main = "Residuals vs Fitted Values (Log Transformed)",
     xlab = "Fitted Values", ylab = "Residuals")
abline(h = 0, col = "red")

# Normality: QQ Plot
qqnorm(model_log$residuals)
qqline(model_log$residuals, col = "red")

# Shapiro-Wilk Test
shapiro.test(model_log$residuals)

## 
##  Shapiro-Wilk normality test
## 
## data:  model_log$residuals
## W = 0.78921, p-value = 0.0469

Paola_PAD 6833_Homework 7

2024-11-06

Independence of errors (durbin-watson)