Load Data

Check data frame for NA and ect

# Checking to make sure no NA's are present 
sum(is.na(pcancer)) 
## [1] 0
  1. Explore data
# Without log - Data is skewed left
ggplot(pcancer, aes(x = PSA)) +
  geom_histogram(bins = 30) +
  labs(title = "Distribution of (PSA)",
       x = "PSA", y = "Frequency")

# With log - Improved data model
# Apply log transformations to PSA to reduce skewness
ggplot(pcancer, aes(x = log(PSA))) +
  geom_histogram(bins = 30) +
  labs(title = "Distribution of log(PSA)",
       x = "PSA", y = "Frequency")

2. Intital fit model and another comparison with the use of log and no log:

# Fit multiple linear regression modelusing PSA as a response variable
# Fitting model without log another check and comparison
m <- lm(PSA ~ Cancer_Vol + Weight + Age + Hyperplasia + Seminal + Capsular + Score, data = pcancer)
# Summarize regression coefficients and overall model fit
summary(m)
## 
## Call:
## lm(formula = PSA ~ Cancer_Vol + Weight + Age + Hyperplasia + 
##     Seminal + Capsular + Score, data = pcancer)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -61.330  -8.130  -0.014   6.324 167.436 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -15.24264   40.53932  -0.376 0.707814    
## Cancer_Vol    2.03225    0.59359   3.424 0.000936 ***
## Weight        0.01132    0.07395   0.153 0.878708    
## Age          -0.53721    0.47588  -1.129 0.261977    
## Hyperplasia   1.29831    1.20168   1.080 0.282878    
## Seminal      19.60957   10.89184   1.800 0.075187 .  
## Capsular      1.09877    1.33377   0.824 0.412253    
## Score         7.05922    5.19452   1.359 0.177589    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 31.17 on 89 degrees of freedom
## Multiple R-squared:  0.4585, Adjusted R-squared:  0.4159 
## F-statistic: 10.77 on 7 and 89 DF,  p-value: 9.266e-10
psa_m <- m
# Fit multiple linear regression modelusing log-transformed PSA 
# data frame pcancer fitted using log()
m <- lm(log(PSA) ~ Cancer_Vol + Weight + Age + Hyperplasia + Seminal + Capsular + Score, data = pcancer)
# Summarize regression coefficients and overall model fit
summary(m)
## 
## Call:
## lm(formula = log(PSA) ~ Cancer_Vol + Weight + Age + Hyperplasia + 
##     Seminal + Capsular + Score, data = pcancer)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.88309 -0.46629  0.08045  0.47380  1.53219 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.685796   0.998754  -0.687  0.49409    
## Cancer_Vol   0.069454   0.014624   4.749 7.77e-06 ***
## Weight       0.001380   0.001822   0.757  0.45079    
## Age         -0.002799   0.011724  -0.239  0.81186    
## Hyperplasia  0.087470   0.029605   2.955  0.00401 ** 
## Seminal      0.782623   0.268339   2.917  0.00448 ** 
## Capsular    -0.026521   0.032860  -0.807  0.42177    
## Score        0.358153   0.127976   2.799  0.00629 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7679 on 89 degrees of freedom
## Multiple R-squared:  0.5893, Adjusted R-squared:  0.557 
## F-statistic: 18.24 on 7 and 89 DF,  p-value: 7.694e-15
# Compare model performance using AIC
# Lower AIC values indicate improved model fit
AIC(psa_m,m)
  1. Predictor Visulization
# ggplot
ggplot(pcancer, aes(x = Cancer_Vol, y = log(PSA))) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Cancer Volume Vs. PSA Levels Scatter Plot")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(pcancer, aes(x = Hyperplasia, y = log(PSA))) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Hyperplasia Vs. PSA Levels Scatter Plot")
## `geom_smooth()` using formula = 'y ~ x'

#######BOXPLOTS##################################################################
# Box plots
# Seminal is a binary - factor  to make interpretation clearer 
# Score is treated as a categorical group for the boxplot
ggplot(pcancer, aes(x = factor(Score), y = log(PSA))) +
  geom_boxplot() +
  labs(
    title = "Score by PSA Levels",
    x = "Score",
    y = "log(PSA)"
  )

ggplot(pcancer, aes(x = factor(Seminal), y = log(PSA))) +
  geom_boxplot() +
  labs(title = "Score by PSA Levels", x = "Seminal", y = "log(PSA)")

  1. Residuals = Observed log(PSA) - predicted log(PSA)

  2. Residual vs fitted

  3. QQplot

# Actual vs Predicted plot
# Evaluate homoscedasticity and linearity assumptions
ggplot(pcancer, aes(x = fitted(m), y = log(PSA))) +
  geom_point() +
  geom_abline(slope = 1, intercept = 0) +
  labs(title = "Actual vs Predicted")

#Fitted vs Resiudals
plot(m, which = 1)

# Assess normaility of resiudals using QQplot
#QQ Plot
plot(m, which = 2)

5. Correlation Analysis

confint(m)
##                    2.5 %      97.5 %
## (Intercept) -2.670297533 1.298706374
## Cancer_Vol   0.040396030 0.098511273
## Weight      -0.002240112 0.005000027
## Age         -0.026094332 0.020496499
## Hyperplasia  0.028644101 0.146294972
## Seminal      0.249440301 1.315806120
## Capsular    -0.091812239 0.038770721
## Score        0.103868056 0.612437985
cor(pcancer$Cancer_Vol, log(pcancer$PSA))
## [1] 0.6570739
cor(pcancer$Score, log(pcancer$PSA))
## [1] 0.5390167
cor(pcancer$Seminal, log(pcancer$PSA))
## [1] 0.5663641
cor(pcancer$Hyperplasia, log(pcancer$PSA))
## [1] 0.1574016
# Correlation matrix
cor(pcancer[, c("PSA", "Cancer_Vol", "Hyperplasia", "Seminal", "Score")])
##                     PSA Cancer_Vol Hyperplasia    Seminal      Score
## PSA          1.00000000  0.6241506 -0.01648649  0.5286188 0.42957975
## Cancer_Vol   0.62415059  1.0000000 -0.13320943  0.5817417 0.48143840
## Hyperplasia -0.01648649 -0.1332094  1.00000000 -0.1195532 0.02682555
## Seminal      0.52861878  0.5817417 -0.11955319  1.0000000 0.42857348
## Score        0.42957975  0.4814384  0.02682555  0.4285735 1.00000000
  1. Model comparison / Diagnostics
vif(m)
##  Cancer_Vol      Weight         Age Hyperplasia     Seminal    Capsular 
##    2.162606    1.128941    1.240489    1.311178    2.009331    2.516346 
##       Score 
##    1.458701
#variance inflation factor - are my predictors to correlated?
# > 1 no correlation 
# 1-5 low/moderate correlation all else not good
# > 5 potential problem and major problem with >10