Load Data
Check data frame for NA and ect
# Checking to make sure no NA's are present
sum(is.na(pcancer))
## [1] 0
# Without log - Data is skewed left
ggplot(pcancer, aes(x = PSA)) +
geom_histogram(bins = 30) +
labs(title = "Distribution of (PSA)",
x = "PSA", y = "Frequency")
# With log - Improved data model
# Apply log transformations to PSA to reduce skewness
ggplot(pcancer, aes(x = log(PSA))) +
geom_histogram(bins = 30) +
labs(title = "Distribution of log(PSA)",
x = "PSA", y = "Frequency")
2. Intital fit model and another comparison with the use of log and no
log:
# Fit multiple linear regression modelusing PSA as a response variable
# Fitting model without log another check and comparison
m <- lm(PSA ~ Cancer_Vol + Weight + Age + Hyperplasia + Seminal + Capsular + Score, data = pcancer)
# Summarize regression coefficients and overall model fit
summary(m)
##
## Call:
## lm(formula = PSA ~ Cancer_Vol + Weight + Age + Hyperplasia +
## Seminal + Capsular + Score, data = pcancer)
##
## Residuals:
## Min 1Q Median 3Q Max
## -61.330 -8.130 -0.014 6.324 167.436
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -15.24264 40.53932 -0.376 0.707814
## Cancer_Vol 2.03225 0.59359 3.424 0.000936 ***
## Weight 0.01132 0.07395 0.153 0.878708
## Age -0.53721 0.47588 -1.129 0.261977
## Hyperplasia 1.29831 1.20168 1.080 0.282878
## Seminal 19.60957 10.89184 1.800 0.075187 .
## Capsular 1.09877 1.33377 0.824 0.412253
## Score 7.05922 5.19452 1.359 0.177589
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31.17 on 89 degrees of freedom
## Multiple R-squared: 0.4585, Adjusted R-squared: 0.4159
## F-statistic: 10.77 on 7 and 89 DF, p-value: 9.266e-10
psa_m <- m
# Fit multiple linear regression modelusing log-transformed PSA
# data frame pcancer fitted using log()
m <- lm(log(PSA) ~ Cancer_Vol + Weight + Age + Hyperplasia + Seminal + Capsular + Score, data = pcancer)
# Summarize regression coefficients and overall model fit
summary(m)
##
## Call:
## lm(formula = log(PSA) ~ Cancer_Vol + Weight + Age + Hyperplasia +
## Seminal + Capsular + Score, data = pcancer)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.88309 -0.46629 0.08045 0.47380 1.53219
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.685796 0.998754 -0.687 0.49409
## Cancer_Vol 0.069454 0.014624 4.749 7.77e-06 ***
## Weight 0.001380 0.001822 0.757 0.45079
## Age -0.002799 0.011724 -0.239 0.81186
## Hyperplasia 0.087470 0.029605 2.955 0.00401 **
## Seminal 0.782623 0.268339 2.917 0.00448 **
## Capsular -0.026521 0.032860 -0.807 0.42177
## Score 0.358153 0.127976 2.799 0.00629 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7679 on 89 degrees of freedom
## Multiple R-squared: 0.5893, Adjusted R-squared: 0.557
## F-statistic: 18.24 on 7 and 89 DF, p-value: 7.694e-15
# Compare model performance using AIC
# Lower AIC values indicate improved model fit
AIC(psa_m,m)
# ggplot
ggplot(pcancer, aes(x = Cancer_Vol, y = log(PSA))) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Cancer Volume Vs. PSA Levels Scatter Plot")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(pcancer, aes(x = Hyperplasia, y = log(PSA))) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Hyperplasia Vs. PSA Levels Scatter Plot")
## `geom_smooth()` using formula = 'y ~ x'
#######BOXPLOTS##################################################################
# Box plots
# Seminal is a binary - factor to make interpretation clearer
# Score is treated as a categorical group for the boxplot
ggplot(pcancer, aes(x = factor(Score), y = log(PSA))) +
geom_boxplot() +
labs(
title = "Score by PSA Levels",
x = "Score",
y = "log(PSA)"
)
ggplot(pcancer, aes(x = factor(Seminal), y = log(PSA))) +
geom_boxplot() +
labs(title = "Score by PSA Levels", x = "Seminal", y = "log(PSA)")
Residuals = Observed log(PSA) - predicted log(PSA)
Residual vs fitted
QQplot
# Actual vs Predicted plot
# Evaluate homoscedasticity and linearity assumptions
ggplot(pcancer, aes(x = fitted(m), y = log(PSA))) +
geom_point() +
geom_abline(slope = 1, intercept = 0) +
labs(title = "Actual vs Predicted")
#Fitted vs Resiudals
plot(m, which = 1)
# Assess normaility of resiudals using QQplot
#QQ Plot
plot(m, which = 2)
5. Correlation Analysis
confint(m)
## 2.5 % 97.5 %
## (Intercept) -2.670297533 1.298706374
## Cancer_Vol 0.040396030 0.098511273
## Weight -0.002240112 0.005000027
## Age -0.026094332 0.020496499
## Hyperplasia 0.028644101 0.146294972
## Seminal 0.249440301 1.315806120
## Capsular -0.091812239 0.038770721
## Score 0.103868056 0.612437985
cor(pcancer$Cancer_Vol, log(pcancer$PSA))
## [1] 0.6570739
cor(pcancer$Score, log(pcancer$PSA))
## [1] 0.5390167
cor(pcancer$Seminal, log(pcancer$PSA))
## [1] 0.5663641
cor(pcancer$Hyperplasia, log(pcancer$PSA))
## [1] 0.1574016
# Correlation matrix
cor(pcancer[, c("PSA", "Cancer_Vol", "Hyperplasia", "Seminal", "Score")])
## PSA Cancer_Vol Hyperplasia Seminal Score
## PSA 1.00000000 0.6241506 -0.01648649 0.5286188 0.42957975
## Cancer_Vol 0.62415059 1.0000000 -0.13320943 0.5817417 0.48143840
## Hyperplasia -0.01648649 -0.1332094 1.00000000 -0.1195532 0.02682555
## Seminal 0.52861878 0.5817417 -0.11955319 1.0000000 0.42857348
## Score 0.42957975 0.4814384 0.02682555 0.4285735 1.00000000
vif(m)
## Cancer_Vol Weight Age Hyperplasia Seminal Capsular
## 2.162606 1.128941 1.240489 1.311178 2.009331 2.516346
## Score
## 1.458701
#variance inflation factor - are my predictors to correlated?
# > 1 no correlation
# 1-5 low/moderate correlation all else not good
# > 5 potential problem and major problem with >10