library(tidyverse)
library(openintro)
library(GGally)
library(broom)
library(ggplot2)
library(ggfortify)
library(readr)
breast_20cancer <- read_csv("https://raw.githubusercontent.com/lwaldron/bios1/main/vignettes/breast%20cancer.csv",
col_types = cols(id = col_character(),
diagnosis = col_factor(levels = c("B",
"M")), ...33 = col_skip()))
bcancer <- breast_20cancer %>% mutate(diagnosis=case_match(diagnosis,
"B" ~ "Benign",
"M"~ "Malignant",
.ptype=factor(levels=c("Benign","Malignant"))))
dim(bcancer)
head(bcancer)
summary(bcancer)
- Check the dimensions of the dataset using the dim() function
- Preview the first few rows of the dataset using the head() function
- Use summary() to summarize the dataset.
distancematrix <- 1 - abs(cor(select(bcancer, -c(id,diagnosis))))
distancematrix <- data.frame(distancematrix)
plot(hclust(as.dist(distancematrix)))
ComplexHeatmap::Heatmap(distancematrix)
## Warning: The input is a data frame-like object, convert it to a matrix.
- Identify collinear variables.
bcancer2 <- bcancer %>% select(-id)
bcancer2long <- bcancer2%>%
pivot_longer(!diagnosis,names_to = "attributes", values_to = "values")
ggplot(bcancer2long,aes(x=diagnosis,y=values))+
geom_boxplot()+
facet_wrap(vars(attributes), scales="free")
- Do any variables clearly have an association with breast cancer diagnosis?
cancmod <- glm(diagnosis~area_mean,data=bcancer,family="binomial")
summary(cancmod)
##
## Call:
## glm(formula = diagnosis ~ area_mean, family = "binomial", data = bcancer)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.72939 -0.46438 -0.20769 0.09969 2.73582
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.97313 0.68300 -11.67 <2e-16 ***
## area_mean 0.01177 0.00109 10.80 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 750.51 on 567 degrees of freedom
## Residual deviance: 325.65 on 566 degrees of freedom
## AIC: 329.65
##
## Number of Fisher Scoring iterations: 7
Interpret the coefficients of the logistic regression model
meanplot <- ggplot(bcancer, aes(x=area_mean, y= diagnosis)) +
geom_point()+
geom_jitter(width=0)
meanplot
bcancer_pred <- bcancer %>% mutate(
modpredict=fitted(cancmod))
meanplot +
geom_line(data=bcancer_pred, aes(x=area_mean,y=1+modpredict), color="green",linewidth=1.5)
What do you notice about the data points where the predicted probabilities are 0, 0.5, and 1?
new_means <- tibble(
area_mean = seq(300,1100,200))
mean_mean_pred <- new_means %>%
mutate(predProbability= predict( cancmod,new_means,type="response"))
mean_mean_pred
## # A tibble: 5 × 2
## area_mean predProbability
## <dbl> <dbl>
## 1 300 0.0116
## 2 500 0.110
## 3 700 0.565
## 4 900 0.932
## 5 1100 0.993