```{r setup, include=FALSE} knitr::opts_chunk\(set(echo = TRUE) # Install required packages if not already installed packages <- c("tidyverse", "ggplot2", "corrplot", "GGally", "readr") installed <- rownames(installed.packages()) for (pkg in packages) { if (!(pkg %in% installed)) install.packages(pkg) } library(tidyverse) library(ggplot2) library(corrplot) library(GGally) library(readr) data <- read_csv("cancer_data_cleaned.csv") glimpse(data) # Focused summary of the target variable summary(data\)TARGET_deathRate) hist(data\(TARGET_deathRate, breaks = 30, main = "Distribution of Cancer Death Rate", xlab = "Cancer Death Rate", col = "skyblue", border = "black") boxplot(data\)TARGET_deathRate, main = “Boxplot of Cancer Death Rate”, col = “lightgreen”) summary(data) # Histograms of all numeric variables data %>% select(where(is.numeric)) %>% pivot_longer(cols = everything(), names_to = “Variable”, values_to = “Value”) %>% ggplot(aes(x = Value)) + geom_histogram(bins = 30, fill = “skyblue”, color = “black”) + facet_wrap(~Variable, scales = “free”) + theme_minimal()
data %>% select(where(is.numeric)) %>% pivot_longer(cols = everything(), names_to = “Variable”, values_to = “Value”) %>% ggplot(aes(x = Variable, y = Value)) + geom_boxplot(fill = “lightgray”) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + ggtitle(“Boxplots of Numeric Variables”) # Correlation matrix for numeric variables numeric_data <- data %>% select(where(is.numeric)) cor_matrix <- cor(numeric_data, use = “complete.obs”) corrplot(cor_matrix, method = “color”, type = “upper”, tl.cex = 0.8) # Pairwise plots among selected predictors and target selected_vars <- data %>% select(TARGET_deathRate, povertyRate, percentBlack, percentHispanic, PctUninsured)
ggpairs(selected_vars)