Notes
Due date: Wednesday,April 14, 2025 at 11:59pm
There is no help on any questions of this Project. You can discuss with your peer but can not copy the work. If you find an error in a question, please email me.
You need to knit your .rmd file into .html or word file and upload the html or word file in Blackboard.
id: patient ID
diagnosis_result: B= bening tomor, M= malignant tumor
radius: Mean distance from center to points on the perimeter (i.e., tumor size).
texture: Standard deviation of gray-scale values — captures variability in texture.
perimeter: Perimeter length of the tumor contour.
area: Area of the tumor in pixel units.
smoothness: How smooth the tumor edge is — measured by local variation in radius.
compactness: A measure related to the shape compactness: (perimeter² / area - 1).
symmetry: Symmetry of the tumor shape.
fractal_dimension: Describes complexity of the contour (higher values = more irregular edges, which implies more chance to be malignant).
prost= read.csv("Prostate_Cancer.csv")
prost[prost == ""]= NA
na_columns= colnames(prost)[colSums(is.na(prost)) > 0]
print(na_columns)
## character(0)
P = na.omit(prost)
mean_radius=mean(P$radius)
mean_perimeter= mean(P$perimeter)
mean_area= mean(P$area)
sd_radius=sd(P$radius)
sd_perimeter=sd(P$perimeter)
sd_area=sd(P$area)
summary_stats <- data.frame(
Feature = c("radius", "perimeter", "area"),
Mean = c(mean_radius, mean_perimeter, mean_area),
SD = c(sd_radius, sd_perimeter, sd_area)
)
print(summary_stats)
## Feature Mean SD
## 1 radius 16.85 4.879094
## 2 perimeter 96.78 23.676089
## 3 area 702.88 319.710895
table(prost$diagnosis_result)
##
## B M
## 38 62
barplot(table(prost$diagnosis_result),
main = "Diagnosis Result Count",
xlab = "Diagnosis Type",
ylab = "Number of Patients",
col = c("blue", "red"),
names.arg = c("Benign", "Malignant"))
Response:
mean_benign_radius=mean(P$radius[P$diagnosis_result == "B"])
print(paste("Mean radius (Benign):", round(mean_benign_radius, 2)))
## [1] "Mean radius (Benign): 17.95"
mean_malignant_radius= mean(P$radius[P$diagnosis_result == "M"])
print(paste("Mean radius (Malignant):", round(mean_malignant_radius, 2)))
## [1] "Mean radius (Malignant): 16.18"
Response:
benign_radius <- P$radius[P$diagnosis_result == "B"]
malignant_radius <- P$radius[P$diagnosis_result == "M"]
var.test(benign_radius, malignant_radius)
##
## F test to compare two variances
##
## data: benign_radius and malignant_radius
## F = 1.1706, num df = 37, denom df = 61, p-value = 0.5762
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.6659895 2.1462614
## sample estimates:
## ratio of variances
## 1.170553
Response:
P$Lik=cut(P$fractal_dimension,
breaks = c(0, 0.06, 0.0635, 0.10),
labels = c("less", "medium", "highly"),
include.lowest = TRUE,
right = TRUE) # Right-closed intervals, e.g., (a, b]
table(P$Lik)
##
## less medium highly
## 38 13 49
Response:
boxplot(radius ~ Lik,
data = P,
main = "Boxplot of Prostate Radius by Fractal Dimension Likelihood",
xlab = "Fractal Dimension Likelihood (Lik)",
ylab = "Radius",
col = c("lavender", "orange", "green"))
#I can see differeces in the spreasd of the data, including some
differet outliers Response:
boxplot(fractal_dimension ~ Lik,
data = P,
main = "Boxplot of Fractal Dimension by Lik Category",
xlab = "Lik (Fractal Dimension Categories)",
ylab = "Fractal Dimension",
col = c("skyblue", "gold", "tomato"))
#So i see there sint much spread along each group and not a lot of band
width. there could be pateitns who are outliers affectinf the data and
overall the data looks stacked and not super organized.
Response:
highly_fd=P$fractal_dimension[P$Lik == "highly"]
less_fd=P$fractal_dimension[P$Lik == "less"]
var.test(highly_fd, less_fd)
##
## F test to compare two variances
##
## data: highly_fd and less_fd
## F = 9.3849, num df = 48, denom df = 37, p-value = 2.091e-10
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 5.012428 17.143212
## sample estimates:
## ratio of variances
## 9.38493
Response:
highly_fd=P$fractal_dimension[P$Lik == "highly"]
medium_fd=P$fractal_dimension[P$Lik == "medium"]
var.test(highly_fd, medium_fd)
##
## F test to compare two variances
##
## data: highly_fd and medium_fd
## F = 50.106, num df = 48, denom df = 12, p-value = 1.29e-08
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 17.41455 111.64115
## sample estimates:
## ratio of variances
## 50.10642
t.test(highly_fd, medium_fd,
alternative = "greater",
var.equal = TRUE)
##
## Two Sample t-test
##
## data: highly_fd and medium_fd
## t = 4.788, df = 60, p-value = 5.708e-06
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 0.005889318 Inf
## sample estimates:
## mean of x mean of y
## 0.07112245 0.06207692
t.test(highly_fd, medium_fd,
alternative = "greater",
var.equal = FALSE)
##
## Welch Two Sample t-test
##
## data: highly_fd and medium_fd
## t = 9.0418, df = 54.265, p-value = 1.021e-12
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 0.007371409 Inf
## sample estimates:
## mean of x mean of y
## 0.07112245 0.06207692
Response:
# Plot histograms
par(mfrow = c(1, 3))
hist(P$fractal_dimension[P$Lik == "less"], main = "Less", xlab = "Fractal Dimension")
hist(P$fractal_dimension[P$Lik == "medium"], main = "Medium", xlab = "Fractal Dimension")
hist(P$fractal_dimension[P$Lik == "highly"], main = "Highly", xlab = "Fractal Dimension")
# QQ plots
par(mfrow = c(1, 3))
qqnorm(P$fractal_dimension[P$Lik == "less"]); qqline(P$fractal_dimension[P$Lik == "less"])
qqnorm(P$fractal_dimension[P$Lik == "medium"]); qqline(P$fractal_dimension[P$Lik == "medium"])
qqnorm(P$fractal_dimension[P$Lik == "highly"]); qqline(P$fractal_dimension[P$Lik == "highly"])
Response:
anova_result <- aov(fractal_dimension ~ Lik, data = P)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## Lik 2 0.004197 0.0020987 85.53 <2e-16 ***
## Residuals 97 0.002380 0.0000245
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tukey_result <- TukeyHSD(anova_result)
print(tukey_result)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = fractal_dimension ~ Lik, data = P)
##
## $Lik
## diff lwr upr p adj
## medium-less 0.004787449 0.0009991708 0.008575728 0.0093080
## highly-less 0.013832975 0.0112844433 0.016381507 0.0000000
## highly-medium 0.009045526 0.0053672320 0.012723820 0.0000002
Response: