#establish general working folder
getwd()
## [1] "/cloud/project/analysis"
#list files in directory
list.files("/cloud/project")
## [1] "analysis" "data" "project.Rproj"
#list files in specific directory
list.files("/cloud/project/data")
## [1] "breast_cancer_dataset.csv"
#load data
breast <- read.csv("/cloud/project/data/breast_cancer_dataset.csv",stringsAsFactors = FALSE)
head(breast)
## PatientID Age TumorSizeMM TumorStage HormoneReceptorStatus HER2Status
## 1 P1000 71 119.5 Stage I Positive Negative
## 2 P1001 34 85.1 Stage II Positive Negative
## 3 P1002 80 49.2 Stage II Positive Negative
## 4 P1003 40 89.8 Stage III Positive Positive
## 5 P1004 43 110.3 Stage II Positive Negative
## 6 P1005 22 115.3 Stage II Positive Negative
## LymphNodesInvolved FamilyHistory BMI TreatmentType TumorGrade
## 1 3 Yes 22.0 Radiation 2
## 2 2 No 35.4 Radiation 2
## 3 3 Yes 26.5 Surgery 1
## 4 4 No 27.6 Hormone Therapy 2
## 5 2 No 31.1 Radiation 1
## 6 4 Yes 31.0 Hormone Therapy 2
## DiagnosticTimeDays SurvivalMonths Recurrence IsMalignant
## 1 131 47.2 1 0
## 2 114 18.4 0 1
## 3 62 105.9 1 1
## 4 106 95.3 0 1
## 5 64 25.2 0 1
## 6 102 64.4 1 1
#Does average tumor size (TumorSizeMM) differ significantly across the four tumor stages?
anova_result <- aov(TumorSizeMM~TumorStage,data=breast)
print(anova_result)
## Call:
## aov(formula = TumorSizeMM ~ TumorStage, data = breast)
##
## Terms:
## TumorStage Residuals
## Sum of Squares 802.7 510936.7
## Deg. of Freedom 3 496
##
## Residual standard error: 32.09539
## Estimated effects may be unbalanced
#Is there a significant difference in SurvivalMonths between the three tumor grades
anova_result2 <- aov(SurvivalMonths~TumorStage,data=breast)
print(anova_result2)
## Call:
## aov(formula = SurvivalMonths ~ TumorStage, data = breast)
##
## Terms:
## TumorStage Residuals
## Sum of Squares 765.6 538776.0
## Deg. of Freedom 3 496
##
## Residual standard error: 32.95818
## Estimated effects may be unbalanced
#Do mean BMI levels differ across treatment types
anova_result_3 <- aov(BMI~TreatmentType,data=breast)
print(anova_result_3)
## Call:
## aov(formula = BMI ~ TreatmentType, data = breast)
##
## Terms:
## TreatmentType Residuals
## Sum of Squares 58.044 19494.140
## Deg. of Freedom 3 496
##
## Residual standard error: 6.269187
## Estimated effects may be unbalanced
#Is hormone receptor status related to cancer recurrence?
#step1: create a contingency table
tbl2 <- table(breast$HormoneReceptorStatus,breast$Recurrence)
#step2: calculate degree of association
chi_result2 <- chisq.test(tbl2)
#step3: print result
print(chi_result2)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: tbl2
## X-squared = 0.14783, df = 1, p-value = 0.7006
#Are certain treatment types more commonly associated with malignant tumors?
#step1: create a contingency table
tbl3 <- table(breast$TreatmentType,breast$IsMalignant)
#step2: calculate the degree of association
chi_result_3 <- chisq.test(tbl3)
#step3: print result
print(chi_result_3)
##
## Pearson's Chi-squared test
##
## data: tbl3
## X-squared = 3.8434, df = 3, p-value = 0.2789
#. Is there a significant association between TumorStage & malignancy status?
#step1: develop a contingency table
tbl <- table(breast$TumorStage,breast$IsMalignant)
#step2: calculate degree of association
chi_result <- chisq.test(tbl)
#step3: print result
print(chi_result)
##
## Pearson's Chi-squared test
##
## data: tbl
## X-squared = 2.9869, df = 3, p-value = 0.3936
#Is there a correlation between TumorSizeMM and the number of lymph nodes involved
correlation <- cor(breast$TumorSize,breast$LymphNodesInvolved,use='complete.obs')
print(correlation)
## [1] 0.002324283
#Does diagnostic delay correlate with tumor size at diagnosis?
corr_result <-cor(breast$DiagnosticTimeDays,breast$TumorSizeMM,use='complete.obs')
print(corr_result)
## [1] -0.006568929
#Is patient age correlated with BMI in the dataset?
corr_3 <- cor(breast$Age,breast$BMI,use='complete.obs')
print(corr_3)
## [1] 0.01953865
#To what extent does TumorSizeMM, TumorGrade, Age, and TreatmentType predict
#patient survival time in months (SurvivalMonths)?
#step1: build a linear regression model
linear_model <- lm(
SurvivalMonths~TumorSizeMM+TumorGrade+TreatmentType+Age,
data=breast
)
#step2: print summary of results
summary(linear_model)
##
## Call:
## lm(formula = SurvivalMonths ~ TumorSizeMM + TumorGrade + TreatmentType +
## Age, data = breast)
##
## Residuals:
## Min 1Q Median 3Q Max
## -63.744 -28.199 0.661 27.720 63.875
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 64.779300 6.971157 9.292 <2e-16 ***
## TumorSizeMM -0.113374 0.046007 -2.464 0.0141 *
## TumorGrade 2.363112 2.058053 1.148 0.2514
## TreatmentTypeHormone Therapy 3.481636 3.978881 0.875 0.3820
## TreatmentTypeRadiation 0.433035 4.212103 0.103 0.9182
## TreatmentTypeSurgery -3.812537 4.106193 -0.928 0.3536
## Age 0.004752 0.076497 0.062 0.9505
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32.74 on 493 degrees of freedom
## Multiple R-squared: 0.02029, Adjusted R-squared: 0.008371
## F-statistic: 1.702 on 6 and 493 DF, p-value: 0.1185
#Is there a linear relationship between DiagnosticTimeDays and SurvivalMonths?
#step1: build a linear model
linear_model_2 <- lm(
SurvivalMonths~DiagnosticTimeDays,
data=breast
)
#step2: print summary of results
print(linear_model_2)
##
## Call:
## lm(formula = SurvivalMonths ~ DiagnosticTimeDays, data = breast)
##
## Coefficients:
## (Intercept) DiagnosticTimeDays
## 63.90777 -0.01796
#Does the number of lymph nodes involved significantly reduce expected survival duration?
#step1: build linear model
linear_model_3 <- lm(
SurvivalMonths~LymphNodesInvolved,
data=breast
)
#step2: print summary of results
summary(linear_model_3)
##
## Call:
## lm(formula = SurvivalMonths ~ LymphNodesInvolved, data = breast)
##
## Residuals:
## Min 1Q Median 3Q Max
## -55.786 -28.239 1.069 28.415 57.724
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 61.8762 2.6075 23.730 <2e-16 ***
## LymphNodesInvolved 0.2032 1.0269 0.198 0.843
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32.91 on 498 degrees of freedom
## Multiple R-squared: 7.865e-05, Adjusted R-squared: -0.001929
## F-statistic: 0.03917 on 1 and 498 DF, p-value: 0.8432
#Predict whether a tumor is malignant (IsMalignant) using features such as
#Age, TumorSizeMM, HormoneReceptorStatus, and LymphNodesInvolved?
#step1: convert variables to factors
breast$HormoneReceptorStatus <-as.factor(breast$HormoneReceptorStatus)
breast$IsMalignant <- as.factor(breast$IsMalignant)
#step2: reconvert variables back to numeric
breast$TumorSizeMM <- as.numeric(breast$TumorSizeMM)
breast$LymphNodesInvolved <- as.numeric(breast$LymphNodesInvolved)
#step2: build a logistical model
logistical_model_1 <- glm(
IsMalignant~Age+TumorSizeMM+HormoneReceptorStatus+LymphNodesInvolved,data=breast,family='binomial'
)
summary(logistical_model_1)
##
## Call:
## glm(formula = IsMalignant ~ Age + TumorSizeMM + HormoneReceptorStatus +
## LymphNodesInvolved, family = "binomial", data = breast)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.712766 0.467406 3.664 0.000248 ***
## Age -0.003546 0.005793 -0.612 0.540526
## TumorSizeMM -0.005132 0.003506 -1.464 0.143257
## HormoneReceptorStatusPositive 0.063189 0.248260 0.255 0.799088
## LymphNodesInvolved 0.054540 0.079439 0.687 0.492359
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 508.61 on 499 degrees of freedom
## Residual deviance: 505.66 on 495 degrees of freedom
## AIC: 515.66
##
## Number of Fisher Scoring iterations: 4
#Does TumorSizeMM, TumorGrade, and TumorStage significantly increase the
#probability of cancer recurrence (Recurrence)?
#step1: convert variables to factors
breast$Recurrence <- as.factor(breast$Recurrence)
breast$TumorStage <- as.factor(breast$TumorStage)
#step2: build a logistical regression model
logistical_model_2 <- glm(
Recurrence~TumorSizeMM+TumorGrade+TumorStage,
data=breast,family='binomial'
)
#step3: summarise results
summary(logistical_model_2)
##
## Call:
## glm(formula = Recurrence ~ TumorSizeMM + TumorGrade + TumorStage,
## family = "binomial", data = breast)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.182264 0.384237 -3.077 0.00209 **
## TumorSizeMM 0.003001 0.003226 0.930 0.35213
## TumorGrade -0.071120 0.144347 -0.493 0.62222
## TumorStageStage II 0.120864 0.246049 0.491 0.62327
## TumorStageStage III 0.285947 0.283590 1.008 0.31330
## TumorStageStage IV -0.449227 0.408795 -1.099 0.27181
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 570.95 on 499 degrees of freedom
## Residual deviance: 566.51 on 494 degrees of freedom
## AIC: 578.51
##
## Number of Fisher Scoring iterations: 4
#Can a logistic regression model accurately classify malignant vs non-malignant tumors using BMI, Age, TumorStage, and HER2Status?
#step1: convert variable to factor
breast$HER2Status <- as.factor(breast$HER2Status)
#step2: build logistical regression model
logistical_model_3 <- glm(IsMalignant~BMI+Age+TumorStage+HER2Status,
data=breast,
family='binomial'
)
#step3: print summary of results
summary(logistical_model_3)
##
## Call:
## glm(formula = IsMalignant ~ BMI + Age + TumorStage + HER2Status,
## family = "binomial", data = breast)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.178526 0.632724 1.863 0.0625 .
## BMI 0.017888 0.017848 1.002 0.3162
## Age -0.002821 0.005775 -0.489 0.6252
## TumorStageStage II -0.185081 0.272090 -0.680 0.4964
## TumorStageStage III -0.097347 0.326640 -0.298 0.7657
## TumorStageStage IV -0.593963 0.367731 -1.615 0.1063
## HER2StatusPositive -0.175377 0.262470 -0.668 0.5040
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 508.61 on 499 degrees of freedom
## Residual deviance: 504.10 on 493 degrees of freedom
## AIC: 518.1
##
## Number of Fisher Scoring iterations: 4
#Is the mean BMI significantly different between malignant and non-malignant cases?
t_result <- t.test(BMI~IsMalignant,data=breast)
print(t_result)
##
## Welch Two Sample t-test
##
## data: BMI by IsMalignant
## t = -1.1178, df = 161.05, p-value = 0.2653
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -2.1194447 0.5873276
## sample estimates:
## mean in group 0 mean in group 1
## 28.48835 29.25441
#Do patients who experience recurrence have significantly larger tumors compared to those who do not?
t_result2 <- t.test(TumorSizeMM~Recurrence,data=breast)
print(t_result2)
##
## Welch Two Sample t-test
##
## data: TumorSizeMM by Recurrence
## t = -0.90901, df = 218.51, p-value = 0.3643
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -9.540472 3.517755
## sample estimates:
## mean in group 0 mean in group 1
## 63.10027 66.11163
#Is there a significant difference in age between patients with positive
#vs negative HER2 status
t_result3 <- t.test(Age~HER2Status,data=breast)
print(t_result3)
##
## Welch Two Sample t-test
##
## data: Age by HER2Status
## t = -0.64252, df = 172.45, p-value = 0.5214
## alternative hypothesis: true difference in means between group Negative and group Positive is not equal to 0
## 95 percent confidence interval:
## -5.481141 2.789008
## sample estimates:
## mean in group Negative mean in group Positive
## 52.00256 53.34862