#establish general working folder
getwd()
## [1] "/cloud/project/analysis"
#list files in directory
list.files("/cloud/project")
## [1] "analysis"      "data"          "project.Rproj"
#list files in specific directory
list.files("/cloud/project/data")
## [1] "breast_cancer_dataset.csv"
#load data
breast <- read.csv("/cloud/project/data/breast_cancer_dataset.csv",stringsAsFactors = FALSE)
head(breast)
##   PatientID Age TumorSizeMM TumorStage HormoneReceptorStatus HER2Status
## 1     P1000  71       119.5    Stage I              Positive   Negative
## 2     P1001  34        85.1   Stage II              Positive   Negative
## 3     P1002  80        49.2   Stage II              Positive   Negative
## 4     P1003  40        89.8  Stage III              Positive   Positive
## 5     P1004  43       110.3   Stage II              Positive   Negative
## 6     P1005  22       115.3   Stage II              Positive   Negative
##   LymphNodesInvolved FamilyHistory  BMI   TreatmentType TumorGrade
## 1                  3           Yes 22.0       Radiation          2
## 2                  2            No 35.4       Radiation          2
## 3                  3           Yes 26.5         Surgery          1
## 4                  4            No 27.6 Hormone Therapy          2
## 5                  2            No 31.1       Radiation          1
## 6                  4           Yes 31.0 Hormone Therapy          2
##   DiagnosticTimeDays SurvivalMonths Recurrence IsMalignant
## 1                131           47.2          1           0
## 2                114           18.4          0           1
## 3                 62          105.9          1           1
## 4                106           95.3          0           1
## 5                 64           25.2          0           1
## 6                102           64.4          1           1
#Does average tumor size (TumorSizeMM) differ significantly across the four tumor stages?

anova_result <- aov(TumorSizeMM~TumorStage,data=breast)
print(anova_result)
## Call:
##    aov(formula = TumorSizeMM ~ TumorStage, data = breast)
## 
## Terms:
##                 TumorStage Residuals
## Sum of Squares       802.7  510936.7
## Deg. of Freedom          3       496
## 
## Residual standard error: 32.09539
## Estimated effects may be unbalanced
#Is there a significant difference in SurvivalMonths between the three tumor grades
anova_result2 <- aov(SurvivalMonths~TumorStage,data=breast)
print(anova_result2)
## Call:
##    aov(formula = SurvivalMonths ~ TumorStage, data = breast)
## 
## Terms:
##                 TumorStage Residuals
## Sum of Squares       765.6  538776.0
## Deg. of Freedom          3       496
## 
## Residual standard error: 32.95818
## Estimated effects may be unbalanced
#Do mean BMI levels differ across treatment types
anova_result_3 <- aov(BMI~TreatmentType,data=breast)
print(anova_result_3)
## Call:
##    aov(formula = BMI ~ TreatmentType, data = breast)
## 
## Terms:
##                 TreatmentType Residuals
## Sum of Squares         58.044 19494.140
## Deg. of Freedom             3       496
## 
## Residual standard error: 6.269187
## Estimated effects may be unbalanced
#Is hormone receptor status related to cancer recurrence?

#step1: create a contingency table
tbl2 <- table(breast$HormoneReceptorStatus,breast$Recurrence)

#step2: calculate degree of association
chi_result2 <- chisq.test(tbl2)

#step3: print result
print(chi_result2)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  tbl2
## X-squared = 0.14783, df = 1, p-value = 0.7006
#Are certain treatment types more commonly associated with malignant tumors?
#step1: create a contingency table
tbl3 <- table(breast$TreatmentType,breast$IsMalignant)

#step2: calculate the degree of association
chi_result_3 <- chisq.test(tbl3)

#step3: print result
print(chi_result_3)
## 
##  Pearson's Chi-squared test
## 
## data:  tbl3
## X-squared = 3.8434, df = 3, p-value = 0.2789
#. Is there a significant association between TumorStage & malignancy status?

#step1: develop a contingency table
tbl <- table(breast$TumorStage,breast$IsMalignant)

#step2: calculate degree of association
chi_result <- chisq.test(tbl)

#step3: print result
print(chi_result)
## 
##  Pearson's Chi-squared test
## 
## data:  tbl
## X-squared = 2.9869, df = 3, p-value = 0.3936
#Is there a correlation between TumorSizeMM and the number of lymph nodes involved

correlation <- cor(breast$TumorSize,breast$LymphNodesInvolved,use='complete.obs')
print(correlation)
## [1] 0.002324283
#Does diagnostic delay correlate with tumor size at diagnosis?
corr_result <-cor(breast$DiagnosticTimeDays,breast$TumorSizeMM,use='complete.obs')
print(corr_result)
## [1] -0.006568929
#Is patient age correlated with BMI in the dataset?
corr_3 <-  cor(breast$Age,breast$BMI,use='complete.obs')
print(corr_3)
## [1] 0.01953865
#To what extent does TumorSizeMM, TumorGrade, Age, and TreatmentType predict
#patient survival time in months (SurvivalMonths)?

#step1: build a linear regression model
linear_model <- lm(
  SurvivalMonths~TumorSizeMM+TumorGrade+TreatmentType+Age,
  data=breast
                     )

#step2: print summary of results
summary(linear_model)
## 
## Call:
## lm(formula = SurvivalMonths ~ TumorSizeMM + TumorGrade + TreatmentType + 
##     Age, data = breast)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -63.744 -28.199   0.661  27.720  63.875 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  64.779300   6.971157   9.292   <2e-16 ***
## TumorSizeMM                  -0.113374   0.046007  -2.464   0.0141 *  
## TumorGrade                    2.363112   2.058053   1.148   0.2514    
## TreatmentTypeHormone Therapy  3.481636   3.978881   0.875   0.3820    
## TreatmentTypeRadiation        0.433035   4.212103   0.103   0.9182    
## TreatmentTypeSurgery         -3.812537   4.106193  -0.928   0.3536    
## Age                           0.004752   0.076497   0.062   0.9505    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32.74 on 493 degrees of freedom
## Multiple R-squared:  0.02029,    Adjusted R-squared:  0.008371 
## F-statistic: 1.702 on 6 and 493 DF,  p-value: 0.1185
#Is there a linear relationship between DiagnosticTimeDays and SurvivalMonths?

#step1: build a linear model
linear_model_2 <- lm(
  SurvivalMonths~DiagnosticTimeDays,
  data=breast
  )

#step2: print summary of results
print(linear_model_2)
## 
## Call:
## lm(formula = SurvivalMonths ~ DiagnosticTimeDays, data = breast)
## 
## Coefficients:
##        (Intercept)  DiagnosticTimeDays  
##           63.90777            -0.01796
#Does the number of lymph nodes involved significantly reduce expected survival duration?

#step1: build linear model
linear_model_3 <- lm(
  SurvivalMonths~LymphNodesInvolved,
  data=breast
  )

#step2: print summary of results
summary(linear_model_3)
## 
## Call:
## lm(formula = SurvivalMonths ~ LymphNodesInvolved, data = breast)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -55.786 -28.239   1.069  28.415  57.724 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         61.8762     2.6075  23.730   <2e-16 ***
## LymphNodesInvolved   0.2032     1.0269   0.198    0.843    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32.91 on 498 degrees of freedom
## Multiple R-squared:  7.865e-05,  Adjusted R-squared:  -0.001929 
## F-statistic: 0.03917 on 1 and 498 DF,  p-value: 0.8432
#Predict whether a tumor is malignant (IsMalignant) using features such as
#Age, TumorSizeMM, HormoneReceptorStatus, and LymphNodesInvolved?

#step1: convert variables to factors
breast$HormoneReceptorStatus <-as.factor(breast$HormoneReceptorStatus)
breast$IsMalignant <- as.factor(breast$IsMalignant)

#step2: reconvert variables back to numeric
breast$TumorSizeMM <- as.numeric(breast$TumorSizeMM)
breast$LymphNodesInvolved <- as.numeric(breast$LymphNodesInvolved)

#step2: build a logistical model
logistical_model_1 <- glm(
  IsMalignant~Age+TumorSizeMM+HormoneReceptorStatus+LymphNodesInvolved,data=breast,family='binomial'
)
summary(logistical_model_1)
## 
## Call:
## glm(formula = IsMalignant ~ Age + TumorSizeMM + HormoneReceptorStatus + 
##     LymphNodesInvolved, family = "binomial", data = breast)
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    1.712766   0.467406   3.664 0.000248 ***
## Age                           -0.003546   0.005793  -0.612 0.540526    
## TumorSizeMM                   -0.005132   0.003506  -1.464 0.143257    
## HormoneReceptorStatusPositive  0.063189   0.248260   0.255 0.799088    
## LymphNodesInvolved             0.054540   0.079439   0.687 0.492359    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 508.61  on 499  degrees of freedom
## Residual deviance: 505.66  on 495  degrees of freedom
## AIC: 515.66
## 
## Number of Fisher Scoring iterations: 4
#Does TumorSizeMM, TumorGrade, and TumorStage significantly increase the 
#probability of cancer recurrence (Recurrence)?

#step1: convert variables to factors
breast$Recurrence <- as.factor(breast$Recurrence)
breast$TumorStage <- as.factor(breast$TumorStage)

#step2: build a logistical regression model
logistical_model_2 <- glm(
  Recurrence~TumorSizeMM+TumorGrade+TumorStage,
  data=breast,family='binomial'
  )

#step3: summarise results
summary(logistical_model_2)
## 
## Call:
## glm(formula = Recurrence ~ TumorSizeMM + TumorGrade + TumorStage, 
##     family = "binomial", data = breast)
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)   
## (Intercept)         -1.182264   0.384237  -3.077  0.00209 **
## TumorSizeMM          0.003001   0.003226   0.930  0.35213   
## TumorGrade          -0.071120   0.144347  -0.493  0.62222   
## TumorStageStage II   0.120864   0.246049   0.491  0.62327   
## TumorStageStage III  0.285947   0.283590   1.008  0.31330   
## TumorStageStage IV  -0.449227   0.408795  -1.099  0.27181   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 570.95  on 499  degrees of freedom
## Residual deviance: 566.51  on 494  degrees of freedom
## AIC: 578.51
## 
## Number of Fisher Scoring iterations: 4
#Can a logistic regression model accurately classify malignant vs non-malignant tumors using BMI, Age, TumorStage, and HER2Status?

#step1: convert variable to factor
breast$HER2Status <- as.factor(breast$HER2Status)

#step2: build logistical regression model
logistical_model_3 <- glm(IsMalignant~BMI+Age+TumorStage+HER2Status,
                          data=breast,
                          family='binomial'
                          )

#step3: print summary of results
summary(logistical_model_3)
## 
## Call:
## glm(formula = IsMalignant ~ BMI + Age + TumorStage + HER2Status, 
##     family = "binomial", data = breast)
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)  
## (Intercept)          1.178526   0.632724   1.863   0.0625 .
## BMI                  0.017888   0.017848   1.002   0.3162  
## Age                 -0.002821   0.005775  -0.489   0.6252  
## TumorStageStage II  -0.185081   0.272090  -0.680   0.4964  
## TumorStageStage III -0.097347   0.326640  -0.298   0.7657  
## TumorStageStage IV  -0.593963   0.367731  -1.615   0.1063  
## HER2StatusPositive  -0.175377   0.262470  -0.668   0.5040  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 508.61  on 499  degrees of freedom
## Residual deviance: 504.10  on 493  degrees of freedom
## AIC: 518.1
## 
## Number of Fisher Scoring iterations: 4
#Is the mean BMI significantly different between malignant and non-malignant cases?
t_result <- t.test(BMI~IsMalignant,data=breast)
print(t_result)
## 
##  Welch Two Sample t-test
## 
## data:  BMI by IsMalignant
## t = -1.1178, df = 161.05, p-value = 0.2653
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -2.1194447  0.5873276
## sample estimates:
## mean in group 0 mean in group 1 
##        28.48835        29.25441
#Do patients who experience recurrence have significantly larger tumors compared to those who do not?

t_result2 <- t.test(TumorSizeMM~Recurrence,data=breast)
print(t_result2)
## 
##  Welch Two Sample t-test
## 
## data:  TumorSizeMM by Recurrence
## t = -0.90901, df = 218.51, p-value = 0.3643
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -9.540472  3.517755
## sample estimates:
## mean in group 0 mean in group 1 
##        63.10027        66.11163
#Is there a significant difference in age between patients with positive 
#vs negative HER2 status

t_result3 <- t.test(Age~HER2Status,data=breast)
print(t_result3)
## 
##  Welch Two Sample t-test
## 
## data:  Age by HER2Status
## t = -0.64252, df = 172.45, p-value = 0.5214
## alternative hypothesis: true difference in means between group Negative and group Positive is not equal to 0
## 95 percent confidence interval:
##  -5.481141  2.789008
## sample estimates:
## mean in group Negative mean in group Positive 
##               52.00256               53.34862