Installed packages include dplyr, magrittr, corrplot, ggplot2, factoextra, MASS, Hmisc, plotly, dbscan, cluster
Question 1
1.1. Partitioning the data into training(90%) and test(10%) subsets.
bcw = read.table('C:\\Users\\katie\\OneDrive\\Desktop\\a2\\wdbc.data', sep=",", header=FALSE)
#Appending column names
bcw = setNames(bcw, c("ID_number","diagnosis", "radius", "texture", "perimeter", "area", "smoothness", "compactness", "concavity", "concave_points", "symmetry", "fractal_dimension", "radius_SE", "texture_SE", "perimeter_SE", "area_SE", "smoothness_SE", "compactness_SE", "concavity_SE", "concave_points_SE", "symmetry_SE", "fractal_dimension_SE", "radius_worst", "texture_worst", "perimeter_worst", "area_worst", "smoothness_worst", "compactness_worst", "concavity_worst", "concave_points_worst", "symmetry_worst", "fractal_dimension_worst"))
sum(is.na(bcw))#checking for NA's
## [1] 0
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(magrittr)
set.seed(2)
train_bcw = bcw %>% dplyr::sample_frac(0.9)#randomly select 90%(0.9) of the data(rows)
test_bcw = dplyr::anti_join(bcw, train_bcw, by = "ID_number")#To select for the remaining 10%
(c(nrow(train_bcw), nrow(test_bcw)))
## [1] 512 57
#applying a logistic regression to the training subset of data(using ALL numeric features as directed)
str(train_bcw)
## 'data.frame': 512 obs. of 32 variables:
## $ ID_number : int 905686 903483 8910988 898690 87880 901549 891936 91504 911320501 911916 ...
## $ diagnosis : chr "B" "B" "M" "B" ...
## $ radius : num 11.89 8.73 21.75 11.47 13.81 ...
## $ texture : num 21.2 16.8 21 16 23.8 ...
## $ perimeter : num 76.4 55.3 147.3 73 91.6 ...
## $ area : num 434 234 1491 403 598 ...
## $ smoothness : num 0.0977 0.1039 0.094 0.0908 0.1323 ...
## $ compactness : num 0.0812 0.0743 0.1961 0.0589 0.1768 ...
## $ concavity : num 0.0255 0 0.2195 0.0259 0.1558 ...
## $ concave_points : num 0.0218 0 0.1088 0.0232 0.0918 ...
## $ symmetry : num 0.202 0.199 0.172 0.163 0.225 ...
## $ fractal_dimension : num 0.0629 0.071 0.0619 0.0637 0.0742 ...
## $ radius_SE : num 0.275 0.517 1.167 0.171 0.565 ...
## $ texture_SE : num 1.203 2.079 1.352 0.761 1.93 ...
## $ perimeter_SE : num 1.93 3.17 8.87 1.09 3.91 ...
## $ area_SE : num 19.5 28.9 156.8 12.2 52.7 ...
## $ smoothness_SE : num 0.00989 0.01582 0.00569 0.00919 0.00882 ...
## $ compactness_SE : num 0.03053 0.01966 0.0496 0.00855 0.03108 ...
## $ concavity_SE : num 0.0163 0 0.0633 0.0094 0.0311 ...
## $ concave_points_SE : num 0.00928 0 0.01561 0.00632 0.01291 ...
## $ symmetry_SE : num 0.0226 0.0186 0.0192 0.0175 0.02 ...
## $ fractal_dimension_SE : num 0.00227 0.00674 0.00461 0.00301 0.00451 ...
## $ radius_worst : num 13.1 10.2 28.2 12.5 19.2 ...
## $ texture_worst : num 27.2 22.8 28.2 20.8 41.9 ...
## $ perimeter_worst : num 85.1 64 195.9 79.7 128.5 ...
## $ area_worst : num 523 317 2384 476 1153 ...
## $ smoothness_worst : num 0.143 0.146 0.127 0.153 0.223 ...
## $ compactness_worst : num 0.219 0.131 0.472 0.112 0.521 ...
## $ concavity_worst : num 0.1164 0 0.5807 0.0982 0.4646 ...
## $ concave_points_worst : num 0.0826 0 0.1841 0.0655 0.2013 ...
## $ symmetry_worst : num 0.307 0.244 0.283 0.285 0.443 ...
## $ fractal_dimension_worst: num 0.0735 0.0887 0.0886 0.0876 0.1086 ...
#need to transform diagnosis outcome as a factor with levels 0 and 1, then convert to numeric
train_bcw$diagnosis[train_bcw$diagnosis == 'M'] <- 1
train_bcw$diagnosis[train_bcw$diagnosis == 'B'] <- 0
train_bcw$diagnosis = as.numeric(as.character(train_bcw$diagnosis))
str(train_bcw)
## 'data.frame': 512 obs. of 32 variables:
## $ ID_number : int 905686 903483 8910988 898690 87880 901549 891936 91504 911320501 911916 ...
## $ diagnosis : num 0 0 1 0 1 0 0 1 0 1 ...
## $ radius : num 11.89 8.73 21.75 11.47 13.81 ...
## $ texture : num 21.2 16.8 21 16 23.8 ...
## $ perimeter : num 76.4 55.3 147.3 73 91.6 ...
## $ area : num 434 234 1491 403 598 ...
## $ smoothness : num 0.0977 0.1039 0.094 0.0908 0.1323 ...
## $ compactness : num 0.0812 0.0743 0.1961 0.0589 0.1768 ...
## $ concavity : num 0.0255 0 0.2195 0.0259 0.1558 ...
## $ concave_points : num 0.0218 0 0.1088 0.0232 0.0918 ...
## $ symmetry : num 0.202 0.199 0.172 0.163 0.225 ...
## $ fractal_dimension : num 0.0629 0.071 0.0619 0.0637 0.0742 ...
## $ radius_SE : num 0.275 0.517 1.167 0.171 0.565 ...
## $ texture_SE : num 1.203 2.079 1.352 0.761 1.93 ...
## $ perimeter_SE : num 1.93 3.17 8.87 1.09 3.91 ...
## $ area_SE : num 19.5 28.9 156.8 12.2 52.7 ...
## $ smoothness_SE : num 0.00989 0.01582 0.00569 0.00919 0.00882 ...
## $ compactness_SE : num 0.03053 0.01966 0.0496 0.00855 0.03108 ...
## $ concavity_SE : num 0.0163 0 0.0633 0.0094 0.0311 ...
## $ concave_points_SE : num 0.00928 0 0.01561 0.00632 0.01291 ...
## $ symmetry_SE : num 0.0226 0.0186 0.0192 0.0175 0.02 ...
## $ fractal_dimension_SE : num 0.00227 0.00674 0.00461 0.00301 0.00451 ...
## $ radius_worst : num 13.1 10.2 28.2 12.5 19.2 ...
## $ texture_worst : num 27.2 22.8 28.2 20.8 41.9 ...
## $ perimeter_worst : num 85.1 64 195.9 79.7 128.5 ...
## $ area_worst : num 523 317 2384 476 1153 ...
## $ smoothness_worst : num 0.143 0.146 0.127 0.153 0.223 ...
## $ compactness_worst : num 0.219 0.131 0.472 0.112 0.521 ...
## $ concavity_worst : num 0.1164 0 0.5807 0.0982 0.4646 ...
## $ concave_points_worst : num 0.0826 0 0.1841 0.0655 0.2013 ...
## $ symmetry_worst : num 0.307 0.244 0.283 0.285 0.443 ...
## $ fractal_dimension_worst: num 0.0735 0.0887 0.0886 0.0876 0.1086 ...
#all numeric except the diagnosis (excluding ID number)
fit = glm(diagnosis ~ ., data = train_bcw[ ,-1], family = binomial)
summary(fit)
##
## Call:
## glm(formula = diagnosis ~ ., family = binomial, data = train_bcw[,
## -1])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7.960e-04 -2.000e-08 -2.000e-08 2.000e-08 7.859e-04
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.026e+03 8.242e+04 -0.025 0.980
## radius -2.068e+03 9.280e+04 -0.022 0.982
## texture 6.749e-01 2.130e+03 0.000 1.000
## perimeter 1.311e+02 1.407e+04 0.009 0.993
## area 1.194e+01 3.380e+02 0.035 0.972
## smoothness 3.147e+04 9.660e+05 0.033 0.974
## compactness -1.730e+04 4.423e+05 -0.039 0.969
## concavity 9.102e+03 2.434e+05 0.037 0.970
## concave_points 2.630e+03 4.394e+05 0.006 0.995
## symmetry -9.598e+03 3.159e+05 -0.030 0.976
## fractal_dimension 1.542e+04 1.087e+06 0.014 0.989
## radius_SE 1.256e+03 1.166e+05 0.011 0.991
## texture_SE -1.951e+02 7.621e+03 -0.026 0.980
## perimeter_SE -2.841e+02 1.121e+04 -0.025 0.980
## area_SE 2.951e+01 1.264e+03 0.023 0.981
## smoothness_SE 1.630e+04 2.213e+06 0.007 0.994
## compactness_SE 4.151e+04 1.109e+06 0.037 0.970
## concavity_SE -2.469e+04 5.967e+05 -0.041 0.967
## concave_points_SE 1.007e+05 2.497e+06 0.040 0.968
## symmetry_SE -2.984e+04 7.257e+05 -0.041 0.967
## fractal_dimension_SE -4.284e+05 1.189e+07 -0.036 0.971
## radius_worst 6.081e+02 2.796e+04 0.022 0.983
## texture_worst 4.710e+01 1.526e+03 0.031 0.975
## perimeter_worst -7.280e+00 3.441e+03 -0.002 0.998
## area_worst -3.705e+00 1.635e+02 -0.023 0.982
## smoothness_worst -1.245e+04 6.421e+05 -0.019 0.985
## compactness_worst -3.407e+03 1.751e+05 -0.019 0.984
## concavity_worst 2.341e+03 9.424e+04 0.025 0.980
## concave_points_worst 1.854e+03 2.146e+05 0.009 0.993
## symmetry_worst 7.230e+03 1.926e+05 0.038 0.970
## fractal_dimension_worst 3.280e+04 1.114e+06 0.029 0.977
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6.6638e+02 on 511 degrees of freedom
## Residual deviance: 6.4283e-06 on 481 degrees of freedom
## AIC: 62
##
## Number of Fisher Scoring iterations: 25
confint(fit, level = 0.95)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -4.480561e+03 427.734834
## radius -5.156934e+03 1021.453768
## texture -6.650505e+01 67.235403
## perimeter -3.489957e+02 591.421774
## area 5.095120e-01 23.504736
## smoothness 2.331683e+03 60851.274361
## compactness -3.457030e+04 -510.925433
## concavity 2.089776e+03 16403.786641
## concave_points -1.259270e+04 17361.567288
## symmetry -2.077623e+04 1714.260362
## fractal_dimension -2.285017e+04 52833.706405
## radius_SE -2.947685e+03 5566.869868
## texture_SE -4.463987e+02 61.492711
## perimeter_SE -6.738767e+02 105.605480
## area_SE -1.444827e+01 73.463044
## smoothness_SE -5.736061e+04 89965.811099
## compactness_SE 1.790463e+03 80754.523795
## concavity_SE -4.717151e+04 -2503.746922
## concave_points_SE 2.403395e+04 175489.919969
## symmetry_SE -5.363709e+04 -6288.687522
## fractal_dimension_SE -8.620285e+05 5277.469689
## radius_worst -3.605307e+02 1545.461283
## texture_worst -3.695227e+00 97.900122
## perimeter_worst -1.236746e+02 110.387392
## area_worst -8.860140e+00 1.402701
## smoothness_worst -3.320314e+04 7904.602149
## compactness_worst -9.065329e+03 2144.224575
## concavity_worst -6.308259e+02 5285.506094
## concave_points_worst -4.913297e+03 8559.412724
## symmetry_worst 1.154810e+03 13249.341804
## fractal_dimension_worst -3.735476e+03 68962.734768
1.2. From the fitted generalised linear regression model above, it can be concluded that the diagnosis of a patient’s tumour as malignant(positive class[as opposed to benign(negative class)]) is not significantly associated with any of the predictor features included in the data, including texture and Concavity (P-value[Pr(>|Z|)] > 0.05). Indeed, both features of texture and concavity exhibited relatively large variation (texture[SE +/- 2, 130; concavity[SE +/- 243, 400]]; concavity[95% CI = 2, 089 > x > 16, 403]) in their results and thus this variation, for which was demonstrated with relatively large calculated confidence intervals(texture[95% CI = 0 > x > 67.235]; concavity[95% CI = 2089 > x > 16403]). Therefore, based on this model a tumour’s pathological texture or level of concavity is not indicative of tumour malignancy.
1.3 Using descriptive methods to ascertain inter-feature relationships in the original data set. This will be determined by first calculating the level of covariance between variables and thus consequently calculating the wrestling correlation coefficients.
#returned to original data set (bcw) and reformatted the response variable of diagnosis to numeric
bcw$diagnosis[bcw$diagnosis == 'M'] <- 1
bcw$diagnosis[bcw$diagnosis == 'B'] <- 0
bcw$diagnosis = as.numeric(as.character(bcw$diagnosis))
str(bcw)
## 'data.frame': 569 obs. of 32 variables:
## $ ID_number : int 842302 842517 84300903 84348301 84358402 843786 844359 84458202 844981 84501001 ...
## $ diagnosis : num 1 1 1 1 1 1 1 1 1 1 ...
## $ radius : num 18 20.6 19.7 11.4 20.3 ...
## $ texture : num 10.4 17.8 21.2 20.4 14.3 ...
## $ perimeter : num 122.8 132.9 130 77.6 135.1 ...
## $ area : num 1001 1326 1203 386 1297 ...
## $ smoothness : num 0.1184 0.0847 0.1096 0.1425 0.1003 ...
## $ compactness : num 0.2776 0.0786 0.1599 0.2839 0.1328 ...
## $ concavity : num 0.3001 0.0869 0.1974 0.2414 0.198 ...
## $ concave_points : num 0.1471 0.0702 0.1279 0.1052 0.1043 ...
## $ symmetry : num 0.242 0.181 0.207 0.26 0.181 ...
## $ fractal_dimension : num 0.0787 0.0567 0.06 0.0974 0.0588 ...
## $ radius_SE : num 1.095 0.543 0.746 0.496 0.757 ...
## $ texture_SE : num 0.905 0.734 0.787 1.156 0.781 ...
## $ perimeter_SE : num 8.59 3.4 4.58 3.44 5.44 ...
## $ area_SE : num 153.4 74.1 94 27.2 94.4 ...
## $ smoothness_SE : num 0.0064 0.00522 0.00615 0.00911 0.01149 ...
## $ compactness_SE : num 0.049 0.0131 0.0401 0.0746 0.0246 ...
## $ concavity_SE : num 0.0537 0.0186 0.0383 0.0566 0.0569 ...
## $ concave_points_SE : num 0.0159 0.0134 0.0206 0.0187 0.0188 ...
## $ symmetry_SE : num 0.03 0.0139 0.0225 0.0596 0.0176 ...
## $ fractal_dimension_SE : num 0.00619 0.00353 0.00457 0.00921 0.00511 ...
## $ radius_worst : num 25.4 25 23.6 14.9 22.5 ...
## $ texture_worst : num 17.3 23.4 25.5 26.5 16.7 ...
## $ perimeter_worst : num 184.6 158.8 152.5 98.9 152.2 ...
## $ area_worst : num 2019 1956 1709 568 1575 ...
## $ smoothness_worst : num 0.162 0.124 0.144 0.21 0.137 ...
## $ compactness_worst : num 0.666 0.187 0.424 0.866 0.205 ...
## $ concavity_worst : num 0.712 0.242 0.45 0.687 0.4 ...
## $ concave_points_worst : num 0.265 0.186 0.243 0.258 0.163 ...
## $ symmetry_worst : num 0.46 0.275 0.361 0.664 0.236 ...
## $ fractal_dimension_worst: num 0.1189 0.089 0.0876 0.173 0.0768 ...
library(corrplot)
## corrplot 0.92 loaded
bcw_cormat<-cor(bcw[ ,3:30]) #computing a correlation matrix
head(round(bcw_cormat,2))
## radius texture perimeter area smoothness compactness concavity
## radius 1.00 0.32 1.00 0.99 0.17 0.51 0.68
## texture 0.32 1.00 0.33 0.32 -0.02 0.24 0.30
## perimeter 1.00 0.33 1.00 0.99 0.21 0.56 0.72
## area 0.99 0.32 0.99 1.00 0.18 0.50 0.69
## smoothness 0.17 -0.02 0.21 0.18 1.00 0.66 0.52
## compactness 0.51 0.24 0.56 0.50 0.66 1.00 0.88
## concave_points symmetry fractal_dimension radius_SE texture_SE
## radius 0.82 0.15 -0.31 0.68 -0.10
## texture 0.29 0.07 -0.08 0.28 0.39
## perimeter 0.85 0.18 -0.26 0.69 -0.09
## area 0.82 0.15 -0.28 0.73 -0.07
## smoothness 0.55 0.56 0.58 0.30 0.07
## compactness 0.83 0.60 0.57 0.50 0.05
## perimeter_SE area_SE smoothness_SE compactness_SE concavity_SE
## radius 0.67 0.74 -0.22 0.21 0.19
## texture 0.28 0.26 0.01 0.19 0.14
## perimeter 0.69 0.74 -0.20 0.25 0.23
## area 0.73 0.80 -0.17 0.21 0.21
## smoothness 0.30 0.25 0.33 0.32 0.25
## compactness 0.55 0.46 0.14 0.74 0.57
## concave_points_SE symmetry_SE fractal_dimension_SE radius_worst
## radius 0.38 -0.10 -0.04 0.97
## texture 0.16 0.01 0.05 0.35
## perimeter 0.41 -0.08 -0.01 0.97
## area 0.37 -0.07 -0.02 0.96
## smoothness 0.38 0.20 0.28 0.21
## compactness 0.64 0.23 0.51 0.54
## texture_worst perimeter_worst area_worst smoothness_worst
## radius 0.30 0.97 0.94 0.12
## texture 0.91 0.36 0.34 0.08
## perimeter 0.30 0.97 0.94 0.15
## area 0.29 0.96 0.96 0.12
## smoothness 0.04 0.24 0.21 0.81
## compactness 0.25 0.59 0.51 0.57
## compactness_worst concavity_worst concave_points_worst
## radius 0.41 0.53 0.74
## texture 0.28 0.30 0.30
## perimeter 0.46 0.56 0.77
## area 0.39 0.51 0.72
## smoothness 0.47 0.43 0.50
## compactness 0.87 0.82 0.82
corrplot(bcw_cormat, method="circle", type = 'lower', insig = "blank", tl.pos = 'lt', tl.cex = 0.5)#insignificant values are left blank for a clearer visualization of the relationships of interest.
Overall, many of the variables present in this dataset exhibit a significant level of correlation with each other. From including the standard error variables previously calculated in the dataset, the subsequent paring for each of the variable’s SE values in a way showcase this covariation that underpins this correlation. Moreover, an interesting relationship to review is the association between each feature and the corresponding ‘worst’ value for each variable (i.e. the average of the top three). As such, this arbitrary analysis indicates which feature variable(s) are more inclined with certain types of tumours. For example, it would be intuitive to assume that tumours for which exhibit excessive growth(i.e. the ‘worst’ in terms of space-based variables [perimeter, area and radius]) would in turn be presented here with a high level of correlation {refer to appendix 1.3 for exact values}. Indeed, data obtained for the ‘worst’ tumour perimeter and area display strong positive correlations to the values obtained for radius(corr. = 0.97 and 0.94 respectively). Furthermore, concavity and concavity_points are also highly correlated. To a lesser extent, there was some positive correlation between tumour smoothness, compactness and concavity (corr. ranging between 0.56 - 0.88). Other feature variables such as texture and smoothness demonstrate some low level/ minor association(positive except for fractal_dimension) across the board.
1.4 Ambitiously, the data was obtained in the hope to identify/classify the risk factors associated with malignant tumours for the purpose of early detection. Given the association of many feature variables present in this dataset(as discussed above), it would be wise to implement a Principle Component Analysis as opposed to other unsupervised learning techniques such as clustering. As such, the application of dimensionality reduction will eliminate unnecessary noise, reduce the influence of covariation and thus streamline the clustering process.
#implementing PCA, including standardization of the variables for equitable comparison(excluding ID)
bcw_prcomp = prcomp(bcw[ ,3:32], scale = TRUE)
#for a full table of eigenvectors, please refer to appendix 1.4a
summary(bcw_prcomp)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 3.6444 2.3857 1.67867 1.40735 1.28403 1.09880 0.82172
## Proportion of Variance 0.4427 0.1897 0.09393 0.06602 0.05496 0.04025 0.02251
## Cumulative Proportion 0.4427 0.6324 0.72636 0.79239 0.84734 0.88759 0.91010
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.69037 0.6457 0.59219 0.5421 0.51104 0.49128 0.39624
## Proportion of Variance 0.01589 0.0139 0.01169 0.0098 0.00871 0.00805 0.00523
## Cumulative Proportion 0.92598 0.9399 0.95157 0.9614 0.97007 0.97812 0.98335
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.30681 0.28260 0.24372 0.22939 0.22244 0.17652 0.1731
## Proportion of Variance 0.00314 0.00266 0.00198 0.00175 0.00165 0.00104 0.0010
## Cumulative Proportion 0.98649 0.98915 0.99113 0.99288 0.99453 0.99557 0.9966
## PC22 PC23 PC24 PC25 PC26 PC27 PC28
## Standard deviation 0.16565 0.15602 0.1344 0.12442 0.09043 0.08307 0.03987
## Proportion of Variance 0.00091 0.00081 0.0006 0.00052 0.00027 0.00023 0.00005
## Cumulative Proportion 0.99749 0.99830 0.9989 0.99942 0.99969 0.99992 0.99997
## PC29 PC30
## Standard deviation 0.02736 0.01153
## Proportion of Variance 0.00002 0.00000
## Cumulative Proportion 1.00000 1.00000
#Proportion of Variance Explained [refer to appendix 1.4b]
round(PVE <- (bcw_prcomp$sdev^2)/sum(bcw_prcomp$sdev^2), digits = 2)
## [1] 0.44 0.19 0.09 0.07 0.05 0.04 0.02 0.02 0.01 0.01 0.01 0.01 0.01 0.01 0.00
## [16] 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
#majority of the variable information i.e maximum variation lies within PC1(44%), PC2(19%) and PC3(9%)[63% total]
#visualising with elbow plot
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_eig(bcw_prcomp)
As seen above, almost two thirds of the variation observed in the original dataset can be accounted for within the first two components calculated in the resulting PCA. As such, PC1, PC2 and PC3 will be used to then visualize the distribution of tumour diagnoses as separate classes(Malignant Vs Benign).
#visualizing the data as PC1 Vs PC2
library(ggplot2)
bcw_predictors <- scale(bcw[ ,3:32])#scaled variables used for PCA
bcw_class <- bcw[ ,2] # Class labels (tumour diagnosis)
bcw_class <- ifelse(bcw_class == 1,0,1) # Inliers (class "1" i.e. M) = 0, Outliers (classes "2", "3", "4", "5") = 1
bcwPCA_3D = as.data.frame(cbind(bcw$diagnosis, bcw_prcomp$x[ , 1:3]))
#3D plot using plotly
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot_ly(x = bcwPCA_3D$PC1, y = bcwPCA_3D$PC2, z = bcwPCA_3D$PC3, color = bcwPCA_3D$V1, size = 1.5)
## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
#M tumours are yellow
#1.5.a justify using corr. matrix to show no correlation
bcw_corPCmat = cor(bcw_prcomp$x)
corrplot(bcw_corPCmat, method="circle", type = 'lower', tl.pos = 'lt', tl.cex = 0.5, insig = 'p-value')
#hence, no significant correlations now exist between variables post-PCA
In summary, a PCA algorithm was applied to the original data set to not only reduce the influence of inter-variable relationships/correlation but to also reduce data set dimensions to the core feature variables of significance only. In doing so, the principal components selected (PC1.2.3) explain almost two thirds of variation present in the original data. When visualizing the subset of data obtained from the principal component analysis (using scaled data), two distinct clusters can be seen when selecting for each class of tumour diagnosis (M or B). There does seem to be some minor overlap upon furhter inspection, however.
1.5 Partitioning the data for PC1, PC2 and PC3 for further analysis
#creating a new data frame with subset of PC1, PC2 and PC3 against original diagnosis
#but renaming and including all components for now as it is also required for Q6
bcw_PC1.2.3 = (cbind.data.frame(bcw$diagnosis, bcw_prcomp$x))
bcw_PC1.2.3 = setNames(bcw_PC1.2.3, c("diagnosis", 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'PC11', 'PC12', 'PC13', 'PC14', 'PC15', 'PC16', 'PC17', 'PC18', 'PC19', 'PC20', 'PC21', 'PC22', 'PC23', 'PC24', 'PC25', 'PC26', 'PC27', 'PC28', 'PC29', 'PC30'))
#1.5.b partitioned the data into 90% training and 10% test subsets for PC1 and PC2
set.seed(3)
train_bcw_PCA = bcw_PC1.2.3 %>% dplyr::sample_frac(0.9)#randomly select 90%(0.9) of the data(rows)
test_bcw_PCA = dplyr::anti_join(bcw_PC1.2.3, train_bcw_PCA, by = NULL)#To select for the remaining 10%
## Joining, by = c("diagnosis", "PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7",
## "PC8", "PC9", "PC10", "PC11", "PC12", "PC13", "PC14", "PC15", "PC16", "PC17",
## "PC18", "PC19", "PC20", "PC21", "PC22", "PC23", "PC24", "PC25", "PC26", "PC27",
## "PC28", "PC29", "PC30")
(c(nrow(train_bcw), nrow(test_bcw)))
## [1] 512 57
#implementing logistic regression to model diagnosis vs the new PCA components
fit_PCAlog = glm(diagnosis ~ PC1 + PC2 + PC3, data = train_bcw_PCA, family = binomial)
(summary(fit_PCAlog))
##
## Call:
## glm(formula = diagnosis ~ PC1 + PC2 + PC3, family = binomial,
## data = train_bcw_PCA)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8173 -0.0995 -0.0148 0.0041 3.4361
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.5666 0.2532 -2.237 0.0253 *
## PC1 -2.3288 0.3210 -7.254 4.04e-13 ***
## PC2 1.3778 0.2395 5.753 8.77e-09 ***
## PC3 -0.4627 0.1439 -3.215 0.0013 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 674.30 on 511 degrees of freedom
## Residual deviance: 111.62 on 508 degrees of freedom
## AIC: 119.62
##
## Number of Fisher Scoring iterations: 9
#1.5.c Implementing QDA instead, for comparison
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:plotly':
##
## select
##
## The following object is masked from 'package:dplyr':
##
## select
fit_PCAqda = qda(diagnosis ~ PC1 + PC2 + PC3, data = train_bcw_PCA, family = binomial)
(fit_PCAqda$prior)
## 0 1
## 0.6308594 0.3691406
#Thus, prior prob. of M tumour is 38% Vs 62% for B
1.6 Assessing the accuracy of each model in predicting tumour diagnosis for the 57 ‘test’ patients
#using glm model to predict prob. of M diagnosis for each of the 57 'test' patients
pred_PCAlog = predict(fit_PCAlog, test_bcw_PCA, type = 'response')
(round(pred_PCAlog, 3))
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## 1.000 1.000 0.999 1.000 0.001 0.000 0.000 0.993 0.010 1.000 0.031 0.996 0.520
## 14 15 16 17 18 19 20 21 22 23 24 25 26
## 0.001 1.000 0.563 0.000 0.006 0.000 0.000 0.000 0.998 0.002 0.974 1.000 1.000
## 27 28 29 30 31 32 33 34 35 36 37 38 39
## 0.829 0.589 0.902 1.000 1.000 0.999 0.984 1.000 0.000 0.001 0.000 0.004 0.000
## 40 41 42 43 44 45 46 47 48 49 50 51 52
## 0.043 0.000 0.000 0.648 0.008 0.013 0.006 0.115 0.819 1.000 0.344 0.270 0.065
## 53 54 55 56 57
## 0.000 0.001 0.007 0.128 1.000
#confusion matrix to assess for accuracy
(table_mat = table(test_bcw_PCA$diagnosis, pred_PCAlog > 0.5))
##
## FALSE TRUE
## 0 31 3
## 1 0 23
#calculating the accuracy of the (glm using PCA-derived variables) model in predicting tumour diagnosis
(accuracy_test = sum(diag(table_mat)) / sum(table_mat))
## [1] 0.9473684
#Thus, the glm model is 96% accurate
#using qda model to predict an M diagnosis for each of the 57 'test' patients
pred_PCAqda = predict(fit_PCAqda, test_bcw_PCA, type = 'response')
(head(pred_PCAqda$posterior))
## 0 1
## 1 9.450608e-11 1.000000000
## 2 8.093389e-05 0.999919066
## 3 1.964840e-03 0.998035160
## 4 2.701703e-07 0.999999730
## 5 9.928724e-01 0.007127649
## 6 8.663413e-01 0.133658748
(pred_PCAqda$class)
## [1] 1 1 1 1 0 0 0 1 0 1 0 1 0 0 1 0 0 0 0 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0
## [39] 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1
## Levels: 0 1
#confusion matrix to assess for accuracy
(table_mat_qda = table(pred_PCAqda$class, test_bcw_PCA$diagnosis))
##
## 0 1
## 0 33 1
## 1 1 22
#calculating the accuracy of the qda (using PCA-derived variables) model in predicting tumour diagnosis
(accuracy_test_qda = sum(diag(table_mat_qda))/sum(table_mat_qda))
## [1] 0.9649123
#Thus this qda model was 96% accurate in predicting the tumour diagnosis of the 57 test patients
#compared to the accuracy of the logistic regression (labelled 'fit') performed for the original data as in section 1.1. above.
#converted test data to a factor
test_bcw$diagnosis[test_bcw$diagnosis == 'M'] <- 1
test_bcw$diagnosis[test_bcw$diagnosis == 'B'] <- 0
pred_PCAlogall = predict(fit, test_bcw, type = 'response')
(table_mat_logall = table(test_bcw$diagnosis, pred_PCAlogall > 0.5))
##
## FALSE TRUE
## 0 26 1
## 1 0 30
(accuracy_test_logall = sum(diag(table_mat_logall))/sum(table_mat_logall))
## [1] 0.9824561
#Thus this log regression model performed on the original data set is 98% accurate in predicting M tumour diagnosis
While the vast array of features record for each patient provide valuable information for tumour pathology, it does so at the cost of efficiency for any subsequent discriminating/classifying machine learning algorithms. Hence the benefits in first discerning the principal components of the dataset to reduce dimensionality of the dataset and thus eliminate the influence of any ‘irrelevant’ data. Moreover, the scaling process embedded within PCA enables the possibility of applying discriminate analyses to the principal dataset due to the relatively comparable distributions (mean = 0, stdev. = 1). However, the type of distribution of the resulting principal components does determine the suitability of the subsequent discriminate analysis. In the analysis performed above, a quadratic discriminate analysis was performed to mitigate this issue. The qda performed was shown to be 96% accurate in diagnosing tumour malignancy.
Question 2
2.a.
There are several methods for hierarchical clustering, for which are implemented in either a top-down(devisive) or bottom-up (agglomerative) approach. The ‘hierarchy’ which underpins hierarchical clustering is due to the systematical way in which data points are grouped together - using distance/ dissimilarity - in a pair-wise, step-by-step fashion. This process continues until all data points have been assigned a cluster(k). Agglomerative hierarchical clustering (bottom-up) begins by assigning all data points as separate independent homogeneous clusters (termed the trivial clustering solution k = N) The resulting steps as part of the discriminating algorithm then aims to group ‘similar’ data points or ‘clusters’ together based on the relative level of distance or similarity(or dissimilarity). Conversely, the devisive (top-down) clustering technique begins with one large cluster encompassing all data points and thus separates this one larger cluster into smaller clusters, until all data points are once again been assigned separate independent ‘clusters’ (again k = N as part of the trivial clustering solution). Thus both types of hierarchical clustering approaches end up with a final number of partitions (P) that are 1 less than the number of original observations (PN-1) or rather at any given level the number of partitions is one less than the number of clusters (Pk-1) present.
The method for determining distance does vary between implementation of these techniques, though typically Elucidean distance is used. Moreover, the criteria set for deciding which distance to measure is referred to as linkage. Two common types of linkage used in hierarchical clustering includes complete and single linkage. Single linkage refers to grouping similar clusters together based on the shortest distance between two separate data points belonging to each independent cluster. By contrast, complete linkage refers to a similar process with the exception of grouping criteria being determined by the maximum distance obtained between the two independent cluster members. The difference between implementation of these two clustering techniques often influences the overall variance of the final clusters. Single linkage often produces very ‘tightly’ knitted clusters while complete linkage produces the ‘loosely’-fitted clusters by comparison.Single linage clustering is prone to increased variation depending on the randomization of training data sets since artificial chains can develop, thereby influencing the boundary of each cluster. Conversely, while complete linkage is less susceptible to minor variations in the data set, the presence of outliers is more likely to warp the process.
Apart from complete and single linkage, other forms of hierarchical clustering techniques include average linkage and Ward’s algorithms. Such techniques form a compromise between single and complete linkage for an optimal balance between accuracy and computational complexity. Specifically, average linkage uses the average distance obtained between all members of each cluster to determine groupings while Ward’s algorithm utilizes the minimal increase sum-of-squares for calculating distance.
2.b. Complete linkage Implementation of complete linkage to derive a dendrogram plot by applying an agglomerative hierarchical clustering approach using an example dissimilarity matrix. Taking note that this matrix is the exact same for the upper and lower panels, with the diagonal being 0 as these points are being compared to themselves.
When creating a dendrogram using complete linkage, the maximum distance between data points/ clusters is taken as the criteria to discern the most similar grouping (i.e. lowest dissimilarity overall). The most similar clusters are then merged at the next level and a new dissimilarity matrix is then formed once again taking the maximum distance between cluster members. Given the following dissimilarity matrix for 5 separate data points, continuing with this process would result in the following order of clustering:
For example, the first grouping would be between data points 2 and 3 as this pairing exhibits the smallest relative level of dissimilarity (0.1). Data points 2 and 3 have now become a cluster labelled as 23. This pairing is the first to be seen on the dendrogram. The distance that was used to group the two data points (or clusters further into the process) is represented as the height of the dendrogram (refer to Fig 2.1).
2 and 3 = 23 - dissimilarity of 0.1 To follow, cluster 23 is then compered to the remaining data points 1, 4 and 5. For each comparison, the maximum distance is taken with respect to members 2 and 3 in cluster 23. After a new dissimilarity matrix is compiled. At this level, the most similar (i.e. lowest dissimilarity) data points/clusters are in fact data points 4 and 5.
4 and 5 = 45 - dissimilarity of 0.3 Further on, Clusters 23 and 45 are now compared back to data point 1 as well as each other. The maximum distance was taken with respect to each cluster member for each comparison. As such, a new dissimilarity matrix was formed. At this level, the most similar (i.e. lowest dissimilarity) data points/clusters are in fact cluster 23 and data point 1.
23 and 1 = dissimilarity of 0.45 Finally, cluster 231 and cluster 45 are now compared to each other at the final level. Here the final height of the dendrogram is equivalent to 0.8 as this is the maximum dissimilarity present between clusters 231 and 45.
231 and 45 = dissimilarity of 0.8 overall
2.c. Single Linkage
Similar to the complete linkage example above, the first step is to determine the least distance between data points i.e. which data points display the least dissimilarity. This process then repeats until all data points/clusters have been grouped (agglomerative hierarchical clustering approach).
Hence, the first grouping would be between data points 2 and 3 as they are the most similar(i.e. lowest dissimilarity).
2 and 3 = 23 - dissimilarity of 0.1
The next step then compares the dissimilarity for cluster 23 with the other data points, thus creating a new dissimilarity matrix (Fig. 2.6.). In contrast to complete linkage however, the minimum dissimilarity between the members of the newly created cluster and the other data points is the criteria used to determine the next cluster grouping.
When comparing members 2 and 3 of cluster 23: overall, it is data point 1 that exhibits the least dissimilarity (0.2) with data point 2 of cluster 23. Hence, cluster 23 and data point 1 constitute the next merger on the dendrogram.
23 and 1 = 231 - dissimilarity of 0.2 Continuing with this process, the order of clustering would be as follows:
4 and 5 = dissimilarity of 0.3 231 and 45 = dissimilarity of 0.35
Question 3
3.a
Initially, a classical k-means algorithim comes to mind in this context as the overall aim is to determine which genes are associated with acute myeloblastic leukemia (AML) as opposed to acute lymphoblastic leukemia (ALL). ALL is more common than AML in regards to leukemia. AML is also more likely to spread to other parts of the body. The ability to distinguish between genes associated with each sub-type not only aids in early intervention for treatment but also leads the way for researchers to further investigate alternative forms of treatment.
Indeed, it is essential to choose a suitable predictive model that is not so easily influenced by co-variation due to the complexities of mammalian genomics. In the context of physiology the ‘outliers’ are often predictors for anomalies rather than ‘mistakes’ in data collection. Hence, an outlier is often an indication of the disease. Thus it is important to choose a clustering technique that takes ‘head’ to the presence of outliers. As such complete linkage as opposed to single linkage agglomerative hierarchical clustering would be applicable in this case. However, to ascertain the validity of the outlier as a mistake rather than an indication of disease, a density-based outlier detection method can be applied. Additionally, a review of the structure of the Golub et al., Science, 1999 leukemia data set indicates a substantial level of feature variables (n = 1867). Hence, it is quite clear that a principal component analysis would need to be performed in order to reduce the number of dimensions for computational manageability. Moreover, the PCA will also reduce the level of irrelevant noise that may influence the data set. A subsequent hierarchical clustering technique can then be applied post-PCA in order to determine an adequate model for leukemia sub-type classification.
Based on the discussion above, either a PCA analysis in combination with a subsequent agglomerative hierarchical clustering approach using complete linkage is performed or a Hierarchical Density-Based Spatial Clustering of Applications with Noise can be performed separately as an alternative. The data set given clearly demonstrates the presence of 2 pre-known clusters that the data due to the presence of a ‘response’ variable being leukemia sub-type. However, the nature of genetic-based diseases such as leukemia can exist whereby several gene combinations can equate to a similar pathology. Thus in this context there could be several clusters within the class of each leukemia sub-type (i.e. k > 2). Hence, once again HDBSCAN would be more applicable in this situation as opposed to k-means which requires k to be defined.
3.b. In order to compare the level of gene expression here, all numeric variables i.e. gene expression for all 72 patients were scaled to a mean of 0 and standard deviation of 1. Whether the data should be scaled comes back to the context of physiology. While a certain level of expression may in turn trigger the desired physiological response, some diseases in the human body only require a threshold of expression to trigger a response. This in contrast to the intuitive covariant manner of expression for numeric data, in that the more RNA present the more severe the resulting disease. The heatmap included below highlights some of these genes which seem to ‘switch on’ synchronously. Additionally, we should not discount the fact that the absence of gene expression may also be indicative of disease. In this case a density-based unsupervised learning technique such as HDBSCAN would be more applicable in detecting this type of genomic /physiological profile.
Moving forward, we assume disease pathology for leukemia is associated with fluctuating mRNA concentrations associated with certain genes. Thus, the data was scaled prior to performing the HDBSCAN algorithm.
#basic overview of the data
load("C:\\Users\\katie\\Downloads\\leukemia_dat.Rdata")
str(leukemia_dat)
## 'data.frame': 72 obs. of 1869 variables:
## $ patient_id: chr "Subject1" "Subject2" "Subject3" "Subject4" ...
## $ type : Factor w/ 2 levels "ALL","AML": 1 1 1 1 1 1 1 1 1 1 ...
## $ Gene_1 : num 199 10 33 158 10 67 131 10 10 328 ...
## $ Gene_2 : num 252 101 206 49 70 87 126 70 24 177 ...
## $ Gene_3 : num 206 74 10 31 252 193 10 10 506 183 ...
## $ Gene_4 : num 10 19 19 363 155 325 10 361 284 10 ...
## $ Gene_5 : num 75 182 208 142 32 10 109 10 292 233 ...
## $ Gene_6 : num 10 37 183 45 10 65 43 338 29 10 ...
## $ Gene_7 : num 165 18 238 247 44 39 100 265 106 10 ...
## $ Gene_8 : num 10 10 104 10 10 10 10 10 10 10 ...
## $ Gene_9 : num 215 116 476 155 122 176 58 257 166 155 ...
## $ Gene_10 : num 14538 615 5669 4850 1284 ...
## $ Gene_11 : num 9738 115 3272 2293 2731 ...
## $ Gene_12 : num 8529 1518 3668 2569 316 ...
## $ Gene_13 : num 70 153 66 10 78 10 10 10 10 10 ...
## $ Gene_14 : num 6750 2215 3325 3058 1130 ...
## $ Gene_15 : num 240 86 252 10 28 71 10 10 235 10 ...
## $ Gene_16 : num 10 185 140 10 10 382 10 10 131 10 ...
## $ Gene_17 : num 72 21 10 61 16 85 10 25 10 65 ...
## $ Gene_18 : num 10 10 142 10 237 10 87 10 148 10 ...
## $ Gene_19 : num 378 249 362 266 554 110 312 238 896 229 ...
## $ Gene_20 : num 10 10 10 10 16 10 134 13 10 10 ...
## $ Gene_21 : num 10 10 94 10 10 10 33 10 34 10 ...
## $ Gene_22 : num 87 53 128 112 10 144 65 203 93 24 ...
## $ Gene_23 : num 11 20 12 34 10 10 10 11 10 19 ...
## $ Gene_24 : num 152 104 10 10 10 88 10 10 35 10 ...
## $ Gene_25 : num 146 224 15 10 169 85 10 101 75 10 ...
## $ Gene_26 : num 117 10 10 10 65 40 10 10 10 30 ...
## $ Gene_27 : num 207 10 109 520 10 401 10 188 180 136 ...
## $ Gene_28 : num 10 348 150 135 280 10 176 10 327 89 ...
## $ Gene_29 : num 169 71 92 88 196 10 146 53 132 39 ...
## $ Gene_30 : num 457 10 376 325 221 280 361 740 296 10 ...
## $ Gene_31 : num 10 10 10 193 10 104 248 415 10 62 ...
## $ Gene_32 : num 484 485 10 597 155 501 774 982 10 59 ...
## $ Gene_33 : num 99 21 10 33 110 76 70 76 98 10 ...
## $ Gene_34 : num 65 92 163 139 107 76 68 10 98 62 ...
## $ Gene_35 : num 197 119 293 118 200 44 181 206 93 55 ...
## $ Gene_36 : num 124 65 10 10 24 10 10 10 10 10 ...
## $ Gene_37 : num 36 58 63 38 120 92 16 169 43 10 ...
## $ Gene_38 : num 10 10 10 307 212 314 10 332 221 10 ...
## $ Gene_39 : num 10 2072 1658 2209 5846 ...
## $ Gene_40 : num 40 87 77 136 223 10 42 10 82 66 ...
## $ Gene_41 : num 42 82 10 101 80 122 35 10 10 10 ...
## $ Gene_42 : num 10 334 10 366 408 200 381 10 10 36 ...
## $ Gene_43 : num 397 515 343 36 29 53 10 19 13 652 ...
## $ Gene_44 : num 10 95 10 10 177 10 10 10 10 68 ...
## $ Gene_45 : num 10 23 10 10 71 76 10 10 10 10 ...
## $ Gene_46 : num 10 399 102 10 504 23 10 10 305 51 ...
## $ Gene_47 : num 10 103 10 10 10 10 67 10 95 10 ...
## $ Gene_48 : num 163 34 55 66 56 71 117 46 165 76 ...
## $ Gene_49 : num 37 13 63 10 10 10 10 10 10 10 ...
## $ Gene_50 : num 10 10 10 10 257 10 336 10 173 256 ...
## $ Gene_51 : num 98 138 10 134 51 10 53 10 10 10 ...
## $ Gene_52 : num 4707 3367 10 101 1276 ...
## $ Gene_53 : num 100 372 10 234 266 156 137 194 49 106 ...
## $ Gene_54 : num 60 10 39 82 37 62 10 47 10 15 ...
## $ Gene_55 : num 10 121 10 18 284 10 10 10 10 73 ...
## $ Gene_56 : num 10 10 10 103 10 160 67 10 10 30 ...
## $ Gene_57 : num 122 32 42 88 163 10 80 10 17 39 ...
## $ Gene_58 : num 21 82 10 114 128 58 84 107 10 10 ...
## $ Gene_59 : num 10 10 10 10 157 10 47 10 10 10 ...
## $ Gene_60 : num 10 10 10 10 10 10 10 10 10 36 ...
## $ Gene_61 : num 53 10 114 10 10 10 43 50 74 163 ...
## $ Gene_62 : num 10 14 10 10 39 10 10 10 99 79 ...
## $ Gene_63 : num 92 10 23 224 69 10 835 73 10 10 ...
## $ Gene_64 : num 88 10 145 136 10 10 70 245 72 88 ...
## $ Gene_65 : num 134 56 207 19 10 10 18 179 92 31 ...
## $ Gene_66 : num 10 76 10 10 42 10 10 10 10 26 ...
## $ Gene_67 : num 10 10 10 10 26 19 10 137 295 45 ...
## $ Gene_68 : num 25 10 58 310 61 10 245 149 10 66 ...
## $ Gene_69 : num 122 227 272 46 449 51 124 52 539 186 ...
## $ Gene_70 : num 50 10 107 10 17 48 26 160 69 27 ...
## $ Gene_71 : num 73 85 10 10 10 10 10 85 10 10 ...
## $ Gene_72 : num 10 377 153 224 1714 ...
## $ Gene_73 : num 519 15 144 458 29 279 250 52 83 83 ...
## $ Gene_74 : num 10 10 10 10 226 158 10 10 73 10 ...
## $ Gene_75 : num 10 10 10 10 78 192 10 10 24 10 ...
## $ Gene_76 : num 10 10 79 10 10 10 10 71 43 86 ...
## $ Gene_77 : num 481 10 443 516 1108 ...
## $ Gene_78 : num 101 479 145 169 340 227 62 110 49 10 ...
## $ Gene_79 : num 10 10 46 57 10 62 57 100 17 76 ...
## $ Gene_80 : num 295 437 17 46 577 10 149 10 371 22 ...
## $ Gene_81 : num 166 10 37 151 121 19 208 328 100 64 ...
## $ Gene_82 : num 179 10 66 28 61 24 10 10 10 10 ...
## $ Gene_83 : num 10 10 10 10 513 10 10 10 10 10 ...
## $ Gene_84 : num 151 148 115 218 238 199 270 194 283 108 ...
## $ Gene_85 : num 10 316 10 355 46 35 10 232 116 258 ...
## $ Gene_86 : num 10 337 10 10 130 234 544 287 10 21 ...
## $ Gene_87 : num 50 153 193 105 47 10 36 10 28 10 ...
## $ Gene_88 : num 16 385 10 10 230 83 10 26 161 51 ...
## $ Gene_89 : num 13 10 83 102 99 17 10 65 33 21 ...
## $ Gene_90 : num 338 413 10 387 504 288 146 74 253 201 ...
## $ Gene_91 : num 10 337 10 262 666 151 55 44 247 89 ...
## $ Gene_92 : num 250 10 10 10 10 33 55 10 10 10 ...
## $ Gene_93 : num 380 642 351 406 248 560 680 506 805 79 ...
## $ Gene_94 : num 10 27 114 28 19 10 25 10 39 78 ...
## $ Gene_95 : num 10 10 150 10 110 88 42 10 552 70 ...
## $ Gene_96 : num 125 548 185 10 163 83 10 247 267 112 ...
## $ Gene_97 : num 136 10 112 10 245 10 10 137 109 124 ...
## [list output truncated]
(sum(is.na(leukemia_dat)))#no missing values
## [1] 0
#72 obs. of 1869 variables [1] chr, [2] Factor w/ 2 levels "ALL","AML", [3:] num
leudat = leukemia_dat#separate obj. for data manipulation
#refer to appendix 3.1a for full str output
#checking for correlation between variables
cor_leumat = cor(leudat[ ,3:1869])
heatmap(cor_leumat, keep.dendro = FALSE)#to glimpse level of correlation between gene expression
#searching for strong positive or negative relationships between genes
leu_mat.df <- as.data.frame(as.table(cor_leumat))
leu_mat_strong = subset(leu_mat.df, abs(Freq) > 0.75)#setting the cutoff for a strong relationship being 0.75(+ve or -ve)
summary(leu_mat.df)
## Var1 Var2 Freq
## Gene_1 : 1867 Gene_1 : 1867 Min. :-0.55500
## Gene_2 : 1867 Gene_2 : 1867 1st Qu.:-0.08132
## Gene_3 : 1867 Gene_3 : 1867 Median : 0.01000
## Gene_4 : 1867 Gene_4 : 1867 Mean : 0.03250
## Gene_5 : 1867 Gene_5 : 1867 3rd Qu.: 0.12450
## Gene_6 : 1867 Gene_6 : 1867 Max. : 1.00000
## (Other):3474487 (Other):3474487
(nrow(leu_mat_strong)/2)#number of pairs exhibiting a strong relationship - /2 to eliminate duplicates for e.g. gene 1 v.s. 2 counts as gene 2 v.s. 1
## [1] 1956.5
#as a percentage of original numbers of pairs
(1423.5/(3474487/2)*100)
## [1] 0.08194015
#0.08% of data (gene pairs) exhibit a strong correlation
#number of pairs that display low to no correlaiton
leu_mat_weak = subset(leu_mat.df, abs(Freq) < 0.3)#setting the cutoff for a weak relationship being 0.3(+ve or -ve)
(nrow(leu_mat_weak)/2)
## [1] 1628291
(162891/(3474487/2)*100)#9.37% of data exhibit a low almost non-exist correlation
## [1] 9.376406
Overall, there is some degree of correlation present between most genes (as seen on the heatmap above), with only 0.08% of gene pairings exhibiting a strong relationship (negative or positive [> 0.75]). By contrast, only 9% of gene parings display little to no relationship whatsoever (corr. < 0.3 [positive or negative]). A subsequent PCA will hope to hone in the strong relationships present in order to select for the most relevant data prior to any type of unsupervised learning. Moreover, the relatively high level of dimensionality associated with this data set can interfere with distance calculations for a complex algorithm such as HDBSCAN. Hence PCA will be performed to reduce computational load and maximize separation of the resulting clusters. Given the low level and widespread contribution of each PC, all components computed will be selected for further analysis (n = 72).
#performing PCA
leudat_prcomp = prcomp(leudat[ ,3:1869], scale = TRUE)#scale data for comparison
#Proportion of Variance Explained [refer to appendix 1.4b]
round(PVE <- (leudat_prcomp$sdev^2)/sum(leudat_prcomp$sdev^2), digits = 2)
## [1] 0.08 0.06 0.05 0.04 0.04 0.03 0.03 0.03 0.02 0.02 0.02 0.02 0.02 0.02 0.02
## [16] 0.02 0.02 0.02 0.02 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01
## [31] 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01
## [46] 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01
## [61] 0.01 0.01 0.01 0.01 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
#majority of the variable information i.e maximum variation lies within PC1(44%) and PC2(19%)[63% total]
#visualising PVE with elbow plot
library(factoextra)
fviz_eig(leudat_prcomp)
cor_leumatPC = cor(leudat_prcomp$x)
heatmap(cor_leumatPC, keep.dendro = FALSE)#checking for correlation post-PCA
Yet the overall PCA results below, indicate that complex dynamic of genetics as only 14% of variance can be explained by the primary and secondary component. Furthermore the subsequent components remaining contribute less than 5% each to the overall variation.
3.1b Implementing HDBSCAN Note: The premise of the HDBSCAN is to apply this hierarchical clustering in an unsupervised way to the data set. Thus, it is not appropriate to partition the data into training and test subsets despite the presence of a ‘response’ variable i.e. leukemia sub-type. Instead the silhouette method will be applied to assess the validity/fit of the clustering model.
# Installing Packages DBSCAN
# Loading package
library("dbscan")
# Fitting HDBScan clustering Model
# using the 'knee' to view otpimal eps and minPts using a k-distance plot
kNNdist(leudat_prcomp$x, k = 2)
## [1] 50.27055 47.86017 56.89287 48.07821 43.40409 45.05373 48.09248 55.70000
## [9] 56.04327 40.62272 43.80987 36.92825 44.74152 52.24813 43.11637 44.19953
## [17] 73.97723 46.00694 40.92713 70.95462 52.70326 44.43403 45.05373 48.40471
## [25] 43.22310 42.18289 54.38586 45.34198 46.06739 66.72755 56.90584 55.28167
## [33] 50.91842 42.64296 59.34705 43.88929 49.13905 63.53363 54.98701 53.25665
## [41] 51.45596 48.09693 52.07510 55.30317 41.53523 53.07829 41.67029 43.92344
## [49] 37.73820 36.31477 38.01753 42.18289 50.90417 49.85013 42.71671 40.98105
## [57] 48.29302 43.54641 50.21345 58.79725 45.63163 46.79705 56.71165 39.56011
## [65] 55.94797 47.38594 47.07966 44.98216 42.64296 55.11247 74.85271 54.13765
kNNdistplot(leudat_prcomp$x, k = 2)
#Hence, the 'knee' in question occurs around a distance of 58, thus eps = 58
#performing HDBSCAN using scaled data
HDbscan_cl <- hdbscan(leudat_prcomp$x, minPts = 2)
# Checking cluster
HDbscan_cl$cluster
## [1] 3 3 0 3 3 3 3 3 3 3 3 3 3 3 3 3 0 3 3 0 3 3 3 3 3 3 1 3 3 0 0 3 3 3 0 3 3 0
## [39] 0 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 0 3 3 3 3 0 3 3 3 3 2 0 2
# Table
table(HDbscan_cl$cluster, leukemia_dat$type)
##
## ALL AML
## 0 4 7
## 1 2 0
## 2 0 2
## 3 41 16
#4 clusters identifed
#2 main clusters [1 and 2] being containing solely AML and ALL subtypes, respectively
# The other clusters contain both subtypes but contain majority of the data
#checking for local outliers for k = 2 clusters
k <- 2 # LOF parameter (two clusters)
LOF_Outlier <- lof(x=leudat_prcomp$x, k = k) # LOF (outlier score) computation
top_n <- 10 # No. of top outliers to be displayed
rank_LOF_Outlier <- order(x=LOF_Outlier, decreasing = TRUE) # Sorting (descending)
LOF_Result <- data.frame(ID = rank_LOF_Outlier, score = LOF_Outlier[rank_LOF_Outlier])
(head(LOF_Result, top_n))
#largest outlier present has an LOF score of 1.6
#Thus, somewhat of an outlier but still relatively local
#Implementing GLOSH algorithm
MinPts <- 2 # GLOSH parameter: same as MinPts in hdbscan(), analogous to k in lof()
GLOSH_Outlier <- glosh(x=as.matrix(leudat_prcomp$x), MinPts) # GLOSH (outlier score) computation
rank_GLOSH_Outlier <- order(x=GLOSH_Outlier, decreasing = TRUE) # Sorting (descending)
GLOSH_Result <- data.frame(ID = rank_GLOSH_Outlier, score = GLOSH_Outlier[rank_GLOSH_Outlier])
(head(GLOSH_Result, top_n))
Overall, the distinction between 2 clusters containing patients, for whom leukemia pathology presents as a particular sub-type does not comply with the results above. When applying a HDBSCAN algorithm to the principal components i.e. the genes of interest (post-PCA) the two major clusters formed actually contain a mixture of leukemia sub-types (cluster ‘3’ [ALL n = 41, AML n = 16], cluster ‘0’ [ALL n = 4, AML n = 7]). However, there were two other clusters identified, containing only two data points each. Thus some further analysis was performed to identify the level at which these four data points are distanced from neighbouring clusters. Indeed both a LOF and GLOSH analysis confirmed that these data points are most liekly on the ‘edge’ of neighbouring clusters (max. LOF score = 1.6 and GLOSH score =0.55 for data point 71). As such the MinPts parameter was adjusted to a higher level (MinPts = 4) and the HDBSCAN algorithm was repeated.
#increasing MinPts value to fine tune HDBSCAN
#repeating GLOSH analysis
MinPts <- 3 # GLOSH parameter: same as MinPts in hdbscan(), analogous to k in lof()
GLOSH_Outlier <- glosh(x=as.matrix(leudat_prcomp$x), MinPts) # GLOSH (outlier score) computation
rank_GLOSH_Outlier <- order(x=GLOSH_Outlier, decreasing = TRUE) # Sorting (descending)
GLOSH_Result <- data.frame(ID = rank_GLOSH_Outlier, score = GLOSH_Outlier[rank_GLOSH_Outlier])
(head(GLOSH_Result, top_n))
#performing HDBSCAN using scaled data, Minpts = 3
leudat_prcompX =leudat_prcomp$x
HDbscan_cl <- hdbscan(leudat_prcomp$x, minPts = 3, gen_hdbscan_tree = TRUE)
# Checking cluster
HDbscan_cl$cluster
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [39] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# Table
table(HDbscan_cl$cluster,leukemia_dat$type)
##
## ALL AML
## 0 47 25
All data points were assigned to the same cluster[0]. Hence the hypothesis that presentation of leukemia sub-types can be explained via expression of specific genes cannot be accepted. To further evaluate the validity of this model, the core distances for each data point (relative to the assigned cluster) was plotted in the graph below. Moreover, the hierarchy of partitioning was also plotted as a dendrogram to showcase the dissimilarity between gene expression levels for each patient. Finally, the Silhouette Width Criterion was also plotted against varying levels of k to confirm parameter choice for the number of clusters. Indeed, the maximum SWC score obtained was for the lowest level of k (0.1 for k = 2). Thus creating two or more clusters in this data set does not create a viable/stable solution.
# Plotting Cluster distances for each patient
plot(HDbscan_cl$coredist)
#creating a dendrogram/simplified tree
HDbscan_cl$hc
##
## Call:
## hdbscan(x = leudat_prcomp$x, minPts = 3, gen_hdbscan_tree = TRUE)
##
## Cluster method : robust single
## Distance : mutual reachability
## Number of objects: 72
plot(HDbscan_cl$hc, main="HDBSCAN* Hierarchy")
#plot(HDbscan_cl, scale = 3, gradient = c("purple", "orange", "red"), show_flat = T)
#couldn't get the simplified tree to work? something about obj. created without names?
library(cluster)
SWC <- function(clusterLabels, dataPoints){
require(cluster)
sil <- silhouette(x = clusterLabels, dist = dist(dataPoints))
return(mean(sil[,3]))
}
set.seed(5)
Silhouette <- rep(0, 10)
for (k in 2:10){
km.out <- kmeans(x = leudat_prcompX, centers = k, nstart = 10)
Silhouette[k] <- SWC(clusterLabels = km.out$cluster, dataPoints = leudat_prcompX)
}
plot(2:10, Silhouette[2:10], xlab="k", ylab="Silhouette Width Criterion (SWC)", type = "b")
Appendix 1.3
(as.dist(round(bcw_cormat, digits = 2)))
## radius texture perimeter area smoothness compactness
## texture 0.32
## perimeter 1.00 0.33
## area 0.99 0.32 0.99
## smoothness 0.17 -0.02 0.21 0.18
## compactness 0.51 0.24 0.56 0.50 0.66
## concavity 0.68 0.30 0.72 0.69 0.52 0.88
## concave_points 0.82 0.29 0.85 0.82 0.55 0.83
## symmetry 0.15 0.07 0.18 0.15 0.56 0.60
## fractal_dimension -0.31 -0.08 -0.26 -0.28 0.58 0.57
## radius_SE 0.68 0.28 0.69 0.73 0.30 0.50
## texture_SE -0.10 0.39 -0.09 -0.07 0.07 0.05
## perimeter_SE 0.67 0.28 0.69 0.73 0.30 0.55
## area_SE 0.74 0.26 0.74 0.80 0.25 0.46
## smoothness_SE -0.22 0.01 -0.20 -0.17 0.33 0.14
## compactness_SE 0.21 0.19 0.25 0.21 0.32 0.74
## concavity_SE 0.19 0.14 0.23 0.21 0.25 0.57
## concave_points_SE 0.38 0.16 0.41 0.37 0.38 0.64
## symmetry_SE -0.10 0.01 -0.08 -0.07 0.20 0.23
## fractal_dimension_SE -0.04 0.05 -0.01 -0.02 0.28 0.51
## radius_worst 0.97 0.35 0.97 0.96 0.21 0.54
## texture_worst 0.30 0.91 0.30 0.29 0.04 0.25
## perimeter_worst 0.97 0.36 0.97 0.96 0.24 0.59
## area_worst 0.94 0.34 0.94 0.96 0.21 0.51
## smoothness_worst 0.12 0.08 0.15 0.12 0.81 0.57
## compactness_worst 0.41 0.28 0.46 0.39 0.47 0.87
## concavity_worst 0.53 0.30 0.56 0.51 0.43 0.82
## concave_points_worst 0.74 0.30 0.77 0.72 0.50 0.82
## concavity concave_points symmetry fractal_dimension
## texture
## perimeter
## area
## smoothness
## compactness
## concavity
## concave_points 0.92
## symmetry 0.50 0.46
## fractal_dimension 0.34 0.17 0.48
## radius_SE 0.63 0.70 0.30 0.00
## texture_SE 0.08 0.02 0.13 0.16
## perimeter_SE 0.66 0.71 0.31 0.04
## area_SE 0.62 0.69 0.22 -0.09
## smoothness_SE 0.10 0.03 0.19 0.40
## compactness_SE 0.67 0.49 0.42 0.56
## concavity_SE 0.69 0.44 0.34 0.45
## concave_points_SE 0.68 0.62 0.39 0.34
## symmetry_SE 0.18 0.10 0.45 0.35
## fractal_dimension_SE 0.45 0.26 0.33 0.69
## radius_worst 0.69 0.83 0.19 -0.25
## texture_worst 0.30 0.29 0.09 -0.05
## perimeter_worst 0.73 0.86 0.22 -0.21
## area_worst 0.68 0.81 0.18 -0.23
## smoothness_worst 0.45 0.45 0.43 0.50
## compactness_worst 0.75 0.67 0.47 0.46
## concavity_worst 0.88 0.75 0.43 0.35
## concave_points_worst 0.86 0.91 0.43 0.18
## radius_SE texture_SE perimeter_SE area_SE smoothness_SE
## texture
## perimeter
## area
## smoothness
## compactness
## concavity
## concave_points
## symmetry
## fractal_dimension
## radius_SE
## texture_SE 0.21
## perimeter_SE 0.97 0.22
## area_SE 0.95 0.11 0.94
## smoothness_SE 0.16 0.40 0.15 0.08
## compactness_SE 0.36 0.23 0.42 0.28 0.34
## concavity_SE 0.33 0.19 0.36 0.27 0.27
## concave_points_SE 0.51 0.23 0.56 0.42 0.33
## symmetry_SE 0.24 0.41 0.27 0.13 0.41
## fractal_dimension_SE 0.23 0.28 0.24 0.13 0.43
## radius_worst 0.72 -0.11 0.70 0.76 -0.23
## texture_worst 0.19 0.41 0.20 0.20 -0.07
## perimeter_worst 0.72 -0.10 0.72 0.76 -0.22
## area_worst 0.75 -0.08 0.73 0.81 -0.18
## smoothness_worst 0.14 -0.07 0.13 0.13 0.31
## compactness_worst 0.29 -0.09 0.34 0.28 -0.06
## concavity_worst 0.38 -0.07 0.42 0.39 -0.06
## concave_points_worst 0.53 -0.12 0.55 0.54 -0.10
## compactness_SE concavity_SE concave_points_SE symmetry_SE
## texture
## perimeter
## area
## smoothness
## compactness
## concavity
## concave_points
## symmetry
## fractal_dimension
## radius_SE
## texture_SE
## perimeter_SE
## area_SE
## smoothness_SE
## compactness_SE
## concavity_SE 0.80
## concave_points_SE 0.74 0.77
## symmetry_SE 0.39 0.31 0.31
## fractal_dimension_SE 0.80 0.73 0.61 0.37
## radius_worst 0.20 0.19 0.36 -0.13
## texture_worst 0.14 0.10 0.09 -0.08
## perimeter_worst 0.26 0.23 0.39 -0.10
## area_worst 0.20 0.19 0.34 -0.11
## smoothness_worst 0.23 0.17 0.22 -0.01
## compactness_worst 0.68 0.48 0.45 0.06
## concavity_worst 0.64 0.66 0.55 0.04
## concave_points_worst 0.48 0.44 0.60 -0.03
## fractal_dimension_SE radius_worst texture_worst
## texture
## perimeter
## area
## smoothness
## compactness
## concavity
## concave_points
## symmetry
## fractal_dimension
## radius_SE
## texture_SE
## perimeter_SE
## area_SE
## smoothness_SE
## compactness_SE
## concavity_SE
## concave_points_SE
## symmetry_SE
## fractal_dimension_SE
## radius_worst -0.04
## texture_worst 0.00 0.36
## perimeter_worst 0.00 0.99 0.37
## area_worst -0.02 0.98 0.35
## smoothness_worst 0.17 0.22 0.23
## compactness_worst 0.39 0.48 0.36
## concavity_worst 0.38 0.57 0.37
## concave_points_worst 0.22 0.79 0.36
## perimeter_worst area_worst smoothness_worst
## texture
## perimeter
## area
## smoothness
## compactness
## concavity
## concave_points
## symmetry
## fractal_dimension
## radius_SE
## texture_SE
## perimeter_SE
## area_SE
## smoothness_SE
## compactness_SE
## concavity_SE
## concave_points_SE
## symmetry_SE
## fractal_dimension_SE
## radius_worst
## texture_worst
## perimeter_worst
## area_worst 0.98
## smoothness_worst 0.24 0.21
## compactness_worst 0.53 0.44 0.57
## concavity_worst 0.62 0.54 0.52
## concave_points_worst 0.82 0.75 0.55
## compactness_worst concavity_worst
## texture
## perimeter
## area
## smoothness
## compactness
## concavity
## concave_points
## symmetry
## fractal_dimension
## radius_SE
## texture_SE
## perimeter_SE
## area_SE
## smoothness_SE
## compactness_SE
## concavity_SE
## concave_points_SE
## symmetry_SE
## fractal_dimension_SE
## radius_worst
## texture_worst
## perimeter_worst
## area_worst
## smoothness_worst
## compactness_worst
## concavity_worst 0.89
## concave_points_worst 0.80 0.86
1.4a
(bcw_prcomp$rotation)
## PC1 PC2 PC3 PC4
## radius -0.21890244 0.233857132 -0.008531243 0.041408962
## texture -0.10372458 0.059706088 0.064549903 -0.603050001
## perimeter -0.22753729 0.215181361 -0.009314220 0.041983099
## area -0.22099499 0.231076711 0.028699526 0.053433795
## smoothness -0.14258969 -0.186113023 -0.104291904 0.159382765
## compactness -0.23928535 -0.151891610 -0.074091571 0.031794581
## concavity -0.25840048 -0.060165363 0.002733838 0.019122753
## concave_points -0.26085376 0.034767500 -0.025563541 0.065335944
## symmetry -0.13816696 -0.190348770 -0.040239936 0.067124984
## fractal_dimension -0.06436335 -0.366575471 -0.022574090 0.048586765
## radius_SE -0.20597878 0.105552152 0.268481387 0.097941242
## texture_SE -0.01742803 -0.089979682 0.374633665 -0.359855528
## perimeter_SE -0.21132592 0.089457234 0.266645367 0.088992415
## area_SE -0.20286964 0.152292628 0.216006528 0.108205039
## smoothness_SE -0.01453145 -0.204430453 0.308838979 0.044664180
## compactness_SE -0.17039345 -0.232715896 0.154779718 -0.027469363
## concavity_SE -0.15358979 -0.197207283 0.176463743 0.001316880
## concave_points_SE -0.18341740 -0.130321560 0.224657567 0.074067335
## symmetry_SE -0.04249842 -0.183848000 0.288584292 0.044073351
## fractal_dimension_SE -0.10256832 -0.280092027 0.211503764 0.015304750
## radius_worst -0.22799663 0.219866379 -0.047506990 0.015417240
## texture_worst -0.10446933 0.045467298 -0.042297823 -0.632807885
## perimeter_worst -0.23663968 0.199878428 -0.048546508 0.013802794
## area_worst -0.22487053 0.219351858 -0.011902318 0.025894749
## smoothness_worst -0.12795256 -0.172304352 -0.259797613 0.017652216
## compactness_worst -0.21009588 -0.143593173 -0.236075625 -0.091328415
## concavity_worst -0.22876753 -0.097964114 -0.173057335 -0.073951180
## concave_points_worst -0.25088597 0.008257235 -0.170344076 0.006006996
## symmetry_worst -0.12290456 -0.141883349 -0.271312642 -0.036250695
## fractal_dimension_worst -0.13178394 -0.275339469 -0.232791313 -0.077053470
## PC5 PC6 PC7 PC8
## radius -0.037786354 0.0187407904 -0.1240883403 0.007452296
## texture 0.049468850 -0.0321788366 0.0113995382 -0.130674825
## perimeter -0.037374663 0.0173084449 -0.1144770573 0.018687258
## area -0.010331251 -0.0018877480 -0.0516534275 -0.034673604
## smoothness 0.365088528 -0.2863744966 -0.1406689928 0.288974575
## compactness -0.011703971 -0.0141309489 0.0309184960 0.151396350
## concavity -0.086375412 -0.0093441809 -0.1075204434 0.072827285
## concave_points 0.043861025 -0.0520499505 -0.1504822142 0.152322414
## symmetry 0.305941428 0.3564584607 -0.0938911345 0.231530989
## fractal_dimension 0.044424360 -0.1194306679 0.2957600240 0.177121441
## radius_SE 0.154456496 -0.0256032561 0.3124900373 -0.022539967
## texture_SE 0.191650506 -0.0287473145 -0.0907553556 0.475413139
## perimeter_SE 0.120990220 0.0018107150 0.3146403902 0.011896690
## area_SE 0.127574432 -0.0428639079 0.3466790028 -0.085805135
## smoothness_SE 0.232065676 -0.3429173935 -0.2440240556 -0.573410232
## compactness_SE -0.279968156 0.0691975186 0.0234635340 -0.117460157
## concavity_SE -0.353982091 0.0563432386 -0.2088237897 -0.060566501
## concave_points_SE -0.195548089 -0.0312244482 -0.3696459369 0.108319309
## symmetry_SE 0.252868765 0.4902456426 -0.0803822539 -0.220149279
## fractal_dimension_SE -0.263297438 -0.0531952674 0.1913949726 -0.011168188
## radius_worst 0.004406592 -0.0002906849 -0.0097099360 -0.042619416
## texture_worst 0.092883400 -0.0500080613 0.0098707439 -0.036251636
## perimeter_worst -0.007454151 0.0085009872 -0.0004457267 -0.030558534
## area_worst 0.027390903 -0.0251643821 0.0678316595 -0.079394246
## smoothness_worst 0.324435445 -0.3692553703 -0.1088308865 -0.205852191
## compactness_worst -0.121804107 0.0477057929 0.1404729381 -0.084019659
## concavity_worst -0.188518727 0.0283792555 -0.0604880561 -0.072467871
## concave_points_worst -0.043332069 -0.0308734498 -0.1679666187 0.036170795
## symmetry_worst 0.244558663 0.4989267845 -0.0184906298 -0.228225053
## fractal_dimension_worst -0.094423351 -0.0802235245 0.3746576261 -0.048360667
## PC9 PC10 PC11 PC12
## radius -0.223109764 0.095486443 -0.04147149 0.051067457
## texture 0.112699390 0.240934066 0.30224340 0.254896423
## perimeter -0.223739213 0.086385615 -0.01678264 0.038926106
## area -0.195586014 0.074956489 -0.11016964 0.065437508
## smoothness 0.006424722 -0.069292681 0.13702184 0.316727211
## compactness -0.167841425 0.012936200 0.30800963 -0.104017044
## concavity 0.040591006 -0.135602298 -0.12419024 0.065653480
## concave_points -0.111971106 0.008054528 0.07244603 0.042589267
## symmetry 0.256040084 0.572069479 -0.16305408 -0.288865504
## fractal_dimension -0.123740789 0.081103207 0.03804827 0.236358988
## radius_SE 0.249985002 -0.049547594 0.02535702 -0.016687915
## texture_SE -0.246645397 -0.289142742 -0.34494446 -0.306160423
## perimeter_SE 0.227154024 -0.114508236 0.16731877 -0.101446828
## area_SE 0.229160015 -0.091927889 -0.05161946 -0.017679218
## smoothness_SE -0.141924890 0.160884609 -0.08420621 -0.294710053
## compactness_SE -0.145322810 0.043504866 0.20688568 -0.263456509
## concavity_SE 0.358107079 -0.141276243 -0.34951794 0.251146975
## concave_points_SE 0.272519886 0.086240847 0.34237591 -0.006458751
## symmetry_SE -0.304077200 -0.316529830 0.18784404 0.320571348
## fractal_dimension_SE -0.213722716 0.367541918 -0.25062479 0.276165974
## radius_worst -0.112141463 0.077361643 -0.10506733 0.039679665
## texture_worst 0.103341204 0.029550941 -0.01315727 0.079797450
## perimeter_worst -0.109614364 0.050508334 -0.05107628 -0.008987738
## area_worst -0.080732461 0.069921152 -0.18459894 0.048088657
## smoothness_worst 0.112315904 -0.128304659 -0.14389035 0.056514866
## compactness_worst -0.100677822 -0.172133632 0.19742047 -0.371662503
## concavity_worst 0.161908621 -0.311638520 -0.18501676 -0.087034532
## concave_points_worst 0.060488462 -0.076648291 0.11777205 -0.068125354
## symmetry_worst 0.064637806 -0.029563075 -0.15756025 0.044033503
## fractal_dimension_worst -0.134174175 0.012609579 -0.11828355 -0.034731693
## PC13 PC14 PC15 PC16
## radius 0.01196721 0.059506135 -0.051118775 -0.15058388
## texture 0.20346133 -0.021560100 -0.107922421 -0.15784196
## perimeter 0.04410950 0.048513812 -0.039902936 -0.11445396
## area 0.06737574 0.010830829 0.013966907 -0.13244803
## smoothness 0.04557360 0.445064860 -0.118143364 -0.20461325
## compactness 0.22928130 0.008101057 0.230899962 0.17017837
## concavity 0.38709081 -0.189358699 -0.128283732 0.26947021
## concave_points 0.13213810 -0.244794768 -0.217099194 0.38046410
## symmetry 0.18993367 0.030738856 -0.073961707 -0.16466159
## fractal_dimension 0.10623908 -0.377078865 0.517975705 -0.04079279
## radius_SE -0.06819523 0.010347413 -0.110050711 0.05890572
## texture_SE -0.16822238 -0.010849347 0.032752721 -0.03450040
## perimeter_SE -0.03784399 -0.045523718 -0.008268089 0.02651665
## area_SE 0.05606493 0.083570718 -0.046024366 0.04115323
## smoothness_SE 0.15044143 -0.201152530 0.018559465 -0.05803906
## compactness_SE 0.01004017 0.491755932 0.168209315 0.18983090
## concavity_SE 0.15878319 0.134586924 0.250471408 -0.12542065
## concave_points_SE -0.49402674 -0.199666719 0.062079344 -0.19881035
## symmetry_SE 0.01033274 -0.046864383 -0.113383199 -0.15771150
## fractal_dimension_SE -0.24045832 0.145652466 -0.353232211 0.26855388
## radius_worst -0.13789053 0.023101281 0.166567074 -0.08156057
## texture_worst -0.08014543 0.053430792 0.101115399 0.18555785
## perimeter_worst -0.09696571 0.012219382 0.182755198 -0.05485705
## area_worst -0.10116061 -0.006685465 0.314993600 -0.09065339
## smoothness_worst -0.20513034 0.162235443 0.046125866 0.14555166
## compactness_worst 0.01227931 0.166470250 -0.049956014 -0.15373486
## concavity_worst 0.21798433 -0.066798931 -0.204835886 -0.21502195
## concave_points_worst -0.25438749 -0.276418891 -0.169499607 0.17814174
## symmetry_worst -0.25653491 0.005355574 0.139888394 0.25789401
## fractal_dimension_worst -0.17281424 -0.212104110 -0.256173195 -0.40555649
## PC17 PC18 PC19 PC20
## radius 0.202924255 0.1467123385 0.22538466 -0.049698664
## texture -0.038706119 -0.0411029851 0.02978864 -0.244134993
## perimeter 0.194821310 0.1583174548 0.23959528 -0.017665012
## area 0.255705763 0.2661681046 -0.02732219 -0.090143762
## smoothness 0.167929914 -0.3522268017 -0.16456584 0.017100960
## compactness -0.020307708 0.0077941384 0.28422236 0.488686329
## concavity -0.001598353 -0.0269681105 0.00226636 -0.033387086
## concave_points 0.034509509 -0.0828277367 -0.15497236 -0.235407606
## symmetry -0.191737848 0.1733977905 -0.05881116 0.026069156
## fractal_dimension 0.050225246 0.0878673570 -0.05815705 -0.175637222
## radius_SE -0.139396866 -0.2362165319 0.17588331 -0.090800503
## texture_SE 0.043963016 -0.0098586620 0.03600985 -0.071659988
## perimeter_SE -0.024635639 -0.0259288003 0.36570154 -0.177250625
## area_SE 0.334418173 0.3049069032 -0.41657231 0.274201148
## smoothness_SE 0.139595006 -0.2312599432 -0.01326009 0.090061477
## compactness_SE -0.008246477 0.1004742346 -0.24244818 -0.461098220
## concavity_SE 0.084616716 -0.0001954852 0.12638102 0.066946174
## concave_points_SE 0.108132263 0.0460549116 -0.01216430 0.068868294
## symmetry_SE -0.274059129 0.1870147640 -0.08903929 0.107385289
## fractal_dimension_SE -0.122733398 -0.0598230982 0.08660084 0.222345297
## radius_worst -0.240049982 -0.2161013526 0.01366130 -0.005626909
## texture_worst 0.069365185 0.0583984505 -0.07586693 0.300599798
## perimeter_worst -0.234164147 -0.1885435919 0.09081325 0.011003858
## area_worst -0.273399584 -0.1420648558 -0.41004720 0.060047387
## smoothness_worst -0.278030197 0.5015516751 0.23451384 -0.129723903
## compactness_worst -0.004037123 -0.0735745143 0.02020070 0.229280589
## concavity_worst -0.191313419 -0.1039079796 -0.04578612 -0.046482792
## concave_points_worst -0.075485316 0.0758138963 -0.26022962 0.033022340
## symmetry_worst 0.430658116 -0.2787138431 0.11725053 -0.116759236
## fractal_dimension_worst 0.159394300 0.0235647497 -0.01149448 -0.104991974
## PC21 PC22 PC23 PC24
## radius -0.0685700057 -0.07292890 -0.0985526942 -0.18257944
## texture 0.4483694667 -0.09480063 -0.0005549975 0.09878679
## perimeter -0.0697690429 -0.07516048 -0.0402447050 -0.11664888
## area -0.0184432785 -0.09756578 0.0077772734 0.06984834
## smoothness -0.1194917473 -0.06382295 -0.0206657211 0.06869742
## compactness 0.1926213963 0.09807756 0.0523603957 -0.10413552
## concavity 0.0055717533 0.18521200 0.3248703785 0.04474106
## concave_points -0.0094238187 0.31185243 -0.0514087968 0.08402770
## symmetry -0.0869384844 0.01840673 -0.0512005770 0.01933947
## fractal_dimension -0.0762718362 -0.28786888 -0.0846898562 -0.13326055
## radius_SE 0.0863867747 0.15027468 -0.2641253170 -0.55870157
## texture_SE 0.2170719674 -0.04845693 -0.0008738805 0.02426730
## perimeter_SE -0.3049501584 -0.15935280 0.0900742110 0.51675039
## area_SE 0.1925877857 -0.06423262 0.0982150746 -0.02246072
## smoothness_SE -0.0720987261 -0.05054490 -0.0598177179 0.01563119
## compactness_SE -0.1403865724 0.04528769 0.0091038710 -0.12177779
## concavity_SE 0.0630479298 0.20521269 -0.3875423290 0.18820504
## concave_points_SE 0.0343753236 0.07254538 0.3517550738 -0.10966898
## symmetry_SE -0.0976995265 0.08465443 -0.0423628949 0.00322620
## fractal_dimension_SE 0.0628432814 -0.24470508 0.0857810992 0.07519442
## radius_worst 0.0072938995 0.09629821 -0.0556767923 -0.15683037
## texture_worst -0.5944401434 0.11111202 -0.0089228997 -0.11848460
## perimeter_worst -0.0920235990 -0.01722163 0.0633448296 0.23711317
## area_worst 0.1467901315 0.09695982 0.1908896250 0.14406303
## smoothness_worst 0.1648492374 0.06825409 0.0936901494 -0.01099014
## compactness_worst 0.1813748671 -0.02967641 -0.1479209247 0.18674995
## concavity_worst -0.1321005945 -0.46042619 0.2864331353 -0.28885257
## concave_points_worst 0.0008860815 -0.29984056 -0.5675277966 0.10734024
## symmetry_worst 0.1627085487 -0.09714484 0.1213434508 -0.01438181
## fractal_dimension_worst -0.0923439434 0.46947115 0.0076253382 0.03782545
## PC25 PC26 PC27 PC28
## radius -0.01922650 -0.129476396 -0.131526670 2.111940e-01
## texture 0.08474593 -0.024556664 -0.017357309 -6.581146e-05
## perimeter 0.02701541 -0.125255946 -0.115415423 8.433827e-02
## area -0.21004078 0.362727403 0.466612477 -2.725083e-01
## smoothness 0.02895489 -0.037003686 0.069689923 1.479269e-03
## compactness 0.39662323 0.262808474 0.097748705 -5.462767e-03
## concavity -0.09697732 -0.548876170 0.364808397 4.553864e-02
## concave_points -0.18645160 0.387643377 -0.454699351 -8.883097e-03
## symmetry -0.02458369 -0.016044038 -0.015164835 1.433026e-03
## fractal_dimension -0.20722186 -0.097404839 -0.101244946 -6.311687e-03
## radius_SE -0.17493043 0.049977080 0.212982901 -1.922239e-01
## texture_SE 0.05698648 -0.011237242 -0.010092889 -5.622611e-03
## perimeter_SE 0.07292764 0.103653282 0.041691553 2.631919e-01
## area_SE 0.13185041 -0.155304589 -0.313358657 -4.206811e-02
## smoothness_SE 0.03121070 -0.007717557 -0.009052154 9.792963e-03
## compactness_SE 0.17316455 -0.049727632 0.046536088 -1.539555e-02
## concavity_SE 0.01593998 0.091454968 -0.084224797 5.820978e-03
## concave_points_SE -0.12954655 -0.017941919 -0.011165509 -2.900930e-02
## symmetry_SE -0.01951493 -0.017267849 -0.019975983 -7.636526e-03
## fractal_dimension_SE -0.08417120 0.035488974 -0.012036564 1.975646e-02
## radius_worst 0.07070972 -0.197054744 -0.178666740 4.126396e-01
## texture_worst -0.11818972 0.036469433 0.021410694 -3.902509e-04
## perimeter_worst 0.11803403 -0.244103670 -0.241031046 -7.286809e-01
## area_worst -0.03828995 0.231359525 0.237162466 2.389603e-01
## smoothness_worst -0.04796476 0.012602464 -0.040853568 -1.535248e-03
## compactness_worst -0.62438494 -0.100463424 -0.070505414 4.869182e-02
## concavity_worst 0.11577034 0.266853781 -0.142905801 -1.764090e-02
## concave_points_worst 0.26319634 -0.133574507 0.230901389 2.247567e-02
## symmetry_worst 0.04529962 0.028184296 0.022790444 4.920481e-03
## fractal_dimension_worst 0.28013348 0.004520482 0.059985998 -2.356214e-02
## PC29 PC30
## radius 2.114605e-01 0.7024140910
## texture -1.053393e-02 0.0002736610
## perimeter 3.838261e-01 -0.6898969685
## area -4.227949e-01 -0.0329473482
## smoothness -3.434667e-03 -0.0048474577
## compactness -4.101677e-02 0.0446741863
## concavity -1.001479e-02 0.0251386661
## concave_points -4.206949e-03 -0.0010772653
## symmetry -7.569862e-03 -0.0012803794
## fractal_dimension 7.301433e-03 -0.0047556848
## radius_SE 1.184421e-01 -0.0087110937
## texture_SE -8.776279e-03 -0.0010710392
## perimeter_SE -6.100219e-03 0.0137293906
## area_SE -8.592591e-02 0.0011053260
## smoothness_SE 1.776386e-03 -0.0016082109
## compactness_SE 3.158134e-03 0.0019156224
## concavity_SE 1.607852e-02 -0.0089265265
## concave_points_SE -2.393779e-02 -0.0021601973
## symmetry_SE -5.223292e-03 0.0003293898
## fractal_dimension_SE -8.341912e-03 0.0017989568
## radius_worst -6.357249e-01 -0.1356430561
## texture_worst 1.723549e-02 0.0010205360
## perimeter_worst 2.292180e-02 0.0797438536
## area_worst 4.449359e-01 0.0397422838
## smoothness_worst 7.385492e-03 0.0045832773
## compactness_worst 3.566904e-06 -0.0128415624
## concavity_worst -1.267572e-02 0.0004021392
## concave_points_worst 3.524045e-02 -0.0022884418
## symmetry_worst 1.340423e-02 0.0003954435
## fractal_dimension_worst 1.147766e-02 0.0018942925
1.4b
(round(PVE <- (bcw_prcomp$sdev^2)/sum(bcw_prcomp$sdev^2), digits = 2))
## [1] 0.44 0.19 0.09 0.07 0.05 0.04 0.02 0.02 0.01 0.01 0.01 0.01 0.01 0.01 0.00
## [16] 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
#basic overview of the data
(str(leukemia_dat))
## 'data.frame': 72 obs. of 1869 variables:
## $ patient_id: chr "Subject1" "Subject2" "Subject3" "Subject4" ...
## $ type : Factor w/ 2 levels "ALL","AML": 1 1 1 1 1 1 1 1 1 1 ...
## $ Gene_1 : num 199 10 33 158 10 67 131 10 10 328 ...
## $ Gene_2 : num 252 101 206 49 70 87 126 70 24 177 ...
## $ Gene_3 : num 206 74 10 31 252 193 10 10 506 183 ...
## $ Gene_4 : num 10 19 19 363 155 325 10 361 284 10 ...
## $ Gene_5 : num 75 182 208 142 32 10 109 10 292 233 ...
## $ Gene_6 : num 10 37 183 45 10 65 43 338 29 10 ...
## $ Gene_7 : num 165 18 238 247 44 39 100 265 106 10 ...
## $ Gene_8 : num 10 10 104 10 10 10 10 10 10 10 ...
## $ Gene_9 : num 215 116 476 155 122 176 58 257 166 155 ...
## $ Gene_10 : num 14538 615 5669 4850 1284 ...
## $ Gene_11 : num 9738 115 3272 2293 2731 ...
## $ Gene_12 : num 8529 1518 3668 2569 316 ...
## $ Gene_13 : num 70 153 66 10 78 10 10 10 10 10 ...
## $ Gene_14 : num 6750 2215 3325 3058 1130 ...
## $ Gene_15 : num 240 86 252 10 28 71 10 10 235 10 ...
## $ Gene_16 : num 10 185 140 10 10 382 10 10 131 10 ...
## $ Gene_17 : num 72 21 10 61 16 85 10 25 10 65 ...
## $ Gene_18 : num 10 10 142 10 237 10 87 10 148 10 ...
## $ Gene_19 : num 378 249 362 266 554 110 312 238 896 229 ...
## $ Gene_20 : num 10 10 10 10 16 10 134 13 10 10 ...
## $ Gene_21 : num 10 10 94 10 10 10 33 10 34 10 ...
## $ Gene_22 : num 87 53 128 112 10 144 65 203 93 24 ...
## $ Gene_23 : num 11 20 12 34 10 10 10 11 10 19 ...
## $ Gene_24 : num 152 104 10 10 10 88 10 10 35 10 ...
## $ Gene_25 : num 146 224 15 10 169 85 10 101 75 10 ...
## $ Gene_26 : num 117 10 10 10 65 40 10 10 10 30 ...
## $ Gene_27 : num 207 10 109 520 10 401 10 188 180 136 ...
## $ Gene_28 : num 10 348 150 135 280 10 176 10 327 89 ...
## $ Gene_29 : num 169 71 92 88 196 10 146 53 132 39 ...
## $ Gene_30 : num 457 10 376 325 221 280 361 740 296 10 ...
## $ Gene_31 : num 10 10 10 193 10 104 248 415 10 62 ...
## $ Gene_32 : num 484 485 10 597 155 501 774 982 10 59 ...
## $ Gene_33 : num 99 21 10 33 110 76 70 76 98 10 ...
## $ Gene_34 : num 65 92 163 139 107 76 68 10 98 62 ...
## $ Gene_35 : num 197 119 293 118 200 44 181 206 93 55 ...
## $ Gene_36 : num 124 65 10 10 24 10 10 10 10 10 ...
## $ Gene_37 : num 36 58 63 38 120 92 16 169 43 10 ...
## $ Gene_38 : num 10 10 10 307 212 314 10 332 221 10 ...
## $ Gene_39 : num 10 2072 1658 2209 5846 ...
## $ Gene_40 : num 40 87 77 136 223 10 42 10 82 66 ...
## $ Gene_41 : num 42 82 10 101 80 122 35 10 10 10 ...
## $ Gene_42 : num 10 334 10 366 408 200 381 10 10 36 ...
## $ Gene_43 : num 397 515 343 36 29 53 10 19 13 652 ...
## $ Gene_44 : num 10 95 10 10 177 10 10 10 10 68 ...
## $ Gene_45 : num 10 23 10 10 71 76 10 10 10 10 ...
## $ Gene_46 : num 10 399 102 10 504 23 10 10 305 51 ...
## $ Gene_47 : num 10 103 10 10 10 10 67 10 95 10 ...
## $ Gene_48 : num 163 34 55 66 56 71 117 46 165 76 ...
## $ Gene_49 : num 37 13 63 10 10 10 10 10 10 10 ...
## $ Gene_50 : num 10 10 10 10 257 10 336 10 173 256 ...
## $ Gene_51 : num 98 138 10 134 51 10 53 10 10 10 ...
## $ Gene_52 : num 4707 3367 10 101 1276 ...
## $ Gene_53 : num 100 372 10 234 266 156 137 194 49 106 ...
## $ Gene_54 : num 60 10 39 82 37 62 10 47 10 15 ...
## $ Gene_55 : num 10 121 10 18 284 10 10 10 10 73 ...
## $ Gene_56 : num 10 10 10 103 10 160 67 10 10 30 ...
## $ Gene_57 : num 122 32 42 88 163 10 80 10 17 39 ...
## $ Gene_58 : num 21 82 10 114 128 58 84 107 10 10 ...
## $ Gene_59 : num 10 10 10 10 157 10 47 10 10 10 ...
## $ Gene_60 : num 10 10 10 10 10 10 10 10 10 36 ...
## $ Gene_61 : num 53 10 114 10 10 10 43 50 74 163 ...
## $ Gene_62 : num 10 14 10 10 39 10 10 10 99 79 ...
## $ Gene_63 : num 92 10 23 224 69 10 835 73 10 10 ...
## $ Gene_64 : num 88 10 145 136 10 10 70 245 72 88 ...
## $ Gene_65 : num 134 56 207 19 10 10 18 179 92 31 ...
## $ Gene_66 : num 10 76 10 10 42 10 10 10 10 26 ...
## $ Gene_67 : num 10 10 10 10 26 19 10 137 295 45 ...
## $ Gene_68 : num 25 10 58 310 61 10 245 149 10 66 ...
## $ Gene_69 : num 122 227 272 46 449 51 124 52 539 186 ...
## $ Gene_70 : num 50 10 107 10 17 48 26 160 69 27 ...
## $ Gene_71 : num 73 85 10 10 10 10 10 85 10 10 ...
## $ Gene_72 : num 10 377 153 224 1714 ...
## $ Gene_73 : num 519 15 144 458 29 279 250 52 83 83 ...
## $ Gene_74 : num 10 10 10 10 226 158 10 10 73 10 ...
## $ Gene_75 : num 10 10 10 10 78 192 10 10 24 10 ...
## $ Gene_76 : num 10 10 79 10 10 10 10 71 43 86 ...
## $ Gene_77 : num 481 10 443 516 1108 ...
## $ Gene_78 : num 101 479 145 169 340 227 62 110 49 10 ...
## $ Gene_79 : num 10 10 46 57 10 62 57 100 17 76 ...
## $ Gene_80 : num 295 437 17 46 577 10 149 10 371 22 ...
## $ Gene_81 : num 166 10 37 151 121 19 208 328 100 64 ...
## $ Gene_82 : num 179 10 66 28 61 24 10 10 10 10 ...
## $ Gene_83 : num 10 10 10 10 513 10 10 10 10 10 ...
## $ Gene_84 : num 151 148 115 218 238 199 270 194 283 108 ...
## $ Gene_85 : num 10 316 10 355 46 35 10 232 116 258 ...
## $ Gene_86 : num 10 337 10 10 130 234 544 287 10 21 ...
## $ Gene_87 : num 50 153 193 105 47 10 36 10 28 10 ...
## $ Gene_88 : num 16 385 10 10 230 83 10 26 161 51 ...
## $ Gene_89 : num 13 10 83 102 99 17 10 65 33 21 ...
## $ Gene_90 : num 338 413 10 387 504 288 146 74 253 201 ...
## $ Gene_91 : num 10 337 10 262 666 151 55 44 247 89 ...
## $ Gene_92 : num 250 10 10 10 10 33 55 10 10 10 ...
## $ Gene_93 : num 380 642 351 406 248 560 680 506 805 79 ...
## $ Gene_94 : num 10 27 114 28 19 10 25 10 39 78 ...
## $ Gene_95 : num 10 10 150 10 110 88 42 10 552 70 ...
## $ Gene_96 : num 125 548 185 10 163 83 10 247 267 112 ...
## $ Gene_97 : num 136 10 112 10 245 10 10 137 109 124 ...
## [list output truncated]
## NULL