# Coursework MA321-7-SP: Initial Task - R Code to Get Started
# Version:January 2025
rm(list=ls())

# --- Load Data ---
InitialData <- read.csv(file = "C:/AS/gene-expression-invasive-vs-noninvasive-cancer.csv")

# --- Check the Data ---

str(InitialData)
## 'data.frame':    78 obs. of  4773 variables:
##  $ X             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ J00129        : num  -0.448 -0.48 -0.568 -0.819 -0.112 -0.391 -0.624 -0.528 -0.811 -0.839 ...
##  $ Contig29982_RC: num  -0.296 -0.512 -0.411 -0.267 -0.67 -0.31 -0.12 -0.447 -0.536 2 ...
##  $ Contig42854   : num  -0.1 -0.031 -0.398 0.023 0.421 -0.06 -0.236 -0.254 -0.211 0.147 ...
##  $ Contig42014_RC: num  -0.177 -0.075 0.116 -0.23 -0.19 -0.164 -0.175 0.017 -0.201 -0.325 ...
##  $ Contig27915_RC: num  -0.107 -0.104 -0.092 0.198 0.032 -0.173 0.253 0.654 0.287 -0.303 ...
##  $ Contig20156_RC: num  -0.11 -0.234 -0.166 -0.51 0.281 -0.034 -0.125 0.364 -0.08 -0.061 ...
##  $ Contig50634_RC: num  -0.095 -0.225 0.036 0.529 0.31 -0.091 -0.127 0.068 -0.15 0.097 ...
##  $ Contig42615_RC: num  -0.076 -0.094 0.397 0.354 0.056 0.036 -0.02 0.181 0.045 0.006 ...
##  $ Contig56678_RC: num  -0.134 0.115 -0.194 -0.261 0.116 0.346 0.047 -1.14 -0.11 0.176 ...
##  $ Contig48659_RC: num  -0.14 0.019 -0.128 0.012 0.074 0.007 -0.15 -0.111 -0.072 -0.084 ...
##  $ Contig49388_RC: num  0.006 0.15 0.139 -0.26 0.041 0.251 0.266 -0.153 0.471 0.114 ...
##  $ Contig1970_RC : num  0.111 0.038 -0.033 -0.069 0.067 0.229 0.246 -0.415 -0.096 -0.081 ...
##  $ Contig26343_RC: num  -0.236 0.092 0.039 -0.115 0.279 0.297 0.142 0.111 0.047 -0.071 ...
##  $ Contig53047_RC: num  -0.866 -1.035 -1.114 -1.021 -1.006 ...
##  $ Contig43945_RC: num  0.126 -0.062 0.011 -0.999 0.211 -0.1 -0.194 -0.053 0.096 -0.121 ...
##  $ Contig19551   : num  -0.692 -0.21 -0.462 0.273 0.242 -0.883 0.206 0.174 -0.355 0.23 ...
##  $ Contig10437_RC: num  0.132 -0.139 -0.185 0.159 0.276 -0.146 -0.301 -0.075 0.253 0.022 ...
##  $ Contig47230_RC: num  0.095 0.068 -0.168 -0.398 -0.604 0.382 -0.549 -0.635 0.856 0.515 ...
##  $ Contig20749_RC: num  0.252 0.268 -0.289 -0.734 0.08 0.403 -0.012 -0.586 0.105 0.138 ...
##  $ AL157502      : num  0.139 -0.179 -0.378 -0.427 0.372 -0.014 -0.022 -0.821 -0.294 -0.165 ...
##  $ Contig36647_RC: num  -0.097 0.181 -0.494 0.848 -0.01 0.6 -0.984 0.077 -0.15 0.58 ...
##  $ D31887        : num  0.113 0.06 -0.211 -0.338 0.076 -0.025 0.075 -0.03 -0.275 0.14 ...
##  $ AB033006      : num  -0.209 -0.198 -0.331 -0.239 -0.118 -0.317 -0.25 -0.082 -0.017 -0.32 ...
##  $ AB033007      : num  0.107 -0.04 0.114 0.081 -0.072 0.134 0.131 0.069 0.177 0.21 ...
##  $ M83822        : num  0.098 0.147 -0.121 -0.09 0.075 0.295 0.024 -0.39 -0.171 0.03 ...
##  $ AB033025      : num  0.11 0.087 -0.141 -0.61 0.236 -0.094 -0.067 -0.116 -0.175 -0.774 ...
##  $ AF114264      : num  0.096 0.051 -0.164 -0.047 0.245 -0.165 -0.072 -0.427 -0.249 -0.372 ...
##  $ Contig40673_RC: num  0.305 -0.056 -0.124 -0.02 -0.19 0.016 -0.246 0.181 1.48 -0.199 ...
##  $ Contig17345_RC: num  0.055 -0.031 -0.031 0.251 -0.06 -0.104 -0.254 0.408 -0.003 0.002 ...
##  $ AB033034      : num  -0.137 -0.05 -0.188 0.153 0.181 -0.231 -0.032 -0.024 -0.305 -0.249 ...
##  $ AB033035      : num  -0.056 -0.162 0.06 -0.249 -0.046 -0.129 0.15 0.088 -0.286 -0.343 ...
##  $ AF227899      : num  -0.001 0.11 -0.395 0.175 0.411 -0.024 -0.246 -0.385 -0.465 0.044 ...
##  $ AB033043      : num  0.108 0.105 0.079 -0.223 0.109 0.201 0.361 -0.224 0.02 0.188 ...
##  $ AB033049      : num  0.329 0.049 0.177 -0.307 0.3 0.046 -0.139 0.036 -0.154 0.069 ...
##  $ Contig55834_RC: num  0.078 0.175 -0.375 0.017 0.255 0.035 -0.044 -0.379 -0.302 0.071 ...
##  $ Contig67229_RC: num  -0.098 -0.107 0.612 -0.107 0.143 0.065 0.061 0.358 0.016 0.025 ...
##  $ Contig3396_RC : num  -0.097 -0.068 0.071 -0.113 -0.021 0.4 0.031 -0.043 -0.004 -0.199 ...
##  $ AB033050      : num  -0.019 0.104 0.023 0.582 0.128 0.216 0.091 -0.186 -0.156 0.032 ...
##  $ AB033055      : num  0.208 0.003 -0.051 0.024 0.161 -0.073 -0.167 -0.28 -0.071 -0.099 ...
##  $ AF009314      : num  -0.021 0.025 -0.279 -0.226 0.09 0.026 0.121 -0.541 -0.106 0.041 ...
##  $ AB033062      : num  0.113 -0.166 -0.153 -0.695 0.101 -0.037 0.366 0.139 -0.05 -0.202 ...
##  $ AB033066      : num  0.178 0.065 -0.077 0.176 -0.089 0.018 -0.144 -0.077 -0.157 0.008 ...
##  $ Contig46243_RC: num  -0.081 0.015 -0.262 -0.209 0.366 0.292 -0.058 -0.017 -0.082 0.113 ...
##  $ Contig26077_RC: num  0.272 0.197 -0.218 -0.038 -0.254 0.107 0.14 -0.4 0.26 0.219 ...
##  $ U45975        : num  0.737 0.268 -0.064 -0.226 -0.511 0.646 0.36 -0.432 0.25 -0.178 ...
##  $ Contig43679_RC: num  0.122 0.099 -0.646 0.169 -0.128 -0.055 -0.21 -0.353 -0.12 0.12 ...
##  $ AB033073      : num  -0.152 0.018 -0.028 -0.167 0.307 0.157 0.261 -0.297 0.129 0.282 ...
##  $ AF018081      : num  -0.063 -0.212 -0.129 -0.347 0.133 -0.128 0.05 -0.389 -0.116 0.145 ...
##  $ AB033079      : num  -0.009 0.07 -0.159 0.004 0.018 -0.26 0.069 -0.18 -0.378 -0.092 ...
##  $ X56210        : num  -0.146 -0.06 -0.045 -0.13 0.185 -0.099 0.04 -0.168 -0.146 -0.095 ...
##  $ AB033091      : num  -0.176 0.018 -0.194 0.212 0.097 -0.202 0.073 -0.481 -0.391 -0.014 ...
##  $ AB033092      : num  0.015 -0.064 -0.312 -0.08 0.138 -0.071 0.012 0.025 -0.333 -0.013 ...
##  $ NM_003004     : num  -0.47 -0.576 -0.064 -0.104 0.134 -0.09 -0.072 0.854 -0.068 -0.049 ...
##  $ Contig57877_RC: num  0.212 -0.053 -0.088 -0.356 0.007 -0.12 -0.14 -0.368 -0.277 -0.401 ...
##  $ NM_003010     : num  -0.211 -0.331 -0.355 -0.064 0.395 0.227 0.118 -0.106 -0.089 0.261 ...
##  $ NM_003012     : num  0.238 -0.26 0.141 -0.306 -0.098 -0.114 -0.026 0.263 -0.095 -0.222 ...
##  $ NM_003014     : num  0.039 -0.039 0.112 -0.273 -0.051 -0.047 0.317 -0.54 0.029 -0.16 ...
##  $ Contig43806_RC: num  -0.734 -0.661 -0.632 -0.944 -0.94 -0.58 -0.494 -0.924 -0.616 -0.7 ...
##  $ Contig29226_RC: num  0.045 -0.135 -0.041 -0.54 0.185 -0.033 -0.064 0.084 0.048 -0.04 ...
##  $ NM_003020     : num  -0.103 -0.255 -0.034 -0.548 -0.067 -0.237 -0.002 -0.351 -0.362 -0.164 ...
##  $ NM_003022     : num  0.292 0.092 -0.049 0.318 -0.051 0.259 -0.002 -0.284 -0.158 0.125 ...
##  $ Contig54847_RC: num  0.181 -0.208 -0.178 -0.692 0.129 0.198 0.436 -0.838 -0.029 0.166 ...
##  $ Contig33260_RC: num  0.056 0.297 -0.342 0.007 0.175 0.168 0.41 -0.089 -0.035 -0.006 ...
##  $ NM_002300     : num  -0.434 -0.316 -0.525 0.033 -0.178 -0.023 -0.149 0.245 -0.131 -0.457 ...
##  $ Contig14658_RC: num  0.043 0.087 -0.036 -0.115 0.208 0.058 0.331 -0.161 0.599 0.23 ...
##  $ NM_003033     : num  0.236 0.031 0.34 0.023 -0.247 0.123 -0.309 -0.077 -0.119 -0.23 ...
##  $ NM_003034     : num  0.096 -0.09 -0.047 -0.011 -0.406 -0.244 -0.218 -0.081 -0.016 -0.593 ...
##  $ NM_002306     : num  -0.271 0.066 0.092 -0.185 -0.01 0.025 0.094 0.152 0.156 0.185 ...
##  $ NM_003035     : num  -0.385 -0.08 0.053 -0.032 -0.071 -0.184 -0.534 0.581 -0.283 -0.253 ...
##  $ NM_002308     : num  -0.237 -0.269 0.203 0.312 0.088 -0.399 -0.076 0.425 -0.054 0.116 ...
##  $ NM_003038     : num  0.131 0.275 0.065 0.043 -0.171 0.192 -0.046 0.086 0.384 0.015 ...
##  $ NM_002313     : num  -0.047 -0.036 0.109 0.516 -0.197 0.063 0.026 -0.108 -0.185 0.143 ...
##  $ Contig54839_RC: num  0.13 -0.101 0.224 -0.149 0.01 -0.05 -0.104 0.083 0.156 0.028 ...
##  $ NM_002318     : num  -0.386 0.189 -0.122 -0.75 0.039 -0.236 0.343 -0.285 0.148 -0.205 ...
##  $ NM_003051     : num  0.299 -0.173 0.193 -0.02 -0.155 0.005 -0.375 -0.3 -0.363 0.064 ...
##  $ NM_003056     : num  0.116 -0.073 0.03 -0.041 -0.164 -0.049 0.113 0.167 0.035 -0.314 ...
##  $ Contig66143_RC: num  -0.294 0.55 0.642 -0.087 -0.381 -0.417 -0.137 -0.081 -0.27 -0.859 ...
##  $ Contig51809_RC: num  0.169 -0.086 0.129 -0.3 -0.054 -0.104 0.076 0.176 0.184 -0.298 ...
##  $ NM_002332     : num  0.025 -0.141 -0.113 -0.389 0.257 0.047 0.341 -0.346 0.221 0.215 ...
##  $ NM_001605     : num  -0.101 -0.138 -0.054 0.232 -0.147 -0.083 -0.082 -0.103 -0.113 -0.213 ...
##  $ NM_003064     : num  -0.065 -0.107 -0.033 0.069 -0.019 0.006 -0.052 0.305 0.527 0 ...
##  $ NM_002336     : num  -0.005 -0.162 -0.015 -0.024 -0.051 0.122 0.11 -0.039 -0.079 0.209 ...
##  $ NM_002337     : num  -0.083 -0.024 -0.17 0.023 -0.029 -0.019 0.119 0.034 0.192 -0.016 ...
##  $ NM_003066     : num  -0.131 -0.093 -0.026 0.028 -0.029 -0.016 -0.097 0.319 0.432 0.004 ...
##  $ NM_001609     : num  0.081 -0.026 -0.133 0.077 -0.191 0.102 -0.207 -0.595 -0.107 -0.131 ...
##  $ Contig50846_RC: num  0.064 -0.051 -0.083 -0.009 -0.063 -0.019 -0.019 0.306 -0.005 -0.219 ...
##  $ NM_001611     : num  -0.712 -0.435 -0.532 -0.097 -0.278 0.323 -0.371 0.188 -0.033 -0.062 ...
##  $ NM_003070     : num  0.09 0.028 -0.042 -0.261 0.151 0.078 0.188 -0.177 -0.254 0.227 ...
##  $ NM_002341     : num  -0.269 -0.731 -0.177 0.369 -0.48 -0.455 -0.133 0.219 0.114 -0.513 ...
##  $ NM_001613     : num  -0.143 0.053 -0.05 -0.492 0.074 -0.121 0.277 -0.331 -0.07 -0.145 ...
##  $ NM_003071     : num  -0.08 -0.125 0.097 -0.012 -0.003 0.381 -0.355 0.127 0.181 -0.159 ...
##  $ NM_001614     : num  -0.064 0.102 -0.031 -0.112 -0.196 -0.114 0.122 0.052 -0.141 0.13 ...
##  $ NM_002343     : num  -0.58 -1.26 -0.261 -0.356 -0.547 -0.371 -0.026 0.722 -0.657 0.314 ...
##  $ NM_001615     : num  -0.75 -0.23 -0.071 -0.999 -0.573 -0.933 -0.514 -0.696 -0.841 -0.529 ...
##  $ NM_002345     : num  -0.177 0.053 -0.251 -0.124 0.261 -0.182 0.045 -0.552 -0.2 -0.134 ...
##  $ NM_002346     : num  -0.339 -0.08 0.253 0.393 -0.099 -0.159 -0.129 0.07 -0.002 0.057 ...
##  $ NM_001618     : num  -0.292 -0.242 -0.125 0.085 0.181 -0.177 -0.141 0.09 -0.327 0.02 ...
##  $ Contig52320   : num  -0.01 0.311 -0.024 0.191 0.064 -0.096 0.12 -0.481 -0.306 -0.07 ...
##   [list output truncated]
# Output Example:
# 'data.frame': 78 obs. of 4773 variables
# $ X             : int  1 2 3 4 5 6 7 8 9 10 ...
# $ J00129        : num  -0.448 -0.48 -0.568 -0.819 ...
# $ Contig29982_RC: num  -0.296 -0.512 -0.411 -0.267 ...
# $ Contig42854   : num  -0.1 -0.031 -0.398 0.023 ...

dim(InitialData)  # Returns dataset dimensions (rows and columns)
## [1]   78 4773
# Example Output:
# [1] 78 4773

dimnames(InitialData)[[2]][4770:4773]  # View the names of the last columns
## [1] "NM_000895" "NM_000898" "AF067420"  "Class"
# Example Output:
# [1] "NM_000895" "NM_000898" "AF067420" "Class"

# --- Randomization Setup ---

subsets <- read.csv("C:/AS/subsets.csv")

my_registration_number <- 2401616

# Find the index of the row corresponding to your registration number.
idx <- which(subsets$RegId == my_registration_number)
print(idx) # Print the index to confirm that the registration number was found.
## [1] 45
# For example, [1] 1 indicates that the corresponding row is the first row in the dataset.

# Extract the subset of variables (excluding the first column "RegId") for your registration number.
# The result is a vector of 10 variables associated with your registration number.
subsets <- unlist(c(subsets[idx, -1]))
print(subsets) # Print your subset of variables.
##  Var1  Var2  Var3  Var4  Var5  Var6  Var7  Var8  Var9 Var10 
##  3248  2790  2547   405  2761  4257   970  3875  3036  2068
# Example output:
# Var1  Var2  Var3  Var4  Var5  Var6  Var7  Var8  Var9 Var10
# 417   3124  2492  4590  107   1557  4554  3610  4657 2428


# Assume that InitialData is a preloaded dataset containing the original variables.
Class <- InitialData$Class # Extract the "Class" column, which represents the labels or targets.

# Select only the columns (variables) specified in the subset (subsets).
X <- InitialData[, subsets]

# Combine the "Class" column with the selected variables to create the final dataset.
My_DataSet <- cbind(Class, X)

# The dataset 'My_DataSet' contains:
# - The "Class" column as the first column.
# - The 10 variables associated with your registration number.
print(My_DataSet)
##    Class Contig4460_RC NM_014750 Contig46881_RC NM_003239 NM_006701
## 1      2         0.114    -0.345          0.475     0.243     0.076
## 2      2         0.087    -0.188         -0.045    -0.025     0.342
## 3      2        -0.179    -0.081         -0.009     0.123     0.181
## 4      2        -0.088    -0.144         -0.235    -0.464     0.047
## 5      2         0.106    -0.152         -0.110     0.097    -0.256
## 6      2        -0.031    -0.260         -0.145     0.387     0.063
## 7      2        -0.179    -0.375         -0.200     0.609     0.040
## 8      2        -0.069     0.253         -0.194    -0.464     0.147
## 9      2        -0.055    -0.109         -0.067     0.374     0.161
## 10     2        -0.072    -0.082         -0.122     0.247     0.013
## 11     2         0.173     0.029          0.228     0.045     0.106
## 12     2        -0.138     0.199         -0.490    -0.297    -0.310
## 13     2        -0.350     0.283         -0.151     0.183     0.209
## 14     2         0.292    -0.490         -0.916     0.202    -0.027
## 15     2         0.012     0.149          0.162    -0.189    -0.128
## 16     2        -0.192     0.012         -0.026     0.165     0.080
## 17     2        -0.144    -0.429          0.366     0.335    -0.192
## 18     2        -0.018    -0.572          0.258     0.146    -0.109
## 19     2        -0.036    -0.389         -0.014     0.283    -0.227
## 20     2         0.091     0.159          0.218    -0.163     0.138
## 21     2        -0.083    -0.492          0.175     0.278    -0.081
## 22     2        -0.279    -0.291          0.026     0.234    -0.101
## 23     2        -0.388    -0.582          0.107     0.233    -0.195
## 24     2         0.105     0.009         -0.078    -0.443    -0.178
## 25     2        -0.045    -0.120         -0.071    -0.166     0.085
## 26     2         0.014    -0.206         -0.021     0.209    -0.011
## 27     2        -0.173    -0.304         -0.035     0.269     0.034
## 28     2        -0.281    -0.422         -0.036    -0.068    -0.164
## 29     2        -0.315    -0.002          0.042     0.188    -0.050
## 30     2        -0.134    -0.285         -0.278     0.341    -0.119
## 31     2         0.018    -0.399         -0.255    -0.038    -0.007
## 32     2        -0.108    -0.094         -0.292    -0.273     0.038
## 33     2         0.068    -0.227          0.105    -0.155     0.124
## 34     2        -0.355     0.424         -0.289    -0.573     0.210
## 35     2         0.284    -0.046         -0.278    -0.119    -0.032
## 36     2         0.024    -0.193         -0.299     0.188    -0.072
## 37     2        -0.329     0.210         -0.485    -0.377    -0.348
## 38     2         0.009    -0.133         -0.304    -0.239    -0.108
## 39     2        -0.300    -0.302         -0.040     0.178    -0.124
## 40     2        -0.180     0.167         -0.058     0.067     0.035
## 41     2        -0.200    -0.296          0.678     0.134     0.098
## 42     2         0.446    -0.577         -0.295     0.341    -0.274
## 43     2         0.200     0.159         -0.054     0.094    -0.200
## 44     2         0.072     0.062         -0.566    -0.557     0.140
## 45     1        -0.218    -0.109         -0.043    -0.081     0.149
## 46     1        -0.066    -0.156         -0.260     0.116    -0.124
## 47     1        -0.200    -0.115         -0.105    -0.028     0.112
## 48     1        -0.437     0.395         -0.430    -0.611    -0.388
## 49     1         0.129     0.153         -0.091    -0.238     0.372
## 50     1         0.033     0.315         -0.150    -0.570    -0.430
## 51     1         0.070    -0.124         -0.163    -0.018    -0.030
## 52     1         0.729    -0.188          0.201    -0.397     0.171
## 53     1        -0.115     0.249         -0.285    -0.089    -0.104
## 54     1         0.101    -0.401          0.194    -0.281     0.024
## 55     1        -0.109    -0.208          0.200    -0.251     0.059
## 56     1         0.035    -0.003          0.015    -0.178    -0.064
## 57     1        -0.053     0.622         -0.339    -0.411    -0.038
## 58     1        -0.302     0.127          0.152    -0.053     0.069
## 59     1        -0.118    -0.046          0.062    -0.046     0.049
## 60     1        -0.222    -0.145         -0.077     0.286    -0.017
## 61     1        -0.015    -0.292          0.175     0.011    -0.082
## 62     1        -0.212    -0.097         -0.121    -0.039     0.006
## 63     1        -0.187    -0.055          0.070    -0.023     0.112
## 64     1         0.082     0.253         -0.009    -0.143    -0.051
## 65     1        -0.117     0.188          0.017    -0.249    -0.113
## 66     1        -0.043     0.279          0.557    -0.626    -0.186
## 67     1        -0.116     0.077         -0.306    -0.638    -0.253
## 68     1         0.038     0.350         -0.168    -0.100     0.157
## 69     1        -0.162    -0.060          0.547    -0.139    -0.161
## 70     1        -0.264    -0.122          0.033    -0.129    -0.010
## 71     1        -0.182     0.715         -0.380    -0.358    -0.008
## 72     1        -0.110    -0.017         -0.101     0.130     0.035
## 73     1        -0.001    -0.501         -0.205    -0.389    -0.264
## 74     1        -0.244     0.181         -0.246    -0.156    -0.103
## 75     1         0.028    -0.057         -0.153    -0.101     0.017
## 76     1         0.007     0.047         -0.233     0.183    -0.192
## 77     1         0.036    -0.038         -0.140    -0.217     0.125
## 78     1         0.009    -0.051         -0.310    -0.029    -0.086
##    Contig41538_RC NM_002857 NM_018374 AL137449 NM_014316
## 1          -0.516    -0.144    -0.140    0.850    -0.092
## 2          -0.495    -0.056    -0.090    0.101     0.041
## 3           0.591    -0.134    -0.101   -0.346    -0.019
## 4           0.350    -0.031     0.195   -0.088     0.355
## 5          -0.054     0.160     0.161   -0.208    -0.152
## 6          -0.416    -0.137    -0.127   -0.383    -0.036
## 7          -0.284    -0.075    -0.088   -0.077    -0.071
## 8           0.193    -0.098    -0.139   -0.346     0.391
## 9          -0.386    -0.185    -0.241   -0.170     0.155
## 10          0.051     0.107    -0.134    0.244    -0.086
## 11          0.270    -0.012    -0.087   -0.339    -0.114
## 12         -0.051    -0.242    -0.349   -0.341    -0.128
## 13         -0.280    -0.120    -0.309   -0.358     0.084
## 14         -0.421     0.237    -0.170    0.726    -0.102
## 15         -0.428    -0.025     0.296    0.210     0.000
## 16         -0.064    -0.098    -0.074   -0.334     0.072
## 17         -0.211    -0.031    -0.299   -0.218    -0.149
## 18         -0.355    -0.043    -0.016    0.239     0.006
## 19          0.082     0.194    -0.120   -0.164    -0.241
## 20          0.204    -0.109    -0.054    0.028    -0.154
## 21         -0.339    -0.149    -0.295   -0.313     0.205
## 22          0.329     0.007     0.024    0.547    -0.298
## 23         -0.375     0.023    -0.126   -0.221    -0.292
## 24          0.302    -0.159    -0.160   -0.217     0.199
## 25          0.538     0.011    -0.298   -0.341     0.125
## 26         -0.346    -0.085     0.050    0.021    -0.044
## 27         -0.282     0.013    -0.006   -0.111    -0.086
## 28         -0.157    -0.138    -0.103   -0.143    -0.153
## 29         -0.252    -0.092    -0.328   -0.010     0.201
## 30         -0.239     0.002    -0.125    0.182    -0.091
## 31          0.306     0.048     0.079   -0.185    -0.015
## 32          0.317     0.079     0.008    0.202    -0.089
## 33         -0.232    -0.031     0.023   -0.304    -0.014
## 34         -0.214     0.289    -0.201   -0.640     0.058
## 35         -0.029    -0.004    -0.167    0.074     0.073
## 36          0.223     0.162    -0.236   -0.120     0.081
## 37         -0.604    -0.320    -0.328   -0.678     0.214
## 38         -0.414     0.245     0.136   -0.324     0.103
## 39         -0.382    -0.007    -0.014    0.148    -0.141
## 40          0.094     0.124    -0.039   -0.266     0.017
## 41         -0.141     0.288    -0.100   -0.469    -0.086
## 42         -0.390    -0.099    -0.057    0.383    -0.301
## 43          0.426    -0.048     0.527   -0.423     0.051
## 44          0.099    -0.294     0.021   -0.345     0.263
## 45          0.159     0.013    -0.223   -0.395     0.169
## 46         -0.175    -0.181    -0.036   -0.239     0.228
## 47          0.141    -0.020    -0.300   -0.155    -0.029
## 48         -0.142    -0.314    -0.238   -0.079     0.294
## 49         -0.362     0.075     0.216   -0.489    -0.070
## 50          0.072    -0.138    -0.168   -0.425    -0.164
## 51         -0.155     0.029    -0.155    0.555     0.000
## 52          0.465     0.029    -0.074    0.077     0.076
## 53          0.059    -0.292    -0.073   -0.269    -0.029
## 54         -2.000    -0.117     0.049    0.543     0.188
## 55          0.058     0.015    -0.108   -0.308     0.078
## 56         -0.053     0.140     0.142   -0.250    -0.198
## 57          0.232     0.007    -0.228   -0.389     0.056
## 58         -0.515     0.094     0.377   -0.397    -0.111
## 59          0.045     0.027     0.096   -0.416    -0.025
## 60         -0.353     0.115    -0.101   -0.418    -0.072
## 61         -0.280     0.116     0.100   -0.403     0.001
## 62         -0.421     0.162     0.034   -0.359     0.025
## 63          0.055    -0.085     0.011   -0.374    -0.014
## 64          0.571    -0.156    -0.126   -0.018    -0.109
## 65         -0.013    -0.331    -0.063   -0.332    -0.011
## 66         -0.224    -0.110    -0.044    0.217     0.135
## 67          0.188    -0.162    -0.151   -0.389     0.455
## 68          0.004    -0.135    -0.163    0.098     0.217
## 69         -0.354     0.180     0.094   -0.198    -0.051
## 70         -0.448    -0.016    -0.189   -0.420     0.113
## 71         -0.337    -0.341    -0.153   -0.421     0.478
## 72         -0.389     0.044     0.009   -0.316    -0.189
## 73         -0.185    -0.151    -0.082   -0.090    -0.241
## 74         -0.223    -0.370    -0.140    0.074    -0.141
## 75          0.001     0.270     0.177   -0.104    -0.050
## 76         -0.023     0.052    -0.111    0.518     0.045
## 77         -0.382     0.170     0.101   -0.352    -0.127
## 78         -0.258     0.192    -0.213    0.064     0.045

Task 1: Compute the variance, co-variance and correlation matrix of your individual subset of 10
genes.Explain the results and add an appropriate table to your report.

# Coursework MA321-7-SP: Analysis of Gene Expression Data
# Version: January 2025

rm(list=ls())  # Clean up the workspace by removing all variables

# --- Setup ---
# Load the necessary data
InitialData <- read.csv(file = "C:/AS/gene-expression-invasive-vs-noninvasive-cancer.csv")

# --- Extract Your Subset of 10 Genes ---
# Assuming your subset has been identified (you can adjust the registration number and the subset extraction accordingly).
# Example subset for registration number 2401468:
my_registration_number <- 2401616
subsets <- read.csv("C:/AS/subsets.csv")  # Load the file containing registration numbers and subsets

# Find the row for the given registration number
idx <- which(subsets$RegId == my_registration_number)

# Extract the subset of 10 gene expression variables for this registration number
subsets <- unlist(c(subsets[idx, -1]))

# Extract the gene expression data from InitialData
X <- InitialData[, subsets]  # Select the 10 variables corresponding to your subset

# --- Compute Variance, Covariance, and Correlation Matrices ---

# Variance: Apply the var() function to each of the 10 selected genes (columns)
gene_variance <- apply(X, 2, var)

# Covariance: Compute the covariance matrix of the selected genes
gene_covariance <- cov(X)

# Correlation: Compute the correlation matrix of the selected genes
gene_correlation <- cor(X)

# --- Print and Present the Results ---
cat("Variance of Each Gene:\n")
## Variance of Each Gene:
print(gene_variance)
##  Contig4460_RC      NM_014750 Contig46881_RC      NM_003239      NM_006701 
##     0.03520098     0.07333086     0.06594380     0.07832368     0.02487964 
## Contig41538_RC      NM_002857      NM_018374       AL137449      NM_014316 
##     0.13050131     0.02377509     0.02656569     0.09703756     0.02722110
cat("\nCovariance Matrix:\n")
## 
## Covariance Matrix:
print(gene_covariance)
##                Contig4460_RC    NM_014750 Contig46881_RC    NM_003239
## Contig4460_RC   0.0352009764 -0.008771209  -0.0009198185 -0.002706309
## NM_014750      -0.0087712095  0.073330856  -0.0158399404 -0.043118438
## Contig46881_RC -0.0009198185 -0.015839940   0.0659438015  0.011776445
## NM_003239      -0.0027063090 -0.043118438   0.0117764449  0.078323681
## NM_006701       0.0029296613  0.002466270   0.0060536513  0.004339042
## Contig41538_RC  0.0082131392  0.023399552  -0.0096507156 -0.015764756
## NM_002857       0.0034752617 -0.011877228   0.0058806064  0.009364266
## NM_018374       0.0070061958 -0.002384750   0.0082138032 -0.001835101
## AL137449        0.0205688342 -0.029822082   0.0019303187  0.021943905
## NM_014316      -0.0023990176  0.018374276  -0.0087796357 -0.020703720
##                   NM_006701 Contig41538_RC     NM_002857     NM_018374
## Contig4460_RC   0.002929661   0.0082131392  0.0034752617  0.0070061958
## NM_014750       0.002466270   0.0233995518 -0.0118772278 -0.0023847502
## Contig46881_RC  0.006053651  -0.0096507156  0.0058806064  0.0082138032
## NM_003239       0.004339042  -0.0157647556  0.0093642657 -0.0018351009
## NM_006701       0.024879636   0.0019394116  0.0049888581  0.0020596044
## Contig41538_RC  0.001939412   0.1305013100  0.0002369431 -0.0031759960
## NM_002857       0.004988858   0.0002369431  0.0237750889  0.0075559081
## NM_018374       0.002059604  -0.0031759960  0.0075559081  0.0265656943
## AL137449       -0.004519434  -0.0223461958  0.0010901269 -0.0006750729
## NM_014316       0.003486768   0.0010698645 -0.0083082118 -0.0050989760
##                     AL137449    NM_014316
## Contig4460_RC   0.0205688342 -0.002399018
## NM_014750      -0.0298220819  0.018374276
## Contig46881_RC  0.0019303187 -0.008779636
## NM_003239       0.0219439051 -0.020703720
## NM_006701      -0.0045194336  0.003486768
## Contig41538_RC -0.0223461958  0.001069864
## NM_002857       0.0010901269 -0.008308212
## NM_018374      -0.0006750729 -0.005098976
## AL137449        0.0970375604 -0.010921772
## NM_014316      -0.0109217722  0.027221100
cat("\nCorrelation Matrix:\n")
## 
## Correlation Matrix:
print(gene_correlation)
##                Contig4460_RC   NM_014750 Contig46881_RC   NM_003239   NM_006701
## Contig4460_RC     1.00000000 -0.17263894    -0.01909140 -0.05154108  0.09899609
## NM_014750        -0.17263894  1.00000000    -0.22778398 -0.56894868  0.05773979
## Contig46881_RC   -0.01909140 -0.22778398     1.00000000  0.16386291  0.14945430
## NM_003239        -0.05154108 -0.56894868     0.16386291  1.00000000  0.09829362
## NM_006701         0.09899609  0.05773979     0.14945430  0.09829362  1.00000000
## Contig41538_RC    0.12117817  0.23919755    -0.10403156 -0.15593129  0.03403616
## NM_002857         0.12012934 -0.28445307     0.14851627  0.21700319  0.20512475
## NM_018374         0.22911015 -0.05403049     0.19624415 -0.04023026  0.08011267
## AL137449          0.35193494 -0.35352867     0.02413082  0.25170824 -0.09197972
## NM_014316        -0.07750028  0.41125768    -0.20722239 -0.44838285  0.13398255
##                Contig41538_RC    NM_002857   NM_018374    AL137449   NM_014316
## Contig4460_RC     0.121178171  0.120129340  0.22911015  0.35193494 -0.07750028
## NM_014750         0.239197548 -0.284453074 -0.05403049 -0.35352867  0.41125768
## Contig46881_RC   -0.104031556  0.148516275  0.19624415  0.02413082 -0.20722239
## NM_003239        -0.155931286  0.217003186 -0.04023026  0.25170824 -0.44838285
## NM_006701         0.034036159  0.205124753  0.08011267 -0.09197972  0.13398255
## Contig41538_RC    1.000000000  0.004253784 -0.05394012 -0.19857576  0.01795017
## NM_002857         0.004253784  1.000000000  0.30065263  0.02269583 -0.32658305
## NM_018374        -0.053940122  0.300652629  1.00000000 -0.01329597 -0.18961363
## AL137449         -0.198575758  0.022695828 -0.01329597  1.00000000 -0.21250558
## NM_014316         0.017950167 -0.326583054 -0.18961363 -0.21250558  1.00000000
# --- Create Tables for the Report ---

# Create a table for variance
variance_table <- data.frame(Gene = colnames(X), Variance = gene_variance)
cat("\nVariance Table:\n")
## 
## Variance Table:
print(variance_table)
##                          Gene   Variance
## Contig4460_RC   Contig4460_RC 0.03520098
## NM_014750           NM_014750 0.07333086
## Contig46881_RC Contig46881_RC 0.06594380
## NM_003239           NM_003239 0.07832368
## NM_006701           NM_006701 0.02487964
## Contig41538_RC Contig41538_RC 0.13050131
## NM_002857           NM_002857 0.02377509
## NM_018374           NM_018374 0.02656569
## AL137449             AL137449 0.09703756
## NM_014316           NM_014316 0.02722110
# Create a table for covariance
covariance_table <- as.data.frame(gene_covariance)
cat("\nCovariance Matrix Table:\n")
## 
## Covariance Matrix Table:
print(covariance_table)
##                Contig4460_RC    NM_014750 Contig46881_RC    NM_003239
## Contig4460_RC   0.0352009764 -0.008771209  -0.0009198185 -0.002706309
## NM_014750      -0.0087712095  0.073330856  -0.0158399404 -0.043118438
## Contig46881_RC -0.0009198185 -0.015839940   0.0659438015  0.011776445
## NM_003239      -0.0027063090 -0.043118438   0.0117764449  0.078323681
## NM_006701       0.0029296613  0.002466270   0.0060536513  0.004339042
## Contig41538_RC  0.0082131392  0.023399552  -0.0096507156 -0.015764756
## NM_002857       0.0034752617 -0.011877228   0.0058806064  0.009364266
## NM_018374       0.0070061958 -0.002384750   0.0082138032 -0.001835101
## AL137449        0.0205688342 -0.029822082   0.0019303187  0.021943905
## NM_014316      -0.0023990176  0.018374276  -0.0087796357 -0.020703720
##                   NM_006701 Contig41538_RC     NM_002857     NM_018374
## Contig4460_RC   0.002929661   0.0082131392  0.0034752617  0.0070061958
## NM_014750       0.002466270   0.0233995518 -0.0118772278 -0.0023847502
## Contig46881_RC  0.006053651  -0.0096507156  0.0058806064  0.0082138032
## NM_003239       0.004339042  -0.0157647556  0.0093642657 -0.0018351009
## NM_006701       0.024879636   0.0019394116  0.0049888581  0.0020596044
## Contig41538_RC  0.001939412   0.1305013100  0.0002369431 -0.0031759960
## NM_002857       0.004988858   0.0002369431  0.0237750889  0.0075559081
## NM_018374       0.002059604  -0.0031759960  0.0075559081  0.0265656943
## AL137449       -0.004519434  -0.0223461958  0.0010901269 -0.0006750729
## NM_014316       0.003486768   0.0010698645 -0.0083082118 -0.0050989760
##                     AL137449    NM_014316
## Contig4460_RC   0.0205688342 -0.002399018
## NM_014750      -0.0298220819  0.018374276
## Contig46881_RC  0.0019303187 -0.008779636
## NM_003239       0.0219439051 -0.020703720
## NM_006701      -0.0045194336  0.003486768
## Contig41538_RC -0.0223461958  0.001069864
## NM_002857       0.0010901269 -0.008308212
## NM_018374      -0.0006750729 -0.005098976
## AL137449        0.0970375604 -0.010921772
## NM_014316      -0.0109217722  0.027221100
# Create a table for correlation
correlation_table <- as.data.frame(gene_correlation)
cat("\nCorrelation Matrix Table:\n")
## 
## Correlation Matrix Table:
print(correlation_table)
##                Contig4460_RC   NM_014750 Contig46881_RC   NM_003239   NM_006701
## Contig4460_RC     1.00000000 -0.17263894    -0.01909140 -0.05154108  0.09899609
## NM_014750        -0.17263894  1.00000000    -0.22778398 -0.56894868  0.05773979
## Contig46881_RC   -0.01909140 -0.22778398     1.00000000  0.16386291  0.14945430
## NM_003239        -0.05154108 -0.56894868     0.16386291  1.00000000  0.09829362
## NM_006701         0.09899609  0.05773979     0.14945430  0.09829362  1.00000000
## Contig41538_RC    0.12117817  0.23919755    -0.10403156 -0.15593129  0.03403616
## NM_002857         0.12012934 -0.28445307     0.14851627  0.21700319  0.20512475
## NM_018374         0.22911015 -0.05403049     0.19624415 -0.04023026  0.08011267
## AL137449          0.35193494 -0.35352867     0.02413082  0.25170824 -0.09197972
## NM_014316        -0.07750028  0.41125768    -0.20722239 -0.44838285  0.13398255
##                Contig41538_RC    NM_002857   NM_018374    AL137449   NM_014316
## Contig4460_RC     0.121178171  0.120129340  0.22911015  0.35193494 -0.07750028
## NM_014750         0.239197548 -0.284453074 -0.05403049 -0.35352867  0.41125768
## Contig46881_RC   -0.104031556  0.148516275  0.19624415  0.02413082 -0.20722239
## NM_003239        -0.155931286  0.217003186 -0.04023026  0.25170824 -0.44838285
## NM_006701         0.034036159  0.205124753  0.08011267 -0.09197972  0.13398255
## Contig41538_RC    1.000000000  0.004253784 -0.05394012 -0.19857576  0.01795017
## NM_002857         0.004253784  1.000000000  0.30065263  0.02269583 -0.32658305
## NM_018374        -0.053940122  0.300652629  1.00000000 -0.01329597 -0.18961363
## AL137449         -0.198575758  0.022695828 -0.01329597  1.00000000 -0.21250558
## NM_014316         0.017950167 -0.326583054 -0.18961363 -0.21250558  1.00000000
# Load necessary libraries
library(ggplot2)
library(reshape2)
library(pheatmap)

# Assuming gene_variance, gene_covariance, and gene_correlation are already calculated

# --- 1. Covariance Matrix Heatmap using pheatmap ---
pheatmap(gene_covariance, 
         main = "Heatmap of Covariance Matrix", 
         color = colorRampPalette(c("blue", "white", "red"))(50),
         cluster_rows = TRUE,   # Cluster rows (genes)
         cluster_cols = TRUE,   # Cluster columns (genes)
         show_rownames = TRUE,  # Show row names (genes)
         show_colnames = TRUE)  # Show column names (genes)

# --- 2. Correlation Matrix Heatmap using ggplot2 ---
correlation_melted <- melt(gene_correlation)  # Melt the correlation matrix for ggplot

ggplot(correlation_melted, aes(Var1, Var2, fill = value)) +
  geom_tile() +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0) +
  labs(title = "Correlation Matrix Heatmap", x = "Gene", y = "Gene") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))  # Rotate x-axis labels

# --- 3. Variance Bar Plot using ggplot2 ---
variance_df <- data.frame(Gene = colnames(X), Variance = gene_variance)

ggplot(variance_df, aes(x = Gene, y = Variance)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(title = "Variance of Each Gene", x = "Gene", y = "Variance")

Task 2: Calculate the distance matrix for the subset of 10 genes:

# Calculate the distance matrix
distance_matrix <- dist(subsets)

# Print the distance matrix
distance_matrix
##       Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9
## Var2   458                                        
## Var3   701  243                                   
## Var4  2843 2385 2142                              
## Var5   487   29  214 2356                         
## Var6  1009 1467 1710 3852 1496                    
## Var7  2278 1820 1577  565 1791 3287               
## Var8   627 1085 1328 3470 1114  382 2905          
## Var9   212  246  489 2631  275 1221 2066  839     
## Var10 1180  722  479 1663  693 2189 1098 1807  968
# Save the distance matrix
write.csv(as.matrix(distance_matrix), "distance_matrix.csv")

# --- Heatmap for Distance Matrix ---
distance_data <- melt(as.matrix(distance_matrix))
ggplot(distance_data, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile() +
  scale_fill_gradient(low = "white", high = "red") +
  theme_minimal() +
  labs(title = "Heatmap of Distance Matrix", x = "Observations", y = "Observations", fill = "Distance")

# Convert the distance matrix to a full matrix form
distance_matrix_full <- as.matrix(distance_matrix)

# Melt the distance matrix
distance_data <- melt(distance_matrix_full)

# Plot the heatmap for the distance matrix
ggplot(distance_data, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile() +
  scale_fill_gradient(low = "white", high = "red") +
  theme_minimal() +
  labs(title = "Heatmap of Distance Matrix", x = "Observations", y = "Observations", fill = "Distance")

Task 3: Using R to calculate univariate Q-Q-plots and a Q-Q-plot based on the generalised distance for the observations of your individual subset of 10 genes

library(MASS)
library(ggplot2)
library(reshape2)

InitialData <- read.csv(file = "C:/AS/gene-expression-invasive-vs-noninvasive-cancer.csv")
subsets <- read.csv("C:/AS/subsets.csv")

my_registration_number <- 2401616
idx <- which(subsets$RegId == my_registration_number)
subsets <- unlist(c(subsets[idx, -1]))

X <- InitialData[, subsets]

par(mfrow = c(2, 5))

for (i in 1:ncol(X)) {
  qqnorm(X[, i], main = paste("Q-Q Plot for Gene", colnames(X)[i]))
  qqline(X[, i], col = "red")
}

mahalanobis_distances <- mahalanobis(X, colMeans(X), cov(X))

qqplot(qchisq(ppoints(length(mahalanobis_distances)), df = ncol(X)), mahalanobis_distances,
       main = "Q-Q Plot of Mahalanobis Distances",
       xlab = "Theoretical Quantiles (Chi-squared)",
       ylab = "Observed Mahalanobis Distances")
abline(0, 1, col = "red")

Task 4:Perform Principal Component Analysis (PCA) on the subset of 10 genes :

library(ggplot2)
library(reshape2)

InitialData <- read.csv(file = "C:/AS/gene-expression-invasive-vs-noninvasive-cancer.csv")
subsets <- read.csv("C:/AS/subsets.csv")

my_registration_number <- 2401616
idx <- which(subsets$RegId == my_registration_number)
my_subset <- unlist(c(subsets[idx, -1]))

X <- InitialData[, my_subset]

X_scaled <- scale(X)

pca_result <- prcomp(X_scaled)

summary(pca_result)
## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5     PC6     PC7
## Standard deviation     1.5963 1.2123 1.1341 1.0234 1.0023 0.90244 0.75282
## Proportion of Variance 0.2548 0.1470 0.1286 0.1047 0.1005 0.08144 0.05667
## Cumulative Proportion  0.2548 0.4018 0.5304 0.6351 0.7356 0.81703 0.87371
##                            PC8    PC9    PC10
## Standard deviation     0.72381 0.6588 0.55233
## Proportion of Variance 0.05239 0.0434 0.03051
## Cumulative Proportion  0.92610 0.9695 1.00000
pca_table <- data.frame(Principal_Component = 1:length(pca_result$sdev),
                        Eigenvalue = pca_result$sdev^2,
                        Variance_Explained = pca_result$sdev^2 / sum(pca_result$sdev^2))

print(pca_table)
##    Principal_Component Eigenvalue Variance_Explained
## 1                    1  2.5482748         0.25482748
## 2                    2  1.4695882         0.14695882
## 3                    3  1.2861137         0.12861137
## 4                    4  1.0472961         0.10472961
## 5                    5  1.0046527         0.10046527
## 6                    6  0.8144035         0.08144035
## 7                    7  0.5667407         0.05667407
## 8                    8  0.5238975         0.05238975
## 9                    9  0.4339591         0.04339591
## 10                  10  0.3050736         0.03050736
# Run PCA
pca_result <- prcomp(subsets, scale. = TRUE)

# Display summary
summary(pca_result)
## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5     PC6     PC7
## Standard deviation     1.2854 1.2140 1.1505 1.1194 1.0902 0.97927 0.93916
## Proportion of Variance 0.1502 0.1340 0.1203 0.1139 0.1080 0.08718 0.08018
## Cumulative Proportion  0.1502 0.2842 0.4045 0.5184 0.6265 0.71365 0.79383
##                            PC8    PC9    PC10   PC11
## Standard deviation     0.84286 0.7883 0.71513 0.6516
## Proportion of Variance 0.06458 0.0565 0.04649 0.0386
## Cumulative Proportion  0.85841 0.9149 0.96140 1.0000
# --- Scree Plot ---
plot(pca_result$sdev^2 / sum(pca_result$sdev^2), type = "b",
     main = "Scree Plot of Principal Components",
     xlab = "Principal Components", ylab = "Proportion of Variance Explained")

# --- PCA Biplot ---
biplot(pca_result, main = "PCA Biplot", cex = 0.7)

# Plot PCA
plot(pca_result$x[, 1:2], main = "PCA: First vs Second Principal Component")

Task 5: Fit a Multivariate Analysis of Variance (MANOVA) mode :

# Load necessary libraries
library(ggplot2)

# --- Prepare Data ---
# Select the gene expression data based on your subset of 10 genes
gene_expression_data <- InitialData[, my_subset]  # Assuming 'my_subset' contains the selected gene indices or names

# Add the 'Class' column (invasive vs noninvasive) to the subset
gene_expression_data$Class <- as.factor(InitialData$Class)

# Convert the gene expression data (excluding 'Class') to a matrix format (required for MANOVA)
gene_expression_matrix <- as.matrix(gene_expression_data[, -ncol(gene_expression_data)])

# --- Fit MANOVA Model ---
# Fit the MANOVA model with Class as the independent variable and gene expression variables as the dependent variables
manova_model <- manova(gene_expression_matrix ~ gene_expression_data$Class)

# --- Summary of MANOVA Model ---
summary(manova_model)  # This provides multivariate test statistics such as Wilks' Lambda, Pillai's Trace, etc.
##                            Df  Pillai approx F num Df den Df  Pr(>F)  
## gene_expression_data$Class  1 0.22517    1.947     10     67 0.05369 .
## Residuals                  76                                         
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# --- Post-hoc Analysis (if MANOVA is significant) ---
# Perform a separate ANOVA for each gene to investigate individual differences (if MANOVA is significant)
anova_results <- summary.aov(manova_model)
print(anova_results)
##  Response Contig4460_RC :
##                            Df  Sum Sq  Mean Sq F value Pr(>F)
## gene_expression_data$Class  1 0.00055 0.000551  0.0155 0.9014
## Residuals                  76 2.70992 0.035657               
## 
##  Response NM_014750 :
##                            Df Sum Sq Mean Sq F value   Pr(>F)   
## gene_expression_data$Class  1 0.6310 0.63102  9.5619 0.002778 **
## Residuals                  76 5.0155 0.06599                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Contig46881_RC :
##                            Df Sum Sq  Mean Sq F value Pr(>F)
## gene_expression_data$Class  1 0.0082 0.008217  0.1232 0.7266
## Residuals                  76 5.0695 0.066703               
## 
##  Response NM_003239 :
##                            Df Sum Sq Mean Sq F value    Pr(>F)    
## gene_expression_data$Class  1 0.8329 0.83289  12.178 0.0008084 ***
## Residuals                  76 5.1980 0.06840                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response NM_006701 :
##                            Df  Sum Sq   Mean Sq F value Pr(>F)
## gene_expression_data$Class  1 0.00442 0.0044175  0.1757 0.6763
## Residuals                  76 1.91131 0.0251489               
## 
##  Response Contig41538_RC :
##                            Df Sum Sq  Mean Sq F value Pr(>F)
## gene_expression_data$Class  1 0.0778 0.077767  0.5928 0.4437
## Residuals                  76 9.9708 0.131195               
## 
##  Response NM_002857 :
##                            Df  Sum Sq   Mean Sq F value Pr(>F)
## gene_expression_data$Class  1 0.00313 0.0031257    0.13 0.7194
## Residuals                  76 1.82756 0.0240468               
## 
##  Response NM_018374 :
##                            Df  Sum Sq  Mean Sq F value Pr(>F)
## gene_expression_data$Class  1 0.01828 0.018278  0.6852 0.4104
## Residuals                  76 2.02728 0.026675               
## 
##  Response AL137449 :
##                            Df Sum Sq  Mean Sq F value Pr(>F)
## gene_expression_data$Class  1 0.0925 0.092474  0.9524 0.3322
## Residuals                  76 7.3794 0.097098               
## 
##  Response NM_014316 :
##                            Df  Sum Sq  Mean Sq F value Pr(>F)
## gene_expression_data$Class  1 0.02282 0.022825  0.8367 0.3632
## Residuals                  76 2.07320 0.027279
# --- Visualize the Results ---
# Create a dataframe for plotting the gene expression data with Class information
plot_data <- data.frame(gene_expression_data)
plot_data_long <- reshape(plot_data, 
                          varying = colnames(plot_data)[1:10], 
                          v.names = "Expression", 
                          times = colnames(plot_data)[1:10], 
                          timevar = "Gene", 
                          direction = "long")

# Plot boxplots or violin plots for each gene expression by Class
ggplot(plot_data_long, aes(x = Class, y = Expression, fill = Class)) +
  geom_boxplot() +
  facet_wrap(~Gene, scales = "free_y") +
  labs(title = "Gene Expression by Class (Invasive vs Noninvasive Cancer)",
       x = "Cancer Class",
       y = "Gene Expression") +
  theme_minimal()

Task 6:Use the first and second principal component to illustrate, if there is a difference between invasive and noninvasive cancer.

# Load necessary libraries
library(ggplot2)

# --- Prepare Data ---
# Remove the Class column and only keep the gene expression data
gene_expression_data <- InitialData[, my_subset] # Assuming `my_subset` contains selected gene columns

# Standardize the data (center and scale)
scaled_data <- scale(gene_expression_data)

# --- Perform PCA ---
pca_result <- prcomp(scaled_data, center = TRUE, scale. = TRUE)

# --- Visualize the PCA Results ---
# Create a data frame for plotting, including the first two principal components and the Class labels
pca_data <- data.frame(PC1 = pca_result$x[, 1], 
                       PC2 = pca_result$x[, 2], 
                       Class = as.factor(InitialData$Class))

# Plot the first and second principal components, colored by Class
ggplot(pca_data, aes(x = PC1, y = PC2, color = Class)) +
  geom_point(size = 4) +
  labs(title = "PCA of Gene Expression Data: Invasive vs Noninvasive Cancer", 
       x = "Principal Component 1", 
       y = "Principal Component 2") +
  theme_minimal() +
  scale_color_manual(values = c("red", "blue")) +
  theme(legend.title = element_blank()) +
  theme(plot.title = element_text(hjust = 0.5))

# --- Table of Variance Explained by Each Principal Component ---
summary(pca_result)
## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5     PC6     PC7
## Standard deviation     1.5963 1.2123 1.1341 1.0234 1.0023 0.90244 0.75282
## Proportion of Variance 0.2548 0.1470 0.1286 0.1047 0.1005 0.08144 0.05667
## Cumulative Proportion  0.2548 0.4018 0.5304 0.6351 0.7356 0.81703 0.87371
##                            PC8    PC9    PC10
## Standard deviation     0.72381 0.6588 0.55233
## Proportion of Variance 0.05239 0.0434 0.03051
## Cumulative Proportion  0.92610 0.9695 1.00000
# --- Additional Details: Eigenvalues (Variance explained by each PC) ---
eigenvalues <- summary(pca_result)$importance[2, ]  # Proportion of Variance Explained
eigenvalues_table <- data.frame(PC = 1:length(eigenvalues), Variance_Explained = eigenvalues)

# Print the table of eigenvalues
print(eigenvalues_table)
##      PC Variance_Explained
## PC1   1            0.25483
## PC2   2            0.14696
## PC3   3            0.12861
## PC4   4            0.10473
## PC5   5            0.10047
## PC6   6            0.08144
## PC7   7            0.05667
## PC8   8            0.05239
## PC9   9            0.04340
## PC10 10            0.03051

Task 7: :Apply LDA to your individual subset of 10 genes and the class variable (invasive (label 1) and noninvasive (label 2) cancer). Calculate a confusion matrix, sensitivity,specificity and misclassification error.

# Load necessary libraries
library(MASS)       # For LDA
library(caret)      # For confusionMatrix and performance metrics
## Loading required package: lattice
library(ggplot2)    # For visualization

# --- Prepare Data ---
# Select the gene expression data based on your subset of 10 genes
gene_expression_data <- InitialData[, my_subset]  # Assuming 'my_subset' contains the selected gene indices or names

# Add the 'Class' column (invasive vs noninvasive) to the subset
gene_expression_data$Class <- as.factor(InitialData$Class)

# --- Fit LDA Model ---
# Fit the LDA model with Class as the dependent variable and gene expression data as the independent variables
lda_model <- lda(Class ~ ., data = gene_expression_data)

# --- Predict the Classes using LDA Model ---
# Use the LDA model to predict the class labels
lda_predictions <- predict(lda_model)

# --- Confusion Matrix ---
# Generate the confusion matrix to compare the true class labels and predicted labels
conf_matrix <- confusionMatrix(lda_predictions$class, gene_expression_data$Class)

# Print the confusion matrix
print(conf_matrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2
##          1 20  8
##          2 14 36
##                                           
##                Accuracy : 0.7179          
##                  95% CI : (0.6047, 0.8141)
##     No Information Rate : 0.5641          
##     P-Value [Acc > NIR] : 0.003743        
##                                           
##                   Kappa : 0.4147          
##                                           
##  Mcnemar's Test P-Value : 0.286422        
##                                           
##             Sensitivity : 0.5882          
##             Specificity : 0.8182          
##          Pos Pred Value : 0.7143          
##          Neg Pred Value : 0.7200          
##              Prevalence : 0.4359          
##          Detection Rate : 0.2564          
##    Detection Prevalence : 0.3590          
##       Balanced Accuracy : 0.7032          
##                                           
##        'Positive' Class : 1               
## 
# --- Calculate Sensitivity, Specificity, and Misclassification Error ---
# Sensitivity (True Positive Rate) for Class 1 (invasive cancer)
sensitivity <- conf_matrix$byClass['Sensitivity']

# Specificity (True Negative Rate) for Class 2 (noninvasive cancer)
specificity <- conf_matrix$byClass['Specificity']

# Misclassification Error
misclassification_error <- 1 - conf_matrix$overall['Accuracy']

# Print Sensitivity, Specificity, and Misclassification Error
cat("Sensitivity (Invasive): ", sensitivity, "\n")
## Sensitivity (Invasive):  0.5882353
cat("Specificity (Noninvasive): ", specificity, "\n")
## Specificity (Noninvasive):  0.8181818
cat("Misclassification Error: ", misclassification_error, "\n")
## Misclassification Error:  0.2820513
# Load necessary library
library(caret)  # For confusionMatrix

# --- Prepare Data ---
# Assuming 'lda_predictions' is the output from the LDA model and 
# 'gene_expression_data$Class' is the true class labels

# Generate the confusion matrix to compare the true class labels and predicted labels
conf_matrix <- confusionMatrix(lda_predictions$class, gene_expression_data$Class)

# Print the confusion matrix
print(conf_matrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2
##          1 20  8
##          2 14 36
##                                           
##                Accuracy : 0.7179          
##                  95% CI : (0.6047, 0.8141)
##     No Information Rate : 0.5641          
##     P-Value [Acc > NIR] : 0.003743        
##                                           
##                   Kappa : 0.4147          
##                                           
##  Mcnemar's Test P-Value : 0.286422        
##                                           
##             Sensitivity : 0.5882          
##             Specificity : 0.8182          
##          Pos Pred Value : 0.7143          
##          Neg Pred Value : 0.7200          
##              Prevalence : 0.4359          
##          Detection Rate : 0.2564          
##    Detection Prevalence : 0.3590          
##       Balanced Accuracy : 0.7032          
##                                           
##        'Positive' Class : 1               
## 
# --- Visualization of Confusion Matrix ---
# Plot confusion matrix using ggplot2
conf_matrix_data <- as.data.frame(conf_matrix$table)
ggplot(conf_matrix_data, aes(Reference, Prediction)) +
  geom_tile(aes(fill = Freq), color = "white") +
  geom_text(aes(label = Freq), vjust = 1) +
  scale_fill_gradient(low = "white", high = "blue") +
  labs(title = "Confusion Matrix for LDA Model", x = "True Class", y = "Predicted Class") +
  theme_minimal()