# Coursework MA321-7-SP: Initial Task - R Code to Get Started
# Version:January 2025
rm(list=ls())
# --- Load Data ---
InitialData <- read.csv(file = "C:/AS/gene-expression-invasive-vs-noninvasive-cancer.csv")
# --- Check the Data ---
str(InitialData)
## 'data.frame': 78 obs. of 4773 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ J00129 : num -0.448 -0.48 -0.568 -0.819 -0.112 -0.391 -0.624 -0.528 -0.811 -0.839 ...
## $ Contig29982_RC: num -0.296 -0.512 -0.411 -0.267 -0.67 -0.31 -0.12 -0.447 -0.536 2 ...
## $ Contig42854 : num -0.1 -0.031 -0.398 0.023 0.421 -0.06 -0.236 -0.254 -0.211 0.147 ...
## $ Contig42014_RC: num -0.177 -0.075 0.116 -0.23 -0.19 -0.164 -0.175 0.017 -0.201 -0.325 ...
## $ Contig27915_RC: num -0.107 -0.104 -0.092 0.198 0.032 -0.173 0.253 0.654 0.287 -0.303 ...
## $ Contig20156_RC: num -0.11 -0.234 -0.166 -0.51 0.281 -0.034 -0.125 0.364 -0.08 -0.061 ...
## $ Contig50634_RC: num -0.095 -0.225 0.036 0.529 0.31 -0.091 -0.127 0.068 -0.15 0.097 ...
## $ Contig42615_RC: num -0.076 -0.094 0.397 0.354 0.056 0.036 -0.02 0.181 0.045 0.006 ...
## $ Contig56678_RC: num -0.134 0.115 -0.194 -0.261 0.116 0.346 0.047 -1.14 -0.11 0.176 ...
## $ Contig48659_RC: num -0.14 0.019 -0.128 0.012 0.074 0.007 -0.15 -0.111 -0.072 -0.084 ...
## $ Contig49388_RC: num 0.006 0.15 0.139 -0.26 0.041 0.251 0.266 -0.153 0.471 0.114 ...
## $ Contig1970_RC : num 0.111 0.038 -0.033 -0.069 0.067 0.229 0.246 -0.415 -0.096 -0.081 ...
## $ Contig26343_RC: num -0.236 0.092 0.039 -0.115 0.279 0.297 0.142 0.111 0.047 -0.071 ...
## $ Contig53047_RC: num -0.866 -1.035 -1.114 -1.021 -1.006 ...
## $ Contig43945_RC: num 0.126 -0.062 0.011 -0.999 0.211 -0.1 -0.194 -0.053 0.096 -0.121 ...
## $ Contig19551 : num -0.692 -0.21 -0.462 0.273 0.242 -0.883 0.206 0.174 -0.355 0.23 ...
## $ Contig10437_RC: num 0.132 -0.139 -0.185 0.159 0.276 -0.146 -0.301 -0.075 0.253 0.022 ...
## $ Contig47230_RC: num 0.095 0.068 -0.168 -0.398 -0.604 0.382 -0.549 -0.635 0.856 0.515 ...
## $ Contig20749_RC: num 0.252 0.268 -0.289 -0.734 0.08 0.403 -0.012 -0.586 0.105 0.138 ...
## $ AL157502 : num 0.139 -0.179 -0.378 -0.427 0.372 -0.014 -0.022 -0.821 -0.294 -0.165 ...
## $ Contig36647_RC: num -0.097 0.181 -0.494 0.848 -0.01 0.6 -0.984 0.077 -0.15 0.58 ...
## $ D31887 : num 0.113 0.06 -0.211 -0.338 0.076 -0.025 0.075 -0.03 -0.275 0.14 ...
## $ AB033006 : num -0.209 -0.198 -0.331 -0.239 -0.118 -0.317 -0.25 -0.082 -0.017 -0.32 ...
## $ AB033007 : num 0.107 -0.04 0.114 0.081 -0.072 0.134 0.131 0.069 0.177 0.21 ...
## $ M83822 : num 0.098 0.147 -0.121 -0.09 0.075 0.295 0.024 -0.39 -0.171 0.03 ...
## $ AB033025 : num 0.11 0.087 -0.141 -0.61 0.236 -0.094 -0.067 -0.116 -0.175 -0.774 ...
## $ AF114264 : num 0.096 0.051 -0.164 -0.047 0.245 -0.165 -0.072 -0.427 -0.249 -0.372 ...
## $ Contig40673_RC: num 0.305 -0.056 -0.124 -0.02 -0.19 0.016 -0.246 0.181 1.48 -0.199 ...
## $ Contig17345_RC: num 0.055 -0.031 -0.031 0.251 -0.06 -0.104 -0.254 0.408 -0.003 0.002 ...
## $ AB033034 : num -0.137 -0.05 -0.188 0.153 0.181 -0.231 -0.032 -0.024 -0.305 -0.249 ...
## $ AB033035 : num -0.056 -0.162 0.06 -0.249 -0.046 -0.129 0.15 0.088 -0.286 -0.343 ...
## $ AF227899 : num -0.001 0.11 -0.395 0.175 0.411 -0.024 -0.246 -0.385 -0.465 0.044 ...
## $ AB033043 : num 0.108 0.105 0.079 -0.223 0.109 0.201 0.361 -0.224 0.02 0.188 ...
## $ AB033049 : num 0.329 0.049 0.177 -0.307 0.3 0.046 -0.139 0.036 -0.154 0.069 ...
## $ Contig55834_RC: num 0.078 0.175 -0.375 0.017 0.255 0.035 -0.044 -0.379 -0.302 0.071 ...
## $ Contig67229_RC: num -0.098 -0.107 0.612 -0.107 0.143 0.065 0.061 0.358 0.016 0.025 ...
## $ Contig3396_RC : num -0.097 -0.068 0.071 -0.113 -0.021 0.4 0.031 -0.043 -0.004 -0.199 ...
## $ AB033050 : num -0.019 0.104 0.023 0.582 0.128 0.216 0.091 -0.186 -0.156 0.032 ...
## $ AB033055 : num 0.208 0.003 -0.051 0.024 0.161 -0.073 -0.167 -0.28 -0.071 -0.099 ...
## $ AF009314 : num -0.021 0.025 -0.279 -0.226 0.09 0.026 0.121 -0.541 -0.106 0.041 ...
## $ AB033062 : num 0.113 -0.166 -0.153 -0.695 0.101 -0.037 0.366 0.139 -0.05 -0.202 ...
## $ AB033066 : num 0.178 0.065 -0.077 0.176 -0.089 0.018 -0.144 -0.077 -0.157 0.008 ...
## $ Contig46243_RC: num -0.081 0.015 -0.262 -0.209 0.366 0.292 -0.058 -0.017 -0.082 0.113 ...
## $ Contig26077_RC: num 0.272 0.197 -0.218 -0.038 -0.254 0.107 0.14 -0.4 0.26 0.219 ...
## $ U45975 : num 0.737 0.268 -0.064 -0.226 -0.511 0.646 0.36 -0.432 0.25 -0.178 ...
## $ Contig43679_RC: num 0.122 0.099 -0.646 0.169 -0.128 -0.055 -0.21 -0.353 -0.12 0.12 ...
## $ AB033073 : num -0.152 0.018 -0.028 -0.167 0.307 0.157 0.261 -0.297 0.129 0.282 ...
## $ AF018081 : num -0.063 -0.212 -0.129 -0.347 0.133 -0.128 0.05 -0.389 -0.116 0.145 ...
## $ AB033079 : num -0.009 0.07 -0.159 0.004 0.018 -0.26 0.069 -0.18 -0.378 -0.092 ...
## $ X56210 : num -0.146 -0.06 -0.045 -0.13 0.185 -0.099 0.04 -0.168 -0.146 -0.095 ...
## $ AB033091 : num -0.176 0.018 -0.194 0.212 0.097 -0.202 0.073 -0.481 -0.391 -0.014 ...
## $ AB033092 : num 0.015 -0.064 -0.312 -0.08 0.138 -0.071 0.012 0.025 -0.333 -0.013 ...
## $ NM_003004 : num -0.47 -0.576 -0.064 -0.104 0.134 -0.09 -0.072 0.854 -0.068 -0.049 ...
## $ Contig57877_RC: num 0.212 -0.053 -0.088 -0.356 0.007 -0.12 -0.14 -0.368 -0.277 -0.401 ...
## $ NM_003010 : num -0.211 -0.331 -0.355 -0.064 0.395 0.227 0.118 -0.106 -0.089 0.261 ...
## $ NM_003012 : num 0.238 -0.26 0.141 -0.306 -0.098 -0.114 -0.026 0.263 -0.095 -0.222 ...
## $ NM_003014 : num 0.039 -0.039 0.112 -0.273 -0.051 -0.047 0.317 -0.54 0.029 -0.16 ...
## $ Contig43806_RC: num -0.734 -0.661 -0.632 -0.944 -0.94 -0.58 -0.494 -0.924 -0.616 -0.7 ...
## $ Contig29226_RC: num 0.045 -0.135 -0.041 -0.54 0.185 -0.033 -0.064 0.084 0.048 -0.04 ...
## $ NM_003020 : num -0.103 -0.255 -0.034 -0.548 -0.067 -0.237 -0.002 -0.351 -0.362 -0.164 ...
## $ NM_003022 : num 0.292 0.092 -0.049 0.318 -0.051 0.259 -0.002 -0.284 -0.158 0.125 ...
## $ Contig54847_RC: num 0.181 -0.208 -0.178 -0.692 0.129 0.198 0.436 -0.838 -0.029 0.166 ...
## $ Contig33260_RC: num 0.056 0.297 -0.342 0.007 0.175 0.168 0.41 -0.089 -0.035 -0.006 ...
## $ NM_002300 : num -0.434 -0.316 -0.525 0.033 -0.178 -0.023 -0.149 0.245 -0.131 -0.457 ...
## $ Contig14658_RC: num 0.043 0.087 -0.036 -0.115 0.208 0.058 0.331 -0.161 0.599 0.23 ...
## $ NM_003033 : num 0.236 0.031 0.34 0.023 -0.247 0.123 -0.309 -0.077 -0.119 -0.23 ...
## $ NM_003034 : num 0.096 -0.09 -0.047 -0.011 -0.406 -0.244 -0.218 -0.081 -0.016 -0.593 ...
## $ NM_002306 : num -0.271 0.066 0.092 -0.185 -0.01 0.025 0.094 0.152 0.156 0.185 ...
## $ NM_003035 : num -0.385 -0.08 0.053 -0.032 -0.071 -0.184 -0.534 0.581 -0.283 -0.253 ...
## $ NM_002308 : num -0.237 -0.269 0.203 0.312 0.088 -0.399 -0.076 0.425 -0.054 0.116 ...
## $ NM_003038 : num 0.131 0.275 0.065 0.043 -0.171 0.192 -0.046 0.086 0.384 0.015 ...
## $ NM_002313 : num -0.047 -0.036 0.109 0.516 -0.197 0.063 0.026 -0.108 -0.185 0.143 ...
## $ Contig54839_RC: num 0.13 -0.101 0.224 -0.149 0.01 -0.05 -0.104 0.083 0.156 0.028 ...
## $ NM_002318 : num -0.386 0.189 -0.122 -0.75 0.039 -0.236 0.343 -0.285 0.148 -0.205 ...
## $ NM_003051 : num 0.299 -0.173 0.193 -0.02 -0.155 0.005 -0.375 -0.3 -0.363 0.064 ...
## $ NM_003056 : num 0.116 -0.073 0.03 -0.041 -0.164 -0.049 0.113 0.167 0.035 -0.314 ...
## $ Contig66143_RC: num -0.294 0.55 0.642 -0.087 -0.381 -0.417 -0.137 -0.081 -0.27 -0.859 ...
## $ Contig51809_RC: num 0.169 -0.086 0.129 -0.3 -0.054 -0.104 0.076 0.176 0.184 -0.298 ...
## $ NM_002332 : num 0.025 -0.141 -0.113 -0.389 0.257 0.047 0.341 -0.346 0.221 0.215 ...
## $ NM_001605 : num -0.101 -0.138 -0.054 0.232 -0.147 -0.083 -0.082 -0.103 -0.113 -0.213 ...
## $ NM_003064 : num -0.065 -0.107 -0.033 0.069 -0.019 0.006 -0.052 0.305 0.527 0 ...
## $ NM_002336 : num -0.005 -0.162 -0.015 -0.024 -0.051 0.122 0.11 -0.039 -0.079 0.209 ...
## $ NM_002337 : num -0.083 -0.024 -0.17 0.023 -0.029 -0.019 0.119 0.034 0.192 -0.016 ...
## $ NM_003066 : num -0.131 -0.093 -0.026 0.028 -0.029 -0.016 -0.097 0.319 0.432 0.004 ...
## $ NM_001609 : num 0.081 -0.026 -0.133 0.077 -0.191 0.102 -0.207 -0.595 -0.107 -0.131 ...
## $ Contig50846_RC: num 0.064 -0.051 -0.083 -0.009 -0.063 -0.019 -0.019 0.306 -0.005 -0.219 ...
## $ NM_001611 : num -0.712 -0.435 -0.532 -0.097 -0.278 0.323 -0.371 0.188 -0.033 -0.062 ...
## $ NM_003070 : num 0.09 0.028 -0.042 -0.261 0.151 0.078 0.188 -0.177 -0.254 0.227 ...
## $ NM_002341 : num -0.269 -0.731 -0.177 0.369 -0.48 -0.455 -0.133 0.219 0.114 -0.513 ...
## $ NM_001613 : num -0.143 0.053 -0.05 -0.492 0.074 -0.121 0.277 -0.331 -0.07 -0.145 ...
## $ NM_003071 : num -0.08 -0.125 0.097 -0.012 -0.003 0.381 -0.355 0.127 0.181 -0.159 ...
## $ NM_001614 : num -0.064 0.102 -0.031 -0.112 -0.196 -0.114 0.122 0.052 -0.141 0.13 ...
## $ NM_002343 : num -0.58 -1.26 -0.261 -0.356 -0.547 -0.371 -0.026 0.722 -0.657 0.314 ...
## $ NM_001615 : num -0.75 -0.23 -0.071 -0.999 -0.573 -0.933 -0.514 -0.696 -0.841 -0.529 ...
## $ NM_002345 : num -0.177 0.053 -0.251 -0.124 0.261 -0.182 0.045 -0.552 -0.2 -0.134 ...
## $ NM_002346 : num -0.339 -0.08 0.253 0.393 -0.099 -0.159 -0.129 0.07 -0.002 0.057 ...
## $ NM_001618 : num -0.292 -0.242 -0.125 0.085 0.181 -0.177 -0.141 0.09 -0.327 0.02 ...
## $ Contig52320 : num -0.01 0.311 -0.024 0.191 0.064 -0.096 0.12 -0.481 -0.306 -0.07 ...
## [list output truncated]
# Output Example:
# 'data.frame': 78 obs. of 4773 variables
# $ X : int 1 2 3 4 5 6 7 8 9 10 ...
# $ J00129 : num -0.448 -0.48 -0.568 -0.819 ...
# $ Contig29982_RC: num -0.296 -0.512 -0.411 -0.267 ...
# $ Contig42854 : num -0.1 -0.031 -0.398 0.023 ...
dim(InitialData) # Returns dataset dimensions (rows and columns)
## [1] 78 4773
# Example Output:
# [1] 78 4773
dimnames(InitialData)[[2]][4770:4773] # View the names of the last columns
## [1] "NM_000895" "NM_000898" "AF067420" "Class"
# Example Output:
# [1] "NM_000895" "NM_000898" "AF067420" "Class"
# --- Randomization Setup ---
subsets <- read.csv("C:/AS/subsets.csv")
my_registration_number <- 2401616
# Find the index of the row corresponding to your registration number.
idx <- which(subsets$RegId == my_registration_number)
print(idx) # Print the index to confirm that the registration number was found.
## [1] 45
# For example, [1] 1 indicates that the corresponding row is the first row in the dataset.
# Extract the subset of variables (excluding the first column "RegId") for your registration number.
# The result is a vector of 10 variables associated with your registration number.
subsets <- unlist(c(subsets[idx, -1]))
print(subsets) # Print your subset of variables.
## Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9 Var10
## 3248 2790 2547 405 2761 4257 970 3875 3036 2068
# Example output:
# Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9 Var10
# 417 3124 2492 4590 107 1557 4554 3610 4657 2428
# Assume that InitialData is a preloaded dataset containing the original variables.
Class <- InitialData$Class # Extract the "Class" column, which represents the labels or targets.
# Select only the columns (variables) specified in the subset (subsets).
X <- InitialData[, subsets]
# Combine the "Class" column with the selected variables to create the final dataset.
My_DataSet <- cbind(Class, X)
# The dataset 'My_DataSet' contains:
# - The "Class" column as the first column.
# - The 10 variables associated with your registration number.
print(My_DataSet)
## Class Contig4460_RC NM_014750 Contig46881_RC NM_003239 NM_006701
## 1 2 0.114 -0.345 0.475 0.243 0.076
## 2 2 0.087 -0.188 -0.045 -0.025 0.342
## 3 2 -0.179 -0.081 -0.009 0.123 0.181
## 4 2 -0.088 -0.144 -0.235 -0.464 0.047
## 5 2 0.106 -0.152 -0.110 0.097 -0.256
## 6 2 -0.031 -0.260 -0.145 0.387 0.063
## 7 2 -0.179 -0.375 -0.200 0.609 0.040
## 8 2 -0.069 0.253 -0.194 -0.464 0.147
## 9 2 -0.055 -0.109 -0.067 0.374 0.161
## 10 2 -0.072 -0.082 -0.122 0.247 0.013
## 11 2 0.173 0.029 0.228 0.045 0.106
## 12 2 -0.138 0.199 -0.490 -0.297 -0.310
## 13 2 -0.350 0.283 -0.151 0.183 0.209
## 14 2 0.292 -0.490 -0.916 0.202 -0.027
## 15 2 0.012 0.149 0.162 -0.189 -0.128
## 16 2 -0.192 0.012 -0.026 0.165 0.080
## 17 2 -0.144 -0.429 0.366 0.335 -0.192
## 18 2 -0.018 -0.572 0.258 0.146 -0.109
## 19 2 -0.036 -0.389 -0.014 0.283 -0.227
## 20 2 0.091 0.159 0.218 -0.163 0.138
## 21 2 -0.083 -0.492 0.175 0.278 -0.081
## 22 2 -0.279 -0.291 0.026 0.234 -0.101
## 23 2 -0.388 -0.582 0.107 0.233 -0.195
## 24 2 0.105 0.009 -0.078 -0.443 -0.178
## 25 2 -0.045 -0.120 -0.071 -0.166 0.085
## 26 2 0.014 -0.206 -0.021 0.209 -0.011
## 27 2 -0.173 -0.304 -0.035 0.269 0.034
## 28 2 -0.281 -0.422 -0.036 -0.068 -0.164
## 29 2 -0.315 -0.002 0.042 0.188 -0.050
## 30 2 -0.134 -0.285 -0.278 0.341 -0.119
## 31 2 0.018 -0.399 -0.255 -0.038 -0.007
## 32 2 -0.108 -0.094 -0.292 -0.273 0.038
## 33 2 0.068 -0.227 0.105 -0.155 0.124
## 34 2 -0.355 0.424 -0.289 -0.573 0.210
## 35 2 0.284 -0.046 -0.278 -0.119 -0.032
## 36 2 0.024 -0.193 -0.299 0.188 -0.072
## 37 2 -0.329 0.210 -0.485 -0.377 -0.348
## 38 2 0.009 -0.133 -0.304 -0.239 -0.108
## 39 2 -0.300 -0.302 -0.040 0.178 -0.124
## 40 2 -0.180 0.167 -0.058 0.067 0.035
## 41 2 -0.200 -0.296 0.678 0.134 0.098
## 42 2 0.446 -0.577 -0.295 0.341 -0.274
## 43 2 0.200 0.159 -0.054 0.094 -0.200
## 44 2 0.072 0.062 -0.566 -0.557 0.140
## 45 1 -0.218 -0.109 -0.043 -0.081 0.149
## 46 1 -0.066 -0.156 -0.260 0.116 -0.124
## 47 1 -0.200 -0.115 -0.105 -0.028 0.112
## 48 1 -0.437 0.395 -0.430 -0.611 -0.388
## 49 1 0.129 0.153 -0.091 -0.238 0.372
## 50 1 0.033 0.315 -0.150 -0.570 -0.430
## 51 1 0.070 -0.124 -0.163 -0.018 -0.030
## 52 1 0.729 -0.188 0.201 -0.397 0.171
## 53 1 -0.115 0.249 -0.285 -0.089 -0.104
## 54 1 0.101 -0.401 0.194 -0.281 0.024
## 55 1 -0.109 -0.208 0.200 -0.251 0.059
## 56 1 0.035 -0.003 0.015 -0.178 -0.064
## 57 1 -0.053 0.622 -0.339 -0.411 -0.038
## 58 1 -0.302 0.127 0.152 -0.053 0.069
## 59 1 -0.118 -0.046 0.062 -0.046 0.049
## 60 1 -0.222 -0.145 -0.077 0.286 -0.017
## 61 1 -0.015 -0.292 0.175 0.011 -0.082
## 62 1 -0.212 -0.097 -0.121 -0.039 0.006
## 63 1 -0.187 -0.055 0.070 -0.023 0.112
## 64 1 0.082 0.253 -0.009 -0.143 -0.051
## 65 1 -0.117 0.188 0.017 -0.249 -0.113
## 66 1 -0.043 0.279 0.557 -0.626 -0.186
## 67 1 -0.116 0.077 -0.306 -0.638 -0.253
## 68 1 0.038 0.350 -0.168 -0.100 0.157
## 69 1 -0.162 -0.060 0.547 -0.139 -0.161
## 70 1 -0.264 -0.122 0.033 -0.129 -0.010
## 71 1 -0.182 0.715 -0.380 -0.358 -0.008
## 72 1 -0.110 -0.017 -0.101 0.130 0.035
## 73 1 -0.001 -0.501 -0.205 -0.389 -0.264
## 74 1 -0.244 0.181 -0.246 -0.156 -0.103
## 75 1 0.028 -0.057 -0.153 -0.101 0.017
## 76 1 0.007 0.047 -0.233 0.183 -0.192
## 77 1 0.036 -0.038 -0.140 -0.217 0.125
## 78 1 0.009 -0.051 -0.310 -0.029 -0.086
## Contig41538_RC NM_002857 NM_018374 AL137449 NM_014316
## 1 -0.516 -0.144 -0.140 0.850 -0.092
## 2 -0.495 -0.056 -0.090 0.101 0.041
## 3 0.591 -0.134 -0.101 -0.346 -0.019
## 4 0.350 -0.031 0.195 -0.088 0.355
## 5 -0.054 0.160 0.161 -0.208 -0.152
## 6 -0.416 -0.137 -0.127 -0.383 -0.036
## 7 -0.284 -0.075 -0.088 -0.077 -0.071
## 8 0.193 -0.098 -0.139 -0.346 0.391
## 9 -0.386 -0.185 -0.241 -0.170 0.155
## 10 0.051 0.107 -0.134 0.244 -0.086
## 11 0.270 -0.012 -0.087 -0.339 -0.114
## 12 -0.051 -0.242 -0.349 -0.341 -0.128
## 13 -0.280 -0.120 -0.309 -0.358 0.084
## 14 -0.421 0.237 -0.170 0.726 -0.102
## 15 -0.428 -0.025 0.296 0.210 0.000
## 16 -0.064 -0.098 -0.074 -0.334 0.072
## 17 -0.211 -0.031 -0.299 -0.218 -0.149
## 18 -0.355 -0.043 -0.016 0.239 0.006
## 19 0.082 0.194 -0.120 -0.164 -0.241
## 20 0.204 -0.109 -0.054 0.028 -0.154
## 21 -0.339 -0.149 -0.295 -0.313 0.205
## 22 0.329 0.007 0.024 0.547 -0.298
## 23 -0.375 0.023 -0.126 -0.221 -0.292
## 24 0.302 -0.159 -0.160 -0.217 0.199
## 25 0.538 0.011 -0.298 -0.341 0.125
## 26 -0.346 -0.085 0.050 0.021 -0.044
## 27 -0.282 0.013 -0.006 -0.111 -0.086
## 28 -0.157 -0.138 -0.103 -0.143 -0.153
## 29 -0.252 -0.092 -0.328 -0.010 0.201
## 30 -0.239 0.002 -0.125 0.182 -0.091
## 31 0.306 0.048 0.079 -0.185 -0.015
## 32 0.317 0.079 0.008 0.202 -0.089
## 33 -0.232 -0.031 0.023 -0.304 -0.014
## 34 -0.214 0.289 -0.201 -0.640 0.058
## 35 -0.029 -0.004 -0.167 0.074 0.073
## 36 0.223 0.162 -0.236 -0.120 0.081
## 37 -0.604 -0.320 -0.328 -0.678 0.214
## 38 -0.414 0.245 0.136 -0.324 0.103
## 39 -0.382 -0.007 -0.014 0.148 -0.141
## 40 0.094 0.124 -0.039 -0.266 0.017
## 41 -0.141 0.288 -0.100 -0.469 -0.086
## 42 -0.390 -0.099 -0.057 0.383 -0.301
## 43 0.426 -0.048 0.527 -0.423 0.051
## 44 0.099 -0.294 0.021 -0.345 0.263
## 45 0.159 0.013 -0.223 -0.395 0.169
## 46 -0.175 -0.181 -0.036 -0.239 0.228
## 47 0.141 -0.020 -0.300 -0.155 -0.029
## 48 -0.142 -0.314 -0.238 -0.079 0.294
## 49 -0.362 0.075 0.216 -0.489 -0.070
## 50 0.072 -0.138 -0.168 -0.425 -0.164
## 51 -0.155 0.029 -0.155 0.555 0.000
## 52 0.465 0.029 -0.074 0.077 0.076
## 53 0.059 -0.292 -0.073 -0.269 -0.029
## 54 -2.000 -0.117 0.049 0.543 0.188
## 55 0.058 0.015 -0.108 -0.308 0.078
## 56 -0.053 0.140 0.142 -0.250 -0.198
## 57 0.232 0.007 -0.228 -0.389 0.056
## 58 -0.515 0.094 0.377 -0.397 -0.111
## 59 0.045 0.027 0.096 -0.416 -0.025
## 60 -0.353 0.115 -0.101 -0.418 -0.072
## 61 -0.280 0.116 0.100 -0.403 0.001
## 62 -0.421 0.162 0.034 -0.359 0.025
## 63 0.055 -0.085 0.011 -0.374 -0.014
## 64 0.571 -0.156 -0.126 -0.018 -0.109
## 65 -0.013 -0.331 -0.063 -0.332 -0.011
## 66 -0.224 -0.110 -0.044 0.217 0.135
## 67 0.188 -0.162 -0.151 -0.389 0.455
## 68 0.004 -0.135 -0.163 0.098 0.217
## 69 -0.354 0.180 0.094 -0.198 -0.051
## 70 -0.448 -0.016 -0.189 -0.420 0.113
## 71 -0.337 -0.341 -0.153 -0.421 0.478
## 72 -0.389 0.044 0.009 -0.316 -0.189
## 73 -0.185 -0.151 -0.082 -0.090 -0.241
## 74 -0.223 -0.370 -0.140 0.074 -0.141
## 75 0.001 0.270 0.177 -0.104 -0.050
## 76 -0.023 0.052 -0.111 0.518 0.045
## 77 -0.382 0.170 0.101 -0.352 -0.127
## 78 -0.258 0.192 -0.213 0.064 0.045
Task 1: Compute the variance, co-variance and correlation matrix of
your individual subset of 10
genes.Explain the results and add an appropriate table to your
report.
# Coursework MA321-7-SP: Analysis of Gene Expression Data
# Version: January 2025
rm(list=ls()) # Clean up the workspace by removing all variables
# --- Setup ---
# Load the necessary data
InitialData <- read.csv(file = "C:/AS/gene-expression-invasive-vs-noninvasive-cancer.csv")
# --- Extract Your Subset of 10 Genes ---
# Assuming your subset has been identified (you can adjust the registration number and the subset extraction accordingly).
# Example subset for registration number 2401468:
my_registration_number <- 2401616
subsets <- read.csv("C:/AS/subsets.csv") # Load the file containing registration numbers and subsets
# Find the row for the given registration number
idx <- which(subsets$RegId == my_registration_number)
# Extract the subset of 10 gene expression variables for this registration number
subsets <- unlist(c(subsets[idx, -1]))
# Extract the gene expression data from InitialData
X <- InitialData[, subsets] # Select the 10 variables corresponding to your subset
# --- Compute Variance, Covariance, and Correlation Matrices ---
# Variance: Apply the var() function to each of the 10 selected genes (columns)
gene_variance <- apply(X, 2, var)
# Covariance: Compute the covariance matrix of the selected genes
gene_covariance <- cov(X)
# Correlation: Compute the correlation matrix of the selected genes
gene_correlation <- cor(X)
# --- Print and Present the Results ---
cat("Variance of Each Gene:\n")
## Variance of Each Gene:
print(gene_variance)
## Contig4460_RC NM_014750 Contig46881_RC NM_003239 NM_006701
## 0.03520098 0.07333086 0.06594380 0.07832368 0.02487964
## Contig41538_RC NM_002857 NM_018374 AL137449 NM_014316
## 0.13050131 0.02377509 0.02656569 0.09703756 0.02722110
cat("\nCovariance Matrix:\n")
##
## Covariance Matrix:
print(gene_covariance)
## Contig4460_RC NM_014750 Contig46881_RC NM_003239
## Contig4460_RC 0.0352009764 -0.008771209 -0.0009198185 -0.002706309
## NM_014750 -0.0087712095 0.073330856 -0.0158399404 -0.043118438
## Contig46881_RC -0.0009198185 -0.015839940 0.0659438015 0.011776445
## NM_003239 -0.0027063090 -0.043118438 0.0117764449 0.078323681
## NM_006701 0.0029296613 0.002466270 0.0060536513 0.004339042
## Contig41538_RC 0.0082131392 0.023399552 -0.0096507156 -0.015764756
## NM_002857 0.0034752617 -0.011877228 0.0058806064 0.009364266
## NM_018374 0.0070061958 -0.002384750 0.0082138032 -0.001835101
## AL137449 0.0205688342 -0.029822082 0.0019303187 0.021943905
## NM_014316 -0.0023990176 0.018374276 -0.0087796357 -0.020703720
## NM_006701 Contig41538_RC NM_002857 NM_018374
## Contig4460_RC 0.002929661 0.0082131392 0.0034752617 0.0070061958
## NM_014750 0.002466270 0.0233995518 -0.0118772278 -0.0023847502
## Contig46881_RC 0.006053651 -0.0096507156 0.0058806064 0.0082138032
## NM_003239 0.004339042 -0.0157647556 0.0093642657 -0.0018351009
## NM_006701 0.024879636 0.0019394116 0.0049888581 0.0020596044
## Contig41538_RC 0.001939412 0.1305013100 0.0002369431 -0.0031759960
## NM_002857 0.004988858 0.0002369431 0.0237750889 0.0075559081
## NM_018374 0.002059604 -0.0031759960 0.0075559081 0.0265656943
## AL137449 -0.004519434 -0.0223461958 0.0010901269 -0.0006750729
## NM_014316 0.003486768 0.0010698645 -0.0083082118 -0.0050989760
## AL137449 NM_014316
## Contig4460_RC 0.0205688342 -0.002399018
## NM_014750 -0.0298220819 0.018374276
## Contig46881_RC 0.0019303187 -0.008779636
## NM_003239 0.0219439051 -0.020703720
## NM_006701 -0.0045194336 0.003486768
## Contig41538_RC -0.0223461958 0.001069864
## NM_002857 0.0010901269 -0.008308212
## NM_018374 -0.0006750729 -0.005098976
## AL137449 0.0970375604 -0.010921772
## NM_014316 -0.0109217722 0.027221100
cat("\nCorrelation Matrix:\n")
##
## Correlation Matrix:
print(gene_correlation)
## Contig4460_RC NM_014750 Contig46881_RC NM_003239 NM_006701
## Contig4460_RC 1.00000000 -0.17263894 -0.01909140 -0.05154108 0.09899609
## NM_014750 -0.17263894 1.00000000 -0.22778398 -0.56894868 0.05773979
## Contig46881_RC -0.01909140 -0.22778398 1.00000000 0.16386291 0.14945430
## NM_003239 -0.05154108 -0.56894868 0.16386291 1.00000000 0.09829362
## NM_006701 0.09899609 0.05773979 0.14945430 0.09829362 1.00000000
## Contig41538_RC 0.12117817 0.23919755 -0.10403156 -0.15593129 0.03403616
## NM_002857 0.12012934 -0.28445307 0.14851627 0.21700319 0.20512475
## NM_018374 0.22911015 -0.05403049 0.19624415 -0.04023026 0.08011267
## AL137449 0.35193494 -0.35352867 0.02413082 0.25170824 -0.09197972
## NM_014316 -0.07750028 0.41125768 -0.20722239 -0.44838285 0.13398255
## Contig41538_RC NM_002857 NM_018374 AL137449 NM_014316
## Contig4460_RC 0.121178171 0.120129340 0.22911015 0.35193494 -0.07750028
## NM_014750 0.239197548 -0.284453074 -0.05403049 -0.35352867 0.41125768
## Contig46881_RC -0.104031556 0.148516275 0.19624415 0.02413082 -0.20722239
## NM_003239 -0.155931286 0.217003186 -0.04023026 0.25170824 -0.44838285
## NM_006701 0.034036159 0.205124753 0.08011267 -0.09197972 0.13398255
## Contig41538_RC 1.000000000 0.004253784 -0.05394012 -0.19857576 0.01795017
## NM_002857 0.004253784 1.000000000 0.30065263 0.02269583 -0.32658305
## NM_018374 -0.053940122 0.300652629 1.00000000 -0.01329597 -0.18961363
## AL137449 -0.198575758 0.022695828 -0.01329597 1.00000000 -0.21250558
## NM_014316 0.017950167 -0.326583054 -0.18961363 -0.21250558 1.00000000
# --- Create Tables for the Report ---
# Create a table for variance
variance_table <- data.frame(Gene = colnames(X), Variance = gene_variance)
cat("\nVariance Table:\n")
##
## Variance Table:
print(variance_table)
## Gene Variance
## Contig4460_RC Contig4460_RC 0.03520098
## NM_014750 NM_014750 0.07333086
## Contig46881_RC Contig46881_RC 0.06594380
## NM_003239 NM_003239 0.07832368
## NM_006701 NM_006701 0.02487964
## Contig41538_RC Contig41538_RC 0.13050131
## NM_002857 NM_002857 0.02377509
## NM_018374 NM_018374 0.02656569
## AL137449 AL137449 0.09703756
## NM_014316 NM_014316 0.02722110
# Create a table for covariance
covariance_table <- as.data.frame(gene_covariance)
cat("\nCovariance Matrix Table:\n")
##
## Covariance Matrix Table:
print(covariance_table)
## Contig4460_RC NM_014750 Contig46881_RC NM_003239
## Contig4460_RC 0.0352009764 -0.008771209 -0.0009198185 -0.002706309
## NM_014750 -0.0087712095 0.073330856 -0.0158399404 -0.043118438
## Contig46881_RC -0.0009198185 -0.015839940 0.0659438015 0.011776445
## NM_003239 -0.0027063090 -0.043118438 0.0117764449 0.078323681
## NM_006701 0.0029296613 0.002466270 0.0060536513 0.004339042
## Contig41538_RC 0.0082131392 0.023399552 -0.0096507156 -0.015764756
## NM_002857 0.0034752617 -0.011877228 0.0058806064 0.009364266
## NM_018374 0.0070061958 -0.002384750 0.0082138032 -0.001835101
## AL137449 0.0205688342 -0.029822082 0.0019303187 0.021943905
## NM_014316 -0.0023990176 0.018374276 -0.0087796357 -0.020703720
## NM_006701 Contig41538_RC NM_002857 NM_018374
## Contig4460_RC 0.002929661 0.0082131392 0.0034752617 0.0070061958
## NM_014750 0.002466270 0.0233995518 -0.0118772278 -0.0023847502
## Contig46881_RC 0.006053651 -0.0096507156 0.0058806064 0.0082138032
## NM_003239 0.004339042 -0.0157647556 0.0093642657 -0.0018351009
## NM_006701 0.024879636 0.0019394116 0.0049888581 0.0020596044
## Contig41538_RC 0.001939412 0.1305013100 0.0002369431 -0.0031759960
## NM_002857 0.004988858 0.0002369431 0.0237750889 0.0075559081
## NM_018374 0.002059604 -0.0031759960 0.0075559081 0.0265656943
## AL137449 -0.004519434 -0.0223461958 0.0010901269 -0.0006750729
## NM_014316 0.003486768 0.0010698645 -0.0083082118 -0.0050989760
## AL137449 NM_014316
## Contig4460_RC 0.0205688342 -0.002399018
## NM_014750 -0.0298220819 0.018374276
## Contig46881_RC 0.0019303187 -0.008779636
## NM_003239 0.0219439051 -0.020703720
## NM_006701 -0.0045194336 0.003486768
## Contig41538_RC -0.0223461958 0.001069864
## NM_002857 0.0010901269 -0.008308212
## NM_018374 -0.0006750729 -0.005098976
## AL137449 0.0970375604 -0.010921772
## NM_014316 -0.0109217722 0.027221100
# Create a table for correlation
correlation_table <- as.data.frame(gene_correlation)
cat("\nCorrelation Matrix Table:\n")
##
## Correlation Matrix Table:
print(correlation_table)
## Contig4460_RC NM_014750 Contig46881_RC NM_003239 NM_006701
## Contig4460_RC 1.00000000 -0.17263894 -0.01909140 -0.05154108 0.09899609
## NM_014750 -0.17263894 1.00000000 -0.22778398 -0.56894868 0.05773979
## Contig46881_RC -0.01909140 -0.22778398 1.00000000 0.16386291 0.14945430
## NM_003239 -0.05154108 -0.56894868 0.16386291 1.00000000 0.09829362
## NM_006701 0.09899609 0.05773979 0.14945430 0.09829362 1.00000000
## Contig41538_RC 0.12117817 0.23919755 -0.10403156 -0.15593129 0.03403616
## NM_002857 0.12012934 -0.28445307 0.14851627 0.21700319 0.20512475
## NM_018374 0.22911015 -0.05403049 0.19624415 -0.04023026 0.08011267
## AL137449 0.35193494 -0.35352867 0.02413082 0.25170824 -0.09197972
## NM_014316 -0.07750028 0.41125768 -0.20722239 -0.44838285 0.13398255
## Contig41538_RC NM_002857 NM_018374 AL137449 NM_014316
## Contig4460_RC 0.121178171 0.120129340 0.22911015 0.35193494 -0.07750028
## NM_014750 0.239197548 -0.284453074 -0.05403049 -0.35352867 0.41125768
## Contig46881_RC -0.104031556 0.148516275 0.19624415 0.02413082 -0.20722239
## NM_003239 -0.155931286 0.217003186 -0.04023026 0.25170824 -0.44838285
## NM_006701 0.034036159 0.205124753 0.08011267 -0.09197972 0.13398255
## Contig41538_RC 1.000000000 0.004253784 -0.05394012 -0.19857576 0.01795017
## NM_002857 0.004253784 1.000000000 0.30065263 0.02269583 -0.32658305
## NM_018374 -0.053940122 0.300652629 1.00000000 -0.01329597 -0.18961363
## AL137449 -0.198575758 0.022695828 -0.01329597 1.00000000 -0.21250558
## NM_014316 0.017950167 -0.326583054 -0.18961363 -0.21250558 1.00000000
# Load necessary libraries
library(ggplot2)
library(reshape2)
library(pheatmap)
# Assuming gene_variance, gene_covariance, and gene_correlation are already calculated
# --- 1. Covariance Matrix Heatmap using pheatmap ---
pheatmap(gene_covariance,
main = "Heatmap of Covariance Matrix",
color = colorRampPalette(c("blue", "white", "red"))(50),
cluster_rows = TRUE, # Cluster rows (genes)
cluster_cols = TRUE, # Cluster columns (genes)
show_rownames = TRUE, # Show row names (genes)
show_colnames = TRUE) # Show column names (genes)
# --- 2. Correlation Matrix Heatmap using ggplot2 ---
correlation_melted <- melt(gene_correlation) # Melt the correlation matrix for ggplot
ggplot(correlation_melted, aes(Var1, Var2, fill = value)) +
geom_tile() +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0) +
labs(title = "Correlation Matrix Heatmap", x = "Gene", y = "Gene") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) # Rotate x-axis labels
# --- 3. Variance Bar Plot using ggplot2 ---
variance_df <- data.frame(Gene = colnames(X), Variance = gene_variance)
ggplot(variance_df, aes(x = Gene, y = Variance)) +
geom_bar(stat = "identity", fill = "steelblue") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = "Variance of Each Gene", x = "Gene", y = "Variance")
Task 2: Calculate the distance matrix for the subset of 10 genes:
# Calculate the distance matrix
distance_matrix <- dist(subsets)
# Print the distance matrix
distance_matrix
## Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9
## Var2 458
## Var3 701 243
## Var4 2843 2385 2142
## Var5 487 29 214 2356
## Var6 1009 1467 1710 3852 1496
## Var7 2278 1820 1577 565 1791 3287
## Var8 627 1085 1328 3470 1114 382 2905
## Var9 212 246 489 2631 275 1221 2066 839
## Var10 1180 722 479 1663 693 2189 1098 1807 968
# Save the distance matrix
write.csv(as.matrix(distance_matrix), "distance_matrix.csv")
# --- Heatmap for Distance Matrix ---
distance_data <- melt(as.matrix(distance_matrix))
ggplot(distance_data, aes(x = Var1, y = Var2, fill = value)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "red") +
theme_minimal() +
labs(title = "Heatmap of Distance Matrix", x = "Observations", y = "Observations", fill = "Distance")
# Convert the distance matrix to a full matrix form
distance_matrix_full <- as.matrix(distance_matrix)
# Melt the distance matrix
distance_data <- melt(distance_matrix_full)
# Plot the heatmap for the distance matrix
ggplot(distance_data, aes(x = Var1, y = Var2, fill = value)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "red") +
theme_minimal() +
labs(title = "Heatmap of Distance Matrix", x = "Observations", y = "Observations", fill = "Distance")
Task 3: Using R to calculate univariate Q-Q-plots and a Q-Q-plot based on the generalised distance for the observations of your individual subset of 10 genes
library(MASS)
library(ggplot2)
library(reshape2)
InitialData <- read.csv(file = "C:/AS/gene-expression-invasive-vs-noninvasive-cancer.csv")
subsets <- read.csv("C:/AS/subsets.csv")
my_registration_number <- 2401616
idx <- which(subsets$RegId == my_registration_number)
subsets <- unlist(c(subsets[idx, -1]))
X <- InitialData[, subsets]
par(mfrow = c(2, 5))
for (i in 1:ncol(X)) {
qqnorm(X[, i], main = paste("Q-Q Plot for Gene", colnames(X)[i]))
qqline(X[, i], col = "red")
}
mahalanobis_distances <- mahalanobis(X, colMeans(X), cov(X))
qqplot(qchisq(ppoints(length(mahalanobis_distances)), df = ncol(X)), mahalanobis_distances,
main = "Q-Q Plot of Mahalanobis Distances",
xlab = "Theoretical Quantiles (Chi-squared)",
ylab = "Observed Mahalanobis Distances")
abline(0, 1, col = "red")
Task 4:Perform Principal Component Analysis (PCA) on the subset of 10 genes :
library(ggplot2)
library(reshape2)
InitialData <- read.csv(file = "C:/AS/gene-expression-invasive-vs-noninvasive-cancer.csv")
subsets <- read.csv("C:/AS/subsets.csv")
my_registration_number <- 2401616
idx <- which(subsets$RegId == my_registration_number)
my_subset <- unlist(c(subsets[idx, -1]))
X <- InitialData[, my_subset]
X_scaled <- scale(X)
pca_result <- prcomp(X_scaled)
summary(pca_result)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.5963 1.2123 1.1341 1.0234 1.0023 0.90244 0.75282
## Proportion of Variance 0.2548 0.1470 0.1286 0.1047 0.1005 0.08144 0.05667
## Cumulative Proportion 0.2548 0.4018 0.5304 0.6351 0.7356 0.81703 0.87371
## PC8 PC9 PC10
## Standard deviation 0.72381 0.6588 0.55233
## Proportion of Variance 0.05239 0.0434 0.03051
## Cumulative Proportion 0.92610 0.9695 1.00000
pca_table <- data.frame(Principal_Component = 1:length(pca_result$sdev),
Eigenvalue = pca_result$sdev^2,
Variance_Explained = pca_result$sdev^2 / sum(pca_result$sdev^2))
print(pca_table)
## Principal_Component Eigenvalue Variance_Explained
## 1 1 2.5482748 0.25482748
## 2 2 1.4695882 0.14695882
## 3 3 1.2861137 0.12861137
## 4 4 1.0472961 0.10472961
## 5 5 1.0046527 0.10046527
## 6 6 0.8144035 0.08144035
## 7 7 0.5667407 0.05667407
## 8 8 0.5238975 0.05238975
## 9 9 0.4339591 0.04339591
## 10 10 0.3050736 0.03050736
# Run PCA
pca_result <- prcomp(subsets, scale. = TRUE)
# Display summary
summary(pca_result)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.2854 1.2140 1.1505 1.1194 1.0902 0.97927 0.93916
## Proportion of Variance 0.1502 0.1340 0.1203 0.1139 0.1080 0.08718 0.08018
## Cumulative Proportion 0.1502 0.2842 0.4045 0.5184 0.6265 0.71365 0.79383
## PC8 PC9 PC10 PC11
## Standard deviation 0.84286 0.7883 0.71513 0.6516
## Proportion of Variance 0.06458 0.0565 0.04649 0.0386
## Cumulative Proportion 0.85841 0.9149 0.96140 1.0000
# --- Scree Plot ---
plot(pca_result$sdev^2 / sum(pca_result$sdev^2), type = "b",
main = "Scree Plot of Principal Components",
xlab = "Principal Components", ylab = "Proportion of Variance Explained")
# --- PCA Biplot ---
biplot(pca_result, main = "PCA Biplot", cex = 0.7)
# Plot PCA
plot(pca_result$x[, 1:2], main = "PCA: First vs Second Principal Component")
Task 5: Fit a Multivariate Analysis of Variance (MANOVA) mode :
# Load necessary libraries
library(ggplot2)
# --- Prepare Data ---
# Select the gene expression data based on your subset of 10 genes
gene_expression_data <- InitialData[, my_subset] # Assuming 'my_subset' contains the selected gene indices or names
# Add the 'Class' column (invasive vs noninvasive) to the subset
gene_expression_data$Class <- as.factor(InitialData$Class)
# Convert the gene expression data (excluding 'Class') to a matrix format (required for MANOVA)
gene_expression_matrix <- as.matrix(gene_expression_data[, -ncol(gene_expression_data)])
# --- Fit MANOVA Model ---
# Fit the MANOVA model with Class as the independent variable and gene expression variables as the dependent variables
manova_model <- manova(gene_expression_matrix ~ gene_expression_data$Class)
# --- Summary of MANOVA Model ---
summary(manova_model) # This provides multivariate test statistics such as Wilks' Lambda, Pillai's Trace, etc.
## Df Pillai approx F num Df den Df Pr(>F)
## gene_expression_data$Class 1 0.22517 1.947 10 67 0.05369 .
## Residuals 76
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# --- Post-hoc Analysis (if MANOVA is significant) ---
# Perform a separate ANOVA for each gene to investigate individual differences (if MANOVA is significant)
anova_results <- summary.aov(manova_model)
print(anova_results)
## Response Contig4460_RC :
## Df Sum Sq Mean Sq F value Pr(>F)
## gene_expression_data$Class 1 0.00055 0.000551 0.0155 0.9014
## Residuals 76 2.70992 0.035657
##
## Response NM_014750 :
## Df Sum Sq Mean Sq F value Pr(>F)
## gene_expression_data$Class 1 0.6310 0.63102 9.5619 0.002778 **
## Residuals 76 5.0155 0.06599
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Contig46881_RC :
## Df Sum Sq Mean Sq F value Pr(>F)
## gene_expression_data$Class 1 0.0082 0.008217 0.1232 0.7266
## Residuals 76 5.0695 0.066703
##
## Response NM_003239 :
## Df Sum Sq Mean Sq F value Pr(>F)
## gene_expression_data$Class 1 0.8329 0.83289 12.178 0.0008084 ***
## Residuals 76 5.1980 0.06840
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response NM_006701 :
## Df Sum Sq Mean Sq F value Pr(>F)
## gene_expression_data$Class 1 0.00442 0.0044175 0.1757 0.6763
## Residuals 76 1.91131 0.0251489
##
## Response Contig41538_RC :
## Df Sum Sq Mean Sq F value Pr(>F)
## gene_expression_data$Class 1 0.0778 0.077767 0.5928 0.4437
## Residuals 76 9.9708 0.131195
##
## Response NM_002857 :
## Df Sum Sq Mean Sq F value Pr(>F)
## gene_expression_data$Class 1 0.00313 0.0031257 0.13 0.7194
## Residuals 76 1.82756 0.0240468
##
## Response NM_018374 :
## Df Sum Sq Mean Sq F value Pr(>F)
## gene_expression_data$Class 1 0.01828 0.018278 0.6852 0.4104
## Residuals 76 2.02728 0.026675
##
## Response AL137449 :
## Df Sum Sq Mean Sq F value Pr(>F)
## gene_expression_data$Class 1 0.0925 0.092474 0.9524 0.3322
## Residuals 76 7.3794 0.097098
##
## Response NM_014316 :
## Df Sum Sq Mean Sq F value Pr(>F)
## gene_expression_data$Class 1 0.02282 0.022825 0.8367 0.3632
## Residuals 76 2.07320 0.027279
# --- Visualize the Results ---
# Create a dataframe for plotting the gene expression data with Class information
plot_data <- data.frame(gene_expression_data)
plot_data_long <- reshape(plot_data,
varying = colnames(plot_data)[1:10],
v.names = "Expression",
times = colnames(plot_data)[1:10],
timevar = "Gene",
direction = "long")
# Plot boxplots or violin plots for each gene expression by Class
ggplot(plot_data_long, aes(x = Class, y = Expression, fill = Class)) +
geom_boxplot() +
facet_wrap(~Gene, scales = "free_y") +
labs(title = "Gene Expression by Class (Invasive vs Noninvasive Cancer)",
x = "Cancer Class",
y = "Gene Expression") +
theme_minimal()
Task 6:Use the first and second principal component to illustrate, if there is a difference between invasive and noninvasive cancer.
# Load necessary libraries
library(ggplot2)
# --- Prepare Data ---
# Remove the Class column and only keep the gene expression data
gene_expression_data <- InitialData[, my_subset] # Assuming `my_subset` contains selected gene columns
# Standardize the data (center and scale)
scaled_data <- scale(gene_expression_data)
# --- Perform PCA ---
pca_result <- prcomp(scaled_data, center = TRUE, scale. = TRUE)
# --- Visualize the PCA Results ---
# Create a data frame for plotting, including the first two principal components and the Class labels
pca_data <- data.frame(PC1 = pca_result$x[, 1],
PC2 = pca_result$x[, 2],
Class = as.factor(InitialData$Class))
# Plot the first and second principal components, colored by Class
ggplot(pca_data, aes(x = PC1, y = PC2, color = Class)) +
geom_point(size = 4) +
labs(title = "PCA of Gene Expression Data: Invasive vs Noninvasive Cancer",
x = "Principal Component 1",
y = "Principal Component 2") +
theme_minimal() +
scale_color_manual(values = c("red", "blue")) +
theme(legend.title = element_blank()) +
theme(plot.title = element_text(hjust = 0.5))
# --- Table of Variance Explained by Each Principal Component ---
summary(pca_result)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.5963 1.2123 1.1341 1.0234 1.0023 0.90244 0.75282
## Proportion of Variance 0.2548 0.1470 0.1286 0.1047 0.1005 0.08144 0.05667
## Cumulative Proportion 0.2548 0.4018 0.5304 0.6351 0.7356 0.81703 0.87371
## PC8 PC9 PC10
## Standard deviation 0.72381 0.6588 0.55233
## Proportion of Variance 0.05239 0.0434 0.03051
## Cumulative Proportion 0.92610 0.9695 1.00000
# --- Additional Details: Eigenvalues (Variance explained by each PC) ---
eigenvalues <- summary(pca_result)$importance[2, ] # Proportion of Variance Explained
eigenvalues_table <- data.frame(PC = 1:length(eigenvalues), Variance_Explained = eigenvalues)
# Print the table of eigenvalues
print(eigenvalues_table)
## PC Variance_Explained
## PC1 1 0.25483
## PC2 2 0.14696
## PC3 3 0.12861
## PC4 4 0.10473
## PC5 5 0.10047
## PC6 6 0.08144
## PC7 7 0.05667
## PC8 8 0.05239
## PC9 9 0.04340
## PC10 10 0.03051
Task 7: :Apply LDA to your individual subset of 10 genes and the class variable (invasive (label 1) and noninvasive (label 2) cancer). Calculate a confusion matrix, sensitivity,specificity and misclassification error.
# Load necessary libraries
library(MASS) # For LDA
library(caret) # For confusionMatrix and performance metrics
## Loading required package: lattice
library(ggplot2) # For visualization
# --- Prepare Data ---
# Select the gene expression data based on your subset of 10 genes
gene_expression_data <- InitialData[, my_subset] # Assuming 'my_subset' contains the selected gene indices or names
# Add the 'Class' column (invasive vs noninvasive) to the subset
gene_expression_data$Class <- as.factor(InitialData$Class)
# --- Fit LDA Model ---
# Fit the LDA model with Class as the dependent variable and gene expression data as the independent variables
lda_model <- lda(Class ~ ., data = gene_expression_data)
# --- Predict the Classes using LDA Model ---
# Use the LDA model to predict the class labels
lda_predictions <- predict(lda_model)
# --- Confusion Matrix ---
# Generate the confusion matrix to compare the true class labels and predicted labels
conf_matrix <- confusionMatrix(lda_predictions$class, gene_expression_data$Class)
# Print the confusion matrix
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 20 8
## 2 14 36
##
## Accuracy : 0.7179
## 95% CI : (0.6047, 0.8141)
## No Information Rate : 0.5641
## P-Value [Acc > NIR] : 0.003743
##
## Kappa : 0.4147
##
## Mcnemar's Test P-Value : 0.286422
##
## Sensitivity : 0.5882
## Specificity : 0.8182
## Pos Pred Value : 0.7143
## Neg Pred Value : 0.7200
## Prevalence : 0.4359
## Detection Rate : 0.2564
## Detection Prevalence : 0.3590
## Balanced Accuracy : 0.7032
##
## 'Positive' Class : 1
##
# --- Calculate Sensitivity, Specificity, and Misclassification Error ---
# Sensitivity (True Positive Rate) for Class 1 (invasive cancer)
sensitivity <- conf_matrix$byClass['Sensitivity']
# Specificity (True Negative Rate) for Class 2 (noninvasive cancer)
specificity <- conf_matrix$byClass['Specificity']
# Misclassification Error
misclassification_error <- 1 - conf_matrix$overall['Accuracy']
# Print Sensitivity, Specificity, and Misclassification Error
cat("Sensitivity (Invasive): ", sensitivity, "\n")
## Sensitivity (Invasive): 0.5882353
cat("Specificity (Noninvasive): ", specificity, "\n")
## Specificity (Noninvasive): 0.8181818
cat("Misclassification Error: ", misclassification_error, "\n")
## Misclassification Error: 0.2820513
# Load necessary library
library(caret) # For confusionMatrix
# --- Prepare Data ---
# Assuming 'lda_predictions' is the output from the LDA model and
# 'gene_expression_data$Class' is the true class labels
# Generate the confusion matrix to compare the true class labels and predicted labels
conf_matrix <- confusionMatrix(lda_predictions$class, gene_expression_data$Class)
# Print the confusion matrix
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 20 8
## 2 14 36
##
## Accuracy : 0.7179
## 95% CI : (0.6047, 0.8141)
## No Information Rate : 0.5641
## P-Value [Acc > NIR] : 0.003743
##
## Kappa : 0.4147
##
## Mcnemar's Test P-Value : 0.286422
##
## Sensitivity : 0.5882
## Specificity : 0.8182
## Pos Pred Value : 0.7143
## Neg Pred Value : 0.7200
## Prevalence : 0.4359
## Detection Rate : 0.2564
## Detection Prevalence : 0.3590
## Balanced Accuracy : 0.7032
##
## 'Positive' Class : 1
##
# --- Visualization of Confusion Matrix ---
# Plot confusion matrix using ggplot2
conf_matrix_data <- as.data.frame(conf_matrix$table)
ggplot(conf_matrix_data, aes(Reference, Prediction)) +
geom_tile(aes(fill = Freq), color = "white") +
geom_text(aes(label = Freq), vjust = 1) +
scale_fill_gradient(low = "white", high = "blue") +
labs(title = "Confusion Matrix for LDA Model", x = "True Class", y = "Predicted Class") +
theme_minimal()