The goal of this exercise is to make you familiar with how to download data from Google Sheets and to briefly review some key concepts R functions and coding concepts.
We’ll do the following things
(TODO: MAKE YOUR OWN OUTLINE)
## Google sheets download package
# comment this out when you are done
# install.packages("googlesheets4")
library(googlesheets4)
# comp bio packages
library(seqinr)
library(rentrez)
library(compbio4all)
library(Biostrings)
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:seqinr':
##
## translate
## The following object is masked from 'package:base':
##
## strsplit
WEB ADDRESS WITH THE DATA FROM GOOGLE SHEETS
spreadsheet_sp <- "https://docs.google.com/spreadsheets/d/1spC_ZA3_cVuvU3e_Jfcj2nEIfzp-vaP7SA5f-qwQ1pg/edit?usp=sharing"
PACKAGE WILL NOT BE CHECKING USER ACCESS
# be sure to run this!
googlesheets4::gs4_deauth() # <====== MUST RUN THIS
Third, we download our data.
“Error in curl::curl_fetch_memory(url, handle = handle) : Error in the HTTP2 framing layer”
If that happens, just re-run the code.
# I include this again in case you missed is the first time : )
googlesheets4::gs4_deauth()
# download
## NOTE: if you get an error, just run the code again
refseq_column <- read_sheet(ss = spreadsheet_sp, # the url
sheet = "RefSeq_prot", # the name of the worksheet
range = "selenoprot!H1:H364",
col_names = TRUE,
na = "", # fill in empty spaces "" w/NA
trim_ws = TRUE)
## v Reading from "human_gene_table".
## v Range ''selenoprot'!H1:H364'.
## NOTE: if you get an error, just run the code again
# for reasons we won't get into I'm going to do this
protein_refseq <- refseq_column$RefSeq_prot
RESULTS
protein_refseq[1:10]
## [1] "NP_000783.2" "NP_998758.1" "NP_001034804.1" "NP_001034805.1"
## [5] "NP_001311245.1" NA NA "NP_054644.1"
## [9] "NP_001353425.1" "NP_000784.3"
COLUMN
# download
## NOTE: if you get an error, just run the code again
gene_name_column <- read_sheet(ss = spreadsheet_sp, # the url
sheet = "gene", # the name of the worksheet
range = "selenoprot!A1:A364",
col_names = TRUE,
na = "", # fill in empty spaces "" w/NA
trim_ws = TRUE)
## v Reading from "human_gene_table".
## v Range ''selenoprot'!A1:A364'.
## NOTE: if you get an error, just run the code again
# for reasons we won't get into I'm going to do this
gene <- gene_name_column$gene
GETTING THE SUMMARY OF PROTEIN_REGSEQ
is(protein_refseq)
## [1] "character" "vector"
## [3] "data.frameRowLabels" "SuperClassMethod"
## [5] "character_OR_connection" "character_OR_NULL"
## [7] "atomic" "EnumerationValue"
## [9] "vector_OR_Vector" "vector_OR_factor"
class(protein_refseq)
## [1] "character"
length(protein_refseq)
## [1] 363
protein_refseq[1:10]
## [1] "NP_000783.2" "NP_998758.1" "NP_001034804.1" "NP_001034805.1"
## [5] "NP_001311245.1" NA NA "NP_054644.1"
## [9] "NP_001353425.1" "NP_000784.3"
cHECKS FOR NA IN THE SEQUENCE
is.na(protein_refseq)
## [1] FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE TRUE
## [13] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE
## [37] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [73] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE
## [253] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [277] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [289] TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [301] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [313] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [325] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [337] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [349] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [361] FALSE FALSE FALSE
cREATES A TABLE COUNTING TRUE AND FALSE IN THE SEQUENCE
table(is.na(protein_refseq))
##
## FALSE TRUE
## 334 29
cOUNTING THE AMOUNT OF TRUE
# ...
temp <- is.na(protein_refseq)
temp
## [1] FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE TRUE
## [13] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE
## [37] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [73] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE
## [253] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [277] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [289] TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [301] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [313] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [325] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [337] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [349] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [361] FALSE FALSE FALSE
# ....
protein_refseq[temp]
## [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [26] NA NA NA NA
temp2 <- protein_refseq[temp]
# ...
length(temp2)
## [1] 29
CREATING A TABLE OF THE GENE AND ITS REFSEQ
seleno_df <- data.frame(gene = gene,
protein_refseq = protein_refseq)
seleno_df
## gene protein_refseq
## 1 DIO1 NP_000783.2
## 2 DIO1 NP_998758.1
## 3 DIO1 NP_001034804.1
## 4 DIO1 NP_001034805.1
## 5 DIO1 NP_001311245.1
## 6 DIO1 <NA>
## 7 DIO1 <NA>
## 8 DIO2 NP_054644.1
## 9 DIO2 NP_001353425.1
## 10 DIO2 NP_000784.3
## 11 DIO2 NP_001311391.2
## 12 DIO2 <NA>
## 13 DIO2 <NA>
## 14 DIO3 NP_001353.4
## 15 GPX1 NP_000572.2
## 16 GPX1 NP_958799.1
## 17 GPX1 NP_001316431.1
## 18 GPX1 NP_001316432.1
## 19 GPX1 NP_001316384.1
## 20 GPX2 NP_002074.2
## 21 GPX2 <NA>
## 22 GPX2 <NA>
## 23 GPX2 <NA>
## 24 GPX3 NP_002075.2
## 25 GPX3 NP_001316719.1
## 26 GPX4 NP_002076.2
## 27 GPX4 NP_001034936.1
## 28 GPX4 NP_001034937.1
## 29 GPX6 NP_874360.1
## 30 MSRB1 NP_057416.1
## 31 SELENOF NP_004252.2
## 32 SELENOF NP_976086.1
## 33 SELENOF <NA>
## 34 SELENOF <NA>
## 35 SELENOH NP_734467.1
## 36 SELENOH NP_001308264.1
## 37 SELENOI NP_277040.1
## 38 SELENOI <NA>
## 39 SELENOK NP_067060.2
## 40 SELENOM NP_536355.1
## 41 SELENON NP_996809.1
## 42 SELENON NP_065184.2
## 43 SELENOO NP_113642.1
## 44 SELENOP NP_005401.3
## 45 SELENOP NP_001078955.1
## 46 SELENOP NP_001087195.1
## 47 SELENOS NP_060915.2
## 48 SELENOS NP_982298.2
## 49 SELENOT NP_057359.2
## 50 SELENOV NP_874363.1
## 51 SELENOV NP_001337738.1
## 52 SELENOV <NA>
## 53 SELENOW NP_003000.1
## 54 SEPHS2 NP_036380.2
## 55 TXNRD1 NP_877393.1
## 56 TXNRD1 NP_877419.1
## 57 TXNRD1 NP_877420.1
## 58 TXNRD1 NP_003321.3
## 59 TXNRD1 NP_001248374.1
## 60 TXNRD1 NP_001248375.1
## 61 TXNRD1 NP_001087240.1
## 62 TXNRD2 NP_006431.2
## 63 TXNRD2 NP_001339229.1
## 64 TXNRD2 NP_001339230.1
## 65 TXNRD2 NP_001339231.1
## 66 TXNRD2 NP_001269441.1
## 67 TXNRD2 NP_001339232.1
## 68 TXNRD2 <NA>
## 69 TXNRD3 NP_443115.1
## 70 TXNRD3 NP_001166984.1
## 71 DIO1 NP_001116124.1
## 72 DIO2 NP_001311436.2
## 73 DIO2 <NA>
## 74 DIO3 NP_001116121.2
## 75 GPX1 NP_001152770.1
## 76 GPX2 NP_001108609.2
## 77 GPX3 NP_001152772.1
## 78 GPX4 NP_001112361.1
## 79 GPX4 NP_001333349.1
## 80 GPX6 NP_001152830.1
## 81 MSRB1 NP_001333615.1
## 82 SELENOF NP_001152888.1
## 83 SELENOF NP_001333897.1
## 84 SELENOF <NA>
## 85 SELENOH NP_001152973.1
## 86 SELENOH NP_001308270.1
## 87 SELENOI NP_001129035.1
## 88 SELENOK NP_001152853.1
## 89 SELENOM NP_001152871.1
## 90 SELENON NP_001334920.1
## 91 SELENON NP_001334921.1
## 92 SELENOO NP_001152883.1
## 93 SELENOP NP_001152962.1
## 94 SELENOS NP_001108227.1
## 95 SELENOS NP_001335139.1
## 96 SELENOT NP_001152886.1
## 97 SELENOV NP_001337751.1
## 98 SELENOW NP_001036202.1
## 99 SEPHS2 NP_001152892.1
## 100 TXNRD1 NP_001338694.1
## 101 TXNRD1 NP_001243204.1
## 102 TXNRD2 NP_001152971.1
## 103 TXNRD3 NP_001340270.1
## 104 DIO1 NP_001116065.1
## 105 DIO2 NP_001010992.2
## 106 DIO3 NP_001010993.2
## 107 GPX1 NP_776501.1
## 108 GPX2 NP_001156611.1
## 109 GPX3 NP_776502.1
## 110 GPX3 <NA>
## 111 GPX4 NP_777195.1
## 112 GPX4 NP_001333359.1
## 113 GPX4 NP_001333360.1
## 114 GPX6 NP_001156614.1
## 115 MSRB1 NP_001029982.2
## 116 SELENOF NP_001029931.2
## 117 SELENOH NP_001157564.1
## 118 SELENOH NP_001308256.1
## 119 SELENOI NP_001068725.2
## 120 SELENOK NP_001032566.2
## 121 SELENOK <NA>
## 122 SELENOM NP_001156643.2
## 123 SELENOM <NA>
## 124 SELENON NP_001108448.1
## 125 SELENOO NP_001156665.2
## 126 SELENOP NP_776884.2
## 127 SELENOS NP_001039579.2
## 128 SELENOS NP_001335154.1
## 129 SELENOS NP_001335155.1
## 130 SELENOT NP_001096573.2
## 131 SELENOT <NA>
## 132 SELENOV NP_001156716.1
## 133 SELENOW NP_001156697.1
## 134 SEPHS2 NP_001108204.2
## 135 TXNRD1 NP_777050.1
## 136 TXNRD2 NP_777051.1
## 137 TXNRD3 NP_001179038.2
## 138 DIO1 NP_031886.3
## 139 DIO2 NP_034180.2
## 140 DIO3 NP_742117.2
## 141 GPX1 NP_032186.2
## 142 GPX1 NP_001316456.1
## 143 GPX1 NP_001316457.1
## 144 GPX2 NP_109602.2
## 145 GPX3 NP_032187.2
## 146 GPX3 NP_001316789.1
## 147 GPX4 NP_032188.3
## 148 GPX4 NP_001032830.2
## 149 GPX4 <NA>
## 150 MSRB1 NP_038787.1
## 151 MSRB1 NP_001333597.1
## 152 SELENOF NP_444332.1
## 153 SELENOH NP_001028338.1
## 154 SELENOH NP_001032356.1
## 155 SELENOI NP_081928.2
## 156 SELENOI <NA>
## 157 SELENOK NP_064363.2
## 158 SELENOM NP_444497.1
## 159 SELENON NP_083376.2
## 160 SELENOO NP_082181.2
## 161 SELENOP NP_033181.3
## 162 SELENOP NP_001036078.1
## 163 SELENOP NP_001036079.1
## 164 SELENOS NP_077759.3
## 165 SELENOS NP_001335175.1
## 166 SELENOT NP_001035486.2
## 167 SELENOV NP_778198.2
## 168 SELENOW NP_033182.1
## 169 SEPHS2 NP_033292.2
## 170 TXNRD1 NP_001035978.1
## 171 TXNRD1 NP_056577.2
## 172 TXNRD1 NP_001035979.1
## 173 TXNRD1 NP_001035988.1
## 174 TXNRD2 NP_038739.2
## 175 TXNRD2 NP_001340072.1
## 176 TXNRD3 NP_694802.2
## 177 TXNRD3 NP_001171529.1
## 178 TXNRD3 NP_001171530.1
## 179 TXNRD3 NP_001171531.1
## 180 DIO1 NP_067685.5
## 181 DIO1 NP_001311139.1
## 182 DIO2 NP_113908.4
## 183 DIO3 NP_058906.3
## 184 GPX1 NP_110453.3
## 185 GPX2 NP_899653.2
## 186 GPX3 NP_071970.2
## 187 GPX4 NP_058861.3
## 188 GPX4 NP_001034938.1
## 189 MSRB1 NP_001037750.2
## 190 SELENOF NP_579831.2
## 191 SELENOH NP_001108411.1
## 192 SELENOH NP_001308221.1
## 193 SELENOI NP_001128226.2
## 194 SELENOK NP_997472.2
## 195 SELENOM NP_001108485.1
## 196 SELENOO NP_001078954.1
## 197 SELENOP NP_062065.2
## 198 SELENOP NP_001077380.1
## 199 SELENOS NP_775143.2
## 200 SELENOT NP_001014275.2
## 201 SELENOV NP_001159868.1
## 202 SELENOV <NA>
## 203 SELENOV <NA>
## 204 SELENOW NP_037159.4
## 205 SEPHS2 NP_001073358.2
## 206 TXNRD1 NP_113802.2
## 207 TXNRD1 NP_001338910.1
## 208 TXNRD1 NP_001338912.1
## 209 TXNRD1 NP_001338913.1
## 210 TXNRD2 NP_072106.1
## 211 TXNRD3 NP_001171641.1
## 212 TXNRD3 NP_001100079.2
## 213 DIO1 NP_001091083.1
## 214 DIO2 NP_989445.3
## 215 DIO2 NP_001311484.1
## 216 DIO3 NP_001116120.1
## 217 GPX1 NP_001264782.2
## 218 GPX2 NP_001264783.2
## 219 GPX3 NP_001156704.1
## 220 GPX4 NP_989551.2
## 221 GPX4 NP_001333378.1
## 222 GPX4 NP_001333377.1
## 223 MSRB1 NP_001129030.1
## 224 MSRB1 NP_001333675.2
## 225 SELENOF NP_001012944.2
## 226 SELENOH NP_001264794.1
## 227 SELENOI NP_001026699.2
## 228 SELENOK NP_001020612.1
## 229 SELENOM NP_001264788.1
## 230 SELENON NP_001108444.1
## 231 SELENOO NP_001108489.4
## 232 SELENOP1 NP_001026780.2
## 233 SELENOP2 NP_001335698.1
## 234 SELENOS NP_001019905.1
## 235 SELENOT NP_001006557.3
## 236 SELENOU NP_001180447.1
## 237 SELENOU NP_001180448.1
## 238 SELENOW NP_001159799.1
## 239 SELENOW NP_001338303.1
## 240 SEPHS2 NP_001353263.2
## 241 TXNRD1 NP_001025933.2
## 242 TXNRD1 NP_001338952.1
## 243 TXNRD2 NP_001116163.2
## 244 TXNRD3 NP_001116249.1
## 245 DIO1 NP_001243226.1
## 246 DIO1 NP_001311326.1
## 247 DIO2 NP_001184161.3
## 248 DIO3 NP_001107139.2
## 249 GPX1 NP_001015740.2
## 250 GPX2 NP_001243244.1
## 251 GPX2 <NA>
## 252 GPX2 <NA>
## 253 GPX3 NP_988961.2
## 254 GPX4 NP_001291701.1
## 255 GPX4 NP_001333415.1
## 256 GPX4 <NA>
## 257 MSRB1 NP_001004972.2
## 258 SELENOF NP_001165182.1
## 259 SELENOF NP_001333964.1
## 260 SELENOI NP_001004832.3
## 261 SELENOK NP_001072180.1
## 262 SELENOM NP_001165078.1
## 263 SELENON NP_001070901.1
## 264 SELENOO NP_001135537.3
## 265 SELENOP NP_001186813.1
## 266 SELENOS NP_001011476.2
## 267 SELENOT NP_988868.2
## 268 SELENOW1 NP_001291715.2
## 269 SELENOW2 NP_001341647.1
## 270 SEPHS2 NP_001072895.2
## 271 TXNRD1 NP_001243400.1
## 272 TXNRD2 NP_001120487.2
## 273 TXNRD3 NP_001340296.1
## 274 DIO1 NP_001089136.1
## 275 DIO1 NP_001340488.1
## 276 DIO2 NP_001340380.2
## 277 DIO2 NP_001340401.2
## 278 DIO3 NP_001081332.2
## 279 DIO3 NP_001087559.2
## 280 GPX1 NP_001089335.2
## 281 GPX1 NP_001088896.3
## 282 GPX2 NP_001340468.1
## 283 GPX2 NP_001340452.1
## 284 GPX3 NP_001085319.2
## 285 GPX3 NP_001086142.2
## 286 GPX4 NP_001165215.2
## 287 GPX4 <NA>
## 288 GPX4 NP_001165213.1
## 289 GPX4 <NA>
## 290 GPX4 <NA>
## 291 MSRB1 NP_001333738.1
## 292 SELENOF NP_001165183.1
## 293 SELENOF NP_001334079.1
## 294 SELENOI NP_001078947.2
## 295 SELENOI NP_001340350.1
## 296 SELENOI NP_001085348.2
## 297 SELENOK NP_001334861.1
## 298 SELENOK NP_001087951.2
## 299 SELENOM NP_001085717.2
## 300 SELENOM NP_001334804.1
## 301 SELENON NP_001334933.1
## 302 SELENON NP_001334934.1
## 303 SELENOO NP_001335003.1
## 304 SELENOP NP_001186825.1
## 305 SELENOS NP_001087125.2
## 306 SELENOS NP_001087566.2
## 307 SELENOT NP_001079587.2
## 308 SELENOT NP_001079506.2
## 309 SELENOW1 NP_001338383.1
## 310 SELENOW2 NP_001341692.1
## 311 SELENOW2 NP_001341700.1
## 312 SEPHS2 NP_001104206.3
## 313 TXNRD1 NP_001087786.2
## 314 TXNRD1 NP_001339002.1
## 315 TXNRD1 NP_001339003.1
## 316 TXNRD2 NP_001080052.1
## 317 TXNRD3 NP_001087660.2
## 318 TXNRD3 NP_001340315.1
## 319 DIO1 NP_001007284.3
## 320 DIO1 NP_001311333.1
## 321 DIO2 NP_997954.4
## 322 DIO3 NP_001242932.1
## 323 DIO3 NP_001171406.2
## 324 GPX1 NP_001007282.2
## 325 GPX1 NP_001004634.2
## 326 GPX2 NP_001316688.1
## 327 GPX3 NP_001131027.1
## 328 GPX4 NP_001333466.1
## 329 GPX4 NP_001007283.2
## 330 GPX4 NP_001025241.2
## 331 GPX4 <NA>
## 332 MSRB1 NP_840073.2
## 333 MSRB1 NP_001098598.2
## 334 SELENOE NP_001182713.2
## 335 SELENOF NP_840079.1
## 336 SELENOH NP_835234.2
## 337 SELENOI NP_001316366.1
## 338 SELENOJ NP_001180398.1
## 339 SELENOK NP_001004681.2
## 340 SELENOL NP_001177311.1
## 341 SELENOM NP_840071.2
## 342 SELENON NP_001004294.4
## 343 SELENOO1 NP_001038336.2
## 344 SELENOO2 NP_001335014.1
## 345 SELENOP NP_840082.3
## 346 SELENOP2 NP_001340840.1
## 347 SELENOS NP_001038799.2
## 348 SELENOT1 NP_840075.2
## 349 SELENOT1 NP_840077.3
## 350 SELENOT2 NP_001091957.2
## 351 SELENOU NP_001180454.1
## 352 SELENOU NP_001341481.1
## 353 SELENOU NP_001341482.1
## 354 SELENOU NP_001070181.2
## 355 SELENOW1 NP_840072.3
## 356 SELENOW2 NP_919398.1
## 357 SELENOW2 NP_919399.3
## 358 SEPHS2 NP_001004295.2
## 359 TXNRD2 NP_001340325.1
## 360 TXNRD2 NP_001340323.1
## 361 TXNRD3 NP_898895.2
## 362 SEPHS2 NP_001353084.1
## 363 SEPHS2 NP_001353085.1
CREATING A DATA FRAME
summary(seleno_df)
## gene protein_refseq
## Length:363 Length:363
## Class :character Class :character
## Mode :character Mode :character
head(seleno_df)
## gene protein_refseq
## 1 DIO1 NP_000783.2
## 2 DIO1 NP_998758.1
## 3 DIO1 NP_001034804.1
## 4 DIO1 NP_001034805.1
## 5 DIO1 NP_001311245.1
## 6 DIO1 <NA>
REMOVING ALL OF THE NA
# omit NAs
seleno_df_noNA <- na.omit(seleno_df)
# check length- should be shorter
dim(seleno_df)
## [1] 363 2
dim(seleno_df_noNA)
## [1] 334 2
The same gene can appear multiple times because multiple isoforms are listed.
head(seleno_df_noNA)
## gene protein_refseq
## 1 DIO1 NP_000783.2
## 2 DIO1 NP_998758.1
## 3 DIO1 NP_001034804.1
## 4 DIO1 NP_001034805.1
## 5 DIO1 NP_001311245.1
## 8 DIO2 NP_054644.1
SELECTING A SINGLE ROW
genes_unique <- unique(seleno_df_noNA$gene)
length(genes_unique)
## [1] 37
genes_unique
## [1] "DIO1" "DIO2" "DIO3" "GPX1" "GPX2" "GPX3"
## [7] "GPX4" "GPX6" "MSRB1" "SELENOF" "SELENOH" "SELENOI"
## [13] "SELENOK" "SELENOM" "SELENON" "SELENOO" "SELENOP" "SELENOS"
## [19] "SELENOT" "SELENOV" "SELENOW" "SEPHS2" "TXNRD1" "TXNRD2"
## [25] "TXNRD3" "SELENOP1" "SELENOP2" "SELENOU" "SELENOW1" "SELENOW2"
## [31] "SELENOE" "SELENOJ" "SELENOL" "SELENOO1" "SELENOO2" "SELENOT1"
## [37] "SELENOT2"
unique() just gives us the unique elements. A related function, duplicated(), gives us the location of duplicated elements in the vector. FALSE means “not duplicated yet” or “first instance so far”.
i.dups <- duplicated(seleno_df_noNA$gene)
We can remove the duplicates using a form of reverse indexing where the “!” means “not”. (You don’t need to know this for the exam)
seleno_df_noNA[!i.dups, ]
## gene protein_refseq
## 1 DIO1 NP_000783.2
## 8 DIO2 NP_054644.1
## 14 DIO3 NP_001353.4
## 15 GPX1 NP_000572.2
## 20 GPX2 NP_002074.2
## 24 GPX3 NP_002075.2
## 26 GPX4 NP_002076.2
## 29 GPX6 NP_874360.1
## 30 MSRB1 NP_057416.1
## 31 SELENOF NP_004252.2
## 35 SELENOH NP_734467.1
## 37 SELENOI NP_277040.1
## 39 SELENOK NP_067060.2
## 40 SELENOM NP_536355.1
## 41 SELENON NP_996809.1
## 43 SELENOO NP_113642.1
## 44 SELENOP NP_005401.3
## 47 SELENOS NP_060915.2
## 49 SELENOT NP_057359.2
## 50 SELENOV NP_874363.1
## 53 SELENOW NP_003000.1
## 54 SEPHS2 NP_036380.2
## 55 TXNRD1 NP_877393.1
## 62 TXNRD2 NP_006431.2
## 69 TXNRD3 NP_443115.1
## 232 SELENOP1 NP_001026780.2
## 233 SELENOP2 NP_001335698.1
## 236 SELENOU NP_001180447.1
## 268 SELENOW1 NP_001291715.2
## 269 SELENOW2 NP_001341647.1
## 334 SELENOE NP_001182713.2
## 338 SELENOJ NP_001180398.1
## 340 SELENOL NP_001177311.1
## 343 SELENOO1 NP_001038336.2
## 344 SELENOO2 NP_001335014.1
## 348 SELENOT1 NP_840075.2
## 350 SELENOT2 NP_001091957.2
Make a dataframe of non-duplicated genes
seleno_df_noDups <- seleno_df_noNA[!i.dups, ]
dim(seleno_df_noDups)
## [1] 37 2
Let’s select 2 random sequences to work with. We’ll use WHICH FUNCTION? to select a random index number to get
First, lets make a vector that contains a unique number for each row of data
indices <- 1:nrow(seleno_df_noDups)
This would do the same thing
# with dim
indices <- 1:dim(seleno_df_noDups)[1]
# with length
indices <- 1:length(seleno_df_noDups$gene)
or hard-coded
indices <- 1:37
We can then use WHICH FUNCTION? to select 2 random numbers from this vector.
For x = we’ll use our vector of indices (1 to 37). For size we’ll use 2, since we want to pull out just 2 numbers. For replace we’ll use WHAT? since we don’t want to be ale to select the same number twice.
i.random.genes <- sample(x = indices,
size = 2,
replace = FALSE)
Hard coded this would be
i.random.genes <- sample(x = c(1:37),
size = 2,
replace = FALSE)
This gives me HOW MANY? indices values.
i.random.genes
## [1] 30 8
I can now use these index values to pull out HOW MANY? rows of data
seleno_df_noNA[i.random.genes, ]
## gene protein_refseq
## 40 SELENOM NP_536355.1
## 10 DIO2 NP_000784.3
Hard coded, this would be something like this for whichever genes happen to have been selected
seleno_df_noNA[c(37,15), ]
## gene protein_refseq
## 47 SELENOS NP_060915.2
## 19 GPX1 NP_001316384.1
I will now… DO WHAT?
rentrez::entrez_fetch(id = "NP_060915.2",
db = "protein",
rettype = "fasta")
## [1] ">NP_060915.2 selenoprotein S isoform 1 [Homo sapiens]\nMERQEESLSARPALETEGLRFLHTTVGSLLATYGWYIVFSCILLYVVFQKLSARLRALRQRQLDRAAAAV\nEPDVVVKRQEALAAARLKMQEELNAQVEKHKEKLKQLEEEKRRQKIEMWDSMQEGKSYKGNAKKPQEEDS\nPGPSTSSVLKRKSDRKPLRGGGYNPLSGEGGGACSWRPGRRGPSSGGUG\n\n"
rentrez::entrez_fetch(id = "NP_001316384.1",
db = "protein",
rettype = "fasta")
## [1] ">NP_001316384.1 glutathione peroxidase 1 isoform 5 [Homo sapiens]\nMCAARLAAAAAAAQSVYAFSARPLAGGEPVSLGSLRGKENAKNEEILNSLKYVRPGGGFEPNFMLFEKCE\nVNGAGAHPLFAFLREALPAPSDDATALMTDPKLITWSPVCRNDVAWNFEKFLVGPDGVPLRRYSRRFQTI\nDIEPDIEALLSQGPSCA\n\n"
WHAT”S THIS DOING?
prot1 <- rentrez::entrez_fetch(id = "NP_060915.2",
db = "protein",
rettype = "fasta")
prot2 <- rentrez::entrez_fetch(id = "NP_001316384.1",
db = "protein",
rettype = "fasta")
I can put them into a WHAT? like this
# make the WHAT?
seleno_thingy <- vector("list", 1)
# add the first fasta
seleno_thingy[[1]] <- prot1
# See the result
seleno_thingy
## [[1]]
## [1] ">NP_060915.2 selenoprotein S isoform 1 [Homo sapiens]\nMERQEESLSARPALETEGLRFLHTTVGSLLATYGWYIVFSCILLYVVFQKLSARLRALRQRQLDRAAAAV\nEPDVVVKRQEALAAARLKMQEELNAQVEKHKEKLKQLEEEKRRQKIEMWDSMQEGKSYKGNAKKPQEEDS\nPGPSTSSVLKRKSDRKPLRGGGYNPLSGEGGGACSWRPGRRGPSSGGUG\n\n"
# add the first fasta
seleno_thingy[[2]] <- prot2
# see the result
seleno_thingy
## [[1]]
## [1] ">NP_060915.2 selenoprotein S isoform 1 [Homo sapiens]\nMERQEESLSARPALETEGLRFLHTTVGSLLATYGWYIVFSCILLYVVFQKLSARLRALRQRQLDRAAAAV\nEPDVVVKRQEALAAARLKMQEELNAQVEKHKEKLKQLEEEKRRQKIEMWDSMQEGKSYKGNAKKPQEEDS\nPGPSTSSVLKRKSDRKPLRGGGYNPLSGEGGGACSWRPGRRGPSSGGUG\n\n"
##
## [[2]]
## [1] ">NP_001316384.1 glutathione peroxidase 1 isoform 5 [Homo sapiens]\nMCAARLAAAAAAAQSVYAFSARPLAGGEPVSLGSLRGKENAKNEEILNSLKYVRPGGGFEPNFMLFEKCE\nVNGAGAHPLFAFLREALPAPSDDATALMTDPKLITWSPVCRNDVAWNFEKFLVGPDGVPLRRYSRRFQTI\nDIEPDIEALLSQGPSCA\n\n"
# WHAT DOES THIS DO?
names(seleno_thingy) <- c("prot1", "prot2")
#Output
seleno_thingy
## $prot1
## [1] ">NP_060915.2 selenoprotein S isoform 1 [Homo sapiens]\nMERQEESLSARPALETEGLRFLHTTVGSLLATYGWYIVFSCILLYVVFQKLSARLRALRQRQLDRAAAAV\nEPDVVVKRQEALAAARLKMQEELNAQVEKHKEKLKQLEEEKRRQKIEMWDSMQEGKSYKGNAKKPQEEDS\nPGPSTSSVLKRKSDRKPLRGGGYNPLSGEGGGACSWRPGRRGPSSGGUG\n\n"
##
## $prot2
## [1] ">NP_001316384.1 glutathione peroxidase 1 isoform 5 [Homo sapiens]\nMCAARLAAAAAAAQSVYAFSARPLAGGEPVSLGSLRGKENAKNEEILNSLKYVRPGGGFEPNFMLFEKCE\nVNGAGAHPLFAFLREALPAPSDDATALMTDPKLITWSPVCRNDVAWNFEKFLVGPDGVPLRRYSRRFQTI\nDIEPDIEALLSQGPSCA\n\n"
Elements of the list are accessed like this
seleno_thingy[[1]]
## [1] ">NP_060915.2 selenoprotein S isoform 1 [Homo sapiens]\nMERQEESLSARPALETEGLRFLHTTVGSLLATYGWYIVFSCILLYVVFQKLSARLRALRQRQLDRAAAAV\nEPDVVVKRQEALAAARLKMQEELNAQVEKHKEKLKQLEEEKRRQKIEMWDSMQEGKSYKGNAKKPQEEDS\nPGPSTSSVLKRKSDRKPLRGGGYNPLSGEGGGACSWRPGRRGPSSGGUG\n\n"
I’ll clean them with fasta_cleaner()
# first, make a copy of the list for storing the clean data
## I'm just going to copy over the old data
seleno_thingy_clean <- seleno_thingy
# HOW TO MAKE THIS MORE COMPACT?
for(i in 1:length(seleno_thingy_clean)){
clean_fasta_temp <- compbio4all::fasta_cleaner(seleno_thingy[[i]],
parse = T)
seleno_thingy_clean[[i]] <- clean_fasta_temp
}
Now the data looks like this HOW WOULD YOU DESCRIBE THIS?
seleno_thingy_clean
## $prot1
## [1] "M" "E" "R" "Q" "E" "E" "S" "L" "S" "A" "R" "P" "A" "L" "E" "T" "E" "G"
## [19] "L" "R" "F" "L" "H" "T" "T" "V" "G" "S" "L" "L" "A" "T" "Y" "G" "W" "Y"
## [37] "I" "V" "F" "S" "C" "I" "L" "L" "Y" "V" "V" "F" "Q" "K" "L" "S" "A" "R"
## [55] "L" "R" "A" "L" "R" "Q" "R" "Q" "L" "D" "R" "A" "A" "A" "A" "V" "E" "P"
## [73] "D" "V" "V" "V" "K" "R" "Q" "E" "A" "L" "A" "A" "A" "R" "L" "K" "M" "Q"
## [91] "E" "E" "L" "N" "A" "Q" "V" "E" "K" "H" "K" "E" "K" "L" "K" "Q" "L" "E"
## [109] "E" "E" "K" "R" "R" "Q" "K" "I" "E" "M" "W" "D" "S" "M" "Q" "E" "G" "K"
## [127] "S" "Y" "K" "G" "N" "A" "K" "K" "P" "Q" "E" "E" "D" "S" "P" "G" "P" "S"
## [145] "T" "S" "S" "V" "L" "K" "R" "K" "S" "D" "R" "K" "P" "L" "R" "G" "G" "G"
## [163] "Y" "N" "P" "L" "S" "G" "E" "G" "G" "G" "A" "C" "S" "W" "R" "P" "G" "R"
## [181] "R" "G" "P" "S" "S" "G" "G" "U" "G"
##
## $prot2
## [1] "M" "C" "A" "A" "R" "L" "A" "A" "A" "A" "A" "A" "A" "Q" "S" "V" "Y" "A"
## [19] "F" "S" "A" "R" "P" "L" "A" "G" "G" "E" "P" "V" "S" "L" "G" "S" "L" "R"
## [37] "G" "K" "E" "N" "A" "K" "N" "E" "E" "I" "L" "N" "S" "L" "K" "Y" "V" "R"
## [55] "P" "G" "G" "G" "F" "E" "P" "N" "F" "M" "L" "F" "E" "K" "C" "E" "V" "N"
## [73] "G" "A" "G" "A" "H" "P" "L" "F" "A" "F" "L" "R" "E" "A" "L" "P" "A" "P"
## [91] "S" "D" "D" "A" "T" "A" "L" "M" "T" "D" "P" "K" "L" "I" "T" "W" "S" "P"
## [109] "V" "C" "R" "N" "D" "V" "A" "W" "N" "F" "E" "K" "F" "L" "V" "G" "P" "D"
## [127] "G" "V" "P" "L" "R" "R" "Y" "S" "R" "R" "F" "Q" "T" "I" "D" "I" "E" "P"
## [145] "D" "I" "E" "A" "L" "L" "S" "Q" "G" "P" "S" "C" "A"
HOW WOULD YOU DESCRIBE THIS?
class(seleno_thingy_clean[[1]])
## [1] "character"
is(seleno_thingy_clean[[1]])
## [1] "character" "vector"
## [3] "data.frameRowLabels" "SuperClassMethod"
## [5] "character_OR_connection" "character_OR_NULL"
## [7] "atomic" "EnumerationValue"
## [9] "vector_OR_Vector" "vector_OR_factor"
is.vector(seleno_thingy_clean[[1]])
## [1] TRUE
For old-times sake we can make a dotplot.
Now for a dotplot
WHAT AM I DOING HERE?
prot1_vector <- seleno_thingy_clean[[1]]
prot2_vector <- seleno_thingy_clean[[2]]
We can dotplot like this
seqinr::dotPlot(prot1_vector,
prot1_vector)
WHAT DID I DO DIFFERENTLY HERE?
seqinr::dotPlot(seleno_thingy_clean[[1]],
seleno_thingy_clean[[2]])
dotPlot likes things in a single vector, but pairwiseAlignment like a single string of characters, so as always we have to process the data.
WHAT AM I DOING HERE? WHAT DOES “” MEAN?
prot1_str <- paste(seleno_thingy_clean[[1]],sep = "", collapse = "")
prot2_str <- paste(seleno_thingy_clean[[2]],sep = "", collapse = "")
So now things look like this HOW WOULD YOU DESCRIBE THIS?
prot1_str
## [1] "MERQEESLSARPALETEGLRFLHTTVGSLLATYGWYIVFSCILLYVVFQKLSARLRALRQRQLDRAAAAVEPDVVVKRQEALAAARLKMQEELNAQVEKHKEKLKQLEEEKRRQKIEMWDSMQEGKSYKGNAKKPQEEDSPGPSTSSVLKRKSDRKPLRGGGYNPLSGEGGGACSWRPGRRGPSSGGUG"
Protein alignments need a amino acid transition matrix, and we need to use data() to bring those up into active memory (VERY IMPORTANT STEP!)
data(BLOSUM50)
The alignment
align_out <- Biostrings::pairwiseAlignment(pattern = prot1_str,
subject = prot2_str,
type = "global",
gapOpening = -9.5,
gapExtension = -0.5)
What is this?
align_out
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MERQEESLSARPALETEGLRFLHTTVGSLLATYG...-----------------ACSWRPGRRGPSSGGUG
## subject: M---------------------------------...IDIEPDIEALLSQGPSCA----------------
## score: -160.2561
WHAT IS THIS? HOW IS IT DIFFERNT FROM THE LAST CHUNK?
compbio4all::print_pairwise_alignment(align_out)
## [1] "MERQEESLSARPALETEGLRFLHTTVGSLLATYGWYIVFSCILLYVVFQKLSARLRALRQ 60"
## [1] "M---------------------------------------C----------AARL----- 6"
## [1] " "
## [1] "RQLDRAAAAVEPDVVVKRQEALAAA--------RLKMQEELNAQVEKHKEKLKQLEEEKR 112"
## [1] "-----AAAA-------------AAAQSVYAFSAR-------------------------- 22"
## [1] " "
## [1] "RQKIEMWDSMQEGKSYKGNAKKPQEEDSPGPSTSSVLKRKSDRKPLRGGGYNPLSGE--- 169"
## [1] "--------------------------------------------PLAGG-------EPVS 31"
## [1] " "
## [1] "------------------------GGG--------------------------------- 172"
## [1] "LGSLRGKENAKNEEILNSLKYVRPGGGFEPNFMLFEKCEVNGAGAHPLFAFLREALPAPS 91"
## [1] " "
## [1] "------------------------------------------------------------ 172"
## [1] "DDATALMTDPKLITWSPVCRNDVAWNFEKFLVGPDGVPLRRYSRRFQTIDIEPDIEALLS 151"
## [1] " "
## [1] "-----A 227"
## [1] "QGPSCA 211"
## [1] " "
These are two randomly chosen sequences, so the alignment should be pretty …WHAT?
The score is negative, but on its own that MEANS WHAT?
score(align_out)
## [1] -160.2561
pid gives us … WHAT?
pid(align_out)
## [1] 7.189542
Of course, pid can be calculated several ways (WHY IS THIS AN ISSUE / POSSIBLE?)
pid(align_out,type = "PID1")
## [1] 7.189542
pid(align_out,type = "PID2")
## [1] 91.66667
pid(align_out,type = "PID3")
## [1] 14.01274
pid(align_out,type = "PID4")
## [1] 12.71676