This script uses MySQL via RMariaDb package and MySQL. This script to use MySQL within RStudio from: https://programminghistorian.org/en/lessons/getting-started-with-mysql-using-r
library(RMariaDB)
library(tidyverse)
library(DT)
library(stringr)
UL_data <- read.csv('UL_nonUL_beadchip_stats.csv', sep=',',header=TRUE, na.strings=c('',' ','NA'))
colnames(UL_data) <- c("SEQUENCE",
"SYMBOL",
"GENE_COUNT",
"UL_MEAN",
"UL_MEDIAN",
"UL_MAX",
"UL_MIN",
"UL_SE",
"nonUL_MEAN",
"nonUL_MEDIAN",
"nonUL_MAX",
"nonUL_MIN",
"nonUL_SE",
"FOLDCHANGE_MEAN_UL_to_nonUL")
head(UL_data,30)
## SEQUENCE SYMBOL GENE_COUNT
## 1 AAAAAACAAAACCGCGCAGCGGAGAACCGGTGCCTGAGTCTCCCAGGGAC LOC644299 1
## 2 AAAAAACAGGAATAGCTCTAGGAGTCCTTACACAGGTCCGAGGGACCAGC LOC642856 1
## 3 AAAAAACCATGAAGGAACTGACTCCTCTTCAAGCCACGATGCTTCGTATG LOC646814 1
## 4 AAAAAACGACTGCCCATCCCGGGTCCTTTCCCTGATGGGTTGGGGCAGTT LOC651423 1
## 5 AAAAAAGAAAAGTAGTACCTAATCACCTCCTAGAATCAAAGGTTTATGCA LOC654134 1
## 6 AAAAAAGAGAAAGACAAATTCCTTTACTGTTACTTCTTTCCCTGGCTTCT LOC646029 1
## 7 AAAAAAGCAGGTGAGCGGGCGCGTCCCCGCGACCGCGCTGCCTCCCCGAG LOC645293 1
## 8 AAAAAAGCCTTTTGGCTCTGACCAGCACCATGGCAGTTGGCAAGAACAAG LOC641849 1
## 9 AAAAAAGGCGGCGGACAGTTGGCCAGAGGGATGGAGACTGGTGGGGACGC ZNF579 1
## 10 AAAAAAGTCTGAAAAGGGCAGAAGCAAAGCCTTCTCCACATGCGGCGCCC OR5AC2 1
## 11 AAAAAAGTGGTTTCAGCTGTGCCCTCTGAAACTTAATGTTTCTTACTGAC LOC653312 3
## 12 AAAAAAGTTAGTTGTAAAACAGCCACAGGCAGCTCCCTCAGGAGAAATTC LOC643039 1
## 13 AAAAAAGTTGAGGGGGCGTGTTCCCGTCGGCTGCGCCCGCGGCCCGGGGC LOC644423 1
## 14 AAAAAAGTTTATTGTAAAAGAGCCTCAGGTAGCTCCTTCAGGAGGTATTC LOC643821 1
## 15 AAAAAATACAGGAGAGGCCAATGGAGAGGGGATGCATCAACGCCAGAACA LOC643505 1
## 16 AAAAAATATTTAATTCAATGGTAAGTTTATGTAAATATCAGAGAATTTAT LOC649413 1
## 17 AAAAAATCAGCAAAGAAGAAAAGACTCCTGGCTGTGTGAAAATTAAAAAA KRAS 2
## 18 AAAAAATCCGAGAGAAGTTCAACCGTTACTTGGATGTGGTCAATCGGAAC LOC653697 1
## 19 AAAAAATCTAACACTGGCTACCCCGCTCGGGACCCCTTCCACGCTGTGAA FAM104B 1
## 20 AAAAAATCTCGCCTCCCACACCAAAGCCCCGGCCACCTCGGCCTCTTCCT PIK3R1 3
## 21 AAAAAATGCCTAAAATAAGGTTTTCTTGCATAAATACTGGAAATTGCACA SUMO1 1
## 22 AAAAAATGGGCTTTGCACACAATGGGTTTGGAGCTGACTGGGAACAATGG DNAJA2 1
## 23 AAAAAATTATAAAAACAGGAAAGAGAGAAAGAACAGCCAGAGCCCCGGCT LOC653338 1
## 24 AAAAAGAACCACCTAGATCCAAATCTCAGTCAAGATCACAGTCTAGGTCT FUSIP1 3
## 25 AAAAAGATAGGGGTGAGGTGTGGTCTATCTCCTGCTTCACCCAGGCCCTG LOC646915 1
## 26 AAAAAGCGCAAGCGGCGCACCTCCTTCACACCCCAGGCCCTTGAGATCCT POU6F2 1
## 27 AAAAAGGCACAACCTGAAAGACCTAGAACCCAGTGTCGGTCCCCAGGCCC MADD 2
## 28 AAAAAGGGAATGATAGTCTTTGAAAGAAAACAGTAGGCATCCAGCACTGG HAS2 4
## 29 AAAAAGTACAGTGTAATAGACAAGACCATGTTACTATTAGAAGTATGGGT RGPD4 2
## 30 AAAAAGTTCTCAGGTGATGCTGATGCTGGTGGTCCTTGTACCTTGATCTG ANKRD30A 3
## UL_MEAN UL_MEDIAN UL_MAX UL_MIN UL_SE nonUL_MEAN
## 1 49.41867 49.48335 55.1600 44.0250 2.589040 48.11046
## 2 48.37833 48.48750 52.3000 43.9000 2.298020 48.02972
## 3 53.38950 52.15000 64.5333 49.6000 3.614234 53.54065
## 4 486.13540 408.89640 1218.0130 300.3753 214.264828 573.50472
## 5 54.01650 53.65000 58.1000 49.7800 2.004786 52.72871
## 6 48.18745 47.95000 60.3909 41.4200 4.201875 47.20602
## 7 46.28343 47.15000 53.8333 41.0122 3.087620 46.52268
## 8 977.39389 1041.09450 1538.9740 68.3500 391.457028 1146.99125
## 9 199.38077 177.83360 383.3843 113.4368 76.996686 157.85435
## 10 54.38902 53.35000 61.6500 49.3000 3.417548 52.10648
## 11 43.60381 43.60500 47.8000 40.4000 2.074714 43.27660
## 12 52.44566 52.21250 59.4000 47.7000 3.001126 52.98417
## 13 60.75091 60.59285 72.8000 51.8000 4.905489 58.61247
## 14 45.68395 45.07500 50.5000 41.7000 2.912428 45.09150
## 15 79.40405 72.41530 134.2643 55.1000 22.531372 80.94183
## 16 42.87605 43.40770 48.4500 37.2529 2.426858 43.17794
## 17 46.96254 46.60000 51.3000 43.5875 2.213988 46.67852
## 18 51.42875 51.02000 58.0000 44.5500 3.593524 51.29573
## 19 531.81124 460.92310 1042.2790 295.2000 207.141238 460.55386
## 20 84.92938 80.74730 126.0905 49.4000 22.679588 74.47471
## 21 45.79348 45.80000 51.3500 40.9609 2.882471 45.07757
## 22 3349.68285 3033.22550 7175.8720 2498.4590 1027.838843 3143.72385
## 23 59.44009 58.97000 70.0400 53.0600 4.387645 60.01128
## 24 298.51286 299.39825 392.6860 212.9553 49.800491 369.92520
## 25 64.99616 63.16025 100.4300 56.3000 10.743989 64.14644
## 26 60.90165 60.42725 71.0000 56.4200 3.392755 62.02234
## 27 55.64184 55.17500 60.1000 50.4000 2.532447 55.52435
## 28 53.79959 53.32500 58.2000 49.6000 2.905302 52.80333
## 29 42.23757 41.70895 48.2250 38.7609 2.445446 42.43035
## 30 50.26533 50.41665 54.8000 45.9000 2.293624 50.50501
## nonUL_MEDIAN nonUL_MAX nonUL_MIN nonUL_SE FOLDCHANGE_MEAN_UL_to_nonUL
## 1 48.55000 52.6000 42.6600 2.462763 1.0271918
## 2 48.06250 54.1250 42.6600 2.749633 1.0072582
## 3 54.13750 56.7000 48.0000 2.847912 0.9971769
## 4 539.01615 795.4293 408.1436 130.028868 0.8476572
## 5 52.52500 57.4000 47.4000 3.068422 1.0244230
## 6 47.21250 51.6500 42.9000 1.969734 1.0207905
## 7 46.57500 50.7000 42.3000 2.401205 0.9948573
## 8 1041.17200 2183.2880 362.8504 471.666060 0.8521372
## 9 138.54340 277.3081 88.7000 51.106078 1.2630680
## 10 52.52500 57.7000 45.7000 3.082463 1.0438052
## 11 43.44375 47.3000 38.8900 2.370359 1.0075608
## 12 52.37500 56.4000 49.9000 2.186687 0.9898365
## 13 57.71500 69.6286 49.1000 4.605897 1.0364844
## 14 45.65000 52.6000 39.1286 3.477828 1.0131387
## 15 80.85520 110.7400 61.4000 16.351863 0.9810014
## 16 43.30000 45.6000 39.1375 1.800928 0.9930083
## 17 45.94000 51.5000 43.6750 2.284786 1.0060848
## 18 50.82500 60.9714 44.5500 3.556813 1.0025932
## 19 414.86415 720.3029 273.0973 134.919587 1.1547211
## 20 64.99285 140.8741 53.0500 24.425163 1.1403788
## 21 44.90000 48.6667 42.0088 1.777433 1.0158817
## 22 3050.23845 4090.2362 2569.7595 462.582701 1.0655143
## 23 59.60595 66.6000 54.5750 3.453990 0.9904818
## 24 366.18445 598.4918 256.4498 80.954206 0.8069547
## 25 62.36250 76.2667 56.6000 6.148024 1.0132465
## 26 60.85000 69.7765 58.1000 3.131962 0.9819307
## 27 55.52750 63.2000 51.1000 2.835085 1.0021159
## 28 52.60000 58.4000 46.2000 2.894212 1.0188672
## 29 42.53845 46.4500 36.5656 2.320795 0.9954566
## 30 49.78335 54.5000 46.3250 2.200926 0.9952544
pretty_headers <- str_to_title(colnames(UL_data))
UL_data1 <- datatable(data=UL_data, rownames=FALSE,
colnames=pretty_headers,
filter=list(position='top'),
options=list(
dom='Bfrtip',
buttons=c('colvis','csv','excel'),
language=list(sSearch='Filter:')),
extensions=c('Buttons','Responsive')
)
UL_data1
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html
Connect to MySQL database.
ULsDb <- dbConnect(RMariaDB::MariaDB(),
user='newuser',
password='2020mystery$',
dbname='UL_data',
host='localhost')
dbListTables(ULsDb)
## character(0)
query <- "CREATE TABLE ul_data.ul_data (
SEQUENCE CHAR(99),
SYMBOL CHAR(25),
GENE_COUNT INT,
UL_MEAN FLOAT,
UL_MEDIAN FLOAT,
UL_MAX FLOAT,
UL_MIN FLOAT,
UL_SE FLOAT,
nonUL_MEAN FLOAT,
nonUL_MEDIAN FLOAT,
nonUL_MAX FLOAT,
nonUL_MIN FLOAT,
nonUL_SE FLOAT,
FOLDCHANGE_MEAN_UL_to_nonUL FLOAT
);"
rsInsert <- dbSendQuery(ULsDb, query)
dbClearResult(rsInsert)
#write the csv data from the UL_data file into the ul_data SQL table
dbWriteTable(ULsDb, value = UL_data, row.names = FALSE, name = "ul_data", append = TRUE )
Get those genes with Copy Number Variants (CNVs) greater than 2 and Fold change of the mean of UL gene expression to nonUL gene expression greater than 1.5 or a 50% increase in UL compared to nonUL. Note that the GENE_COUNT column is the CNV count as it is a count of the number of times the gene symbol is in the data with the attached gene sequence for each gene. Notice the sequences are not exact for each gene symbol.
query <- "select SEQUENCE,SYMBOL,GENE_COUNT,FOLDCHANGE_MEAN_UL_to_nonUL FROM ul_data.ul_data
WHERE( FOLDCHANGE_MEAN_UL_to_nonUL > 1.5 AND GENE_COUNT>2)
ORDER BY GENE_COUNT DESC,FOLDCHANGE_MEAN_UL_to_nonUL DESC LIMIT 30;"
rs = dbSendQuery(ULsDb,query)
dbRows<-dbFetch(rs)
The top 30 results of gene sequences and gene symbols with respective CNV counts and fold change values.
dbRows
## SEQUENCE SYMBOL GENE_COUNT
## 1 AGCTGCAGGGGTCCTCTGTGAACTTGCTCAGGACAAGGAAGCTGCAGAAG CTNNB1 7
## 2 CTGCAGGGGTCCTCTGTGAACTTGCTCAGGACAAGGAAGCTGCAGAAGCT CTNNB1 7
## 3 GTCACCCAACGACCCATGACCGTTCTCTGAGGCAAAGGAGGGCAACCTGA GABRB3 5
## 4 GATGCCCCTGACATCATCATTCTTGTGGGAGACAGCAGCCTGTATGTGGT CHKA 5
## 5 CCCAACACCTGGGTAAACAATGACCTTCCAGAGAGTGGCTCCCGTATGCC GABRB3 5
## 6 CTTTGTCCACATGGGCCGTTGACCTTAGAGTTAAGGCGGTTGCTTTTTTG ZNF365 4
## 7 GCAGCCTTTTGGGTATCAAATGGGTCAAAACCATGGGACCTGCCACCTCC NPTX1 4
## 8 AAGAGGGGTGCTGGAGGACATGGACAGCGACAAACCTGAGGCCGTGTCTA GALNT9 4
## 9 AATGTCCCCGGATCTTTCGATGCAAGTCCTGGAGGCAGGGAGATCACTGC FOXL2 4
## 10 ACCCAGCTTTTGTCCACAAGGTGACTGTAACTCAGAATGGAAAGTGGGCT PLAG1 4
## 11 GGGCATGTCGTTCAGTGCCCCAGCAGTCAGAAGCAGTTTTTGTGGAACTA PARD6G 4
## 12 GAGCCAGGCACCCTCCTCTGTCACTTGCTGCTTGGAGCCTTAACTCCAGA FSD1 4
## 13 TGTGCTCTCTGCTGACTTGCAGGCGAGGCCGACTGTGCTGATGTACGTGA PARD6G 4
## 14 TTCTTGCTGTGCTCTCTGCTGACTTGCAGGCGAGGCCGACTGTGCTGATG PARD6G 4
## 15 GCCCAGTTTTCTCTGGTGGGTCCAGTGTTTTGTTCCTAGGTGTCTGCTGC PPP2R2B 4
## 16 CAGCCTGGGTCTCAGCCCCGCGTACGGCCTTTCACGAGTCTTCAAGCCTT MGC39900 4
## 17 CCTGGGTCTCAGCCCCGCGTACGGCCTTTCACGAGTCTTCAAGCCTTCAG MGC39900 4
## 18 ACAGGACTGATGGCGGGACCGCGCTGCCCGAGAAAGGGACGGACCAATAC FOXL2 4
## 19 TTTGGCGCTTTTGTGCGCGCCCGGGTCTGTTGGTGCTCAGAGTGTGGTCA STMN1 4
## 20 GGGAAGACGACGGAACTTGCCACAGCGACAGACTTCTCCCAAACAGGAAA TRIM36 4
## 21 GCTTTGCTGCCCTTTTATTGTACCCCAGGCCTCCTAGAGGACTCCTGCAG FRAS1 4
## 22 ACCTTGCCATGGATGACCTTTGTGTAGGTAGTCCTTGCACCTCATGCAGG MGC39900 4
## 23 TTCTACTGGGCTCGTCTCTTTCCGGAGCCCCAGCGTCTCCTGCCCAAATT FOXL2 4
## 24 GGCTTGGGTCTCTGCACTCACTGTCAGCAGCTTTCACCAGCAGTTTAAGC PLAG1 4
## 25 CCGTGCAGCTGACCCCAGACGATGAGGATGCCATAAAGAGGATCGACCAG PRKCZ 4
## 26 ACCTGGAGAGTGAAGACGGGCATGACGGCAGGTGAAGGGGTTTGCTGTGA ZSCAN2 4
## 27 CCACCCAACCCGGTGGTTAACGATTTAATACATATAACCAAACCGGCAGC PDZRN3 4
## 28 CAGGAGGACGGCATCATGCTGTCTGCCGACTGCTCTGAGCTCGGGCTGTC LGR6 4
## 29 CTGTCACCTTCCTCCTCCCACTTCTGTATCCTCAGAGATGCTGCGTGGAT BTBD11 4
## 30 TCAGGACCCACCACCTGCCGGACCATCATTTTCCGAGTCCCTCCGGCCTA C3orf23 4
## FOLDCHANGE_MEAN_UL_to_nonUL
## 1 1.568983
## 2 1.524832
## 3 1.807217
## 4 1.555827
## 5 1.538612
## 6 4.261581
## 7 4.172457
## 8 2.835567
## 9 2.538581
## 10 2.509276
## 11 2.412239
## 12 2.317374
## 13 2.125194
## 14 2.050708
## 15 2.017088
## 16 2.004920
## 17 1.984822
## 18 1.914530
## 19 1.900091
## 20 1.848526
## 21 1.842715
## 22 1.818676
## 23 1.665787
## 24 1.645051
## 25 1.614649
## 26 1.590096
## 27 1.577921
## 28 1.562422
## 29 1.542656
## 30 1.533255
For re-running the script we need to delete/drop the database and tables.
dbClearResult(rsInsert)
dbListTables(ULsDb)
## [1] "ul_data"
dbClearResult(rsInsert)
query <- "DROP TABLE ul_data.ul_data;"
rsInsert <- dbSendQuery(ULsDb, query)
dbClearResult(rsInsert)
dbDisconnect(ULsDb)
This Rmarkdown script showed the UL data of gene expressions in a DT interactive table, and by attaching to MySQL to get the top 30 highest fold change and copy number variant genes in the samples this study examined.