This script uses MySQL via RMariaDb package and MySQL. This script to use MySQL within RStudio from: https://programminghistorian.org/en/lessons/getting-started-with-mysql-using-r

library(RMariaDB)
library(tidyverse)
library(DT)
library(stringr)
UL_data <- read.csv('UL_nonUL_beadchip_stats.csv', sep=',',header=TRUE, na.strings=c('',' ','NA'))
colnames(UL_data) <- c("SEQUENCE",
                       "SYMBOL",
                       "GENE_COUNT",
                       "UL_MEAN",
                       "UL_MEDIAN",
                       "UL_MAX",
                       "UL_MIN",
                       "UL_SE",
                       "nonUL_MEAN",
                       "nonUL_MEDIAN",
                       "nonUL_MAX",
                       "nonUL_MIN",
                       "nonUL_SE",
                       "FOLDCHANGE_MEAN_UL_to_nonUL")
head(UL_data,30)
##                                              SEQUENCE    SYMBOL GENE_COUNT
## 1  AAAAAACAAAACCGCGCAGCGGAGAACCGGTGCCTGAGTCTCCCAGGGAC LOC644299          1
## 2  AAAAAACAGGAATAGCTCTAGGAGTCCTTACACAGGTCCGAGGGACCAGC LOC642856          1
## 3  AAAAAACCATGAAGGAACTGACTCCTCTTCAAGCCACGATGCTTCGTATG LOC646814          1
## 4  AAAAAACGACTGCCCATCCCGGGTCCTTTCCCTGATGGGTTGGGGCAGTT LOC651423          1
## 5  AAAAAAGAAAAGTAGTACCTAATCACCTCCTAGAATCAAAGGTTTATGCA LOC654134          1
## 6  AAAAAAGAGAAAGACAAATTCCTTTACTGTTACTTCTTTCCCTGGCTTCT LOC646029          1
## 7  AAAAAAGCAGGTGAGCGGGCGCGTCCCCGCGACCGCGCTGCCTCCCCGAG LOC645293          1
## 8  AAAAAAGCCTTTTGGCTCTGACCAGCACCATGGCAGTTGGCAAGAACAAG LOC641849          1
## 9  AAAAAAGGCGGCGGACAGTTGGCCAGAGGGATGGAGACTGGTGGGGACGC    ZNF579          1
## 10 AAAAAAGTCTGAAAAGGGCAGAAGCAAAGCCTTCTCCACATGCGGCGCCC    OR5AC2          1
## 11 AAAAAAGTGGTTTCAGCTGTGCCCTCTGAAACTTAATGTTTCTTACTGAC LOC653312          3
## 12 AAAAAAGTTAGTTGTAAAACAGCCACAGGCAGCTCCCTCAGGAGAAATTC LOC643039          1
## 13 AAAAAAGTTGAGGGGGCGTGTTCCCGTCGGCTGCGCCCGCGGCCCGGGGC LOC644423          1
## 14 AAAAAAGTTTATTGTAAAAGAGCCTCAGGTAGCTCCTTCAGGAGGTATTC LOC643821          1
## 15 AAAAAATACAGGAGAGGCCAATGGAGAGGGGATGCATCAACGCCAGAACA LOC643505          1
## 16 AAAAAATATTTAATTCAATGGTAAGTTTATGTAAATATCAGAGAATTTAT LOC649413          1
## 17 AAAAAATCAGCAAAGAAGAAAAGACTCCTGGCTGTGTGAAAATTAAAAAA      KRAS          2
## 18 AAAAAATCCGAGAGAAGTTCAACCGTTACTTGGATGTGGTCAATCGGAAC LOC653697          1
## 19 AAAAAATCTAACACTGGCTACCCCGCTCGGGACCCCTTCCACGCTGTGAA   FAM104B          1
## 20 AAAAAATCTCGCCTCCCACACCAAAGCCCCGGCCACCTCGGCCTCTTCCT    PIK3R1          3
## 21 AAAAAATGCCTAAAATAAGGTTTTCTTGCATAAATACTGGAAATTGCACA     SUMO1          1
## 22 AAAAAATGGGCTTTGCACACAATGGGTTTGGAGCTGACTGGGAACAATGG    DNAJA2          1
## 23 AAAAAATTATAAAAACAGGAAAGAGAGAAAGAACAGCCAGAGCCCCGGCT LOC653338          1
## 24 AAAAAGAACCACCTAGATCCAAATCTCAGTCAAGATCACAGTCTAGGTCT    FUSIP1          3
## 25 AAAAAGATAGGGGTGAGGTGTGGTCTATCTCCTGCTTCACCCAGGCCCTG LOC646915          1
## 26 AAAAAGCGCAAGCGGCGCACCTCCTTCACACCCCAGGCCCTTGAGATCCT    POU6F2          1
## 27 AAAAAGGCACAACCTGAAAGACCTAGAACCCAGTGTCGGTCCCCAGGCCC      MADD          2
## 28 AAAAAGGGAATGATAGTCTTTGAAAGAAAACAGTAGGCATCCAGCACTGG      HAS2          4
## 29 AAAAAGTACAGTGTAATAGACAAGACCATGTTACTATTAGAAGTATGGGT     RGPD4          2
## 30 AAAAAGTTCTCAGGTGATGCTGATGCTGGTGGTCCTTGTACCTTGATCTG  ANKRD30A          3
##       UL_MEAN  UL_MEDIAN    UL_MAX    UL_MIN       UL_SE nonUL_MEAN
## 1    49.41867   49.48335   55.1600   44.0250    2.589040   48.11046
## 2    48.37833   48.48750   52.3000   43.9000    2.298020   48.02972
## 3    53.38950   52.15000   64.5333   49.6000    3.614234   53.54065
## 4   486.13540  408.89640 1218.0130  300.3753  214.264828  573.50472
## 5    54.01650   53.65000   58.1000   49.7800    2.004786   52.72871
## 6    48.18745   47.95000   60.3909   41.4200    4.201875   47.20602
## 7    46.28343   47.15000   53.8333   41.0122    3.087620   46.52268
## 8   977.39389 1041.09450 1538.9740   68.3500  391.457028 1146.99125
## 9   199.38077  177.83360  383.3843  113.4368   76.996686  157.85435
## 10   54.38902   53.35000   61.6500   49.3000    3.417548   52.10648
## 11   43.60381   43.60500   47.8000   40.4000    2.074714   43.27660
## 12   52.44566   52.21250   59.4000   47.7000    3.001126   52.98417
## 13   60.75091   60.59285   72.8000   51.8000    4.905489   58.61247
## 14   45.68395   45.07500   50.5000   41.7000    2.912428   45.09150
## 15   79.40405   72.41530  134.2643   55.1000   22.531372   80.94183
## 16   42.87605   43.40770   48.4500   37.2529    2.426858   43.17794
## 17   46.96254   46.60000   51.3000   43.5875    2.213988   46.67852
## 18   51.42875   51.02000   58.0000   44.5500    3.593524   51.29573
## 19  531.81124  460.92310 1042.2790  295.2000  207.141238  460.55386
## 20   84.92938   80.74730  126.0905   49.4000   22.679588   74.47471
## 21   45.79348   45.80000   51.3500   40.9609    2.882471   45.07757
## 22 3349.68285 3033.22550 7175.8720 2498.4590 1027.838843 3143.72385
## 23   59.44009   58.97000   70.0400   53.0600    4.387645   60.01128
## 24  298.51286  299.39825  392.6860  212.9553   49.800491  369.92520
## 25   64.99616   63.16025  100.4300   56.3000   10.743989   64.14644
## 26   60.90165   60.42725   71.0000   56.4200    3.392755   62.02234
## 27   55.64184   55.17500   60.1000   50.4000    2.532447   55.52435
## 28   53.79959   53.32500   58.2000   49.6000    2.905302   52.80333
## 29   42.23757   41.70895   48.2250   38.7609    2.445446   42.43035
## 30   50.26533   50.41665   54.8000   45.9000    2.293624   50.50501
##    nonUL_MEDIAN nonUL_MAX nonUL_MIN   nonUL_SE FOLDCHANGE_MEAN_UL_to_nonUL
## 1      48.55000   52.6000   42.6600   2.462763                   1.0271918
## 2      48.06250   54.1250   42.6600   2.749633                   1.0072582
## 3      54.13750   56.7000   48.0000   2.847912                   0.9971769
## 4     539.01615  795.4293  408.1436 130.028868                   0.8476572
## 5      52.52500   57.4000   47.4000   3.068422                   1.0244230
## 6      47.21250   51.6500   42.9000   1.969734                   1.0207905
## 7      46.57500   50.7000   42.3000   2.401205                   0.9948573
## 8    1041.17200 2183.2880  362.8504 471.666060                   0.8521372
## 9     138.54340  277.3081   88.7000  51.106078                   1.2630680
## 10     52.52500   57.7000   45.7000   3.082463                   1.0438052
## 11     43.44375   47.3000   38.8900   2.370359                   1.0075608
## 12     52.37500   56.4000   49.9000   2.186687                   0.9898365
## 13     57.71500   69.6286   49.1000   4.605897                   1.0364844
## 14     45.65000   52.6000   39.1286   3.477828                   1.0131387
## 15     80.85520  110.7400   61.4000  16.351863                   0.9810014
## 16     43.30000   45.6000   39.1375   1.800928                   0.9930083
## 17     45.94000   51.5000   43.6750   2.284786                   1.0060848
## 18     50.82500   60.9714   44.5500   3.556813                   1.0025932
## 19    414.86415  720.3029  273.0973 134.919587                   1.1547211
## 20     64.99285  140.8741   53.0500  24.425163                   1.1403788
## 21     44.90000   48.6667   42.0088   1.777433                   1.0158817
## 22   3050.23845 4090.2362 2569.7595 462.582701                   1.0655143
## 23     59.60595   66.6000   54.5750   3.453990                   0.9904818
## 24    366.18445  598.4918  256.4498  80.954206                   0.8069547
## 25     62.36250   76.2667   56.6000   6.148024                   1.0132465
## 26     60.85000   69.7765   58.1000   3.131962                   0.9819307
## 27     55.52750   63.2000   51.1000   2.835085                   1.0021159
## 28     52.60000   58.4000   46.2000   2.894212                   1.0188672
## 29     42.53845   46.4500   36.5656   2.320795                   0.9954566
## 30     49.78335   54.5000   46.3250   2.200926                   0.9952544
pretty_headers <- str_to_title(colnames(UL_data))

UL_data1 <- datatable(data=UL_data,  rownames=FALSE,
                      colnames=pretty_headers,
                      filter=list(position='top'),
                      options=list(
                        dom='Bfrtip',
                        buttons=c('colvis','csv','excel'),
                        language=list(sSearch='Filter:')),
                      extensions=c('Buttons','Responsive')
                      )
UL_data1
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html

Connect to MySQL database.

ULsDb <- dbConnect(RMariaDB::MariaDB(), 
                       user='newuser', 
                       password='2020mystery$', 
                       dbname='UL_data', 
                       host='localhost')
dbListTables(ULsDb)
## character(0)
query <- "CREATE TABLE ul_data.ul_data (
SEQUENCE CHAR(99),
SYMBOL CHAR(25),
GENE_COUNT INT,
UL_MEAN FLOAT,
UL_MEDIAN FLOAT,
UL_MAX FLOAT,
UL_MIN FLOAT,
UL_SE FLOAT,
nonUL_MEAN FLOAT,
nonUL_MEDIAN FLOAT,
nonUL_MAX FLOAT,
nonUL_MIN FLOAT,
nonUL_SE FLOAT,
FOLDCHANGE_MEAN_UL_to_nonUL FLOAT
);"

rsInsert <- dbSendQuery(ULsDb, query)
dbClearResult(rsInsert)
#write the csv data from the UL_data file into the ul_data SQL table
dbWriteTable(ULsDb, value = UL_data, row.names = FALSE, name = "ul_data", append = TRUE )

Get those genes with Copy Number Variants (CNVs) greater than 2 and Fold change of the mean of UL gene expression to nonUL gene expression greater than 1.5 or a 50% increase in UL compared to nonUL. Note that the GENE_COUNT column is the CNV count as it is a count of the number of times the gene symbol is in the data with the attached gene sequence for each gene. Notice the sequences are not exact for each gene symbol.

query <- "select SEQUENCE,SYMBOL,GENE_COUNT,FOLDCHANGE_MEAN_UL_to_nonUL FROM ul_data.ul_data
WHERE( FOLDCHANGE_MEAN_UL_to_nonUL > 1.5 AND GENE_COUNT>2)
ORDER BY GENE_COUNT DESC,FOLDCHANGE_MEAN_UL_to_nonUL DESC LIMIT 30;"

rs = dbSendQuery(ULsDb,query)

dbRows<-dbFetch(rs)

The top 30 results of gene sequences and gene symbols with respective CNV counts and fold change values.

dbRows
##                                              SEQUENCE   SYMBOL GENE_COUNT
## 1  AGCTGCAGGGGTCCTCTGTGAACTTGCTCAGGACAAGGAAGCTGCAGAAG   CTNNB1          7
## 2  CTGCAGGGGTCCTCTGTGAACTTGCTCAGGACAAGGAAGCTGCAGAAGCT   CTNNB1          7
## 3  GTCACCCAACGACCCATGACCGTTCTCTGAGGCAAAGGAGGGCAACCTGA   GABRB3          5
## 4  GATGCCCCTGACATCATCATTCTTGTGGGAGACAGCAGCCTGTATGTGGT     CHKA          5
## 5  CCCAACACCTGGGTAAACAATGACCTTCCAGAGAGTGGCTCCCGTATGCC   GABRB3          5
## 6  CTTTGTCCACATGGGCCGTTGACCTTAGAGTTAAGGCGGTTGCTTTTTTG   ZNF365          4
## 7  GCAGCCTTTTGGGTATCAAATGGGTCAAAACCATGGGACCTGCCACCTCC    NPTX1          4
## 8  AAGAGGGGTGCTGGAGGACATGGACAGCGACAAACCTGAGGCCGTGTCTA   GALNT9          4
## 9  AATGTCCCCGGATCTTTCGATGCAAGTCCTGGAGGCAGGGAGATCACTGC    FOXL2          4
## 10 ACCCAGCTTTTGTCCACAAGGTGACTGTAACTCAGAATGGAAAGTGGGCT    PLAG1          4
## 11 GGGCATGTCGTTCAGTGCCCCAGCAGTCAGAAGCAGTTTTTGTGGAACTA   PARD6G          4
## 12 GAGCCAGGCACCCTCCTCTGTCACTTGCTGCTTGGAGCCTTAACTCCAGA     FSD1          4
## 13 TGTGCTCTCTGCTGACTTGCAGGCGAGGCCGACTGTGCTGATGTACGTGA   PARD6G          4
## 14 TTCTTGCTGTGCTCTCTGCTGACTTGCAGGCGAGGCCGACTGTGCTGATG   PARD6G          4
## 15 GCCCAGTTTTCTCTGGTGGGTCCAGTGTTTTGTTCCTAGGTGTCTGCTGC  PPP2R2B          4
## 16 CAGCCTGGGTCTCAGCCCCGCGTACGGCCTTTCACGAGTCTTCAAGCCTT MGC39900          4
## 17 CCTGGGTCTCAGCCCCGCGTACGGCCTTTCACGAGTCTTCAAGCCTTCAG MGC39900          4
## 18 ACAGGACTGATGGCGGGACCGCGCTGCCCGAGAAAGGGACGGACCAATAC    FOXL2          4
## 19 TTTGGCGCTTTTGTGCGCGCCCGGGTCTGTTGGTGCTCAGAGTGTGGTCA    STMN1          4
## 20 GGGAAGACGACGGAACTTGCCACAGCGACAGACTTCTCCCAAACAGGAAA   TRIM36          4
## 21 GCTTTGCTGCCCTTTTATTGTACCCCAGGCCTCCTAGAGGACTCCTGCAG    FRAS1          4
## 22 ACCTTGCCATGGATGACCTTTGTGTAGGTAGTCCTTGCACCTCATGCAGG MGC39900          4
## 23 TTCTACTGGGCTCGTCTCTTTCCGGAGCCCCAGCGTCTCCTGCCCAAATT    FOXL2          4
## 24 GGCTTGGGTCTCTGCACTCACTGTCAGCAGCTTTCACCAGCAGTTTAAGC    PLAG1          4
## 25 CCGTGCAGCTGACCCCAGACGATGAGGATGCCATAAAGAGGATCGACCAG    PRKCZ          4
## 26 ACCTGGAGAGTGAAGACGGGCATGACGGCAGGTGAAGGGGTTTGCTGTGA   ZSCAN2          4
## 27 CCACCCAACCCGGTGGTTAACGATTTAATACATATAACCAAACCGGCAGC   PDZRN3          4
## 28 CAGGAGGACGGCATCATGCTGTCTGCCGACTGCTCTGAGCTCGGGCTGTC     LGR6          4
## 29 CTGTCACCTTCCTCCTCCCACTTCTGTATCCTCAGAGATGCTGCGTGGAT   BTBD11          4
## 30 TCAGGACCCACCACCTGCCGGACCATCATTTTCCGAGTCCCTCCGGCCTA  C3orf23          4
##    FOLDCHANGE_MEAN_UL_to_nonUL
## 1                     1.568983
## 2                     1.524832
## 3                     1.807217
## 4                     1.555827
## 5                     1.538612
## 6                     4.261581
## 7                     4.172457
## 8                     2.835567
## 9                     2.538581
## 10                    2.509276
## 11                    2.412239
## 12                    2.317374
## 13                    2.125194
## 14                    2.050708
## 15                    2.017088
## 16                    2.004920
## 17                    1.984822
## 18                    1.914530
## 19                    1.900091
## 20                    1.848526
## 21                    1.842715
## 22                    1.818676
## 23                    1.665787
## 24                    1.645051
## 25                    1.614649
## 26                    1.590096
## 27                    1.577921
## 28                    1.562422
## 29                    1.542656
## 30                    1.533255

For re-running the script we need to delete/drop the database and tables.

dbClearResult(rsInsert)

dbListTables(ULsDb)
## [1] "ul_data"
dbClearResult(rsInsert)

query <- "DROP TABLE ul_data.ul_data;"
rsInsert <- dbSendQuery(ULsDb, query)

dbClearResult(rsInsert)

dbDisconnect(ULsDb)

This Rmarkdown script showed the UL data of gene expressions in a DT interactive table, and by attaching to MySQL to get the top 30 highest fold change and copy number variant genes in the samples this study examined.