conditions = read.csv("conditions_annotation.csv")
expression = read.csv("SC_expression.csv")
str(conditions)
## 'data.frame':    92 obs. of  4 variables:
##  $ ID                    : chr  "AFIQCI" "AFIQBR" "AFIINC" "AFNAQI" ...
##  $ primary               : chr  "wildtype" "wildtype" "itc1" "itc1" ...
##  $ secondary             : chr  "wildtype 1" "wildtype 2" "itc1-1_dUTP" "itc1-1_dUTP" ...
##  $ additional_information: chr  "" "" "" "" ...
head(expression)
##         X     IFFABF    SAASCC      IFFAFF      IFFAFA   INICIA   INICIF
## 1 YAL008W 70.8049308 30.970186 118.3846533 110.7039503 74.10224 89.80222
## 2 YBR255W 36.3687858 16.860290  27.7264519  34.5279281 35.44020 56.97130
## 3 YGR164W  0.3513892  0.904239   0.4560272   0.2926096  0.00000  1.10356
## 4 YGR131W  7.3791739  5.858715  12.1303227  12.6797476 11.27643 18.48464
## 5 YNL003C 82.0493864 78.480411  67.2184048  64.1790302 75.71315 75.31799
## 6 YBR135W 45.0656694 21.004719  53.8112060  42.9160688 17.72010 14.76012
##       SAABQI    FFNAAA     IFAAAR    AFIINC    IFFABN     SAARRS   FFNARN
## 1  7.3415361 112.09117  5.9736681 1.5999440 80.906634  25.248508 92.87835
## 2 15.4521855  26.85875  5.3840882 3.2055216 35.588256  18.951094 20.78055
## 3  0.7691133  18.08554  0.7119455 0.6140630  0.399868   3.747059 16.48786
## 4  2.3772593  22.25159  4.4051630 0.4675893 10.263280  12.300555 14.73175
## 5 20.1367847  43.32689 30.3244305 3.6505765 84.638736 120.082451 38.53671
## 6 13.4944425  21.76147  6.7746068 2.4280840 35.055098  17.715153 12.87809
##       SAABQR   FFNAAR    FAFASB     SAABBN      SAASCF    INICIQ     SAABFA
## 1 14.3453666 97.81174  9.706003 11.5621808  53.7957089 87.287511  7.1145370
## 2 19.8577066 26.51142 10.002635 14.0147646  23.5684783 47.303048 12.5800091
## 3  0.4648961 21.62535  1.101011  0.9009491   0.8542478  1.249514  0.7067421
## 4  4.2172721 19.09184  1.095667  1.5015819   9.5719563 22.937516  0.9423228
## 5 20.5550508 48.86063 10.529089 15.2660828 126.0782168 95.498606 12.5800091
## 6 12.4857821 22.98259 12.103108 14.0147646  24.3789186 10.977877  9.3289955
##        IFFAFQ     SAABRQ      IFFAFB    INICIA.1    QCFQIA     FFNAAC    QNCSCN
## 1 123.6629036  10.232139 100.5829827 148.0641808  7.210451 103.392926 25.230781
## 2  32.2781138  14.944556  32.7070538  37.2228387 13.255577  27.390313 24.827382
## 3   0.5589284   3.961434   0.3516888   0.3308697  2.840481   3.524759  4.327372
## 4  13.1348169   4.505896   9.6128259 546.9275773  2.348859  18.431551  4.877462
## 5  67.9097979 105.325323  67.4070106  60.5491510 17.133926  47.143649 43.933831
## 6  44.9937344  13.236070  38.8029922   8.2717419 18.372084  27.684043 39.533113
##       SAABQQ      INICIC     INICIB    IAAICI    SAABIC   FAFAFF    IFAAAA
## 1 11.3035156 119.7574980 64.4814982 13.098598 15.303059 3.420917  6.147074
## 2 14.4192283  21.0620018 44.9510444 19.570847  8.763290 3.890629  5.891871
## 3  0.7245843   0.3481323  0.9300216  2.157416  1.613143 2.488141  1.054101
## 4  2.8983373 438.1244514 22.9405330  2.465618  2.833900 1.650637  4.660237
## 5 15.5061048  80.9407508 93.6221753 27.121803 20.622071 5.895765 28.438540
## 6  9.7094301  16.5362824 12.2452845 38.525289 11.684387 4.342615  7.522954
##      SAABIB     SAARAA    QCFFAB    INICSR    AFNCCR      INICIR  SAABFA.1
## 1 14.107651  17.562015  7.386431 84.979729 1.6045494  80.5725155 4.6628026
## 2  9.637325  23.270167 14.158209 63.216628 3.4161373  38.1860263 7.5517129
## 3  1.393348   4.713701  2.151285  7.254367 0.6314678   0.7637205 0.6081916
## 4  4.412269  11.197525  2.278454 14.508734 0.4934421  26.0619629 0.7602396
## 5 19.913269 125.539570 17.941502 42.489865 4.6790730 113.1261029 9.6297010
## 6 12.946527  23.309945 18.058074  7.254367 3.9544378  10.2147620 6.4873775
##        SAARRR     IFFABI    SARICB     INICSA    QCAQFI   FAFAFQ     SAASAQ
## 1  32.3772163 86.3460797 1.4843492  40.752473 21.094260 3.799670 35.9634167
## 2  30.5309261 36.2690356 6.2972391  31.157837  1.362831 4.153840 17.6560149
## 3   0.8056539  0.4602669 0.1349408   9.891377  1.836860 2.662830  0.8228046
## 4  17.1369299  9.1132856 0.6297239  28.190424  2.133127 1.912952  5.7767738
## 5 123.0132806 76.4963670 4.5430082 126.708538 16.502111 6.383796 91.7255686
## 6  69.4037270 50.6293644 2.4739154  26.706718 10.961905 4.580155 16.7646433
##       IAAICF      INICIS    AFIQBR    FABINF   FABINB     IFFABQ    IFFAFR
## 1 10.0163057  71.3208488 0.9131177 2.9034705 2.973727 75.5213649 78.871847
## 2 29.7359076  18.4637731 1.7245724 2.6832326 3.363441 33.7479669 37.446574
## 3  1.9823938   0.1206783 0.4639525 1.5802074 2.524501  0.4115606  1.170205
## 4  0.8346921 359.8625571 0.1663575 0.4312994 0.514499  8.4369917  8.191438
## 5 34.1180413  80.3717180 2.0868620 5.2324870 5.960893 74.9040241 75.712292
## 6 30.8836093  17.6190253 1.6136674 3.8082815 4.501866 41.3618375 37.446574
##       FFNAAQ    QCAQFQ   IFFAFA.1   AFNCCA    QNCSCS    AFIQCI    QNCSCQ
## 1 126.891141 34.945444 117.398963 3.244893 21.068795 1.5778249 25.953675
## 2  30.376063  6.604097  28.296160 7.346928 23.305008 2.7106923 23.344930
## 3   4.090045  1.480739   0.301023 1.390668  2.353030 0.5432304  2.842864
## 4  19.253137  1.332665  10.686316 1.128278  4.514146 0.4012807  4.715810
## 5  45.589034 17.857714  59.301527 9.953338 41.645290 3.7835040 48.897260
## 6  31.473392 12.823201  40.638102 7.361505 54.478481 2.5196062 40.034213
##      SAABFI     IFFABC      IFFAFC    QCAQFS      INICII    IAAICQ     SAABFR
## 1 15.434716 77.3586657 106.1655599 44.996731 128.2320386 15.528619  9.8278590
## 2 13.780115 37.1147756  39.8865001 51.122792  47.1169594 34.503537  7.9692954
## 3  1.728688  0.1738397   0.3968806  1.192684   0.1847724  1.419064  0.5632011
## 4  4.667458 11.5603399   7.3422911  2.168517 443.0841912  2.108324  1.3235226
## 5 22.695207 79.6185818  71.4385076 31.118221  42.6824221 39.490535 12.4185840
## 6 18.620442 54.5856652  36.9098956 21.929130   8.4995299 28.056930 12.4185840
##       SAABFN    QNCSCI    IAAICS    SAABFS     QCAQFN    SICIBA     SAABQS
## 1 14.5538531 24.428979 11.445349 10.399637 43.5405506  46.13813 11.8666284
## 2 20.0536382 21.706792 21.212047 13.187547 59.0769248 124.16212 17.2840022
## 3  0.7482701  2.969659  1.678651  1.815810  2.4603867  13.46222  0.5159404
## 4  4.1528990  4.065604  2.619713  3.741668  0.4976063  35.45683  2.8376720
## 5 19.2305412 45.287296 25.281504 21.514593 35.1088884  63.61373 22.5723909
## 6 15.5640177 45.393356 35.149938 15.681992 17.3332859  22.75305 12.2535837
##      FAFAQQ    IFFABS     SAABIF     FFNARI      INICIN FFNAAA.1    SAABIQ
## 1 10.106824 88.473574 12.3602202 153.113996  75.6719123 95.29289 12.953973
## 2  9.793654 36.050109 21.6454588  22.203842  33.9282329 20.87077 21.543401
## 3  1.610132  0.000000  0.7838188   5.088381   0.7355714 15.83124  1.222073
## 4  1.357066  8.904808  3.0749816  20.816102  22.3429826 22.34700  2.199731
## 5 12.805140 83.733918 22.0072214  32.380603 119.5303597 39.75627 21.124404
## 6 14.326699 47.971061 10.0690575  20.816102  12.5966610 19.49635 10.684409
##     AFNAQI      SAANNN    IFFABB     FFNARS    SAABQF       IFFAFS   IFAAAA.1
## 1 1.638143  34.0660510 81.346495 127.981129 14.649997 101.58666703  6.5740958
## 2 3.547613  28.9971045 35.137835  31.181031 13.574768  30.90338738  6.4596733
## 3 0.712236   0.9045595  0.000000   2.586445  1.209633   0.08218986  0.9777927
## 4 0.491782  11.0253852  7.059656  21.936885  2.016055   9.36964405  4.6913247
## 5 4.232717 131.2123246 84.715876  41.766297 17.539675  66.16283735 26.7124654
## 6 2.482651  59.9057309 47.010893  25.002301 10.953897  43.06748667  7.0525902
##        IFFAFI     SARIAI
## 1 110.2845047 1.48926863
## 2  29.0860232 5.11936093
## 3   0.1009931 0.04653964
## 4  12.0181832 0.93079290
## 5  75.2398865 3.90933016
## 6  42.9220829 2.23390295

1. Filter annotations for a specific condition

# Filter for itc1 conditions
myFilter = conditions[grepl("itc1",
                            conditions$primary), ]
print(myFilter)
##       ID primary   secondary additional_information
## 3 AFIINC    itc1 itc1-1_dUTP                       
## 4 AFNAQI    itc1 itc1-1_dUTP

2. Select expression data from filtered annotations

Select only the columns correspodning to our filtered conditions.

# Only need to add the gene names column (X) plus our condition columns
selectedColumns = c("X",
                    myFilter$ID)
expressionFiltered = expression %>%
  select(all_of(selectedColumns))
# Check structure
str(expressionFiltered)
## 'data.frame':    6071 obs. of  3 variables:
##  $ X     : chr  "YAL008W" "YBR255W" "YGR164W" "YGR131W" ...
##  $ AFIINC: num  1.6 3.206 0.614 0.468 3.651 ...
##  $ AFNAQI: num  1.638 3.548 0.712 0.492 4.233 ...

3. Make tidy data for analysis

Convert to long format, excluding gene names column.

tidyExpression = expressionFiltered %>%
  pivot_longer(cols = -X,
               names_to = "treatment",
               values_to = "expressionValue")
# Check the first few rows
head(tidyExpression)
## # A tibble: 6 × 3
##   X       treatment expressionValue
##   <chr>   <chr>               <dbl>
## 1 YAL008W AFIINC              1.60 
## 2 YAL008W AFNAQI              1.64 
## 3 YBR255W AFIINC              3.21 
## 4 YBR255W AFNAQI              3.55 
## 5 YGR164W AFIINC              0.614
## 6 YGR164W AFNAQI              0.712

4. Create summary statistics

Group the treatment and calculate summary statistics.

summaryStats = tidyExpression %>%
  group_by(treatment) %>%
  summarize(
    mean = mean(expressionValue,
                na.rm = TRUE),
    median = median(expressionValue,
                    na.rm = TRUE),
    n = n(),
    .groups = "drop"
  )
print(summaryStats)
## # A tibble: 2 × 4
##   treatment  mean median     n
##   <chr>     <dbl>  <dbl> <int>
## 1 AFIINC     165.   3.62  6071
## 2 AFNAQI     165.   3.97  6071

5. Create violin plot with enhancements

Create enhanced violin plot with log transformation.

violinPlot = ggplot(tidyExpression,
                    aes(x = treatment,
                        y = log(expressionValue + 1))) +
  # Main violin plot showing distribution density
  geom_violin(aes(fill = treatment),
              alpha = 0.7,
              trim = FALSE) +
  # Overlay boxplot to show quartiles and outliers
  geom_boxplot(width = 0.2,
               alpha = 0.9,
               outlier.size = 1.5,
               outlier.alpha = 0.9) +
  # Labels and titles
  labs(
    title = "Distribution of Gene Expression Values by Treatment",
    subtitle = "ITC1 conditions - Log-transformed expression values",
    x = "Treatment Condition",
    y = "Log(Expression Value +1)") +
  # Apply minimal theme and customize text sizes
  theme_minimal() +
  theme(
    # Make axis titles large and bold
    axis.title.x = element_text(size = 16,
                                face = "bold"),
    axis.title.y = element_text(size = 16,
                                face = "bold"),
    # Make tick labels large
    axis.text.x = element_text(size = 14,
                               angle = 45,
                               hjust = 1),
    axis.text.y = element_text(size = 14),
    # Enhance plot titles
    plot.title = element_text(size = 18,
                              face = "bold"),
    plot.subtitle = element_text(size = 14),
    # Remove legend
    legend.position = "none"
  ) +
  # Use distinct colors for each treatment
  scale_fill_viridis_d(option = "plasma")

print(violinPlot)  

Summary

I chose “itc1 as the condition instead of wildtype, using ‘all_of()’ to avoid conflicts and adding log(expressionValue + 1) transformation to handle zero values. Boxplot overlay, oulier points, and enhanced theming were applied to the violin plot.