import pandas as pd
import numpy as np
import matplotlib
import plotly
matplotlib.use('Agg')
import matplotlib.pyplot as plt
pson_expr_tpm_df1 = pd.read_csv('/home/alex/pson_expr_tpm_df.csv')
pson_expr_tpm_symbol_df = pd.read_csv('/home/alex/pson_expr_tpm_symbol_df.csv')
cell_speeds_df = pd.read_csv('/home/alex/cell_speeds_df.csv')
pson_expr_tpm_symbol_df.head()
##      symbol  mRNA_R17  mRNA_R21  ...  mRNA_R60  mRNA_R58  mRNA_R57
## 0    TSPAN6     33.56     45.10  ...      9.38     14.81      9.84
## 1      TNMD      0.00      0.00  ...      0.00      0.00      0.00
## 2      DPM1    169.46    129.88  ...     85.66    100.57     70.69
## 3     SCYL3      1.85      1.85  ...      3.49      4.09      4.40
## 4  C1orf112      5.73     11.85  ...     13.37     19.29     12.16
## 
## [5 rows x 64 columns]
pson_expr_tpm_df = pson_expr_tpm_df1.drop(columns = ["gene_id"])
pson_expr_tpm_symbol_df = pson_expr_tpm_symbol_df.set_index("symbol")
pson_expr_tpm_symbol_df.head()
##           mRNA_R17  mRNA_R21  mRNA_R20  ...  mRNA_R60  mRNA_R58  mRNA_R57
## symbol                                  ...                              
## TSPAN6       33.56     45.10     39.42  ...      9.38     14.81      9.84
## TNMD          0.00      0.00      0.00  ...      0.00      0.00      0.00
## DPM1        169.46    129.88    132.06  ...     85.66    100.57     70.69
## SCYL3         1.85      1.85      1.77  ...      3.49      4.09      4.40
## C1orf112      5.73     11.85     10.16  ...     13.37     19.29     12.16
## 
## [5 rows x 63 columns]
df3 = pd.DataFrame(pson_expr_tpm_df)
df = round(df3, 1)
dfadd = df + 1
pson_logtpm = np.log2(dfadd)
df4 = pd.DataFrame(pson_expr_tpm_symbol_df)
df1=round(df4, 1)
dfadd1 = df1 + 1
pson_logtpm_symbol = np.log2(dfadd1)
pson_logtpm_symbol.head()
##           mRNA_R17  mRNA_R21  mRNA_R20  ...  mRNA_R60  mRNA_R58  mRNA_R57
## symbol                                  ...                              
## TSPAN6    5.112700  5.526695  5.336283  ...  3.378512  3.981853  3.432959
## TNMD      0.000000  0.000000  0.000000  ...  0.000000  0.000000  0.000000
## DPM1      7.413628  7.032321  7.056367  ...  6.437960  6.666757  6.163901
## SCYL3     1.485427  1.485427  1.485427  ...  2.169925  2.350497  2.432959
## C1orf112  2.744161  3.678072  3.485427  ...  3.847997  4.343408  3.722466
## 
## [5 rows x 63 columns]
hyal_coll_df = cell_speeds_df[(cell_speeds_df["experimentalCondition"] == "HyaluronicAcid Collagen")]
hyal_brca_df = hyal_coll_df[(hyal_coll_df["diagnosis"] == "Breast Cancer")]
hyal_brca_df
##       sample summary_metric  ...      diagnosis    cellLine
## 42  mRNA_R56    speed_um_hr  ...  Breast Cancer       T-47D
## 56  mRNA_R63    speed_um_hr  ...  Breast Cancer  MDA-MB-231
## 
## [2 rows x 7 columns]
hyal_brca_logtpm = pson_logtpm.iloc[0:18682, [43, 57]]
hyal_brca_logtpm_symbol = pson_logtpm_symbol.iloc[0:18682, [43, 57]]
hyal_brca_logtpm.columns = ["slow", "fast"]
hyal_brca_logtpm_symbol.columns = ["slow", "fast"]
hyal_brca_logtpm_symbol
##                 slow      fast
## symbol                        
## TSPAN6      4.224966  4.061776
## TNMD        0.000000  0.000000
## DPM1        6.960002  6.738768
## SCYL3       3.292782  2.632268
## C1orf112    3.498251  4.095924
## ...              ...       ...
## MUC8        0.137504  0.000000
## ZIM2        0.000000  0.000000
## GOLGA7B     0.137504  0.000000
## AC012313.1  0.678072  0.000000
## EGLN2       5.892391  5.419539
## 
## [18682 rows x 2 columns]
x = np.linspace(0,16,100)
y = x
plt.scatter(hyal_brca_logtpm_symbol["slow"], hyal_brca_logtpm_symbol["fast"], c = "black")
plt.plot(x, y, color = "red")
plt.ylabel("Log expression in slow cell line")
plt.xlabel("Log expression in fast cell line")

dge = hyal_brca_logtpm["fast"] - hyal_brca_logtpm["slow"]
dge = pd.DataFrame(dge)
DGE = pd.concat([hyal_brca_logtpm, dge], axis = 1)
#DGE_symbol = pd.concat([hyal_brca_logtpm_symbol, dge], axis = 1)
#DGE_symbol
order_dge = dge.sort_values(0, ascending = False)
order_dge = order_dge.index
DGE = DGE.iloc[order_dge, ]
DGE.columns = ["slow", "fast", "dge"]
#DGE_symbol.head()
plt.hist(DGE["dge"])
## (array([2.0000e+00, 7.0000e+00, 4.6000e+01, 2.4200e+02, 1.4450e+03,
##        1.3654e+04, 2.7520e+03, 4.3400e+02, 8.2000e+01, 1.8000e+01]), array([-12.33171007, -10.128922  ,  -7.92613393,  -5.72334586,
##         -3.52055779,  -1.31776972,   0.88501836,   3.08780643,
##          5.2905945 ,   7.49338257,   9.69617064]), <BarContainer object of 10 artists>)
plt.title("Histogram of dge values")
plt.xlabel("Differential gene expression, dge")
plt.show()

cutoff = 4
genelist = DGE["dge"]
#genelist_symbol = DGE_symbol["dge"]
genesfast = genelist[genelist > cutoff]
genesslow = genelist[genelist < -cutoff]
len(genesfast)
## 289
len(genesslow)
#genelist_symbol.head()
## 207