import pandas as pd
import numpy as np
import matplotlib
import plotly
matplotlib.use('Agg')
import matplotlib.pyplot as plt
pson_expr_tpm_df1 = pd.read_csv('/home/alex/pson_expr_tpm_df.csv')
pson_expr_tpm_symbol_df = pd.read_csv('/home/alex/pson_expr_tpm_symbol_df.csv')
cell_speeds_df = pd.read_csv('/home/alex/cell_speeds_df.csv')
pson_expr_tpm_df1.head()
##            gene_id  mRNA_R17  mRNA_R21  ...  mRNA_R60  mRNA_R58  mRNA_R57
## 0  ENSG00000000003     33.56     45.10  ...      9.38     14.81      9.84
## 1  ENSG00000000005      0.00      0.00  ...      0.00      0.00      0.00
## 2  ENSG00000000419    169.46    129.88  ...     85.66    100.57     70.69
## 3  ENSG00000000457      1.85      1.85  ...      3.49      4.09      4.40
## 4  ENSG00000000460      5.73     11.85  ...     13.37     19.29     12.16
## 
## [5 rows x 64 columns]
pson_expr_tpm_symbol_df.head()
##      symbol  mRNA_R17  mRNA_R21  ...  mRNA_R60  mRNA_R58  mRNA_R57
## 0    TSPAN6     33.56     45.10  ...      9.38     14.81      9.84
## 1      TNMD      0.00      0.00  ...      0.00      0.00      0.00
## 2      DPM1    169.46    129.88  ...     85.66    100.57     70.69
## 3     SCYL3      1.85      1.85  ...      3.49      4.09      4.40
## 4  C1orf112      5.73     11.85  ...     13.37     19.29     12.16
## 
## [5 rows x 64 columns]
cell_speeds_df.head()
##      sample summary_metric  ...     diagnosis  cellLine
## 0  mRNA_R17    speed_um_hr  ...  Colon Cancer     SW620
## 1  mRNA_R21    speed_um_hr  ...  Colon Cancer     SW620
## 2  mRNA_R20    speed_um_hr  ...  Colon Cancer     SW620
## 3  mRNA_R19    speed_um_hr  ...  Colon Cancer     SW620
## 4  mRNA_R18    speed_um_hr  ...  Colon Cancer     SW620
## 
## [5 rows x 7 columns]
np.unique(cell_speeds_df["cellLine"])
## array(['22Rv1', 'A375', 'MDA-MB-231', 'RWPE-1', 'SW480', 'SW620', 'T-47D',
##        'T98G', 'U-87'], dtype=object)
np.unique(cell_speeds_df["diagnosis"])
## array(['Brain Cancer', 'Breast Cancer', 'Colon Cancer', 'Not Applicable',
##        'Prostate Cancer', 'Skin Cancer'], dtype=object)
dt = cell_speeds_df[["cellLine", "diagnosis"]]
np.unique(dt)
## array(['22Rv1', 'A375', 'Brain Cancer', 'Breast Cancer', 'Colon Cancer',
##        'MDA-MB-231', 'Not Applicable', 'Prostate Cancer', 'RWPE-1',
##        'SW480', 'SW620', 'Skin Cancer', 'T-47D', 'T98G', 'U-87'],
##       dtype=object)
pson_expr_tpm_df = pson_expr_tpm_df1.drop(columns = ["gene_id"])
pson_expr_tpm_symbol_df = pson_expr_tpm_symbol_df.set_index("symbol")
pson_expr_tpm_symbol_df.head()
##           mRNA_R17  mRNA_R21  mRNA_R20  ...  mRNA_R60  mRNA_R58  mRNA_R57
## symbol                                  ...                              
## TSPAN6       33.56     45.10     39.42  ...      9.38     14.81      9.84
## TNMD          0.00      0.00      0.00  ...      0.00      0.00      0.00
## DPM1        169.46    129.88    132.06  ...     85.66    100.57     70.69
## SCYL3         1.85      1.85      1.77  ...      3.49      4.09      4.40
## C1orf112      5.73     11.85     10.16  ...     13.37     19.29     12.16
## 
## [5 rows x 63 columns]
pson_expr_tpm_symbol_df.sum(0)
## mRNA_R17    925249.15
## mRNA_R21    939584.77
## mRNA_R20    936327.99
## mRNA_R19    939342.91
## mRNA_R18    937213.35
##               ...    
## mRNA_R62    950446.64
## mRNA_R61    951255.04
## mRNA_R60    946777.25
## mRNA_R58    951215.83
## mRNA_R57    949333.95
## Length: 63, dtype: float64
df3 = pd.DataFrame(pson_expr_tpm_df)
df = round(df3, 1)
dfadd = df + 1
pson_logtpm = np.log2(dfadd)
df4 = pd.DataFrame(pson_expr_tpm_symbol_df)
df1=round(df4, 1)
dfadd1 = df1 + 1
pson_logtpm_symbol = np.log2(dfadd1)
pson_logtpm_symbol.head()
##           mRNA_R17  mRNA_R21  mRNA_R20  ...  mRNA_R60  mRNA_R58  mRNA_R57
## symbol                                  ...                              
## TSPAN6    5.112700  5.526695  5.336283  ...  3.378512  3.981853  3.432959
## TNMD      0.000000  0.000000  0.000000  ...  0.000000  0.000000  0.000000
## DPM1      7.413628  7.032321  7.056367  ...  6.437960  6.666757  6.163901
## SCYL3     1.485427  1.485427  1.485427  ...  2.169925  2.350497  2.432959
## C1orf112  2.744161  3.678072  3.485427  ...  3.847997  4.343408  3.722466
## 
## [5 rows x 63 columns]
np.unique(cell_speeds_df["experimentalCondition"])
## array(['30 kPa polyacrylamide Collagen',
##        '30 kPa polyacrylamide Fibronectin',
##        '500 Pa polyacrylamide Collagen',
##        '500 Pa polyacrylamide Fibronectin', 'Glass',
##        'HyaluronicAcid Collagen', 'HyaluronicAcid Fibronectin'],
##       dtype=object)
hyal_coll_df = cell_speeds_df[(cell_speeds_df["experimentalCondition"] == "HyaluronicAcid Collagen")]
hyal_coll_df
##       sample summary_metric  ...        diagnosis    cellLine
## 1   mRNA_R21    speed_um_hr  ...     Colon Cancer       SW620
## 7   mRNA_R42    speed_um_hr  ...     Colon Cancer       SW480
## 14  mRNA_R28    speed_um_hr  ...   Not Applicable      RWPE-1
## 21  mRNA_R49    speed_um_hr  ...      Skin Cancer        A375
## 28  mRNA_R14    speed_um_hr  ...     Brain Cancer        T98G
## 35   mRNA_R7    speed_um_hr  ...  Prostate Cancer       22Rv1
## 42  mRNA_R56    speed_um_hr  ...    Breast Cancer       T-47D
## 49  mRNA_R35    speed_um_hr  ...     Brain Cancer        U-87
## 56  mRNA_R63    speed_um_hr  ...    Breast Cancer  MDA-MB-231
## 
## [9 rows x 7 columns]
hyal_brca_df = hyal_coll_df[(hyal_coll_df["diagnosis"] == "Breast Cancer")]
hyal_brca_df.head()
##       sample summary_metric  ...      diagnosis    cellLine
## 42  mRNA_R56    speed_um_hr  ...  Breast Cancer       T-47D
## 56  mRNA_R63    speed_um_hr  ...  Breast Cancer  MDA-MB-231
## 
## [2 rows x 7 columns]
hyal_brca_logtpm = pson_logtpm.iloc[0:18682, [43, 57]]
hyal_brca_logtpm_symbol = pson_logtpm_symbol.iloc[0:18682, [43, 57]]
hyal_brca_logtpm.columns = ["slow", "fast"]
hyal_brca_logtpm_symbol.columns = ["slow", "fast"]
hyal_brca_logtpm_symbol.head()
##               slow      fast
## symbol                      
## TSPAN6    4.224966  4.061776
## TNMD      0.000000  0.000000
## DPM1      6.960002  6.738768
## SCYL3     3.292782  2.632268
## C1orf112  3.498251  4.095924
dge = hyal_brca_logtpm["fast"] - hyal_brca_logtpm["slow"]
dge = pd.DataFrame(dge)
DGE = pd.concat([hyal_brca_logtpm, dge], axis = 1)
#DGE_symbol = pd.concat([hyal_brca_logtpm_symbol, dge], axis = 1)
#DGE_symbol
order_dge = dge.sort_values(0, ascending = False)
order_dge = order_dge.index
DGE = DGE.iloc[order_dge, ]
DGE.columns = ["slow", "fast", "dge"]
#DGE_symbol.head()
plt.show(plt.hist(DGE["dge"]))