CC DD DE exp overlap#

CC/C DD/D differential expression gene statistics and overlap analysis

[1]:
import pandas as pd
import os, glob
import matplotlib.pyplot as plt
import numpy as np

import matplotlib as mpl
from matplotlib_venn import venn3,venn2
mpl.rcParams['pdf.fonttype']=42
mpl.rcParams['ps.fonttype']=42
/Users/yuanzan/anaconda3/lib/python3.8/site-packages/pandas/core/computation/expressions.py:20: UserWarning: Pandas requires version '2.7.3' or newer of 'numexpr' (version '2.7.1' currently installed).
  from pandas.core.computation.check import NUMEXPR_INSTALLED

all CC DE genes#

[2]:
def read_allFC_data(filepath):
    df = pd.read_excel(filepath, index_col=0)
    df = df[["log2FoldChange", "regulated", "padj", "pvalue"]]
    return df

def read_allFC_data2(filepath):
    df = pd.read_table(filepath, index_col=0)
    df = df[["log2FoldChange", "regulated", "padj", "pvalue"]]
    return df

allFC_dir = "~/Documents/phd/tomato_metabolic/data/番茄转录组数据分析/CC:C/全部基因/"
allFC_dir2 = "~/Documents/phd/tomato_metabolic/upsetR/CC:C/"
[3]:
C24_allFC = read_allFC_data2(allFC_dir2 + "newdata/C24_rmC24-3_new_DE.xls")
C84_allFC = read_allFC_data2(allFC_dir2 + "newdata/C84_rmC84-2_new_DE.xls")
[4]:

C12_allFC = read_allFC_data(allFC_dir + "C12h_vs_CC12h.all.annot.xlsx") C24_allFC = read_allFC_data2(allFC_dir2 + "newdata/C24_rmC24-3_new_DE.xls") C36_allFC = read_allFC_data(allFC_dir + "C36h_vs_CC36h.all.annot.xlsx") C48_allFC = read_allFC_data(allFC_dir + "C48h_vs_CC48h.all.annot.xlsx") C60_allFC = read_allFC_data(allFC_dir + "C60h_vs_CC60h.all.annot.xlsx") C72_allFC = read_allFC_data(allFC_dir + "C72h_vs_CC72h.all.annot.xlsx") C84_allFC = read_allFC_data2(allFC_dir2 + "newdata/C84_rmC84-2_new_DE.xls") C96_allFC = read_allFC_data(allFC_dir + "C96h_vs_CC96h.all.annot.xlsx") C120_allFC = read_allFC_data(allFC_dir + "C120h_vs_CC120h.all.annot.xlsx") C168_allFC = read_allFC_data(allFC_dir + "C168h_vs_CC168h.all.annot.xlsx") C216_allFC = read_allFC_data(allFC_dir + "C216h_vs_CC216h.all.annot.xlsx")
[5]:
def de_gene(df, name):
    df["abslog2FoldChange"] = abs(df["log2FoldChange"])
    df = df.query('abslog2FoldChange >=1 & padj < 0.01')
    df = df[["log2FoldChange"]]
    df.columns = [name]
    return df


df = pd.concat([de_gene(C12_allFC, 'CC/C 12'),
           de_gene(C24_allFC, 'CC/C 24'),
           de_gene(C36_allFC, 'CC/C 36'),
           de_gene(C48_allFC, 'CC/C 48'),
           de_gene(C60_allFC, 'CC/C 60'),
           de_gene(C72_allFC, 'CC/C 72'),
           de_gene(C84_allFC, 'CC/C 84'),
           de_gene(C96_allFC, 'CC/C 96'),
           de_gene(C120_allFC, 'CC/C 120'),
           de_gene(C168_allFC, 'CC/C 168'),
           de_gene(C216_allFC, 'CC/C 216')], axis=1)

df = df.fillna(0)
[6]:
df[df>0] = 1
df[df<0] = -1
[ ]:

[ ]:

[7]:
import seaborn as sns
x = sns.clustermap(
    df,
    figsize=(5, 6),
    col_cluster=True,
    dendrogram_ratio=(.1, .2),
    cmap="vlag",

    cbar_pos=(1, 0.2, .03, .4)
)
x.savefig("CC_C-all_DEgene_cluster.pdf")
#ax.set(xlabel="", yticklabels="")
#ax.xaxis.tick_top()
../../_images/notebooks_CC_DD_DE_overlap_CC_DD_DE_overlap_10_0.png

all DD DE genes#

[8]:
DDallFC_dir = "~/Documents/phd/tomato_metabolic/data/番茄转录组数据分析/DD:D/全部基因/"

D12_allFC = read_allFC_data(DDallFC_dir + "D12h_vs_DD12h.all.annot.xlsx")
D24_allFC = read_allFC_data(DDallFC_dir + "D24h_vs_DD24h.all.annot.xlsx")
D36_allFC = read_allFC_data(DDallFC_dir + "D36h_vs_DD36h.all.annot.xlsx")
D48_allFC = read_allFC_data(DDallFC_dir + "D48h_vs_DD48h.all.annot.xlsx")
D60_allFC = read_allFC_data(DDallFC_dir + "D60h_vs_DD60h.all.annot.xlsx")
D72_allFC = read_allFC_data(DDallFC_dir + "D72h_vs_DD72h.all.annot.xlsx")
D84_allFC = read_allFC_data(DDallFC_dir + "D84h_vs_DD84h.all.annot.xlsx")
D96_allFC = read_allFC_data(DDallFC_dir + "D96h_vs_DD96h.all.annot.xlsx")
D120_allFC = read_allFC_data(DDallFC_dir + "D120h_vs_DD120h.all.annot.xlsx")
D168_allFC = read_allFC_data(DDallFC_dir + "D168h_vs_DD168h.all.annot.xlsx")
D216_allFC = read_allFC_data(DDallFC_dir + "D216h_vs_DD216h.all.annot.xlsx")

[9]:
def de_gene(df, name):
    df["abslog2FoldChange"] = abs(df["log2FoldChange"])
    df = df.query('abslog2FoldChange >=1 & padj < 0.01')
    df = df[["log2FoldChange"]]
    df.columns = [name]
    return df


df_DD = pd.concat([de_gene(D12_allFC, 'DD/D 12'),
           de_gene(D24_allFC, 'DD/D 24'),
           de_gene(D36_allFC, 'DD/D 36'),
           de_gene(D48_allFC, 'DD/D 48'),
           de_gene(D60_allFC, 'DD/D 60'),
           de_gene(D72_allFC, 'DD/D 72'),
           de_gene(D84_allFC, 'DD/D 84'),
           de_gene(D96_allFC, 'DD/D 96'),
           de_gene(D120_allFC, 'DD/D 120'),
           de_gene(D168_allFC, 'DD/D 168'),
           de_gene(D216_allFC, 'DD/D 216')], axis=1)

df_DD = df_DD.fillna(0)

df_DD[df_DD>0] = 1
df_DD[df_DD<0] = -1
[10]:
x2 = sns.clustermap(
    df_DD,
    figsize=(5, 6),
    col_cluster=True,
    dendrogram_ratio=(.1, .2),
    cmap="vlag",

    cbar_pos=(1, 0.2, .03, .4)
)
x2.savefig("DD_D-all_DEgene_cluster.pdf")
../../_images/notebooks_CC_DD_DE_overlap_CC_DD_DE_overlap_14_0.png
[11]:
def cum_DE_gene_num(df_DE, DE, sample):
    if DE=="up":
        df_DE[df_DE<0]=0

    elif DE=="down":
        df_DE[df_DE>0]=0
    else:
        return 0


    df_DE = abs(df_DE)

    df_DE = pd.DataFrame(df_DE.sum())
    df_DE.columns = [sample + "_" + DE]
    df_DE["HAG"] = df_DE.index
    df_DE.index = df_DE["HAG"].str.split(" ", expand=True)[1]
    df_DE = df_DE[[sample + "_" + DE]]

    return df_DE

CC_C_up = cum_DE_gene_num(df.copy(), "up", "CC/C")
CC_C_down = cum_DE_gene_num(df.copy(), "down", "CC/C")

DD_D_up = cum_DE_gene_num(df_DD.copy(), "up", "DD/D")
DD_D_down = cum_DE_gene_num(df_DD.copy(), "down", "DD/D")
[12]:
CC_DD_all_DE_num = pd.concat([CC_C_up, CC_C_down, DD_D_up, DD_D_down], axis=1)
CC_DD_all_DE_num.to_csv("CC_DD_all_DE_num.csv", sep="\t")
CC_DD_all_DE_num["HAG"] = CC_DD_all_DE_num.index
CC_DD_all_DE_num_2plot = CC_DD_all_DE_num.melt(id_vars=["HAG"])
del CC_DD_all_DE_num["HAG"]
[13]:
CC_DD_all_DE_num_2plot.head(2)
[13]:
HAG variable value
0 12 CC/C_up 1838.0
1 24 CC/C_up 1770.0
[14]:
CC_DD_all_DE_num
[14]:
CC/C_up CC/C_down DD/D_up DD/D_down
1
12 1838.0 2219.0 3212.0 1634.0
24 1770.0 1607.0 3548.0 1981.0
36 2807.0 1175.0 3104.0 1165.0
48 3594.0 1699.0 3410.0 1275.0
60 2900.0 884.0 4185.0 1534.0
72 2138.0 599.0 3656.0 1327.0
84 2043.0 601.0 2401.0 1094.0
96 2191.0 670.0 1968.0 533.0
120 2278.0 585.0 1541.0 482.0
168 1952.0 777.0 886.0 289.0
216 1222.0 732.0 1644.0 1036.0
[ ]:

[15]:
import numpy as np
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(6,4),constrained_layout=True)
ax = fig.add_gridspec(top=0.8, right=0.8).subplots()

plt.subplots_adjust(left=0.2, bottom=0.6)

y_offset = np.zeros(CC_DD_all_DE_num.shape[0])
cell_text = []

colors = ["#C26364", "#2046F6", "#FBE5E5", "#8684F7"]

ax.bar(np.array(range(0, CC_DD_all_DE_num.shape[0])) - 0.25, CC_DD_all_DE_num["CC/C_up"], 0.25, bottom=y_offset, color=colors[0], align="edge")
cell_text.append(CC_DD_all_DE_num["CC/C_up"])

y_offset = y_offset + CC_DD_all_DE_num["CC/C_up"]
ax.bar(np.array(range(0, CC_DD_all_DE_num.shape[0])) - 0.25 , CC_DD_all_DE_num["CC/C_down"], 0.25, bottom=y_offset, color=colors[1], align="edge")
cell_text.append(CC_DD_all_DE_num["CC/C_down"])

## DD
y_offset = np.zeros(CC_DD_all_DE_num.shape[0])
ax.bar(np.array(range(0, CC_DD_all_DE_num.shape[0])) , CC_DD_all_DE_num["DD/D_up"], 0.25, bottom=y_offset, color=colors[2], align="edge")
cell_text.append(CC_DD_all_DE_num["DD/D_up"])

y_offset = y_offset + CC_DD_all_DE_num["DD/D_up"]
ax.bar(np.array(range(0, CC_DD_all_DE_num.shape[0])) , CC_DD_all_DE_num["DD/D_down"], 0.25, bottom=y_offset, color=colors[3], align="edge")
cell_text.append(CC_DD_all_DE_num["DD/D_down"])


#cell_text.reverse()
the_table = ax.table(cellText=cell_text,
                      rowLabels=CC_DD_all_DE_num.columns,
                      rowColours=colors,
                      colLabels=CC_DD_all_DE_num.index,
                      loc='bottom', fontsize=12)

ax.set(xticklabels=[], xticks=[], ylabel="DE gene number")
ax.set_xlim(-0.5, 10.5)

plt.savefig("CC_DD_DE_genes_number.pdf")

<ipython-input-15-913aa41f5b3d>:7: UserWarning: This figure was using a layout engine that is incompatible with subplots_adjust and/or tight_layout; not calling subplots_adjust.
  plt.subplots_adjust(left=0.2, bottom=0.6)
../../_images/notebooks_CC_DD_DE_overlap_CC_DD_DE_overlap_20_1.png
[16]:
np.array(range(0, CC_DD_all_DE_num.shape[0]))
[16]:
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])
[17]:
accum_df = pd.DataFrame(index=CC_DD_all_DE_num.index, columns=["CC/C", "DD/D"])
accum_df = accum_df.fillna(0)
[18]:
for i,v in enumerate(df.columns):
    df_sub = None
    if i == 0:
        df_sub = abs(df[[df.columns[i]]])
    else:
        df_sub = abs(df[df.columns[0:i+1]])

    df_sub = pd.DataFrame(df_sub.sum(axis=1))
    df_sub.loc[df_sub[df_sub[0]>=1].index, 0] = 1
    accum_df.iloc[i, 0]= df_sub.sum()[0]


for i,v in enumerate(df_DD.columns):
    df_sub = None
    if i == 0:
        df_sub = abs(df_DD[[df_DD.columns[i]]])
    else:
        df_sub = abs(df_DD[df_DD.columns[0:i+1]])

    df_sub = pd.DataFrame(df_sub.sum(axis=1))
    df_sub.loc[df_sub[df_sub[0]>=1].index, 0] = 1
    accum_df.iloc[i, 1]= df_sub.sum()[0]
[19]:
fig = plt.figure(figsize=(4,3),constrained_layout=True)
ax = fig.add_gridspec(top=0.8, right=0.8).subplots()

accum_df.plot.bar(ax=ax, color=['#5B7695','#E0CC71'])
ax.set(xlabel="HAG(h)", ylabel="cumulative DE gene number")

plt.savefig("CC_DD_cumulative_DE_genes_number.pdf")
../../_images/notebooks_CC_DD_DE_overlap_CC_DD_DE_overlap_24_0.png
[20]:
CC_all_DE_genes = set(df.index)
DD_all_DE_genes = set(df_DD.index)

venn2([CC_all_DE_genes, DD_all_DE_genes], ('CC/C', 'DD/D'), set_colors=['#5B7695','#E0CC71'])
plt.savefig("CC_DD_DE_genes_overlap_number.pdf")

../../_images/notebooks_CC_DD_DE_overlap_CC_DD_DE_overlap_25_0.png
[21]:
CC_all_DE_genes_df = pd.DataFrame(CC_all_DE_genes)
DD_all_DE_genes_df = pd.DataFrame(DD_all_DE_genes)

[22]:
CC_DD_overlap_DEgenes = CC_all_DE_genes_df[CC_all_DE_genes_df[0].isin(DD_all_DE_genes_df[0])]
[286]:
CC_DD_overlap_DEgenes.to_csv("CC_DD_overlap_DEgenes.csv", sep="\t", header=False, index=False)
[ ]:

overlap DE gene heatmap#

[25]:
import seaborn as sns
x = sns.clustermap(
    df.loc[CC_DD_overlap_DEgenes[0],:],
    figsize=(5, 6),
    col_cluster=True,
    dendrogram_ratio=(.1, .2),
    cmap="vlag",

    cbar_pos=(1, 0.2, .03, .4)
)
x.savefig("CC_C-overlap_DEgene_cluster.pdf")
#ax.set(xlabel="", yticklabels="")
#ax.xaxis.tick_top()
../../_images/notebooks_CC_DD_DE_overlap_CC_DD_DE_overlap_31_0.png
[26]:
import seaborn as sns
x = sns.clustermap(
    df_DD.loc[CC_DD_overlap_DEgenes[0],:],
    figsize=(5, 6),
    col_cluster=True,
    dendrogram_ratio=(.1, .2),
    cmap="vlag",

    cbar_pos=(1, 0.2, .03, .4)
)
x.savefig("DD_D-overlap_DEgene_cluster.pdf")
../../_images/notebooks_CC_DD_DE_overlap_CC_DD_DE_overlap_32_0.png
[ ]:

for mfuzz#

[303]:
def get_log2fc(df, name, overlap_genes=CC_DD_overlap_DEgenes[0]):
    df["abslog2FoldChange"] = abs(df["log2FoldChange"])
    df = df.query('abslog2FoldChange >=1 & padj < 0.01')
    df = df[["log2FoldChange"]]
    df.columns = [name]
    df = df.loc[df.index.isin(overlap_genes),:]

    return df
[302]:
DD_overlap_log2fc = pd.concat([get_log2fc(D12_allFC, 'DD/D 12'),
           get_log2fc(D24_allFC, 'DD/D 24'),
           get_log2fc(D36_allFC, 'DD/D 36'),
           get_log2fc(D48_allFC, 'DD/D 48'),
           get_log2fc(D60_allFC, 'DD/D 60'),
           get_log2fc(D72_allFC, 'DD/D 72'),
           get_log2fc(D84_allFC, 'DD/D 84'),
           get_log2fc(D96_allFC, 'DD/D 96'),
           get_log2fc(D120_allFC, 'DD/D 120'),
           get_log2fc(D168_allFC, 'DD/D 168'),
           get_log2fc(D216_allFC, 'DD/D 216')], axis=1)

DD_overlap_log2fc = DD_overlap_log2fc.fillna(0)



CC_overlap_log2fc = pd.concat([get_log2fc(C12_allFC, 'CC/C 12'),
           get_log2fc(C24_allFC, 'CC/C 24'),
           get_log2fc(C36_allFC, 'CC/C 36'),
           get_log2fc(C48_allFC, 'CC/C 48'),
           get_log2fc(C60_allFC, 'CC/C 60'),
           get_log2fc(C72_allFC, 'CC/C 72'),
           get_log2fc(C84_allFC, 'CC/C 84'),
           get_log2fc(C96_allFC, 'CC/C 96'),
           get_log2fc(C120_allFC, 'CC/C 120'),
           get_log2fc(C168_allFC, 'CC/C 168'),
           get_log2fc(C216_allFC, 'CC/C 216')], axis=1)

CC_overlap_log2fc = CC_overlap_log2fc.fillna(0)

[305]:
CC_overlap_log2fc.to_csv("CC_overlap_log2fc.csv", sep="\t")
[306]:
DD_overlap_log2fc.to_csv("DD_overlap_log2fc.csv", sep="\t")
[ ]:

[ ]:

[ ]:

[ ]: