CC DD DE exp overlap

Contents

CC DD DE exp overlap#

CC/C DD/D differential expression gene statistics and overlap analysis

[1]:

import pandas as pd
import os, glob
import matplotlib.pyplot as plt
import numpy as np

import matplotlib as mpl
from matplotlib_venn import venn3,venn2
mpl.rcParams['pdf.fonttype']=42
mpl.rcParams['ps.fonttype']=42

/Users/yuanzan/anaconda3/lib/python3.8/site-packages/pandas/core/computation/expressions.py:20: UserWarning: Pandas requires version '2.7.3' or newer of 'numexpr' (version '2.7.1' currently installed).
  from pandas.core.computation.check import NUMEXPR_INSTALLED

all CC DE genes#

[2]:

def read_allFC_data(filepath):
    df = pd.read_excel(filepath, index_col=0)
    df = df[["log2FoldChange", "regulated", "padj", "pvalue"]]
    return df

def read_allFC_data2(filepath):
    df = pd.read_table(filepath, index_col=0)
    df = df[["log2FoldChange", "regulated", "padj", "pvalue"]]
    return df

allFC_dir = "~/Documents/phd/tomato_metabolic/data/番茄转录组数据分析/CC:C/全部基因/"
allFC_dir2 = "~/Documents/phd/tomato_metabolic/upsetR/CC:C/"

[3]:

C24_allFC = read_allFC_data2(allFC_dir2 + "newdata/C24_rmC24-3_new_DE.xls")
C84_allFC = read_allFC_data2(allFC_dir2 + "newdata/C84_rmC84-2_new_DE.xls")

[4]:

C12_allFC = read_allFC_data(allFC_dir + "C12h_vs_CC12h.all.annot.xlsx")
C24_allFC = read_allFC_data2(allFC_dir2 + "newdata/C24_rmC24-3_new_DE.xls")
C36_allFC = read_allFC_data(allFC_dir + "C36h_vs_CC36h.all.annot.xlsx")
C48_allFC = read_allFC_data(allFC_dir + "C48h_vs_CC48h.all.annot.xlsx")
C60_allFC = read_allFC_data(allFC_dir + "C60h_vs_CC60h.all.annot.xlsx")
C72_allFC = read_allFC_data(allFC_dir + "C72h_vs_CC72h.all.annot.xlsx")
C84_allFC = read_allFC_data2(allFC_dir2 + "newdata/C84_rmC84-2_new_DE.xls")
C96_allFC = read_allFC_data(allFC_dir + "C96h_vs_CC96h.all.annot.xlsx")
C120_allFC = read_allFC_data(allFC_dir + "C120h_vs_CC120h.all.annot.xlsx")
C168_allFC = read_allFC_data(allFC_dir + "C168h_vs_CC168h.all.annot.xlsx")
C216_allFC = read_allFC_data(allFC_dir + "C216h_vs_CC216h.all.annot.xlsx")

[5]:

def de_gene(df, name):
    df["abslog2FoldChange"] = abs(df["log2FoldChange"])
    df = df.query('abslog2FoldChange >=1 & padj < 0.01')
    df = df[["log2FoldChange"]]
    df.columns = [name]
    return df


df = pd.concat([de_gene(C12_allFC, 'CC/C 12'),
           de_gene(C24_allFC, 'CC/C 24'),
           de_gene(C36_allFC, 'CC/C 36'),
           de_gene(C48_allFC, 'CC/C 48'),
           de_gene(C60_allFC, 'CC/C 60'),
           de_gene(C72_allFC, 'CC/C 72'),
           de_gene(C84_allFC, 'CC/C 84'),
           de_gene(C96_allFC, 'CC/C 96'),
           de_gene(C120_allFC, 'CC/C 120'),
           de_gene(C168_allFC, 'CC/C 168'),
           de_gene(C216_allFC, 'CC/C 216')], axis=1)

df = df.fillna(0)

[6]:

df[df>0] = 1
df[df<0] = -1

[ ]:

[ ]:

[7]:

import seaborn as sns
x = sns.clustermap(
    df,
    figsize=(5, 6),
    col_cluster=True,
    dendrogram_ratio=(.1, .2),
    cmap="vlag",

    cbar_pos=(1, 0.2, .03, .4)
)
x.savefig("CC_C-all_DEgene_cluster.pdf")
#ax.set(xlabel="", yticklabels="")
#ax.xaxis.tick_top()

../../_images/notebooks_CC_DD_DE_overlap_CC_DD_DE_overlap_10_0.png

all DD DE genes#

[8]:

DDallFC_dir = "~/Documents/phd/tomato_metabolic/data/番茄转录组数据分析/DD:D/全部基因/"

D12_allFC = read_allFC_data(DDallFC_dir + "D12h_vs_DD12h.all.annot.xlsx")
D24_allFC = read_allFC_data(DDallFC_dir + "D24h_vs_DD24h.all.annot.xlsx")
D36_allFC = read_allFC_data(DDallFC_dir + "D36h_vs_DD36h.all.annot.xlsx")
D48_allFC = read_allFC_data(DDallFC_dir + "D48h_vs_DD48h.all.annot.xlsx")
D60_allFC = read_allFC_data(DDallFC_dir + "D60h_vs_DD60h.all.annot.xlsx")
D72_allFC = read_allFC_data(DDallFC_dir + "D72h_vs_DD72h.all.annot.xlsx")
D84_allFC = read_allFC_data(DDallFC_dir + "D84h_vs_DD84h.all.annot.xlsx")
D96_allFC = read_allFC_data(DDallFC_dir + "D96h_vs_DD96h.all.annot.xlsx")
D120_allFC = read_allFC_data(DDallFC_dir + "D120h_vs_DD120h.all.annot.xlsx")
D168_allFC = read_allFC_data(DDallFC_dir + "D168h_vs_DD168h.all.annot.xlsx")
D216_allFC = read_allFC_data(DDallFC_dir + "D216h_vs_DD216h.all.annot.xlsx")

[9]:

def de_gene(df, name):
    df["abslog2FoldChange"] = abs(df["log2FoldChange"])
    df = df.query('abslog2FoldChange >=1 & padj < 0.01')
    df = df[["log2FoldChange"]]
    df.columns = [name]
    return df


df_DD = pd.concat([de_gene(D12_allFC, 'DD/D 12'),
           de_gene(D24_allFC, 'DD/D 24'),
           de_gene(D36_allFC, 'DD/D 36'),
           de_gene(D48_allFC, 'DD/D 48'),
           de_gene(D60_allFC, 'DD/D 60'),
           de_gene(D72_allFC, 'DD/D 72'),
           de_gene(D84_allFC, 'DD/D 84'),
           de_gene(D96_allFC, 'DD/D 96'),
           de_gene(D120_allFC, 'DD/D 120'),
           de_gene(D168_allFC, 'DD/D 168'),
           de_gene(D216_allFC, 'DD/D 216')], axis=1)

df_DD = df_DD.fillna(0)

df_DD[df_DD>0] = 1
df_DD[df_DD<0] = -1

[10]:

x2 = sns.clustermap(
    df_DD,
    figsize=(5, 6),
    col_cluster=True,
    dendrogram_ratio=(.1, .2),
    cmap="vlag",

    cbar_pos=(1, 0.2, .03, .4)
)
x2.savefig("DD_D-all_DEgene_cluster.pdf")

../../_images/notebooks_CC_DD_DE_overlap_CC_DD_DE_overlap_14_0.png

[11]:

def cum_DE_gene_num(df_DE, DE, sample):
    if DE=="up":
        df_DE[df_DE<0]=0

    elif DE=="down":
        df_DE[df_DE>0]=0
    else:
        return 0


    df_DE = abs(df_DE)

    df_DE = pd.DataFrame(df_DE.sum())
    df_DE.columns = [sample + "_" + DE]
    df_DE["HAG"] = df_DE.index
    df_DE.index = df_DE["HAG"].str.split(" ", expand=True)[1]
    df_DE = df_DE[[sample + "_" + DE]]

    return df_DE

CC_C_up = cum_DE_gene_num(df.copy(), "up", "CC/C")
CC_C_down = cum_DE_gene_num(df.copy(), "down", "CC/C")

DD_D_up = cum_DE_gene_num(df_DD.copy(), "up", "DD/D")
DD_D_down = cum_DE_gene_num(df_DD.copy(), "down", "DD/D")

[12]:

CC_DD_all_DE_num = pd.concat([CC_C_up, CC_C_down, DD_D_up, DD_D_down], axis=1)
CC_DD_all_DE_num.to_csv("CC_DD_all_DE_num.csv", sep="\t")
CC_DD_all_DE_num["HAG"] = CC_DD_all_DE_num.index
CC_DD_all_DE_num_2plot = CC_DD_all_DE_num.melt(id_vars=["HAG"])
del CC_DD_all_DE_num["HAG"]

[13]:

CC_DD_all_DE_num_2plot.head(2)

[13]:

	HAG	variable	value
0	12	CC/C_up	1838.0
1	24	CC/C_up	1770.0

[14]:

CC_DD_all_DE_num

[14]:

	CC/C_up	CC/C_down	DD/D_up	DD/D_down
1
12	1838.0	2219.0	3212.0	1634.0
24	1770.0	1607.0	3548.0	1981.0
36	2807.0	1175.0	3104.0	1165.0
48	3594.0	1699.0	3410.0	1275.0
60	2900.0	884.0	4185.0	1534.0
72	2138.0	599.0	3656.0	1327.0
84	2043.0	601.0	2401.0	1094.0
96	2191.0	670.0	1968.0	533.0
120	2278.0	585.0	1541.0	482.0
168	1952.0	777.0	886.0	289.0
216	1222.0	732.0	1644.0	1036.0

[ ]:

[15]:

import numpy as np
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(6,4),constrained_layout=True)
ax = fig.add_gridspec(top=0.8, right=0.8).subplots()

plt.subplots_adjust(left=0.2, bottom=0.6)

y_offset = np.zeros(CC_DD_all_DE_num.shape[0])
cell_text = []

colors = ["#C26364", "#2046F6", "#FBE5E5", "#8684F7"]

ax.bar(np.array(range(0, CC_DD_all_DE_num.shape[0])) - 0.25, CC_DD_all_DE_num["CC/C_up"], 0.25, bottom=y_offset, color=colors[0], align="edge")
cell_text.append(CC_DD_all_DE_num["CC/C_up"])

y_offset = y_offset + CC_DD_all_DE_num["CC/C_up"]
ax.bar(np.array(range(0, CC_DD_all_DE_num.shape[0])) - 0.25 , CC_DD_all_DE_num["CC/C_down"], 0.25, bottom=y_offset, color=colors[1], align="edge")
cell_text.append(CC_DD_all_DE_num["CC/C_down"])

## DD
y_offset = np.zeros(CC_DD_all_DE_num.shape[0])
ax.bar(np.array(range(0, CC_DD_all_DE_num.shape[0])) , CC_DD_all_DE_num["DD/D_up"], 0.25, bottom=y_offset, color=colors[2], align="edge")
cell_text.append(CC_DD_all_DE_num["DD/D_up"])

y_offset = y_offset + CC_DD_all_DE_num["DD/D_up"]
ax.bar(np.array(range(0, CC_DD_all_DE_num.shape[0])) , CC_DD_all_DE_num["DD/D_down"], 0.25, bottom=y_offset, color=colors[3], align="edge")
cell_text.append(CC_DD_all_DE_num["DD/D_down"])


#cell_text.reverse()
the_table = ax.table(cellText=cell_text,
                      rowLabels=CC_DD_all_DE_num.columns,
                      rowColours=colors,
                      colLabels=CC_DD_all_DE_num.index,
                      loc='bottom', fontsize=12)

ax.set(xticklabels=[], xticks=[], ylabel="DE gene number")
ax.set_xlim(-0.5, 10.5)

plt.savefig("CC_DD_DE_genes_number.pdf")

<ipython-input-15-913aa41f5b3d>:7: UserWarning: This figure was using a layout engine that is incompatible with subplots_adjust and/or tight_layout; not calling subplots_adjust.
  plt.subplots_adjust(left=0.2, bottom=0.6)

../../_images/notebooks_CC_DD_DE_overlap_CC_DD_DE_overlap_20_1.png

[16]:

np.array(range(0, CC_DD_all_DE_num.shape[0]))

[16]:

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

[17]:

accum_df = pd.DataFrame(index=CC_DD_all_DE_num.index, columns=["CC/C", "DD/D"])
accum_df = accum_df.fillna(0)

[18]:

for i,v in enumerate(df.columns):
    df_sub = None
    if i == 0:
        df_sub = abs(df[[df.columns[i]]])
    else:
        df_sub = abs(df[df.columns[0:i+1]])

    df_sub = pd.DataFrame(df_sub.sum(axis=1))
    df_sub.loc[df_sub[df_sub[0]>=1].index, 0] = 1
    accum_df.iloc[i, 0]= df_sub.sum()[0]


for i,v in enumerate(df_DD.columns):
    df_sub = None
    if i == 0:
        df_sub = abs(df_DD[[df_DD.columns[i]]])
    else:
        df_sub = abs(df_DD[df_DD.columns[0:i+1]])

    df_sub = pd.DataFrame(df_sub.sum(axis=1))
    df_sub.loc[df_sub[df_sub[0]>=1].index, 0] = 1
    accum_df.iloc[i, 1]= df_sub.sum()[0]

[19]:

fig = plt.figure(figsize=(4,3),constrained_layout=True)
ax = fig.add_gridspec(top=0.8, right=0.8).subplots()

accum_df.plot.bar(ax=ax, color=['#5B7695','#E0CC71'])
ax.set(xlabel="HAG(h)", ylabel="cumulative DE gene number")

plt.savefig("CC_DD_cumulative_DE_genes_number.pdf")

../../_images/notebooks_CC_DD_DE_overlap_CC_DD_DE_overlap_24_0.png

[20]:

CC_all_DE_genes = set(df.index)
DD_all_DE_genes = set(df_DD.index)

venn2([CC_all_DE_genes, DD_all_DE_genes], ('CC/C', 'DD/D'), set_colors=['#5B7695','#E0CC71'])
plt.savefig("CC_DD_DE_genes_overlap_number.pdf")

../../_images/notebooks_CC_DD_DE_overlap_CC_DD_DE_overlap_25_0.png

[21]:

CC_all_DE_genes_df = pd.DataFrame(CC_all_DE_genes)
DD_all_DE_genes_df = pd.DataFrame(DD_all_DE_genes)

[22]:

CC_DD_overlap_DEgenes = CC_all_DE_genes_df[CC_all_DE_genes_df[0].isin(DD_all_DE_genes_df[0])]

[286]:

CC_DD_overlap_DEgenes.to_csv("CC_DD_overlap_DEgenes.csv", sep="\t", header=False, index=False)

[ ]:

overlap DE gene heatmap#

[25]:

import seaborn as sns
x = sns.clustermap(
    df.loc[CC_DD_overlap_DEgenes[0],:],
    figsize=(5, 6),
    col_cluster=True,
    dendrogram_ratio=(.1, .2),
    cmap="vlag",

    cbar_pos=(1, 0.2, .03, .4)
)
x.savefig("CC_C-overlap_DEgene_cluster.pdf")
#ax.set(xlabel="", yticklabels="")
#ax.xaxis.tick_top()

../../_images/notebooks_CC_DD_DE_overlap_CC_DD_DE_overlap_31_0.png

[26]:

import seaborn as sns
x = sns.clustermap(
    df_DD.loc[CC_DD_overlap_DEgenes[0],:],
    figsize=(5, 6),
    col_cluster=True,
    dendrogram_ratio=(.1, .2),
    cmap="vlag",

    cbar_pos=(1, 0.2, .03, .4)
)
x.savefig("DD_D-overlap_DEgene_cluster.pdf")

../../_images/notebooks_CC_DD_DE_overlap_CC_DD_DE_overlap_32_0.png

[ ]:

for mfuzz#

[303]:

def get_log2fc(df, name, overlap_genes=CC_DD_overlap_DEgenes[0]):
    df["abslog2FoldChange"] = abs(df["log2FoldChange"])
    df = df.query('abslog2FoldChange >=1 & padj < 0.01')
    df = df[["log2FoldChange"]]
    df.columns = [name]
    df = df.loc[df.index.isin(overlap_genes),:]

    return df

[302]:

DD_overlap_log2fc = pd.concat([get_log2fc(D12_allFC, 'DD/D 12'),
           get_log2fc(D24_allFC, 'DD/D 24'),
           get_log2fc(D36_allFC, 'DD/D 36'),
           get_log2fc(D48_allFC, 'DD/D 48'),
           get_log2fc(D60_allFC, 'DD/D 60'),
           get_log2fc(D72_allFC, 'DD/D 72'),
           get_log2fc(D84_allFC, 'DD/D 84'),
           get_log2fc(D96_allFC, 'DD/D 96'),
           get_log2fc(D120_allFC, 'DD/D 120'),
           get_log2fc(D168_allFC, 'DD/D 168'),
           get_log2fc(D216_allFC, 'DD/D 216')], axis=1)

DD_overlap_log2fc = DD_overlap_log2fc.fillna(0)



CC_overlap_log2fc = pd.concat([get_log2fc(C12_allFC, 'CC/C 12'),
           get_log2fc(C24_allFC, 'CC/C 24'),
           get_log2fc(C36_allFC, 'CC/C 36'),
           get_log2fc(C48_allFC, 'CC/C 48'),
           get_log2fc(C60_allFC, 'CC/C 60'),
           get_log2fc(C72_allFC, 'CC/C 72'),
           get_log2fc(C84_allFC, 'CC/C 84'),
           get_log2fc(C96_allFC, 'CC/C 96'),
           get_log2fc(C120_allFC, 'CC/C 120'),
           get_log2fc(C168_allFC, 'CC/C 168'),
           get_log2fc(C216_allFC, 'CC/C 216')], axis=1)

CC_overlap_log2fc = CC_overlap_log2fc.fillna(0)

[305]:

CC_overlap_log2fc.to_csv("CC_overlap_log2fc.csv", sep="\t")

[306]:

DD_overlap_log2fc.to_csv("DD_overlap_log2fc.csv", sep="\t")

[ ]:

[ ]:

[ ]:

[ ]: