greatpy vs gprofiler

This notebook compares the results obtained with greatpy and gprofiler packages.

[1]:
%load_ext autoreload
%autoreload 2
[4]:
import greatpy as great
from gprofiler import GProfiler
import pandas as pd
from numpy import nan
import matplotlib.pyplot as plt

Compute the results

with greatpy

[5]:
test = "../data/tests/test_data/input/03_srf_hg19.bed"
regdom = "../data/human/hg19/regulatory_domain.bed"
size = "../data/human/hg19/chr_size.bed"
[6]:
enrichment = great.tl.enrichment(
    test_file = test,
    regdom_file = regdom,
    chr_size_file = size,
    annotation_file = "../data/human/ontologies.csv",
    binom = True,
    hypergeom = True,
    )
enrichment = great.tl.set_fdr(enrichment)
enrichment = great.tl.set_bonferroni(enrichment)
enrichment
[6]:
go_term binom_p_value binom_fold_enrichment hypergeom_p_value hypergeometric_fold_enrichment intersection_size recall binom_bonferroni hypergeom_bonferroni binom_fdr hypergeom_fdr
GO:0072749 cellular response to cytochalasin B 2.21968e-12 2.27251e+05 4.28032e-02 2.33627e+01 5 5.00000e+00 9.23386e-09 1.00000e+00 4.61693e-09 4.65089e-01
GO:0051623 positive regulation of norepinephrine uptake 2.21968e-12 2.27251e+05 4.28032e-02 2.33627e+01 5 5.00000e+00 9.23386e-09 1.00000e+00 4.61693e-09 4.65089e-01
GO:0098973 structural constituent of postsynaptic actin c... 2.11740e-10 9.10526e+04 1.60543e-01 5.84068e+00 5 1.25000e+00 8.80839e-07 1.00000e+00 2.93613e-07 5.43858e-01
GO:0097433 dense body 6.40085e-10 1.60618e+04 1.41783e-03 1.16814e+01 8 1.33333e+00 2.66275e-06 1.00000e+00 6.65688e-07 3.65408e-01
GO:0032796 uropod organization 2.69880e-09 5.45449e+04 1.82991e-03 2.33627e+01 5 2.50000e+00 1.12270e-05 1.00000e+00 1.93909e-06 3.65408e-01
... ... ... ... ... ... ... ... ... ... ... ...
GO:0007156 homophilic cell adhesion via plasma membrane a... 9.99960e-01 1.12747e+02 9.84651e-01 3.89379e-01 3 1.66667e-02 1.00000e+00 1.00000e+00 1.00000e+00 1.00000e+00
GO:0007186 G protein-coupled receptor signaling pathway 9.99992e-01 2.46644e+02 1.00000e+00 3.07404e-01 19 1.47059e-02 1.00000e+00 1.00000e+00 1.00000e+00 1.00000e+00
GO:0016021 integral component of membrane 9.99997e-01 3.94493e+02 9.99998e-01 7.05221e-01 112 2.88958e-02 1.00000e+00 1.00000e+00 1.00000e+00 1.00000e+00
GO:0005887 integral component of plasma membrane 1.00000e+00 2.81852e+02 9.99970e-01 5.84068e-01 40 2.50000e-02 1.00000e+00 1.00000e+00 1.00000e+00 1.00000e+00
GO:0005886 plasma membrane 1.00000e+00 3.98798e+02 1.00000e+00 2.55733e-01 159 1.06776e-02 1.00000e+00 1.00000e+00 1.00000e+00 1.00000e+00

4160 rows × 11 columns

[7]:
plot = enrichment
plot = plot.rename(columns = {"binom_p_value" : "p_value", "go_term":"name"})
plt.figure(figsize = (10,10))
great.pl.plot_enrich(plot,color = "Reds")
../_images/notebooks_04_hypergeom_vs_gprofiler_8_0.png

With gprofile

Preprocessing

[8]:
L = []
f = open(test)
cdc = f.readline()
while cdc !=  "":
    cdc = cdc.split("\t")
    cdc = ":".join(cdc)
    cdc = cdc.strip()
    L.append(cdc[3:])
    cdc = f.readline()
[9]:
back = pd.read_csv(regdom,sep = "\t",comment = "#",
                    names = ["Chr", "Chr_Start", "Chr_End","Name","tss","Strand"],dtype = {"Chr":"object", "Chr_Start":"int64", "Chr_End":"int64","Name":"object","tss":"int64","Strand":"object"})
back = list(back["Name"])

Computation

[10]:
gp_no_correct = GProfiler(return_dataframe = True)
gp_no_correct = gp_no_correct.profile(organism = 'hsapiens',
            query = L,
            all_results = True,
            background = back,user_threshold = 1,
            )

gp_no_correct = gp_no_correct.loc[(gp_no_correct["source"]  ==  "GO:MF")|(gp_no_correct["source"]  ==  "GO:CC")|(gp_no_correct["source"]  ==  "GO:BP")]
[11]:
gp_no_correct
[11]:
source native name p_value significant description term_size query_size intersection_size effective_domain_size precision recall query parents
105 GO:MF GO:0000062 fatty-acyl-CoA binding 1.00000e+00 False "Binding to a fatty-acyl-CoA, any derivative o... 21 277 1 17730 3.61011e-03 4.76190e-02 query_1 [GO:0120227, GO:1901567]
9867 GO:BP GO:0044725 chromatin reprogramming in the zygote 1.00000e+00 False "The global reprogramming of epigenetic modifi... 2 277 1 17730 3.61011e-03 5.00000e-01 query_1 [GO:0043045]
9868 GO:BP GO:0044721 protein import into peroxisome matrix, substra... 1.00000e+00 False "The process by which the cargo protein is rel... 1 277 1 17730 3.61011e-03 1.00000e+00 query_1 [GO:0016558, GO:0032984]
9869 GO:BP GO:0044706 multi-multicellular organism process 1.00000e+00 False "A multicellular organism process which involv... 207 277 3 17730 1.08303e-02 1.44928e-02 query_1 [GO:0032501]
9870 GO:BP GO:0044703 multi-organism reproductive process 1.00000e+00 False "A biological process that directly contribute... 198 277 2 17730 7.22022e-03 1.01010e-02 query_1 [GO:0022414]
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
18181 GO:BP GO:2000379 positive regulation of reactive oxygen species... 1.00000e+00 False "Any process that activates or increases the f... 65 277 1 17730 3.61011e-03 1.53846e-02 query_1 [GO:0031325, GO:0072593, GO:2000377]
18182 GO:BP GO:2000377 regulation of reactive oxygen species metaboli... 1.00000e+00 False "Any process that modulates the frequency, rat... 134 277 1 17730 3.61011e-03 7.46269e-03 query_1 [GO:0031323, GO:0072593]
18183 GO:BP GO:2000370 positive regulation of clathrin-dependent endo... 1.00000e+00 False "Any process that activates or increases the f... 5 277 1 17730 3.61011e-03 2.00000e-01 query_1 [GO:0048260, GO:0072583, GO:2000369]
18184 GO:BP GO:2000369 regulation of clathrin-dependent endocytosis 1.00000e+00 False "Any process that modulates the frequency, rat... 18 277 2 17730 7.22022e-03 1.11111e-01 query_1 [GO:0048259, GO:0072583]
18185 GO:BP GO:2000406 positive regulation of T cell migration 1.00000e+00 False "Any process that activates or increases the f... 31 277 1 17730 3.61011e-03 3.22581e-02 query_1 [GO:0072678, GO:2000403, GO:2000404]

5102 rows × 14 columns

[12]:
plt.figure(figsize = (10,10))
great.pl.plot_enrich(gp_no_correct,color = "Reds")
../_images/notebooks_04_hypergeom_vs_gprofiler_16_0.png

Add the GProfiler pvalue on the BinomP_and_HypergeomP df

[13]:
val = []
for i in enrichment.index:
    try :
        val.append(float(gp_no_correct.loc[gp_no_correct["native"]  ==  i]["p_value"]))
    except:
        val.append(nan)
enrichment["gprofile_p_val"] = val

Plot the p-value without correction

[19]:
fig,ax = plt.subplots(2,2,figsize = (20,12))
great.pl.scatterplot(enrichment,"hypergeom_p_value","gprofile_p_val",minus_log10 = False,ax = ax[0,0],title = "hypergeom vs gprofiler")
great.pl.scatterplot(enrichment,"hypergeom_p_value","gprofile_p_val",minus_log10 = True,ax = ax[0,1],title = "hypergeom vs gprofiler with log scale")
great.pl.scatterplot(enrichment,"binom_p_value","gprofile_p_val",minus_log10 = False,ax = ax[1,0],title = "binom vs gprofiler")
great.pl.scatterplot(enrichment,"binom_p_value","gprofile_p_val",minus_log10 = True,ax = ax[1,1],title = "binom vs gprofiler with log scale")
plt.show()
../_images/notebooks_04_hypergeom_vs_gprofiler_20_0.png

We observe very different results. Indeed, GProfiler does not obtain any significant probability.

It is not possible to determine the p-value without multiple correction with GProfiler.

It is therefore inconsistent to compare the uncorrected values of the multiple tests obtained with greatpy and the results of Gprofiler

Comparison with a fdr correction

[15]:
gp_correct_fdr = GProfiler(return_dataframe = True)
gp_correct_fdr = gp_correct_fdr.profile(organism = 'hsapiens',
            query = L,
            all_results = True,
            background = back,
            user_threshold = 1,
            significance_threshold_method = "fdr")

gp_correct_fdr = gp_correct_fdr.loc[(gp_correct_fdr["source"]  ==  "GO:MF")|(gp_correct_fdr["source"]  ==  "GO:CC")|(gp_correct_fdr["source"]  ==  "GO:BP")]

gp_correct_bonferroni = GProfiler(return_dataframe = True)
gp_correct_bonferroni = gp_correct_bonferroni.profile(organism = 'hsapiens',
            query = L,
            all_results = True,
            background = back,
            user_threshold = 1,
            significance_threshold_method = "fdr")

gp_correct_bonferroni = gp_correct_bonferroni.loc[(gp_correct_bonferroni["source"]  ==  "GO:MF")|(gp_correct_bonferroni["source"]  ==  "GO:CC")|(gp_correct_bonferroni["source"]  ==  "GO:BP")]
[16]:
val = []
for i in enrichment.index:
    try :
        val.append(float(gp_correct_fdr.loc[gp_correct_fdr["native"]  ==  i]["p_value"]))
    except:
        val.append(nan)
enrichment["gprofile_p_val_fdr"] = val

val = []
for i in enrichment.index:
    try :
        val.append(float(gp_correct_bonferroni.loc[gp_correct_bonferroni["native"]  ==  i]["p_value"]))
    except:
        val.append(nan)
enrichment["gprofile_p_val_bonferroni"] = val

Plot of the p-value with correction and bonferroni

[17]:
enrichment
[17]:
go_term binom_p_value binom_fold_enrichment hypergeom_p_value hypergeometric_fold_enrichment intersection_size recall binom_bonferroni hypergeom_bonferroni binom_fdr hypergeom_fdr gprofile_p_val gprofile_p_val_fdr gprofile_p_val_bonferroni
GO:0072749 cellular response to cytochalasin B 2.21968e-12 2.27251e+05 4.28032e-02 2.33627e+01 5 5.00000e+00 9.23386e-09 1.00000e+00 4.61693e-09 4.65089e-01 NaN NaN NaN
GO:0051623 positive regulation of norepinephrine uptake 2.21968e-12 2.27251e+05 4.28032e-02 2.33627e+01 5 5.00000e+00 9.23386e-09 1.00000e+00 4.61693e-09 4.65089e-01 NaN NaN NaN
GO:0098973 structural constituent of postsynaptic actin c... 2.11740e-10 9.10526e+04 1.60543e-01 5.84068e+00 5 1.25000e+00 8.80839e-07 1.00000e+00 2.93613e-07 5.43858e-01 NaN NaN NaN
GO:0097433 dense body 6.40085e-10 1.60618e+04 1.41783e-03 1.16814e+01 8 1.33333e+00 2.66275e-06 1.00000e+00 6.65688e-07 3.65408e-01 NaN NaN NaN
GO:0032796 uropod organization 2.69880e-09 5.45449e+04 1.82991e-03 2.33627e+01 5 2.50000e+00 1.12270e-05 1.00000e+00 1.93909e-06 3.65408e-01 NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
GO:0007156 homophilic cell adhesion via plasma membrane a... 9.99960e-01 1.12747e+02 9.84651e-01 3.89379e-01 3 1.66667e-02 1.00000e+00 1.00000e+00 1.00000e+00 1.00000e+00 1.00000e+00 8.51916e-01 8.51916e-01
GO:0007186 G protein-coupled receptor signaling pathway 9.99992e-01 2.46644e+02 1.00000e+00 3.07404e-01 19 1.47059e-02 1.00000e+00 1.00000e+00 1.00000e+00 1.00000e+00 1.00000e+00 9.86784e-01 9.86784e-01
GO:0016021 integral component of membrane 9.99997e-01 3.94493e+02 9.99998e-01 7.05221e-01 112 2.88958e-02 1.00000e+00 1.00000e+00 1.00000e+00 1.00000e+00 1.00000e+00 7.87548e-01 7.87548e-01
GO:0005887 integral component of plasma membrane 1.00000e+00 2.81852e+02 9.99970e-01 5.84068e-01 40 2.50000e-02 1.00000e+00 1.00000e+00 1.00000e+00 1.00000e+00 1.00000e+00 7.92393e-01 7.92393e-01
GO:0005886 plasma membrane 1.00000e+00 3.98798e+02 1.00000e+00 2.55733e-01 159 1.06776e-02 1.00000e+00 1.00000e+00 1.00000e+00 1.00000e+00 1.00000e+00 4.11496e-01 4.11496e-01

4160 rows × 14 columns

[20]:
fig,ax = plt.subplots(2,2,figsize = (20,8))
great.pl.scatterplot(enrichment,"hypergeom_fdr","gprofile_p_val_fdr",minus_log10 = False,ax = ax[0,0])
great.pl.scatterplot(enrichment,"hypergeom_fdr","gprofile_p_val_fdr",minus_log10 = True,ax = ax[0,1])
great.pl.scatterplot(enrichment,"binom_fdr","gprofile_p_val_fdr",minus_log10 = False,ax = ax[1,0])
great.pl.scatterplot(enrichment,"binom_fdr","gprofile_p_val_fdr",minus_log10 = True,ax = ax[1,1])
plt.show()


fig,ax = plt.subplots(2,2,figsize = (20,8))
great.pl.scatterplot(enrichment,"hypergeom_bonferroni","gprofile_p_val_bonferroni",minus_log10 = False,ax = ax[0,0])
great.pl.scatterplot(enrichment,"hypergeom_bonferroni","gprofile_p_val_bonferroni",minus_log10 = True,ax = ax[0,1])
great.pl.scatterplot(enrichment,"binom_bonferroni","gprofile_p_val_bonferroni",minus_log10 = False,ax = ax[1,0])
great.pl.scatterplot(enrichment,"binom_bonferroni","gprofile_p_val_bonferroni",minus_log10 = True,ax = ax[1,1])
plt.show()
../_images/notebooks_04_hypergeom_vs_gprofiler_27_0.png
../_images/notebooks_04_hypergeom_vs_gprofiler_27_1.png