greatpy vs gprofiler
This notebook compares the results obtained with greatpy and gprofiler packages.
[1]:
%load_ext autoreload
%autoreload 2
[4]:
import greatpy as great
from gprofiler import GProfiler
import pandas as pd
from numpy import nan
import matplotlib.pyplot as plt
Compute the results
with greatpy
[5]:
test = "../data/tests/test_data/input/03_srf_hg19.bed"
regdom = "../data/human/hg19/regulatory_domain.bed"
size = "../data/human/hg19/chr_size.bed"
[6]:
enrichment = great.tl.enrichment(
test_file = test,
regdom_file = regdom,
chr_size_file = size,
annotation_file = "../data/human/ontologies.csv",
binom = True,
hypergeom = True,
)
enrichment = great.tl.set_fdr(enrichment)
enrichment = great.tl.set_bonferroni(enrichment)
enrichment
[6]:
| go_term | binom_p_value | binom_fold_enrichment | hypergeom_p_value | hypergeometric_fold_enrichment | intersection_size | recall | binom_bonferroni | hypergeom_bonferroni | binom_fdr | hypergeom_fdr | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| GO:0072749 | cellular response to cytochalasin B | 2.21968e-12 | 2.27251e+05 | 4.28032e-02 | 2.33627e+01 | 5 | 5.00000e+00 | 9.23386e-09 | 1.00000e+00 | 4.61693e-09 | 4.65089e-01 |
| GO:0051623 | positive regulation of norepinephrine uptake | 2.21968e-12 | 2.27251e+05 | 4.28032e-02 | 2.33627e+01 | 5 | 5.00000e+00 | 9.23386e-09 | 1.00000e+00 | 4.61693e-09 | 4.65089e-01 |
| GO:0098973 | structural constituent of postsynaptic actin c... | 2.11740e-10 | 9.10526e+04 | 1.60543e-01 | 5.84068e+00 | 5 | 1.25000e+00 | 8.80839e-07 | 1.00000e+00 | 2.93613e-07 | 5.43858e-01 |
| GO:0097433 | dense body | 6.40085e-10 | 1.60618e+04 | 1.41783e-03 | 1.16814e+01 | 8 | 1.33333e+00 | 2.66275e-06 | 1.00000e+00 | 6.65688e-07 | 3.65408e-01 |
| GO:0032796 | uropod organization | 2.69880e-09 | 5.45449e+04 | 1.82991e-03 | 2.33627e+01 | 5 | 2.50000e+00 | 1.12270e-05 | 1.00000e+00 | 1.93909e-06 | 3.65408e-01 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| GO:0007156 | homophilic cell adhesion via plasma membrane a... | 9.99960e-01 | 1.12747e+02 | 9.84651e-01 | 3.89379e-01 | 3 | 1.66667e-02 | 1.00000e+00 | 1.00000e+00 | 1.00000e+00 | 1.00000e+00 |
| GO:0007186 | G protein-coupled receptor signaling pathway | 9.99992e-01 | 2.46644e+02 | 1.00000e+00 | 3.07404e-01 | 19 | 1.47059e-02 | 1.00000e+00 | 1.00000e+00 | 1.00000e+00 | 1.00000e+00 |
| GO:0016021 | integral component of membrane | 9.99997e-01 | 3.94493e+02 | 9.99998e-01 | 7.05221e-01 | 112 | 2.88958e-02 | 1.00000e+00 | 1.00000e+00 | 1.00000e+00 | 1.00000e+00 |
| GO:0005887 | integral component of plasma membrane | 1.00000e+00 | 2.81852e+02 | 9.99970e-01 | 5.84068e-01 | 40 | 2.50000e-02 | 1.00000e+00 | 1.00000e+00 | 1.00000e+00 | 1.00000e+00 |
| GO:0005886 | plasma membrane | 1.00000e+00 | 3.98798e+02 | 1.00000e+00 | 2.55733e-01 | 159 | 1.06776e-02 | 1.00000e+00 | 1.00000e+00 | 1.00000e+00 | 1.00000e+00 |
4160 rows × 11 columns
[7]:
plot = enrichment
plot = plot.rename(columns = {"binom_p_value" : "p_value", "go_term":"name"})
plt.figure(figsize = (10,10))
great.pl.plot_enrich(plot,color = "Reds")
With gprofile
Preprocessing
[8]:
L = []
f = open(test)
cdc = f.readline()
while cdc != "":
cdc = cdc.split("\t")
cdc = ":".join(cdc)
cdc = cdc.strip()
L.append(cdc[3:])
cdc = f.readline()
[9]:
back = pd.read_csv(regdom,sep = "\t",comment = "#",
names = ["Chr", "Chr_Start", "Chr_End","Name","tss","Strand"],dtype = {"Chr":"object", "Chr_Start":"int64", "Chr_End":"int64","Name":"object","tss":"int64","Strand":"object"})
back = list(back["Name"])
Computation
[10]:
gp_no_correct = GProfiler(return_dataframe = True)
gp_no_correct = gp_no_correct.profile(organism = 'hsapiens',
query = L,
all_results = True,
background = back,user_threshold = 1,
)
gp_no_correct = gp_no_correct.loc[(gp_no_correct["source"] == "GO:MF")|(gp_no_correct["source"] == "GO:CC")|(gp_no_correct["source"] == "GO:BP")]
[11]:
gp_no_correct
[11]:
| source | native | name | p_value | significant | description | term_size | query_size | intersection_size | effective_domain_size | precision | recall | query | parents | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 105 | GO:MF | GO:0000062 | fatty-acyl-CoA binding | 1.00000e+00 | False | "Binding to a fatty-acyl-CoA, any derivative o... | 21 | 277 | 1 | 17730 | 3.61011e-03 | 4.76190e-02 | query_1 | [GO:0120227, GO:1901567] |
| 9867 | GO:BP | GO:0044725 | chromatin reprogramming in the zygote | 1.00000e+00 | False | "The global reprogramming of epigenetic modifi... | 2 | 277 | 1 | 17730 | 3.61011e-03 | 5.00000e-01 | query_1 | [GO:0043045] |
| 9868 | GO:BP | GO:0044721 | protein import into peroxisome matrix, substra... | 1.00000e+00 | False | "The process by which the cargo protein is rel... | 1 | 277 | 1 | 17730 | 3.61011e-03 | 1.00000e+00 | query_1 | [GO:0016558, GO:0032984] |
| 9869 | GO:BP | GO:0044706 | multi-multicellular organism process | 1.00000e+00 | False | "A multicellular organism process which involv... | 207 | 277 | 3 | 17730 | 1.08303e-02 | 1.44928e-02 | query_1 | [GO:0032501] |
| 9870 | GO:BP | GO:0044703 | multi-organism reproductive process | 1.00000e+00 | False | "A biological process that directly contribute... | 198 | 277 | 2 | 17730 | 7.22022e-03 | 1.01010e-02 | query_1 | [GO:0022414] |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18181 | GO:BP | GO:2000379 | positive regulation of reactive oxygen species... | 1.00000e+00 | False | "Any process that activates or increases the f... | 65 | 277 | 1 | 17730 | 3.61011e-03 | 1.53846e-02 | query_1 | [GO:0031325, GO:0072593, GO:2000377] |
| 18182 | GO:BP | GO:2000377 | regulation of reactive oxygen species metaboli... | 1.00000e+00 | False | "Any process that modulates the frequency, rat... | 134 | 277 | 1 | 17730 | 3.61011e-03 | 7.46269e-03 | query_1 | [GO:0031323, GO:0072593] |
| 18183 | GO:BP | GO:2000370 | positive regulation of clathrin-dependent endo... | 1.00000e+00 | False | "Any process that activates or increases the f... | 5 | 277 | 1 | 17730 | 3.61011e-03 | 2.00000e-01 | query_1 | [GO:0048260, GO:0072583, GO:2000369] |
| 18184 | GO:BP | GO:2000369 | regulation of clathrin-dependent endocytosis | 1.00000e+00 | False | "Any process that modulates the frequency, rat... | 18 | 277 | 2 | 17730 | 7.22022e-03 | 1.11111e-01 | query_1 | [GO:0048259, GO:0072583] |
| 18185 | GO:BP | GO:2000406 | positive regulation of T cell migration | 1.00000e+00 | False | "Any process that activates or increases the f... | 31 | 277 | 1 | 17730 | 3.61011e-03 | 3.22581e-02 | query_1 | [GO:0072678, GO:2000403, GO:2000404] |
5102 rows × 14 columns
[12]:
plt.figure(figsize = (10,10))
great.pl.plot_enrich(gp_no_correct,color = "Reds")
Add the GProfiler pvalue on the BinomP_and_HypergeomP df
[13]:
val = []
for i in enrichment.index:
try :
val.append(float(gp_no_correct.loc[gp_no_correct["native"] == i]["p_value"]))
except:
val.append(nan)
enrichment["gprofile_p_val"] = val
Plot the p-value without correction
[19]:
fig,ax = plt.subplots(2,2,figsize = (20,12))
great.pl.scatterplot(enrichment,"hypergeom_p_value","gprofile_p_val",minus_log10 = False,ax = ax[0,0],title = "hypergeom vs gprofiler")
great.pl.scatterplot(enrichment,"hypergeom_p_value","gprofile_p_val",minus_log10 = True,ax = ax[0,1],title = "hypergeom vs gprofiler with log scale")
great.pl.scatterplot(enrichment,"binom_p_value","gprofile_p_val",minus_log10 = False,ax = ax[1,0],title = "binom vs gprofiler")
great.pl.scatterplot(enrichment,"binom_p_value","gprofile_p_val",minus_log10 = True,ax = ax[1,1],title = "binom vs gprofiler with log scale")
plt.show()
We observe very different results. Indeed, GProfiler does not obtain any significant probability.
It is not possible to determine the p-value without multiple correction with GProfiler.
It is therefore inconsistent to compare the uncorrected values of the multiple tests obtained with greatpy and the results of Gprofiler
Comparison with a fdr correction
[15]:
gp_correct_fdr = GProfiler(return_dataframe = True)
gp_correct_fdr = gp_correct_fdr.profile(organism = 'hsapiens',
query = L,
all_results = True,
background = back,
user_threshold = 1,
significance_threshold_method = "fdr")
gp_correct_fdr = gp_correct_fdr.loc[(gp_correct_fdr["source"] == "GO:MF")|(gp_correct_fdr["source"] == "GO:CC")|(gp_correct_fdr["source"] == "GO:BP")]
gp_correct_bonferroni = GProfiler(return_dataframe = True)
gp_correct_bonferroni = gp_correct_bonferroni.profile(organism = 'hsapiens',
query = L,
all_results = True,
background = back,
user_threshold = 1,
significance_threshold_method = "fdr")
gp_correct_bonferroni = gp_correct_bonferroni.loc[(gp_correct_bonferroni["source"] == "GO:MF")|(gp_correct_bonferroni["source"] == "GO:CC")|(gp_correct_bonferroni["source"] == "GO:BP")]
[16]:
val = []
for i in enrichment.index:
try :
val.append(float(gp_correct_fdr.loc[gp_correct_fdr["native"] == i]["p_value"]))
except:
val.append(nan)
enrichment["gprofile_p_val_fdr"] = val
val = []
for i in enrichment.index:
try :
val.append(float(gp_correct_bonferroni.loc[gp_correct_bonferroni["native"] == i]["p_value"]))
except:
val.append(nan)
enrichment["gprofile_p_val_bonferroni"] = val
Plot of the p-value with correction and bonferroni
[17]:
enrichment
[17]:
| go_term | binom_p_value | binom_fold_enrichment | hypergeom_p_value | hypergeometric_fold_enrichment | intersection_size | recall | binom_bonferroni | hypergeom_bonferroni | binom_fdr | hypergeom_fdr | gprofile_p_val | gprofile_p_val_fdr | gprofile_p_val_bonferroni | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| GO:0072749 | cellular response to cytochalasin B | 2.21968e-12 | 2.27251e+05 | 4.28032e-02 | 2.33627e+01 | 5 | 5.00000e+00 | 9.23386e-09 | 1.00000e+00 | 4.61693e-09 | 4.65089e-01 | NaN | NaN | NaN |
| GO:0051623 | positive regulation of norepinephrine uptake | 2.21968e-12 | 2.27251e+05 | 4.28032e-02 | 2.33627e+01 | 5 | 5.00000e+00 | 9.23386e-09 | 1.00000e+00 | 4.61693e-09 | 4.65089e-01 | NaN | NaN | NaN |
| GO:0098973 | structural constituent of postsynaptic actin c... | 2.11740e-10 | 9.10526e+04 | 1.60543e-01 | 5.84068e+00 | 5 | 1.25000e+00 | 8.80839e-07 | 1.00000e+00 | 2.93613e-07 | 5.43858e-01 | NaN | NaN | NaN |
| GO:0097433 | dense body | 6.40085e-10 | 1.60618e+04 | 1.41783e-03 | 1.16814e+01 | 8 | 1.33333e+00 | 2.66275e-06 | 1.00000e+00 | 6.65688e-07 | 3.65408e-01 | NaN | NaN | NaN |
| GO:0032796 | uropod organization | 2.69880e-09 | 5.45449e+04 | 1.82991e-03 | 2.33627e+01 | 5 | 2.50000e+00 | 1.12270e-05 | 1.00000e+00 | 1.93909e-06 | 3.65408e-01 | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| GO:0007156 | homophilic cell adhesion via plasma membrane a... | 9.99960e-01 | 1.12747e+02 | 9.84651e-01 | 3.89379e-01 | 3 | 1.66667e-02 | 1.00000e+00 | 1.00000e+00 | 1.00000e+00 | 1.00000e+00 | 1.00000e+00 | 8.51916e-01 | 8.51916e-01 |
| GO:0007186 | G protein-coupled receptor signaling pathway | 9.99992e-01 | 2.46644e+02 | 1.00000e+00 | 3.07404e-01 | 19 | 1.47059e-02 | 1.00000e+00 | 1.00000e+00 | 1.00000e+00 | 1.00000e+00 | 1.00000e+00 | 9.86784e-01 | 9.86784e-01 |
| GO:0016021 | integral component of membrane | 9.99997e-01 | 3.94493e+02 | 9.99998e-01 | 7.05221e-01 | 112 | 2.88958e-02 | 1.00000e+00 | 1.00000e+00 | 1.00000e+00 | 1.00000e+00 | 1.00000e+00 | 7.87548e-01 | 7.87548e-01 |
| GO:0005887 | integral component of plasma membrane | 1.00000e+00 | 2.81852e+02 | 9.99970e-01 | 5.84068e-01 | 40 | 2.50000e-02 | 1.00000e+00 | 1.00000e+00 | 1.00000e+00 | 1.00000e+00 | 1.00000e+00 | 7.92393e-01 | 7.92393e-01 |
| GO:0005886 | plasma membrane | 1.00000e+00 | 3.98798e+02 | 1.00000e+00 | 2.55733e-01 | 159 | 1.06776e-02 | 1.00000e+00 | 1.00000e+00 | 1.00000e+00 | 1.00000e+00 | 1.00000e+00 | 4.11496e-01 | 4.11496e-01 |
4160 rows × 14 columns
[20]:
fig,ax = plt.subplots(2,2,figsize = (20,8))
great.pl.scatterplot(enrichment,"hypergeom_fdr","gprofile_p_val_fdr",minus_log10 = False,ax = ax[0,0])
great.pl.scatterplot(enrichment,"hypergeom_fdr","gprofile_p_val_fdr",minus_log10 = True,ax = ax[0,1])
great.pl.scatterplot(enrichment,"binom_fdr","gprofile_p_val_fdr",minus_log10 = False,ax = ax[1,0])
great.pl.scatterplot(enrichment,"binom_fdr","gprofile_p_val_fdr",minus_log10 = True,ax = ax[1,1])
plt.show()
fig,ax = plt.subplots(2,2,figsize = (20,8))
great.pl.scatterplot(enrichment,"hypergeom_bonferroni","gprofile_p_val_bonferroni",minus_log10 = False,ax = ax[0,0])
great.pl.scatterplot(enrichment,"hypergeom_bonferroni","gprofile_p_val_bonferroni",minus_log10 = True,ax = ax[0,1])
great.pl.scatterplot(enrichment,"binom_bonferroni","gprofile_p_val_bonferroni",minus_log10 = False,ax = ax[1,0])
great.pl.scatterplot(enrichment,"binom_bonferroni","gprofile_p_val_bonferroni",minus_log10 = True,ax = ax[1,1])
plt.show()