combine bindome and greatpy

[1]:
%load_ext autoreload
%autoreload 2
[9]:
import greatpy as great
import bindome as bd
import warnings
warnings.filterwarnings('ignore')

Import data with bindome

[10]:
annpath = '/storage/groups/ml01/datasets/annotations'
bd.bindome.constants.ANNOTATIONS_DIRECTORY = annpath
df1 = bd.bindome.datasets.REMAP2020.get_remap_peaks('SRF')
df1
[10]:
chr start end 3 4 5 6 7 8 coordinate summit.start summit.end k.summit
0 chr1 629831 629954 SRF:K-562 1 . 629938 629939 140,112,224 chr1:629831-629954 629838 630038 chr1:629838-630038
1 chr1 778597 778779 SRF:Ishikawa,GM12878,A-673-clone-Asp114,K-562,... 8 . 778671 778672 140,112,224 chr1:778597-778779 778571 778771 chr1:778571-778771
2 chr1 869817 870104 SRF:A-673-clone-Asp114 1 . 870004 870005 140,112,224 chr1:869817-870104 869904 870104 chr1:869904-870104
3 chr1 904714 904924 SRF:GM12878,A-673-clone-Asp114 2 . 904795 904796 140,112,224 chr1:904714-904924 904695 904895 chr1:904695-904895
4 chr1 905143 905666 SRF:A-673-clone-Asp114 1 . 905393 905394 140,112,224 chr1:905143-905666 905293 905493 chr1:905293-905493
... ... ... ... ... ... ... ... ... ... ... ... ... ...
130753 chrY 19567211 19567311 SRF:Hep-G2 1 . 19567231 19567232 140,112,224 chrY:19567211-19567311 19567131 19567331 chrY:19567131-19567331
130754 chrY 19744932 19745043 SRF:Hep-G2 1 . 19744983 19744984 140,112,224 chrY:19744932-19745043 19744883 19745083 chrY:19744883-19745083
130755 chrY 20575583 20575822 SRF:WA01,Hep-G2 2 . 20575707 20575708 140,112,224 chrY:20575583-20575822 20575607 20575807 chrY:20575607-20575807
130756 chrY 20575934 20576114 SRF:WA01,Hep-G2 2 . 20576046 20576047 140,112,224 chrY:20575934-20576114 20575946 20576146 chrY:20575946-20576146
130757 chrY 21021882 21022040 SRF:Hep-G2 1 . 21021946 21021947 140,112,224 chrY:21021882-21022040 21021846 21022046 chrY:21021846-21022046

130758 rows × 13 columns

[12]:
df1 = df1[df1[3] == "SRF:Ishikawa,MCF-7"].iloc[:,0:3]
df1 = df1.rename(columns = {"chr":'Chr','start':"Chr_start",'end':"Chr_end"})
df1
[12]:
Chr Chr_start Chr_end
47 chr1 1157504 1157660
178 chr1 2586482 2586614
241 chr1 3900438 3900894
371 chr1 6785102 6785311
708 chr1 10796590 10796818
... ... ... ...
130119 chrX 132023226 132023393
130187 chrX 133418786 133418958
130266 chrX 135344596 135344830
130562 chrX 152830686 152830951
130645 chrX 154000071 154000286

578 rows × 3 columns

Compute the data with greatpy

[16]:
enrichment = great.tl.enrichment(
    test_file = df1,
    regdom_file = "../data/human/hg38/regulatory_domain.bed",
    chr_size_file = "../data/human/hg38/chr_size.bed",
    annotation_file = "../data/human/ontologies.csv",
    binom = True,
    hypergeom = True,
    )
enrichment = great.tl.set_fdr(enrichment)
enrichment = great.tl.set_bonferroni(enrichment)
enrichment
[16]:
go_term binom_p_value binom_fold_enrichment hypergeom_p_value hypergeometric_fold_enrichment binom_fdr hypergeom_fdr binom_bonferroni hypergeom_bonferroni
GO:0003887 DNA-directed DNA polymerase activity 5.58223e-07 6.86070e+03 6.78765e-03 2.83705e+00 2.52484e-03 4.24967e-01 2.52484e-03 1.00000e+00
GO:0005654 nucleoplasm 2.37389e-06 7.76641e+02 1.00000e+00 2.46288e-01 5.36856e-03 1.00000e+00 1.07371e-02 1.00000e+00
GO:0051271 negative regulation of cellular component move... 9.61747e-06 7.40429e+03 2.99533e-03 9.27107e+00 1.02085e-02 4.24967e-01 4.34998e-02 1.00000e+00
GO:0045095 keratin filament 1.06729e-05 5.59057e+03 2.05747e-01 1.45603e+00 1.02085e-02 5.39459e-01 4.82736e-02 1.00000e+00
GO:0044317 rod spherule 1.12851e-05 2.42732e+05 4.62268e-02 2.16325e+01 1.02085e-02 4.24967e-01 5.10424e-02 1.00000e+00
... ... ... ... ... ... ... ... ... ...
GO:0042391 regulation of membrane potential 9.99684e-01 7.22182e+01 9.98196e-01 1.62650e-01 1.00000e+00 1.00000e+00 1.00000e+00 1.00000e+00
GO:0045211 postsynaptic membrane 9.99729e-01 1.92339e+02 9.14369e-01 6.36250e-01 1.00000e+00 9.60393e-01 1.00000e+00 1.00000e+00
GO:0004930 G protein-coupled receptor activity 9.99972e-01 1.78522e+02 1.00000e+00 2.21872e-01 1.00000e+00 1.00000e+00 1.00000e+00 1.00000e+00
GO:0005886 plasma membrane 9.99999e-01 4.37347e+02 1.00000e+00 2.62943e-01 1.00000e+00 1.00000e+00 1.00000e+00 1.00000e+00
GO:0007186 G protein-coupled receptor signaling pathway 1.00000e+00 2.02047e+02 1.00000e+00 3.01381e-01 1.00000e+00 1.00000e+00 1.00000e+00 1.00000e+00

4523 rows × 9 columns