combine bindome and greatpy
[1]:
%load_ext autoreload
%autoreload 2
[9]:
import greatpy as great
import bindome as bd
import warnings
warnings.filterwarnings('ignore')
Import data with bindome
[10]:
annpath = '/storage/groups/ml01/datasets/annotations'
bd.bindome.constants.ANNOTATIONS_DIRECTORY = annpath
df1 = bd.bindome.datasets.REMAP2020.get_remap_peaks('SRF')
df1
[10]:
| chr | start | end | 3 | 4 | 5 | 6 | 7 | 8 | coordinate | summit.start | summit.end | k.summit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | chr1 | 629831 | 629954 | SRF:K-562 | 1 | . | 629938 | 629939 | 140,112,224 | chr1:629831-629954 | 629838 | 630038 | chr1:629838-630038 |
| 1 | chr1 | 778597 | 778779 | SRF:Ishikawa,GM12878,A-673-clone-Asp114,K-562,... | 8 | . | 778671 | 778672 | 140,112,224 | chr1:778597-778779 | 778571 | 778771 | chr1:778571-778771 |
| 2 | chr1 | 869817 | 870104 | SRF:A-673-clone-Asp114 | 1 | . | 870004 | 870005 | 140,112,224 | chr1:869817-870104 | 869904 | 870104 | chr1:869904-870104 |
| 3 | chr1 | 904714 | 904924 | SRF:GM12878,A-673-clone-Asp114 | 2 | . | 904795 | 904796 | 140,112,224 | chr1:904714-904924 | 904695 | 904895 | chr1:904695-904895 |
| 4 | chr1 | 905143 | 905666 | SRF:A-673-clone-Asp114 | 1 | . | 905393 | 905394 | 140,112,224 | chr1:905143-905666 | 905293 | 905493 | chr1:905293-905493 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 130753 | chrY | 19567211 | 19567311 | SRF:Hep-G2 | 1 | . | 19567231 | 19567232 | 140,112,224 | chrY:19567211-19567311 | 19567131 | 19567331 | chrY:19567131-19567331 |
| 130754 | chrY | 19744932 | 19745043 | SRF:Hep-G2 | 1 | . | 19744983 | 19744984 | 140,112,224 | chrY:19744932-19745043 | 19744883 | 19745083 | chrY:19744883-19745083 |
| 130755 | chrY | 20575583 | 20575822 | SRF:WA01,Hep-G2 | 2 | . | 20575707 | 20575708 | 140,112,224 | chrY:20575583-20575822 | 20575607 | 20575807 | chrY:20575607-20575807 |
| 130756 | chrY | 20575934 | 20576114 | SRF:WA01,Hep-G2 | 2 | . | 20576046 | 20576047 | 140,112,224 | chrY:20575934-20576114 | 20575946 | 20576146 | chrY:20575946-20576146 |
| 130757 | chrY | 21021882 | 21022040 | SRF:Hep-G2 | 1 | . | 21021946 | 21021947 | 140,112,224 | chrY:21021882-21022040 | 21021846 | 21022046 | chrY:21021846-21022046 |
130758 rows × 13 columns
[12]:
df1 = df1[df1[3] == "SRF:Ishikawa,MCF-7"].iloc[:,0:3]
df1 = df1.rename(columns = {"chr":'Chr','start':"Chr_start",'end':"Chr_end"})
df1
[12]:
| Chr | Chr_start | Chr_end | |
|---|---|---|---|
| 47 | chr1 | 1157504 | 1157660 |
| 178 | chr1 | 2586482 | 2586614 |
| 241 | chr1 | 3900438 | 3900894 |
| 371 | chr1 | 6785102 | 6785311 |
| 708 | chr1 | 10796590 | 10796818 |
| ... | ... | ... | ... |
| 130119 | chrX | 132023226 | 132023393 |
| 130187 | chrX | 133418786 | 133418958 |
| 130266 | chrX | 135344596 | 135344830 |
| 130562 | chrX | 152830686 | 152830951 |
| 130645 | chrX | 154000071 | 154000286 |
578 rows × 3 columns
Compute the data with greatpy
[16]:
enrichment = great.tl.enrichment(
test_file = df1,
regdom_file = "../data/human/hg38/regulatory_domain.bed",
chr_size_file = "../data/human/hg38/chr_size.bed",
annotation_file = "../data/human/ontologies.csv",
binom = True,
hypergeom = True,
)
enrichment = great.tl.set_fdr(enrichment)
enrichment = great.tl.set_bonferroni(enrichment)
enrichment
[16]:
| go_term | binom_p_value | binom_fold_enrichment | hypergeom_p_value | hypergeometric_fold_enrichment | binom_fdr | hypergeom_fdr | binom_bonferroni | hypergeom_bonferroni | |
|---|---|---|---|---|---|---|---|---|---|
| GO:0003887 | DNA-directed DNA polymerase activity | 5.58223e-07 | 6.86070e+03 | 6.78765e-03 | 2.83705e+00 | 2.52484e-03 | 4.24967e-01 | 2.52484e-03 | 1.00000e+00 |
| GO:0005654 | nucleoplasm | 2.37389e-06 | 7.76641e+02 | 1.00000e+00 | 2.46288e-01 | 5.36856e-03 | 1.00000e+00 | 1.07371e-02 | 1.00000e+00 |
| GO:0051271 | negative regulation of cellular component move... | 9.61747e-06 | 7.40429e+03 | 2.99533e-03 | 9.27107e+00 | 1.02085e-02 | 4.24967e-01 | 4.34998e-02 | 1.00000e+00 |
| GO:0045095 | keratin filament | 1.06729e-05 | 5.59057e+03 | 2.05747e-01 | 1.45603e+00 | 1.02085e-02 | 5.39459e-01 | 4.82736e-02 | 1.00000e+00 |
| GO:0044317 | rod spherule | 1.12851e-05 | 2.42732e+05 | 4.62268e-02 | 2.16325e+01 | 1.02085e-02 | 4.24967e-01 | 5.10424e-02 | 1.00000e+00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| GO:0042391 | regulation of membrane potential | 9.99684e-01 | 7.22182e+01 | 9.98196e-01 | 1.62650e-01 | 1.00000e+00 | 1.00000e+00 | 1.00000e+00 | 1.00000e+00 |
| GO:0045211 | postsynaptic membrane | 9.99729e-01 | 1.92339e+02 | 9.14369e-01 | 6.36250e-01 | 1.00000e+00 | 9.60393e-01 | 1.00000e+00 | 1.00000e+00 |
| GO:0004930 | G protein-coupled receptor activity | 9.99972e-01 | 1.78522e+02 | 1.00000e+00 | 2.21872e-01 | 1.00000e+00 | 1.00000e+00 | 1.00000e+00 | 1.00000e+00 |
| GO:0005886 | plasma membrane | 9.99999e-01 | 4.37347e+02 | 1.00000e+00 | 2.62943e-01 | 1.00000e+00 | 1.00000e+00 | 1.00000e+00 | 1.00000e+00 |
| GO:0007186 | G protein-coupled receptor signaling pathway | 1.00000e+00 | 2.02047e+02 | 1.00000e+00 | 3.01381e-01 | 1.00000e+00 | 1.00000e+00 | 1.00000e+00 | 1.00000e+00 |
4523 rows × 9 columns