create regulatory domains

This is an example of code enables to create gene regulatory domains from several files.

Both files must be in .bed format documentation here: http://genome.ucsc.edu/FAQ/FAQformat#format1

Columns for tss_file should be : chr \t start \t end

association_rule could be : "one_closet" , "two_closet" and "basal_plus_extention"

[1]:
%load_ext autoreload
%autoreload 2
[2]:
import greatpy as great

For hg38

[3]:
regdom_hg38 = great.tl.create_regdom(
    tss_file = "../data/human/hg38/tss_from_great.bed",
    chr_sizes_file = "../data/human/hg38/chr_size.bed",
    association_rule = "basal_plus_extention",
    out_path = None
    )
regdom_hg38
[3]:
chr chr_start chr_end name tss strand
ENSG00000186092 chr1 0 450697 OR4F5 65418 +
ENSG00000284733 chr1 66418 685673 OR4F29 451697 -
ENSG00000284662 chr1 456697 920737 OR4F16 686673 -
ENSG00000187634 chr1 691673 958290 SAMD11 925737 +
ENSG00000188976 chr1 926737 964290 NOC2L 959290 -
... ... ... ... ... ... ...
ENSG00000183795 chrY 24052969 24812479 BPY2B 24618003 +
ENSG00000187191 chrY 24619003 24828916 DAZ3 24813479 -
ENSG00000205916 chrY 24818479 25051104 DAZ4 24833916 +
ENSG00000185894 chrY 24834916 25617161 BPY2C 25052104 -
ENSG00000172288 chrY 25057104 26622161 CDY1 25622161 +

18777 rows × 6 columns

For hg19

[4]:
regdom_hg19 = great.tl.create_regdom(
    tss_file = "../data/human/hg19/tss.bed",
    chr_sizes_file = "../data/human/hg19/chr_size.bed",
    association_rule = "basal_plus_extention",
    out_path = None
    )
regdom_hg19
[4]:
chr chr_start chr_end name tss strand
ENSG00000186092 chr1 0 362639 OR4F5 69090 +
ENSG00000235249 chr1 70090 621053 OR4F29 367639 +
ENSG00000185097 chr1 368639 856117 OR4F16 622053 -
ENSG00000187634 chr1 627053 893670 SAMD11 861117 +
ENSG00000188976 chr1 862117 899670 NOC2L 894670 -
... ... ... ... ... ... ...
ENSG00000183795 chrY 26199116 26958626 BPY2B 26764150 +
ENSG00000187191 chrY 26765150 26975080 DAZ3 26959626 -
ENSG00000205916 chrY 26964626 27197251 DAZ4 26980080 +
ENSG00000185894 chrY 26981080 27763308 BPY2C 27198251 -
ENSG00000172288 chrY 27203251 28768308 CDY1 27768308 +

18549 rows × 6 columns