rGreat vs greatpy

This notebook enable you to compare results between local rGreat online, rGREAT local and greatpy.

Note

rpy2 is required for this notebook with r-base == 3.6.1 and the following R packages : - rGREAT - GenomicRanges

python version == 3.8

[1]:

%load_ext autoreload
%autoreload 2

[1]:

import rpy2
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
from rpy2.robjects import r as r
pandas2ri.activate()
import pandas as pd

import os
import matplotlib.pyplot as plt
import greatpy as great
from scipy.stats import pearsonr
import re
import time
import seaborn as sns
import numpy as np
import warnings

warnings.filterwarnings('ignore')

[10]:

import sys
sys.path.append("../greatpy/tl")

[11]:

from utils_r import online_vs_local_vs_greatpy_comparison

[12]:

online_vs_local_vs_greatpy_comparison()

../_images/notebooks_05_rgreat_online_vs_local_vs_greatpy_8_0.png

With threshold = 0.05 on fdr correction

[7]:

from statsmodels.stats.multitest import multipletests

[8]:

stat = {
        "name" : [],
        "binom_correlation" : [],
    }

for name in ["01_random.bed","06_height_snps_hg38.bed","07_height_snps_hg19.bed","10_MAX.bed"] :
    file_out = name.split(".")

     # find the assembly
    if re.match(".*hg19.*",name) !=  None :
        assembly = "hg19"
    else :
        assembly = "hg38"

    # online test
    res_online = rpy2.robjects.r['submitGreatJob'](f"../data/tests/test_data/input/{name}",species = f"{assembly}",help = False)
    res_online = rpy2.robjects.r['getEnrichmentTables'](res_online)
    # create each dataframe
            # online
    name_online = [cdc.lower() for cdc in list(res_online.rx2("GO Molecular Function").rx2("name"))+list(res_online.rx2("GO Biological Process").rx2("name"))+list(res_online.rx2("GO Cellular Component").rx2("name"))]
    online = pd.DataFrame({
        "id" : list(res_online.rx2("GO Molecular Function").rx2("ID"))+list(res_online.rx2("GO Biological Process").rx2("ID"))+list(res_online.rx2("GO Cellular Component").rx2("ID")),
        "name" : name_online,
        "binom_p_val" : list(res_online.rx2("GO Molecular Function").rx2("Binom_Raw_PValue"))+list(res_online.rx2("GO Biological Process").rx2("Binom_Raw_PValue"))+list(res_online.rx2("GO Cellular Component").rx2("Binom_Raw_PValue")),
        "hyper_p_val" : list(res_online.rx2("GO Molecular Function").rx2("Hyper_Raw_PValue"))+list(res_online.rx2("GO Biological Process").rx2("Hyper_Raw_PValue"))+list(res_online.rx2("GO Cellular Component").rx2("Hyper_Raw_PValue"))
        })

    greatpy = great.tl.enrichment(
            test_file = f"../data/tests/test_data/input/{name}",
            regdom_file = f"../data/human/{assembly}/regulatory_domain.bed",
            chr_size_file = f"../data/human/{assembly}/chr_size.bed",
            annotation_file = f"../data/human/ontologies.csv",
            binom = True,
            hypergeom = True,
            )



    online["binom_fdr"] = multipletests(online["binom_p_val"], alpha = 0.05, method = 'fdr_bh')[1]

    # réduction of df
    greatpy = greatpy[greatpy["binom_fdr"] <=  0.1]
    online = online[online["binom_fdr"] <=  0.1]

    print(f"after threshold : greatpy shape = {greatpy.shape[0]}, online shape = {online.shape[0]}")
    greatpy = greatpy[greatpy.index.isin(list(online["id"]))]
    online = online[online["id"].isin(list(greatpy.index))]

    print(f"after get same go ID : greatpy shape = {greatpy.shape[0]}, online shape = {online.shape[0]} \n")


    stat["name"].append(file_out[0])
    try :
        stat["binom_correlation"].append(pearsonr(online["binom_fdr"],greatpy["binom_fdr"])[0])
    except :
        stat["binom_correlation"].append(f"Error : df.shape = {online.shape[0]}")

pd.DataFrame(stat)

after threshold : greatpy shape = 292, online shape = 144
after get same go ID : greatpy shape = 82, online shape = 82

after threshold : greatpy shape = 494, online shape = 577
after get same go ID : greatpy shape = 217, online shape = 217

after threshold : greatpy shape = 553, online shape = 652
after get same go ID : greatpy shape = 242, online shape = 242

after threshold : greatpy shape = 3, online shape = 17
after get same go ID : greatpy shape = 1, online shape = 1

[8]:

	name	binom_correlation
0	01_random	3.74254e-01
1	06_height_snps_hg38	4.73357e-01
2	07_height_snps_hg19	3.94162e-01
3	10_MAX	Error : df.shape = 1