Source code for cooltools.api.expected

from itertools import chain, combinations, combinations_with_replacement
from functools import partial

import warnings

import numpy as np
import pandas as pd
from scipy.signal import fftconvolve
from scipy.interpolate import interp1d

from cooler.util import partition
import cooler
import bioframe
from ..lib import assign_supports, numutils
from ..lib.checks import is_compatible_viewframe, is_cooler_balanced
from ..lib.common import make_cooler_view
from ..lib.schemas import diag_expected_dtypes, block_expected_dtypes

from ..sandbox import expected_smoothing
from ..lib.common import pool_decorator

# common expected_df column names, take from schemas
_REGION1 = list(diag_expected_dtypes)[0]
_REGION2 = list(diag_expected_dtypes)[1]
_DIST = list(diag_expected_dtypes)[2]
_NUM_VALID = list(diag_expected_dtypes)[3]

# See notes in the docstring of expected_cis() and per_region_smooth_cvd() 
DEFAULT_BALANCED_CVD_COLS = {
    "region1": _REGION1,
    "region2": _REGION2,
    "dist": "dist",
    "n_pixels": "n_valid",
    "n_contacts": "balanced.sum",
    "output_prefix": "balanced.avg",
}
# See notes in the docstring of expected_cis() and genomewide_smooth_cvd()
DEFAULT_RAW_CVD_COLS = {
    "region1": _REGION1,
    "region2": _REGION2,
    "dist": "dist",
    "n_pixels": "n_total",
    "n_contacts": "count.sum",
    "output_prefix": "count.avg",
}
SMOOTH_SUFFIX = ".smoothed"
SMOOTH_AGG_SUFFIX = ".smoothed.agg"


[docs]def lattice_pdist_frequencies(n, points):
    """
    Distribution of pairwise 1D distances among a collection of distinct
    integers ranging from 0 to n-1.

    Parameters
    ----------
    n : int
        Size of the lattice on which the integer points reside.
    points : sequence of int
        Arbitrary integers between 0 and n-1, inclusive, in any order but
        with no duplicates.

    Returns
    -------
    h : 1D array of length n
        h[d] counts the number of integer pairs that are exactly d units apart

    Notes
    -----
    This is done using a convolution via FFT. Thanks to Peter de Rivaz; see
    `<http://stackoverflow.com/questions/42423823/distribution-of-pairwise-distances-between-many-integers>`_.

    """
    if len(np.unique(points)) != len(points):
        raise ValueError("Integers must be distinct.")
    x = np.zeros(n)
    x[points] = 1
    return np.round(fftconvolve(x, x[::-1], mode="full")).astype(int)[-n:]


[docs]def count_bad_pixels_per_diag(n, bad_bins):
    """
    Efficiently count the number of bad pixels on each upper diagonal of a
    matrix assuming a sequence of bad bins forms a "grid" of invalid pixels.

    Each bad bin bifurcates into two a row and column of bad pixels, so an
    upper bound on number of bad pixels per diagonal is 2*k, where k is the
    number of bad bins. For a given diagonal, we need to subtract from this
    upper estimate the contribution from rows/columns reaching "out-of-bounds"
    and the contribution of the intersection points of bad rows with bad
    columns that get double counted.

    ::

        o : bad bin
        * : bad pixel
        x : intersection bad pixel
        $ : out of bounds bad pixel
             $    $     $
         *--------------------------+
          *  *    *     *           |
           * *    *     *           |
            **    *     *           |
             o****x*****x***********|$
              *   *     *           |
               *  *     *           |
                * *     *           |
                 o******x***********|$
                  *     *           |
                   *    *           |
                    *   *           |
                     *  *           |
                      * *           |
                       **           |
                        o***********|$
                         *          |
                          *         |

    Parameters
    ----------
    n : int
        total number of bins
    bad_bins : 1D array of int
        sorted array of bad bin indexes

    Returns
    -------
    dcount : 1D array of length n
        dcount[d] == number of bad pixels on diagonal d

    """
    k = len(bad_bins)
    dcount = np.zeros(n, dtype=int)

    # Store all intersection pixels in a separate array
    # ~O(n log n) with fft
    ixn = lattice_pdist_frequencies(n, bad_bins)
    dcount[0] = ixn[0]

    # Keep track of out-of-bounds pixels by squeezing left and right bounds
    # ~O(n)
    pl = 0
    pr = k
    for diag in range(1, n):
        if pl < k:
            while (bad_bins[pl] - diag) < 0:
                pl += 1
                if pl == k:
                    break
        if pr > 0:
            while (bad_bins[pr - 1] + diag) >= n:
                pr -= 1
                if pr == 0:
                    break
        dcount[diag] = 2 * k - ixn[diag] - pl - (k - pr)
    return dcount


[docs]def count_all_pixels_per_diag(n):
    """
    Total number of pixels on each upper diagonal of a square matrix.

    Parameters
    ----------
    n : int
        total number of bins (dimension of square matrix)

    Returns
    -------
    dcount : 1D array of length n
        dcount[d] == total number of pixels on diagonal d

    """
    return np.arange(n, 0, -1)


[docs]def count_all_pixels_per_block(x, y):
    """
    Calculate total number of pixels in a rectangular block

    Parameters
    ----------
    x : int
        block width in pixels
    y : int
        block height in pixels

    Returns
    -------
    number_of_pixels : int
        total number of pixels in a block
    """
    return x * y


[docs]def count_bad_pixels_per_block(x, y, bad_bins_x, bad_bins_y):
    """
    Calculate number of "bad" pixels per rectangular block of a contact map

    Parameters
    ----------
    x : int
        block width in pixels
    y : int
        block height in pixels
    bad_bins_x : int
        number of bad bins on x-side
    bad_bins_y : int
        number of bad bins on y-side

    Returns
    -------
    number_of_pixes : int
        number of "bad" pixels in a block
    """

    # Calculate the resulting bad pixels in a rectangular block:
    return (x * bad_bins_y) + (y * bad_bins_x) - (bad_bins_x * bad_bins_y)


[docs]def make_diag_table(bad_mask, span1, span2):
    """
    Compute the total number of elements ``n_total`` and the number of bad
    elements ``n_bad`` per diagonal for a single contact area encompassing
    ``span1`` and ``span2`` on the same genomic scaffold (cis matrix).

    Follows the same principle as the algorithm for finding contact areas for
    computing scalings.

    Parameters
    ----------
    bad_mask : 1D array of bool
        Mask of bad bins for the whole genomic scaffold containing the regions
        of interest.
    span1, span2 : pair of ints
        The bin spans (not genomic coordinates) of the two regions of interest.

    Returns
    -------
    diags : pandas.DataFrame
        Table indexed by 'diag' with columns ['n_total', 'n_bad'].

    """

    # alias for np.flatnonzero inside `make_diag_table`
    where = np.flatnonzero

    def _make_diag_table(n_bins, bad_locs):
        diags = pd.DataFrame(index=pd.Series(np.arange(n_bins), name=_DIST))
        diags["n_total"] = count_all_pixels_per_diag(n_bins)
        diags[_NUM_VALID] = diags["n_total"] - count_bad_pixels_per_diag(
            n_bins, bad_locs
        )
        return diags

    if span1 == span2:
        lo, hi = span1
        diags = _make_diag_table(hi - lo, where(bad_mask[lo:hi]))
    else:
        lo1, hi1 = span1
        lo2, hi2 = span2
        if lo2 <= lo1:
            lo1, lo2 = lo2, lo1
            hi1, hi2 = hi2, hi1
        diags = (
            _make_diag_table(hi2 - lo1, where(bad_mask[lo1:hi2]))
            .subtract(
                _make_diag_table(lo2 - lo1, where(bad_mask[lo1:lo2])), fill_value=0
            )
            .subtract(
                _make_diag_table(hi2 - hi1, where(bad_mask[hi1:hi2])), fill_value=0
            )
        )
        if hi1 < lo2:
            diags.add(
                _make_diag_table(lo2 - hi1, where(bad_mask[hi1:lo2])), fill_value=0
            )
        diags = diags[diags["n_total"] > 0]

    # keep diags["n_total"] for calculating count.avg
    # diags = diags.drop("n_total", axis=1)
    return diags.astype(int)


[docs]def make_diag_tables(clr, regions, regions2=None, clr_weight_name="weight"):
    """
    For every region infer diagonals that intersect this region and calculate
    the size of these intersections in pixels, both "total" and "n_valid",
    where "n_valid" does not count "bad" pixels.

    "Bad" pixels are inferred from the balancing weight column `clr_weight_name`.
    When `clr_weight_name` is None, raw data is used, and no "bad" pixels are exclued.

    When `regions2` are provided, all intersecting diagonals are reported for
    each rectangular and asymmetric block defined by combinations of matching
    elements of `regions` and `regions2`.
    Otherwise only `regions`-based symmetric square blocks are considered.
    Only intra-chromosomal regions are supported.

    Parameters
    ----------
    clr : cooler.Cooler
        Input cooler
    regions : viewframe or viewframe-like dataframe
        viewframe without repeated entries or viewframe-like dataframe with repeated entries
    regions2 : viewframe or viewframe-like dataframe
        viewframe without repeated entries or viewframe-like dataframe with repeated entries
    clr_weight_name : str
        name of the weight column in the clr bin-table,
        Balancing weight is used to infer bad bins, set to
        `None` is masking bad bins is not desired for raw data.

    Returns
    -------
    diag_tables : dict
        dictionary with DataFrames of relevant diagonals for every region.
    """

    try:  # Run regular viewframe conversion:
        regions = bioframe.make_viewframe(
            regions, check_bounds=clr.chromsizes
        ).to_numpy()
        if regions2 is not None:
            regions2 = bioframe.make_viewframe(
                regions2, check_bounds=clr.chromsizes
            ).to_numpy()
    except (
        ValueError
    ):  # If there are non-unique entries in regions1/2, possible only for asymmetric expected:
        regions = pd.concat(
            [
                bioframe.make_viewframe([region], check_bounds=clr.chromsizes)
                for i, region in regions.iterrows()
            ]
        ).to_numpy()
        regions2 = pd.concat(
            [
                bioframe.make_viewframe([region], check_bounds=clr.chromsizes)
                for i, region in regions2.iterrows()
            ]
        ).to_numpy()

    bins = clr.bins()[:]
    if clr_weight_name is None:
        # ignore bad bins
        sizes = dict(bins.groupby("chrom").size())
        bad_bin_dict = {
            chrom: np.zeros(sizes[chrom], dtype=bool) for chrom in sizes.keys()
        }
    elif is_cooler_balanced(clr, clr_weight_name):
        groups = dict(iter(bins.groupby("chrom")[clr_weight_name]))
        bad_bin_dict = {
            chrom: np.array(groups[chrom].isnull()) for chrom in groups.keys()
        }
    else:
        raise ValueError(
            f"provided cooler is not balanced, or weight {clr_weight_name} is missing"
        )

    diag_tables = {}
    for i, region in enumerate(regions):
        chrom, start1, end1, name1 = region
        if regions2 is not None:
            chrom2, start2, end2, name2 = regions2[i]
            # cis-only for now:
            if not (chrom2 == chrom):
                raise ValueError(
                    "regions/2 have to be on the same chrom to generate diag_tables"
                )
        else:
            start2, end2 = start1, end1

        # translate regions into relative bin id-s:
        lo1, hi1 = clr.extent((chrom, start1, end1))
        lo2, hi2 = clr.extent((chrom, start2, end2))
        co = clr.offset(chrom)
        lo1 -= co
        lo2 -= co
        hi1 -= co
        hi2 -= co

        bad_mask = bad_bin_dict[chrom]
        newname = name1
        if regions2 is not None:
            newname = (name1, name2)
        diag_tables[newname] = make_diag_table(bad_mask, [lo1, hi1], [lo2, hi2])

    return diag_tables


[docs]def make_block_table(clr, regions1, regions2, clr_weight_name="weight"):
    """
    Creates a table of total and valid pixels for a set of rectangular genomic blocks
    defined by regions1 and regions2.
    For every block calculate its "area" in pixels ("n_total"), and calculate
    number of "valid" pixels ("n_valid").
    Valid pixels exclude "bad" pixels, which are inferred from the balancing
    weight column `clr_weight_name`.

    When `clr_weight_name` is None, raw data is used, and no "bad" pixels are exclued.

    Parameters
    ----------
    clr : cooler.Cooler
        Input cooler
    regions1 : viewframe-like dataframe
        viewframe-like dataframe, where repeated entries are allowed
    regions2 : viewframe-like dataframe
        viewframe-like dataframe, where repeated entries are allowed
    clr_weight_name : str
        name of the weight column in the cooler bins-table, used
        for masking bad pixels.
        When clr_weight_name is None, no bad pixels are masked.

    Returns
    -------
    block_table : dict
        dictionary for blocks that are 0-indexed
    """

    try:  # Run regular viewframe conversion:
        regions1 = bioframe.make_viewframe(
            regions1, check_bounds=clr.chromsizes
        ).to_numpy()
        regions2 = bioframe.make_viewframe(
            regions2, check_bounds=clr.chromsizes
        ).to_numpy()
    except ValueError:  # Might be non-unique entries in regions:
        regions1 = pd.concat(
            [
                bioframe.make_viewframe([region], check_bounds=clr.chromsizes)
                for i, region in regions1.iterrows()
            ]
        ).to_numpy()
        regions2 = pd.concat(
            [
                bioframe.make_viewframe([region], check_bounds=clr.chromsizes)
                for i, region in regions2.iterrows()
            ]
        ).to_numpy()

    # should we check for nestedness here, or that each region1 is < region2 ?

    block_table = {}
    for r1, r2 in zip(regions1, regions2):
        chrom1, start1, end1, name1 = r1
        chrom2, start2, end2, name2 = r2
        # translate regions into relative bin id-s:
        lo1, hi1 = clr.extent((chrom1, start1, end1))
        lo2, hi2 = clr.extent((chrom2, start2, end2))
        # width and height of a block:
        x = hi1 - lo1
        y = hi2 - lo2

        # now we need to combine it with the balancing weights
        if clr_weight_name is None:
            bad_bins_x = 0
            bad_bins_y = 0
        elif is_cooler_balanced(clr, clr_weight_name):
            # count "bad" bins filtered by balancing:
            bad_bins_x = clr.bins()[clr_weight_name][lo1:hi1].isnull().sum()
            bad_bins_y = clr.bins()[clr_weight_name][lo2:hi2].isnull().sum()
        else:
            raise ValueError(
                f"cooler is not balanced or weight {clr_weight_name} is missing"
            )

        # calculate total and bad pixels per block:
        n_tot = count_all_pixels_per_block(x, y)
        n_bad = count_bad_pixels_per_block(x, y, bad_bins_x, bad_bins_y)

        # fill in "block_table" with number of valid pixels:
        block_table[name1, name2] = {_NUM_VALID: n_tot - n_bad}

    return block_table


def _diagsum_symm(clr, fields, transforms, clr_weight_name, regions, span):
    """
    calculates diagonal/distance summary for a collection of
    square symmetric blocks defined by the "regions".

    Return:
    dictionary of DataFrames with diagonal/distance
    sums for the "fields", and 0-based indexes of square
    genomic regions as keys.
    """
    lo, hi = span
    bins = clr.bins()[:]
    pixels = clr.pixels()[lo:hi]
    pixels = cooler.annotate(pixels, bins, replace=False)
    # pre-filter cis-only pixels to speed up calculations
    pixels = pixels[pixels["chrom1"] == pixels["chrom2"]].copy()

    # annotate pixels with regions at once
    # book-ended regions still get reannotated
    pixels["r1"] = assign_supports(pixels, regions, suffix="1")
    pixels["r2"] = assign_supports(pixels, regions, suffix="2")
    # select symmetric pixels that have notnull weights
    if clr_weight_name is None:
        pixels = pixels.dropna(subset=["r1", "r2"])
    else:
        pixels = pixels.dropna(
            subset=["r1", "r2", clr_weight_name + "1", clr_weight_name + "2"]
        )
    pixels = pixels[pixels["r1"] == pixels["r2"]]

    # this could further expanded to allow for custom groupings:
    pixels[_DIST] = pixels["bin2_id"] - pixels["bin1_id"]
    for field, t in transforms.items():
        pixels[field] = t(pixels)

    symm_blocks = pixels.groupby("r1")
    return {int(i): block.groupby(_DIST)[fields].sum() for i, block in symm_blocks}


[docs]def diagsum_symm(
    clr,
    view_df,
    transforms={},
    clr_weight_name="weight",
    ignore_diags=2,
    chunksize=10000000,
    map=map,
):
    """

    Intra-chromosomal diagonal summary statistics.

    Parameters
    ----------
    clr : cooler.Cooler
        Cooler object
    view_df : viewframe
        view_dfof regions for intra-chromosomal diagonal summation
    transforms : dict of str -> callable, optional
        Transformations to apply to pixels. The result will be assigned to
        a temporary column with the name given by the key. Callables take
        one argument: the current chunk of the (annotated) pixel dataframe.
    clr_weight_name : str
        name of the balancing weight vector used to count "bad"
        pixels per diagonal. Set to `None` not to mask
        "bad" pixels (raw data only).
    chunksize : int, optional
        Size of pixel table chunks to process
    ignore_diags : int, optional
        Number of intial diagonals to exclude from statistics
    map : callable, optional
        Map functor implementation.

    Returns
    -------
    Dataframe of diagonal statistics for all regions in the view

    """
    spans = partition(0, len(clr.pixels()), chunksize)
    fields = list(chain(["count"], transforms))
    # names of summary results
    summary_fields = [f"{field}.sum" for field in fields]

    # check viewframe
    try:
        _ = is_compatible_viewframe(
            view_df,
            clr,
            check_sorting=False,  # liberal for this low-level function
            raise_errors=True,
        )
    except Exception as e:
        raise ValueError("provided view_df is not valid") from e

    # prepare dtables: for every region a table with diagonals, number of valid pixels, etc
    dtables = make_diag_tables(clr, view_df, clr_weight_name=clr_weight_name)

    # initialize columns to store summary results in dtables
    for dt in dtables.values():
        for agg_name in summary_fields:
            dt[agg_name] = 0

    # apply _diagsum_symm to chunks of pixels
    job = partial(
        _diagsum_symm, clr, fields, transforms, clr_weight_name, view_df.to_numpy()
    )
    results = map(job, spans)
    # accumulate every chunk of summary results to dtables
    for result in results:
        for i, agg in result.items():
            region = view_df["name"].iat[i]
            # for every field accumulate its aggregate/summary:
            for field, agg_name in zip(fields, summary_fields):
                dtables[region][agg_name] = dtables[region][agg_name].add(
                    agg[field], fill_value=0
                )

    # returning pd.DataFrame for API consistency
    result = []
    for i, dtable in dtables.items():
        dtable = dtable.reset_index()
        # conform with the new expected format, treat regions as 2D
        dtable.insert(0, _REGION1, i)
        dtable.insert(1, _REGION2, i)
        if ignore_diags:
            # fill out summary fields of ignored diagonals with NaN:
            dtable.loc[dtable[_DIST] < ignore_diags, summary_fields] = np.nan
        result.append(dtable)

    return pd.concat(result).reset_index(drop=True)


def _diagsum_pairwise(clr, fields, transforms, clr_weight_name, regions, span):
    """
    calculates diagonal/distance summary for a collection of
    rectangular blocks defined by all pairwise combinations
    of "regions" for intra-chromosomal interactions.

    Return:
    dictionary of DataFrames with diagonal/distance
    sums for the "fields", and (i,j)-like indexes of rectangular
    genomic regions as keys.
    """
    lo, hi = span
    bins = clr.bins()[:]
    pixels = clr.pixels()[lo:hi]
    pixels = cooler.annotate(pixels, bins, replace=False)
    # pre-filter cis-only pixels to speed up calculations
    pixels = pixels[pixels["chrom1"] == pixels["chrom2"]].copy()

    # annotate pixels with regions at once
    # book-ended regions still get reannotated
    pixels["r1"] = assign_supports(pixels, regions, suffix="1")
    pixels["r2"] = assign_supports(pixels, regions, suffix="2")
    # pre-filter asymetric pixels only that have notnull weights
    if clr_weight_name is None:
        pixels = pixels.dropna(subset=["r1", "r2"])
    else:
        pixels = pixels.dropna(
            subset=["r1", "r2", clr_weight_name + "1", clr_weight_name + "2"]
        )
    # pixels = pixels[ pixels["r1"] != pixels["r2"] ]

    # this could further expanded to allow for custom groupings:
    pixels[_DIST] = pixels["bin2_id"] - pixels["bin1_id"]
    for field, t in transforms.items():
        pixels[field] = t(pixels)

    asymm_blocks = pixels.groupby(["r1", "r2"])
    return {
        (int(i), int(j)): block.groupby(_DIST)[fields].sum()
        for (i, j), block in asymm_blocks
    }


[docs]def diagsum_pairwise(
    clr,
    view_df,
    transforms={},
    clr_weight_name="weight",
    ignore_diags=2,
    chunksize=10_000_000,
    map=map,
):
    """

    Intra-chromosomal diagonal summary statistics for asymmetric blocks of
    contact matrix defined as pairwise combinations of regions in "view_df.

    Note
    ----
    This is a special case of asymmetric diagonal summary statistic that is
    efficient and covers the most important practical case of inter-chromosomal
    arms "expected" calculation.

    Parameters
    ----------
    clr : cooler.Cooler
        Cooler object
    view_df : viewframe
        view_df of regions for intra-chromosomal diagonal summation, has to
        be sorted according to the order of chromosomes in cooler.
    transforms : dict of str -> callable, optional
        Transformations to apply to pixels. The result will be assigned to
        a temporary column with the name given by the key. Callables take
        one argument: the current chunk of the (annotated) pixel dataframe.
    clr_weight_name : str
        name of the balancing weight vector used to count "bad"
        pixels per diagonal. Set to `None` not to mask
        "bad" pixels (raw data only).
    chunksize : int, optional
        Size of pixel table chunks to process
    map : callable, optional
        Map functor implementation.

    Returns
    -------
    Dataframe of diagonal statistics for all intra-chromosomal blocks defined as
    pairwise combinations of regions in the view

    """
    spans = partition(0, len(clr.pixels()), chunksize)
    fields = list(chain(["count"], transforms))
    # names of summary results
    summary_fields = [f"{field}.sum" for field in fields]

    # check viewframe
    try:
        _ = is_compatible_viewframe(
            view_df,
            clr,
            check_sorting=True,  # required for pairwise combinations
            raise_errors=True,
        )
    except Exception as e:
        raise ValueError("provided view_df is not valid") from e

    # create pairwise combinations of regions from view_df
    all_combinations = combinations_with_replacement(view_df.itertuples(index=False), 2)
    # keep only intra-chromosomal combinations
    cis_combinations = ((r1, r2) for r1, r2 in all_combinations if (r1[0] == r2[0]))
    # unzip regions1 regions2 defining the blocks for summary collection
    regions1, regions2 = zip(*cis_combinations)
    regions1 = pd.DataFrame(regions1)
    regions2 = pd.DataFrame(regions2)
    # prepare dtables: for every region a table with diagonals, number of valid pixels, etc
    dtables = make_diag_tables(clr, regions1, regions2, clr_weight_name=clr_weight_name)

    # initialize columns to store summary results in dtables
    for dt in dtables.values():
        for agg_name in summary_fields:
            dt[agg_name] = 0

    # apply _diagsum_pairwise to chunks of pixels
    job = partial(
        _diagsum_pairwise, clr, fields, transforms, clr_weight_name, view_df.values
    )
    results = map(job, spans)
    # accumulate every chunk of summary results to dtables
    for result in results:
        for (i, j), agg in result.items():
            ni = view_df["name"].iat[i]
            nj = view_df["name"].iat[j]
            # for every field accumulate its aggregate/summary:
            for field, agg_name in zip(fields, summary_fields):
                dtables[ni, nj][agg_name] = dtables[ni, nj][agg_name].add(
                    agg[field], fill_value=0
                )

    # returning a pd.DataFrame for API consistency:
    result = []
    for (i, j), dtable in dtables.items():
        dtable = dtable.reset_index()
        dtable.insert(0, _REGION1, i)
        dtable.insert(1, _REGION2, j)
        if ignore_diags:
            # fill out summary fields of ignored diagonals with NaN:
            dtable.loc[dtable[_DIST] < ignore_diags, summary_fields] = np.nan
        result.append(dtable)
    return pd.concat(result).reset_index(drop=True)


def _blocksum_pairwise(clr, fields, transforms, clr_weight_name, regions, span):
    """
    calculates block summary for a collection of
    rectangular regions defined as pairwise combinations
    of all regions.

    Return:
    a dictionary of block-wide sums for all "fields":
    keys are (i,j)-like, where i and j are 0-based indexes of
    "regions", and a combination of (i,j) defines rectangular block.

    Note:
    Input pixels are assumed to be "symmetric-upper", and "regions"
    to be sorted according to the order of chromosomes in "clr", thus
    i < j.

    """
    lo, hi = span
    bins = clr.bins()[:]
    pixels = clr.pixels()[lo:hi]
    pixels = cooler.annotate(pixels, bins, replace=False)

    pixels["r1"] = assign_supports(pixels, regions, suffix="1")
    pixels["r2"] = assign_supports(pixels, regions, suffix="2")
    # pre-filter asymetric pixels only that have notnull weights
    if clr_weight_name is None:
        pixels = pixels.dropna(subset=["r1", "r2"])
    else:
        pixels = pixels.dropna(
            subset=["r1", "r2", clr_weight_name + "1", clr_weight_name + "2"]
        )
    pixels = pixels[pixels["r1"] != pixels["r2"]]

    # apply transforms, e.g. balancing etc
    for field, t in transforms.items():
        pixels[field] = t(pixels)

    # pairwise-combinations of regions define asymetric pixels-blocks
    pixel_groups = pixels.groupby(["r1", "r2"])
    return {
        (int(i), int(j)): group[fields].sum(skipna=False)
        for (i, j), group in pixel_groups
    }


[docs]def blocksum_pairwise(
    clr,
    view_df,
    transforms={},
    clr_weight_name="weight",
    chunksize=1000000,
    map=map,
):
    """
    Summary statistics on rectangular blocks of all (trans-)pairwise combinations
    of genomic regions in the view_df (aka trans-expected).

    Note
    ----
    This is a special case of asymmetric block-level summary stats, that can be
    calculated very efficiently. Regions in view_df are assigned to pixels only
    once and pixels falling into a given asymmetric block i != j are summed up.

    Parameters
    ----------
    clr : cooler.Cooler
        Cooler object
    view_df : viewframe
        view_df of regions defining blocks for summary calculations,
        has to be sorted according to the order of chromosomes in clr.
    transforms : dict of str -> callable, optional
        Transformations to apply to pixels. The result will be assigned to
        a temporary column with the name given by the key. Callables take
        one argument: the current chunk of the (annotated) pixel dataframe.
    clr_weight_name : str
        name of the balancing weight column in cooler bin-table used
        to count "bad" pixels per block. Set to `None` not ot mask
        "bad" pixels (raw data only).
    chunksize : int, optional
        Size of pixel table chunks to process
    map : callable, optional
        Map functor implementation.

    Returns
    -------
    DataFrame with entries for each blocks: region1, region2, n_valid, count.sum

    """

    # check viewframe
    try:
        _ = is_compatible_viewframe(
            view_df,
            clr,
            check_sorting=False,  # required for pairwise combinations
            raise_errors=True,
        )
    except Exception as e:
        raise ValueError("provided view_df is not valid") from e

    spans = partition(0, len(clr.pixels()), chunksize)
    fields = list(chain(["count"], transforms))
    # names of summary results
    summary_fields = [f"{field}.sum" for field in fields]

    # create pairwise combinations of regions from view_df using
    # the standard zip(*bunch_of_tuples) unzipping procedure:
    regions1, regions2 = zip(*combinations(view_df.itertuples(index=False), 2))
    regions1 = pd.DataFrame(regions1)
    regions2 = pd.DataFrame(regions2)
    # similar with diagonal summations, pre-generate a block_table listing
    # all of the rectangular blocks and "n_valid" number of pixels per each block:
    btables = make_block_table(clr, regions1, regions2, clr_weight_name=clr_weight_name)

    # initialize columns to store summary results in btables
    for bt in btables.values():
        for agg_name in summary_fields:
            bt[agg_name] = 0

    # apply _diagsum_pairwise to chunks of pixels
    job = partial(
        _blocksum_pairwise, clr, fields, transforms, clr_weight_name, view_df.to_numpy()
    )
    results = map(job, spans)
    # accumulate every chunk of summary results to dtables
    for result in results:
        for (i, j), agg in result.items():
            ni = view_df["name"].iat[i]
            nj = view_df["name"].iat[j]
            # for every field accumulate its aggregate/summary:
            for field, agg_name in zip(fields, summary_fields):
                btables[ni, nj][agg_name] += np.nan_to_num(agg[field].item())

    # returning a pd.DataFrame for API consistency:
    return pd.DataFrame(
        [
            {_REGION1: n1, _REGION2: n2, **btable}
            for (n1, n2), btable in btables.items()
        ],
        columns=list(block_expected_dtypes) + summary_fields,
    )


[docs]def genomewide_smooth_cvd(
    cvd,
    sigma_log10=0.1,
    window_sigma=5,
    points_per_sigma=10,
    cols=None,
    suffix=".smoothed",
):
    """
    Smooth the contact-vs-distance curve aggregated across all regions in log-space.

    Parameters
    ----------
    cvd : pandas.DataFrame
        A dataframe with the expected values in the cooltools.expected format.
    sigma_log10 : float, optional
        The standard deviation of the smoothing Gaussian kernel, applied over log10(diagonal), by default 0.1
    window_sigma : int, optional
        Width of the smoothing window, expressed in sigmas, by default 5
    points_per_sigma : int, optional
        If provided, smoothing is done only for `points_per_sigma` points per sigma and the
        rest of the values are interpolated (this results in a major speed-up). By default 10
    cols : dict, optional
        If provided, use the specified column names instead of the standard ones.
        See DEFAULT_CVD_COLS variable for the format of this argument.
    suffix : string, optional
        If provided, use the specified string as the suffix of the output column's name

    Returns
    -------
    cvd : pandas.DataFrame
        A cvd table with extra column for the log-smoothed contact frequencies (by default, "balanced.avg.smoothed.agg" if balanced, or "count.avg.smoothed.agg" if raw).

    Notes
    -----
    Parameters in "cols" will be used:

    dist:
        Name of the column that stores distance values (by default, "dist").
    n_pixels:
        Name of the column that stores number of pixels (by default, "n_valid" if balanced, or "n_total" if raw).
    n_contacts:
        Name of the column that stores the sum of contacts (by default, "balanced.sum" if balanced, or "count.sum" if raw).
    output_prefix:
        Name prefix of the column that will store output value (by default, "balanced.avg" if balanced, or "count.avg" if raw).
    """
    cvd_smoothed = expected_smoothing._smooth_cvd_group(
        cvd,
        sigma_log10=sigma_log10,
        window_sigma=window_sigma,
        points_per_sigma=points_per_sigma,
        cols=cols,
        suffix=suffix,
    )

    # add aggeragated smoothed columns to the result
    cvd = cvd.merge(
        cvd_smoothed[[cols["output_prefix"] + suffix, cols["dist"]]],
        on=[cols["dist"]],
        how="left",
    )

    return cvd


[docs]def per_region_smooth_cvd(
    cvd,
    sigma_log10=0.1,
    window_sigma=5,
    points_per_sigma=10,
    cols=None,
    suffix="",
):
    """
    Smooth the contact-vs-distance curve for each region in log-space.

    Parameters
    ----------
    cvd : pandas.DataFrame
        A dataframe with the expected values in the cooltools.expected format.
    sigma_log10 : float, optional
        The standard deviation of the smoothing Gaussian kernel, applied over log10(diagonal), by default 0.1
    window_sigma : int, optional
        Width of the smoothing window, expressed in sigmas, by default 5
    points_per_sigma : int, optional
        If provided, smoothing is done only for `points_per_sigma` points per sigma and the
        rest of the values are interpolated (this results in a major speed-up). By default 10
    cols : dict, optional
        If provided, use the specified column names instead of the standard ones.
        See DEFAULT_CVD_COLS variable for the format of this argument.
    suffix : string, optional
        If provided, use the specified string as the suffix of the output column's name

    Returns
    -------
    cvd : pandas.DataFrame
        A cvd table with extra column for the log-smoothed contact frequencies (by default, "balanced.avg.smoothed" if balanced, or "count.avg.smoothed" if raw).

    Notes
    -----
    Parameters in "cols" will be used:

    region1:
        Name of the column that stores region1's locations (by default, "region1").
    region2:
        Name of the column that stores region2's locations (by default, "region2").
    dist:
        Name of the column that stores distance values (by default, "dist").
    n_pixels:
        Name of the column that stores number of pixels (by default, "n_valid" if balanced, or "n_total" if raw).
    n_contacts:
        Name of the column that stores the sum of contacts (by default, "balanced.sum" if balanced, or "count.sum" if raw).
    output_prefix:
        Name prefix of the column that will store output value (by default, "balanced.avg" if balanced, or "count.avg" if raw).
    """
    cvd_smoothed = (
        cvd.set_index([cols["region1"], cols["region2"]])
        .groupby([cols["region1"], cols["region2"]])
        .apply(
            expected_smoothing._smooth_cvd_group,
            sigma_log10=sigma_log10,
            window_sigma=window_sigma,
            points_per_sigma=points_per_sigma,
            cols=cols,
            suffix=suffix,
        )
    )
    # add smoothed columns to the result
    cvd = cvd.merge(
        cvd_smoothed[[cols["output_prefix"] + suffix, cols["dist"]]],
        on=[cols["region1"], cols["region2"], cols["dist"]],
        how="left",
    )

    return cvd
# user-friendly wrapper for diagsum_symm and diagsum_pairwise - part of new "public" API
[docs]@pool_decorator
def expected_cis(
    clr,
    view_df=None,
    intra_only=True,
    smooth=True,
    aggregate_smoothed=True,
    smooth_sigma=0.1,
    clr_weight_name="weight",
    ignore_diags=2,  # should default to cooler info
    chunksize=10_000_000,
    nproc=1,
    map_functor=map,
):
    """
    Calculate average interaction frequencies as a function of genomic
    separation between pixels i.e. interaction decay with distance.
    Genomic separation aka "dist" is measured in the number of bins,
    and defined as an index of a diagonal on which pixels reside (bin1_id - bin2_id).

    Average values are reported in the columns with names {}.avg, and they
    are calculated as a ratio between a corresponding sum {}.sum and the
    total number of "valid" pixels on the diagonal "n_valid".

    When balancing weights (clr_weight_name=None) are not applied to the data, there is no
    masking of bad bins performed.


    Parameters
    ----------
    clr : cooler.Cooler
        Cooler object
    view_df : viewframe
        a collection of genomic intervals where expected is calculated
        otherwise expected is calculated for full chromosomes.
        view_df has to be sorted, when inter-regions expected is requested,
        i.e. intra_only is False.
    intra_only: bool
        Return expected only for symmetric intra-regions defined by view_df,
        i.e. chromosomes, chromosomal-arms, intra-domains, etc.
        When False returns expected both for symmetric intra-regions and
        assymetric inter-regions.
    smooth: bool
        Apply smoothing to cis-expected. Will be stored in an additional column
    aggregate_smoothed: bool
        When smoothing, average over all regions, ignored without smoothing.
    smooth_sigma: float
        Control smoothing with the standard deviation of the smoothing Gaussian kernel.
        Ignored without smoothing.
    clr_weight_name : str or None
        Name of balancing weight column from the cooler to use.
        Use raw unbalanced data, when None.
    ignore_diags : int, optional
        Number of intial diagonals to exclude results
    chunksize : int, optional
        Size of pixel table chunks to process
    nproc : int, optional
        How many processes to use for calculation. Ignored if map_functor is passed.
    map_functor : callable, optional
        Map function to dispatch the matrix chunks to workers.
        If left unspecified, pool_decorator applies the following defaults: if nproc>1 this defaults to multiprocess.Pool;
        If nproc=1 this defaults the builtin map.

    Returns
    -------
    DataFrame with summary statistic of every diagonal of every symmetric
    or asymmetric block:

    Notes
    -----
    When clr_weight_name=None, smooth=False, aggregate_smoothed=False, the minimum output DataFrame includes the following quantities (columns):

        dist:
            Distance in bins.
        dist_bp:
            Distance in basepairs.
        contact_freq:
            The "most processed" contact frequency value. For example, if balanced & smoothing then this will return the balanced.avg.smooth.agg;
            if aggregated+smoothed, then balanced.avg.smooth.agg; if nothing then count.avg.
        n_total:
            Number of total pixels at a given distance.
        n_valid:
            Number of valid pixels (with non-NaN values after balancing) at a given distance.
        count.sum:
            Sum up raw contact counts of all pixels at a given distance.
        count.avg:
            The average raw contact count of pixels at a given distance. count.sum / n_total. 
    
    When clr_weigh_name is provided (by default, clr_weigh_name="weight"), the following quantities (columns) will be added into the DataFrame:
        
        balanced.sum:
            Sum up balanced contact values of valid pixels at a given distance. Returned if clr_weight_name is not None.
        balanced.avg:
            The average balanced contact values of valid pixels at a given distance. balanced.sum / n_valid. Returned if clr_weight_name is not None.
    
    When smooth=True, the following quantities (columns) will be added into the DataFrame:
        
        count.avg.smoothed:
            Log-smoothed count.avg. Returned if smooth=True and clr_weight_name=None.
        balanced.avg.smoothed:
            Log-smoothed balanced.avg. Returned if smooth=True and clr_weight_name is not None.
    
    When aggregate_smoothed=True, the following quantities (columns) will be added into the DataFrame:
        count.avg.smoothed.agg:
            Aggregate Log-smoothed count.avg of all genome regions. Returned if smooth=True and aggregate_smoothed=True and clr_weight_name=None.
        balanced.avg.smoothed.agg:
            Aggregate Log-smoothed balanced.avg of all genome regions. Returned if smooth=True and aggregate_smoothed=True and clr_weight_name is not None.

    By default, clr_weight_name="weight", smooth=True, aggregate_smoothed=True, the output DataFrame includes all quantities (columns).

    """

    if view_df is None:
        if not intra_only:
            raise ValueError(
                "asymmetric regions has to be smaller then full chromosomes, use view_df"
            )
        else:
            # Generate viewframe from clr.chromsizes:
            view_df = make_cooler_view(clr)
    else:
        # Make sure view_df is a proper viewframe
        try:
            _ = is_compatible_viewframe(
                view_df,
                clr,
                # must be sorted for asymmetric case
                check_sorting=(not intra_only),
                raise_errors=True,
            )
        except Exception as e:
            raise ValueError("view_df is not a valid viewframe or incompatible") from e

    # define transforms - balanced and raw ('count') for now
    cols = DEFAULT_BALANCED_CVD_COLS
    if clr_weight_name is None:
        # no transforms
        transforms = {}
        cols = DEFAULT_RAW_CVD_COLS
    elif is_cooler_balanced(clr, clr_weight_name):
        # define balanced data transform:
        weight1 = clr_weight_name + "1"
        weight2 = clr_weight_name + "2"
        transforms = {"balanced": lambda p: p["count"] * p[weight1] * p[weight2]}
    else:
        raise ValueError(
            "cooler is not balanced, or"
            f"balancing weight {clr_weight_name} is not available in the cooler."
            f"Pass clr_weight_name=None explicitly to calculate expected on raw counts"
        )

    # using try-clause to close mp.Pool properly
    if intra_only:
        result = diagsum_symm(
            clr,
            view_df,
            transforms=transforms,
            clr_weight_name=clr_weight_name,
            ignore_diags=ignore_diags,
            chunksize=chunksize,
            map=map_functor,
        )
    else:
        result = diagsum_pairwise(
            clr,
            view_df,
            transforms=transforms,
            clr_weight_name=clr_weight_name,
            ignore_diags=ignore_diags,
            chunksize=chunksize,
            map=map_functor,
        )

    # calculate actual averages by dividing sum by n_valid:
    for key in chain(["count"], transforms):
        if key == "count":
            result[f"{key}.avg"] = result[f"{key}.sum"] / result["n_total"]
        else:
            result[f"{key}.avg"] = result[f"{key}.sum"] / result[_NUM_VALID]

    # add dist_bp column, which shows distance in basepairs
    result.insert(3, "dist_bp", result[_DIST] * clr.binsize)

    # additional smoothing and aggregating options would add columns only, not replace them
    if smooth:
        result = per_region_smooth_cvd(
            result,
            sigma_log10=smooth_sigma,
            cols=cols,
            suffix=SMOOTH_SUFFIX,
        )

        if aggregate_smoothed:
            result = genomewide_smooth_cvd(
                result,
                sigma_log10=smooth_sigma,
                cols=cols,
                suffix=SMOOTH_AGG_SUFFIX,
            )

    # add contact_frequency columns to the result, which is the copy of the "most processing" contact values
    result.insert(4, "contact_frequency", result.iloc[:, -1])

    return result


# user-friendly wrapper for diagsum_symm and diagsum_pairwise - part of new "public" API
[docs]@pool_decorator
def expected_trans(
    clr,
    view_df=None,
    clr_weight_name="weight",
    chunksize=10_000_000,
    nproc=1,
    map_functor=map,
):
    """
    Calculate average interaction frequencies for inter-chromosomal
    blocks defined as pairwise combinations of regions in view_df.

    An expected level of interactions between disjoint chromosomes
    is calculated as a simple average, as there is no notion of genomic
    separation for a pair of chromosomes and contact matrix for these
    regions looks "flat".

    Average values are reported in the columns with names {}.avg, and they
    are calculated as a ratio between a corresponding sum {}.sum and the
    total number of "valid" pixels on the diagonal "n_valid".


    Parameters
    ----------
    clr : cooler.Cooler
        Cooler object
    view_df : viewframe
        a collection of genomic intervals where expected is calculated
        otherwise expected is calculated for full chromosomes, has to be sorted.
    clr_weight_name : str or None
        Name of balancing weight column from the cooler to use.
        Use raw unbalanced data, when None.
    chunksize : int, optional
        Size of pixel table chunks to process
    nproc : int, optional
        How many processes to use for calculation. Ignored if map_functor is passed.
    map_functor : callable, optional
        Map function to dispatch the matrix chunks to workers.
        If left unspecified, pool_decorator applies the following defaults: if nproc>1 this defaults to multiprocess.Pool;
        If nproc=1 this defaults the builtin map.

    Returns
    -------
    DataFrame with summary statistic for every trans-blocks:
    region1, region2, n_valid, count.sum count.avg, etc

    """

    if view_df is None:
        # Generate viewframe from clr.chromsizes:
        view_df = make_cooler_view(clr)
    else:
        # Make sure view_df is a proper viewframe
        try:
            _ = is_compatible_viewframe(
                view_df,
                clr,
                # must be sorted for pairwise regions combinations
                # to be in the upper right of the heatmap
                check_sorting=True,
                raise_errors=True,
            )
        except Exception as e:
            raise ValueError("view_df is not a valid viewframe or incompatible") from e

    # define transforms - balanced and raw ('count') for now
    if clr_weight_name is None:
        # no transforms
        transforms = {}
    elif is_cooler_balanced(clr, clr_weight_name):
        # define balanced data transform:
        weight1 = clr_weight_name + "1"
        weight2 = clr_weight_name + "2"
        transforms = {"balanced": lambda p: p["count"] * p[weight1] * p[weight2]}

    else:
        raise ValueError(
            "cooler is not balanced, or"
            f"balancing weight {clr_weight_name} is not available in the cooler."
        )

    # execution details
    # using try-clause to close mp.Pool properly
    result = blocksum_pairwise(
        clr,
        view_df,
        transforms=transforms,
        clr_weight_name=clr_weight_name,
        chunksize=chunksize,
        map=map_functor,
    )

    # keep only trans interactions for the user-friendly function:
    _name_to_region = view_df.set_index("name")
    _r1_chroms = _name_to_region.loc[result[_REGION1]]["chrom"].values
    _r2_chroms = _name_to_region.loc[result[_REGION2]]["chrom"].values
    # trans-data only:
    result = result.loc[_r1_chroms != _r2_chroms].reset_index(drop=True)

    # calculate actual averages by dividing sum by n_valid:
    for key in chain(["count"], transforms):
        result[f"{key}.avg"] = result[f"{key}.sum"] / result[_NUM_VALID]

    return result


[docs]def diagsum_from_array(
    A, counts=None, *, offset=0, ignore_diags=2, filter_counts=False, region_name=None
):
    """
    Calculates Open2C-formatted expected for a dense submatrix of a whole
    genome contact map.

    Parameters
    ----------
    A : 2D array
        Normalized submatrix to calculate expected (``balanced.sum``).
    counts : 2D array or None, optional
        Corresponding raw contacts to populate ``count.sum``.
    offset : int or (int, int)
        i- and j- bin offsets of A relative to the parent matrix. If a single
        offset is provided it is applied to both axes.
    ignore_diags : int, optional
        Number of initial diagonals to ignore.
    filter_counts : bool, optional
        Apply the validity mask from balanced matrix to the raw one. Ignored
        when counts is None.
    region_name : str or (str, str), optional
        A custom region name or pair of region names. If provided, region
        columns will be included in the output.

    Notes
    -----
    For regions that cross the main diagonal of the whole-genome contact map,
    the lower triangle "overhang" is ignored.

    Examples
    --------
    >>> A = clr.matrix()[:, :]  # whole genome balanced
    >>> C = clr.matrix(balance=False)[:, :]  # whole genome raw

    Using only balanced data:
    >>> exp = diagsum_from_array(A)

    Using balanced and raw counts:
    >>> exp1 = diagsum_from_array(A, C)

    Using an off-diagonal submatrix
    >>> exp2 = diagsum_from_array(A[:50, 50:], offset=(0, 50))

    """
    if isinstance(offset, (list, tuple)):
        offset1, offset2 = offset
    else:
        offset1, offset2 = offset, offset
    if isinstance(region_name, (list, tuple)):
        region1, region2 = region_name
    elif region_name is not None:
        region1, region2 = region_name, region_name
    A = np.asarray(A, dtype=float)
    if counts is not None:
        counts = np.asarray(counts)
        if counts.shape != A.shape:
            raise ValueError("`counts` must have the same shape as `A`.")

    # Compute validity mask for bins on each axis
    invalid_mask1 = np.sum(np.isnan(A), axis=1) == A.shape[0]
    invalid_mask2 = np.sum(np.isnan(A), axis=0) == A.shape[1]

    A[~np.isfinite(A)] = 0

    # Prepare an indicator matrix of "diagonals" (toeplitz) where the lower
    # triangle diagonals wrt the parent matrix are negative.
    # The "outer difference" operation below produces a toeplitz matrix.
    lo1, hi1 = offset1, offset1 + A.shape[0]
    lo2, hi2 = offset2, offset2 + A.shape[1]
    ar1 = np.arange(lo1, hi1, dtype=np.int32)
    ar2 = np.arange(lo2, hi2, dtype=np.int32)
    diag_indicator = ar2[np.newaxis, :] - ar1[:, np.newaxis]
    diag_lo = max(lo2 - hi1 + 1, 0)
    diag_hi = hi2 - lo1

    # Apply the validity mask to the indicator matrix.
    # Both invalid and lower triangle pixels will now have negative indicator values.
    D = diag_indicator.copy()
    D[invalid_mask1, :] = -1
    D[:, invalid_mask2] = -1
    # Drop invalid and lower triangle pixels and flatten.
    mask_per_pixel = D >= 0
    A_flat = A[mask_per_pixel]
    D_flat = D[mask_per_pixel]

    # Group by diagonal and aggregate the number of valid pixels and pixel values.
    diagonals = np.arange(diag_lo, diag_hi, dtype=int)
    n_valid = np.bincount(D_flat, minlength=diag_hi - diag_lo)[diag_lo:]
    balanced_sum = np.bincount(D_flat, weights=A_flat, minlength=diag_hi - diag_lo)[
        diag_lo:
    ]
    # Mask to ignore initial diagonals.
    mask_per_diag = diagonals >= ignore_diags

    # Populate the output dataframe.
    # Include region columns if region names are provided.
    # Include raw pixel counts for each diag if counts is provided.
    df = pd.DataFrame({_DIST: diagonals, _NUM_VALID: n_valid})

    if region_name is not None:
        df.insert(0, _REGION1, region1)
        df.insert(1, _REGION2, region2)

    if counts is not None:
        # Either count everything or apply the same filtering as A.
        if filter_counts:
            C_flat = counts[mask_per_pixel]
            count_sum = np.bincount(
                D_flat, weights=C_flat, minlength=diag_hi - diag_lo
            )[diag_lo:]
        else:
            mask_per_pixel = diag_indicator >= 0
            D_flat = diag_indicator[mask_per_pixel]
            C_flat = counts[mask_per_pixel]
            count_sum = np.bincount(
                D_flat, weights=C_flat, minlength=diag_hi - diag_lo
            )[diag_lo:]
        count_sum[~mask_per_diag] = np.nan
        df["count.sum"] = count_sum

    balanced_sum[~mask_per_diag] = np.nan
    df["balanced.sum"] = balanced_sum

    return df


[docs]def logbin_expected(
    exp,
    summary_name="balanced.sum",
    bins_per_order_magnitude=10,
    bin_layout="fixed",
    smooth=lambda x: numutils.robust_gauss_filter(x, 2),
    min_nvalid=200,
    min_count=50,
):
    """
    Logarithmically bins expected as produced by diagsum_symm method.

    Parameters
    ----------
    exp : DataFrame
        DataFrame produced by diagsum_symm

    summary_name : str, optional
        Name of the column of exp-DataFrame to use as a diagonal summary.
        Default is "balanced.sum".

    bins_per_order_magnitude : int, optional
        How many bins per order of magnitude. Default of 10 has a ratio of
        neighboring bins of about 1.25

    bin_layout : "fixed", "longest_region", or array
        "fixed" means that bins are exactly the same for different datasets,
        and only depend on bins_per_order_magnitude

        "longest_region" means that the last bin will end at size of the
        longest region.
            GOOD: the last bin will have as much data as possible.
            BAD: bin edges will end up different for different datasets, you
            can't divide them by each other

        array: provide your own bin edges. Can be of any size, and end at any
        value. Bins exceeding the size of the largest region will be simply
        ignored.

    smooth : callable
        A smoothing function to be applied to log(P(s)) and log(x)
        before calculating P(s) slopes for by-region data

    min_nvalid : int
        For each region, throw out bins (log-spaced) that have less than
        min_nvalid valid pixels
        This will ensure that each entree in Pc_by_region has at least n_valid
        valid pixels
        Don't set it to zero, or it will introduce bugs. Setting it to 1 is OK,
        but not recommended.

    min_count : int
        If counts are found in the data, then for each region, throw out bins
        (log-spaced)
        that have more than min_counts of counts.sum (raw Hi-C counts).
        This will ensure that each entree in Pc_by_region has at least
        min_count raw Hi-C reads

    Returns
    -------
    Pc : DataFrame
        dataframe of contact probabilities and spread across regions
    slope : ndarray
        slope of Pc(s) on a log-log plot and spread across regions
    bins : ndarray
        an array of bin edges used for calculating P(s)

    Notes
    -----
    For main Pc and slope, the algorithm is the following

    1. concatenate all the expected for all regions into a large dataframe.
    2. create logarithmically-spaced bins of diagonals (or use provided)
    3. pool together n_valid and balanced.sum for each region and for each bin
    4. calculate the average diagonal for each bucket, weighted by n_valid
    5. divide balanced.sum by n_valid after summing for each bucket (not before)
    6. calculate the slope in log space (for each region)

    X values are not midpoints of bins
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    In step 4, we calculate the average diag index weighted by n_valid. This
    seems counter-intuitive, but it actually is justified.

    Let's take the worst case scenario. Let there be a bin from 40MB to 44MB.
    Let there be a region that is exactly 41 MB long. The midpoint of the bin
    is at 42MB. But the only part of this region belonging to this bin is
    actually between 40MB and 41MB. Moreover, the "average" read in this
    little triangle of the heatmap is actually not coming even from 40.5 MB
    because the triangle is getting narrower towards 41MB. The center of mass
    of a triangle is 1/3 of the way up, or 40.33 MB. So an average read for
    this region in this bin is coming from 40.33.

    Consider the previous bin, say, from 36MB to 40MB. The heatmap there is a
    trapezoid with a long side of 5MB, the short side of 1MB, and height of
    4MB. The center of mass of this trapezoid is at 36 + 14/9 = 37.55MB,
    and not at 38MB. So the last bin center is definitely mis-assigned, and
    the second-to-last bin center is off by some 25%. This would lead to a 25%
    error of the P(s) slope estimated between the third-to-last and
    second-to-last bin.

    In presence of missing bins, this all becomes more complex, but this kind
    of averaging should take care of everything. It follows a general
    principle: when averaging the y values with some weights, one needs to
    average the x values with the same weights. The y values here are being
    added together, so per-diag means are effectively averaged with the weight
    of n_valid. Therefore, the x values (diag) should be averaged with the
    same weights.

    Other considerations
    ~~~~~~~~~~~~~~~~~~~~
    Steps #3 and #5 are important because the ratio of sums does not equal to
    the sum of ratios, and the former is more correct (the latter is more
    susceptible to noise). It is generally better to divide at the very end,
    rather than dividing things for each diagonal.

    Here we divide at the end twice: first we divide balanced.sum by n_valid
    for each region, then we effectively multiply it back up and divide it for
    each bin when combining different regions (see weighted average in the
    next function).

    Smoothing P(s) for the slope
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    For calcuating the slope, we apply smoothing to the P(s) to ensure the
    slope is not too noisy. There are several caveats here: the P(s) has to
    be smoothed in logspace, and both P and s have to be smoothed. It is
    discussed in detail here

    https://gist.github.com/mimakaev/4becf1310ba6ee07f6b91e511c531e73

    Examples
    --------
    For example, see this gist: https://gist.github.com/mimakaev/e9117a7fcc318e7904702eba5b47d9e6

    """

    def _get_diag_bins(bin_layout, diagmax, bins_per_order_magnitude):
        """
        create the logbins themselves based on layout, maxdiag, etc.
        """
        # create diag_bins based on chosen layout:
        if bin_layout == "fixed":
            diag_bins = numutils.persistent_log_bins(
                10, bins_per_order_magnitude=bins_per_order_magnitude
            )
        elif bin_layout == "longest_region":
            diag_bins = numutils.logbins(
                1, diagmax + 1, ratio=10 ** (1 / bins_per_order_magnitude)
            )
        elif isinstance(bin_layout, np.ndarray):
            diag_bins = bin_layout
        else:
            raise ValueError("bin_layout can be fixed, longest_region or an ndarray")

        if diag_bins[-1] < diagmax:
            raise ValueError(
                "Genomic separation bins end is less than the size of the largest region"
            )
        return diag_bins

    def _get_weighted_expected(
        exp_filtered,
        diag_bins,
        digit_id_name,
        weighted_dist_name,
        Pc_name,
        summary_name,
        raw_summary_name,
        min_nvalid=0,
        min_count=0,
    ):
        """
        given the logbins (diag_bins) and expected with digitized distances (and pre-filtered)
        calculate weighted distance per logbin and weighted expected.
        """

        # digitize dist: assign diagonals in expected df to diag_bins, - give them ids:
        exp_filtered[digit_id_name] = (
            np.searchsorted(diag_bins, exp_filtered[weighted_dist_name], side="right")
            - 1
        )
        exp_filtered = exp_filtered[
            exp_filtered[digit_id_name] >= 0
        ]  # ignore those that do not fit into diag_bins

        # constructing expected grouped by region
        byReg = exp_filtered.copy()

        # this averages diag_avg with the weight equal to n_valid, and sums everything else
        byReg[weighted_dist_name] *= byReg[_NUM_VALID]  # dist * n_valid
        byRegExp = byReg.groupby(
            [_REGION1, _REGION2, digit_id_name]
        ).sum()  # sum in each logbin
        byRegExp[weighted_dist_name] /= byRegExp[
            _NUM_VALID
        ]  # sum(dist*n_valid) / sum(n_valid)

        byRegExp = byRegExp.reset_index()
        byRegExp = byRegExp[byRegExp[_NUM_VALID] > min_nvalid]  # filtering by n_valid
        byRegExp[Pc_name] = byRegExp[summary_name] / byRegExp[_NUM_VALID]
        byRegExp = byRegExp[byRegExp[Pc_name] > 0]  # drop diag_bins with 0 counts
        # try to filter by the matching raw number of interactions
        if min_count:
            if raw_summary_name in byRegExp:
                byRegExp = byRegExp[byRegExp[raw_summary_name] > min_count]
            else:
                warnings.warn(
                    RuntimeWarning(
                        f"{raw_summary_name} not found in the input expected"
                    )
                )

        byRegExp["dist_bin_start"] = diag_bins[byRegExp[digit_id_name].to_numpy()]
        byRegExp["dist_bin_end"] = diag_bins[byRegExp[digit_id_name].to_numpy() + 1] - 1

        return byRegExp

    def _get_slopes(
        logbin_exp,
        digit_id_name,
        weighted_dist_name,
        Pc_name,
    ):
        """
        calculate derivative of P(s) (our logbinned weighted average expected)
        """

        slope_name = "slope"

        # now calculate P(s) derivatives aka slopes per region
        byRegDer = []
        for (reg1, reg2), subdf in logbin_exp.groupby([_REGION1, _REGION2]):
            subdf = subdf.sort_values(digit_id_name)
            valid = np.minimum(
                subdf[_NUM_VALID].to_numpy()[:-1], subdf[_NUM_VALID].to_numpy()[1:]
            )
            # geometric mean of each logbin - aka mids
            mids = np.sqrt(
                subdf[weighted_dist_name].to_numpy()[:-1]
                * subdf[weighted_dist_name].to_numpy()[1:]
            )
            slope = np.diff(smooth(np.log(subdf[Pc_name].to_numpy()))) / np.diff(
                smooth(np.log(subdf[weighted_dist_name].to_numpy()))
            )
            newdf = pd.DataFrame(
                {
                    weighted_dist_name: mids,
                    slope_name: slope,
                    _NUM_VALID: valid,
                    digit_id_name: subdf[digit_id_name].to_numpy()[:-1],
                }
            )
            newdf[_REGION1] = reg1
            newdf[_REGION2] = reg2
            byRegDer.append(newdf)
        byRegDer = pd.concat(byRegDer).reset_index(drop=True)

        return byRegDer

    raw_summary_name = "count.sum"
    exp_summary_base, *_ = summary_name.split(".")
    Pc_name = f"{exp_summary_base}.avg"
    diag_name = _DIST
    diag_avg_name = f"{diag_name}.avg"
    # filter expected from NaNs in summary column and copy (precaution)
    exp = exp.dropna(
        subset=[
            summary_name,
        ]
    ).copy()
    # rename "dist" column dist.avg, it'll change later
    exp[diag_avg_name] = exp.pop(diag_name)

    # generate the "logbins", i.e. uneven bins for the diagonal distances
    diag_bins = _get_diag_bins(
        bin_layout=bin_layout,
        diagmax=exp[diag_avg_name].max(),
        bins_per_order_magnitude=bins_per_order_magnitude,
    )
    # assign distances to logbins and calculate weight averages for dist and counts
    byRegExp = _get_weighted_expected(
        exp,
        diag_bins,
        digit_id_name="dist_bin_id",
        weighted_dist_name=diag_avg_name,
        Pc_name=f"{exp_summary_base}.avg",
        summary_name=summary_name,
        raw_summary_name="count.sum",
        min_nvalid=min_nvalid,
        min_count=min_count,
    )

    # now calculate P(s) derivatives aka slopes per region
    byRegDer = _get_slopes(
        byRegExp,
        digit_id_name="dist_bin_id",
        weighted_dist_name=diag_avg_name,
        Pc_name=f"{exp_summary_base}.avg",
    )

    # returning logbin expected, its derivative and lobins themselves:
    return byRegExp, byRegDer, diag_bins[: byRegExp["dist_bin_id"].max() + 2]


[docs]def combine_binned_expected(
    binned_exp,
    binned_exp_slope=None,
    Pc_name="balanced.avg",
    der_smooth_function_combined=lambda x: numutils.robust_gauss_filter(x, 1.3),
    spread_funcs="logstd",
    spread_funcs_slope="std",
    minmax_drop_bins=2,
    concat_original=False,
):
    """
    Combines by-region log-binned expected and slopes into genome-wide averages,
    handling small chromosomes and "corners" in an optimal fashion, robust to
    outliers. Calculates spread of by-chromosome P(s) and slopes, also in an optimal fashion.

    Parameters
    ----------
    binned_exp: dataframe
        binned expected as outputed by logbin_expected

    binned_exp_slope : dataframe or None
        If provided, estimates spread of slopes.
        Is necessary if concat_original is True

    Pc_name : str
        Name of the column with the probability of contacts.
        Defaults to "balanced.avg".

    der_smooth_function_combined : callable
        A smoothing function for calculating slopes on combined data

    spread_funcs: "minmax", "std", "logstd" or a function (see below)
        A way to estimate the spread of the P(s) curves between regions.
        * "minmax" - use the minimum/maximum of by-region P(s)
        * "std" - use weighted standard deviation of P(s) curves (may produce negative results)
        * "logstd" (recommended) weighted standard deviation in logspace (as seen on the plot)

    spread_funcs_slope: "minmax", "std" or a funciton
        Similar to spread_func, but for slopes rather than P(s)

    concat_original: bool (default = False)
        Append original dataframe, and put combined under region "combined"

    Returns
    -------
    scal, slope_df

    Notes
    -----
    This function does not calculate errorbars. The spread is not the deviation of the mean,
    and rather is representative of variability between chromosomes.


    Calculating errorbars/spread

    1. Take all by-region P(s)
    2. For "minmax", remove the last var_drop_last_bins bins for each region
       (by default two. They are most noisy and would inflate the
       spread for the last points). Min/max are most susceptible to this.
    3. Groupby P(s) by region
    4. Apply spread_funcs to the pd.GroupBy object. Options are:
       * minimum and maximum ("minmax"),
       * weighted standard deviation ("std"),
       * weighted standard deviation in logspace ("logstd", default) or two custom functions
       We do not remove the last bins for "std" / "logstd" because we are
       doing weighted standard deviation. Therefore, noisy "ends" of regions
       would contribute very little to this.
    5. Append them to the P(s) for the same bin.

    As a result, by for minmax, we do not estimate spread for the last
    two bins. This is because there are often very few chromosomal arms there,
    and different arm measurements are noisy. For other methods, we do
    estimate the spread there, and noisy last bins are taken care of by the
    weighted standard deviation. However, the spread in the last bins may be
    noisy, and may become a 0 if only one region is contributing to the last
    pixel.
    """
    diag_avg_name = f"{_DIST}.avg"
    # combine pre-logbinned expecteds
    scal = numutils.weighted_groupby_mean(
        binned_exp[
            [
                Pc_name,
                "dist_bin_id",
                "n_valid",
                diag_avg_name,
                "dist_bin_start",
                "dist_bin_end",
            ]
        ],
        group_by="dist_bin_id",
        weigh_by="n_valid",
        mode="mean",
    )

    # for every diagonal calculate the spread of expected
    if spread_funcs == "minmax":
        byRegVar = binned_exp.copy()
        byRegVar = byRegVar.loc[
            byRegVar.index.difference(
                byRegVar.groupby(["region1", "region2"])["n_valid"]
                .tail(minmax_drop_bins)
                .index
            )
        ]
        low_err = byRegVar.groupby("dist_bin_id")[Pc_name].min()
        high_err = byRegVar.groupby("dist_bin_id")[Pc_name].max()
    elif spread_funcs == "std":
        var = numutils.weighted_groupby_mean(
            binned_exp[[Pc_name, "dist_bin_id", "n_valid"]],
            group_by="dist_bin_id",
            weigh_by="n_valid",
            mode="std",
        )[Pc_name]
        low_err = scal[Pc_name] - var
        high_err = scal[Pc_name] + var
    elif spread_funcs == "logstd":
        var = numutils.weighted_groupby_mean(
            binned_exp[[Pc_name, "dist_bin_id", "n_valid"]],
            group_by="dist_bin_id",
            weigh_by="n_valid",
            mode="logstd",
        )[Pc_name]
        low_err = scal[Pc_name] / var
        high_err = scal[Pc_name] * var
    else:
        low_err, high_err = spread_funcs(binned_exp, scal)

    scal["low_err"] = low_err
    scal["high_err"] = high_err

    # re-calculate slope of the combined expected (log,smooth,diff)
    f = der_smooth_function_combined
    slope = np.diff(f(np.log(scal[Pc_name].values))) / np.diff(
        f(np.log(scal[diag_avg_name].values))
    )
    valid = np.minimum(scal["n_valid"].values[:-1], scal["n_valid"].values[1:])
    mids = np.sqrt(scal[diag_avg_name].values[:-1] * scal[diag_avg_name].values[1:])
    slope_df = pd.DataFrame(
        {
            diag_avg_name: mids,
            "slope": slope,
            "n_valid": valid,
            "dist_bin_id": scal.index.values[:-1],
        }
    )
    slope_df = slope_df.set_index("dist_bin_id")

    # when pre-region slopes are provided, calculate spread of slopes
    if binned_exp_slope is not None:
        if spread_funcs_slope == "minmax":
            byRegDer = binned_exp_slope.copy()
            byRegDer = byRegDer.loc[
                byRegDer.index.difference(
                    byRegDer.groupby(["region1", "region2"])["n_valid"]
                    .tail(minmax_drop_bins)
                    .index
                )
            ]
            low_err = byRegDer.groupby("dist_bin_id")["slope"].min()
            high_err = byRegDer.groupby("dist_bin_id")["slope"].max()
        elif spread_funcs_slope == "std":
            var = numutils.weighted_groupby_mean(
                binned_exp_slope[["slope", "dist_bin_id", "n_valid"]],
                group_by="dist_bin_id",
                weigh_by="n_valid",
                mode="std",
            )["slope"]
            low_err = slope_df["slope"] - var
            high_err = slope_df["slope"] + var

        else:
            low_err, high_err = spread_funcs_slope(binned_exp_slope, scal)
        slope_df["low_err"] = low_err
        slope_df["high_err"] = high_err

    slope_df = slope_df.reset_index()
    scal = scal.reset_index()

    # append "combined" expected/slopes to the input DataFrames (not in-place)
    if concat_original:
        scal["region"] = "combined"
        slope_df["region"] = "combined"
        scal = pd.concat([scal, binned_exp], sort=False).reset_index(drop=True)
        slope_df = pd.concat([slope_df, binned_exp_slope], sort=False).reset_index(
            drop=True
        )

    return scal, slope_df


[docs]def interpolate_expected(
    expected,
    binned_expected,
    columns=["balanced.avg"],
    kind="quadratic",
    by_region=True,
    extrapolate_small_s=False,
):
    """
    Interpolates expected to match binned_expected.
    Basically, this function smoothes the original expected according to the logbinned expected.
    It could either use by-region expected (each region will have different expected)
    or use combined binned_expected (all regions will have the same expected after that)

    Such a smoothed expected should be used to calculate observed/expected for downstream analysis.

    Parameters
    ----------
    expected: pd.DataFrame
        expected as returned by diagsum_symm
    binned_expected: pd.DataFrame
        binned expected (combined or not)
    columns: list[str] (optional)
        Columns to interpolate. Must be present in binned_expected,
        but not necessarily in expected.
    kind: str (optional)
        Interpolation type, according to scipy.interpolate.interp1d
    by_region: bool or str (optional)
        Whether to do interpolation by-region (default=True).
        False means use one expected for all regions (use entire table).
        If a region name is provided, expected for that region is used.

    """

    exp_int = expected.copy()
    gr_exp = exp_int.groupby(
        ["region1", "region2"]
    )  # groupby original expected by region

    if by_region is not False and (
        ("region1" not in binned_expected) or ("region2" not in binned_expected)
    ):
        warnings.warn("Region columns not found, assuming combined expected")
        by_region = False

    if by_region is True:
        # groupby expected
        gr_binned = binned_expected.groupby(["region1", "region2"])
    elif by_region is not False:
        # extract a region that we want to use
        binned_expected = binned_expected[binned_expected["region1"] == by_region]

    if by_region is not True:
        # check that we have no duplicates in expected
        assert len(binned_expected["dist_bin_id"].drop_duplicates()) == len(
            binned_expected
        )

    interp_dfs = []

    for (reg1, reg2), df_orig in gr_exp:
        if by_region is True:  # use binned expected for this region
            if (reg1, reg2) not in gr_binned.groups:
                continue
            subdf = gr_binned.get_group((reg1, reg2))
        else:
            subdf = binned_expected

        diag_orig = df_orig[_DIST].to_numpy()
        diag_mid = (subdf["dist_bin_start"] + subdf["dist_bin_end"]) / 2
        interp_df = pd.DataFrame(
            index=df_orig.index
        )  # df to put interpolated values in
        with np.errstate(invalid="ignore", divide="ignore"):
            for colname in columns:  # interpolate each column
                value_column = subdf[colname]
                interp = interp1d(
                    np.log(diag_mid),
                    np.log(value_column),
                    kind=kind,
                    fill_value="extrapolate",
                )
                interp_df[colname] = np.exp(interp(np.log(diag_orig)))
            if not extrapolate_small_s:
                mask = diag_orig >= subdf["dist_bin_start"].min()
                interp_df = interp_df.iloc[mask]
        interp_dfs.append(interp_df)
    interp_df = pd.concat(interp_dfs)
    for i in interp_df.columns:
        exp_int[i] = interp_df[i]
    return exp_int