Source code for cooltools.api.dotfinder

"""
Collection of functions related to dot-calling

The main user-facing API function is:

.. code-block:: python

    dots(
        clr,
        expected,
        expected_value_col="balanced.avg",
        clr_weight_name="weight",
        view_df=None,
        kernels=None,
        max_loci_separation=10_000_000,
        max_nans_tolerated=1,
        n_lambda_bins=40,
        lambda_bin_fdr=0.1,
        clustering_radius=20_000,
        cluster_filtering=None,
        tile_size=5_000_000,
        nproc=1,
    )

This function implements HiCCUPS-style dot calling, but enables user-specified
modifications at multiple steps. The current implementation makes two passes
over the input data, first to create a histogram of pixel enrichment values, 
and second to extract significantly enriched pixels.

- The function starts with compatibility verifications   
- Recommendation or verification for `kernels` is done next.  
  Custom kernels must satisfy properties including: square shape,
  equal sizes, odd sizes, zeros in the middle, etc. By default,
  HiCCUPS-style kernels are recommended based on the binsize.
- Lambda bins are defined for multiple hypothesis  
  testing separately for different value ranges of the locally adjusted expected.
  Currently, log-binned lambda-bins are hardcoded using a pre-defined
  BASE of 2^(1/3). `n_lambda_bins` controls the total number of bins.
  for the `clr`, `expected` and `view` of interest.
- Genomic regions in the specified `view`(all chromosomes by default)  
  are split into smaller tiles of size `tile_size`.
- `scoring_and_histogramming_step()` is performed independently  
  on the genomic tiles. In this step, locally adjusted expected is
  calculated using convolution kernels for each pixel in the tile.
  All surveyed pixels are histogrammed according to their adjusted 
  expected and raw observed counts. Locally adjusted expected is 
  not stored in memory.
- Chunks of histograms are aggregated together and a modified BH-FDR  
  procedure is applied to the result in `determine_thresholds()`.
  This returns thresholds for statistical significance 
  in each lambda-bin (for observed counts), along with the adjusted
  p-values (q-values).
- Calculated thresholds are used to extract statistically significant  
  pixels in `scoring_and_extraction_step()`. Because locally adjusted
  expected is not stored in memory, it is re-caluclated
  during this step, which makes it computationally intensive.
  Locally adjusted expected values are required in order to apply
  different thresholds of significance depending on the lambda-bin.
- Returned filtered pixels, or 'dots', are significantly enriched  
  relative to their locally adjusted expecteds and thus have potential
  biological interest. Dots are further annotated with their 
  genomic coordinates and q-values (adjusted p-values) for
  all applied kernels.
- All further steps perform optional post-processing on called dots

  - enriched pixels that are within `clustering_radius` of each other  
    are clustered together and the brightest one is selected as the
    representative position of a dot.
  - cluster-representatives along with "singletons" (enriched pixels  
    that are not part of any cluster) can be subjected to further
    empirical enrichment filtering in `cluster_filtering_hiccups()`. This 
    both requires clustered dots exceed prescribed enrichment thresholds 
    relative to their local neighborhoods and that singletons pass an 
    even more stringent q-value threshold.
"""

from functools import partial, reduce
import logging
import warnings
import time

from scipy.ndimage import convolve
from scipy.stats import poisson
import numpy as np
import pandas as pd
from sklearn.cluster import Birch
import cooler

from ..lib.numutils import LazyToeplitz, get_kernel
from ..lib.common import assign_regions, make_cooler_view
from ..lib.checks import (
    is_cooler_balanced,
    is_compatible_viewframe,
    is_valid_expected,
)
from ..lib.common import pool_decorator


warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
logging.basicConfig(level=logging.INFO)

# this is to mitigate and parameterize the obs.raw vs count controversy:
observed_count_name = "count"
expected_count_name = "exp.raw"
adjusted_exp_name = lambda kernel_name: f"la_exp.{kernel_name}.value"
nans_inkernel_name = lambda kernel_name: f"la_exp.{kernel_name}.nnans"
bin1_id_name = "bin1_id"
bin2_id_name = "bin2_id"
bedpe_required_cols = [
    "chrom1",
    "start1",
    "end1",
    "chrom2",
    "start2",
    "end2",
]


# define basepairs to bins for clarity
[docs]def bp_to_bins(basepairs, binsize):
    return int(basepairs / binsize)


[docs]def recommend_kernels(binsize):
    """
    Return a recommended set of convolution kernels for dot-calling
    based on the resolution, or binsize, of the input data.

    This function currently recommends the four kernels used in the HiCCUPS method:
    donut, horizontal, vertical, lowerleft. Kernels are recommended for resolutions
    near 5 kb, 10 kb, and 25 kb. Dots are not typically visible at lower resolutions
    (binsize >28kb) and the majority of datasets are too sparse for dot-calling
    at very high resolutions (<4kb). Given this, default kernels are not
    recommended for resolutions outside this range.

    Parameters
    ----------
    binsize : integer
        binsize of the provided cooler

    Returns
    -------
    kernels : {str:ndarray}
        dictionary of convolution kernels as ndarrays, with their
        names as keys.
    """
    # define "donut" parameters w and p based on the resolution of the data
    # w is the outside half-width, and p is the internal half-width.
    if binsize > 28000:
        raise ValueError(
            f"Provided cooler has resolution of {binsize} bases,"
            " which is too coarse for automated kernel recommendation."
            " Provide custom kernels to proceed."
        )
    elif binsize >= 18000:
        w, p = 3, 1
    elif binsize >= 8000:
        w, p = 5, 2
    elif binsize >= 4000:
        w, p = 7, 4
    else:
        raise ValueError(
            f"Provided cooler has resolution {binsize} bases,"
            " which is too fine for automated kernel recommendation. Provide custom kernels"
            " to proceed."
        )
    logging.info(
        f"Using recommended donut-based kernels with w={w}, p={p} for binsize={binsize}"
    )
    # standard set of 4 kernels used in Rao et al 2014
    # 'upright' is a symmetrical inversion of "lowleft", not needed.
    kernel_types = ["donut", "vertical", "horizontal", "lowleft"]

    # generate standard kernels - consider providing custom ones
    kernels = {k: get_kernel(w, p, k) for k in kernel_types}

    return kernels


[docs]def is_compatible_kernels(kernels, binsize, max_nans_tolerated):
    """
    TODO implement checks for kernels:
     - matrices are of the same size
     - they should be squared (too restrictive ? maybe pad with 0 as needed)
     - dimensions are odd, to have a center pixel to refer to
     - they can be turned into int 1/0 ones (too restrictive ? allow weighted kernels ?)
     - the central pixel should be zero perhaps (unless weights are allowed 4sure)
     - maybe introduce an upper limit to the size - to avoid crazy long calculations
     - check relative to the binsize maybe ? what's the criteria ?
    """

    # kernels must be a dict with kernel-names as keys
    # and kernel ndarrays as values.
    if not isinstance(kernels, dict):
        raise ValueError(
            "'kernels' must be a dictionary" "with name-keys and ndarrays-values."
        )

    # deduce kernel_width - overall footprint
    kernel_widths = [len(k) for kn, k in kernels.items()]
    # kernels must have the same width for now:
    if min(kernel_widths) != max(kernel_widths):
        raise ValueError(f"all 'kernels' must have the same size, now: {kernel_widths}")
    # now extract their dimensions:
    kernel_width = max(kernel_widths)
    kernel_half_width = (kernel_width - 1) / 2  # former w parameter
    if (kernel_half_width <= 0) or not kernel_half_width.is_integer():
        raise ValueError(
            f"Size of the convolution kernels has to be odd and > 3, currently {kernel_width}"
        )

    # once kernel parameters are setup check max_nans_tolerated
    # to make sure kernel footprints overlaping 1 side with the
    # NaNs filled row/column are not "allowed"
    if not max_nans_tolerated <= kernel_width:
        raise ValueError(
            f"Too many NaNs allowed max_nans_tolerated={max_nans_tolerated}"
        )
    # may lead to scoring the same pixel twice, - i.e. duplicates.

    # return True if everyhting passes
    return True


[docs]def annotate_pixels_with_qvalues(pixels_df, qvalues, obs_raw_name=observed_count_name):
    """
    Add columns with the qvalues to a DataFrame of scored pixels

    Parameters
    ----------
    pixels_df : pandas.DataFrame
        a DataFrame with pixel coordinates that must have at least 2 columns
        named 'bin1_id' and 'bin2_id', where first is pixels's row and the
        second is pixel's column index.
    qvalues : dict of DataFrames
        A dictionary with keys being kernel names and values DataFrames
        storing q-values for each observed count values in each lambda-
        bin. Colunms are Intervals defined by 'ledges' boundaries.
        Rows corresponding to a range of observed count values.
    obs_raw_name : str
        Name of the column/field that carry number of counts per pixel,
        i.e. observed raw counts.

    Returns
    -------
    pixels_qvalue_df : pandas.DataFrame
        DataFrame of pixels with additional columns la_exp.{k}.qval,
        storing q-values (adjusted p-values) corresponding to the count
        value of a pixel, its kernel, and a lambda-bin it belongs to.
    """
    # do it "safe" - using a copy:
    pixels_qvalue_df = pixels_df.copy()
    # columns to return
    cols = list(pixels_qvalue_df.columns)
    # will do it efficiently using "melted" qvalues table:
    for k, qval_df in qvalues.items():
        lbins = pd.IntervalIndex(qval_df.columns)
        pixels_qvalue_df["lbins"] = pd.cut(
            pixels_qvalue_df[f"la_exp.{k}.value"], bins=lbins
        )
        pixels_qvalue_df = pixels_qvalue_df.merge(
            # melted qval_df columns: [counts, la_exp.k.value, value]
            qval_df.melt(ignore_index=False).reset_index(),
            left_on=[obs_raw_name, "lbins"],
            right_on=[obs_raw_name, f"la_exp.{k}.value"],
            suffixes=("", "_"),
        )
        qval_col_name = f"la_exp.{k}.qval"
        pixels_qvalue_df = pixels_qvalue_df.rename(columns={"value": qval_col_name})
        cols.append(qval_col_name)

    # return q-values annotated pixels
    return pixels_qvalue_df.loc[:, cols]


[docs]def clust_2D_pixels(
    pixels_df,
    threshold_cluster=2,
    bin1_id_name="bin1_id",
    bin2_id_name="bin2_id",
    clust_label_name="c_label",
    clust_size_name="c_size",
):
    """
    Group significant pixels by proximity using Birch clustering. We use
    "n_clusters=None", which implies no AgglomerativeClustering, and thus
    simply reporting "blobs" of pixels of radii <="threshold_cluster" along
    with corresponding blob-centroids as well.

    Parameters
    ----------
    pixels_df : pandas.DataFrame
        a DataFrame with pixel coordinates that must have at least 2 columns
        named 'bin1_id' and 'bin2_id', where first is pixels's row and the
        second is pixel's column index.
    threshold_cluster : int
        clustering radius for Birch clustering derived from ~40kb radius of
        clustering and bin size.
    bin1_id_name : str
        Name of the 1st coordinate (row index) in 'pixel_df', by default
        'bin1_id'. 'start1/end1' could be usefull as well.
    bin2_id_name : str
        Name of the 2nd coordinate (column index) in 'pixel_df', by default
        'bin2_id'. 'start2/end2' could be usefull as well.
    clust_label_name : str
        Name of the cluster of pixels label. "c_label" by default.
    clust_size_name : str
        Name of the cluster of pixels size. "c_size" by default.

    Returns
    -------
    peak_tmp : pandas.DataFrame
        DataFrame with the following columns:
        [c+bin1_id_name, c+bin2_id_name, clust_label_name, clust_size_name]
        row/col (bin1/bin2) are coordinates of centroids,
        label and sizes are unique pixel-cluster
        labels and their corresponding sizes.
    """

    # col (bin2) must precede row (bin1):
    pixels = pixels_df[[bin1_id_name, bin2_id_name]].values.astype(np.float64)
    # added astype(float) to avoid further issues with clustering, as it
    # turned out start1/start2 genome coordinates could be int32 or int64
    # and int32 is not enough for some operations, i.e., integer overflow.
    pixel_idxs = pixels_df.index

    # perform BIRCH clustering of pixels:
    # "n_clusters=None" implies using BIRCH without AgglomerativeClustering,
    # thus simply reporting "blobs" of pixels of radius "threshold_cluster"
    # along with blob-centroids as well:
    brc = Birch(
        n_clusters=None,
        threshold=threshold_cluster,
        # branching_factor=50, (it's default)
        compute_labels=True,
    )
    brc.fit(pixels)
    # # following is redundant,
    # # as it's done here:
    # # https://github.com/scikit-learn/scikit-learn/blob/a24c8b464d094d2c468a16ea9f8bf8d42d949f84/sklearn/cluster/birch.py#L638
    # clustered_labels = brc.predict(pixels)

    # labels of nearest centroid, assigned to each pixel,
    # BEWARE: labels might not be continuous, i.e.,
    # "np.unique(clustered_labels)" isn't same as "brc.subcluster_labels_", because:
    # https://github.com/scikit-learn/scikit-learn/blob/a24c8b464d094d2c468a16ea9f8bf8d42d949f84/sklearn/cluster/birch.py#L576
    clustered_labels = brc.labels_
    # centroid coordinates ( <= len(clustered_labels)):
    clustered_centroids = brc.subcluster_centers_
    # count unique labels and get their continuous indices
    uniq_labels, inverse_idx, uniq_counts = np.unique(
        clustered_labels, return_inverse=True, return_counts=True
    )
    # cluster sizes taken to match labels:
    cluster_sizes = uniq_counts[inverse_idx]
    # take centroids corresponding to labels (as many as needed):
    centroids_per_pixel = np.take(clustered_centroids, clustered_labels, axis=0)

    # clustering message:
    logging.info(
        f"detected {uniq_counts.size} clusters of {uniq_counts.mean():.2f}+/-{uniq_counts.std():.2f} size"
    )

    # create output DataFrame
    centroids_n_labels_df = pd.DataFrame(
        centroids_per_pixel,
        index=pixel_idxs,
        columns=["c" + bin1_id_name, "c" + bin2_id_name],
    )
    # add labels per pixel:
    centroids_n_labels_df[clust_label_name] = clustered_labels.astype(np.int64)
    # add cluster sizes:
    centroids_n_labels_df[clust_size_name] = cluster_sizes.astype(np.int64)

    return centroids_n_labels_df


##################################
# matrix tiling and tiles-generator
##################################


[docs]def tile_square_matrix(matrix_size, offset, tile_size, pad=0):
    """
    Generate a stream of coordinates of tiles that cover a matrix of a given size.
    Matrix has to be square, on-digaonal one: e.g. corresponding to a chromosome
    or a chromosomal arm.

    Parameters
    ----------
    matrix_size : int
        Size of a squared matrix
    offset : int
        Offset coordinates of generated tiles by 'offset'
    tile_size : int
        Requested size of the tiles. Tiles near
        the right and botoom edges could be rectangular
        and smaller then 'tile_size'
    pad : int
        Small padding around each tile to be included in the yielded coordinates.

    Yields
    ------
    Pairs of indices/coordinates of every tile: (start_i, end_i), (start_j, end_j)

    Notes
    -----
    Generated tiles coordinates [start_i,end_i) , [start_i,end_i)
    can be used to fetch heatmap tiles from cooler:
    >>> clr.matrix()[start_i:end_i, start_j:end_j]

    'offset' is useful when a given matrix is part of a
    larger matrix (a given chromosome or arm), and thus
    all coordinated needs to be offset to get absolute
    coordinates.

    Tiles are non-overlapping (pad=0), but tiles near
    the right and bottom edges could be rectangular:

    * * * * * * * * *
    *     *     *   *
    *     *     *   *
    * * * * * * *   *
    *     *         *
    *     *  ...    *
    * * * *         *
    *               *
    * * * * * * * * *
    """
    # number of tiles along each axis
    if matrix_size % tile_size:
        num_tiles = matrix_size // tile_size + 1
    else:
        num_tiles = matrix_size // tile_size

    logging.debug(
        f" matrix {matrix_size}X{matrix_size} to be split into {num_tiles * num_tiles} tiles of {tile_size}X{tile_size}."
    )
    if pad:
        logging.debug(
            f" tiles are padded (width={pad}) to enable convolution near the edges"
        )

    # generate 'num_tiles X num_tiles' tiles
    for ti in range(num_tiles):
        for tj in range(num_tiles):

            start_i = max(0, tile_size * ti - pad)
            start_j = max(0, tile_size * tj - pad)

            end_i = min(matrix_size, tile_size * (ti + 1) + pad)
            end_j = min(matrix_size, tile_size * (tj + 1) + pad)

            yield (start_i + offset, end_i + offset), (start_j + offset, end_j + offset)


[docs]def generate_tiles_diag_band(clr, view_df, pad_size, tile_size, band_to_cover):
    """
    A generator yielding corrdinates of heatmap tiles that are needed to cover
    the requested band_to_cover around diagonal. Each tile is "padded" with
    the pad of size 'pad_size' to allow for convolution near the boundary of
    a tile.

    Parameters
    ----------
    clr : cooler
        Cooler object to use to extract chromosome extents.
    view_df : viewframe
        Viewframe with genomic regions to process, chrom, start, end, name.
    pad_size : int
        Size of padding around each tile. Typically the outer size of the
        kernel.
    tile_size : int
        Size of the heatmap tile.
    band_to_cover : int
        Size of the diagonal band to be covered by the generated tiles.
        Typically correspond to the max_loci_separation for called dots.
    Returns
    -------
    tile_coords : tuple
        Generator of tile coordinates, i.e. tuples of three:
        (region_name, tile_span_i, tile_span_j), where 'tile_span_i/j'
        each is a tuple of bin ids (bin_start, bin_end).
    """

    for chrom, start, end, region_name in view_df.itertuples(index=False):
        region_start, region_end = clr.extent((chrom, start, end))
        region_size = region_end - region_start
        for tile_span_i, tile_span_j in tile_square_matrix(
            matrix_size=region_size,
            offset=region_start,
            tile_size=tile_size,
            pad=pad_size,
        ):
            # check if a given tile intersects with
            # with the diagonal band of interest ...
            tile_diag_start = tile_span_j[0] - tile_span_i[1]
            tile_diag_end = tile_span_j[1] - tile_span_i[0]
            # TODO allow more flexible definition of a band to cover
            band_start, band_end = 0, band_to_cover
            # we are using this >2*padding trick to exclude
            # tiles from the lower triangle from calculations ...
            if (
                min(band_end, tile_diag_end) - max(band_start, tile_diag_start)
            ) > 2 * pad_size:
                yield region_name, tile_span_i, tile_span_j


########################################################################
# this is the MAIN function to get locally adjusted expected
########################################################################
[docs]def get_adjusted_expected_tile_some_nans(
    origin_ij, observed, expected, bal_weights, kernels
):
    """
    Get locally adjusted expected for a collection of local-filters (kernels).

    Such locally adjusted expected, 'Ek' for a given kernel,
    can serve as a baseline for deciding whether a given
    pixel is enriched enough to call it a feature (dot-loop,
    flare, etc.) in a downstream analysis.

    For every pixel of interest [i,j], locally adjusted
    expected is a product of a global expected in that
    pixel E_bal[i,j] and an enrichment of local environ-
    ment of the pixel, described with a given kernel:

    ::

                                  KERNEL[i,j](O_bal)
        Ek_bal[i,j] = E_bal[i,j]* ------------------
                                  KERNEL[i,j](E_bal)

    where KERNEL[i,j](X) is a result of convolution
    between the kernel and a slice of matrix X centered
    around (i,j). See link below for details:
    https://en.wikipedia.org/wiki/Kernel_(image_processing)

    Returned values for observed and all expecteds
    are rescaled back to raw-counts, for the sake of
    downstream statistical analysis, which is using
    Poisson test to decide is a given pixel is enriched.
    (comparison between balanced values using Poisson-
    test is intractable):

    ::

                                  KERNEL[i,j](O_bal)
        Ek_raw[i,j] = E_raw[i,j]* ------------------ ,
                                  KERNEL[i,j](E_bal)

    where E_raw[i,j] is:

    ::

              1               1
        -------------- * -------------- * E_bal[i,j]
        bal_weights[i]   bal_weights[j]


    Parameters
    ----------
    origin_ij : (int,int) tuple
        tuple of interegers that specify the
        location of an observed matrix slice.
        Measured in bins, not in nucleotides.
    observed : numpy.ndarray
        square symmetrical dense-matrix
        that contains balanced observed O_bal
    expected : numpy.ndarray
        square symmetrical dense-matrix
        that contains expected, calculated
        based on balanced observed: E_bal.
    bal_weights : numpy.ndarray or (numpy.ndarray, numpy.ndarray)
        1D vector used to turn raw observed
        into balanced observed for a slice of
        a matrix with the origin_ij on the diagonal;
        and a tuple/list of a couple of 1D arrays
        in case it is a slice with an arbitrary
        origin_ij.
    kernels : dict of (str, numpy.ndarray)
        dictionary of kernels/masks to perform
        convolution of the heatmap. Kernels
        describe the local environment, and
        used to estimate baseline for finding
        enriched/prominent peaks.
        Peak must be enriched with respect to
        all local environments (all kernels),
        to be considered significant.
        Dictionay keys must contain names for
        each kernel.
        Note, scipy.ndimage.convolve first flips kernel
        and only then applies it to matrix.

    Returns
    -------
    peaks_df : pandas.DataFrame
        DataFrame with the results of locally adjusted calculations
        for every kernel for a given slice of input matrix.

    Notes
    -----

    Reported columns:
        bin1_id - bin1_id index (row), adjusted to tile_start_i
        bin2_id - bin bin2_id index, adjusted to tile_start_j
        la_exp - locally adjusted expected (for each kernel)
        la_nan - number of NaNs around (each kernel's footprint)
        exp.raw - global expected, rescaled to raw-counts
        obs.raw(counts) - observed values in raw-counts.

    Depending on the intial tiling of the interaction matrix,
    concatened `peaks_df` may require "deduplication", as some pixels
    can be evaluated in several tiles (e.g. near the tile edges).
    Default tilitng in the `dots` functions, should avoid this problem.

    """
    # extract origin_ij coordinate of this tile:
    io, jo = origin_ij
    # let's extract full matrices and ice_vector:
    O_raw = observed  # raw observed, no need to copy, no modifications.
    E_bal = np.copy(expected)
    # 'bal_weights': ndarray or a couple of those ...
    if isinstance(bal_weights, np.ndarray):
        v_bal_i = bal_weights
        v_bal_j = bal_weights
    elif isinstance(bal_weights, (tuple, list)):
        v_bal_i, v_bal_j = bal_weights
    else:
        raise ValueError(
            "'bal_weights' must be an numpy.ndarray"
            "for slices of a matrix with diagonal-origin or"
            "a tuple/list of a couple of numpy.ndarray-s"
            "for a slice of matrix with an arbitrary origin."
        )
    # prepare matrix of balancing weights Ci*Cj
    bal_weights_ij = np.outer(v_bal_i, v_bal_j)

    # balanced observed, from raw-observed
    # by element-wise multiply:
    O_bal = np.multiply(O_raw, bal_weights_ij)
    # O_bal is separate from O_raw memory-wise.

    # fill lower triangle of O_bal and E_bal with NaNs
    # in order to prevent peak calling from the lower triangle
    # and also to provide fair locally adjusted expected
    # estimation for pixels very close to diagonal, whose
    # "donuts"(kernels) would be crossing the main diagonal.
    # The trickiest thing here would be dealing with the origin_ij: io,jo.
    O_bal[np.tril_indices_from(O_bal, k=(io - jo) - 1)] = np.nan
    E_bal[np.tril_indices_from(E_bal, k=(io - jo) - 1)] = np.nan

    # raw E_bal: element-wise division of E_bal[i,j] and
    # v_bal[i]*v_bal[j]:
    E_raw = np.divide(E_bal, bal_weights_ij)

    # let's calculate a matrix of common NaNs
    # shared between observed and expected:
    # check if it's redundant ? (is NaNs from O_bal sufficient? )
    N_bal = np.logical_or(np.isnan(O_bal), np.isnan(E_bal))
    # fill in common nan-s with zeroes, preventing
    # NaNs during convolution:
    O_bal[N_bal] = 0.0
    E_bal[N_bal] = 0.0
    E_raw[N_bal] = 0.0  # todo: consider filtering those altogether
    # think about usinf copyto and where functions later:
    # https://stackoverflow.com/questions/6431973/how-to-copy-data-from-a-numpy-array-to-another
    #
    # we are going to accumulate all the results
    # into a DataFrame, keeping NaNs, and other
    # unfiltered results (even the lower triangle for now):
    i, j = np.indices(O_raw.shape)
    # pack it into DataFrame to accumulate results:
    peaks_df = pd.DataFrame({"bin1_id": i.ravel() + io, "bin2_id": j.ravel() + jo, "count": O_raw.ravel()})

    with np.errstate(divide="ignore", invalid="ignore"):
        for kernel_name, kernel in kernels.items():
            ###############################
            # kernel-specific calculations:
            ###############################
            # kernel paramters such as width etc
            # are taken into account implicitly ...
            ########################################
            # a matrix filled with the kernel-weighted sums
            # based on a balanced observed matrix:
            KO = convolve(O_bal, kernel, mode="constant", cval=0.0, origin=0)
            # a matrix filled with the kernel-weighted sums
            # based on a balanced expected matrix:
            KE = convolve(E_bal, kernel, mode="constant", cval=0.0, origin=0)
            # get number of NaNs in a vicinity of every
            # pixel (kernel's nonzero footprint)
            # based on the NaN-matrix N_bal.
            # N_bal is shared NaNs between O_bal E_bal,
            NN = convolve(
                N_bal.astype(np.int64),
                (kernel != 0).astype(np.int64),  # non-zero footprint as kernel
                mode="constant",
                cval=1,  # NaNs beyond boundaries
                origin=0,
            )
            ######################################
            # using cval=0 for actual data and
            # cval=1 for NaNs matrix reduces
            # "boundary issue" to the "number of
            # NaNs"-issue
            # ####################################
            # now finally, E_raw*(KO/KE), as the
            # locally-adjusted expected with raw counts as values:
            local_adjustment_factor = np.divide(KO, KE)
            Ek_raw = np.multiply(E_raw, local_adjustment_factor)

            logging.debug(
                f"Convolution with kernel {kernel_name} is done for tile @ {io} {jo}."
            )
            # accumulation into single DataFrame:
            # store locally adjusted expected for each kernel
            # and number of NaNs in the footprint of each kernel
            peaks_df[f"la_exp.{kernel_name}.value"] = Ek_raw.ravel()
            peaks_df[f"la_exp.{kernel_name}.nnans"] = NN.ravel()
            # division by KE=0 has to be treated separately:
            peaks_df[f"safe_division.{kernel_name}"] = np.isfinite(local_adjustment_factor.ravel())
            # do all the filter/logic/masking etc on the complete DataFrame ...

    return peaks_df


##################################
# step-specific dot-calling functions
##################################
[docs]def score_tile(
    tile_cij,
    clr,
    expected_indexed,
    expected_value_col,
    clr_weight_name,
    kernels,
    max_nans_tolerated,
    band_to_cover,
):
    """
    The main working function that given a tile of a heatmap, applies kernels to
    perform convolution to calculate locally-adjusted expected and then
    calculates a p-value for every meaningfull pixel against these
    locally-adjusted expected (la_exp) values.

    Parameters
    ----------
    tile_cij : tuple
        Tuple of 3: region name, tile span row-wise, tile span column-wise:
        (region, tile_span_i, tile_span_j), where tile_span_i = (start_i, end_i), and
        tile_span_j = (start_j, end_j).
    clr : cooler
        Cooler object to use to extract Hi-C heatmap data.
    expected_indexed : pandas.DataFrame
        DataFrame with cis-expected, indexed with 'region1', 'region2', 'dist'.
    expected_value_col : str
        Name of a value column in expected DataFrame
    clr_weight_name : str
        Name of a value column with balancing weights in a cooler.bins()
        DataFrame. Typically 'weight'.
    kernels : dict
        A dictionary with keys being kernels names and values being ndarrays
        representing those kernels.
    max_nans_tolerated : int
        Number of NaNs tolerated in a footprint of every kernel.
    band_to_cover : int
        Results would be stored only for pixels connecting loci closer than
        'band_to_cover'.

    Returns
    -------
    res_df : pandas.DataFrame
        results: annotated pixels with calculated locally adjusted expected
        for every kernels, observed, precalculated pvalues, number of NaNs in
        footprint of every kernels, all of that in a form of an annotated
        pixels DataFrame for eligible pixels of a given tile.

    """
    # unpack tile's coordinates
    region_name, tile_span_i, tile_span_j = tile_cij
    tile_start_ij = (tile_span_i[0], tile_span_j[0])

    # we have to do it for every tile, because
    # region_name is not known apriori (maybe move outside)
    # use .loc[region, region] for symmetric cis regions to conform with expected v1.0
    lazy_exp = LazyToeplitz(
        expected_indexed.loc[region_name, region_name][expected_value_col].to_numpy()
    )

    # RAW observed matrix slice:
    observed = clr.matrix(balance=False)[slice(*tile_span_i), slice(*tile_span_j)]
    # expected as a rectangular tile :
    expected = lazy_exp[slice(*tile_span_i), slice(*tile_span_j)]
    # slice of balance_weight for row-span and column-span :
    bal_weight_i = clr.bins()[slice(*tile_span_i)][clr_weight_name].to_numpy()
    bal_weight_j = clr.bins()[slice(*tile_span_j)][clr_weight_name].to_numpy()

    # do the convolutions
    result = get_adjusted_expected_tile_some_nans(
        origin_ij=tile_start_ij,
        observed=observed,
        expected=expected,
        bal_weights=(bal_weight_i, bal_weight_j),
        kernels=kernels,
    )

    # Post-processing filters
    # (0) keep only upper-triangle pixels:
    upper_band = result["bin1_id"] < result["bin2_id"]

    # (1) exclude pixels that connect loci further than 'band_to_cover' apart:
    is_inside_band = result["bin1_id"] > (result["bin2_id"] - band_to_cover)

    # (2) identify pixels that pass number of NaNs compliance test for ALL kernels:
    does_comply_nans = np.all(
        result[[f"la_exp.{k}.nnans" for k in kernels]] < max_nans_tolerated, axis=1
    )

    # (3) keep pixels without nan/infinite local adjustment factors for all kernel
    finite_values_only = result[[f"safe_division.{k}" for k in kernels]].all(axis="columns")

    # so, selecting inside band and nNaNs compliant results:
    res_df = result[upper_band & is_inside_band & does_comply_nans & finite_values_only].reset_index(
        drop=True
    )
    #
    # so return only bare minimum: bin_ids , observed-raw counts
    # and locally adjusted expected estimates for every kernel
    return res_df[
        ["bin1_id", "bin2_id", "count"] + [f"la_exp.{k}.value" for k in kernels]
    ].astype(dtype={f"la_exp.{k}.value": "float64" for k in kernels})


[docs]def histogram_scored_pixels(
    scored_df, kernels, ledges, obs_raw_name=observed_count_name
):
    """
    An attempt to implement HiCCUPS-like lambda-binning statistical procedure.
    This function aims at building up a histogram of locally adjusted
    expected scores for groups of characterized pixels.

    Such histograms are subsequently used to compute FDR thresholds
    for different "classes" of hypothesis (classified by their
    locally-adjusted expected (la_exp)).

    Parameters
    ----------
    scored_df : pd.DataFrame
        A table with the scoring information for a group of pixels.
    kernels : dict
        A dictionary with keys being kernels names and values being ndarrays
        representing those kernels.
    ledges : ndarray
        An ndarray with bin lambda-edges for groupping locally adjusted
        expecteds, i.e., classifying statistical hypothesis into lambda-bins.
        Left-most bin (-inf, 1], and right-most one (value,+inf].
    obs_raw_name : str
        Name of the column/field that carry number of counts per pixel,
        i.e. observed raw counts.

    Returns
    -------
    hists : dict of pandas.DataFrame
        A dictionary of pandas.DataFrame with lambda/observed 2D histogram for
        every kernel-type.


    Notes
    -----
    returning histograms corresponding to the chunks of scored pixels.
    """

    hists = {}
    for k in kernels:
        #  we would need to generate a bunch of these histograms for all of the
        # kernel types:
        # needs to be lambda-binned             : scored_df["la_exp."+k+".value"]
        # needs to be histogrammed in every bin : scored_df["obs.raw"]
        #
        # lambda-bin index for kernel-type "k":
        lbins = pd.cut(scored_df[f"la_exp.{k}.value"], ledges)
        # group scored_df by counts and lambda-bins to contructs histograms:
        hists[k] = (
            scored_df.groupby([obs_raw_name, lbins], dropna=False, observed=False)[
                f"la_exp.{k}.value"
            ]
            .count()
            .unstack()
            .fillna(0)
            .astype(np.int64)
        )
    # return a dict of DataFrames with a bunch of histograms:
    return hists


[docs]def determine_thresholds(gw_hist, fdr):
    """
    given a 'gw_hist' histogram of observed counts
    for each lambda-bin and for each kernel-type, and
    also given a FDR, calculate q-values for each observed
    count value in each lambda-bin for each kernel-type.

    Parameters
    ----------
    gw_hist_kernels : dict
        dictionary {kernel_name : 2D_hist}, where '2D_hist' is a pd.DataFrame
    fdr : float
        False Discovery Rate level

    Returns
    -------
    threshold_df : dict
      each threshold_df[k] is a Series indexed by la_exp intervals
      (IntervalIndex) and it is all we need to extract "good" pixels from
      each chunk ...
    qvalues : dict
      A dictionary with keys being kernel names and values pandas.DataFrames
      storing q-values: each column corresponds to a lambda-bin,
      while rows correspond to observed pixels values.


    """

    # extracting significantly enriched interactions:
    # We have our *null* hypothesis: intensity of a HiC pixel is Poisson-distributed
    # with a certain expected. In this case that would be *locally-adjusted expected*.
    #
    # Thus for dot-calling, we could estimate a *p*-value for every pixel based
    # on its observed intensity and its expected intensity, e.g.:
    # lambda = la_exp["la_exp."+k+".value"]; pvals = 1.0 - poisson.cdf(la_exp["count"], lambda)
    # However this is technically challenging (too many pixels - genome wide) and
    # might not be sensitive enough due to wide dyamic range of interaction counts
    # Instead we use the *lambda*-binning procedure from Rao et al 2014 to tackle
    # both technicall challenges and some issues associated with the wide dynamic range
    # of the expected for the dot-calling (due to distance decay).
    #
    # Some extra points:
    # 1. simple p-value thresholding should be replaced to more "productive" FDR, which is more tractable
    # 2. "unfair" to treat all pixels with the same stat-testing (multiple hypothesis) - too wide range of "expected"
    # 3. (2) is addressed by spliting the pixels in the groups by their localy adjusted expected - lambda-bins
    # 4. upper boundary of each lambda-bin is used as expected for every pixel that belongs to the chunk:
    #                   - for technical/efficiency reasons - test pixels in a chunk all at once
    # for each lambda-bin q-values are calculated in an efficient way:
    # in part, efficiency comes from collapsing identical observed values, i.e. histogramming
    # also upper boundary of each lambda-bin is used as an expected for every pixel in this lambda-bin

    qvalues = {}
    threshold_df = {}
    for k, _hist in gw_hist.items():
        # Reverse cumulative histogram for kernel 'k'.
        rcs_hist = _hist.iloc[::-1].cumsum(axis=0).iloc[::-1]
        # 1st row of 'rcs_hist' contains total pixels-counts in each lambda-bin.
        norm = rcs_hist.iloc[0, :]

        # Assign a unit Poisson distribution to each lambda-bin.
        # The expected value 'lambda' is the upper boundary of each lambda-bin:
        #   poisson.sf = 1 - poisson.cdf, but more precise
        #   poisson.sf(-1, lambda) == 1.0, i.e. is equivalent to the
        #   poisson.pmf(rcs_hist.index, lambda)[::-1].cumsum()[::-1]
        # unit Poisson is a collection of 1-CDF distributions for each l-chunk
        # same dimensions as rcs_hist - matching lchunks and observed values:
        unit_Poisson = pd.DataFrame().reindex_like(rcs_hist)
        for lbin in rcs_hist.columns:
            # Number of occurances in Poisson distribution for which we estimate
            _occurances = rcs_hist.index.to_numpy()
            unit_Poisson[lbin] = poisson.sf(_occurances, lbin.right)
        # normalize unit-Poisson distr for the total pixel counts per lambda-bin
        unit_Poisson = norm * unit_Poisson

        # Determine the threshold by checking the value at which 'fdr_diff'
        # first turns positive. Fill NaNs with a high value, that's out of reach.
        _high_value = rcs_hist.index.max() + 1
        fdr_diff = ((fdr * rcs_hist) - unit_Poisson).cummax()
        # cummax ensures monotonic increase of differences
        threshold_df[k] = (
            fdr_diff.mask(fdr_diff < 0)  # mask negative with nans
            .idxmin()  # index of the first positive difference
            .fillna(_high_value)  # set high threshold if no pixels pass
            .astype(np.int64)
        )
        qvalues[k] = (unit_Poisson / rcs_hist).cummin()
        # run cumulative min, on the array of adjusted p-values
        # to make sure q-values are monotonously decreasing with pvals
        qvalues[k] = qvalues[k].mask(qvalues[k] > 1.0, 1.0)
        # cast categorical index of dtype-interval to proper interval index
        threshold_df[k].index = pd.IntervalIndex(threshold_df[k].index)

    return threshold_df, qvalues


[docs]def extract_scored_pixels(scored_df, thresholds, ledges, obs_raw_name=observed_count_name):
    """
    Implementation of HiCCUPS-like lambda-binning statistical procedure.
    Use FDR thresholds for different "classes" of hypothesis
    (classified by their locally-adjusted expected (la_exp) scores),
    in order to extract "enriched" pixels.

    Parameters
    ----------
    scored_df : pd.DataFrame
        A table with the scoring information for a group of pixels.
    thresholds : dict
        A dictionary {kernel_name : lambda_thresholds}, where 'lambda_thresholds'
        are pd.Series with FDR thresholds indexed by lambda-bin intervals
    ledges : ndarray
        An ndarray with bin lambda-edges for groupping locally adjusted
        expecteds, i.e., classifying statistical hypothesis into lambda-bins.
        Left-most bin (-inf, 1], and right-most one (value,+inf].
    obs_raw_name : str
        Name of the column/field with number of counts per pixel,
        i.e. observed raw counts.

    Returns
    -------
    scored_df_slice : pandas.DataFrame
        Filtered DataFrame of pixels that satisfy thresholds.

    """
    compliant_pixel_masks = []
    for kernel_name, threshold in thresholds.items():
        # locally adjusted expected (lambda) of the scored pixels:
        lambda_of_pixels = scored_df[f"la_exp.{kernel_name}.value"]
        # find indices of lambda-bins where pixels belong, using exact ledges
        lbin_idx = pd.cut(lambda_of_pixels, ledges, labels=False)
        # extract threholds for every pixel, based on lambda-bin each of the belongs
        threshold_of_pixels = threshold.iloc[lbin_idx]
        compliant_pixel_masks.append(
            scored_df[obs_raw_name].to_numpy() >= threshold_of_pixels.to_numpy()
        )
    # return pixels from 'scored_df' that satisfy FDR thresholds for all kernels:
    return scored_df[np.all(compliant_pixel_masks, axis=0)]


[docs]def clustering_step(
    scored_df,
    dots_clustering_radius,
    assigned_regions_name="region",
    obs_raw_name=observed_count_name,
):
    """
    Group together adjacent significant pixels into clusters after
    the lambda-binning multiple hypothesis testing by iterating over
    assigned regions and calling `clust_2D_pixels`.

    Parameters
    ----------
    scored_df : pandas.DataFrame
        DataFrame with enriched pixels that are ready to be
        clustered and are annotated with their genomic  coordinates.
    dots_clustering_radius : int
        Birch-clustering threshold.
    assigned_regions_name : str | None
        Name of the column in scored_df to use for grouping pixels
        before clustering. When None, full chromosome clustering is done.
    obs_raw_name : str
        name of the column with raw observed pixel counts
    Returns
    -------
    centroids : pandas.DataFrame
        Pixels from 'scored_df' annotated with clustering information.

    Notes
    -----
    'dots_clustering_radius' in Birch clustering algorithm corresponds to a
    double the clustering radius in the "greedy"-clustering used in HiCCUPS

    """
    # make sure provided pixels are annotated with genomic corrdinates and raw counts column is present:
    if not {"chrom1", "chrom2", "start1", "start2", obs_raw_name}.issubset(scored_df):
        raise ValueError("Scored pixels provided for clustering are not annotated")

    scored_df = scored_df.copy()
    if (
        not assigned_regions_name in scored_df.columns
    ):  # If input scores are not annotated by regions:
        logging.warning(
            f"No regions assigned to the scored pixels before clustering, using chromosomes"
        )
        scored_df[assigned_regions_name] = np.where(
            scored_df["chrom1"] == scored_df["chrom2"], scored_df["chrom1"], np.nan
        )

    # cluster within each regions separately and accumulate the result:
    pixel_clust_list = []
    scored_pixels_by_region = scored_df.groupby(assigned_regions_name, observed=True)
    for region, _df in scored_pixels_by_region:
        logging.info(f"clustering enriched pixels in region: {region}")
        # Using genomic corrdinated for clustering, not bin_id
        pixel_clust = clust_2D_pixels(
            _df,
            threshold_cluster=dots_clustering_radius,
            bin1_id_name="start1",
            bin2_id_name="start2",
        )
        pixel_clust_list.append(pixel_clust)
    logging.info("Clustering is complete")

    # concatenate clustering results ...
    # indexing information persists here ...
    if not pixel_clust_list:
        logging.warning("No clusters found for any regions! Output will be empty")
        empty_output = pd.DataFrame(
            [],
            columns=list(scored_df.columns)
            + [
                assigned_regions_name + "1",
                assigned_regions_name + "2",
                "c_label",
                "c_size",
                "cstart1",
                "cstart2",
            ],
        )
        return empty_output  # Empty dataframe with the same columns as anticipated
    else:
        pixel_clust_df = pd.concat(
            pixel_clust_list, ignore_index=False
        )  # Concatenate the clustering results for different regions

    # now merge pixel_clust_df and scored_df DataFrame ...
    # TODO make a more robust merge here
    df = pd.merge(
        scored_df, pixel_clust_df, how="left", left_index=True, right_index=True
    )
    # TODO check if next str-cast is neccessary
    df[assigned_regions_name + "1"] = df[assigned_regions_name].astype(str)
    df[assigned_regions_name + "2"] = df[assigned_regions_name].astype(str)
    # report only centroids with highest Observed:
    chrom_clust_group = df.groupby(
        [assigned_regions_name + "1", assigned_regions_name + "2", "c_label"],
        observed=True,
    )
    centroids = df.loc[
        chrom_clust_group[obs_raw_name].idxmax()
    ]  # Select the brightest pixel in the cluster
    return centroids


[docs]def cluster_filtering_hiccups(
    centroids,
    obs_raw_name=observed_count_name,
    enrichment_factor_vh=1.5,
    enrichment_factor_d_and_ll=1.75,
    enrichment_factor_d_or_ll=2.0,
    FDR_orphan_threshold=0.02,
):
    """
    Centroids of enriched pixels can be filtered to further minimize
    the amount of false-positive dot-calls.

    First, centroids are filtered on enrichment relative to the
    locally-adjusted expected for the "donut", "lowleft", "vertical",
    and "horizontal" kernels. Additionally, singleton pixels
    (i.e. pixels that do not belong to a cluster) are filtered based on
    a combined q-values for all kernels. This empirical filtering approach
    was developed in Rao et al 2014 and results in a conservative dot-calls
    with the low rate of false-positive calls.

    Parameters
    ----------
    centroids : pd.DataFrame
        DataFrame that stores enriched and clustered pixels.
    obs_raw_name : str
        name of the column with raw observed pixel counts
    enrichment_factor_vh : float
        minimal enrichment factor for pixels relative to
        both "vertical" and "horizontal" kernel.
    enrichment_factor_d_and_ll : float
        minimal enrichment factor for pixels relative to
        both "donut" and "lowleft" kernels.
    enrichment_factor_d_or_ll : float
        minimal enrichment factor for pixels relative to
        either "donut" or" "lowleft" kenels.
    FDR_orphan_threshold : float
        minimal combined q-value for singleton pixels.

    Returns
    -------
    filtered_centroids : pd.DataFrame
        filtered dot-calls
    """
    # make sure input DataFrame of pixels has been clustered:
    if not "c_size" in centroids:
        raise ValueError(f"input dataframe of pixels does not seem to be clustered")

    # make sure input DataFrame of pixels has been annotated with genomic coordinates and raw counts column is present:
    if not {"chrom1", "chrom2", "start1", "start2", obs_raw_name}.issubset(centroids):
        raise ValueError(
            "input dataframe of clustered pixels provided for filtering is not annotated"
        )

    # make sureinput DataFrame of pixels was scored using 4-hiccups kernels (donut, lowleft, vertical, horizontal):
    _hiccups_kernel_cols_set = {
        "la_exp.donut.value",
        "la_exp.vertical.value",
        "la_exp.horizontal.value",
        "la_exp.lowleft.value",
        # and q-values as well
        "la_exp.donut.qval",
        "la_exp.vertical.qval",
        "la_exp.horizontal.qval",
        "la_exp.lowleft.qval",
    }
    # make sure input DataFrame of pixels has been annotated with genomic coordinates and raw counts column is present:
    if not _hiccups_kernel_cols_set.issubset(centroids):
        raise ValueError(
            "clustered pixels provided for filtering were not scored with 4 hiccups kernels"
        )

    # ad hoc filtering by enrichment, FDR for singletons etc.
    # employed in Rao et al 2014 HiCCUPS
    enrichment_fdr_comply = (
        (
            centroids[obs_raw_name]
            > enrichment_factor_d_and_ll * centroids["la_exp.lowleft.value"]
        )
        & (
            centroids[obs_raw_name]
            > enrichment_factor_d_and_ll * centroids["la_exp.donut.value"]
        )
        & (
            centroids[obs_raw_name]
            > enrichment_factor_vh * centroids["la_exp.vertical.value"]
        )
        & (
            centroids[obs_raw_name]
            > enrichment_factor_vh * centroids["la_exp.horizontal.value"]
        )
        & (
            (
                centroids[obs_raw_name]
                > enrichment_factor_d_or_ll * centroids["la_exp.lowleft.value"]
            )
            | (
                centroids[obs_raw_name]
                > enrichment_factor_d_or_ll * centroids["la_exp.donut.value"]
            )
        )
        & (
            (centroids["c_size"] > 1)
            | (
                (
                    centroids["la_exp.lowleft.qval"]
                    + centroids["la_exp.donut.qval"]
                    + centroids["la_exp.vertical.qval"]
                    + centroids["la_exp.horizontal.qval"]
                )
                <= FDR_orphan_threshold
            )
        )
    )

    logging.info(
        f"filtered {enrichment_fdr_comply.sum()} out of {len(centroids)} centroids to reduce the number of false-positives"
    )

    # use "enrichment_fdr_comply" to filter out non-satisfying pixels:
    return centroids[enrichment_fdr_comply].reset_index(drop=True)


####################################################################
# large helper functions wrapping smaller step-specific ones
####################################################################

[docs]@pool_decorator
def scoring_and_histogramming_step(
    clr,
    expected_indexed,
    expected_value_col,
    clr_weight_name,
    tiles,
    kernels,
    ledges,
    max_nans_tolerated,
    loci_separation_bins,
    nproc,
    map_functor=map,
):
    """
    This implements the 1st step of the lambda-binning scoring procedure - histogramming.

    In short, this pipes a scoring operation together with histogramming into a
    single pipeline of per-chunk operations/transforms.
    """
    logging.info(f"convolving {len(tiles)} tiles to build histograms for lambda-bins")

    # to score per tile:
    to_score = partial(
        score_tile,
        clr=clr,
        expected_indexed=expected_indexed,
        expected_value_col=expected_value_col,
        clr_weight_name=clr_weight_name,
        kernels=kernels,
        max_nans_tolerated=max_nans_tolerated,
        band_to_cover=loci_separation_bins,
    )

    # to hist per scored chunk:
    to_hist = partial(histogram_scored_pixels, kernels=kernels, ledges=ledges)

    # compose scoring and histogramming together :
    job = lambda tile: to_hist(to_score(tile))

    # standard multiprocessing implementation
    if nproc > 1:
        map_kwargs = dict(chunksize=int(np.ceil(len(tiles) / nproc)))
    else:
        map_kwargs = {}
        
    # consider using
    # https://github.com/mirnylab/cooler/blob/9e72ee202b0ac6f9d93fd2444d6f94c524962769/cooler/tools.py#L59
    histogram_chunks = map_functor(job, tiles, **map_kwargs)

    # now we need to combine/sum all of the histograms for different kernels:
    def _sum_hists(hx, hy):
        # perform a DataFrame summation for every value of the dictionary:
        hxy = {}
        for k in kernels:
            hxy[k] = hx[k].add(hy[k], fill_value=0).fillna(0).astype(np.int64)
        # returning the sum:
        return hxy

    # TODO consider more efficient implementation to accumulate histograms:
    final_hist = reduce(_sum_hists, histogram_chunks)
    # we have to make sure there is nothing in the last lambda-bin
    # this is a temporary implementation detail, until we implement dynamic lambda-bins
    for k in kernels:
        last_lambda_bin = final_hist[k].iloc[:, -1]
        if last_lambda_bin.sum() != 0:
            raise ValueError(
                f"There are la_exp.{k}.value in {last_lambda_bin.name}, please check the histogram"
            )
        # last non-empty lambda bin (column)
        last_non_empty_lbin = final_hist[k].columns[final_hist[k].sum() > 0][-1]
        # drop trailing empty lambda bins (columns)
        final_hist[k] = final_hist[k].loc[:, :last_non_empty_lbin]
        # make sure index (observed pixels counts) is sorted
        if not final_hist[k].index.is_monotonic_increasing:
            raise ValueError(f"Histogram for {k}-kernel is not sorted")
    # returning filtered histogram
    return final_hist

[docs]@pool_decorator
def scoring_and_extraction_step(
    clr,
    expected_indexed,
    expected_value_col,
    clr_weight_name,
    tiles,
    kernels,
    ledges,
    thresholds,
    max_nans_tolerated,
    loci_separation_bins,
    nproc,
    bin1_id_name="bin1_id",
    bin2_id_name="bin2_id",
    map_functor=map,
):
    """
    This implements the 2nd step of the lambda-binning scoring procedure,
    extracting pixels that are FDR compliant.

    In short, this combines scoring with with extraction into a
    single pipeline of per-chunk operations/transforms.

    """
    logging.info(f"convolving {len(tiles)} tiles to extract enriched pixels")

    # to score per tile:
    to_score = partial(
        score_tile,
        clr=clr,
        expected_indexed=expected_indexed,
        expected_value_col=expected_value_col,
        clr_weight_name=clr_weight_name,
        kernels=kernels,
        max_nans_tolerated=max_nans_tolerated,
        band_to_cover=loci_separation_bins,
    )

    # to hist per scored chunk:
    to_extract = partial(
        extract_scored_pixels,
        thresholds=thresholds,
        ledges=ledges,
    )

    # compose scoring and histogramming together
    job = lambda tile: to_extract(to_score(tile))

    # standard multiprocessing implementation
    if nproc > 1:
        logging.info(f"creating a Pool of {nproc} workers to tackle {len(tiles)} tiles")
        map_kwargs = dict(chunksize=int(np.ceil(len(tiles) / nproc)))
    else:
        logging.info("fallback to serial implementation.")
        map_kwargs = {}
        
    # TODO: try using cooler.parallel.MultiplexDataPipe for pipelining the steps
    filtered_pix_chunks = map_functor(job, tiles, **map_kwargs)
    significant_pixels = pd.concat(filtered_pix_chunks, ignore_index=True)
    
    # same pixels should never be scored >1 times with the current tiling of the interactions matrix
    if significant_pixels.duplicated().any():
        raise ValueError(
            f"Some pixels were scored more than one time, matrix tiling procedure is not correct"
        )
    # sort the result just in case and drop its index:
    return significant_pixels.sort_values(by=[bin1_id_name, bin2_id_name]).reset_index(
        drop=True
    )


# user-friendly high-level API function
[docs]def dots(
    clr,
    expected,
    expected_value_col="balanced.avg",
    clr_weight_name="weight",
    view_df=None,
    kernels=None,
    max_loci_separation=10_000_000,
    max_nans_tolerated=1,  # test if this has desired behavior
    n_lambda_bins=40,  # update this eventually
    lambda_bin_fdr=0.1,
    clustering_radius=20_000,
    cluster_filtering=None,
    tile_size=5_000_000,
    nproc=1,
):
    """
    Call dots on a cooler {clr}, using {expected} defined in regions specified
    in {view_df}.

    All convolution kernels specified in {kernels} will be all applied to the {clr},
    and statistical testing will be performed separately for each kernel. A convolutional
    kernel is a small squared matrix (e.g. 7x7) of zeros and ones
    that defines a "mask" to extract local expected around each pixel. Since the
    enrichment is calculated relative to the central pixel, kernel width should
    be an odd number >=3.

    Parameters
    ----------
    clr : cooler.Cooler
        A cooler with balanced Hi-C data.
    expected : DataFrame in expected format
        Diagonal summary statistics for each chromosome, and name of the column
        with the values of expected to use.
    expected_value_col : str
        Name of the column in expected that holds the values of expected
    clr_weight_name : str
        Name of the column in the clr.bins to use as balancing weights.
        Using raw unbalanced data is not supported for dot-calling.
    view_df : viewframe
        Viewframe with genomic regions, at the moment the view has to match the
        view used for generating expected. If None, generate from the cooler.
    kernels : { str:np.ndarray } | None
        A dictionary of convolution kernels to be used for calculating locally adjusted
        expected. If None the default kernels from HiCCUPS are going to be recommended
        based on the resolution of the cooler.
    max_loci_separation : int
        Miaximum loci separation for dot-calling, i.e., do not call dots for
        loci that are further than max_loci_separation basepair apart. default 10Mb.
    max_nans_tolerated : int
        Maximum number of NaNs tolerated in a footprint of every used kernel
        Adjust with caution, as large max_nans_tolerated, might lead to artifacts in
        pixels scoring.
    n_lambda_bins : int
        Number of log-spaced bins, where FDR-testing will be performed independently.
        TODO: generate lambda-bins on the fly based on the dynamic range of the data (i.e. maximum pixel count)
    lambda_bin_fdr : float
        False discovery rate (FDR) for multiple hypothesis testing BH-FDR procedure, applied per lambda bin.
    clustering_radius : None | int
        Cluster enriched pixels with a given radius. "Brightest" pixels in each group
        will be reported as the final dot-calls. If None, no clustering is performed.
    cluster_filtering : bool
        whether to apply additional filtering to centroids after clustering, using cluster_filtering_hiccups()
    tile_size : int
        Tile size for the Hi-C heatmap tiling. Typically on order of several mega-bases, and <= max_loci_separation.
        Controls tradeoff between memory consumption and speed of execution.
    nproc : int
        Number of processes to use for multiprocessing.

    Returns
    -------
    dots : pandas.DataFrame
        BEDPE-style dataFrame with genomic coordinates of called dots and additional annotations.

    Notes
    -----
    'clustering_radius' in Birch clustering algorithm corresponds to a
    double the clustering radius in the "greedy"-clustering used in HiCCUPS
    (to be tested).

    TODO describe sequence of processing steps

    """

    #### Generate viewframes ####
    if view_df is None:
        view_df = make_cooler_view(clr)
    else:
        try:
            _ = is_compatible_viewframe(
                view_df,
                clr,
                check_sorting=True,
                raise_errors=True,
            )
        except Exception as e:
            raise ValueError("view_df is not a valid viewframe or incompatible") from e

    # check balancing status
    if clr_weight_name:
        # check if cooler is balanced
        try:
            _ = is_cooler_balanced(clr, clr_weight_name, raise_errors=True)
        except Exception as e:
            raise ValueError(
                f"provided cooler is not balanced or {clr_weight_name} is missing"
            ) from e
    else:
        raise ValueError("calling dots on raw data is not supported.")

    # add checks to make sure cis-expected is symmetric
    # make sure provided expected is compatible
    try:
        _ = is_valid_expected(
            expected,
            "cis",
            view_df,
            verify_cooler=clr,
            expected_value_cols=[
                expected_value_col,
            ],
            raise_errors=True,
        )
    except Exception as e:
        raise ValueError("provided expected is not compatible") from e
    expected = expected.set_index(["region1", "region2", "dist"]).sort_index()

    # Prepare some parameters.
    binsize = clr.binsize
    loci_separation_bins = bp_to_bins(max_loci_separation, binsize)
    tile_size_bins = bp_to_bins(tile_size, binsize)

    # verify provided kernels or recommend them (HiCCUPS)...
    if kernels and is_compatible_kernels(kernels, binsize, max_nans_tolerated):
        warnings.warn(
            "Compatibility checks for 'kernels' are not fully implemented yet, use at your own risk"
        )
    else:
        # recommend them (default hiccups ones for now)
        kernels = recommend_kernels(binsize)
    # deduce kernel_width - overall footprint
    kernel_width = max(len(k) for k in kernels.values())  # 2*w+1
    kernel_half_width = int((kernel_width - 1) / 2)  # former w parameter

    # try to guess required lambda bins using "max" value of pixel counts
    # statistical: lambda-binning edges ...
    if not 40 <= n_lambda_bins <= 50:
        raise ValueError(f"Incompatible n_lambda_bins={n_lambda_bins}")
    BASE = 2 ** (1 / 3)  # very arbitrary - parameterize !
    ledges = np.concatenate(
        (
            [-np.inf],
            np.logspace(
                0,
                n_lambda_bins - 1,
                num=n_lambda_bins,
                base=BASE,
                dtype=np.float64,
            ),
            [np.inf],
        )
    )

    # list of tile coordinate ranges
    tiles = list(
        generate_tiles_diag_band(
            clr, view_df, kernel_half_width, tile_size_bins, loci_separation_bins
        )
    )

    # 1. Calculate genome-wide histograms of scores.
    time_start = time.perf_counter()
    gw_hist = scoring_and_histogramming_step(
        clr,
        expected,
        expected_value_col=expected_value_col,
        clr_weight_name=clr_weight_name,
        tiles=tiles,
        kernels=kernels,
        ledges=ledges,
        max_nans_tolerated=max_nans_tolerated,
        loci_separation_bins=loci_separation_bins,
        nproc=nproc,
    )
    elapsed_time = time.perf_counter() - time_start
    logging.info(f"Done building histograms in {elapsed_time:.3f} sec ...")

    # 2. Determine the FDR thresholds.
    threshold_df, qvalues = determine_thresholds(gw_hist, lambda_bin_fdr)
    logging.info("Determined thresholds for every lambda-bin ...")

    # 3. Filter using FDR thresholds calculated in the histogramming step
    time_start = time.perf_counter()
    filtered_pixels = scoring_and_extraction_step(
        clr,
        expected,
        expected_value_col=expected_value_col,
        clr_weight_name=clr_weight_name,
        tiles=tiles,
        kernels=kernels,
        ledges=ledges,
        thresholds=threshold_df,
        max_nans_tolerated=max_nans_tolerated,
        loci_separation_bins=loci_separation_bins,
        nproc=nproc,
        bin1_id_name="bin1_id",
        bin2_id_name="bin2_id",
    )
    elapsed_time = time.perf_counter() - time_start
    logging.info(f"Done extracting enriched pixels in {elapsed_time:.3f} sec ...")

    # 4. Post-processing
    logging.info(f"Begin post-processing of {len(filtered_pixels)} filtered pixels")
    logging.info("preparing to extract needed q-values ...")

    # annotate enriched pixels
    filtered_pixels_qvals = annotate_pixels_with_qvalues(filtered_pixels, qvalues)
    filtered_pixels_annotated = cooler.annotate(
        filtered_pixels_qvals, clr.bins()[["chrom", "start", "end"]], replace=True
    )
    if not clustering_radius:
        # TODO: make sure returned DataFrame has the same columns as "postprocessed_calls"
        # columns to return before-clustering
        output_cols = []
        output_cols += bedpe_required_cols
        output_cols += [
            observed_count_name,
        ]
        output_cols += [f"la_exp.{k}.value" for k in kernels]
        output_cols += [f"la_exp.{k}.qval" for k in kernels]
        return filtered_pixels_annotated[output_cols]

    # 4a. clustering
    # Clustering is done independently for every region, therefore regions must be assigned:
    filtered_pixels_annotated = assign_regions(filtered_pixels_annotated, view_df)
    centroids = clustering_step(
        filtered_pixels_annotated,
        clustering_radius,
    ).reset_index(drop=True)

    # columns to return post-clustering
    output_cols = []
    output_cols += bedpe_required_cols
    output_cols += [
        "cstart1",
        "cstart2",
        "c_label",
        "c_size",
        observed_count_name,
    ]
    output_cols += [f"la_exp.{k}.value" for k in kernels]
    output_cols += [f"la_exp.{k}.qval" for k in kernels]

    # 4b. filter by enrichment and qval
    if (cluster_filtering is None) or cluster_filtering:
        # default - engage HiCCUPS filtering
        postprocessed_calls = cluster_filtering_hiccups(centroids)
    elif not cluster_filtering:
        postprocessed_calls = centroids

    return postprocessed_calls