Source code for picasso.utils

"""
Utilities: Data preprocessing and loading functions for PICASSO.

This module provides utility functions for preprocessing copy number alteration (CNA)
data and loading example datasets. It includes specialized functions for handling
noisy scRNA-seq-inferred CNA data and converting complex copy number states into
formats suitable for phylogenetic analysis.

Functions
---------
encode_cnvs_as_ternary
    Convert integer CNA data to ternary encoding for improved phylogenetic inference.
load_data
    Load example CNA dataset for testing and demonstration purposes.

Examples
--------
Data preprocessing workflow:

>>> from picasso import Picasso, load_data, encode_cnvs_as_ternary
>>>
>>> # Load example dataset
>>> cna_data = load_data()
>>> print(f"Loaded data: {cna_data.shape}")
>>>
>>> # Optional: Convert to ternary encoding for complex copy number states
>>> ternary_data = encode_cnvs_as_ternary(cna_data)
>>> print(f"Ternary encoded: {ternary_data.shape}")
>>>
>>> # Use with PICASSO
>>> picasso = Picasso(cna_data, min_clone_size=8)
>>> picasso.fit()

Notes
-----
These utilities are specifically designed for:
- Handling noisy scRNA-seq-inferred CNA data
- Converting complex copy number states to phylogeny-compatible formats
- Providing realistic example data for algorithm development
- Supporting data preprocessing workflows

See Also
--------
Picasso : Main phylogenetic inference class
CloneTree : Analysis and visualization of phylogenetic results
"""

import numpy as np
import pandas as pd
import os
from typing import Union



[docs]
def encode_cnvs_as_ternary(data: Union[pd.DataFrame, np.ndarray]) -> pd.DataFrame:
    """
    Convert CNA data to ternary encoding for phylogenetic analysis.

    Transforms integer copy number alteration (CNA) data into a ternary format
    suitable for phylogenetic inference algorithms like PICASSO. This encoding
    is particularly useful for handling complex copy number states and ensuring
    compatibility with categorical mixture models.

    Parameters
    ----------
    data : pd.DataFrame or np.ndarray
        Input CNA data where rows represent cells/samples and columns represent
        genomic features. Values should be integers representing copy number states
        (e.g., 0=deletion, 1=neutral, 2=single amplification, 3=double amplification).
        Can handle both positive and negative copy number values.

    Returns
    -------
    pd.DataFrame
        Ternary-encoded DataFrame with values in {-1, 0, 1}. The number of columns
        is expanded based on the maximum absolute value in each original column.
        Column names follow the pattern 'original_column-position' (e.g., 'chr1p-1', 'chr1p-2').

    Examples
    --------
    Basic encoding of copy number states:

    >>> import pandas as pd
    >>> import numpy as np
    >>> from picasso.utils import encode_cnvs_as_ternary
    >>>
    >>> # Create sample CNA data
    >>> cna_data = pd.DataFrame({
    ...     'chr1p': [0, 1, 2, 3],
    ...     'chr2q': [0, 0, 1, 2]
    ... }, index=['Cell_A', 'Cell_B', 'Cell_C', 'Cell_D'])
    >>>
    >>> print(cna_data)
           chr1p  chr2q
    Cell_A     0     0
    Cell_B     -1     0
    Cell_C     2     1
    Cell_D     3     2

    >>> # Encode to ternary format
    >>> ternary_data = encode_cnvs_as_ternary(cna_data)
    >>> print(ternary_data)
           chr1p-1  chr1p-2  chr1p-3  chr2q-1  chr2q-2
    Cell_A       0        0        0        0        0
    Cell_B      -1        0        0        0        0
    Cell_C       1        1        0        1        0
    Cell_D       1        1        1        1        1

    Handling deletions (negative values):

    >>> # Data with deletions
    >>> cna_with_dels = pd.DataFrame({
    ...     'chr3p': [-2, -1, 0, 1, 2],
    ... }, index=[f'Cell_{i}' for i in range(5)])
    >>>
    >>> ternary_dels = encode_cnvs_as_ternary(cna_with_dels)
    >>> print(ternary_dels)
           chr3p-1  chr3p-2
    Cell_0      -1       -1
    Cell_1      -1        0
    Cell_2       0        0
    Cell_3       1        0
    Cell_4       1        1

    Notes
    -----
    **Encoding Rules**:
    - Positive integers n are encoded as n ones followed by zeros: [1, 1, ..., 1, 0, 0, ...]
    - Negative integers -n are encoded as n negative ones: [-1, -1, ..., -1]
    - Zero values are encoded as all zeros: [0, 0, ...]
    - Column width is determined by the maximum absolute value in each original column

    **Use Cases**:
    - Preprocessing CNA data for PICASSO phylogenetic inference
    - Converting complex copy number states to categorical format
    - Ensuring proper handling of amplifications and deletions in mixture models

    **Performance Considerations**:
    - Output size scales with maximum copy number values
    - Memory usage increases significantly for high-amplitude CNAs
    - Consider binning extreme values before encoding for very noisy data. We recommend binning into 'amplified' and 'highly amplified' categories.

    Raises
    ------
    ValueError
        If input data cannot be converted to integer format.

    See Also
    --------
    Picasso : Main phylogenetic inference class that accepts ternary-encoded data
    load_data : Function to load example CNA datasets
    """

    # If input is a numpy array, convert it to a DataFrame
    if isinstance(data, np.ndarray):
        data = pd.DataFrame(data)

    data = data.astype(int)

    # Initialize a list to hold the binary encoded columns
    binary_encoded_cols = []
    column_names = []

    # Process each column independently
    for col in data.columns:
        col_data = data[col]
        # Get the maximum magnitude in the column
        max_val = np.max(np.abs(col_data))

        # Initialize an empty list to hold the binary encoded values for the column
        binary_col = []

        # Encode each value in the column
        for val in col_data:
            if val >= 0:
                binary_val = [1] * val + [0] * (max_val - val)
            else:
                binary_val = [-1] * abs(val)
            binary_col.append(binary_val)

        # Determine the length needed for padding
        max_length = max(len(b) for b in binary_col)

        # Pad binary_col to ensure uniform length
        padded_col = [
            np.pad(b, (0, max_length - len(b)), "constant") for b in binary_col
        ]

        # Convert the padded column to a numpy array
        padded_col = np.array(padded_col)

        # Add the binary encoded columns to the list
        for i in range(max_length):
            binary_encoded_cols.append(padded_col[:, i])
            column_names.append(f"{col}-{i + 1}")

    # Combine all binary encoded columns into a DataFrame
    binary_encoded_df = pd.DataFrame(
        np.column_stack(binary_encoded_cols), columns=column_names
    )
    binary_encoded_df.index = data.index

    return binary_encoded_df




[docs]
def load_data() -> pd.DataFrame:
    """
    Load example single-cell copy number alteration (CNA) dataset.

    Provides a sample dataset of inferred CNAs from single-cell RNA sequencing data
    for testing and demonstration purposes. This dataset represents the type of noisy,
    inferred CNA data that PICASSO is designed to handle.

    Returns
    -------
    pd.DataFrame
        Example CNA dataset with cells as rows and genomic features as columns.
        Values represent inferred copy number states, typically integers where:
        - 0 indicates deletions/loss
        - 1 indicates neutral copy number
        - 2+ indicates amplifications/gains
        Index contains cell/sample identifiers, columns contain feature names.

    Examples
    --------
    Load and explore the example dataset:

    >>> from picasso import Picasso, load_data
    >>>
    >>> # Load example data
    >>> cna_data = load_data()
    >>> print(f"Dataset shape: {cna_data.shape}")
    >>> print(f"Copy number range: {cna_data.min().min()} to {cna_data.max().max()}")
    >>> print("First few rows:")
    >>> print(cna_data.head())
    >>>
    >>> # Use with PICASSO
    >>> picasso = Picasso(cna_data, min_clone_size=5)
    >>> picasso.fit()

    Inspect data characteristics:

    >>> # Check for missing values
    >>> print(f"Missing values: {cna_data.isnull().sum().sum()}")
    >>>
    >>> # Distribution of copy number states
    >>> print("Copy number state distribution:")
    >>> print(cna_data.values.flatten().astype(int))
    >>>
    >>> # Feature-wise statistics
    >>> print("Per-feature statistics:")
    >>> print(cna_data.describe())

    Notes
    -----
    **Dataset Characteristics**:
    - Representative of scRNA-seq-inferred CNA data
    - Contains typical noise patterns and artifacts
    - Suitable for algorithm testing and parameter tuning
    - May include both amplifications and deletions

    **Data Origin**:
    - Loaded from sample_data/cnas.txt in the package directory
    - Tab-separated format with sample IDs as first column
    - Preprocessed to remove extreme outliers and artifacts

    **Intended Use**:
    - Algorithm development and testing
    - Parameter optimization for noisy datasets
    - Tutorial and documentation examples
    - Benchmarking against other methods

    Raises
    ------
    FileNotFoundError
        If the sample data file cannot be located in the expected directory.
    pd.errors.EmptyDataError
        If the data file is empty or corrupted.

    See Also
    --------
    Picasso : Main phylogenetic inference class for analyzing the loaded data
    encode_cnvs_as_ternary : Preprocessing function for complex copy number states
    CloneTree : Class for visualizing and analyzing phylogenetic results
    """

    # Load the example dataset
    # Get path to sample data within the package
    package_dir = os.path.dirname(os.path.abspath(__file__))
    data = pd.read_csv(f"{package_dir}/sample_data/cnas.txt", sep="\t", index_col=0)
    return data



# Define public API
__all__ = ["encode_cnvs_as_ternary", "load_data"]