Source code for picasso.utils

"""
Utilities: Data preprocessing and loading functions for PICASSO.

This module provides utility functions for preprocessing copy number alteration (CNA)
data and loading example datasets. It includes specialized functions for handling
noisy scRNA-seq-inferred CNA data and converting complex copy number states into
formats suitable for phylogenetic analysis.

Functions
---------
encode_cnvs_as_ternary
    Convert integer CNA data to ternary encoding for improved phylogenetic inference.
load_data
    Load example CNA dataset for testing and demonstration purposes.

Examples
--------
Data preprocessing workflow:

>>> from picasso import Picasso, load_data, encode_cnvs_as_ternary
>>>
>>> # Load example dataset
>>> cna_data = load_data()
>>> print(f"Loaded data: {cna_data.shape}")
>>>
>>> # Optional: Convert to ternary encoding for complex copy number states
>>> ternary_data = encode_cnvs_as_ternary(cna_data)
>>> print(f"Ternary encoded: {ternary_data.shape}")
>>>
>>> # Use with PICASSO
>>> picasso = Picasso(cna_data, min_clone_size=8)
>>> picasso.fit()

Notes
-----
These utilities are specifically designed for:
- Handling noisy scRNA-seq-inferred CNA data
- Converting complex copy number states to phylogeny-compatible formats
- Providing realistic example data for algorithm development
- Supporting data preprocessing workflows

See Also
--------
Picasso : Main phylogenetic inference class
CloneTree : Analysis and visualization of phylogenetic results
"""

import numpy as np
import pandas as pd
import os
from typing import Union


[docs] def encode_cnvs_as_ternary(data: Union[pd.DataFrame, np.ndarray]) -> pd.DataFrame: """ Convert CNA data to ternary encoding for phylogenetic analysis. Transforms integer copy number alteration (CNA) data into a ternary format suitable for phylogenetic inference algorithms like PICASSO. This encoding is particularly useful for handling complex copy number states and ensuring compatibility with categorical mixture models. Parameters ---------- data : pd.DataFrame or np.ndarray Input CNA data where rows represent cells/samples and columns represent genomic features. Values should be integers representing copy number states (e.g., 0=deletion, 1=neutral, 2=single amplification, 3=double amplification). Can handle both positive and negative copy number values. Returns ------- pd.DataFrame Ternary-encoded DataFrame with values in {-1, 0, 1}. The number of columns is expanded based on the maximum absolute value in each original column. Column names follow the pattern 'original_column-position' (e.g., 'chr1p-1', 'chr1p-2'). Examples -------- Basic encoding of copy number states: >>> import pandas as pd >>> import numpy as np >>> from picasso.utils import encode_cnvs_as_ternary >>> >>> # Create sample CNA data >>> cna_data = pd.DataFrame({ ... 'chr1p': [0, 1, 2, 3], ... 'chr2q': [0, 0, 1, 2] ... }, index=['Cell_A', 'Cell_B', 'Cell_C', 'Cell_D']) >>> >>> print(cna_data) chr1p chr2q Cell_A 0 0 Cell_B -1 0 Cell_C 2 1 Cell_D 3 2 >>> # Encode to ternary format >>> ternary_data = encode_cnvs_as_ternary(cna_data) >>> print(ternary_data) chr1p-1 chr1p-2 chr1p-3 chr2q-1 chr2q-2 Cell_A 0 0 0 0 0 Cell_B -1 0 0 0 0 Cell_C 1 1 0 1 0 Cell_D 1 1 1 1 1 Handling deletions (negative values): >>> # Data with deletions >>> cna_with_dels = pd.DataFrame({ ... 'chr3p': [-2, -1, 0, 1, 2], ... }, index=[f'Cell_{i}' for i in range(5)]) >>> >>> ternary_dels = encode_cnvs_as_ternary(cna_with_dels) >>> print(ternary_dels) chr3p-1 chr3p-2 Cell_0 -1 -1 Cell_1 -1 0 Cell_2 0 0 Cell_3 1 0 Cell_4 1 1 Notes ----- **Encoding Rules**: - Positive integers n are encoded as n ones followed by zeros: [1, 1, ..., 1, 0, 0, ...] - Negative integers -n are encoded as n negative ones: [-1, -1, ..., -1] - Zero values are encoded as all zeros: [0, 0, ...] - Column width is determined by the maximum absolute value in each original column **Use Cases**: - Preprocessing CNA data for PICASSO phylogenetic inference - Converting complex copy number states to categorical format - Ensuring proper handling of amplifications and deletions in mixture models **Performance Considerations**: - Output size scales with maximum copy number values - Memory usage increases significantly for high-amplitude CNAs - Consider binning extreme values before encoding for very noisy data. We recommend binning into 'amplified' and 'highly amplified' categories. Raises ------ ValueError If input data cannot be converted to integer format. See Also -------- Picasso : Main phylogenetic inference class that accepts ternary-encoded data load_data : Function to load example CNA datasets """ # If input is a numpy array, convert it to a DataFrame if isinstance(data, np.ndarray): data = pd.DataFrame(data) data = data.astype(int) # Initialize a list to hold the binary encoded columns binary_encoded_cols = [] column_names = [] # Process each column independently for col in data.columns: col_data = data[col] # Get the maximum magnitude in the column max_val = np.max(np.abs(col_data)) # Initialize an empty list to hold the binary encoded values for the column binary_col = [] # Encode each value in the column for val in col_data: if val >= 0: binary_val = [1] * val + [0] * (max_val - val) else: binary_val = [-1] * abs(val) binary_col.append(binary_val) # Determine the length needed for padding max_length = max(len(b) for b in binary_col) # Pad binary_col to ensure uniform length padded_col = [ np.pad(b, (0, max_length - len(b)), "constant") for b in binary_col ] # Convert the padded column to a numpy array padded_col = np.array(padded_col) # Add the binary encoded columns to the list for i in range(max_length): binary_encoded_cols.append(padded_col[:, i]) column_names.append(f"{col}-{i + 1}") # Combine all binary encoded columns into a DataFrame binary_encoded_df = pd.DataFrame( np.column_stack(binary_encoded_cols), columns=column_names ) binary_encoded_df.index = data.index return binary_encoded_df
[docs] def load_data() -> pd.DataFrame: """ Load example single-cell copy number alteration (CNA) dataset. Provides a sample dataset of inferred CNAs from single-cell RNA sequencing data for testing and demonstration purposes. This dataset represents the type of noisy, inferred CNA data that PICASSO is designed to handle. Returns ------- pd.DataFrame Example CNA dataset with cells as rows and genomic features as columns. Values represent inferred copy number states, typically integers where: - 0 indicates deletions/loss - 1 indicates neutral copy number - 2+ indicates amplifications/gains Index contains cell/sample identifiers, columns contain feature names. Examples -------- Load and explore the example dataset: >>> from picasso import Picasso, load_data >>> >>> # Load example data >>> cna_data = load_data() >>> print(f"Dataset shape: {cna_data.shape}") >>> print(f"Copy number range: {cna_data.min().min()} to {cna_data.max().max()}") >>> print("First few rows:") >>> print(cna_data.head()) >>> >>> # Use with PICASSO >>> picasso = Picasso(cna_data, min_clone_size=5) >>> picasso.fit() Inspect data characteristics: >>> # Check for missing values >>> print(f"Missing values: {cna_data.isnull().sum().sum()}") >>> >>> # Distribution of copy number states >>> print("Copy number state distribution:") >>> print(cna_data.values.flatten().astype(int)) >>> >>> # Feature-wise statistics >>> print("Per-feature statistics:") >>> print(cna_data.describe()) Notes ----- **Dataset Characteristics**: - Representative of scRNA-seq-inferred CNA data - Contains typical noise patterns and artifacts - Suitable for algorithm testing and parameter tuning - May include both amplifications and deletions **Data Origin**: - Loaded from sample_data/cnas.txt in the package directory - Tab-separated format with sample IDs as first column - Preprocessed to remove extreme outliers and artifacts **Intended Use**: - Algorithm development and testing - Parameter optimization for noisy datasets - Tutorial and documentation examples - Benchmarking against other methods Raises ------ FileNotFoundError If the sample data file cannot be located in the expected directory. pd.errors.EmptyDataError If the data file is empty or corrupted. See Also -------- Picasso : Main phylogenetic inference class for analyzing the loaded data encode_cnvs_as_ternary : Preprocessing function for complex copy number states CloneTree : Class for visualizing and analyzing phylogenetic results """ # Load the example dataset # Get path to sample data within the package package_dir = os.path.dirname(os.path.abspath(__file__)) data = pd.read_csv(f"{package_dir}/sample_data/cnas.txt", sep="\t", index_col=0) return data
# Define public API __all__ = ["encode_cnvs_as_ternary", "load_data"]