Source code for picasso.CloneTree

"""
CloneTree: Phylogenetic tree analysis and visualization for PICASSO results.

This module provides the CloneTree class for integrating phylogenetic trees with
clone assignments and CNA data. It enables comprehensive analysis and visualization
of phylogenetic reconstruction results, with specific support for noisy scRNA-seq-
inferred CNA data patterns.

Classes
-------
CloneTree
    Integrates phylogenetic trees, clone assignments, and CNA profiles for
    comprehensive analysis and visualization of tumor evolution patterns.

Examples
--------
Basic usage with PICASSO results:

>>> from picasso import Picasso, CloneTree, load_data
>>>
>>> # Load example data and run PICASSO phylogenetic inference
>>> cna_data = load_data()
>>> picasso = Picasso(cna_data)
>>> picasso.fit()
>>>
>>> # Create CloneTree for analysis and visualization
>>> phylogeny = picasso.get_phylogeny()
>>> assignments = picasso.get_clone_assignments()
>>> clone_tree = CloneTree(phylogeny, assignments, cna_data)
>>>
>>> # Generate visualizations
>>> clone_tree.plot_alterations(save_as='heatmap.pdf')
>>> clone_tree.plot_clone_sizes(save_as='sizes.pdf')

Notes
-----
The CloneTree class is designed to handle:
- Integration of phylogenetic trees with cellular data
- Aggregation of noisy CNA profiles by clone
- Visualization of clonal evolution patterns
- Export to publication-ready formats

See Also
--------
Picasso : Main phylogenetic inference algorithm
itol_utils : Functions for iTOL visualization export
utils : Data preprocessing utilities
"""

import pandas as pd
import ete3
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Optional, Tuple, Union



[docs]
class CloneTree:
    __clone_phylogeny: ete3.Tree
    __sample_phylogeny: Optional[ete3.Tree]
    clone_profiles: pd.DataFrame
    clone_profiles_certainty: pd.DataFrame


[docs]
    def __init__(
        self,
        phylogeny: ete3.Tree,
        clone_assignments: pd.DataFrame,
        character_matrix: pd.DataFrame,
        clone_aggregation: str = "mode",
        metadata: Optional[pd.DataFrame] = None,
    ) -> None:
        """
        Initialize a CloneTree for analysis and visualization of phylogenetic reconstruction results.

        CloneTree integrates phylogenetic trees from PICASSO with clone assignments and CNA
        data to provide comprehensive analysis and visualization capabilities. It handles
        the aggregation of noisy scRNA-seq-inferred CNA profiles by clone and supports
        various downstream analyses.

        Parameters
        ----------
        phylogeny : ete3.Tree
            The phylogenetic tree with terminal clones as leaves, typically obtained from
            the PICASSO model via get_phylogeny(). Internal nodes represent ancestral
            clones and splitting events.
        clone_assignments : pd.DataFrame
            DataFrame with cell/sample identifiers as index and a 'clone_id' column
            containing clone assignments. Should correspond to the leaves of the phylogeny.
            Typically obtained from PICASSO via get_clone_assignments().
        character_matrix : pd.DataFrame
            The CNA character matrix where rows are cells/samples and columns are genomic
            features (genes, chromosome arms, bins). Values represent inferred copy number
            states. Should contain the same samples as in clone_assignments.
        clone_aggregation : {'mode', 'mean'}, default='mode'
            Method for aggregating CNA profiles within each clone:
            - 'mode': Use most frequent copy number state (recommended for noisy data)
            - 'mean': Use average copy number (not yet implemented)
        metadata : pd.DataFrame, optional
            Additional sample metadata for visualization and analysis. Index should match
            character_matrix. Common examples include cell type annotations, sample origin,
            experimental conditions.

        Attributes
        ----------
        clone_profiles : pd.DataFrame
            Aggregated CNA profiles for each clone (rows=clones, columns=genomic features).
        clone_profiles_certainty : pd.DataFrame
            Confidence/certainty scores for each aggregated profile value.
        clone_assignments : pd.DataFrame
            DataFrame with cell/sample identifiers as index and clone assignments.
        character_matrix : pd.DataFrame
            The CNA character matrix with cells as rows and genomic features as columns.
        metadata : Optional[pd.DataFrame]
            Additional sample metadata for visualization and analysis.

        Raises
        ------
        AssertionError
            If clone_assignments lacks 'clone_id' column, if phylogeny leaves don't match
            clone assignments, if sample indices don't match between DataFrames, or if
            clone_aggregation method is invalid.

        Examples
        --------
        Basic usage with PICASSO results:

        >>> from picasso import Picasso, CloneTree, load_data
        >>>
        >>> # Load example data and run PICASSO
        >>> character_matrix = load_data()
        >>> picasso = Picasso(character_matrix)
        >>> picasso.fit()
        >>>
        >>> # Create CloneTree for analysis
        >>> phylogeny = picasso.get_phylogeny()
        >>> assignments = picasso.get_clone_assignments()
        >>> clone_tree = CloneTree(phylogeny, assignments, character_matrix)
        >>>
        >>> # Analyze results
        >>> print(f"Number of clones: {len(clone_tree.clone_profiles)}")
        >>> clone_tree.plot_alterations(save_as='cna_heatmap.pdf')
        >>> clone_tree.plot_clone_sizes(save_as='clone_sizes.pdf')

        With metadata for enhanced visualization:

        >>> import pandas as pd
        >>> # Add cell type metadata (example)
        >>> metadata = pd.DataFrame({'cell_type': ['TypeA'] * 50 + ['TypeB'] * 50},
        ...                        index=character_matrix.index)
        >>> clone_tree = CloneTree(phylogeny, assignments, character_matrix,
        ...                       metadata=metadata)
        >>> clone_tree.plot_alterations(metadata=metadata[['cell_type']])

        Notes
        -----
        **Design Considerations for Noisy Data**:
        - Modal aggregation reduces impact of outlier cells within clones
        - Confidence scores help identify uncertain clone profiles
        - Visualization functions highlight clone-specific patterns

        **Clone Profile Aggregation**:
        - Mode aggregation finds most common copy number state per feature per clone
        - Handles missing data and ties in noisy scRNA-seq data
        - Certainty scores indicate reliability of aggregated values

        **Visualization Capabilities**:
        - Heatmaps show clone-specific CNA patterns
        - Clone size distributions reveal clonal architecture
        - Integration with iTOL for publication-quality figures

        See Also
        --------
        Picasso : Main class for phylogenetic inference from CNA data
        plot_alterations : Create heatmap visualization of CNA profiles
        plot_clone_sizes : Visualize clone size distribution
        get_sample_phylogeny : Generate sample-level phylogenetic tree
        """
        assert (
            "clone_id" in clone_assignments.columns
        ), 'The clone assignments must have a column named "clone_id".'
        assert isinstance(phylogeny, ete3.Tree)
        # Check the leaves of the phylogeny match the clones in the clone assignments
        assert set(phylogeny.get_leaf_names()) == set(clone_assignments["clone_id"]), (
            "The leaves of the phylogeny do not match the "
            "clones in the clone assignments."
        )

        # Check that the samples in the assignment matrix match the samples in the character matrix
        assert set(character_matrix.index) == set(clone_assignments.index), (
            "The samples in the assignment matrix do not match the samples in the character "
            "matrix."
        )

        clone_aggregation = clone_aggregation.lower()
        assert clone_aggregation in [
            "mode",
            "mean",
        ], 'The clone aggregation method must be either "mode" or "mean".'

        self.__clone_phylogeny = phylogeny
        self.__sample_phylogeny = None

        self.clone_assignments = clone_assignments
        self.character_matrix = character_matrix

        assert metadata is None or isinstance(
            metadata, pd.DataFrame
        ), "The metadata must be a pandas DataFrame."
        if metadata is not None:
            assert set(metadata.index) == set(
                character_matrix.index
            ), "The samples in the metadata do not match the samples in the character matrix."
        self.metadata = metadata

        self.clone_profiles, self.clone_profiles_certainty = self.aggregate_clones(
            clone_aggregation
        )
        print(
            f"Initialized CloneTree with {len(self.clone_profiles)} clones and {len(self.character_matrix)} samples."
        )



[docs]
    def aggregate_clones(
        self, aggregation_method: str
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Aggregate CNA profiles within each clone to create representative clone profiles.

        Combines individual cell CNA profiles within each clone into single representative
        profiles using statistical aggregation. This reduces noise and creates clean
        clone-level CNA signatures for downstream analysis and visualization.

        Parameters
        ----------
        aggregation_method : str
            Method for aggregating CNA values within clones:
            - 'mode': Use most frequent copy number state (recommended for noisy data)
            - 'mean': Use average copy number (not yet implemented)

        Returns
        -------
        tuple of (pd.DataFrame, pd.DataFrame)
            First DataFrame: Aggregated clone profiles with clones as rows and genomic
            features as columns. Values represent the aggregated copy number states.
            Second DataFrame: Certainty/confidence scores for each aggregated value,
            indicating reliability of the aggregation.

        Examples
        --------
        >>> clone_tree = CloneTree(phylogeny, assignments, cna_data)
        >>> profiles, certainty = clone_tree.aggregate_clones('mode')
        >>> print(f"Clone profiles shape: {profiles.shape}")
        >>> print(f"Average certainty: {certainty.mean().mean():.2f}")

        Notes
        -----
        **Modal Aggregation**:
        - Finds the most common copy number state for each feature within each clone
        - Handles ties by selecting the first modal value
        - Provides certainty scores based on frequency of the modal state
        - Robust to outlier cells within clones
        - Facilitates visualization of CNA patterns across clones

        **Design for Noisy Data**:
        - Modal aggregation reduces impact of noise and technical artifacts
        - Certainty scores help identify unreliable aggregated values
        - Particularly effective for scRNA-seq-inferred CNA data

        Raises
        ------
        NotImplementedError
            If aggregation_method is 'mean' (not yet implemented).
        ValueError
            If aggregation_method is not 'mode' or 'mean'.

        See Also
        --------
        get_modal_clone_profiles : Internal method implementing modal aggregation
        """
        if aggregation_method == "mode":
            return self.get_modal_clone_profiles()
        elif aggregation_method == "mean":
            raise NotImplementedError("Mean aggregation is not yet implemented.")



[docs]
    def get_most_ancestral_clone(self) -> str:
        """
        Identify the most ancestral clone based on CNA profile complexity.

        Determines which clone represents the most ancestral state by counting the
        number of copy number alterations (deviations from neutral state). This
        is useful for rooting phylogenetic trees and understanding evolutionary
        relationships.

        Returns
        -------
        str
            Clone identifier of the most ancestral clone (fewest alterations).

        Examples
        --------
        >>> clone_tree = CloneTree(phylogeny, assignments, cna_data)
        >>> ancestral = clone_tree.get_most_ancestral_clone()
        >>> print(f"Most ancestral clone: {ancestral}")
        >>>
        >>> # Use for tree rooting
        >>> clone_tree.root_tree(ancestral)

        Notes
        -----
        **Ancestral State Assumptions**:
        - Copy number state 0 is considered the ancestral/neutral state
        - Clones with more alterations are considered more derived
        - Useful for establishing evolutionary directionality

        **Algorithm**:
        1. Count non-zero states for each clone in aggregated profiles
        2. Select clone with minimum alteration count
        3. Return clone identifier

        **Use Cases**:
        - Rooting phylogenetic trees for visualization
        - Identifying putative normal/founder cell populations
        - Understanding tumor evolution trajectories

        See Also
        --------
        root_tree : Method to root the phylogeny using an outgroup clone
        clone_profiles : Aggregated CNA profiles used for ancestral inference
        """
        num_alterations = (self.clone_profiles != 0).sum(axis=1)
        ancestral_clone = num_alterations.idxmin()
        return ancestral_clone



[docs]
    def root_tree(self, outgroup: str) -> None:
        """
        Root the phylogenetic tree using a specified outgroup clone.

        Establishes evolutionary directionality by setting a designated clone as
        the outgroup, which becomes the root of the tree. This is essential for
        proper interpretation of evolutionary relationships and visualization.

        Parameters
        ----------
        outgroup : str
            Identifier of the clone to use as outgroup. Must be present in the
            phylogenetic tree leaves. Often the most ancestral clone identified
            by get_most_ancestral_clone().

        Examples
        --------
        >>> clone_tree = CloneTree(phylogeny, assignments, cna_data)
        >>>
        >>> # Root with most ancestral clone
        >>> ancestral = clone_tree.get_most_ancestral_clone()
        >>> clone_tree.root_tree(ancestral)
        >>>
        >>> # Or root with specific clone
        >>> clone_tree.root_tree('1-0-STOP')

        Notes
        -----
        **Effects of Rooting**:
        - Changes tree topology and evolutionary interpretation
        - Affects all subsequent tree-based analyses
        - Resets sample phylogeny (if previously generated)
        - Essential for proper tree visualization

        **Outgroup Selection Guidelines**:
        - Use most ancestral clone (fewest alterations) when possible
        - Consider biological knowledge about cell populations
        - Avoid clones with many unique alterations

        **Implementation Details**:
        - Uses ete3's set_outgroup() method
        - Invalidates cached sample phylogeny
        - Tree structure is modified in-place

        Raises
        ------
        AssertionError
            If outgroup is not found among the tree leaves.

        See Also
        --------
        get_most_ancestral_clone : Identify suitable outgroup candidates
        get_clone_phylogeny : Access the rooted phylogenetic tree
        get_sample_phylogeny : Generate sample-level tree from rooted clone tree
        """
        assert (
            outgroup in self.__clone_phylogeny.get_leaf_names()
        ), "The outgroup must be a leaf in the tree."
        self.__sample_phylogeny = None
        self.__clone_phylogeny.set_outgroup(outgroup)
        return



[docs]
    def get_clone_phylogeny(self) -> ete3.Tree:
        """
        Access the clone-level phylogenetic tree.

        Returns the phylogenetic tree where leaves represent clones (terminal cell
        populations) and internal nodes represent ancestral populations. This is
        the primary tree structure used for evolutionary analysis.

        Returns
        -------
        ete3.Tree
            Phylogenetic tree with clones as leaves. Tree may be rooted or unrooted
            depending on whether root_tree() has been called.

        Examples
        --------
        >>> clone_tree = CloneTree(phylogeny, assignments, cna_data)
        >>> tree = clone_tree.get_clone_phylogeny()
        >>> print(f"Tree has {len(tree.get_leaves())} clones")
        >>> print("Clone names:", tree.get_leaf_names())
        >>>
        >>> # Tree manipulation
        >>> if not tree.is_root():
        ...     print("Tree is rooted")
        >>>
        >>> # Export to Newick format
        >>> newick_str = tree.write()

        Notes
        -----
        **Tree Structure**:
        - Leaves represent terminal clones from PICASSO analysis
        - Internal nodes represent inferred ancestral states
        - Branch structure reflects evolutionary relationships
        - Node names correspond to clone identifiers

        **Tree States**:
        - May be rooted (after root_tree()) or unrooted
        - Tree topology reflects PICASSO splitting hierarchy
        - Compatible with standard phylogenetic analysis tools

        **Use Cases**:
        - Phylogenetic visualization and analysis
        - Export to external tools (iTOL, FigTree, etc.)
        - Evolutionary distance calculations
        - Tree-based clustering validation

        See Also
        --------
        get_sample_phylogeny : Get expanded tree with individual cells
        root_tree : Root the tree for proper evolutionary interpretation
        """
        return self.__clone_phylogeny



[docs]
    def get_sample_phylogeny(self) -> ete3.Tree:
        """
        Generate expanded phylogenetic tree with individual cells as leaves.

        Creates a detailed tree where each cell/sample appears as a separate leaf,
        while maintaining the clone-based evolutionary structure. Cells within the
        same clone are attached as children of their respective clone nodes.

        Returns
        -------
        ete3.Tree
            Expanded phylogenetic tree where leaves represent individual cells/samples
            rather than clones. Clone nodes become internal nodes with cells as children.

        Examples
        --------
        >>> clone_tree = CloneTree(phylogeny, assignments, cna_data)
        >>> sample_tree = clone_tree.get_sample_phylogeny()
        >>> print(f"Tree has {len(sample_tree.get_leaves())} cells")
        >>>
        >>> # Access cell-specific information
        >>> for leaf in sample_tree.get_leaves():
        ...     print(f"Cell {leaf.name}")
        ...     if clone_tree.metadata is not None:
        ...         print(f"  Metadata: {leaf.features}")

        Notes
        -----
        **Tree Construction**:
        - Starts with clone phylogeny as backbone
        - Adds individual cells as children of clone nodes
        - Preserves evolutionary relationships at clone level
        - Enables cell-level analysis within phylogenetic context

        **Metadata Integration**:
        - If metadata provided, adds features to cell nodes
        - Features accessible via leaf.features or leaf.get_feature()
        - Enables metadata-aware tree visualization

        **Performance Considerations**:
        - Tree generated on first call, then cached
        - Cache invalidated when tree is re-rooted
        - Large datasets may produce complex trees

        **Use Cases**:
        - Cell-level phylogenetic visualization
        - Metadata mapping onto evolutionary structure
        - Detailed iTOL annotations
        - Single-cell evolutionary analysis

        See Also
        --------
        get_clone_phylogeny : Access the underlying clone tree structure
        metadata : Cell-level metadata integrated into tree nodes
        """
        if self.__sample_phylogeny is None:
            cell_tree = self.__clone_phylogeny.copy()
            n_leaves_added = 0
            for clone in cell_tree.get_leaves():
                samples = self.clone_assignments.query(
                    f'clone_id == "{clone.name}"'
                ).index
                for sample in samples:
                    clone.add_child(name=sample)
                    n_leaves_added += 1
            print(f"Added {n_leaves_added} leaves to the tree.")
            assert set(cell_tree.get_leaf_names()) == set(
                self.character_matrix.index
            ), (
                "The samples in the tree do not match the samples in the character "
                "matrix."
            )

            self.__sample_phylogeny = cell_tree

            if self.metadata is not None:
                for sample_node in self.__sample_phylogeny.get_leaves():
                    for column in self.metadata.columns:
                        sample = sample_node.name
                        sample_node.add_feature(
                            column, self.metadata.loc[sample, column]
                        )

        return self.__sample_phylogeny



[docs]
    def infer_evolutionary_changes(self) -> None:
        """
        Infer evolutionary changes along phylogenetic tree branches.

        Reconstructs the specific copy number alterations that occurred at each
        internal node of the phylogenetic tree by analyzing transitions between
        ancestral and derived clone profiles. This method is planned for future
        implementation.

        Raises
        ------
        NotImplementedError
            This method is not yet implemented. Future versions will support
            ancestral state reconstruction and evolutionary change mapping.

        Notes
        -----
        **Planned Functionality**:
        - Ancestral state reconstruction for internal tree nodes
        - Identification of specific CNA events along branches

        **Potential Applications**:
        - Understanding CNA acquisition patterns
        - Identifying driver vs passenger alterations
        - Validating phylogenetic relationships

        See Also
        --------
        clone_profiles : Aggregated clone CNA profiles used for inference
        get_clone_phylogeny : Phylogenetic tree structure for change mapping
        """
        raise NotImplementedError



[docs]
    def plot_alterations(
        self,
        metadata: Optional[pd.DataFrame] = None,
        cmap: str = "coolwarm",
        show: bool = True,
        save_as: Optional[str] = None,
        center: Optional[float] = None,
    ) -> None:
        """
        Create clustered heatmap visualization of CNA profiles with clone annotations.

        Generates a comprehensive heatmap showing copy number alterations across all
        cells, with cells grouped by clone assignment and colored sidebars indicating
        clone membership and optional metadata categories.

        Parameters
        ----------
        metadata : pd.DataFrame, optional
            Additional metadata for enhanced visualization. Index should match
            character_matrix. Each column represents a metadata category (e.g.,
            cell_type, treatment, tissue). Will be displayed as colored sidebars.
        cmap : str, default='coolwarm'
            Matplotlib colormap for the main heatmap. Common choices:
            - 'coolwarm': Blue-white-red for CNAs (deletions-neutral-amplifications)
            - 'RdBu_r': Red-blue reversed
            - 'viridis': Perceptually uniform colormap
        show : bool, default=True
            Whether to display the plot interactively.
        save_as : str, optional
            File path to save the plot. Supports common formats (.pdf, .png, .svg).
            Recommended: use .pdf for publication quality.
        center : float, optional
            Value at which to center the colormap. If None, uses default centering.
            For CNA data, typically 0 (neutral copy number) or 2 (diploid).

        Examples
        --------
        Basic heatmap with clone annotations:

        >>> from picasso import Picasso, CloneTree, load_data
        >>>
        >>> # Create CloneTree
        >>> cna_data = load_data()
        >>> picasso = Picasso(cna_data)
        >>> picasso.fit()
        >>> clone_tree = CloneTree(picasso.get_phylogeny(),
        ...                       picasso.get_clone_assignments(),
        ...                       cna_data)
        >>>
        >>> # Basic visualization
        >>> clone_tree.plot_alterations(save_as='cna_heatmap.pdf')

        Enhanced visualization with metadata:

        >>> import pandas as pd
        >>>
        >>> # Add cell type metadata
        >>> metadata = pd.DataFrame({
        ...     'cell_type': ['Malignant'] * 80 + ['Normal'] * 20,
        ...     'tissue': ['Primary'] * 60 + ['Metastasis'] * 40
        ... }, index=cna_data.index)
        >>>
        >>> # Create enhanced heatmap
        >>> clone_tree.plot_alterations(metadata=metadata,
        ...                            cmap='RdBu_r',
        ...                            center=0,
        ...                            save_as='enhanced_heatmap.pdf')

        Notes
        -----
        **Visualization Features**:
        - Cells automatically grouped by clone assignment
        - Clone-specific color sidebar for easy identification
        - Optional metadata sidebars for additional context
        - Configurable color schemes for different data types

        **Layout Organization**:
        - Rows: Individual cells/samples
        - Columns: Genomic features (chromosome arms, genes, etc.)
        - Left sidebars: Clone assignments + optional metadata
        - Main heatmap: Copy number alteration values

        **Color Interpretation**:
        - Clone sidebar: Each clone gets a distinct color
        - Metadata sidebars: Categorical values get distinct colors
        - Main heatmap: Continuous colormap for CNA values

        **Best Practices**:
        - Use 'coolwarm' colormap for copy number data
        - Center colormap at neutral copy number (typically 0 or 2)
        - Save as PDF for publication-quality figures
        - Include relevant metadata for biological context

        See Also
        --------
        plot_clone_sizes : Visualize clone size distribution
        clone_profiles : Access aggregated clone CNA profiles
        seaborn.clustermap : Underlying plotting function used
        """
        df = self.character_matrix.join(self.clone_assignments)
        # Sort the columns by clone assignment
        df = df.sort_values(by="clone_id")

        # Colour cells by clone assignment
        palette = sns.color_palette("tab20", len(df["clone_id"].unique()))
        clone_cmap = {}
        for i, clone in enumerate(df["clone_id"].unique()):
            clone_cmap[clone] = self.rgba_to_hex(palette[i])
        row_colors = pd.DataFrame(df["clone_id"].map(clone_cmap))

        # Plot a clustered heatmap, so that we can display the clone assignments as a colour bar
        if metadata is not None:
            row_colors = row_colors.join(metadata)
        if center is not None:
            sns.clustermap(
                df.drop(columns="clone_id"),
                row_colors=row_colors,
                col_cluster=False,
                row_cluster=False,
                cmap=cmap,
                figsize=(10, 10),
                center=center,
            )
        else:
            sns.clustermap(
                df.drop(columns="clone_id"),
                row_colors=row_colors,
                col_cluster=False,
                row_cluster=False,
                cmap=cmap,
                figsize=(10, 10),
            )
        if save_as:
            plt.savefig(save_as, dpi=300)
        if show:
            plt.show()
        plt.close()



[docs]
    def plot_clone_sizes(
        self, show: bool = True, save_as: Optional[str] = None
    ) -> None:
        """
        Visualize the distribution of clone sizes in the phylogenetic tree.

        Creates a histogram showing how many cells belong to each clone, providing
        insights into clonal architecture, diversity, and potential dominant/rare
        clones within the analyzed population.

        Parameters
        ----------
        show : bool, default=True
            Whether to display the plot interactively using matplotlib.
        save_as : str, optional
            File path to save the plot. Supports common formats (.pdf, .png, .svg).
            If provided, plot will be saved to this location.

        Examples
        --------
        Basic clone size visualization:

        >>> from picasso import Picasso, CloneTree, load_data
        >>>
        >>> # Create CloneTree and visualize clone sizes
        >>> cna_data = load_data()
        >>> picasso = Picasso(cna_data)
        >>> picasso.fit()
        >>> clone_tree = CloneTree(picasso.get_phylogeny(),
        ...                       picasso.get_clone_assignments(),
        ...                       cna_data)
        >>>
        >>> # Display clone size distribution
        >>> clone_tree.plot_clone_sizes()

        Save without displaying:

        >>> # Save to file without showing
        >>> clone_tree.plot_clone_sizes(show=False, save_as='clone_sizes.pdf')

        Analyze clone architecture:

        >>> # Get clone sizes for analysis
        >>> assignments = picasso.get_clone_assignments()
        >>> clone_sizes = assignments['clone_id'].value_counts()
        >>> print(f"Largest clone: {clone_sizes.max()} cells")
        >>> print(f"Smallest clone: {clone_sizes.min()} cells")
        >>> print(f"Mean clone size: {clone_sizes.mean():.1f} cells")
        >>>
        >>> # Visualize
        >>> clone_tree.plot_clone_sizes(save_as='clone_architecture.pdf')

        Notes
        -----
        **Plot Features**:
        - Histogram showing distribution of clone sizes
        - X-axis: Clone size (number of cells per clone)
        - Y-axis: Number of clones with that size
        - Kernel density estimate (KDE) overlay for smooth distribution
        - Automatic binning based on data range

        **Interpretation**:
        - Right-skewed distribution: Few large clones dominate
        - Uniform distribution: Balanced clonal architecture
        - Left-skewed distribution: Many small clones, rare large ones

        **Technical Considerations**:
        - Clone sizes depend on PICASSO parameters (min_clone_size, etc.)
        - Very small clones may indicate noise or over-splitting
        - Very large clones may indicate under-splitting or homogeneity

        See Also
        --------
        plot_alterations : Visualize CNA profiles with clone annotations
        clone_assignments : Access raw clone assignment data
        get_clone_assignments : Get clone assignments from PICASSO analysis
        """
        cells_per_clone = self.clone_assignments["clone_id"].value_counts()
        plt.figure()
        sns.histplot(cells_per_clone, kde=True)
        plt.xlabel("Clone Size")
        plt.xticks(rotation=45)
        plt.ylabel("Number of Clones")
        plt.title("Number of Cells per Clone")
        if save_as:
            plt.savefig(save_as)
        if show:
            plt.show()
        plt.close()



[docs]
    @staticmethod
    def calc_mode(series: pd.Series) -> Union[int, float, None]:
        """
        Calculate the statistical mode (most frequent value) of a pandas Series.

        Computes the most common value in a series, handling edge cases where no
        mode exists or multiple modes are present. Used for aggregating copy number
        states within clones.

        Parameters
        ----------
        series : pd.Series
            Input data series containing numeric values (typically copy number states).

        Returns
        -------
        int, float, or None
            The most frequent value in the series. Returns None if series is empty
            or all values are NaN. If multiple modes exist, returns the first one.

        Examples
        --------
        >>> import pandas as pd
        >>> data = pd.Series([1, 1, 2, 2, 2, 3])
        >>> CloneTree.calc_mode(data)
        2
        >>>
        >>> # Handle ties
        >>> tie_data = pd.Series([1, 1, 2, 2])
        >>> CloneTree.calc_mode(tie_data)  # Returns first mode
        1

        Notes
        -----
        - Uses pandas Series.mode() method internally
        - Handles empty series gracefully by returning None
        - For ties, returns the first modal value (arbitrary but consistent)
        - Designed for integer copy number data but works with any numeric type

        See Also
        --------
        calc_mode_freq : Calculate frequency of the modal value
        get_modal_clone_profiles : Main method using this utility
        """
        mode = series.mode()
        if len(mode) > 0:  # If there's at least one mode
            return mode[0]  # Return the first mode
        return None



[docs]
    @staticmethod
    def calc_mode_freq(series: pd.Series) -> float:
        """
        Calculate the frequency (proportion) of the modal value in a pandas Series.

        Computes what fraction of values in the series match the most frequent value.
        This provides a confidence measure for modal aggregation - higher frequencies
        indicate more reliable consensus within the data.

        Parameters
        ----------
        series : pd.Series
            Input data series containing numeric values (typically copy number states).

        Returns
        -------
        float
            Proportion of values matching the modal value, between 0.0 and 1.0.
            Returns 0.0 if series is empty or contains only NaN values.

        Examples
        --------
        >>> import pandas as pd
        >>> # High consensus
        >>> data = pd.Series([2, 2, 2, 2, 1])
        >>> CloneTree.calc_mode_freq(data)
        0.8  # 4 out of 5 values are modal
        >>>
        >>> # Perfect consensus
        >>> uniform = pd.Series([1, 1, 1, 1])
        >>> CloneTree.calc_mode_freq(uniform)
        1.0
        >>>
        >>> # Low consensus (tie)
        >>> mixed = pd.Series([1, 2, 3, 4])
        >>> CloneTree.calc_mode_freq(mixed)
        0.25  # Each value appears once

        Notes
        -----
        **Interpretation Guide**:
        - 1.0: Perfect consensus, all values identical
        - 0.8-0.9: Strong consensus with few outliers
        - 0.5-0.7: Moderate consensus, some heterogeneity
        - <0.5: Weak consensus, high heterogeneity

        **Use in Clone Analysis**:
        - Quality metric for clone coherence
        - Confidence score for aggregated profiles
        - Filter for reliable clone assignments
        - Identifies noisy or heterogeneous clones

        See Also
        --------
        calc_mode : Calculate the actual modal value
        get_modal_clone_profiles : Main method using this utility for confidence scores
        """
        mode = series.mode()
        if len(mode) > 0:
            return len(series[series == mode[0]]) / len(series)
        return 0



[docs]
    def get_modal_clone_profiles(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Compute modal (most frequent) copy number states for each clone.

        Aggregates CNA profiles within each clone by finding the most common copy
        number state for each genomic feature. Also computes confidence scores
        based on the frequency of the modal state.

        Returns
        -------
        tuple of (pd.DataFrame, pd.DataFrame)
            modal_profiles : pd.DataFrame
                Clone profiles with modal copy number states. Rows are clones,
                columns are genomic features. Values are the most frequent
                copy number state within each clone.
            modal_frequencies : pd.DataFrame
                Confidence scores for modal states. Same structure as modal_profiles
                but values represent the proportion of cells with the modal state
                (0.0 to 1.0, where 1.0 indicates all cells have the same state).

        Examples
        --------
        >>> clone_tree = CloneTree(phylogeny, assignments, cna_data)
        >>> profiles, frequencies = clone_tree.get_modal_clone_profiles()
        >>>
        >>> # Examine profile quality
        >>> avg_confidence = frequencies.mean().mean()
        >>> print(f"Average modal confidence: {avg_confidence:.2f}")
        >>>
        >>> # Find highly confident features
        >>> confident_features = frequencies.columns[frequencies.mean() > 0.8]
        >>> print(f"High confidence features: {len(confident_features)}")

        Notes
        -----
        **Modal Aggregation Process**:
        1. Group cells by clone assignment
        2. For each clone-feature combination, find most frequent copy number state
        3. Calculate frequency of modal state as confidence measure
        4. Handle ties by selecting first modal value

        **Confidence Interpretation**:
        - 1.0: All cells in clone have identical copy number state
        - 0.5-0.9: Majority consensus with some variation
        - <0.5: High heterogeneity, unreliable modal state

        **Noise Handling**:
        - Modal aggregation naturally filters outlier cells
        - Confidence scores identify unreliable aggregations
        - Particularly effective for noisy scRNA-seq-inferred CNAs

        **Applications**:
        - Generate clean clone signatures for visualization
        - Quality control for clone assignments
        - Feature selection based on clone coherence

        See Also
        --------
        calc_mode : Static method for computing modal values
        calc_mode_freq : Static method for computing modal frequencies
        aggregate_clones : Public interface using this method
        """

        # Ensure the indices are aligned
        cnvs = self.character_matrix.loc[self.clone_assignments.index]

        # Merge the two DataFrames on their indices
        merged_df = pd.concat([self.clone_assignments, cnvs], axis=1)

        clone_column = "clone_id"

        # Modal values DataFrame
        modal_df = merged_df.groupby(clone_column).agg(self.calc_mode).reset_index()

        # Frequencies of modal values DataFrame
        freq_df = merged_df.groupby(clone_column).agg(self.calc_mode_freq).reset_index()

        # Set the clone column as the index again for modal_df and freq_df
        modal_df.set_index(clone_column, inplace=True)
        freq_df.set_index(clone_column, inplace=True)

        return modal_df, freq_df



[docs]
    @staticmethod
    def rgba_to_hex(rgba: Tuple[float, ...]) -> str:
        """
        Convert RGBA values to hexadecimal color string.

        Parameters
        ----------
        rgba : tuple
            Tuple of RGBA values.

        Returns
        -------
        str
            Hexadecimal color string.

        Examples
        --------
        >>> rgba_to_hex((1.0, 0.0, 0.0, 1.0))
        '#ff0000'
        """

        # Extract the RGBA values
        if len(rgba) == 3:
            red, green, blue = rgba
        elif len(rgba) == 4:
            red, green, blue, _ = rgba

        # Ensure the values are in the range 0-1
        red = min(1.0, max(0.0, red))
        green = min(1.0, max(0.0, green))
        blue = min(1.0, max(0.0, blue))

        # Convert to hexadecimal and ensure two characters for each value
        red_hex = format(int(red * 255), "02X")
        green_hex = format(int(green * 255), "02X")
        blue_hex = format(int(blue * 255), "02X")

        # Concatenate the hexadecimal values
        hex_color = f"#{red_hex}{green_hex}{blue_hex}"

        return hex_color




# Define public API
__all__ = ["CloneTree"]