diff --git a/CHANGELOG.md b/CHANGELOG.md index 8d17b17..a49ab99 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,12 +1,15 @@ # Changelog -## Version 0.6.0 - 0.6.3 +## Version 0.6.0 - 0.6.4 - Changed related to SummarizedExperiment and implementation of `CompressedGenomicRangesList` in the genomic ranges package. - Update versions of relevant dependency packages. - Rename `reduced_dims` to `reduced_dimensions`. - Implement coercions to/from RSE/SE. - Access data stored in `raw` (if available) as `alternative_experiments`, when initializing `SingleCellExperiment` objects from anndata/h5ad files. +- Fix bug when slicing objects containing row or column pairs. +- Add getters/setters for `sizeFactors`, modifying this in the column_data of the object. +- Improve test coverage and fix bugs. ## Version 0.5.8 - 0.5.9 diff --git a/docs/tutorial.md b/docs/tutorial.md index 43081c7..b660d55 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -179,6 +179,35 @@ One can access an reduced dimension by index or name: sce.reduced_dim(0) # same as se.reduced_dim("random_embeds") ``` +## Size Factors + +In Bioconductor, size factors represent scaling factors used to normalize cell-specific biases (such as differences in sequencing depth). Following the Bioconductor design, `SingleCellExperiment` stores size factors directly inside the column data under the column name `"sizeFactors"`. + +You can set, retrieve, and delete size factors using either functional methods or the `size_factors` property: + +```{code-cell} +# Generate mock size factors (e.g. library size scaling factors) +cell_depths = counts.sum(axis=0) +if isinstance(cell_depths, np.matrix) or hasattr(cell_depths, "A"): + cell_depths = np.array(cell_depths).flatten() + +# Scale factors so that their mean is 1 +size_factors = cell_depths / np.mean(cell_depths) + +# Set size factors on the experiment +sce.set_size_factors(size_factors, in_place=True) + +# Access size factors +print("Retrieved size factors:", sce.get_size_factors()) + +# They reside directly in the column_data under "sizeFactors": +print("Column data 'sizeFactors':", sce.column_data["sizeFactors"]) + +# To delete/remove size factors, set them to None +sce.set_size_factors(None, in_place=True) +print("After deletion, is 'sizeFactors' in column_data?", "sizeFactors" in sce.column_data.column_names) +``` + ## Subset experiments You can subset experimental data by using the subset (`[]`) operator. This operation accepts different slice input types, such as a boolean vector, a `slice` object, a list of indices, or names (if available) to subset. diff --git a/pyproject.toml b/pyproject.toml index 00aa968..7d7c7bf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ version_scheme = "no-guess-dev" [tool.ruff] line-length = 120 src = ["src"] -exclude = ["tests"] +# exclude = ["tests"] extend-ignore = ["F821"] [tool.ruff.pydocstyle] diff --git a/src/singlecellexperiment/SingleCellExperiment.py b/src/singlecellexperiment/SingleCellExperiment.py index 745e206..511526c 100644 --- a/src/singlecellexperiment/SingleCellExperiment.py +++ b/src/singlecellexperiment/SingleCellExperiment.py @@ -6,6 +6,7 @@ import biocframe import biocutils as ut +import numpy as np from summarizedexperiment import SummarizedExperiment from summarizedexperiment._combineutils import ( check_assays_are_equal, @@ -75,10 +76,32 @@ def _validate_alternative_experiments(alternative_experiments, shape, column_nam warn(f"Column names do not match for alternative_experiment: {alt_name}", UserWarning) -def _validate_pairs(pairs): +def _validate_size_factors(size_factors, shape): + if size_factors is not None: + if not hasattr(size_factors, "__len__"): + raise TypeError("'size_factors' must be a sequence-like object.") + if len(size_factors) != shape[1]: + raise ValueError("'size_factors' length must match the number of columns.") + + +def _validate_pairs(pairs, expected_dim, name): if pairs is not None: if not isinstance(pairs, dict): - raise TypeError("Pair is not a dictionary.") + raise TypeError(f"'{name}' is not a dictionary.") + + for k, v in pairs.items(): + if not hasattr(v, "shape"): + raise TypeError( + f"Pair '{k}' in '{name}' must be a matrix-like object. Does not contain a `shape` property." + ) + + if len(v.shape) != 2: + raise ValueError(f"Pair '{k}' in '{name}' must be 2-dimensional.") + + if v.shape[0] != expected_dim or v.shape[1] != expected_dim: + raise ValueError( + f"Pair '{k}' in '{name}' must be a square matrix of shape ({expected_dim}, {expected_dim})." + ) class SingleCellExperiment(RangedSummarizedExperiment): @@ -115,6 +138,7 @@ def __init__( alternative_experiments: Optional[Dict[str, Any]] = None, row_pairs: Optional[Any] = None, column_pairs: Optional[Any] = None, + size_factors: Optional[Union[np.ndarray, List[float], Sequence[float]]] = None, alternative_experiment_check_dim_names: bool = True, _validate: bool = True, **kwargs, @@ -201,6 +225,11 @@ def __init__( Defaults to None. + size_factors: + Cell size factors. + + Defaults to None. + _validate: Internal use only. @@ -237,6 +266,15 @@ def __init__( self._row_pairs = row_pairs if row_pairs is not None else {} self._column_pairs = column_pairs if column_pairs is not None else {} + if size_factors is not None: + _new_sf = np.array(size_factors, dtype=np.float64) + if _validate: + _validate_size_factors(_new_sf, self._shape) + + self._cols = self._cols.set_column("sizeFactors", _new_sf, in_place=True) + elif _validate and "sizeFactors" in self._cols.column_names: + _validate_size_factors(np.array(self._cols.column("sizeFactors"), dtype=np.float64), self._shape) + if _validate: _validate_reduced_dims(self._reduced_dims, self._shape) _validate_alternative_experiments( @@ -245,8 +283,8 @@ def __init__( self.get_column_names(), with_dim_names=alternative_experiment_check_dim_names, ) - _validate_pairs(self._row_pairs) - _validate_pairs(self._column_pairs) + _validate_pairs(self._row_pairs, self._shape[0], "row_pairs") + _validate_pairs(self._column_pairs, self._shape[1], "column_pairs") ######################### ######>> Copying <<###### @@ -354,6 +392,10 @@ def __repr__(self) -> str: if len(self._column_pairs) > 0: output += ", column_pairs=" + ut.print_truncated_dict(self._column_pairs) + _sf = self.get_size_factors() + if _sf is not None: + output += ", size_factors=" + ut.print_truncated_list(_sf) + if len(self._metadata) > 0: output += ", metadata=" + ut.print_truncated_dict(self._metadata) @@ -388,6 +430,10 @@ def __str__(self) -> str: output += f"alternative_experiments({len(self.alternative_experiment_names)}): {ut.print_truncated_list(self.alternative_experiment_names)}\n" output += f"row_pairs({len(self.row_pair_names)}): {ut.print_truncated_list(self.row_pair_names)}\n" output += f"column_pairs({len(self.column_pair_names)}): {ut.print_truncated_list(self.column_pair_names)}\n" + _sf = self.get_size_factors() + output += ( + f"size_factors({0 if _sf is None else len(_sf)}): {' ' if _sf is None else ut.print_truncated_list(_sf)}\n" + ) output += f"metadata({str(len(self.metadata))}): {ut.print_truncated_list(list(self.metadata.keys()), sep=' ', include_brackets=False, transform=lambda y: y)}\n" @@ -498,9 +544,10 @@ def set_reduced_dimension_names(self, names: List[str], in_place: bool = False) if len(names) != len(current_names): raise ValueError("Length of 'names' does not match the number of `reduced_dims`.") + _tmp = self._reduced_dims.copy() new_reduced_dims = OrderedDict() for idx in range(len(names)): - new_reduced_dims[names[idx]] = self._reduced_dims.pop(current_names[idx]) + new_reduced_dims[names[idx]] = _tmp.pop(current_names[idx]) output = self._define_output(in_place) output._reduced_dims = new_reduced_dims @@ -562,7 +609,7 @@ def get_reduced_dimension(self, name: Union[str, int]) -> Any: if name < 0: raise IndexError("Index cannot be negative.") - if name > len(self.reduced_dim_names): + if name >= len(self.reduced_dim_names): raise IndexError("Index greater than the number of reduced dimensions.") return self._reduced_dims[self.reduced_dim_names[name]] @@ -752,9 +799,10 @@ def set_alternative_experiment_names(self, names: List[str], in_place: bool = Fa if len(names) != len(current_names): raise ValueError("Length of 'names' does not match the number of `alternative_experiments`.") + _tmp = self._alternative_experiments.copy() new_alt_expts = OrderedDict() for idx in range(len(names)): - new_alt_expts[names[idx]] = self._alternative_experiments.pop(current_names[idx]) + new_alt_expts[names[idx]] = _tmp.pop(current_names[idx]) output = self._define_output(in_place) output._alternative_experiments = new_alt_expts @@ -807,7 +855,7 @@ def get_alternative_experiment(self, name: Union[str, int], with_dim_names: bool if name < 0: raise IndexError("Index cannot be negative.") - if name > len(self.alternative_experiment_names): + if name >= len(self.alternative_experiment_names): raise IndexError("Index greater than the number of alternative experiments.") _out = self._alternative_experiments[self.alternative_experiment_names[name]] @@ -895,7 +943,7 @@ def set_row_pairs(self, pairs: Dict[str, Any], in_place: bool = False) -> Single A modified ``SingleCellExperiment`` object, either as a copy of the original or as a reference to the (in-place-modified) original. """ - _validate_pairs(pairs) + _validate_pairs(pairs, self.shape[0], "row_pairs") output = self._define_output(in_place) output._row_pairs = pairs @@ -945,9 +993,10 @@ def set_row_pair_names(self, names: List[str], in_place: bool = False) -> Single if len(names) != len(current_names): raise ValueError("Length of 'names' does not match the number of `row_pairs`.") + _tmp = self._row_pairs.copy() new_row_pairs = OrderedDict() for idx in range(len(names)): - new_row_pairs[names[idx]] = self._row_pairs.pop(current_names[idx]) + new_row_pairs[names[idx]] = _tmp.pop(current_names[idx]) output = self._define_output(in_place) output._row_pairs = new_row_pairs @@ -993,7 +1042,7 @@ def set_column_pairs(self, pairs: Dict[str, Any], in_place: bool = False) -> Sin A modified ``SingleCellExperiment`` object, either as a copy of the original or as a reference to the (in-place-modified) original. """ - _validate_pairs(pairs) + _validate_pairs(pairs, self.shape[1], "column_pairs") output = self._define_output(in_place) output._column_pairs = pairs @@ -1043,9 +1092,10 @@ def set_column_pair_names(self, names: List[str], in_place: bool = False) -> Sin if len(names) != len(current_names): raise ValueError("Length of 'names' does not match the number of `column_pairs`.") + _tmp = self._column_pairs.copy() new_column_pairs = OrderedDict() for idx in range(len(names)): - new_column_pairs[names[idx]] = self._column_pairs.pop(current_names[idx]) + new_column_pairs[names[idx]] = _tmp.pop(current_names[idx]) output = self._define_output(in_place) output._column_pairs = new_column_pairs @@ -1065,6 +1115,404 @@ def column_pair_names(self, names: List[str]): ) self.set_column_pair_names(names, in_place=True) + ################################## + ######>> size_factors <<########## + ################################## + + def get_size_factors(self, on_absence: str = "none") -> Optional[np.ndarray]: + """Access size factors. + + Args: + on_absence: + Behavior when size factors are absent: + - "none": returns None. + - "warn": issues a UserWarning and returns None. + - "error": raises a ValueError. + + Returns: + A numpy array containing size factors, or None. + """ + sf = None + if "sizeFactors" in self._cols.column_names: + sf = np.array(self._cols.column("sizeFactors"), dtype=np.float64) + + if sf is None: + if on_absence == "error": + raise ValueError("Size factors are not set.") + elif on_absence == "warn": + warn("Size factors are not set.", UserWarning) + elif on_absence != "none": + raise ValueError(f"Invalid 'on_absence' value: '{on_absence}'. Must be 'none', 'warn', or 'error'.") + + return sf + + def set_size_factors( + self, + size_factors: Optional[Union[np.ndarray, List[float], Sequence[float]]], + in_place: bool = False, + ) -> SingleCellExperiment: + """Set new size factors. + + Args: + size_factors: + New size factors. + + in_place: + Whether to modify the ``SingleCellExperiment`` in place. + + Returns: + A modified ``SingleCellExperiment`` object, either as a copy of the original + or as a reference to the (in-place-modified) original. + """ + if size_factors is not None: + _new_sf = np.array(size_factors, dtype=np.float64) + _validate_size_factors(_new_sf, self.shape) + else: + _new_sf = None + + output = self._define_output(in_place) + if _new_sf is not None: + output._cols = output._cols.set_column("sizeFactors", _new_sf, in_place=in_place) + else: + if "sizeFactors" in output._cols.column_names: + output._cols = output._cols.remove_column("sizeFactors", in_place=in_place) + + return output + + @property + def size_factors(self) -> Optional[np.ndarray]: + """Accessor for size factors.""" + return self.get_size_factors() + + @size_factors.setter + def size_factors(self, size_factors: Optional[Union[np.ndarray, List[float], Sequence[float]]]): + """Set size factors in-place.""" + warn( + "Setting property 'size_factors' is an in-place operation, use 'set_size_factors' instead", + UserWarning, + ) + self.set_size_factors(size_factors, in_place=True) + + #################################### + ######>> row_pair / col_pair <<##### + #################################### + + def get_row_pair(self, name: Union[str, int]) -> Any: + """Access a row pair by name or index. + + Args: + name: + Name or index of the row pair. + + Returns: + The row pair matrix. + """ + if isinstance(name, int): + if name < 0: + raise IndexError("Index cannot be negative.") + + if name >= len(self.row_pair_names): + raise IndexError("Index greater than the number of row pairs.") + + return self._row_pairs[self.row_pair_names[name]] + elif isinstance(name, str): + if name not in self._row_pairs: + raise AttributeError(f"Row pair: '{name}' does not exist.") + + return self._row_pairs[name] + + raise TypeError(f"'name' must be a string or integer, provided '{type(name)}'.") + + def set_row_pair(self, name: str, pair: Any, in_place: bool = False) -> SingleCellExperiment: + """Add or replace a row pair. + + Args: + name: + Name of the row pair. + + pair: + The row pair matrix. + + in_place: + Whether to modify the object in place. + + Returns: + A modified ``SingleCellExperiment`` object. + """ + output = self._define_output(in_place) + + _tmp = output._row_pairs + if not in_place: + _tmp = _tmp.copy() + _tmp[name] = pair + + _validate_pairs(_tmp, self.shape[0], "row_pairs") + output._row_pairs = _tmp + return output + + def get_column_pair(self, name: Union[str, int]) -> Any: + """Access a column pair by name or index. + + Args: + name: + Name or index of the column pair. + + Returns: + The column pair matrix. + """ + if isinstance(name, int): + if name < 0: + raise IndexError("Index cannot be negative.") + + if name >= len(self.column_pair_names): + raise IndexError("Index greater than the number of column pairs.") + + return self._column_pairs[self.column_pair_names[name]] + elif isinstance(name, str): + if name not in self._column_pairs: + raise AttributeError(f"Column pair: '{name}' does not exist.") + + return self._column_pairs[name] + + raise TypeError(f"'name' must be a string or integer, provided '{type(name)}'.") + + def set_column_pair(self, name: str, pair: Any, in_place: bool = False) -> SingleCellExperiment: + """Add or replace a column pair. + + Args: + name: + Name of the column pair. + pair: + The column pair matrix. + in_place: + Whether to modify the object in place. + + Returns: + A modified ``SingleCellExperiment`` object. + """ + output = self._define_output(in_place) + + _tmp = output._column_pairs + if not in_place: + _tmp = _tmp.copy() + _tmp[name] = pair + + _validate_pairs(_tmp, self.shape[1], "column_pairs") + output._column_pairs = _tmp + return output + + ######################################### + ######>> alt_exps workflows <<########### + ######################################### + + def swap_alt_exp( + self, + name: Union[str, int], + saved: Optional[str] = None, + with_col_data: bool = True, + in_place: bool = False, + ) -> SingleCellExperiment: + """Swap main experiment with an alternative experiment. + + Args: + name: + Name or index of the alternative experiment to promote. + + saved: + Name to save the current main experiment as an alternative experiment. + If None, the current main experiment is not saved. + + with_col_data: + Whether to keep the column data, reduced dimensions, column pairs, + and size factors of the current main experiment. + + in_place: + Whether to modify the object in place. + + Returns: + A modified ``SingleCellExperiment`` object. + """ + alt_exp = self.get_alternative_experiment(name, with_dim_names=False) + alt_name = name if isinstance(name, str) else self.alternative_experiment_names[name] + + # Prepare new alternative experiments dict + new_alt_expts = self.alternative_experiments.copy() + new_alt_expts.pop(alt_name) + + if saved is not None: + saved_exp = self.copy() + saved_exp._alternative_experiments = {} + new_alt_expts[saved] = saved_exp + + # Build the new class constructor arguments + new_assays = alt_exp.assays + new_row_data = alt_exp.row_data + new_row_ranges = getattr(alt_exp, "row_ranges", None) + new_row_names = alt_exp.row_names + + if with_col_data: + new_col_data = self.column_data + new_col_names = self.column_names + new_reduced_dims = self._reduced_dims + new_column_pairs = self._column_pairs + else: + new_col_data = alt_exp.column_data + new_col_names = alt_exp.column_names + new_reduced_dims = getattr(alt_exp, "_reduced_dims", None) + new_column_pairs = getattr(alt_exp, "_column_pairs", None) + + output = self._define_output(in_place) + output._assays = new_assays + output._rows = new_row_data + output._row_ranges = new_row_ranges + output._row_names = new_row_names + output._cols = new_col_data + output._column_names = new_col_names + output._reduced_dims = new_reduced_dims if new_reduced_dims is not None else {} + output._column_pairs = new_column_pairs if new_column_pairs is not None else {} + output._alternative_experiments = new_alt_expts + output._shape = (new_row_data.shape[0], new_col_data.shape[0]) + + return output + + def split_alt_exps( + self, + f: Union[str, Sequence], + ref: Optional[str] = None, + in_place: bool = False, + ) -> SingleCellExperiment: + """Split the main experiment into alternative experiments based on a grouping vector. + + Args: + f: + A column name in ``row_data`` or a sequence of the same length as ``shape[0]`` + specifying the group for each feature. + + ref: + The group name that should remain in the main experiment. + If None, the first unique group name in ``f`` is used. + + in_place: + Whether to modify the object in place. + + Returns: + A modified ``SingleCellExperiment`` object. + """ + if isinstance(f, str): + if f not in self.row_data.column_names: + raise ValueError(f"Column '{f}' not found in row_data.") + + groups = list(self.row_data.column(f)) + else: + groups = list(f) + + if len(groups) != self.shape[0]: + raise ValueError("Length of 'f' must match the number of rows.") + + unique_groups = [] + for g in groups: + if g not in unique_groups: + unique_groups.append(g) + + if len(unique_groups) == 0: + raise ValueError("No groups found in 'f'.") + + if ref is None: + ref = unique_groups[0] + elif ref not in unique_groups: + raise ValueError(f"Reference group '{ref}' not found in groups.") + + group_indices = {g: [] for g in unique_groups} + for idx, g in enumerate(groups): + group_indices[g].append(idx) + + new_alt_expts = self.alternative_experiments.copy() + + for g, indices in group_indices.items(): + if g == ref: + continue + + sub_exp = self[indices, :] + sub_exp._alternative_experiments = {} + new_alt_expts[str(g)] = sub_exp + + ref_indices = group_indices[ref] + + if in_place: + ref_sliced = self[ref_indices, :] + self._assays = ref_sliced.assays + self._rows = ref_sliced.row_data + self._row_ranges = ref_sliced.row_ranges + self._row_names = ref_sliced.row_names + self._shape = ref_sliced._shape + self._alternative_experiments = new_alt_expts + + return self + else: + ref_sliced = self[ref_indices, :] + ref_sliced._alternative_experiments = new_alt_expts + + return ref_sliced + + def unsplit_alt_exps( + self, + names: Optional[Sequence[str]] = None, + in_place: bool = False, + ) -> SingleCellExperiment: + """Recombine alternative experiments back into the main experiment by row. + + Args: + names: + Names of the alternative experiments to unsplit. + If None, all alternative experiments are unsplit. + in_place: + Whether to modify the object in place. + + Returns: + A modified ``SingleCellExperiment`` object. + """ + if names is None: + names = self.alternative_experiment_names + + if len(names) == 0: + return self if in_place else self.copy() + + to_combine = [self] + for name in names: + if name not in self.alternative_experiment_names: + raise ValueError(f"Alternative experiment '{name}' not found.") + + alt = self.get_alternative_experiment(name) + if not isinstance(alt, SingleCellExperiment): + if hasattr(alt, "row_ranges"): + alt = SingleCellExperiment.from_rangedsummarizedexperiment(alt) + else: + alt = SingleCellExperiment.from_summarizedexperiment(alt) + + to_combine.append(alt) + + import biocutils as ut + + combined = ut.relaxed_combine_rows(*to_combine) + remaining_alts = {k: v for k, v in self.alternative_experiments.items() if k not in names} + + if in_place: + self._assays = combined.assays + self._rows = combined.row_data + self._row_ranges = combined.row_ranges + self._row_names = combined.row_names + self._cols = combined.column_data + self._column_names = combined.column_names + self._reduced_dims = combined._reduced_dims + self._column_pairs = combined._column_pairs + self._alternative_experiments = remaining_alts + self._shape = combined._shape + + return self + else: + combined._alternative_experiments = remaining_alts + return combined + ########################## ######>> slicers <<####### ########################## @@ -1089,7 +1537,13 @@ def get_slice( new_reduced_dims = {} for rdim, rmat in self._reduced_dims.items(): if do_slice_cols: - rmat = rmat[slicer.col_indices, :] + if hasattr(rmat, "iloc"): + rmat = rmat.iloc[slicer.col_indices, :] + else: + try: + rmat = rmat[slicer.col_indices, :] + except Exception: + rmat = rmat[slicer.col_indices] new_reduced_dims[rdim] = rmat @@ -1103,14 +1557,14 @@ def get_slice( new_row_pairs = {} for rname, rpair in self._row_pairs.items(): if do_slice_rows: - rpair = rpair[slicer.row_indices, :] + rpair = rpair[slicer.row_indices, :][:, slicer.row_indices] new_row_pairs[rname] = rpair new_col_pairs = {} for cname, cpair in self._column_pairs.items(): if do_slice_cols: - cpair = cpair[:, slicer.col_indices] + cpair = cpair[slicer.col_indices, :][:, slicer.col_indices] new_col_pairs[cname] = cpair diff --git a/src/singlecellexperiment/_combineutils.py b/src/singlecellexperiment/_combineutils.py index 06bf466..7e139e0 100644 --- a/src/singlecellexperiment/_combineutils.py +++ b/src/singlecellexperiment/_combineutils.py @@ -57,12 +57,21 @@ def relaxed_merge_numpy_generic(se, by, attr, names_attr): _all_assays = {} for k in _all_keys: + dim_size = 2 + for y in se: + if k in getattr(y, names_attr): + _mat = getattr(y, attr)[k] + if hasattr(_mat, "shape") and len(_mat.shape) > 1: + dim_size = _mat.shape[1] + + break + _all_mats = [] for x in se: _txmat = None if k not in getattr(x, names_attr): _txmat = np.ma.array( - np.zeros(shape=x.shape), + np.zeros(shape=(x.shape[1], dim_size)), mask=True, ) else: diff --git a/src/singlecellexperiment/_ioutils.py b/src/singlecellexperiment/_ioutils.py index c1a8f38..322148f 100644 --- a/src/singlecellexperiment/_ioutils.py +++ b/src/singlecellexperiment/_ioutils.py @@ -6,12 +6,11 @@ def _to_normal_dict(obj): - norm_obj = obj - if len(norm_obj.keys()) == 0: - norm_obj = None - else: - norm_obj = OrderedDict() - for okey, oval in norm_obj.items(): - norm_obj[okey] = oval + if len(obj.keys()) == 0: + return None + + norm_obj = OrderedDict() + for okey, oval in obj.items(): + norm_obj[okey] = oval return norm_obj diff --git a/tests/data/mocks.py b/tests/data/mocks.py index 4e5ceae..57ae120 100644 --- a/tests/data/mocks.py +++ b/tests/data/mocks.py @@ -1,5 +1,3 @@ -from random import random - import numpy as np import pandas as pd from biocframe import BiocFrame diff --git a/tests/test_sce.py b/tests/test_sce.py index df025c6..9881f7a 100644 --- a/tests/test_sce.py +++ b/tests/test_sce.py @@ -3,8 +3,8 @@ import genomicranges import numpy as np import pandas as pd -from biocframe import BiocFrame import pytest +from biocframe import BiocFrame from summarizedexperiment import SummarizedExperiment from singlecellexperiment import SingleCellExperiment @@ -51,18 +51,14 @@ def test_SCE_creation(): - tse = SingleCellExperiment( - assays={"counts": counts}, row_data=row_data, column_data=col_data - ) + tse = SingleCellExperiment(assays={"counts": counts}, row_data=row_data, column_data=col_data) assert tse is not None assert isinstance(tse, sce) def test_SCE_creation_with_alts(): - tse = SummarizedExperiment( - assays={"counts": counts}, row_data=row_data, column_data=col_data - ) + tse = SummarizedExperiment(assays={"counts": counts}, row_data=row_data, column_data=col_data) tse = SingleCellExperiment( assays={"counts": counts}, @@ -107,9 +103,7 @@ def test_SCE_creation_with_alts_should_fail(): } ) - tse = SummarizedExperiment( - assays={"counts": acounts}, row_data=adf_gr, column_data=acol_data - ) + tse = SummarizedExperiment(assays={"counts": acounts}, row_data=adf_gr, column_data=acol_data) with pytest.raises(Exception): tse = SingleCellExperiment( @@ -119,10 +113,9 @@ def test_SCE_creation_with_alts_should_fail(): alternative_experiments={"alt": tse}, ) + def test_SCE_creation_modifications(): - rse = SummarizedExperiment( - assays={"counts": counts}, row_data=row_data, column_data=col_data - ) + rse = SummarizedExperiment(assays={"counts": counts}, row_data=row_data, column_data=col_data) tse = SingleCellExperiment( assays={"counts": counts}, @@ -144,13 +137,14 @@ def test_SCE_creation_modifications(): tse.set_reduced_dimension("something", np.random.rand(tse.shape[1], 4), in_place=True) assert nassay_tse.get_reduced_dimension_names() == tse.get_reduced_dimension_names() + def test_SCE_different_alt_names(): rse = SummarizedExperiment( - assays={"counts": counts}, row_data=row_data, column_data=pd.DataFrame(index = ["ChIP"] * 6 ) + assays={"counts": counts}, row_data=row_data, column_data=pd.DataFrame(index=["ChIP"] * 6) ) with pytest.raises(Exception): - tse = SingleCellExperiment( + SingleCellExperiment( assays={"counts": counts}, row_data=row_data, column_data=col_data, @@ -158,30 +152,26 @@ def test_SCE_different_alt_names(): ) with pytest.raises(Exception): - tse = SingleCellExperiment( + SingleCellExperiment( assays={"counts": counts}, row_data=row_data, - column_data=pd.DataFrame(index = ["ChIP", "Input"] * 3), + column_data=pd.DataFrame(index=["ChIP", "Input"] * 3), alternative_experiments={"alt": rse}, ) with pytest.raises(Exception): - tse = SingleCellExperiment( + SingleCellExperiment( assays={"counts": counts}, row_data=row_data, - column_data=pd.DataFrame(index = ["ChIP", "Input", "Input"] * 2), + column_data=pd.DataFrame(index=["ChIP", "Input", "Input"] * 2), alternative_experiments={"alt": rse}, ) + def test_SCE_dims(): embeds = np.random.rand(counts.shape[1], 4) tse = SingleCellExperiment( - assays={"counts": counts}, - row_data=row_data, - column_data=col_data, - reduced_dimensions={ - "something": embeds - } + assays={"counts": counts}, row_data=row_data, column_data=col_data, reduced_dimensions={"something": embeds} ) assert tse is not None @@ -189,12 +179,7 @@ def test_SCE_dims(): assert tse.get_reduced_dimension_names() == ["something"] tse2 = SingleCellExperiment( - assays={"counts": counts}, - row_data=row_data, - column_data=col_data, - reduced_dims={ - "something": embeds - } + assays={"counts": counts}, row_data=row_data, column_data=col_data, reduced_dims={"something": embeds} ) assert tse2 is not None @@ -205,15 +190,73 @@ def test_SCE_dims(): assert np.allclose(tse.get_reduced_dimension("something"), tse2.get_reduced_dimension("something")) - with pytest.raises(Exception, match="Either 'reduced_dims' or 'reduced_dimensions' should be provided, but not both."): + with pytest.raises( + Exception, match="Either 'reduced_dims' or 'reduced_dimensions' should be provided, but not both." + ): SingleCellExperiment( assays={"counts": counts}, row_data=row_data, column_data=col_data, - reduced_dims={ - "something": embeds - }, - reduced_dimensions={ - "something": embeds - } + reduced_dims={"something": embeds}, + reduced_dimensions={"something": embeds}, ) + + +def test_validation_functions(): + from singlecellexperiment.SingleCellExperiment import ( + _validate_alternative_experiments, + _validate_pairs, + _validate_reduced_dims, + _validate_size_factors, + ) + + shape = (nrows, ncols) + + with pytest.raises(ValueError, match="'reduced_dims' cannot be `None`"): + _validate_reduced_dims(None, shape) + with pytest.raises(TypeError, match="'reduced_dims' is not a dictionary"): + _validate_reduced_dims("not_a_dict", shape) + with pytest.raises(TypeError, match="must be a matrix-like object"): + _validate_reduced_dims({"umap": "not_matrix"}, shape) + with pytest.raises(ValueError, match="does not contain embeddings for all cells"): + _validate_reduced_dims({"umap": np.zeros((ncols + 1, 2))}, shape) + + with pytest.raises(ValueError, match="'alternative_experiments' cannot be `None`"): + _validate_alternative_experiments(None, shape, ["1", "2", "3", "4", "5", "6"]) + with pytest.raises(TypeError, match="'alternative_experiments' is not a dictionary"): + _validate_alternative_experiments("not_a_dict", shape, ["1", "2", "3", "4", "5", "6"]) + with pytest.raises(TypeError, match="must be a 2-dimensional object"): + _validate_alternative_experiments({"alt": "not_exp"}, shape, ["1", "2", "3", "4", "5", "6"]) + + se_wrong_cells = SummarizedExperiment(assays={"counts": np.zeros((nrows, ncols + 1))}) + with pytest.raises(ValueError, match="does not contain same number of cells"): + _validate_alternative_experiments({"alt": se_wrong_cells}, shape, ["1", "2", "3", "4", "5", "6"]) + + se_wrong_names = SummarizedExperiment( + assays={"counts": np.zeros((nrows, ncols))}, + column_data=BiocFrame({}, number_of_rows=ncols), + column_names=["A", "B", "C", "D", "E", "F"], + ) + with pytest.raises(Exception, match="Column names do not match"): + _validate_alternative_experiments( + {"alt": se_wrong_names}, shape, ["1", "2", "3", "4", "5", "6"], with_dim_names=True + ) + + with pytest.warns(UserWarning, match="Column names do not match"): + _validate_alternative_experiments( + {"alt": se_wrong_names}, shape, ["1", "2", "3", "4", "5", "6"], with_dim_names=False + ) + + with pytest.raises(TypeError, match="'size_factors' must be a sequence-like object"): + _validate_size_factors(5, shape) + with pytest.raises(ValueError, match="'size_factors' length must match the number of columns"): + _validate_size_factors(np.zeros(ncols - 1), shape) + + with pytest.raises(TypeError, match="'row_pairs' is not a dictionary"): + _validate_pairs("not_a_dict", nrows, "row_pairs") + with pytest.raises(TypeError, match="must be a matrix-like object"): + _validate_pairs({"p1": "not_matrix"}, nrows, "row_pairs") + with pytest.raises(ValueError, match="must be 2-dimensional"): + _validate_pairs({"p1": np.zeros(nrows)}, nrows, "row_pairs") + with pytest.raises(ValueError, match="must be a square matrix"): + _validate_pairs({"p1": np.zeros((nrows, nrows + 1))}, nrows, "row_pairs") diff --git a/tests/test_sce_combine_cols.py b/tests/test_sce_combine_cols.py index 42ce42f..b468c00 100644 --- a/tests/test_sce_combine_cols.py +++ b/tests/test_sce_combine_cols.py @@ -1,42 +1,28 @@ -from random import random - -import anndata -import genomicranges -from biocframe import BiocFrame import biocutils import numpy as np -import pandas as pd -from mudata import MuData +import pytest -import singlecellexperiment from singlecellexperiment.SingleCellExperiment import SingleCellExperiment -import pytest - __author__ = "jkanche" __copyright__ = "jkanche" __license__ = "MIT" def test_combine_columns(experiments): - combined = biocutils.combine_columns( - experiments.se_unnamed, experiments.se_unnamed_2 - ) + combined = biocutils.combine_columns(experiments.se_unnamed, experiments.se_unnamed_2) assert combined is not None assert isinstance(combined, SingleCellExperiment) assert len(combined.alternative_experiments) == 0 assert len(combined.column_data["A"]) == 20 - combined2 = experiments.se_unnamed.combine_columns( - experiments.se_unnamed_2 - ) + combined2 = experiments.se_unnamed.combine_columns(experiments.se_unnamed_2) assert combined2 is not None assert isinstance(combined2, SingleCellExperiment) assert len(combined2.alternative_experiments) == 0 assert len(combined2.column_data["A"]) == 20 - def test_relaxed_combine_columns(experiments): ncols = 10 nrows = 100 @@ -65,9 +51,7 @@ def test_relaxed_combine_columns(experiments): def test_combine_with_alts(experiments): - combined = biocutils.combine_columns( - experiments.se_with_alts1, experiments.se_with_alts2 - ) + combined = biocutils.combine_columns(experiments.se_with_alts1, experiments.se_with_alts2) assert combined is not None assert isinstance(combined, SingleCellExperiment) print(combined) @@ -75,3 +59,18 @@ def test_combine_with_alts(experiments): assert len(combined.reduced_dim_names) == 1 assert combined.reduced_dim_names == ["PCA"] assert len(combined.row_data["seqnames"]) == 3 + + +def test_combine_utils_errors_and_masks(): + nrows_loc = 10 + ncols_loc = 4 + counts_loc = np.random.rand(nrows_loc, ncols_loc) + + sce1 = SingleCellExperiment(assays={"counts": counts_loc}, reduced_dimensions={"PCA": np.ones((ncols_loc, 2))}) + sce2 = SingleCellExperiment(assays={"counts": counts_loc}, reduced_dimensions={}) + + combined = sce1.relaxed_combine_columns(sce2) + pca = combined.get_reduced_dimension("PCA") + + assert isinstance(pca, np.ma.MaskedArray) + assert np.all(pca.mask[ncols_loc:, :]) diff --git a/tests/test_sce_combine_rows.py b/tests/test_sce_combine_rows.py index 6053c60..1d2625c 100644 --- a/tests/test_sce_combine_rows.py +++ b/tests/test_sce_combine_rows.py @@ -1,14 +1,6 @@ -from random import random - -import anndata -import genomicranges -from biocframe import BiocFrame import biocutils import numpy as np -import pandas as pd -from mudata import MuData -import singlecellexperiment from singlecellexperiment.SingleCellExperiment import SingleCellExperiment import pytest @@ -31,6 +23,7 @@ def test_combine_rows(experiments): assert len(combined2.alternative_experiments) == 0 assert len(combined2.row_data["A"]) == 200 + def test_relaxed_combine_rows(experiments): ncols = 10 nrows = 100 @@ -59,9 +52,7 @@ def test_relaxed_combine_rows(experiments): def test_combine_with_alts(experiments): - combined = biocutils.combine_rows( - experiments.se_with_alts1, experiments.se_with_alts2 - ) + combined = biocutils.combine_rows(experiments.se_with_alts1, experiments.se_with_alts2) assert combined is not None assert isinstance(combined, SingleCellExperiment) assert len(combined.alternative_experiments) == 1 diff --git a/tests/test_sce_io.py b/tests/test_sce_io.py index 424b0cd..b42efbb 100644 --- a/tests/test_sce_io.py +++ b/tests/test_sce_io.py @@ -2,16 +2,16 @@ import anndata import genomicranges -from biocframe import BiocFrame import numpy as np import pandas as pd +from biocframe import BiocFrame +from hdf5array import Hdf5CompressedSparseMatrix from mudata import MuData from scipy import sparse +from summarizedexperiment import SummarizedExperiment import singlecellexperiment from singlecellexperiment import SingleCellExperiment -from summarizedexperiment import SummarizedExperiment -from hdf5array import Hdf5CompressedSparseMatrix __author__ = "jkanche, keviny2" __copyright__ = "jkanche, keviny2" @@ -54,9 +54,7 @@ def test_SCE_to_anndata(): - tse = SingleCellExperiment( - assays={"counts": counts}, row_data=row_data, column_data=col_data - ) + tse = SingleCellExperiment(assays={"counts": counts}, row_data=row_data, column_data=col_data) assert tse is not None assert isinstance(tse, SingleCellExperiment) @@ -69,10 +67,9 @@ def test_SCE_to_anndata(): assert adata[1] is None + def test_SCE_to_anndata_with_alts(): - se = SummarizedExperiment( - assays={"counts": counts}, row_data=row_data, column_data=col_data - ) + se = SummarizedExperiment(assays={"counts": counts}, row_data=row_data, column_data=col_data) tse = SingleCellExperiment( assays={"counts": counts}, @@ -166,9 +163,7 @@ def test_SCE_from10xH5(): assert sliced.shape == (10, 4) - tse = singlecellexperiment.read_tenx_h5( - "tests/data/tenx.sub.h5", realize_assays=True - ) + tse = singlecellexperiment.read_tenx_h5("tests/data/tenx.sub.h5", realize_assays=True) assert isinstance(tse.assay(0), sparse.spmatrix) @@ -182,8 +177,8 @@ def test_SCE_randomAnnData(): y = np.dot(z, w.T) adata = anndata.AnnData(y) - adata.obs_names = [f"obs_{i+1}" for i in range(n)] - adata.var_names = [f"var_{j+1}" for j in range(d)] + adata.obs_names = [f"obs_{i + 1}" for i in range(n)] + adata.var_names = [f"var_{j + 1}" for j in range(d)] tse = singlecellexperiment.SingleCellExperiment.from_anndata(adata) @@ -192,7 +187,7 @@ def test_SCE_randomAnnData(): # to avoid unknown mapping types; # ran into an issue with anndata.compat._overloaded_dict.OverloadedDict when loading a h5ad - adata.uns = {".internal": [f"obs_{i+1}" for i in range(n)]} + adata.uns = {".internal": [f"obs_{i + 1}" for i in range(n)]} tse = singlecellexperiment.SingleCellExperiment.from_anndata(adata) assert tse is not None @@ -209,10 +204,9 @@ def test_SCE_randomAnnData(): assert isinstance(tse.alternative_experiments["raw"], SummarizedExperiment) assert tse.alternative_experiments["raw"].shape == (d, n) + def test_SCE_to_mudata(): - tse = SingleCellExperiment( - assays={"counts": counts}, row_data=row_data, column_data=col_data - ) + tse = SingleCellExperiment(assays={"counts": counts}, row_data=row_data, column_data=col_data) assert tse is not None assert isinstance(tse, SingleCellExperiment) @@ -220,3 +214,72 @@ def test_SCE_to_mudata(): result = tse.to_mudata() assert result is not None assert isinstance(result, MuData) + + +def test_tenx_io_edge_cases(): + import os + import tempfile + + import h5py + import pytest + + from singlecellexperiment.io.tenx import read_tenx_h5, read_tenx_mtx + + with tempfile.TemporaryDirectory() as tmpdir: + with open(os.path.join(tmpdir, "matrix.mtx"), "w") as f: + f.write("%%MatrixMarket matrix coordinate real general\n% Metadata\n10 4 3\n1 1 1.0\n2 2 2.0\n3 3 3.0\n") + + with open(os.path.join(tmpdir, "barcodes.tsv"), "w") as f: + f.write("BC1\nBC2\nBC3\nBC4\n") + + with open(os.path.join(tmpdir, "genes.tsv"), "w") as f: + f.write( + "G1\tGene1\nG2\tGene2\nG3\tGene3\nG4\tGene4\nG5\tGene5\nG6\tGene6\nG7\tGene7\nG8\tGene8\nG9\tGene9\nG10\tGene10\n" + ) + + sce = read_tenx_mtx(tmpdir) + assert sce.shape == (10, 4) + assert "gene_symbols" in sce.row_data.column_names + + with open(os.path.join(tmpdir, "features.tsv"), "w") as f: + f.write( + "F1\tFeat1\nF2\tFeat2\nF3\tFeat3\nF4\tFeat4\nF5\tFeat5\nF6\tFeat6\nF7\tFeat7\nF8\tFeat8\nF9\tFeat9\nF10\tFeat10\n" + ) + + with pytest.warns(UserWarning, match="Both 'features.tsv' and 'genes.tsv' files are present"): + sce = read_tenx_mtx(tmpdir) + + assert sce.shape == (10, 4) + + with tempfile.NamedTemporaryFile(suffix=".h5", delete=False) as tmpfile: + tmpname = tmpfile.name + try: + with h5py.File(tmpname, "w") as f: + f.create_group("wrong_key") + + with pytest.raises(ValueError, match="is not a 10X V3 format"): + read_tenx_h5(tmpname) + + with h5py.File(tmpname, "w") as f: + mat_grp = f.create_group("matrix") + mat_grp.create_dataset("shape", data=[10, 4]) + mat_grp.create_dataset("data", data=[1.0, 2.0]) + mat_grp.create_dataset("indices", data=[0, 1]) + mat_grp.create_dataset("indptr", data=[0, 1, 2, 2, 2]) + + feat_grp = mat_grp.create_group("features") + feat_grp.create_dataset("id", data=[b"F1", b"F2", b"F3", b"F4", b"F5", b"F6", b"F7", b"F8", b"F9", b"F10"]) + feat_grp.create_dataset("mismatched", data=[b"M1", b"M2", b"M3", b"M4", b"M5"]) + + mat_grp.create_dataset("barcodes", data=[b"BC1", b"BC2", b"BC3", b"BC4"]) + + with pytest.warns(UserWarning, match="These columns from h5 are ignored - mismatched"): + sce = read_tenx_h5(tmpname) + + assert sce.shape == (10, 4) + assert "id" in sce.row_data.column_names + assert "mismatched" not in sce.row_data.column_names + + finally: + if os.path.exists(tmpname): + os.remove(tmpname) diff --git a/tests/test_sce_methods.py b/tests/test_sce_methods.py index 2581742..7318dc6 100644 --- a/tests/test_sce_methods.py +++ b/tests/test_sce_methods.py @@ -1,11 +1,11 @@ from random import random import genomicranges -from biocframe import BiocFrame import numpy as np import pandas as pd import pytest -from summarizedexperiment import SummarizedExperiment, RangedSummarizedExperiment +from biocframe import BiocFrame +from summarizedexperiment import RangedSummarizedExperiment, SummarizedExperiment from singlecellexperiment import SingleCellExperiment from singlecellexperiment.SingleCellExperiment import SingleCellExperiment as sce @@ -51,17 +51,13 @@ def test_SCE_props(): - tse = SingleCellExperiment( - assays={"counts": counts}, row_data=row_data, column_data=col_data - ) + tse = SingleCellExperiment(assays={"counts": counts}, row_data=row_data, column_data=col_data) assert tse is not None assert isinstance(tse, sce) assert tse.alternative_experiments == {} - alt = SummarizedExperiment( - assays={"counts": counts}, row_data=row_data, column_data=col_data - ) + alt = SummarizedExperiment(assays={"counts": counts}, row_data=row_data, column_data=col_data) tse.alternative_experiments = {"alt": alt} assert tse.alternative_experiments is not None @@ -70,12 +66,15 @@ def test_SCE_props(): assert tse.col_data is not None assert tse.column_pairs == {} - tse.column_pairs = {"random": col_data} + tse.column_pairs = {"random": np.random.rand(ncols, ncols)} assert tse.column_pairs is not None with pytest.raises(Exception): tse.row_pairs = counts + with pytest.raises(Exception): + tse.row_pairs = {"random": np.random.rand(nrows, 4)} + assert tse.row_pairs == {} assert tse.main_experiment_name is None @@ -91,10 +90,9 @@ def test_SCE_props(): assert tse.reduced_dim_names is not None assert len(tse.reduced_dim_names) == 1 + def test_SCE_to_RSE(): - tse = SingleCellExperiment( - assays={"counts": counts}, row_data=row_data, column_data=col_data, row_ranges=gr - ) + tse = SingleCellExperiment(assays={"counts": counts}, row_data=row_data, column_data=col_data, row_ranges=gr) rse = tse.to_rangedsummarizedexperiment() assert isinstance(rse, RangedSummarizedExperiment) @@ -102,20 +100,18 @@ def test_SCE_to_RSE(): assert rse.shape == tse.shape assert rse.row_ranges is not None + def test_RSE_to_SCE(): - rse = RangedSummarizedExperiment( - assays={"counts": counts}, row_data=row_data, column_data=col_data, row_ranges=gr - ) + rse = RangedSummarizedExperiment(assays={"counts": counts}, row_data=row_data, column_data=col_data, row_ranges=gr) tse = SingleCellExperiment.from_rangedsummarizedexperiment(rse) assert isinstance(tse, SingleCellExperiment) assert tse.shape == rse.shape assert tse.row_ranges is not None + def test_SCE_to_SE(): - tse = SingleCellExperiment( - assays={"counts": counts}, row_data=row_data, column_data=col_data, row_ranges=gr - ) + tse = SingleCellExperiment(assays={"counts": counts}, row_data=row_data, column_data=col_data, row_ranges=gr) se = tse.to_summarizedexperiment() assert isinstance(se, SummarizedExperiment) @@ -124,11 +120,303 @@ def test_SCE_to_SE(): assert se.row_data is not None assert "seqnames" in se.row_data.column_names + def test_SE_to_SCE(): - se = SummarizedExperiment( - assays={"counts": counts}, row_data=row_data, column_data=col_data - ) + se = SummarizedExperiment(assays={"counts": counts}, row_data=row_data, column_data=col_data) tse = SingleCellExperiment.from_summarizedexperiment(se) assert isinstance(tse, SingleCellExperiment) assert tse.shape == se.shape + + +def test_size_factors(): + sf = np.random.rand(ncols) + tse = SingleCellExperiment(assays={"counts": counts}, row_data=row_data, column_data=col_data, size_factors=sf) + + assert np.allclose(tse.size_factors, sf) + assert np.allclose(tse.get_size_factors(), sf) + assert "sizeFactors" in tse.column_data.column_names + assert np.allclose(np.array(tse.column_data.column("sizeFactors")), sf) + + sf2 = np.random.rand(ncols) + tse2 = tse.set_size_factors(sf2, in_place=False) + assert np.allclose(tse2.size_factors, sf2) + assert "sizeFactors" in tse2.column_data.column_names + assert np.allclose(np.array(tse2.column_data.column("sizeFactors")), sf2) + + assert np.allclose(tse.size_factors, sf) # original unchanged + assert np.allclose(np.array(tse.column_data.column("sizeFactors")), sf) + + tse.set_size_factors(sf2, in_place=True) + assert np.allclose(tse.size_factors, sf2) + assert np.allclose(np.array(tse.column_data.column("sizeFactors")), sf2) + + tse_cleared = tse.set_size_factors(None, in_place=False) + assert tse_cleared.size_factors is None + assert "sizeFactors" not in tse_cleared.column_data.column_names + + tse_no_sf = SingleCellExperiment(assays={"counts": counts}, row_data=row_data, column_data=col_data) + assert tse_no_sf.size_factors is None + assert tse_no_sf.get_size_factors(on_absence="none") is None + assert "sizeFactors" not in tse_no_sf.column_data.column_names + + with pytest.warns(UserWarning): + tse_no_sf.get_size_factors(on_absence="warn") + + with pytest.raises(ValueError): + tse_no_sf.get_size_factors(on_absence="error") + + with pytest.raises(Exception): + tse.set_size_factors(np.random.rand(ncols - 1)) + with pytest.raises(Exception): + tse.set_size_factors(5) + + +def test_individual_pair_accessors(): + rp = np.random.rand(nrows, nrows) + cp = np.random.rand(ncols, ncols) + tse = SingleCellExperiment( + assays={"counts": counts}, + row_data=row_data, + column_data=col_data, + row_pairs={"rp1": rp}, + column_pairs={"cp1": cp}, + ) + + assert np.allclose(tse.get_row_pair("rp1"), rp) + assert np.allclose(tse.get_row_pair(0), rp) + assert np.allclose(tse.get_column_pair("cp1"), cp) + assert np.allclose(tse.get_column_pair(0), cp) + + rp2 = np.random.rand(nrows, nrows) + tse2 = tse.set_row_pair("rp2", rp2, in_place=False) + assert np.allclose(tse2.get_row_pair("rp2"), rp2) + + cp2 = np.random.rand(ncols, ncols) + tse3 = tse.set_column_pair("cp2", cp2, in_place=False) + assert np.allclose(tse3.get_column_pair("cp2"), cp2) + + with pytest.raises(IndexError): + tse.get_row_pair(-1) + with pytest.raises(IndexError): + tse.get_row_pair(10) + with pytest.raises(AttributeError): + tse.get_row_pair("nonexistent") + with pytest.raises(TypeError): + tse.get_row_pair(3.5) + + +def test_copy_deepcopy(): + sf = np.random.rand(ncols) + tse = SingleCellExperiment(assays={"counts": counts}, row_data=row_data, column_data=col_data, size_factors=sf) + + from copy import copy, deepcopy + + tse_copy = copy(tse) + assert np.allclose(tse_copy.size_factors, sf) + + tse_deepcopy = deepcopy(tse) + assert np.allclose(tse_deepcopy.size_factors, sf) + + +def test_alt_exp_workflows(): + rse = SummarizedExperiment(assays={"counts": np.random.rand(nrows, ncols)}, row_data=row_data, column_data=col_data) + + tse = SingleCellExperiment( + assays={"counts": counts}, + row_data=row_data, + column_data=col_data, + alternative_experiments={"alt": rse}, + size_factors=np.random.rand(ncols), + ) + + swapped = tse.swap_alt_exp("alt", saved="main") + assert isinstance(swapped, SingleCellExperiment) + assert "main" in swapped.alternative_experiments + assert "alt" not in swapped.alternative_experiments + assert swapped.shape == (nrows, ncols) + assert np.allclose(swapped.size_factors, tse.size_factors) + + f = ["groupA"] * (nrows // 2) + ["groupB"] * (nrows // 2) + split_sce = tse.split_alt_exps(f, ref="groupA") + assert "groupB" in split_sce.alternative_experiments + assert split_sce.shape == (nrows // 2, ncols) + assert split_sce.alternative_experiments["groupB"].shape == (nrows // 2, ncols) + + unsplit_sce = split_sce.unsplit_alt_exps(names=["groupB"]) + assert "groupB" not in unsplit_sce.alternative_experiments + assert unsplit_sce.shape == (nrows, ncols) + + +def test_sce_alias_and_deprecated_paths(): + with pytest.warns(DeprecationWarning, match="'reduced_dims' is deprecated"): + sce_dep = SingleCellExperiment(assays={"counts": counts}, reduced_dims={"PCA": np.zeros((ncols, 2))}) + assert "PCA" in sce_dep.reduced_dim_names + + tse = SingleCellExperiment( + assays={"counts": counts}, + row_data=row_data, + column_data=col_data, + reduced_dimensions={"PCA": np.zeros((ncols, 2)), "UMAP": np.zeros((ncols, 3))}, + ) + + assert "PCA" in tse.get_reduced_dims() + + sce2 = tse.set_reduced_dims({"TSNE": np.zeros((ncols, 2))}, in_place=False) + assert "TSNE" in sce2.get_reduced_dims() + assert "PCA" in tse.get_reduced_dims() + + with pytest.warns(UserWarning, match="use 'set_reduced_dimensions' instead"): + tse.reduced_dims = {"TSNE": np.zeros((ncols, 2))} + + assert "TSNE" in tse.reduced_dim_names + + with pytest.warns(UserWarning, match="use 'set_reduced_dimensions' instead"): + tse.reduced_dimensions = {"PCA": np.zeros((ncols, 2))} + + assert "PCA" in tse.reduced_dim_names + + with pytest.warns(UserWarning, match="use 'set_reduced_dimension_names' instead"): + tse.reduced_dim_names = ["PCA_new"] + + assert "PCA_new" in tse.reduced_dim_names + + with pytest.warns(UserWarning, match="use 'set_reduced_dimension_names' instead"): + tse.reduced_dimension_names = ["PCA_brand_new"] + + assert "PCA_brand_new" in tse.reduced_dimension_names + + tse = tse.set_reduced_dim_names(["PCA"], in_place=False) + with pytest.raises(ValueError, match="Length of 'names' does not match"): + tse.set_reduced_dim_names(["A", "B"]) + + sce_renamed = tse.set_reduced_dim_names(["PCA_renamed"], in_place=False) + assert "PCA_renamed" in sce_renamed.reduced_dim_names + + with pytest.raises(IndexError, match="Index cannot be negative"): + tse.get_reduced_dimension(-1) + + with pytest.raises(IndexError, match="Index greater than the number of reduced dimensions"): + tse.get_reduced_dimension(10) + + assert tse.get_reduced_dimension(0).shape == (ncols, 2) + + with pytest.raises(AttributeError, match="does not exist"): + tse.get_reduced_dimension("TSNE") + + assert tse.get_reduced_dimension("PCA").shape == (ncols, 2) + assert tse.reduced_dim("PCA").shape == (ncols, 2) + assert tse.reduced_dimension("PCA").shape == (ncols, 2) + + with pytest.raises(TypeError, match="must be a string or integer"): + tse.get_reduced_dimension([1, 2]) + + with pytest.raises(ValueError, match="Length of 'names' does not match"): + tse.set_alternative_experiment_names(["alt1", "alt2"]) + + se1 = SummarizedExperiment(assays={"counts": counts}) + sce_alt = SingleCellExperiment(assays={"counts": counts}, alternative_experiments={"alt1": se1}) + with pytest.warns(UserWarning, match="use 'set_alternative_experiment_names' instead"): + sce_alt.alternative_experiment_names = ["alt1_new"] + + assert "alt1_new" in sce_alt.alternative_experiment_names + + with pytest.raises(IndexError, match="Index cannot be negative"): + sce_alt.get_alternative_experiment(-1) + + with pytest.raises(IndexError, match="Index greater than the number of alternative experiments"): + sce_alt.get_alternative_experiment(5) + + with pytest.raises(AttributeError, match="does not exist"): + sce_alt.get_alternative_experiment("nonexistent") + + with pytest.raises(TypeError, match="must be a string or integer"): + sce_alt.get_alternative_experiment([1]) + + assert isinstance(sce_alt.alternative_experiment(0), SummarizedExperiment) + + with pytest.warns(UserWarning, match="use 'set_row_pairs' instead"): + tse.row_pairs = {"rp": np.zeros((nrows, nrows))} + + with pytest.warns(UserWarning, match="use 'set_column_pairs' instead"): + tse.column_pairs = {"cp": np.zeros((ncols, ncols))} + + with pytest.warns(UserWarning, match="use 'set_row_pair_names' instead"): + tse.row_pair_names = ["rp_new"] + + with pytest.warns(UserWarning, match="use 'set_column_pair_names' instead"): + tse.column_pair_names = ["cp_new"] + + with pytest.raises(IndexError, match="Index cannot be negative"): + tse.get_row_pair(-1) + + with pytest.raises(IndexError, match="Index greater than the number of row pairs"): + tse.get_row_pair(5) + + with pytest.raises(AttributeError, match="does not exist"): + tse.get_row_pair("nonexistent") + + with pytest.raises(TypeError, match="must be a string or integer"): + tse.get_row_pair([1]) + + with pytest.raises(IndexError, match="Index cannot be negative"): + tse.get_column_pair(-1) + + with pytest.raises(IndexError, match="Index greater than the number of column pairs"): + tse.get_column_pair(5) + + with pytest.raises(AttributeError, match="does not exist"): + tse.get_column_pair("nonexistent") + + with pytest.raises(TypeError, match="must be a string or integer"): + tse.get_column_pair([1]) + + +def test_sce_workflow_corner_cases(): + se = SummarizedExperiment(assays={"counts": counts}, column_data=col_data) + tse = SingleCellExperiment( + assays={"counts": counts}, + row_data=row_data, + column_data=col_data, + alternative_experiments={"alt": se}, + reduced_dimensions={"PCA": np.zeros((ncols, 2))}, + ) + + swapped = tse.swap_alt_exp("alt", with_col_data=False) + assert swapped.column_data.shape == se.column_data.shape + assert len(swapped.reduced_dim_names) == 0 + + with pytest.raises(ValueError, match="Column 'nonexistent' not found in row_data"): + tse.split_alt_exps("nonexistent") + with pytest.raises(ValueError, match="Length of 'f' must match the number of rows"): + tse.split_alt_exps([1, 2]) + with pytest.raises(ValueError, match="Reference group 'wrong' not found in groups"): + tse.split_alt_exps(["G1", "G2"] * int(nrows / 2), ref="wrong") + + sce_split = tse.copy() + sce_split.split_alt_exps(["G1", "G2"] * int(nrows / 2), ref="G1", in_place=True) + assert "G2" in sce_split.alternative_experiment_names + + assert isinstance(tse.unsplit_alt_exps(names=[]), SingleCellExperiment) + sce_unsplit = tse.copy() + assert sce_unsplit.unsplit_alt_exps(names=[], in_place=True) is sce_unsplit + + with pytest.raises(ValueError, match="not found"): + tse.unsplit_alt_exps(names=["wrong"]) + + se_alt = SummarizedExperiment(assays={"counts": np.random.rand(5, ncols)}, column_data=col_data) + rse_alt = RangedSummarizedExperiment(assays={"counts": np.random.rand(5, ncols)}, column_data=col_data) + + sce_multi = SingleCellExperiment( + assays={"counts": counts}, + row_data=row_data, + column_data=col_data, + alternative_experiments={"se_alt": se_alt, "rse_alt": rse_alt}, + ) + + unsplit = sce_multi.unsplit_alt_exps() + assert unsplit.shape[0] == nrows + 5 + 5 + + sce_multi_ip = sce_multi.copy() + sce_multi_ip.unsplit_alt_exps(in_place=True) + assert sce_multi_ip.shape[0] == nrows + 5 + 5 diff --git a/tests/test_sce_slice.py b/tests/test_sce_slice.py index 76f41ef..d2536a0 100644 --- a/tests/test_sce_slice.py +++ b/tests/test_sce_slice.py @@ -1,9 +1,9 @@ from random import random import genomicranges -from biocframe import BiocFrame import numpy as np import pandas as pd +from biocframe import BiocFrame from summarizedexperiment import SummarizedExperiment from singlecellexperiment import SingleCellExperiment @@ -66,6 +66,7 @@ def test_SCE_slice(): assert tse_slice.assay("counts").shape == (10, 3) + def test_SCE_slice_with_numpy(): tse = SingleCellExperiment( assays={"counts": counts}, @@ -92,6 +93,7 @@ def test_SCE_slice_with_numpy(): assert tse_slice.assay("counts").shape == (10, 3) + def test_SCE_creation_with_alts_slice(): trse = SummarizedExperiment( assays={"counts": counts.copy()}, @@ -117,3 +119,30 @@ def test_SCE_creation_with_alts_slice(): assert tsce_slice.assay("counts").shape == (10, 3) alt_exp = tsce_slice.alternative_experiments["alt"] assert alt_exp.shape == (200, 3) + + +def test_SCE_slice_pairs_and_size_factors(): + rp = np.random.rand(nrows, nrows) + cp = np.random.rand(ncols, ncols) + sf = np.random.rand(ncols) + + tse = SingleCellExperiment( + assays={"counts": counts}, + row_data=row_data, + column_data=col_data, + row_pairs={"rp1": rp}, + column_pairs={"cp1": cp}, + size_factors=sf, + ) + + tse_slice = tse[0:10, 0:3] + + assert tse_slice.row_pairs["rp1"].shape == (10, 10) + assert tse_slice.column_pairs["cp1"].shape == (3, 3) + assert tse_slice.size_factors.shape == (3,) + assert "sizeFactors" in tse_slice.column_data.column_names + assert np.allclose(np.array(tse_slice.column_data.column("sizeFactors")), sf[0:3]) + + assert np.allclose(tse_slice.row_pairs["rp1"], rp[0:10, :][:, 0:10]) + assert np.allclose(tse_slice.column_pairs["cp1"], cp[0:3, :][:, 0:3]) + assert np.allclose(tse_slice.size_factors, sf[0:3])