Source code for squidpy.read._read

from __future__ import annotations

import json
import os
import re
from pathlib import Path
from typing import Any

import numpy as np
import pandas as pd
from anndata import AnnData
from scipy.sparse import csr_matrix
from spatialdata._logging import logger as logg

from squidpy._constants._pkg_constants import Key
from squidpy.read._utils import PathLike, _load_image, _read_counts

__all__ = ["visium", "vizgen", "nanostring"]



[docs]
def visium(
    path: PathLike,
    *,
    counts_file: str = "filtered_feature_bc_matrix.h5",
    library_id: str | None = None,
    load_images: bool = True,
    source_image_path: PathLike | None = None,
    **kwargs: Any,
) -> AnnData:
    """
    Read *10x Genomics* Visium formatted dataset.

    In addition to reading the regular *Visium* output, it looks for the *spatial* directory and loads the images,
    spatial coordinates and scale factors.

    .. seealso::

        - `Space Ranger output <https://support.10xgenomics.com/spatial-gene-expression/software/pipelines/latest/output/overview>`_.
        - :func:`squidpy.pl.spatial_scatter` on how to plot spatial data.

    Parameters
    ----------
    path
        Path to the root directory containing *Visium* files.
    counts_file
        Which file in the passed directory to use as the count file. Typically either *filtered_feature_bc_matrix.h5* or
        *raw_feature_bc_matrix.h5*.
    library_id
        Identifier for the *Visium* library. Useful when concatenating multiple :class:`anndata.AnnData` objects.
    kwargs
        Keyword arguments for :func:`scanpy.read_10x_h5`, :func:`scanpy.read_10x_mtx` or :func:`anndata.io.read_text`.

    Returns
    -------
    Annotated data object with the following keys:

        - :attr:`anndata.AnnData.obsm` ``['spatial']`` - spatial spot coordinates.
        - :attr:`anndata.AnnData.uns` ``['spatial']['{library_id}']['images']`` - *hires* and *lowres* images.
        - :attr:`anndata.AnnData.uns` ``['spatial']['{library_id}']['scalefactors']`` - scale factors for the spots.
        - :attr:`anndata.AnnData.uns` ``['spatial']['{library_id}']['metadata']`` - various metadata.
    """  # noqa: E501
    path = Path(path)
    adata, library_id = _read_counts(path, counts_file=counts_file, library_id=library_id, **kwargs)

    if not load_images:
        return adata

    adata.uns[Key.uns.spatial][library_id][Key.uns.image_key] = {
        res: _load_image(path / f"{Key.uns.spatial}/tissue_{res}_image.png") for res in ["hires", "lowres"]
    }
    adata.uns[Key.uns.spatial][library_id]["scalefactors"] = json.loads(
        (path / f"{Key.uns.spatial}/scalefactors_json.json").read_bytes()
    )

    # Space Ranger versions use different file formats:
    #   - v1: tissue_positions.csv (no header)
    #   - v2: tissue_positions_list.csv (with header)
    #   - v3: tissue_positions.csv (with header)
    tissue_positions_file = (
        path / "spatial/tissue_positions.csv"
        if (path / "spatial/tissue_positions.csv").exists()
        else path / "spatial/tissue_positions_list.csv"
    )

    # Detect header by checking if first cell is 'barcode' (header) or a barcode value
    with open(tissue_positions_file) as f:
        first_cell = f.readline().split(",")[0].strip()
    has_header = first_cell.lower() == "barcode"

    coords = pd.read_csv(
        tissue_positions_file,
        header=0 if has_header else None,
        index_col=0,
    )
    coords.columns = ["in_tissue", "array_row", "array_col", "pxl_col_in_fullres", "pxl_row_in_fullres"]
    # https://github.com/scverse/squidpy/issues/657
    coords.set_index(coords.index.astype(adata.obs.index.dtype), inplace=True)

    adata.obs = pd.merge(adata.obs, coords, how="left", left_index=True, right_index=True)
    adata.obsm[Key.obsm.spatial] = adata.obs[["pxl_row_in_fullres", "pxl_col_in_fullres"]].values
    adata.obs.drop(columns=["pxl_row_in_fullres", "pxl_col_in_fullres"], inplace=True)

    if source_image_path is not None:
        source_image_path = Path(source_image_path).absolute()
        if not source_image_path.exists():
            logg.warning(f"Path to the high-resolution tissue image `{source_image_path}` does not exist")
        adata.uns["spatial"][library_id]["metadata"]["source_image_path"] = str(source_image_path)

    return adata




[docs]
def vizgen(
    path: str | Path,
    *,
    counts_file: str,
    meta_file: str,
    transformation_file: str | None = None,
    library_id: str = "library",
    **kwargs: Any,
) -> AnnData:
    """
    Read *Vizgen* formatted dataset.

    In addition to reading the regular *Vizgen* output, it loads the metadata file and optionally loads
    the transformation matrix.

    .. seealso::

        - `Vizgen data release program <https://vizgen.com/data-release-program/>`_.
        - :func:`squidpy.pl.spatial_scatter` on how to plot spatial data.

    Parameters
    ----------
    path
        Path to the root directory containing *Vizgen* files.
    counts_file
        File containing the counts. Typically ends with *_cell_by_gene.csv*.
    meta_file
        File containing the spatial coordinates and additional cell-level metadata.
    transformation_file
        Transformation matrix file for converting micron coordinates into pixels in images.
    library_id
        Identifier for the *Vizgen* library. Useful when concatenating multiple :class:`anndata.AnnData` objects.

    Returns
    -------
    Annotated data object with the following keys:

        - :attr:`anndata.AnnData.obsm` ``['spatial']`` - spatial spot coordinates in microns.
        - :attr:`anndata.AnnData.obsm` ``['blank_genes']`` - blank genes from Vizgen platform.
        - :attr:`anndata.AnnData.uns` ``['spatial']['{library_id}']['scalefactors']['transformation_matrix']`` -
          transformation matrix for converting micron coordinates to pixels.
          Only present if ``transformation_file != None``.
    """
    path = Path(path)
    adata, library_id = _read_counts(
        path=path, counts_file=counts_file, library_id=library_id, delimiter=",", first_column_names=True, **kwargs
    )
    blank_genes = np.array(["Blank" in v for v in adata.var_names])
    adata.obsm["blank_genes"] = pd.DataFrame(
        adata[:, blank_genes].X.copy(), columns=adata.var_names[blank_genes], index=adata.obs_names
    )
    adata = adata[:, ~blank_genes].copy()

    adata.X = csr_matrix(adata.X)

    coords = pd.read_csv(path / meta_file, header=0, index_col=0)
    # https://github.com/scverse/squidpy/issues/657
    coords.set_index(coords.index.astype("str"), inplace=True)

    adata.obs = pd.merge(adata.obs, coords, how="left", left_index=True, right_index=True)
    adata.obsm[Key.obsm.spatial] = adata.obs[["center_x", "center_y"]].values
    adata.obs.drop(columns=["center_x", "center_y"], inplace=True)

    if transformation_file is not None:
        matrix = pd.read_csv(path / f"images/{transformation_file}", sep=" ", header=None)
        # https://github.com/scverse/squidpy/issues/727
        matrix.columns = matrix.columns.astype(str)
        adata.uns[Key.uns.spatial][library_id]["scalefactors"] = {"transformation_matrix": matrix}

    return adata




[docs]
def nanostring(
    path: str | Path,
    *,
    counts_file: str,
    meta_file: str,
    fov_file: str | None = None,
) -> AnnData:
    """
    Read *Nanostring* formatted dataset.

    In addition to reading the regular *Nanostring* output, it loads the metadata file, if present *CellComposite* and *CellLabels*
    directories containing the images and optionally the field of view file.

    .. seealso::

        - `Nanostring Spatial Molecular Imager <https://nanostring.com/products/cosmx-spatial-molecular-imager/>`_.
        - :func:`squidpy.pl.spatial_scatter` on how to plot spatial data.

    Parameters
    ----------
    path
        Path to the root directory containing *Nanostring* files.
    counts_file
        File containing the counts. Typically ends with *_exprMat_file.csv*.
    meta_file
        File containing the spatial coordinates and additional cell-level metadata.
        Typically ends with *_metadata_file.csv*.
    fov_file
        File containing the coordinates of all the fields of view.

    Returns
    -------
    Annotated data object with the following keys:

        - :attr:`anndata.AnnData.obsm` ``['spatial']`` -  local coordinates of the centers of cells.
        - :attr:`anndata.AnnData.obsm` ``['spatial_fov']`` - global coordinates of the centers of cells in the
          field of view.
        - :attr:`anndata.AnnData.uns` ``['spatial']['{fov}']['images']`` - *hires* and *segmentation* images.
        - :attr:`anndata.AnnData.uns` ``['spatial']['{fov}']['metadata']]['{x,y}_global_px']`` - coordinates of the field of view.
          Only present if ``fov_file != None``.
    """  # noqa: E501
    path, fov_key = Path(path), "fov"
    cell_id_key = "cell_ID"
    counts = pd.read_csv(path / counts_file, header=0, index_col=cell_id_key)
    counts.index = counts.index.astype(str).str.cat(counts.pop(fov_key).astype(str).values, sep="_")

    obs = pd.read_csv(path / meta_file, header=0, index_col=cell_id_key)
    obs[fov_key] = pd.Categorical(obs[fov_key].astype(str))
    obs[cell_id_key] = obs.index.astype(np.int64)
    obs.rename_axis(None, inplace=True)
    obs.index = obs.index.astype(str).str.cat(obs[fov_key].values, sep="_")

    common_index = obs.index.intersection(counts.index)

    adata = AnnData(
        csr_matrix(counts.loc[common_index, :].values),
        obs=obs.loc[common_index, :],
        uns={Key.uns.spatial: {}},
    )
    adata.var_names = counts.columns

    adata.obsm[Key.obsm.spatial] = adata.obs[["CenterX_local_px", "CenterY_local_px"]].values
    adata.obsm["spatial_fov"] = adata.obs[["CenterX_global_px", "CenterY_global_px"]].values
    adata.obs.drop(columns=["CenterX_local_px", "CenterY_local_px"], inplace=True)

    for fov in adata.obs[fov_key].cat.categories:
        adata.uns[Key.uns.spatial][fov] = {
            "images": {},
            "scalefactors": {"tissue_hires_scalef": 1, "spot_diameter_fullres": 1},
        }

    file_extensions = (".jpg", ".png", ".jpeg", ".tif", ".tiff")

    pat = re.compile(r".*_F(\d+)")
    for subdir in ["CellComposite", "CellLabels"]:
        if os.path.exists(path / subdir) and os.path.isdir(path / subdir):
            kind = "hires" if subdir == "CellComposite" else "segmentation"
            for fname in os.listdir(path / subdir):
                if fname.endswith(file_extensions):
                    fov = str(int(pat.findall(fname)[0]))
                    try:
                        adata.uns[Key.uns.spatial][fov]["images"][kind] = _load_image(path / subdir / fname)
                    except KeyError:
                        logg.warning(f"FOV `{str(fov)}` does not exist in {subdir} folder, skipping it.")
                        continue

    if fov_file is not None:
        fov_positions = pd.read_csv(path / fov_file, header=0, index_col=fov_key)
        for fov, row in fov_positions.iterrows():
            try:
                adata.uns[Key.uns.spatial][str(fov)]["metadata"] = row.to_dict()
            except KeyError:
                logg.warning(f"FOV `{str(fov)}` does not exist, skipping it.")
                continue

    return adata