Source code for squidpy.read._read

from __future__ import annotations

import json
import os
import re
from pathlib import Path
from typing import Any

import numpy as np
import pandas as pd
from anndata import AnnData
from scipy.sparse import csr_matrix
from spatialdata._logging import logger as logg

from squidpy._constants._pkg_constants import Key
from squidpy.read._utils import PathLike, _load_image, _read_counts

__all__ = ["visium", "vizgen", "nanostring"]


[docs] def visium( path: PathLike, *, counts_file: str = "filtered_feature_bc_matrix.h5", library_id: str | None = None, load_images: bool = True, source_image_path: PathLike | None = None, **kwargs: Any, ) -> AnnData: """ Read *10x Genomics* Visium formatted dataset. In addition to reading the regular *Visium* output, it looks for the *spatial* directory and loads the images, spatial coordinates and scale factors. .. seealso:: - `Space Ranger output <https://support.10xgenomics.com/spatial-gene-expression/software/pipelines/latest/output/overview>`_. - :func:`squidpy.pl.spatial_scatter` on how to plot spatial data. Parameters ---------- path Path to the root directory containing *Visium* files. counts_file Which file in the passed directory to use as the count file. Typically either *filtered_feature_bc_matrix.h5* or *raw_feature_bc_matrix.h5*. library_id Identifier for the *Visium* library. Useful when concatenating multiple :class:`anndata.AnnData` objects. kwargs Keyword arguments for :func:`scanpy.read_10x_h5`, :func:`scanpy.read_10x_mtx` or :func:`anndata.io.read_text`. Returns ------- Annotated data object with the following keys: - :attr:`anndata.AnnData.obsm` ``['spatial']`` - spatial spot coordinates. - :attr:`anndata.AnnData.uns` ``['spatial']['{library_id}']['images']`` - *hires* and *lowres* images. - :attr:`anndata.AnnData.uns` ``['spatial']['{library_id}']['scalefactors']`` - scale factors for the spots. - :attr:`anndata.AnnData.uns` ``['spatial']['{library_id}']['metadata']`` - various metadata. """ # noqa: E501 path = Path(path) adata, library_id = _read_counts(path, counts_file=counts_file, library_id=library_id, **kwargs) if not load_images: return adata adata.uns[Key.uns.spatial][library_id][Key.uns.image_key] = { res: _load_image(path / f"{Key.uns.spatial}/tissue_{res}_image.png") for res in ["hires", "lowres"] } adata.uns[Key.uns.spatial][library_id]["scalefactors"] = json.loads( (path / f"{Key.uns.spatial}/scalefactors_json.json").read_bytes() ) # Space Ranger versions use different file formats: # - v1: tissue_positions.csv (no header) # - v2: tissue_positions_list.csv (with header) # - v3: tissue_positions.csv (with header) tissue_positions_file = ( path / "spatial/tissue_positions.csv" if (path / "spatial/tissue_positions.csv").exists() else path / "spatial/tissue_positions_list.csv" ) # Detect header by checking if first cell is 'barcode' (header) or a barcode value with open(tissue_positions_file) as f: first_cell = f.readline().split(",")[0].strip() has_header = first_cell.lower() == "barcode" coords = pd.read_csv( tissue_positions_file, header=0 if has_header else None, index_col=0, ) coords.columns = ["in_tissue", "array_row", "array_col", "pxl_col_in_fullres", "pxl_row_in_fullres"] # https://github.com/scverse/squidpy/issues/657 coords.set_index(coords.index.astype(adata.obs.index.dtype), inplace=True) adata.obs = pd.merge(adata.obs, coords, how="left", left_index=True, right_index=True) adata.obsm[Key.obsm.spatial] = adata.obs[["pxl_row_in_fullres", "pxl_col_in_fullres"]].values adata.obs.drop(columns=["pxl_row_in_fullres", "pxl_col_in_fullres"], inplace=True) if source_image_path is not None: source_image_path = Path(source_image_path).absolute() if not source_image_path.exists(): logg.warning(f"Path to the high-resolution tissue image `{source_image_path}` does not exist") adata.uns["spatial"][library_id]["metadata"]["source_image_path"] = str(source_image_path) return adata
[docs] def vizgen( path: str | Path, *, counts_file: str, meta_file: str, transformation_file: str | None = None, library_id: str = "library", **kwargs: Any, ) -> AnnData: """ Read *Vizgen* formatted dataset. In addition to reading the regular *Vizgen* output, it loads the metadata file and optionally loads the transformation matrix. .. seealso:: - `Vizgen data release program <https://vizgen.com/data-release-program/>`_. - :func:`squidpy.pl.spatial_scatter` on how to plot spatial data. Parameters ---------- path Path to the root directory containing *Vizgen* files. counts_file File containing the counts. Typically ends with *_cell_by_gene.csv*. meta_file File containing the spatial coordinates and additional cell-level metadata. transformation_file Transformation matrix file for converting micron coordinates into pixels in images. library_id Identifier for the *Vizgen* library. Useful when concatenating multiple :class:`anndata.AnnData` objects. Returns ------- Annotated data object with the following keys: - :attr:`anndata.AnnData.obsm` ``['spatial']`` - spatial spot coordinates in microns. - :attr:`anndata.AnnData.obsm` ``['blank_genes']`` - blank genes from Vizgen platform. - :attr:`anndata.AnnData.uns` ``['spatial']['{library_id}']['scalefactors']['transformation_matrix']`` - transformation matrix for converting micron coordinates to pixels. Only present if ``transformation_file != None``. """ path = Path(path) adata, library_id = _read_counts( path=path, counts_file=counts_file, library_id=library_id, delimiter=",", first_column_names=True, **kwargs ) blank_genes = np.array(["Blank" in v for v in adata.var_names]) adata.obsm["blank_genes"] = pd.DataFrame( adata[:, blank_genes].X.copy(), columns=adata.var_names[blank_genes], index=adata.obs_names ) adata = adata[:, ~blank_genes].copy() adata.X = csr_matrix(adata.X) coords = pd.read_csv(path / meta_file, header=0, index_col=0) # https://github.com/scverse/squidpy/issues/657 coords.set_index(coords.index.astype("str"), inplace=True) adata.obs = pd.merge(adata.obs, coords, how="left", left_index=True, right_index=True) adata.obsm[Key.obsm.spatial] = adata.obs[["center_x", "center_y"]].values adata.obs.drop(columns=["center_x", "center_y"], inplace=True) if transformation_file is not None: matrix = pd.read_csv(path / f"images/{transformation_file}", sep=" ", header=None) # https://github.com/scverse/squidpy/issues/727 matrix.columns = matrix.columns.astype(str) adata.uns[Key.uns.spatial][library_id]["scalefactors"] = {"transformation_matrix": matrix} return adata
[docs] def nanostring( path: str | Path, *, counts_file: str, meta_file: str, fov_file: str | None = None, ) -> AnnData: """ Read *Nanostring* formatted dataset. In addition to reading the regular *Nanostring* output, it loads the metadata file, if present *CellComposite* and *CellLabels* directories containing the images and optionally the field of view file. .. seealso:: - `Nanostring Spatial Molecular Imager <https://nanostring.com/products/cosmx-spatial-molecular-imager/>`_. - :func:`squidpy.pl.spatial_scatter` on how to plot spatial data. Parameters ---------- path Path to the root directory containing *Nanostring* files. counts_file File containing the counts. Typically ends with *_exprMat_file.csv*. meta_file File containing the spatial coordinates and additional cell-level metadata. Typically ends with *_metadata_file.csv*. fov_file File containing the coordinates of all the fields of view. Returns ------- Annotated data object with the following keys: - :attr:`anndata.AnnData.obsm` ``['spatial']`` - local coordinates of the centers of cells. - :attr:`anndata.AnnData.obsm` ``['spatial_fov']`` - global coordinates of the centers of cells in the field of view. - :attr:`anndata.AnnData.uns` ``['spatial']['{fov}']['images']`` - *hires* and *segmentation* images. - :attr:`anndata.AnnData.uns` ``['spatial']['{fov}']['metadata']]['{x,y}_global_px']`` - coordinates of the field of view. Only present if ``fov_file != None``. """ # noqa: E501 path, fov_key = Path(path), "fov" cell_id_key = "cell_ID" counts = pd.read_csv(path / counts_file, header=0, index_col=cell_id_key) counts.index = counts.index.astype(str).str.cat(counts.pop(fov_key).astype(str).values, sep="_") obs = pd.read_csv(path / meta_file, header=0, index_col=cell_id_key) obs[fov_key] = pd.Categorical(obs[fov_key].astype(str)) obs[cell_id_key] = obs.index.astype(np.int64) obs.rename_axis(None, inplace=True) obs.index = obs.index.astype(str).str.cat(obs[fov_key].values, sep="_") common_index = obs.index.intersection(counts.index) adata = AnnData( csr_matrix(counts.loc[common_index, :].values), obs=obs.loc[common_index, :], uns={Key.uns.spatial: {}}, ) adata.var_names = counts.columns adata.obsm[Key.obsm.spatial] = adata.obs[["CenterX_local_px", "CenterY_local_px"]].values adata.obsm["spatial_fov"] = adata.obs[["CenterX_global_px", "CenterY_global_px"]].values adata.obs.drop(columns=["CenterX_local_px", "CenterY_local_px"], inplace=True) for fov in adata.obs[fov_key].cat.categories: adata.uns[Key.uns.spatial][fov] = { "images": {}, "scalefactors": {"tissue_hires_scalef": 1, "spot_diameter_fullres": 1}, } file_extensions = (".jpg", ".png", ".jpeg", ".tif", ".tiff") pat = re.compile(r".*_F(\d+)") for subdir in ["CellComposite", "CellLabels"]: if os.path.exists(path / subdir) and os.path.isdir(path / subdir): kind = "hires" if subdir == "CellComposite" else "segmentation" for fname in os.listdir(path / subdir): if fname.endswith(file_extensions): fov = str(int(pat.findall(fname)[0])) try: adata.uns[Key.uns.spatial][fov]["images"][kind] = _load_image(path / subdir / fname) except KeyError: logg.warning(f"FOV `{str(fov)}` does not exist in {subdir} folder, skipping it.") continue if fov_file is not None: fov_positions = pd.read_csv(path / fov_file, header=0, index_col=fov_key) for fov, row in fov_positions.iterrows(): try: adata.uns[Key.uns.spatial][str(fov)]["metadata"] = row.to_dict() except KeyError: logg.warning(f"FOV `{str(fov)}` does not exist, skipping it.") continue return adata