Source code for squidpy.read._read

from __future__ import annotations

from typing import Union  # noqa: F401
from typing import Any
from pathlib import Path
import os
import re
import json

from scanpy import logging as logg
from anndata import AnnData

from scipy.sparse import csr_matrix
import numpy as np
import pandas as pd

from squidpy.read._utils import _load_image, _read_counts
from squidpy.datasets._utils import PathLike
from squidpy._constants._pkg_constants import Key

__all__ = ["visium", "vizgen", "nanostring"]


[docs]def visium( path: PathLike, *, counts_file: str = "filtered_feature_bc_matrix.h5", library_id: str | None = None, load_images: bool = True, source_image_path: PathLike | None = None, **kwargs: Any, ) -> AnnData: """ Read *10x Genomics* Visium formatted dataset. In addition to reading the regular *Visium* output, it looks for the *spatial* directory and loads the images, spatial coordinates and scale factors. .. seealso:: - `Space Ranger output <https://support.10xgenomics.com/spatial-gene-expression/software/pipelines/latest/output/overview>`_. - :func:`squidpy.pl.spatial_scatter` on how to plot spatial data. Parameters ---------- path Path to the root directory containing *Visium* files. counts_file Which file in the passed directory to use as the count file. Typically either *filtered_feature_bc_matrix.h5* or *raw_feature_bc_matrix.h5*. library_id Identifier for the *Visium* library. Useful when concatenating multiple :class:`anndata.AnnData` objects. kwargs Keyword arguments for :func:`scanpy.read_10x_h5`, :func:`anndata.read_mtx` or :func:`read_text`. Returns ------- Annotated data object with the following keys: - :attr:`anndata.AnnData.obsm` ``['spatial']`` - spatial spot coordinates. - :attr:`anndata.AnnData.uns` ``['spatial']['{library_id}']['images']`` - *hires* and *lowres* images. - :attr:`anndata.AnnData.uns` ``['spatial']['{library_id}']['scalefactors']`` - scale factors for the spots. - :attr:`anndata.AnnData.uns` ``['spatial']['{library_id}']['metadata']`` - various metadata. """ # noqa: E501 path = Path(path) adata, library_id = _read_counts(path, count_file=counts_file, library_id=library_id, **kwargs) if not load_images: return adata adata.uns[Key.uns.spatial][library_id][Key.uns.image_key] = { res: _load_image(path / f"{Key.uns.spatial}/tissue_{res}_image.png") for res in ["hires", "lowres"] } adata.uns[Key.uns.spatial][library_id]["scalefactors"] = json.loads( (path / f"{Key.uns.spatial}/scalefactors_json.json").read_bytes() ) tissue_positions_file = ( path / "spatial/tissue_positions.csv" if (path / "spatial/tissue_positions.csv").exists() else path / "spatial/tissue_positions_list.csv" ) coords = pd.read_csv( tissue_positions_file, header=1 if tissue_positions_file.name == "tissue_positions.csv" else None, index_col=0, ) coords.columns = ["in_tissue", "array_row", "array_col", "pxl_col_in_fullres", "pxl_row_in_fullres"] adata.obs = pd.merge(adata.obs, coords, how="left", left_index=True, right_index=True) adata.obsm[Key.obsm.spatial] = adata.obs[["pxl_row_in_fullres", "pxl_col_in_fullres"]].values adata.obs.drop(columns=["pxl_row_in_fullres", "pxl_col_in_fullres"], inplace=True) if source_image_path is not None: source_image_path = Path(source_image_path).absolute() if not source_image_path.exists(): logg.warning(f"Path to the high-resolution tissue image `{source_image_path}` does not exist") adata.uns["spatial"][library_id]["metadata"]["source_image_path"] = str(source_image_path) return adata
[docs]def vizgen( path: str | Path, *, counts_file: str, meta_file: str, transformation_file: str | None = None, library_id: str = "library", **kwargs: Any, ) -> AnnData: """ Read *Vizgen* formatted dataset. In addition to reading the regular *Vizgen* output, it loads the metadata file and optionally loads the transformation matrix. .. seealso:: - `Vizgen data release program <https://vizgen.com/data-release-program/>`_. - :func:`squidpy.pl.spatial_scatter` on how to plot spatial data. Parameters ---------- path Path to the root directory containing *Vizgen* files. counts_file File containing the counts. Typically ends with *_cell_by_gene.csv*. meta_file File containing the spatial coordinates and additional cell-level metadata. transformation_file Transformation matrix file for converting micron coordinates into pixels in images. library_id Identifier for the *Vizgen* library. Useful when concatenating multiple :class:`anndata.AnnData` objects. Returns ------- Annotated data object with the following keys: - :attr:`anndata.AnnData.obsm` ``['spatial']`` - spatial spot coordinates in microns. - :attr:`anndata.AnnData.obsm` ``['blank_genes']`` - blank genes from Vizgen platform. - :attr:`anndata.AnnData.uns` ``['spatial']['{library_id}']['scalefactors']['transformation_matrix']`` - transformation matrix for converting micron coordinates to pixels. Only present if ``transformation_file != None``. """ path = Path(path) adata, library_id = _read_counts( path=path, count_file=counts_file, library_id=library_id, delimiter=",", first_column_names=True, **kwargs ) blank_genes = np.array(["Blank" in v for v in adata.var_names]) adata.obsm["blank_genes"] = pd.DataFrame( adata[:, blank_genes].X.copy(), columns=adata.var_names[blank_genes], index=adata.obs_names ) adata = adata[:, ~blank_genes].copy() adata.X = csr_matrix(adata.X) # fmt: off coords = pd.read_csv(path / meta_file, header=0, index_col=0) coords.columns = ["fov", "volume", "center_x", "center_y", "min_x", "max_x", "min_y", "max_y"] # fmt: on adata.obs = pd.merge(adata.obs, coords, how="left", left_index=True, right_index=True) adata.obsm[Key.obsm.spatial] = adata.obs[["center_x", "center_y"]].values adata.obs.drop(columns=["center_x", "center_y"], inplace=True) if transformation_file is not None: matrix = pd.read_csv(path / f"images/{transformation_file}", sep=" ", header=None) adata.uns[Key.uns.spatial][library_id]["scalefactors"] = {"transformation_matrix": matrix} return adata
[docs]def nanostring( path: str | Path, *, counts_file: str, meta_file: str, fov_file: str | None = None, ) -> AnnData: """ Read *Nanostring* formatted dataset. In addition to reading the regular *Nanostring* output, it loads the metadata file, *CellComposite* and *CellLabels* directories containing the images and optionally the field of view file. .. seealso:: - `Nanostring Spatial Molecular Imager <https://nanostring.com/products/cosmx-spatial-molecular-imager/>`_. - :func:`squidpy.pl.spatial_scatter` on how to plot spatial data. Parameters ---------- path Path to the root directory containing *Nanostring* files. counts_file File containing the counts. Typically ends with *_exprMat_file.csv*. meta_file File containing the spatial coordinates and additional cell-level metadata. Typically ends with *_metadata_file.csv*. fov_file File containing the coordinates of all the fields of view. Returns ------- Annotated data object with the following keys: - :attr:`anndata.AnnData.obsm` ``['spatial']`` - local coordinates of the centers of cells. - :attr:`anndata.AnnData.obsm` ``['spatial_fov']`` - global coordinates of the centers of cells in the field of view. - :attr:`anndata.AnnData.uns` ``['spatial']['{fov}']['images']`` - *hires* and *segmentation* images. - :attr:`anndata.AnnData.uns` ``['spatial']['{fov}']['metadata']]['{x,y}_global_px']`` - coordinates of the field of view. Only present if ``fov_file != None``. """ # noqa: E501 path, fov_key = Path(path), "fov" cell_id_key = "cell_ID" counts = pd.read_csv(path / counts_file, header=0, index_col=cell_id_key) counts.index = counts.index.astype(str).str.cat(counts.pop(fov_key).astype(str).values, sep="_") obs = pd.read_csv(path / meta_file, header=0, index_col=cell_id_key) obs[fov_key] = pd.Categorical(obs[fov_key].astype(str)) obs[cell_id_key] = obs.index.astype(np.int64) obs.rename_axis(None, inplace=True) obs.index = obs.index.astype(str).str.cat(obs[fov_key].values, sep="_") common_index = obs.index.intersection(counts.index) adata = AnnData( csr_matrix(counts.loc[common_index, :].values), dtype=counts.values.dtype, obs=obs.loc[common_index, :], uns={Key.uns.spatial: {}}, ) adata.var_names = counts.columns adata.obsm[Key.obsm.spatial] = adata.obs[["CenterX_local_px", "CenterY_local_px"]].values adata.obsm["spatial_fov"] = adata.obs[["CenterX_global_px", "CenterY_global_px"]].values adata.obs.drop(columns=["CenterX_local_px", "CenterY_local_px"], inplace=True) for fov in adata.obs[fov_key].cat.categories: adata.uns[Key.uns.spatial][fov] = { "images": {}, "scalefactors": {"tissue_hires_scalef": 1, "spot_diameter_fullres": 1}, } file_extensions = (".jpg", ".png", ".jpeg", ".tif", ".tiff") pat = re.compile(r".*_F(\d+)") for subdir in ["CellComposite", "CellLabels"]: kind = "hires" if subdir == "CellComposite" else "segmentation" for fname in os.listdir(path / subdir): if fname.endswith(file_extensions): fov = str(int(pat.findall(fname)[0])) adata.uns[Key.uns.spatial][fov]["images"][kind] = _load_image(path / subdir / fname) if fov_file is not None: fov_positions = pd.read_csv(path / fov_file, header=0, index_col=fov_key) for fov, row in fov_positions.iterrows(): try: adata.uns[Key.uns.spatial][str(fov)]["metadata"] = row.to_dict() except KeyError: logg.warning(f"FOV `{str(fov)}` does not exist, skipping it.") continue return adata