Basic Spatial Transcriptomics Analysis with Harpy#

In this notebook, we will work with mouse liver spatial transcriptomics data generated using Resolve Biosciences’ Molecular Cartography technology.

import harpy as hp

1. Read in the data#

The first step includes reading in the raw data.

The example dataset for this notebook will be downloaded and cached using pooch via harpy.dataset.registry.

Convert to zarr and read the image#

import os
import tempfile
import uuid

from spatialdata import SpatialData
from spatialdata import read_zarr

from harpy.datasets.registry import get_registry

from spatialdata.transformations import Scale, Identity

from dask_image import imread

# change this path. It is the directory where the spatialdata .zarr will be saved.
OUTPUT_DIR = tempfile.gettempdir()

image_name = "raw_image"

path = None  # If None, example data will be downloaded in the default cache folder of your os. Set this to a custom path, to change this behaviour.
registry = get_registry(path=path)
path_image = registry.fetch("transcriptomics/resolve/mouse/20272_slide1_A1-1_DAPI.tiff")

# you can choose any name for your zarr file
output_path = os.path.join(OUTPUT_DIR, f"sdata_{uuid.uuid4()}.zarr")

sdata = SpatialData()
sdata.write(output_path)
sdata = read_zarr(sdata.path)

arr = imread.imread(path_image)

# we will add the image in the micron and pixel coordinate system
pixel_size = 0.138  # pixel size in micron for resolve

scale = Scale(axes=["x", "y"], scale=[pixel_size, pixel_size])

sdata = hp.im.add_image(
    sdata,
    arr=arr.rechunk(1024),
    output_image_name=image_name,
    transformations={"pixel": Identity(), "micron": scale},
    scale_factors=[2, 2, 2, 2],
    c_coords="DAPI",
)
sdata
SpatialData object, with associated Zarr store: /private/var/folders/q5/7yhs0l6d0x771g7qdbhvkvmr0000gp/T/sdata_21e9705d-6c32-4296-b47c-b06327941efa.zarr
└── Images
      └── 'raw_image': DataTree[cyx] (1, 12864, 10720), (1, 6432, 5360), (1, 3216, 2680), (1, 1608, 1340), (1, 804, 670)
with coordinate systems:
    ▸ 'micron', with elements:
        raw_image (Images)
    ▸ 'pixel', with elements:
        raw_image (Images)

2. Plot the image#

import numpy as np
import dask.array as da
from matplotlib.colors import Normalize

import matplotlib.pyplot as plt

fig, ax = plt.subplots(1, 1, figsize=(4, 4))
channel = "DAPI"
crd = [3000, 4000, 3000, 4000]  # crop is in pixel coordinates
to_coordinate_system = "pixel"

# normalization parameters for visualization (underlying image not changed)
# for visualization we normalize by the 99th percentile
se = hp.im.get_dataarray(sdata, element_name=image_name)
_channel_idx = np.where(se.c.data == channel)[0].item()
vmax = da.percentile(se.data[_channel_idx].flatten(), q=99).compute()
norm = Normalize(vmax=vmax, clip=False)

render_images_kwargs = {"cmap": "viridis", "norm": norm}
show_kwargs = {"title": channel, "colorbar": False}
ax = hp.pl.plot_sdata(
    sdata,
    image_name=image_name,
    channel=channel,
    crd=crd,
    to_coordinate_system=to_coordinate_system,
    render_images_kwargs=render_images_kwargs,
    show_kwargs=show_kwargs,
    ax=ax,
)
# frameless figure
ax.axis("off")

plt.tight_layout()
plt.show()
../../_images/ce9d2745fd9b53ea14327ef3480cac3ea9e16d8b58a82c4882adc304d788c41d.png

Or plot full image in micron coordinates:

fig, ax = plt.subplots(1, 1, figsize=(6, 6))
channel = "DAPI"
to_coordinate_system = "micron"

render_images_kwargs = {
    "cmap": "viridis",
    "norm": norm,
}
show_kwargs = {
    "title": channel,
    "colorbar": False,
}
ax = hp.pl.plot_sdata(
    sdata,
    image_name=image_name,
    channel=channel,
    crd=None,
    to_coordinate_system=to_coordinate_system,
    render_images_kwargs=render_images_kwargs,
    show_kwargs=show_kwargs,
    ax=ax,
)

plt.tight_layout()
plt.show()
../../_images/bc77db1bf3386cdf2f73b77babc5c4a6b0ef36d5a936123101b9bb36f6b3bbfc.png

Queries can also be done in micron coordinates:

fig, ax = plt.subplots(1, 1, figsize=(4, 4))
channel = "DAPI"
crd = [400, 500, 400, 500]  # crops can also be specified in micron coordinates
to_coordinate_system = "micron"

render_images_kwargs = {"cmap": "viridis", "norm": norm}
show_kwargs = {"title": channel, "colorbar": False}
ax = hp.pl.plot_sdata(
    sdata,
    image_name=image_name,
    channel=channel,
    crd=crd,
    to_coordinate_system="micron",
    render_images_kwargs=render_images_kwargs,
    show_kwargs=show_kwargs,
    ax=ax,
)

plt.tight_layout()
plt.show()
../../_images/a1bcf279c67ec445f9729ba85d93e94a9f6f80b69c213f1acf06e9853a024dad.png

3. Segment using Cellpose#

import torch
from dask.distributed import Client, LocalCluster

if torch.cuda.is_available():
    from dask_cuda import LocalCUDACluster  # pip install dask-cuda

    # One worker per GPU; each worker will be pinned to a single GPU.
    cluster = LocalCUDACluster(
        CUDA_VISIBLE_DEVICES=[0],  # or [0,1],...etc
        n_workers=1,  # 2 if [0,1],...etc
        threads_per_worker=1,
        memory_limit="32GB",
        # local_directory=os.environ.get( "TMPDIR" ),#
    )
else:
    # cpu/mps fall back
    from dask.distributed import LocalCluster

    cluster = LocalCluster(
        n_workers=1
        if torch.backends.mps.is_available()
        else 8,  # If mps/cuda device available, it is better to increase chunk size to maximal value that fits on the gpu, and set n_workers to 1.
        # For this dummy example, we only have one chunk, so setting n_workers>1, has no effect.
        threads_per_worker=1,
        memory_limit="32GB",
        # local_directory=os.environ.get( "TMPDIR" ),
    )

client = Client(cluster)

print(client.dashboard_link)

sdata = hp.im.segment(
    sdata,
    image_name="raw_image",
    depth=200,
    model=hp.im.cellpose_callable,
    # parameters that will be passed to the callable cellpose_callable
    diameter=50,
    flow_threshold=0.9,
    cellprob_threshold=-4,
    output_labels_name="segmentation_mask",
    output_shapes_name="segmentation_mask_boundaries",
    crd=[
        3000,
        4000,
        3000,
        4000,
    ],  # region to segment [x_min, xmax, y_min, y_max], must be defined in pixel coordinates
    to_coordinate_system="pixel",
    overwrite=True,
)
client.close()
http://127.0.0.1:8787/status

4. Visualize resulting segmentation#

fig, axes = plt.subplots(1, 2, figsize=(8, 4))

channel = "DAPI"
image_name = "raw_image"
labels_name = "segmentation_mask"
crd = [
    3000,
    4000,
    3000,
    4000,
]  # Crop in pixels. One could also choose to do the crop in micron coordinates
to_coordinate_system = "pixel"

render_images_kwargs = {"cmap": "viridis", "norm": norm}
render_labels_kwargs = {"fill_alpha": 0.6, "outline_alpha": 0.4}
show_kwargs = {"title": channel, "colorbar": False}
_ax = hp.pl.plot_sdata(
    sdata,
    image_name=image_name,
    crd=crd,
    to_coordinate_system=to_coordinate_system,
    channel=channel,
    render_images_kwargs=render_images_kwargs,
    show_kwargs=show_kwargs,
    ax=axes[0],
)
_ax.axis("off")

show_kwargs = {"title": "Segmentation mask", "colorbar": False}
_ax = hp.pl.plot_sdata(
    sdata,
    image_name=image_name,
    labels_name=labels_name,
    crd=crd,
    to_coordinate_system=to_coordinate_system,
    channel=channel,
    render_images_kwargs=render_images_kwargs,
    render_labels_kwargs=render_labels_kwargs,
    show_kwargs=show_kwargs,
    ax=axes[1],
)
_ax.axis("off")

plt.tight_layout()
plt.show()
../../_images/237e5f819db3a4f204c3723a3f902f1d0286d32a153b63d3b36d6d7cdae8db1e.png

5. Read the transcripts#

path_transcripts = registry.fetch(
    "transcriptomics/resolve/mouse/20272_slide1_A1-1_results.txt"
)
sdata = hp.io.read_resolve_transcripts(
    sdata,
    path_count_matrix=path_transcripts,
    output_points_name="transcripts",
    to_coordinate_system="pixel",
    to_micron_coordinate_system="micron",
    pixel_size=pixel_size,
    overwrite=True,
)
sdata
SpatialData object, with associated Zarr store: /private/var/folders/q5/7yhs0l6d0x771g7qdbhvkvmr0000gp/T/sdata_21e9705d-6c32-4296-b47c-b06327941efa.zarr
├── Images
│     └── 'raw_image': DataTree[cyx] (1, 12864, 10720), (1, 6432, 5360), (1, 3216, 2680), (1, 1608, 1340), (1, 804, 670)
├── Labels
│     └── 'segmentation_mask': DataArray[yx] (1000, 1000)
├── Points
│     └── 'transcripts': DataFrame with shape: (<Delayed>, 3) (2D points)
└── Shapes
      └── 'segmentation_mask_boundaries': GeoDataFrame shape: (121, 1) (2D shapes)
with coordinate systems:
    ▸ 'micron', with elements:
        raw_image (Images), segmentation_mask (Labels), transcripts (Points), segmentation_mask_boundaries (Shapes)
    ▸ 'pixel', with elements:
        raw_image (Images), segmentation_mask (Labels), transcripts (Points), segmentation_mask_boundaries (Shapes)
fig, axes = plt.subplots(1, 2, figsize=(16, 8))

render_images_kwargs = {
    "cmap": "binary",
}
show_kwargs = {
    "title": "transcripts (in micron)",
    "colorbar": False,
}

hp.pl.plot_sdata_genes(
    sdata,
    points_name="transcripts",
    image_name="raw_image",
    genes=None,  # plot all genes
    color="cornflowerblue",
    size=0.1,
    frac=0.5,  # only plot half of them
    to_coordinate_system="micron",
    show_kwargs=show_kwargs,
    render_images_kwargs=render_images_kwargs,
    ax=axes[0],
)

render_images_kwargs = {
    "cmap": "grey",
}
show_kwargs = {
    "title": "transcripts (in micron)",
    "colorbar": False,
}

hp.pl.plot_sdata_genes(
    sdata,
    points_name="transcripts",
    image_name="raw_image",
    genes=["Axl", "Stab2"],
    palette=["#4D88FF", "#FFE082"],
    size=5.0,
    frac=None,
    to_coordinate_system="micron",
    show_kwargs=show_kwargs,
    render_images_kwargs=render_images_kwargs,
    ax=axes[1],
)
INFO     Value for parameter 'color' appears to be a color, using it as such.
INFO     Using 'datashader' backend with 'None' as reduction method to speed up plotting. Depending on the         
         reduction method, the value range of the plot might change. Set method to 'matplotlib' do disable this    
         behaviour.
INFO     Using 'datashader' backend with 'None' as reduction method to speed up plotting. Depending on the         
         reduction method, the value range of the plot might change. Set method to 'matplotlib' do disable this    
         behaviour.
<Axes: title={'center': 'transcripts (in micron)'}>
../../_images/8b8854b2243662d3bf5cbfb10a1e18f4724529a5c053b7adc89c82b29095abd6.png

Or visualize a crop.

Querying points is slow in spatialdata, therefore, we query the points outside of spatialdata (and therefore also the image).

from spatialdata import bounding_box_query

crd = [3000, 4000, 3000, 4000]

to_coordinate_system = "pixel"  # we query in 'intrinsic' pixel coordinate system

y_start = crd[2]
x_start = crd[0]

y_end = crd[3]
x_end = crd[1]

name_x = "x"
name_y = "y"

y_query = f"{y_start} <={name_y} < {y_end}"
x_query = f"{x_start} <={name_x} < {x_end}"

query = f"{y_query} and {x_query}"

sdata["transcripts"] = sdata["transcripts"].categorize(columns=["gene"])
sdata["transcripts_queried"] = sdata["transcripts"].query(query)

sdata["raw_image_queried"] = bounding_box_query(
    sdata["raw_image"],
    axes=["x", "y"],
    min_coordinate=[crd[0], crd[2]],
    max_coordinate=[crd[1], crd[3]],
    target_coordinate_system=to_coordinate_system,
)
fig, axes = plt.subplots(1, 1, figsize=(6, 6))

render_images_kwargs = {
    "cmap": "pink",
}
show_kwargs = {
    "title": channel,
    "colorbar": False,
}

hp.pl.plot_sdata_genes(
    sdata,
    points_name="transcripts_queried",
    image_name="raw_image_queried",
    genes=["Axl", "Stab2"],
    palette=["#4D88FF", "#FFE082"],
    size=5.0,
    frac=None,
    crd=None,
    to_coordinate_system=to_coordinate_system,
    show_kwargs=show_kwargs,
    render_images_kwargs=render_images_kwargs,
    ax=axes,
)

del sdata["transcripts_queried"]
del sdata["raw_image_queried"]

axes.axis("off")
(np.float64(3000.0),
 np.float64(4000.0),
 np.float64(4000.0),
 np.float64(3000.0))
../../_images/0820141e29f77c271c6dc0311cabe40d53642f1654dd40eb9dffae078d810ed6.png

6. Create the AnnData table#

# Use dask cluster for optimal processing. For this dummy example, one could also work without a dask cluster.
cluster = LocalCluster(
    n_workers=8,
    threads_per_worker=1,
    processes=True,
    memory_limit="8GB",
    # local_directory=os.environ.get( "TMPDIR" ),
)

client = Client(cluster)

print(client.dashboard_link)
http://127.0.0.1:49706/status
sdata = hp.tb.allocate(
    sdata,
    labels_name="segmentation_mask",
    points_name="transcripts",
    output_table_name="table_transcriptomics",
    chunks=None,
    to_coordinate_system="pixel",
    update_shapes_elements=False,
    overwrite=True,
)
sdata["table_transcriptomics"]

client.close()
sdata["table_transcriptomics"].to_df().head()
Adamtsl2 Adgre1 Atp6v0d2 Axl C5ar1 Ccr2 Cd207 Cd209a Cd36 Cd3e ... Sox9 Spn Spon2 Stab2 Timd4 Tmem119 Vsig4 Vwf Wnt2 Wnt9b
cells
65_segmentation_mask_a6baaa75 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
69_segmentation_mask_a6baaa75 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
71_segmentation_mask_a6baaa75 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
81_segmentation_mask_a6baaa75 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
83_segmentation_mask_a6baaa75 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 71 columns

Now visualize the expression of 2 genes:

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

channel = "DAPI"
image_name = "raw_image"
labels_name = "segmentation_mask"
table_name = "table_transcriptomics"
crd = [3000, 4000, 3000, 4000]
to_coordinate_system = "pixel"

render_images_kwargs = {"cmap": "binary", "norm": norm}
render_labels_kwargs = {"fill_alpha": 0.6, "outline_alpha": 0.4}

# subset because querying of points is slow, and we do not need them in this visualization
sdata_to_plot = sdata.subset(element_names=[image_name, labels_name, table_name])

color = "Axl"
show_kwargs = {
    "title": f"{color}",
    "colorbar": True,
}
ax = hp.pl.plot_sdata(
    sdata_to_plot,
    image_name=image_name,
    channel=channel,
    crd=crd,
    to_coordinate_system=to_coordinate_system,
    labels_name=labels_name,
    table_name=table_name,
    color=color,
    render_images_kwargs=render_images_kwargs,
    show_kwargs=show_kwargs,
    ax=axes[0],
)
ax.axis("off")

# color by mean intensity
color = "Stab2"
show_kwargs = {
    "title": f"{color}",
    "colorbar": True,
}
ax = hp.pl.plot_sdata(
    sdata_to_plot,
    image_name=image_name,
    channel=channel,
    crd=crd,
    to_coordinate_system=to_coordinate_system,
    labels_name=labels_name,
    table_name=table_name,
    color=color,
    render_images_kwargs=render_images_kwargs,
    show_kwargs=show_kwargs,
    ax=axes[1],
)
ax.axis("off")

plt.tight_layout()
plt.show()
../../_images/31c307824e8adc5f0d8167b8aec1e605e12b65fa0b4b7155b905574a479783c7.png

6. Leiden Clustering#

sdata = hp.tb.preprocess_transcriptomics(
    sdata,
    labels_name="segmentation_mask",
    table_name="table_transcriptomics",
    output_table_name="table_transcriptomics_preprocessed",
    update_shapes_elements=False,
    overwrite=True,
)
sdata = hp.tb.leiden(
    sdata,
    labels_name="segmentation_mask",
    table_name="table_transcriptomics_preprocessed",
    output_table_name="table_transcriptomics_preprocessed",
    overwrite=True,
)
import scanpy as sc

fig, ax = plt.subplots(figsize=(4, 4))

sc.pl.umap(
    sdata["table_transcriptomics_preprocessed"],
    color="leiden",
    size=400,
    ax=ax,
)
../../_images/90d7a248fb1b280dceda85aa8eaeca954a7d83cd9a7bc4d811823efd2cfb30d2.png
fig, axes = plt.subplots(1, 1, figsize=(5, 5))

channel = "DAPI"
image_name = "raw_image"
labels_name = "segmentation_mask"
table_name = "table_transcriptomics_preprocessed"
crd = [3000, 4000, 3000, 4000]
to_coordinate_system = "pixel"

render_images_kwargs = {"cmap": "binary", "norm": norm}
render_labels_kwargs = {"fill_alpha": 0.6, "outline_alpha": 0.4}

# subset because querying of points is slow, and we do not need them in this visualization
sdata_to_plot = sdata.subset(element_names=[image_name, labels_name, table_name])

color = "leiden"
show_kwargs = {
    "title": f"{color} clusters",
    "colorbar": False,
}
ax = hp.pl.plot_sdata(
    sdata_to_plot,
    image_name=image_name,
    channel=channel,
    crd=crd,
    to_coordinate_system=to_coordinate_system,
    labels_name=labels_name,
    table_name=table_name,
    color=color,
    render_images_kwargs=render_images_kwargs,
    show_kwargs=show_kwargs,
    ax=axes,
)
ax.axis("off")
(np.float64(3000.0),
 np.float64(4000.0),
 np.float64(4000.0),
 np.float64(3000.0))
../../_images/2204c4108ec53c09b1a061eeb1ed083b90da5b557e52c9556238ffe0f5af253e.png