Example spatial transcriptomics (Xenium)

Example spatial transcriptomics (Xenium)#

Data downloaded from 10x Genomics (accessed 25/11/2025): https://www.10xgenomics.com/datasets/xenium-prime-ffpe-human-ovarian-cancer

1. Read in the data#

import os

import harpy as hp

path_sdata = os.path.join(
    os.environ.get("TMPDIR"), "sdata.zarr"
)  # or pick a path where the SpatialData object will be backed.

# takes around 7 minutes
sdata = hp.datasets.xenium_human_ovarian_cancer(output=path_sdata)

2. Plot the images#

H&E and annotated H&E

import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(20, 10))

image_name = ["he_image_global_ROI1", "he_image_annotated_global_ROI1"]

for _image_name, _ax in zip(image_name, axes, strict=True):
    show_kwargs = {
        "title": _image_name,
        "colorbar": False,
    }
    ax = hp.pl.plot_sdata(
        sdata,
        image_name=_image_name,
        channel=["r", "g", "b"],
        to_coordinate_system="global_ROI1",  # or if you want to plot in micron coordinates, specify "global_ROI1_micron"
        show_kwargs=show_kwargs,
        ax=_ax,
    )

INFO     Rasterizing image for faster rendering.                                                                   
INFO     Rasterizing image for faster rendering.

../../_images/6fd4c65d9cef3641873cc73dbbd0a7c9bbdb59627f31cce963a73a0efb993faf.png

import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(20, 10))

show_kwargs = {"colorbar": False}

image_name = ["he_image_global_ROI1", "he_image_annotated_global_ROI1"]
for _image_name, _ax in zip(image_name, axes, strict=True):
    show_kwargs = {
        "title": _image_name,
        "colorbar": False,
    }
    ax = hp.pl.plot_sdata(
        sdata,
        image_name=_image_name,
        crd=[15000, 25000, 10000, 20000],
        channel=["r", "g", "b"],
        to_coordinate_system="global_ROI1",
        show_kwargs=show_kwargs,
        ax=_ax,
    )

../../_images/b7f8f98312f9f5abd69885c73fa8acb742c7d9f719e71336c160ba0c6a622261.png

Morphology focus

This is the image that is used for segmentation.

import matplotlib.pyplot as plt

fig, ax = plt.subplots(1, 1, figsize=(10, 10))

image_name = "morphology_focus_global_ROI1"
channel = hp.im.get_dataarray(sdata, element_name=image_name).c.data.tolist()

show_kwargs = {
    "title": image_name,
    "colorbar": False,
}
ax = hp.pl.plot_sdata(
    sdata,
    image_name=image_name,
    channel=channel,
    to_coordinate_system="global_ROI1",  # or if you want to plot in micron coordinates, specify "global_ROI1_micron"
    show_kwargs=show_kwargs,
    ax=ax,
)

INFO     Rasterizing image for faster rendering.                                                                   
INFO     Your image has 4 channels. Sampling categorical colors and using multichannel strategy 'stack' to render.

../../_images/1f9703af4d451d131d215ecf41ce63d35c4fbba2c9657602b0f9098856450db9.png

2. Visualize the transcripts#

import matplotlib.pyplot as plt

fig, ax = plt.subplots(1, 1, figsize=(8, 8))

render_images_kwargs = {
    "cmap": "binary",
}
show_kwargs = {
    "title": "transcripts",
    "colorbar": False,
}

hp.pl.plot_sdata_genes(
    sdata,
    points_name="transcripts_global_ROI1",
    image_name="morphology_focus_global_ROI1",
    channel="DAPI",
    genes=None,  # plot all genes
    color="cornflowerblue",
    size=0.1,
    frac=0.01,  # only plot 1% of them
    to_coordinate_system="global_ROI1",
    show_kwargs=show_kwargs,
    render_images_kwargs=render_images_kwargs,
    ax=ax,
)

INFO     Value for parameter 'color' appears to be a color, using it as such.                                      
INFO     Rasterizing image for faster rendering.
INFO     Using 'datashader' backend with 'None' as reduction method to speed up plotting. Depending on the         
         reduction method, the value range of the plot might change. Set method to 'matplotlib' do disable this    
         behaviour.

<Axes: title={'center': 'transcripts'}>

../../_images/089b491941b1f4b728c7edbdda8499f5f56ac43cbc43b6031a4fada4e46ffb4f.png

Now plot some genes associated with tumor cells.

import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(16, 8))

render_images_kwargs = {
    "cmap": "binary",
}

for gene, ax in zip(["PLXNB1", "CP"], axes, strict=True):
    show_kwargs = {
        "title": gene,
        "colorbar": False,
    }

    hp.pl.plot_sdata_genes(
        sdata,
        points_name="transcripts_global_ROI1",
        image_name="morphology_focus_global_ROI1",
        channel="DAPI",
        genes=[
            gene
        ],  #  You could specify a list of genes here. But note that if they occur at same location, they will plotted onto each other.
        palette=["#4D88FF"],
        size=0.1,
        frac=None,
        to_coordinate_system="global_ROI1",
        show_kwargs=show_kwargs,
        render_images_kwargs=render_images_kwargs,
        ax=ax,
    )
plt.tight_layout()
plt.show()

INFO     Rasterizing image for faster rendering.
INFO     input has more than 103 categories. Uniform 'grey' color will be used for all categories.                 
INFO     Using 'datashader' backend with 'None' as reduction method to speed up plotting. Depending on the         
         reduction method, the value range of the plot might change. Set method to 'matplotlib' do disable this    
         behaviour.
INFO     Rasterizing image for faster rendering.                                                                   
INFO     input has more than 103 categories. Uniform 'grey' color will be used for all categories.                 
INFO     Using 'datashader' backend with 'None' as reduction method to speed up plotting. Depending on the         
         reduction method, the value range of the plot might change. Set method to 'matplotlib' do disable this    
         behaviour.

../../_images/1c05c307c7bdb7d3f7dedd41563b789a7632138a951332df655c04ba48dbd2d7.png

Calculate the transcript density

sdata = hp.im.transcript_density(
    sdata,
    points_name="transcripts_global_ROI1",
    image_name="morphology_focus_global_ROI1",
    output_image_name="transcript_density",
    to_coordinate_system="global_ROI1",
    scale_factors=[2, 2, 2, 2],
    overwrite=True,
)

import numpy as np

import dask.array as da

from matplotlib.colors import Normalize

scale = None  # auto pick scale by spatialdata-plot
image_name = "transcript_density"

fig, ax = plt.subplots(1, 1, figsize=(10, 10))

channel = 0
se = hp.im.get_dataarray(sdata, element_name=image_name)
_channel_idx = np.where(se.c.data == channel)[0].item()
vmax = da.percentile(se.data[_channel_idx].flatten(), q=95).compute()
norm = Normalize(vmax=vmax, clip=False)
print(norm)

crd = None
sdata_to_plot = sdata.subset(element_names=[image_name])

render_images_kwargs = {"cmap": "grey", "scale": scale, "norm": norm}
show_kwargs = {
    "title": "transcript density",
    "colorbar": False,
}
ax = hp.pl.plot_sdata(
    sdata_to_plot,
    image_name=image_name,
    channel=channel,
    crd=crd,
    to_coordinate_system="global_ROI1",
    render_images_kwargs=render_images_kwargs,
    show_kwargs=show_kwargs,
    ax=ax,
)

plt.tight_layout()
plt.show()

<matplotlib.colors.Normalize object at 0x7f4e992eae40>
INFO     Rasterizing image for faster rendering.

../../_images/5f3f7f546ad39b969c0271669c64a51609565e5a0c2bf974a13b11554d468583.png

3. Segmentation#

Segmentation is already provided by 10x Genomics. We will inspect these segmentation masks, and also illustate how the user could generate segmentation masks using harpy.

First visualize segmentation masks provided by 10x Genomics.

fig, axes = plt.subplots(1, 2, figsize=(20, 10))

channel = "DAPI"
image_name = "morphology_focus_global_ROI1"
labels_name = "cell_labels_global_ROI1"

crd = [15000, 20000, 10000, 15000]

to_coordinate_system = "global_ROI1"

render_images_kwargs = {"cmap": "viridis"}
render_labels_kwargs = {"fill_alpha": 0.6, "outline_alpha": 0.4}
show_kwargs = {"title": f"DAPI in {to_coordinate_system}", "colorbar": False}

hp.pl.plot_sdata(
    sdata,
    image_name=image_name,
    crd=crd,
    to_coordinate_system=to_coordinate_system,
    channel=channel,
    render_images_kwargs=render_images_kwargs,
    show_kwargs=show_kwargs,
    ax=axes[0],
)
# _ax.axis("off")

show_kwargs = {
    "title": f"Segmentation mask in {to_coordinate_system}",
    "colorbar": False,
}
hp.pl.plot_sdata(
    sdata,
    image_name=image_name,
    labels_name=labels_name,
    crd=crd,
    to_coordinate_system=to_coordinate_system,
    channel=channel,
    render_images_kwargs=render_images_kwargs,
    render_labels_kwargs=render_labels_kwargs,
    show_kwargs=show_kwargs,
    ax=axes[1],
)
# _ax.axis("off")

plt.tight_layout()
plt.show()

../../_images/7bce48dcc41229b8199e5957d6e13b5338bc15dd0ba3e872221ff131126361dc.png

In many cases, you want to generate these segmentation masks yourself. Therefore, for illustration purposes, we show how to segment using Cellpose via harpy.im.segment, and how to generate an AnnData table using hp.tb.allocate.

Make sure to install Cellpose before running the following cell (pip install cellpose).

import os
import torch
from dask.distributed import Client, LocalCluster

if torch.cuda.is_available():
    from dask_cuda import LocalCUDACluster  # pip install dask-cuda

    # One worker per GPU; each worker will be pinned to a single GPU.
    cluster = LocalCUDACluster(
        CUDA_VISIBLE_DEVICES=[0],  # or [0,1],...etc
        n_workers=1,  # 2 if [0,1],...etc
        threads_per_worker=1,
        memory_limit="32GB",
        local_directory=os.environ.get("TMPDIR"),
    )
else:
    # cpu/mps fall back
    from dask.distributed import LocalCluster

    cluster = LocalCluster(
        n_workers=1
        if torch.backends.mps.is_available()
        else 8,  # If mps/cuda device available, it is better to increase chunk size to maximal value that fits on the gpu, and set n_workers to 1.
        # For this dummy example, we only have one chunk, so setting n_workers>1, has no effect.
        threads_per_worker=1,
        memory_limit="32GB",
        local_directory=os.environ.get("TMPDIR"),
    )

client = Client(cluster)

print(client.dashboard_link)

sdata = hp.im.segment(
    sdata,
    image_name="morphology_focus_global_ROI1",  # note, currently cellpose only supports segmentation of 3 channels. So the channel at index -1 will not be used (AlphaSMA/Vimentin)
    depth=200,
    model=hp.im.cellpose_callable,
    # parameters that will be passed to the callable cellpose_callable
    diameter=50,
    flow_threshold=0.9,
    cellprob_threshold=-4,
    output_labels_name="cell_labels_harpy_global_ROI1",
    output_shapes_name=None,
    crd=[15000, 20000, 10000, 15000],
    # region to segment [x_min, xmax, y_min, y_max], must be defined in pixel coordinates, e.g. specify the pixel coordinate system
    to_coordinate_system="global_ROI1",
    overwrite=True,
)
client.close()

http://127.0.0.1:8787/status

Lets inspect the segmentation masks we generated.

import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(20, 10))

channel = "DAPI"
image_name = "morphology_focus_global_ROI1"
labels_name = "cell_labels_harpy_global_ROI1"

crd = [15000, 20000, 10000, 15000]

to_coordinate_system = "global_ROI1"

render_images_kwargs = {"cmap": "viridis"}
render_labels_kwargs = {"fill_alpha": 0.6, "outline_alpha": 0.4}
show_kwargs = {"title": f"DAPI in {to_coordinate_system}", "colorbar": False}

hp.pl.plot_sdata(
    sdata,
    image_name=image_name,
    crd=crd,
    to_coordinate_system=to_coordinate_system,
    channel=channel,
    render_images_kwargs=render_images_kwargs,
    show_kwargs=show_kwargs,
    ax=axes[0],
)
# _ax.axis("off")

show_kwargs = {
    "title": f"Segmentation mask (Harpy/Cellpose) in {to_coordinate_system}",
    "colorbar": False,
}
hp.pl.plot_sdata(
    sdata,
    image_name=image_name,
    labels_name=labels_name,
    crd=crd,
    to_coordinate_system=to_coordinate_system,
    channel=channel,
    render_images_kwargs=render_images_kwargs,
    render_labels_kwargs=render_labels_kwargs,
    show_kwargs=show_kwargs,
    ax=axes[1],
)
# _ax.axis("off")

plt.tight_layout()
plt.show()

INFO     Rasterizing image for faster rendering.

../../_images/826ec499fe6bc1ce297cae0612a734230fedd50a80f7e2db044700e1a7d75a6c.png

4. Create the AnnData table#

Below we illustrate how an AnnData table can be created using this custom generated segmentation mask via the harpy function harpy.tb.allocate.

from dask.distributed import Client, LocalCluster

cluster = LocalCluster(
    n_workers=16,
    threads_per_worker=1,
    processes=True,
    memory_limit="8GB",
    local_directory=os.environ.get("TMPDIR"),  # folder where dask will spill to disk
)

client = Client(cluster)

sdata = sdata = hp.tb.allocate(
    sdata,
    labels_name="cell_labels_harpy_global_ROI1",
    points_name="transcripts_global_ROI1",
    output_table_name="table_harpy_global_ROI1",
    to_coordinate_system="global_ROI1",  # Do allocation step in the intrinsic pixel coordinate system,
    append=False,
    overwrite=True,
    update_shapes_elements=False,
)

client.close()

5. Preprocess the AnnData table#

We proceed working with the AnnData table provided by 10x Genomics. But the same logic applies to a custom generated AnnData table.

labels_name = "cell_labels_global_ROI1"
table_name = "table_global_ROI1"
points_name = "transcripts_global_ROI1"

filtered = hp.qc.analyse_genes_left_out(
    sdata,
    labels_name=labels_name,
    table_name=table_name,
    points_name=points_name,
    to_coordinate_system=to_coordinate_system,
)

../../_images/cb76c5e5e13ab92fd0f389ad6664d362c2269b4f977b34b2d9b05f66df588c87.png

../../_images/ba56577af00e24348f8a5bbedb956b54591e864aa18fd40f2dd0f75a1e1a1dc7.png

We proceed with filtering of cells with low transcript count, and genes with low cell count; normalization using cell size; a log transformation; and scaling such that the data has mean zero and a variance of one.

sdata = hp.tb.preprocess_transcriptomics(
    sdata,
    labels_name=labels_name,
    table_name=table_name,
    output_table_name="table_global_ROI1_preprocessed",
    size_norm=True,
    update_shapes_elements=False,
    overwrite=True,
)

hp.pl.preprocess_transcriptomics(
    sdata,
    table_name="table_global_ROI1_preprocessed",
)

../../_images/2729c473f6e6de856c54a525a8f920ade7803d9ac78dbeafd09382f87820514c.png

../../_images/e8a44f9083410af23fb7952d7b0a1a264c84cae54ab16d12e104428e8abbc6f9.png

../../_images/b249c9efdd5f9a9b9d87f9a5395e525d1d1080ef866683f6bff2198c5195d734.png

../../_images/6792ed44574f2afb2f9059ef9dc5638276d1347367346552cd2377c484bb5529.png

Visualize the expression after preprocessing.

# show expression after preprocessing;
import matplotlib.pyplot as plt

channel = "DAPI"
image_name = "morphology_focus_global_ROI1"
labels_name = "cell_labels_global_ROI1"
table_name = "table_global_ROI1_preprocessed"

fig, axes = plt.subplots(1, 1, figsize=(10, 10))

to_coordinate_system = to_coordinate_system
scale = None  # auto pick scale by spatialdata-plot
color = "PLXNB1"  # visualize expression of specific genes
crd = None

# if points in, query is slow, so remove them for plotting
sdata_to_plot = sdata.subset(element_names=[image_name, labels_name, table_name])

render_images_kwargs = {"cmap": "grey", "norm": None, "scale": scale}
render_labels_kwargs = {"fill_alpha": 0.5, "outline_alpha": 0.4, "scale": scale}
show_kwargs = {
    "title": color,
    "colorbar": False,
}
axes = hp.pl.plot_sdata(
    sdata_to_plot,
    image_name=image_name,
    labels_name=labels_name,
    table_name=table_name,
    color=color,
    channel=channel,
    crd=crd,
    render_images_kwargs=render_images_kwargs,
    show_kwargs=show_kwargs,
    ax=axes,
    to_coordinate_system=to_coordinate_system,
)

plt.tight_layout()
plt.show()

INFO     Rasterizing image for faster rendering.                                                                   
INFO     Rasterizing image for faster rendering.

../../_images/9ca65d55613aec94a5f253c4e64b5bb1a720ba99376655571b3c4cf0de22c993.png

6. Leiden clustering.#

# takes around 40 minutes
# This can consume quite some RAM, as the AnnData table is in memory.

labels_name = "cell_labels_global_ROI1"
table_name = "table_global_ROI1_preprocessed"

sdata = hp.tb.leiden(
    sdata,
    labels_name=labels_name,
    table_name=table_name,
    output_table_name=table_name,
    overwrite=True,
)

sdata.tables["table_global_ROI1_preprocessed"].obs.head()

	transcript_counts	control_probe_counts	genomic_control_counts	control_codeword_counts	unassigned_codeword_counts	deprecated_codeword_counts	total_counts	cell_area	nucleus_area	nucleus_count	...	cell_ID	group	n_genes_by_counts	log1p_n_genes_by_counts	log1p_total_counts	pct_counts_in_top_2_genes	pct_counts_in_top_5_genes	n_counts	shapeSize	leiden
cell_id
aaaaebmm-1	303	0	0	0	0	31	303.0	44.659533	31.835157	1.0	...	1	Proliferative Tumor Cells	246	5.509388	5.717028	7.260726	10.231023	303.0	989	3
aaaafhpp-1	311	0	0	0	0	28	311.0	57.664533	38.247345	1.0	...	2	Tumor Cells	255	5.545177	5.743003	2.893891	6.109325	311.0	1277	10
aaaahcem-1	301	0	0	0	0	32	301.0	39.827814	27.590470	1.0	...	3	Proliferative Tumor Cells	249	5.521461	5.710427	4.983389	8.637874	301.0	882	3
aaaakeoi-1	369	0	0	0	0	58	369.0	60.870627	38.879533	1.0	...	4	Proliferative Tumor Cells	306	5.726848	5.913503	3.252033	5.962060	369.0	1348	3
aaaalald-1	295	0	0	0	0	47	295.0	50.484689	46.510939	1.0	...	5	Tumor Cells	254	5.541264	5.690360	5.423729	8.813559	295.0	1118	10

5 rows × 23 columns

import scanpy as sc

sc.pl.umap(sdata.tables["table_global_ROI1_preprocessed"], color=["leiden"], show=True)
# this dataset is annotated, lets see how clusters correspond to cell type
sc.pl.umap(sdata.tables["table_global_ROI1_preprocessed"], color=["group"], show=True)

../../_images/8b779d034306dfc60b8b5f656cb30b61de7497b1c93a046e57e90b8a493c4556.png

../../_images/c101e48093955d9dc19d7d0ab709aa62d61fda6ceabbdb382d4ac9b5e993f49f.png

sc.pl.rank_genes_groups(
    sdata.tables["table_global_ROI1_preprocessed"], n_genes=8, sharey=False, show=False
)

[<Axes: title={'center': '0 vs. rest'}, ylabel='score'>,
 <Axes: title={'center': '1 vs. rest'}>,
 <Axes: title={'center': '2 vs. rest'}>,
 <Axes: title={'center': '3 vs. rest'}>,
 <Axes: title={'center': '4 vs. rest'}, ylabel='score'>,
 <Axes: title={'center': '5 vs. rest'}>,
 <Axes: title={'center': '6 vs. rest'}>,
 <Axes: title={'center': '7 vs. rest'}>,
 <Axes: title={'center': '8 vs. rest'}, ylabel='score'>,
 <Axes: title={'center': '9 vs. rest'}>,
 <Axes: title={'center': '10 vs. rest'}>,
 <Axes: title={'center': '11 vs. rest'}>,
 <Axes: title={'center': '12 vs. rest'}, ylabel='score'>,
 <Axes: title={'center': '13 vs. rest'}>,
 <Axes: title={'center': '14 vs. rest'}>,
 <Axes: title={'center': '15 vs. rest'}>,
 <Axes: title={'center': '16 vs. rest'}, xlabel='ranking', ylabel='score'>,
 <Axes: title={'center': '17 vs. rest'}, xlabel='ranking'>,
 <Axes: title={'center': '18 vs. rest'}, xlabel='ranking'>]

../../_images/b564776cbc5061b2162a06a32b8c908928f13471a7ca1ea701967f7fc86f1167.png

import matplotlib.pyplot as plt

channel = "DAPI"
image_name = "morphology_focus_global_ROI1"
labels_name = "cell_labels_global_ROI1"
table_name = "table_global_ROI1_preprocessed"
to_coordinate_system = "global_ROI1"


to_coordinate_system = to_coordinate_system
scale = None  # auto pick scale by spatialdata-plot
crd = None

color = "leiden"

fig, ax = plt.subplots(1, 1, figsize=(10, 10))

# if points in, query is slow, so remove them for plotting
sdata_to_plot = sdata.subset(element_names=[image_name, labels_name, table_name])

render_images_kwargs = {"cmap": "grey", "norm": None, "scale": scale}
render_labels_kwargs = {"fill_alpha": 0.5, "outline_alpha": 0.4, "scale": scale}
show_kwargs = {
    "title": color,
    "colorbar": False,
}
ax = hp.pl.plot_sdata(
    sdata_to_plot,
    image_name=image_name,
    labels_name=labels_name,
    table_name=table_name,
    color=color,
    channel=channel,
    crd=crd,
    render_images_kwargs=render_images_kwargs,
    show_kwargs=show_kwargs,
    ax=ax,
    to_coordinate_system=to_coordinate_system,
)
# _ax.axis("off")

plt.tight_layout()
plt.show()

INFO     Rasterizing image for faster rendering.                                                                   
INFO     Rasterizing image for faster rendering.

../../_images/6bd88e9f475b27fba069d6d6ff3e9ffc013c9c56b930c036200cd0de23a12da9.png

import matplotlib.pyplot as plt

channel = "DAPI"
image_name = "morphology_focus_global_ROI1"
labels_name = "cell_labels_global_ROI1"
table_name = "table_global_ROI1_preprocessed"
to_coordinate_system = "global_ROI1"


to_coordinate_system = to_coordinate_system
scale = None  # auto pick scale by spatialdata-plot
crd = None

color = "group"

fig, ax = plt.subplots(1, 1, figsize=(20, 20))

# if points in, query is slow, so remove them for plotting
sdata_to_plot = sdata.subset(element_names=[image_name, labels_name, table_name])

render_images_kwargs = {"cmap": "grey", "norm": None, "scale": scale}
render_labels_kwargs = {"fill_alpha": 0.5, "outline_alpha": 0.4, "scale": scale}
show_kwargs = {
    "title": color,
    "colorbar": False,
}
ax = hp.pl.plot_sdata(
    sdata_to_plot,
    image_name=image_name,
    labels_name=labels_name,
    table_name=table_name,
    color=color,
    channel=channel,
    crd=crd,
    render_images_kwargs=render_images_kwargs,
    show_kwargs=show_kwargs,
    ax=ax,
    to_coordinate_system=to_coordinate_system,
)
# _ax.axis("off")

plt.tight_layout()
plt.show()

INFO     Rasterizing image for faster rendering.                                                                   
INFO     Rasterizing image for faster rendering.

../../_images/511d1938f4aed05f29c938ba362c7bca2453fdb142bb2dae7c87760482d99838.png

Note the correspondence between Leiden cluster 17, and the Inflammatory Tumor Cells.

We also estimate the correlation between leiden clusters and the annotated cell type.

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import chi2_contingency

table_name = "table_global_ROI1_preprocessed"
cluster_key = "leiden"
cell_type_key = "group"

df = sdata[table_name].obs

df = df[~df[cell_type_key].isna()]

contingency = pd.crosstab(df[cell_type_key], df[cluster_key])

# chi squared test
chi2, p, dof, expected = chi2_contingency(contingency)


# cramers v
def _cramers_v(confusion_matrix):
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    return np.sqrt(phi2 / min(k - 1, r - 1))


print("Cramers v:", _cramers_v(contingency))

print("Chi-squared statistic:", chi2)
print("p-value:", p)

plt.figure(figsize=(12, 6))
sns.heatmap(contingency, cmap="viridis")
plt.title(f"Cell type × {cluster_key} association")
plt.show()

# Calculate standardized residuals
residuals = (contingency - expected) / np.sqrt(expected)

residuals_df = pd.DataFrame(
    residuals, index=contingency.index, columns=contingency.columns
)

resid_long = residuals_df.stack().reset_index()
resid_long.columns = ["cell_type", cluster_key, "std_residual"]

# Sort by strength of deviation
resid_long["abs_resid"] = resid_long["std_residual"].abs()
top_effects = resid_long.sort_values("abs_resid", ascending=False).head(20)
print(top_effects)

plt.figure(figsize=(12, 6))
sns.heatmap(residuals_df, center=0, cmap="coolwarm", annot=False)
plt.title(f"Standardized residuals: cell_type × {cluster_key}")
plt.show()

Cramers v: 0.7613470480652001
Chi-squared statistic: 3925208.4534459906
p-value: 0.0
                                cell_type leiden  std_residual   abs_resid
Stromal Associated Endothelial Cells      9    558.842983  558.842983
                      T and NK Cells     13    554.391652  554.391652
            Ciliated Epithelial Cells     18    553.300254  553.300254
        Tumor Associated Fibroblasts      1    513.570314  513.570314
            Fallopian Tube Epithelium     16    505.783565  505.783565
      Stromal Associated Fibroblasts      8    485.604699  485.604699
                          Macrophages      5    466.487412  466.487412
           Proliferative Tumor Cells      3    449.341736  449.341736
                  VEGFA+ Tumor Cells     11    438.169924  438.169924
         Malignant Cells Lining Cyst     15    425.370919  425.370919
             Inflammatory Tumor Cells     17    425.115462  425.115462
                           Pericytes     14    400.937651  400.937651
                 Smooth Muscle Cells      4    389.203349  389.203349
                      Granulosa Cells     15    385.049195  385.049195
                SOX2-OT+ Tumor Cells      6    351.807910  351.807910
  Tumor Associated Endothelial Cells     14    343.997358  343.997358
                 Smooth Muscle Cells      7    327.645762  327.645762
                         Tumor Cells      0    259.965609  259.965609
                         Tumor Cells      2    235.059284  235.059284
                         Tumor Cells     10    154.009542  154.009542

../../_images/1211f4dd4e1beaaa9e6b88dfcd4e3ccec833a073af326aec002ae7011f03b00c.png

../../_images/ec02688777ccd7b0d687b322550a2dbecbea274688f1dc03da19c5a064599d09.png