KRONOS: mapping marker ID’s.#
This tutorial demonstrates how to map dataset-specific marker IDs to the pretrained marker IDs used by KRONOS.
import harpy as hp
# fetch the codex example data
sdata = hp.datasets.codex_example()
# fetch the kronos marker metadata. This is the marker metadata specific to the kronos model
registry = hp.datasets.get_registry()
marker_metadata_path = registry.fetch("proteomics/codex/chl_maps_dataset/marker_metadata.csv")
import pandas as pd
marker_names = sdata["image"]["scale4"]["image"].c.data
marker_data_specific = pd.DataFrame({"marker_id": range(len(marker_names)), "marker_name": marker_names})
marker_data_specific.head() # dataset-specific marker info file.
| marker_id | marker_name | |
|---|---|---|
| 0 | 0 | BCL-2 |
| 1 | 1 | CCR6 |
| 2 | 2 | CD11B |
| 3 | 3 | CD11C |
| 4 | 4 | CD15 |
import pandas as pd
marker_metadata = pd.read_csv(marker_metadata_path)
marker_metadata.rename(columns={"marker_id": "marker_id_pretrained"}, inplace=True)
marker_metadata.head() # pretrained marker metadata file.
| marker_name | marker_id_pretrained | marker_mean | marker_std | |
|---|---|---|---|---|
| 0 | DAPI | 4 | 0.083207 | 0.095882 |
| 1 | ARID1A | 8 | 0.005042 | 0.010528 |
| 2 | ATRX | 10 | 0.021219 | 0.048886 |
| 3 | BCL6 | 12 | 0.023329 | 0.054438 |
| 4 | CDT1 | 14 | 0.003630 | 0.019132 |
matched_markers = (
pd.merge(marker_metadata, marker_data_specific, on="marker_name", how="inner")
.sort_values(by="marker_id")
.reset_index(drop=True)
)
print(f"There are {matched_markers.shape[0]} matched markers.")
display(matched_markers)
There are 31 matched markers.
| marker_name | marker_id_pretrained | marker_mean | marker_std | marker_id | |
|---|---|---|---|---|---|
| 0 | CCR6 | 166 | 0.044867 | 0.042833 | 1 |
| 1 | CD11B | 180 | 0.032169 | 0.052366 | 2 |
| 2 | CD11C | 182 | 0.019039 | 0.044336 | 3 |
| 3 | CD15 | 194 | 0.016322 | 0.040416 | 4 |
| 4 | CD16 | 196 | 0.041869 | 0.055626 | 5 |
| 5 | CD162 | 198 | 0.012217 | 0.040094 | 6 |
| 6 | CD163 | 200 | 0.014384 | 0.033087 | 7 |
| 7 | CD2 | 212 | 0.161256 | 0.110404 | 8 |
| 8 | CD20 | 214 | 0.045192 | 0.057727 | 9 |
| 9 | CD206 | 216 | 0.014008 | 0.044501 | 10 |
| 10 | CD25 | 224 | 0.046293 | 0.060704 | 11 |
| 11 | CD30 | 234 | 0.080204 | 0.056237 | 12 |
| 12 | CD31 | 236 | 0.007018 | 0.026488 | 13 |
| 13 | CD4 | 248 | 0.017620 | 0.043489 | 14 |
| 14 | CD44 | 254 | 0.025730 | 0.062900 | 15 |
| 15 | CD45RA | 258 | 0.046235 | 0.070005 | 16 |
| 16 | CD45RO | 260 | 0.029777 | 0.074940 | 17 |
| 17 | CD5 | 268 | 0.060517 | 0.094988 | 19 |
| 18 | CD56 | 272 | 0.027449 | 0.049417 | 20 |
| 19 | CD57 | 274 | 0.005938 | 0.029239 | 21 |
| 20 | CD68 | 282 | 0.013593 | 0.033644 | 22 |
| 21 | CD69 | 284 | 0.029540 | 0.045456 | 23 |
| 22 | CD7 | 286 | 0.056185 | 0.082216 | 24 |
| 23 | CD8 | 294 | 0.050695 | 0.084633 | 25 |
| 24 | EGFR | 330 | 0.067375 | 0.074557 | 29 |
| 25 | FOXP3 | 28 | 0.014452 | 0.038268 | 30 |
| 26 | MCT | 428 | 0.018428 | 0.019523 | 35 |
| 27 | PODOPLANIN | 456 | 0.044324 | 0.082199 | 40 |
| 28 | VIMENTIN | 496 | 0.045974 | 0.058084 | 46 |
| 29 | A-SMA | 130 | 0.029575 | 0.041247 | 47 |
| 30 | B-CATENIN | 142 | 0.032558 | 0.059073 | 48 |
missing_markers = marker_data_specific[~marker_data_specific["marker_name"].isin(marker_metadata["marker_name"])]
print(
f"There are {missing_markers.shape[0]} markers that could not be matched to a marker from the pretraining dataset."
)
missing_markers.head()
There are 18 markers that could not be matched to a marker from the pretraining dataset.
| marker_id | marker_name | |
|---|---|---|
| 0 | 0 | BCL-2 |
| 18 | 18 | CD46 |
| 26 | 26 | COLLAGEN 4 |
| 27 | 27 | CYTOKERITIN |
| 28 | 28 | DAPI-01 |
unmatched_markers = marker_metadata[~marker_metadata["marker_name"].isin(marker_data_specific["marker_name"])]
print(f"There are {unmatched_markers.shape[0]} markers from the pretraining dataset that where not yet matched.")
unmatched_markers.head() # markers from dataset that could not be mapped to marker_metadata -> do a manual mapping
There are 146 markers from the pretraining dataset that where not yet matched.
| marker_name | marker_id_pretrained | marker_mean | marker_std | |
|---|---|---|---|---|
| 0 | DAPI | 4 | 0.083207 | 0.095882 |
| 1 | ARID1A | 8 | 0.005042 | 0.010528 |
| 2 | ATRX | 10 | 0.021219 | 0.048886 |
| 3 | BCL6 | 12 | 0.023329 | 0.054438 |
| 4 | CDT1 | 14 | 0.003630 | 0.019132 |
matched_markers["marker_name_pretrained"] = matched_markers[
"marker_name"
] # we want to keep track of name of data specific markers and pretrained marker names
matched_markers = matched_markers[
["marker_name", "marker_name_pretrained", "marker_id_pretrained", "marker_mean", "marker_std", "marker_id"]
]
matched_markers.head()
| marker_name | marker_name_pretrained | marker_id_pretrained | marker_mean | marker_std | marker_id | |
|---|---|---|---|---|---|---|
| 0 | CCR6 | CCR6 | 166 | 0.044867 | 0.042833 | 1 |
| 1 | CD11B | CD11B | 180 | 0.032169 | 0.052366 | 2 |
| 2 | CD11C | CD11C | 182 | 0.019039 | 0.044336 | 3 |
| 3 | CD15 | CD15 | 194 | 0.016322 | 0.040416 | 4 |
| 4 | CD16 | CD16 | 196 | 0.041869 | 0.055626 | 5 |
Lets match the unmatched#
# idea taken from https://github.com/mahmoodlab/KRONOS/blob/1f57c51efd863968cfa491819aef0e37dd3e2fdb/tutorials/utils/marker_metadata.py#L60
from difflib import SequenceMatcher
import numpy as np
top_5_suggestions = []
for _missing_marker in missing_markers["marker_name"].values:
similarity_list = np.array(
[
SequenceMatcher(None, _missing_marker.upper(), marker_name).ratio()
for marker_name in unmatched_markers["marker_name"].to_list()
]
)
sorted_index = np.argsort(similarity_list, stable=True)
sorted_index = sorted_index[::-1]
top_5_suggestions.append(unmatched_markers["marker_name"].values[sorted_index][:5])
top_5_suggestions = np.stack(top_5_suggestions)
top_5_suggestions_names = [f"marker name suggestion {i + 1} (pretrained)" for i in range(top_5_suggestions.shape[1])]
# Combine into DataFrame
top_5_suggestions = pd.DataFrame(
top_5_suggestions, columns=top_5_suggestions_names, index=missing_markers["marker_name"].values
)
top_5_suggestions.index.name = "marker name (data specific)"
display(top_5_suggestions)
print("Following markers need to be manually mapped: ")
dict.fromkeys(top_5_suggestions.index, "")
| marker name suggestion 1 (pretrained) | marker name suggestion 2 (pretrained) | marker name suggestion 3 (pretrained) | marker name suggestion 4 (pretrained) | marker name suggestion 5 (pretrained) | |
|---|---|---|---|---|---|
| marker name (data specific) | |||||
| BCL-2 | BCL2 | BDCA-2 | BCL6 | B2M | CD28 |
| CD46 | CD61 | CD54 | CD45 | CD40 | CD36 |
| COLLAGEN 4 | COLLAGEN | LANGERIN | CTLA4 | CATHEPSIN L | LAG3 |
| CYTOKERITIN | CYTOKERATIN | CLUSTERIN | E-CADHERIN | CATHEPSIN L | LANGERIN |
| DAPI-01 | DAPI | VDAC1 | IL-1B | BDCA-2 | PD1 |
| GRANZYME B | GZMB | LYSOZYME | RELB | LANGERIN | RB |
| HLA-DR | HLA_DR | HLA_DRA | HLA_DRBPB | HLA1 | HLA_1 |
| IDO-1 | IDO1 | IL-1B | ARID1A | PD1 | IGD |
| LAG-3 | LAG3 | SIGELC-3 | LANGERIN | COLLAGEN | HLA1 |
| MMP-9 | MMP9 | MPO | LMP1 | TMPRSS2 | SIGLEC-9 |
| MUC-1 | MUC1 | MUC5AC | C1Q | LMP1 | CD61 |
| PD-1 | PD1 | PDL1 | LMP1 | IDO1 | G6PD |
| PD-L1 | PDL1 | PD1 | LMP1 | IDO1 | HLA1 |
| T-BET | TBET | PTEN | TREM2 | TCR_B | IL-1B |
| TCR-G-D | TCR_GD | TCR_B | DC-SIGN | IGD | CD3 |
| TCRB | TCR_B | RB | TCR_GD | ERB | GITR |
| TIM-3 | TIM3 | GALECTIN-3 | IGM | SIGELC-3 | TREM2 |
| VISA | VISTA | INOS | IGA2 | IGA1 | ICOS |
Following markers need to be manually mapped:
{'BCL-2': '',
'CD46': '',
'COLLAGEN 4': '',
'CYTOKERITIN': '',
'DAPI-01': '',
'GRANZYME B': '',
'HLA-DR': '',
'IDO-1': '',
'LAG-3': '',
'MMP-9': '',
'MUC-1': '',
'PD-1': '',
'PD-L1': '',
'T-BET': '',
'TCR-G-D': '',
'TCRB': '',
'TIM-3': '',
'VISA': ''}
manual_map = {
"BCL-2": "BCL2",
"COLLAGEN 4": "COLLAGEN",
"CYTOKERITIN": "CYTOKERATIN",
"DAPI-01": "DAPI",
"GRANZYME B": "GZMB",
"IDO-1": "IDO1",
"LAG-3": "LAG3",
"MMP-9": "MMP9",
"MUC-1": "MUC1",
"PD-1": "PD1",
"PD-L1": "PDL1",
"T-BET": "TBET",
"TIM-3": "TIM3",
"TCR-G-D": "TCR_GD",
"TCRB": "TCR_B",
"HLA-DR": "HLA_DR",
}
manual_map
{'BCL-2': 'BCL2',
'COLLAGEN 4': 'COLLAGEN',
'CYTOKERITIN': 'CYTOKERATIN',
'DAPI-01': 'DAPI',
'GRANZYME B': 'GZMB',
'IDO-1': 'IDO1',
'LAG-3': 'LAG3',
'MMP-9': 'MMP9',
'MUC-1': 'MUC1',
'PD-1': 'PD1',
'PD-L1': 'PDL1',
'T-BET': 'TBET',
'TIM-3': 'TIM3',
'TCR-G-D': 'TCR_GD',
'TCRB': 'TCR_B',
'HLA-DR': 'HLA_DR'}
# append the manual mapped ones to the matched_markers dataframe
for _key, _value in manual_map.items():
# some sanity checks
if _value in matched_markers["marker_name_pretrained"].values:
raise ValueError(f"'{_value}' already exists, please map to a marker not already in matched_markers dataframe.")
if _value in matched_markers["marker_name"].values:
raise ValueError(
f"You mapped to a marker name '{_value}' that is in the data specific column. Please verify your manual mapping."
)
if _key in matched_markers["marker_name"].values or _key in matched_markers["marker_name"]:
raise ValueError(f"Marker '{_key}' is already mapped.")
if _key in matched_markers["marker_name_pretrained"].values:
raise ValueError(f"Marker '{_key}' is in the pretrained column. Please verify your manual mapping.")
if _key not in marker_data_specific["marker_name"].values:
raise ValueError(f"Marker '{_key}' is not in data specific marker data.")
if _value not in marker_metadata["marker_name"].values:
raise ValueError(f"marker '{_value}' not in pretrained marker metadata file.")
# unmatched_markers holds pretrained marker names
_match = unmatched_markers[unmatched_markers["marker_name"] == _value].copy()
# keep track of both marker_name_pretrained and data specific marker name ('marker_name')
_match.rename(columns={"marker_name": "marker_name_pretrained"}, inplace=True)
_match["marker_name"] = _key
_marker_id = marker_data_specific[marker_data_specific["marker_name"] == _key]["marker_id"].item()
_match["marker_id"] = _marker_id # marker id, is the id in the data specific marker data.
matched_markers = (
pd.concat([matched_markers, _match]).reset_index(drop=True).sort_values(by="marker_id").reset_index(drop=True)
)
matched_markers
| marker_name | marker_name_pretrained | marker_id_pretrained | marker_mean | marker_std | marker_id | |
|---|---|---|---|---|---|---|
| 0 | BCL-2 | BCL2 | 150 | 0.047104 | 0.060276 | 0 |
| 1 | CCR6 | CCR6 | 166 | 0.044867 | 0.042833 | 1 |
| 2 | CD11B | CD11B | 180 | 0.032169 | 0.052366 | 2 |
| 3 | CD11C | CD11C | 182 | 0.019039 | 0.044336 | 3 |
| 4 | CD15 | CD15 | 194 | 0.016322 | 0.040416 | 4 |
| 5 | CD16 | CD16 | 196 | 0.041869 | 0.055626 | 5 |
| 6 | CD162 | CD162 | 198 | 0.012217 | 0.040094 | 6 |
| 7 | CD163 | CD163 | 200 | 0.014384 | 0.033087 | 7 |
| 8 | CD2 | CD2 | 212 | 0.161256 | 0.110404 | 8 |
| 9 | CD20 | CD20 | 214 | 0.045192 | 0.057727 | 9 |
| 10 | CD206 | CD206 | 216 | 0.014008 | 0.044501 | 10 |
| 11 | CD25 | CD25 | 224 | 0.046293 | 0.060704 | 11 |
| 12 | CD30 | CD30 | 234 | 0.080204 | 0.056237 | 12 |
| 13 | CD31 | CD31 | 236 | 0.007018 | 0.026488 | 13 |
| 14 | CD4 | CD4 | 248 | 0.017620 | 0.043489 | 14 |
| 15 | CD44 | CD44 | 254 | 0.025730 | 0.062900 | 15 |
| 16 | CD45RA | CD45RA | 258 | 0.046235 | 0.070005 | 16 |
| 17 | CD45RO | CD45RO | 260 | 0.029777 | 0.074940 | 17 |
| 18 | CD5 | CD5 | 268 | 0.060517 | 0.094988 | 19 |
| 19 | CD56 | CD56 | 272 | 0.027449 | 0.049417 | 20 |
| 20 | CD57 | CD57 | 274 | 0.005938 | 0.029239 | 21 |
| 21 | CD68 | CD68 | 282 | 0.013593 | 0.033644 | 22 |
| 22 | CD69 | CD69 | 284 | 0.029540 | 0.045456 | 23 |
| 23 | CD7 | CD7 | 286 | 0.056185 | 0.082216 | 24 |
| 24 | CD8 | CD8 | 294 | 0.050695 | 0.084633 | 25 |
| 25 | COLLAGEN 4 | COLLAGEN | 312 | 0.023570 | 0.069731 | 26 |
| 26 | CYTOKERITIN | CYTOKERATIN | 322 | 0.017070 | 0.040264 | 27 |
| 27 | DAPI-01 | DAPI | 4 | 0.083207 | 0.095882 | 28 |
| 28 | EGFR | EGFR | 330 | 0.067375 | 0.074557 | 29 |
| 29 | FOXP3 | FOXP3 | 28 | 0.014452 | 0.038268 | 30 |
| 30 | GRANZYME B | GZMB | 360 | 0.021288 | 0.036292 | 31 |
| 31 | HLA-DR | HLA_DR | 370 | 0.056301 | 0.094876 | 32 |
| 32 | IDO-1 | IDO1 | 390 | 0.048158 | 0.067393 | 33 |
| 33 | LAG-3 | LAG3 | 410 | 0.034682 | 0.056881 | 34 |
| 34 | MCT | MCT | 428 | 0.018428 | 0.019523 | 35 |
| 35 | MMP-9 | MMP9 | 432 | 0.100762 | 0.103386 | 36 |
| 36 | MUC-1 | MUC1 | 436 | 0.008583 | 0.032340 | 37 |
| 37 | PD-1 | PD1 | 446 | 0.025946 | 0.044720 | 38 |
| 38 | PD-L1 | PDL1 | 450 | 0.036110 | 0.056350 | 39 |
| 39 | PODOPLANIN | PODOPLANIN | 456 | 0.044324 | 0.082199 | 40 |
| 40 | T-BET | TBET | 68 | 0.021789 | 0.046029 | 41 |
| 41 | TCR-G-D | TCR_GD | 482 | 0.011759 | 0.025139 | 42 |
| 42 | TCRB | TCR_B | 480 | 0.097150 | 0.102599 | 43 |
| 43 | TIM-3 | TIM3 | 486 | 0.035809 | 0.059759 | 44 |
| 44 | VIMENTIN | VIMENTIN | 496 | 0.045974 | 0.058084 | 46 |
| 45 | A-SMA | A-SMA | 130 | 0.029575 | 0.041247 | 47 |
| 46 | B-CATENIN | B-CATENIN | 142 | 0.032558 | 0.059073 | 48 |
Manually set metadata:#
Taken from Kronos tutorials, manually set metadata:
If some markers are still unmatched with the pretrained dataset and you can not ignore these marker then you can manually assign their marker ID, mean, and standard deviation values:
Marker ID: Choose an unassigned ID from the range 1–512 in marker_metadata.csv. Ideally, select an ID close to a biologically similar marker.
Mean & Std Values: Calculate these from your dataset for the corresponding markers. Ensure marker intensities are converted to float type and intensities are in range of 0-1 before computing the mean and standard deviation.
Marker IDs are assigned as integers from 1 to 512. In the pretrained dataset, nuclear markers are assigned IDs from 1 to 127, while non-nuclear markers receive IDs from 128 to 512. This grouping helps capture high-level similarities between markers of the same type. Within each category, markers are arranged alphabetically, but only even-numbered IDs are assigned to those included in the pretrained dataset. The odd-numbered IDs are intentionally left unassigned, reserved for biologically similar markers that were not part of the pretrained dataset. This approach allows end-users to assign marker IDs from the odd-numbered values, ensuring that any newly added markers remain closely linked to the existing structure while preserving biological relevance.
# print markers still not matched:
missing_markers = marker_data_specific[~marker_data_specific["marker_id"].isin(matched_markers["marker_id"])]
missing_markers
| marker_id | marker_name | |
|---|---|---|
| 18 | 18 | CD46 |
| 45 | 45 | VISA |
df_manually_set_rows = pd.DataFrame(
[
{
"marker_name": "CD46",
"marker_id_pretrained": 295,
"marker_mean": 0.051,
"marker_std": 0.085,
"marker_id": 18,
},
{
"marker_name": "VISA",
"marker_id_pretrained": 150,
"marker_mean": 0.015,
"marker_std": 0.014,
"marker_id": 45,
},
]
)
# add a sanity check here
for _value in df_manually_set_rows["marker_name"].values:
if _value in matched_markers["marker_name"].values:
raise ValueError(f"Marker '{_value}' already matched.")
if _value in matched_markers["marker_name_pretrained"].values:
raise ValueError(
f"Marker '{_value}' found in the 'marker_name_pretrained' column. Please verify your manual mapping."
)
matched_markers = (
pd.concat([matched_markers, df_manually_set_rows], ignore_index=True)
.sort_values(by="marker_id")
.reset_index(drop=True)
)
matched_markers
| marker_name | marker_name_pretrained | marker_id_pretrained | marker_mean | marker_std | marker_id | |
|---|---|---|---|---|---|---|
| 0 | BCL-2 | BCL2 | 150 | 0.047104 | 0.060276 | 0 |
| 1 | CCR6 | CCR6 | 166 | 0.044867 | 0.042833 | 1 |
| 2 | CD11B | CD11B | 180 | 0.032169 | 0.052366 | 2 |
| 3 | CD11C | CD11C | 182 | 0.019039 | 0.044336 | 3 |
| 4 | CD15 | CD15 | 194 | 0.016322 | 0.040416 | 4 |
| 5 | CD16 | CD16 | 196 | 0.041869 | 0.055626 | 5 |
| 6 | CD162 | CD162 | 198 | 0.012217 | 0.040094 | 6 |
| 7 | CD163 | CD163 | 200 | 0.014384 | 0.033087 | 7 |
| 8 | CD2 | CD2 | 212 | 0.161256 | 0.110404 | 8 |
| 9 | CD20 | CD20 | 214 | 0.045192 | 0.057727 | 9 |
| 10 | CD206 | CD206 | 216 | 0.014008 | 0.044501 | 10 |
| 11 | CD25 | CD25 | 224 | 0.046293 | 0.060704 | 11 |
| 12 | CD30 | CD30 | 234 | 0.080204 | 0.056237 | 12 |
| 13 | CD31 | CD31 | 236 | 0.007018 | 0.026488 | 13 |
| 14 | CD4 | CD4 | 248 | 0.017620 | 0.043489 | 14 |
| 15 | CD44 | CD44 | 254 | 0.025730 | 0.062900 | 15 |
| 16 | CD45RA | CD45RA | 258 | 0.046235 | 0.070005 | 16 |
| 17 | CD45RO | CD45RO | 260 | 0.029777 | 0.074940 | 17 |
| 18 | CD46 | NaN | 295 | 0.051000 | 0.085000 | 18 |
| 19 | CD5 | CD5 | 268 | 0.060517 | 0.094988 | 19 |
| 20 | CD56 | CD56 | 272 | 0.027449 | 0.049417 | 20 |
| 21 | CD57 | CD57 | 274 | 0.005938 | 0.029239 | 21 |
| 22 | CD68 | CD68 | 282 | 0.013593 | 0.033644 | 22 |
| 23 | CD69 | CD69 | 284 | 0.029540 | 0.045456 | 23 |
| 24 | CD7 | CD7 | 286 | 0.056185 | 0.082216 | 24 |
| 25 | CD8 | CD8 | 294 | 0.050695 | 0.084633 | 25 |
| 26 | COLLAGEN 4 | COLLAGEN | 312 | 0.023570 | 0.069731 | 26 |
| 27 | CYTOKERITIN | CYTOKERATIN | 322 | 0.017070 | 0.040264 | 27 |
| 28 | DAPI-01 | DAPI | 4 | 0.083207 | 0.095882 | 28 |
| 29 | EGFR | EGFR | 330 | 0.067375 | 0.074557 | 29 |
| 30 | FOXP3 | FOXP3 | 28 | 0.014452 | 0.038268 | 30 |
| 31 | GRANZYME B | GZMB | 360 | 0.021288 | 0.036292 | 31 |
| 32 | HLA-DR | HLA_DR | 370 | 0.056301 | 0.094876 | 32 |
| 33 | IDO-1 | IDO1 | 390 | 0.048158 | 0.067393 | 33 |
| 34 | LAG-3 | LAG3 | 410 | 0.034682 | 0.056881 | 34 |
| 35 | MCT | MCT | 428 | 0.018428 | 0.019523 | 35 |
| 36 | MMP-9 | MMP9 | 432 | 0.100762 | 0.103386 | 36 |
| 37 | MUC-1 | MUC1 | 436 | 0.008583 | 0.032340 | 37 |
| 38 | PD-1 | PD1 | 446 | 0.025946 | 0.044720 | 38 |
| 39 | PD-L1 | PDL1 | 450 | 0.036110 | 0.056350 | 39 |
| 40 | PODOPLANIN | PODOPLANIN | 456 | 0.044324 | 0.082199 | 40 |
| 41 | T-BET | TBET | 68 | 0.021789 | 0.046029 | 41 |
| 42 | TCR-G-D | TCR_GD | 482 | 0.011759 | 0.025139 | 42 |
| 43 | TCRB | TCR_B | 480 | 0.097150 | 0.102599 | 43 |
| 44 | TIM-3 | TIM3 | 486 | 0.035809 | 0.059759 | 44 |
| 45 | VISA | NaN | 150 | 0.015000 | 0.014000 | 45 |
| 46 | VIMENTIN | VIMENTIN | 496 | 0.045974 | 0.058084 | 46 |
| 47 | A-SMA | A-SMA | 130 | 0.029575 | 0.041247 | 47 |
| 48 | B-CATENIN | B-CATENIN | 142 | 0.032558 | 0.059073 | 48 |
# export to csv:
import os
output_path = "/data/groups/technologies/spatial.catalyst/Arne/harpy/notebooks/kronos/marker_metadata_mapped.csv" # change the output path
if os.path.exists(output_path):
print(f"File already exists: {output_path}")
print("Aborting to prevent overwrite.")
else:
matched_markers.to_csv(output_path, index=False)
print(f"File saved to: {output_path}")
File already exists: /data/groups/technologies/spatial.catalyst/Arne/harpy/notebooks/kronos/marker_metadata_mapped.csv
Aborting to prevent overwrite.
df = pd.read_csv("/data/groups/technologies/spatial.catalyst/Arne/harpy/notebooks/kronos/marker_metadata_mapped.csv")
df.head() # feed this to kronos, do mean and std normalization inside the chunk
| marker_name | marker_name_pretrained | marker_id_pretrained | marker_mean | marker_std | marker_id | |
|---|---|---|---|---|---|---|
| 0 | BCL-2 | BCL2 | 150 | 0.047104 | 0.060276 | 0 |
| 1 | CCR6 | CCR6 | 166 | 0.044867 | 0.042833 | 1 |
| 2 | CD11B | CD11B | 180 | 0.032169 | 0.052366 | 2 |
| 3 | CD11C | CD11C | 182 | 0.019039 | 0.044336 | 3 |
| 4 | CD15 | CD15 | 194 | 0.016322 | 0.040416 | 4 |