Arc Virtual Cell Atlas: scRNA-seq¶
The Arc Virtual Cell Atlas hosts one of the biggest collections of scRNA-seq datasets.
Lamin mirrors the dataset for simplified access here: laminlabs/arc-virtual-cell-atlas.
If you use the data academically, please cite the original publications, Youngblut et al. (2025) and Zhang et al. (2025).
Connect to the source instance.
# pip install 'lamindb[jupyter,bionty,wetlab,gcp]'
!lamin connect laminlabs/arc-virtual-cell-atlas
Note
If you want to transfer artifacts or metadata into your own instance, use .using("laminlabs/arc-virtual-cell-atlas")
when accessing registries and then .save()
(Transfer data).
import lamindb as ln
import bionty as bt
import wetlab as wl
import pyarrow.compute as pc
import anndata as ad
Tahoe-100M¶
project_tahoe = ln.Project.get(name="Tahoe-100M")
project_tahoe
Project(uid='H5MwZwyA62rG', name='Tahoe-100M', is_type=False, url='https://arcinstitute.org/tools/virtualcellatlas', branch_id=1, space_id=1, created_by_id=1, created_at=2025-02-26 16:03:40 UTC)
# one collection in this project
project_tahoe.collections.df()
uid | key | description | hash | reference | reference_type | space_id | meta_artifact_id | version | is_latest | run_id | created_at | created_by_id | _aux | branch_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
id | |||||||||||||||
1 | BpavRL4ntRTzWEE50000 | tahoe100 | None | GCLk4ZgQxgWspjmEUk3gIg | None | None | 1 | None | 2025-02-25 | True | 3 | 2025-02-26 13:51:22.787537+00:00 | 1 | None | 1 |
Every individual dataset in the atlas is an .h5ad
file that is registered as an artifact in LaminDB.
Artifact level metadata are registered and can be explored as follows:
# get the collection: https://lamin.ai/laminlabs/arc-virtual-cell-atlas/collection/BpavRL4ntRTzWEE5
collection_tahoe = ln.Collection.get(key="tahoe100")
# 14 artifacts in this collection, each correspond to a plate
artifacts_tahoe = collection_tahoe.artifacts.distinct()
artifacts_tahoe.df()
50 cell lines.
artifacts_tahoe.list("cell_lines__name")[:5]
['A-172', 'A-427', 'A498', 'A549', 'AN3 CA']
380 compounds.
artifacts_tahoe.list("compounds__name")[:5]
['18β-Glycyrrhetinic acid',
'4EGI-1',
'5-Azacytidine',
'5-Fluorouracil',
'8-Hydroxyquinoline']
1,138 perturbations.
artifacts_tahoe.list("compound_perturbations__name")[:5]
["[('18β-Glycyrrhetinic acid', 0.05, 'uM')]",
"[('18β-Glycyrrhetinic acid', 0.5, 'uM')]",
"[('18β-Glycyrrhetinic acid', 5.0, 'uM')]",
"[('4EGI-1', 0.05, 'uM')]",
"[('4EGI-1', 0.5, 'uM')]"]
# check the curated metadata of the first artifact
artifact1 = artifacts_tahoe[0]
artifact1.describe()
16 obs metadata features.
artifact1.features["obs"].df()
Query artifacts of interest based on metadata¶
Since all metadata are registered in the sql database, we can explore the datasets without accessing them.
Let’s find which datasets contain A549 cells perturbed with Piroxicam.
# lookup objects give you pythonic access to the values
cell_lines = bt.CellLine.lookup("ontology_id")
drugs = wl.Compound.lookup()
artifacts_a549_piroxicam = artifacts_tahoe.filter(
cell_lines=cell_lines.cvcl_0023, compounds=drugs.piroxicam
)
artifacts_a549_piroxicam.df()
uid | key | description | suffix | kind | otype | size | hash | n_files | n_observations | _hash_type | _key_is_virtual | _overwrite_versions | space_id | storage_id | schema_id | version | is_latest | run_id | created_at | created_by_id | _aux | branch_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
id | |||||||||||||||||||||||
1362 | 56uA9lPPmJ4zLUcr0000 | 2025-02-25/h5ad/plate10_filt_Vevo_Tahoe100M_WS... | None | .h5ad | dataset | AnnData | 26536400717 | j1FXsX7hs7u+eBqnWnmNHw | None | 8044908 | md5 | False | False | 1 | 2 | 3 | None | True | 1 | 2025-02-25 23:22:17.849980+00:00 | 1 | None | 1 |
1363 | omn7JStfJMzy8m6O0000 | 2025-02-25/h5ad/plate11_filt_Vevo_Tahoe100M_WS... | None | .h5ad | dataset | AnnData | 23230802756 | N2mzoYlMLEl6PdecaYyDvw | None | 7435869 | md5 | False | False | 1 | 2 | 3 | None | True | 1 | 2025-02-25 23:22:18.229629+00:00 | 1 | None | 1 |
1364 | S2h2rPLCaUhZAM9u0000 | 2025-02-25/h5ad/plate12_filt_Vevo_Tahoe100M_WS... | None | .h5ad | dataset | AnnData | 37495736876 | VjAkWVFGVpzAMi9Innusuw | None | 10487057 | md5 | False | False | 1 | 2 | 3 | None | True | 1 | 2025-02-25 23:22:18.600910+00:00 | 1 | None | 1 |
You can download an .h5ad
into your local cache:
artifact1.cache()
Or stream it:
artifact1.open()
Open the obs metadata parquet file as a PyArrow Dataset¶
Open the obs metadata file (2.29G) with PyArrow.Dataset
.
obs_metadata = ln.Artifact.filter(
key__endswith="obs_metadata.parquet", projects=project_tahoe
).one()
obs_metadata
Artifact(uid='y1TTR9wbrmZEwpOa0000', is_latest=True, key='2025-02-25/metadata/obs_metadata.parquet', suffix='.parquet', kind='dataset', otype='DataFrame', size=2293981573, hash='qEWOpGw9CmQVzaElyMWT1Q', n_observations=100648790, branch_id=1, space_id=1, storage_id=2, run_id=1, created_by_id=1, created_at=2025-02-25 19:33:42 UTC)
obs_metadata_ds = obs_metadata.open()
obs_metadata_ds.schema
Which A549 cells are perturbed with Piroxicam.
filter_expr = (pc.field("cell_name") == cell_lines.cvcl_0023.name) & (
pc.field("drug") == drugs.piroxicam.name
)
obs_metadata_df = obs_metadata_ds.scanner(filter=filter_expr).to_table().to_pandas()
obs_metadata_df.value_counts("plate")
plate
plate12 2818
plate10 2812
plate11 2279
Name: count, dtype: int64
obs_metadata_df.head()
plate | BARCODE_SUB_LIB_ID | sample | gene_count | tscp_count | mread_count | drugname_drugconc | drug | cell_line | sublibrary | BARCODE | pcnt_mito | S_score | G2M_score | phase | pass_filter | cell_name | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
29314 | plate10 | 50_030_183-lib_1681 | smp_2408 | 644 | 863 | 1024 | [('Piroxicam', 0.05, 'uM')] | Piroxicam | CVCL_0023 | lib_1681 | 50_030_183 | 0.101970 | -0.282297 | -0.165568 | G1 | full | A549 |
29337 | plate10 | 50_035_135-lib_1681 | smp_2408 | 1130 | 1570 | 1827 | [('Piroxicam', 0.05, 'uM')] | Piroxicam | CVCL_0023 | lib_1681 | 50_035_135 | 0.077070 | -0.335042 | -0.280220 | G1 | full | A549 |
29338 | plate10 | 50_035_171-lib_1681 | smp_2408 | 1058 | 1534 | 1809 | [('Piroxicam', 0.05, 'uM')] | Piroxicam | CVCL_0023 | lib_1681 | 50_035_171 | 0.124511 | -0.402028 | -0.404579 | G1 | full | A549 |
29352 | plate10 | 50_038_157-lib_1681 | smp_2408 | 1265 | 1883 | 2240 | [('Piroxicam', 0.05, 'uM')] | Piroxicam | CVCL_0023 | lib_1681 | 50_038_157 | 0.147106 | -0.455343 | -0.311355 | G1 | full | A549 |
29355 | plate10 | 50_039_078-lib_1681 | smp_2408 | 1355 | 1914 | 2258 | [('Piroxicam', 0.05, 'uM')] | Piroxicam | CVCL_0023 | lib_1681 | 50_039_078 | 0.070010 | -0.349396 | 0.186264 | G2M | full | A549 |
Retrieve the corresponding cells from h5ad files.
plate_cells = df.groupby("plate")["BARCODE_SUB_LIB_ID"].apply(list)
adatas = []
for artifact in artifacts_a549_piroxicam:
plate = artifact.features.get_values()["plate"]
idxs = plate_cells.get(plate)
print(f"Loading {len(idxs)} cells from plate {plate}")
with artifact.open() as store:
adata = store[idxs].to_memory() # can also subst genes here
adatas.append(adata)
scBaseCount¶
project_scbasecount = ln.Project.get(name="scBaseCount")
project_scbasecount
Project(uid='vdK00t9DGwHP', name='scBaseCount', is_type=False, url='https://arcinstitute.org/tools/virtualcellatlas', branch_id=1, space_id=1, created_by_id=1, created_at=2025-02-26 16:04:08 UTC)
This project has 105 collections (21 organisms x 5 count features):
project_scbasecount.collections.df()
Query artifacts of interest based on metadata¶
Often you might not want to access all the h5ads in a collection, but rather filter them by metadata:
organisms = bt.Organism.lookup()
tissues = bt.Tissue.lookup()
efos = bt.ExperimentalFactor.lookup()
feature_counts = ln.ULabel.filter(type__name="STARsolo count features").lookup()
h5ads_brain = ln.Artifact.filter(
suffix=".h5ad",
projects=project_scbasecount,
organisms=organisms.human,
ulabels=feature_counts.genefull_ex50pas,
tissues=tissues.brain,
experimental_factors=efos.single_cell,
experiments__name__contains="CRISPRi", # `perturbation` column is registered in `wetlab.Experiment`
).distinct()
h5ads_brain.df()
Load the h5ad files with obs metadata¶
Load the h5ads as a single AnnData:
adatas = []
for artifact in h5ads_brain[:5]: # only load the first 5 artifacts to save CI time
adatas.append(artifact.load())
# the obs metadatas are present in the parquet files
adata_concat = ad.concat(adatas)
adata_concat
Open the sample metadata:
sample_meta = ln.Artifact.filter(
key__endswith="sample_metadata.parquet",
projects=project_scbasecount,
organisms=organisms.human,
ulabels=feature_counts.genefull_ex50pas,
).one()
sample_meta
Artifact(uid='WCHkcyWN8L6pDI4E0000', is_latest=True, key='2025-02-25/metadata/GeneFull_Ex50pAS/Homo_sapiens/sample_metadata.parquet', suffix='.parquet', kind='dataset', otype='DataFrame', size=531878, hash='4QrqW8DQVRl6bKNYiJhq3g', n_observations=16077, branch_id=1, space_id=1, storage_id=3, run_id=2, created_by_id=1, created_at=2025-02-25 20:41:32 UTC)
sample_meta_dataset = sample_meta.open()
sample_meta_dataset.schema
Fetch corresponding sample metadata:
filter_expr = pc.field("srx_accession").isin(
adata_concat.obs["SRX_accession"].astype(str)
)
df = sample_meta_dataset.scanner(filter=filter_expr).to_table().to_pandas()
Add the sample metadata to the AnnData:
adata_concat.obs = adata_concat.obs.merge(
df, left_on="SRX_accession", right_on="srx_accession"
)
adata_concat
AnnData object with n_obs × n_vars = 38206 × 36601
obs: 'gene_count', 'umi_count', 'SRX_accession', 'entrez_id', 'srx_accession', 'file_path', 'obs_count', 'lib_prep', 'tech_10x', 'cell_prep', 'organism', 'tissue', 'disease', 'perturbation', 'cell_line', 'czi_collection_id', 'czi_collection_name'
adata_concat.obs.head()