EHR¶
In this guide, we’ll look at curating a DataFrame
storing examplary EHR data, curate it and save it as an annotated .parquet
file.
the dataframe has columns
disease
,phenotype
,developmental_stage
, andage
if columns or values are missing, we standardize the dataframe with default values
any values that are present map against specific versions of pre-defined ontologies
# pip install 'lamindb[bionty]'
!lamin init --storage ./test-ehrschema --modules bionty
import lamindb as ln
import bionty as bt
import pandas as pd
ln.track("2XEr2IA4n1w4")
Define a schema¶
Let us first define the ontology versions we want to use.
disease_source = bt.Source.get(
entity="bionty.Disease", name="mondo", currently_used=True
)
developmental_stage_source = bt.Source.get(
entity="bionty.DevelopmentalStage", name="hsapdv", currently_used=True
)
bt.Source.filter(entity="bionty.Phenotype", name="pato").update(currently_used=False)
phenotype_source = bt.Source.get(
entity="bionty.Phenotype", name="hp"
) # will use add_source
phenotype_source.currently_used = True
phenotype_source.save()
Source(uid='48fBFLmn', entity='bionty.Phenotype', organism='human', name='hp', version='2024-04-26', in_db=False, currently_used=True, description='Human Phenotype Ontology', url='https://github.com/obophenotype/human-phenotype-ontology/releases/download/v2024-04-26/hp.owl', source_website='https://hpo.jax.org', branch_id=1, space_id=1, created_by_id=1, created_at=2025-07-14 06:40:02 UTC)
Let us now create a schema by defining the features that it measures. The ontology versions are captured via their uid
.
schema = ln.Schema(
name="My EHR schema",
features=[
ln.Feature(name="age", dtype=int).save(),
ln.Feature(
name="disease",
dtype=bt.Disease,
default_value="normal",
nullable=False,
cat_filters={"source__uid": disease_source.uid},
).save(),
ln.Feature(
name="developmental_stage",
dtype=bt.DevelopmentalStage,
default_value="unknown",
nullable=False,
cat_filters={"source__uid": developmental_stage_source.uid},
).save(),
ln.Feature(
name="phenotype",
dtype=bt.Phenotype,
default_value="unknown",
nullable=False,
cat_filters={"source__uid": phenotype_source.uid},
).save(),
],
).save()
# look at a dataframe of the features that are part of the schema
schema.features.df()
Curate an example dataset¶
Create an example DataFrame
that has all required columns but one is misnamed.
dataset = {
"disease": pd.Categorical(
[
"Alzheimer disease",
"diabetes mellitus",
pd.NA,
"Hypertension",
"asthma",
]
),
"phenotype": pd.Categorical(
[
"Mental deterioration",
"Hyperglycemia",
"Tumor growth",
"Increased blood pressure",
"Airway inflammation",
]
),
"developmental_stage": pd.Categorical(
["Adult", "Adult", "Adult", "Adult", "Child"]
),
"patient_age": [70, 55, 60, 65, 12],
}
df = pd.DataFrame(dataset)
df
Let’s validate it.
curator = ln.curators.DataFrameCurator(df, schema)
try:
curator.validate()
except ln.errors.ValidationError as e:
assert "column 'age' not in dataframe" in str(e)
print(e)
Fix the name of the patient_age
column to be age
.
df.columns = df.columns.str.replace("patient_age", "age")
try:
curator.validate()
except ln.errors.ValidationError as e:
assert "non-nullable series 'disease' contains null values" in str(e)
print(e)
Standardize the dataframe so that the missing value gets populated with the default value.
curator.standardize()
try:
curator.validate()
except ln.errors.ValidationError as e:
print(e)
# assert "2 terms are not validated: 'Tumor growth', 'Airway inflammation'" in str(e)
! 2 terms not validated in feature 'disease': 'normal', 'Hypertension'
→ fix typos, remove non-existent values, or save terms via: curator.cat.add_new_from('disease')
! 2 terms not validated in feature 'developmental_stage': 'Adult', 'Child'
→ fix typos, remove non-existent values, or save terms via: curator.cat.add_new_from('developmental_stage')
! 2 terms not validated in feature 'phenotype': 'Tumor growth', 'Airway inflammation'
→ fix typos, remove non-existent values, or save terms via: curator.cat.add_new_from('phenotype')
2 terms not validated in feature 'phenotype': 'Tumor growth', 'Airway inflammation'
→ fix typos, remove non-existent values, or save terms via: curator.cat.add_new_from('phenotype')
Add the ‘normal’ term to the disease registry.
bt.Disease(name="normal", description="Healthy condition").save()
Disease(uid='7kTPatVd', name='normal', description='Healthy condition', branch_id=1, space_id=1, created_by_id=1, run_id=1, created_at=2025-07-14 06:40:10 UTC)
Curate the remaining mismatches manually.
diseases = bt.Disease.public().lookup()
phenotypes = bt.Phenotype.public().lookup()
developmental_stages = bt.DevelopmentalStage.public().lookup()
df["disease"] = df["disease"].cat.rename_categories(
{"Hypertension": diseases.hypertensive_disorder.name}
)
df["phenotype"] = df["phenotype"].cat.rename_categories(
{
"Tumor growth": phenotypes.neoplasm.name,
"Airway inflammation": phenotypes.bronchitis.name,
}
)
df["developmental_stage"] = df["developmental_stage"].cat.rename_categories(
{
"Adult": developmental_stages.adolescent_stage.name,
"Child": developmental_stages.child_stage.name,
}
)
curator.validate()