From 6fabc98fba54027a96ca79f295f64aa620e02320 Mon Sep 17 00:00:00 2001
From: Tobi Alegbe <alegbe@ebi.ac.uk>
Date: Mon, 9 Sep 2024 10:50:32 +0100
Subject: [PATCH 01/22] Initial commit of biosample index

---
 .../assets/schemas/biosample_index.json       |  94 ++++
 src/gentropy/dataset/biosample_index.py       | 406 ++++++++++++++++++
 .../datasource/cell_ontology/__init__.py      |   3 +
 .../cell_ontology/biosample_index.py          |  65 +++
 4 files changed, 568 insertions(+)
 create mode 100644 src/gentropy/assets/schemas/biosample_index.json
 create mode 100644 src/gentropy/dataset/biosample_index.py
 create mode 100644 src/gentropy/datasource/cell_ontology/__init__.py
 create mode 100644 src/gentropy/datasource/cell_ontology/biosample_index.py

diff --git a/src/gentropy/assets/schemas/biosample_index.json b/src/gentropy/assets/schemas/biosample_index.json
new file mode 100644
index 000000000..5ef3f02c3
--- /dev/null
+++ b/src/gentropy/assets/schemas/biosample_index.json
@@ -0,0 +1,94 @@
+{
+  "type": "struct",
+  "fields": [
+    {
+      "name": "id",
+      "type": "string",
+      "nullable": true,
+      "metadata": {}
+    },
+    {
+      "name": "name",
+      "type": "string",
+      "nullable": true,
+      "metadata": {}
+    },
+    {
+      "name": "dbXRefs",
+      "type": {
+        "type": "array",
+        "elementType": "string",
+        "containsNull": true
+      },
+      "nullable": true,
+      "metadata": {}
+    },
+    {
+      "name": "description",
+      "type": "string",
+      "nullable": true,
+      "metadata": {}
+    },
+    {
+      "name": "parents",
+      "type": {
+        "type": "array",
+        "elementType": "string",
+        "containsNull": true
+      },
+      "nullable": true,
+      "metadata": {}
+    },
+    {
+      "name": "synonyms",
+      "type": {
+        "type": "array",
+        "elementType": "string",
+        "containsNull": true
+      },
+      "nullable": true,
+      "metadata": {}
+    },
+    {
+      "name": "ancestors",
+      "type": {
+        "type": "array",
+        "elementType": "string",
+        "containsNull": true
+      },
+      "nullable": true,
+      "metadata": {}
+    },
+    {
+      "name": "descendants",
+      "type": {
+        "type": "array",
+        "elementType": "string",
+        "containsNull": true
+      },
+      "nullable": true,
+      "metadata": {}
+    },
+    {
+      "name": "children",
+      "type": {
+        "type": "array",
+        "elementType": "string",
+        "containsNull": true
+      },
+      "nullable": true,
+      "metadata": {}
+    },
+    {
+      "name": "ontology",
+      "type": {
+        "type": "map",
+        "keyType": "string",
+        "valueType": "boolean",
+        "valueContainsNull": true
+      },
+      "nullable": true,
+      "metadata": {}
+    }
+  ]
+}
diff --git a/src/gentropy/dataset/biosample_index.py b/src/gentropy/dataset/biosample_index.py
new file mode 100644
index 000000000..e8c787ed6
--- /dev/null
+++ b/src/gentropy/dataset/biosample_index.py
@@ -0,0 +1,406 @@
+"""Study index dataset."""
+
+from __future__ import annotations
+
+import importlib.resources as pkg_resources
+import json
+from dataclasses import dataclass
+from enum import Enum
+from itertools import chain
+from typing import TYPE_CHECKING
+
+from pyspark.sql import functions as f
+from pyspark.sql.window import Window
+
+from gentropy.assets import data
+from gentropy.common.schemas import parse_spark_schema
+from gentropy.dataset.dataset import Dataset
+
+if TYPE_CHECKING:
+    from pyspark.sql import Column, DataFrame
+    from pyspark.sql.types import StructType
+
+    from gentropy.dataset.gene_index import GeneIndex
+
+
+class StudyQualityCheck(Enum):
+    """Study quality control options listing concerns on the quality of the study.
+
+    Attributes:
+        UNRESOLVED_TARGET (str): Target/gene identifier could not match to reference - Labelling failing target.
+        UNRESOLVED_DISEASE (str): Disease identifier could not match to referece or retired identifier - labelling failing disease
+        UNKNOWN_STUDY_TYPE (str): Indicating the provided type of study is not supported.
+        DUPLICATED_STUDY (str): Flagging if a study identifier is not unique.
+        NO_GENE_PROVIDED (str): Flagging QTL studies if the measured
+    """
+
+    UNRESOLVED_TARGET = "Target/gene identifier could not match to reference."
+    UNRESOLVED_DISEASE = "No valid disease identifier found."
+    UNKNOWN_STUDY_TYPE = "This type of study is not supported."
+    DUPLICATED_STUDY = "The identifier of this study is not unique."
+    NO_GENE_PROVIDED = "QTL study doesn't have gene assigned."
+
+
+@dataclass
+class StudyIndex(Dataset):
+    """Study index dataset.
+
+    A study index dataset captures all the metadata for all studies including GWAS and Molecular QTL.
+    """
+
+    @staticmethod
+    def _aggregate_samples_by_ancestry(merged: Column, ancestry: Column) -> Column:
+        """Aggregate sample counts by ancestry in a list of struct colmns.
+
+        Args:
+            merged (Column): A column representing merged data (list of structs).
+            ancestry (Column): The `ancestry` parameter is a column that represents the ancestry of each
+                sample. (a struct)
+
+        Returns:
+            Column: the modified "merged" column after aggregating the samples by ancestry.
+        """
+        # Iterating over the list of ancestries and adding the sample size if label matches:
+        return f.transform(
+            merged,
+            lambda a: f.when(
+                a.ancestry == ancestry.ancestry,
+                f.struct(
+                    a.ancestry.alias("ancestry"),
+                    (a.sampleSize + ancestry.sampleSize).alias("sampleSize"),
+                ),
+            ).otherwise(a),
+        )
+
+    @staticmethod
+    def _map_ancestries_to_ld_population(gwas_ancestry_label: Column) -> Column:
+        """Normalise ancestry column from GWAS studies into reference LD panel based on a pre-defined map.
+
+        This function assumes all possible ancestry categories have a corresponding
+        LD panel in the LD index. It is very important to have the ancestry labels
+        moved to the LD panel map.
+
+        Args:
+            gwas_ancestry_label (Column): A struct column with ancestry label like Finnish,
+                European, African etc. and the corresponding sample size.
+
+        Returns:
+            Column: Struct column with the mapped LD population label and the sample size.
+        """
+        # Loading ancestry label to LD population label:
+        json_dict = json.loads(
+            pkg_resources.read_text(
+                data, "gwas_population_2_LD_panel_map.json", encoding="utf-8"
+            )
+        )
+        map_expr = f.create_map(*[f.lit(x) for x in chain(*json_dict.items())])
+
+        return f.struct(
+            map_expr[gwas_ancestry_label.ancestry].alias("ancestry"),
+            gwas_ancestry_label.sampleSize.alias("sampleSize"),
+        )
+
+    @classmethod
+    def get_schema(cls: type[StudyIndex]) -> StructType:
+        """Provide the schema for the StudyIndex dataset.
+
+        Returns:
+            StructType: The schema of the StudyIndex dataset.
+        """
+        return parse_spark_schema("study_index.json")
+
+    @classmethod
+    def aggregate_and_map_ancestries(
+        cls: type[StudyIndex], discovery_samples: Column
+    ) -> Column:
+        """Map ancestries to populations in the LD reference and calculate relative sample size.
+
+        Args:
+            discovery_samples (Column): A list of struct column. Has an `ancestry` column and a `sampleSize` columns
+
+        Returns:
+            Column: A list of struct with mapped LD population and their relative sample size.
+        """
+        # Map ancestry categories to population labels of the LD index:
+        mapped_ancestries = f.transform(
+            discovery_samples, cls._map_ancestries_to_ld_population
+        )
+
+        # Aggregate sample sizes belonging to the same LD population:
+        aggregated_counts = f.aggregate(
+            mapped_ancestries,
+            f.array_distinct(
+                f.transform(
+                    mapped_ancestries,
+                    lambda x: f.struct(
+                        x.ancestry.alias("ancestry"), f.lit(0.0).alias("sampleSize")
+                    ),
+                )
+            ),
+            cls._aggregate_samples_by_ancestry,
+        )
+        # Getting total sample count:
+        total_sample_count = f.aggregate(
+            aggregated_counts, f.lit(0.0), lambda total, pop: total + pop.sampleSize
+        ).alias("sampleSize")
+
+        # Calculating relative sample size for each LD population:
+        return f.transform(
+            aggregated_counts,
+            lambda ld_population: f.struct(
+                ld_population.ancestry.alias("ldPopulation"),
+                (ld_population.sampleSize / total_sample_count).alias(
+                    "relativeSampleSize"
+                ),
+            ),
+        )
+
+    def study_type_lut(self: StudyIndex) -> DataFrame:
+        """Return a lookup table of study type.
+
+        Returns:
+            DataFrame: A dataframe containing `studyId` and `studyType` columns.
+        """
+        return self.df.select("studyId", "studyType")
+
+    def is_qtl(self: StudyIndex) -> Column:
+        """Return a boolean column with true values for QTL studies.
+
+        Returns:
+            Column: True if the study is a QTL study.
+        """
+        return self.df.studyType.endswith("qtl")
+
+    def is_gwas(self: StudyIndex) -> Column:
+        """Return a boolean column with true values for GWAS studies.
+
+        Returns:
+            Column: True if the study is a GWAS study.
+        """
+        return self.df.studyType == "gwas"
+
+    def has_mapped_trait(self: StudyIndex) -> Column:
+        """Return a boolean column indicating if a study has mapped disease.
+
+        Returns:
+            Column: True if the study has mapped disease.
+        """
+        return f.size(self.df.traitFromSourceMappedIds) > 0
+
+    def is_quality_flagged(self: StudyIndex) -> Column:
+        """Return a boolean column indicating if a study is flagged due to quality issues.
+
+        Returns:
+            Column: True if the study is flagged.
+        """
+        # Testing for the presence of the qualityControls column:
+        if "qualityControls" not in self.df.columns:
+            return f.lit(False)
+        else:
+            return f.size(self.df.qualityControls) != 0
+
+    def has_summarystats(self: StudyIndex) -> Column:
+        """Return a boolean column indicating if a study has harmonized summary statistics.
+
+        Returns:
+            Column: True if the study has harmonized summary statistics.
+        """
+        return self.df.hasSumstats
+
+    def validate_unique_study_id(self: StudyIndex) -> StudyIndex:
+        """Validating the uniqueness of study identifiers and flagging duplicated studies.
+
+        Returns:
+            StudyIndex: with flagged duplicated studies.
+        """
+        validated_df = (
+            self.df.withColumn(
+                "isDuplicated",
+                f.when(
+                    f.count("studyType").over(
+                        Window.partitionBy("studyId").rowsBetween(
+                            Window.unboundedPreceding, Window.unboundedFollowing
+                        )
+                    )
+                    > 1,
+                    True,
+                ).otherwise(False),
+            )
+            .withColumn(
+                "qualityControls",
+                StudyIndex.update_quality_flag(
+                    f.col("qualityControls"),
+                    f.col("isDuplicated"),
+                    StudyQualityCheck.DUPLICATED_STUDY,
+                ),
+            )
+            .drop("isDuplicated")
+        )
+        return StudyIndex(_df=validated_df, _schema=StudyIndex.get_schema())
+
+    def _normalise_disease(
+        self: StudyIndex,
+        source_disease_column_name: str,
+        disease_column_name: str,
+        disease_map: DataFrame,
+    ) -> DataFrame:
+        """Normalising diseases in the study index.
+
+        Given a reference disease map (containing all potential EFO ids with the corresponding reference disease ids),
+        this function maps all EFO ids in the study index to the reference disease ids.
+
+        Args:
+            source_disease_column_name (str): The column name of the disease column to validate.
+            disease_column_name (str): The resulting disease column name that contains the validated ids.
+            disease_map (DataFrame): Reference dataframe with diseases
+
+        Returns:
+            DataFrame: where the newly added diseaseIds column will contain the validated EFO identifiers.
+        """
+        return (
+            self.df
+            # Only validating studies with diseases:
+            .filter(f.size(f.col(source_disease_column_name)) > 0)
+            # Explode disease column:
+            .select(
+                "studyId",
+                "studyType",
+                f.explode_outer(source_disease_column_name).alias("efo"),
+            )
+            # Join disease map:
+            .join(disease_map, on="efo", how="left")
+            .groupBy("studyId")
+            .agg(
+                f.collect_set(f.col("diseaseId")).alias(disease_column_name),
+            )
+        )
+
+    def validate_disease(self: StudyIndex, disease_map: DataFrame) -> StudyIndex:
+        """Validate diseases in the study index dataset.
+
+        Args:
+            disease_map (DataFrame): a dataframe with two columns (efo, diseaseId).
+
+        Returns:
+            StudyIndex: where gwas studies are flagged where no valid disease id could be found.
+        """
+        # Because the disease ids are not mandatory fields of the schema, we skip vaildation if these columns are not present:
+        if ("traitFromSourceMappedIds" not in self.df.columns) or (
+            "backgroundTraitFromSourceMappedIds" not in self.df.columns
+        ):
+            return self
+
+        # Disease Column names:
+        foreground_disease_column = "diseaseIds"
+        background_disease_column = "backgroundDiseaseIds"
+
+        # If diseaseId in schema, we need to drop it:
+        drop_columns = [
+            column
+            for column in self.df.columns
+            if column in [foreground_disease_column, background_disease_column]
+        ]
+
+        if len(drop_columns) > 0:
+            self.df = self.df.drop(*drop_columns)
+
+        # Normalise disease:
+        normalised_disease = self._normalise_disease(
+            "traitFromSourceMappedIds", foreground_disease_column, disease_map
+        )
+        normalised_background_disease = self._normalise_disease(
+            "backgroundTraitFromSourceMappedIds", background_disease_column, disease_map
+        )
+
+        return StudyIndex(
+            _df=(
+                self.df.join(normalised_disease, on="studyId", how="left")
+                .join(normalised_background_disease, on="studyId", how="left")
+                # Updating disease columns:
+                .withColumn(
+                    foreground_disease_column,
+                    f.when(
+                        f.col(foreground_disease_column).isNull(), f.array()
+                    ).otherwise(f.col(foreground_disease_column)),
+                )
+                .withColumn(
+                    background_disease_column,
+                    f.when(
+                        f.col(background_disease_column).isNull(), f.array()
+                    ).otherwise(f.col(background_disease_column)),
+                )
+                # Flagging gwas studies where no valid disease is avilable:
+                .withColumn(
+                    "qualityControls",
+                    StudyIndex.update_quality_flag(
+                        f.col("qualityControls"),
+                        # Flagging all gwas studies with no normalised disease:
+                        (f.size(f.col(foreground_disease_column)) == 0)
+                        & (f.col("studyType") == "gwas"),
+                        StudyQualityCheck.UNRESOLVED_DISEASE,
+                    ),
+                )
+            ),
+            _schema=StudyIndex.get_schema(),
+        )
+
+    def validate_study_type(self: StudyIndex) -> StudyIndex:
+        """Validating study type and flag unsupported types.
+
+        Returns:
+            StudyIndex: with flagged studies with unsupported type.
+        """
+        validated_df = (
+            self.df
+            # Flagging unsupported study types:
+            .withColumn(
+                "qualityControls",
+                StudyIndex.update_quality_flag(
+                    f.col("qualityControls"),
+                    f.when(
+                        (f.col("studyType") == "gwas")
+                        | f.col("studyType").endswith("qtl"),
+                        False,
+                    ).otherwise(True),
+                    StudyQualityCheck.UNKNOWN_STUDY_TYPE,
+                ),
+            )
+        )
+        return StudyIndex(_df=validated_df, _schema=StudyIndex.get_schema())
+
+    def validate_target(self: StudyIndex, target_index: GeneIndex) -> StudyIndex:
+        """Validating gene identifiers in the study index against the provided target index.
+
+        Args:
+            target_index (GeneIndex): gene index containing the reference gene identifiers (Ensembl gene identifiers).
+
+        Returns:
+            StudyIndex: with flagged studies if geneId could not be validated.
+        """
+        gene_set = target_index.df.select("geneId", f.lit(True).alias("isIdFound"))
+
+        # As the geneId is not a mandatory field of study index, we return if the column is not there:
+        if "geneId" not in self.df.columns:
+            return self
+
+        validated_df = (
+            self.df.join(gene_set, on="geneId", how="left")
+            .withColumn(
+                "isIdFound",
+                f.when(
+                    (f.col("studyType") != "gwas") & f.col("isIdFound").isNull(),
+                    f.lit(False),
+                ).otherwise(f.lit(True)),
+            )
+            .withColumn(
+                "qualityControls",
+                StudyIndex.update_quality_flag(
+                    f.col("qualityControls"),
+                    ~f.col("isIdFound"),
+                    StudyQualityCheck.UNRESOLVED_TARGET,
+                ),
+            )
+            .drop("isIdFound")
+        )
+
+        return StudyIndex(_df=validated_df, _schema=StudyIndex.get_schema())
diff --git a/src/gentropy/datasource/cell_ontology/__init__.py b/src/gentropy/datasource/cell_ontology/__init__.py
new file mode 100644
index 000000000..c9f3e2075
--- /dev/null
+++ b/src/gentropy/datasource/cell_ontology/__init__.py
@@ -0,0 +1,3 @@
+"""Cell ontology datasource classes."""
+
+from __future__ import annotations
diff --git a/src/gentropy/datasource/cell_ontology/biosample_index.py b/src/gentropy/datasource/cell_ontology/biosample_index.py
new file mode 100644
index 000000000..c96bf89a1
--- /dev/null
+++ b/src/gentropy/datasource/cell_ontology/biosample_index.py
@@ -0,0 +1,65 @@
+"""Biosample index for Cell Ontology data source."""
+
+from __future__ import annotations
+
+from itertools import chain
+from typing import TYPE_CHECKING
+
+import pandas as pd
+import pyspark.sql.functions as f
+from pyspark.sql.types import IntegerType, StringType, StructField, StructType
+
+import owlready2 as owl
+
+from gentropy.common.session import Session
+from gentropy.dataset.study_index import StudyIndex
+
+if TYPE_CHECKING:
+    from pyspark.sql import DataFrame
+    from pyspark.sql.column import Column
+
+class CellOntologyStudyIndex:
+    """Study index dataset from Cell Ontology.
+    
+    Cell type data is extracted from the Cell Ontology (CL) https://obophenotype.github.io/cell-ontology/ and used to define the cell types in the study index dataset.
+
+    """"
+
+    # Define the schema explicitly for the DataFrame
+    raw_biosample_schema: StructType = StructType(
+        [
+            StructField("id", StringType(), True),
+            StructField("code", StringType(), True),
+            StructField("name", StringType(), True),
+            StructField("dbXRefs", ArrayType(StringType()), True),
+            StructField("description", StringType(), True),
+            StructField("parents", ArrayType(StringType()), True),
+            StructField("synonyms", ArrayType(StringType()), True),
+            StructField("ancestors", ArrayType(StringType()), True),
+            StructField("descendants", ArrayType(StringType()), True),
+            StructField("children", ArrayType(StringType()), True),
+            StructField("ontology", MapType(StringType(), BooleanType()), True)
+        ]
+    )
+    raw_biosample_path = "https://raw.githubusercontent.com/obophenotype/cell-ontology/master/cl.owl" # Dummy path for now
+
+    @classmethod
+    def extract_celltypes_from_source(
+        cls: type[CellOntologyStudyIndex],
+        session: Session,
+        mqtl_quantification_methods_blacklist: list[str],
+    ) -> DataFrame:
+        """Read raw studies metadata from eQTL Catalogue.
+
+        Args:
+            session (Session): Spark session.
+            mqtl_quantification_methods_blacklist (list[str]): Molecular trait quantification methods that we don't want to ingest. Available options in https://github.com/eQTL-Catalogue/eQTL-Catalogue-resources/blob/master/data_tables/dataset_metadata.tsv
+
+        Returns:
+            DataFrame: raw studies metadata.
+        """
+        pd.DataFrame.iteritems = pd.DataFrame.items
+        return session.spark.createDataFrame(
+            pd.read_csv(cls.raw_studies_metadata_path, sep="\t"),
+            schema=cls.raw_studies_metadata_schema,
+        ).filter(~(f.col("quant_method").isin(mqtl_quantification_methods_blacklist)))

From e8a3775fe38c60646065886bfc7cceae3c490d1c Mon Sep 17 00:00:00 2001
From: Tobi Alegbe <alegbe@ebi.ac.uk>
Date: Mon, 9 Sep 2024 11:08:48 +0100
Subject: [PATCH 02/22] Make minimal class

---
 src/gentropy/dataset/biosample_index.py | 374 +-----------------------
 1 file changed, 3 insertions(+), 371 deletions(-)

diff --git a/src/gentropy/dataset/biosample_index.py b/src/gentropy/dataset/biosample_index.py
index e8c787ed6..2e445e843 100644
--- a/src/gentropy/dataset/biosample_index.py
+++ b/src/gentropy/dataset/biosample_index.py
@@ -20,26 +20,6 @@
     from pyspark.sql import Column, DataFrame
     from pyspark.sql.types import StructType
 
-    from gentropy.dataset.gene_index import GeneIndex
-
-
-class StudyQualityCheck(Enum):
-    """Study quality control options listing concerns on the quality of the study.
-
-    Attributes:
-        UNRESOLVED_TARGET (str): Target/gene identifier could not match to reference - Labelling failing target.
-        UNRESOLVED_DISEASE (str): Disease identifier could not match to referece or retired identifier - labelling failing disease
-        UNKNOWN_STUDY_TYPE (str): Indicating the provided type of study is not supported.
-        DUPLICATED_STUDY (str): Flagging if a study identifier is not unique.
-        NO_GENE_PROVIDED (str): Flagging QTL studies if the measured
-    """
-
-    UNRESOLVED_TARGET = "Target/gene identifier could not match to reference."
-    UNRESOLVED_DISEASE = "No valid disease identifier found."
-    UNKNOWN_STUDY_TYPE = "This type of study is not supported."
-    DUPLICATED_STUDY = "The identifier of this study is not unique."
-    NO_GENE_PROVIDED = "QTL study doesn't have gene assigned."
-
 
 @dataclass
 class StudyIndex(Dataset):
@@ -48,359 +28,11 @@ class StudyIndex(Dataset):
     A study index dataset captures all the metadata for all studies including GWAS and Molecular QTL.
     """
 
-    @staticmethod
-    def _aggregate_samples_by_ancestry(merged: Column, ancestry: Column) -> Column:
-        """Aggregate sample counts by ancestry in a list of struct colmns.
-
-        Args:
-            merged (Column): A column representing merged data (list of structs).
-            ancestry (Column): The `ancestry` parameter is a column that represents the ancestry of each
-                sample. (a struct)
-
-        Returns:
-            Column: the modified "merged" column after aggregating the samples by ancestry.
-        """
-        # Iterating over the list of ancestries and adding the sample size if label matches:
-        return f.transform(
-            merged,
-            lambda a: f.when(
-                a.ancestry == ancestry.ancestry,
-                f.struct(
-                    a.ancestry.alias("ancestry"),
-                    (a.sampleSize + ancestry.sampleSize).alias("sampleSize"),
-                ),
-            ).otherwise(a),
-        )
-
-    @staticmethod
-    def _map_ancestries_to_ld_population(gwas_ancestry_label: Column) -> Column:
-        """Normalise ancestry column from GWAS studies into reference LD panel based on a pre-defined map.
-
-        This function assumes all possible ancestry categories have a corresponding
-        LD panel in the LD index. It is very important to have the ancestry labels
-        moved to the LD panel map.
-
-        Args:
-            gwas_ancestry_label (Column): A struct column with ancestry label like Finnish,
-                European, African etc. and the corresponding sample size.
-
-        Returns:
-            Column: Struct column with the mapped LD population label and the sample size.
-        """
-        # Loading ancestry label to LD population label:
-        json_dict = json.loads(
-            pkg_resources.read_text(
-                data, "gwas_population_2_LD_panel_map.json", encoding="utf-8"
-            )
-        )
-        map_expr = f.create_map(*[f.lit(x) for x in chain(*json_dict.items())])
-
-        return f.struct(
-            map_expr[gwas_ancestry_label.ancestry].alias("ancestry"),
-            gwas_ancestry_label.sampleSize.alias("sampleSize"),
-        )
-
     @classmethod
     def get_schema(cls: type[StudyIndex]) -> StructType:
-        """Provide the schema for the StudyIndex dataset.
-
-        Returns:
-            StructType: The schema of the StudyIndex dataset.
-        """
-        return parse_spark_schema("study_index.json")
-
-    @classmethod
-    def aggregate_and_map_ancestries(
-        cls: type[StudyIndex], discovery_samples: Column
-    ) -> Column:
-        """Map ancestries to populations in the LD reference and calculate relative sample size.
-
-        Args:
-            discovery_samples (Column): A list of struct column. Has an `ancestry` column and a `sampleSize` columns
-
-        Returns:
-            Column: A list of struct with mapped LD population and their relative sample size.
-        """
-        # Map ancestry categories to population labels of the LD index:
-        mapped_ancestries = f.transform(
-            discovery_samples, cls._map_ancestries_to_ld_population
-        )
-
-        # Aggregate sample sizes belonging to the same LD population:
-        aggregated_counts = f.aggregate(
-            mapped_ancestries,
-            f.array_distinct(
-                f.transform(
-                    mapped_ancestries,
-                    lambda x: f.struct(
-                        x.ancestry.alias("ancestry"), f.lit(0.0).alias("sampleSize")
-                    ),
-                )
-            ),
-            cls._aggregate_samples_by_ancestry,
-        )
-        # Getting total sample count:
-        total_sample_count = f.aggregate(
-            aggregated_counts, f.lit(0.0), lambda total, pop: total + pop.sampleSize
-        ).alias("sampleSize")
-
-        # Calculating relative sample size for each LD population:
-        return f.transform(
-            aggregated_counts,
-            lambda ld_population: f.struct(
-                ld_population.ancestry.alias("ldPopulation"),
-                (ld_population.sampleSize / total_sample_count).alias(
-                    "relativeSampleSize"
-                ),
-            ),
-        )
-
-    def study_type_lut(self: StudyIndex) -> DataFrame:
-        """Return a lookup table of study type.
-
-        Returns:
-            DataFrame: A dataframe containing `studyId` and `studyType` columns.
-        """
-        return self.df.select("studyId", "studyType")
-
-    def is_qtl(self: StudyIndex) -> Column:
-        """Return a boolean column with true values for QTL studies.
-
-        Returns:
-            Column: True if the study is a QTL study.
-        """
-        return self.df.studyType.endswith("qtl")
-
-    def is_gwas(self: StudyIndex) -> Column:
-        """Return a boolean column with true values for GWAS studies.
-
-        Returns:
-            Column: True if the study is a GWAS study.
-        """
-        return self.df.studyType == "gwas"
-
-    def has_mapped_trait(self: StudyIndex) -> Column:
-        """Return a boolean column indicating if a study has mapped disease.
-
-        Returns:
-            Column: True if the study has mapped disease.
-        """
-        return f.size(self.df.traitFromSourceMappedIds) > 0
-
-    def is_quality_flagged(self: StudyIndex) -> Column:
-        """Return a boolean column indicating if a study is flagged due to quality issues.
+        """Provide the schema for the BioSampleIndex dataset.
 
         Returns:
-            Column: True if the study is flagged.
+            StructType: The schema of the BioSampleIndex dataset.
         """
-        # Testing for the presence of the qualityControls column:
-        if "qualityControls" not in self.df.columns:
-            return f.lit(False)
-        else:
-            return f.size(self.df.qualityControls) != 0
-
-    def has_summarystats(self: StudyIndex) -> Column:
-        """Return a boolean column indicating if a study has harmonized summary statistics.
-
-        Returns:
-            Column: True if the study has harmonized summary statistics.
-        """
-        return self.df.hasSumstats
-
-    def validate_unique_study_id(self: StudyIndex) -> StudyIndex:
-        """Validating the uniqueness of study identifiers and flagging duplicated studies.
-
-        Returns:
-            StudyIndex: with flagged duplicated studies.
-        """
-        validated_df = (
-            self.df.withColumn(
-                "isDuplicated",
-                f.when(
-                    f.count("studyType").over(
-                        Window.partitionBy("studyId").rowsBetween(
-                            Window.unboundedPreceding, Window.unboundedFollowing
-                        )
-                    )
-                    > 1,
-                    True,
-                ).otherwise(False),
-            )
-            .withColumn(
-                "qualityControls",
-                StudyIndex.update_quality_flag(
-                    f.col("qualityControls"),
-                    f.col("isDuplicated"),
-                    StudyQualityCheck.DUPLICATED_STUDY,
-                ),
-            )
-            .drop("isDuplicated")
-        )
-        return StudyIndex(_df=validated_df, _schema=StudyIndex.get_schema())
-
-    def _normalise_disease(
-        self: StudyIndex,
-        source_disease_column_name: str,
-        disease_column_name: str,
-        disease_map: DataFrame,
-    ) -> DataFrame:
-        """Normalising diseases in the study index.
-
-        Given a reference disease map (containing all potential EFO ids with the corresponding reference disease ids),
-        this function maps all EFO ids in the study index to the reference disease ids.
-
-        Args:
-            source_disease_column_name (str): The column name of the disease column to validate.
-            disease_column_name (str): The resulting disease column name that contains the validated ids.
-            disease_map (DataFrame): Reference dataframe with diseases
-
-        Returns:
-            DataFrame: where the newly added diseaseIds column will contain the validated EFO identifiers.
-        """
-        return (
-            self.df
-            # Only validating studies with diseases:
-            .filter(f.size(f.col(source_disease_column_name)) > 0)
-            # Explode disease column:
-            .select(
-                "studyId",
-                "studyType",
-                f.explode_outer(source_disease_column_name).alias("efo"),
-            )
-            # Join disease map:
-            .join(disease_map, on="efo", how="left")
-            .groupBy("studyId")
-            .agg(
-                f.collect_set(f.col("diseaseId")).alias(disease_column_name),
-            )
-        )
-
-    def validate_disease(self: StudyIndex, disease_map: DataFrame) -> StudyIndex:
-        """Validate diseases in the study index dataset.
-
-        Args:
-            disease_map (DataFrame): a dataframe with two columns (efo, diseaseId).
-
-        Returns:
-            StudyIndex: where gwas studies are flagged where no valid disease id could be found.
-        """
-        # Because the disease ids are not mandatory fields of the schema, we skip vaildation if these columns are not present:
-        if ("traitFromSourceMappedIds" not in self.df.columns) or (
-            "backgroundTraitFromSourceMappedIds" not in self.df.columns
-        ):
-            return self
-
-        # Disease Column names:
-        foreground_disease_column = "diseaseIds"
-        background_disease_column = "backgroundDiseaseIds"
-
-        # If diseaseId in schema, we need to drop it:
-        drop_columns = [
-            column
-            for column in self.df.columns
-            if column in [foreground_disease_column, background_disease_column]
-        ]
-
-        if len(drop_columns) > 0:
-            self.df = self.df.drop(*drop_columns)
-
-        # Normalise disease:
-        normalised_disease = self._normalise_disease(
-            "traitFromSourceMappedIds", foreground_disease_column, disease_map
-        )
-        normalised_background_disease = self._normalise_disease(
-            "backgroundTraitFromSourceMappedIds", background_disease_column, disease_map
-        )
-
-        return StudyIndex(
-            _df=(
-                self.df.join(normalised_disease, on="studyId", how="left")
-                .join(normalised_background_disease, on="studyId", how="left")
-                # Updating disease columns:
-                .withColumn(
-                    foreground_disease_column,
-                    f.when(
-                        f.col(foreground_disease_column).isNull(), f.array()
-                    ).otherwise(f.col(foreground_disease_column)),
-                )
-                .withColumn(
-                    background_disease_column,
-                    f.when(
-                        f.col(background_disease_column).isNull(), f.array()
-                    ).otherwise(f.col(background_disease_column)),
-                )
-                # Flagging gwas studies where no valid disease is avilable:
-                .withColumn(
-                    "qualityControls",
-                    StudyIndex.update_quality_flag(
-                        f.col("qualityControls"),
-                        # Flagging all gwas studies with no normalised disease:
-                        (f.size(f.col(foreground_disease_column)) == 0)
-                        & (f.col("studyType") == "gwas"),
-                        StudyQualityCheck.UNRESOLVED_DISEASE,
-                    ),
-                )
-            ),
-            _schema=StudyIndex.get_schema(),
-        )
-
-    def validate_study_type(self: StudyIndex) -> StudyIndex:
-        """Validating study type and flag unsupported types.
-
-        Returns:
-            StudyIndex: with flagged studies with unsupported type.
-        """
-        validated_df = (
-            self.df
-            # Flagging unsupported study types:
-            .withColumn(
-                "qualityControls",
-                StudyIndex.update_quality_flag(
-                    f.col("qualityControls"),
-                    f.when(
-                        (f.col("studyType") == "gwas")
-                        | f.col("studyType").endswith("qtl"),
-                        False,
-                    ).otherwise(True),
-                    StudyQualityCheck.UNKNOWN_STUDY_TYPE,
-                ),
-            )
-        )
-        return StudyIndex(_df=validated_df, _schema=StudyIndex.get_schema())
-
-    def validate_target(self: StudyIndex, target_index: GeneIndex) -> StudyIndex:
-        """Validating gene identifiers in the study index against the provided target index.
-
-        Args:
-            target_index (GeneIndex): gene index containing the reference gene identifiers (Ensembl gene identifiers).
-
-        Returns:
-            StudyIndex: with flagged studies if geneId could not be validated.
-        """
-        gene_set = target_index.df.select("geneId", f.lit(True).alias("isIdFound"))
-
-        # As the geneId is not a mandatory field of study index, we return if the column is not there:
-        if "geneId" not in self.df.columns:
-            return self
-
-        validated_df = (
-            self.df.join(gene_set, on="geneId", how="left")
-            .withColumn(
-                "isIdFound",
-                f.when(
-                    (f.col("studyType") != "gwas") & f.col("isIdFound").isNull(),
-                    f.lit(False),
-                ).otherwise(f.lit(True)),
-            )
-            .withColumn(
-                "qualityControls",
-                StudyIndex.update_quality_flag(
-                    f.col("qualityControls"),
-                    ~f.col("isIdFound"),
-                    StudyQualityCheck.UNRESOLVED_TARGET,
-                ),
-            )
-            .drop("isIdFound")
-        )
-
-        return StudyIndex(_df=validated_df, _schema=StudyIndex.get_schema())
+        return parse_spark_schema("biosample_index.json")

From c4d6d5feac2e0f21deaca89199c34a4eb736f537 Mon Sep 17 00:00:00 2001
From: Tobi Alegbe <alegbe@ebi.ac.uk>
Date: Mon, 9 Sep 2024 15:07:36 +0100
Subject: [PATCH 03/22] Tidy up first draft of adding biosample index

---
 .../assets/schemas/biosample_index.json       |   4 +-
 src/gentropy/dataset/biosample_index.py       | 103 +++++++++++++++++-
 .../cell_ontology/biosample_index.py          |  49 +++------
 src/gentropy/datasource/uberon/__init__.py    |   3 +
 .../datasource/uberon/biosample_index.py      |  48 ++++++++
 5 files changed, 166 insertions(+), 41 deletions(-)
 create mode 100644 src/gentropy/datasource/uberon/__init__.py
 create mode 100644 src/gentropy/datasource/uberon/biosample_index.py

diff --git a/src/gentropy/assets/schemas/biosample_index.json b/src/gentropy/assets/schemas/biosample_index.json
index 5ef3f02c3..27c4a508f 100644
--- a/src/gentropy/assets/schemas/biosample_index.json
+++ b/src/gentropy/assets/schemas/biosample_index.json
@@ -2,13 +2,13 @@
   "type": "struct",
   "fields": [
     {
-      "name": "id",
+      "name": "biosampleIndex",
       "type": "string",
       "nullable": true,
       "metadata": {}
     },
     {
-      "name": "name",
+      "name": "biosampleName",
       "type": "string",
       "nullable": true,
       "metadata": {}
diff --git a/src/gentropy/dataset/biosample_index.py b/src/gentropy/dataset/biosample_index.py
index 2e445e843..4646a7774 100644
--- a/src/gentropy/dataset/biosample_index.py
+++ b/src/gentropy/dataset/biosample_index.py
@@ -1,4 +1,4 @@
-"""Study index dataset."""
+"""Biosample index dataset."""
 
 from __future__ import annotations
 
@@ -22,17 +22,108 @@
 
 
 @dataclass
-class StudyIndex(Dataset):
-    """Study index dataset.
+class BiosampleIndex(Dataset):
+    """Biosample index dataset.
 
-    A study index dataset captures all the metadata for all studies including GWAS and Molecular QTL.
+    A Biosample index dataset captures the metadata of the biosamples (e.g. tissues, cell types, cell lines, etc) such as alternate names and relationships with other biosamples.
     """
 
     @classmethod
     def get_schema(cls: type[StudyIndex]) -> StructType:
-        """Provide the schema for the BioSampleIndex dataset.
+        """Provide the schema for the BiosampleIndex dataset.
 
         Returns:
-            StructType: The schema of the BioSampleIndex dataset.
+            StructType: The schema of the BiosampleIndex dataset.
         """
         return parse_spark_schema("biosample_index.json")
+
+
+def extract_ontology_info(
+    ontology : owlready2.namespace.Ontology,
+    prefix : str,
+    session : Session,
+    schema : StructType = BiosampleIndex.get_schema(),
+) -> BiosampleIndex:
+    """Extracts the ontology information from Uberon or Cell Ontology owo owlready2 ontology object.
+
+    Args:
+        ontology (owlready2.namespace.Ontology): An owlready2 ontology object. Must be either from Cell Ontology or Uberon.
+        prefix (str): Prefix for the desired ontology terms.
+        session (Session): Spark session.
+
+    Returns:
+        BiosampleIndex: Parsed and annotated biosample index table.
+    """
+
+    # Iterate over all classes in the ontology
+    for cls in ont.classes():
+        if cls.name.startswith(prefix):
+            # Basic class information
+            cls_id = cls.name
+            # cls_code = cls.iri
+            cls_name = cls.label[0] if cls.label else None
+
+            # Extract descriptions
+            description = None
+            if hasattr(cls, 'IAO_0000115'):
+                description = cls.IAO_0000115.first() if cls.IAO_0000115 else None
+
+            # Extract dbXRefs
+            dbXRefs = [x for x in cls.hasDbXref] if hasattr(cls, 'hasDbXref') else []
+
+            # Parent classes
+            parents = []
+            for parent in cls.is_a:
+                if parent is owl.Thing: 
+                    continue  # Skip owlready2 Thing class, which is a top-level class
+                elif hasattr(parent, 'name'):
+                    parent_id = parent.name
+                    parents.append(parent_id)
+                elif hasattr(parent, 'property'):  # For restrictions
+                    continue  # We skip restrictions in this simplified list
+
+            # Synonyms
+            synonyms = set()
+            if hasattr(cls, 'hasExactSynonym'):
+                synonyms.update(cls.hasExactSynonym)
+            if hasattr(cls, 'hasBroadSynonym'):
+                synonyms.update(cls.hasBroadSynonym)
+            if hasattr(cls, 'hasNarrowSynonym'):
+                synonyms.update(cls.hasNarrowSynonym)
+            if hasattr(cls, 'hasRelatedSynonym'):
+                synonyms.update(cls.hasRelatedSynonym)
+
+            # Children classes
+            children = [child.name for child in cls.subclasses()]
+
+            # Ancestors and descendants with Thing class filtered out
+            ancestors = [anc.name for anc in cls.ancestors() if hasattr(anc, 'name') and anc is not owl.Thing]
+            descendants = [desc.name for desc in cls.descendants() if hasattr(desc, 'name')]
+
+            # Check if the class is deprecated
+            is_deprecated = False
+            if hasattr(cls, 'deprecated') and cls.deprecated:
+                is_deprecated = True
+
+            # Compile all information into a Row
+            entry = Row(
+                id=cls_id,
+                # code=cls_code,
+                name=cls_name,  
+                dbXRefs=dbXRefs,
+                description=description,
+                parents=parents,
+                synonyms=list(synonyms),
+                ancestors=ancestors,
+                descendants=descendants,
+                children=children,
+                ontology={"is_obsolete": is_deprecated}
+            )
+            
+            # Add to data list
+            data.append(entry)
+
+
+    # Create DataFrame directly from Rows
+    df = spark2.createDataFrame(data, schema)
+    return df
diff --git a/src/gentropy/datasource/cell_ontology/biosample_index.py b/src/gentropy/datasource/cell_ontology/biosample_index.py
index c96bf89a1..3ec2d7be4 100644
--- a/src/gentropy/datasource/cell_ontology/biosample_index.py
+++ b/src/gentropy/datasource/cell_ontology/biosample_index.py
@@ -12,54 +12,37 @@
 import owlready2 as owl
 
 from gentropy.common.session import Session
-from gentropy.dataset.study_index import StudyIndex
+from gentropy.dataset.biosample_index import BiosampleIndex, extract_ontology_info
 
 if TYPE_CHECKING:
     from pyspark.sql import DataFrame
     from pyspark.sql.column import Column
 
-class CellOntologyStudyIndex:
-    """Study index dataset from Cell Ontology.
+class CellOntologyBiosampleIndex:
+    """Biosample index dataset from Cell Ontology.
     
-    Cell type data is extracted from the Cell Ontology (CL) https://obophenotype.github.io/cell-ontology/ and used to define the cell types in the study index dataset.
-
-    """"
-
-    # Define the schema explicitly for the DataFrame
-    raw_biosample_schema: StructType = StructType(
-        [
-            StructField("id", StringType(), True),
-            StructField("code", StringType(), True),
-            StructField("name", StringType(), True),
-            StructField("dbXRefs", ArrayType(StringType()), True),
-            StructField("description", StringType(), True),
-            StructField("parents", ArrayType(StringType()), True),
-            StructField("synonyms", ArrayType(StringType()), True),
-            StructField("ancestors", ArrayType(StringType()), True),
-            StructField("descendants", ArrayType(StringType()), True),
-            StructField("children", ArrayType(StringType()), True),
-            StructField("ontology", MapType(StringType(), BooleanType()), True)
-        ]
-    )
-    raw_biosample_path = "https://raw.githubusercontent.com/obophenotype/cell-ontology/master/cl.owl" # Dummy path for now
+    Cell type data is extracted from the Cell Ontology (CL) https://obophenotype.github.io/cell-ontology/ and used to define the cell types in the biosample index dataset.
+    """
 
     @classmethod
     def extract_celltypes_from_source(
         cls: type[CellOntologyStudyIndex],
         session: Session,
-        mqtl_quantification_methods_blacklist: list[str],
+        ontology_path: str,
     ) -> DataFrame:
-        """Read raw studies metadata from eQTL Catalogue.
+        """Ingests Cell Ontology owo file and extracts cell types.
 
         Args:
             session (Session): Spark session.
-            mqtl_quantification_methods_blacklist (list[str]): Molecular trait quantification methods that we don't want to ingest. Available options in https://github.com/eQTL-Catalogue/eQTL-Catalogue-resources/blob/master/data_tables/dataset_metadata.tsv
+            ontology_path (str): Path to the Cell ontology owo file.
 
         Returns:
-            DataFrame: raw studies metadata.
+            BiosampleIndex: Parsed and annotated Cell Ontology biosample index table.
         """
-        pd.DataFrame.iteritems = pd.DataFrame.items
-        return session.spark.createDataFrame(
-            pd.read_csv(cls.raw_studies_metadata_path, sep="\t"),
-            schema=cls.raw_studies_metadata_schema,
-        ).filter(~(f.col("quant_method").isin(mqtl_quantification_methods_blacklist)))
+        ontology_data = owl.get_ontology(ontology_path).load()
+        df = extract_ontology_info(ontology_data, "CL_", session, BiosampleIndex.get_schema())
+        
+        return BiosampleIndex(
+            _df=df,
+            _schema=BiosampleIndex.get_schema()
+            )
\ No newline at end of file
diff --git a/src/gentropy/datasource/uberon/__init__.py b/src/gentropy/datasource/uberon/__init__.py
new file mode 100644
index 000000000..11899e25b
--- /dev/null
+++ b/src/gentropy/datasource/uberon/__init__.py
@@ -0,0 +1,3 @@
+"""Uberon datasource classes."""
+
+from __future__ import annotations
diff --git a/src/gentropy/datasource/uberon/biosample_index.py b/src/gentropy/datasource/uberon/biosample_index.py
new file mode 100644
index 000000000..d07248b5e
--- /dev/null
+++ b/src/gentropy/datasource/uberon/biosample_index.py
@@ -0,0 +1,48 @@
+"""Biosample index for Uberon data source."""
+
+from __future__ import annotations
+
+from itertools import chain
+from typing import TYPE_CHECKING
+
+import pandas as pd
+import pyspark.sql.functions as f
+from pyspark.sql.types import IntegerType, StringType, StructField, StructType
+
+import owlready2 as owl
+
+from gentropy.common.session import Session
+from gentropy.dataset.biosample_index import BiosampleIndex
+
+if TYPE_CHECKING:
+    from pyspark.sql import DataFrame
+    from pyspark.sql.column import Column
+
+class UberonBiosampleIndex:
+    """Biosample index dataset from Uberon.
+    
+    Cell type data is extracted from the Uberon (UBERON) https://obophenotype.github.io/uberon/ and used to define the tissues in the biosample index dataset.
+    """
+
+    @classmethod
+    def extract_tissue_from_source(
+        cls: type[UberonStudyIndex],
+        session: Session,
+        ontology_path: str,
+    ) -> DataFrame:
+        """Ingests Uberon owo file and extracts tissues.
+
+        Args:
+            session (Session): Spark session.
+            ontology_path (str): Path to the Uberon owo file.
+
+        Returns:
+            BiosampleIndex: Parsed and annotated Uberon biosample index table.
+        """
+        ontology_data = owl.get_ontology(ontology_path).load()
+        df = extract_ontology_info(ontology_data, "UBERON_", session, BiosampleIndex.get_schema())
+        
+        return BiosampleIndex(
+            _df=df,
+            _schema=BiosampleIndex.get_schema()
+            )

From 186e77313413fb21bd45703890457d2afa19cfa1 Mon Sep 17 00:00:00 2001
From: Tobi Alegbe <alegbe@ebi.ac.uk>
Date: Tue, 10 Sep 2024 09:42:09 +0100
Subject: [PATCH 04/22] Add beginning of logic for checking if biosample from a
 studyindex is in biosample index

---
 .../assets/schemas/biosample_index.json       |  2 +-
 src/gentropy/dataset/study_index.py           | 34 +++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/src/gentropy/assets/schemas/biosample_index.json b/src/gentropy/assets/schemas/biosample_index.json
index 27c4a508f..df8f82188 100644
--- a/src/gentropy/assets/schemas/biosample_index.json
+++ b/src/gentropy/assets/schemas/biosample_index.json
@@ -2,7 +2,7 @@
   "type": "struct",
   "fields": [
     {
-      "name": "biosampleIndex",
+      "name": "biosampleId",
       "type": "string",
       "nullable": true,
       "metadata": {}
diff --git a/src/gentropy/dataset/study_index.py b/src/gentropy/dataset/study_index.py
index e8c787ed6..cc4dabae5 100644
--- a/src/gentropy/dataset/study_index.py
+++ b/src/gentropy/dataset/study_index.py
@@ -404,3 +404,37 @@ def validate_target(self: StudyIndex, target_index: GeneIndex) -> StudyIndex:
         )
 
         return StudyIndex(_df=validated_df, _schema=StudyIndex.get_schema())
+
+    def validate_biosample(self: StudyIndex, biosample_index: BiosampleIndex) -> StudyIndex:
+        """Validating biosample identifiers in the study index against the provided biosample index.
+
+        Args:
+            biosample_index (BiosampleIndex): Biosample index containing a reference of biosample identifiers e.g. cell types, tissues, cell lines, etc.
+
+        Returns:
+            StudyIndex: with flagged studies if biosampleIndex could not be validated.
+        """
+        biosample_set = biosample_index.df.select("biosampleId", f.lit(True).alias("isIdFound"))
+
+        validated_df = (
+            self.df.join(biosample_set, on="biosampleId", how="left")
+            .withColumn(
+                "isIdFound",
+                f.when(
+                    f.col("isIdFound").isNull(),
+                    f.lit(False),
+                ).otherwise(f.lit(True)),
+            )
+            .withColumn(
+                "qualityControls",
+                StudyIndex.update_quality_flag(
+                    f.col("qualityControls"),
+                    ~f.col("isIdFound"),
+                    StudyQualityCheck.NO_GENE_PROVIDED,
+                ),
+            )
+            .drop("isIdFound")
+        )
+
+        return StudyIndex(_df=validated_df, _schema=StudyIndex.get_schema())
+

From 6f0a2e2711c367b5331d7cb52ac63c7118f8259b Mon Sep 17 00:00:00 2001
From: Tobi Alegbe <alegbe@ebi.ac.uk>
Date: Tue, 10 Sep 2024 09:42:39 +0100
Subject: [PATCH 05/22] Make early file for merging multiple biosample indices
 into one

---
 src/gentropy/biosample_index.py | 37 +++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 src/gentropy/biosample_index.py

diff --git a/src/gentropy/biosample_index.py b/src/gentropy/biosample_index.py
new file mode 100644
index 000000000..26beb0ea5
--- /dev/null
+++ b/src/gentropy/biosample_index.py
@@ -0,0 +1,37 @@
+"""Step to generate biosample index dataset."""
+from __future__ import annotations
+
+from gentropy.common.session import Session
+from gentropy.datasource.open_targets.target import OpenTargetsTarget
+from gentropy.dataset.biosample_index import BiosampleIndex
+from gentropy.datasource.cell_ontology.biosample_index import CellOntologyBiosampleIndex
+from gentropy.datasource.uberon.biosample_index import UberonBiosampleIndex
+
+
+class BiosampleIndexStep:
+    """Biosample index step.
+
+    This step generates a Biosample index dataset from the various ontology sources. Currently Cell Ontology and Uberon are supported.
+    """
+
+    def __init__(
+        self,
+        session: Session,
+        cell_ontology_input_path: str,
+        uberon_input_path: str,
+        biosample_index_output_path: str,
+    ) -> None:
+        """Run Biosample index generation step.
+
+        Args:
+            session (Session): Session object.
+            cell_ontology_input_path (str): Input cell ontology dataset path.
+            uberon_input_path (str): Input uberon dataset path.
+            biosample_index_output_path (str): Output gene index dataset path.
+        """
+        cell_ontology_index = CellOntologyBiosampleIndex.extract_celltypes_from_source(
+            session, cell_ontology_input_path
+        )
+        uberon_index = UberonBiosampleIndex.extract_tissue_from_source(
+            session, uberon_input_path
+        )

From 55e2baf90cfdd305d5fb649afa394240f36cbaae Mon Sep 17 00:00:00 2001
From: Tobi Alegbe <alegbe@ebi.ac.uk>
Date: Tue, 10 Sep 2024 11:59:45 +0100
Subject: [PATCH 06/22] Finish adding basic iteration of biosample index, needs
 debugging

---
 src/gentropy/biosample_index.py         |  3 +++
 src/gentropy/dataset/biosample_index.py | 19 +++++++++++++++++++
 src/gentropy/study_validation.py        |  3 +++
 3 files changed, 25 insertions(+)

diff --git a/src/gentropy/biosample_index.py b/src/gentropy/biosample_index.py
index 26beb0ea5..b5864a5ba 100644
--- a/src/gentropy/biosample_index.py
+++ b/src/gentropy/biosample_index.py
@@ -35,3 +35,6 @@ def __init__(
         uberon_index = UberonBiosampleIndex.extract_tissue_from_source(
             session, uberon_input_path
         )
+        biosample_index = BiosampleIndex.merge([cell_ontology_index, uberon_index])
+        biosample_index.write_parquet(biosample_index_output_path)
+        
diff --git a/src/gentropy/dataset/biosample_index.py b/src/gentropy/dataset/biosample_index.py
index 4646a7774..2dc547cd8 100644
--- a/src/gentropy/dataset/biosample_index.py
+++ b/src/gentropy/dataset/biosample_index.py
@@ -11,6 +11,7 @@
 
 from pyspark.sql import functions as f
 from pyspark.sql.window import Window
+from functools import reduce
 
 from gentropy.assets import data
 from gentropy.common.schemas import parse_spark_schema
@@ -37,6 +38,23 @@ def get_schema(cls: type[StudyIndex]) -> StructType:
         """
         return parse_spark_schema("biosample_index.json")
 
+    @classmethod
+    def merge(
+        cls: type[BiosampleIndex],
+         biosample_indexes: list[BiosampleIndex], 
+    ) -> BiosampleIndex:
+        """Merge a list of biosample indexes into a single biosample index.
+
+        Args:
+            biosample_indexes (BiosampleIndex): Biosample indexes to merge.
+
+        Returns:
+            BiosampleIndex: Merged biosample index.
+        """
+        df = reduct(DataFrame.unionAll, [biosample_index._df for biosample_index in biosample_indexes])
+        return BiosampleIndex(_df=df, _schema=BiosampleIndex.get_schema())
+        
+
 
 def extract_ontology_info(
     ontology : owlready2.namespace.Ontology,
@@ -127,3 +145,4 @@ def extract_ontology_info(
     # Create DataFrame directly from Rows
     df = spark2.createDataFrame(data, schema)
     return df
+
diff --git a/src/gentropy/study_validation.py b/src/gentropy/study_validation.py
index 5bfb83fe0..3e926078d 100644
--- a/src/gentropy/study_validation.py
+++ b/src/gentropy/study_validation.py
@@ -22,6 +22,7 @@ def __init__(
         study_index_path: list[str],
         target_index_path: str,
         disease_index_path: str,
+        biosample_index_path: str,
         valid_study_index_path: str,
         invalid_study_index_path: str,
         invalid_qc_reasons: list[str] = [],
@@ -55,6 +56,7 @@ def __init__(
             .withColumn("efo", f.coalesce(f.col("efo"), f.col("diseaseId")))
         )
         study_index = StudyIndex.from_parquet(session, list(study_index_path))
+        biosample_index = BiosampleIndex.from_parquet(session, biosample_index_path)
 
         # Running validation:
         study_index_with_qc = (
@@ -63,6 +65,7 @@ def __init__(
             .validate_study_type()  # Flagging non-supported study types.
             .validate_target(target_index)  # Flagging QTL studies with invalid targets
             .validate_disease(disease_index)  # Flagging invalid EFOs
+            .validate_biosample(biosample_index)  # Flagging invalid biosamples
         ).persist()  # we will need this for 2 types of outputs
 
         study_index_with_qc.valid_rows(

From 692732b3c2c84dac74f250355f581155267f6b78 Mon Sep 17 00:00:00 2001
From: Tobi Alegbe <alegbe@ebi.ac.uk>
Date: Fri, 13 Sep 2024 12:05:34 +0000
Subject: [PATCH 07/22] Tweak slightly

---
 .../assets/schemas/biosample_index.json       |  32 +++-
 src/gentropy/biosample_index.py               |   4 +-
 src/gentropy/dataset/biosample_index.py       | 148 +++++++++---------
 .../datasource/cell_ontology/__init__.py      |   3 -
 .../cell_ontology.py}                         |   0
 .../uberon.py}                                |   1 +
 src/gentropy/datasource/ontologies/utils.py   |  91 +++++++++++
 src/gentropy/datasource/uberon/__init__.py    |   3 -
 .../gentropy/dataset/test_biosample_index.py  |  31 ++++
 9 files changed, 225 insertions(+), 88 deletions(-)
 delete mode 100644 src/gentropy/datasource/cell_ontology/__init__.py
 rename src/gentropy/datasource/{cell_ontology/biosample_index.py => ontologies/cell_ontology.py} (100%)
 rename src/gentropy/datasource/{uberon/biosample_index.py => ontologies/uberon.py} (99%)
 create mode 100644 src/gentropy/datasource/ontologies/utils.py
 delete mode 100644 src/gentropy/datasource/uberon/__init__.py
 create mode 100644 tests/gentropy/dataset/test_biosample_index.py

diff --git a/src/gentropy/assets/schemas/biosample_index.json b/src/gentropy/assets/schemas/biosample_index.json
index df8f82188..cd91f090b 100644
--- a/src/gentropy/assets/schemas/biosample_index.json
+++ b/src/gentropy/assets/schemas/biosample_index.json
@@ -4,7 +4,7 @@
     {
       "name": "biosampleId",
       "type": "string",
-      "nullable": true,
+      "nullable": false,
       "metadata": {}
     },
     {
@@ -14,14 +14,30 @@
       "metadata": {}
     },
     {
-      "name": "dbXRefs",
-      "type": {
-        "type": "array",
-        "elementType": "string",
-        "containsNull": true
-      },
+      "metadata": {},
+      "name": "dbXrefs",
       "nullable": true,
-      "metadata": {}
+      "type": {
+        "containsNull": true,
+        "elementType": {
+          "fields": [
+            {
+              "metadata": {},
+              "name": "id",
+              "nullable": true,
+              "type": "string"
+            },
+            {
+              "metadata": {},
+              "name": "source",
+              "nullable": true,
+              "type": "string"
+            }
+          ],
+          "type": "struct"
+        },
+        "type": "array"
+      }
     },
     {
       "name": "description",
diff --git a/src/gentropy/biosample_index.py b/src/gentropy/biosample_index.py
index b5864a5ba..43d1511bc 100644
--- a/src/gentropy/biosample_index.py
+++ b/src/gentropy/biosample_index.py
@@ -29,10 +29,10 @@ def __init__(
             uberon_input_path (str): Input uberon dataset path.
             biosample_index_output_path (str): Output gene index dataset path.
         """
-        cell_ontology_index = CellOntologyBiosampleIndex.extract_celltypes_from_source(
+        cell_ontology_index = BiosampleIndex.extract_from_source(
             session, cell_ontology_input_path
         )
-        uberon_index = UberonBiosampleIndex.extract_tissue_from_source(
+        uberon_index = BiosampleIndex.extract_from_source(
             session, uberon_input_path
         )
         biosample_index = BiosampleIndex.merge([cell_ontology_index, uberon_index])
diff --git a/src/gentropy/dataset/biosample_index.py b/src/gentropy/dataset/biosample_index.py
index 2dc547cd8..9905afd91 100644
--- a/src/gentropy/dataset/biosample_index.py
+++ b/src/gentropy/dataset/biosample_index.py
@@ -17,10 +17,14 @@
 from gentropy.common.schemas import parse_spark_schema
 from gentropy.dataset.dataset import Dataset
 
+
+from pyspark.sql import Column, DataFrame, Row
+
 if TYPE_CHECKING:
-    from pyspark.sql import Column, DataFrame
     from pyspark.sql.types import StructType
 
+import owlready2 as owl
+
 
 @dataclass
 class BiosampleIndex(Dataset):
@@ -52,15 +56,13 @@ def merge(
             BiosampleIndex: Merged biosample index.
         """
         df = reduct(DataFrame.unionAll, [biosample_index._df for biosample_index in biosample_indexes])
-        return BiosampleIndex(_df=df, _schema=BiosampleIndex.get_schema())
+        return BiosampleIndex(_df=df, _schema=cls.get_schema())
         
 
-
 def extract_ontology_info(
     ontology : owlready2.namespace.Ontology,
-    prefix : str,
     session : Session,
-    schema : StructType = BiosampleIndex.get_schema(),
+    schema : StructType
 ) -> BiosampleIndex:
     """Extracts the ontology information from Uberon or Cell Ontology owo owlready2 ontology object.
 
@@ -72,77 +74,79 @@ def extract_ontology_info(
     Returns:
         BiosampleIndex: Parsed and annotated biosample index table.
     """
+    data_list = []
 
     # Iterate over all classes in the ontology
-    for cls in ont.classes():
-        if cls.name.startswith(prefix):
-            # Basic class information
-            cls_id = cls.name
-            # cls_code = cls.iri
-            cls_name = cls.label[0] if cls.label else None
-
-            # Extract descriptions
-            description = None
-            if hasattr(cls, 'IAO_0000115'):
-                description = cls.IAO_0000115.first() if cls.IAO_0000115 else None
-
-            # Extract dbXRefs
-            dbXRefs = [x for x in cls.hasDbXref] if hasattr(cls, 'hasDbXref') else []
-
-            # Parent classes
-            parents = []
-            for parent in cls.is_a:
-                if parent is owl.Thing: 
-                    continue  # Skip owlready2 Thing class, which is a top-level class
-                elif hasattr(parent, 'name'):
-                    parent_id = parent.name
-                    parents.append(parent_id)
-                elif hasattr(parent, 'property'):  # For restrictions
-                    continue  # We skip restrictions in this simplified list
-
-            # Synonyms
-            synonyms = set()
-            if hasattr(cls, 'hasExactSynonym'):
-                synonyms.update(cls.hasExactSynonym)
-            if hasattr(cls, 'hasBroadSynonym'):
-                synonyms.update(cls.hasBroadSynonym)
-            if hasattr(cls, 'hasNarrowSynonym'):
-                synonyms.update(cls.hasNarrowSynonym)
-            if hasattr(cls, 'hasRelatedSynonym'):
-                synonyms.update(cls.hasRelatedSynonym)
-
-            # Children classes
-            children = [child.name for child in cls.subclasses()]
-
-            # Ancestors and descendants with Thing class filtered out
-            ancestors = [anc.name for anc in cls.ancestors() if hasattr(anc, 'name') and anc is not owl.Thing]
-            descendants = [desc.name for desc in cls.descendants() if hasattr(desc, 'name')]
-
-            # Check if the class is deprecated
-            is_deprecated = False
-            if hasattr(cls, 'deprecated') and cls.deprecated:
-                is_deprecated = True
-
-            # Compile all information into a Row
-            entry = Row(
-                id=cls_id,
-                # code=cls_code,
-                name=cls_name,  
-                dbXRefs=dbXRefs,
-                description=description,
-                parents=parents,
-                synonyms=list(synonyms),
-                ancestors=ancestors,
-                descendants=descendants,
-                children=children,
-                ontology={"is_obsolete": is_deprecated}
-            )
-            
-            # Add to data list
-            data.append(entry)
+    for cls in ontology.classes():
+        # Basic class information
+        cls_id = cls.name
+        # cls_code = cls.iri
+        cls_name = cls.label[0] if cls.label else None
+
+        # Extract descriptions
+        description = None
+        if hasattr(cls, 'IAO_0000115'):
+            description = cls.IAO_0000115.first() if cls.IAO_0000115 else None
+
+        # Extract dbXRefs
+        dbXRefs = []
+        if hasattr(cls, 'hasDbXref'):
+            dbXRefs = [Row(id=x, source=x.split(':')[0]) for x in cls.hasDbXref]
+
+        # Parent classes
+        parents = []
+        for parent in cls.is_a:
+            if parent is owl.Thing: 
+                continue  # Skip owlready2 Thing class, which is a top-level class
+            elif hasattr(parent, 'name'):
+                parent_id = parent.name
+                parents.append(parent_id)
+            elif hasattr(parent, 'property'):  # For restrictions
+                continue  # We skip restrictions in this simplified list
+
+        # Synonyms
+        synonyms = set()
+        if hasattr(cls, 'hasExactSynonym'):
+            synonyms.update(cls.hasExactSynonym)
+        if hasattr(cls, 'hasBroadSynonym'):
+            synonyms.update(cls.hasBroadSynonym)
+        if hasattr(cls, 'hasNarrowSynonym'):
+            synonyms.update(cls.hasNarrowSynonym)
+        if hasattr(cls, 'hasRelatedSynonym'):
+            synonyms.update(cls.hasRelatedSynonym)
+
+        # Children classes
+        children = [child.name for child in cls.subclasses()]
+
+        # Ancestors and descendants with Thing class filtered out
+        ancestors = [anc.name for anc in cls.ancestors() if hasattr(anc, 'name') and anc is not owl.Thing]
+        descendants = [desc.name for desc in cls.descendants() if hasattr(desc, 'name')]
+
+        # Check if the class is deprecated
+        is_deprecated = False
+        if hasattr(cls, 'deprecated') and cls.deprecated:
+            is_deprecated = True
+
+        # Compile all information into a Row
+        entry = Row(
+            id=cls_id,
+            # code=cls_code,
+            name=cls_name,  
+            dbXRefs=dbXRefs,
+            description=description,
+            parents=parents,
+            synonyms=list(synonyms),
+            ancestors=ancestors,
+            descendants=descendants,
+            children=children,
+            ontology={"is_obsolete": is_deprecated}
+        )
+        
+        # Add to data list
+        data_list.append(entry)
 
 
     # Create DataFrame directly from Rows
-    df = spark2.createDataFrame(data, schema)
+    df = session.createDataFrame(data_list, schema)
     return df
 
diff --git a/src/gentropy/datasource/cell_ontology/__init__.py b/src/gentropy/datasource/cell_ontology/__init__.py
deleted file mode 100644
index c9f3e2075..000000000
--- a/src/gentropy/datasource/cell_ontology/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-"""Cell ontology datasource classes."""
-
-from __future__ import annotations
diff --git a/src/gentropy/datasource/cell_ontology/biosample_index.py b/src/gentropy/datasource/ontologies/cell_ontology.py
similarity index 100%
rename from src/gentropy/datasource/cell_ontology/biosample_index.py
rename to src/gentropy/datasource/ontologies/cell_ontology.py
diff --git a/src/gentropy/datasource/uberon/biosample_index.py b/src/gentropy/datasource/ontologies/uberon.py
similarity index 99%
rename from src/gentropy/datasource/uberon/biosample_index.py
rename to src/gentropy/datasource/ontologies/uberon.py
index d07248b5e..a59ce2df4 100644
--- a/src/gentropy/datasource/uberon/biosample_index.py
+++ b/src/gentropy/datasource/ontologies/uberon.py
@@ -13,6 +13,7 @@
 
 from gentropy.common.session import Session
 from gentropy.dataset.biosample_index import BiosampleIndex
+from grn
 
 if TYPE_CHECKING:
     from pyspark.sql import DataFrame
diff --git a/src/gentropy/datasource/ontologies/utils.py b/src/gentropy/datasource/ontologies/utils.py
new file mode 100644
index 000000000..6f2043c93
--- /dev/null
+++ b/src/gentropy/datasource/ontologies/utils.py
@@ -0,0 +1,91 @@
+
+def extract_ontology_info(
+    ontology : owlready2.namespace.Ontology,
+    session : Session,
+    schema : StructType
+) -> BiosampleIndex:
+    """Extracts the ontology information from Uberon or Cell Ontology owo owlready2 ontology object.
+
+    Args:
+        ontology (owlready2.namespace.Ontology): An owlready2 ontology object. Must be either from Cell Ontology or Uberon.
+        prefix (str): Prefix for the desired ontology terms.
+        session (Session): Spark session.
+
+    Returns:
+        BiosampleIndex: Parsed and annotated biosample index table.
+    """
+    data_list = []
+
+    # Iterate over all classes in the ontology
+    for cls in ontology.classes():
+        # Basic class information
+        cls_id = cls.name
+        # cls_code = cls.iri
+        cls_name = cls.label[0] if cls.label else None
+
+        # Extract descriptions
+        description = None
+        if hasattr(cls, 'IAO_0000115'):
+            description = cls.IAO_0000115.first() if cls.IAO_0000115 else None
+
+        # Extract dbXRefs
+        dbXRefs = []
+        if hasattr(cls, 'hasDbXref'):
+            dbXRefs = [Row(id=x, source=x.split(':')[0]) for x in cls.hasDbXref]
+
+        # Parent classes
+        parents = []
+        for parent in cls.is_a:
+            if parent is owl.Thing: 
+                continue  # Skip owlready2 Thing class, which is a top-level class
+            elif hasattr(parent, 'name'):
+                parent_id = parent.name
+                parents.append(parent_id)
+            elif hasattr(parent, 'property'):  # For restrictions
+                continue  # We skip restrictions in this simplified list
+
+        # Synonyms
+        synonyms = set()
+        if hasattr(cls, 'hasExactSynonym'):
+            synonyms.update(cls.hasExactSynonym)
+        if hasattr(cls, 'hasBroadSynonym'):
+            synonyms.update(cls.hasBroadSynonym)
+        if hasattr(cls, 'hasNarrowSynonym'):
+            synonyms.update(cls.hasNarrowSynonym)
+        if hasattr(cls, 'hasRelatedSynonym'):
+            synonyms.update(cls.hasRelatedSynonym)
+
+        # Children classes
+        children = [child.name for child in cls.subclasses()]
+
+        # Ancestors and descendants with Thing class filtered out
+        ancestors = [anc.name for anc in cls.ancestors() if hasattr(anc, 'name') and anc is not owl.Thing]
+        descendants = [desc.name for desc in cls.descendants() if hasattr(desc, 'name')]
+
+        # Check if the class is deprecated
+        is_deprecated = False
+        if hasattr(cls, 'deprecated') and cls.deprecated:
+            is_deprecated = True
+
+        # Compile all information into a Row
+        entry = Row(
+            id=cls_id,
+            # code=cls_code,
+            name=cls_name,  
+            dbXRefs=dbXRefs,
+            description=description,
+            parents=parents,
+            synonyms=list(synonyms),
+            ancestors=ancestors,
+            descendants=descendants,
+            children=children,
+            ontology={"is_obsolete": is_deprecated}
+        )
+        
+        # Add to data list
+        data_list.append(entry)
+
+
+    # Create DataFrame directly from Rows
+    df = session.createDataFrame(data_list, schema)
+    return df
diff --git a/src/gentropy/datasource/uberon/__init__.py b/src/gentropy/datasource/uberon/__init__.py
deleted file mode 100644
index 11899e25b..000000000
--- a/src/gentropy/datasource/uberon/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-"""Uberon datasource classes."""
-
-from __future__ import annotations
diff --git a/tests/gentropy/dataset/test_biosample_index.py b/tests/gentropy/dataset/test_biosample_index.py
new file mode 100644
index 000000000..c680b4f19
--- /dev/null
+++ b/tests/gentropy/dataset/test_biosample_index.py
@@ -0,0 +1,31 @@
+"""Tests on Biosample index."""
+
+import pandas as pd
+import numpy as np
+from pyspark.sql import SparkSession
+from pyspark.sql import Row
+import pyspark.sql.functions as F
+import owlready2 as owl
+from pyspark.sql.types import StructType, StructField, StringType, ArrayType, MapType, BooleanType
+import json
+
+from gentropy.dataset.biosample_index import BiosampleIndex, extract_ontology_info
+
+
+def test_biosample_index_creation(mock_biosample_index: BiosampleIndex) -> None:
+    """Test biosample index creation with mock biosample index."""
+    assert isinstance(mock_biosample_index, BiosampleIndex)
+
+
+
+cell_ontology = owl.get_ontology("/home/alegbe/repos/gentropy/tests/gentropy/data_samples/cell_ontology.owl").load()
+spark2 = SparkSession.builder \
+    .master("local[*]") \
+    .appName("LocalOntologyIndexing") \
+    .getOrCreate()
+
+# Define the schema for the DataFrame
+schema_path = '/home/alegbe/repos/gentropy/src/gentropy/assets/schemas/biosample_index.json'
+schema = StructType.fromJson(json.load(open(schema_path)))
+
+df = extract_ontology_info(cell_ontology, spark2, schema)
\ No newline at end of file

From 30dc23fe0cbc50d5c4854f158ff9eaeec8c124e1 Mon Sep 17 00:00:00 2001
From: Tobi Alegbe <alegbe@ebi.ac.uk>
Date: Fri, 13 Sep 2024 14:53:30 +0000
Subject: [PATCH 08/22] Modified the parser to accept JSON files

---
 .../assets/schemas/biosample_index.json       |  54 +++-----
 src/gentropy/dataset/biosample_index.py       | 105 ---------------
 src/gentropy/datasource/ontologies/utils.py   | 127 +++++++++++++++++-
 .../gentropy/dataset/test_biosample_index.py  |  12 +-
 4 files changed, 150 insertions(+), 148 deletions(-)

diff --git a/src/gentropy/assets/schemas/biosample_index.json b/src/gentropy/assets/schemas/biosample_index.json
index cd91f090b..82ba5ae2b 100644
--- a/src/gentropy/assets/schemas/biosample_index.json
+++ b/src/gentropy/assets/schemas/biosample_index.json
@@ -13,32 +13,6 @@
       "nullable": true,
       "metadata": {}
     },
-    {
-      "metadata": {},
-      "name": "dbXrefs",
-      "nullable": true,
-      "type": {
-        "containsNull": true,
-        "elementType": {
-          "fields": [
-            {
-              "metadata": {},
-              "name": "id",
-              "nullable": true,
-              "type": "string"
-            },
-            {
-              "metadata": {},
-              "name": "source",
-              "nullable": true,
-              "type": "string"
-            }
-          ],
-          "type": "struct"
-        },
-        "type": "array"
-      }
-    },
     {
       "name": "description",
       "type": "string",
@@ -46,7 +20,7 @@
       "metadata": {}
     },
     {
-      "name": "parents",
+      "name": "dbXrefs",
       "type": {
         "type": "array",
         "elementType": "string",
@@ -66,7 +40,18 @@
       "metadata": {}
     },
     {
-      "name": "ancestors",
+      "name": "deprecated",
+      "type": {
+        "type": "map",
+        "keyType": "string",
+        "valueType": "boolean",
+        "valueContainsNull": true
+      },
+      "nullable": true,
+      "metadata": {}
+    },
+    {
+      "name": "parents",
       "type": {
         "type": "array",
         "elementType": "string",
@@ -76,7 +61,7 @@
       "metadata": {}
     },
     {
-      "name": "descendants",
+      "name": "ancestors",
       "type": {
         "type": "array",
         "elementType": "string",
@@ -86,7 +71,7 @@
       "metadata": {}
     },
     {
-      "name": "children",
+      "name": "descendants",
       "type": {
         "type": "array",
         "elementType": "string",
@@ -96,12 +81,11 @@
       "metadata": {}
     },
     {
-      "name": "ontology",
+      "name": "children",
       "type": {
-        "type": "map",
-        "keyType": "string",
-        "valueType": "boolean",
-        "valueContainsNull": true
+        "type": "array",
+        "elementType": "string",
+        "containsNull": true
       },
       "nullable": true,
       "metadata": {}
diff --git a/src/gentropy/dataset/biosample_index.py b/src/gentropy/dataset/biosample_index.py
index 9905afd91..49256fe69 100644
--- a/src/gentropy/dataset/biosample_index.py
+++ b/src/gentropy/dataset/biosample_index.py
@@ -42,111 +42,6 @@ def get_schema(cls: type[StudyIndex]) -> StructType:
         """
         return parse_spark_schema("biosample_index.json")
 
-    @classmethod
-    def merge(
-        cls: type[BiosampleIndex],
-         biosample_indexes: list[BiosampleIndex], 
-    ) -> BiosampleIndex:
-        """Merge a list of biosample indexes into a single biosample index.
-
-        Args:
-            biosample_indexes (BiosampleIndex): Biosample indexes to merge.
 
-        Returns:
-            BiosampleIndex: Merged biosample index.
-        """
-        df = reduct(DataFrame.unionAll, [biosample_index._df for biosample_index in biosample_indexes])
-        return BiosampleIndex(_df=df, _schema=cls.get_schema())
         
 
-def extract_ontology_info(
-    ontology : owlready2.namespace.Ontology,
-    session : Session,
-    schema : StructType
-) -> BiosampleIndex:
-    """Extracts the ontology information from Uberon or Cell Ontology owo owlready2 ontology object.
-
-    Args:
-        ontology (owlready2.namespace.Ontology): An owlready2 ontology object. Must be either from Cell Ontology or Uberon.
-        prefix (str): Prefix for the desired ontology terms.
-        session (Session): Spark session.
-
-    Returns:
-        BiosampleIndex: Parsed and annotated biosample index table.
-    """
-    data_list = []
-
-    # Iterate over all classes in the ontology
-    for cls in ontology.classes():
-        # Basic class information
-        cls_id = cls.name
-        # cls_code = cls.iri
-        cls_name = cls.label[0] if cls.label else None
-
-        # Extract descriptions
-        description = None
-        if hasattr(cls, 'IAO_0000115'):
-            description = cls.IAO_0000115.first() if cls.IAO_0000115 else None
-
-        # Extract dbXRefs
-        dbXRefs = []
-        if hasattr(cls, 'hasDbXref'):
-            dbXRefs = [Row(id=x, source=x.split(':')[0]) for x in cls.hasDbXref]
-
-        # Parent classes
-        parents = []
-        for parent in cls.is_a:
-            if parent is owl.Thing: 
-                continue  # Skip owlready2 Thing class, which is a top-level class
-            elif hasattr(parent, 'name'):
-                parent_id = parent.name
-                parents.append(parent_id)
-            elif hasattr(parent, 'property'):  # For restrictions
-                continue  # We skip restrictions in this simplified list
-
-        # Synonyms
-        synonyms = set()
-        if hasattr(cls, 'hasExactSynonym'):
-            synonyms.update(cls.hasExactSynonym)
-        if hasattr(cls, 'hasBroadSynonym'):
-            synonyms.update(cls.hasBroadSynonym)
-        if hasattr(cls, 'hasNarrowSynonym'):
-            synonyms.update(cls.hasNarrowSynonym)
-        if hasattr(cls, 'hasRelatedSynonym'):
-            synonyms.update(cls.hasRelatedSynonym)
-
-        # Children classes
-        children = [child.name for child in cls.subclasses()]
-
-        # Ancestors and descendants with Thing class filtered out
-        ancestors = [anc.name for anc in cls.ancestors() if hasattr(anc, 'name') and anc is not owl.Thing]
-        descendants = [desc.name for desc in cls.descendants() if hasattr(desc, 'name')]
-
-        # Check if the class is deprecated
-        is_deprecated = False
-        if hasattr(cls, 'deprecated') and cls.deprecated:
-            is_deprecated = True
-
-        # Compile all information into a Row
-        entry = Row(
-            id=cls_id,
-            # code=cls_code,
-            name=cls_name,  
-            dbXRefs=dbXRefs,
-            description=description,
-            parents=parents,
-            synonyms=list(synonyms),
-            ancestors=ancestors,
-            descendants=descendants,
-            children=children,
-            ontology={"is_obsolete": is_deprecated}
-        )
-        
-        # Add to data list
-        data_list.append(entry)
-
-
-    # Create DataFrame directly from Rows
-    df = session.createDataFrame(data_list, schema)
-    return df
-
diff --git a/src/gentropy/datasource/ontologies/utils.py b/src/gentropy/datasource/ontologies/utils.py
index 6f2043c93..d22470778 100644
--- a/src/gentropy/datasource/ontologies/utils.py
+++ b/src/gentropy/datasource/ontologies/utils.py
@@ -1,10 +1,20 @@
+"""Utility functions for Biosample ontology processing."""
+import owlready2
+from pyspark.sql import Row, SparkSession
+from pyspark.sql.types import StructType, StringType, ArrayType
+from pyspark.sql.functions import col, explode_outer, collect_set, collect_list, array_distinct, regexp_replace, udf, coalesce
+from pyspark.sql.window import Window
+from functools import reduce
+from gentropy.dataset.biosample_index import BiosampleIndex
+
 
 def extract_ontology_info(
     ontology : owlready2.namespace.Ontology,
-    session : Session,
+    spark : SparkSession,
     schema : StructType
 ) -> BiosampleIndex:
     """Extracts the ontology information from Uberon or Cell Ontology owo owlready2 ontology object.
+    NOT IN USE
 
     Args:
         ontology (owlready2.namespace.Ontology): An owlready2 ontology object. Must be either from Cell Ontology or Uberon.
@@ -87,5 +97,118 @@ def extract_ontology_info(
 
 
     # Create DataFrame directly from Rows
-    df = session.createDataFrame(data_list, schema)
+    df = spark.createDataFrame(data_list, schema)
     return df
+
+
+def extract_ontology_from_json(
+    ontology_json : str,
+    spark : SparkSession
+) -> BiosampleIndex:
+    """
+    Extracts the ontology information from a JSON file. Currently only supports Uberon and Cell Ontology.
+
+    Args:
+        ontology_json (str): Path to the JSON file containing the ontology information.
+        spark (SparkSession): Spark session.
+
+    Returns:
+        BiosampleIndex: Parsed and annotated biosample index table.
+    """
+
+    def json_graph_traversal(df, node_col, link_col, traversal_type="ancestors"):
+        """
+        Traverse a graph represented in a DataFrame to find all ancestors or descendants.
+        """
+        # Collect graph data as a map
+        graph_map = df.select(node_col, link_col).rdd.collectAsMap()
+        broadcasted_graph = spark.sparkContext.broadcast(graph_map)
+
+        def get_relationships(node):
+            relationships = set()
+            stack = [node]
+            while stack:
+                current = stack.pop()
+                if current in broadcasted_graph.value:
+                    current_links = broadcasted_graph.value[current]
+                    stack.extend(current_links)
+                    relationships.update(current_links)
+            return list(relationships)
+
+        # Choose column name based on traversal type
+        result_col = "ancestors" if traversal_type == "ancestors" else "descendants"
+
+        # Register the UDF based on traversal type
+        relationship_udf = udf(get_relationships, ArrayType(StringType()))
+
+        # Apply the UDF to create the result column
+        return df.withColumn(result_col, relationship_udf(col(node_col)))
+
+    # Load the JSON file
+    df = spark.read.json(ontology_json, multiLine=True)
+
+    # Exploding the 'graphs' array to make individual records easier to access
+    df_graphs = df.select(explode_outer("graphs").alias("graph"))
+
+    # Exploding the 'nodes' array within each graph
+    df_nodes = df_graphs.select(
+        col("graph.id").alias("graph_id"),
+        explode_outer("graph.nodes").alias("node"))
+
+    # Exploding the 'edges' array within each graph for relationship data
+    df_edges = df_graphs.select(
+        col("graph.id").alias("graph_id"),
+        explode_outer("graph.edges").alias("edge")
+    ).select(
+        col("edge.sub").alias("subject"),
+        col("edge.pred").alias("predicate"),
+        col("edge.obj").alias("object")
+    )
+    df_edges = df_edges.withColumn("subject", regexp_replace(col("subject"), "http://purl.obolibrary.org/obo/", ""))
+    df_edges = df_edges.withColumn("object", regexp_replace(col("object"), "http://purl.obolibrary.org/obo/", ""))
+
+    # Extract the relevant information from the nodes
+    transformed_df = df_nodes.select(
+    regexp_replace(col("node.id"), "http://purl.obolibrary.org/obo/", "").alias("biosampleId"),
+    col("node.lbl").alias("biosampleName"),
+    col("node.meta.definition.val").alias("description"),
+    collect_set(col("node.meta.xrefs.val")).over(Window.partitionBy("node.id")).getItem(0).alias("dbXrefs"),
+    collect_set(col("node.meta.synonyms.val")).over(Window.partitionBy("node.id")).getItem(0).alias("synonyms"),
+    col("node.meta.deprecated").alias("deprecated"))
+    
+    # Extract the relationships from the edges
+    # Prepare relationship-specific DataFrames
+    df_parents = df_edges.filter(col("predicate") == "is_a").select("subject", "object").withColumnRenamed("object", "parent")
+    df_children = df_edges.filter(col("predicate") == "is_a").select("object", "subject").withColumnRenamed("subject", "child")
+
+    # Aggregate relationships back to nodes
+    df_parents_grouped = df_parents.groupBy("subject").agg(array_distinct(collect_list("parent")).alias("parents"))
+    df_children_grouped = df_children.groupBy("object").agg(array_distinct(collect_list("child")).alias("children"))
+
+    # Get all ancestors
+    df_with_ancestors = json_graph_traversal(df_parents_grouped, "subject", "parents", "ancestors")
+    # Get all descendants
+    df_with_descendants = json_graph_traversal(df_children_grouped, "object", "children", "descendants")
+
+    # Join the ancestor and descendant DataFrames
+    df_with_relationships = df_with_ancestors.join(df_with_descendants, df_with_ancestors.subject == df_with_descendants.object, "full_outer").withColumn("biosampleId", coalesce(df_with_ancestors.subject, df_with_descendants.object)).drop("subject", "object")
+
+    # Join the original DataFrame with the relationship DataFrame
+    final_df = transformed_df.join(df_with_relationships, ['biosampleId'], "left")
+    
+    return final_df
+
+    def merge_biosample_indices(
+         biosample_indices: list[BiosampleIndex], 
+    ) -> BiosampleIndex:
+        """Merge a list of biosample indexes into a single biosample index.
+        Where there are conflicts, in single values - the first value is taken. In list values, the union of all values is taken.
+
+        Args:
+            biosample_indexes (BiosampleIndex): Biosample indexes to merge.
+
+        Returns:
+            BiosampleIndex: Merged biosample index.
+        """
+        # Merge the DataFrames
+        merged_df = reduce(DataFrame.unionByName, biosample_indices)
\ No newline at end of file
diff --git a/tests/gentropy/dataset/test_biosample_index.py b/tests/gentropy/dataset/test_biosample_index.py
index c680b4f19..f7d85e230 100644
--- a/tests/gentropy/dataset/test_biosample_index.py
+++ b/tests/gentropy/dataset/test_biosample_index.py
@@ -9,7 +9,8 @@
 from pyspark.sql.types import StructType, StructField, StringType, ArrayType, MapType, BooleanType
 import json
 
-from gentropy.dataset.biosample_index import BiosampleIndex, extract_ontology_info
+from gentropy.dataset.biosample_index import BiosampleIndex
+from gentropy.datasource.ontologies.utils import extract_ontology_from_json
 
 
 def test_biosample_index_creation(mock_biosample_index: BiosampleIndex) -> None:
@@ -18,14 +19,13 @@ def test_biosample_index_creation(mock_biosample_index: BiosampleIndex) -> None:
 
 
 
-cell_ontology = owl.get_ontology("/home/alegbe/repos/gentropy/tests/gentropy/data_samples/cell_ontology.owl").load()
 spark2 = SparkSession.builder \
     .master("local[*]") \
     .appName("LocalOntologyIndexing") \
     .getOrCreate()
 
-# Define the schema for the DataFrame
-schema_path = '/home/alegbe/repos/gentropy/src/gentropy/assets/schemas/biosample_index.json'
-schema = StructType.fromJson(json.load(open(schema_path)))
 
-df = extract_ontology_info(cell_ontology, spark2, schema)
\ No newline at end of file
+ontology_json = 'file:///home/alegbe/cl.json'
+# ontology_json = 'file:///home/alegbe/uberon.json'
+
+df = extract_ontology_from_json(ontology_json, spark2)
\ No newline at end of file

From 28e1f92749b000a424a92807c174320a8083718a Mon Sep 17 00:00:00 2001
From: Tobi Alegbe <alegbe@ebi.ac.uk>
Date: Mon, 16 Sep 2024 13:31:16 +0000
Subject: [PATCH 09/22] Update biosample index

---
 docs/python_api/datasources/_datasources.md   |   7 +-
 .../assets/schemas/biosample_index.json       |  11 -
 src/gentropy/biosample_index.py               |  18 +-
 src/gentropy/config.py                        |  10 +
 src/gentropy/dataset/biosample_index.py       |   8 +-
 src/gentropy/dataset/study_index.py           |   5 +-
 .../{gwas_catalog => ontologies}/__init__.py  |   2 +-
 .../datasource/ontologies/cell_ontology.py    |  48 --
 src/gentropy/datasource/ontologies/uberon.py  |  49 --
 src/gentropy/datasource/ontologies/utils.py   | 147 +---
 src/gentropy/study_validation.py              |   4 +-
 .../data_samples/cell_ontology_sample.json    | 274 +++++++
 .../gentropy/data_samples/uberon_sample.json  | 675 ++++++++++++++++++
 .../gentropy/dataset/test_biosample_index.py  |  12 +-
 14 files changed, 1026 insertions(+), 244 deletions(-)
 rename src/gentropy/datasource/{gwas_catalog => ontologies}/__init__.py (50%)
 delete mode 100644 src/gentropy/datasource/ontologies/cell_ontology.py
 delete mode 100644 src/gentropy/datasource/ontologies/uberon.py
 create mode 100644 tests/gentropy/data_samples/cell_ontology_sample.json
 create mode 100644 tests/gentropy/data_samples/uberon_sample.json

diff --git a/docs/python_api/datasources/_datasources.md b/docs/python_api/datasources/_datasources.md
index e6e081b21..f79f8137b 100644
--- a/docs/python_api/datasources/_datasources.md
+++ b/docs/python_api/datasources/_datasources.md
@@ -26,7 +26,7 @@ This section contains information about the data source harmonisation tools avai
 2. GWAS catalog's [harmonisation pipeline](https://www.ebi.ac.uk/gwas/docs/methods/summary-statistics#_harmonised_summary_statistics_data)
 3. Ensembl's [Variant Effect Predictor](https://www.ensembl.org/info/docs/tools/vep/index.html)
 
-## Linkage desiquilibrium
+## Linkage disequilibrium
 
 1. [GnomAD](gnomad/_gnomad.md) v2.1.1 LD matrixes (7 ancestries)
 
@@ -37,3 +37,8 @@ This section contains information about the data source harmonisation tools avai
 ## Gene annotation
 
 1. [Open Targets Platform Target Dataset](open_targets/target.md) (derived from Ensembl)
+
+## Biological samples
+
+1. [Uberon](ontologies/_uberon.md)
+2. [Cell Ontology](ontologies/_cell_ontology.md)
\ No newline at end of file
diff --git a/src/gentropy/assets/schemas/biosample_index.json b/src/gentropy/assets/schemas/biosample_index.json
index 82ba5ae2b..7c28ec970 100644
--- a/src/gentropy/assets/schemas/biosample_index.json
+++ b/src/gentropy/assets/schemas/biosample_index.json
@@ -39,17 +39,6 @@
       "nullable": true,
       "metadata": {}
     },
-    {
-      "name": "deprecated",
-      "type": {
-        "type": "map",
-        "keyType": "string",
-        "valueType": "boolean",
-        "valueContainsNull": true
-      },
-      "nullable": true,
-      "metadata": {}
-    },
     {
       "name": "parents",
       "type": {
diff --git a/src/gentropy/biosample_index.py b/src/gentropy/biosample_index.py
index 43d1511bc..11274d789 100644
--- a/src/gentropy/biosample_index.py
+++ b/src/gentropy/biosample_index.py
@@ -2,10 +2,8 @@
 from __future__ import annotations
 
 from gentropy.common.session import Session
-from gentropy.datasource.open_targets.target import OpenTargetsTarget
 from gentropy.dataset.biosample_index import BiosampleIndex
-from gentropy.datasource.cell_ontology.biosample_index import CellOntologyBiosampleIndex
-from gentropy.datasource.uberon.biosample_index import UberonBiosampleIndex
+from gentropy.datasource.ontologies.utils import extract_ontology_from_json, merge_biosample_indices
 
 
 class BiosampleIndexStep:
@@ -29,12 +27,10 @@ def __init__(
             uberon_input_path (str): Input uberon dataset path.
             biosample_index_output_path (str): Output gene index dataset path.
         """
-        cell_ontology_index = BiosampleIndex.extract_from_source(
-            session, cell_ontology_input_path
-        )
-        uberon_index = BiosampleIndex.extract_from_source(
-            session, uberon_input_path
-        )
-        biosample_index = BiosampleIndex.merge([cell_ontology_index, uberon_index])
-        biosample_index.write_parquet(biosample_index_output_path)
+        cell_ontology_index = extract_ontology_from_json(cell_ontology_input_path, session.spark)
+        uberon_index = extract_ontology_from_json(uberon_input_path, session.spark)
+        
+        biosample_index = merge_biosample_indices([cell_ontology_index, uberon_index])
+        
+        biosample_index.df.write.mode(session.write_mode).parquet(biosample_index_output_path)
         
diff --git a/src/gentropy/config.py b/src/gentropy/config.py
index 9089dbecf..114913090 100644
--- a/src/gentropy/config.py
+++ b/src/gentropy/config.py
@@ -51,6 +51,15 @@ class GeneIndexConfig(StepConfig):
     _target_: str = "gentropy.gene_index.GeneIndexStep"
 
 
+@dataclass
+class BiosampleIndexConfig(StepConfig):
+    """Biosample index step configuration."""
+
+    target_path: str = MISSING
+    biosample_index_path: str = MISSING
+    _target_: str = "gentropy.biosample_index.BiosampleIndexStep"
+
+
 @dataclass
 class GWASCatalogStudyCurationConfig(StepConfig):
     """GWAS Catalog study curation step configuration."""
@@ -532,6 +541,7 @@ def register_config() -> None:
     cs.store(group="step", name="colocalisation", node=ColocalisationConfig)
     cs.store(group="step", name="eqtl_catalogue", node=EqtlCatalogueConfig)
     cs.store(group="step", name="gene_index", node=GeneIndexConfig)
+    cs.store(group="step", name="biosample_index", node=BiosampleIndexConfig)
     cs.store(
         group="step",
         name="gwas_catalog_study_curation",
diff --git a/src/gentropy/dataset/biosample_index.py b/src/gentropy/dataset/biosample_index.py
index 49256fe69..b3735ca62 100644
--- a/src/gentropy/dataset/biosample_index.py
+++ b/src/gentropy/dataset/biosample_index.py
@@ -23,8 +23,6 @@
 if TYPE_CHECKING:
     from pyspark.sql.types import StructType
 
-import owlready2 as owl
-
 
 @dataclass
 class BiosampleIndex(Dataset):
@@ -40,8 +38,4 @@ def get_schema(cls: type[StudyIndex]) -> StructType:
         Returns:
             StructType: The schema of the BiosampleIndex dataset.
         """
-        return parse_spark_schema("biosample_index.json")
-
-
-        
-
+        return parse_spark_schema("biosample_index.json")
\ No newline at end of file
diff --git a/src/gentropy/dataset/study_index.py b/src/gentropy/dataset/study_index.py
index 43ca171ee..852f14d9e 100644
--- a/src/gentropy/dataset/study_index.py
+++ b/src/gentropy/dataset/study_index.py
@@ -20,6 +20,7 @@
     from pyspark.sql.types import StructType
 
     from gentropy.dataset.gene_index import GeneIndex
+    from gentropy.dataset.biosample_index import BiosampleIndex
 
 
 class StudyQualityCheck(Enum):
@@ -29,6 +30,7 @@ class StudyQualityCheck(Enum):
         UNRESOLVED_TARGET (str): Target/gene identifier could not match to reference - Labelling failing target.
         UNRESOLVED_DISEASE (str): Disease identifier could not match to referece or retired identifier - labelling failing disease
         UNKNOWN_STUDY_TYPE (str): Indicating the provided type of study is not supported.
+        UNKNOWN_BIOSAMPLE (str): Flagging if a biosample identifier is not found in the reference.
         DUPLICATED_STUDY (str): Flagging if a study identifier is not unique.
         NO_GENE_PROVIDED (str): Flagging QTL studies if the measured
     """
@@ -36,6 +38,7 @@ class StudyQualityCheck(Enum):
     UNRESOLVED_TARGET = "Target/gene identifier could not match to reference."
     UNRESOLVED_DISEASE = "No valid disease identifier found."
     UNKNOWN_STUDY_TYPE = "This type of study is not supported."
+    UNKNOWN_BIOSAMPLE = "Biosample identifier was not found in the reference."
     DUPLICATED_STUDY = "The identifier of this study is not unique."
     NO_GENE_PROVIDED = "QTL study doesn't have gene assigned."
 
@@ -434,7 +437,7 @@ def validate_biosample(self: StudyIndex, biosample_index: BiosampleIndex) -> Stu
                 StudyIndex.update_quality_flag(
                     f.col("qualityControls"),
                     ~f.col("isIdFound"),
-                    StudyQualityCheck.NO_GENE_PROVIDED,
+                    StudyQualityCheck.UNKNOWN_BIOSAMPLE,
                 ),
             )
             .drop("isIdFound")
diff --git a/src/gentropy/datasource/gwas_catalog/__init__.py b/src/gentropy/datasource/ontologies/__init__.py
similarity index 50%
rename from src/gentropy/datasource/gwas_catalog/__init__.py
rename to src/gentropy/datasource/ontologies/__init__.py
index 544779b18..d3fa6b416 100644
--- a/src/gentropy/datasource/gwas_catalog/__init__.py
+++ b/src/gentropy/datasource/ontologies/__init__.py
@@ -1,3 +1,3 @@
-"""GWAS Catalog Data Source."""
+"""Biosample index data source."""
 
 from __future__ import annotations
diff --git a/src/gentropy/datasource/ontologies/cell_ontology.py b/src/gentropy/datasource/ontologies/cell_ontology.py
deleted file mode 100644
index 3ec2d7be4..000000000
--- a/src/gentropy/datasource/ontologies/cell_ontology.py
+++ /dev/null
@@ -1,48 +0,0 @@
-"""Biosample index for Cell Ontology data source."""
-
-from __future__ import annotations
-
-from itertools import chain
-from typing import TYPE_CHECKING
-
-import pandas as pd
-import pyspark.sql.functions as f
-from pyspark.sql.types import IntegerType, StringType, StructField, StructType
-
-import owlready2 as owl
-
-from gentropy.common.session import Session
-from gentropy.dataset.biosample_index import BiosampleIndex, extract_ontology_info
-
-if TYPE_CHECKING:
-    from pyspark.sql import DataFrame
-    from pyspark.sql.column import Column
-
-class CellOntologyBiosampleIndex:
-    """Biosample index dataset from Cell Ontology.
-    
-    Cell type data is extracted from the Cell Ontology (CL) https://obophenotype.github.io/cell-ontology/ and used to define the cell types in the biosample index dataset.
-    """
-
-    @classmethod
-    def extract_celltypes_from_source(
-        cls: type[CellOntologyStudyIndex],
-        session: Session,
-        ontology_path: str,
-    ) -> DataFrame:
-        """Ingests Cell Ontology owo file and extracts cell types.
-
-        Args:
-            session (Session): Spark session.
-            ontology_path (str): Path to the Cell ontology owo file.
-
-        Returns:
-            BiosampleIndex: Parsed and annotated Cell Ontology biosample index table.
-        """
-        ontology_data = owl.get_ontology(ontology_path).load()
-        df = extract_ontology_info(ontology_data, "CL_", session, BiosampleIndex.get_schema())
-        
-        return BiosampleIndex(
-            _df=df,
-            _schema=BiosampleIndex.get_schema()
-            )
\ No newline at end of file
diff --git a/src/gentropy/datasource/ontologies/uberon.py b/src/gentropy/datasource/ontologies/uberon.py
deleted file mode 100644
index a59ce2df4..000000000
--- a/src/gentropy/datasource/ontologies/uberon.py
+++ /dev/null
@@ -1,49 +0,0 @@
-"""Biosample index for Uberon data source."""
-
-from __future__ import annotations
-
-from itertools import chain
-from typing import TYPE_CHECKING
-
-import pandas as pd
-import pyspark.sql.functions as f
-from pyspark.sql.types import IntegerType, StringType, StructField, StructType
-
-import owlready2 as owl
-
-from gentropy.common.session import Session
-from gentropy.dataset.biosample_index import BiosampleIndex
-from grn
-
-if TYPE_CHECKING:
-    from pyspark.sql import DataFrame
-    from pyspark.sql.column import Column
-
-class UberonBiosampleIndex:
-    """Biosample index dataset from Uberon.
-    
-    Cell type data is extracted from the Uberon (UBERON) https://obophenotype.github.io/uberon/ and used to define the tissues in the biosample index dataset.
-    """
-
-    @classmethod
-    def extract_tissue_from_source(
-        cls: type[UberonStudyIndex],
-        session: Session,
-        ontology_path: str,
-    ) -> DataFrame:
-        """Ingests Uberon owo file and extracts tissues.
-
-        Args:
-            session (Session): Spark session.
-            ontology_path (str): Path to the Uberon owo file.
-
-        Returns:
-            BiosampleIndex: Parsed and annotated Uberon biosample index table.
-        """
-        ontology_data = owl.get_ontology(ontology_path).load()
-        df = extract_ontology_info(ontology_data, "UBERON_", session, BiosampleIndex.get_schema())
-        
-        return BiosampleIndex(
-            _df=df,
-            _schema=BiosampleIndex.get_schema()
-            )
diff --git a/src/gentropy/datasource/ontologies/utils.py b/src/gentropy/datasource/ontologies/utils.py
index d22470778..adf38c4da 100644
--- a/src/gentropy/datasource/ontologies/utils.py
+++ b/src/gentropy/datasource/ontologies/utils.py
@@ -1,106 +1,11 @@
 """Utility functions for Biosample ontology processing."""
-import owlready2
-from pyspark.sql import Row, SparkSession
+from pyspark.sql import Row, SparkSession, DataFrame
 from pyspark.sql.types import StructType, StringType, ArrayType
-from pyspark.sql.functions import col, explode_outer, collect_set, collect_list, array_distinct, regexp_replace, udf, coalesce
+from pyspark.sql.functions import col, explode_outer, collect_set, collect_list, array_distinct, regexp_replace, udf, coalesce, first
 from pyspark.sql.window import Window
 from functools import reduce
 from gentropy.dataset.biosample_index import BiosampleIndex
 
-
-def extract_ontology_info(
-    ontology : owlready2.namespace.Ontology,
-    spark : SparkSession,
-    schema : StructType
-) -> BiosampleIndex:
-    """Extracts the ontology information from Uberon or Cell Ontology owo owlready2 ontology object.
-    NOT IN USE
-
-    Args:
-        ontology (owlready2.namespace.Ontology): An owlready2 ontology object. Must be either from Cell Ontology or Uberon.
-        prefix (str): Prefix for the desired ontology terms.
-        session (Session): Spark session.
-
-    Returns:
-        BiosampleIndex: Parsed and annotated biosample index table.
-    """
-    data_list = []
-
-    # Iterate over all classes in the ontology
-    for cls in ontology.classes():
-        # Basic class information
-        cls_id = cls.name
-        # cls_code = cls.iri
-        cls_name = cls.label[0] if cls.label else None
-
-        # Extract descriptions
-        description = None
-        if hasattr(cls, 'IAO_0000115'):
-            description = cls.IAO_0000115.first() if cls.IAO_0000115 else None
-
-        # Extract dbXRefs
-        dbXRefs = []
-        if hasattr(cls, 'hasDbXref'):
-            dbXRefs = [Row(id=x, source=x.split(':')[0]) for x in cls.hasDbXref]
-
-        # Parent classes
-        parents = []
-        for parent in cls.is_a:
-            if parent is owl.Thing: 
-                continue  # Skip owlready2 Thing class, which is a top-level class
-            elif hasattr(parent, 'name'):
-                parent_id = parent.name
-                parents.append(parent_id)
-            elif hasattr(parent, 'property'):  # For restrictions
-                continue  # We skip restrictions in this simplified list
-
-        # Synonyms
-        synonyms = set()
-        if hasattr(cls, 'hasExactSynonym'):
-            synonyms.update(cls.hasExactSynonym)
-        if hasattr(cls, 'hasBroadSynonym'):
-            synonyms.update(cls.hasBroadSynonym)
-        if hasattr(cls, 'hasNarrowSynonym'):
-            synonyms.update(cls.hasNarrowSynonym)
-        if hasattr(cls, 'hasRelatedSynonym'):
-            synonyms.update(cls.hasRelatedSynonym)
-
-        # Children classes
-        children = [child.name for child in cls.subclasses()]
-
-        # Ancestors and descendants with Thing class filtered out
-        ancestors = [anc.name for anc in cls.ancestors() if hasattr(anc, 'name') and anc is not owl.Thing]
-        descendants = [desc.name for desc in cls.descendants() if hasattr(desc, 'name')]
-
-        # Check if the class is deprecated
-        is_deprecated = False
-        if hasattr(cls, 'deprecated') and cls.deprecated:
-            is_deprecated = True
-
-        # Compile all information into a Row
-        entry = Row(
-            id=cls_id,
-            # code=cls_code,
-            name=cls_name,  
-            dbXRefs=dbXRefs,
-            description=description,
-            parents=parents,
-            synonyms=list(synonyms),
-            ancestors=ancestors,
-            descendants=descendants,
-            children=children,
-            ontology={"is_obsolete": is_deprecated}
-        )
-        
-        # Add to data list
-        data_list.append(entry)
-
-
-    # Create DataFrame directly from Rows
-    df = spark.createDataFrame(data_list, schema)
-    return df
-
-
 def extract_ontology_from_json(
     ontology_json : str,
     spark : SparkSession
@@ -173,8 +78,9 @@ def get_relationships(node):
     col("node.lbl").alias("biosampleName"),
     col("node.meta.definition.val").alias("description"),
     collect_set(col("node.meta.xrefs.val")).over(Window.partitionBy("node.id")).getItem(0).alias("dbXrefs"),
-    collect_set(col("node.meta.synonyms.val")).over(Window.partitionBy("node.id")).getItem(0).alias("synonyms"),
-    col("node.meta.deprecated").alias("deprecated"))
+    # col("node.meta.deprecated").alias("deprecated"),
+    collect_set(col("node.meta.synonyms.val")).over(Window.partitionBy("node.id")).getItem(0).alias("synonyms"))
+    
     
     # Extract the relationships from the edges
     # Prepare relationship-specific DataFrames
@@ -198,17 +104,40 @@ def get_relationships(node):
     
     return final_df
 
-    def merge_biosample_indices(
+def merge_biosample_indices(
          biosample_indices: list[BiosampleIndex], 
     ) -> BiosampleIndex:
-        """Merge a list of biosample indexes into a single biosample index.
-        Where there are conflicts, in single values - the first value is taken. In list values, the union of all values is taken.
+    """Merge a list of biosample indexes into a single biosample index.
+    Where there are conflicts, in single values - the first value is taken. In list values, the union of all values is taken.
 
-        Args:
-            biosample_indexes (BiosampleIndex): Biosample indexes to merge.
+    Args:
+        biosample_indexes (BiosampleIndex): Biosample indexes to merge.
 
-        Returns:
-            BiosampleIndex: Merged biosample index.
-        """
-        # Merge the DataFrames
-        merged_df = reduce(DataFrame.unionByName, biosample_indices)
\ No newline at end of file
+    Returns:
+        BiosampleIndex: Merged biosample index.
+    """
+    
+    def merge_lists(lists):
+        """Merge a list of lists into a single list."""
+        return list(set([item for sublist in lists if sublist is not None for item in sublist]))
+    
+    # Make a spark udf (user defined function) to merge lists
+    merge_lists_udf = udf(merge_lists, ArrayType(StringType()))
+
+    # Merge the DataFrames
+    merged_df = reduce(DataFrame.unionAll, biosample_indices)
+    
+    # Define dictionary of columns and corresponding aggregation functions
+    # Currently this will take the first value for single values and merge lists for list values
+    agg_funcs = {}
+    for column in merged_df.columns:
+        if column != 'biosampleId':
+            if 'list' in column:  # Assuming column names that have 'list' need list merging
+                agg_funcs[column] = merge_lists_udf(collect_list(column)).alias(column)
+            else:
+                agg_funcs[column] = first(column, ignorenulls=True).alias(column)
+
+    # Group by biosampleId and aggregate the columns
+    merged_df = merged_df.groupBy('biosampleId').agg(agg_funcs)
+
+    return merged_df
\ No newline at end of file
diff --git a/src/gentropy/study_validation.py b/src/gentropy/study_validation.py
index 3e926078d..d19f012e4 100644
--- a/src/gentropy/study_validation.py
+++ b/src/gentropy/study_validation.py
@@ -7,6 +7,7 @@
 from gentropy.common.session import Session
 from gentropy.dataset.gene_index import GeneIndex
 from gentropy.dataset.study_index import StudyIndex
+from gentropy.dataset.biosample_index import BiosampleIndex
 
 
 class StudyValidationStep:
@@ -34,12 +35,14 @@ def __init__(
             study_index_path (list[str]): Path to study index file.
             target_index_path (str): Path to target index file.
             disease_index_path (str): Path to disease index file.
+            biosample_index_path (str): Path to biosample index file.
             valid_study_index_path (str): Path to write the valid records.
             invalid_study_index_path (str): Path to write the output file.
             invalid_qc_reasons (list[str]): List of invalid quality check reason names from `StudyQualityCheck` (e.g. ['DUPLICATED_STUDY']).
         """
         # Reading datasets:
         target_index = GeneIndex.from_parquet(session, target_index_path)
+        biosample_index = BiosampleIndex.from_parquet(session, biosample_index_path)
         # Reading disease index and pre-process.
         # This logic does not belong anywhere, but gentorpy has no disease dataset yet.
         disease_index = (
@@ -56,7 +59,6 @@ def __init__(
             .withColumn("efo", f.coalesce(f.col("efo"), f.col("diseaseId")))
         )
         study_index = StudyIndex.from_parquet(session, list(study_index_path))
-        biosample_index = BiosampleIndex.from_parquet(session, biosample_index_path)
 
         # Running validation:
         study_index_with_qc = (
diff --git a/tests/gentropy/data_samples/cell_ontology_sample.json b/tests/gentropy/data_samples/cell_ontology_sample.json
new file mode 100644
index 000000000..5a774f473
--- /dev/null
+++ b/tests/gentropy/data_samples/cell_ontology_sample.json
@@ -0,0 +1,274 @@
+{
+  "graphs" : [ {
+    "id" : "http://purl.obolibrary.org/obo/cl.json",
+    "meta" : {
+      "basicPropertyValues" : [ {
+        "pred" : "http://purl.obolibrary.org/obo/IAO_0000700",
+        "val" : "http://purl.obolibrary.org/obo/CL_0000000"
+      }, {
+        "pred" : "http://purl.org/dc/elements/1.1/description",
+        "val" : "An ontology of cell types."
+      }, {
+        "pred" : "http://purl.org/dc/elements/1.1/title",
+        "val" : "Cell Ontology"
+      }, {
+        "pred" : "http://purl.org/dc/terms/contributor",
+        "val" : "https://orcid.org/0000-0001-5208-3432"
+      }, {
+        "pred" : "http://purl.org/dc/terms/contributor",
+        "val" : "https://orcid.org/0000-0001-9114-8737"
+      }, {
+        "pred" : "http://purl.org/dc/terms/contributor",
+        "val" : "https://orcid.org/0000-0001-9990-8331"
+      }, {
+        "pred" : "http://purl.org/dc/terms/contributor",
+        "val" : "https://orcid.org/0000-0002-2244-7917"
+      }, {
+        "pred" : "http://purl.org/dc/terms/contributor",
+        "val" : "https://orcid.org/0000-0002-6601-2165"
+      }, {
+        "pred" : "http://purl.org/dc/terms/contributor",
+        "val" : "https://orcid.org/0000-0002-7073-9172"
+      }, {
+        "pred" : "http://purl.org/dc/terms/contributor",
+        "val" : "https://orcid.org/0000-0002-8688-6599"
+      }, {
+        "pred" : "http://purl.org/dc/terms/contributor",
+        "val" : "https://orcid.org/0000-0002-9900-7880"
+      }, {
+        "pred" : "http://purl.org/dc/terms/contributor",
+        "val" : "https://orcid.org/0000-0003-1980-3228"
+      }, {
+        "pred" : "http://purl.org/dc/terms/license",
+        "val" : "http://creativecommons.org/licenses/by/4.0/"
+      }, {
+        "pred" : "http://www.w3.org/2000/01/rdf-schema#comment",
+        "val" : "See PMID:15693950, PMID:12799354, PMID:20123131, PMID:21208450; Contact Alexander Diehl, addiehl@buffalo.edu, university at buffalo."
+      }, {
+        "pred" : "http://www.w3.org/2002/07/owl#versionInfo",
+        "val" : "2024-08-16"
+      } ],
+      "version" : "http://purl.obolibrary.org/obo/cl/releases/2024-08-16/cl.json"
+    },
+    "nodes" : [ {
+      "id" : "http://purl.obolibrary.org/obo/CL_0000653",
+      "lbl" : "podocyte",
+      "type" : "CLASS",
+      "meta" : {
+        "definition" : {
+          "val" : "A specialized kidney epithelial cell, contained within a glomerulus, that contains \"feet\" that interdigitate with the \"feet\" of other podocytes.",
+          "xrefs" : [ "GOC:tfm", "https://doi.org/10.1101/2021.10.10.463829" ]
+        },
+        "subsets" : [ "http://purl.obolibrary.org/obo/cl#cellxgene_subset", "http://purl.obolibrary.org/obo/uberon/core#human_reference_atlas" ],
+        "synonyms" : [ {
+          "pred" : "hasBroadSynonym",
+          "val" : "epithelial cell of visceral layer of glomerular capsule",
+          "xrefs" : [ "FMA:70967" ]
+        }, {
+          "pred" : "hasExactSynonym",
+          "val" : "glomerular podocyte",
+          "xrefs" : [ "FMA:70967" ]
+        }, {
+          "pred" : "hasExactSynonym",
+          "val" : "glomerular visceral epithelial cell"
+        }, {
+          "pred" : "hasExactSynonym",
+          "val" : "kidney podocyte"
+        }, {
+          "pred" : "hasExactSynonym",
+          "val" : "renal podocyte"
+        } ],
+        "xrefs" : [ {
+          "val" : "BTO:0002295"
+        }, {
+          "val" : "FMA:70967"
+        }, {
+          "val" : "ZFA:0009285"
+        } ],
+        "basicPropertyValues" : [ {
+          "pred" : "http://purl.obolibrary.org/obo/RO_0002175",
+          "val" : "http://purl.obolibrary.org/obo/NCBITaxon_9606"
+        }, {
+          "pred" : "http://www.w3.org/2000/01/rdf-schema#seeAlso",
+          "val" : "https://github.com/obophenotype/cell-ontology/issues/1460"
+        } ]
+      }
+    }, {
+      "id" : "http://purl.obolibrary.org/obo/CL_0000654",
+      "lbl" : "primary oocyte",
+      "type" : "CLASS",
+      "meta" : {
+        "definition" : {
+          "val" : "A primary oocyte is an oocyte that has not completed female meosis I.",
+          "xrefs" : [ "GOC:tfm", "ISBN:0721662544" ]
+        },
+        "subsets" : [ "http://purl.obolibrary.org/obo/uberon/core#human_reference_atlas" ],
+        "synonyms" : [ {
+          "pred" : "hasRelatedSynonym",
+          "val" : "primary oogonium"
+        } ],
+        "xrefs" : [ {
+          "val" : "BTO:0000512"
+        }, {
+          "val" : "FMA:18645"
+        } ],
+        "basicPropertyValues" : [ {
+          "pred" : "http://purl.obolibrary.org/obo/RO_0002175",
+          "val" : "http://purl.obolibrary.org/obo/NCBITaxon_9606"
+        } ]
+      }
+    }, {
+      "id" : "http://purl.obolibrary.org/obo/CL_0000655",
+      "lbl" : "secondary oocyte",
+      "type" : "CLASS",
+      "meta" : {
+        "definition" : {
+          "val" : "A secondary oocyte is an oocyte that has not completed meiosis II.",
+          "xrefs" : [ "GOC:tfm", "ISBN:0721662544" ]
+        },
+        "synonyms" : [ {
+          "pred" : "hasRelatedSynonym",
+          "val" : "primary oogonium"
+        } ],
+        "xrefs" : [ {
+          "val" : "BTO:0003094"
+        }, {
+          "val" : "FMA:18646"
+        } ]
+      }
+    }, {
+      "id" : "http://purl.obolibrary.org/obo/CL_0000656",
+      "lbl" : "primary spermatocyte",
+      "type" : "CLASS",
+      "meta" : {
+        "definition" : {
+          "val" : "A diploid cell that has derived from a spermatogonium and can subsequently begin meiosis and divide into two haploid secondary spermatocytes.",
+          "xrefs" : [ "GOC:tfm", "ISBN:0721662544" ]
+        },
+        "xrefs" : [ {
+          "val" : "BTO:0001115"
+        }, {
+          "val" : "CALOHA:TS-2194"
+        }, {
+          "val" : "FMA:72292"
+        } ]
+      }
+    }, {
+      "id" : "http://purl.obolibrary.org/obo/CL_0000657",
+      "lbl" : "secondary spermatocyte",
+      "type" : "CLASS",
+      "meta" : {
+        "definition" : {
+          "val" : "One of the two haploid cells into which a primary spermatocyte divides, and which in turn gives origin to spermatids.",
+          "xrefs" : [ "GOC:tfm", "ISBN:0721662544" ]
+        },
+        "xrefs" : [ {
+          "val" : "BTO:0000709"
+        }, {
+          "val" : "CALOHA:TS-2195"
+        }, {
+          "val" : "FBbt:00004941"
+        }, {
+          "val" : "FMA:72293"
+        } ]
+      }
+    }, {
+      "id" : "http://purl.obolibrary.org/obo/CL_0000658",
+      "lbl" : "cuticle secreting cell",
+      "type" : "CLASS",
+      "meta" : {
+        "definition" : {
+          "val" : "An epithelial cell that secretes cuticle.",
+          "xrefs" : [ "GOC:tfm" ]
+        }
+      }
+    }, {
+      "id" : "http://purl.obolibrary.org/obo/CL_0000659",
+      "lbl" : "eggshell secreting cell",
+      "type" : "CLASS",
+      "meta" : {
+        "definition" : {
+          "val" : "An extracellular matrix secreting cell that secretes eggshell.",
+          "xrefs" : [ "GOC:tfm" ]
+        }
+      }
+    } , {
+      "id" : "http://purl.obolibrary.org/obo/CL_1000451",
+      "lbl" : "obsolete epithelial cell of visceral layer of glomerular capsule",
+      "type" : "CLASS",
+      "meta" : {
+        "basicPropertyValues" : [ {
+          "pred" : "http://purl.obolibrary.org/obo/IAO_0100001",
+          "val" : "http://purl.obolibrary.org/obo/CL_0000653"
+        } ],
+        "deprecated" : true
+      }
+    } ],
+    "edges" : [
+      {
+        "sub" : "http://purl.obolibrary.org/obo/UBERON_0005751",
+        "pred" : "http://purl.obolibrary.org/obo/BFO_0000051",
+        "obj" : "http://purl.obolibrary.org/obo/CL_0000653"
+      },
+      {
+        "sub" : "http://purl.obolibrary.org/obo/GO_1903210",
+        "pred" : "http://purl.obolibrary.org/obo/BFO_0000066",
+        "obj" : "http://purl.obolibrary.org/obo/CL_0000653"
+      },
+      {
+        "sub" : "http://purl.obolibrary.org/obo/GO_0090521",
+        "pred" : "http://purl.obolibrary.org/obo/RO_0002565",
+        "obj" : "http://purl.obolibrary.org/obo/CL_0000653"
+      },
+      {
+        "sub" : "http://purl.obolibrary.org/obo/GO_0072015",
+        "pred" : "http://purl.obolibrary.org/obo/RO_0002296",
+        "obj" : "http://purl.obolibrary.org/obo/CL_0000653"
+      }, 
+      {
+        "sub" : "http://purl.obolibrary.org/obo/CL_4030008",
+        "pred" : "is_a",
+        "obj" : "http://purl.obolibrary.org/obo/CL_0000653"
+      },{
+        "sub" : "http://purl.obolibrary.org/obo/CL_0002525",
+        "pred" : "is_a",
+        "obj" : "http://purl.obolibrary.org/obo/CL_0000653"
+      },{
+        "sub" : "http://purl.obolibrary.org/obo/CL_0002523",
+        "pred" : "is_a",
+        "obj" : "http://purl.obolibrary.org/obo/CL_0000653"
+      },{
+        "sub" : "http://purl.obolibrary.org/obo/CL_0000653",
+        "pred" : "is_a",
+        "obj" : "http://purl.obolibrary.org/obo/CL_0002522"
+      }, {
+        "sub" : "http://purl.obolibrary.org/obo/CL_0000653",
+        "pred" : "is_a",
+        "obj" : "http://purl.obolibrary.org/obo/CL_1000450"
+      }, {
+        "sub" : "http://purl.obolibrary.org/obo/CL_0000653",
+        "pred" : "http://purl.obolibrary.org/obo/BFO_0000050",
+        "obj" : "http://purl.obolibrary.org/obo/UBERON_0005751"
+      },
+      {
+        "sub" : "http://purl.obolibrary.org/obo/CL_0000655",
+        "pred" : "is_a",
+        "obj" : "http://purl.obolibrary.org/obo/CL_0000023",
+        "meta" : {
+          "basicPropertyValues" : [ {
+            "pred" : "http://www.geneontology.org/formats/oboInOwl#is_inferred",
+            "val" : "true"
+          } ]
+        }
+      }, {
+        "sub" : "http://purl.obolibrary.org/obo/CL_0000655",
+        "pred" : "http://purl.obolibrary.org/obo/CL_4030044",
+        "obj" : "http://purl.obolibrary.org/obo/GO_0007147"
+      }, {
+        "sub" : "http://purl.obolibrary.org/obo/CL_0000655",
+        "pred" : "http://purl.obolibrary.org/obo/RO_0002202",
+        "obj" : "http://purl.obolibrary.org/obo/CL_0000654"
+      }
+    ]
+  }
+]}
diff --git a/tests/gentropy/data_samples/uberon_sample.json b/tests/gentropy/data_samples/uberon_sample.json
new file mode 100644
index 000000000..b06d652ef
--- /dev/null
+++ b/tests/gentropy/data_samples/uberon_sample.json
@@ -0,0 +1,675 @@
+{
+    "graphs" : [ {
+      "id" : "http://purl.obolibrary.org/obo/uberon.json",
+      "meta" : {
+        "basicPropertyValues" : [ {
+          "pred" : "http://purl.obolibrary.org/obo/IAO_0000700",
+          "val" : "http://purl.obolibrary.org/obo/UBERON_0000104"
+        }, {
+          "pred" : "http://purl.obolibrary.org/obo/IAO_0000700",
+          "val" : "http://purl.obolibrary.org/obo/UBERON_0001062"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/creator",
+          "val" : "https://orcid.org/0000-0001-5839-6798"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/creator",
+          "val" : "https://orcid.org/0000-0001-7972-3866"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/creator",
+          "val" : "https://orcid.org/0000-0001-9114-8737"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/creator",
+          "val" : "https://orcid.org/0000-0002-1810-9886"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/creator",
+          "val" : "https://orcid.org/0000-0002-6601-2165"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/creator",
+          "val" : "https://orcid.org/0000-0002-7356-1779"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/creator",
+          "val" : "https://orcid.org/0000-0002-9611-1279"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/creator",
+          "val" : "https://orcid.org/0000-0003-3162-7490"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/creator",
+          "val" : "https://orcid.org/0000-0003-3308-6245"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/description",
+          "val" : "Uberon is an integrated cross-species anatomy ontology representing a variety of entities classified according to traditional anatomical criteria such as structure, function and developmental lineage. The ontology includes comprehensive relationships to taxon-specific anatomical ontologies, allowing integration of functional, phenotype and expression data."
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/publisher",
+          "val" : "http://uberon.org"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "http://dbpedia.org"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "http://palaeos.com"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "http://www.brain-map.org"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "http://braininfo.rprc.washington.edu/"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "http://en.wikipedia.org/wiki/"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "http://ontology.neuinfo.org/NIF/BiomaterialEntities/NIF-GrossAnatomy.owl"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "http://pons.incf.org/wiki/Common_Upper_Mammalian_Brain_Ontology_%28Cumbo%29"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "http://purl.obolibrary.org/obo/aao.owl"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "http://purl.obolibrary.org/obo/aba.owl"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "http://purl.obolibrary.org/obo/aeo.owl"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "http://purl.obolibrary.org/obo/bila.owl"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "http://purl.obolibrary.org/obo/bto.owl"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "http://purl.obolibrary.org/obo/caro.owl"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "http://purl.obolibrary.org/obo/cl.owl"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "http://purl.obolibrary.org/obo/ehdaa2.owl"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "http://purl.obolibrary.org/obo/emapa.owl"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "http://purl.obolibrary.org/obo/fbbt.owl"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "http://purl.obolibrary.org/obo/fma.owl"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "http://purl.obolibrary.org/obo/go.owl"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "http://purl.obolibrary.org/obo/hp.owl"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "http://purl.obolibrary.org/obo/ma.owl"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "http://purl.obolibrary.org/obo/mp.owl"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "http://purl.obolibrary.org/obo/tao.owl"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "http://purl.obolibrary.org/obo/vhog.owl"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "http://purl.obolibrary.org/obo/vsao.owl"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "http://purl.obolibrary.org/obo/wbbt.owl"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "http://purl.obolibrary.org/obo/xao.owl"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "http://purl.obolibrary.org/obo/zfa.owl"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "http://uri.neuinfo.org/nif/nifstd"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "http://www.e-lico.eu/public/kupo/kupo.owl"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "http://www.ebi.ac.uk/efo/efo.owl"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "ISBN:0030229073 Invertebrate Zoology, Barnes"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "ISBN:0073040584 Vertebrates, Kardong"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "ISBN:0123813611 Comparative Anatomy and Histology: A Mouse and Human Atlas, Treuting and Dintzis"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "ISBN:0226313379 Fins into Limbs: Evolution, Development, and Transformation, Hall"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "ISBN:0443065837 Human embryology, Larsen"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "ISBN:0471888893 Comparative Vertebrate Neuroanatomy: Evolution and Adaptation by Butler and Hodos"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "ISBN:0683400088 Stedman's Medical Dictionary"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "ISBN:1588900649 Color Atlas and Textbook of Human Anatomy: Nervous system and sensory organs By Werner Kahle, Michael Frotscher"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "ISBN:1588903958 Principles and practice of pediatric neurosurgery By A. Leland Albright, P. David Adelson, Ian F. Pollack"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "ISBN:1607950324 Craniofacial Embryogenetics & Development, 2nd edition, Sperber"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "ISBN:978-0-12-369548-2 Principles of Developmental Genetics, Sally A Moody"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "ISBN:9780120749034 The laboratory rat"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "ISBN:9780397517251 Surgical anatomy of the hand and upper extremity. By James R. Doyle and Michael J. Botte"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "ISBN:9780674021839 The Tree of Life - Guillaume Lecointre, Herve Le Guyader"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "ISBN:9780878932504 Developmental Biology"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "MESH"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "PMID:11433360 Placental development: lessons from mouse mutants"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "PMID:16417468 Forgotten and novel aspects in pancreas development, Pieler and Chen"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "aggregates AAO from 13:04:2012"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "aggregates TAO from 09:08:2012"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "aggregates VSAO from 16:07:2012"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "http://wiki.phenotypercn.org/wg/phenotypercn/index.php?title=Neural_Crest_Workshop"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "http://wiki.phenotypercn.org/wiki/August_2012_Notes"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "https://docs.google.com/document/d/16JZOuH9sh_a8uIXA4cqg0Q1H6MV5yCj3-rhuKsZoV_U/edit"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "https://docs.google.com/document/d/1MnUgispgGfNQoezYzWzzGTnkAnI0gzRnJIwdip6MMtw/edit"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "https://docs.google.com/document/d/1cPWBqrl_Qy7XHEWFqtR_PgQX61yRkgGuLaiDpnEXxkE/edit"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "https://docs.google.com/document/d/1r9kNPpFYGdu0SpJDLyFAVQczBlG0wAZCBMd18gG3Ot8/edit#"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/source",
+          "val" : "https://docs.google.com/spreadsheet/ccc?key=0Aj8NJdyb-leqdDM0R3hTVTRHRExDVjRCSkZEbDc5N1E#gid=0"
+        }, {
+          "pred" : "http://purl.org/dc/elements/1.1/title",
+          "val" : "Uber-anatomy ontology"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://github.com/orgs/pato-ontology/teams/pato-community"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://orcid.org/0000-0001-5889-4463"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://orcid.org/0000-0001-7433-0086"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://orcid.org/0000-0001-7476-6306"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://orcid.org/0000-0001-7920-5321"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://orcid.org/0000-0001-7958-3701"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://orcid.org/0000-0001-8682-8754"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://orcid.org/0000-0001-9107-0714"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://orcid.org/0000-0001-9990-8331"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://orcid.org/0000-0002-0819-0473"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://orcid.org/0000-0002-0956-8634"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://orcid.org/0000-0002-1112-5832"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://orcid.org/0000-0002-1572-1316"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://orcid.org/0000-0002-1604-3078"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://orcid.org/0000-0002-1615-2899"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://orcid.org/0000-0002-2061-091X"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://orcid.org/0000-0002-2244-7917"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://orcid.org/0000-0002-3437-3329"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://orcid.org/0000-0002-3467-2636"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://orcid.org/0000-0002-3734-1859"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://orcid.org/0000-0002-5111-7263"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://orcid.org/0000-0002-6490-7723"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://orcid.org/0000-0002-7073-9172"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://orcid.org/0000-0002-8406-3871"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://orcid.org/0000-0002-8455-3213"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://orcid.org/0000-0002-8688-6599"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://orcid.org/0000-0002-9415-5104"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://orcid.org/0000-0002-9818-3030"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://orcid.org/0000-0002-9900-7880"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://orcid.org/0000-0003-1980-3228"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://orcid.org/0000-0003-2105-2283"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://orcid.org/0000-0003-2338-2550"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://orcid.org/0000-0003-3691-0324"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://orcid.org/0000-0003-4423-4370"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://www.wikidata.org/wiki/Q11695472"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://www.wikidata.org/wiki/Q23809253"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://www.wikidata.org/wiki/Q4964264"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://www.wikidata.org/wiki/Q54985720"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://www.wikidata.org/wiki/Q6983890"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://www.wikidata.org/wiki/Q7650732"
+        }, {
+          "pred" : "http://purl.org/dc/terms/contributor",
+          "val" : "https://www.wikidata.org/wiki/Q85793053"
+        }, {
+          "pred" : "http://purl.org/dc/terms/isReferencedBy",
+          "val" : "http://genomebiology.com/2012/13/1/R5"
+        }, {
+          "pred" : "http://purl.org/dc/terms/isReferencedBy",
+          "val" : "http://www.ncbi.nlm.nih.gov/pubmed/22293552"
+        }, {
+          "pred" : "http://purl.org/dc/terms/license",
+          "val" : "http://creativecommons.org/licenses/by/3.0/"
+        }, {
+          "pred" : "http://usefulinc.com/ns/doap#GitRepository",
+          "val" : "https://github.com/cmungall/uberon/"
+        }, {
+          "pred" : "http://usefulinc.com/ns/doap#SVNRepository",
+          "val" : "https://obo.svn.sourceforge.net/svnroot/obo/uberon/"
+        }, {
+          "pred" : "http://usefulinc.com/ns/doap#bug-database",
+          "val" : "https://github.com/obophenotype/uberon/issues/"
+        }, {
+          "pred" : "http://usefulinc.com/ns/doap#mailing-list",
+          "val" : "https://lists.sourceforge.net/lists/listinfo/obo-anatomy"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#default-namespace",
+          "val" : "uberon"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#hasOBOFormatVersion",
+          "val" : "1.2"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent",
+          "val" : "AEO"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent",
+          "val" : "BILA"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent",
+          "val" : "BSPO"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent",
+          "val" : "CARO"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent",
+          "val" : "GO"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent",
+          "val" : "OG"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent",
+          "val" : "VSAO"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-has-subclass",
+          "val" : "EHDAA"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-has-subclass",
+          "val" : "EV"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-has-subclass",
+          "val" : "NCIT"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-has-subclass",
+          "val" : "OGES"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-has-subclass",
+          "val" : "SCTID"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-is_a",
+          "val" : "BFO"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-is_a",
+          "val" : "VHOG"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+          "val" : "AAO part_of NCBITaxon:8292"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+          "val" : "DHBA part_of NCBITaxon:9606"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+          "val" : "EHDAA2 part_of NCBITaxon:9606"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+          "val" : "EMAPA part_of NCBITaxon:10090"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+          "val" : "FBdv part_of NCBITaxon:7227"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+          "val" : "FMA part_of NCBITaxon:9606"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+          "val" : "HAO part_of NCBITaxon:7399"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+          "val" : "HBA part_of NCBITaxon:9606"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+          "val" : "HsapDv part_of NCBITaxon:9606"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+          "val" : "KUPO part_of NCBITaxon:9606"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+          "val" : "MA part_of NCBITaxon:10090"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+          "val" : "MFO part_of NCBITaxon:8089"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+          "val" : "MmusDv part_of NCBITaxon:10090"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+          "val" : "OlatDv part_of NCBITaxon:8089"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+          "val" : "PBA part_of NCBITaxon:9443"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+          "val" : "SPD part_of NCBITaxon:6893"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+          "val" : "TADS part_of NCBITaxon:6939"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+          "val" : "TAO part_of NCBITaxon:32443"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+          "val" : "TGMA part_of NCBITaxon:44484"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+          "val" : "WBbt part_of NCBITaxon:6237"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+          "val" : "WBls part_of NCBITaxon:6237"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+          "val" : "XAO part_of NCBITaxon:8353"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+          "val" : "ZFA part_of NCBITaxon:7954"
+        }, {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+          "val" : "ZFS part_of NCBITaxon:7954"
+        }, {
+          "pred" : "http://www.w3.org/2000/01/rdf-schema#comment",
+          "val" : "Aurelie Comte, Bill Bug, Catherine Leroy, Duncan Davidson and Trish Whetzel are also contributors. However their ORCIDs were not found."
+        }, {
+          "pred" : "http://www.w3.org/2002/07/owl#versionInfo",
+          "val" : "2024-09-03"
+        }, {
+          "pred" : "http://xmlns.com/foaf/0.1/homepage",
+          "val" : "http://uberon.org"
+        } ],
+        "version" : "http://purl.obolibrary.org/obo/uberon/releases/2024-09-03/uberon.json"
+      },
+    "nodes" : [{
+        "id" : "http://purl.obolibrary.org/obo/CL_1001593",
+        "lbl" : "parathyroid glandular cell",
+        "type" : "CLASS",
+        "meta" : {
+          "definition" : {
+            "val" : "Glandular cell of parathyroid epithelium. Example: Parathyroid chief cell and parathyroid oxyphil cells.",
+            "xrefs" : [ "HPA:HPA", "NPX:PDR" ]
+          },
+          "synonyms" : [ {
+            "pred" : "hasRelatedSynonym",
+            "val" : "parathyroid gland glandular cell",
+            "xrefs" : [ "CALOHA:TS-1279" ]
+          }, {
+            "pred" : "hasRelatedSynonym",
+            "val" : "parathyroid gland glandular cells",
+            "xrefs" : [ "CALOHA:TS-1279" ]
+          } ],
+          "xrefs" : [ {
+            "val" : "CALOHA:TS-1279"
+          } ]
+        }
+      }, {
+        "id" : "http://purl.obolibrary.org/obo/CL_1001595",
+        "lbl" : "rectum glandular cell",
+        "type" : "CLASS",
+        "meta" : {
+          "definition" : {
+            "val" : "Glandular cell of rectal epithelium. Example: Goblet cell; enterocytes or absorptive cells; enteroendocrine and M cells.",
+            "xrefs" : [ "NPX:PDR" ]
+          },
+          "synonyms" : [ {
+            "pred" : "hasRelatedSynonym",
+            "val" : "rectal glandular cell",
+            "xrefs" : [ "CALOHA:TS-1281" ]
+          }, {
+            "pred" : "hasRelatedSynonym",
+            "val" : "rectum glandular cells",
+            "xrefs" : [ "CALOHA:TS-1281" ]
+          } ],
+          "xrefs" : [ {
+            "val" : "CALOHA:TS-1281"
+          } ]
+        }
+      }, {
+        "id" : "http://purl.obolibrary.org/obo/CL_1001596",
+        "lbl" : "salivary gland glandular cell",
+        "type" : "CLASS",
+        "meta" : {
+          "definition" : {
+            "val" : "Glandular cell of salivary gland. Example: Serous cells, mucous cells, cuboidal epithelial cells of the intercalated ducts, simple cuboidal epithelium of the striated ducts, epithelial cells of excretory ducts.",
+            "xrefs" : [ "HPA:HPA", "NPX:PDR" ]
+          },
+          "synonyms" : [ {
+            "pred" : "hasRelatedSynonym",
+            "val" : "salivary gland glandular cells",
+            "xrefs" : [ "CALOHA:TS-1282" ]
+          } ],
+          "xrefs" : [ {
+            "val" : "CALOHA:TS-1282"
+          } ]
+        }
+      },
+      {
+        "id" : "http://purl.obolibrary.org/obo/CL_0000653",
+        "lbl" : "podocyte",
+        "type" : "CLASS",
+        "meta" : {
+          "definition" : {
+            "val" : "A specialized kidney epithelial cell, contained within a glomerulus, that contains \"feet\" that interdigitate with the \"feet\" of other podocytes.",
+            "xrefs" : [ "GOC:tfm", "https://doi.org/10.1101/2021.10.10.463829" ]
+          },
+          "subsets" : [ "http://purl.obolibrary.org/obo/cl#cellxgene_subset", "http://purl.obolibrary.org/obo/uberon/core#human_reference_atlas" ],
+          "synonyms" : [ {
+            "pred" : "hasBroadSynonym",
+            "val" : "epithelial cell of visceral layer of glomerular capsule",
+            "xrefs" : [ "FMA:70967" ]
+          }, {
+            "pred" : "hasExactSynonym",
+            "val" : "glomerular podocyte",
+            "xrefs" : [ "FMA:70967" ]
+          }, {
+            "pred" : "hasExactSynonym",
+            "val" : "glomerular visceral epithelial cell"
+          }, {
+            "pred" : "hasExactSynonym",
+            "val" : "kidney podocyte"
+          }, {
+            "pred" : "hasExactSynonym",
+            "val" : "renal podocyte"
+          } ],
+          "xrefs" : [ {
+            "val" : "BTO:0002295"
+          }, {
+            "val" : "FMA:70967"
+          } ],
+          "basicPropertyValues" : [ {
+            "pred" : "http://purl.obolibrary.org/obo/RO_0002175",
+            "val" : "http://purl.obolibrary.org/obo/NCBITaxon_9606"
+          }, {
+            "pred" : "http://www.w3.org/2000/01/rdf-schema#seeAlso",
+            "val" : "https://github.com/obophenotype/cell-ontology/issues/1460"
+          } ]
+        }
+      }],
+      "edges" : [
+        {
+            "sub" : "http://purl.obolibrary.org/obo/CL_1001596",
+            "pred" : "is_a",
+            "obj" : "http://purl.obolibrary.org/obo/CL_0000150"
+          }, {
+            "sub" : "http://purl.obolibrary.org/obo/CL_1001596",
+            "pred" : "is_a",
+            "obj" : "http://purl.obolibrary.org/obo/CL_0000152"
+          }, {
+            "sub" : "http://purl.obolibrary.org/obo/CL_1001596",
+            "pred" : "is_a",
+            "obj" : "http://purl.obolibrary.org/obo/CL_0002251"
+          }, {
+            "sub" : "http://purl.obolibrary.org/obo/CL_1001596",
+            "pred" : "http://purl.obolibrary.org/obo/BFO_0000050",
+            "obj" : "http://purl.obolibrary.org/obo/UBERON_0001044"
+          }, {
+            "sub" : "http://purl.obolibrary.org/obo/CL_1001596",
+            "pred" : "http://purl.obolibrary.org/obo/BFO_0000050",
+            "obj" : "http://purl.obolibrary.org/obo/UBERON_0004809"
+          }, {
+      "sub" : "http://purl.obolibrary.org/obo/CL_0002623",
+      "pred" : "is_a",
+      "obj" : "http://purl.obolibrary.org/obo/CL_0000622",
+      "meta" : {
+        "basicPropertyValues" : [ {
+          "pred" : "http://www.geneontology.org/formats/oboInOwl#is_inferred",
+          "val" : "true"
+        } ]
+      }
+    }, {
+      "sub" : "http://purl.obolibrary.org/obo/CL_0002623",
+      "pred" : "is_a",
+      "obj" : "http://purl.obolibrary.org/obo/CL_1001596"
+    }, {
+      "sub" : "http://purl.obolibrary.org/obo/CL_0002623",
+      "pred" : "http://purl.obolibrary.org/obo/BFO_0000050",
+      "obj" : "http://purl.obolibrary.org/obo/UBERON_0001044"
+    },  {
+        "sub" : "http://purl.obolibrary.org/obo/CL_0002623",
+        "pred" : "is_a",
+        "obj" : "http://purl.obolibrary.org/obo/CL_0000622",
+        "meta" : {
+          "basicPropertyValues" : [ {
+            "pred" : "http://www.geneontology.org/formats/oboInOwl#is_inferred",
+            "val" : "true"
+          } ]
+        }
+      }, {
+        "sub" : "http://purl.obolibrary.org/obo/CL_0002623",
+        "pred" : "is_a",
+        "obj" : "http://purl.obolibrary.org/obo/CL_1001596"
+      }, {
+        "sub" : "http://purl.obolibrary.org/obo/CL_0002623",
+        "pred" : "http://purl.obolibrary.org/obo/BFO_0000050",
+        "obj" : "http://purl.obolibrary.org/obo/UBERON_0001044"
+      },
+      {
+        "sub" : "http://purl.obolibrary.org/obo/CL_0000653",
+        "pred" : "is_a",
+        "obj" : "http://purl.obolibrary.org/obo/CL_1000450"
+      }, {
+        "sub" : "http://purl.obolibrary.org/obo/CL_0000653",
+        "pred" : "http://purl.obolibrary.org/obo/BFO_0000050",
+        "obj" : "http://purl.obolibrary.org/obo/UBERON_0005751"
+      }, 
+    ]
+    }
+    ]
+}
diff --git a/tests/gentropy/dataset/test_biosample_index.py b/tests/gentropy/dataset/test_biosample_index.py
index f7d85e230..a9221569e 100644
--- a/tests/gentropy/dataset/test_biosample_index.py
+++ b/tests/gentropy/dataset/test_biosample_index.py
@@ -10,7 +10,7 @@
 import json
 
 from gentropy.dataset.biosample_index import BiosampleIndex
-from gentropy.datasource.ontologies.utils import extract_ontology_from_json
+from gentropy.datasource.ontologies.utils import extract_ontology_from_json, merge_biosample_indices
 
 
 def test_biosample_index_creation(mock_biosample_index: BiosampleIndex) -> None:
@@ -19,13 +19,15 @@ def test_biosample_index_creation(mock_biosample_index: BiosampleIndex) -> None:
 
 
 
-spark2 = SparkSession.builder \
+spark = SparkSession.builder \
     .master("local[*]") \
     .appName("LocalOntologyIndexing") \
     .getOrCreate()
 
+ontology_json1 = "file:////home/alegbe/repos/gentropy/tests/gentropy/data_samples/nephron-minimal.json"
+ontology_json2 = "file://///home/alegbe/repos/gentropy/tests/gentropy/data_samples/cell_ontology_dummy.json"
 
-ontology_json = 'file:///home/alegbe/cl.json'
-# ontology_json = 'file:///home/alegbe/uberon.json'
+df1 = extract_ontology_from_json(ontology_json1, spark)
+df2 = extract_ontology_from_json(ontology_json2, spark)
 
-df = extract_ontology_from_json(ontology_json, spark2)
\ No newline at end of file
+df_merged = merge_biosample_indices([df1, df2])

From 33ebf58c713d84c5d0603ce0504fff419a624f2f Mon Sep 17 00:00:00 2001
From: Tobi Alegbe <alegbe@ebi.ac.uk>
Date: Mon, 16 Sep 2024 13:31:58 +0000
Subject: [PATCH 10/22] Tests and docs

---
 .../datasources/ontologies/_cell_ontology.md       |  5 +++++
 docs/python_api/datasources/ontologies/_uberon.md  |  5 +++++
 docs/python_api/steps/biosample_index_step.md      |  5 +++++
 .../ontologies/test_biosample_ontology.py          | 14 ++++++++++++++
 4 files changed, 29 insertions(+)
 create mode 100644 docs/python_api/datasources/ontologies/_cell_ontology.md
 create mode 100644 docs/python_api/datasources/ontologies/_uberon.md
 create mode 100644 docs/python_api/steps/biosample_index_step.md
 create mode 100644 tests/gentropy/datasource/ontologies/test_biosample_ontology.py

diff --git a/docs/python_api/datasources/ontologies/_cell_ontology.md b/docs/python_api/datasources/ontologies/_cell_ontology.md
new file mode 100644
index 000000000..5798e032b
--- /dev/null
+++ b/docs/python_api/datasources/ontologies/_cell_ontology.md
@@ -0,0 +1,5 @@
+---
+title: Cell Ontology
+---
+
+The [Cell Ontology](http://www.obofoundry.org/ontology/cl.html) is a structured controlled vocabulary for cell types. It is used to annotate cell types in single-cell RNA-seq data and other omics data.
diff --git a/docs/python_api/datasources/ontologies/_uberon.md b/docs/python_api/datasources/ontologies/_uberon.md
new file mode 100644
index 000000000..62ef3e96f
--- /dev/null
+++ b/docs/python_api/datasources/ontologies/_uberon.md
@@ -0,0 +1,5 @@
+---
+title: Uberon
+---
+
+The [Uberon](http://uberon.github.io/) ontology is a multi-species anatomy ontology that integrates cross-species ontologies into a single ontology. 
diff --git a/docs/python_api/steps/biosample_index_step.md b/docs/python_api/steps/biosample_index_step.md
new file mode 100644
index 000000000..d8f7abbb4
--- /dev/null
+++ b/docs/python_api/steps/biosample_index_step.md
@@ -0,0 +1,5 @@
+---
+title: biosample_index
+---
+
+::: gentropy.biosample_index.BiosampleIndexStep
diff --git a/tests/gentropy/datasource/ontologies/test_biosample_ontology.py b/tests/gentropy/datasource/ontologies/test_biosample_ontology.py
new file mode 100644
index 000000000..477272a5d
--- /dev/null
+++ b/tests/gentropy/datasource/ontologies/test_biosample_ontology.py
@@ -0,0 +1,14 @@
+"""Tests for study index dataset from FinnGen."""
+
+from __future__ import annotations
+
+from pyspark.sql import SparkSession
+from pyspark.sql import types as t
+
+from gentropy.dataset.study_index import BiosampleIndex
+from gentropy.datasource.ontologies.utils import extract_ontology_from_json
+
+
+def test_biosample_index_from_source(spark: SparkSession) -> None:
+    """Test biosample index from source."""
+    assert isinstance(extract_ontology_from_json(), BiosampleIndex)

From 26a429539b3c4d770b5bcb912bdc8ad6f1dc82ee Mon Sep 17 00:00:00 2001
From: Tobi Alegbe <alegbe@ebi.ac.uk>
Date: Mon, 16 Sep 2024 14:07:07 +0000
Subject: [PATCH 11/22] Updating tests

---
 .../datasource/gwas_catalog/__init__.py       |  3 ++
 .../gentropy/dataset/test_biosample_index.py  | 14 -------
 .../ontologies/test_biosample_ontology.py     | 41 +++++++++++++++++--
 3 files changed, 41 insertions(+), 17 deletions(-)
 create mode 100644 src/gentropy/datasource/gwas_catalog/__init__.py

diff --git a/src/gentropy/datasource/gwas_catalog/__init__.py b/src/gentropy/datasource/gwas_catalog/__init__.py
new file mode 100644
index 000000000..d12240a6e
--- /dev/null
+++ b/src/gentropy/datasource/gwas_catalog/__init__.py
@@ -0,0 +1,3 @@
+"""GWAS Catalog index data source."""
+
+from __future__ import annotations
diff --git a/tests/gentropy/dataset/test_biosample_index.py b/tests/gentropy/dataset/test_biosample_index.py
index a9221569e..60c89d703 100644
--- a/tests/gentropy/dataset/test_biosample_index.py
+++ b/tests/gentropy/dataset/test_biosample_index.py
@@ -17,17 +17,3 @@ def test_biosample_index_creation(mock_biosample_index: BiosampleIndex) -> None:
     """Test biosample index creation with mock biosample index."""
     assert isinstance(mock_biosample_index, BiosampleIndex)
 
-
-
-spark = SparkSession.builder \
-    .master("local[*]") \
-    .appName("LocalOntologyIndexing") \
-    .getOrCreate()
-
-ontology_json1 = "file:////home/alegbe/repos/gentropy/tests/gentropy/data_samples/nephron-minimal.json"
-ontology_json2 = "file://///home/alegbe/repos/gentropy/tests/gentropy/data_samples/cell_ontology_dummy.json"
-
-df1 = extract_ontology_from_json(ontology_json1, spark)
-df2 = extract_ontology_from_json(ontology_json2, spark)
-
-df_merged = merge_biosample_indices([df1, df2])
diff --git a/tests/gentropy/datasource/ontologies/test_biosample_ontology.py b/tests/gentropy/datasource/ontologies/test_biosample_ontology.py
index 477272a5d..af7d9e405 100644
--- a/tests/gentropy/datasource/ontologies/test_biosample_ontology.py
+++ b/tests/gentropy/datasource/ontologies/test_biosample_ontology.py
@@ -2,13 +2,48 @@
 
 from __future__ import annotations
 
-from pyspark.sql import SparkSession
-from pyspark.sql import types as t
+from typing import TYPE_CHECKING
+
+import pytest
+from pyspark.sql import DataFrame
+from pyspark.sql import functions as f
+
 
 from gentropy.dataset.study_index import BiosampleIndex
-from gentropy.datasource.ontologies.utils import extract_ontology_from_json
+from gentropy.datasource.ontologies.utils import extract_ontology_from_json, merge_biosample_indices
 
+if TYPE_CHECKING:
+    from pyspark.sql import SparkSession
 
 def test_biosample_index_from_source(spark: SparkSession) -> None:
     """Test biosample index from source."""
     assert isinstance(extract_ontology_from_json(), BiosampleIndex)
+
+class TestOntologyParger:
+    """ Testing ontology parser."""
+
+    SAMPLE_CELL_ONTOLOGY_PATH = "tests/gentropy/data_samples/cell_ontology_sample.json"
+    SAMPLE_UBERON_PATH = "tests/gentropy/data_samples/uberon_sample.json"
+
+    def test_cell_ontology_parser(self) -> None:
+        """Test cell ontology parser."""
+        cell_ontology = extract_ontology_from_json(self.SAMPLE_CELL_ONTOLOGY_PATH)
+        assert isinstance(
+            cell_ontology, BiosampleIndex
+            ), "Cell ontology subset is not parsed correctly to BiosampleIndex."
+
+    def test_uberon_parser(self) -> None:
+        """Test uberon parser."""
+        uberon = extract_ontology_from_json(self.SAMPLE_UBERON_PATH)
+        assert isinstance(
+            uberon, BiosampleIndex
+            ), "Uberon subset is not parsed correctly to BiosampleIndex."
+
+    def test_merge_biosample_indices(self) -> None:
+        """Test merging of biosample indices."""
+        cell_ontology = extract_ontology_from_json(self.SAMPLE_CELL_ONTOLOGY_PATH)
+        uberon = extract_ontology_from_json(self.SAMPLE_UBERON_PATH)
+        merged = merge_biosample_indices(cell_ontology, uberon)
+        assert isinstance(
+            merged, BiosampleIndex
+            ), "Merging of biosample indices is not correct."
\ No newline at end of file

From 1c507e61b3bb06a7617a3e8c4b33fe85d85d545a Mon Sep 17 00:00:00 2001
From: Tobi Alegbe <alegbe@ebi.ac.uk>
Date: Mon, 16 Sep 2024 14:11:40 +0000
Subject: [PATCH 12/22] Revert GWAS catalog file

---
 src/gentropy/datasource/gwas_catalog/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gentropy/datasource/gwas_catalog/__init__.py b/src/gentropy/datasource/gwas_catalog/__init__.py
index d12240a6e..544779b18 100644
--- a/src/gentropy/datasource/gwas_catalog/__init__.py
+++ b/src/gentropy/datasource/gwas_catalog/__init__.py
@@ -1,3 +1,3 @@
-"""GWAS Catalog index data source."""
+"""GWAS Catalog Data Source."""
 
 from __future__ import annotations

From 567d8e10fa32d0a458912c603d8c486eef312c93 Mon Sep 17 00:00:00 2001
From: Tobi Alegbe <alegbe@ebi.ac.uk>
Date: Tue, 17 Sep 2024 09:28:55 +0000
Subject: [PATCH 13/22] fix(biosample index): update to match pre-commit
 standards

---
 docs/python_api/datasources/_datasources.md   |    2 +-
 .../datasources/ontologies/_uberon.md         |    2 +-
 src/gentropy/biosample_index.py               |   11 +-
 src/gentropy/dataset/biosample_index.py       |   16 +-
 src/gentropy/dataset/study_index.py           |    3 +-
 src/gentropy/datasource/ontologies/utils.py   |  104 +-
 src/gentropy/study_validation.py              |    2 +-
 .../data_samples/cell_ontology_sample.json    |  609 ++++---
 .../gentropy/data_samples/uberon_sample.json  | 1550 ++++++++++-------
 .../gentropy/dataset/test_biosample_index.py  |   11 -
 .../ontologies/test_biosample_ontology.py     |   49 +-
 11 files changed, 1338 insertions(+), 1021 deletions(-)

diff --git a/docs/python_api/datasources/_datasources.md b/docs/python_api/datasources/_datasources.md
index f79f8137b..58b4bcd2b 100644
--- a/docs/python_api/datasources/_datasources.md
+++ b/docs/python_api/datasources/_datasources.md
@@ -41,4 +41,4 @@ This section contains information about the data source harmonisation tools avai
 ## Biological samples
 
 1. [Uberon](ontologies/_uberon.md)
-2. [Cell Ontology](ontologies/_cell_ontology.md)
\ No newline at end of file
+2. [Cell Ontology](ontologies/_cell_ontology.md)
diff --git a/docs/python_api/datasources/ontologies/_uberon.md b/docs/python_api/datasources/ontologies/_uberon.md
index 62ef3e96f..4bb47305a 100644
--- a/docs/python_api/datasources/ontologies/_uberon.md
+++ b/docs/python_api/datasources/ontologies/_uberon.md
@@ -2,4 +2,4 @@
 title: Uberon
 ---
 
-The [Uberon](http://uberon.github.io/) ontology is a multi-species anatomy ontology that integrates cross-species ontologies into a single ontology. 
+The [Uberon](http://uberon.github.io/) ontology is a multi-species anatomy ontology that integrates cross-species ontologies into a single ontology.
diff --git a/src/gentropy/biosample_index.py b/src/gentropy/biosample_index.py
index 11274d789..a4080fba1 100644
--- a/src/gentropy/biosample_index.py
+++ b/src/gentropy/biosample_index.py
@@ -2,8 +2,10 @@
 from __future__ import annotations
 
 from gentropy.common.session import Session
-from gentropy.dataset.biosample_index import BiosampleIndex
-from gentropy.datasource.ontologies.utils import extract_ontology_from_json, merge_biosample_indices
+from gentropy.datasource.ontologies.utils import (
+    extract_ontology_from_json,
+    merge_biosample_indices,
+)
 
 
 class BiosampleIndexStep:
@@ -29,8 +31,7 @@ def __init__(
         """
         cell_ontology_index = extract_ontology_from_json(cell_ontology_input_path, session.spark)
         uberon_index = extract_ontology_from_json(uberon_input_path, session.spark)
-        
+
         biosample_index = merge_biosample_indices([cell_ontology_index, uberon_index])
-        
+
         biosample_index.df.write.mode(session.write_mode).parquet(biosample_index_output_path)
-        
diff --git a/src/gentropy/dataset/biosample_index.py b/src/gentropy/dataset/biosample_index.py
index b3735ca62..20cff34e8 100644
--- a/src/gentropy/dataset/biosample_index.py
+++ b/src/gentropy/dataset/biosample_index.py
@@ -2,24 +2,12 @@
 
 from __future__ import annotations
 
-import importlib.resources as pkg_resources
-import json
 from dataclasses import dataclass
-from enum import Enum
-from itertools import chain
 from typing import TYPE_CHECKING
 
-from pyspark.sql import functions as f
-from pyspark.sql.window import Window
-from functools import reduce
-
-from gentropy.assets import data
 from gentropy.common.schemas import parse_spark_schema
 from gentropy.dataset.dataset import Dataset
 
-
-from pyspark.sql import Column, DataFrame, Row
-
 if TYPE_CHECKING:
     from pyspark.sql.types import StructType
 
@@ -32,10 +20,10 @@ class BiosampleIndex(Dataset):
     """
 
     @classmethod
-    def get_schema(cls: type[StudyIndex]) -> StructType:
+    def get_schema(cls: type[BiosampleIndex]) -> StructType:
         """Provide the schema for the BiosampleIndex dataset.
 
         Returns:
             StructType: The schema of the BiosampleIndex dataset.
         """
-        return parse_spark_schema("biosample_index.json")
\ No newline at end of file
+        return parse_spark_schema("biosample_index.json")
diff --git a/src/gentropy/dataset/study_index.py b/src/gentropy/dataset/study_index.py
index 852f14d9e..e6e4d4dc3 100644
--- a/src/gentropy/dataset/study_index.py
+++ b/src/gentropy/dataset/study_index.py
@@ -19,8 +19,8 @@
     from pyspark.sql import Column, DataFrame
     from pyspark.sql.types import StructType
 
-    from gentropy.dataset.gene_index import GeneIndex
     from gentropy.dataset.biosample_index import BiosampleIndex
+    from gentropy.dataset.gene_index import GeneIndex
 
 
 class StudyQualityCheck(Enum):
@@ -444,4 +444,3 @@ def validate_biosample(self: StudyIndex, biosample_index: BiosampleIndex) -> Stu
         )
 
         return StudyIndex(_df=validated_df, _schema=StudyIndex.get_schema())
-
diff --git a/src/gentropy/datasource/ontologies/utils.py b/src/gentropy/datasource/ontologies/utils.py
index adf38c4da..0c4215d09 100644
--- a/src/gentropy/datasource/ontologies/utils.py
+++ b/src/gentropy/datasource/ontologies/utils.py
@@ -1,17 +1,29 @@
 """Utility functions for Biosample ontology processing."""
-from pyspark.sql import Row, SparkSession, DataFrame
-from pyspark.sql.types import StructType, StringType, ArrayType
-from pyspark.sql.functions import col, explode_outer, collect_set, collect_list, array_distinct, regexp_replace, udf, coalesce, first
-from pyspark.sql.window import Window
 from functools import reduce
+
+from pyspark.sql import DataFrame, SparkSession
+from pyspark.sql.functions import (
+    array_distinct,
+    coalesce,
+    col,
+    collect_list,
+    collect_set,
+    explode_outer,
+    first,
+    regexp_replace,
+    udf,
+)
+from pyspark.sql.types import ArrayType, StringType
+from pyspark.sql.window import Window
+
 from gentropy.dataset.biosample_index import BiosampleIndex
 
+
 def extract_ontology_from_json(
     ontology_json : str,
     spark : SparkSession
 ) -> BiosampleIndex:
-    """
-    Extracts the ontology information from a JSON file. Currently only supports Uberon and Cell Ontology.
+    """Extracts the ontology information from a JSON file. Currently only supports Uberon and Cell Ontology.
 
     Args:
         ontology_json (str): Path to the JSON file containing the ontology information.
@@ -21,15 +33,30 @@ def extract_ontology_from_json(
         BiosampleIndex: Parsed and annotated biosample index table.
     """
 
-    def json_graph_traversal(df, node_col, link_col, traversal_type="ancestors"):
-        """
-        Traverse a graph represented in a DataFrame to find all ancestors or descendants.
+    def json_graph_traversal(
+        df : DataFrame,
+        node_col : str,
+        link_col: str,
+        traversal_type: str
+    ) -> DataFrame:
+        """Traverse a graph represented in a DataFrame to find all ancestors or descendants.
+
+        Args:
+            df (DataFrame): DataFrame containing the graph data.
+            node_col (str): Column name for the node.
+            link_col (str): Column name for the link.
+            traversal_type (str): Type of traversal - "ancestors" or "descendants".
+
+        Returns:
+            DataFrame: DataFrame with the result column added.
         """
         # Collect graph data as a map
         graph_map = df.select(node_col, link_col).rdd.collectAsMap()
         broadcasted_graph = spark.sparkContext.broadcast(graph_map)
 
-        def get_relationships(node):
+        def get_relationships(
+            node : str
+            ) -> list[str]:
             relationships = set()
             stack = [node]
             while stack:
@@ -80,8 +107,8 @@ def get_relationships(node):
     collect_set(col("node.meta.xrefs.val")).over(Window.partitionBy("node.id")).getItem(0).alias("dbXrefs"),
     # col("node.meta.deprecated").alias("deprecated"),
     collect_set(col("node.meta.synonyms.val")).over(Window.partitionBy("node.id")).getItem(0).alias("synonyms"))
-    
-    
+
+
     # Extract the relationships from the edges
     # Prepare relationship-specific DataFrames
     df_parents = df_edges.filter(col("predicate") == "is_a").select("subject", "object").withColumnRenamed("object", "parent")
@@ -100,44 +127,63 @@ def get_relationships(node):
     df_with_relationships = df_with_ancestors.join(df_with_descendants, df_with_ancestors.subject == df_with_descendants.object, "full_outer").withColumn("biosampleId", coalesce(df_with_ancestors.subject, df_with_descendants.object)).drop("subject", "object")
 
     # Join the original DataFrame with the relationship DataFrame
-    final_df = transformed_df.join(df_with_relationships, ['biosampleId'], "left")
-    
-    return final_df
+    final_df = transformed_df.join(df_with_relationships, ["biosampleId"], "left")
+
+    return BiosampleIndex(
+        _df=final_df,
+        _schema=BiosampleIndex.get_schema()
+        )
 
 def merge_biosample_indices(
-         biosample_indices: list[BiosampleIndex], 
+    biosample_indices : list[BiosampleIndex]
     ) -> BiosampleIndex:
-    """Merge a list of biosample indexes into a single biosample index.
+    """Merge a list of biosample indices into a single biosample index.
+
     Where there are conflicts, in single values - the first value is taken. In list values, the union of all values is taken.
 
     Args:
-        biosample_indexes (BiosampleIndex): Biosample indexes to merge.
+        biosample_indices (list[BiosampleIndex]): Biosample indices to merge.
 
     Returns:
         BiosampleIndex: Merged biosample index.
     """
-    
-    def merge_lists(lists):
-        """Merge a list of lists into a single list."""
-        return list(set([item for sublist in lists if sublist is not None for item in sublist]))
-    
+
+    def merge_lists(
+        lists : list[list[str]]
+        ) -> list[str]:
+        """Merge a list of lists into a single list.
+
+        Args:
+            lists (list[list[str]]): List of lists to merge.
+
+        Returns:
+            list[str]: Merged list.
+        """
+        return list({item for sublist in lists if sublist is not None for item in sublist})
+
     # Make a spark udf (user defined function) to merge lists
     merge_lists_udf = udf(merge_lists, ArrayType(StringType()))
 
+    # Extract the DataFrames from the BiosampleIndex objects
+    biosample_dfs = [biosample_index.df for biosample_index in biosample_indices]
+
     # Merge the DataFrames
-    merged_df = reduce(DataFrame.unionAll, biosample_indices)
-    
+    merged_df = reduce(DataFrame.unionAll, biosample_dfs)
+
     # Define dictionary of columns and corresponding aggregation functions
     # Currently this will take the first value for single values and merge lists for list values
     agg_funcs = {}
     for column in merged_df.columns:
-        if column != 'biosampleId':
-            if 'list' in column:  # Assuming column names that have 'list' need list merging
+        if column != "biosampleId":
+            if "list" in column:  # Assuming column names that have 'list' need list merging
                 agg_funcs[column] = merge_lists_udf(collect_list(column)).alias(column)
             else:
                 agg_funcs[column] = first(column, ignorenulls=True).alias(column)
 
     # Group by biosampleId and aggregate the columns
-    merged_df = merged_df.groupBy('biosampleId').agg(agg_funcs)
+    merged_df = merged_df.groupBy("biosampleId").agg(agg_funcs)
 
-    return merged_df
\ No newline at end of file
+    return BiosampleIndex(
+        _df=merged_df,
+        _schema=BiosampleIndex.get_schema()
+        )
diff --git a/src/gentropy/study_validation.py b/src/gentropy/study_validation.py
index d19f012e4..0e4c22e6b 100644
--- a/src/gentropy/study_validation.py
+++ b/src/gentropy/study_validation.py
@@ -5,9 +5,9 @@
 from pyspark.sql import functions as f
 
 from gentropy.common.session import Session
+from gentropy.dataset.biosample_index import BiosampleIndex
 from gentropy.dataset.gene_index import GeneIndex
 from gentropy.dataset.study_index import StudyIndex
-from gentropy.dataset.biosample_index import BiosampleIndex
 
 
 class StudyValidationStep:
diff --git a/tests/gentropy/data_samples/cell_ontology_sample.json b/tests/gentropy/data_samples/cell_ontology_sample.json
index 5a774f473..5e73bfdee 100644
--- a/tests/gentropy/data_samples/cell_ontology_sample.json
+++ b/tests/gentropy/data_samples/cell_ontology_sample.json
@@ -1,274 +1,351 @@
 {
-  "graphs" : [ {
-    "id" : "http://purl.obolibrary.org/obo/cl.json",
-    "meta" : {
-      "basicPropertyValues" : [ {
-        "pred" : "http://purl.obolibrary.org/obo/IAO_0000700",
-        "val" : "http://purl.obolibrary.org/obo/CL_0000000"
-      }, {
-        "pred" : "http://purl.org/dc/elements/1.1/description",
-        "val" : "An ontology of cell types."
-      }, {
-        "pred" : "http://purl.org/dc/elements/1.1/title",
-        "val" : "Cell Ontology"
-      }, {
-        "pred" : "http://purl.org/dc/terms/contributor",
-        "val" : "https://orcid.org/0000-0001-5208-3432"
-      }, {
-        "pred" : "http://purl.org/dc/terms/contributor",
-        "val" : "https://orcid.org/0000-0001-9114-8737"
-      }, {
-        "pred" : "http://purl.org/dc/terms/contributor",
-        "val" : "https://orcid.org/0000-0001-9990-8331"
-      }, {
-        "pred" : "http://purl.org/dc/terms/contributor",
-        "val" : "https://orcid.org/0000-0002-2244-7917"
-      }, {
-        "pred" : "http://purl.org/dc/terms/contributor",
-        "val" : "https://orcid.org/0000-0002-6601-2165"
-      }, {
-        "pred" : "http://purl.org/dc/terms/contributor",
-        "val" : "https://orcid.org/0000-0002-7073-9172"
-      }, {
-        "pred" : "http://purl.org/dc/terms/contributor",
-        "val" : "https://orcid.org/0000-0002-8688-6599"
-      }, {
-        "pred" : "http://purl.org/dc/terms/contributor",
-        "val" : "https://orcid.org/0000-0002-9900-7880"
-      }, {
-        "pred" : "http://purl.org/dc/terms/contributor",
-        "val" : "https://orcid.org/0000-0003-1980-3228"
-      }, {
-        "pred" : "http://purl.org/dc/terms/license",
-        "val" : "http://creativecommons.org/licenses/by/4.0/"
-      }, {
-        "pred" : "http://www.w3.org/2000/01/rdf-schema#comment",
-        "val" : "See PMID:15693950, PMID:12799354, PMID:20123131, PMID:21208450; Contact Alexander Diehl, addiehl@buffalo.edu, university at buffalo."
-      }, {
-        "pred" : "http://www.w3.org/2002/07/owl#versionInfo",
-        "val" : "2024-08-16"
-      } ],
-      "version" : "http://purl.obolibrary.org/obo/cl/releases/2024-08-16/cl.json"
-    },
-    "nodes" : [ {
-      "id" : "http://purl.obolibrary.org/obo/CL_0000653",
-      "lbl" : "podocyte",
-      "type" : "CLASS",
-      "meta" : {
-        "definition" : {
-          "val" : "A specialized kidney epithelial cell, contained within a glomerulus, that contains \"feet\" that interdigitate with the \"feet\" of other podocytes.",
-          "xrefs" : [ "GOC:tfm", "https://doi.org/10.1101/2021.10.10.463829" ]
+  "graphs": [
+    {
+      "id": "http://purl.obolibrary.org/obo/cl.json",
+      "meta": {
+        "basicPropertyValues": [
+          {
+            "pred": "http://purl.obolibrary.org/obo/IAO_0000700",
+            "val": "http://purl.obolibrary.org/obo/CL_0000000"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/description",
+            "val": "An ontology of cell types."
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/title",
+            "val": "Cell Ontology"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0001-5208-3432"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0001-9114-8737"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0001-9990-8331"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0002-2244-7917"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0002-6601-2165"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0002-7073-9172"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0002-8688-6599"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0002-9900-7880"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0003-1980-3228"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/license",
+            "val": "http://creativecommons.org/licenses/by/4.0/"
+          },
+          {
+            "pred": "http://www.w3.org/2000/01/rdf-schema#comment",
+            "val": "See PMID:15693950, PMID:12799354, PMID:20123131, PMID:21208450; Contact Alexander Diehl, addiehl@buffalo.edu, university at buffalo."
+          },
+          {
+            "pred": "http://www.w3.org/2002/07/owl#versionInfo",
+            "val": "2024-08-16"
+          }
+        ],
+        "version": "http://purl.obolibrary.org/obo/cl/releases/2024-08-16/cl.json"
+      },
+      "nodes": [
+        {
+          "id": "http://purl.obolibrary.org/obo/CL_0000653",
+          "lbl": "podocyte",
+          "type": "CLASS",
+          "meta": {
+            "definition": {
+              "val": "A specialized kidney epithelial cell, contained within a glomerulus, that contains \"feet\" that interdigitate with the \"feet\" of other podocytes.",
+              "xrefs": ["GOC:tfm", "https://doi.org/10.1101/2021.10.10.463829"]
+            },
+            "subsets": [
+              "http://purl.obolibrary.org/obo/cl#cellxgene_subset",
+              "http://purl.obolibrary.org/obo/uberon/core#human_reference_atlas"
+            ],
+            "synonyms": [
+              {
+                "pred": "hasBroadSynonym",
+                "val": "epithelial cell of visceral layer of glomerular capsule",
+                "xrefs": ["FMA:70967"]
+              },
+              {
+                "pred": "hasExactSynonym",
+                "val": "glomerular podocyte",
+                "xrefs": ["FMA:70967"]
+              },
+              {
+                "pred": "hasExactSynonym",
+                "val": "glomerular visceral epithelial cell"
+              },
+              {
+                "pred": "hasExactSynonym",
+                "val": "kidney podocyte"
+              },
+              {
+                "pred": "hasExactSynonym",
+                "val": "renal podocyte"
+              }
+            ],
+            "xrefs": [
+              {
+                "val": "BTO:0002295"
+              },
+              {
+                "val": "FMA:70967"
+              },
+              {
+                "val": "ZFA:0009285"
+              }
+            ],
+            "basicPropertyValues": [
+              {
+                "pred": "http://purl.obolibrary.org/obo/RO_0002175",
+                "val": "http://purl.obolibrary.org/obo/NCBITaxon_9606"
+              },
+              {
+                "pred": "http://www.w3.org/2000/01/rdf-schema#seeAlso",
+                "val": "https://github.com/obophenotype/cell-ontology/issues/1460"
+              }
+            ]
+          }
         },
-        "subsets" : [ "http://purl.obolibrary.org/obo/cl#cellxgene_subset", "http://purl.obolibrary.org/obo/uberon/core#human_reference_atlas" ],
-        "synonyms" : [ {
-          "pred" : "hasBroadSynonym",
-          "val" : "epithelial cell of visceral layer of glomerular capsule",
-          "xrefs" : [ "FMA:70967" ]
-        }, {
-          "pred" : "hasExactSynonym",
-          "val" : "glomerular podocyte",
-          "xrefs" : [ "FMA:70967" ]
-        }, {
-          "pred" : "hasExactSynonym",
-          "val" : "glomerular visceral epithelial cell"
-        }, {
-          "pred" : "hasExactSynonym",
-          "val" : "kidney podocyte"
-        }, {
-          "pred" : "hasExactSynonym",
-          "val" : "renal podocyte"
-        } ],
-        "xrefs" : [ {
-          "val" : "BTO:0002295"
-        }, {
-          "val" : "FMA:70967"
-        }, {
-          "val" : "ZFA:0009285"
-        } ],
-        "basicPropertyValues" : [ {
-          "pred" : "http://purl.obolibrary.org/obo/RO_0002175",
-          "val" : "http://purl.obolibrary.org/obo/NCBITaxon_9606"
-        }, {
-          "pred" : "http://www.w3.org/2000/01/rdf-schema#seeAlso",
-          "val" : "https://github.com/obophenotype/cell-ontology/issues/1460"
-        } ]
-      }
-    }, {
-      "id" : "http://purl.obolibrary.org/obo/CL_0000654",
-      "lbl" : "primary oocyte",
-      "type" : "CLASS",
-      "meta" : {
-        "definition" : {
-          "val" : "A primary oocyte is an oocyte that has not completed female meosis I.",
-          "xrefs" : [ "GOC:tfm", "ISBN:0721662544" ]
+        {
+          "id": "http://purl.obolibrary.org/obo/CL_0000654",
+          "lbl": "primary oocyte",
+          "type": "CLASS",
+          "meta": {
+            "definition": {
+              "val": "A primary oocyte is an oocyte that has not completed female meosis I.",
+              "xrefs": ["GOC:tfm", "ISBN:0721662544"]
+            },
+            "subsets": [
+              "http://purl.obolibrary.org/obo/uberon/core#human_reference_atlas"
+            ],
+            "synonyms": [
+              {
+                "pred": "hasRelatedSynonym",
+                "val": "primary oogonium"
+              }
+            ],
+            "xrefs": [
+              {
+                "val": "BTO:0000512"
+              },
+              {
+                "val": "FMA:18645"
+              }
+            ],
+            "basicPropertyValues": [
+              {
+                "pred": "http://purl.obolibrary.org/obo/RO_0002175",
+                "val": "http://purl.obolibrary.org/obo/NCBITaxon_9606"
+              }
+            ]
+          }
         },
-        "subsets" : [ "http://purl.obolibrary.org/obo/uberon/core#human_reference_atlas" ],
-        "synonyms" : [ {
-          "pred" : "hasRelatedSynonym",
-          "val" : "primary oogonium"
-        } ],
-        "xrefs" : [ {
-          "val" : "BTO:0000512"
-        }, {
-          "val" : "FMA:18645"
-        } ],
-        "basicPropertyValues" : [ {
-          "pred" : "http://purl.obolibrary.org/obo/RO_0002175",
-          "val" : "http://purl.obolibrary.org/obo/NCBITaxon_9606"
-        } ]
-      }
-    }, {
-      "id" : "http://purl.obolibrary.org/obo/CL_0000655",
-      "lbl" : "secondary oocyte",
-      "type" : "CLASS",
-      "meta" : {
-        "definition" : {
-          "val" : "A secondary oocyte is an oocyte that has not completed meiosis II.",
-          "xrefs" : [ "GOC:tfm", "ISBN:0721662544" ]
+        {
+          "id": "http://purl.obolibrary.org/obo/CL_0000655",
+          "lbl": "secondary oocyte",
+          "type": "CLASS",
+          "meta": {
+            "definition": {
+              "val": "A secondary oocyte is an oocyte that has not completed meiosis II.",
+              "xrefs": ["GOC:tfm", "ISBN:0721662544"]
+            },
+            "synonyms": [
+              {
+                "pred": "hasRelatedSynonym",
+                "val": "primary oogonium"
+              }
+            ],
+            "xrefs": [
+              {
+                "val": "BTO:0003094"
+              },
+              {
+                "val": "FMA:18646"
+              }
+            ]
+          }
         },
-        "synonyms" : [ {
-          "pred" : "hasRelatedSynonym",
-          "val" : "primary oogonium"
-        } ],
-        "xrefs" : [ {
-          "val" : "BTO:0003094"
-        }, {
-          "val" : "FMA:18646"
-        } ]
-      }
-    }, {
-      "id" : "http://purl.obolibrary.org/obo/CL_0000656",
-      "lbl" : "primary spermatocyte",
-      "type" : "CLASS",
-      "meta" : {
-        "definition" : {
-          "val" : "A diploid cell that has derived from a spermatogonium and can subsequently begin meiosis and divide into two haploid secondary spermatocytes.",
-          "xrefs" : [ "GOC:tfm", "ISBN:0721662544" ]
+        {
+          "id": "http://purl.obolibrary.org/obo/CL_0000656",
+          "lbl": "primary spermatocyte",
+          "type": "CLASS",
+          "meta": {
+            "definition": {
+              "val": "A diploid cell that has derived from a spermatogonium and can subsequently begin meiosis and divide into two haploid secondary spermatocytes.",
+              "xrefs": ["GOC:tfm", "ISBN:0721662544"]
+            },
+            "xrefs": [
+              {
+                "val": "BTO:0001115"
+              },
+              {
+                "val": "CALOHA:TS-2194"
+              },
+              {
+                "val": "FMA:72292"
+              }
+            ]
+          }
         },
-        "xrefs" : [ {
-          "val" : "BTO:0001115"
-        }, {
-          "val" : "CALOHA:TS-2194"
-        }, {
-          "val" : "FMA:72292"
-        } ]
-      }
-    }, {
-      "id" : "http://purl.obolibrary.org/obo/CL_0000657",
-      "lbl" : "secondary spermatocyte",
-      "type" : "CLASS",
-      "meta" : {
-        "definition" : {
-          "val" : "One of the two haploid cells into which a primary spermatocyte divides, and which in turn gives origin to spermatids.",
-          "xrefs" : [ "GOC:tfm", "ISBN:0721662544" ]
+        {
+          "id": "http://purl.obolibrary.org/obo/CL_0000657",
+          "lbl": "secondary spermatocyte",
+          "type": "CLASS",
+          "meta": {
+            "definition": {
+              "val": "One of the two haploid cells into which a primary spermatocyte divides, and which in turn gives origin to spermatids.",
+              "xrefs": ["GOC:tfm", "ISBN:0721662544"]
+            },
+            "xrefs": [
+              {
+                "val": "BTO:0000709"
+              },
+              {
+                "val": "CALOHA:TS-2195"
+              },
+              {
+                "val": "FBbt:00004941"
+              },
+              {
+                "val": "FMA:72293"
+              }
+            ]
+          }
         },
-        "xrefs" : [ {
-          "val" : "BTO:0000709"
-        }, {
-          "val" : "CALOHA:TS-2195"
-        }, {
-          "val" : "FBbt:00004941"
-        }, {
-          "val" : "FMA:72293"
-        } ]
-      }
-    }, {
-      "id" : "http://purl.obolibrary.org/obo/CL_0000658",
-      "lbl" : "cuticle secreting cell",
-      "type" : "CLASS",
-      "meta" : {
-        "definition" : {
-          "val" : "An epithelial cell that secretes cuticle.",
-          "xrefs" : [ "GOC:tfm" ]
-        }
-      }
-    }, {
-      "id" : "http://purl.obolibrary.org/obo/CL_0000659",
-      "lbl" : "eggshell secreting cell",
-      "type" : "CLASS",
-      "meta" : {
-        "definition" : {
-          "val" : "An extracellular matrix secreting cell that secretes eggshell.",
-          "xrefs" : [ "GOC:tfm" ]
+        {
+          "id": "http://purl.obolibrary.org/obo/CL_0000658",
+          "lbl": "cuticle secreting cell",
+          "type": "CLASS",
+          "meta": {
+            "definition": {
+              "val": "An epithelial cell that secretes cuticle.",
+              "xrefs": ["GOC:tfm"]
+            }
+          }
+        },
+        {
+          "id": "http://purl.obolibrary.org/obo/CL_0000659",
+          "lbl": "eggshell secreting cell",
+          "type": "CLASS",
+          "meta": {
+            "definition": {
+              "val": "An extracellular matrix secreting cell that secretes eggshell.",
+              "xrefs": ["GOC:tfm"]
+            }
+          }
+        },
+        {
+          "id": "http://purl.obolibrary.org/obo/CL_1000451",
+          "lbl": "obsolete epithelial cell of visceral layer of glomerular capsule",
+          "type": "CLASS",
+          "meta": {
+            "basicPropertyValues": [
+              {
+                "pred": "http://purl.obolibrary.org/obo/IAO_0100001",
+                "val": "http://purl.obolibrary.org/obo/CL_0000653"
+              }
+            ],
+            "deprecated": true
+          }
         }
-      }
-    } , {
-      "id" : "http://purl.obolibrary.org/obo/CL_1000451",
-      "lbl" : "obsolete epithelial cell of visceral layer of glomerular capsule",
-      "type" : "CLASS",
-      "meta" : {
-        "basicPropertyValues" : [ {
-          "pred" : "http://purl.obolibrary.org/obo/IAO_0100001",
-          "val" : "http://purl.obolibrary.org/obo/CL_0000653"
-        } ],
-        "deprecated" : true
-      }
-    } ],
-    "edges" : [
-      {
-        "sub" : "http://purl.obolibrary.org/obo/UBERON_0005751",
-        "pred" : "http://purl.obolibrary.org/obo/BFO_0000051",
-        "obj" : "http://purl.obolibrary.org/obo/CL_0000653"
-      },
-      {
-        "sub" : "http://purl.obolibrary.org/obo/GO_1903210",
-        "pred" : "http://purl.obolibrary.org/obo/BFO_0000066",
-        "obj" : "http://purl.obolibrary.org/obo/CL_0000653"
-      },
-      {
-        "sub" : "http://purl.obolibrary.org/obo/GO_0090521",
-        "pred" : "http://purl.obolibrary.org/obo/RO_0002565",
-        "obj" : "http://purl.obolibrary.org/obo/CL_0000653"
-      },
-      {
-        "sub" : "http://purl.obolibrary.org/obo/GO_0072015",
-        "pred" : "http://purl.obolibrary.org/obo/RO_0002296",
-        "obj" : "http://purl.obolibrary.org/obo/CL_0000653"
-      }, 
-      {
-        "sub" : "http://purl.obolibrary.org/obo/CL_4030008",
-        "pred" : "is_a",
-        "obj" : "http://purl.obolibrary.org/obo/CL_0000653"
-      },{
-        "sub" : "http://purl.obolibrary.org/obo/CL_0002525",
-        "pred" : "is_a",
-        "obj" : "http://purl.obolibrary.org/obo/CL_0000653"
-      },{
-        "sub" : "http://purl.obolibrary.org/obo/CL_0002523",
-        "pred" : "is_a",
-        "obj" : "http://purl.obolibrary.org/obo/CL_0000653"
-      },{
-        "sub" : "http://purl.obolibrary.org/obo/CL_0000653",
-        "pred" : "is_a",
-        "obj" : "http://purl.obolibrary.org/obo/CL_0002522"
-      }, {
-        "sub" : "http://purl.obolibrary.org/obo/CL_0000653",
-        "pred" : "is_a",
-        "obj" : "http://purl.obolibrary.org/obo/CL_1000450"
-      }, {
-        "sub" : "http://purl.obolibrary.org/obo/CL_0000653",
-        "pred" : "http://purl.obolibrary.org/obo/BFO_0000050",
-        "obj" : "http://purl.obolibrary.org/obo/UBERON_0005751"
-      },
-      {
-        "sub" : "http://purl.obolibrary.org/obo/CL_0000655",
-        "pred" : "is_a",
-        "obj" : "http://purl.obolibrary.org/obo/CL_0000023",
-        "meta" : {
-          "basicPropertyValues" : [ {
-            "pred" : "http://www.geneontology.org/formats/oboInOwl#is_inferred",
-            "val" : "true"
-          } ]
+      ],
+      "edges": [
+        {
+          "sub": "http://purl.obolibrary.org/obo/UBERON_0005751",
+          "pred": "http://purl.obolibrary.org/obo/BFO_0000051",
+          "obj": "http://purl.obolibrary.org/obo/CL_0000653"
+        },
+        {
+          "sub": "http://purl.obolibrary.org/obo/GO_1903210",
+          "pred": "http://purl.obolibrary.org/obo/BFO_0000066",
+          "obj": "http://purl.obolibrary.org/obo/CL_0000653"
+        },
+        {
+          "sub": "http://purl.obolibrary.org/obo/GO_0090521",
+          "pred": "http://purl.obolibrary.org/obo/RO_0002565",
+          "obj": "http://purl.obolibrary.org/obo/CL_0000653"
+        },
+        {
+          "sub": "http://purl.obolibrary.org/obo/GO_0072015",
+          "pred": "http://purl.obolibrary.org/obo/RO_0002296",
+          "obj": "http://purl.obolibrary.org/obo/CL_0000653"
+        },
+        {
+          "sub": "http://purl.obolibrary.org/obo/CL_4030008",
+          "pred": "is_a",
+          "obj": "http://purl.obolibrary.org/obo/CL_0000653"
+        },
+        {
+          "sub": "http://purl.obolibrary.org/obo/CL_0002525",
+          "pred": "is_a",
+          "obj": "http://purl.obolibrary.org/obo/CL_0000653"
+        },
+        {
+          "sub": "http://purl.obolibrary.org/obo/CL_0002523",
+          "pred": "is_a",
+          "obj": "http://purl.obolibrary.org/obo/CL_0000653"
+        },
+        {
+          "sub": "http://purl.obolibrary.org/obo/CL_0000653",
+          "pred": "is_a",
+          "obj": "http://purl.obolibrary.org/obo/CL_0002522"
+        },
+        {
+          "sub": "http://purl.obolibrary.org/obo/CL_0000653",
+          "pred": "is_a",
+          "obj": "http://purl.obolibrary.org/obo/CL_1000450"
+        },
+        {
+          "sub": "http://purl.obolibrary.org/obo/CL_0000653",
+          "pred": "http://purl.obolibrary.org/obo/BFO_0000050",
+          "obj": "http://purl.obolibrary.org/obo/UBERON_0005751"
+        },
+        {
+          "sub": "http://purl.obolibrary.org/obo/CL_0000655",
+          "pred": "is_a",
+          "obj": "http://purl.obolibrary.org/obo/CL_0000023",
+          "meta": {
+            "basicPropertyValues": [
+              {
+                "pred": "http://www.geneontology.org/formats/oboInOwl#is_inferred",
+                "val": "true"
+              }
+            ]
+          }
+        },
+        {
+          "sub": "http://purl.obolibrary.org/obo/CL_0000655",
+          "pred": "http://purl.obolibrary.org/obo/CL_4030044",
+          "obj": "http://purl.obolibrary.org/obo/GO_0007147"
+        },
+        {
+          "sub": "http://purl.obolibrary.org/obo/CL_0000655",
+          "pred": "http://purl.obolibrary.org/obo/RO_0002202",
+          "obj": "http://purl.obolibrary.org/obo/CL_0000654"
         }
-      }, {
-        "sub" : "http://purl.obolibrary.org/obo/CL_0000655",
-        "pred" : "http://purl.obolibrary.org/obo/CL_4030044",
-        "obj" : "http://purl.obolibrary.org/obo/GO_0007147"
-      }, {
-        "sub" : "http://purl.obolibrary.org/obo/CL_0000655",
-        "pred" : "http://purl.obolibrary.org/obo/RO_0002202",
-        "obj" : "http://purl.obolibrary.org/obo/CL_0000654"
-      }
-    ]
-  }
-]}
+      ]
+    }
+  ]
+}
diff --git a/tests/gentropy/data_samples/uberon_sample.json b/tests/gentropy/data_samples/uberon_sample.json
index b06d652ef..7dedfa23c 100644
--- a/tests/gentropy/data_samples/uberon_sample.json
+++ b/tests/gentropy/data_samples/uberon_sample.json
@@ -1,675 +1,889 @@
 {
-    "graphs" : [ {
-      "id" : "http://purl.obolibrary.org/obo/uberon.json",
-      "meta" : {
-        "basicPropertyValues" : [ {
-          "pred" : "http://purl.obolibrary.org/obo/IAO_0000700",
-          "val" : "http://purl.obolibrary.org/obo/UBERON_0000104"
-        }, {
-          "pred" : "http://purl.obolibrary.org/obo/IAO_0000700",
-          "val" : "http://purl.obolibrary.org/obo/UBERON_0001062"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/creator",
-          "val" : "https://orcid.org/0000-0001-5839-6798"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/creator",
-          "val" : "https://orcid.org/0000-0001-7972-3866"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/creator",
-          "val" : "https://orcid.org/0000-0001-9114-8737"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/creator",
-          "val" : "https://orcid.org/0000-0002-1810-9886"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/creator",
-          "val" : "https://orcid.org/0000-0002-6601-2165"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/creator",
-          "val" : "https://orcid.org/0000-0002-7356-1779"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/creator",
-          "val" : "https://orcid.org/0000-0002-9611-1279"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/creator",
-          "val" : "https://orcid.org/0000-0003-3162-7490"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/creator",
-          "val" : "https://orcid.org/0000-0003-3308-6245"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/description",
-          "val" : "Uberon is an integrated cross-species anatomy ontology representing a variety of entities classified according to traditional anatomical criteria such as structure, function and developmental lineage. The ontology includes comprehensive relationships to taxon-specific anatomical ontologies, allowing integration of functional, phenotype and expression data."
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/publisher",
-          "val" : "http://uberon.org"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "http://dbpedia.org"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "http://palaeos.com"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "http://www.brain-map.org"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "http://braininfo.rprc.washington.edu/"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "http://en.wikipedia.org/wiki/"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "http://ontology.neuinfo.org/NIF/BiomaterialEntities/NIF-GrossAnatomy.owl"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "http://pons.incf.org/wiki/Common_Upper_Mammalian_Brain_Ontology_%28Cumbo%29"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "http://purl.obolibrary.org/obo/aao.owl"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "http://purl.obolibrary.org/obo/aba.owl"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "http://purl.obolibrary.org/obo/aeo.owl"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "http://purl.obolibrary.org/obo/bila.owl"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "http://purl.obolibrary.org/obo/bto.owl"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "http://purl.obolibrary.org/obo/caro.owl"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "http://purl.obolibrary.org/obo/cl.owl"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "http://purl.obolibrary.org/obo/ehdaa2.owl"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "http://purl.obolibrary.org/obo/emapa.owl"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "http://purl.obolibrary.org/obo/fbbt.owl"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "http://purl.obolibrary.org/obo/fma.owl"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "http://purl.obolibrary.org/obo/go.owl"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "http://purl.obolibrary.org/obo/hp.owl"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "http://purl.obolibrary.org/obo/ma.owl"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "http://purl.obolibrary.org/obo/mp.owl"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "http://purl.obolibrary.org/obo/tao.owl"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "http://purl.obolibrary.org/obo/vhog.owl"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "http://purl.obolibrary.org/obo/vsao.owl"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "http://purl.obolibrary.org/obo/wbbt.owl"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "http://purl.obolibrary.org/obo/xao.owl"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "http://purl.obolibrary.org/obo/zfa.owl"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "http://uri.neuinfo.org/nif/nifstd"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "http://www.e-lico.eu/public/kupo/kupo.owl"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "http://www.ebi.ac.uk/efo/efo.owl"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "ISBN:0030229073 Invertebrate Zoology, Barnes"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "ISBN:0073040584 Vertebrates, Kardong"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "ISBN:0123813611 Comparative Anatomy and Histology: A Mouse and Human Atlas, Treuting and Dintzis"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "ISBN:0226313379 Fins into Limbs: Evolution, Development, and Transformation, Hall"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "ISBN:0443065837 Human embryology, Larsen"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "ISBN:0471888893 Comparative Vertebrate Neuroanatomy: Evolution and Adaptation by Butler and Hodos"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "ISBN:0683400088 Stedman's Medical Dictionary"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "ISBN:1588900649 Color Atlas and Textbook of Human Anatomy: Nervous system and sensory organs By Werner Kahle, Michael Frotscher"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "ISBN:1588903958 Principles and practice of pediatric neurosurgery By A. Leland Albright, P. David Adelson, Ian F. Pollack"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "ISBN:1607950324 Craniofacial Embryogenetics & Development, 2nd edition, Sperber"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "ISBN:978-0-12-369548-2 Principles of Developmental Genetics, Sally A Moody"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "ISBN:9780120749034 The laboratory rat"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "ISBN:9780397517251 Surgical anatomy of the hand and upper extremity. By James R. Doyle and Michael J. Botte"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "ISBN:9780674021839 The Tree of Life - Guillaume Lecointre, Herve Le Guyader"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "ISBN:9780878932504 Developmental Biology"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "MESH"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "PMID:11433360 Placental development: lessons from mouse mutants"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "PMID:16417468 Forgotten and novel aspects in pancreas development, Pieler and Chen"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "aggregates AAO from 13:04:2012"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "aggregates TAO from 09:08:2012"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "aggregates VSAO from 16:07:2012"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "http://wiki.phenotypercn.org/wg/phenotypercn/index.php?title=Neural_Crest_Workshop"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "http://wiki.phenotypercn.org/wiki/August_2012_Notes"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "https://docs.google.com/document/d/16JZOuH9sh_a8uIXA4cqg0Q1H6MV5yCj3-rhuKsZoV_U/edit"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "https://docs.google.com/document/d/1MnUgispgGfNQoezYzWzzGTnkAnI0gzRnJIwdip6MMtw/edit"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "https://docs.google.com/document/d/1cPWBqrl_Qy7XHEWFqtR_PgQX61yRkgGuLaiDpnEXxkE/edit"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "https://docs.google.com/document/d/1r9kNPpFYGdu0SpJDLyFAVQczBlG0wAZCBMd18gG3Ot8/edit#"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/source",
-          "val" : "https://docs.google.com/spreadsheet/ccc?key=0Aj8NJdyb-leqdDM0R3hTVTRHRExDVjRCSkZEbDc5N1E#gid=0"
-        }, {
-          "pred" : "http://purl.org/dc/elements/1.1/title",
-          "val" : "Uber-anatomy ontology"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://github.com/orgs/pato-ontology/teams/pato-community"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://orcid.org/0000-0001-5889-4463"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://orcid.org/0000-0001-7433-0086"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://orcid.org/0000-0001-7476-6306"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://orcid.org/0000-0001-7920-5321"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://orcid.org/0000-0001-7958-3701"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://orcid.org/0000-0001-8682-8754"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://orcid.org/0000-0001-9107-0714"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://orcid.org/0000-0001-9990-8331"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://orcid.org/0000-0002-0819-0473"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://orcid.org/0000-0002-0956-8634"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://orcid.org/0000-0002-1112-5832"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://orcid.org/0000-0002-1572-1316"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://orcid.org/0000-0002-1604-3078"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://orcid.org/0000-0002-1615-2899"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://orcid.org/0000-0002-2061-091X"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://orcid.org/0000-0002-2244-7917"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://orcid.org/0000-0002-3437-3329"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://orcid.org/0000-0002-3467-2636"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://orcid.org/0000-0002-3734-1859"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://orcid.org/0000-0002-5111-7263"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://orcid.org/0000-0002-6490-7723"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://orcid.org/0000-0002-7073-9172"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://orcid.org/0000-0002-8406-3871"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://orcid.org/0000-0002-8455-3213"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://orcid.org/0000-0002-8688-6599"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://orcid.org/0000-0002-9415-5104"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://orcid.org/0000-0002-9818-3030"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://orcid.org/0000-0002-9900-7880"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://orcid.org/0000-0003-1980-3228"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://orcid.org/0000-0003-2105-2283"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://orcid.org/0000-0003-2338-2550"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://orcid.org/0000-0003-3691-0324"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://orcid.org/0000-0003-4423-4370"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://www.wikidata.org/wiki/Q11695472"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://www.wikidata.org/wiki/Q23809253"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://www.wikidata.org/wiki/Q4964264"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://www.wikidata.org/wiki/Q54985720"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://www.wikidata.org/wiki/Q6983890"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://www.wikidata.org/wiki/Q7650732"
-        }, {
-          "pred" : "http://purl.org/dc/terms/contributor",
-          "val" : "https://www.wikidata.org/wiki/Q85793053"
-        }, {
-          "pred" : "http://purl.org/dc/terms/isReferencedBy",
-          "val" : "http://genomebiology.com/2012/13/1/R5"
-        }, {
-          "pred" : "http://purl.org/dc/terms/isReferencedBy",
-          "val" : "http://www.ncbi.nlm.nih.gov/pubmed/22293552"
-        }, {
-          "pred" : "http://purl.org/dc/terms/license",
-          "val" : "http://creativecommons.org/licenses/by/3.0/"
-        }, {
-          "pred" : "http://usefulinc.com/ns/doap#GitRepository",
-          "val" : "https://github.com/cmungall/uberon/"
-        }, {
-          "pred" : "http://usefulinc.com/ns/doap#SVNRepository",
-          "val" : "https://obo.svn.sourceforge.net/svnroot/obo/uberon/"
-        }, {
-          "pred" : "http://usefulinc.com/ns/doap#bug-database",
-          "val" : "https://github.com/obophenotype/uberon/issues/"
-        }, {
-          "pred" : "http://usefulinc.com/ns/doap#mailing-list",
-          "val" : "https://lists.sourceforge.net/lists/listinfo/obo-anatomy"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#default-namespace",
-          "val" : "uberon"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#hasOBOFormatVersion",
-          "val" : "1.2"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent",
-          "val" : "AEO"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent",
-          "val" : "BILA"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent",
-          "val" : "BSPO"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent",
-          "val" : "CARO"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent",
-          "val" : "GO"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent",
-          "val" : "OG"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent",
-          "val" : "VSAO"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-has-subclass",
-          "val" : "EHDAA"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-has-subclass",
-          "val" : "EV"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-has-subclass",
-          "val" : "NCIT"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-has-subclass",
-          "val" : "OGES"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-has-subclass",
-          "val" : "SCTID"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-is_a",
-          "val" : "BFO"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-is_a",
-          "val" : "VHOG"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
-          "val" : "AAO part_of NCBITaxon:8292"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
-          "val" : "DHBA part_of NCBITaxon:9606"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
-          "val" : "EHDAA2 part_of NCBITaxon:9606"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
-          "val" : "EMAPA part_of NCBITaxon:10090"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
-          "val" : "FBdv part_of NCBITaxon:7227"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
-          "val" : "FMA part_of NCBITaxon:9606"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
-          "val" : "HAO part_of NCBITaxon:7399"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
-          "val" : "HBA part_of NCBITaxon:9606"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
-          "val" : "HsapDv part_of NCBITaxon:9606"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
-          "val" : "KUPO part_of NCBITaxon:9606"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
-          "val" : "MA part_of NCBITaxon:10090"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
-          "val" : "MFO part_of NCBITaxon:8089"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
-          "val" : "MmusDv part_of NCBITaxon:10090"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
-          "val" : "OlatDv part_of NCBITaxon:8089"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
-          "val" : "PBA part_of NCBITaxon:9443"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
-          "val" : "SPD part_of NCBITaxon:6893"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
-          "val" : "TADS part_of NCBITaxon:6939"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
-          "val" : "TAO part_of NCBITaxon:32443"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
-          "val" : "TGMA part_of NCBITaxon:44484"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
-          "val" : "WBbt part_of NCBITaxon:6237"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
-          "val" : "WBls part_of NCBITaxon:6237"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
-          "val" : "XAO part_of NCBITaxon:8353"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
-          "val" : "ZFA part_of NCBITaxon:7954"
-        }, {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
-          "val" : "ZFS part_of NCBITaxon:7954"
-        }, {
-          "pred" : "http://www.w3.org/2000/01/rdf-schema#comment",
-          "val" : "Aurelie Comte, Bill Bug, Catherine Leroy, Duncan Davidson and Trish Whetzel are also contributors. However their ORCIDs were not found."
-        }, {
-          "pred" : "http://www.w3.org/2002/07/owl#versionInfo",
-          "val" : "2024-09-03"
-        }, {
-          "pred" : "http://xmlns.com/foaf/0.1/homepage",
-          "val" : "http://uberon.org"
-        } ],
-        "version" : "http://purl.obolibrary.org/obo/uberon/releases/2024-09-03/uberon.json"
-      },
-    "nodes" : [{
-        "id" : "http://purl.obolibrary.org/obo/CL_1001593",
-        "lbl" : "parathyroid glandular cell",
-        "type" : "CLASS",
-        "meta" : {
-          "definition" : {
-            "val" : "Glandular cell of parathyroid epithelium. Example: Parathyroid chief cell and parathyroid oxyphil cells.",
-            "xrefs" : [ "HPA:HPA", "NPX:PDR" ]
-          },
-          "synonyms" : [ {
-            "pred" : "hasRelatedSynonym",
-            "val" : "parathyroid gland glandular cell",
-            "xrefs" : [ "CALOHA:TS-1279" ]
-          }, {
-            "pred" : "hasRelatedSynonym",
-            "val" : "parathyroid gland glandular cells",
-            "xrefs" : [ "CALOHA:TS-1279" ]
-          } ],
-          "xrefs" : [ {
-            "val" : "CALOHA:TS-1279"
-          } ]
-        }
-      }, {
-        "id" : "http://purl.obolibrary.org/obo/CL_1001595",
-        "lbl" : "rectum glandular cell",
-        "type" : "CLASS",
-        "meta" : {
-          "definition" : {
-            "val" : "Glandular cell of rectal epithelium. Example: Goblet cell; enterocytes or absorptive cells; enteroendocrine and M cells.",
-            "xrefs" : [ "NPX:PDR" ]
-          },
-          "synonyms" : [ {
-            "pred" : "hasRelatedSynonym",
-            "val" : "rectal glandular cell",
-            "xrefs" : [ "CALOHA:TS-1281" ]
-          }, {
-            "pred" : "hasRelatedSynonym",
-            "val" : "rectum glandular cells",
-            "xrefs" : [ "CALOHA:TS-1281" ]
-          } ],
-          "xrefs" : [ {
-            "val" : "CALOHA:TS-1281"
-          } ]
-        }
-      }, {
-        "id" : "http://purl.obolibrary.org/obo/CL_1001596",
-        "lbl" : "salivary gland glandular cell",
-        "type" : "CLASS",
-        "meta" : {
-          "definition" : {
-            "val" : "Glandular cell of salivary gland. Example: Serous cells, mucous cells, cuboidal epithelial cells of the intercalated ducts, simple cuboidal epithelium of the striated ducts, epithelial cells of excretory ducts.",
-            "xrefs" : [ "HPA:HPA", "NPX:PDR" ]
-          },
-          "synonyms" : [ {
-            "pred" : "hasRelatedSynonym",
-            "val" : "salivary gland glandular cells",
-            "xrefs" : [ "CALOHA:TS-1282" ]
-          } ],
-          "xrefs" : [ {
-            "val" : "CALOHA:TS-1282"
-          } ]
-        }
+  "graphs": [
+    {
+      "id": "http://purl.obolibrary.org/obo/uberon.json",
+      "meta": {
+        "basicPropertyValues": [
+          {
+            "pred": "http://purl.obolibrary.org/obo/IAO_0000700",
+            "val": "http://purl.obolibrary.org/obo/UBERON_0000104"
+          },
+          {
+            "pred": "http://purl.obolibrary.org/obo/IAO_0000700",
+            "val": "http://purl.obolibrary.org/obo/UBERON_0001062"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/creator",
+            "val": "https://orcid.org/0000-0001-5839-6798"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/creator",
+            "val": "https://orcid.org/0000-0001-7972-3866"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/creator",
+            "val": "https://orcid.org/0000-0001-9114-8737"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/creator",
+            "val": "https://orcid.org/0000-0002-1810-9886"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/creator",
+            "val": "https://orcid.org/0000-0002-6601-2165"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/creator",
+            "val": "https://orcid.org/0000-0002-7356-1779"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/creator",
+            "val": "https://orcid.org/0000-0002-9611-1279"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/creator",
+            "val": "https://orcid.org/0000-0003-3162-7490"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/creator",
+            "val": "https://orcid.org/0000-0003-3308-6245"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/description",
+            "val": "Uberon is an integrated cross-species anatomy ontology representing a variety of entities classified according to traditional anatomical criteria such as structure, function and developmental lineage. The ontology includes comprehensive relationships to taxon-specific anatomical ontologies, allowing integration of functional, phenotype and expression data."
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/publisher",
+            "val": "http://uberon.org"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "http://dbpedia.org"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "http://palaeos.com"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "http://www.brain-map.org"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "http://braininfo.rprc.washington.edu/"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "http://en.wikipedia.org/wiki/"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "http://ontology.neuinfo.org/NIF/BiomaterialEntities/NIF-GrossAnatomy.owl"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "http://pons.incf.org/wiki/Common_Upper_Mammalian_Brain_Ontology_%28Cumbo%29"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "http://purl.obolibrary.org/obo/aao.owl"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "http://purl.obolibrary.org/obo/aba.owl"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "http://purl.obolibrary.org/obo/aeo.owl"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "http://purl.obolibrary.org/obo/bila.owl"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "http://purl.obolibrary.org/obo/bto.owl"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "http://purl.obolibrary.org/obo/caro.owl"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "http://purl.obolibrary.org/obo/cl.owl"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "http://purl.obolibrary.org/obo/ehdaa2.owl"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "http://purl.obolibrary.org/obo/emapa.owl"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "http://purl.obolibrary.org/obo/fbbt.owl"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "http://purl.obolibrary.org/obo/fma.owl"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "http://purl.obolibrary.org/obo/go.owl"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "http://purl.obolibrary.org/obo/hp.owl"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "http://purl.obolibrary.org/obo/ma.owl"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "http://purl.obolibrary.org/obo/mp.owl"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "http://purl.obolibrary.org/obo/tao.owl"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "http://purl.obolibrary.org/obo/vhog.owl"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "http://purl.obolibrary.org/obo/vsao.owl"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "http://purl.obolibrary.org/obo/wbbt.owl"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "http://purl.obolibrary.org/obo/xao.owl"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "http://purl.obolibrary.org/obo/zfa.owl"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "http://uri.neuinfo.org/nif/nifstd"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "http://www.e-lico.eu/public/kupo/kupo.owl"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "http://www.ebi.ac.uk/efo/efo.owl"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "ISBN:0030229073 Invertebrate Zoology, Barnes"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "ISBN:0073040584 Vertebrates, Kardong"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "ISBN:0123813611 Comparative Anatomy and Histology: A Mouse and Human Atlas, Treuting and Dintzis"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "ISBN:0226313379 Fins into Limbs: Evolution, Development, and Transformation, Hall"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "ISBN:0443065837 Human embryology, Larsen"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "ISBN:0471888893 Comparative Vertebrate Neuroanatomy: Evolution and Adaptation by Butler and Hodos"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "ISBN:0683400088 Stedman's Medical Dictionary"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "ISBN:1588900649 Color Atlas and Textbook of Human Anatomy: Nervous system and sensory organs By Werner Kahle, Michael Frotscher"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "ISBN:1588903958 Principles and practice of pediatric neurosurgery By A. Leland Albright, P. David Adelson, Ian F. Pollack"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "ISBN:1607950324 Craniofacial Embryogenetics & Development, 2nd edition, Sperber"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "ISBN:978-0-12-369548-2 Principles of Developmental Genetics, Sally A Moody"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "ISBN:9780120749034 The laboratory rat"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "ISBN:9780397517251 Surgical anatomy of the hand and upper extremity. By James R. Doyle and Michael J. Botte"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "ISBN:9780674021839 The Tree of Life - Guillaume Lecointre, Herve Le Guyader"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "ISBN:9780878932504 Developmental Biology"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "MESH"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "PMID:11433360 Placental development: lessons from mouse mutants"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "PMID:16417468 Forgotten and novel aspects in pancreas development, Pieler and Chen"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "aggregates AAO from 13:04:2012"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "aggregates TAO from 09:08:2012"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "aggregates VSAO from 16:07:2012"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "http://wiki.phenotypercn.org/wg/phenotypercn/index.php?title=Neural_Crest_Workshop"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "http://wiki.phenotypercn.org/wiki/August_2012_Notes"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "https://docs.google.com/document/d/16JZOuH9sh_a8uIXA4cqg0Q1H6MV5yCj3-rhuKsZoV_U/edit"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "https://docs.google.com/document/d/1MnUgispgGfNQoezYzWzzGTnkAnI0gzRnJIwdip6MMtw/edit"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "https://docs.google.com/document/d/1cPWBqrl_Qy7XHEWFqtR_PgQX61yRkgGuLaiDpnEXxkE/edit"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "https://docs.google.com/document/d/1r9kNPpFYGdu0SpJDLyFAVQczBlG0wAZCBMd18gG3Ot8/edit#"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/source",
+            "val": "https://docs.google.com/spreadsheet/ccc?key=0Aj8NJdyb-leqdDM0R3hTVTRHRExDVjRCSkZEbDc5N1E#gid=0"
+          },
+          {
+            "pred": "http://purl.org/dc/elements/1.1/title",
+            "val": "Uber-anatomy ontology"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://github.com/orgs/pato-ontology/teams/pato-community"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0001-5889-4463"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0001-7433-0086"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0001-7476-6306"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0001-7920-5321"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0001-7958-3701"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0001-8682-8754"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0001-9107-0714"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0001-9990-8331"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0002-0819-0473"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0002-0956-8634"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0002-1112-5832"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0002-1572-1316"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0002-1604-3078"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0002-1615-2899"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0002-2061-091X"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0002-2244-7917"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0002-3437-3329"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0002-3467-2636"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0002-3734-1859"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0002-5111-7263"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0002-6490-7723"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0002-7073-9172"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0002-8406-3871"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0002-8455-3213"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0002-8688-6599"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0002-9415-5104"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0002-9818-3030"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0002-9900-7880"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0003-1980-3228"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0003-2105-2283"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0003-2338-2550"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0003-3691-0324"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://orcid.org/0000-0003-4423-4370"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://www.wikidata.org/wiki/Q11695472"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://www.wikidata.org/wiki/Q23809253"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://www.wikidata.org/wiki/Q4964264"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://www.wikidata.org/wiki/Q54985720"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://www.wikidata.org/wiki/Q6983890"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://www.wikidata.org/wiki/Q7650732"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/contributor",
+            "val": "https://www.wikidata.org/wiki/Q85793053"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/isReferencedBy",
+            "val": "http://genomebiology.com/2012/13/1/R5"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/isReferencedBy",
+            "val": "http://www.ncbi.nlm.nih.gov/pubmed/22293552"
+          },
+          {
+            "pred": "http://purl.org/dc/terms/license",
+            "val": "http://creativecommons.org/licenses/by/3.0/"
+          },
+          {
+            "pred": "http://usefulinc.com/ns/doap#GitRepository",
+            "val": "https://github.com/cmungall/uberon/"
+          },
+          {
+            "pred": "http://usefulinc.com/ns/doap#SVNRepository",
+            "val": "https://obo.svn.sourceforge.net/svnroot/obo/uberon/"
+          },
+          {
+            "pred": "http://usefulinc.com/ns/doap#bug-database",
+            "val": "https://github.com/obophenotype/uberon/issues/"
+          },
+          {
+            "pred": "http://usefulinc.com/ns/doap#mailing-list",
+            "val": "https://lists.sourceforge.net/lists/listinfo/obo-anatomy"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#default-namespace",
+            "val": "uberon"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#hasOBOFormatVersion",
+            "val": "1.2"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent",
+            "val": "AEO"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent",
+            "val": "BILA"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent",
+            "val": "BSPO"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent",
+            "val": "CARO"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent",
+            "val": "GO"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent",
+            "val": "OG"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent",
+            "val": "VSAO"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-has-subclass",
+            "val": "EHDAA"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-has-subclass",
+            "val": "EV"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-has-subclass",
+            "val": "NCIT"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-has-subclass",
+            "val": "OGES"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-has-subclass",
+            "val": "SCTID"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-is_a",
+            "val": "BFO"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-is_a",
+            "val": "VHOG"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+            "val": "AAO part_of NCBITaxon:8292"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+            "val": "DHBA part_of NCBITaxon:9606"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+            "val": "EHDAA2 part_of NCBITaxon:9606"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+            "val": "EMAPA part_of NCBITaxon:10090"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+            "val": "FBdv part_of NCBITaxon:7227"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+            "val": "FMA part_of NCBITaxon:9606"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+            "val": "HAO part_of NCBITaxon:7399"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+            "val": "HBA part_of NCBITaxon:9606"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+            "val": "HsapDv part_of NCBITaxon:9606"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+            "val": "KUPO part_of NCBITaxon:9606"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+            "val": "MA part_of NCBITaxon:10090"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+            "val": "MFO part_of NCBITaxon:8089"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+            "val": "MmusDv part_of NCBITaxon:10090"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+            "val": "OlatDv part_of NCBITaxon:8089"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+            "val": "PBA part_of NCBITaxon:9443"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+            "val": "SPD part_of NCBITaxon:6893"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+            "val": "TADS part_of NCBITaxon:6939"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+            "val": "TAO part_of NCBITaxon:32443"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+            "val": "TGMA part_of NCBITaxon:44484"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+            "val": "WBbt part_of NCBITaxon:6237"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+            "val": "WBls part_of NCBITaxon:6237"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+            "val": "XAO part_of NCBITaxon:8353"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+            "val": "ZFA part_of NCBITaxon:7954"
+          },
+          {
+            "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia",
+            "val": "ZFS part_of NCBITaxon:7954"
+          },
+          {
+            "pred": "http://www.w3.org/2000/01/rdf-schema#comment",
+            "val": "Aurelie Comte, Bill Bug, Catherine Leroy, Duncan Davidson and Trish Whetzel are also contributors. However their ORCIDs were not found."
+          },
+          {
+            "pred": "http://www.w3.org/2002/07/owl#versionInfo",
+            "val": "2024-09-03"
+          },
+          {
+            "pred": "http://xmlns.com/foaf/0.1/homepage",
+            "val": "http://uberon.org"
+          }
+        ],
+        "version": "http://purl.obolibrary.org/obo/uberon/releases/2024-09-03/uberon.json"
       },
-      {
-        "id" : "http://purl.obolibrary.org/obo/CL_0000653",
-        "lbl" : "podocyte",
-        "type" : "CLASS",
-        "meta" : {
-          "definition" : {
-            "val" : "A specialized kidney epithelial cell, contained within a glomerulus, that contains \"feet\" that interdigitate with the \"feet\" of other podocytes.",
-            "xrefs" : [ "GOC:tfm", "https://doi.org/10.1101/2021.10.10.463829" ]
-          },
-          "subsets" : [ "http://purl.obolibrary.org/obo/cl#cellxgene_subset", "http://purl.obolibrary.org/obo/uberon/core#human_reference_atlas" ],
-          "synonyms" : [ {
-            "pred" : "hasBroadSynonym",
-            "val" : "epithelial cell of visceral layer of glomerular capsule",
-            "xrefs" : [ "FMA:70967" ]
-          }, {
-            "pred" : "hasExactSynonym",
-            "val" : "glomerular podocyte",
-            "xrefs" : [ "FMA:70967" ]
-          }, {
-            "pred" : "hasExactSynonym",
-            "val" : "glomerular visceral epithelial cell"
-          }, {
-            "pred" : "hasExactSynonym",
-            "val" : "kidney podocyte"
-          }, {
-            "pred" : "hasExactSynonym",
-            "val" : "renal podocyte"
-          } ],
-          "xrefs" : [ {
-            "val" : "BTO:0002295"
-          }, {
-            "val" : "FMA:70967"
-          } ],
-          "basicPropertyValues" : [ {
-            "pred" : "http://purl.obolibrary.org/obo/RO_0002175",
-            "val" : "http://purl.obolibrary.org/obo/NCBITaxon_9606"
-          }, {
-            "pred" : "http://www.w3.org/2000/01/rdf-schema#seeAlso",
-            "val" : "https://github.com/obophenotype/cell-ontology/issues/1460"
-          } ]
+      "nodes": [
+        {
+          "id": "http://purl.obolibrary.org/obo/CL_1001593",
+          "lbl": "parathyroid glandular cell",
+          "type": "CLASS",
+          "meta": {
+            "definition": {
+              "val": "Glandular cell of parathyroid epithelium. Example: Parathyroid chief cell and parathyroid oxyphil cells.",
+              "xrefs": ["HPA:HPA", "NPX:PDR"]
+            },
+            "synonyms": [
+              {
+                "pred": "hasRelatedSynonym",
+                "val": "parathyroid gland glandular cell",
+                "xrefs": ["CALOHA:TS-1279"]
+              },
+              {
+                "pred": "hasRelatedSynonym",
+                "val": "parathyroid gland glandular cells",
+                "xrefs": ["CALOHA:TS-1279"]
+              }
+            ],
+            "xrefs": [
+              {
+                "val": "CALOHA:TS-1279"
+              }
+            ]
+          }
+        },
+        {
+          "id": "http://purl.obolibrary.org/obo/CL_1001595",
+          "lbl": "rectum glandular cell",
+          "type": "CLASS",
+          "meta": {
+            "definition": {
+              "val": "Glandular cell of rectal epithelium. Example: Goblet cell; enterocytes or absorptive cells; enteroendocrine and M cells.",
+              "xrefs": ["NPX:PDR"]
+            },
+            "synonyms": [
+              {
+                "pred": "hasRelatedSynonym",
+                "val": "rectal glandular cell",
+                "xrefs": ["CALOHA:TS-1281"]
+              },
+              {
+                "pred": "hasRelatedSynonym",
+                "val": "rectum glandular cells",
+                "xrefs": ["CALOHA:TS-1281"]
+              }
+            ],
+            "xrefs": [
+              {
+                "val": "CALOHA:TS-1281"
+              }
+            ]
+          }
+        },
+        {
+          "id": "http://purl.obolibrary.org/obo/CL_1001596",
+          "lbl": "salivary gland glandular cell",
+          "type": "CLASS",
+          "meta": {
+            "definition": {
+              "val": "Glandular cell of salivary gland. Example: Serous cells, mucous cells, cuboidal epithelial cells of the intercalated ducts, simple cuboidal epithelium of the striated ducts, epithelial cells of excretory ducts.",
+              "xrefs": ["HPA:HPA", "NPX:PDR"]
+            },
+            "synonyms": [
+              {
+                "pred": "hasRelatedSynonym",
+                "val": "salivary gland glandular cells",
+                "xrefs": ["CALOHA:TS-1282"]
+              }
+            ],
+            "xrefs": [
+              {
+                "val": "CALOHA:TS-1282"
+              }
+            ]
+          }
+        },
+        {
+          "id": "http://purl.obolibrary.org/obo/CL_0000653",
+          "lbl": "podocyte",
+          "type": "CLASS",
+          "meta": {
+            "definition": {
+              "val": "A specialized kidney epithelial cell, contained within a glomerulus, that contains \"feet\" that interdigitate with the \"feet\" of other podocytes.",
+              "xrefs": ["GOC:tfm", "https://doi.org/10.1101/2021.10.10.463829"]
+            },
+            "subsets": [
+              "http://purl.obolibrary.org/obo/cl#cellxgene_subset",
+              "http://purl.obolibrary.org/obo/uberon/core#human_reference_atlas"
+            ],
+            "synonyms": [
+              {
+                "pred": "hasBroadSynonym",
+                "val": "epithelial cell of visceral layer of glomerular capsule",
+                "xrefs": ["FMA:70967"]
+              },
+              {
+                "pred": "hasExactSynonym",
+                "val": "glomerular podocyte",
+                "xrefs": ["FMA:70967"]
+              },
+              {
+                "pred": "hasExactSynonym",
+                "val": "glomerular visceral epithelial cell"
+              },
+              {
+                "pred": "hasExactSynonym",
+                "val": "kidney podocyte"
+              },
+              {
+                "pred": "hasExactSynonym",
+                "val": "renal podocyte"
+              }
+            ],
+            "xrefs": [
+              {
+                "val": "BTO:0002295"
+              },
+              {
+                "val": "FMA:70967"
+              }
+            ],
+            "basicPropertyValues": [
+              {
+                "pred": "http://purl.obolibrary.org/obo/RO_0002175",
+                "val": "http://purl.obolibrary.org/obo/NCBITaxon_9606"
+              },
+              {
+                "pred": "http://www.w3.org/2000/01/rdf-schema#seeAlso",
+                "val": "https://github.com/obophenotype/cell-ontology/issues/1460"
+              }
+            ]
+          }
         }
-      }],
-      "edges" : [
+      ],
+      "edges": [
+        {
+          "sub": "http://purl.obolibrary.org/obo/CL_1001596",
+          "pred": "is_a",
+          "obj": "http://purl.obolibrary.org/obo/CL_0000150"
+        },
+        {
+          "sub": "http://purl.obolibrary.org/obo/CL_1001596",
+          "pred": "is_a",
+          "obj": "http://purl.obolibrary.org/obo/CL_0000152"
+        },
         {
-            "sub" : "http://purl.obolibrary.org/obo/CL_1001596",
-            "pred" : "is_a",
-            "obj" : "http://purl.obolibrary.org/obo/CL_0000150"
-          }, {
-            "sub" : "http://purl.obolibrary.org/obo/CL_1001596",
-            "pred" : "is_a",
-            "obj" : "http://purl.obolibrary.org/obo/CL_0000152"
-          }, {
-            "sub" : "http://purl.obolibrary.org/obo/CL_1001596",
-            "pred" : "is_a",
-            "obj" : "http://purl.obolibrary.org/obo/CL_0002251"
-          }, {
-            "sub" : "http://purl.obolibrary.org/obo/CL_1001596",
-            "pred" : "http://purl.obolibrary.org/obo/BFO_0000050",
-            "obj" : "http://purl.obolibrary.org/obo/UBERON_0001044"
-          }, {
-            "sub" : "http://purl.obolibrary.org/obo/CL_1001596",
-            "pred" : "http://purl.obolibrary.org/obo/BFO_0000050",
-            "obj" : "http://purl.obolibrary.org/obo/UBERON_0004809"
-          }, {
-      "sub" : "http://purl.obolibrary.org/obo/CL_0002623",
-      "pred" : "is_a",
-      "obj" : "http://purl.obolibrary.org/obo/CL_0000622",
-      "meta" : {
-        "basicPropertyValues" : [ {
-          "pred" : "http://www.geneontology.org/formats/oboInOwl#is_inferred",
-          "val" : "true"
-        } ]
-      }
-    }, {
-      "sub" : "http://purl.obolibrary.org/obo/CL_0002623",
-      "pred" : "is_a",
-      "obj" : "http://purl.obolibrary.org/obo/CL_1001596"
-    }, {
-      "sub" : "http://purl.obolibrary.org/obo/CL_0002623",
-      "pred" : "http://purl.obolibrary.org/obo/BFO_0000050",
-      "obj" : "http://purl.obolibrary.org/obo/UBERON_0001044"
-    },  {
-        "sub" : "http://purl.obolibrary.org/obo/CL_0002623",
-        "pred" : "is_a",
-        "obj" : "http://purl.obolibrary.org/obo/CL_0000622",
-        "meta" : {
-          "basicPropertyValues" : [ {
-            "pred" : "http://www.geneontology.org/formats/oboInOwl#is_inferred",
-            "val" : "true"
-          } ]
+          "sub": "http://purl.obolibrary.org/obo/CL_1001596",
+          "pred": "is_a",
+          "obj": "http://purl.obolibrary.org/obo/CL_0002251"
+        },
+        {
+          "sub": "http://purl.obolibrary.org/obo/CL_1001596",
+          "pred": "http://purl.obolibrary.org/obo/BFO_0000050",
+          "obj": "http://purl.obolibrary.org/obo/UBERON_0001044"
+        },
+        {
+          "sub": "http://purl.obolibrary.org/obo/CL_1001596",
+          "pred": "http://purl.obolibrary.org/obo/BFO_0000050",
+          "obj": "http://purl.obolibrary.org/obo/UBERON_0004809"
+        },
+        {
+          "sub": "http://purl.obolibrary.org/obo/CL_0002623",
+          "pred": "is_a",
+          "obj": "http://purl.obolibrary.org/obo/CL_0000622",
+          "meta": {
+            "basicPropertyValues": [
+              {
+                "pred": "http://www.geneontology.org/formats/oboInOwl#is_inferred",
+                "val": "true"
+              }
+            ]
+          }
+        },
+        {
+          "sub": "http://purl.obolibrary.org/obo/CL_0002623",
+          "pred": "is_a",
+          "obj": "http://purl.obolibrary.org/obo/CL_1001596"
+        },
+        {
+          "sub": "http://purl.obolibrary.org/obo/CL_0002623",
+          "pred": "http://purl.obolibrary.org/obo/BFO_0000050",
+          "obj": "http://purl.obolibrary.org/obo/UBERON_0001044"
+        },
+        {
+          "sub": "http://purl.obolibrary.org/obo/CL_0002623",
+          "pred": "is_a",
+          "obj": "http://purl.obolibrary.org/obo/CL_0000622",
+          "meta": {
+            "basicPropertyValues": [
+              {
+                "pred": "http://www.geneontology.org/formats/oboInOwl#is_inferred",
+                "val": "true"
+              }
+            ]
+          }
+        },
+        {
+          "sub": "http://purl.obolibrary.org/obo/CL_0002623",
+          "pred": "is_a",
+          "obj": "http://purl.obolibrary.org/obo/CL_1001596"
+        },
+        {
+          "sub": "http://purl.obolibrary.org/obo/CL_0002623",
+          "pred": "http://purl.obolibrary.org/obo/BFO_0000050",
+          "obj": "http://purl.obolibrary.org/obo/UBERON_0001044"
+        },
+        {
+          "sub": "http://purl.obolibrary.org/obo/CL_0000653",
+          "pred": "is_a",
+          "obj": "http://purl.obolibrary.org/obo/CL_1000450"
+        },
+        {
+          "sub": "http://purl.obolibrary.org/obo/CL_0000653",
+          "pred": "http://purl.obolibrary.org/obo/BFO_0000050",
+          "obj": "http://purl.obolibrary.org/obo/UBERON_0005751"
         }
-      }, {
-        "sub" : "http://purl.obolibrary.org/obo/CL_0002623",
-        "pred" : "is_a",
-        "obj" : "http://purl.obolibrary.org/obo/CL_1001596"
-      }, {
-        "sub" : "http://purl.obolibrary.org/obo/CL_0002623",
-        "pred" : "http://purl.obolibrary.org/obo/BFO_0000050",
-        "obj" : "http://purl.obolibrary.org/obo/UBERON_0001044"
-      },
-      {
-        "sub" : "http://purl.obolibrary.org/obo/CL_0000653",
-        "pred" : "is_a",
-        "obj" : "http://purl.obolibrary.org/obo/CL_1000450"
-      }, {
-        "sub" : "http://purl.obolibrary.org/obo/CL_0000653",
-        "pred" : "http://purl.obolibrary.org/obo/BFO_0000050",
-        "obj" : "http://purl.obolibrary.org/obo/UBERON_0005751"
-      }, 
-    ]
+      ]
     }
-    ]
+  ]
 }
diff --git a/tests/gentropy/dataset/test_biosample_index.py b/tests/gentropy/dataset/test_biosample_index.py
index 60c89d703..c647710d1 100644
--- a/tests/gentropy/dataset/test_biosample_index.py
+++ b/tests/gentropy/dataset/test_biosample_index.py
@@ -1,19 +1,8 @@
 """Tests on Biosample index."""
 
-import pandas as pd
-import numpy as np
-from pyspark.sql import SparkSession
-from pyspark.sql import Row
-import pyspark.sql.functions as F
-import owlready2 as owl
-from pyspark.sql.types import StructType, StructField, StringType, ArrayType, MapType, BooleanType
-import json
-
 from gentropy.dataset.biosample_index import BiosampleIndex
-from gentropy.datasource.ontologies.utils import extract_ontology_from_json, merge_biosample_indices
 
 
 def test_biosample_index_creation(mock_biosample_index: BiosampleIndex) -> None:
     """Test biosample index creation with mock biosample index."""
     assert isinstance(mock_biosample_index, BiosampleIndex)
-
diff --git a/tests/gentropy/datasource/ontologies/test_biosample_ontology.py b/tests/gentropy/datasource/ontologies/test_biosample_ontology.py
index af7d9e405..9fb8ff92a 100644
--- a/tests/gentropy/datasource/ontologies/test_biosample_ontology.py
+++ b/tests/gentropy/datasource/ontologies/test_biosample_ontology.py
@@ -1,49 +1,52 @@
-"""Tests for study index dataset from FinnGen."""
+"""Tests for biosample index dataset."""
 
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
 
-import pytest
-from pyspark.sql import DataFrame
-from pyspark.sql import functions as f
-
-
-from gentropy.dataset.study_index import BiosampleIndex
-from gentropy.datasource.ontologies.utils import extract_ontology_from_json, merge_biosample_indices
+from gentropy.dataset.biosample_index import BiosampleIndex
+from gentropy.datasource.ontologies.utils import (
+    extract_ontology_from_json,
+    merge_biosample_indices,
+)
 
 if TYPE_CHECKING:
     from pyspark.sql import SparkSession
 
-def test_biosample_index_from_source(spark: SparkSession) -> None:
-    """Test biosample index from source."""
-    assert isinstance(extract_ontology_from_json(), BiosampleIndex)
 
 class TestOntologyParger:
-    """ Testing ontology parser."""
+    """Testing ontology parser."""
 
     SAMPLE_CELL_ONTOLOGY_PATH = "tests/gentropy/data_samples/cell_ontology_sample.json"
     SAMPLE_UBERON_PATH = "tests/gentropy/data_samples/uberon_sample.json"
 
-    def test_cell_ontology_parser(self) -> None:
+    def test_cell_ontology_parser(
+        self: TestOntologyParger, spark: SparkSession
+    ) -> None:
         """Test cell ontology parser."""
-        cell_ontology = extract_ontology_from_json(self.SAMPLE_CELL_ONTOLOGY_PATH)
+        cell_ontology = extract_ontology_from_json(
+            self.SAMPLE_CELL_ONTOLOGY_PATH, spark
+        )
         assert isinstance(
             cell_ontology, BiosampleIndex
-            ), "Cell ontology subset is not parsed correctly to BiosampleIndex."
+        ), "Cell ontology subset is not parsed correctly to BiosampleIndex."
 
-    def test_uberon_parser(self) -> None:
+    def test_uberon_parser(self: TestOntologyParger, spark: SparkSession) -> None:
         """Test uberon parser."""
-        uberon = extract_ontology_from_json(self.SAMPLE_UBERON_PATH)
+        uberon = extract_ontology_from_json(self.SAMPLE_UBERON_PATH, spark)
         assert isinstance(
             uberon, BiosampleIndex
-            ), "Uberon subset is not parsed correctly to BiosampleIndex."
+        ), "Uberon subset is not parsed correctly to BiosampleIndex."
 
-    def test_merge_biosample_indices(self) -> None:
+    def test_merge_biosample_indices(
+        self: TestOntologyParger, spark: SparkSession
+    ) -> None:
         """Test merging of biosample indices."""
-        cell_ontology = extract_ontology_from_json(self.SAMPLE_CELL_ONTOLOGY_PATH)
-        uberon = extract_ontology_from_json(self.SAMPLE_UBERON_PATH)
-        merged = merge_biosample_indices(cell_ontology, uberon)
+        cell_ontology = extract_ontology_from_json(
+            self.SAMPLE_CELL_ONTOLOGY_PATH, spark
+        )
+        uberon = extract_ontology_from_json(self.SAMPLE_UBERON_PATH, spark)
+        merged = merge_biosample_indices([cell_ontology, uberon])
         assert isinstance(
             merged, BiosampleIndex
-            ), "Merging of biosample indices is not correct."
\ No newline at end of file
+        ), "Merging of biosample indices is not correct."

From 12293d30da1bb6fd7a1e0ad990796d8e4d67545e Mon Sep 17 00:00:00 2001
From: Tobi Alegbe <alegbe@ebi.ac.uk>
Date: Tue, 17 Sep 2024 12:54:15 +0000
Subject: [PATCH 14/22] fix(biosample index): merging indices fix

---
 src/gentropy/datasource/ontologies/utils.py | 30 +++++++++++++--------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/src/gentropy/datasource/ontologies/utils.py b/src/gentropy/datasource/ontologies/utils.py
index 0c4215d09..e38f81b11 100644
--- a/src/gentropy/datasource/ontologies/utils.py
+++ b/src/gentropy/datasource/ontologies/utils.py
@@ -56,7 +56,15 @@ def json_graph_traversal(
 
         def get_relationships(
             node : str
-            ) -> list[str]:
+        ) -> list[str]:
+            """Get all relationships for a given node.
+
+            Args:
+                node (str): Node ID.
+
+            Returns:
+                list[str]: List of relationships.
+            """
             relationships = set()
             stack = [node]
             while stack:
@@ -170,20 +178,20 @@ def merge_lists(
     # Merge the DataFrames
     merged_df = reduce(DataFrame.unionAll, biosample_dfs)
 
-    # Define dictionary of columns and corresponding aggregation functions
+    # Determine aggregation functions for each column
     # Currently this will take the first value for single values and merge lists for list values
-    agg_funcs = {}
-    for column in merged_df.columns:
-        if column != "biosampleId":
-            if "list" in column:  # Assuming column names that have 'list' need list merging
-                agg_funcs[column] = merge_lists_udf(collect_list(column)).alias(column)
+    agg_funcs = []
+    for field in merged_df.schema.fields:
+        if field.name != "biosampleId":  # Skip the grouping column
+            if field.dataType == ArrayType(StringType()):
+                agg_funcs.append(merge_lists_udf(collect_list(col(field.name))).alias(field.name))
             else:
-                agg_funcs[column] = first(column, ignorenulls=True).alias(column)
+                agg_funcs.append(first(col(field.name), ignorenulls=True).alias(field.name))
 
-    # Group by biosampleId and aggregate the columns
-    merged_df = merged_df.groupBy("biosampleId").agg(agg_funcs)
+    # Perform aggregation
+    aggregated_df = merged_df.groupBy("biosampleId").agg(*agg_funcs)
 
     return BiosampleIndex(
-        _df=merged_df,
+        _df=aggregated_df,
         _schema=BiosampleIndex.get_schema()
         )

From 850f91098d3b4ae1475dcee4cca078d9d4250f4e Mon Sep 17 00:00:00 2001
From: Tobi Alegbe <alegbe@ebi.ac.uk>
Date: Tue, 17 Sep 2024 16:13:33 +0000
Subject: [PATCH 15/22] fix(biosample index): update study index qc logic

---
 src/gentropy/dataset/study_index.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gentropy/dataset/study_index.py b/src/gentropy/dataset/study_index.py
index e6e4d4dc3..b853e740a 100644
--- a/src/gentropy/dataset/study_index.py
+++ b/src/gentropy/dataset/study_index.py
@@ -424,7 +424,7 @@ def validate_biosample(self: StudyIndex, biosample_index: BiosampleIndex) -> Stu
         biosample_set = biosample_index.df.select("biosampleId", f.lit(True).alias("isIdFound"))
 
         validated_df = (
-            self.df.join(biosample_set, on="biosampleId", how="left")
+            self.df.join(biosample_set, self.df.biosampleFromSourceId == biosample_set.biosampleId, how="left")
             .withColumn(
                 "isIdFound",
                 f.when(
@@ -440,7 +440,7 @@ def validate_biosample(self: StudyIndex, biosample_index: BiosampleIndex) -> Stu
                     StudyQualityCheck.UNKNOWN_BIOSAMPLE,
                 ),
             )
-            .drop("isIdFound")
+            .drop("isIdFound").drop("biosampleId")
         )
 
         return StudyIndex(_df=validated_df, _schema=StudyIndex.get_schema())

From c42bdd6974d34e5fefe440222a3a34c597d6a966 Mon Sep 17 00:00:00 2001
From: Tobi Alegbe <alegbe@ebi.ac.uk>
Date: Wed, 18 Sep 2024 14:06:59 +0000
Subject: [PATCH 16/22] fix(biosample index): fix missing mock_biosample_index

---
 tests/gentropy/conftest.py | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/tests/gentropy/conftest.py b/tests/gentropy/conftest.py
index 629f3a505..9051ce91e 100644
--- a/tests/gentropy/conftest.py
+++ b/tests/gentropy/conftest.py
@@ -13,6 +13,7 @@
 
 from gentropy.common.Liftover import LiftOverSpark
 from gentropy.common.session import Session
+from gentropy.dataset.biosample_index import BiosampleIndex
 from gentropy.dataset.colocalisation import Colocalisation
 from gentropy.dataset.gene_index import GeneIndex
 from gentropy.dataset.intervals import Intervals
@@ -559,6 +560,35 @@ def mock_gene_index(spark: SparkSession) -> GeneIndex:
     return GeneIndex(_df=data_spec.build(), _schema=gi_schema)
 
 
+@pytest.fixture()
+def mock_biosample_index(spark: SparkSession) -> BiosampleIndex:
+    """Mock biosample index dataset."""
+    bi_schema = BiosampleIndex.get_schema()
+
+    # Makes arrays of varying length with random integers between 1 and 100
+    array_expression = "transform(sequence(1, 1 + floor(rand() * 9)), x -> cast((rand() * 100) as int))"
+
+    data_spec = (
+        dg.DataGenerator(
+            spark,
+            rows=400,
+            partitions=4,
+            randomSeedMethod="hash_fieldname",
+        )
+        .withSchema(bi_schema)
+        .withColumnSpec("biosampleName", percentNulls=0.1)
+        .withColumnSpec("description", percentNulls=0.1)
+        .withColumnSpec("dbXrefs", expr=array_expression, percentNulls=0.1)
+        .withColumnSpec("synonyms", expr=array_expression, percentNulls=0.1)
+        .withColumnSpec("parents", expr=array_expression, percentNulls=0.1)
+        .withColumnSpec("ancestors", expr=array_expression, percentNulls=0.1)
+        .withColumnSpec("descendants", expr=array_expression, percentNulls=0.1)
+        .withColumnSpec("children", expr=array_expression, percentNulls=0.1)
+    )
+
+    return BiosampleIndex(_df=data_spec.build(), _schema=bi_schema)
+
+
 @pytest.fixture()
 def liftover_chain_37_to_38(spark: SparkSession) -> LiftOverSpark:
     """Sample liftover chain file."""

From 07daedc33eafc8b8f42f093924bb879ac16c3f74 Mon Sep 17 00:00:00 2001
From: Tobi Alegbe <alegbe@ebi.ac.uk>
Date: Wed, 18 Sep 2024 14:07:57 +0000
Subject: [PATCH 17/22] chore(biosample index): change datasource name from
 ontologies

---
 docs/python_api/datasources/_datasources.md                   | 4 ++--
 .../{ontologies => biosample_ontologies}/_cell_ontology.md    | 0
 .../{ontologies => biosample_ontologies}/_uberon.md           | 0
 src/gentropy/biosample_index.py                               | 2 +-
 .../{ontologies => biosample_ontologies}/__init__.py          | 0
 .../datasource/{ontologies => biosample_ontologies}/utils.py  | 0
 .../test_biosample_ontology.py                                | 2 +-
 7 files changed, 4 insertions(+), 4 deletions(-)
 rename docs/python_api/datasources/{ontologies => biosample_ontologies}/_cell_ontology.md (100%)
 rename docs/python_api/datasources/{ontologies => biosample_ontologies}/_uberon.md (100%)
 rename src/gentropy/datasource/{ontologies => biosample_ontologies}/__init__.py (100%)
 rename src/gentropy/datasource/{ontologies => biosample_ontologies}/utils.py (100%)
 rename tests/gentropy/datasource/{ontologies => biosample_ontologies}/test_biosample_ontology.py (96%)

diff --git a/docs/python_api/datasources/_datasources.md b/docs/python_api/datasources/_datasources.md
index 58b4bcd2b..43b212e50 100644
--- a/docs/python_api/datasources/_datasources.md
+++ b/docs/python_api/datasources/_datasources.md
@@ -40,5 +40,5 @@ This section contains information about the data source harmonisation tools avai
 
 ## Biological samples
 
-1. [Uberon](ontologies/_uberon.md)
-2. [Cell Ontology](ontologies/_cell_ontology.md)
+1. [Uberon](biosample_ontologies/_uberon.md)
+2. [Cell Ontology](biosample_ontologies/_cell_ontology.md)
diff --git a/docs/python_api/datasources/ontologies/_cell_ontology.md b/docs/python_api/datasources/biosample_ontologies/_cell_ontology.md
similarity index 100%
rename from docs/python_api/datasources/ontologies/_cell_ontology.md
rename to docs/python_api/datasources/biosample_ontologies/_cell_ontology.md
diff --git a/docs/python_api/datasources/ontologies/_uberon.md b/docs/python_api/datasources/biosample_ontologies/_uberon.md
similarity index 100%
rename from docs/python_api/datasources/ontologies/_uberon.md
rename to docs/python_api/datasources/biosample_ontologies/_uberon.md
diff --git a/src/gentropy/biosample_index.py b/src/gentropy/biosample_index.py
index a4080fba1..671309ae5 100644
--- a/src/gentropy/biosample_index.py
+++ b/src/gentropy/biosample_index.py
@@ -2,7 +2,7 @@
 from __future__ import annotations
 
 from gentropy.common.session import Session
-from gentropy.datasource.ontologies.utils import (
+from gentropy.datasource.biosample_ontologies.utils import (
     extract_ontology_from_json,
     merge_biosample_indices,
 )
diff --git a/src/gentropy/datasource/ontologies/__init__.py b/src/gentropy/datasource/biosample_ontologies/__init__.py
similarity index 100%
rename from src/gentropy/datasource/ontologies/__init__.py
rename to src/gentropy/datasource/biosample_ontologies/__init__.py
diff --git a/src/gentropy/datasource/ontologies/utils.py b/src/gentropy/datasource/biosample_ontologies/utils.py
similarity index 100%
rename from src/gentropy/datasource/ontologies/utils.py
rename to src/gentropy/datasource/biosample_ontologies/utils.py
diff --git a/tests/gentropy/datasource/ontologies/test_biosample_ontology.py b/tests/gentropy/datasource/biosample_ontologies/test_biosample_ontology.py
similarity index 96%
rename from tests/gentropy/datasource/ontologies/test_biosample_ontology.py
rename to tests/gentropy/datasource/biosample_ontologies/test_biosample_ontology.py
index 9fb8ff92a..0f16f8115 100644
--- a/tests/gentropy/datasource/ontologies/test_biosample_ontology.py
+++ b/tests/gentropy/datasource/biosample_ontologies/test_biosample_ontology.py
@@ -5,7 +5,7 @@
 from typing import TYPE_CHECKING
 
 from gentropy.dataset.biosample_index import BiosampleIndex
-from gentropy.datasource.ontologies.utils import (
+from gentropy.datasource.biosample_ontologies.utils import (
     extract_ontology_from_json,
     merge_biosample_indices,
 )

From b150122f4082379553e8e7471a5059c1c41c67d1 Mon Sep 17 00:00:00 2001
From: Tobi Alegbe <alegbe@ebi.ac.uk>
Date: Wed, 18 Sep 2024 16:30:21 +0100
Subject: [PATCH 18/22] fix(biosample index): add dataset doc

---
 docs/python_api/datasets/biosample_index.md | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 docs/python_api/datasets/biosample_index.md

diff --git a/docs/python_api/datasets/biosample_index.md b/docs/python_api/datasets/biosample_index.md
new file mode 100644
index 000000000..d3e4ee2c8
--- /dev/null
+++ b/docs/python_api/datasets/biosample_index.md
@@ -0,0 +1,9 @@
+---
+title: Biosample index
+---
+
+::: gentropy.dataset.biosample_index.BiosampleIndex
+
+## Schema
+
+--8<-- "assets/schemas/biosample_index.md"

From 978f6367a6bff915bc2ac9aa6d889abec8941aa4 Mon Sep 17 00:00:00 2001
From: Tobi Alegbe <alegbe@ebi.ac.uk>
Date: Thu, 19 Sep 2024 09:43:54 +0000
Subject: [PATCH 19/22] fix(biosample index): change dbXrefs to xrefs

---
 src/gentropy/assets/schemas/biosample_index.json      | 2 +-
 src/gentropy/datasource/biosample_ontologies/utils.py | 2 +-
 tests/gentropy/conftest.py                            | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/gentropy/assets/schemas/biosample_index.json b/src/gentropy/assets/schemas/biosample_index.json
index 7c28ec970..1d68762ac 100644
--- a/src/gentropy/assets/schemas/biosample_index.json
+++ b/src/gentropy/assets/schemas/biosample_index.json
@@ -20,7 +20,7 @@
       "metadata": {}
     },
     {
-      "name": "dbXrefs",
+      "name": "xrefs",
       "type": {
         "type": "array",
         "elementType": "string",
diff --git a/src/gentropy/datasource/biosample_ontologies/utils.py b/src/gentropy/datasource/biosample_ontologies/utils.py
index e38f81b11..5cc4cfaf9 100644
--- a/src/gentropy/datasource/biosample_ontologies/utils.py
+++ b/src/gentropy/datasource/biosample_ontologies/utils.py
@@ -112,7 +112,7 @@ def get_relationships(
     regexp_replace(col("node.id"), "http://purl.obolibrary.org/obo/", "").alias("biosampleId"),
     col("node.lbl").alias("biosampleName"),
     col("node.meta.definition.val").alias("description"),
-    collect_set(col("node.meta.xrefs.val")).over(Window.partitionBy("node.id")).getItem(0).alias("dbXrefs"),
+    collect_set(col("node.meta.xrefs.val")).over(Window.partitionBy("node.id")).getItem(0).alias("xrefs"),
     # col("node.meta.deprecated").alias("deprecated"),
     collect_set(col("node.meta.synonyms.val")).over(Window.partitionBy("node.id")).getItem(0).alias("synonyms"))
 
diff --git a/tests/gentropy/conftest.py b/tests/gentropy/conftest.py
index 9051ce91e..35fcc6e24 100644
--- a/tests/gentropy/conftest.py
+++ b/tests/gentropy/conftest.py
@@ -578,7 +578,7 @@ def mock_biosample_index(spark: SparkSession) -> BiosampleIndex:
         .withSchema(bi_schema)
         .withColumnSpec("biosampleName", percentNulls=0.1)
         .withColumnSpec("description", percentNulls=0.1)
-        .withColumnSpec("dbXrefs", expr=array_expression, percentNulls=0.1)
+        .withColumnSpec("xrefs", expr=array_expression, percentNulls=0.1)
         .withColumnSpec("synonyms", expr=array_expression, percentNulls=0.1)
         .withColumnSpec("parents", expr=array_expression, percentNulls=0.1)
         .withColumnSpec("ancestors", expr=array_expression, percentNulls=0.1)

From ec4edf37d1798f2179c0a4968952f788545f4ad2 Mon Sep 17 00:00:00 2001
From: Tobi Alegbe <obba2@cam.ac.uk>
Date: Sat, 21 Sep 2024 00:18:33 +0100
Subject: [PATCH 20/22] chore (biosample index): better commenting

Co-authored-by: Daniel Suveges <daniel.suveges@protonmail.com>
---
 src/gentropy/study_validation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gentropy/study_validation.py b/src/gentropy/study_validation.py
index 0e4c22e6b..6f905f89b 100644
--- a/src/gentropy/study_validation.py
+++ b/src/gentropy/study_validation.py
@@ -67,7 +67,7 @@ def __init__(
             .validate_study_type()  # Flagging non-supported study types.
             .validate_target(target_index)  # Flagging QTL studies with invalid targets
             .validate_disease(disease_index)  # Flagging invalid EFOs
-            .validate_biosample(biosample_index)  # Flagging invalid biosamples
+            .validate_biosample(biosample_index)  # Flagging studies with invalid biosamples
         ).persist()  # we will need this for 2 types of outputs
 
         study_index_with_qc.valid_rows(

From cf005042c2b99ca35ebc5cc9dbc39eb6952e05f1 Mon Sep 17 00:00:00 2001
From: Tobi Alegbe <alegbe@ebi.ac.uk>
Date: Sat, 21 Sep 2024 07:41:59 +0000
Subject: [PATCH 21/22] fix(biosample index): various minor tweaks to biosample
 index

---
 poetry.lock                                   |   3 +-
 src/gentropy/biosample_index.py               |  13 +-
 src/gentropy/config.py                        |   4 +-
 src/gentropy/dataset/biosample_index.py       |  44 +++++++
 .../datasource/biosample_ontologies/utils.py  | 113 ++++--------------
 .../test_biosample_ontology.py                |   8 +-
 6 files changed, 80 insertions(+), 105 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 226311a8b..296f07145 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.0 and should not be changed by hand.
 
 [[package]]
 name = "aiodns"
@@ -3952,6 +3952,7 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
     {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
     {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
     {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
diff --git a/src/gentropy/biosample_index.py b/src/gentropy/biosample_index.py
index 671309ae5..e85c2e135 100644
--- a/src/gentropy/biosample_index.py
+++ b/src/gentropy/biosample_index.py
@@ -2,10 +2,7 @@
 from __future__ import annotations
 
 from gentropy.common.session import Session
-from gentropy.datasource.biosample_ontologies.utils import (
-    extract_ontology_from_json,
-    merge_biosample_indices,
-)
+from gentropy.datasource.biosample_ontologies.utils import extract_ontology_from_json
 
 
 class BiosampleIndexStep:
@@ -19,7 +16,7 @@ def __init__(
         session: Session,
         cell_ontology_input_path: str,
         uberon_input_path: str,
-        biosample_index_output_path: str,
+        biosample_index_path: str,
     ) -> None:
         """Run Biosample index generation step.
 
@@ -27,11 +24,11 @@ def __init__(
             session (Session): Session object.
             cell_ontology_input_path (str): Input cell ontology dataset path.
             uberon_input_path (str): Input uberon dataset path.
-            biosample_index_output_path (str): Output gene index dataset path.
+            biosample_index_path (str): Output gene index dataset path.
         """
         cell_ontology_index = extract_ontology_from_json(cell_ontology_input_path, session.spark)
         uberon_index = extract_ontology_from_json(uberon_input_path, session.spark)
 
-        biosample_index = merge_biosample_indices([cell_ontology_index, uberon_index])
+        biosample_index = cell_ontology_index.merge_indices([uberon_index])
 
-        biosample_index.df.write.mode(session.write_mode).parquet(biosample_index_output_path)
+        biosample_index.df.write.mode(session.write_mode).parquet(biosample_index_path)
diff --git a/src/gentropy/config.py b/src/gentropy/config.py
index 82bff532c..a1d0cdfc6 100644
--- a/src/gentropy/config.py
+++ b/src/gentropy/config.py
@@ -55,7 +55,8 @@ class GeneIndexConfig(StepConfig):
 class BiosampleIndexConfig(StepConfig):
     """Biosample index step configuration."""
 
-    target_path: str = MISSING
+    cell_ontology_input_path: str = MISSING
+    uberon_input_path: str = MISSING
     biosample_index_path: str = MISSING
     _target_: str = "gentropy.biosample_index.BiosampleIndexStep"
 
@@ -514,6 +515,7 @@ class StudyValidationStepConfig(StepConfig):
     study_index_path: list[str] = MISSING
     target_index_path: str = MISSING
     disease_index_path: str = MISSING
+    biosample_index_path: str = MISSING
     valid_study_index_path: str = MISSING
     invalid_study_index_path: str = MISSING
     invalid_qc_reasons: list[str] = MISSING
diff --git a/src/gentropy/dataset/biosample_index.py b/src/gentropy/dataset/biosample_index.py
index 20cff34e8..35c65a491 100644
--- a/src/gentropy/dataset/biosample_index.py
+++ b/src/gentropy/dataset/biosample_index.py
@@ -3,8 +3,13 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
+from functools import reduce
 from typing import TYPE_CHECKING
 
+import pyspark.sql.functions as f
+from pyspark.sql import DataFrame
+from pyspark.sql.types import ArrayType, StringType
+
 from gentropy.common.schemas import parse_spark_schema
 from gentropy.dataset.dataset import Dataset
 
@@ -27,3 +32,42 @@ def get_schema(cls: type[BiosampleIndex]) -> StructType:
             StructType: The schema of the BiosampleIndex dataset.
         """
         return parse_spark_schema("biosample_index.json")
+
+    @classmethod
+    def merge_indices(
+        cls: type[BiosampleIndex],
+        biosample_indices : list[BiosampleIndex]
+        ) -> BiosampleIndex:
+        """Merge a list of biosample indices into a single biosample index.
+
+        Where there are conflicts, in single values - the first value is taken. In list values, the union of all values is taken.
+
+        Args:
+            biosample_indices (list[BiosampleIndex]): Biosample indices to merge.
+
+        Returns:
+            BiosampleIndex: Merged biosample index.
+        """
+        # Extract the DataFrames from the BiosampleIndex objects
+        biosample_dfs = [biosample_index.df for biosample_index in biosample_indices] + [cls.df]
+
+        # Merge the DataFrames
+        merged_df = reduce(DataFrame.unionAll, biosample_dfs)
+
+        # Determine aggregation functions for each column
+        # Currently this will take the first value for single values and merge lists for list values
+        agg_funcs = []
+        for field in merged_df.schema.fields:
+            if field.name != "biosampleId":  # Skip the grouping column
+                if field.dataType == ArrayType(StringType()):
+                    agg_funcs.append(f.array_distinct(f.flatten(f.col(field.name))).alias(field.name))
+                else:
+                    agg_funcs.append(f.first(f.col(field.name), ignorenulls=True).alias(field.name))
+
+        # Perform aggregation
+        aggregated_df = merged_df.groupBy("biosampleId").agg(*agg_funcs)
+
+        return BiosampleIndex(
+            _df=aggregated_df,
+            _schema=BiosampleIndex.get_schema()
+            )
diff --git a/src/gentropy/datasource/biosample_ontologies/utils.py b/src/gentropy/datasource/biosample_ontologies/utils.py
index 5cc4cfaf9..6a90a7bab 100644
--- a/src/gentropy/datasource/biosample_ontologies/utils.py
+++ b/src/gentropy/datasource/biosample_ontologies/utils.py
@@ -1,18 +1,6 @@
 """Utility functions for Biosample ontology processing."""
-from functools import reduce
-
 from pyspark.sql import DataFrame, SparkSession
-from pyspark.sql.functions import (
-    array_distinct,
-    coalesce,
-    col,
-    collect_list,
-    collect_set,
-    explode_outer,
-    first,
-    regexp_replace,
-    udf,
-)
+from pyspark.sql import functions as f
 from pyspark.sql.types import ArrayType, StringType
 from pyspark.sql.window import Window
 
@@ -79,52 +67,51 @@ def get_relationships(
         result_col = "ancestors" if traversal_type == "ancestors" else "descendants"
 
         # Register the UDF based on traversal type
-        relationship_udf = udf(get_relationships, ArrayType(StringType()))
+        relationship_udf = f.udf(get_relationships, ArrayType(StringType()))
 
         # Apply the UDF to create the result column
-        return df.withColumn(result_col, relationship_udf(col(node_col)))
+        return df.withColumn(result_col, relationship_udf(f.col(node_col)))
 
     # Load the JSON file
     df = spark.read.json(ontology_json, multiLine=True)
 
     # Exploding the 'graphs' array to make individual records easier to access
-    df_graphs = df.select(explode_outer("graphs").alias("graph"))
+    df_graphs = df.select(f.explode_outer("graphs").alias("graph"))
 
     # Exploding the 'nodes' array within each graph
     df_nodes = df_graphs.select(
-        col("graph.id").alias("graph_id"),
-        explode_outer("graph.nodes").alias("node"))
+        f.col("graph.id").alias("graph_id"),
+        f.explode_outer("graph.nodes").alias("node"))
 
     # Exploding the 'edges' array within each graph for relationship data
     df_edges = df_graphs.select(
-        col("graph.id").alias("graph_id"),
-        explode_outer("graph.edges").alias("edge")
+        f.col("graph.id").alias("graph_id"),
+        f.explode_outer("graph.edges").alias("edge")
     ).select(
-        col("edge.sub").alias("subject"),
-        col("edge.pred").alias("predicate"),
-        col("edge.obj").alias("object")
+        f.col("edge.sub").alias("subject"),
+        f.col("edge.pred").alias("predicate"),
+        f.col("edge.obj").alias("object")
     )
-    df_edges = df_edges.withColumn("subject", regexp_replace(col("subject"), "http://purl.obolibrary.org/obo/", ""))
-    df_edges = df_edges.withColumn("object", regexp_replace(col("object"), "http://purl.obolibrary.org/obo/", ""))
+    df_edges = df_edges.withColumn("subject", f.regexp_replace(f.col("subject"), "http://purl.obolibrary.org/obo/", ""))
+    df_edges = df_edges.withColumn("object", f.regexp_replace(f.col("object"), "http://purl.obolibrary.org/obo/", ""))
 
     # Extract the relevant information from the nodes
     transformed_df = df_nodes.select(
-    regexp_replace(col("node.id"), "http://purl.obolibrary.org/obo/", "").alias("biosampleId"),
-    col("node.lbl").alias("biosampleName"),
-    col("node.meta.definition.val").alias("description"),
-    collect_set(col("node.meta.xrefs.val")).over(Window.partitionBy("node.id")).getItem(0).alias("xrefs"),
-    # col("node.meta.deprecated").alias("deprecated"),
-    collect_set(col("node.meta.synonyms.val")).over(Window.partitionBy("node.id")).getItem(0).alias("synonyms"))
+    f.regexp_replace(f.col("node.id"), "http://purl.obolibrary.org/obo/", "").alias("biosampleId"),
+    f.col("node.lbl").alias("biosampleName"),
+    f.col("node.meta.definition.val").alias("description"),
+    f.collect_set(f.col("node.meta.xrefs.val")).over(Window.partitionBy("node.id")).getItem(0).alias("xrefs"),
+    f.collect_set(f.col("node.meta.synonyms.val")).over(Window.partitionBy("node.id")).getItem(0).alias("synonyms"))
 
 
     # Extract the relationships from the edges
     # Prepare relationship-specific DataFrames
-    df_parents = df_edges.filter(col("predicate") == "is_a").select("subject", "object").withColumnRenamed("object", "parent")
-    df_children = df_edges.filter(col("predicate") == "is_a").select("object", "subject").withColumnRenamed("subject", "child")
+    df_parents = df_edges.filter(f.col("predicate") == "is_a").select("subject", "object").withColumnRenamed("object", "parent")
+    df_children = df_edges.filter(f.col("predicate") == "is_a").select("object", "subject").withColumnRenamed("subject", "child")
 
     # Aggregate relationships back to nodes
-    df_parents_grouped = df_parents.groupBy("subject").agg(array_distinct(collect_list("parent")).alias("parents"))
-    df_children_grouped = df_children.groupBy("object").agg(array_distinct(collect_list("child")).alias("children"))
+    df_parents_grouped = df_parents.groupBy("subject").agg(f.array_distinct(f.collect_list("parent"))).alias("parents")
+    df_children_grouped = df_children.groupBy("object").agg(f.array_distinct(f.collect_list("child"))).alias("children")
 
     # Get all ancestors
     df_with_ancestors = json_graph_traversal(df_parents_grouped, "subject", "parents", "ancestors")
@@ -132,7 +119,7 @@ def get_relationships(
     df_with_descendants = json_graph_traversal(df_children_grouped, "object", "children", "descendants")
 
     # Join the ancestor and descendant DataFrames
-    df_with_relationships = df_with_ancestors.join(df_with_descendants, df_with_ancestors.subject == df_with_descendants.object, "full_outer").withColumn("biosampleId", coalesce(df_with_ancestors.subject, df_with_descendants.object)).drop("subject", "object")
+    df_with_relationships = df_with_ancestors.join(df_with_descendants, df_with_ancestors.subject == df_with_descendants.object, "full_outer").withColumn("biosampleId", f.coalesce(df_with_ancestors.subject, df_with_descendants.object)).drop("subject", "object")
 
     # Join the original DataFrame with the relationship DataFrame
     final_df = transformed_df.join(df_with_relationships, ["biosampleId"], "left")
@@ -141,57 +128,3 @@ def get_relationships(
         _df=final_df,
         _schema=BiosampleIndex.get_schema()
         )
-
-def merge_biosample_indices(
-    biosample_indices : list[BiosampleIndex]
-    ) -> BiosampleIndex:
-    """Merge a list of biosample indices into a single biosample index.
-
-    Where there are conflicts, in single values - the first value is taken. In list values, the union of all values is taken.
-
-    Args:
-        biosample_indices (list[BiosampleIndex]): Biosample indices to merge.
-
-    Returns:
-        BiosampleIndex: Merged biosample index.
-    """
-
-    def merge_lists(
-        lists : list[list[str]]
-        ) -> list[str]:
-        """Merge a list of lists into a single list.
-
-        Args:
-            lists (list[list[str]]): List of lists to merge.
-
-        Returns:
-            list[str]: Merged list.
-        """
-        return list({item for sublist in lists if sublist is not None for item in sublist})
-
-    # Make a spark udf (user defined function) to merge lists
-    merge_lists_udf = udf(merge_lists, ArrayType(StringType()))
-
-    # Extract the DataFrames from the BiosampleIndex objects
-    biosample_dfs = [biosample_index.df for biosample_index in biosample_indices]
-
-    # Merge the DataFrames
-    merged_df = reduce(DataFrame.unionAll, biosample_dfs)
-
-    # Determine aggregation functions for each column
-    # Currently this will take the first value for single values and merge lists for list values
-    agg_funcs = []
-    for field in merged_df.schema.fields:
-        if field.name != "biosampleId":  # Skip the grouping column
-            if field.dataType == ArrayType(StringType()):
-                agg_funcs.append(merge_lists_udf(collect_list(col(field.name))).alias(field.name))
-            else:
-                agg_funcs.append(first(col(field.name), ignorenulls=True).alias(field.name))
-
-    # Perform aggregation
-    aggregated_df = merged_df.groupBy("biosampleId").agg(*agg_funcs)
-
-    return BiosampleIndex(
-        _df=aggregated_df,
-        _schema=BiosampleIndex.get_schema()
-        )
diff --git a/tests/gentropy/datasource/biosample_ontologies/test_biosample_ontology.py b/tests/gentropy/datasource/biosample_ontologies/test_biosample_ontology.py
index 0f16f8115..b88623b0d 100644
--- a/tests/gentropy/datasource/biosample_ontologies/test_biosample_ontology.py
+++ b/tests/gentropy/datasource/biosample_ontologies/test_biosample_ontology.py
@@ -5,10 +5,7 @@
 from typing import TYPE_CHECKING
 
 from gentropy.dataset.biosample_index import BiosampleIndex
-from gentropy.datasource.biosample_ontologies.utils import (
-    extract_ontology_from_json,
-    merge_biosample_indices,
-)
+from gentropy.datasource.biosample_ontologies.utils import extract_ontology_from_json
 
 if TYPE_CHECKING:
     from pyspark.sql import SparkSession
@@ -46,7 +43,8 @@ def test_merge_biosample_indices(
             self.SAMPLE_CELL_ONTOLOGY_PATH, spark
         )
         uberon = extract_ontology_from_json(self.SAMPLE_UBERON_PATH, spark)
-        merged = merge_biosample_indices([cell_ontology, uberon])
+
+        merged = cell_ontology.merge_indices([uberon])
         assert isinstance(
             merged, BiosampleIndex
         ), "Merging of biosample indices is not correct."

From 729f492b74e52eb9fe69f8db5169d84dcfab2a94 Mon Sep 17 00:00:00 2001
From: Tobi Alegbe <alegbe@ebi.ac.uk>
Date: Sat, 21 Sep 2024 08:10:05 +0000
Subject: [PATCH 22/22] fix(biosample index): minor bug

---
 src/gentropy/datasource/biosample_ontologies/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gentropy/datasource/biosample_ontologies/utils.py b/src/gentropy/datasource/biosample_ontologies/utils.py
index 6a90a7bab..e02c82cb7 100644
--- a/src/gentropy/datasource/biosample_ontologies/utils.py
+++ b/src/gentropy/datasource/biosample_ontologies/utils.py
@@ -110,8 +110,8 @@ def get_relationships(
     df_children = df_edges.filter(f.col("predicate") == "is_a").select("object", "subject").withColumnRenamed("subject", "child")
 
     # Aggregate relationships back to nodes
-    df_parents_grouped = df_parents.groupBy("subject").agg(f.array_distinct(f.collect_list("parent"))).alias("parents")
-    df_children_grouped = df_children.groupBy("object").agg(f.array_distinct(f.collect_list("child"))).alias("children")
+    df_parents_grouped = df_parents.groupBy("subject").agg(f.array_distinct(f.collect_list("parent")).alias("parents"))
+    df_children_grouped = df_children.groupBy("object").agg(f.array_distinct(f.collect_list("child")).alias("children"))
 
     # Get all ancestors
     df_with_ancestors = json_graph_traversal(df_parents_grouped, "subject", "parents", "ancestors")