From 6fabc98fba54027a96ca79f295f64aa620e02320 Mon Sep 17 00:00:00 2001 From: Tobi Alegbe Date: Mon, 9 Sep 2024 10:50:32 +0100 Subject: [PATCH 01/22] Initial commit of biosample index --- .../assets/schemas/biosample_index.json | 94 ++++ src/gentropy/dataset/biosample_index.py | 406 ++++++++++++++++++ .../datasource/cell_ontology/__init__.py | 3 + .../cell_ontology/biosample_index.py | 65 +++ 4 files changed, 568 insertions(+) create mode 100644 src/gentropy/assets/schemas/biosample_index.json create mode 100644 src/gentropy/dataset/biosample_index.py create mode 100644 src/gentropy/datasource/cell_ontology/__init__.py create mode 100644 src/gentropy/datasource/cell_ontology/biosample_index.py diff --git a/src/gentropy/assets/schemas/biosample_index.json b/src/gentropy/assets/schemas/biosample_index.json new file mode 100644 index 000000000..5ef3f02c3 --- /dev/null +++ b/src/gentropy/assets/schemas/biosample_index.json @@ -0,0 +1,94 @@ +{ + "type": "struct", + "fields": [ + { + "name": "id", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "name", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "dbXRefs", + "type": { + "type": "array", + "elementType": "string", + "containsNull": true + }, + "nullable": true, + "metadata": {} + }, + { + "name": "description", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "parents", + "type": { + "type": "array", + "elementType": "string", + "containsNull": true + }, + "nullable": true, + "metadata": {} + }, + { + "name": "synonyms", + "type": { + "type": "array", + "elementType": "string", + "containsNull": true + }, + "nullable": true, + "metadata": {} + }, + { + "name": "ancestors", + "type": { + "type": "array", + "elementType": "string", + "containsNull": true + }, + "nullable": true, + "metadata": {} + }, + { + "name": "descendants", + "type": { + "type": "array", + "elementType": "string", + "containsNull": true + }, + "nullable": true, + "metadata": {} + }, + { + "name": "children", + "type": { + "type": "array", + "elementType": "string", + "containsNull": true + }, + "nullable": true, + "metadata": {} + }, + { + "name": "ontology", + "type": { + "type": "map", + "keyType": "string", + "valueType": "boolean", + "valueContainsNull": true + }, + "nullable": true, + "metadata": {} + } + ] +} diff --git a/src/gentropy/dataset/biosample_index.py b/src/gentropy/dataset/biosample_index.py new file mode 100644 index 000000000..e8c787ed6 --- /dev/null +++ b/src/gentropy/dataset/biosample_index.py @@ -0,0 +1,406 @@ +"""Study index dataset.""" + +from __future__ import annotations + +import importlib.resources as pkg_resources +import json +from dataclasses import dataclass +from enum import Enum +from itertools import chain +from typing import TYPE_CHECKING + +from pyspark.sql import functions as f +from pyspark.sql.window import Window + +from gentropy.assets import data +from gentropy.common.schemas import parse_spark_schema +from gentropy.dataset.dataset import Dataset + +if TYPE_CHECKING: + from pyspark.sql import Column, DataFrame + from pyspark.sql.types import StructType + + from gentropy.dataset.gene_index import GeneIndex + + +class StudyQualityCheck(Enum): + """Study quality control options listing concerns on the quality of the study. + + Attributes: + UNRESOLVED_TARGET (str): Target/gene identifier could not match to reference - Labelling failing target. + UNRESOLVED_DISEASE (str): Disease identifier could not match to referece or retired identifier - labelling failing disease + UNKNOWN_STUDY_TYPE (str): Indicating the provided type of study is not supported. + DUPLICATED_STUDY (str): Flagging if a study identifier is not unique. + NO_GENE_PROVIDED (str): Flagging QTL studies if the measured + """ + + UNRESOLVED_TARGET = "Target/gene identifier could not match to reference." + UNRESOLVED_DISEASE = "No valid disease identifier found." + UNKNOWN_STUDY_TYPE = "This type of study is not supported." + DUPLICATED_STUDY = "The identifier of this study is not unique." + NO_GENE_PROVIDED = "QTL study doesn't have gene assigned." + + +@dataclass +class StudyIndex(Dataset): + """Study index dataset. + + A study index dataset captures all the metadata for all studies including GWAS and Molecular QTL. + """ + + @staticmethod + def _aggregate_samples_by_ancestry(merged: Column, ancestry: Column) -> Column: + """Aggregate sample counts by ancestry in a list of struct colmns. + + Args: + merged (Column): A column representing merged data (list of structs). + ancestry (Column): The `ancestry` parameter is a column that represents the ancestry of each + sample. (a struct) + + Returns: + Column: the modified "merged" column after aggregating the samples by ancestry. + """ + # Iterating over the list of ancestries and adding the sample size if label matches: + return f.transform( + merged, + lambda a: f.when( + a.ancestry == ancestry.ancestry, + f.struct( + a.ancestry.alias("ancestry"), + (a.sampleSize + ancestry.sampleSize).alias("sampleSize"), + ), + ).otherwise(a), + ) + + @staticmethod + def _map_ancestries_to_ld_population(gwas_ancestry_label: Column) -> Column: + """Normalise ancestry column from GWAS studies into reference LD panel based on a pre-defined map. + + This function assumes all possible ancestry categories have a corresponding + LD panel in the LD index. It is very important to have the ancestry labels + moved to the LD panel map. + + Args: + gwas_ancestry_label (Column): A struct column with ancestry label like Finnish, + European, African etc. and the corresponding sample size. + + Returns: + Column: Struct column with the mapped LD population label and the sample size. + """ + # Loading ancestry label to LD population label: + json_dict = json.loads( + pkg_resources.read_text( + data, "gwas_population_2_LD_panel_map.json", encoding="utf-8" + ) + ) + map_expr = f.create_map(*[f.lit(x) for x in chain(*json_dict.items())]) + + return f.struct( + map_expr[gwas_ancestry_label.ancestry].alias("ancestry"), + gwas_ancestry_label.sampleSize.alias("sampleSize"), + ) + + @classmethod + def get_schema(cls: type[StudyIndex]) -> StructType: + """Provide the schema for the StudyIndex dataset. + + Returns: + StructType: The schema of the StudyIndex dataset. + """ + return parse_spark_schema("study_index.json") + + @classmethod + def aggregate_and_map_ancestries( + cls: type[StudyIndex], discovery_samples: Column + ) -> Column: + """Map ancestries to populations in the LD reference and calculate relative sample size. + + Args: + discovery_samples (Column): A list of struct column. Has an `ancestry` column and a `sampleSize` columns + + Returns: + Column: A list of struct with mapped LD population and their relative sample size. + """ + # Map ancestry categories to population labels of the LD index: + mapped_ancestries = f.transform( + discovery_samples, cls._map_ancestries_to_ld_population + ) + + # Aggregate sample sizes belonging to the same LD population: + aggregated_counts = f.aggregate( + mapped_ancestries, + f.array_distinct( + f.transform( + mapped_ancestries, + lambda x: f.struct( + x.ancestry.alias("ancestry"), f.lit(0.0).alias("sampleSize") + ), + ) + ), + cls._aggregate_samples_by_ancestry, + ) + # Getting total sample count: + total_sample_count = f.aggregate( + aggregated_counts, f.lit(0.0), lambda total, pop: total + pop.sampleSize + ).alias("sampleSize") + + # Calculating relative sample size for each LD population: + return f.transform( + aggregated_counts, + lambda ld_population: f.struct( + ld_population.ancestry.alias("ldPopulation"), + (ld_population.sampleSize / total_sample_count).alias( + "relativeSampleSize" + ), + ), + ) + + def study_type_lut(self: StudyIndex) -> DataFrame: + """Return a lookup table of study type. + + Returns: + DataFrame: A dataframe containing `studyId` and `studyType` columns. + """ + return self.df.select("studyId", "studyType") + + def is_qtl(self: StudyIndex) -> Column: + """Return a boolean column with true values for QTL studies. + + Returns: + Column: True if the study is a QTL study. + """ + return self.df.studyType.endswith("qtl") + + def is_gwas(self: StudyIndex) -> Column: + """Return a boolean column with true values for GWAS studies. + + Returns: + Column: True if the study is a GWAS study. + """ + return self.df.studyType == "gwas" + + def has_mapped_trait(self: StudyIndex) -> Column: + """Return a boolean column indicating if a study has mapped disease. + + Returns: + Column: True if the study has mapped disease. + """ + return f.size(self.df.traitFromSourceMappedIds) > 0 + + def is_quality_flagged(self: StudyIndex) -> Column: + """Return a boolean column indicating if a study is flagged due to quality issues. + + Returns: + Column: True if the study is flagged. + """ + # Testing for the presence of the qualityControls column: + if "qualityControls" not in self.df.columns: + return f.lit(False) + else: + return f.size(self.df.qualityControls) != 0 + + def has_summarystats(self: StudyIndex) -> Column: + """Return a boolean column indicating if a study has harmonized summary statistics. + + Returns: + Column: True if the study has harmonized summary statistics. + """ + return self.df.hasSumstats + + def validate_unique_study_id(self: StudyIndex) -> StudyIndex: + """Validating the uniqueness of study identifiers and flagging duplicated studies. + + Returns: + StudyIndex: with flagged duplicated studies. + """ + validated_df = ( + self.df.withColumn( + "isDuplicated", + f.when( + f.count("studyType").over( + Window.partitionBy("studyId").rowsBetween( + Window.unboundedPreceding, Window.unboundedFollowing + ) + ) + > 1, + True, + ).otherwise(False), + ) + .withColumn( + "qualityControls", + StudyIndex.update_quality_flag( + f.col("qualityControls"), + f.col("isDuplicated"), + StudyQualityCheck.DUPLICATED_STUDY, + ), + ) + .drop("isDuplicated") + ) + return StudyIndex(_df=validated_df, _schema=StudyIndex.get_schema()) + + def _normalise_disease( + self: StudyIndex, + source_disease_column_name: str, + disease_column_name: str, + disease_map: DataFrame, + ) -> DataFrame: + """Normalising diseases in the study index. + + Given a reference disease map (containing all potential EFO ids with the corresponding reference disease ids), + this function maps all EFO ids in the study index to the reference disease ids. + + Args: + source_disease_column_name (str): The column name of the disease column to validate. + disease_column_name (str): The resulting disease column name that contains the validated ids. + disease_map (DataFrame): Reference dataframe with diseases + + Returns: + DataFrame: where the newly added diseaseIds column will contain the validated EFO identifiers. + """ + return ( + self.df + # Only validating studies with diseases: + .filter(f.size(f.col(source_disease_column_name)) > 0) + # Explode disease column: + .select( + "studyId", + "studyType", + f.explode_outer(source_disease_column_name).alias("efo"), + ) + # Join disease map: + .join(disease_map, on="efo", how="left") + .groupBy("studyId") + .agg( + f.collect_set(f.col("diseaseId")).alias(disease_column_name), + ) + ) + + def validate_disease(self: StudyIndex, disease_map: DataFrame) -> StudyIndex: + """Validate diseases in the study index dataset. + + Args: + disease_map (DataFrame): a dataframe with two columns (efo, diseaseId). + + Returns: + StudyIndex: where gwas studies are flagged where no valid disease id could be found. + """ + # Because the disease ids are not mandatory fields of the schema, we skip vaildation if these columns are not present: + if ("traitFromSourceMappedIds" not in self.df.columns) or ( + "backgroundTraitFromSourceMappedIds" not in self.df.columns + ): + return self + + # Disease Column names: + foreground_disease_column = "diseaseIds" + background_disease_column = "backgroundDiseaseIds" + + # If diseaseId in schema, we need to drop it: + drop_columns = [ + column + for column in self.df.columns + if column in [foreground_disease_column, background_disease_column] + ] + + if len(drop_columns) > 0: + self.df = self.df.drop(*drop_columns) + + # Normalise disease: + normalised_disease = self._normalise_disease( + "traitFromSourceMappedIds", foreground_disease_column, disease_map + ) + normalised_background_disease = self._normalise_disease( + "backgroundTraitFromSourceMappedIds", background_disease_column, disease_map + ) + + return StudyIndex( + _df=( + self.df.join(normalised_disease, on="studyId", how="left") + .join(normalised_background_disease, on="studyId", how="left") + # Updating disease columns: + .withColumn( + foreground_disease_column, + f.when( + f.col(foreground_disease_column).isNull(), f.array() + ).otherwise(f.col(foreground_disease_column)), + ) + .withColumn( + background_disease_column, + f.when( + f.col(background_disease_column).isNull(), f.array() + ).otherwise(f.col(background_disease_column)), + ) + # Flagging gwas studies where no valid disease is avilable: + .withColumn( + "qualityControls", + StudyIndex.update_quality_flag( + f.col("qualityControls"), + # Flagging all gwas studies with no normalised disease: + (f.size(f.col(foreground_disease_column)) == 0) + & (f.col("studyType") == "gwas"), + StudyQualityCheck.UNRESOLVED_DISEASE, + ), + ) + ), + _schema=StudyIndex.get_schema(), + ) + + def validate_study_type(self: StudyIndex) -> StudyIndex: + """Validating study type and flag unsupported types. + + Returns: + StudyIndex: with flagged studies with unsupported type. + """ + validated_df = ( + self.df + # Flagging unsupported study types: + .withColumn( + "qualityControls", + StudyIndex.update_quality_flag( + f.col("qualityControls"), + f.when( + (f.col("studyType") == "gwas") + | f.col("studyType").endswith("qtl"), + False, + ).otherwise(True), + StudyQualityCheck.UNKNOWN_STUDY_TYPE, + ), + ) + ) + return StudyIndex(_df=validated_df, _schema=StudyIndex.get_schema()) + + def validate_target(self: StudyIndex, target_index: GeneIndex) -> StudyIndex: + """Validating gene identifiers in the study index against the provided target index. + + Args: + target_index (GeneIndex): gene index containing the reference gene identifiers (Ensembl gene identifiers). + + Returns: + StudyIndex: with flagged studies if geneId could not be validated. + """ + gene_set = target_index.df.select("geneId", f.lit(True).alias("isIdFound")) + + # As the geneId is not a mandatory field of study index, we return if the column is not there: + if "geneId" not in self.df.columns: + return self + + validated_df = ( + self.df.join(gene_set, on="geneId", how="left") + .withColumn( + "isIdFound", + f.when( + (f.col("studyType") != "gwas") & f.col("isIdFound").isNull(), + f.lit(False), + ).otherwise(f.lit(True)), + ) + .withColumn( + "qualityControls", + StudyIndex.update_quality_flag( + f.col("qualityControls"), + ~f.col("isIdFound"), + StudyQualityCheck.UNRESOLVED_TARGET, + ), + ) + .drop("isIdFound") + ) + + return StudyIndex(_df=validated_df, _schema=StudyIndex.get_schema()) diff --git a/src/gentropy/datasource/cell_ontology/__init__.py b/src/gentropy/datasource/cell_ontology/__init__.py new file mode 100644 index 000000000..c9f3e2075 --- /dev/null +++ b/src/gentropy/datasource/cell_ontology/__init__.py @@ -0,0 +1,3 @@ +"""Cell ontology datasource classes.""" + +from __future__ import annotations diff --git a/src/gentropy/datasource/cell_ontology/biosample_index.py b/src/gentropy/datasource/cell_ontology/biosample_index.py new file mode 100644 index 000000000..c96bf89a1 --- /dev/null +++ b/src/gentropy/datasource/cell_ontology/biosample_index.py @@ -0,0 +1,65 @@ +"""Biosample index for Cell Ontology data source.""" + +from __future__ import annotations + +from itertools import chain +from typing import TYPE_CHECKING + +import pandas as pd +import pyspark.sql.functions as f +from pyspark.sql.types import IntegerType, StringType, StructField, StructType + +import owlready2 as owl + +from gentropy.common.session import Session +from gentropy.dataset.study_index import StudyIndex + +if TYPE_CHECKING: + from pyspark.sql import DataFrame + from pyspark.sql.column import Column + +class CellOntologyStudyIndex: + """Study index dataset from Cell Ontology. + + Cell type data is extracted from the Cell Ontology (CL) https://obophenotype.github.io/cell-ontology/ and used to define the cell types in the study index dataset. + + """" + + # Define the schema explicitly for the DataFrame + raw_biosample_schema: StructType = StructType( + [ + StructField("id", StringType(), True), + StructField("code", StringType(), True), + StructField("name", StringType(), True), + StructField("dbXRefs", ArrayType(StringType()), True), + StructField("description", StringType(), True), + StructField("parents", ArrayType(StringType()), True), + StructField("synonyms", ArrayType(StringType()), True), + StructField("ancestors", ArrayType(StringType()), True), + StructField("descendants", ArrayType(StringType()), True), + StructField("children", ArrayType(StringType()), True), + StructField("ontology", MapType(StringType(), BooleanType()), True) + ] + ) + raw_biosample_path = "https://raw.githubusercontent.com/obophenotype/cell-ontology/master/cl.owl" # Dummy path for now + + @classmethod + def extract_celltypes_from_source( + cls: type[CellOntologyStudyIndex], + session: Session, + mqtl_quantification_methods_blacklist: list[str], + ) -> DataFrame: + """Read raw studies metadata from eQTL Catalogue. + + Args: + session (Session): Spark session. + mqtl_quantification_methods_blacklist (list[str]): Molecular trait quantification methods that we don't want to ingest. Available options in https://github.com/eQTL-Catalogue/eQTL-Catalogue-resources/blob/master/data_tables/dataset_metadata.tsv + + Returns: + DataFrame: raw studies metadata. + """ + pd.DataFrame.iteritems = pd.DataFrame.items + return session.spark.createDataFrame( + pd.read_csv(cls.raw_studies_metadata_path, sep="\t"), + schema=cls.raw_studies_metadata_schema, + ).filter(~(f.col("quant_method").isin(mqtl_quantification_methods_blacklist))) From e8a3775fe38c60646065886bfc7cceae3c490d1c Mon Sep 17 00:00:00 2001 From: Tobi Alegbe Date: Mon, 9 Sep 2024 11:08:48 +0100 Subject: [PATCH 02/22] Make minimal class --- src/gentropy/dataset/biosample_index.py | 374 +----------------------- 1 file changed, 3 insertions(+), 371 deletions(-) diff --git a/src/gentropy/dataset/biosample_index.py b/src/gentropy/dataset/biosample_index.py index e8c787ed6..2e445e843 100644 --- a/src/gentropy/dataset/biosample_index.py +++ b/src/gentropy/dataset/biosample_index.py @@ -20,26 +20,6 @@ from pyspark.sql import Column, DataFrame from pyspark.sql.types import StructType - from gentropy.dataset.gene_index import GeneIndex - - -class StudyQualityCheck(Enum): - """Study quality control options listing concerns on the quality of the study. - - Attributes: - UNRESOLVED_TARGET (str): Target/gene identifier could not match to reference - Labelling failing target. - UNRESOLVED_DISEASE (str): Disease identifier could not match to referece or retired identifier - labelling failing disease - UNKNOWN_STUDY_TYPE (str): Indicating the provided type of study is not supported. - DUPLICATED_STUDY (str): Flagging if a study identifier is not unique. - NO_GENE_PROVIDED (str): Flagging QTL studies if the measured - """ - - UNRESOLVED_TARGET = "Target/gene identifier could not match to reference." - UNRESOLVED_DISEASE = "No valid disease identifier found." - UNKNOWN_STUDY_TYPE = "This type of study is not supported." - DUPLICATED_STUDY = "The identifier of this study is not unique." - NO_GENE_PROVIDED = "QTL study doesn't have gene assigned." - @dataclass class StudyIndex(Dataset): @@ -48,359 +28,11 @@ class StudyIndex(Dataset): A study index dataset captures all the metadata for all studies including GWAS and Molecular QTL. """ - @staticmethod - def _aggregate_samples_by_ancestry(merged: Column, ancestry: Column) -> Column: - """Aggregate sample counts by ancestry in a list of struct colmns. - - Args: - merged (Column): A column representing merged data (list of structs). - ancestry (Column): The `ancestry` parameter is a column that represents the ancestry of each - sample. (a struct) - - Returns: - Column: the modified "merged" column after aggregating the samples by ancestry. - """ - # Iterating over the list of ancestries and adding the sample size if label matches: - return f.transform( - merged, - lambda a: f.when( - a.ancestry == ancestry.ancestry, - f.struct( - a.ancestry.alias("ancestry"), - (a.sampleSize + ancestry.sampleSize).alias("sampleSize"), - ), - ).otherwise(a), - ) - - @staticmethod - def _map_ancestries_to_ld_population(gwas_ancestry_label: Column) -> Column: - """Normalise ancestry column from GWAS studies into reference LD panel based on a pre-defined map. - - This function assumes all possible ancestry categories have a corresponding - LD panel in the LD index. It is very important to have the ancestry labels - moved to the LD panel map. - - Args: - gwas_ancestry_label (Column): A struct column with ancestry label like Finnish, - European, African etc. and the corresponding sample size. - - Returns: - Column: Struct column with the mapped LD population label and the sample size. - """ - # Loading ancestry label to LD population label: - json_dict = json.loads( - pkg_resources.read_text( - data, "gwas_population_2_LD_panel_map.json", encoding="utf-8" - ) - ) - map_expr = f.create_map(*[f.lit(x) for x in chain(*json_dict.items())]) - - return f.struct( - map_expr[gwas_ancestry_label.ancestry].alias("ancestry"), - gwas_ancestry_label.sampleSize.alias("sampleSize"), - ) - @classmethod def get_schema(cls: type[StudyIndex]) -> StructType: - """Provide the schema for the StudyIndex dataset. - - Returns: - StructType: The schema of the StudyIndex dataset. - """ - return parse_spark_schema("study_index.json") - - @classmethod - def aggregate_and_map_ancestries( - cls: type[StudyIndex], discovery_samples: Column - ) -> Column: - """Map ancestries to populations in the LD reference and calculate relative sample size. - - Args: - discovery_samples (Column): A list of struct column. Has an `ancestry` column and a `sampleSize` columns - - Returns: - Column: A list of struct with mapped LD population and their relative sample size. - """ - # Map ancestry categories to population labels of the LD index: - mapped_ancestries = f.transform( - discovery_samples, cls._map_ancestries_to_ld_population - ) - - # Aggregate sample sizes belonging to the same LD population: - aggregated_counts = f.aggregate( - mapped_ancestries, - f.array_distinct( - f.transform( - mapped_ancestries, - lambda x: f.struct( - x.ancestry.alias("ancestry"), f.lit(0.0).alias("sampleSize") - ), - ) - ), - cls._aggregate_samples_by_ancestry, - ) - # Getting total sample count: - total_sample_count = f.aggregate( - aggregated_counts, f.lit(0.0), lambda total, pop: total + pop.sampleSize - ).alias("sampleSize") - - # Calculating relative sample size for each LD population: - return f.transform( - aggregated_counts, - lambda ld_population: f.struct( - ld_population.ancestry.alias("ldPopulation"), - (ld_population.sampleSize / total_sample_count).alias( - "relativeSampleSize" - ), - ), - ) - - def study_type_lut(self: StudyIndex) -> DataFrame: - """Return a lookup table of study type. - - Returns: - DataFrame: A dataframe containing `studyId` and `studyType` columns. - """ - return self.df.select("studyId", "studyType") - - def is_qtl(self: StudyIndex) -> Column: - """Return a boolean column with true values for QTL studies. - - Returns: - Column: True if the study is a QTL study. - """ - return self.df.studyType.endswith("qtl") - - def is_gwas(self: StudyIndex) -> Column: - """Return a boolean column with true values for GWAS studies. - - Returns: - Column: True if the study is a GWAS study. - """ - return self.df.studyType == "gwas" - - def has_mapped_trait(self: StudyIndex) -> Column: - """Return a boolean column indicating if a study has mapped disease. - - Returns: - Column: True if the study has mapped disease. - """ - return f.size(self.df.traitFromSourceMappedIds) > 0 - - def is_quality_flagged(self: StudyIndex) -> Column: - """Return a boolean column indicating if a study is flagged due to quality issues. + """Provide the schema for the BioSampleIndex dataset. Returns: - Column: True if the study is flagged. + StructType: The schema of the BioSampleIndex dataset. """ - # Testing for the presence of the qualityControls column: - if "qualityControls" not in self.df.columns: - return f.lit(False) - else: - return f.size(self.df.qualityControls) != 0 - - def has_summarystats(self: StudyIndex) -> Column: - """Return a boolean column indicating if a study has harmonized summary statistics. - - Returns: - Column: True if the study has harmonized summary statistics. - """ - return self.df.hasSumstats - - def validate_unique_study_id(self: StudyIndex) -> StudyIndex: - """Validating the uniqueness of study identifiers and flagging duplicated studies. - - Returns: - StudyIndex: with flagged duplicated studies. - """ - validated_df = ( - self.df.withColumn( - "isDuplicated", - f.when( - f.count("studyType").over( - Window.partitionBy("studyId").rowsBetween( - Window.unboundedPreceding, Window.unboundedFollowing - ) - ) - > 1, - True, - ).otherwise(False), - ) - .withColumn( - "qualityControls", - StudyIndex.update_quality_flag( - f.col("qualityControls"), - f.col("isDuplicated"), - StudyQualityCheck.DUPLICATED_STUDY, - ), - ) - .drop("isDuplicated") - ) - return StudyIndex(_df=validated_df, _schema=StudyIndex.get_schema()) - - def _normalise_disease( - self: StudyIndex, - source_disease_column_name: str, - disease_column_name: str, - disease_map: DataFrame, - ) -> DataFrame: - """Normalising diseases in the study index. - - Given a reference disease map (containing all potential EFO ids with the corresponding reference disease ids), - this function maps all EFO ids in the study index to the reference disease ids. - - Args: - source_disease_column_name (str): The column name of the disease column to validate. - disease_column_name (str): The resulting disease column name that contains the validated ids. - disease_map (DataFrame): Reference dataframe with diseases - - Returns: - DataFrame: where the newly added diseaseIds column will contain the validated EFO identifiers. - """ - return ( - self.df - # Only validating studies with diseases: - .filter(f.size(f.col(source_disease_column_name)) > 0) - # Explode disease column: - .select( - "studyId", - "studyType", - f.explode_outer(source_disease_column_name).alias("efo"), - ) - # Join disease map: - .join(disease_map, on="efo", how="left") - .groupBy("studyId") - .agg( - f.collect_set(f.col("diseaseId")).alias(disease_column_name), - ) - ) - - def validate_disease(self: StudyIndex, disease_map: DataFrame) -> StudyIndex: - """Validate diseases in the study index dataset. - - Args: - disease_map (DataFrame): a dataframe with two columns (efo, diseaseId). - - Returns: - StudyIndex: where gwas studies are flagged where no valid disease id could be found. - """ - # Because the disease ids are not mandatory fields of the schema, we skip vaildation if these columns are not present: - if ("traitFromSourceMappedIds" not in self.df.columns) or ( - "backgroundTraitFromSourceMappedIds" not in self.df.columns - ): - return self - - # Disease Column names: - foreground_disease_column = "diseaseIds" - background_disease_column = "backgroundDiseaseIds" - - # If diseaseId in schema, we need to drop it: - drop_columns = [ - column - for column in self.df.columns - if column in [foreground_disease_column, background_disease_column] - ] - - if len(drop_columns) > 0: - self.df = self.df.drop(*drop_columns) - - # Normalise disease: - normalised_disease = self._normalise_disease( - "traitFromSourceMappedIds", foreground_disease_column, disease_map - ) - normalised_background_disease = self._normalise_disease( - "backgroundTraitFromSourceMappedIds", background_disease_column, disease_map - ) - - return StudyIndex( - _df=( - self.df.join(normalised_disease, on="studyId", how="left") - .join(normalised_background_disease, on="studyId", how="left") - # Updating disease columns: - .withColumn( - foreground_disease_column, - f.when( - f.col(foreground_disease_column).isNull(), f.array() - ).otherwise(f.col(foreground_disease_column)), - ) - .withColumn( - background_disease_column, - f.when( - f.col(background_disease_column).isNull(), f.array() - ).otherwise(f.col(background_disease_column)), - ) - # Flagging gwas studies where no valid disease is avilable: - .withColumn( - "qualityControls", - StudyIndex.update_quality_flag( - f.col("qualityControls"), - # Flagging all gwas studies with no normalised disease: - (f.size(f.col(foreground_disease_column)) == 0) - & (f.col("studyType") == "gwas"), - StudyQualityCheck.UNRESOLVED_DISEASE, - ), - ) - ), - _schema=StudyIndex.get_schema(), - ) - - def validate_study_type(self: StudyIndex) -> StudyIndex: - """Validating study type and flag unsupported types. - - Returns: - StudyIndex: with flagged studies with unsupported type. - """ - validated_df = ( - self.df - # Flagging unsupported study types: - .withColumn( - "qualityControls", - StudyIndex.update_quality_flag( - f.col("qualityControls"), - f.when( - (f.col("studyType") == "gwas") - | f.col("studyType").endswith("qtl"), - False, - ).otherwise(True), - StudyQualityCheck.UNKNOWN_STUDY_TYPE, - ), - ) - ) - return StudyIndex(_df=validated_df, _schema=StudyIndex.get_schema()) - - def validate_target(self: StudyIndex, target_index: GeneIndex) -> StudyIndex: - """Validating gene identifiers in the study index against the provided target index. - - Args: - target_index (GeneIndex): gene index containing the reference gene identifiers (Ensembl gene identifiers). - - Returns: - StudyIndex: with flagged studies if geneId could not be validated. - """ - gene_set = target_index.df.select("geneId", f.lit(True).alias("isIdFound")) - - # As the geneId is not a mandatory field of study index, we return if the column is not there: - if "geneId" not in self.df.columns: - return self - - validated_df = ( - self.df.join(gene_set, on="geneId", how="left") - .withColumn( - "isIdFound", - f.when( - (f.col("studyType") != "gwas") & f.col("isIdFound").isNull(), - f.lit(False), - ).otherwise(f.lit(True)), - ) - .withColumn( - "qualityControls", - StudyIndex.update_quality_flag( - f.col("qualityControls"), - ~f.col("isIdFound"), - StudyQualityCheck.UNRESOLVED_TARGET, - ), - ) - .drop("isIdFound") - ) - - return StudyIndex(_df=validated_df, _schema=StudyIndex.get_schema()) + return parse_spark_schema("biosample_index.json") From c4d6d5feac2e0f21deaca89199c34a4eb736f537 Mon Sep 17 00:00:00 2001 From: Tobi Alegbe Date: Mon, 9 Sep 2024 15:07:36 +0100 Subject: [PATCH 03/22] Tidy up first draft of adding biosample index --- .../assets/schemas/biosample_index.json | 4 +- src/gentropy/dataset/biosample_index.py | 103 +++++++++++++++++- .../cell_ontology/biosample_index.py | 49 +++------ src/gentropy/datasource/uberon/__init__.py | 3 + .../datasource/uberon/biosample_index.py | 48 ++++++++ 5 files changed, 166 insertions(+), 41 deletions(-) create mode 100644 src/gentropy/datasource/uberon/__init__.py create mode 100644 src/gentropy/datasource/uberon/biosample_index.py diff --git a/src/gentropy/assets/schemas/biosample_index.json b/src/gentropy/assets/schemas/biosample_index.json index 5ef3f02c3..27c4a508f 100644 --- a/src/gentropy/assets/schemas/biosample_index.json +++ b/src/gentropy/assets/schemas/biosample_index.json @@ -2,13 +2,13 @@ "type": "struct", "fields": [ { - "name": "id", + "name": "biosampleIndex", "type": "string", "nullable": true, "metadata": {} }, { - "name": "name", + "name": "biosampleName", "type": "string", "nullable": true, "metadata": {} diff --git a/src/gentropy/dataset/biosample_index.py b/src/gentropy/dataset/biosample_index.py index 2e445e843..4646a7774 100644 --- a/src/gentropy/dataset/biosample_index.py +++ b/src/gentropy/dataset/biosample_index.py @@ -1,4 +1,4 @@ -"""Study index dataset.""" +"""Biosample index dataset.""" from __future__ import annotations @@ -22,17 +22,108 @@ @dataclass -class StudyIndex(Dataset): - """Study index dataset. +class BiosampleIndex(Dataset): + """Biosample index dataset. - A study index dataset captures all the metadata for all studies including GWAS and Molecular QTL. + A Biosample index dataset captures the metadata of the biosamples (e.g. tissues, cell types, cell lines, etc) such as alternate names and relationships with other biosamples. """ @classmethod def get_schema(cls: type[StudyIndex]) -> StructType: - """Provide the schema for the BioSampleIndex dataset. + """Provide the schema for the BiosampleIndex dataset. Returns: - StructType: The schema of the BioSampleIndex dataset. + StructType: The schema of the BiosampleIndex dataset. """ return parse_spark_schema("biosample_index.json") + + +def extract_ontology_info( + ontology : owlready2.namespace.Ontology, + prefix : str, + session : Session, + schema : StructType = BiosampleIndex.get_schema(), +) -> BiosampleIndex: + """Extracts the ontology information from Uberon or Cell Ontology owo owlready2 ontology object. + + Args: + ontology (owlready2.namespace.Ontology): An owlready2 ontology object. Must be either from Cell Ontology or Uberon. + prefix (str): Prefix for the desired ontology terms. + session (Session): Spark session. + + Returns: + BiosampleIndex: Parsed and annotated biosample index table. + """ + + # Iterate over all classes in the ontology + for cls in ont.classes(): + if cls.name.startswith(prefix): + # Basic class information + cls_id = cls.name + # cls_code = cls.iri + cls_name = cls.label[0] if cls.label else None + + # Extract descriptions + description = None + if hasattr(cls, 'IAO_0000115'): + description = cls.IAO_0000115.first() if cls.IAO_0000115 else None + + # Extract dbXRefs + dbXRefs = [x for x in cls.hasDbXref] if hasattr(cls, 'hasDbXref') else [] + + # Parent classes + parents = [] + for parent in cls.is_a: + if parent is owl.Thing: + continue # Skip owlready2 Thing class, which is a top-level class + elif hasattr(parent, 'name'): + parent_id = parent.name + parents.append(parent_id) + elif hasattr(parent, 'property'): # For restrictions + continue # We skip restrictions in this simplified list + + # Synonyms + synonyms = set() + if hasattr(cls, 'hasExactSynonym'): + synonyms.update(cls.hasExactSynonym) + if hasattr(cls, 'hasBroadSynonym'): + synonyms.update(cls.hasBroadSynonym) + if hasattr(cls, 'hasNarrowSynonym'): + synonyms.update(cls.hasNarrowSynonym) + if hasattr(cls, 'hasRelatedSynonym'): + synonyms.update(cls.hasRelatedSynonym) + + # Children classes + children = [child.name for child in cls.subclasses()] + + # Ancestors and descendants with Thing class filtered out + ancestors = [anc.name for anc in cls.ancestors() if hasattr(anc, 'name') and anc is not owl.Thing] + descendants = [desc.name for desc in cls.descendants() if hasattr(desc, 'name')] + + # Check if the class is deprecated + is_deprecated = False + if hasattr(cls, 'deprecated') and cls.deprecated: + is_deprecated = True + + # Compile all information into a Row + entry = Row( + id=cls_id, + # code=cls_code, + name=cls_name, + dbXRefs=dbXRefs, + description=description, + parents=parents, + synonyms=list(synonyms), + ancestors=ancestors, + descendants=descendants, + children=children, + ontology={"is_obsolete": is_deprecated} + ) + + # Add to data list + data.append(entry) + + + # Create DataFrame directly from Rows + df = spark2.createDataFrame(data, schema) + return df diff --git a/src/gentropy/datasource/cell_ontology/biosample_index.py b/src/gentropy/datasource/cell_ontology/biosample_index.py index c96bf89a1..3ec2d7be4 100644 --- a/src/gentropy/datasource/cell_ontology/biosample_index.py +++ b/src/gentropy/datasource/cell_ontology/biosample_index.py @@ -12,54 +12,37 @@ import owlready2 as owl from gentropy.common.session import Session -from gentropy.dataset.study_index import StudyIndex +from gentropy.dataset.biosample_index import BiosampleIndex, extract_ontology_info if TYPE_CHECKING: from pyspark.sql import DataFrame from pyspark.sql.column import Column -class CellOntologyStudyIndex: - """Study index dataset from Cell Ontology. +class CellOntologyBiosampleIndex: + """Biosample index dataset from Cell Ontology. - Cell type data is extracted from the Cell Ontology (CL) https://obophenotype.github.io/cell-ontology/ and used to define the cell types in the study index dataset. - - """" - - # Define the schema explicitly for the DataFrame - raw_biosample_schema: StructType = StructType( - [ - StructField("id", StringType(), True), - StructField("code", StringType(), True), - StructField("name", StringType(), True), - StructField("dbXRefs", ArrayType(StringType()), True), - StructField("description", StringType(), True), - StructField("parents", ArrayType(StringType()), True), - StructField("synonyms", ArrayType(StringType()), True), - StructField("ancestors", ArrayType(StringType()), True), - StructField("descendants", ArrayType(StringType()), True), - StructField("children", ArrayType(StringType()), True), - StructField("ontology", MapType(StringType(), BooleanType()), True) - ] - ) - raw_biosample_path = "https://raw.githubusercontent.com/obophenotype/cell-ontology/master/cl.owl" # Dummy path for now + Cell type data is extracted from the Cell Ontology (CL) https://obophenotype.github.io/cell-ontology/ and used to define the cell types in the biosample index dataset. + """ @classmethod def extract_celltypes_from_source( cls: type[CellOntologyStudyIndex], session: Session, - mqtl_quantification_methods_blacklist: list[str], + ontology_path: str, ) -> DataFrame: - """Read raw studies metadata from eQTL Catalogue. + """Ingests Cell Ontology owo file and extracts cell types. Args: session (Session): Spark session. - mqtl_quantification_methods_blacklist (list[str]): Molecular trait quantification methods that we don't want to ingest. Available options in https://github.com/eQTL-Catalogue/eQTL-Catalogue-resources/blob/master/data_tables/dataset_metadata.tsv + ontology_path (str): Path to the Cell ontology owo file. Returns: - DataFrame: raw studies metadata. + BiosampleIndex: Parsed and annotated Cell Ontology biosample index table. """ - pd.DataFrame.iteritems = pd.DataFrame.items - return session.spark.createDataFrame( - pd.read_csv(cls.raw_studies_metadata_path, sep="\t"), - schema=cls.raw_studies_metadata_schema, - ).filter(~(f.col("quant_method").isin(mqtl_quantification_methods_blacklist))) + ontology_data = owl.get_ontology(ontology_path).load() + df = extract_ontology_info(ontology_data, "CL_", session, BiosampleIndex.get_schema()) + + return BiosampleIndex( + _df=df, + _schema=BiosampleIndex.get_schema() + ) \ No newline at end of file diff --git a/src/gentropy/datasource/uberon/__init__.py b/src/gentropy/datasource/uberon/__init__.py new file mode 100644 index 000000000..11899e25b --- /dev/null +++ b/src/gentropy/datasource/uberon/__init__.py @@ -0,0 +1,3 @@ +"""Uberon datasource classes.""" + +from __future__ import annotations diff --git a/src/gentropy/datasource/uberon/biosample_index.py b/src/gentropy/datasource/uberon/biosample_index.py new file mode 100644 index 000000000..d07248b5e --- /dev/null +++ b/src/gentropy/datasource/uberon/biosample_index.py @@ -0,0 +1,48 @@ +"""Biosample index for Uberon data source.""" + +from __future__ import annotations + +from itertools import chain +from typing import TYPE_CHECKING + +import pandas as pd +import pyspark.sql.functions as f +from pyspark.sql.types import IntegerType, StringType, StructField, StructType + +import owlready2 as owl + +from gentropy.common.session import Session +from gentropy.dataset.biosample_index import BiosampleIndex + +if TYPE_CHECKING: + from pyspark.sql import DataFrame + from pyspark.sql.column import Column + +class UberonBiosampleIndex: + """Biosample index dataset from Uberon. + + Cell type data is extracted from the Uberon (UBERON) https://obophenotype.github.io/uberon/ and used to define the tissues in the biosample index dataset. + """ + + @classmethod + def extract_tissue_from_source( + cls: type[UberonStudyIndex], + session: Session, + ontology_path: str, + ) -> DataFrame: + """Ingests Uberon owo file and extracts tissues. + + Args: + session (Session): Spark session. + ontology_path (str): Path to the Uberon owo file. + + Returns: + BiosampleIndex: Parsed and annotated Uberon biosample index table. + """ + ontology_data = owl.get_ontology(ontology_path).load() + df = extract_ontology_info(ontology_data, "UBERON_", session, BiosampleIndex.get_schema()) + + return BiosampleIndex( + _df=df, + _schema=BiosampleIndex.get_schema() + ) From 186e77313413fb21bd45703890457d2afa19cfa1 Mon Sep 17 00:00:00 2001 From: Tobi Alegbe Date: Tue, 10 Sep 2024 09:42:09 +0100 Subject: [PATCH 04/22] Add beginning of logic for checking if biosample from a studyindex is in biosample index --- .../assets/schemas/biosample_index.json | 2 +- src/gentropy/dataset/study_index.py | 34 +++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/src/gentropy/assets/schemas/biosample_index.json b/src/gentropy/assets/schemas/biosample_index.json index 27c4a508f..df8f82188 100644 --- a/src/gentropy/assets/schemas/biosample_index.json +++ b/src/gentropy/assets/schemas/biosample_index.json @@ -2,7 +2,7 @@ "type": "struct", "fields": [ { - "name": "biosampleIndex", + "name": "biosampleId", "type": "string", "nullable": true, "metadata": {} diff --git a/src/gentropy/dataset/study_index.py b/src/gentropy/dataset/study_index.py index e8c787ed6..cc4dabae5 100644 --- a/src/gentropy/dataset/study_index.py +++ b/src/gentropy/dataset/study_index.py @@ -404,3 +404,37 @@ def validate_target(self: StudyIndex, target_index: GeneIndex) -> StudyIndex: ) return StudyIndex(_df=validated_df, _schema=StudyIndex.get_schema()) + + def validate_biosample(self: StudyIndex, biosample_index: BiosampleIndex) -> StudyIndex: + """Validating biosample identifiers in the study index against the provided biosample index. + + Args: + biosample_index (BiosampleIndex): Biosample index containing a reference of biosample identifiers e.g. cell types, tissues, cell lines, etc. + + Returns: + StudyIndex: with flagged studies if biosampleIndex could not be validated. + """ + biosample_set = biosample_index.df.select("biosampleId", f.lit(True).alias("isIdFound")) + + validated_df = ( + self.df.join(biosample_set, on="biosampleId", how="left") + .withColumn( + "isIdFound", + f.when( + f.col("isIdFound").isNull(), + f.lit(False), + ).otherwise(f.lit(True)), + ) + .withColumn( + "qualityControls", + StudyIndex.update_quality_flag( + f.col("qualityControls"), + ~f.col("isIdFound"), + StudyQualityCheck.NO_GENE_PROVIDED, + ), + ) + .drop("isIdFound") + ) + + return StudyIndex(_df=validated_df, _schema=StudyIndex.get_schema()) + From 6f0a2e2711c367b5331d7cb52ac63c7118f8259b Mon Sep 17 00:00:00 2001 From: Tobi Alegbe Date: Tue, 10 Sep 2024 09:42:39 +0100 Subject: [PATCH 05/22] Make early file for merging multiple biosample indices into one --- src/gentropy/biosample_index.py | 37 +++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 src/gentropy/biosample_index.py diff --git a/src/gentropy/biosample_index.py b/src/gentropy/biosample_index.py new file mode 100644 index 000000000..26beb0ea5 --- /dev/null +++ b/src/gentropy/biosample_index.py @@ -0,0 +1,37 @@ +"""Step to generate biosample index dataset.""" +from __future__ import annotations + +from gentropy.common.session import Session +from gentropy.datasource.open_targets.target import OpenTargetsTarget +from gentropy.dataset.biosample_index import BiosampleIndex +from gentropy.datasource.cell_ontology.biosample_index import CellOntologyBiosampleIndex +from gentropy.datasource.uberon.biosample_index import UberonBiosampleIndex + + +class BiosampleIndexStep: + """Biosample index step. + + This step generates a Biosample index dataset from the various ontology sources. Currently Cell Ontology and Uberon are supported. + """ + + def __init__( + self, + session: Session, + cell_ontology_input_path: str, + uberon_input_path: str, + biosample_index_output_path: str, + ) -> None: + """Run Biosample index generation step. + + Args: + session (Session): Session object. + cell_ontology_input_path (str): Input cell ontology dataset path. + uberon_input_path (str): Input uberon dataset path. + biosample_index_output_path (str): Output gene index dataset path. + """ + cell_ontology_index = CellOntologyBiosampleIndex.extract_celltypes_from_source( + session, cell_ontology_input_path + ) + uberon_index = UberonBiosampleIndex.extract_tissue_from_source( + session, uberon_input_path + ) From 55e2baf90cfdd305d5fb649afa394240f36cbaae Mon Sep 17 00:00:00 2001 From: Tobi Alegbe Date: Tue, 10 Sep 2024 11:59:45 +0100 Subject: [PATCH 06/22] Finish adding basic iteration of biosample index, needs debugging --- src/gentropy/biosample_index.py | 3 +++ src/gentropy/dataset/biosample_index.py | 19 +++++++++++++++++++ src/gentropy/study_validation.py | 3 +++ 3 files changed, 25 insertions(+) diff --git a/src/gentropy/biosample_index.py b/src/gentropy/biosample_index.py index 26beb0ea5..b5864a5ba 100644 --- a/src/gentropy/biosample_index.py +++ b/src/gentropy/biosample_index.py @@ -35,3 +35,6 @@ def __init__( uberon_index = UberonBiosampleIndex.extract_tissue_from_source( session, uberon_input_path ) + biosample_index = BiosampleIndex.merge([cell_ontology_index, uberon_index]) + biosample_index.write_parquet(biosample_index_output_path) + diff --git a/src/gentropy/dataset/biosample_index.py b/src/gentropy/dataset/biosample_index.py index 4646a7774..2dc547cd8 100644 --- a/src/gentropy/dataset/biosample_index.py +++ b/src/gentropy/dataset/biosample_index.py @@ -11,6 +11,7 @@ from pyspark.sql import functions as f from pyspark.sql.window import Window +from functools import reduce from gentropy.assets import data from gentropy.common.schemas import parse_spark_schema @@ -37,6 +38,23 @@ def get_schema(cls: type[StudyIndex]) -> StructType: """ return parse_spark_schema("biosample_index.json") + @classmethod + def merge( + cls: type[BiosampleIndex], + biosample_indexes: list[BiosampleIndex], + ) -> BiosampleIndex: + """Merge a list of biosample indexes into a single biosample index. + + Args: + biosample_indexes (BiosampleIndex): Biosample indexes to merge. + + Returns: + BiosampleIndex: Merged biosample index. + """ + df = reduct(DataFrame.unionAll, [biosample_index._df for biosample_index in biosample_indexes]) + return BiosampleIndex(_df=df, _schema=BiosampleIndex.get_schema()) + + def extract_ontology_info( ontology : owlready2.namespace.Ontology, @@ -127,3 +145,4 @@ def extract_ontology_info( # Create DataFrame directly from Rows df = spark2.createDataFrame(data, schema) return df + diff --git a/src/gentropy/study_validation.py b/src/gentropy/study_validation.py index 5bfb83fe0..3e926078d 100644 --- a/src/gentropy/study_validation.py +++ b/src/gentropy/study_validation.py @@ -22,6 +22,7 @@ def __init__( study_index_path: list[str], target_index_path: str, disease_index_path: str, + biosample_index_path: str, valid_study_index_path: str, invalid_study_index_path: str, invalid_qc_reasons: list[str] = [], @@ -55,6 +56,7 @@ def __init__( .withColumn("efo", f.coalesce(f.col("efo"), f.col("diseaseId"))) ) study_index = StudyIndex.from_parquet(session, list(study_index_path)) + biosample_index = BiosampleIndex.from_parquet(session, biosample_index_path) # Running validation: study_index_with_qc = ( @@ -63,6 +65,7 @@ def __init__( .validate_study_type() # Flagging non-supported study types. .validate_target(target_index) # Flagging QTL studies with invalid targets .validate_disease(disease_index) # Flagging invalid EFOs + .validate_biosample(biosample_index) # Flagging invalid biosamples ).persist() # we will need this for 2 types of outputs study_index_with_qc.valid_rows( From 692732b3c2c84dac74f250355f581155267f6b78 Mon Sep 17 00:00:00 2001 From: Tobi Alegbe Date: Fri, 13 Sep 2024 12:05:34 +0000 Subject: [PATCH 07/22] Tweak slightly --- .../assets/schemas/biosample_index.json | 32 +++- src/gentropy/biosample_index.py | 4 +- src/gentropy/dataset/biosample_index.py | 148 +++++++++--------- .../datasource/cell_ontology/__init__.py | 3 - .../cell_ontology.py} | 0 .../uberon.py} | 1 + src/gentropy/datasource/ontologies/utils.py | 91 +++++++++++ src/gentropy/datasource/uberon/__init__.py | 3 - .../gentropy/dataset/test_biosample_index.py | 31 ++++ 9 files changed, 225 insertions(+), 88 deletions(-) delete mode 100644 src/gentropy/datasource/cell_ontology/__init__.py rename src/gentropy/datasource/{cell_ontology/biosample_index.py => ontologies/cell_ontology.py} (100%) rename src/gentropy/datasource/{uberon/biosample_index.py => ontologies/uberon.py} (99%) create mode 100644 src/gentropy/datasource/ontologies/utils.py delete mode 100644 src/gentropy/datasource/uberon/__init__.py create mode 100644 tests/gentropy/dataset/test_biosample_index.py diff --git a/src/gentropy/assets/schemas/biosample_index.json b/src/gentropy/assets/schemas/biosample_index.json index df8f82188..cd91f090b 100644 --- a/src/gentropy/assets/schemas/biosample_index.json +++ b/src/gentropy/assets/schemas/biosample_index.json @@ -4,7 +4,7 @@ { "name": "biosampleId", "type": "string", - "nullable": true, + "nullable": false, "metadata": {} }, { @@ -14,14 +14,30 @@ "metadata": {} }, { - "name": "dbXRefs", - "type": { - "type": "array", - "elementType": "string", - "containsNull": true - }, + "metadata": {}, + "name": "dbXrefs", "nullable": true, - "metadata": {} + "type": { + "containsNull": true, + "elementType": { + "fields": [ + { + "metadata": {}, + "name": "id", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "source", + "nullable": true, + "type": "string" + } + ], + "type": "struct" + }, + "type": "array" + } }, { "name": "description", diff --git a/src/gentropy/biosample_index.py b/src/gentropy/biosample_index.py index b5864a5ba..43d1511bc 100644 --- a/src/gentropy/biosample_index.py +++ b/src/gentropy/biosample_index.py @@ -29,10 +29,10 @@ def __init__( uberon_input_path (str): Input uberon dataset path. biosample_index_output_path (str): Output gene index dataset path. """ - cell_ontology_index = CellOntologyBiosampleIndex.extract_celltypes_from_source( + cell_ontology_index = BiosampleIndex.extract_from_source( session, cell_ontology_input_path ) - uberon_index = UberonBiosampleIndex.extract_tissue_from_source( + uberon_index = BiosampleIndex.extract_from_source( session, uberon_input_path ) biosample_index = BiosampleIndex.merge([cell_ontology_index, uberon_index]) diff --git a/src/gentropy/dataset/biosample_index.py b/src/gentropy/dataset/biosample_index.py index 2dc547cd8..9905afd91 100644 --- a/src/gentropy/dataset/biosample_index.py +++ b/src/gentropy/dataset/biosample_index.py @@ -17,10 +17,14 @@ from gentropy.common.schemas import parse_spark_schema from gentropy.dataset.dataset import Dataset + +from pyspark.sql import Column, DataFrame, Row + if TYPE_CHECKING: - from pyspark.sql import Column, DataFrame from pyspark.sql.types import StructType +import owlready2 as owl + @dataclass class BiosampleIndex(Dataset): @@ -52,15 +56,13 @@ def merge( BiosampleIndex: Merged biosample index. """ df = reduct(DataFrame.unionAll, [biosample_index._df for biosample_index in biosample_indexes]) - return BiosampleIndex(_df=df, _schema=BiosampleIndex.get_schema()) + return BiosampleIndex(_df=df, _schema=cls.get_schema()) - def extract_ontology_info( ontology : owlready2.namespace.Ontology, - prefix : str, session : Session, - schema : StructType = BiosampleIndex.get_schema(), + schema : StructType ) -> BiosampleIndex: """Extracts the ontology information from Uberon or Cell Ontology owo owlready2 ontology object. @@ -72,77 +74,79 @@ def extract_ontology_info( Returns: BiosampleIndex: Parsed and annotated biosample index table. """ + data_list = [] # Iterate over all classes in the ontology - for cls in ont.classes(): - if cls.name.startswith(prefix): - # Basic class information - cls_id = cls.name - # cls_code = cls.iri - cls_name = cls.label[0] if cls.label else None - - # Extract descriptions - description = None - if hasattr(cls, 'IAO_0000115'): - description = cls.IAO_0000115.first() if cls.IAO_0000115 else None - - # Extract dbXRefs - dbXRefs = [x for x in cls.hasDbXref] if hasattr(cls, 'hasDbXref') else [] - - # Parent classes - parents = [] - for parent in cls.is_a: - if parent is owl.Thing: - continue # Skip owlready2 Thing class, which is a top-level class - elif hasattr(parent, 'name'): - parent_id = parent.name - parents.append(parent_id) - elif hasattr(parent, 'property'): # For restrictions - continue # We skip restrictions in this simplified list - - # Synonyms - synonyms = set() - if hasattr(cls, 'hasExactSynonym'): - synonyms.update(cls.hasExactSynonym) - if hasattr(cls, 'hasBroadSynonym'): - synonyms.update(cls.hasBroadSynonym) - if hasattr(cls, 'hasNarrowSynonym'): - synonyms.update(cls.hasNarrowSynonym) - if hasattr(cls, 'hasRelatedSynonym'): - synonyms.update(cls.hasRelatedSynonym) - - # Children classes - children = [child.name for child in cls.subclasses()] - - # Ancestors and descendants with Thing class filtered out - ancestors = [anc.name for anc in cls.ancestors() if hasattr(anc, 'name') and anc is not owl.Thing] - descendants = [desc.name for desc in cls.descendants() if hasattr(desc, 'name')] - - # Check if the class is deprecated - is_deprecated = False - if hasattr(cls, 'deprecated') and cls.deprecated: - is_deprecated = True - - # Compile all information into a Row - entry = Row( - id=cls_id, - # code=cls_code, - name=cls_name, - dbXRefs=dbXRefs, - description=description, - parents=parents, - synonyms=list(synonyms), - ancestors=ancestors, - descendants=descendants, - children=children, - ontology={"is_obsolete": is_deprecated} - ) - - # Add to data list - data.append(entry) + for cls in ontology.classes(): + # Basic class information + cls_id = cls.name + # cls_code = cls.iri + cls_name = cls.label[0] if cls.label else None + + # Extract descriptions + description = None + if hasattr(cls, 'IAO_0000115'): + description = cls.IAO_0000115.first() if cls.IAO_0000115 else None + + # Extract dbXRefs + dbXRefs = [] + if hasattr(cls, 'hasDbXref'): + dbXRefs = [Row(id=x, source=x.split(':')[0]) for x in cls.hasDbXref] + + # Parent classes + parents = [] + for parent in cls.is_a: + if parent is owl.Thing: + continue # Skip owlready2 Thing class, which is a top-level class + elif hasattr(parent, 'name'): + parent_id = parent.name + parents.append(parent_id) + elif hasattr(parent, 'property'): # For restrictions + continue # We skip restrictions in this simplified list + + # Synonyms + synonyms = set() + if hasattr(cls, 'hasExactSynonym'): + synonyms.update(cls.hasExactSynonym) + if hasattr(cls, 'hasBroadSynonym'): + synonyms.update(cls.hasBroadSynonym) + if hasattr(cls, 'hasNarrowSynonym'): + synonyms.update(cls.hasNarrowSynonym) + if hasattr(cls, 'hasRelatedSynonym'): + synonyms.update(cls.hasRelatedSynonym) + + # Children classes + children = [child.name for child in cls.subclasses()] + + # Ancestors and descendants with Thing class filtered out + ancestors = [anc.name for anc in cls.ancestors() if hasattr(anc, 'name') and anc is not owl.Thing] + descendants = [desc.name for desc in cls.descendants() if hasattr(desc, 'name')] + + # Check if the class is deprecated + is_deprecated = False + if hasattr(cls, 'deprecated') and cls.deprecated: + is_deprecated = True + + # Compile all information into a Row + entry = Row( + id=cls_id, + # code=cls_code, + name=cls_name, + dbXRefs=dbXRefs, + description=description, + parents=parents, + synonyms=list(synonyms), + ancestors=ancestors, + descendants=descendants, + children=children, + ontology={"is_obsolete": is_deprecated} + ) + + # Add to data list + data_list.append(entry) # Create DataFrame directly from Rows - df = spark2.createDataFrame(data, schema) + df = session.createDataFrame(data_list, schema) return df diff --git a/src/gentropy/datasource/cell_ontology/__init__.py b/src/gentropy/datasource/cell_ontology/__init__.py deleted file mode 100644 index c9f3e2075..000000000 --- a/src/gentropy/datasource/cell_ontology/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -"""Cell ontology datasource classes.""" - -from __future__ import annotations diff --git a/src/gentropy/datasource/cell_ontology/biosample_index.py b/src/gentropy/datasource/ontologies/cell_ontology.py similarity index 100% rename from src/gentropy/datasource/cell_ontology/biosample_index.py rename to src/gentropy/datasource/ontologies/cell_ontology.py diff --git a/src/gentropy/datasource/uberon/biosample_index.py b/src/gentropy/datasource/ontologies/uberon.py similarity index 99% rename from src/gentropy/datasource/uberon/biosample_index.py rename to src/gentropy/datasource/ontologies/uberon.py index d07248b5e..a59ce2df4 100644 --- a/src/gentropy/datasource/uberon/biosample_index.py +++ b/src/gentropy/datasource/ontologies/uberon.py @@ -13,6 +13,7 @@ from gentropy.common.session import Session from gentropy.dataset.biosample_index import BiosampleIndex +from grn if TYPE_CHECKING: from pyspark.sql import DataFrame diff --git a/src/gentropy/datasource/ontologies/utils.py b/src/gentropy/datasource/ontologies/utils.py new file mode 100644 index 000000000..6f2043c93 --- /dev/null +++ b/src/gentropy/datasource/ontologies/utils.py @@ -0,0 +1,91 @@ + +def extract_ontology_info( + ontology : owlready2.namespace.Ontology, + session : Session, + schema : StructType +) -> BiosampleIndex: + """Extracts the ontology information from Uberon or Cell Ontology owo owlready2 ontology object. + + Args: + ontology (owlready2.namespace.Ontology): An owlready2 ontology object. Must be either from Cell Ontology or Uberon. + prefix (str): Prefix for the desired ontology terms. + session (Session): Spark session. + + Returns: + BiosampleIndex: Parsed and annotated biosample index table. + """ + data_list = [] + + # Iterate over all classes in the ontology + for cls in ontology.classes(): + # Basic class information + cls_id = cls.name + # cls_code = cls.iri + cls_name = cls.label[0] if cls.label else None + + # Extract descriptions + description = None + if hasattr(cls, 'IAO_0000115'): + description = cls.IAO_0000115.first() if cls.IAO_0000115 else None + + # Extract dbXRefs + dbXRefs = [] + if hasattr(cls, 'hasDbXref'): + dbXRefs = [Row(id=x, source=x.split(':')[0]) for x in cls.hasDbXref] + + # Parent classes + parents = [] + for parent in cls.is_a: + if parent is owl.Thing: + continue # Skip owlready2 Thing class, which is a top-level class + elif hasattr(parent, 'name'): + parent_id = parent.name + parents.append(parent_id) + elif hasattr(parent, 'property'): # For restrictions + continue # We skip restrictions in this simplified list + + # Synonyms + synonyms = set() + if hasattr(cls, 'hasExactSynonym'): + synonyms.update(cls.hasExactSynonym) + if hasattr(cls, 'hasBroadSynonym'): + synonyms.update(cls.hasBroadSynonym) + if hasattr(cls, 'hasNarrowSynonym'): + synonyms.update(cls.hasNarrowSynonym) + if hasattr(cls, 'hasRelatedSynonym'): + synonyms.update(cls.hasRelatedSynonym) + + # Children classes + children = [child.name for child in cls.subclasses()] + + # Ancestors and descendants with Thing class filtered out + ancestors = [anc.name for anc in cls.ancestors() if hasattr(anc, 'name') and anc is not owl.Thing] + descendants = [desc.name for desc in cls.descendants() if hasattr(desc, 'name')] + + # Check if the class is deprecated + is_deprecated = False + if hasattr(cls, 'deprecated') and cls.deprecated: + is_deprecated = True + + # Compile all information into a Row + entry = Row( + id=cls_id, + # code=cls_code, + name=cls_name, + dbXRefs=dbXRefs, + description=description, + parents=parents, + synonyms=list(synonyms), + ancestors=ancestors, + descendants=descendants, + children=children, + ontology={"is_obsolete": is_deprecated} + ) + + # Add to data list + data_list.append(entry) + + + # Create DataFrame directly from Rows + df = session.createDataFrame(data_list, schema) + return df diff --git a/src/gentropy/datasource/uberon/__init__.py b/src/gentropy/datasource/uberon/__init__.py deleted file mode 100644 index 11899e25b..000000000 --- a/src/gentropy/datasource/uberon/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -"""Uberon datasource classes.""" - -from __future__ import annotations diff --git a/tests/gentropy/dataset/test_biosample_index.py b/tests/gentropy/dataset/test_biosample_index.py new file mode 100644 index 000000000..c680b4f19 --- /dev/null +++ b/tests/gentropy/dataset/test_biosample_index.py @@ -0,0 +1,31 @@ +"""Tests on Biosample index.""" + +import pandas as pd +import numpy as np +from pyspark.sql import SparkSession +from pyspark.sql import Row +import pyspark.sql.functions as F +import owlready2 as owl +from pyspark.sql.types import StructType, StructField, StringType, ArrayType, MapType, BooleanType +import json + +from gentropy.dataset.biosample_index import BiosampleIndex, extract_ontology_info + + +def test_biosample_index_creation(mock_biosample_index: BiosampleIndex) -> None: + """Test biosample index creation with mock biosample index.""" + assert isinstance(mock_biosample_index, BiosampleIndex) + + + +cell_ontology = owl.get_ontology("/home/alegbe/repos/gentropy/tests/gentropy/data_samples/cell_ontology.owl").load() +spark2 = SparkSession.builder \ + .master("local[*]") \ + .appName("LocalOntologyIndexing") \ + .getOrCreate() + +# Define the schema for the DataFrame +schema_path = '/home/alegbe/repos/gentropy/src/gentropy/assets/schemas/biosample_index.json' +schema = StructType.fromJson(json.load(open(schema_path))) + +df = extract_ontology_info(cell_ontology, spark2, schema) \ No newline at end of file From 30dc23fe0cbc50d5c4854f158ff9eaeec8c124e1 Mon Sep 17 00:00:00 2001 From: Tobi Alegbe Date: Fri, 13 Sep 2024 14:53:30 +0000 Subject: [PATCH 08/22] Modified the parser to accept JSON files --- .../assets/schemas/biosample_index.json | 54 +++----- src/gentropy/dataset/biosample_index.py | 105 --------------- src/gentropy/datasource/ontologies/utils.py | 127 +++++++++++++++++- .../gentropy/dataset/test_biosample_index.py | 12 +- 4 files changed, 150 insertions(+), 148 deletions(-) diff --git a/src/gentropy/assets/schemas/biosample_index.json b/src/gentropy/assets/schemas/biosample_index.json index cd91f090b..82ba5ae2b 100644 --- a/src/gentropy/assets/schemas/biosample_index.json +++ b/src/gentropy/assets/schemas/biosample_index.json @@ -13,32 +13,6 @@ "nullable": true, "metadata": {} }, - { - "metadata": {}, - "name": "dbXrefs", - "nullable": true, - "type": { - "containsNull": true, - "elementType": { - "fields": [ - { - "metadata": {}, - "name": "id", - "nullable": true, - "type": "string" - }, - { - "metadata": {}, - "name": "source", - "nullable": true, - "type": "string" - } - ], - "type": "struct" - }, - "type": "array" - } - }, { "name": "description", "type": "string", @@ -46,7 +20,7 @@ "metadata": {} }, { - "name": "parents", + "name": "dbXrefs", "type": { "type": "array", "elementType": "string", @@ -66,7 +40,18 @@ "metadata": {} }, { - "name": "ancestors", + "name": "deprecated", + "type": { + "type": "map", + "keyType": "string", + "valueType": "boolean", + "valueContainsNull": true + }, + "nullable": true, + "metadata": {} + }, + { + "name": "parents", "type": { "type": "array", "elementType": "string", @@ -76,7 +61,7 @@ "metadata": {} }, { - "name": "descendants", + "name": "ancestors", "type": { "type": "array", "elementType": "string", @@ -86,7 +71,7 @@ "metadata": {} }, { - "name": "children", + "name": "descendants", "type": { "type": "array", "elementType": "string", @@ -96,12 +81,11 @@ "metadata": {} }, { - "name": "ontology", + "name": "children", "type": { - "type": "map", - "keyType": "string", - "valueType": "boolean", - "valueContainsNull": true + "type": "array", + "elementType": "string", + "containsNull": true }, "nullable": true, "metadata": {} diff --git a/src/gentropy/dataset/biosample_index.py b/src/gentropy/dataset/biosample_index.py index 9905afd91..49256fe69 100644 --- a/src/gentropy/dataset/biosample_index.py +++ b/src/gentropy/dataset/biosample_index.py @@ -42,111 +42,6 @@ def get_schema(cls: type[StudyIndex]) -> StructType: """ return parse_spark_schema("biosample_index.json") - @classmethod - def merge( - cls: type[BiosampleIndex], - biosample_indexes: list[BiosampleIndex], - ) -> BiosampleIndex: - """Merge a list of biosample indexes into a single biosample index. - - Args: - biosample_indexes (BiosampleIndex): Biosample indexes to merge. - Returns: - BiosampleIndex: Merged biosample index. - """ - df = reduct(DataFrame.unionAll, [biosample_index._df for biosample_index in biosample_indexes]) - return BiosampleIndex(_df=df, _schema=cls.get_schema()) -def extract_ontology_info( - ontology : owlready2.namespace.Ontology, - session : Session, - schema : StructType -) -> BiosampleIndex: - """Extracts the ontology information from Uberon or Cell Ontology owo owlready2 ontology object. - - Args: - ontology (owlready2.namespace.Ontology): An owlready2 ontology object. Must be either from Cell Ontology or Uberon. - prefix (str): Prefix for the desired ontology terms. - session (Session): Spark session. - - Returns: - BiosampleIndex: Parsed and annotated biosample index table. - """ - data_list = [] - - # Iterate over all classes in the ontology - for cls in ontology.classes(): - # Basic class information - cls_id = cls.name - # cls_code = cls.iri - cls_name = cls.label[0] if cls.label else None - - # Extract descriptions - description = None - if hasattr(cls, 'IAO_0000115'): - description = cls.IAO_0000115.first() if cls.IAO_0000115 else None - - # Extract dbXRefs - dbXRefs = [] - if hasattr(cls, 'hasDbXref'): - dbXRefs = [Row(id=x, source=x.split(':')[0]) for x in cls.hasDbXref] - - # Parent classes - parents = [] - for parent in cls.is_a: - if parent is owl.Thing: - continue # Skip owlready2 Thing class, which is a top-level class - elif hasattr(parent, 'name'): - parent_id = parent.name - parents.append(parent_id) - elif hasattr(parent, 'property'): # For restrictions - continue # We skip restrictions in this simplified list - - # Synonyms - synonyms = set() - if hasattr(cls, 'hasExactSynonym'): - synonyms.update(cls.hasExactSynonym) - if hasattr(cls, 'hasBroadSynonym'): - synonyms.update(cls.hasBroadSynonym) - if hasattr(cls, 'hasNarrowSynonym'): - synonyms.update(cls.hasNarrowSynonym) - if hasattr(cls, 'hasRelatedSynonym'): - synonyms.update(cls.hasRelatedSynonym) - - # Children classes - children = [child.name for child in cls.subclasses()] - - # Ancestors and descendants with Thing class filtered out - ancestors = [anc.name for anc in cls.ancestors() if hasattr(anc, 'name') and anc is not owl.Thing] - descendants = [desc.name for desc in cls.descendants() if hasattr(desc, 'name')] - - # Check if the class is deprecated - is_deprecated = False - if hasattr(cls, 'deprecated') and cls.deprecated: - is_deprecated = True - - # Compile all information into a Row - entry = Row( - id=cls_id, - # code=cls_code, - name=cls_name, - dbXRefs=dbXRefs, - description=description, - parents=parents, - synonyms=list(synonyms), - ancestors=ancestors, - descendants=descendants, - children=children, - ontology={"is_obsolete": is_deprecated} - ) - - # Add to data list - data_list.append(entry) - - - # Create DataFrame directly from Rows - df = session.createDataFrame(data_list, schema) - return df - diff --git a/src/gentropy/datasource/ontologies/utils.py b/src/gentropy/datasource/ontologies/utils.py index 6f2043c93..d22470778 100644 --- a/src/gentropy/datasource/ontologies/utils.py +++ b/src/gentropy/datasource/ontologies/utils.py @@ -1,10 +1,20 @@ +"""Utility functions for Biosample ontology processing.""" +import owlready2 +from pyspark.sql import Row, SparkSession +from pyspark.sql.types import StructType, StringType, ArrayType +from pyspark.sql.functions import col, explode_outer, collect_set, collect_list, array_distinct, regexp_replace, udf, coalesce +from pyspark.sql.window import Window +from functools import reduce +from gentropy.dataset.biosample_index import BiosampleIndex + def extract_ontology_info( ontology : owlready2.namespace.Ontology, - session : Session, + spark : SparkSession, schema : StructType ) -> BiosampleIndex: """Extracts the ontology information from Uberon or Cell Ontology owo owlready2 ontology object. + NOT IN USE Args: ontology (owlready2.namespace.Ontology): An owlready2 ontology object. Must be either from Cell Ontology or Uberon. @@ -87,5 +97,118 @@ def extract_ontology_info( # Create DataFrame directly from Rows - df = session.createDataFrame(data_list, schema) + df = spark.createDataFrame(data_list, schema) return df + + +def extract_ontology_from_json( + ontology_json : str, + spark : SparkSession +) -> BiosampleIndex: + """ + Extracts the ontology information from a JSON file. Currently only supports Uberon and Cell Ontology. + + Args: + ontology_json (str): Path to the JSON file containing the ontology information. + spark (SparkSession): Spark session. + + Returns: + BiosampleIndex: Parsed and annotated biosample index table. + """ + + def json_graph_traversal(df, node_col, link_col, traversal_type="ancestors"): + """ + Traverse a graph represented in a DataFrame to find all ancestors or descendants. + """ + # Collect graph data as a map + graph_map = df.select(node_col, link_col).rdd.collectAsMap() + broadcasted_graph = spark.sparkContext.broadcast(graph_map) + + def get_relationships(node): + relationships = set() + stack = [node] + while stack: + current = stack.pop() + if current in broadcasted_graph.value: + current_links = broadcasted_graph.value[current] + stack.extend(current_links) + relationships.update(current_links) + return list(relationships) + + # Choose column name based on traversal type + result_col = "ancestors" if traversal_type == "ancestors" else "descendants" + + # Register the UDF based on traversal type + relationship_udf = udf(get_relationships, ArrayType(StringType())) + + # Apply the UDF to create the result column + return df.withColumn(result_col, relationship_udf(col(node_col))) + + # Load the JSON file + df = spark.read.json(ontology_json, multiLine=True) + + # Exploding the 'graphs' array to make individual records easier to access + df_graphs = df.select(explode_outer("graphs").alias("graph")) + + # Exploding the 'nodes' array within each graph + df_nodes = df_graphs.select( + col("graph.id").alias("graph_id"), + explode_outer("graph.nodes").alias("node")) + + # Exploding the 'edges' array within each graph for relationship data + df_edges = df_graphs.select( + col("graph.id").alias("graph_id"), + explode_outer("graph.edges").alias("edge") + ).select( + col("edge.sub").alias("subject"), + col("edge.pred").alias("predicate"), + col("edge.obj").alias("object") + ) + df_edges = df_edges.withColumn("subject", regexp_replace(col("subject"), "http://purl.obolibrary.org/obo/", "")) + df_edges = df_edges.withColumn("object", regexp_replace(col("object"), "http://purl.obolibrary.org/obo/", "")) + + # Extract the relevant information from the nodes + transformed_df = df_nodes.select( + regexp_replace(col("node.id"), "http://purl.obolibrary.org/obo/", "").alias("biosampleId"), + col("node.lbl").alias("biosampleName"), + col("node.meta.definition.val").alias("description"), + collect_set(col("node.meta.xrefs.val")).over(Window.partitionBy("node.id")).getItem(0).alias("dbXrefs"), + collect_set(col("node.meta.synonyms.val")).over(Window.partitionBy("node.id")).getItem(0).alias("synonyms"), + col("node.meta.deprecated").alias("deprecated")) + + # Extract the relationships from the edges + # Prepare relationship-specific DataFrames + df_parents = df_edges.filter(col("predicate") == "is_a").select("subject", "object").withColumnRenamed("object", "parent") + df_children = df_edges.filter(col("predicate") == "is_a").select("object", "subject").withColumnRenamed("subject", "child") + + # Aggregate relationships back to nodes + df_parents_grouped = df_parents.groupBy("subject").agg(array_distinct(collect_list("parent")).alias("parents")) + df_children_grouped = df_children.groupBy("object").agg(array_distinct(collect_list("child")).alias("children")) + + # Get all ancestors + df_with_ancestors = json_graph_traversal(df_parents_grouped, "subject", "parents", "ancestors") + # Get all descendants + df_with_descendants = json_graph_traversal(df_children_grouped, "object", "children", "descendants") + + # Join the ancestor and descendant DataFrames + df_with_relationships = df_with_ancestors.join(df_with_descendants, df_with_ancestors.subject == df_with_descendants.object, "full_outer").withColumn("biosampleId", coalesce(df_with_ancestors.subject, df_with_descendants.object)).drop("subject", "object") + + # Join the original DataFrame with the relationship DataFrame + final_df = transformed_df.join(df_with_relationships, ['biosampleId'], "left") + + return final_df + + def merge_biosample_indices( + biosample_indices: list[BiosampleIndex], + ) -> BiosampleIndex: + """Merge a list of biosample indexes into a single biosample index. + Where there are conflicts, in single values - the first value is taken. In list values, the union of all values is taken. + + Args: + biosample_indexes (BiosampleIndex): Biosample indexes to merge. + + Returns: + BiosampleIndex: Merged biosample index. + """ + # Merge the DataFrames + merged_df = reduce(DataFrame.unionByName, biosample_indices) \ No newline at end of file diff --git a/tests/gentropy/dataset/test_biosample_index.py b/tests/gentropy/dataset/test_biosample_index.py index c680b4f19..f7d85e230 100644 --- a/tests/gentropy/dataset/test_biosample_index.py +++ b/tests/gentropy/dataset/test_biosample_index.py @@ -9,7 +9,8 @@ from pyspark.sql.types import StructType, StructField, StringType, ArrayType, MapType, BooleanType import json -from gentropy.dataset.biosample_index import BiosampleIndex, extract_ontology_info +from gentropy.dataset.biosample_index import BiosampleIndex +from gentropy.datasource.ontologies.utils import extract_ontology_from_json def test_biosample_index_creation(mock_biosample_index: BiosampleIndex) -> None: @@ -18,14 +19,13 @@ def test_biosample_index_creation(mock_biosample_index: BiosampleIndex) -> None: -cell_ontology = owl.get_ontology("/home/alegbe/repos/gentropy/tests/gentropy/data_samples/cell_ontology.owl").load() spark2 = SparkSession.builder \ .master("local[*]") \ .appName("LocalOntologyIndexing") \ .getOrCreate() -# Define the schema for the DataFrame -schema_path = '/home/alegbe/repos/gentropy/src/gentropy/assets/schemas/biosample_index.json' -schema = StructType.fromJson(json.load(open(schema_path))) -df = extract_ontology_info(cell_ontology, spark2, schema) \ No newline at end of file +ontology_json = 'file:///home/alegbe/cl.json' +# ontology_json = 'file:///home/alegbe/uberon.json' + +df = extract_ontology_from_json(ontology_json, spark2) \ No newline at end of file From 28e1f92749b000a424a92807c174320a8083718a Mon Sep 17 00:00:00 2001 From: Tobi Alegbe Date: Mon, 16 Sep 2024 13:31:16 +0000 Subject: [PATCH 09/22] Update biosample index --- docs/python_api/datasources/_datasources.md | 7 +- .../assets/schemas/biosample_index.json | 11 - src/gentropy/biosample_index.py | 18 +- src/gentropy/config.py | 10 + src/gentropy/dataset/biosample_index.py | 8 +- src/gentropy/dataset/study_index.py | 5 +- .../{gwas_catalog => ontologies}/__init__.py | 2 +- .../datasource/ontologies/cell_ontology.py | 48 -- src/gentropy/datasource/ontologies/uberon.py | 49 -- src/gentropy/datasource/ontologies/utils.py | 147 +--- src/gentropy/study_validation.py | 4 +- .../data_samples/cell_ontology_sample.json | 274 +++++++ .../gentropy/data_samples/uberon_sample.json | 675 ++++++++++++++++++ .../gentropy/dataset/test_biosample_index.py | 12 +- 14 files changed, 1026 insertions(+), 244 deletions(-) rename src/gentropy/datasource/{gwas_catalog => ontologies}/__init__.py (50%) delete mode 100644 src/gentropy/datasource/ontologies/cell_ontology.py delete mode 100644 src/gentropy/datasource/ontologies/uberon.py create mode 100644 tests/gentropy/data_samples/cell_ontology_sample.json create mode 100644 tests/gentropy/data_samples/uberon_sample.json diff --git a/docs/python_api/datasources/_datasources.md b/docs/python_api/datasources/_datasources.md index e6e081b21..f79f8137b 100644 --- a/docs/python_api/datasources/_datasources.md +++ b/docs/python_api/datasources/_datasources.md @@ -26,7 +26,7 @@ This section contains information about the data source harmonisation tools avai 2. GWAS catalog's [harmonisation pipeline](https://www.ebi.ac.uk/gwas/docs/methods/summary-statistics#_harmonised_summary_statistics_data) 3. Ensembl's [Variant Effect Predictor](https://www.ensembl.org/info/docs/tools/vep/index.html) -## Linkage desiquilibrium +## Linkage disequilibrium 1. [GnomAD](gnomad/_gnomad.md) v2.1.1 LD matrixes (7 ancestries) @@ -37,3 +37,8 @@ This section contains information about the data source harmonisation tools avai ## Gene annotation 1. [Open Targets Platform Target Dataset](open_targets/target.md) (derived from Ensembl) + +## Biological samples + +1. [Uberon](ontologies/_uberon.md) +2. [Cell Ontology](ontologies/_cell_ontology.md) \ No newline at end of file diff --git a/src/gentropy/assets/schemas/biosample_index.json b/src/gentropy/assets/schemas/biosample_index.json index 82ba5ae2b..7c28ec970 100644 --- a/src/gentropy/assets/schemas/biosample_index.json +++ b/src/gentropy/assets/schemas/biosample_index.json @@ -39,17 +39,6 @@ "nullable": true, "metadata": {} }, - { - "name": "deprecated", - "type": { - "type": "map", - "keyType": "string", - "valueType": "boolean", - "valueContainsNull": true - }, - "nullable": true, - "metadata": {} - }, { "name": "parents", "type": { diff --git a/src/gentropy/biosample_index.py b/src/gentropy/biosample_index.py index 43d1511bc..11274d789 100644 --- a/src/gentropy/biosample_index.py +++ b/src/gentropy/biosample_index.py @@ -2,10 +2,8 @@ from __future__ import annotations from gentropy.common.session import Session -from gentropy.datasource.open_targets.target import OpenTargetsTarget from gentropy.dataset.biosample_index import BiosampleIndex -from gentropy.datasource.cell_ontology.biosample_index import CellOntologyBiosampleIndex -from gentropy.datasource.uberon.biosample_index import UberonBiosampleIndex +from gentropy.datasource.ontologies.utils import extract_ontology_from_json, merge_biosample_indices class BiosampleIndexStep: @@ -29,12 +27,10 @@ def __init__( uberon_input_path (str): Input uberon dataset path. biosample_index_output_path (str): Output gene index dataset path. """ - cell_ontology_index = BiosampleIndex.extract_from_source( - session, cell_ontology_input_path - ) - uberon_index = BiosampleIndex.extract_from_source( - session, uberon_input_path - ) - biosample_index = BiosampleIndex.merge([cell_ontology_index, uberon_index]) - biosample_index.write_parquet(biosample_index_output_path) + cell_ontology_index = extract_ontology_from_json(cell_ontology_input_path, session.spark) + uberon_index = extract_ontology_from_json(uberon_input_path, session.spark) + + biosample_index = merge_biosample_indices([cell_ontology_index, uberon_index]) + + biosample_index.df.write.mode(session.write_mode).parquet(biosample_index_output_path) diff --git a/src/gentropy/config.py b/src/gentropy/config.py index 9089dbecf..114913090 100644 --- a/src/gentropy/config.py +++ b/src/gentropy/config.py @@ -51,6 +51,15 @@ class GeneIndexConfig(StepConfig): _target_: str = "gentropy.gene_index.GeneIndexStep" +@dataclass +class BiosampleIndexConfig(StepConfig): + """Biosample index step configuration.""" + + target_path: str = MISSING + biosample_index_path: str = MISSING + _target_: str = "gentropy.biosample_index.BiosampleIndexStep" + + @dataclass class GWASCatalogStudyCurationConfig(StepConfig): """GWAS Catalog study curation step configuration.""" @@ -532,6 +541,7 @@ def register_config() -> None: cs.store(group="step", name="colocalisation", node=ColocalisationConfig) cs.store(group="step", name="eqtl_catalogue", node=EqtlCatalogueConfig) cs.store(group="step", name="gene_index", node=GeneIndexConfig) + cs.store(group="step", name="biosample_index", node=BiosampleIndexConfig) cs.store( group="step", name="gwas_catalog_study_curation", diff --git a/src/gentropy/dataset/biosample_index.py b/src/gentropy/dataset/biosample_index.py index 49256fe69..b3735ca62 100644 --- a/src/gentropy/dataset/biosample_index.py +++ b/src/gentropy/dataset/biosample_index.py @@ -23,8 +23,6 @@ if TYPE_CHECKING: from pyspark.sql.types import StructType -import owlready2 as owl - @dataclass class BiosampleIndex(Dataset): @@ -40,8 +38,4 @@ def get_schema(cls: type[StudyIndex]) -> StructType: Returns: StructType: The schema of the BiosampleIndex dataset. """ - return parse_spark_schema("biosample_index.json") - - - - + return parse_spark_schema("biosample_index.json") \ No newline at end of file diff --git a/src/gentropy/dataset/study_index.py b/src/gentropy/dataset/study_index.py index 43ca171ee..852f14d9e 100644 --- a/src/gentropy/dataset/study_index.py +++ b/src/gentropy/dataset/study_index.py @@ -20,6 +20,7 @@ from pyspark.sql.types import StructType from gentropy.dataset.gene_index import GeneIndex + from gentropy.dataset.biosample_index import BiosampleIndex class StudyQualityCheck(Enum): @@ -29,6 +30,7 @@ class StudyQualityCheck(Enum): UNRESOLVED_TARGET (str): Target/gene identifier could not match to reference - Labelling failing target. UNRESOLVED_DISEASE (str): Disease identifier could not match to referece or retired identifier - labelling failing disease UNKNOWN_STUDY_TYPE (str): Indicating the provided type of study is not supported. + UNKNOWN_BIOSAMPLE (str): Flagging if a biosample identifier is not found in the reference. DUPLICATED_STUDY (str): Flagging if a study identifier is not unique. NO_GENE_PROVIDED (str): Flagging QTL studies if the measured """ @@ -36,6 +38,7 @@ class StudyQualityCheck(Enum): UNRESOLVED_TARGET = "Target/gene identifier could not match to reference." UNRESOLVED_DISEASE = "No valid disease identifier found." UNKNOWN_STUDY_TYPE = "This type of study is not supported." + UNKNOWN_BIOSAMPLE = "Biosample identifier was not found in the reference." DUPLICATED_STUDY = "The identifier of this study is not unique." NO_GENE_PROVIDED = "QTL study doesn't have gene assigned." @@ -434,7 +437,7 @@ def validate_biosample(self: StudyIndex, biosample_index: BiosampleIndex) -> Stu StudyIndex.update_quality_flag( f.col("qualityControls"), ~f.col("isIdFound"), - StudyQualityCheck.NO_GENE_PROVIDED, + StudyQualityCheck.UNKNOWN_BIOSAMPLE, ), ) .drop("isIdFound") diff --git a/src/gentropy/datasource/gwas_catalog/__init__.py b/src/gentropy/datasource/ontologies/__init__.py similarity index 50% rename from src/gentropy/datasource/gwas_catalog/__init__.py rename to src/gentropy/datasource/ontologies/__init__.py index 544779b18..d3fa6b416 100644 --- a/src/gentropy/datasource/gwas_catalog/__init__.py +++ b/src/gentropy/datasource/ontologies/__init__.py @@ -1,3 +1,3 @@ -"""GWAS Catalog Data Source.""" +"""Biosample index data source.""" from __future__ import annotations diff --git a/src/gentropy/datasource/ontologies/cell_ontology.py b/src/gentropy/datasource/ontologies/cell_ontology.py deleted file mode 100644 index 3ec2d7be4..000000000 --- a/src/gentropy/datasource/ontologies/cell_ontology.py +++ /dev/null @@ -1,48 +0,0 @@ -"""Biosample index for Cell Ontology data source.""" - -from __future__ import annotations - -from itertools import chain -from typing import TYPE_CHECKING - -import pandas as pd -import pyspark.sql.functions as f -from pyspark.sql.types import IntegerType, StringType, StructField, StructType - -import owlready2 as owl - -from gentropy.common.session import Session -from gentropy.dataset.biosample_index import BiosampleIndex, extract_ontology_info - -if TYPE_CHECKING: - from pyspark.sql import DataFrame - from pyspark.sql.column import Column - -class CellOntologyBiosampleIndex: - """Biosample index dataset from Cell Ontology. - - Cell type data is extracted from the Cell Ontology (CL) https://obophenotype.github.io/cell-ontology/ and used to define the cell types in the biosample index dataset. - """ - - @classmethod - def extract_celltypes_from_source( - cls: type[CellOntologyStudyIndex], - session: Session, - ontology_path: str, - ) -> DataFrame: - """Ingests Cell Ontology owo file and extracts cell types. - - Args: - session (Session): Spark session. - ontology_path (str): Path to the Cell ontology owo file. - - Returns: - BiosampleIndex: Parsed and annotated Cell Ontology biosample index table. - """ - ontology_data = owl.get_ontology(ontology_path).load() - df = extract_ontology_info(ontology_data, "CL_", session, BiosampleIndex.get_schema()) - - return BiosampleIndex( - _df=df, - _schema=BiosampleIndex.get_schema() - ) \ No newline at end of file diff --git a/src/gentropy/datasource/ontologies/uberon.py b/src/gentropy/datasource/ontologies/uberon.py deleted file mode 100644 index a59ce2df4..000000000 --- a/src/gentropy/datasource/ontologies/uberon.py +++ /dev/null @@ -1,49 +0,0 @@ -"""Biosample index for Uberon data source.""" - -from __future__ import annotations - -from itertools import chain -from typing import TYPE_CHECKING - -import pandas as pd -import pyspark.sql.functions as f -from pyspark.sql.types import IntegerType, StringType, StructField, StructType - -import owlready2 as owl - -from gentropy.common.session import Session -from gentropy.dataset.biosample_index import BiosampleIndex -from grn - -if TYPE_CHECKING: - from pyspark.sql import DataFrame - from pyspark.sql.column import Column - -class UberonBiosampleIndex: - """Biosample index dataset from Uberon. - - Cell type data is extracted from the Uberon (UBERON) https://obophenotype.github.io/uberon/ and used to define the tissues in the biosample index dataset. - """ - - @classmethod - def extract_tissue_from_source( - cls: type[UberonStudyIndex], - session: Session, - ontology_path: str, - ) -> DataFrame: - """Ingests Uberon owo file and extracts tissues. - - Args: - session (Session): Spark session. - ontology_path (str): Path to the Uberon owo file. - - Returns: - BiosampleIndex: Parsed and annotated Uberon biosample index table. - """ - ontology_data = owl.get_ontology(ontology_path).load() - df = extract_ontology_info(ontology_data, "UBERON_", session, BiosampleIndex.get_schema()) - - return BiosampleIndex( - _df=df, - _schema=BiosampleIndex.get_schema() - ) diff --git a/src/gentropy/datasource/ontologies/utils.py b/src/gentropy/datasource/ontologies/utils.py index d22470778..adf38c4da 100644 --- a/src/gentropy/datasource/ontologies/utils.py +++ b/src/gentropy/datasource/ontologies/utils.py @@ -1,106 +1,11 @@ """Utility functions for Biosample ontology processing.""" -import owlready2 -from pyspark.sql import Row, SparkSession +from pyspark.sql import Row, SparkSession, DataFrame from pyspark.sql.types import StructType, StringType, ArrayType -from pyspark.sql.functions import col, explode_outer, collect_set, collect_list, array_distinct, regexp_replace, udf, coalesce +from pyspark.sql.functions import col, explode_outer, collect_set, collect_list, array_distinct, regexp_replace, udf, coalesce, first from pyspark.sql.window import Window from functools import reduce from gentropy.dataset.biosample_index import BiosampleIndex - -def extract_ontology_info( - ontology : owlready2.namespace.Ontology, - spark : SparkSession, - schema : StructType -) -> BiosampleIndex: - """Extracts the ontology information from Uberon or Cell Ontology owo owlready2 ontology object. - NOT IN USE - - Args: - ontology (owlready2.namespace.Ontology): An owlready2 ontology object. Must be either from Cell Ontology or Uberon. - prefix (str): Prefix for the desired ontology terms. - session (Session): Spark session. - - Returns: - BiosampleIndex: Parsed and annotated biosample index table. - """ - data_list = [] - - # Iterate over all classes in the ontology - for cls in ontology.classes(): - # Basic class information - cls_id = cls.name - # cls_code = cls.iri - cls_name = cls.label[0] if cls.label else None - - # Extract descriptions - description = None - if hasattr(cls, 'IAO_0000115'): - description = cls.IAO_0000115.first() if cls.IAO_0000115 else None - - # Extract dbXRefs - dbXRefs = [] - if hasattr(cls, 'hasDbXref'): - dbXRefs = [Row(id=x, source=x.split(':')[0]) for x in cls.hasDbXref] - - # Parent classes - parents = [] - for parent in cls.is_a: - if parent is owl.Thing: - continue # Skip owlready2 Thing class, which is a top-level class - elif hasattr(parent, 'name'): - parent_id = parent.name - parents.append(parent_id) - elif hasattr(parent, 'property'): # For restrictions - continue # We skip restrictions in this simplified list - - # Synonyms - synonyms = set() - if hasattr(cls, 'hasExactSynonym'): - synonyms.update(cls.hasExactSynonym) - if hasattr(cls, 'hasBroadSynonym'): - synonyms.update(cls.hasBroadSynonym) - if hasattr(cls, 'hasNarrowSynonym'): - synonyms.update(cls.hasNarrowSynonym) - if hasattr(cls, 'hasRelatedSynonym'): - synonyms.update(cls.hasRelatedSynonym) - - # Children classes - children = [child.name for child in cls.subclasses()] - - # Ancestors and descendants with Thing class filtered out - ancestors = [anc.name for anc in cls.ancestors() if hasattr(anc, 'name') and anc is not owl.Thing] - descendants = [desc.name for desc in cls.descendants() if hasattr(desc, 'name')] - - # Check if the class is deprecated - is_deprecated = False - if hasattr(cls, 'deprecated') and cls.deprecated: - is_deprecated = True - - # Compile all information into a Row - entry = Row( - id=cls_id, - # code=cls_code, - name=cls_name, - dbXRefs=dbXRefs, - description=description, - parents=parents, - synonyms=list(synonyms), - ancestors=ancestors, - descendants=descendants, - children=children, - ontology={"is_obsolete": is_deprecated} - ) - - # Add to data list - data_list.append(entry) - - - # Create DataFrame directly from Rows - df = spark.createDataFrame(data_list, schema) - return df - - def extract_ontology_from_json( ontology_json : str, spark : SparkSession @@ -173,8 +78,9 @@ def get_relationships(node): col("node.lbl").alias("biosampleName"), col("node.meta.definition.val").alias("description"), collect_set(col("node.meta.xrefs.val")).over(Window.partitionBy("node.id")).getItem(0).alias("dbXrefs"), - collect_set(col("node.meta.synonyms.val")).over(Window.partitionBy("node.id")).getItem(0).alias("synonyms"), - col("node.meta.deprecated").alias("deprecated")) + # col("node.meta.deprecated").alias("deprecated"), + collect_set(col("node.meta.synonyms.val")).over(Window.partitionBy("node.id")).getItem(0).alias("synonyms")) + # Extract the relationships from the edges # Prepare relationship-specific DataFrames @@ -198,17 +104,40 @@ def get_relationships(node): return final_df - def merge_biosample_indices( +def merge_biosample_indices( biosample_indices: list[BiosampleIndex], ) -> BiosampleIndex: - """Merge a list of biosample indexes into a single biosample index. - Where there are conflicts, in single values - the first value is taken. In list values, the union of all values is taken. + """Merge a list of biosample indexes into a single biosample index. + Where there are conflicts, in single values - the first value is taken. In list values, the union of all values is taken. - Args: - biosample_indexes (BiosampleIndex): Biosample indexes to merge. + Args: + biosample_indexes (BiosampleIndex): Biosample indexes to merge. - Returns: - BiosampleIndex: Merged biosample index. - """ - # Merge the DataFrames - merged_df = reduce(DataFrame.unionByName, biosample_indices) \ No newline at end of file + Returns: + BiosampleIndex: Merged biosample index. + """ + + def merge_lists(lists): + """Merge a list of lists into a single list.""" + return list(set([item for sublist in lists if sublist is not None for item in sublist])) + + # Make a spark udf (user defined function) to merge lists + merge_lists_udf = udf(merge_lists, ArrayType(StringType())) + + # Merge the DataFrames + merged_df = reduce(DataFrame.unionAll, biosample_indices) + + # Define dictionary of columns and corresponding aggregation functions + # Currently this will take the first value for single values and merge lists for list values + agg_funcs = {} + for column in merged_df.columns: + if column != 'biosampleId': + if 'list' in column: # Assuming column names that have 'list' need list merging + agg_funcs[column] = merge_lists_udf(collect_list(column)).alias(column) + else: + agg_funcs[column] = first(column, ignorenulls=True).alias(column) + + # Group by biosampleId and aggregate the columns + merged_df = merged_df.groupBy('biosampleId').agg(agg_funcs) + + return merged_df \ No newline at end of file diff --git a/src/gentropy/study_validation.py b/src/gentropy/study_validation.py index 3e926078d..d19f012e4 100644 --- a/src/gentropy/study_validation.py +++ b/src/gentropy/study_validation.py @@ -7,6 +7,7 @@ from gentropy.common.session import Session from gentropy.dataset.gene_index import GeneIndex from gentropy.dataset.study_index import StudyIndex +from gentropy.dataset.biosample_index import BiosampleIndex class StudyValidationStep: @@ -34,12 +35,14 @@ def __init__( study_index_path (list[str]): Path to study index file. target_index_path (str): Path to target index file. disease_index_path (str): Path to disease index file. + biosample_index_path (str): Path to biosample index file. valid_study_index_path (str): Path to write the valid records. invalid_study_index_path (str): Path to write the output file. invalid_qc_reasons (list[str]): List of invalid quality check reason names from `StudyQualityCheck` (e.g. ['DUPLICATED_STUDY']). """ # Reading datasets: target_index = GeneIndex.from_parquet(session, target_index_path) + biosample_index = BiosampleIndex.from_parquet(session, biosample_index_path) # Reading disease index and pre-process. # This logic does not belong anywhere, but gentorpy has no disease dataset yet. disease_index = ( @@ -56,7 +59,6 @@ def __init__( .withColumn("efo", f.coalesce(f.col("efo"), f.col("diseaseId"))) ) study_index = StudyIndex.from_parquet(session, list(study_index_path)) - biosample_index = BiosampleIndex.from_parquet(session, biosample_index_path) # Running validation: study_index_with_qc = ( diff --git a/tests/gentropy/data_samples/cell_ontology_sample.json b/tests/gentropy/data_samples/cell_ontology_sample.json new file mode 100644 index 000000000..5a774f473 --- /dev/null +++ b/tests/gentropy/data_samples/cell_ontology_sample.json @@ -0,0 +1,274 @@ +{ + "graphs" : [ { + "id" : "http://purl.obolibrary.org/obo/cl.json", + "meta" : { + "basicPropertyValues" : [ { + "pred" : "http://purl.obolibrary.org/obo/IAO_0000700", + "val" : "http://purl.obolibrary.org/obo/CL_0000000" + }, { + "pred" : "http://purl.org/dc/elements/1.1/description", + "val" : "An ontology of cell types." + }, { + "pred" : "http://purl.org/dc/elements/1.1/title", + "val" : "Cell Ontology" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0001-5208-3432" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0001-9114-8737" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0001-9990-8331" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0002-2244-7917" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0002-6601-2165" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0002-7073-9172" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0002-8688-6599" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0002-9900-7880" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0003-1980-3228" + }, { + "pred" : "http://purl.org/dc/terms/license", + "val" : "http://creativecommons.org/licenses/by/4.0/" + }, { + "pred" : "http://www.w3.org/2000/01/rdf-schema#comment", + "val" : "See PMID:15693950, PMID:12799354, PMID:20123131, PMID:21208450; Contact Alexander Diehl, addiehl@buffalo.edu, university at buffalo." + }, { + "pred" : "http://www.w3.org/2002/07/owl#versionInfo", + "val" : "2024-08-16" + } ], + "version" : "http://purl.obolibrary.org/obo/cl/releases/2024-08-16/cl.json" + }, + "nodes" : [ { + "id" : "http://purl.obolibrary.org/obo/CL_0000653", + "lbl" : "podocyte", + "type" : "CLASS", + "meta" : { + "definition" : { + "val" : "A specialized kidney epithelial cell, contained within a glomerulus, that contains \"feet\" that interdigitate with the \"feet\" of other podocytes.", + "xrefs" : [ "GOC:tfm", "https://doi.org/10.1101/2021.10.10.463829" ] + }, + "subsets" : [ "http://purl.obolibrary.org/obo/cl#cellxgene_subset", "http://purl.obolibrary.org/obo/uberon/core#human_reference_atlas" ], + "synonyms" : [ { + "pred" : "hasBroadSynonym", + "val" : "epithelial cell of visceral layer of glomerular capsule", + "xrefs" : [ "FMA:70967" ] + }, { + "pred" : "hasExactSynonym", + "val" : "glomerular podocyte", + "xrefs" : [ "FMA:70967" ] + }, { + "pred" : "hasExactSynonym", + "val" : "glomerular visceral epithelial cell" + }, { + "pred" : "hasExactSynonym", + "val" : "kidney podocyte" + }, { + "pred" : "hasExactSynonym", + "val" : "renal podocyte" + } ], + "xrefs" : [ { + "val" : "BTO:0002295" + }, { + "val" : "FMA:70967" + }, { + "val" : "ZFA:0009285" + } ], + "basicPropertyValues" : [ { + "pred" : "http://purl.obolibrary.org/obo/RO_0002175", + "val" : "http://purl.obolibrary.org/obo/NCBITaxon_9606" + }, { + "pred" : "http://www.w3.org/2000/01/rdf-schema#seeAlso", + "val" : "https://github.com/obophenotype/cell-ontology/issues/1460" + } ] + } + }, { + "id" : "http://purl.obolibrary.org/obo/CL_0000654", + "lbl" : "primary oocyte", + "type" : "CLASS", + "meta" : { + "definition" : { + "val" : "A primary oocyte is an oocyte that has not completed female meosis I.", + "xrefs" : [ "GOC:tfm", "ISBN:0721662544" ] + }, + "subsets" : [ "http://purl.obolibrary.org/obo/uberon/core#human_reference_atlas" ], + "synonyms" : [ { + "pred" : "hasRelatedSynonym", + "val" : "primary oogonium" + } ], + "xrefs" : [ { + "val" : "BTO:0000512" + }, { + "val" : "FMA:18645" + } ], + "basicPropertyValues" : [ { + "pred" : "http://purl.obolibrary.org/obo/RO_0002175", + "val" : "http://purl.obolibrary.org/obo/NCBITaxon_9606" + } ] + } + }, { + "id" : "http://purl.obolibrary.org/obo/CL_0000655", + "lbl" : "secondary oocyte", + "type" : "CLASS", + "meta" : { + "definition" : { + "val" : "A secondary oocyte is an oocyte that has not completed meiosis II.", + "xrefs" : [ "GOC:tfm", "ISBN:0721662544" ] + }, + "synonyms" : [ { + "pred" : "hasRelatedSynonym", + "val" : "primary oogonium" + } ], + "xrefs" : [ { + "val" : "BTO:0003094" + }, { + "val" : "FMA:18646" + } ] + } + }, { + "id" : "http://purl.obolibrary.org/obo/CL_0000656", + "lbl" : "primary spermatocyte", + "type" : "CLASS", + "meta" : { + "definition" : { + "val" : "A diploid cell that has derived from a spermatogonium and can subsequently begin meiosis and divide into two haploid secondary spermatocytes.", + "xrefs" : [ "GOC:tfm", "ISBN:0721662544" ] + }, + "xrefs" : [ { + "val" : "BTO:0001115" + }, { + "val" : "CALOHA:TS-2194" + }, { + "val" : "FMA:72292" + } ] + } + }, { + "id" : "http://purl.obolibrary.org/obo/CL_0000657", + "lbl" : "secondary spermatocyte", + "type" : "CLASS", + "meta" : { + "definition" : { + "val" : "One of the two haploid cells into which a primary spermatocyte divides, and which in turn gives origin to spermatids.", + "xrefs" : [ "GOC:tfm", "ISBN:0721662544" ] + }, + "xrefs" : [ { + "val" : "BTO:0000709" + }, { + "val" : "CALOHA:TS-2195" + }, { + "val" : "FBbt:00004941" + }, { + "val" : "FMA:72293" + } ] + } + }, { + "id" : "http://purl.obolibrary.org/obo/CL_0000658", + "lbl" : "cuticle secreting cell", + "type" : "CLASS", + "meta" : { + "definition" : { + "val" : "An epithelial cell that secretes cuticle.", + "xrefs" : [ "GOC:tfm" ] + } + } + }, { + "id" : "http://purl.obolibrary.org/obo/CL_0000659", + "lbl" : "eggshell secreting cell", + "type" : "CLASS", + "meta" : { + "definition" : { + "val" : "An extracellular matrix secreting cell that secretes eggshell.", + "xrefs" : [ "GOC:tfm" ] + } + } + } , { + "id" : "http://purl.obolibrary.org/obo/CL_1000451", + "lbl" : "obsolete epithelial cell of visceral layer of glomerular capsule", + "type" : "CLASS", + "meta" : { + "basicPropertyValues" : [ { + "pred" : "http://purl.obolibrary.org/obo/IAO_0100001", + "val" : "http://purl.obolibrary.org/obo/CL_0000653" + } ], + "deprecated" : true + } + } ], + "edges" : [ + { + "sub" : "http://purl.obolibrary.org/obo/UBERON_0005751", + "pred" : "http://purl.obolibrary.org/obo/BFO_0000051", + "obj" : "http://purl.obolibrary.org/obo/CL_0000653" + }, + { + "sub" : "http://purl.obolibrary.org/obo/GO_1903210", + "pred" : "http://purl.obolibrary.org/obo/BFO_0000066", + "obj" : "http://purl.obolibrary.org/obo/CL_0000653" + }, + { + "sub" : "http://purl.obolibrary.org/obo/GO_0090521", + "pred" : "http://purl.obolibrary.org/obo/RO_0002565", + "obj" : "http://purl.obolibrary.org/obo/CL_0000653" + }, + { + "sub" : "http://purl.obolibrary.org/obo/GO_0072015", + "pred" : "http://purl.obolibrary.org/obo/RO_0002296", + "obj" : "http://purl.obolibrary.org/obo/CL_0000653" + }, + { + "sub" : "http://purl.obolibrary.org/obo/CL_4030008", + "pred" : "is_a", + "obj" : "http://purl.obolibrary.org/obo/CL_0000653" + },{ + "sub" : "http://purl.obolibrary.org/obo/CL_0002525", + "pred" : "is_a", + "obj" : "http://purl.obolibrary.org/obo/CL_0000653" + },{ + "sub" : "http://purl.obolibrary.org/obo/CL_0002523", + "pred" : "is_a", + "obj" : "http://purl.obolibrary.org/obo/CL_0000653" + },{ + "sub" : "http://purl.obolibrary.org/obo/CL_0000653", + "pred" : "is_a", + "obj" : "http://purl.obolibrary.org/obo/CL_0002522" + }, { + "sub" : "http://purl.obolibrary.org/obo/CL_0000653", + "pred" : "is_a", + "obj" : "http://purl.obolibrary.org/obo/CL_1000450" + }, { + "sub" : "http://purl.obolibrary.org/obo/CL_0000653", + "pred" : "http://purl.obolibrary.org/obo/BFO_0000050", + "obj" : "http://purl.obolibrary.org/obo/UBERON_0005751" + }, + { + "sub" : "http://purl.obolibrary.org/obo/CL_0000655", + "pred" : "is_a", + "obj" : "http://purl.obolibrary.org/obo/CL_0000023", + "meta" : { + "basicPropertyValues" : [ { + "pred" : "http://www.geneontology.org/formats/oboInOwl#is_inferred", + "val" : "true" + } ] + } + }, { + "sub" : "http://purl.obolibrary.org/obo/CL_0000655", + "pred" : "http://purl.obolibrary.org/obo/CL_4030044", + "obj" : "http://purl.obolibrary.org/obo/GO_0007147" + }, { + "sub" : "http://purl.obolibrary.org/obo/CL_0000655", + "pred" : "http://purl.obolibrary.org/obo/RO_0002202", + "obj" : "http://purl.obolibrary.org/obo/CL_0000654" + } + ] + } +]} diff --git a/tests/gentropy/data_samples/uberon_sample.json b/tests/gentropy/data_samples/uberon_sample.json new file mode 100644 index 000000000..b06d652ef --- /dev/null +++ b/tests/gentropy/data_samples/uberon_sample.json @@ -0,0 +1,675 @@ +{ + "graphs" : [ { + "id" : "http://purl.obolibrary.org/obo/uberon.json", + "meta" : { + "basicPropertyValues" : [ { + "pred" : "http://purl.obolibrary.org/obo/IAO_0000700", + "val" : "http://purl.obolibrary.org/obo/UBERON_0000104" + }, { + "pred" : "http://purl.obolibrary.org/obo/IAO_0000700", + "val" : "http://purl.obolibrary.org/obo/UBERON_0001062" + }, { + "pred" : "http://purl.org/dc/elements/1.1/creator", + "val" : "https://orcid.org/0000-0001-5839-6798" + }, { + "pred" : "http://purl.org/dc/elements/1.1/creator", + "val" : "https://orcid.org/0000-0001-7972-3866" + }, { + "pred" : "http://purl.org/dc/elements/1.1/creator", + "val" : "https://orcid.org/0000-0001-9114-8737" + }, { + "pred" : "http://purl.org/dc/elements/1.1/creator", + "val" : "https://orcid.org/0000-0002-1810-9886" + }, { + "pred" : "http://purl.org/dc/elements/1.1/creator", + "val" : "https://orcid.org/0000-0002-6601-2165" + }, { + "pred" : "http://purl.org/dc/elements/1.1/creator", + "val" : "https://orcid.org/0000-0002-7356-1779" + }, { + "pred" : "http://purl.org/dc/elements/1.1/creator", + "val" : "https://orcid.org/0000-0002-9611-1279" + }, { + "pred" : "http://purl.org/dc/elements/1.1/creator", + "val" : "https://orcid.org/0000-0003-3162-7490" + }, { + "pred" : "http://purl.org/dc/elements/1.1/creator", + "val" : "https://orcid.org/0000-0003-3308-6245" + }, { + "pred" : "http://purl.org/dc/elements/1.1/description", + "val" : "Uberon is an integrated cross-species anatomy ontology representing a variety of entities classified according to traditional anatomical criteria such as structure, function and developmental lineage. The ontology includes comprehensive relationships to taxon-specific anatomical ontologies, allowing integration of functional, phenotype and expression data." + }, { + "pred" : "http://purl.org/dc/elements/1.1/publisher", + "val" : "http://uberon.org" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "http://dbpedia.org" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "http://palaeos.com" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "http://www.brain-map.org" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "http://braininfo.rprc.washington.edu/" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "http://en.wikipedia.org/wiki/" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "http://ontology.neuinfo.org/NIF/BiomaterialEntities/NIF-GrossAnatomy.owl" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "http://pons.incf.org/wiki/Common_Upper_Mammalian_Brain_Ontology_%28Cumbo%29" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "http://purl.obolibrary.org/obo/aao.owl" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "http://purl.obolibrary.org/obo/aba.owl" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "http://purl.obolibrary.org/obo/aeo.owl" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "http://purl.obolibrary.org/obo/bila.owl" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "http://purl.obolibrary.org/obo/bto.owl" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "http://purl.obolibrary.org/obo/caro.owl" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "http://purl.obolibrary.org/obo/cl.owl" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "http://purl.obolibrary.org/obo/ehdaa2.owl" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "http://purl.obolibrary.org/obo/emapa.owl" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "http://purl.obolibrary.org/obo/fbbt.owl" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "http://purl.obolibrary.org/obo/fma.owl" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "http://purl.obolibrary.org/obo/go.owl" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "http://purl.obolibrary.org/obo/hp.owl" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "http://purl.obolibrary.org/obo/ma.owl" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "http://purl.obolibrary.org/obo/mp.owl" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "http://purl.obolibrary.org/obo/tao.owl" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "http://purl.obolibrary.org/obo/vhog.owl" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "http://purl.obolibrary.org/obo/vsao.owl" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "http://purl.obolibrary.org/obo/wbbt.owl" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "http://purl.obolibrary.org/obo/xao.owl" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "http://purl.obolibrary.org/obo/zfa.owl" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "http://uri.neuinfo.org/nif/nifstd" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "http://www.e-lico.eu/public/kupo/kupo.owl" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "http://www.ebi.ac.uk/efo/efo.owl" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "ISBN:0030229073 Invertebrate Zoology, Barnes" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "ISBN:0073040584 Vertebrates, Kardong" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "ISBN:0123813611 Comparative Anatomy and Histology: A Mouse and Human Atlas, Treuting and Dintzis" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "ISBN:0226313379 Fins into Limbs: Evolution, Development, and Transformation, Hall" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "ISBN:0443065837 Human embryology, Larsen" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "ISBN:0471888893 Comparative Vertebrate Neuroanatomy: Evolution and Adaptation by Butler and Hodos" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "ISBN:0683400088 Stedman's Medical Dictionary" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "ISBN:1588900649 Color Atlas and Textbook of Human Anatomy: Nervous system and sensory organs By Werner Kahle, Michael Frotscher" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "ISBN:1588903958 Principles and practice of pediatric neurosurgery By A. Leland Albright, P. David Adelson, Ian F. Pollack" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "ISBN:1607950324 Craniofacial Embryogenetics & Development, 2nd edition, Sperber" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "ISBN:978-0-12-369548-2 Principles of Developmental Genetics, Sally A Moody" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "ISBN:9780120749034 The laboratory rat" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "ISBN:9780397517251 Surgical anatomy of the hand and upper extremity. By James R. Doyle and Michael J. Botte" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "ISBN:9780674021839 The Tree of Life - Guillaume Lecointre, Herve Le Guyader" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "ISBN:9780878932504 Developmental Biology" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "MESH" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "PMID:11433360 Placental development: lessons from mouse mutants" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "PMID:16417468 Forgotten and novel aspects in pancreas development, Pieler and Chen" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "aggregates AAO from 13:04:2012" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "aggregates TAO from 09:08:2012" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "aggregates VSAO from 16:07:2012" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "http://wiki.phenotypercn.org/wg/phenotypercn/index.php?title=Neural_Crest_Workshop" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "http://wiki.phenotypercn.org/wiki/August_2012_Notes" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "https://docs.google.com/document/d/16JZOuH9sh_a8uIXA4cqg0Q1H6MV5yCj3-rhuKsZoV_U/edit" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "https://docs.google.com/document/d/1MnUgispgGfNQoezYzWzzGTnkAnI0gzRnJIwdip6MMtw/edit" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "https://docs.google.com/document/d/1cPWBqrl_Qy7XHEWFqtR_PgQX61yRkgGuLaiDpnEXxkE/edit" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "https://docs.google.com/document/d/1r9kNPpFYGdu0SpJDLyFAVQczBlG0wAZCBMd18gG3Ot8/edit#" + }, { + "pred" : "http://purl.org/dc/elements/1.1/source", + "val" : "https://docs.google.com/spreadsheet/ccc?key=0Aj8NJdyb-leqdDM0R3hTVTRHRExDVjRCSkZEbDc5N1E#gid=0" + }, { + "pred" : "http://purl.org/dc/elements/1.1/title", + "val" : "Uber-anatomy ontology" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://github.com/orgs/pato-ontology/teams/pato-community" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0001-5889-4463" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0001-7433-0086" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0001-7476-6306" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0001-7920-5321" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0001-7958-3701" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0001-8682-8754" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0001-9107-0714" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0001-9990-8331" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0002-0819-0473" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0002-0956-8634" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0002-1112-5832" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0002-1572-1316" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0002-1604-3078" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0002-1615-2899" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0002-2061-091X" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0002-2244-7917" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0002-3437-3329" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0002-3467-2636" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0002-3734-1859" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0002-5111-7263" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0002-6490-7723" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0002-7073-9172" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0002-8406-3871" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0002-8455-3213" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0002-8688-6599" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0002-9415-5104" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0002-9818-3030" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0002-9900-7880" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0003-1980-3228" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0003-2105-2283" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0003-2338-2550" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0003-3691-0324" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://orcid.org/0000-0003-4423-4370" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://www.wikidata.org/wiki/Q11695472" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://www.wikidata.org/wiki/Q23809253" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://www.wikidata.org/wiki/Q4964264" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://www.wikidata.org/wiki/Q54985720" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://www.wikidata.org/wiki/Q6983890" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://www.wikidata.org/wiki/Q7650732" + }, { + "pred" : "http://purl.org/dc/terms/contributor", + "val" : "https://www.wikidata.org/wiki/Q85793053" + }, { + "pred" : "http://purl.org/dc/terms/isReferencedBy", + "val" : "http://genomebiology.com/2012/13/1/R5" + }, { + "pred" : "http://purl.org/dc/terms/isReferencedBy", + "val" : "http://www.ncbi.nlm.nih.gov/pubmed/22293552" + }, { + "pred" : "http://purl.org/dc/terms/license", + "val" : "http://creativecommons.org/licenses/by/3.0/" + }, { + "pred" : "http://usefulinc.com/ns/doap#GitRepository", + "val" : "https://github.com/cmungall/uberon/" + }, { + "pred" : "http://usefulinc.com/ns/doap#SVNRepository", + "val" : "https://obo.svn.sourceforge.net/svnroot/obo/uberon/" + }, { + "pred" : "http://usefulinc.com/ns/doap#bug-database", + "val" : "https://github.com/obophenotype/uberon/issues/" + }, { + "pred" : "http://usefulinc.com/ns/doap#mailing-list", + "val" : "https://lists.sourceforge.net/lists/listinfo/obo-anatomy" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#default-namespace", + "val" : "uberon" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#hasOBOFormatVersion", + "val" : "1.2" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent", + "val" : "AEO" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent", + "val" : "BILA" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent", + "val" : "BSPO" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent", + "val" : "CARO" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent", + "val" : "GO" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent", + "val" : "OG" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent", + "val" : "VSAO" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-has-subclass", + "val" : "EHDAA" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-has-subclass", + "val" : "EV" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-has-subclass", + "val" : "NCIT" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-has-subclass", + "val" : "OGES" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-has-subclass", + "val" : "SCTID" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-is_a", + "val" : "BFO" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-is_a", + "val" : "VHOG" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val" : "AAO part_of NCBITaxon:8292" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val" : "DHBA part_of NCBITaxon:9606" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val" : "EHDAA2 part_of NCBITaxon:9606" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val" : "EMAPA part_of NCBITaxon:10090" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val" : "FBdv part_of NCBITaxon:7227" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val" : "FMA part_of NCBITaxon:9606" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val" : "HAO part_of NCBITaxon:7399" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val" : "HBA part_of NCBITaxon:9606" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val" : "HsapDv part_of NCBITaxon:9606" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val" : "KUPO part_of NCBITaxon:9606" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val" : "MA part_of NCBITaxon:10090" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val" : "MFO part_of NCBITaxon:8089" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val" : "MmusDv part_of NCBITaxon:10090" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val" : "OlatDv part_of NCBITaxon:8089" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val" : "PBA part_of NCBITaxon:9443" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val" : "SPD part_of NCBITaxon:6893" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val" : "TADS part_of NCBITaxon:6939" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val" : "TAO part_of NCBITaxon:32443" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val" : "TGMA part_of NCBITaxon:44484" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val" : "WBbt part_of NCBITaxon:6237" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val" : "WBls part_of NCBITaxon:6237" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val" : "XAO part_of NCBITaxon:8353" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val" : "ZFA part_of NCBITaxon:7954" + }, { + "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val" : "ZFS part_of NCBITaxon:7954" + }, { + "pred" : "http://www.w3.org/2000/01/rdf-schema#comment", + "val" : "Aurelie Comte, Bill Bug, Catherine Leroy, Duncan Davidson and Trish Whetzel are also contributors. However their ORCIDs were not found." + }, { + "pred" : "http://www.w3.org/2002/07/owl#versionInfo", + "val" : "2024-09-03" + }, { + "pred" : "http://xmlns.com/foaf/0.1/homepage", + "val" : "http://uberon.org" + } ], + "version" : "http://purl.obolibrary.org/obo/uberon/releases/2024-09-03/uberon.json" + }, + "nodes" : [{ + "id" : "http://purl.obolibrary.org/obo/CL_1001593", + "lbl" : "parathyroid glandular cell", + "type" : "CLASS", + "meta" : { + "definition" : { + "val" : "Glandular cell of parathyroid epithelium. Example: Parathyroid chief cell and parathyroid oxyphil cells.", + "xrefs" : [ "HPA:HPA", "NPX:PDR" ] + }, + "synonyms" : [ { + "pred" : "hasRelatedSynonym", + "val" : "parathyroid gland glandular cell", + "xrefs" : [ "CALOHA:TS-1279" ] + }, { + "pred" : "hasRelatedSynonym", + "val" : "parathyroid gland glandular cells", + "xrefs" : [ "CALOHA:TS-1279" ] + } ], + "xrefs" : [ { + "val" : "CALOHA:TS-1279" + } ] + } + }, { + "id" : "http://purl.obolibrary.org/obo/CL_1001595", + "lbl" : "rectum glandular cell", + "type" : "CLASS", + "meta" : { + "definition" : { + "val" : "Glandular cell of rectal epithelium. Example: Goblet cell; enterocytes or absorptive cells; enteroendocrine and M cells.", + "xrefs" : [ "NPX:PDR" ] + }, + "synonyms" : [ { + "pred" : "hasRelatedSynonym", + "val" : "rectal glandular cell", + "xrefs" : [ "CALOHA:TS-1281" ] + }, { + "pred" : "hasRelatedSynonym", + "val" : "rectum glandular cells", + "xrefs" : [ "CALOHA:TS-1281" ] + } ], + "xrefs" : [ { + "val" : "CALOHA:TS-1281" + } ] + } + }, { + "id" : "http://purl.obolibrary.org/obo/CL_1001596", + "lbl" : "salivary gland glandular cell", + "type" : "CLASS", + "meta" : { + "definition" : { + "val" : "Glandular cell of salivary gland. Example: Serous cells, mucous cells, cuboidal epithelial cells of the intercalated ducts, simple cuboidal epithelium of the striated ducts, epithelial cells of excretory ducts.", + "xrefs" : [ "HPA:HPA", "NPX:PDR" ] + }, + "synonyms" : [ { + "pred" : "hasRelatedSynonym", + "val" : "salivary gland glandular cells", + "xrefs" : [ "CALOHA:TS-1282" ] + } ], + "xrefs" : [ { + "val" : "CALOHA:TS-1282" + } ] + } + }, + { + "id" : "http://purl.obolibrary.org/obo/CL_0000653", + "lbl" : "podocyte", + "type" : "CLASS", + "meta" : { + "definition" : { + "val" : "A specialized kidney epithelial cell, contained within a glomerulus, that contains \"feet\" that interdigitate with the \"feet\" of other podocytes.", + "xrefs" : [ "GOC:tfm", "https://doi.org/10.1101/2021.10.10.463829" ] + }, + "subsets" : [ "http://purl.obolibrary.org/obo/cl#cellxgene_subset", "http://purl.obolibrary.org/obo/uberon/core#human_reference_atlas" ], + "synonyms" : [ { + "pred" : "hasBroadSynonym", + "val" : "epithelial cell of visceral layer of glomerular capsule", + "xrefs" : [ "FMA:70967" ] + }, { + "pred" : "hasExactSynonym", + "val" : "glomerular podocyte", + "xrefs" : [ "FMA:70967" ] + }, { + "pred" : "hasExactSynonym", + "val" : "glomerular visceral epithelial cell" + }, { + "pred" : "hasExactSynonym", + "val" : "kidney podocyte" + }, { + "pred" : "hasExactSynonym", + "val" : "renal podocyte" + } ], + "xrefs" : [ { + "val" : "BTO:0002295" + }, { + "val" : "FMA:70967" + } ], + "basicPropertyValues" : [ { + "pred" : "http://purl.obolibrary.org/obo/RO_0002175", + "val" : "http://purl.obolibrary.org/obo/NCBITaxon_9606" + }, { + "pred" : "http://www.w3.org/2000/01/rdf-schema#seeAlso", + "val" : "https://github.com/obophenotype/cell-ontology/issues/1460" + } ] + } + }], + "edges" : [ + { + "sub" : "http://purl.obolibrary.org/obo/CL_1001596", + "pred" : "is_a", + "obj" : "http://purl.obolibrary.org/obo/CL_0000150" + }, { + "sub" : "http://purl.obolibrary.org/obo/CL_1001596", + "pred" : "is_a", + "obj" : "http://purl.obolibrary.org/obo/CL_0000152" + }, { + "sub" : "http://purl.obolibrary.org/obo/CL_1001596", + "pred" : "is_a", + "obj" : "http://purl.obolibrary.org/obo/CL_0002251" + }, { + "sub" : "http://purl.obolibrary.org/obo/CL_1001596", + "pred" : "http://purl.obolibrary.org/obo/BFO_0000050", + "obj" : "http://purl.obolibrary.org/obo/UBERON_0001044" + }, { + "sub" : "http://purl.obolibrary.org/obo/CL_1001596", + "pred" : "http://purl.obolibrary.org/obo/BFO_0000050", + "obj" : "http://purl.obolibrary.org/obo/UBERON_0004809" + }, { + "sub" : "http://purl.obolibrary.org/obo/CL_0002623", + "pred" : "is_a", + "obj" : "http://purl.obolibrary.org/obo/CL_0000622", + "meta" : { + "basicPropertyValues" : [ { + "pred" : "http://www.geneontology.org/formats/oboInOwl#is_inferred", + "val" : "true" + } ] + } + }, { + "sub" : "http://purl.obolibrary.org/obo/CL_0002623", + "pred" : "is_a", + "obj" : "http://purl.obolibrary.org/obo/CL_1001596" + }, { + "sub" : "http://purl.obolibrary.org/obo/CL_0002623", + "pred" : "http://purl.obolibrary.org/obo/BFO_0000050", + "obj" : "http://purl.obolibrary.org/obo/UBERON_0001044" + }, { + "sub" : "http://purl.obolibrary.org/obo/CL_0002623", + "pred" : "is_a", + "obj" : "http://purl.obolibrary.org/obo/CL_0000622", + "meta" : { + "basicPropertyValues" : [ { + "pred" : "http://www.geneontology.org/formats/oboInOwl#is_inferred", + "val" : "true" + } ] + } + }, { + "sub" : "http://purl.obolibrary.org/obo/CL_0002623", + "pred" : "is_a", + "obj" : "http://purl.obolibrary.org/obo/CL_1001596" + }, { + "sub" : "http://purl.obolibrary.org/obo/CL_0002623", + "pred" : "http://purl.obolibrary.org/obo/BFO_0000050", + "obj" : "http://purl.obolibrary.org/obo/UBERON_0001044" + }, + { + "sub" : "http://purl.obolibrary.org/obo/CL_0000653", + "pred" : "is_a", + "obj" : "http://purl.obolibrary.org/obo/CL_1000450" + }, { + "sub" : "http://purl.obolibrary.org/obo/CL_0000653", + "pred" : "http://purl.obolibrary.org/obo/BFO_0000050", + "obj" : "http://purl.obolibrary.org/obo/UBERON_0005751" + }, + ] + } + ] +} diff --git a/tests/gentropy/dataset/test_biosample_index.py b/tests/gentropy/dataset/test_biosample_index.py index f7d85e230..a9221569e 100644 --- a/tests/gentropy/dataset/test_biosample_index.py +++ b/tests/gentropy/dataset/test_biosample_index.py @@ -10,7 +10,7 @@ import json from gentropy.dataset.biosample_index import BiosampleIndex -from gentropy.datasource.ontologies.utils import extract_ontology_from_json +from gentropy.datasource.ontologies.utils import extract_ontology_from_json, merge_biosample_indices def test_biosample_index_creation(mock_biosample_index: BiosampleIndex) -> None: @@ -19,13 +19,15 @@ def test_biosample_index_creation(mock_biosample_index: BiosampleIndex) -> None: -spark2 = SparkSession.builder \ +spark = SparkSession.builder \ .master("local[*]") \ .appName("LocalOntologyIndexing") \ .getOrCreate() +ontology_json1 = "file:////home/alegbe/repos/gentropy/tests/gentropy/data_samples/nephron-minimal.json" +ontology_json2 = "file://///home/alegbe/repos/gentropy/tests/gentropy/data_samples/cell_ontology_dummy.json" -ontology_json = 'file:///home/alegbe/cl.json' -# ontology_json = 'file:///home/alegbe/uberon.json' +df1 = extract_ontology_from_json(ontology_json1, spark) +df2 = extract_ontology_from_json(ontology_json2, spark) -df = extract_ontology_from_json(ontology_json, spark2) \ No newline at end of file +df_merged = merge_biosample_indices([df1, df2]) From 33ebf58c713d84c5d0603ce0504fff419a624f2f Mon Sep 17 00:00:00 2001 From: Tobi Alegbe Date: Mon, 16 Sep 2024 13:31:58 +0000 Subject: [PATCH 10/22] Tests and docs --- .../datasources/ontologies/_cell_ontology.md | 5 +++++ docs/python_api/datasources/ontologies/_uberon.md | 5 +++++ docs/python_api/steps/biosample_index_step.md | 5 +++++ .../ontologies/test_biosample_ontology.py | 14 ++++++++++++++ 4 files changed, 29 insertions(+) create mode 100644 docs/python_api/datasources/ontologies/_cell_ontology.md create mode 100644 docs/python_api/datasources/ontologies/_uberon.md create mode 100644 docs/python_api/steps/biosample_index_step.md create mode 100644 tests/gentropy/datasource/ontologies/test_biosample_ontology.py diff --git a/docs/python_api/datasources/ontologies/_cell_ontology.md b/docs/python_api/datasources/ontologies/_cell_ontology.md new file mode 100644 index 000000000..5798e032b --- /dev/null +++ b/docs/python_api/datasources/ontologies/_cell_ontology.md @@ -0,0 +1,5 @@ +--- +title: Cell Ontology +--- + +The [Cell Ontology](http://www.obofoundry.org/ontology/cl.html) is a structured controlled vocabulary for cell types. It is used to annotate cell types in single-cell RNA-seq data and other omics data. diff --git a/docs/python_api/datasources/ontologies/_uberon.md b/docs/python_api/datasources/ontologies/_uberon.md new file mode 100644 index 000000000..62ef3e96f --- /dev/null +++ b/docs/python_api/datasources/ontologies/_uberon.md @@ -0,0 +1,5 @@ +--- +title: Uberon +--- + +The [Uberon](http://uberon.github.io/) ontology is a multi-species anatomy ontology that integrates cross-species ontologies into a single ontology. diff --git a/docs/python_api/steps/biosample_index_step.md b/docs/python_api/steps/biosample_index_step.md new file mode 100644 index 000000000..d8f7abbb4 --- /dev/null +++ b/docs/python_api/steps/biosample_index_step.md @@ -0,0 +1,5 @@ +--- +title: biosample_index +--- + +::: gentropy.biosample_index.BiosampleIndexStep diff --git a/tests/gentropy/datasource/ontologies/test_biosample_ontology.py b/tests/gentropy/datasource/ontologies/test_biosample_ontology.py new file mode 100644 index 000000000..477272a5d --- /dev/null +++ b/tests/gentropy/datasource/ontologies/test_biosample_ontology.py @@ -0,0 +1,14 @@ +"""Tests for study index dataset from FinnGen.""" + +from __future__ import annotations + +from pyspark.sql import SparkSession +from pyspark.sql import types as t + +from gentropy.dataset.study_index import BiosampleIndex +from gentropy.datasource.ontologies.utils import extract_ontology_from_json + + +def test_biosample_index_from_source(spark: SparkSession) -> None: + """Test biosample index from source.""" + assert isinstance(extract_ontology_from_json(), BiosampleIndex) From 26a429539b3c4d770b5bcb912bdc8ad6f1dc82ee Mon Sep 17 00:00:00 2001 From: Tobi Alegbe Date: Mon, 16 Sep 2024 14:07:07 +0000 Subject: [PATCH 11/22] Updating tests --- .../datasource/gwas_catalog/__init__.py | 3 ++ .../gentropy/dataset/test_biosample_index.py | 14 ------- .../ontologies/test_biosample_ontology.py | 41 +++++++++++++++++-- 3 files changed, 41 insertions(+), 17 deletions(-) create mode 100644 src/gentropy/datasource/gwas_catalog/__init__.py diff --git a/src/gentropy/datasource/gwas_catalog/__init__.py b/src/gentropy/datasource/gwas_catalog/__init__.py new file mode 100644 index 000000000..d12240a6e --- /dev/null +++ b/src/gentropy/datasource/gwas_catalog/__init__.py @@ -0,0 +1,3 @@ +"""GWAS Catalog index data source.""" + +from __future__ import annotations diff --git a/tests/gentropy/dataset/test_biosample_index.py b/tests/gentropy/dataset/test_biosample_index.py index a9221569e..60c89d703 100644 --- a/tests/gentropy/dataset/test_biosample_index.py +++ b/tests/gentropy/dataset/test_biosample_index.py @@ -17,17 +17,3 @@ def test_biosample_index_creation(mock_biosample_index: BiosampleIndex) -> None: """Test biosample index creation with mock biosample index.""" assert isinstance(mock_biosample_index, BiosampleIndex) - - -spark = SparkSession.builder \ - .master("local[*]") \ - .appName("LocalOntologyIndexing") \ - .getOrCreate() - -ontology_json1 = "file:////home/alegbe/repos/gentropy/tests/gentropy/data_samples/nephron-minimal.json" -ontology_json2 = "file://///home/alegbe/repos/gentropy/tests/gentropy/data_samples/cell_ontology_dummy.json" - -df1 = extract_ontology_from_json(ontology_json1, spark) -df2 = extract_ontology_from_json(ontology_json2, spark) - -df_merged = merge_biosample_indices([df1, df2]) diff --git a/tests/gentropy/datasource/ontologies/test_biosample_ontology.py b/tests/gentropy/datasource/ontologies/test_biosample_ontology.py index 477272a5d..af7d9e405 100644 --- a/tests/gentropy/datasource/ontologies/test_biosample_ontology.py +++ b/tests/gentropy/datasource/ontologies/test_biosample_ontology.py @@ -2,13 +2,48 @@ from __future__ import annotations -from pyspark.sql import SparkSession -from pyspark.sql import types as t +from typing import TYPE_CHECKING + +import pytest +from pyspark.sql import DataFrame +from pyspark.sql import functions as f + from gentropy.dataset.study_index import BiosampleIndex -from gentropy.datasource.ontologies.utils import extract_ontology_from_json +from gentropy.datasource.ontologies.utils import extract_ontology_from_json, merge_biosample_indices +if TYPE_CHECKING: + from pyspark.sql import SparkSession def test_biosample_index_from_source(spark: SparkSession) -> None: """Test biosample index from source.""" assert isinstance(extract_ontology_from_json(), BiosampleIndex) + +class TestOntologyParger: + """ Testing ontology parser.""" + + SAMPLE_CELL_ONTOLOGY_PATH = "tests/gentropy/data_samples/cell_ontology_sample.json" + SAMPLE_UBERON_PATH = "tests/gentropy/data_samples/uberon_sample.json" + + def test_cell_ontology_parser(self) -> None: + """Test cell ontology parser.""" + cell_ontology = extract_ontology_from_json(self.SAMPLE_CELL_ONTOLOGY_PATH) + assert isinstance( + cell_ontology, BiosampleIndex + ), "Cell ontology subset is not parsed correctly to BiosampleIndex." + + def test_uberon_parser(self) -> None: + """Test uberon parser.""" + uberon = extract_ontology_from_json(self.SAMPLE_UBERON_PATH) + assert isinstance( + uberon, BiosampleIndex + ), "Uberon subset is not parsed correctly to BiosampleIndex." + + def test_merge_biosample_indices(self) -> None: + """Test merging of biosample indices.""" + cell_ontology = extract_ontology_from_json(self.SAMPLE_CELL_ONTOLOGY_PATH) + uberon = extract_ontology_from_json(self.SAMPLE_UBERON_PATH) + merged = merge_biosample_indices(cell_ontology, uberon) + assert isinstance( + merged, BiosampleIndex + ), "Merging of biosample indices is not correct." \ No newline at end of file From 1c507e61b3bb06a7617a3e8c4b33fe85d85d545a Mon Sep 17 00:00:00 2001 From: Tobi Alegbe Date: Mon, 16 Sep 2024 14:11:40 +0000 Subject: [PATCH 12/22] Revert GWAS catalog file --- src/gentropy/datasource/gwas_catalog/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gentropy/datasource/gwas_catalog/__init__.py b/src/gentropy/datasource/gwas_catalog/__init__.py index d12240a6e..544779b18 100644 --- a/src/gentropy/datasource/gwas_catalog/__init__.py +++ b/src/gentropy/datasource/gwas_catalog/__init__.py @@ -1,3 +1,3 @@ -"""GWAS Catalog index data source.""" +"""GWAS Catalog Data Source.""" from __future__ import annotations From 567d8e10fa32d0a458912c603d8c486eef312c93 Mon Sep 17 00:00:00 2001 From: Tobi Alegbe Date: Tue, 17 Sep 2024 09:28:55 +0000 Subject: [PATCH 13/22] fix(biosample index): update to match pre-commit standards --- docs/python_api/datasources/_datasources.md | 2 +- .../datasources/ontologies/_uberon.md | 2 +- src/gentropy/biosample_index.py | 11 +- src/gentropy/dataset/biosample_index.py | 16 +- src/gentropy/dataset/study_index.py | 3 +- src/gentropy/datasource/ontologies/utils.py | 104 +- src/gentropy/study_validation.py | 2 +- .../data_samples/cell_ontology_sample.json | 609 ++++--- .../gentropy/data_samples/uberon_sample.json | 1550 ++++++++++------- .../gentropy/dataset/test_biosample_index.py | 11 - .../ontologies/test_biosample_ontology.py | 49 +- 11 files changed, 1338 insertions(+), 1021 deletions(-) diff --git a/docs/python_api/datasources/_datasources.md b/docs/python_api/datasources/_datasources.md index f79f8137b..58b4bcd2b 100644 --- a/docs/python_api/datasources/_datasources.md +++ b/docs/python_api/datasources/_datasources.md @@ -41,4 +41,4 @@ This section contains information about the data source harmonisation tools avai ## Biological samples 1. [Uberon](ontologies/_uberon.md) -2. [Cell Ontology](ontologies/_cell_ontology.md) \ No newline at end of file +2. [Cell Ontology](ontologies/_cell_ontology.md) diff --git a/docs/python_api/datasources/ontologies/_uberon.md b/docs/python_api/datasources/ontologies/_uberon.md index 62ef3e96f..4bb47305a 100644 --- a/docs/python_api/datasources/ontologies/_uberon.md +++ b/docs/python_api/datasources/ontologies/_uberon.md @@ -2,4 +2,4 @@ title: Uberon --- -The [Uberon](http://uberon.github.io/) ontology is a multi-species anatomy ontology that integrates cross-species ontologies into a single ontology. +The [Uberon](http://uberon.github.io/) ontology is a multi-species anatomy ontology that integrates cross-species ontologies into a single ontology. diff --git a/src/gentropy/biosample_index.py b/src/gentropy/biosample_index.py index 11274d789..a4080fba1 100644 --- a/src/gentropy/biosample_index.py +++ b/src/gentropy/biosample_index.py @@ -2,8 +2,10 @@ from __future__ import annotations from gentropy.common.session import Session -from gentropy.dataset.biosample_index import BiosampleIndex -from gentropy.datasource.ontologies.utils import extract_ontology_from_json, merge_biosample_indices +from gentropy.datasource.ontologies.utils import ( + extract_ontology_from_json, + merge_biosample_indices, +) class BiosampleIndexStep: @@ -29,8 +31,7 @@ def __init__( """ cell_ontology_index = extract_ontology_from_json(cell_ontology_input_path, session.spark) uberon_index = extract_ontology_from_json(uberon_input_path, session.spark) - + biosample_index = merge_biosample_indices([cell_ontology_index, uberon_index]) - + biosample_index.df.write.mode(session.write_mode).parquet(biosample_index_output_path) - diff --git a/src/gentropy/dataset/biosample_index.py b/src/gentropy/dataset/biosample_index.py index b3735ca62..20cff34e8 100644 --- a/src/gentropy/dataset/biosample_index.py +++ b/src/gentropy/dataset/biosample_index.py @@ -2,24 +2,12 @@ from __future__ import annotations -import importlib.resources as pkg_resources -import json from dataclasses import dataclass -from enum import Enum -from itertools import chain from typing import TYPE_CHECKING -from pyspark.sql import functions as f -from pyspark.sql.window import Window -from functools import reduce - -from gentropy.assets import data from gentropy.common.schemas import parse_spark_schema from gentropy.dataset.dataset import Dataset - -from pyspark.sql import Column, DataFrame, Row - if TYPE_CHECKING: from pyspark.sql.types import StructType @@ -32,10 +20,10 @@ class BiosampleIndex(Dataset): """ @classmethod - def get_schema(cls: type[StudyIndex]) -> StructType: + def get_schema(cls: type[BiosampleIndex]) -> StructType: """Provide the schema for the BiosampleIndex dataset. Returns: StructType: The schema of the BiosampleIndex dataset. """ - return parse_spark_schema("biosample_index.json") \ No newline at end of file + return parse_spark_schema("biosample_index.json") diff --git a/src/gentropy/dataset/study_index.py b/src/gentropy/dataset/study_index.py index 852f14d9e..e6e4d4dc3 100644 --- a/src/gentropy/dataset/study_index.py +++ b/src/gentropy/dataset/study_index.py @@ -19,8 +19,8 @@ from pyspark.sql import Column, DataFrame from pyspark.sql.types import StructType - from gentropy.dataset.gene_index import GeneIndex from gentropy.dataset.biosample_index import BiosampleIndex + from gentropy.dataset.gene_index import GeneIndex class StudyQualityCheck(Enum): @@ -444,4 +444,3 @@ def validate_biosample(self: StudyIndex, biosample_index: BiosampleIndex) -> Stu ) return StudyIndex(_df=validated_df, _schema=StudyIndex.get_schema()) - diff --git a/src/gentropy/datasource/ontologies/utils.py b/src/gentropy/datasource/ontologies/utils.py index adf38c4da..0c4215d09 100644 --- a/src/gentropy/datasource/ontologies/utils.py +++ b/src/gentropy/datasource/ontologies/utils.py @@ -1,17 +1,29 @@ """Utility functions for Biosample ontology processing.""" -from pyspark.sql import Row, SparkSession, DataFrame -from pyspark.sql.types import StructType, StringType, ArrayType -from pyspark.sql.functions import col, explode_outer, collect_set, collect_list, array_distinct, regexp_replace, udf, coalesce, first -from pyspark.sql.window import Window from functools import reduce + +from pyspark.sql import DataFrame, SparkSession +from pyspark.sql.functions import ( + array_distinct, + coalesce, + col, + collect_list, + collect_set, + explode_outer, + first, + regexp_replace, + udf, +) +from pyspark.sql.types import ArrayType, StringType +from pyspark.sql.window import Window + from gentropy.dataset.biosample_index import BiosampleIndex + def extract_ontology_from_json( ontology_json : str, spark : SparkSession ) -> BiosampleIndex: - """ - Extracts the ontology information from a JSON file. Currently only supports Uberon and Cell Ontology. + """Extracts the ontology information from a JSON file. Currently only supports Uberon and Cell Ontology. Args: ontology_json (str): Path to the JSON file containing the ontology information. @@ -21,15 +33,30 @@ def extract_ontology_from_json( BiosampleIndex: Parsed and annotated biosample index table. """ - def json_graph_traversal(df, node_col, link_col, traversal_type="ancestors"): - """ - Traverse a graph represented in a DataFrame to find all ancestors or descendants. + def json_graph_traversal( + df : DataFrame, + node_col : str, + link_col: str, + traversal_type: str + ) -> DataFrame: + """Traverse a graph represented in a DataFrame to find all ancestors or descendants. + + Args: + df (DataFrame): DataFrame containing the graph data. + node_col (str): Column name for the node. + link_col (str): Column name for the link. + traversal_type (str): Type of traversal - "ancestors" or "descendants". + + Returns: + DataFrame: DataFrame with the result column added. """ # Collect graph data as a map graph_map = df.select(node_col, link_col).rdd.collectAsMap() broadcasted_graph = spark.sparkContext.broadcast(graph_map) - def get_relationships(node): + def get_relationships( + node : str + ) -> list[str]: relationships = set() stack = [node] while stack: @@ -80,8 +107,8 @@ def get_relationships(node): collect_set(col("node.meta.xrefs.val")).over(Window.partitionBy("node.id")).getItem(0).alias("dbXrefs"), # col("node.meta.deprecated").alias("deprecated"), collect_set(col("node.meta.synonyms.val")).over(Window.partitionBy("node.id")).getItem(0).alias("synonyms")) - - + + # Extract the relationships from the edges # Prepare relationship-specific DataFrames df_parents = df_edges.filter(col("predicate") == "is_a").select("subject", "object").withColumnRenamed("object", "parent") @@ -100,44 +127,63 @@ def get_relationships(node): df_with_relationships = df_with_ancestors.join(df_with_descendants, df_with_ancestors.subject == df_with_descendants.object, "full_outer").withColumn("biosampleId", coalesce(df_with_ancestors.subject, df_with_descendants.object)).drop("subject", "object") # Join the original DataFrame with the relationship DataFrame - final_df = transformed_df.join(df_with_relationships, ['biosampleId'], "left") - - return final_df + final_df = transformed_df.join(df_with_relationships, ["biosampleId"], "left") + + return BiosampleIndex( + _df=final_df, + _schema=BiosampleIndex.get_schema() + ) def merge_biosample_indices( - biosample_indices: list[BiosampleIndex], + biosample_indices : list[BiosampleIndex] ) -> BiosampleIndex: - """Merge a list of biosample indexes into a single biosample index. + """Merge a list of biosample indices into a single biosample index. + Where there are conflicts, in single values - the first value is taken. In list values, the union of all values is taken. Args: - biosample_indexes (BiosampleIndex): Biosample indexes to merge. + biosample_indices (list[BiosampleIndex]): Biosample indices to merge. Returns: BiosampleIndex: Merged biosample index. """ - - def merge_lists(lists): - """Merge a list of lists into a single list.""" - return list(set([item for sublist in lists if sublist is not None for item in sublist])) - + + def merge_lists( + lists : list[list[str]] + ) -> list[str]: + """Merge a list of lists into a single list. + + Args: + lists (list[list[str]]): List of lists to merge. + + Returns: + list[str]: Merged list. + """ + return list({item for sublist in lists if sublist is not None for item in sublist}) + # Make a spark udf (user defined function) to merge lists merge_lists_udf = udf(merge_lists, ArrayType(StringType())) + # Extract the DataFrames from the BiosampleIndex objects + biosample_dfs = [biosample_index.df for biosample_index in biosample_indices] + # Merge the DataFrames - merged_df = reduce(DataFrame.unionAll, biosample_indices) - + merged_df = reduce(DataFrame.unionAll, biosample_dfs) + # Define dictionary of columns and corresponding aggregation functions # Currently this will take the first value for single values and merge lists for list values agg_funcs = {} for column in merged_df.columns: - if column != 'biosampleId': - if 'list' in column: # Assuming column names that have 'list' need list merging + if column != "biosampleId": + if "list" in column: # Assuming column names that have 'list' need list merging agg_funcs[column] = merge_lists_udf(collect_list(column)).alias(column) else: agg_funcs[column] = first(column, ignorenulls=True).alias(column) # Group by biosampleId and aggregate the columns - merged_df = merged_df.groupBy('biosampleId').agg(agg_funcs) + merged_df = merged_df.groupBy("biosampleId").agg(agg_funcs) - return merged_df \ No newline at end of file + return BiosampleIndex( + _df=merged_df, + _schema=BiosampleIndex.get_schema() + ) diff --git a/src/gentropy/study_validation.py b/src/gentropy/study_validation.py index d19f012e4..0e4c22e6b 100644 --- a/src/gentropy/study_validation.py +++ b/src/gentropy/study_validation.py @@ -5,9 +5,9 @@ from pyspark.sql import functions as f from gentropy.common.session import Session +from gentropy.dataset.biosample_index import BiosampleIndex from gentropy.dataset.gene_index import GeneIndex from gentropy.dataset.study_index import StudyIndex -from gentropy.dataset.biosample_index import BiosampleIndex class StudyValidationStep: diff --git a/tests/gentropy/data_samples/cell_ontology_sample.json b/tests/gentropy/data_samples/cell_ontology_sample.json index 5a774f473..5e73bfdee 100644 --- a/tests/gentropy/data_samples/cell_ontology_sample.json +++ b/tests/gentropy/data_samples/cell_ontology_sample.json @@ -1,274 +1,351 @@ { - "graphs" : [ { - "id" : "http://purl.obolibrary.org/obo/cl.json", - "meta" : { - "basicPropertyValues" : [ { - "pred" : "http://purl.obolibrary.org/obo/IAO_0000700", - "val" : "http://purl.obolibrary.org/obo/CL_0000000" - }, { - "pred" : "http://purl.org/dc/elements/1.1/description", - "val" : "An ontology of cell types." - }, { - "pred" : "http://purl.org/dc/elements/1.1/title", - "val" : "Cell Ontology" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0001-5208-3432" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0001-9114-8737" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0001-9990-8331" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0002-2244-7917" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0002-6601-2165" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0002-7073-9172" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0002-8688-6599" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0002-9900-7880" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0003-1980-3228" - }, { - "pred" : "http://purl.org/dc/terms/license", - "val" : "http://creativecommons.org/licenses/by/4.0/" - }, { - "pred" : "http://www.w3.org/2000/01/rdf-schema#comment", - "val" : "See PMID:15693950, PMID:12799354, PMID:20123131, PMID:21208450; Contact Alexander Diehl, addiehl@buffalo.edu, university at buffalo." - }, { - "pred" : "http://www.w3.org/2002/07/owl#versionInfo", - "val" : "2024-08-16" - } ], - "version" : "http://purl.obolibrary.org/obo/cl/releases/2024-08-16/cl.json" - }, - "nodes" : [ { - "id" : "http://purl.obolibrary.org/obo/CL_0000653", - "lbl" : "podocyte", - "type" : "CLASS", - "meta" : { - "definition" : { - "val" : "A specialized kidney epithelial cell, contained within a glomerulus, that contains \"feet\" that interdigitate with the \"feet\" of other podocytes.", - "xrefs" : [ "GOC:tfm", "https://doi.org/10.1101/2021.10.10.463829" ] + "graphs": [ + { + "id": "http://purl.obolibrary.org/obo/cl.json", + "meta": { + "basicPropertyValues": [ + { + "pred": "http://purl.obolibrary.org/obo/IAO_0000700", + "val": "http://purl.obolibrary.org/obo/CL_0000000" + }, + { + "pred": "http://purl.org/dc/elements/1.1/description", + "val": "An ontology of cell types." + }, + { + "pred": "http://purl.org/dc/elements/1.1/title", + "val": "Cell Ontology" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0001-5208-3432" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0001-9114-8737" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0001-9990-8331" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0002-2244-7917" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0002-6601-2165" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0002-7073-9172" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0002-8688-6599" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0002-9900-7880" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0003-1980-3228" + }, + { + "pred": "http://purl.org/dc/terms/license", + "val": "http://creativecommons.org/licenses/by/4.0/" + }, + { + "pred": "http://www.w3.org/2000/01/rdf-schema#comment", + "val": "See PMID:15693950, PMID:12799354, PMID:20123131, PMID:21208450; Contact Alexander Diehl, addiehl@buffalo.edu, university at buffalo." + }, + { + "pred": "http://www.w3.org/2002/07/owl#versionInfo", + "val": "2024-08-16" + } + ], + "version": "http://purl.obolibrary.org/obo/cl/releases/2024-08-16/cl.json" + }, + "nodes": [ + { + "id": "http://purl.obolibrary.org/obo/CL_0000653", + "lbl": "podocyte", + "type": "CLASS", + "meta": { + "definition": { + "val": "A specialized kidney epithelial cell, contained within a glomerulus, that contains \"feet\" that interdigitate with the \"feet\" of other podocytes.", + "xrefs": ["GOC:tfm", "https://doi.org/10.1101/2021.10.10.463829"] + }, + "subsets": [ + "http://purl.obolibrary.org/obo/cl#cellxgene_subset", + "http://purl.obolibrary.org/obo/uberon/core#human_reference_atlas" + ], + "synonyms": [ + { + "pred": "hasBroadSynonym", + "val": "epithelial cell of visceral layer of glomerular capsule", + "xrefs": ["FMA:70967"] + }, + { + "pred": "hasExactSynonym", + "val": "glomerular podocyte", + "xrefs": ["FMA:70967"] + }, + { + "pred": "hasExactSynonym", + "val": "glomerular visceral epithelial cell" + }, + { + "pred": "hasExactSynonym", + "val": "kidney podocyte" + }, + { + "pred": "hasExactSynonym", + "val": "renal podocyte" + } + ], + "xrefs": [ + { + "val": "BTO:0002295" + }, + { + "val": "FMA:70967" + }, + { + "val": "ZFA:0009285" + } + ], + "basicPropertyValues": [ + { + "pred": "http://purl.obolibrary.org/obo/RO_0002175", + "val": "http://purl.obolibrary.org/obo/NCBITaxon_9606" + }, + { + "pred": "http://www.w3.org/2000/01/rdf-schema#seeAlso", + "val": "https://github.com/obophenotype/cell-ontology/issues/1460" + } + ] + } }, - "subsets" : [ "http://purl.obolibrary.org/obo/cl#cellxgene_subset", "http://purl.obolibrary.org/obo/uberon/core#human_reference_atlas" ], - "synonyms" : [ { - "pred" : "hasBroadSynonym", - "val" : "epithelial cell of visceral layer of glomerular capsule", - "xrefs" : [ "FMA:70967" ] - }, { - "pred" : "hasExactSynonym", - "val" : "glomerular podocyte", - "xrefs" : [ "FMA:70967" ] - }, { - "pred" : "hasExactSynonym", - "val" : "glomerular visceral epithelial cell" - }, { - "pred" : "hasExactSynonym", - "val" : "kidney podocyte" - }, { - "pred" : "hasExactSynonym", - "val" : "renal podocyte" - } ], - "xrefs" : [ { - "val" : "BTO:0002295" - }, { - "val" : "FMA:70967" - }, { - "val" : "ZFA:0009285" - } ], - "basicPropertyValues" : [ { - "pred" : "http://purl.obolibrary.org/obo/RO_0002175", - "val" : "http://purl.obolibrary.org/obo/NCBITaxon_9606" - }, { - "pred" : "http://www.w3.org/2000/01/rdf-schema#seeAlso", - "val" : "https://github.com/obophenotype/cell-ontology/issues/1460" - } ] - } - }, { - "id" : "http://purl.obolibrary.org/obo/CL_0000654", - "lbl" : "primary oocyte", - "type" : "CLASS", - "meta" : { - "definition" : { - "val" : "A primary oocyte is an oocyte that has not completed female meosis I.", - "xrefs" : [ "GOC:tfm", "ISBN:0721662544" ] + { + "id": "http://purl.obolibrary.org/obo/CL_0000654", + "lbl": "primary oocyte", + "type": "CLASS", + "meta": { + "definition": { + "val": "A primary oocyte is an oocyte that has not completed female meosis I.", + "xrefs": ["GOC:tfm", "ISBN:0721662544"] + }, + "subsets": [ + "http://purl.obolibrary.org/obo/uberon/core#human_reference_atlas" + ], + "synonyms": [ + { + "pred": "hasRelatedSynonym", + "val": "primary oogonium" + } + ], + "xrefs": [ + { + "val": "BTO:0000512" + }, + { + "val": "FMA:18645" + } + ], + "basicPropertyValues": [ + { + "pred": "http://purl.obolibrary.org/obo/RO_0002175", + "val": "http://purl.obolibrary.org/obo/NCBITaxon_9606" + } + ] + } }, - "subsets" : [ "http://purl.obolibrary.org/obo/uberon/core#human_reference_atlas" ], - "synonyms" : [ { - "pred" : "hasRelatedSynonym", - "val" : "primary oogonium" - } ], - "xrefs" : [ { - "val" : "BTO:0000512" - }, { - "val" : "FMA:18645" - } ], - "basicPropertyValues" : [ { - "pred" : "http://purl.obolibrary.org/obo/RO_0002175", - "val" : "http://purl.obolibrary.org/obo/NCBITaxon_9606" - } ] - } - }, { - "id" : "http://purl.obolibrary.org/obo/CL_0000655", - "lbl" : "secondary oocyte", - "type" : "CLASS", - "meta" : { - "definition" : { - "val" : "A secondary oocyte is an oocyte that has not completed meiosis II.", - "xrefs" : [ "GOC:tfm", "ISBN:0721662544" ] + { + "id": "http://purl.obolibrary.org/obo/CL_0000655", + "lbl": "secondary oocyte", + "type": "CLASS", + "meta": { + "definition": { + "val": "A secondary oocyte is an oocyte that has not completed meiosis II.", + "xrefs": ["GOC:tfm", "ISBN:0721662544"] + }, + "synonyms": [ + { + "pred": "hasRelatedSynonym", + "val": "primary oogonium" + } + ], + "xrefs": [ + { + "val": "BTO:0003094" + }, + { + "val": "FMA:18646" + } + ] + } }, - "synonyms" : [ { - "pred" : "hasRelatedSynonym", - "val" : "primary oogonium" - } ], - "xrefs" : [ { - "val" : "BTO:0003094" - }, { - "val" : "FMA:18646" - } ] - } - }, { - "id" : "http://purl.obolibrary.org/obo/CL_0000656", - "lbl" : "primary spermatocyte", - "type" : "CLASS", - "meta" : { - "definition" : { - "val" : "A diploid cell that has derived from a spermatogonium and can subsequently begin meiosis and divide into two haploid secondary spermatocytes.", - "xrefs" : [ "GOC:tfm", "ISBN:0721662544" ] + { + "id": "http://purl.obolibrary.org/obo/CL_0000656", + "lbl": "primary spermatocyte", + "type": "CLASS", + "meta": { + "definition": { + "val": "A diploid cell that has derived from a spermatogonium and can subsequently begin meiosis and divide into two haploid secondary spermatocytes.", + "xrefs": ["GOC:tfm", "ISBN:0721662544"] + }, + "xrefs": [ + { + "val": "BTO:0001115" + }, + { + "val": "CALOHA:TS-2194" + }, + { + "val": "FMA:72292" + } + ] + } }, - "xrefs" : [ { - "val" : "BTO:0001115" - }, { - "val" : "CALOHA:TS-2194" - }, { - "val" : "FMA:72292" - } ] - } - }, { - "id" : "http://purl.obolibrary.org/obo/CL_0000657", - "lbl" : "secondary spermatocyte", - "type" : "CLASS", - "meta" : { - "definition" : { - "val" : "One of the two haploid cells into which a primary spermatocyte divides, and which in turn gives origin to spermatids.", - "xrefs" : [ "GOC:tfm", "ISBN:0721662544" ] + { + "id": "http://purl.obolibrary.org/obo/CL_0000657", + "lbl": "secondary spermatocyte", + "type": "CLASS", + "meta": { + "definition": { + "val": "One of the two haploid cells into which a primary spermatocyte divides, and which in turn gives origin to spermatids.", + "xrefs": ["GOC:tfm", "ISBN:0721662544"] + }, + "xrefs": [ + { + "val": "BTO:0000709" + }, + { + "val": "CALOHA:TS-2195" + }, + { + "val": "FBbt:00004941" + }, + { + "val": "FMA:72293" + } + ] + } }, - "xrefs" : [ { - "val" : "BTO:0000709" - }, { - "val" : "CALOHA:TS-2195" - }, { - "val" : "FBbt:00004941" - }, { - "val" : "FMA:72293" - } ] - } - }, { - "id" : "http://purl.obolibrary.org/obo/CL_0000658", - "lbl" : "cuticle secreting cell", - "type" : "CLASS", - "meta" : { - "definition" : { - "val" : "An epithelial cell that secretes cuticle.", - "xrefs" : [ "GOC:tfm" ] - } - } - }, { - "id" : "http://purl.obolibrary.org/obo/CL_0000659", - "lbl" : "eggshell secreting cell", - "type" : "CLASS", - "meta" : { - "definition" : { - "val" : "An extracellular matrix secreting cell that secretes eggshell.", - "xrefs" : [ "GOC:tfm" ] + { + "id": "http://purl.obolibrary.org/obo/CL_0000658", + "lbl": "cuticle secreting cell", + "type": "CLASS", + "meta": { + "definition": { + "val": "An epithelial cell that secretes cuticle.", + "xrefs": ["GOC:tfm"] + } + } + }, + { + "id": "http://purl.obolibrary.org/obo/CL_0000659", + "lbl": "eggshell secreting cell", + "type": "CLASS", + "meta": { + "definition": { + "val": "An extracellular matrix secreting cell that secretes eggshell.", + "xrefs": ["GOC:tfm"] + } + } + }, + { + "id": "http://purl.obolibrary.org/obo/CL_1000451", + "lbl": "obsolete epithelial cell of visceral layer of glomerular capsule", + "type": "CLASS", + "meta": { + "basicPropertyValues": [ + { + "pred": "http://purl.obolibrary.org/obo/IAO_0100001", + "val": "http://purl.obolibrary.org/obo/CL_0000653" + } + ], + "deprecated": true + } } - } - } , { - "id" : "http://purl.obolibrary.org/obo/CL_1000451", - "lbl" : "obsolete epithelial cell of visceral layer of glomerular capsule", - "type" : "CLASS", - "meta" : { - "basicPropertyValues" : [ { - "pred" : "http://purl.obolibrary.org/obo/IAO_0100001", - "val" : "http://purl.obolibrary.org/obo/CL_0000653" - } ], - "deprecated" : true - } - } ], - "edges" : [ - { - "sub" : "http://purl.obolibrary.org/obo/UBERON_0005751", - "pred" : "http://purl.obolibrary.org/obo/BFO_0000051", - "obj" : "http://purl.obolibrary.org/obo/CL_0000653" - }, - { - "sub" : "http://purl.obolibrary.org/obo/GO_1903210", - "pred" : "http://purl.obolibrary.org/obo/BFO_0000066", - "obj" : "http://purl.obolibrary.org/obo/CL_0000653" - }, - { - "sub" : "http://purl.obolibrary.org/obo/GO_0090521", - "pred" : "http://purl.obolibrary.org/obo/RO_0002565", - "obj" : "http://purl.obolibrary.org/obo/CL_0000653" - }, - { - "sub" : "http://purl.obolibrary.org/obo/GO_0072015", - "pred" : "http://purl.obolibrary.org/obo/RO_0002296", - "obj" : "http://purl.obolibrary.org/obo/CL_0000653" - }, - { - "sub" : "http://purl.obolibrary.org/obo/CL_4030008", - "pred" : "is_a", - "obj" : "http://purl.obolibrary.org/obo/CL_0000653" - },{ - "sub" : "http://purl.obolibrary.org/obo/CL_0002525", - "pred" : "is_a", - "obj" : "http://purl.obolibrary.org/obo/CL_0000653" - },{ - "sub" : "http://purl.obolibrary.org/obo/CL_0002523", - "pred" : "is_a", - "obj" : "http://purl.obolibrary.org/obo/CL_0000653" - },{ - "sub" : "http://purl.obolibrary.org/obo/CL_0000653", - "pred" : "is_a", - "obj" : "http://purl.obolibrary.org/obo/CL_0002522" - }, { - "sub" : "http://purl.obolibrary.org/obo/CL_0000653", - "pred" : "is_a", - "obj" : "http://purl.obolibrary.org/obo/CL_1000450" - }, { - "sub" : "http://purl.obolibrary.org/obo/CL_0000653", - "pred" : "http://purl.obolibrary.org/obo/BFO_0000050", - "obj" : "http://purl.obolibrary.org/obo/UBERON_0005751" - }, - { - "sub" : "http://purl.obolibrary.org/obo/CL_0000655", - "pred" : "is_a", - "obj" : "http://purl.obolibrary.org/obo/CL_0000023", - "meta" : { - "basicPropertyValues" : [ { - "pred" : "http://www.geneontology.org/formats/oboInOwl#is_inferred", - "val" : "true" - } ] + ], + "edges": [ + { + "sub": "http://purl.obolibrary.org/obo/UBERON_0005751", + "pred": "http://purl.obolibrary.org/obo/BFO_0000051", + "obj": "http://purl.obolibrary.org/obo/CL_0000653" + }, + { + "sub": "http://purl.obolibrary.org/obo/GO_1903210", + "pred": "http://purl.obolibrary.org/obo/BFO_0000066", + "obj": "http://purl.obolibrary.org/obo/CL_0000653" + }, + { + "sub": "http://purl.obolibrary.org/obo/GO_0090521", + "pred": "http://purl.obolibrary.org/obo/RO_0002565", + "obj": "http://purl.obolibrary.org/obo/CL_0000653" + }, + { + "sub": "http://purl.obolibrary.org/obo/GO_0072015", + "pred": "http://purl.obolibrary.org/obo/RO_0002296", + "obj": "http://purl.obolibrary.org/obo/CL_0000653" + }, + { + "sub": "http://purl.obolibrary.org/obo/CL_4030008", + "pred": "is_a", + "obj": "http://purl.obolibrary.org/obo/CL_0000653" + }, + { + "sub": "http://purl.obolibrary.org/obo/CL_0002525", + "pred": "is_a", + "obj": "http://purl.obolibrary.org/obo/CL_0000653" + }, + { + "sub": "http://purl.obolibrary.org/obo/CL_0002523", + "pred": "is_a", + "obj": "http://purl.obolibrary.org/obo/CL_0000653" + }, + { + "sub": "http://purl.obolibrary.org/obo/CL_0000653", + "pred": "is_a", + "obj": "http://purl.obolibrary.org/obo/CL_0002522" + }, + { + "sub": "http://purl.obolibrary.org/obo/CL_0000653", + "pred": "is_a", + "obj": "http://purl.obolibrary.org/obo/CL_1000450" + }, + { + "sub": "http://purl.obolibrary.org/obo/CL_0000653", + "pred": "http://purl.obolibrary.org/obo/BFO_0000050", + "obj": "http://purl.obolibrary.org/obo/UBERON_0005751" + }, + { + "sub": "http://purl.obolibrary.org/obo/CL_0000655", + "pred": "is_a", + "obj": "http://purl.obolibrary.org/obo/CL_0000023", + "meta": { + "basicPropertyValues": [ + { + "pred": "http://www.geneontology.org/formats/oboInOwl#is_inferred", + "val": "true" + } + ] + } + }, + { + "sub": "http://purl.obolibrary.org/obo/CL_0000655", + "pred": "http://purl.obolibrary.org/obo/CL_4030044", + "obj": "http://purl.obolibrary.org/obo/GO_0007147" + }, + { + "sub": "http://purl.obolibrary.org/obo/CL_0000655", + "pred": "http://purl.obolibrary.org/obo/RO_0002202", + "obj": "http://purl.obolibrary.org/obo/CL_0000654" } - }, { - "sub" : "http://purl.obolibrary.org/obo/CL_0000655", - "pred" : "http://purl.obolibrary.org/obo/CL_4030044", - "obj" : "http://purl.obolibrary.org/obo/GO_0007147" - }, { - "sub" : "http://purl.obolibrary.org/obo/CL_0000655", - "pred" : "http://purl.obolibrary.org/obo/RO_0002202", - "obj" : "http://purl.obolibrary.org/obo/CL_0000654" - } - ] - } -]} + ] + } + ] +} diff --git a/tests/gentropy/data_samples/uberon_sample.json b/tests/gentropy/data_samples/uberon_sample.json index b06d652ef..7dedfa23c 100644 --- a/tests/gentropy/data_samples/uberon_sample.json +++ b/tests/gentropy/data_samples/uberon_sample.json @@ -1,675 +1,889 @@ { - "graphs" : [ { - "id" : "http://purl.obolibrary.org/obo/uberon.json", - "meta" : { - "basicPropertyValues" : [ { - "pred" : "http://purl.obolibrary.org/obo/IAO_0000700", - "val" : "http://purl.obolibrary.org/obo/UBERON_0000104" - }, { - "pred" : "http://purl.obolibrary.org/obo/IAO_0000700", - "val" : "http://purl.obolibrary.org/obo/UBERON_0001062" - }, { - "pred" : "http://purl.org/dc/elements/1.1/creator", - "val" : "https://orcid.org/0000-0001-5839-6798" - }, { - "pred" : "http://purl.org/dc/elements/1.1/creator", - "val" : "https://orcid.org/0000-0001-7972-3866" - }, { - "pred" : "http://purl.org/dc/elements/1.1/creator", - "val" : "https://orcid.org/0000-0001-9114-8737" - }, { - "pred" : "http://purl.org/dc/elements/1.1/creator", - "val" : "https://orcid.org/0000-0002-1810-9886" - }, { - "pred" : "http://purl.org/dc/elements/1.1/creator", - "val" : "https://orcid.org/0000-0002-6601-2165" - }, { - "pred" : "http://purl.org/dc/elements/1.1/creator", - "val" : "https://orcid.org/0000-0002-7356-1779" - }, { - "pred" : "http://purl.org/dc/elements/1.1/creator", - "val" : "https://orcid.org/0000-0002-9611-1279" - }, { - "pred" : "http://purl.org/dc/elements/1.1/creator", - "val" : "https://orcid.org/0000-0003-3162-7490" - }, { - "pred" : "http://purl.org/dc/elements/1.1/creator", - "val" : "https://orcid.org/0000-0003-3308-6245" - }, { - "pred" : "http://purl.org/dc/elements/1.1/description", - "val" : "Uberon is an integrated cross-species anatomy ontology representing a variety of entities classified according to traditional anatomical criteria such as structure, function and developmental lineage. The ontology includes comprehensive relationships to taxon-specific anatomical ontologies, allowing integration of functional, phenotype and expression data." - }, { - "pred" : "http://purl.org/dc/elements/1.1/publisher", - "val" : "http://uberon.org" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "http://dbpedia.org" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "http://palaeos.com" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "http://www.brain-map.org" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "http://braininfo.rprc.washington.edu/" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "http://en.wikipedia.org/wiki/" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "http://ontology.neuinfo.org/NIF/BiomaterialEntities/NIF-GrossAnatomy.owl" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "http://pons.incf.org/wiki/Common_Upper_Mammalian_Brain_Ontology_%28Cumbo%29" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "http://purl.obolibrary.org/obo/aao.owl" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "http://purl.obolibrary.org/obo/aba.owl" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "http://purl.obolibrary.org/obo/aeo.owl" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "http://purl.obolibrary.org/obo/bila.owl" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "http://purl.obolibrary.org/obo/bto.owl" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "http://purl.obolibrary.org/obo/caro.owl" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "http://purl.obolibrary.org/obo/cl.owl" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "http://purl.obolibrary.org/obo/ehdaa2.owl" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "http://purl.obolibrary.org/obo/emapa.owl" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "http://purl.obolibrary.org/obo/fbbt.owl" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "http://purl.obolibrary.org/obo/fma.owl" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "http://purl.obolibrary.org/obo/go.owl" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "http://purl.obolibrary.org/obo/hp.owl" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "http://purl.obolibrary.org/obo/ma.owl" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "http://purl.obolibrary.org/obo/mp.owl" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "http://purl.obolibrary.org/obo/tao.owl" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "http://purl.obolibrary.org/obo/vhog.owl" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "http://purl.obolibrary.org/obo/vsao.owl" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "http://purl.obolibrary.org/obo/wbbt.owl" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "http://purl.obolibrary.org/obo/xao.owl" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "http://purl.obolibrary.org/obo/zfa.owl" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "http://uri.neuinfo.org/nif/nifstd" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "http://www.e-lico.eu/public/kupo/kupo.owl" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "http://www.ebi.ac.uk/efo/efo.owl" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "ISBN:0030229073 Invertebrate Zoology, Barnes" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "ISBN:0073040584 Vertebrates, Kardong" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "ISBN:0123813611 Comparative Anatomy and Histology: A Mouse and Human Atlas, Treuting and Dintzis" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "ISBN:0226313379 Fins into Limbs: Evolution, Development, and Transformation, Hall" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "ISBN:0443065837 Human embryology, Larsen" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "ISBN:0471888893 Comparative Vertebrate Neuroanatomy: Evolution and Adaptation by Butler and Hodos" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "ISBN:0683400088 Stedman's Medical Dictionary" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "ISBN:1588900649 Color Atlas and Textbook of Human Anatomy: Nervous system and sensory organs By Werner Kahle, Michael Frotscher" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "ISBN:1588903958 Principles and practice of pediatric neurosurgery By A. Leland Albright, P. David Adelson, Ian F. Pollack" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "ISBN:1607950324 Craniofacial Embryogenetics & Development, 2nd edition, Sperber" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "ISBN:978-0-12-369548-2 Principles of Developmental Genetics, Sally A Moody" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "ISBN:9780120749034 The laboratory rat" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "ISBN:9780397517251 Surgical anatomy of the hand and upper extremity. By James R. Doyle and Michael J. Botte" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "ISBN:9780674021839 The Tree of Life - Guillaume Lecointre, Herve Le Guyader" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "ISBN:9780878932504 Developmental Biology" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "MESH" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "PMID:11433360 Placental development: lessons from mouse mutants" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "PMID:16417468 Forgotten and novel aspects in pancreas development, Pieler and Chen" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "aggregates AAO from 13:04:2012" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "aggregates TAO from 09:08:2012" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "aggregates VSAO from 16:07:2012" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "http://wiki.phenotypercn.org/wg/phenotypercn/index.php?title=Neural_Crest_Workshop" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "http://wiki.phenotypercn.org/wiki/August_2012_Notes" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "https://docs.google.com/document/d/16JZOuH9sh_a8uIXA4cqg0Q1H6MV5yCj3-rhuKsZoV_U/edit" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "https://docs.google.com/document/d/1MnUgispgGfNQoezYzWzzGTnkAnI0gzRnJIwdip6MMtw/edit" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "https://docs.google.com/document/d/1cPWBqrl_Qy7XHEWFqtR_PgQX61yRkgGuLaiDpnEXxkE/edit" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "https://docs.google.com/document/d/1r9kNPpFYGdu0SpJDLyFAVQczBlG0wAZCBMd18gG3Ot8/edit#" - }, { - "pred" : "http://purl.org/dc/elements/1.1/source", - "val" : "https://docs.google.com/spreadsheet/ccc?key=0Aj8NJdyb-leqdDM0R3hTVTRHRExDVjRCSkZEbDc5N1E#gid=0" - }, { - "pred" : "http://purl.org/dc/elements/1.1/title", - "val" : "Uber-anatomy ontology" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://github.com/orgs/pato-ontology/teams/pato-community" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0001-5889-4463" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0001-7433-0086" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0001-7476-6306" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0001-7920-5321" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0001-7958-3701" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0001-8682-8754" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0001-9107-0714" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0001-9990-8331" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0002-0819-0473" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0002-0956-8634" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0002-1112-5832" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0002-1572-1316" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0002-1604-3078" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0002-1615-2899" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0002-2061-091X" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0002-2244-7917" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0002-3437-3329" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0002-3467-2636" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0002-3734-1859" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0002-5111-7263" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0002-6490-7723" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0002-7073-9172" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0002-8406-3871" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0002-8455-3213" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0002-8688-6599" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0002-9415-5104" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0002-9818-3030" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0002-9900-7880" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0003-1980-3228" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0003-2105-2283" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0003-2338-2550" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0003-3691-0324" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://orcid.org/0000-0003-4423-4370" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://www.wikidata.org/wiki/Q11695472" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://www.wikidata.org/wiki/Q23809253" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://www.wikidata.org/wiki/Q4964264" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://www.wikidata.org/wiki/Q54985720" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://www.wikidata.org/wiki/Q6983890" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://www.wikidata.org/wiki/Q7650732" - }, { - "pred" : "http://purl.org/dc/terms/contributor", - "val" : "https://www.wikidata.org/wiki/Q85793053" - }, { - "pred" : "http://purl.org/dc/terms/isReferencedBy", - "val" : "http://genomebiology.com/2012/13/1/R5" - }, { - "pred" : "http://purl.org/dc/terms/isReferencedBy", - "val" : "http://www.ncbi.nlm.nih.gov/pubmed/22293552" - }, { - "pred" : "http://purl.org/dc/terms/license", - "val" : "http://creativecommons.org/licenses/by/3.0/" - }, { - "pred" : "http://usefulinc.com/ns/doap#GitRepository", - "val" : "https://github.com/cmungall/uberon/" - }, { - "pred" : "http://usefulinc.com/ns/doap#SVNRepository", - "val" : "https://obo.svn.sourceforge.net/svnroot/obo/uberon/" - }, { - "pred" : "http://usefulinc.com/ns/doap#bug-database", - "val" : "https://github.com/obophenotype/uberon/issues/" - }, { - "pred" : "http://usefulinc.com/ns/doap#mailing-list", - "val" : "https://lists.sourceforge.net/lists/listinfo/obo-anatomy" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#default-namespace", - "val" : "uberon" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#hasOBOFormatVersion", - "val" : "1.2" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent", - "val" : "AEO" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent", - "val" : "BILA" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent", - "val" : "BSPO" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent", - "val" : "CARO" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent", - "val" : "GO" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent", - "val" : "OG" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent", - "val" : "VSAO" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-has-subclass", - "val" : "EHDAA" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-has-subclass", - "val" : "EV" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-has-subclass", - "val" : "NCIT" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-has-subclass", - "val" : "OGES" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-has-subclass", - "val" : "SCTID" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-is_a", - "val" : "BFO" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-is_a", - "val" : "VHOG" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", - "val" : "AAO part_of NCBITaxon:8292" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", - "val" : "DHBA part_of NCBITaxon:9606" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", - "val" : "EHDAA2 part_of NCBITaxon:9606" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", - "val" : "EMAPA part_of NCBITaxon:10090" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", - "val" : "FBdv part_of NCBITaxon:7227" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", - "val" : "FMA part_of NCBITaxon:9606" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", - "val" : "HAO part_of NCBITaxon:7399" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", - "val" : "HBA part_of NCBITaxon:9606" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", - "val" : "HsapDv part_of NCBITaxon:9606" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", - "val" : "KUPO part_of NCBITaxon:9606" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", - "val" : "MA part_of NCBITaxon:10090" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", - "val" : "MFO part_of NCBITaxon:8089" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", - "val" : "MmusDv part_of NCBITaxon:10090" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", - "val" : "OlatDv part_of NCBITaxon:8089" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", - "val" : "PBA part_of NCBITaxon:9443" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", - "val" : "SPD part_of NCBITaxon:6893" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", - "val" : "TADS part_of NCBITaxon:6939" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", - "val" : "TAO part_of NCBITaxon:32443" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", - "val" : "TGMA part_of NCBITaxon:44484" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", - "val" : "WBbt part_of NCBITaxon:6237" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", - "val" : "WBls part_of NCBITaxon:6237" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", - "val" : "XAO part_of NCBITaxon:8353" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", - "val" : "ZFA part_of NCBITaxon:7954" - }, { - "pred" : "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", - "val" : "ZFS part_of NCBITaxon:7954" - }, { - "pred" : "http://www.w3.org/2000/01/rdf-schema#comment", - "val" : "Aurelie Comte, Bill Bug, Catherine Leroy, Duncan Davidson and Trish Whetzel are also contributors. However their ORCIDs were not found." - }, { - "pred" : "http://www.w3.org/2002/07/owl#versionInfo", - "val" : "2024-09-03" - }, { - "pred" : "http://xmlns.com/foaf/0.1/homepage", - "val" : "http://uberon.org" - } ], - "version" : "http://purl.obolibrary.org/obo/uberon/releases/2024-09-03/uberon.json" - }, - "nodes" : [{ - "id" : "http://purl.obolibrary.org/obo/CL_1001593", - "lbl" : "parathyroid glandular cell", - "type" : "CLASS", - "meta" : { - "definition" : { - "val" : "Glandular cell of parathyroid epithelium. Example: Parathyroid chief cell and parathyroid oxyphil cells.", - "xrefs" : [ "HPA:HPA", "NPX:PDR" ] - }, - "synonyms" : [ { - "pred" : "hasRelatedSynonym", - "val" : "parathyroid gland glandular cell", - "xrefs" : [ "CALOHA:TS-1279" ] - }, { - "pred" : "hasRelatedSynonym", - "val" : "parathyroid gland glandular cells", - "xrefs" : [ "CALOHA:TS-1279" ] - } ], - "xrefs" : [ { - "val" : "CALOHA:TS-1279" - } ] - } - }, { - "id" : "http://purl.obolibrary.org/obo/CL_1001595", - "lbl" : "rectum glandular cell", - "type" : "CLASS", - "meta" : { - "definition" : { - "val" : "Glandular cell of rectal epithelium. Example: Goblet cell; enterocytes or absorptive cells; enteroendocrine and M cells.", - "xrefs" : [ "NPX:PDR" ] - }, - "synonyms" : [ { - "pred" : "hasRelatedSynonym", - "val" : "rectal glandular cell", - "xrefs" : [ "CALOHA:TS-1281" ] - }, { - "pred" : "hasRelatedSynonym", - "val" : "rectum glandular cells", - "xrefs" : [ "CALOHA:TS-1281" ] - } ], - "xrefs" : [ { - "val" : "CALOHA:TS-1281" - } ] - } - }, { - "id" : "http://purl.obolibrary.org/obo/CL_1001596", - "lbl" : "salivary gland glandular cell", - "type" : "CLASS", - "meta" : { - "definition" : { - "val" : "Glandular cell of salivary gland. Example: Serous cells, mucous cells, cuboidal epithelial cells of the intercalated ducts, simple cuboidal epithelium of the striated ducts, epithelial cells of excretory ducts.", - "xrefs" : [ "HPA:HPA", "NPX:PDR" ] - }, - "synonyms" : [ { - "pred" : "hasRelatedSynonym", - "val" : "salivary gland glandular cells", - "xrefs" : [ "CALOHA:TS-1282" ] - } ], - "xrefs" : [ { - "val" : "CALOHA:TS-1282" - } ] - } + "graphs": [ + { + "id": "http://purl.obolibrary.org/obo/uberon.json", + "meta": { + "basicPropertyValues": [ + { + "pred": "http://purl.obolibrary.org/obo/IAO_0000700", + "val": "http://purl.obolibrary.org/obo/UBERON_0000104" + }, + { + "pred": "http://purl.obolibrary.org/obo/IAO_0000700", + "val": "http://purl.obolibrary.org/obo/UBERON_0001062" + }, + { + "pred": "http://purl.org/dc/elements/1.1/creator", + "val": "https://orcid.org/0000-0001-5839-6798" + }, + { + "pred": "http://purl.org/dc/elements/1.1/creator", + "val": "https://orcid.org/0000-0001-7972-3866" + }, + { + "pred": "http://purl.org/dc/elements/1.1/creator", + "val": "https://orcid.org/0000-0001-9114-8737" + }, + { + "pred": "http://purl.org/dc/elements/1.1/creator", + "val": "https://orcid.org/0000-0002-1810-9886" + }, + { + "pred": "http://purl.org/dc/elements/1.1/creator", + "val": "https://orcid.org/0000-0002-6601-2165" + }, + { + "pred": "http://purl.org/dc/elements/1.1/creator", + "val": "https://orcid.org/0000-0002-7356-1779" + }, + { + "pred": "http://purl.org/dc/elements/1.1/creator", + "val": "https://orcid.org/0000-0002-9611-1279" + }, + { + "pred": "http://purl.org/dc/elements/1.1/creator", + "val": "https://orcid.org/0000-0003-3162-7490" + }, + { + "pred": "http://purl.org/dc/elements/1.1/creator", + "val": "https://orcid.org/0000-0003-3308-6245" + }, + { + "pred": "http://purl.org/dc/elements/1.1/description", + "val": "Uberon is an integrated cross-species anatomy ontology representing a variety of entities classified according to traditional anatomical criteria such as structure, function and developmental lineage. The ontology includes comprehensive relationships to taxon-specific anatomical ontologies, allowing integration of functional, phenotype and expression data." + }, + { + "pred": "http://purl.org/dc/elements/1.1/publisher", + "val": "http://uberon.org" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "http://dbpedia.org" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "http://palaeos.com" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "http://www.brain-map.org" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "http://braininfo.rprc.washington.edu/" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "http://en.wikipedia.org/wiki/" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "http://ontology.neuinfo.org/NIF/BiomaterialEntities/NIF-GrossAnatomy.owl" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "http://pons.incf.org/wiki/Common_Upper_Mammalian_Brain_Ontology_%28Cumbo%29" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "http://purl.obolibrary.org/obo/aao.owl" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "http://purl.obolibrary.org/obo/aba.owl" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "http://purl.obolibrary.org/obo/aeo.owl" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "http://purl.obolibrary.org/obo/bila.owl" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "http://purl.obolibrary.org/obo/bto.owl" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "http://purl.obolibrary.org/obo/caro.owl" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "http://purl.obolibrary.org/obo/cl.owl" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "http://purl.obolibrary.org/obo/ehdaa2.owl" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "http://purl.obolibrary.org/obo/emapa.owl" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "http://purl.obolibrary.org/obo/fbbt.owl" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "http://purl.obolibrary.org/obo/fma.owl" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "http://purl.obolibrary.org/obo/go.owl" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "http://purl.obolibrary.org/obo/hp.owl" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "http://purl.obolibrary.org/obo/ma.owl" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "http://purl.obolibrary.org/obo/mp.owl" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "http://purl.obolibrary.org/obo/tao.owl" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "http://purl.obolibrary.org/obo/vhog.owl" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "http://purl.obolibrary.org/obo/vsao.owl" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "http://purl.obolibrary.org/obo/wbbt.owl" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "http://purl.obolibrary.org/obo/xao.owl" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "http://purl.obolibrary.org/obo/zfa.owl" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "http://uri.neuinfo.org/nif/nifstd" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "http://www.e-lico.eu/public/kupo/kupo.owl" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "http://www.ebi.ac.uk/efo/efo.owl" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "ISBN:0030229073 Invertebrate Zoology, Barnes" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "ISBN:0073040584 Vertebrates, Kardong" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "ISBN:0123813611 Comparative Anatomy and Histology: A Mouse and Human Atlas, Treuting and Dintzis" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "ISBN:0226313379 Fins into Limbs: Evolution, Development, and Transformation, Hall" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "ISBN:0443065837 Human embryology, Larsen" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "ISBN:0471888893 Comparative Vertebrate Neuroanatomy: Evolution and Adaptation by Butler and Hodos" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "ISBN:0683400088 Stedman's Medical Dictionary" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "ISBN:1588900649 Color Atlas and Textbook of Human Anatomy: Nervous system and sensory organs By Werner Kahle, Michael Frotscher" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "ISBN:1588903958 Principles and practice of pediatric neurosurgery By A. Leland Albright, P. David Adelson, Ian F. Pollack" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "ISBN:1607950324 Craniofacial Embryogenetics & Development, 2nd edition, Sperber" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "ISBN:978-0-12-369548-2 Principles of Developmental Genetics, Sally A Moody" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "ISBN:9780120749034 The laboratory rat" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "ISBN:9780397517251 Surgical anatomy of the hand and upper extremity. By James R. Doyle and Michael J. Botte" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "ISBN:9780674021839 The Tree of Life - Guillaume Lecointre, Herve Le Guyader" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "ISBN:9780878932504 Developmental Biology" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "MESH" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "PMID:11433360 Placental development: lessons from mouse mutants" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "PMID:16417468 Forgotten and novel aspects in pancreas development, Pieler and Chen" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "aggregates AAO from 13:04:2012" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "aggregates TAO from 09:08:2012" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "aggregates VSAO from 16:07:2012" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "http://wiki.phenotypercn.org/wg/phenotypercn/index.php?title=Neural_Crest_Workshop" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "http://wiki.phenotypercn.org/wiki/August_2012_Notes" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "https://docs.google.com/document/d/16JZOuH9sh_a8uIXA4cqg0Q1H6MV5yCj3-rhuKsZoV_U/edit" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "https://docs.google.com/document/d/1MnUgispgGfNQoezYzWzzGTnkAnI0gzRnJIwdip6MMtw/edit" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "https://docs.google.com/document/d/1cPWBqrl_Qy7XHEWFqtR_PgQX61yRkgGuLaiDpnEXxkE/edit" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "https://docs.google.com/document/d/1r9kNPpFYGdu0SpJDLyFAVQczBlG0wAZCBMd18gG3Ot8/edit#" + }, + { + "pred": "http://purl.org/dc/elements/1.1/source", + "val": "https://docs.google.com/spreadsheet/ccc?key=0Aj8NJdyb-leqdDM0R3hTVTRHRExDVjRCSkZEbDc5N1E#gid=0" + }, + { + "pred": "http://purl.org/dc/elements/1.1/title", + "val": "Uber-anatomy ontology" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://github.com/orgs/pato-ontology/teams/pato-community" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0001-5889-4463" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0001-7433-0086" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0001-7476-6306" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0001-7920-5321" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0001-7958-3701" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0001-8682-8754" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0001-9107-0714" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0001-9990-8331" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0002-0819-0473" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0002-0956-8634" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0002-1112-5832" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0002-1572-1316" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0002-1604-3078" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0002-1615-2899" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0002-2061-091X" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0002-2244-7917" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0002-3437-3329" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0002-3467-2636" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0002-3734-1859" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0002-5111-7263" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0002-6490-7723" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0002-7073-9172" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0002-8406-3871" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0002-8455-3213" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0002-8688-6599" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0002-9415-5104" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0002-9818-3030" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0002-9900-7880" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0003-1980-3228" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0003-2105-2283" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0003-2338-2550" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0003-3691-0324" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://orcid.org/0000-0003-4423-4370" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://www.wikidata.org/wiki/Q11695472" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://www.wikidata.org/wiki/Q23809253" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://www.wikidata.org/wiki/Q4964264" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://www.wikidata.org/wiki/Q54985720" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://www.wikidata.org/wiki/Q6983890" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://www.wikidata.org/wiki/Q7650732" + }, + { + "pred": "http://purl.org/dc/terms/contributor", + "val": "https://www.wikidata.org/wiki/Q85793053" + }, + { + "pred": "http://purl.org/dc/terms/isReferencedBy", + "val": "http://genomebiology.com/2012/13/1/R5" + }, + { + "pred": "http://purl.org/dc/terms/isReferencedBy", + "val": "http://www.ncbi.nlm.nih.gov/pubmed/22293552" + }, + { + "pred": "http://purl.org/dc/terms/license", + "val": "http://creativecommons.org/licenses/by/3.0/" + }, + { + "pred": "http://usefulinc.com/ns/doap#GitRepository", + "val": "https://github.com/cmungall/uberon/" + }, + { + "pred": "http://usefulinc.com/ns/doap#SVNRepository", + "val": "https://obo.svn.sourceforge.net/svnroot/obo/uberon/" + }, + { + "pred": "http://usefulinc.com/ns/doap#bug-database", + "val": "https://github.com/obophenotype/uberon/issues/" + }, + { + "pred": "http://usefulinc.com/ns/doap#mailing-list", + "val": "https://lists.sourceforge.net/lists/listinfo/obo-anatomy" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#default-namespace", + "val": "uberon" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#hasOBOFormatVersion", + "val": "1.2" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent", + "val": "AEO" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent", + "val": "BILA" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent", + "val": "BSPO" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent", + "val": "CARO" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent", + "val": "GO" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent", + "val": "OG" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-equivalent", + "val": "VSAO" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-has-subclass", + "val": "EHDAA" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-has-subclass", + "val": "EV" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-has-subclass", + "val": "NCIT" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-has-subclass", + "val": "OGES" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-has-subclass", + "val": "SCTID" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-is_a", + "val": "BFO" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-is_a", + "val": "VHOG" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val": "AAO part_of NCBITaxon:8292" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val": "DHBA part_of NCBITaxon:9606" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val": "EHDAA2 part_of NCBITaxon:9606" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val": "EMAPA part_of NCBITaxon:10090" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val": "FBdv part_of NCBITaxon:7227" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val": "FMA part_of NCBITaxon:9606" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val": "HAO part_of NCBITaxon:7399" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val": "HBA part_of NCBITaxon:9606" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val": "HsapDv part_of NCBITaxon:9606" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val": "KUPO part_of NCBITaxon:9606" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val": "MA part_of NCBITaxon:10090" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val": "MFO part_of NCBITaxon:8089" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val": "MmusDv part_of NCBITaxon:10090" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val": "OlatDv part_of NCBITaxon:8089" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val": "PBA part_of NCBITaxon:9443" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val": "SPD part_of NCBITaxon:6893" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val": "TADS part_of NCBITaxon:6939" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val": "TAO part_of NCBITaxon:32443" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val": "TGMA part_of NCBITaxon:44484" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val": "WBbt part_of NCBITaxon:6237" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val": "WBls part_of NCBITaxon:6237" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val": "XAO part_of NCBITaxon:8353" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val": "ZFA part_of NCBITaxon:7954" + }, + { + "pred": "http://www.geneontology.org/formats/oboInOwl#treat-xrefs-as-reverse-genus-differentia", + "val": "ZFS part_of NCBITaxon:7954" + }, + { + "pred": "http://www.w3.org/2000/01/rdf-schema#comment", + "val": "Aurelie Comte, Bill Bug, Catherine Leroy, Duncan Davidson and Trish Whetzel are also contributors. However their ORCIDs were not found." + }, + { + "pred": "http://www.w3.org/2002/07/owl#versionInfo", + "val": "2024-09-03" + }, + { + "pred": "http://xmlns.com/foaf/0.1/homepage", + "val": "http://uberon.org" + } + ], + "version": "http://purl.obolibrary.org/obo/uberon/releases/2024-09-03/uberon.json" }, - { - "id" : "http://purl.obolibrary.org/obo/CL_0000653", - "lbl" : "podocyte", - "type" : "CLASS", - "meta" : { - "definition" : { - "val" : "A specialized kidney epithelial cell, contained within a glomerulus, that contains \"feet\" that interdigitate with the \"feet\" of other podocytes.", - "xrefs" : [ "GOC:tfm", "https://doi.org/10.1101/2021.10.10.463829" ] - }, - "subsets" : [ "http://purl.obolibrary.org/obo/cl#cellxgene_subset", "http://purl.obolibrary.org/obo/uberon/core#human_reference_atlas" ], - "synonyms" : [ { - "pred" : "hasBroadSynonym", - "val" : "epithelial cell of visceral layer of glomerular capsule", - "xrefs" : [ "FMA:70967" ] - }, { - "pred" : "hasExactSynonym", - "val" : "glomerular podocyte", - "xrefs" : [ "FMA:70967" ] - }, { - "pred" : "hasExactSynonym", - "val" : "glomerular visceral epithelial cell" - }, { - "pred" : "hasExactSynonym", - "val" : "kidney podocyte" - }, { - "pred" : "hasExactSynonym", - "val" : "renal podocyte" - } ], - "xrefs" : [ { - "val" : "BTO:0002295" - }, { - "val" : "FMA:70967" - } ], - "basicPropertyValues" : [ { - "pred" : "http://purl.obolibrary.org/obo/RO_0002175", - "val" : "http://purl.obolibrary.org/obo/NCBITaxon_9606" - }, { - "pred" : "http://www.w3.org/2000/01/rdf-schema#seeAlso", - "val" : "https://github.com/obophenotype/cell-ontology/issues/1460" - } ] + "nodes": [ + { + "id": "http://purl.obolibrary.org/obo/CL_1001593", + "lbl": "parathyroid glandular cell", + "type": "CLASS", + "meta": { + "definition": { + "val": "Glandular cell of parathyroid epithelium. Example: Parathyroid chief cell and parathyroid oxyphil cells.", + "xrefs": ["HPA:HPA", "NPX:PDR"] + }, + "synonyms": [ + { + "pred": "hasRelatedSynonym", + "val": "parathyroid gland glandular cell", + "xrefs": ["CALOHA:TS-1279"] + }, + { + "pred": "hasRelatedSynonym", + "val": "parathyroid gland glandular cells", + "xrefs": ["CALOHA:TS-1279"] + } + ], + "xrefs": [ + { + "val": "CALOHA:TS-1279" + } + ] + } + }, + { + "id": "http://purl.obolibrary.org/obo/CL_1001595", + "lbl": "rectum glandular cell", + "type": "CLASS", + "meta": { + "definition": { + "val": "Glandular cell of rectal epithelium. Example: Goblet cell; enterocytes or absorptive cells; enteroendocrine and M cells.", + "xrefs": ["NPX:PDR"] + }, + "synonyms": [ + { + "pred": "hasRelatedSynonym", + "val": "rectal glandular cell", + "xrefs": ["CALOHA:TS-1281"] + }, + { + "pred": "hasRelatedSynonym", + "val": "rectum glandular cells", + "xrefs": ["CALOHA:TS-1281"] + } + ], + "xrefs": [ + { + "val": "CALOHA:TS-1281" + } + ] + } + }, + { + "id": "http://purl.obolibrary.org/obo/CL_1001596", + "lbl": "salivary gland glandular cell", + "type": "CLASS", + "meta": { + "definition": { + "val": "Glandular cell of salivary gland. Example: Serous cells, mucous cells, cuboidal epithelial cells of the intercalated ducts, simple cuboidal epithelium of the striated ducts, epithelial cells of excretory ducts.", + "xrefs": ["HPA:HPA", "NPX:PDR"] + }, + "synonyms": [ + { + "pred": "hasRelatedSynonym", + "val": "salivary gland glandular cells", + "xrefs": ["CALOHA:TS-1282"] + } + ], + "xrefs": [ + { + "val": "CALOHA:TS-1282" + } + ] + } + }, + { + "id": "http://purl.obolibrary.org/obo/CL_0000653", + "lbl": "podocyte", + "type": "CLASS", + "meta": { + "definition": { + "val": "A specialized kidney epithelial cell, contained within a glomerulus, that contains \"feet\" that interdigitate with the \"feet\" of other podocytes.", + "xrefs": ["GOC:tfm", "https://doi.org/10.1101/2021.10.10.463829"] + }, + "subsets": [ + "http://purl.obolibrary.org/obo/cl#cellxgene_subset", + "http://purl.obolibrary.org/obo/uberon/core#human_reference_atlas" + ], + "synonyms": [ + { + "pred": "hasBroadSynonym", + "val": "epithelial cell of visceral layer of glomerular capsule", + "xrefs": ["FMA:70967"] + }, + { + "pred": "hasExactSynonym", + "val": "glomerular podocyte", + "xrefs": ["FMA:70967"] + }, + { + "pred": "hasExactSynonym", + "val": "glomerular visceral epithelial cell" + }, + { + "pred": "hasExactSynonym", + "val": "kidney podocyte" + }, + { + "pred": "hasExactSynonym", + "val": "renal podocyte" + } + ], + "xrefs": [ + { + "val": "BTO:0002295" + }, + { + "val": "FMA:70967" + } + ], + "basicPropertyValues": [ + { + "pred": "http://purl.obolibrary.org/obo/RO_0002175", + "val": "http://purl.obolibrary.org/obo/NCBITaxon_9606" + }, + { + "pred": "http://www.w3.org/2000/01/rdf-schema#seeAlso", + "val": "https://github.com/obophenotype/cell-ontology/issues/1460" + } + ] + } } - }], - "edges" : [ + ], + "edges": [ + { + "sub": "http://purl.obolibrary.org/obo/CL_1001596", + "pred": "is_a", + "obj": "http://purl.obolibrary.org/obo/CL_0000150" + }, + { + "sub": "http://purl.obolibrary.org/obo/CL_1001596", + "pred": "is_a", + "obj": "http://purl.obolibrary.org/obo/CL_0000152" + }, { - "sub" : "http://purl.obolibrary.org/obo/CL_1001596", - "pred" : "is_a", - "obj" : "http://purl.obolibrary.org/obo/CL_0000150" - }, { - "sub" : "http://purl.obolibrary.org/obo/CL_1001596", - "pred" : "is_a", - "obj" : "http://purl.obolibrary.org/obo/CL_0000152" - }, { - "sub" : "http://purl.obolibrary.org/obo/CL_1001596", - "pred" : "is_a", - "obj" : "http://purl.obolibrary.org/obo/CL_0002251" - }, { - "sub" : "http://purl.obolibrary.org/obo/CL_1001596", - "pred" : "http://purl.obolibrary.org/obo/BFO_0000050", - "obj" : "http://purl.obolibrary.org/obo/UBERON_0001044" - }, { - "sub" : "http://purl.obolibrary.org/obo/CL_1001596", - "pred" : "http://purl.obolibrary.org/obo/BFO_0000050", - "obj" : "http://purl.obolibrary.org/obo/UBERON_0004809" - }, { - "sub" : "http://purl.obolibrary.org/obo/CL_0002623", - "pred" : "is_a", - "obj" : "http://purl.obolibrary.org/obo/CL_0000622", - "meta" : { - "basicPropertyValues" : [ { - "pred" : "http://www.geneontology.org/formats/oboInOwl#is_inferred", - "val" : "true" - } ] - } - }, { - "sub" : "http://purl.obolibrary.org/obo/CL_0002623", - "pred" : "is_a", - "obj" : "http://purl.obolibrary.org/obo/CL_1001596" - }, { - "sub" : "http://purl.obolibrary.org/obo/CL_0002623", - "pred" : "http://purl.obolibrary.org/obo/BFO_0000050", - "obj" : "http://purl.obolibrary.org/obo/UBERON_0001044" - }, { - "sub" : "http://purl.obolibrary.org/obo/CL_0002623", - "pred" : "is_a", - "obj" : "http://purl.obolibrary.org/obo/CL_0000622", - "meta" : { - "basicPropertyValues" : [ { - "pred" : "http://www.geneontology.org/formats/oboInOwl#is_inferred", - "val" : "true" - } ] + "sub": "http://purl.obolibrary.org/obo/CL_1001596", + "pred": "is_a", + "obj": "http://purl.obolibrary.org/obo/CL_0002251" + }, + { + "sub": "http://purl.obolibrary.org/obo/CL_1001596", + "pred": "http://purl.obolibrary.org/obo/BFO_0000050", + "obj": "http://purl.obolibrary.org/obo/UBERON_0001044" + }, + { + "sub": "http://purl.obolibrary.org/obo/CL_1001596", + "pred": "http://purl.obolibrary.org/obo/BFO_0000050", + "obj": "http://purl.obolibrary.org/obo/UBERON_0004809" + }, + { + "sub": "http://purl.obolibrary.org/obo/CL_0002623", + "pred": "is_a", + "obj": "http://purl.obolibrary.org/obo/CL_0000622", + "meta": { + "basicPropertyValues": [ + { + "pred": "http://www.geneontology.org/formats/oboInOwl#is_inferred", + "val": "true" + } + ] + } + }, + { + "sub": "http://purl.obolibrary.org/obo/CL_0002623", + "pred": "is_a", + "obj": "http://purl.obolibrary.org/obo/CL_1001596" + }, + { + "sub": "http://purl.obolibrary.org/obo/CL_0002623", + "pred": "http://purl.obolibrary.org/obo/BFO_0000050", + "obj": "http://purl.obolibrary.org/obo/UBERON_0001044" + }, + { + "sub": "http://purl.obolibrary.org/obo/CL_0002623", + "pred": "is_a", + "obj": "http://purl.obolibrary.org/obo/CL_0000622", + "meta": { + "basicPropertyValues": [ + { + "pred": "http://www.geneontology.org/formats/oboInOwl#is_inferred", + "val": "true" + } + ] + } + }, + { + "sub": "http://purl.obolibrary.org/obo/CL_0002623", + "pred": "is_a", + "obj": "http://purl.obolibrary.org/obo/CL_1001596" + }, + { + "sub": "http://purl.obolibrary.org/obo/CL_0002623", + "pred": "http://purl.obolibrary.org/obo/BFO_0000050", + "obj": "http://purl.obolibrary.org/obo/UBERON_0001044" + }, + { + "sub": "http://purl.obolibrary.org/obo/CL_0000653", + "pred": "is_a", + "obj": "http://purl.obolibrary.org/obo/CL_1000450" + }, + { + "sub": "http://purl.obolibrary.org/obo/CL_0000653", + "pred": "http://purl.obolibrary.org/obo/BFO_0000050", + "obj": "http://purl.obolibrary.org/obo/UBERON_0005751" } - }, { - "sub" : "http://purl.obolibrary.org/obo/CL_0002623", - "pred" : "is_a", - "obj" : "http://purl.obolibrary.org/obo/CL_1001596" - }, { - "sub" : "http://purl.obolibrary.org/obo/CL_0002623", - "pred" : "http://purl.obolibrary.org/obo/BFO_0000050", - "obj" : "http://purl.obolibrary.org/obo/UBERON_0001044" - }, - { - "sub" : "http://purl.obolibrary.org/obo/CL_0000653", - "pred" : "is_a", - "obj" : "http://purl.obolibrary.org/obo/CL_1000450" - }, { - "sub" : "http://purl.obolibrary.org/obo/CL_0000653", - "pred" : "http://purl.obolibrary.org/obo/BFO_0000050", - "obj" : "http://purl.obolibrary.org/obo/UBERON_0005751" - }, - ] + ] } - ] + ] } diff --git a/tests/gentropy/dataset/test_biosample_index.py b/tests/gentropy/dataset/test_biosample_index.py index 60c89d703..c647710d1 100644 --- a/tests/gentropy/dataset/test_biosample_index.py +++ b/tests/gentropy/dataset/test_biosample_index.py @@ -1,19 +1,8 @@ """Tests on Biosample index.""" -import pandas as pd -import numpy as np -from pyspark.sql import SparkSession -from pyspark.sql import Row -import pyspark.sql.functions as F -import owlready2 as owl -from pyspark.sql.types import StructType, StructField, StringType, ArrayType, MapType, BooleanType -import json - from gentropy.dataset.biosample_index import BiosampleIndex -from gentropy.datasource.ontologies.utils import extract_ontology_from_json, merge_biosample_indices def test_biosample_index_creation(mock_biosample_index: BiosampleIndex) -> None: """Test biosample index creation with mock biosample index.""" assert isinstance(mock_biosample_index, BiosampleIndex) - diff --git a/tests/gentropy/datasource/ontologies/test_biosample_ontology.py b/tests/gentropy/datasource/ontologies/test_biosample_ontology.py index af7d9e405..9fb8ff92a 100644 --- a/tests/gentropy/datasource/ontologies/test_biosample_ontology.py +++ b/tests/gentropy/datasource/ontologies/test_biosample_ontology.py @@ -1,49 +1,52 @@ -"""Tests for study index dataset from FinnGen.""" +"""Tests for biosample index dataset.""" from __future__ import annotations from typing import TYPE_CHECKING -import pytest -from pyspark.sql import DataFrame -from pyspark.sql import functions as f - - -from gentropy.dataset.study_index import BiosampleIndex -from gentropy.datasource.ontologies.utils import extract_ontology_from_json, merge_biosample_indices +from gentropy.dataset.biosample_index import BiosampleIndex +from gentropy.datasource.ontologies.utils import ( + extract_ontology_from_json, + merge_biosample_indices, +) if TYPE_CHECKING: from pyspark.sql import SparkSession -def test_biosample_index_from_source(spark: SparkSession) -> None: - """Test biosample index from source.""" - assert isinstance(extract_ontology_from_json(), BiosampleIndex) class TestOntologyParger: - """ Testing ontology parser.""" + """Testing ontology parser.""" SAMPLE_CELL_ONTOLOGY_PATH = "tests/gentropy/data_samples/cell_ontology_sample.json" SAMPLE_UBERON_PATH = "tests/gentropy/data_samples/uberon_sample.json" - def test_cell_ontology_parser(self) -> None: + def test_cell_ontology_parser( + self: TestOntologyParger, spark: SparkSession + ) -> None: """Test cell ontology parser.""" - cell_ontology = extract_ontology_from_json(self.SAMPLE_CELL_ONTOLOGY_PATH) + cell_ontology = extract_ontology_from_json( + self.SAMPLE_CELL_ONTOLOGY_PATH, spark + ) assert isinstance( cell_ontology, BiosampleIndex - ), "Cell ontology subset is not parsed correctly to BiosampleIndex." + ), "Cell ontology subset is not parsed correctly to BiosampleIndex." - def test_uberon_parser(self) -> None: + def test_uberon_parser(self: TestOntologyParger, spark: SparkSession) -> None: """Test uberon parser.""" - uberon = extract_ontology_from_json(self.SAMPLE_UBERON_PATH) + uberon = extract_ontology_from_json(self.SAMPLE_UBERON_PATH, spark) assert isinstance( uberon, BiosampleIndex - ), "Uberon subset is not parsed correctly to BiosampleIndex." + ), "Uberon subset is not parsed correctly to BiosampleIndex." - def test_merge_biosample_indices(self) -> None: + def test_merge_biosample_indices( + self: TestOntologyParger, spark: SparkSession + ) -> None: """Test merging of biosample indices.""" - cell_ontology = extract_ontology_from_json(self.SAMPLE_CELL_ONTOLOGY_PATH) - uberon = extract_ontology_from_json(self.SAMPLE_UBERON_PATH) - merged = merge_biosample_indices(cell_ontology, uberon) + cell_ontology = extract_ontology_from_json( + self.SAMPLE_CELL_ONTOLOGY_PATH, spark + ) + uberon = extract_ontology_from_json(self.SAMPLE_UBERON_PATH, spark) + merged = merge_biosample_indices([cell_ontology, uberon]) assert isinstance( merged, BiosampleIndex - ), "Merging of biosample indices is not correct." \ No newline at end of file + ), "Merging of biosample indices is not correct." From 12293d30da1bb6fd7a1e0ad990796d8e4d67545e Mon Sep 17 00:00:00 2001 From: Tobi Alegbe Date: Tue, 17 Sep 2024 12:54:15 +0000 Subject: [PATCH 14/22] fix(biosample index): merging indices fix --- src/gentropy/datasource/ontologies/utils.py | 30 +++++++++++++-------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/src/gentropy/datasource/ontologies/utils.py b/src/gentropy/datasource/ontologies/utils.py index 0c4215d09..e38f81b11 100644 --- a/src/gentropy/datasource/ontologies/utils.py +++ b/src/gentropy/datasource/ontologies/utils.py @@ -56,7 +56,15 @@ def json_graph_traversal( def get_relationships( node : str - ) -> list[str]: + ) -> list[str]: + """Get all relationships for a given node. + + Args: + node (str): Node ID. + + Returns: + list[str]: List of relationships. + """ relationships = set() stack = [node] while stack: @@ -170,20 +178,20 @@ def merge_lists( # Merge the DataFrames merged_df = reduce(DataFrame.unionAll, biosample_dfs) - # Define dictionary of columns and corresponding aggregation functions + # Determine aggregation functions for each column # Currently this will take the first value for single values and merge lists for list values - agg_funcs = {} - for column in merged_df.columns: - if column != "biosampleId": - if "list" in column: # Assuming column names that have 'list' need list merging - agg_funcs[column] = merge_lists_udf(collect_list(column)).alias(column) + agg_funcs = [] + for field in merged_df.schema.fields: + if field.name != "biosampleId": # Skip the grouping column + if field.dataType == ArrayType(StringType()): + agg_funcs.append(merge_lists_udf(collect_list(col(field.name))).alias(field.name)) else: - agg_funcs[column] = first(column, ignorenulls=True).alias(column) + agg_funcs.append(first(col(field.name), ignorenulls=True).alias(field.name)) - # Group by biosampleId and aggregate the columns - merged_df = merged_df.groupBy("biosampleId").agg(agg_funcs) + # Perform aggregation + aggregated_df = merged_df.groupBy("biosampleId").agg(*agg_funcs) return BiosampleIndex( - _df=merged_df, + _df=aggregated_df, _schema=BiosampleIndex.get_schema() ) From 850f91098d3b4ae1475dcee4cca078d9d4250f4e Mon Sep 17 00:00:00 2001 From: Tobi Alegbe Date: Tue, 17 Sep 2024 16:13:33 +0000 Subject: [PATCH 15/22] fix(biosample index): update study index qc logic --- src/gentropy/dataset/study_index.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gentropy/dataset/study_index.py b/src/gentropy/dataset/study_index.py index e6e4d4dc3..b853e740a 100644 --- a/src/gentropy/dataset/study_index.py +++ b/src/gentropy/dataset/study_index.py @@ -424,7 +424,7 @@ def validate_biosample(self: StudyIndex, biosample_index: BiosampleIndex) -> Stu biosample_set = biosample_index.df.select("biosampleId", f.lit(True).alias("isIdFound")) validated_df = ( - self.df.join(biosample_set, on="biosampleId", how="left") + self.df.join(biosample_set, self.df.biosampleFromSourceId == biosample_set.biosampleId, how="left") .withColumn( "isIdFound", f.when( @@ -440,7 +440,7 @@ def validate_biosample(self: StudyIndex, biosample_index: BiosampleIndex) -> Stu StudyQualityCheck.UNKNOWN_BIOSAMPLE, ), ) - .drop("isIdFound") + .drop("isIdFound").drop("biosampleId") ) return StudyIndex(_df=validated_df, _schema=StudyIndex.get_schema()) From c42bdd6974d34e5fefe440222a3a34c597d6a966 Mon Sep 17 00:00:00 2001 From: Tobi Alegbe Date: Wed, 18 Sep 2024 14:06:59 +0000 Subject: [PATCH 16/22] fix(biosample index): fix missing mock_biosample_index --- tests/gentropy/conftest.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tests/gentropy/conftest.py b/tests/gentropy/conftest.py index 629f3a505..9051ce91e 100644 --- a/tests/gentropy/conftest.py +++ b/tests/gentropy/conftest.py @@ -13,6 +13,7 @@ from gentropy.common.Liftover import LiftOverSpark from gentropy.common.session import Session +from gentropy.dataset.biosample_index import BiosampleIndex from gentropy.dataset.colocalisation import Colocalisation from gentropy.dataset.gene_index import GeneIndex from gentropy.dataset.intervals import Intervals @@ -559,6 +560,35 @@ def mock_gene_index(spark: SparkSession) -> GeneIndex: return GeneIndex(_df=data_spec.build(), _schema=gi_schema) +@pytest.fixture() +def mock_biosample_index(spark: SparkSession) -> BiosampleIndex: + """Mock biosample index dataset.""" + bi_schema = BiosampleIndex.get_schema() + + # Makes arrays of varying length with random integers between 1 and 100 + array_expression = "transform(sequence(1, 1 + floor(rand() * 9)), x -> cast((rand() * 100) as int))" + + data_spec = ( + dg.DataGenerator( + spark, + rows=400, + partitions=4, + randomSeedMethod="hash_fieldname", + ) + .withSchema(bi_schema) + .withColumnSpec("biosampleName", percentNulls=0.1) + .withColumnSpec("description", percentNulls=0.1) + .withColumnSpec("dbXrefs", expr=array_expression, percentNulls=0.1) + .withColumnSpec("synonyms", expr=array_expression, percentNulls=0.1) + .withColumnSpec("parents", expr=array_expression, percentNulls=0.1) + .withColumnSpec("ancestors", expr=array_expression, percentNulls=0.1) + .withColumnSpec("descendants", expr=array_expression, percentNulls=0.1) + .withColumnSpec("children", expr=array_expression, percentNulls=0.1) + ) + + return BiosampleIndex(_df=data_spec.build(), _schema=bi_schema) + + @pytest.fixture() def liftover_chain_37_to_38(spark: SparkSession) -> LiftOverSpark: """Sample liftover chain file.""" From 07daedc33eafc8b8f42f093924bb879ac16c3f74 Mon Sep 17 00:00:00 2001 From: Tobi Alegbe Date: Wed, 18 Sep 2024 14:07:57 +0000 Subject: [PATCH 17/22] chore(biosample index): change datasource name from ontologies --- docs/python_api/datasources/_datasources.md | 4 ++-- .../{ontologies => biosample_ontologies}/_cell_ontology.md | 0 .../{ontologies => biosample_ontologies}/_uberon.md | 0 src/gentropy/biosample_index.py | 2 +- .../{ontologies => biosample_ontologies}/__init__.py | 0 .../datasource/{ontologies => biosample_ontologies}/utils.py | 0 .../test_biosample_ontology.py | 2 +- 7 files changed, 4 insertions(+), 4 deletions(-) rename docs/python_api/datasources/{ontologies => biosample_ontologies}/_cell_ontology.md (100%) rename docs/python_api/datasources/{ontologies => biosample_ontologies}/_uberon.md (100%) rename src/gentropy/datasource/{ontologies => biosample_ontologies}/__init__.py (100%) rename src/gentropy/datasource/{ontologies => biosample_ontologies}/utils.py (100%) rename tests/gentropy/datasource/{ontologies => biosample_ontologies}/test_biosample_ontology.py (96%) diff --git a/docs/python_api/datasources/_datasources.md b/docs/python_api/datasources/_datasources.md index 58b4bcd2b..43b212e50 100644 --- a/docs/python_api/datasources/_datasources.md +++ b/docs/python_api/datasources/_datasources.md @@ -40,5 +40,5 @@ This section contains information about the data source harmonisation tools avai ## Biological samples -1. [Uberon](ontologies/_uberon.md) -2. [Cell Ontology](ontologies/_cell_ontology.md) +1. [Uberon](biosample_ontologies/_uberon.md) +2. [Cell Ontology](biosample_ontologies/_cell_ontology.md) diff --git a/docs/python_api/datasources/ontologies/_cell_ontology.md b/docs/python_api/datasources/biosample_ontologies/_cell_ontology.md similarity index 100% rename from docs/python_api/datasources/ontologies/_cell_ontology.md rename to docs/python_api/datasources/biosample_ontologies/_cell_ontology.md diff --git a/docs/python_api/datasources/ontologies/_uberon.md b/docs/python_api/datasources/biosample_ontologies/_uberon.md similarity index 100% rename from docs/python_api/datasources/ontologies/_uberon.md rename to docs/python_api/datasources/biosample_ontologies/_uberon.md diff --git a/src/gentropy/biosample_index.py b/src/gentropy/biosample_index.py index a4080fba1..671309ae5 100644 --- a/src/gentropy/biosample_index.py +++ b/src/gentropy/biosample_index.py @@ -2,7 +2,7 @@ from __future__ import annotations from gentropy.common.session import Session -from gentropy.datasource.ontologies.utils import ( +from gentropy.datasource.biosample_ontologies.utils import ( extract_ontology_from_json, merge_biosample_indices, ) diff --git a/src/gentropy/datasource/ontologies/__init__.py b/src/gentropy/datasource/biosample_ontologies/__init__.py similarity index 100% rename from src/gentropy/datasource/ontologies/__init__.py rename to src/gentropy/datasource/biosample_ontologies/__init__.py diff --git a/src/gentropy/datasource/ontologies/utils.py b/src/gentropy/datasource/biosample_ontologies/utils.py similarity index 100% rename from src/gentropy/datasource/ontologies/utils.py rename to src/gentropy/datasource/biosample_ontologies/utils.py diff --git a/tests/gentropy/datasource/ontologies/test_biosample_ontology.py b/tests/gentropy/datasource/biosample_ontologies/test_biosample_ontology.py similarity index 96% rename from tests/gentropy/datasource/ontologies/test_biosample_ontology.py rename to tests/gentropy/datasource/biosample_ontologies/test_biosample_ontology.py index 9fb8ff92a..0f16f8115 100644 --- a/tests/gentropy/datasource/ontologies/test_biosample_ontology.py +++ b/tests/gentropy/datasource/biosample_ontologies/test_biosample_ontology.py @@ -5,7 +5,7 @@ from typing import TYPE_CHECKING from gentropy.dataset.biosample_index import BiosampleIndex -from gentropy.datasource.ontologies.utils import ( +from gentropy.datasource.biosample_ontologies.utils import ( extract_ontology_from_json, merge_biosample_indices, ) From b150122f4082379553e8e7471a5059c1c41c67d1 Mon Sep 17 00:00:00 2001 From: Tobi Alegbe Date: Wed, 18 Sep 2024 16:30:21 +0100 Subject: [PATCH 18/22] fix(biosample index): add dataset doc --- docs/python_api/datasets/biosample_index.md | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 docs/python_api/datasets/biosample_index.md diff --git a/docs/python_api/datasets/biosample_index.md b/docs/python_api/datasets/biosample_index.md new file mode 100644 index 000000000..d3e4ee2c8 --- /dev/null +++ b/docs/python_api/datasets/biosample_index.md @@ -0,0 +1,9 @@ +--- +title: Biosample index +--- + +::: gentropy.dataset.biosample_index.BiosampleIndex + +## Schema + +--8<-- "assets/schemas/biosample_index.md" From 978f6367a6bff915bc2ac9aa6d889abec8941aa4 Mon Sep 17 00:00:00 2001 From: Tobi Alegbe Date: Thu, 19 Sep 2024 09:43:54 +0000 Subject: [PATCH 19/22] fix(biosample index): change dbXrefs to xrefs --- src/gentropy/assets/schemas/biosample_index.json | 2 +- src/gentropy/datasource/biosample_ontologies/utils.py | 2 +- tests/gentropy/conftest.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/gentropy/assets/schemas/biosample_index.json b/src/gentropy/assets/schemas/biosample_index.json index 7c28ec970..1d68762ac 100644 --- a/src/gentropy/assets/schemas/biosample_index.json +++ b/src/gentropy/assets/schemas/biosample_index.json @@ -20,7 +20,7 @@ "metadata": {} }, { - "name": "dbXrefs", + "name": "xrefs", "type": { "type": "array", "elementType": "string", diff --git a/src/gentropy/datasource/biosample_ontologies/utils.py b/src/gentropy/datasource/biosample_ontologies/utils.py index e38f81b11..5cc4cfaf9 100644 --- a/src/gentropy/datasource/biosample_ontologies/utils.py +++ b/src/gentropy/datasource/biosample_ontologies/utils.py @@ -112,7 +112,7 @@ def get_relationships( regexp_replace(col("node.id"), "http://purl.obolibrary.org/obo/", "").alias("biosampleId"), col("node.lbl").alias("biosampleName"), col("node.meta.definition.val").alias("description"), - collect_set(col("node.meta.xrefs.val")).over(Window.partitionBy("node.id")).getItem(0).alias("dbXrefs"), + collect_set(col("node.meta.xrefs.val")).over(Window.partitionBy("node.id")).getItem(0).alias("xrefs"), # col("node.meta.deprecated").alias("deprecated"), collect_set(col("node.meta.synonyms.val")).over(Window.partitionBy("node.id")).getItem(0).alias("synonyms")) diff --git a/tests/gentropy/conftest.py b/tests/gentropy/conftest.py index 9051ce91e..35fcc6e24 100644 --- a/tests/gentropy/conftest.py +++ b/tests/gentropy/conftest.py @@ -578,7 +578,7 @@ def mock_biosample_index(spark: SparkSession) -> BiosampleIndex: .withSchema(bi_schema) .withColumnSpec("biosampleName", percentNulls=0.1) .withColumnSpec("description", percentNulls=0.1) - .withColumnSpec("dbXrefs", expr=array_expression, percentNulls=0.1) + .withColumnSpec("xrefs", expr=array_expression, percentNulls=0.1) .withColumnSpec("synonyms", expr=array_expression, percentNulls=0.1) .withColumnSpec("parents", expr=array_expression, percentNulls=0.1) .withColumnSpec("ancestors", expr=array_expression, percentNulls=0.1) From ec4edf37d1798f2179c0a4968952f788545f4ad2 Mon Sep 17 00:00:00 2001 From: Tobi Alegbe Date: Sat, 21 Sep 2024 00:18:33 +0100 Subject: [PATCH 20/22] chore (biosample index): better commenting Co-authored-by: Daniel Suveges --- src/gentropy/study_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gentropy/study_validation.py b/src/gentropy/study_validation.py index 0e4c22e6b..6f905f89b 100644 --- a/src/gentropy/study_validation.py +++ b/src/gentropy/study_validation.py @@ -67,7 +67,7 @@ def __init__( .validate_study_type() # Flagging non-supported study types. .validate_target(target_index) # Flagging QTL studies with invalid targets .validate_disease(disease_index) # Flagging invalid EFOs - .validate_biosample(biosample_index) # Flagging invalid biosamples + .validate_biosample(biosample_index) # Flagging studies with invalid biosamples ).persist() # we will need this for 2 types of outputs study_index_with_qc.valid_rows( From cf005042c2b99ca35ebc5cc9dbc39eb6952e05f1 Mon Sep 17 00:00:00 2001 From: Tobi Alegbe Date: Sat, 21 Sep 2024 07:41:59 +0000 Subject: [PATCH 21/22] fix(biosample index): various minor tweaks to biosample index --- poetry.lock | 3 +- src/gentropy/biosample_index.py | 13 +- src/gentropy/config.py | 4 +- src/gentropy/dataset/biosample_index.py | 44 +++++++ .../datasource/biosample_ontologies/utils.py | 113 ++++-------------- .../test_biosample_ontology.py | 8 +- 6 files changed, 80 insertions(+), 105 deletions(-) diff --git a/poetry.lock b/poetry.lock index 226311a8b..296f07145 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.0 and should not be changed by hand. [[package]] name = "aiodns" @@ -3952,6 +3952,7 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, diff --git a/src/gentropy/biosample_index.py b/src/gentropy/biosample_index.py index 671309ae5..e85c2e135 100644 --- a/src/gentropy/biosample_index.py +++ b/src/gentropy/biosample_index.py @@ -2,10 +2,7 @@ from __future__ import annotations from gentropy.common.session import Session -from gentropy.datasource.biosample_ontologies.utils import ( - extract_ontology_from_json, - merge_biosample_indices, -) +from gentropy.datasource.biosample_ontologies.utils import extract_ontology_from_json class BiosampleIndexStep: @@ -19,7 +16,7 @@ def __init__( session: Session, cell_ontology_input_path: str, uberon_input_path: str, - biosample_index_output_path: str, + biosample_index_path: str, ) -> None: """Run Biosample index generation step. @@ -27,11 +24,11 @@ def __init__( session (Session): Session object. cell_ontology_input_path (str): Input cell ontology dataset path. uberon_input_path (str): Input uberon dataset path. - biosample_index_output_path (str): Output gene index dataset path. + biosample_index_path (str): Output gene index dataset path. """ cell_ontology_index = extract_ontology_from_json(cell_ontology_input_path, session.spark) uberon_index = extract_ontology_from_json(uberon_input_path, session.spark) - biosample_index = merge_biosample_indices([cell_ontology_index, uberon_index]) + biosample_index = cell_ontology_index.merge_indices([uberon_index]) - biosample_index.df.write.mode(session.write_mode).parquet(biosample_index_output_path) + biosample_index.df.write.mode(session.write_mode).parquet(biosample_index_path) diff --git a/src/gentropy/config.py b/src/gentropy/config.py index 82bff532c..a1d0cdfc6 100644 --- a/src/gentropy/config.py +++ b/src/gentropy/config.py @@ -55,7 +55,8 @@ class GeneIndexConfig(StepConfig): class BiosampleIndexConfig(StepConfig): """Biosample index step configuration.""" - target_path: str = MISSING + cell_ontology_input_path: str = MISSING + uberon_input_path: str = MISSING biosample_index_path: str = MISSING _target_: str = "gentropy.biosample_index.BiosampleIndexStep" @@ -514,6 +515,7 @@ class StudyValidationStepConfig(StepConfig): study_index_path: list[str] = MISSING target_index_path: str = MISSING disease_index_path: str = MISSING + biosample_index_path: str = MISSING valid_study_index_path: str = MISSING invalid_study_index_path: str = MISSING invalid_qc_reasons: list[str] = MISSING diff --git a/src/gentropy/dataset/biosample_index.py b/src/gentropy/dataset/biosample_index.py index 20cff34e8..35c65a491 100644 --- a/src/gentropy/dataset/biosample_index.py +++ b/src/gentropy/dataset/biosample_index.py @@ -3,8 +3,13 @@ from __future__ import annotations from dataclasses import dataclass +from functools import reduce from typing import TYPE_CHECKING +import pyspark.sql.functions as f +from pyspark.sql import DataFrame +from pyspark.sql.types import ArrayType, StringType + from gentropy.common.schemas import parse_spark_schema from gentropy.dataset.dataset import Dataset @@ -27,3 +32,42 @@ def get_schema(cls: type[BiosampleIndex]) -> StructType: StructType: The schema of the BiosampleIndex dataset. """ return parse_spark_schema("biosample_index.json") + + @classmethod + def merge_indices( + cls: type[BiosampleIndex], + biosample_indices : list[BiosampleIndex] + ) -> BiosampleIndex: + """Merge a list of biosample indices into a single biosample index. + + Where there are conflicts, in single values - the first value is taken. In list values, the union of all values is taken. + + Args: + biosample_indices (list[BiosampleIndex]): Biosample indices to merge. + + Returns: + BiosampleIndex: Merged biosample index. + """ + # Extract the DataFrames from the BiosampleIndex objects + biosample_dfs = [biosample_index.df for biosample_index in biosample_indices] + [cls.df] + + # Merge the DataFrames + merged_df = reduce(DataFrame.unionAll, biosample_dfs) + + # Determine aggregation functions for each column + # Currently this will take the first value for single values and merge lists for list values + agg_funcs = [] + for field in merged_df.schema.fields: + if field.name != "biosampleId": # Skip the grouping column + if field.dataType == ArrayType(StringType()): + agg_funcs.append(f.array_distinct(f.flatten(f.col(field.name))).alias(field.name)) + else: + agg_funcs.append(f.first(f.col(field.name), ignorenulls=True).alias(field.name)) + + # Perform aggregation + aggregated_df = merged_df.groupBy("biosampleId").agg(*agg_funcs) + + return BiosampleIndex( + _df=aggregated_df, + _schema=BiosampleIndex.get_schema() + ) diff --git a/src/gentropy/datasource/biosample_ontologies/utils.py b/src/gentropy/datasource/biosample_ontologies/utils.py index 5cc4cfaf9..6a90a7bab 100644 --- a/src/gentropy/datasource/biosample_ontologies/utils.py +++ b/src/gentropy/datasource/biosample_ontologies/utils.py @@ -1,18 +1,6 @@ """Utility functions for Biosample ontology processing.""" -from functools import reduce - from pyspark.sql import DataFrame, SparkSession -from pyspark.sql.functions import ( - array_distinct, - coalesce, - col, - collect_list, - collect_set, - explode_outer, - first, - regexp_replace, - udf, -) +from pyspark.sql import functions as f from pyspark.sql.types import ArrayType, StringType from pyspark.sql.window import Window @@ -79,52 +67,51 @@ def get_relationships( result_col = "ancestors" if traversal_type == "ancestors" else "descendants" # Register the UDF based on traversal type - relationship_udf = udf(get_relationships, ArrayType(StringType())) + relationship_udf = f.udf(get_relationships, ArrayType(StringType())) # Apply the UDF to create the result column - return df.withColumn(result_col, relationship_udf(col(node_col))) + return df.withColumn(result_col, relationship_udf(f.col(node_col))) # Load the JSON file df = spark.read.json(ontology_json, multiLine=True) # Exploding the 'graphs' array to make individual records easier to access - df_graphs = df.select(explode_outer("graphs").alias("graph")) + df_graphs = df.select(f.explode_outer("graphs").alias("graph")) # Exploding the 'nodes' array within each graph df_nodes = df_graphs.select( - col("graph.id").alias("graph_id"), - explode_outer("graph.nodes").alias("node")) + f.col("graph.id").alias("graph_id"), + f.explode_outer("graph.nodes").alias("node")) # Exploding the 'edges' array within each graph for relationship data df_edges = df_graphs.select( - col("graph.id").alias("graph_id"), - explode_outer("graph.edges").alias("edge") + f.col("graph.id").alias("graph_id"), + f.explode_outer("graph.edges").alias("edge") ).select( - col("edge.sub").alias("subject"), - col("edge.pred").alias("predicate"), - col("edge.obj").alias("object") + f.col("edge.sub").alias("subject"), + f.col("edge.pred").alias("predicate"), + f.col("edge.obj").alias("object") ) - df_edges = df_edges.withColumn("subject", regexp_replace(col("subject"), "http://purl.obolibrary.org/obo/", "")) - df_edges = df_edges.withColumn("object", regexp_replace(col("object"), "http://purl.obolibrary.org/obo/", "")) + df_edges = df_edges.withColumn("subject", f.regexp_replace(f.col("subject"), "http://purl.obolibrary.org/obo/", "")) + df_edges = df_edges.withColumn("object", f.regexp_replace(f.col("object"), "http://purl.obolibrary.org/obo/", "")) # Extract the relevant information from the nodes transformed_df = df_nodes.select( - regexp_replace(col("node.id"), "http://purl.obolibrary.org/obo/", "").alias("biosampleId"), - col("node.lbl").alias("biosampleName"), - col("node.meta.definition.val").alias("description"), - collect_set(col("node.meta.xrefs.val")).over(Window.partitionBy("node.id")).getItem(0).alias("xrefs"), - # col("node.meta.deprecated").alias("deprecated"), - collect_set(col("node.meta.synonyms.val")).over(Window.partitionBy("node.id")).getItem(0).alias("synonyms")) + f.regexp_replace(f.col("node.id"), "http://purl.obolibrary.org/obo/", "").alias("biosampleId"), + f.col("node.lbl").alias("biosampleName"), + f.col("node.meta.definition.val").alias("description"), + f.collect_set(f.col("node.meta.xrefs.val")).over(Window.partitionBy("node.id")).getItem(0).alias("xrefs"), + f.collect_set(f.col("node.meta.synonyms.val")).over(Window.partitionBy("node.id")).getItem(0).alias("synonyms")) # Extract the relationships from the edges # Prepare relationship-specific DataFrames - df_parents = df_edges.filter(col("predicate") == "is_a").select("subject", "object").withColumnRenamed("object", "parent") - df_children = df_edges.filter(col("predicate") == "is_a").select("object", "subject").withColumnRenamed("subject", "child") + df_parents = df_edges.filter(f.col("predicate") == "is_a").select("subject", "object").withColumnRenamed("object", "parent") + df_children = df_edges.filter(f.col("predicate") == "is_a").select("object", "subject").withColumnRenamed("subject", "child") # Aggregate relationships back to nodes - df_parents_grouped = df_parents.groupBy("subject").agg(array_distinct(collect_list("parent")).alias("parents")) - df_children_grouped = df_children.groupBy("object").agg(array_distinct(collect_list("child")).alias("children")) + df_parents_grouped = df_parents.groupBy("subject").agg(f.array_distinct(f.collect_list("parent"))).alias("parents") + df_children_grouped = df_children.groupBy("object").agg(f.array_distinct(f.collect_list("child"))).alias("children") # Get all ancestors df_with_ancestors = json_graph_traversal(df_parents_grouped, "subject", "parents", "ancestors") @@ -132,7 +119,7 @@ def get_relationships( df_with_descendants = json_graph_traversal(df_children_grouped, "object", "children", "descendants") # Join the ancestor and descendant DataFrames - df_with_relationships = df_with_ancestors.join(df_with_descendants, df_with_ancestors.subject == df_with_descendants.object, "full_outer").withColumn("biosampleId", coalesce(df_with_ancestors.subject, df_with_descendants.object)).drop("subject", "object") + df_with_relationships = df_with_ancestors.join(df_with_descendants, df_with_ancestors.subject == df_with_descendants.object, "full_outer").withColumn("biosampleId", f.coalesce(df_with_ancestors.subject, df_with_descendants.object)).drop("subject", "object") # Join the original DataFrame with the relationship DataFrame final_df = transformed_df.join(df_with_relationships, ["biosampleId"], "left") @@ -141,57 +128,3 @@ def get_relationships( _df=final_df, _schema=BiosampleIndex.get_schema() ) - -def merge_biosample_indices( - biosample_indices : list[BiosampleIndex] - ) -> BiosampleIndex: - """Merge a list of biosample indices into a single biosample index. - - Where there are conflicts, in single values - the first value is taken. In list values, the union of all values is taken. - - Args: - biosample_indices (list[BiosampleIndex]): Biosample indices to merge. - - Returns: - BiosampleIndex: Merged biosample index. - """ - - def merge_lists( - lists : list[list[str]] - ) -> list[str]: - """Merge a list of lists into a single list. - - Args: - lists (list[list[str]]): List of lists to merge. - - Returns: - list[str]: Merged list. - """ - return list({item for sublist in lists if sublist is not None for item in sublist}) - - # Make a spark udf (user defined function) to merge lists - merge_lists_udf = udf(merge_lists, ArrayType(StringType())) - - # Extract the DataFrames from the BiosampleIndex objects - biosample_dfs = [biosample_index.df for biosample_index in biosample_indices] - - # Merge the DataFrames - merged_df = reduce(DataFrame.unionAll, biosample_dfs) - - # Determine aggregation functions for each column - # Currently this will take the first value for single values and merge lists for list values - agg_funcs = [] - for field in merged_df.schema.fields: - if field.name != "biosampleId": # Skip the grouping column - if field.dataType == ArrayType(StringType()): - agg_funcs.append(merge_lists_udf(collect_list(col(field.name))).alias(field.name)) - else: - agg_funcs.append(first(col(field.name), ignorenulls=True).alias(field.name)) - - # Perform aggregation - aggregated_df = merged_df.groupBy("biosampleId").agg(*agg_funcs) - - return BiosampleIndex( - _df=aggregated_df, - _schema=BiosampleIndex.get_schema() - ) diff --git a/tests/gentropy/datasource/biosample_ontologies/test_biosample_ontology.py b/tests/gentropy/datasource/biosample_ontologies/test_biosample_ontology.py index 0f16f8115..b88623b0d 100644 --- a/tests/gentropy/datasource/biosample_ontologies/test_biosample_ontology.py +++ b/tests/gentropy/datasource/biosample_ontologies/test_biosample_ontology.py @@ -5,10 +5,7 @@ from typing import TYPE_CHECKING from gentropy.dataset.biosample_index import BiosampleIndex -from gentropy.datasource.biosample_ontologies.utils import ( - extract_ontology_from_json, - merge_biosample_indices, -) +from gentropy.datasource.biosample_ontologies.utils import extract_ontology_from_json if TYPE_CHECKING: from pyspark.sql import SparkSession @@ -46,7 +43,8 @@ def test_merge_biosample_indices( self.SAMPLE_CELL_ONTOLOGY_PATH, spark ) uberon = extract_ontology_from_json(self.SAMPLE_UBERON_PATH, spark) - merged = merge_biosample_indices([cell_ontology, uberon]) + + merged = cell_ontology.merge_indices([uberon]) assert isinstance( merged, BiosampleIndex ), "Merging of biosample indices is not correct." From 729f492b74e52eb9fe69f8db5169d84dcfab2a94 Mon Sep 17 00:00:00 2001 From: Tobi Alegbe Date: Sat, 21 Sep 2024 08:10:05 +0000 Subject: [PATCH 22/22] fix(biosample index): minor bug --- src/gentropy/datasource/biosample_ontologies/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gentropy/datasource/biosample_ontologies/utils.py b/src/gentropy/datasource/biosample_ontologies/utils.py index 6a90a7bab..e02c82cb7 100644 --- a/src/gentropy/datasource/biosample_ontologies/utils.py +++ b/src/gentropy/datasource/biosample_ontologies/utils.py @@ -110,8 +110,8 @@ def get_relationships( df_children = df_edges.filter(f.col("predicate") == "is_a").select("object", "subject").withColumnRenamed("subject", "child") # Aggregate relationships back to nodes - df_parents_grouped = df_parents.groupBy("subject").agg(f.array_distinct(f.collect_list("parent"))).alias("parents") - df_children_grouped = df_children.groupBy("object").agg(f.array_distinct(f.collect_list("child"))).alias("children") + df_parents_grouped = df_parents.groupBy("subject").agg(f.array_distinct(f.collect_list("parent")).alias("parents")) + df_children_grouped = df_children.groupBy("object").agg(f.array_distinct(f.collect_list("child")).alias("children")) # Get all ancestors df_with_ancestors = json_graph_traversal(df_parents_grouped, "subject", "parents", "ancestors")