Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 13 additions & 10 deletions quantmsutils/mzml/mzml_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,21 +100,21 @@ def batch_write_bruker_d(file_name: str, output_path: str, batch_size: int = 100

# Get allowed columns from the schema
allowed_columns = {
"Id": "Id",
"MsMsType": "CASE WHEN MsMsType IN (8, 9) THEN 2 WHEN MsMsType = 0 THEN 1 ELSE NULL END",
"NumPeaks": "NumPeaks",
"MaxIntensity": "MaxIntensity",
"SummedIntensities": "SummedIntensities",
"Time": "Time",
"Charge": "Charge",
"MonoisotopicMz": "MonoisotopicMz",
"Id": ("Id", SCAN),
"MsMsType": ("CASE WHEN MsMsType IN (8, 9) THEN 2 WHEN MsMsType = 0 THEN 1 ELSE NULL END",MS_LEVEL),
"NumPeaks": ("NumPeaks",NUM_PEAKS),
"MaxIntensity": ("MaxIntensity",BASE_PEAK_INTENSITY),
"SummedIntensities": ("SummedIntensities",SUMMED_PEAK_INTENSITY),
"Time": ("Time", RETENTION_TIME),
"Charge": ("Charge", CHARGE),
"MonoisotopicMz": ("MonoisotopicMz", EXPERIMENTAL_MASS_TO_CHARGE),
}

# Construct safe column list
safe_columns = []
for schema_col_name, sql_expr in allowed_columns.items():
if schema_col_name in columns or schema_col_name == "Id":
safe_columns.append(sql_expr)
safe_columns.append(sql_expr[0])

# Construct the query using safe columns
query = f"""SELECT {', '.join(safe_columns)} FROM frames"""
Expand All @@ -125,7 +125,10 @@ def batch_write_bruker_d(file_name: str, output_path: str, batch_size: int = 100
) as parquet_writer:
# Stream data in batches
for chunk in pd.read_sql_query(query, conn, chunksize=batch_size):
chunk["AcquisitionDateTime"] = acquisition_date_time
chunk[ACQUISITION_DATETIME] = acquisition_date_time
# Change column names to match the schema using allowed columns mapping
chunk.rename(columns={v[0]: v[1] for v in allowed_columns.values()}, inplace=True)
chunk[SCAN] = chunk[SCAN].astype(str)
for col in schema.names:
if col not in chunk.columns:
chunk[col] = None
Expand Down
12 changes: 12 additions & 0 deletions tests/test_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,4 +145,16 @@ def test_mzml_statistics_local(self):
assert os.path.exists(TEST_DATA_DIR / "RD139_Narrow_UPS1_0_1fmol_inj1_ms_info.parquet")

output_table = pd.read_parquet(TEST_DATA_DIR / "RD139_Narrow_UPS1_0_1fmol_inj1_ms_info.parquet")
assert len(output_table) > 0, "Output table is empty"

@pytest.mark.skip("Test to be run locally, with bruker file")
def test_mzml_statistics_bruker(self):
"""Test mzML statistics on Bruker sample"""
args = ["--ms2_file", "--ms_path", str(TEST_DATA_DIR / "hMICAL1_coiPAnP-N2-200_3Murea-1Mthiourea-200mMtcep_14733.d")]
result = run_cli_command("mzmlstats", args)

assert result.exit_code == 0
assert os.path.exists(TEST_DATA_DIR / "hMICAL1_coiPAnP-N2-200_3Murea-1Mthiourea-200mMtcep_14733_ms_info.parquet")

output_table = pd.read_parquet(TEST_DATA_DIR / "hMICAL1_coiPAnP-N2-200_3Murea-1Mthiourea-200mMtcep_14733_ms_info.parquet")
assert len(output_table) > 0, "Output table is empty"
Loading