From 2c5197bc3b1cef3876bfba7358516299a76d897a Mon Sep 17 00:00:00 2001 From: Eugene M Date: Fri, 31 May 2024 16:21:22 -0400 Subject: [PATCH 01/15] ENH: add spidr rollover --- requirements.txt | 10 +- tpx3awkward/__init__.py | 1 + tpx3awkward/_utils.py | 210 ++++++++++++++++++++++++++++++++-- tpx3awkward/tests/test_toa.py | 67 +++++++++++ 4 files changed, 272 insertions(+), 16 deletions(-) create mode 100644 tpx3awkward/tests/test_toa.py diff --git a/requirements.txt b/requirements.txt index d7479d5..683f27b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,6 @@ # List required packages in this file, one per line. numpy numba -os pandas -scipy.spatial -concurrent.futures -multiprocessing -time -tdqm -gc -pyCHX +scipy +# pyCHX diff --git a/tpx3awkward/__init__.py b/tpx3awkward/__init__.py index 628ae79..c56bf1b 100644 --- a/tpx3awkward/__init__.py +++ b/tpx3awkward/__init__.py @@ -1,5 +1,6 @@ from pathlib import Path from . import _version +from ._utils import ingest_from_files def _get_version(): diff --git a/tpx3awkward/_utils.py b/tpx3awkward/_utils.py index ffdef7d..ab47d17 100644 --- a/tpx3awkward/_utils.py +++ b/tpx3awkward/_utils.py @@ -2,16 +2,18 @@ import numpy as np from pathlib import Path from numpy.typing import NDArray -from typing import TypeVar, Union, Dict, Set, List +from typing import TypeVar, Union, Dict, Set, List, Iterable import numba import pandas as pd from scipy.spatial import KDTree -import concurrent.futures import multiprocessing -import time from tqdm import tqdm -from pyCHX.chx_packages import db, get_sid_filenames +import warnings import gc +try: + from pyCHX.chx_packages import db, get_sid_filenames +except ImportError: + warnings.warn("Could not import pyCHX package. Proceeding without it...") IA = NDArray[np.uint64] UnSigned = TypeVar("UnSigned", IA, np.uint64) @@ -27,17 +29,24 @@ def raw_as_numpy(fpath: Union[str, Path]) -> IA: ---------- """ - with open(fpath, "rb") as fin: - return np.frombuffer(fin.read(), dtype=" UnSigned: + return v >> np.uint64(shift) & np.uint64(2**width - 1) +@numba.jit(nopython=True) +def matches_nibble(data, nibble) -> numba.boolean: + return (int(data) >> 60) == nibble + + @numba.jit(nopython=True) def is_packet_header(v: UnSigned) -> UnSigned: + """Identify packet headers by magic number (TPX3 as ascii on lowest 8 bytes]""" return get_block(v, 32, 0) == 861425748 @@ -55,7 +64,6 @@ def classify_array(data: IA) -> NDArray[np.uint8]: 6: frame driven data (id'd via 0xA upper nibble) (??) """ output = np.zeros_like(data, dtype="> np.uint(44)) & np.uint(0xFFFF) + # This is laid out 16ibts which are 2 interleaved 8 bit unsigned ints + # CCCCCCCRRRRRRCRR + # |dcol ||spix|^|| + # | 7 || 6 |1|2 + # + # The high 7 bits of the column + # '1111111000000000' + dcol = (l_pix_addr & np.uint(0xFE00)) >> np.uint(8) + # The high 6 bits of the row + # '0000000111111000' + spix = (l_pix_addr & np.uint(0x01F8)) >> np.uint(1) + rowcol = _shift_xy( + chip, + # add the low 2 bits of the row + # '0000000000000011' + spix + (l_pix_addr & np.uint(0x3)), + # add the low 1 bit of the column + # '0000000000000100' + dcol + ((l_pix_addr & np.uint(0x4)) >> np.uint(2)), + ) + return rowcol[1], rowcol[0] + + +@numba.jit(nopython=True) +def get_spidr(msg): + return msg & np.uint(0xFFFF) + + +@numba.jit(nopython=True) +def decode_message(msg, chip, spidr_epoch=0, spidr_cutoff=0): + """Decode TPX3 packages of the second type corresponding to photon events (id'd via 0xB upper nibble) + + Parameters + ---------- + msg (uint64): tpx3 binary package + chip (uint8): chip ID, 0..3 + spidr_epoch: SPIDR epoch at the start of the file; the counter increments every time the SPIDR clock resets + spidr_cutoff: if a file spans two SPIDR epochs, this parameter defines a midpoint between the lowest SPIDR + id in the previous epoch and the highest SPIDR id in the following epoch; i.e. all packages with + SPIDR > spidr_cutoff are guaranteed to belong to the next epoch, and vice versa. + + Returns + ---------- + Arrays of pixel coordinates and timestamps. + """ + x, y = decode_xy(msg, chip) # or use x1, y1 = calculateXY(msg, chip) from the Vendor's code + # ToA is 14 bits + ToA = (msg >> np.uint(30)) & np.uint(0x3FFF) + # ToT is 10 bits; report in ns + ToT = ((msg >> np.uint(20)) & np.uint(0x3FF)) * 25 + # FToA is 4 bits + FToA = (msg >> np.uint(16)) & np.uint(0xF) + # SPIDR time is 16 bits + SPIDR = np.uint64(get_spidr(msg)) + # Apply the spidr_epoch rollover + SPIDR += np.uint64((spidr_epoch + (SPIDR < spidr_cutoff)) * (2**16)) + # Counter value, in multiples of 1.5625 ns + count = (((SPIDR << np.uint(14)) + ToA) << np.uint(4)) - FToA + # Phase correction + phase = np.uint((x / 2) % 16) or np.uint(16) + # Calculate the timestamp + ts = np.uint64(count + phase) + + return x, y, ToT, ts + + +@numba.jit(nopython=True) +def check_spidr_overflow(data, last_spidr=0): + """Detect if the SPIDR counter has overflowed in the current data chunk (file) + + Assumes that only single SPIDR overflow can occur within one file, i.e. the duration of the file is <27 sec. + + Arguments: + ---------- + data : NDArray[np.unint64] + The stream of raw data from the timepix3 detector + last_spidr : int + The last SPIDR value at the tail of the previous tpx3 file + + Returns: + ------- + spidr_midpoint : int + A value of the spidr counter such that any datapoints in the file with SPIDR < midpoint should be assigned + to the next SPIDR epoch (kept track of and incremented outside the function), and vice versa. + If spidr_midpoint=0, all data points in the file belong to the previous SPIDR epoch. + last_spidr : int + The highest SPIDR counter value from the new epoch observed at the end of file. + """ + + head_min, head_max = 65535, 0 + tail_min, tail_max = 65535, 0 + for i in range(64): + if not is_packet_header(data[i]) and matches_nibble(data[i], 0xB): + head_min = min(get_spidr(data[i]), head_min) + head_max = max(get_spidr(data[i]), head_max) + if not is_packet_header(data[-i - 1]) and matches_nibble(data[-i - 1], 0xB): + tail_min = min(get_spidr(data[-i - 1]), tail_min) + tail_max = max(get_spidr(data[-i - 1]), tail_max) + if head_min > tail_max: + # Transition somewhere in the middle of the file + midpoint = (np.uint32(head_min) + np.uint32(tail_max)) // 2 + elif (head_max - head_min) > 32768: + # Large drop in the head of the file + midpoint = (65535 + np.uint32(tail_max)) // 2 # Assume the smallest SPIDR from the last epoch at the head is 65535 + elif (tail_max - tail_min) > 32768: + # Large drop in the tail of the file; find tail_max from the new epoch + tail_max = 0 + for i in range(64): + if not is_packet_header(data[-i - 1]) and matches_nibble(data[-i - 1], 0xB): + tail_spidr = get_spidr(data[-i - 1]) + if tail_spidr < 32768: + tail_max = max(tail_spidr, tail_max) + midpoint = (np.uint32(head_min) + np.uint32(tail_max)) // 2 + elif last_spidr > head_min: + # Transition occured between the files + midpoint = 65535 # Everything in this file is in the new SPIDR epoch + else: + midpoint = 0 + + return np.uint16(midpoint), np.uint16(tail_max) + + +@numba.jit(nopython=True) +def _ingest_raw_data_rollover(data: IA, spidr_epoch=0, last_spidr=0): + + chips = np.zeros_like(data, dtype=np.uint8) + x = np.zeros_like(data, dtype="u2") + y = np.zeros_like(data, dtype="u2") + tot = np.zeros_like(data, dtype="u4") + ts = np.zeros_like(data, dtype="u8") + + chip_indx = 0 + spidr_cutoff, last_spidr = check_spidr_overflow(data, last_spidr=last_spidr) + i = 0 + for msg in data: + if is_packet_header(msg): + chip_indx = np.uint8(get_block(msg, 8, 32)) + elif matches_nibble(msg, 0xB): + chips[i] = chip_indx + _x, _y, _tot, _ts = decode_message(msg, chip_indx, spidr_epoch=spidr_epoch, spidr_cutoff=spidr_cutoff) + x[i] = _x + y[i] = _y + tot[i] = _tot + ts[i] = _ts + i += 1 + + # Sort the timestamps + indx = np.argsort(ts[:i], kind="mergesort") + chips = chips[indx] + x, y, tot, ts = x[indx], y[indx], tot[indx], ts[indx] + + if spidr_cutoff > 0: + spidr_epoch += 1 + + return x, y, tot, ts, chips, spidr_epoch, last_spidr + + @numba.jit(nopython=True) def _ingest_raw_data(data: IA): types = np.zeros_like(data, dtype=" Iterable[Dict[str, NDArray]]: + """Parse values out of a sequence of timepix3 files with rollover of SPIDR counter. + + Parameters + ---------- + fpaths : A sorted sequence of tpx3 filepaths. + + Returns + ------- + An iterable over the parsing results, each encapsulated in a dictionary + """ + + spidr_epoch, last_spidr = 0, 0 + for fpath in fpaths: + data = raw_as_numpy(fpath) + x, y, tot, ts, chips, spidr_epoch, last_spidr = _ingest_raw_data_rollover(data, spidr_epoch, last_spidr=last_spidr) + + yield { + k.strip(): v + for k, v in zip( + "x, y, tot, timestamp, chips".split(","), + (x, y, tot, ts, chips), + ) + } + + def ingest_raw_data(data: IA) -> Dict[str, NDArray]: """ Parse values out of raw timepix3 data stream. diff --git a/tpx3awkward/tests/test_toa.py b/tpx3awkward/tests/test_toa.py new file mode 100644 index 0000000..cb590e8 --- /dev/null +++ b/tpx3awkward/tests/test_toa.py @@ -0,0 +1,67 @@ +from tpx3awkward._utils import get_block, matches_nibble, get_spidr, check_spidr_overflow + +import numpy as np +import pytest + + +@pytest.fixture(scope="function") +def data(n=10): + data = np.zeros(n, dtype=np.uint64) + return data + + +@pytest.fixture(scope="function") +def header_msg(chip=3): + return (np.uint8(chip) << np.uint(32)) + np.uint64(861425748) + + +@pytest.fixture(scope="function") +def empty_msg(): + return np.uint64(0xb) << np.uint(60) + + +def test_get_block(header_msg): + assert np.uint8(get_block(header_msg, 8, 32)) == 3 + + +def test_matches_nibble(): + msg = np.uint64(0xb) << np.uint(60) + assert matches_nibble(msg, 0xb) + + +def test_get_spidr(empty_msg): + msg1 = empty_msg + np.uint32(32768) + msg2 = empty_msg + np.uint32(65535) + assert get_spidr(msg1) == 32768 + assert get_spidr(msg2) == 65535 + +def test_check_spidr_overflow(empty_msg): + spidr_arr = [0, 0, 0, 1, 1, 1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10] + spidr_arr.extend(range(10, 65535, 3)) + spidr_arr.extend([65534, 65534, 65535, 65534, 65534, 65535, 65535, 65534]) + data = [empty_msg + np.uint32(s) for s in spidr_arr] + midpoint, last_spidr = check_spidr_overflow(data, 0) + assert midpoint == 0 + assert last_spidr == 65535 + + spidr_arr = list(range(20000, 65535)) + spidr_arr.extend(range(0, 10000)) + data = [empty_msg + np.uint32(s) for s in spidr_arr] + midpoint, last_spidr = check_spidr_overflow(data, 0) + assert midpoint == 14999 + assert last_spidr == 9999 + + spidr_arr = [65534, 65534, 65535, 65534, 65534, 65535, 0, 65535, 1, 65534, 0, 0, 0, 1, 1, 2, 3, 4, 5] + spidr_arr.extend(range(5, 2000)) + data = [empty_msg + np.uint32(s) for s in spidr_arr] + midpoint, last_spidr = check_spidr_overflow(data, 0) + assert midpoint == 33767 + assert last_spidr == 1999 + + spidr_arr = list(range(20000, 65535)) + spidr_arr.extend([65534, 65534, 65535, 65534, 65534, 65535, 0, 65535, 1, 65534, 0, 0, 1, 1, 2, 3, 4, 5]) + + data = [empty_msg + np.uint32(s) for s in spidr_arr] + midpoint, last_spidr = check_spidr_overflow(data, 0) + assert midpoint == 10002 + assert last_spidr == 5 \ No newline at end of file From aaf82f64eb6d4f19eafca40b083add5b2d448bef Mon Sep 17 00:00:00 2001 From: Eugene M Date: Tue, 2 Jul 2024 09:46:22 -0400 Subject: [PATCH 02/15] ENH: use heartbeat_time instead of SPIDR_epochs --- tpx3awkward/_utils.py | 288 ++++++++---------------------------------- 1 file changed, 52 insertions(+), 236 deletions(-) diff --git a/tpx3awkward/_utils.py b/tpx3awkward/_utils.py index ab47d17..4baad5a 100644 --- a/tpx3awkward/_utils.py +++ b/tpx3awkward/_utils.py @@ -133,21 +133,35 @@ def get_spidr(msg): @numba.jit(nopython=True) -def decode_message(msg, chip, spidr_epoch=0, spidr_cutoff=0): +def decode_message(msg, chip, last_ts: np.uint64 = 0): """Decode TPX3 packages of the second type corresponding to photon events (id'd via 0xB upper nibble) Parameters ---------- - msg (uint64): tpx3 binary package + msg (uint64): tpx3 binary message chip (uint8): chip ID, 0..3 - spidr_epoch: SPIDR epoch at the start of the file; the counter increments every time the SPIDR clock resets - spidr_cutoff: if a file spans two SPIDR epochs, this parameter defines a midpoint between the lowest SPIDR - id in the previous epoch and the highest SPIDR id in the following epoch; i.e. all packages with - SPIDR > spidr_cutoff are guaranteed to belong to the next epoch, and vice versa. + last_ts (uint64): last timestamp from the previous message; defines the SPIDR rollover + + # bit position : ... 44 40 36 32 28 24 20 16 12 8 7 4 3 0 + # 0xFFFFC0000000 : 1111 1111 1111 1111 1100 0000 0000 0000 0000 0000 0000 0000 + # 0x3FFFFFFF : 0000 0000 0000 0000 0011 1111 1111 1111 1111 1111 1111 1111 + # SPIDR : ssss ssss ssss ssss ssss + # ToA : tt tttt tttt + # ToA_coarse : ss ssss ssss ssss ssss sstt tttt tttt + # pixel_bits : ^^ + # FToA : ffff + # count : ss ssss ssss ssss ssss sstt tttt tttt ffff (FToA is subtracted) + # phase : pppp + # 0x10000000 : 1 0000 0000 0000 0000 0000 0000 0000 + # heartbeat_time : hhhh hhhh hhhh hhhh hhhh hhhh hhhh hhhh hhhh hhhh hhhh hhhh + # heartbeat_bits : ^^ + # global_time : hhhh hhhh hhhh hhss ssss ssss ssss ssss sstt tttt tttt ffff + + # count = (ToA_coarse << np.uint(4)) - FToA # Counter value, in multiples of 1.5625 ns Returns ---------- - Arrays of pixel coordinates and timestamps. + Arrays of pixel coordinates, ToT, and timestamps. """ x, y = decode_xy(msg, chip) # or use x1, y1 = calculateXY(msg, chip) from the Vendor's code # ToA is 14 bits @@ -158,76 +172,34 @@ def decode_message(msg, chip, spidr_epoch=0, spidr_cutoff=0): FToA = (msg >> np.uint(16)) & np.uint(0xF) # SPIDR time is 16 bits SPIDR = np.uint64(get_spidr(msg)) - # Apply the spidr_epoch rollover - SPIDR += np.uint64((spidr_epoch + (SPIDR < spidr_cutoff)) * (2**16)) - # Counter value, in multiples of 1.5625 ns - count = (((SPIDR << np.uint(14)) + ToA) << np.uint(4)) - FToA + + ToA_coarse = (SPIDR << np.uint(14)) | ToA + # pixel_bits are the two highest bits of the SPIDR (i.e. the pixelbits range covers 262143 spidr cycles) + pixel_bits = int((ToA_coarse >> np.uint(28)) & np.uint(0x3)) + # heart_bits are the bits at the same positions in the heartbeat_time + heartbeat_time = np.uint64(last_ts) + heart_bits = int((heartbeat_time >> np.uint(28)) & np.uint(0x3)) + # Adjust heartbeat_time based on the difference between heart_bits and pixel_bits + diff = heart_bits - pixel_bits + # diff +/-1 occur when pixelbits step up + # diff +/-3 occur when spidr counter overfills + # diff can also be 0 -- then nothing happens -- but never +/-2 + if diff == 1 or diff == -3: + heartbeat_time -= np.uint(0x10000000) + elif diff == -1 or diff == 3: + heartbeat_time += np.uint(0x10000000) + # Construct globaltime + global_time = (heartbeat_time & np.uint(0xFFFFC0000000)) | (ToA_coarse & np.uint(0x3FFFFFFF)) # Phase correction phase = np.uint((x / 2) % 16) or np.uint(16) - # Calculate the timestamp - ts = np.uint64(count + phase) + # Construct timestamp with phase correction + ts = (global_time << np.uint(4)) - FToA + phase return x, y, ToT, ts @numba.jit(nopython=True) -def check_spidr_overflow(data, last_spidr=0): - """Detect if the SPIDR counter has overflowed in the current data chunk (file) - - Assumes that only single SPIDR overflow can occur within one file, i.e. the duration of the file is <27 sec. - - Arguments: - ---------- - data : NDArray[np.unint64] - The stream of raw data from the timepix3 detector - last_spidr : int - The last SPIDR value at the tail of the previous tpx3 file - - Returns: - ------- - spidr_midpoint : int - A value of the spidr counter such that any datapoints in the file with SPIDR < midpoint should be assigned - to the next SPIDR epoch (kept track of and incremented outside the function), and vice versa. - If spidr_midpoint=0, all data points in the file belong to the previous SPIDR epoch. - last_spidr : int - The highest SPIDR counter value from the new epoch observed at the end of file. - """ - - head_min, head_max = 65535, 0 - tail_min, tail_max = 65535, 0 - for i in range(64): - if not is_packet_header(data[i]) and matches_nibble(data[i], 0xB): - head_min = min(get_spidr(data[i]), head_min) - head_max = max(get_spidr(data[i]), head_max) - if not is_packet_header(data[-i - 1]) and matches_nibble(data[-i - 1], 0xB): - tail_min = min(get_spidr(data[-i - 1]), tail_min) - tail_max = max(get_spidr(data[-i - 1]), tail_max) - if head_min > tail_max: - # Transition somewhere in the middle of the file - midpoint = (np.uint32(head_min) + np.uint32(tail_max)) // 2 - elif (head_max - head_min) > 32768: - # Large drop in the head of the file - midpoint = (65535 + np.uint32(tail_max)) // 2 # Assume the smallest SPIDR from the last epoch at the head is 65535 - elif (tail_max - tail_min) > 32768: - # Large drop in the tail of the file; find tail_max from the new epoch - tail_max = 0 - for i in range(64): - if not is_packet_header(data[-i - 1]) and matches_nibble(data[-i - 1], 0xB): - tail_spidr = get_spidr(data[-i - 1]) - if tail_spidr < 32768: - tail_max = max(tail_spidr, tail_max) - midpoint = (np.uint32(head_min) + np.uint32(tail_max)) // 2 - elif last_spidr > head_min: - # Transition occured between the files - midpoint = 65535 # Everything in this file is in the new SPIDR epoch - else: - midpoint = 0 - - return np.uint16(midpoint), np.uint16(tail_max) - - -@numba.jit(nopython=True) -def _ingest_raw_data_rollover(data: IA, spidr_epoch=0, last_spidr=0): +def _ingest_raw_data(data: IA, last_ts: np.uint64 = 0): chips = np.zeros_like(data, dtype=np.uint8) x = np.zeros_like(data, dtype="u2") @@ -235,19 +207,17 @@ def _ingest_raw_data_rollover(data: IA, spidr_epoch=0, last_spidr=0): tot = np.zeros_like(data, dtype="u4") ts = np.zeros_like(data, dtype="u8") - chip_indx = 0 - spidr_cutoff, last_spidr = check_spidr_overflow(data, last_spidr=last_spidr) - i = 0 + i, chip_indx = 0, 0 for msg in data: if is_packet_header(msg): chip_indx = np.uint8(get_block(msg, 8, 32)) elif matches_nibble(msg, 0xB): chips[i] = chip_indx - _x, _y, _tot, _ts = decode_message(msg, chip_indx, spidr_epoch=spidr_epoch, spidr_cutoff=spidr_cutoff) + _x, _y, _tot, _ts = decode_message(msg, chip_indx, last_ts = last_ts) x[i] = _x y[i] = _y tot[i] = _tot - ts[i] = _ts + ts[i] = last_ts = _ts i += 1 # Sort the timestamps @@ -255,158 +225,7 @@ def _ingest_raw_data_rollover(data: IA, spidr_epoch=0, last_spidr=0): chips = chips[indx] x, y, tot, ts = x[indx], y[indx], tot[indx], ts[indx] - if spidr_cutoff > 0: - spidr_epoch += 1 - - return x, y, tot, ts, chips, spidr_epoch, last_spidr - - -@numba.jit(nopython=True) -def _ingest_raw_data(data: IA): - types = np.zeros_like(data, dtype="> np.uint(60) - # probably a better way to do this, but brute force! - types[~is_header & (nibble == 0xB)] = 2 - types[~is_header & (nibble == 0x6)] = 3 - types[~is_header & (nibble == 0x4)] = 4 - types[~is_header & (nibble == 0x7)] = 5 - - # sort out how many photons we have - total_photons = np.sum(types == 2) - - # allocate the return arrays - x = np.zeros(total_photons, dtype="u2") - y = np.zeros(total_photons, dtype="u2") - pix_addr = np.zeros(total_photons, dtype="u2") - ToA = np.zeros(total_photons, dtype="u2") - ToT = np.zeros(total_photons, dtype="u4") - FToA = np.zeros(total_photons, dtype="u2") - SPIDR = np.zeros(total_photons, dtype="u2") - chip_number = np.zeros(total_photons, dtype="u1") - basetime = np.zeros(total_photons, dtype="u8") - timestamp = np.zeros(total_photons, dtype="u8") - - photon_offset = 0 - chip = np.uint16(0) - expected_msg_count = np.uint16(0) - msg_run_count = np.uint(0) - - heartbeat_lsb = np.uint64(0) - heartbeat_msb = np.uint64(0) - heartbeat_time = np.uint64(0) - # loop over the packet headers (can not vectorize this with numpy) - for j in range(len(data)): - msg = data[j] - typ = types[j] - if typ == 1: - # 1: packet header (id'd via TPX3 magic number) - if expected_msg_count != msg_run_count: - print("missing messages!", msg) - # extract scalar information from the header - - # "number of pixels in chunk" is given in bytes not words - # and means all words in the chunk, not just "photons" - expected_msg_count = get_block(msg, 16, 48) // 8 - # what chip we are on - chip = np.uint8(get_block(msg, 8, 32)) - msg_run_count = 0 - elif typ == 2 or typ == 6: - # 2: photon event (id'd via 0xB upper nibble) - # 6: frame driven data (id'd via 0xA upper nibble) (??) - - # | - - # pixAddr is 16 bits - # these names and math are adapted from c++ code - l_pix_addr = pix_addr[photon_offset] = (msg >> np.uint(44)) & np.uint(0xFFFF) - # This is laid out 16ibts which are 2 interleaved 8 bit unsigned ints - # CCCCCCCRRRRRRCRR - # |dcol ||spix|^|| - # | 7 || 6 |1|2 - # - # The high 7 bits of the column - # '1111111000000000' - dcol = (l_pix_addr & np.uint(0xFE00)) >> np.uint(8) - # The high 6 bits of the row - # '0000000111111000' - spix = (l_pix_addr & np.uint(0x01F8)) >> np.uint(1) - rowcol = _shift_xy( - chip, - # add the low 2 bits of the row - # '0000000000000011' - spix + (l_pix_addr & np.uint(0x3)), - # add the low 1 bit of the column - # '0000000000000100' - dcol + ((l_pix_addr & np.uint(0x4)) >> np.uint(2)), - ) - col = x[photon_offset] = rowcol[1] - y[photon_offset] = rowcol[0] - # ToA is 14 bits - ToA[photon_offset] = (msg >> np.uint(30)) & np.uint(0x3FFF) - # ToT is 10 bits - # report in ns - ToT[photon_offset] = ((msg >> np.uint(20)) & np.uint(0x3FF)) * 25 - # FToA is 4 bits - l_FToA = FToA[photon_offset] = (msg >> np.uint(16)) & np.uint(0xF) - # SPIDR time is 16 bits - SPIDR[photon_offset] = msg & np.uint(0xFFFF) - # chip number (this is a constant) - chip_number[photon_offset] = chip - # heartbeat time - basetime[photon_offset] = heartbeat_time - - ToA_coarse = (SPIDR[photon_offset] << np.uint(14)) | ToA[photon_offset] - pixelbits = int((ToA_coarse >> np.uint(28)) & np.uint(0x3)) - heartbeat_time_bits = int((heartbeat_time >> np.uint(28)) & np.uint(0x3)) - diff = heartbeat_time_bits - pixelbits - if diff == 1 or diff == -3: - heartbeat_time -= np.uint(0x10000000) - elif diff == -1 or diff == 3: - heartbeat_time += np.uint(0x10000000) - globaltime = (heartbeat_time & np.uint(0xFFFFC0000000)) | (ToA_coarse & np.uint(0x3FFFFFFF)) - - timestamp[photon_offset] = (globaltime << np.uint(12)) - (l_FToA << np.uint(8)) - # correct for phase shift - phase = np.uint((col / 2) % 16) - if phase == 0: - timestamp[photon_offset] += 16 << 8 - else: - timestamp[photon_offset] += phase << 8 - - photon_offset += 1 - msg_run_count += 1 - elif typ == 3: - # 3: TDC timstamp (id'd via 0x6 upper nibble) - # TODO: handle these! - msg_run_count += 1 - elif typ == 4: - # 4: global timestap (id'd via 0x4 upper nibble) - subheader = (msg >> np.uint(56)) & np.uint(0x0F) - if subheader == 0x4: - # timer lsb, 32 bits of time - heartbeat_lsb = (msg >> np.uint(16)) & np.uint(0xFFFFFFFF) - elif subheader == 0x5: - # timer msb - - time_msg = (msg >> np.uint(16)) & np.uint(0xFFFF) - heartbeat_msb = time_msg << np.uint(32) - # TODO the c++ code has large jump detection, do not understand why - heartbeat_time = heartbeat_msb | heartbeat_lsb - else: - raise Exception("unknown header") - - msg_run_count += 1 - elif typ == 5: - # 5: "command" data (id'd via 0x7 upper nibble) - # TODO handle this! - msg_run_count += 1 - else: - raise Exception("Not supported") - - return x, y, pix_addr, ToA, ToT, FToA, SPIDR, chip_number, basetime, timestamp + return x, y, tot, ts, chips def ingest_from_files(fpaths: List[Union[str, Path]]) -> Iterable[Dict[str, NDArray]]: @@ -421,11 +240,11 @@ def ingest_from_files(fpaths: List[Union[str, Path]]) -> Iterable[Dict[str, NDAr An iterable over the parsing results, each encapsulated in a dictionary """ - spidr_epoch, last_spidr = 0, 0 + last_ts = 0 for fpath in fpaths: data = raw_as_numpy(fpath) - x, y, tot, ts, chips, spidr_epoch, last_spidr = _ingest_raw_data_rollover(data, spidr_epoch, last_spidr=last_spidr) - + x, y, tot, ts, chips = _ingest_raw_data(data, last_ts = last_ts) + last_ts = ts[-1] yield { k.strip(): v for k, v in zip( @@ -435,7 +254,7 @@ def ingest_from_files(fpaths: List[Union[str, Path]]) -> Iterable[Dict[str, NDAr } -def ingest_raw_data(data: IA) -> Dict[str, NDArray]: +def ingest_raw_data(data: IA, last_ts: np.uint64 = 0) -> Dict[str, NDArray]: """ Parse values out of raw timepix3 data stream. @@ -447,14 +266,11 @@ def ingest_raw_data(data: IA) -> Dict[str, NDArray]: Returns ------- Dict[str, NDArray] - Keys of x, y, pix_addr, ToA, ToT, FToA, SPIDR, chip_number + Keys of x, y, ToT, chip_number """ return { k.strip(): v - for k, v in zip( - "x, y, pix_addr, ToA, ToT, FToA, SPIDR, chip_number, basetime, timestamp".split(","), - _ingest_raw_data(data), - ) + for k, v in zip("x, y, ToT, ts, chip_number".split(","), _ingest_raw_data(data, last_ts=last_ts)) } # ^-- tom wrote From d01bf0c397a7c3f92d930be36df2816920604fd5 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Fri, 19 Jul 2024 13:43:48 -0400 Subject: [PATCH 03/15] Register all message types --- tpx3awkward/_utils.py | 75 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 67 insertions(+), 8 deletions(-) diff --git a/tpx3awkward/_utils.py b/tpx3awkward/_utils.py index 4baad5a..c1ba420 100644 --- a/tpx3awkward/_utils.py +++ b/tpx3awkward/_utils.py @@ -198,6 +198,34 @@ def decode_message(msg, chip, last_ts: np.uint64 = 0): return x, y, ToT, ts +@numba.jit(nopython=True) +def decode_heartbeat(msg, heartbeat_lsb=np.uint64(0)): + """Decode a heartbeat time message (idetified by its 0x4 upper nibble) + + Parameters + ---------- + msg (uint64): tpx3 binary message with upper_nibble == 0x4 + + Returns + ---------- + Integer value of the heartbeat time. + """ + subheader = (msg >> np.uint(56)) & np.uint(0x0F) + if subheader == 0x4: + # timer lsb, 32 bits of time + heartbeat_lsb = (msg >> np.uint(16)) & np.uint(0xFFFFFFFF) + elif subheader == 0x5: + # timer msb + time_msg = (msg >> np.uint(16)) & np.uint(0xFFFF) + heartbeat_msb = time_msg << np.uint(32) + # TODO the c++ code has large jump detection, do not understand why + heartbeat_time = heartbeat_msb | heartbeat_lsb + else: + raise Exception(f"Unknown subheader {subheader} in the Global Timestamp message") + + return heartbeat_lsb, heartbeat_time + + @numba.jit(nopython=True) def _ingest_raw_data(data: IA, last_ts: np.uint64 = 0): @@ -207,21 +235,52 @@ def _ingest_raw_data(data: IA, last_ts: np.uint64 = 0): tot = np.zeros_like(data, dtype="u4") ts = np.zeros_like(data, dtype="u8") - i, chip_indx = 0, 0 + photon_count, chip_indx, msg_run_count = 0, 0, 0 for msg in data: if is_packet_header(msg): + # Type 1: packet header (id'd via TPX3 magic number) + if expected_msg_count != msg_run_count: + print("Missing messages!", msg) + + # extract the chip number for the following photon events chip_indx = np.uint8(get_block(msg, 8, 32)) + + # "number of pixels in chunk" is given in bytes not words and means all words in the chunk, not just "photons" + expected_msg_count = get_block(msg, 16, 48) // 8 + msg_run_count = 0 + elif matches_nibble(msg, 0xB): - chips[i] = chip_indx + # Type 2: photon event (id'd via 0xB upper nibble) + chips[photon_count] = chip_indx _x, _y, _tot, _ts = decode_message(msg, chip_indx, last_ts = last_ts) - x[i] = _x - y[i] = _y - tot[i] = _tot - ts[i] = last_ts = _ts - i += 1 + x[photon_count] = _x + y[photon_count] = _y + tot[photon_count] = _tot + ts[photon_count] = last_ts = _ts + photon_count += 1 + msg_run_count += 1 + + elif matches_nibble(msg, 0x6): + # Type 3: TDC timstamp (id'd via 0x6 upper nibble) + # TODO: handle these! + msg_run_count += 1 + + elif matches_nibble(msg, 0x4): + # Type 4: global timestap (id'd via 0x4 upper nibble) + heartbeat_lsb, heartbeat_time = decode_heartbeat(msg, heartbeat_lsb) + msg_run_count += 1 + + elif matches_nibble(msg, 0x7): + # Type 5: "command" data (id'd via 0x7 upper nibble) + # TODO handle this! + msg_run_count += 1 + + else: + raise Exception(f"Not supported: {msg}") + # Sort the timestamps - indx = np.argsort(ts[:i], kind="mergesort") + indx = np.argsort(ts[:photon_count], kind="mergesort") chips = chips[indx] x, y, tot, ts = x[indx], y[indx], tot[indx], ts[indx] From 08d896c568f182affbef3bd5cdc3f0577186895d Mon Sep 17 00:00:00 2001 From: Eugene M Date: Fri, 19 Jul 2024 16:12:49 -0400 Subject: [PATCH 04/15] FIX: missing initializations and dtypes --- tpx3awkward/_utils.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tpx3awkward/_utils.py b/tpx3awkward/_utils.py index c1ba420..dbcf8bc 100644 --- a/tpx3awkward/_utils.py +++ b/tpx3awkward/_utils.py @@ -199,7 +199,7 @@ def decode_message(msg, chip, last_ts: np.uint64 = 0): @numba.jit(nopython=True) -def decode_heartbeat(msg, heartbeat_lsb=np.uint64(0)): +def decode_heartbeat(msg: np.uint64, heartbeat_lsb: np.uint64 = np.uint64(0)): """Decode a heartbeat time message (idetified by its 0x4 upper nibble) Parameters @@ -214,10 +214,12 @@ def decode_heartbeat(msg, heartbeat_lsb=np.uint64(0)): if subheader == 0x4: # timer lsb, 32 bits of time heartbeat_lsb = (msg >> np.uint(16)) & np.uint(0xFFFFFFFF) + heartbeat_time = None elif subheader == 0x5: # timer msb time_msg = (msg >> np.uint(16)) & np.uint(0xFFFF) heartbeat_msb = time_msg << np.uint(32) + print(type(heartbeat_msb), type(heartbeat_lsb)) # TODO the c++ code has large jump detection, do not understand why heartbeat_time = heartbeat_msb | heartbeat_lsb else: @@ -234,8 +236,9 @@ def _ingest_raw_data(data: IA, last_ts: np.uint64 = 0): y = np.zeros_like(data, dtype="u2") tot = np.zeros_like(data, dtype="u4") ts = np.zeros_like(data, dtype="u8") + heartbeat_lsb=np.uint64(0) - photon_count, chip_indx, msg_run_count = 0, 0, 0 + photon_count, chip_indx, msg_run_count, expected_msg_count = 0, 0, 0, 0 for msg in data: if is_packet_header(msg): # Type 1: packet header (id'd via TPX3 magic number) @@ -267,7 +270,7 @@ def _ingest_raw_data(data: IA, last_ts: np.uint64 = 0): elif matches_nibble(msg, 0x4): # Type 4: global timestap (id'd via 0x4 upper nibble) - heartbeat_lsb, heartbeat_time = decode_heartbeat(msg, heartbeat_lsb) + heartbeat_lsb, heartbeat_time = decode_heartbeat(msg, np.uint64(heartbeat_lsb)) msg_run_count += 1 elif matches_nibble(msg, 0x7): From 6ffe1534ff16fa1abf134af54278a093306544a0 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Wed, 24 Jul 2024 18:36:49 -0400 Subject: [PATCH 05/15] ENH: check for spidr rollover with global timestamp --- tpx3awkward/_utils.py | 93 +++++++++++++++++++++---------------------- 1 file changed, 45 insertions(+), 48 deletions(-) diff --git a/tpx3awkward/_utils.py b/tpx3awkward/_utils.py index dbcf8bc..42668fc 100644 --- a/tpx3awkward/_utils.py +++ b/tpx3awkward/_utils.py @@ -133,14 +133,14 @@ def get_spidr(msg): @numba.jit(nopython=True) -def decode_message(msg, chip, last_ts: np.uint64 = 0): +def decode_message(msg, chip, heartbeat_time: np.uint64 = 0): """Decode TPX3 packages of the second type corresponding to photon events (id'd via 0xB upper nibble) Parameters ---------- msg (uint64): tpx3 binary message chip (uint8): chip ID, 0..3 - last_ts (uint64): last timestamp from the previous message; defines the SPIDR rollover + heartbeat_time (uint64): # bit position : ... 44 40 36 32 28 24 20 16 12 8 7 4 3 0 # 0xFFFFC0000000 : 1111 1111 1111 1111 1100 0000 0000 0000 0000 0000 0000 0000 @@ -177,7 +177,6 @@ def decode_message(msg, chip, last_ts: np.uint64 = 0): # pixel_bits are the two highest bits of the SPIDR (i.e. the pixelbits range covers 262143 spidr cycles) pixel_bits = int((ToA_coarse >> np.uint(28)) & np.uint(0x3)) # heart_bits are the bits at the same positions in the heartbeat_time - heartbeat_time = np.uint64(last_ts) heart_bits = int((heartbeat_time >> np.uint(28)) & np.uint(0x3)) # Adjust heartbeat_time based on the difference between heart_bits and pixel_bits diff = heart_bits - pixel_bits @@ -189,7 +188,7 @@ def decode_message(msg, chip, last_ts: np.uint64 = 0): elif diff == -1 or diff == 3: heartbeat_time += np.uint(0x10000000) # Construct globaltime - global_time = (heartbeat_time & np.uint(0xFFFFC0000000)) | (ToA_coarse & np.uint(0x3FFFFFFF)) + global_time = (np.uint64(heartbeat_time) & np.uint(0xFFFFC0000000)) | (ToA_coarse & np.uint(0x3FFFFFFF)) # Phase correction phase = np.uint((x / 2) % 16) or np.uint(16) # Construct timestamp with phase correction @@ -198,45 +197,18 @@ def decode_message(msg, chip, last_ts: np.uint64 = 0): return x, y, ToT, ts -@numba.jit(nopython=True) -def decode_heartbeat(msg: np.uint64, heartbeat_lsb: np.uint64 = np.uint64(0)): - """Decode a heartbeat time message (idetified by its 0x4 upper nibble) - - Parameters - ---------- - msg (uint64): tpx3 binary message with upper_nibble == 0x4 - - Returns - ---------- - Integer value of the heartbeat time. - """ - subheader = (msg >> np.uint(56)) & np.uint(0x0F) - if subheader == 0x4: - # timer lsb, 32 bits of time - heartbeat_lsb = (msg >> np.uint(16)) & np.uint(0xFFFFFFFF) - heartbeat_time = None - elif subheader == 0x5: - # timer msb - time_msg = (msg >> np.uint(16)) & np.uint(0xFFFF) - heartbeat_msb = time_msg << np.uint(32) - print(type(heartbeat_msb), type(heartbeat_lsb)) - # TODO the c++ code has large jump detection, do not understand why - heartbeat_time = heartbeat_msb | heartbeat_lsb - else: - raise Exception(f"Unknown subheader {subheader} in the Global Timestamp message") - - return heartbeat_lsb, heartbeat_time - - -@numba.jit(nopython=True) -def _ingest_raw_data(data: IA, last_ts: np.uint64 = 0): +# @numba.jit(nopython=True) +def _ingest_raw_data(data): chips = np.zeros_like(data, dtype=np.uint8) x = np.zeros_like(data, dtype="u2") y = np.zeros_like(data, dtype="u2") tot = np.zeros_like(data, dtype="u4") ts = np.zeros_like(data, dtype="u8") - heartbeat_lsb=np.uint64(0) + heartbeat_lsb = np.uint64(0) + heartbeat_msb = np.uint64(0) + heartbeat_time = np.uint64(0) + hb = np.zeros_like(data, dtype="u8") photon_count, chip_indx, msg_run_count, expected_msg_count = 0, 0, 0, 0 for msg in data: @@ -255,11 +227,23 @@ def _ingest_raw_data(data: IA, last_ts: np.uint64 = 0): elif matches_nibble(msg, 0xB): # Type 2: photon event (id'd via 0xB upper nibble) chips[photon_count] = chip_indx - _x, _y, _tot, _ts = decode_message(msg, chip_indx, last_ts = last_ts) + _x, _y, _tot, _ts = decode_message(msg, chip_indx, heartbeat_time=heartbeat_time) x[photon_count] = _x y[photon_count] = _y tot[photon_count] = _tot - ts[photon_count] = last_ts = _ts + ts[photon_count] = _ts + + if (photon_count > 0) and (_ts - ts[photon_count-1] > 2**30): + prev_ts = ts[:photon_count] # This portion needs to be adjusted + # Find what the current timestamp would be without global heartbeat + _, _, _, _ts_0 = decode_message(msg, chip_indx, heartbeat_time=np.uint64(0)) + # Check if there is a SPIDR rollover in the beginning of the file but before heartbeat was received + if int(max(prev_ts[:10])) - int(min(prev_ts[-10:])) > 2**32: + prev_ts[prev_ts < 2**33] += np.uint64(2**34) + _ts_0 += 2**34 + # Ajust the existing timestamps + ts[:photon_count] = prev_ts + (_ts - _ts_0) + photon_count += 1 msg_run_count += 1 @@ -270,8 +254,22 @@ def _ingest_raw_data(data: IA, last_ts: np.uint64 = 0): elif matches_nibble(msg, 0x4): # Type 4: global timestap (id'd via 0x4 upper nibble) - heartbeat_lsb, heartbeat_time = decode_heartbeat(msg, np.uint64(heartbeat_lsb)) + subheader = (msg >> np.uint(56)) & np.uint(0x0F) + if subheader == 0x4: + # timer lsb, 32 bits of time + heartbeat_lsb = (msg >> np.uint(16)) & np.uint(0xFFFFFFFF) + elif subheader == 0x5: + # timer msb + time_msg = (msg >> np.uint(16)) & np.uint(0xFFFF) + heartbeat_msb = time_msg << np.uint(32) + heartbeat_time = (heartbeat_msb | heartbeat_lsb) # << np.uint(4) + # TODO the c++ code has large jump detection, do not understand why + else: + raise Exception(f"Unknown subheader {subheader} in the Global Timestamp message") + pass + msg_run_count += 1 + hb[photon_count] = heartbeat_time elif matches_nibble(msg, 0x7): # Type 5: "command" data (id'd via 0x7 upper nibble) @@ -286,8 +284,9 @@ def _ingest_raw_data(data: IA, last_ts: np.uint64 = 0): indx = np.argsort(ts[:photon_count], kind="mergesort") chips = chips[indx] x, y, tot, ts = x[indx], y[indx], tot[indx], ts[indx] + hb = hb[indx] - return x, y, tot, ts, chips + return x, y, tot, ts, chips, hb def ingest_from_files(fpaths: List[Union[str, Path]]) -> Iterable[Dict[str, NDArray]]: @@ -302,21 +301,19 @@ def ingest_from_files(fpaths: List[Union[str, Path]]) -> Iterable[Dict[str, NDAr An iterable over the parsing results, each encapsulated in a dictionary """ - last_ts = 0 for fpath in fpaths: data = raw_as_numpy(fpath) - x, y, tot, ts, chips = _ingest_raw_data(data, last_ts = last_ts) - last_ts = ts[-1] + x, y, tot, ts, chips, hb = _ingest_raw_data(data) yield { k.strip(): v for k, v in zip( - "x, y, tot, timestamp, chips".split(","), - (x, y, tot, ts, chips), + "x, y, tot, timestamp, chips, hb".split(","), + (x, y, tot, ts, chips, hb), ) } -def ingest_raw_data(data: IA, last_ts: np.uint64 = 0) -> Dict[str, NDArray]: +def ingest_raw_data(data: IA) -> Dict[str, NDArray]: """ Parse values out of raw timepix3 data stream. @@ -332,7 +329,7 @@ def ingest_raw_data(data: IA, last_ts: np.uint64 = 0) -> Dict[str, NDArray]: """ return { k.strip(): v - for k, v in zip("x, y, ToT, ts, chip_number".split(","), _ingest_raw_data(data, last_ts=last_ts)) + for k, v in zip("x, y, ToT, ts, chip_number, hb".split(","), _ingest_raw_data(data)) } # ^-- tom wrote From b125687bf7322a2fdd69e492cc890e0ab93036a6 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Wed, 24 Jul 2024 19:57:28 -0400 Subject: [PATCH 06/15] FIX: int overflow errors, force uint64 dtype --- tpx3awkward/_utils.py | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/tpx3awkward/_utils.py b/tpx3awkward/_utils.py index 42668fc..6de53c5 100644 --- a/tpx3awkward/_utils.py +++ b/tpx3awkward/_utils.py @@ -163,6 +163,7 @@ def decode_message(msg, chip, heartbeat_time: np.uint64 = 0): ---------- Arrays of pixel coordinates, ToT, and timestamps. """ + msg, heartbeat_time = np.uint64(msg), np.uint64(heartbeat_time) # Force types x, y = decode_xy(msg, chip) # or use x1, y1 = calculateXY(msg, chip) from the Vendor's code # ToA is 14 bits ToA = (msg >> np.uint(30)) & np.uint(0x3FFF) @@ -175,20 +176,20 @@ def decode_message(msg, chip, heartbeat_time: np.uint64 = 0): ToA_coarse = (SPIDR << np.uint(14)) | ToA # pixel_bits are the two highest bits of the SPIDR (i.e. the pixelbits range covers 262143 spidr cycles) - pixel_bits = int((ToA_coarse >> np.uint(28)) & np.uint(0x3)) + pixel_bits = np.int8((ToA_coarse >> np.uint(28)) & np.uint(0x3)) # heart_bits are the bits at the same positions in the heartbeat_time - heart_bits = int((heartbeat_time >> np.uint(28)) & np.uint(0x3)) + heart_bits = np.int8((heartbeat_time >> np.uint(28)) & np.uint(0x3)) # Adjust heartbeat_time based on the difference between heart_bits and pixel_bits diff = heart_bits - pixel_bits # diff +/-1 occur when pixelbits step up # diff +/-3 occur when spidr counter overfills # diff can also be 0 -- then nothing happens -- but never +/-2 - if diff == 1 or diff == -3: + if (diff == 1 or diff == -3) and (heartbeat_time > np.uint(0x10000000)): heartbeat_time -= np.uint(0x10000000) elif diff == -1 or diff == 3: heartbeat_time += np.uint(0x10000000) # Construct globaltime - global_time = (np.uint64(heartbeat_time) & np.uint(0xFFFFC0000000)) | (ToA_coarse & np.uint(0x3FFFFFFF)) + global_time = (heartbeat_time & np.uint(0xFFFFFFFC0000000)) | (ToA_coarse & np.uint(0x3FFFFFFF)) # Phase correction phase = np.uint((x / 2) % 16) or np.uint(16) # Construct timestamp with phase correction @@ -196,8 +197,7 @@ def decode_message(msg, chip, heartbeat_time: np.uint64 = 0): return x, y, ToT, ts - -# @numba.jit(nopython=True) +@numba.jit(nopython=True) def _ingest_raw_data(data): chips = np.zeros_like(data, dtype=np.uint8) @@ -208,7 +208,6 @@ def _ingest_raw_data(data): heartbeat_lsb = np.uint64(0) heartbeat_msb = np.uint64(0) heartbeat_time = np.uint64(0) - hb = np.zeros_like(data, dtype="u8") photon_count, chip_indx, msg_run_count, expected_msg_count = 0, 0, 0, 0 for msg in data: @@ -227,21 +226,24 @@ def _ingest_raw_data(data): elif matches_nibble(msg, 0xB): # Type 2: photon event (id'd via 0xB upper nibble) chips[photon_count] = chip_indx + # heartbeat_time = np.uint64(0) _x, _y, _tot, _ts = decode_message(msg, chip_indx, heartbeat_time=heartbeat_time) x[photon_count] = _x y[photon_count] = _y tot[photon_count] = _tot ts[photon_count] = _ts - if (photon_count > 0) and (_ts - ts[photon_count-1] > 2**30): + if (photon_count > 0) and (_ts > ts[photon_count-1]) and (_ts - ts[photon_count-1] > 2**30): prev_ts = ts[:photon_count] # This portion needs to be adjusted # Find what the current timestamp would be without global heartbeat _, _, _, _ts_0 = decode_message(msg, chip_indx, heartbeat_time=np.uint64(0)) # Check if there is a SPIDR rollover in the beginning of the file but before heartbeat was received - if int(max(prev_ts[:10])) - int(min(prev_ts[-10:])) > 2**32: + head_max = max(prev_ts[:10]) + tail_min = min(prev_ts[-10:]) + if (head_max > tail_min) and (head_max - tail_min > 2**32): prev_ts[prev_ts < 2**33] += np.uint64(2**34) _ts_0 += 2**34 - # Ajust the existing timestamps + # Ajust already processed timestamps ts[:photon_count] = prev_ts + (_ts - _ts_0) photon_count += 1 @@ -254,13 +256,13 @@ def _ingest_raw_data(data): elif matches_nibble(msg, 0x4): # Type 4: global timestap (id'd via 0x4 upper nibble) - subheader = (msg >> np.uint(56)) & np.uint(0x0F) + subheader = (msg >> np.uint(56)) & np.uint64(0x0F) if subheader == 0x4: # timer lsb, 32 bits of time - heartbeat_lsb = (msg >> np.uint(16)) & np.uint(0xFFFFFFFF) + heartbeat_lsb = (msg >> np.uint(16)) & np.uint64(0xFFFFFFFF) elif subheader == 0x5: # timer msb - time_msg = (msg >> np.uint(16)) & np.uint(0xFFFF) + time_msg = (msg >> np.uint(16)) & np.uint64(0xFFFF) heartbeat_msb = time_msg << np.uint(32) heartbeat_time = (heartbeat_msb | heartbeat_lsb) # << np.uint(4) # TODO the c++ code has large jump detection, do not understand why @@ -269,7 +271,6 @@ def _ingest_raw_data(data): pass msg_run_count += 1 - hb[photon_count] = heartbeat_time elif matches_nibble(msg, 0x7): # Type 5: "command" data (id'd via 0x7 upper nibble) @@ -284,9 +285,8 @@ def _ingest_raw_data(data): indx = np.argsort(ts[:photon_count], kind="mergesort") chips = chips[indx] x, y, tot, ts = x[indx], y[indx], tot[indx], ts[indx] - hb = hb[indx] - return x, y, tot, ts, chips, hb + return x, y, tot, ts, chips def ingest_from_files(fpaths: List[Union[str, Path]]) -> Iterable[Dict[str, NDArray]]: @@ -303,12 +303,12 @@ def ingest_from_files(fpaths: List[Union[str, Path]]) -> Iterable[Dict[str, NDAr for fpath in fpaths: data = raw_as_numpy(fpath) - x, y, tot, ts, chips, hb = _ingest_raw_data(data) + x, y, tot, ts, chips = _ingest_raw_data(data) yield { k.strip(): v for k, v in zip( - "x, y, tot, timestamp, chips, hb".split(","), - (x, y, tot, ts, chips, hb), + "x, y, tot, timestamp, chips".split(","), + (x, y, tot, ts, chips), ) } @@ -329,7 +329,7 @@ def ingest_raw_data(data: IA) -> Dict[str, NDArray]: """ return { k.strip(): v - for k, v in zip("x, y, ToT, ts, chip_number, hb".split(","), _ingest_raw_data(data)) + for k, v in zip("x, y, ToT, ts, chip_number".split(","), _ingest_raw_data(data)) } # ^-- tom wrote From 0a942ae45c9bcfd1613aa4ef278bc9da236b2037 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Thu, 25 Jul 2024 09:58:14 -0400 Subject: [PATCH 07/15] FIX: timer ensure timer LSB is initialized --- tpx3awkward/_utils.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/tpx3awkward/_utils.py b/tpx3awkward/_utils.py index 6de53c5..8e656f0 100644 --- a/tpx3awkward/_utils.py +++ b/tpx3awkward/_utils.py @@ -205,8 +205,7 @@ def _ingest_raw_data(data): y = np.zeros_like(data, dtype="u2") tot = np.zeros_like(data, dtype="u4") ts = np.zeros_like(data, dtype="u8") - heartbeat_lsb = np.uint64(0) - heartbeat_msb = np.uint64(0) + heartbeat_lsb = None #np.uint64(0) heartbeat_time = np.uint64(0) photon_count, chip_indx, msg_run_count, expected_msg_count = 0, 0, 0, 0 @@ -226,7 +225,6 @@ def _ingest_raw_data(data): elif matches_nibble(msg, 0xB): # Type 2: photon event (id'd via 0xB upper nibble) chips[photon_count] = chip_indx - # heartbeat_time = np.uint64(0) _x, _y, _tot, _ts = decode_message(msg, chip_indx, heartbeat_time=heartbeat_time) x[photon_count] = _x y[photon_count] = _y @@ -240,9 +238,10 @@ def _ingest_raw_data(data): # Check if there is a SPIDR rollover in the beginning of the file but before heartbeat was received head_max = max(prev_ts[:10]) tail_min = min(prev_ts[-10:]) + # Compare the difference with some big number (e.g. 1/4 of SPIDR) if (head_max > tail_min) and (head_max - tail_min > 2**32): prev_ts[prev_ts < 2**33] += np.uint64(2**34) - _ts_0 += 2**34 + _ts_0 += np.uint64(2**34) # Ajust already processed timestamps ts[:photon_count] = prev_ts + (_ts - _ts_0) @@ -258,14 +257,14 @@ def _ingest_raw_data(data): # Type 4: global timestap (id'd via 0x4 upper nibble) subheader = (msg >> np.uint(56)) & np.uint64(0x0F) if subheader == 0x4: - # timer lsb, 32 bits of time + # timer LSB, 32 bits of time heartbeat_lsb = (msg >> np.uint(16)) & np.uint64(0xFFFFFFFF) elif subheader == 0x5: - # timer msb - time_msg = (msg >> np.uint(16)) & np.uint64(0xFFFF) - heartbeat_msb = time_msg << np.uint(32) - heartbeat_time = (heartbeat_msb | heartbeat_lsb) # << np.uint(4) - # TODO the c++ code has large jump detection, do not understand why + # timer MSB -- only matters if LSB has been received already + if heartbeat_lsb is not None: + heartbeat_msb = ((msg >> np.uint(16)) & np.uint64(0xFFFF)) << np.uint(32) + heartbeat_time = (heartbeat_msb | heartbeat_lsb) + # TODO the c++ code has large jump detection, do not understand why else: raise Exception(f"Unknown subheader {subheader} in the Global Timestamp message") pass From 7677757d7f68dcf57579f74733b15890505dc7ca Mon Sep 17 00:00:00 2001 From: Justin Goodrich Date: Mon, 29 Jul 2024 23:13:51 -0400 Subject: [PATCH 08/15] Miscallenous updates to tpx3awkward. Various clean-ups, optimized handling of KDTree neighbor list, updated conversion functions, and removed file-finding logic specific to CHX's current ophyd object. --- tpx3awkward/_utils.py | 611 ++++++++--------------------- tpx3awkward/examples/example.ipynb | 351 +++++++++++++++++ 2 files changed, 508 insertions(+), 454 deletions(-) create mode 100644 tpx3awkward/examples/example.ipynb diff --git a/tpx3awkward/_utils.py b/tpx3awkward/_utils.py index 8e656f0..7f6498a 100644 --- a/tpx3awkward/_utils.py +++ b/tpx3awkward/_utils.py @@ -1,19 +1,15 @@ import os -import numpy as np from pathlib import Path +from typing import TypeVar, Union, Dict, List, Tuple +import numpy as np from numpy.typing import NDArray -from typing import TypeVar, Union, Dict, Set, List, Iterable import numba import pandas as pd from scipy.spatial import KDTree import multiprocessing from tqdm import tqdm import warnings -import gc -try: - from pyCHX.chx_packages import db, get_sid_filenames -except ImportError: - warnings.warn("Could not import pyCHX package. Proceeding without it...") + IA = NDArray[np.uint64] UnSigned = TypeVar("UnSigned", IA, np.uint64) @@ -29,28 +25,26 @@ def raw_as_numpy(fpath: Union[str, Path]) -> IA: ---------- """ - return np.fromfile(fpath, dtype=" UnSigned: - return v >> np.uint64(shift) & np.uint64(2**width - 1) -@numba.jit(nopython=True) +@numba.jit(nopython=True, cache=True) def matches_nibble(data, nibble) -> numba.boolean: return (int(data) >> 60) == nibble -@numba.jit(nopython=True) +@numba.jit(nopython=True, cache=True) def is_packet_header(v: UnSigned) -> UnSigned: """Identify packet headers by magic number (TPX3 as ascii on lowest 8 bytes]""" return get_block(v, 32, 0) == 861425748 -@numba.jit(nopython=True) +@numba.jit(nopython=True, cache=True) def classify_array(data: IA) -> NDArray[np.uint8]: """ Create an array the same size as the data classifying 64bit uint by type. @@ -78,7 +72,7 @@ def classify_array(data: IA) -> NDArray[np.uint8]: return output -@numba.jit(nopython=True) +@numba.jit(nopython=True, cache=True) def _shift_xy(chip, row, col): # TODO sort out if this needs to be paremeterized out = np.zeros(2, "u4") @@ -100,7 +94,7 @@ def _shift_xy(chip, row, col): return out -@numba.jit(nopython=True) +@numba.jit(nopython=True, cache=True) def decode_xy(msg, chip): # these names and math are adapted from c++ code l_pix_addr = (msg >> np.uint(44)) & np.uint(0xFFFF) @@ -127,12 +121,12 @@ def decode_xy(msg, chip): return rowcol[1], rowcol[0] -@numba.jit(nopython=True) +@numba.jit(nopython=True, cache=True) def get_spidr(msg): return msg & np.uint(0xFFFF) -@numba.jit(nopython=True) +@numba.jit(nopython=True, cache=True) def decode_message(msg, chip, heartbeat_time: np.uint64 = 0): """Decode TPX3 packages of the second type corresponding to photon events (id'd via 0xB upper nibble) @@ -197,7 +191,8 @@ def decode_message(msg, chip, heartbeat_time: np.uint64 = 0): return x, y, ToT, ts -@numba.jit(nopython=True) + +@numba.jit(nopython=True, cache=True) def _ingest_raw_data(data): chips = np.zeros_like(data, dtype=np.uint8) @@ -281,37 +276,12 @@ def _ingest_raw_data(data): # Sort the timestamps - indx = np.argsort(ts[:photon_count], kind="mergesort") - chips = chips[indx] - x, y, tot, ts = x[indx], y[indx], tot[indx], ts[indx] + indx = np.argsort(ts[:photon_count], kind="mergesort") # is mergesort the best here? wondering if this could be optimized + x, y, tot, ts, chips = x[indx], y[indx], tot[indx], ts[indx], chips[indx] return x, y, tot, ts, chips -def ingest_from_files(fpaths: List[Union[str, Path]]) -> Iterable[Dict[str, NDArray]]: - """Parse values out of a sequence of timepix3 files with rollover of SPIDR counter. - - Parameters - ---------- - fpaths : A sorted sequence of tpx3 filepaths. - - Returns - ------- - An iterable over the parsing results, each encapsulated in a dictionary - """ - - for fpath in fpaths: - data = raw_as_numpy(fpath) - x, y, tot, ts, chips = _ingest_raw_data(data) - yield { - k.strip(): v - for k, v in zip( - "x, y, tot, timestamp, chips".split(","), - (x, y, tot, ts, chips), - ) - } - - def ingest_raw_data(data: IA) -> Dict[str, NDArray]: """ Parse values out of raw timepix3 data stream. @@ -328,17 +298,14 @@ def ingest_raw_data(data: IA) -> Dict[str, NDArray]: """ return { k.strip(): v - for k, v in zip("x, y, ToT, ts, chip_number".split(","), _ingest_raw_data(data)) + for k, v in zip("x, y, ToT, t, chip".split(","), _ingest_raw_data(data)) } -# ^-- tom wrote -# v-- justin wrote + """ Some basic functions that help take the initial output of ingest_raw_data and finish the processing. """ - - -def raw_to_sorted_df(fpath: Union[str, Path]) -> pd.DataFrame: +def tpx_to_raw_df(fpath: Union[str, Path]) -> pd.DataFrame: """ Parses a .tpx3 file and returns the raw data after timesorting. @@ -353,28 +320,7 @@ def raw_to_sorted_df(fpath: Union[str, Path]) -> pd.DataFrame: DataFrame of raw events from the .tpx3 file. """ raw_df = pd.DataFrame(ingest_raw_data(raw_as_numpy(fpath))) - return raw_df.sort_values("timestamp").reset_index(drop=True) - - -def condense_raw_df(df: pd.DataFrame) -> pd.DataFrame: - """ - Condenses the raw dataframe with only key information necesary for the analysis. Returns a dataframe with timestamp (renamed to t), x, y, and ToT. - - Parameters - ---------- - df : pd.DataFrame - DataFrame generated using raw_to_sorted_df(). - - Returns - ------- - pd.DataFrame - Dataframe condensed to only contain pertinent information for analysis. - """ - cdf = df[["timestamp", "x", "y", "ToT"]] - cdf = cdf.rename( - columns={"timestamp": "t"} - ) # obviously not necessary, just easier to type 't' a lot than 'timestamp' - return cdf + return raw_df.sort_values("t").reset_index(drop=True) # should we specify the sorting algorithm? at this point, it should be sorted anyway, but I think dataframes need to be explicitly sorted for use in e.g. merge_asof? def drop_zero_tot(df: pd.DataFrame) -> pd.DataFrame: @@ -391,26 +337,25 @@ def drop_zero_tot(df: pd.DataFrame) -> pd.DataFrame: pd.DataFrame df with only the events with ToT > 0 """ - fdf = df[df["ToT"] > 0] - return fdf + return df[df["ToT"] > 0] """ Functions to help perform clustering and centroiding on raw data. """ -TIMESTAMP_VALUE = ((1e-9) / 4096) * 25 -MICROSECOND = 10 ** (-6) +TIMESTAMP_VALUE = 1.5625*1e-9 # each raw timestamp is 1.5625 seconds +MICROSECOND = 1e-6 # We have had decent success with these values, but do not know for sure if they are optimal. -DEFAULT_CLUSTER_RADIUS = 2 -DEFAULT_CLUSTER_TW_MICROSECONDS = 0.5 +DEFAULT_CLUSTER_RADIUS = 3 +DEFAULT_CLUSTER_TW_MICROSECONDS = 0.3 DEFAULT_CLUSTER_TW = int(DEFAULT_CLUSTER_TW_MICROSECONDS * MICROSECOND / TIMESTAMP_VALUE) -def neighbor_set_from_df( +def cluster_df( df: pd.DataFrame, tw: int = DEFAULT_CLUSTER_TW, radius: int = DEFAULT_CLUSTER_RADIUS -) -> tuple[np.ndarray, Set[tuple[int]]]: +) -> tuple[np.ndarray, np.ndarray]: """ Uses scipy.spatial's KDTree to cluster raw input data. Requires a time window for clustering adjacent pixels and the total search radius. @@ -431,90 +376,41 @@ def neighbor_set_from_df( An set of tuples of the indices of the clustered events. The outer set is each cluster, and the inner tuples are the events in each cluster. """ events = np.array( - df[["t", "x", "y", "ToT", "t"]].values + df[["t", "x", "y", "ToT", "t"]].values # raw data stored in dataframe. duplicate timestamp column as the first instance is windowed ) # first three columns are for search radius of KDTree events[:, 0] = np.floor_divide(events[:, 0], tw) # bin by the time window - tree = KDTree(events[:, :3]) # generate KDTree based off the coordinates + tree = KDTree(events[:, :3]) # generate KDTree based off the coordinates (t/timewindow, x, y) neighbors = tree.query_ball_tree( tree, radius ) # compare tree against itself to find neighbors within the search radius - clusters = set(tuple(n) for n in neighbors) # turn the list of lists into a set of tuples - return events, clusters - - -def cluster_stats( - clusters: Set[tuple[int]] -) -> tuple[int]: - """ - Determines basic information about cluster information, such as the number of clusters and size of the largest cluster. - - Parameters - ---------- - clusters : Set[tuple[int]] - The set of tuples of clusters from neighbor_set_from_df() - - Returns - ------- - int - The total number of clusters - int - The number of events in the largest cluster - """ - num_clusters = len(clusters) - max_cluster = max(map(len, clusters)) - return num_clusters, max_cluster + if len(neighbors) >= 2147483647: # performance is marginally better if can use int32 for the indices, so check for that + dtype = np.int64 + else: + dtype = np.int32 + return pd.DataFrame(neighbors).fillna(-1).astype(dtype).drop_duplicates().values, events[:, 1:] # a bit weird, but faster than using sets and list operators to unpack neighbors -def create_cluster_arr( - clusters: Set[tuple[int]], num_clusters: int, max_cluster: int -) -> np.ndarray: # is there a better way to do this? - """ - Converts the clusters from a set of tuples of indices to an 2D numpy array format which can be efficiently iterated through with Numba. - - Parameters - ---------- - clusters : Set[tuple[int]] - The set of tuples of clusters from neighbor_set_from_df() - num_clusters : int - The total number of clusters - max_cluster : int - The number of events in the largest cluster - - Returns - ------- - np.ndarray - The cluster data now in a 2D numpy array. - """ - cluster_arr = np.full( - (num_clusters, max_cluster), -1, dtype=np.int64 - ) # fill with -1; these will be passed later - for cluster_num, cluster in enumerate(clusters): - for event_num, event in enumerate(cluster): - cluster_arr[cluster_num, event_num] = event - return cluster_arr - - -@numba.jit(nopython=True) -def cluster_arr_to_cent( - cluster_arr: np.ndarray, events: np.ndarray, num_clusters: int, max_cluster: int -) -> tuple[np.ndarray]: +@numba.jit(nopython=True, cache=True) +def centroid_clusters( + cluster_arr: np.ndarray, events: np.ndarray +) -> tuple[np.ndarray]: """ Performs the centroiding of a group of clusters using Numba. Note I originally attempted to unpack the clusters using list comprehensions, but this approach is significantly faster. Parameters ---------- - clusters : Set[tuple[int]] - The set of tuples of clusters from neighbor_set_from_df() - num_clusters : int - The total number of clusters - max_cluster : int - The number of events in the largest clust + clusters : nd.array + The numpy representation of the clusters' event indices. + events : nd.array + The numpy represetation of the event data. Returns ------- tuple[np.ndarray] t, xc, yc, ToT_max, ToT_sum, and n (number of events) in each cluster. """ + num_clusters = cluster_arr.shape[0] + max_cluster = cluster_arr.shape[1] t = np.zeros(num_clusters, dtype="uint64") xc = np.zeros(num_clusters, dtype="float32") yc = np.zeros(num_clusters, dtype="float32") @@ -527,13 +423,13 @@ def cluster_arr_to_cent( for event_num in range(max_cluster): event = cluster_arr[cluster_id, event_num] if event > -1: # if we have an event here - if events[event, 3] > _ToT_max: # find the max ToT, assign, use that time - _ToT_max = events[event, 3] - t[cluster_id] = events[event, 4] + if events[event, 2] > _ToT_max: # find the max ToT, assign, use that time + _ToT_max = events[event, 2] + t[cluster_id] = events[event, 3] ToT_max[cluster_id] = _ToT_max - xc[cluster_id] += events[event, 1] * events[event, 3] # x and y centroids by time over threshold - yc[cluster_id] += events[event, 2] * events[event, 3] - ToT_sum[cluster_id] += events[event, 3] # calcuate sum + xc[cluster_id] += events[event, 0] * events[event, 2] # x and y centroids by time over threshold + yc[cluster_id] += events[event, 1] * events[event, 2] + ToT_sum[cluster_id] += events[event, 2] # calcuate sum n[cluster_id] += np.ubyte(1) # number of events in cluster else: break @@ -568,60 +464,15 @@ def ingest_cent_data( } -def cent_to_numpy( - cluster_arr: np.ndarray, events: int, num_clusters: int, max_cluster: int -) -> Dict[str, np.ndarray]: - """ - Wrapper function to perform ingest_cent_data(cluster_arr_to_cent()) - - Parameters - ---------- - cluster_arr : np.ndarray - The array of cluster events from create_cluster_arr() - events : int - Number of photon events - num_clusters : int - The total number of clusters - max_cluster : int - The number of events in the largest clust - - Returns - ------- - Dict[str, np.ndarray] - Keys of t, xc, yc, ToT_max, ToT_sum, and n (number of events) in each cluster. - """ - return ingest_cent_data(cluster_arr_to_cent(cluster_arr, events, num_clusters, max_cluster)) - - -def cent_to_df( - cd_np: Dict[str, np.ndarray] -) -> pd.DataFrame: - """ - Returns the centroided dataframe from the zipped inputs. - - Parameters - ---------- - cd_np : Dict[str, np.ndarray] - Dictionary of the clustered data. - - Returns - ------- - pd.DataFrame - Time sorted dataframe of the centroids. - """ - cent_df = pd.DataFrame(cd_np) - return cent_df.sort_values("t").reset_index(drop=True) - - -def raw_df_to_cluster_df( - raw_df: pd.DataFrame, tw: int = DEFAULT_CLUSTER_TW, radius: int = DEFAULT_CLUSTER_RADIUS +def raw_df_to_cent_df( + df: pd.DataFrame, tw: int = DEFAULT_CLUSTER_TW, radius: int = DEFAULT_CLUSTER_RADIUS ) -> pd.DataFrame: """ Uses functions defined herein to take Dataframe of raw data and return dataframe of clustered data. Parameters ---------- - raw_df : pd.DataFrame + df : pd.DataFrame Pandas DataFrame of the raw data tw : int The time window to be considered "coincident" for clustering purposes @@ -633,11 +484,10 @@ def raw_df_to_cluster_df( pd.DataFrame Pandas DataFrame of the centroided data. """ - filt_cond_raw_df = drop_zero_tot(condense_raw_df(raw_df)) - events, clusters = neighbor_set_from_df(filt_cond_raw_df, tw, radius) - num_clusters, max_cluster = cluster_stats(clusters) - cluster_arr = create_cluster_arr(clusters, num_clusters, max_cluster) - return cent_to_df(cent_to_numpy(cluster_arr, events, num_clusters, max_cluster)) + fdf = drop_zero_tot(df) + cluster_arr, events = cluster_df(fdf, tw, radius) + data = centroid_clusters(cluster_arr, events) + return pd.DataFrame(ingest_cent_data(data)).sort_values("t").reset_index(drop=True) # should we specify the sort type here? this should be *almost* completely sorted already def add_centroid_cols( @@ -659,107 +509,31 @@ def add_centroid_cols( Originally dataframe with new columns x, y, and t_ns added. """ if gap: - df.loc[df['xc'] >= 255.5, 'xc'] += 2 - df.loc[df['yc'] >= 255.5, 'yc'] += 2 - df["x"] = np.round(df["xc"]).astype(np.uint16) + df.loc[df["xc"] >= 255.5, "xc"] += 2 + df.loc[df["yc"] >= 255.5, "yc"] += 2 + df["x"] = np.round(df["xc"]).astype(np.uint16) # sometimes you just want to know the closest pixel df["y"] = np.round(df["yc"]).astype(np.uint16) - df["t_ns"] = df["t"] / 4096 * 25 + df["t_ns"] = (df["t"].astype(np.float64) * 1.5625) # better way to convert to ns while maintaining precision? return df """ -A bunch of functions to help process multiple related .tpx3 files into Pandas dataframes stored in .h5 files. +Functions to help process multiple related .tpx3 files into Pandas dataframes stored in .h5 files. """ RAW_H5_SUFFIX = "" CENT_H5_SUFFIX = "_cent" CONCAT_H5_SUFFIX = "_cent" -def extract_fpaths_from_sid( - sid: int -) -> List[str]: - """ - Extract file paths from a given sid. - - Parameters - ---------- - sid : int - Short ID of a BlueSky scan - - Returns - ------- - List[str] - Filepaths of the written .tpx3, as recorded in Tiled - """ - return list(db[sid].table()["tpx3_files_raw_filepaths"].to_numpy()[0]) - - -def extract_uid_from_fpaths( - fpaths: List[str] -) -> str: - """ - Extract scan unique ID from file paths. - - Parameters - ---------- - fpaths : List[str] - List of the filepaths. - - Returns - ------- - str - String of the first file's unique ID. - - """ - return os.path.basename(fpaths[0])[:23] - - -def extract_dir_from_fpaths( - fpaths: List[str] -) -> str: - """ - Extract directory from file paths. - - Parameters - ---------- - fpaths : List[str] - List of the filepaths. - - Returns - ------- - str - String of the first file's directory. - - """ - return os.path.dirname(fpaths[0]) - - -def extract_uid_from_sid( - sid: int -) -> str: - """ - Extract user ID from a given sid. - - Parameters - ---------- - sid : int - - Returns - ------- - str - String of the short ID's corresponding unique ID. - - """ - return extract_uid_from_fpaths(extract_fpaths_from_sid(sid)) - - -def convert_file( - fpath: Union[str, Path], time_window_microsecond: float = DEFAULT_CLUSTER_TW_MICROSECONDS, radius: int = DEFAULT_CLUSTER_RADIUS, print_details: bool = False +def convert_tpx_file( + fpath: Union[str, Path], time_window_microsecond: float = DEFAULT_CLUSTER_TW_MICROSECONDS, radius: int = DEFAULT_CLUSTER_RADIUS, print_details: bool = False, overwrite: bool = True ): """ Convert a .tpx3 file into raw and centroided Pandas dataframes, which are stored in .h5 files. + TO DO: Args to specify output directory (default will be same directory as .tpx3 file as is now). + Parameters ---------- fpath : Union[str, Path] @@ -770,193 +544,122 @@ def convert_file( The radius, in pixels, to perform centroiding print_details : bool = False Boolean toggle about whether to print detailed data. + overwrite : bool = True + Boolean toggle about whether to overwrite pre-existing data. + """ fname, ext = os.path.splitext(fpath) dfname = "{}{}.h5".format(fname, RAW_H5_SUFFIX) dfcname = "{}{}.h5".format(fname, CONCAT_H5_SUFFIX) if ext == ".tpx3" and os.path.exists(fpath): - file_size = os.path.getsize(fpath) - have_df = os.path.exists(dfname) - have_dfc = os.path.exists(dfcname) - - if have_df and have_dfc: - print("-> {} exists, skipping.".format(dfname)) - else: - print("-> Processing {}, size: {:.1f} MB".format(fpath, file_size/1000000)) - time_window = time_window_microsecond * 1e-6 - time_stamp_conversion = 6.1e-12 - timedif = int(time_window / time_stamp_conversion) - - if print_details: - print("Loading {} data into dataframe...".format(fpath)) - df = raw_to_sorted_df(fpath) - num_events = df.shape[0] - - if print_details: - print("Loading {} complete. {} events found. Saving to: {}".format(fpath, num_events, dfname)) - df.to_hdf(dfname, key='df', mode='w') - - if print_details: - print("Saving {} complete. Beginning clustering...".format(dfname)) - df_c = raw_df_to_cluster_df(df, timedif, radius) - num_clusters = df_c.shape[0] + + try: + file_size = os.path.getsize(fpath) + have_df = os.path.exists(dfname) + have_dfc = os.path.exists(dfcname) + + if have_df and have_dfc and not overwrite: + + print("-> {} exists, skipping.".format(dfname)) + + else: + + if print_details: + print("-> Processing {}, size: {:.1f} MB".format(fpath, file_size/1000000)) + + time_window = time_window_microsecond * 1e-6 + time_stamp_conversion = 6.1e-12 + timedif = int(time_window / time_stamp_conversion) + + if print_details: + print("Loading {} data into dataframe...".format(fpath)) + + df = tpx_to_raw_df(fpath) + num_events = df.shape[0] + + if print_details: + print("Loading {} complete. {} events found. Saving to: {}".format(fpath, num_events, dfname)) + + df.to_hdf(dfname, key="df", format="table", mode="w") + + if print_details: + print("Saving {} complete. Beginning clustering...".format(dfname)) + + df_c = raw_df_to_cent_df(df, timedif, radius) + + num_clusters = df_c.shape[0] + + if print_details: + print("Clustering {} complete. {} clusters found. Saving to {}".format(fpath, num_clusters, dfcname)) + + df_c.to_hdf(dfcname, key="df", format="table", mode="w") + + if print_details: + print("Saving {} complete. Moving onto next file.".format(dfcname)) + + except Exception as e: + if print_details: - print("Clustering {} complete. {} clusters found. Saving to {}".format(fpath, num_clusters, dfcname)) - df_c.to_hdf(dfcname, key='df', mode='w') - print("Saving {} complete. Moving onto next file.".format(dfcname)) + print("Conversion of {} failed due to {}, moving on.".format(fpath,e)) + else: - print("File not found. Moving onto next file.") - - -def convert_tpx3_parallel( - fpaths: Union[str, Path], num_workers: int = None -): + if print_details: + print("File not found. Moving onto next file.") + + +def convert_tpx3_files_parallel(fpaths: Union[List[str], List[Path]], num_workers: int = None): """ - Convert a list of .tpx3 files in a parallel processing pool. + Convert a list .tpx3 files in parallel using multiprocessing and convert_tpx_file(). + + TO DO: Accept more arguments for convert_tpx_file. Parameters ---------- - fpaths : Union[str, Path] - .tpx3 file paths to convert in a parallel processing pool. - num_workers : int = None - Number of parallel workers to employ. + fpath : Union[str, Path] + .tpx3 file path + time_window_microsecond : float = DEFAULT_CLUSTER_TW_MICROSECONDS + The time window, in microseconds, to perform centroiding + radius : int = DEFAULT_CLUSTER_RADIUS + The radius, in pixels, to perform centroiding + print_details : bool = False + Boolean toggle about whether to print detailed data. + overwrite : bool = True + Boolean toggle about whether to overwrite pre-existing data. + """ - if num_workers == None: - num_cores = multiprocessing.cpu_count() - max_workers = num_cores-1 + if num_workers is None: + max_workers = multiprocessing.cpu_count() else: max_workers = num_workers - - with multiprocessing.Pool(processes=max_workers) as pool: - pool.map(convert_file, fpaths) - - print("Parallel conversion complete") - - -def convert_tpx3_st(fpaths: Union[str, Path]): - """ - Convert a list of .tpx3 files in a single thread. - - Parameters - ---------- - fpaths : Union[str, Path] - .tpx3 file paths to convert in a single thread. - """ - for file in fpaths: - convert_file(file) - -def get_cent_files( - uid: str, dir_name: Union[str, Path] -) -> List[str]: - """ - Gets a list of the centroided .h5 files from a given uid, sorted by sequence number. - - Parameters - ---------- - uid : str - The unique ID of the scan of which we want to get the files. + with multiprocessing.Pool(processes=max_workers) as pool: + for _ in tqdm(pool.imap_unordered(convert_tpx_file, fpaths), total=len(fpaths)): + pass - dir_name : Union[str, path] - Directory to look in for the files. - Returns - ------- - List[str] - List of the centroided file paths. +def convert_tpx3_files(fpaths: Union[List[str], List[Path]], print_details = True): """ - cent_files = [ - os.path.join(dir_name, file) - for file in os.listdir(dir_name) - if file.endswith("{}.h5".format(CENT_H5_SUFFIX)) and str(uid) in file and len(os.path.basename(file)) == 44 - ] - - cent_files.sort(key=lambda f: int(os.path.splitext(os.path.basename(f))[0].split("_")[-2])) - return cent_files - - -def concat_cent_files( - cfpaths: List[Union[str, Path]] -): - """ - Concatenates several centroided files together. + Convert a list .tpx3 files in a single process using convert_tpx_file(). - Parameters - ---------- - cfpaths : List[str, Path] - List of the centroided .h5 files to concatenate together. - """ - dir_name = os.path.dirname(cfpaths[0]) - uid = extract_uid_from_fpaths(cfpaths) - - dfs = [pd.read_hdf(fpath, key='df') for fpath in tqdm(cfpaths)] - combined_df = pd.concat(dfs).reset_index(drop=True) - - save_path = os.path.join(dir_name, "{}{}.h5".format(uid, CONCAT_H5_SUFFIX)) - combined_df.to_hdf(save_path, key='df', mode='w') - - print("-> Saving complete.") - - -def get_con_cent_file( - sid: int -) -> str: - """ - Gets the location of the concatenated centroid files of a given sid. + TO DO: Accept more arguments for convert_tpx_file. Parameters ---------- - sid : int - Short ID of whichto get the centroided file path + fpath : Union[str, Path] + .tpx3 file path + time_window_microsecond : float = DEFAULT_CLUSTER_TW_MICROSECONDS + The time window, in microseconds, to perform centroiding + radius : int = DEFAULT_CLUSTER_RADIUS + The radius, in pixels, to perform centroiding + print_details : bool = False + Boolean toggle about whether to print detailed data. + overwrite : bool = True + Boolean toggle about whether to overwrite pre-existing data. - Returns - ------- - str - Path of the centroided file. """ - fpaths = extract_fpaths_from_sid(sid) - uid = extract_uid_from_fpaths(fpaths) - dir_name = extract_dir_from_fpaths(fpaths) - cfpath = os.path.join(dir_name, "{}{}.h5".format(uid, CONCAT_H5_SUFFIX)) - - if os.path.exists(cfpath): - return cfpath - else: - print("-> Warning: {} does not exist".format(cfpath)) - return None - - -def convert_sids( - sids: List[int] -): - """ - Convert given sids by converting each .tpx3 file and then concatenating results together into a master dataframe. - - Parameters - ---------- - sids : List[int] - List of BlueSky scans' short IDs to convert. - """ - - for sid in sids: - print("\n\n---> Beginning sid: {} <---\n".format(sid)) + for file in fpaths: + convert_tpx_file(file, print_details = print_details) - tpx3fpaths = extract_fpaths_from_sid(sid) - dir_name = extract_dir_from_fpaths(tpx3fpaths) - num_tpx = len(tpx3fpaths) - uid = extract_uid_from_fpaths(tpx3fpaths) - - convert_tpx3_parallel(tpx3fpaths, num_workers=16) - centfpaths = get_cent_files(uid, dir_name) - num_cent = len(centfpaths) - - if num_tpx == num_cent: - print("---> Conversion numbers match") - concat_cent_files(centfpaths) - else: - print("---> Warning: conversion mismatch: tpx3={}, cent={}".format(num_tpx, num_cent)) - - print("---> Done with {}!".format(sid)) - gc.collect() + \ No newline at end of file diff --git a/tpx3awkward/examples/example.ipynb b/tpx3awkward/examples/example.ipynb new file mode 100644 index 0000000..6eeabb1 --- /dev/null +++ b/tpx3awkward/examples/example.ipynb @@ -0,0 +1,351 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "76010898-2082-49cb-9bff-8173fd79ad6e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import _utils as tpx\n", + "import pandas as pd\n", + "from tiled.client import from_uri\n", + "db = from_uri('https://tiled.nsls2.bnl.gov', 'dask')['chx']['raw']" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "59bb0e40-1686-413c-8595-c38655e2d0d6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "sid = 143200 # example SID (1800 partitions at 1 sec/partition from NSLS-II quantum microscope project)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1b629cf7-1d46-49b9-a41b-43f3b9972f4e", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Raw dataframe: \n", + " x y ToT t chip\n", + "0 502 452 475 11301032 1\n", + "1 405 272 275 11302904 1\n", + "2 406 272 350 11302904 1\n", + "3 295 135 525 11303039 0\n", + "4 358 509 300 11303780 1\n", + "... ... ... ... ... ...\n", + "971254 200 46 550 651295354 3\n", + "971255 489 104 450 651296021 0\n", + "971256 389 21 300 651296305 0\n", + "971257 390 21 300 651296306 0\n", + "971258 37 325 75 651299518 2\n", + "\n", + "[971259 rows x 5 columns]\n", + "Centroided dataframe: \n", + " t xc yc ToT_max ToT_sum n\n", + "0 11301032 502.000000 452.0 475 475 1\n", + "1 11302904 405.559998 272.0 350 625 2\n", + "2 11303039 295.000000 135.0 525 525 1\n", + "3 11303780 358.478271 509.0 300 575 2\n", + "4 11303973 412.409088 60.0 325 550 2\n", + "... ... ... ... ... ... ..\n", + "659582 651294809 326.000000 379.0 475 475 1\n", + "659583 651295354 200.000000 46.0 550 550 1\n", + "659584 651296021 489.000000 104.0 450 450 1\n", + "659585 651296305 389.500000 21.0 300 600 2\n", + "659586 651299518 37.000000 325.0 75 75 1\n", + "\n", + "[659587 rows x 6 columns]\n", + "Centroided dataframe: \n", + " t xc yc ToT_max ToT_sum n x y \\\n", + "0 11301032 504.000000 454.0 475 475 1 504 454 \n", + "1 11302904 407.559998 274.0 350 625 2 408 274 \n", + "2 11303039 297.000000 135.0 525 525 1 297 135 \n", + "3 11303780 360.478271 511.0 300 575 2 360 511 \n", + "4 11303973 414.409088 60.0 325 550 2 414 60 \n", + "... ... ... ... ... ... .. ... ... \n", + "659582 651294809 328.000000 381.0 475 475 1 328 381 \n", + "659583 651295354 200.000000 46.0 550 550 1 200 46 \n", + "659584 651296021 491.000000 104.0 450 450 1 491 104 \n", + "659585 651296305 391.500000 21.0 300 600 2 392 21 \n", + "659586 651299518 37.000000 327.0 75 75 1 37 327 \n", + "\n", + " t_ns \n", + "0 1.765786e+07 \n", + "1 1.766079e+07 \n", + "2 1.766100e+07 \n", + "3 1.766216e+07 \n", + "4 1.766246e+07 \n", + "... ... \n", + "659582 1.017648e+09 \n", + "659583 1.017649e+09 \n", + "659584 1.017650e+09 \n", + "659585 1.017650e+09 \n", + "659586 1.017655e+09 \n", + "\n", + "[659587 rows x 9 columns]\n", + "Partition duration: 0.999997634375 sec.\n" + ] + } + ], + "source": [ + "files = db[sid]['primary']['data']['tpx3_files_raw_filepaths'][0].compute()\n", + "\n", + "file = files[0][5:] # cut off 'file:' from beginning\n", + "\n", + "df = tpx.tpx_to_raw_df(file)\n", + "cdf = tpx.raw_df_to_cent_df(df)\n", + "\n", + "print('Raw dataframe: ')\n", + "print(df)\n", + "\n", + "print('Centroided dataframe: ')\n", + "print(cdf)\n", + "\n", + "cdf = tpx.add_centroid_cols(cdf)\n", + "print('Centroided dataframe: ')\n", + "print(cdf)\n", + "\n", + "duration = (cdf.iloc[-1]['t_ns'] - cdf.iloc[0]['t_ns'])/1e9\n", + "print(f'Partition duration: {duration} sec.')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "48254ae7-8d0c-4766-b1d1-2b7f4732934c", + "metadata": {}, + "outputs": [], + "source": [ + "sid = 143201 # new example SID (1800 partitions at 1 sec/partition from NSLS-II quantum microscope project)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "2adae433-a68c-436e-9640-d4a9745af969", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-> Processing /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000.tpx3, size: 10.5 MB\n", + "Loading /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000.tpx3 data into dataframe...\n", + "Loading /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000.tpx3 complete. 975938 events found. Saving to: /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000.h5\n", + "Saving /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000.h5 complete. Beginning clustering...\n", + "Clustering /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000.tpx3 complete. 659537 clusters found. Saving to /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000_cent.h5\n", + "Saving /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000_cent.h5 complete. Moving onto next file.\n", + "-> Processing /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000001.tpx3, size: 10.5 MB\n", + "Loading /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000001.tpx3 data into dataframe...\n", + "Loading /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000001.tpx3 complete. 972867 events found. Saving to: /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000001.h5\n", + "Saving /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000001.h5 complete. Beginning clustering...\n", + "Clustering /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000001.tpx3 complete. 657491 clusters found. Saving to /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000001_cent.h5\n", + "Saving /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000001_cent.h5 complete. Moving onto next file.\n" + ] + } + ], + "source": [ + "files = db[sid]['primary']['data']['tpx3_files_raw_filepaths'][0].compute()\n", + "files = [f.replace('file:', '') for f in files]\n", + "tpx.convert_tpx3_files(files[0:2])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "05332e5b-17eb-4074-ad92-51ef1273e9f4", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Raw dataframe: \n", + " x y ToT t chip\n", + "0 210 132 575 11294477 3\n", + "1 106 227 375 11294540 3\n", + "2 107 227 150 11294544 3\n", + "3 381 346 475 11294882 1\n", + "4 268 444 450 11295743 1\n", + "... ... ... ... ... ...\n", + "975933 103 441 600 651291179 2\n", + "975934 193 383 450 651294115 2\n", + "975935 121 280 400 651294141 2\n", + "975936 256 373 400 651294145 1\n", + "975937 255 373 50 651294166 2\n", + "\n", + "[975938 rows x 5 columns]\n", + "Centroided dataframe: \n", + " t xc yc ToT_max ToT_sum n\n", + "0 11294477 210.000000 132.0 575 575 1\n", + "1 11294540 106.285713 227.0 375 525 2\n", + "2 11294882 381.000000 346.0 475 475 1\n", + "3 11295743 268.000000 444.0 450 450 1\n", + "4 11298466 420.000000 52.0 450 450 1\n", + "... ... ... ... ... ... ..\n", + "659532 651290601 101.000000 436.0 575 575 1\n", + "659533 651291179 103.000000 441.0 600 600 1\n", + "659534 651294115 193.000000 383.0 450 450 1\n", + "659535 651294141 121.000000 280.0 400 400 1\n", + "659536 651294145 255.888885 373.0 400 450 2\n", + "\n", + "[659537 rows x 6 columns]\n", + "Centroided dataframe: \n", + " t xc yc ToT_max ToT_sum n x y \\\n", + "0 11294477 210.000000 132.0 575 575 1 210 132 \n", + "1 11294540 106.285713 227.0 375 525 2 106 227 \n", + "2 11294882 383.000000 348.0 475 475 1 383 348 \n", + "3 11295743 270.000000 446.0 450 450 1 270 446 \n", + "4 11298466 422.000000 52.0 450 450 1 422 52 \n", + "... ... ... ... ... ... .. ... ... \n", + "659532 651290601 101.000000 438.0 575 575 1 101 438 \n", + "659533 651291179 103.000000 443.0 600 600 1 103 443 \n", + "659534 651294115 193.000000 385.0 450 450 1 193 385 \n", + "659535 651294141 121.000000 282.0 400 400 1 121 282 \n", + "659536 651294145 257.888885 375.0 400 450 2 258 375 \n", + "\n", + " t_ns \n", + "0 1.764762e+07 \n", + "1 1.764772e+07 \n", + "2 1.764825e+07 \n", + "3 1.764960e+07 \n", + "4 1.765385e+07 \n", + "... ... \n", + "659532 1.017642e+09 \n", + "659533 1.017642e+09 \n", + "659534 1.017647e+09 \n", + "659535 1.017647e+09 \n", + "659536 1.017647e+09 \n", + "\n", + "[659537 rows x 9 columns]\n", + "Partition duration: 0.99999948125 sec.\n" + ] + } + ], + "source": [ + "df = pd.read_hdf('/nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000.h5')\n", + "cdf = pd.read_hdf('/nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000_cent.h5')\n", + "\n", + "print('Raw dataframe: ')\n", + "print(df)\n", + "\n", + "print('Centroided dataframe: ')\n", + "print(cdf)\n", + "\n", + "cdf = tpx.add_centroid_cols(cdf)\n", + "print('Centroided dataframe: ')\n", + "print(cdf)\n", + "\n", + "duration = (cdf.iloc[-1]['t_ns'] - cdf.iloc[0]['t_ns'])/1e9\n", + "print(f'Partition duration: {duration} sec.')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c981be40-e631-4f73-bbab-4fadda8b9285", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1800/1800 [07:51<00:00, 3.82it/s]\n" + ] + } + ], + "source": [ + "try:\n", + " files = db[sid]['primary']['data']['tpx3_files_raw_filepaths'][0].compute()\n", + " files = [f.replace('file:', '') for f in files]\n", + " tpx.convert_tpx3_files_parallel(files)\n", + " \n", + "except Exception as e:\n", + " print(f'Could not finish {sid} due to {e}') " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "946fa74f-8c8c-4f36-bc0c-9f203e023f78", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "All raw .h5 files exist!\n", + "All centroided raw .h5 files exist!\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "raw_h5_files = [os.path.exists(f.replace('.tpx3', '.h5')) for f in files]\n", + "cent_h5_files = [os.path.exists(f.replace('.tpx3', '_cent.h5')) for f in files]\n", + "\n", + "if all(raw_h5_files):\n", + " print('All raw .h5 files exist!')\n", + "else:\n", + " print('Missing .h5 files!')\n", + " \n", + "if all(cent_h5_files):\n", + " print('All centroided raw .h5 files exist!')\n", + "else:\n", + " print('Missing centroided .h5 files!')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "705c639d-dac9-4517-b5ff-ce44365ed4df", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 065d6075e07875332c8bc9e35b76fae1531709b9 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Mon, 19 Aug 2024 17:00:02 -0400 Subject: [PATCH 09/15] FIX: adjusting timestamps before heartbeat --- tpx3awkward/_utils.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tpx3awkward/_utils.py b/tpx3awkward/_utils.py index 8e656f0..1ea71d8 100644 --- a/tpx3awkward/_utils.py +++ b/tpx3awkward/_utils.py @@ -206,9 +206,12 @@ def _ingest_raw_data(data): tot = np.zeros_like(data, dtype="u4") ts = np.zeros_like(data, dtype="u8") heartbeat_lsb = None #np.uint64(0) + heartbeat_msb = None #np.uint64(0) heartbeat_time = np.uint64(0) + hb_init_flag = False # Indicate when the heartbeat was set for the first time photon_count, chip_indx, msg_run_count, expected_msg_count = 0, 0, 0, 0 + for msg in data: if is_packet_header(msg): # Type 1: packet header (id'd via TPX3 magic number) @@ -230,21 +233,21 @@ def _ingest_raw_data(data): y[photon_count] = _y tot[photon_count] = _tot ts[photon_count] = _ts - - if (photon_count > 0) and (_ts > ts[photon_count-1]) and (_ts - ts[photon_count-1] > 2**30): + + # Adjust timestamps that were set before the first heartbeat was received + if hb_init_flag and (photon_count > 0): prev_ts = ts[:photon_count] # This portion needs to be adjusted # Find what the current timestamp would be without global heartbeat _, _, _, _ts_0 = decode_message(msg, chip_indx, heartbeat_time=np.uint64(0)) - # Check if there is a SPIDR rollover in the beginning of the file but before heartbeat was received + # Check if there is a SPIDR rollover in the beginning of the file but before the received head_max = max(prev_ts[:10]) tail_min = min(prev_ts[-10:]) - # Compare the difference with some big number (e.g. 1/4 of SPIDR) if (head_max > tail_min) and (head_max - tail_min > 2**32): prev_ts[prev_ts < 2**33] += np.uint64(2**34) _ts_0 += np.uint64(2**34) - # Ajust already processed timestamps ts[:photon_count] = prev_ts + (_ts - _ts_0) + hb_init_flag = False photon_count += 1 msg_run_count += 1 @@ -257,11 +260,13 @@ def _ingest_raw_data(data): # Type 4: global timestap (id'd via 0x4 upper nibble) subheader = (msg >> np.uint(56)) & np.uint64(0x0F) if subheader == 0x4: - # timer LSB, 32 bits of time + # timer LSB, 32 bits of time -- needs to be received first, before MSB heartbeat_lsb = (msg >> np.uint(16)) & np.uint64(0xFFFFFFFF) elif subheader == 0x5: # timer MSB -- only matters if LSB has been received already if heartbeat_lsb is not None: + if heartbeat_msb is None: + hb_init_flag = True heartbeat_msb = ((msg >> np.uint(16)) & np.uint64(0xFFFF)) << np.uint(32) heartbeat_time = (heartbeat_msb | heartbeat_lsb) # TODO the c++ code has large jump detection, do not understand why @@ -279,7 +284,6 @@ def _ingest_raw_data(data): else: raise Exception(f"Not supported: {msg}") - # Sort the timestamps indx = np.argsort(ts[:photon_count], kind="mergesort") chips = chips[indx] From ba3d5528b087decd1037d81b33d631790ce0bf44 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Mon, 19 Aug 2024 17:07:10 -0400 Subject: [PATCH 10/15] FIX: adjusting timestamps before heartbeat --- tpx3awkward/_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tpx3awkward/_utils.py b/tpx3awkward/_utils.py index 1ea71d8..31a393d 100644 --- a/tpx3awkward/_utils.py +++ b/tpx3awkward/_utils.py @@ -246,6 +246,8 @@ def _ingest_raw_data(data): prev_ts[prev_ts < 2**33] += np.uint64(2**34) _ts_0 += np.uint64(2**34) ts[:photon_count] = prev_ts + (_ts - _ts_0) + + # TODO: Do we want to adjust timestamps after SPIDR rollower even if no heartbeat was received at all? hb_init_flag = False photon_count += 1 From 7f2b57b01aa6fcdfcc6514fd3b39a01c90147687 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Mon, 19 Aug 2024 17:23:29 -0400 Subject: [PATCH 11/15] ENH: apply SPIDR rollover with single partition if no heartbeat is received --- tpx3awkward/_utils.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tpx3awkward/_utils.py b/tpx3awkward/_utils.py index 31a393d..1ad253e 100644 --- a/tpx3awkward/_utils.py +++ b/tpx3awkward/_utils.py @@ -243,11 +243,9 @@ def _ingest_raw_data(data): head_max = max(prev_ts[:10]) tail_min = min(prev_ts[-10:]) if (head_max > tail_min) and (head_max - tail_min > 2**32): - prev_ts[prev_ts < 2**33] += np.uint64(2**34) + prev_ts[prev_ts < (tail_min+head_max)/2] += np.uint64(2**34) _ts_0 += np.uint64(2**34) ts[:photon_count] = prev_ts + (_ts - _ts_0) - - # TODO: Do we want to adjust timestamps after SPIDR rollower even if no heartbeat was received at all? hb_init_flag = False photon_count += 1 @@ -286,6 +284,13 @@ def _ingest_raw_data(data): else: raise Exception(f"Not supported: {msg}") + # Check if there were no heartbeat messages and adjust for potential SPIDR rollovers + if heartbeat_msb is None: + head_max = max(ts[:10]) + tail_min = min(ts[-10:]) + if (head_max > tail_min) and (head_max - tail_min > 2**32): + ts[ts < (tail_min+head_max)/2] += np.uint64(2**34) + # Sort the timestamps indx = np.argsort(ts[:photon_count], kind="mergesort") chips = chips[indx] From 4dcf196476cff95e9c6f7ec0c51a44bca332ece0 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Mon, 19 Aug 2024 17:27:32 -0400 Subject: [PATCH 12/15] ENH: apply SPIDR rollover with single partition if no heartbeat is received --- tpx3awkward/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpx3awkward/_utils.py b/tpx3awkward/_utils.py index 1ad253e..73f6d71 100644 --- a/tpx3awkward/_utils.py +++ b/tpx3awkward/_utils.py @@ -239,7 +239,7 @@ def _ingest_raw_data(data): prev_ts = ts[:photon_count] # This portion needs to be adjusted # Find what the current timestamp would be without global heartbeat _, _, _, _ts_0 = decode_message(msg, chip_indx, heartbeat_time=np.uint64(0)) - # Check if there is a SPIDR rollover in the beginning of the file but before the received + # Check if there is a SPIDR rollover in the beginning of the file before the heartbeat head_max = max(prev_ts[:10]) tail_min = min(prev_ts[-10:]) if (head_max > tail_min) and (head_max - tail_min > 2**32): From 4be0cee224a28ad8e6e261b1ced3adc65d2e24fe Mon Sep 17 00:00:00 2001 From: Eugene M Date: Thu, 12 Sep 2024 17:59:45 -0400 Subject: [PATCH 13/15] ENH: raise warning when no heartbeat messages received in a file --- tpx3awkward/_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tpx3awkward/_utils.py b/tpx3awkward/_utils.py index 73f6d71..dcd1d16 100644 --- a/tpx3awkward/_utils.py +++ b/tpx3awkward/_utils.py @@ -286,6 +286,7 @@ def _ingest_raw_data(data): # Check if there were no heartbeat messages and adjust for potential SPIDR rollovers if heartbeat_msb is None: + warnings.warn("No heartbeat messages received; decoded timestamps may be inaccurate.") head_max = max(ts[:10]) tail_min = min(ts[-10:]) if (head_max > tail_min) and (head_max - tail_min > 2**32): From c9f9d92f05a42b531ac7e27862a706e018cb7ada Mon Sep 17 00:00:00 2001 From: Eugene M Date: Thu, 12 Sep 2024 18:12:58 -0400 Subject: [PATCH 14/15] FIX: errors in flake8 config --- .flake8 | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.flake8 b/.flake8 index 90019b0..10a446f 100644 --- a/.flake8 +++ b/.flake8 @@ -8,4 +8,7 @@ exclude = tpx3awkward/_version.py, docs/source/conf.py max-line-length = 115 -ignore = E203, W503 # Ignore some style 'errors' produced while formatting by 'black' +ignore = + # Ignore some style 'errors' produced while formatting by 'black' + E203, + W503 From b2330182b727658f24010cb470da2a2f0caf669a Mon Sep 17 00:00:00 2001 From: Justin Goodrich Date: Tue, 10 Jun 2025 22:08:22 -0400 Subject: [PATCH 15/15] Updated to current production version. --- AUTHORS.rst | 2 + requirements.txt | 11 +- tpx3awkward/_utils.py | 619 +++++++++++++++++++++-------- tpx3awkward/examples/example.ipynb | 351 ---------------- 4 files changed, 460 insertions(+), 523 deletions(-) delete mode 100644 tpx3awkward/examples/example.ipynb diff --git a/AUTHORS.rst b/AUTHORS.rst index 7a93849..1187ddc 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -10,4 +10,6 @@ Maintainer Contributors ------------ +* Thomas Caswell * Justin Goodrich +* Eugene Matviychuk diff --git a/requirements.txt b/requirements.txt index 683f27b..30caa21 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,13 @@ # List required packages in this file, one per line. +os +pathlib +typing numpy numba pandas -scipy -# pyCHX +multiprocessing +functools +warnings +glob +tqdm +gc diff --git a/tpx3awkward/_utils.py b/tpx3awkward/_utils.py index 7a337cb..c4d03f5 100644 --- a/tpx3awkward/_utils.py +++ b/tpx3awkward/_utils.py @@ -5,10 +5,12 @@ from numpy.typing import NDArray import numba import pandas as pd -from scipy.spatial import KDTree import multiprocessing -from tqdm import tqdm +from functools import partial import warnings +import glob +from tqdm import tqdm +import gc IA = NDArray[np.uint64] @@ -210,8 +212,8 @@ def _ingest_raw_data(data): for msg in data: if is_packet_header(msg): # Type 1: packet header (id'd via TPX3 magic number) - if expected_msg_count != msg_run_count: - print("Missing messages!", msg) + #if expected_msg_count != msg_run_count: + #print("Missing messages!", msg) # extract the chip number for the following photon events chip_indx = np.uint8(get_block(msg, 8, 32)) @@ -276,12 +278,14 @@ def _ingest_raw_data(data): # TODO handle this! msg_run_count += 1 - else: - raise Exception(f"Not supported: {msg}") + #else: + #print(f"Exception 'Not supported: {msg}'") + #raise Exception(f"Not supported: {msg}") # Check if there were no heartbeat messages and adjust for potential SPIDR rollovers if heartbeat_msb is None: - warnings.warn("No heartbeat messages received; decoded timestamps may be inaccurate.") + #warnings.warn("No heartbeat messages received; decoded timestamps may be inaccurate.") + #print("No heartbeat messages received; decoded timestamps may be inaccurate.") head_max = max(ts[:10]) tail_min = min(ts[-10:]) if (head_max > tail_min) and (head_max - tail_min > 2**32): @@ -332,7 +336,7 @@ def tpx_to_raw_df(fpath: Union[str, Path]) -> pd.DataFrame: DataFrame of raw events from the .tpx3 file. """ raw_df = pd.DataFrame(ingest_raw_data(raw_as_numpy(fpath))) - return raw_df.sort_values("t").reset_index(drop=True) # should we specify the sorting algorithm? at this point, it should be sorted anyway, but I think dataframes need to be explicitly sorted for use in e.g. merge_asof? + return raw_df.sort_values("t").reset_index(drop=True) # should we specify the sorting algorithm? at this point? it should be sorted anyway, but I think dataframes need to be explicitly sorted for use in e.g. merge_asof? def drop_zero_tot(df: pd.DataFrame) -> pd.DataFrame: @@ -365,69 +369,88 @@ def drop_zero_tot(df: pd.DataFrame) -> pd.DataFrame: DEFAULT_CLUSTER_TW = int(DEFAULT_CLUSTER_TW_MICROSECONDS * MICROSECOND / TIMESTAMP_VALUE) -def cluster_df( - df: pd.DataFrame, tw: int = DEFAULT_CLUSTER_TW, radius: int = DEFAULT_CLUSTER_RADIUS -) -> tuple[np.ndarray, np.ndarray]: +def cluster_df_optimized(df, tw = DEFAULT_CLUSTER_TW, radius = DEFAULT_CLUSTER_RADIUS): + events = df[["t", "x", "y", "ToT", "t"]].to_numpy() + events[:, 0] = np.floor_divide(events[:, 0], tw) # Bin timestamps into time windows + + labels = cluster_df(events, radius, tw) + + return labels, events[:, 1:] + + +@numba.jit(nopython=True, cache=True) +def cluster_df(events, radius = DEFAULT_CLUSTER_TW, tw = DEFAULT_CLUSTER_RADIUS): + n = len(events) + labels = np.full(n, -1, dtype=np.int64) + cluster_id = 0 + + max_time = radius * tw # maximum time difference allowed for clustering + radius_sq = radius ** 2 + + for i in range(n): + if labels[i] == -1: # if event is unclustered + labels[i] = cluster_id + for j in range(i + 1, n): # scan forward only + if events[j, 4] - events[i, 4] > max_time: # early exit based on time + break + # Compute squared Euclidean distance + dx = events[i, 0] - events[j, 0] + dy = events[i, 1] - events[j, 1] + dt = events[i, 2] - events[j, 2] + distance_sq = dx * dx + dy * dy + dt * dt + + if distance_sq <= radius_sq: + labels[j] = cluster_id + cluster_id += 1 + + return labels + + +@numba.jit(nopython=True, cache=True) +def group_indices(labels): """ - Uses scipy.spatial's KDTree to cluster raw input data. Requires a time window for clustering adjacent pixels and the total search radius. + Group indices by cluster ID using pre-allocated arrays in a Numba-optimized way. Parameters ---------- - df : pd.DataFrame - DataFrame with the raw data (after timesorting and ToT filtering). - tw : int - The time window to be considered "coincident" for clustering purposes - radius : int - The search radius, using Euclidean distance of x, y, timestamp/tw + labels : np.ndarray + Array of cluster labels for each event. + num_clusters : int + Number of unique clusters. + max_cluster_size : int + Maximum number of events in a single cluster. Returns ------- np.ndarray - Numpy representation of the raw events being used in the clustering. - Set[tuple[int]] - An set of tuples of the indices of the clustered events. The outer set is each cluster, and the inner tuples are the events in each cluster. - """ - events = np.array( - df[["t", "x", "y", "ToT", "t"]].values # raw data stored in dataframe. duplicate timestamp column as the first instance is windowed - ) # first three columns are for search radius of KDTree - events[:, 0] = np.floor_divide(events[:, 0], tw) # bin by the time window - tree = KDTree(events[:, :3]) # generate KDTree based off the coordinates (t/timewindow, x, y) - neighbors = tree.query_ball_tree( - tree, radius - ) # compare tree against itself to find neighbors within the search radius - if len(neighbors) >= 2147483647: # performance is marginally better if can use int32 for the indices, so check for that - dtype = np.int64 - else: - dtype = np.int32 - return pd.DataFrame(neighbors).fillna(-1).astype(dtype).drop_duplicates().values, events[:, 1:] # a bit weird, but faster than using sets and list operators to unpack neighbors + A 2D NumPy array of shape (num_clusters, max_cluster_size), where each row corresponds to a cluster + and contains event indices padded with -1 for unused slots. + """ + num_clusters = np.max(labels) + 1 # Assume no noise, all labels are valid clusters + max_cluster_size = np.bincount(labels).max() + cluster_array = -1 * np.ones((num_clusters, max_cluster_size), dtype=np.int32) + cluster_counts = np.zeros(num_clusters, dtype=np.int32) + + for idx in range(labels.shape[0]): + cluster_idx = labels[idx] # Label is directly the cluster ID + cluster_array[cluster_idx, cluster_counts[cluster_idx]] = idx + cluster_counts[cluster_idx] += 1 + + return cluster_array @numba.jit(nopython=True, cache=True) def centroid_clusters( cluster_arr: np.ndarray, events: np.ndarray ) -> tuple[np.ndarray]: - """ - Performs the centroiding of a group of clusters using Numba. Note I originally attempted to unpack the clusters using list comprehensions, but this approach is significantly faster. - Parameters - ---------- - clusters : nd.array - The numpy representation of the clusters' event indices. - events : nd.array - The numpy represetation of the event data. - - Returns - ------- - tuple[np.ndarray] - t, xc, yc, ToT_max, ToT_sum, and n (number of events) in each cluster. - """ num_clusters = cluster_arr.shape[0] max_cluster = cluster_arr.shape[1] t = np.zeros(num_clusters, dtype="uint64") xc = np.zeros(num_clusters, dtype="float32") yc = np.zeros(num_clusters, dtype="float32") - ToT_max = np.zeros(num_clusters, dtype="uint16") - ToT_sum = np.zeros(num_clusters, dtype="uint16") + ToT_max = np.zeros(num_clusters, dtype="uint32") + ToT_sum = np.zeros(num_clusters, dtype="uint32") n = np.zeros(num_clusters, dtype="ubyte") for cluster_id in range(num_clusters): @@ -476,70 +499,261 @@ def ingest_cent_data( } -def raw_df_to_cent_df( - df: pd.DataFrame, tw: int = DEFAULT_CLUSTER_TW, radius: int = DEFAULT_CLUSTER_RADIUS +def add_centroid_cols( + df: pd.DataFrame, gap: bool = True ) -> pd.DataFrame: """ - Uses functions defined herein to take Dataframe of raw data and return dataframe of clustered data. + Calculates centroid positions to the nearest pixel and the timestamp in nanoseconds. Parameters ---------- df : pd.DataFrame - Pandas DataFrame of the raw data - tw : int - The time window to be considered "coincident" for clustering purposes - radius : int - The search radius, using Euclidean distance of x, y, timestamp/tw + Input centroided dataframe + gap : bool = True + Determines whether to implement large gap correction by adding 2 empty pixels offsets Returns ------- pd.DataFrame - Pandas DataFrame of the centroided data. + Originally dataframe with new columns x, y, and t_ns added. """ - fdf = drop_zero_tot(df) - cluster_arr, events = cluster_df(fdf, tw, radius) - data = centroid_clusters(cluster_arr, events) - return pd.DataFrame(ingest_cent_data(data)).sort_values("t").reset_index(drop=True) # should we specify the sort type here? this should be *almost* completely sorted already + if gap: + df.loc[df["xc"] >= 255.5, "xc"] += 2 + df.loc[df["yc"] >= 255.5, "yc"] += 2 + df["x"] = np.round(df["xc"]).astype(np.uint16) # sometimes you just want to know the closest pixel + df["y"] = np.round(df["yc"]).astype(np.uint16) + df["t_ns"] = (df["t"].astype(np.float64) * 1.5625) # better way to convert to ns while maintaining precision? + return df -def add_centroid_cols( - df: pd.DataFrame, gap: bool = True -) -> pd.DataFrame: + +def add_centroid_cols_dask( + df, gap: bool = True +): """ Calculates centroid positions to the nearest pixel and the timestamp in nanoseconds. Parameters ---------- - df : pd.DataFrame + df : dd.DataFrame Input centroided dataframe gap : bool = True Determines whether to implement large gap correction by adding 2 empty pixels offsets Returns ------- - pd.DataFrame + dd.DataFrame Originally dataframe with new columns x, y, and t_ns added. """ if gap: - df.loc[df["xc"] >= 255.5, "xc"] += 2 - df.loc[df["yc"] >= 255.5, "yc"] += 2 - df["x"] = np.round(df["xc"]).astype(np.uint16) # sometimes you just want to know the closest pixel - df["y"] = np.round(df["yc"]).astype(np.uint16) - df["t_ns"] = (df["t"].astype(np.float64) * 1.5625) # better way to convert to ns while maintaining precision? + df['xc'] = df['xc'].mask(cond=df['xc'] >= 255.5, other= df['xc'] + 2) + df['yc'] = df['yc'].mask(cond=df['yc'] >= 255.5, other= df['yc'] + 2) + df["x"] = dd.DataFrame.round(df["xc"]).astype(np.uint16) + df["y"] = dd.DataFrame.round(df["yc"]).astype(np.uint16) + df["t_ns"] = df["t"] * 1.5625 + #df.compute() + return df + + +def trim_corr_file(mask_fpath: str = "/nsls2/users/jgoodrich/proposals/2025-1/qmicroscope/jgoodrich/new_clustering/bool_mask_total.csv"): + """ + Load a boolean mask from a file, supporting .npy and .csv formats. + + Parameters: + ----------- + mask_fpath : str, optional + Path to the mask file. Supports `.npy` (NumPy binary) and `.csv` (comma-separated values). + Defaults to a predefined file path. + + Returns: + -------- + np.ndarray or None + A NumPy boolean array if the file is successfully loaded. + Returns `None` if the file format is unsupported. + + Notes: + ------ + - If the file is `.npy`, it is loaded using `np.load()` and converted to `bool`. + - If the file is `.csv`, it is loaded using `np.loadtxt()` with `delimiter=','` and converted to `bool`. + - Prints a message and returns `None` for unsupported file formats. + """ + if mask_fpath is None: + return None + + mask_fpath = Path(mask_fpath) + + if mask_fpath.suffix == '.npy': + return np.load(mask_fpath).astype(bool) + elif mask_fpath.suffix == '.csv': + return np.loadtxt(mask_fpath, delimiter=',').astype(bool) + else: + print("Unsupported file format. Use .npy or .csv. Returning None.") + return None + + +def trim_corr(df: pd.DataFrame, total_mask: np.ndarray) -> None: + """ + Modify df in place by subtracting 16 from column 't' where mask is True. + + Parameters + ---------- + df : pd.DataFrame + Dataframe containing 'x' and 'y' columns. + total_mask : np.ndarray + Boolean mask array indexed by (x, y). + """ + if total_mask is None: + print("Trim correction mask is None. No changes applied.") + return + + # Apply mask condition using direct NumPy indexing + df_mask = total_mask[df['x'].to_numpy(), df['y'].to_numpy()] + + # Use `.loc[]` with a boolean mask of the same length as df + df.loc[df_mask, 't'] -= 16 + return df + + +@numba.njit(cache=True, fastmath=True) +def timewalk_corr_exp(ToT, b = 167.0, c = -0.016): + return np.rint(b * np.exp(c * ToT) / 1.5625).astype(np.uint64) + + +def timewalk_corr(df: pd.DataFrame, b = 167.0, c = -0.016) -> None: + """Applies timewalk correction in place.""" + df.loc[:, 't'] -= timewalk_corr_exp(df['ToT'].to_numpy(), b, c) return df """ Functions to help process multiple related .tpx3 files into Pandas dataframes stored in .h5 files. -""" -RAW_H5_SUFFIX = "" -CENT_H5_SUFFIX = "_cent" -CONCAT_H5_SUFFIX = "_cent" +""" +def empty_raw_df() -> pd.DataFrame: + """ + Create an empty DataFrame with the expected columns from ingest_raw_data() + and the specified data types. + + Returns + ------- + pd.DataFrame + Empty DataFrame with columns: + ['x', 'y', 'ToT', 't', 'chip', 'cluster_id'] and appropriate dtypes. + """ + return pd.DataFrame({ + "x": np.array([], dtype="u2"), # uint16 + "y": np.array([], dtype="u2"), # uint16 + "ToT": np.array([], dtype="u4"), # uint32 + "t": np.array([], dtype="u8"), # uint64 + "chip": np.array([], dtype="u1"), # uint8 + "cluster_id": np.array([], dtype="u8") # uint64 + }) + + +def empty_cent_df() -> pd.DataFrame: + """ + Create an empty DataFrame with the expected columns from ingest_cent_data() + and the specified data types. + + Returns + ------- + pd.DataFrame + Empty DataFrame with columns: + ['t', 'xc', 'yc', 'ToT_max', 'ToT_sum', 'n'] and appropriate dtypes. + """ + return pd.DataFrame({ + "t": np.array([], dtype="uint64"), # uint64 + "xc": np.array([], dtype="float32"), # float32 + "yc": np.array([], dtype="float32"), # float32 + "ToT_max": np.array([], dtype="uint32"), # uint32 + "ToT_sum": np.array([], dtype="uint32"), # uint32 + "n": np.array([], dtype="u1") # uint8 (ubyte) + }) + + +def find_unmatched_tpx3_files(directory_list, reprocess = False): + + #Finds .tpx3 files in the given directories that do not have corresponding _cent.h5 files. + #Returns a list of Path objects. + + unmatched_files = [] + all_tpx3_files = [] + + for tpx3_dir in directory_list: + # Get all .tpx3 files + tpx3_files = list(Path(tpx3_dir).glob("*.tpx3")) + + if reprocess == True: + all_tpx3_files.extend(tpx3_files) + continue + + # Generate corresponding _cent.h5 file paths + h5_cent_files = [converted_path(tpx3_file, cent=True) for tpx3_file in tpx3_files] + + if not h5_cent_files: + continue + + # Find h5_dir from the first _cent.h5 file + h5_dir = h5_cent_files[0].parent + + # Get all existing _cent.h5 files in that directory + existing_h5_files = [p for p in h5_dir.glob("*_cent.h5")] + + # Check which _cent.h5 files are missingl + unmatched_files.extend(tpx3_file for tpx3_file, h5_cent_file in zip(tpx3_files, h5_cent_files) if h5_cent_file not in existing_h5_files) + + if reprocess: + return all_tpx3_files + else: + return unmatched_files + + +def converted_path(filepath, cent=False): + """ + Converts .tpx3 file path(s) to corresponding .h5 file path(s). + Handles individual strings, Path objects, lists, or numpy arrays. + + This is specific to CHX beamline pre and post data security. Is there a better way or place to store this? + + Returns Path objects. + """ + if isinstance(filepath, (list, np.ndarray)): + return [converted_path(fp, cent) for fp in filepath] + + filepath = Path(str(filepath).replace("file:", "")) + + if "/nsls2/data/chx/proposals/" in str(filepath): + h5_path = Path(str(filepath).replace("/assets/", "/Compressed_Data/").replace(".tpx3", "_cent.h5" if cent else ".h5")) + elif "/nsls2/data/chx/legacy/" in str(filepath): + h5_path = Path(str(filepath).replace(".tpx3", "_cent.h5" if cent else ".h5")) + else: + raise ValueError(f"Unknown path format: {filepath}") + + return h5_path + + +def save_df(df: pd.DataFrame, fpath: Union[str, Path]): + """ + Save a Pandas DataFrame to an HDF5 file, ensuring that all necessary directories exist. + + Parameters + ---------- + df : pd.DataFrame + The DataFrame to be saved. + fpath : Union[str, Path] + The full path to the output HDF5 file. + """ + fpath = Path(fpath) # Ensure fpath is a Path object + + # Create parent directories if they do not exist + fpath.parent.mkdir(parents=True, exist_ok=True) + + # Save DataFrame + df.to_hdf(fpath, key="df", format="table", mode="w") def convert_tpx_file( - fpath: Union[str, Path], time_window_microsecond: float = DEFAULT_CLUSTER_TW_MICROSECONDS, radius: int = DEFAULT_CLUSTER_RADIUS, print_details: bool = False, overwrite: bool = True + tpx3_fpath: Union[str, Path], tw: float = DEFAULT_CLUSTER_TW, radius: int = DEFAULT_CLUSTER_RADIUS, trim_correct: bool = None, timewalk_correct: bool = False, print_details: bool = False, overwrite: bool = True ): """ Convert a .tpx3 file into raw and centroided Pandas dataframes, which are stored in .h5 files. @@ -548,130 +762,195 @@ def convert_tpx_file( Parameters ---------- - fpath : Union[str, Path] + tpx3_fpath : Union[str, Path] .tpx3 file path - time_window_microsecond : float = DEFAULT_CLUSTER_TW_MICROSECONDS - The time window, in microseconds, to perform centroiding + tw : float = DEFAULT_CLUSTER_TW_MICROSECONDS + The time window, in Timepix timestamp units, to perform centroiding radius : int = DEFAULT_CLUSTER_RADIUS The radius, in pixels, to perform centroiding + trim_correct : bool = False + Whether to apply trim correction + timewalk_correct : bool = False + Whether to apply timewalk correction print_details : bool = False Boolean toggle about whether to print detailed data. overwrite : bool = True Boolean toggle about whether to overwrite pre-existing data. """ - fname, ext = os.path.splitext(fpath) - dfname = "{}{}.h5".format(fname, RAW_H5_SUFFIX) - dfcname = "{}{}.h5".format(fname, CONCAT_H5_SUFFIX) - - if ext == ".tpx3" and os.path.exists(fpath): - - try: - - file_size = os.path.getsize(fpath) - have_df = os.path.exists(dfname) - have_dfc = os.path.exists(dfcname) + if isinstance(tpx3_fpath, str): + tpx3_fpath = Path(tpx3_fpath) - if have_df and have_dfc and not overwrite: - - print("-> {} exists, skipping.".format(dfname)) - - else: - - if print_details: - print("-> Processing {}, size: {:.1f} MB".format(fpath, file_size/1000000)) - - time_window = time_window_microsecond * 1e-6 - time_stamp_conversion = 6.1e-12 - timedif = int(time_window / time_stamp_conversion) + if tpx3_fpath.exists(): + if tpx3_fpath.suffix == ".tpx3": - if print_details: - print("Loading {} data into dataframe...".format(fpath)) + h5_fpath = converted_path(tpx3_fpath, cent=False) + cent_h5_fpath = converted_path(tpx3_fpath, cent=True) + + try: + + tpx3_fpath_size = tpx3_fpath.stat().st_size # Get file size + have_df = h5_fpath.exists() # Check if dfname exists + have_dfc = cent_h5_fpath.exists() # Check if dfcname exists + + if have_df and have_dfc and not overwrite: - df = tpx_to_raw_df(fpath) - num_events = df.shape[0] - - if print_details: - print("Loading {} complete. {} events found. Saving to: {}".format(fpath, num_events, dfname)) + print("-> {} already processed, skipping.".format(tpx3_fpath.name)) + return False - df.to_hdf(dfname, key="df", format="table", mode="w") + else: - if print_details: - print("Saving {} complete. Beginning clustering...".format(dfname)) + if print_details: + print("-> Processing {}, size: {:.1f} MB".format(tpx3_fpath.name, tpx3_fpath_size/(1024*1024))) + + if tpx3_fpath_size == 0: + num_events = 0 + else: + df = drop_zero_tot(tpx_to_raw_df(tpx3_fpath)) + num_events = df.shape[0] + + if num_events > 0: - df_c = raw_df_to_cent_df(df, timedif, radius) - - num_clusters = df_c.shape[0] + if print_details: + print("Loading {} complete. {} events found.".format(tpx3_fpath.name, num_events)) + + if trim_correct is not None: + if print_details: + print("Performing trim correction on {}".format(tpx3_fpath.name)) + df = trim_corr(df, trim_correct) + + if timewalk_correct: + if print_details: + print("Performing timewalk correction on {}".format(tpx3_fpath.name)) + df = timewalk_corr(df) + + cluster_labels, events = cluster_df_optimized(df, tw, radius) + df['cluster_id'] = cluster_labels + if print_details: + print("Clustering {} complete. {} clusters found. Saving {}...".format(tpx3_fpath.name, cluster_labels.max()+1, h5_fpath.name)) + + save_df(df, h5_fpath) + if print_details: + print("Saving {} complete. Centroiding...".format(h5_fpath.name)) + + cluster_array = group_indices(cluster_labels) + data = centroid_clusters(cluster_array, events) + + cdf = pd.DataFrame(ingest_cent_data(data)).sort_values("t").reset_index(drop=True) + if print_details: + print("Centroiding complete. Saving to {}...".format(cent_h5_fpath.name)) + # save cdf + + save_df(cdf, cent_h5_fpath) + if print_details: + print("Saving {} complete. Checking file existence...".format(cent_h5_fpath.name)) + + if cent_h5_fpath.exists(): + if print_details: + print("Confirmed {} exists!".format(cent_h5_fpath.name)) + to_return = True + else: + if print_details: + print("WARNING: {} doesn't exist but it should?!".format(cent_h5_fpath.name)) + to_return = False + + if print_details: + print("Moving onto next file...") + + df, cdf, cluster_labels, events, cluster_array, data = None, None, None, None, None, None + gc.collect() + return to_return - if print_details: - print("Clustering {} complete. {} clusters found. Saving to {}".format(fpath, num_clusters, dfcname)) + else: + + if print_details: + print("No events found! Saving empty dataframes.") + save_df(empty_raw_df(), h5_fpath) + save_df(empty_cent_df(), cent_h5_fpath) + + gc.collect() + + return True + + except Exception as e: - df_c.to_hdf(dfcname, key="df", format="table", mode="w") - if print_details: - print("Saving {} complete. Moving onto next file.".format(dfcname)) - - except Exception as e: - + print(f"Conversion of {tpx3_fpath.name} failed due to {e.__class__.__name__}: {e}, moving on.") + return False + + else: if print_details: - print("Conversion of {} failed due to {}, moving on.".format(fpath,e)) - + print("File was not a .tpx3 file. Moving onto next file.") + return False + else: if print_details: - print("File not found. Moving onto next file.") + print("File does not exist. Moving onto next file.") + return False -def convert_tpx3_files_parallel(fpaths: Union[List[str], List[Path]], num_workers: int = None): +def convert_tpx3_files_parallel( + fpaths: Union[List[str], List[Path]], num_workers: int = None, trim_correct: Union[str, Path] = None, **kwargs +): """ - Convert a list .tpx3 files in parallel using multiprocessing and convert_tpx_file(). - - TO DO: Accept more arguments for convert_tpx_file. + Convert a list of .tpx3 files in parallel using multiprocessing and convert_tpx_file(). Parameters ---------- - fpath : Union[str, Path] - .tpx3 file path - time_window_microsecond : float = DEFAULT_CLUSTER_TW_MICROSECONDS - The time window, in microseconds, to perform centroiding - radius : int = DEFAULT_CLUSTER_RADIUS - The radius, in pixels, to perform centroiding - print_details : bool = False - Boolean toggle about whether to print detailed data. - overwrite : bool = True - Boolean toggle about whether to overwrite pre-existing data. - + fpaths : Union[List[str], List[Path]] + List of .tpx3 file paths to process. + num_workers : int, optional + Number of worker processes to use. Defaults to (CPU count - 4) to leave room for other tasks. + trim_mask_fpath : str, optional + Path to the trim correction mask. If None, no correction is applied. + **kwargs : dict + Additional keyword arguments passed to `convert_tpx_file()`. """ - if num_workers is None: - max_workers = multiprocessing.cpu_count() + if len(fpaths) > 0: + if num_workers is None: + max_workers = min(multiprocessing.cpu_count() - 4, len(fpaths)) # Leave 4 cores free + else: + max_workers = min(num_workers, len(fpaths)) # Don't use more workers than files + + # Load the mask once + trim_mask = trim_corr_file(trim_correct) + + # Pass the preloaded mask to all workers + worker_func = partial(convert_tpx_file, trim_correct=trim_mask, **kwargs) + + with multiprocessing.Pool(processes=max_workers) as pool: + results = list(tqdm(pool.imap_unordered(worker_func, fpaths), total=len(fpaths), desc="Processing files")) + + # Count successes + num_true = sum(results) else: - max_workers = num_workers - - with multiprocessing.Pool(processes=max_workers) as pool: - for _ in tqdm(pool.imap_unordered(convert_tpx_file, fpaths), total=len(fpaths)): - pass - + num_true = 0 -def convert_tpx3_files(fpaths: Union[List[str], List[Path]], print_details = True): + print(f"Successfully converted {num_true} out of {len(fpaths)}!") + + +def convert_tpx3_files( + fpaths: Union[List[str], List[Path]], trim_correct: Union[str, Path] = None, print_details: bool = True, **kwargs +): """ - Convert a list .tpx3 files in a single process using convert_tpx_file(). - - TO DO: Accept more arguments for convert_tpx_file. + Convert a list of .tpx3 files in a single process using convert_tpx_file(). Parameters ---------- - fpath : Union[str, Path] - .tpx3 file path - time_window_microsecond : float = DEFAULT_CLUSTER_TW_MICROSECONDS - The time window, in microseconds, to perform centroiding - radius : int = DEFAULT_CLUSTER_RADIUS - The radius, in pixels, to perform centroiding - print_details : bool = False - Boolean toggle about whether to print detailed data. - overwrite : bool = True - Boolean toggle about whether to overwrite pre-existing data. - + fpaths : Union[List[str], List[Path]] + List of .tpx3 file paths to process. + trim_mask_fpath : str, optional + Path to the trim correction mask. If None, no correction is applied. + print_details : bool, optional + Boolean toggle about whether to print detailed data. Default is True. + **kwargs : dict + Additional keyword arguments passed to `convert_tpx_file()`. """ - for file in fpaths: - convert_tpx_file(file, print_details = print_details) + # Load the mask once (only if provided) + trim_mask = trim_corr_file(trim_correct) + + # Process files sequentially with tqdm progress bar + for file in tqdm(fpaths, desc="Processing files"): + convert_tpx_file(file, trim_correct=trim_mask, print_details=print_details, **kwargs) + - \ No newline at end of file diff --git a/tpx3awkward/examples/example.ipynb b/tpx3awkward/examples/example.ipynb deleted file mode 100644 index 6eeabb1..0000000 --- a/tpx3awkward/examples/example.ipynb +++ /dev/null @@ -1,351 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "76010898-2082-49cb-9bff-8173fd79ad6e", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import _utils as tpx\n", - "import pandas as pd\n", - "from tiled.client import from_uri\n", - "db = from_uri('https://tiled.nsls2.bnl.gov', 'dask')['chx']['raw']" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "59bb0e40-1686-413c-8595-c38655e2d0d6", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "sid = 143200 # example SID (1800 partitions at 1 sec/partition from NSLS-II quantum microscope project)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "1b629cf7-1d46-49b9-a41b-43f3b9972f4e", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Raw dataframe: \n", - " x y ToT t chip\n", - "0 502 452 475 11301032 1\n", - "1 405 272 275 11302904 1\n", - "2 406 272 350 11302904 1\n", - "3 295 135 525 11303039 0\n", - "4 358 509 300 11303780 1\n", - "... ... ... ... ... ...\n", - "971254 200 46 550 651295354 3\n", - "971255 489 104 450 651296021 0\n", - "971256 389 21 300 651296305 0\n", - "971257 390 21 300 651296306 0\n", - "971258 37 325 75 651299518 2\n", - "\n", - "[971259 rows x 5 columns]\n", - "Centroided dataframe: \n", - " t xc yc ToT_max ToT_sum n\n", - "0 11301032 502.000000 452.0 475 475 1\n", - "1 11302904 405.559998 272.0 350 625 2\n", - "2 11303039 295.000000 135.0 525 525 1\n", - "3 11303780 358.478271 509.0 300 575 2\n", - "4 11303973 412.409088 60.0 325 550 2\n", - "... ... ... ... ... ... ..\n", - "659582 651294809 326.000000 379.0 475 475 1\n", - "659583 651295354 200.000000 46.0 550 550 1\n", - "659584 651296021 489.000000 104.0 450 450 1\n", - "659585 651296305 389.500000 21.0 300 600 2\n", - "659586 651299518 37.000000 325.0 75 75 1\n", - "\n", - "[659587 rows x 6 columns]\n", - "Centroided dataframe: \n", - " t xc yc ToT_max ToT_sum n x y \\\n", - "0 11301032 504.000000 454.0 475 475 1 504 454 \n", - "1 11302904 407.559998 274.0 350 625 2 408 274 \n", - "2 11303039 297.000000 135.0 525 525 1 297 135 \n", - "3 11303780 360.478271 511.0 300 575 2 360 511 \n", - "4 11303973 414.409088 60.0 325 550 2 414 60 \n", - "... ... ... ... ... ... .. ... ... \n", - "659582 651294809 328.000000 381.0 475 475 1 328 381 \n", - "659583 651295354 200.000000 46.0 550 550 1 200 46 \n", - "659584 651296021 491.000000 104.0 450 450 1 491 104 \n", - "659585 651296305 391.500000 21.0 300 600 2 392 21 \n", - "659586 651299518 37.000000 327.0 75 75 1 37 327 \n", - "\n", - " t_ns \n", - "0 1.765786e+07 \n", - "1 1.766079e+07 \n", - "2 1.766100e+07 \n", - "3 1.766216e+07 \n", - "4 1.766246e+07 \n", - "... ... \n", - "659582 1.017648e+09 \n", - "659583 1.017649e+09 \n", - "659584 1.017650e+09 \n", - "659585 1.017650e+09 \n", - "659586 1.017655e+09 \n", - "\n", - "[659587 rows x 9 columns]\n", - "Partition duration: 0.999997634375 sec.\n" - ] - } - ], - "source": [ - "files = db[sid]['primary']['data']['tpx3_files_raw_filepaths'][0].compute()\n", - "\n", - "file = files[0][5:] # cut off 'file:' from beginning\n", - "\n", - "df = tpx.tpx_to_raw_df(file)\n", - "cdf = tpx.raw_df_to_cent_df(df)\n", - "\n", - "print('Raw dataframe: ')\n", - "print(df)\n", - "\n", - "print('Centroided dataframe: ')\n", - "print(cdf)\n", - "\n", - "cdf = tpx.add_centroid_cols(cdf)\n", - "print('Centroided dataframe: ')\n", - "print(cdf)\n", - "\n", - "duration = (cdf.iloc[-1]['t_ns'] - cdf.iloc[0]['t_ns'])/1e9\n", - "print(f'Partition duration: {duration} sec.')" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "48254ae7-8d0c-4766-b1d1-2b7f4732934c", - "metadata": {}, - "outputs": [], - "source": [ - "sid = 143201 # new example SID (1800 partitions at 1 sec/partition from NSLS-II quantum microscope project)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "2adae433-a68c-436e-9640-d4a9745af969", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "-> Processing /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000.tpx3, size: 10.5 MB\n", - "Loading /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000.tpx3 data into dataframe...\n", - "Loading /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000.tpx3 complete. 975938 events found. Saving to: /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000.h5\n", - "Saving /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000.h5 complete. Beginning clustering...\n", - "Clustering /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000.tpx3 complete. 659537 clusters found. Saving to /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000_cent.h5\n", - "Saving /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000_cent.h5 complete. Moving onto next file.\n", - "-> Processing /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000001.tpx3, size: 10.5 MB\n", - "Loading /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000001.tpx3 data into dataframe...\n", - "Loading /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000001.tpx3 complete. 972867 events found. Saving to: /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000001.h5\n", - "Saving /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000001.h5 complete. Beginning clustering...\n", - "Clustering /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000001.tpx3 complete. 657491 clusters found. Saving to /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000001_cent.h5\n", - "Saving /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000001_cent.h5 complete. Moving onto next file.\n" - ] - } - ], - "source": [ - "files = db[sid]['primary']['data']['tpx3_files_raw_filepaths'][0].compute()\n", - "files = [f.replace('file:', '') for f in files]\n", - "tpx.convert_tpx3_files(files[0:2])" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "05332e5b-17eb-4074-ad92-51ef1273e9f4", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Raw dataframe: \n", - " x y ToT t chip\n", - "0 210 132 575 11294477 3\n", - "1 106 227 375 11294540 3\n", - "2 107 227 150 11294544 3\n", - "3 381 346 475 11294882 1\n", - "4 268 444 450 11295743 1\n", - "... ... ... ... ... ...\n", - "975933 103 441 600 651291179 2\n", - "975934 193 383 450 651294115 2\n", - "975935 121 280 400 651294141 2\n", - "975936 256 373 400 651294145 1\n", - "975937 255 373 50 651294166 2\n", - "\n", - "[975938 rows x 5 columns]\n", - "Centroided dataframe: \n", - " t xc yc ToT_max ToT_sum n\n", - "0 11294477 210.000000 132.0 575 575 1\n", - "1 11294540 106.285713 227.0 375 525 2\n", - "2 11294882 381.000000 346.0 475 475 1\n", - "3 11295743 268.000000 444.0 450 450 1\n", - "4 11298466 420.000000 52.0 450 450 1\n", - "... ... ... ... ... ... ..\n", - "659532 651290601 101.000000 436.0 575 575 1\n", - "659533 651291179 103.000000 441.0 600 600 1\n", - "659534 651294115 193.000000 383.0 450 450 1\n", - "659535 651294141 121.000000 280.0 400 400 1\n", - "659536 651294145 255.888885 373.0 400 450 2\n", - "\n", - "[659537 rows x 6 columns]\n", - "Centroided dataframe: \n", - " t xc yc ToT_max ToT_sum n x y \\\n", - "0 11294477 210.000000 132.0 575 575 1 210 132 \n", - "1 11294540 106.285713 227.0 375 525 2 106 227 \n", - "2 11294882 383.000000 348.0 475 475 1 383 348 \n", - "3 11295743 270.000000 446.0 450 450 1 270 446 \n", - "4 11298466 422.000000 52.0 450 450 1 422 52 \n", - "... ... ... ... ... ... .. ... ... \n", - "659532 651290601 101.000000 438.0 575 575 1 101 438 \n", - "659533 651291179 103.000000 443.0 600 600 1 103 443 \n", - "659534 651294115 193.000000 385.0 450 450 1 193 385 \n", - "659535 651294141 121.000000 282.0 400 400 1 121 282 \n", - "659536 651294145 257.888885 375.0 400 450 2 258 375 \n", - "\n", - " t_ns \n", - "0 1.764762e+07 \n", - "1 1.764772e+07 \n", - "2 1.764825e+07 \n", - "3 1.764960e+07 \n", - "4 1.765385e+07 \n", - "... ... \n", - "659532 1.017642e+09 \n", - "659533 1.017642e+09 \n", - "659534 1.017647e+09 \n", - "659535 1.017647e+09 \n", - "659536 1.017647e+09 \n", - "\n", - "[659537 rows x 9 columns]\n", - "Partition duration: 0.99999948125 sec.\n" - ] - } - ], - "source": [ - "df = pd.read_hdf('/nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000.h5')\n", - "cdf = pd.read_hdf('/nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000_cent.h5')\n", - "\n", - "print('Raw dataframe: ')\n", - "print(df)\n", - "\n", - "print('Centroided dataframe: ')\n", - "print(cdf)\n", - "\n", - "cdf = tpx.add_centroid_cols(cdf)\n", - "print('Centroided dataframe: ')\n", - "print(cdf)\n", - "\n", - "duration = (cdf.iloc[-1]['t_ns'] - cdf.iloc[0]['t_ns'])/1e9\n", - "print(f'Partition duration: {duration} sec.')" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "c981be40-e631-4f73-bbab-4fadda8b9285", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 1800/1800 [07:51<00:00, 3.82it/s]\n" - ] - } - ], - "source": [ - "try:\n", - " files = db[sid]['primary']['data']['tpx3_files_raw_filepaths'][0].compute()\n", - " files = [f.replace('file:', '') for f in files]\n", - " tpx.convert_tpx3_files_parallel(files)\n", - " \n", - "except Exception as e:\n", - " print(f'Could not finish {sid} due to {e}') " - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "946fa74f-8c8c-4f36-bc0c-9f203e023f78", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "All raw .h5 files exist!\n", - "All centroided raw .h5 files exist!\n" - ] - } - ], - "source": [ - "import os\n", - "\n", - "raw_h5_files = [os.path.exists(f.replace('.tpx3', '.h5')) for f in files]\n", - "cent_h5_files = [os.path.exists(f.replace('.tpx3', '_cent.h5')) for f in files]\n", - "\n", - "if all(raw_h5_files):\n", - " print('All raw .h5 files exist!')\n", - "else:\n", - " print('Missing .h5 files!')\n", - " \n", - "if all(cent_h5_files):\n", - " print('All centroided raw .h5 files exist!')\n", - "else:\n", - " print('Missing centroided .h5 files!')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "705c639d-dac9-4517-b5ff-ce44365ed4df", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}