From 2c5197bc3b1cef3876bfba7358516299a76d897a Mon Sep 17 00:00:00 2001
From: Eugene M <ymatviych@bnl.gov>
Date: Fri, 31 May 2024 16:21:22 -0400
Subject: [PATCH 01/15] ENH: add spidr rollover

---
 requirements.txt              |  10 +-
 tpx3awkward/__init__.py       |   1 +
 tpx3awkward/_utils.py         | 210 ++++++++++++++++++++++++++++++++--
 tpx3awkward/tests/test_toa.py |  67 +++++++++++
 4 files changed, 272 insertions(+), 16 deletions(-)
 create mode 100644 tpx3awkward/tests/test_toa.py

diff --git a/requirements.txt b/requirements.txt
index d7479d5..683f27b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,12 +1,6 @@
 # List required packages in this file, one per line.
 numpy
 numba
-os
 pandas
-scipy.spatial
-concurrent.futures
-multiprocessing
-time
-tdqm
-gc
-pyCHX
+scipy
+# pyCHX
diff --git a/tpx3awkward/__init__.py b/tpx3awkward/__init__.py
index 628ae79..c56bf1b 100644
--- a/tpx3awkward/__init__.py
+++ b/tpx3awkward/__init__.py
@@ -1,5 +1,6 @@
 from pathlib import Path
 from . import _version
+from ._utils import ingest_from_files
 
 
 def _get_version():
diff --git a/tpx3awkward/_utils.py b/tpx3awkward/_utils.py
index ffdef7d..ab47d17 100644
--- a/tpx3awkward/_utils.py
+++ b/tpx3awkward/_utils.py
@@ -2,16 +2,18 @@
 import numpy as np
 from pathlib import Path
 from numpy.typing import NDArray
-from typing import TypeVar, Union, Dict, Set, List
+from typing import TypeVar, Union, Dict, Set, List, Iterable
 import numba
 import pandas as pd
 from scipy.spatial import KDTree
-import concurrent.futures
 import multiprocessing
-import time
 from tqdm import tqdm
-from pyCHX.chx_packages import db, get_sid_filenames
+import warnings
 import gc
+try:
+    from pyCHX.chx_packages import db, get_sid_filenames
+except ImportError:
+    warnings.warn("Could not import pyCHX package. Proceeding without it...")
 
 IA = NDArray[np.uint64]
 UnSigned = TypeVar("UnSigned", IA, np.uint64)
@@ -27,17 +29,24 @@ def raw_as_numpy(fpath: Union[str, Path]) -> IA:
     ----------
 
     """
-    with open(fpath, "rb") as fin:
-        return np.frombuffer(fin.read(), dtype="<u8")
+
+    return np.fromfile(fpath, dtype="<u8")
 
 
 @numba.jit(nopython=True)
 def get_block(v: UnSigned, width: int, shift: int) -> UnSigned:
+
     return v >> np.uint64(shift) & np.uint64(2**width - 1)
 
 
+@numba.jit(nopython=True)
+def matches_nibble(data, nibble) -> numba.boolean:
+    return (int(data) >> 60) == nibble
+
+
 @numba.jit(nopython=True)
 def is_packet_header(v: UnSigned) -> UnSigned:
+    """Identify packet headers by magic number (TPX3 as ascii on lowest 8 bytes]"""
     return get_block(v, 32, 0) == 861425748
 
 
@@ -55,7 +64,6 @@ def classify_array(data: IA) -> NDArray[np.uint8]:
     6: frame driven data (id'd via 0xA upper nibble) (??)
     """
     output = np.zeros_like(data, dtype="<u1")
-    # identify packet headers by magic number (TPX3 as ascii on lowest 8 bytes]
     is_header = is_packet_header(data)
     output[is_header] = 1
     # get the highest nibble
@@ -92,10 +100,170 @@ def _shift_xy(chip, row, col):
     return out
 
 
+@numba.jit(nopython=True)
+def decode_xy(msg, chip):
+    # these names and math are adapted from c++ code
+    l_pix_addr = (msg >> np.uint(44)) & np.uint(0xFFFF)
+    # This is laid out 16ibts which are 2 interleaved 8 bit unsigned ints
+    #  CCCCCCCRRRRRRCRR
+    #  |dcol ||spix|^||
+    #  | 7   || 6  |1|2
+    #
+    # The high 7 bits of the column
+    # '1111111000000000'
+    dcol = (l_pix_addr & np.uint(0xFE00)) >> np.uint(8)
+    # The high 6 bits of the row
+    # '0000000111111000'
+    spix = (l_pix_addr & np.uint(0x01F8)) >> np.uint(1)
+    rowcol = _shift_xy(
+        chip,
+        # add the low 2 bits of the row
+        # '0000000000000011'
+        spix + (l_pix_addr & np.uint(0x3)),
+        # add the low 1 bit of the column
+        # '0000000000000100'
+        dcol + ((l_pix_addr & np.uint(0x4)) >> np.uint(2)),
+    )
+    return rowcol[1], rowcol[0]
+
+
+@numba.jit(nopython=True)
+def get_spidr(msg):
+    return msg & np.uint(0xFFFF)
+
+
+@numba.jit(nopython=True)
+def decode_message(msg, chip, spidr_epoch=0, spidr_cutoff=0):
+    """Decode TPX3 packages of the second type corresponding to photon events (id'd via 0xB upper nibble)
+
+    Parameters
+    ----------
+        msg (uint64): tpx3 binary package
+        chip (uint8): chip ID, 0..3
+        spidr_epoch: SPIDR epoch at the start of the file; the counter increments every time the SPIDR clock resets
+        spidr_cutoff: if a file spans two SPIDR epochs, this parameter defines a midpoint between the lowest SPIDR
+            id in the previous epoch and the highest SPIDR id in the following epoch; i.e. all packages with
+            SPIDR > spidr_cutoff are guaranteed to belong to the next epoch, and vice versa.
+
+    Returns
+    ----------
+        Arrays of pixel coordinates and timestamps.
+    """
+    x, y = decode_xy(msg, chip)  # or use x1, y1 = calculateXY(msg, chip) from the Vendor's code
+    # ToA is 14 bits
+    ToA = (msg >> np.uint(30)) & np.uint(0x3FFF)
+    # ToT is 10 bits; report in ns
+    ToT = ((msg >> np.uint(20)) & np.uint(0x3FF)) * 25
+    # FToA is 4 bits
+    FToA = (msg >> np.uint(16)) & np.uint(0xF)
+    # SPIDR time is 16 bits
+    SPIDR = np.uint64(get_spidr(msg))
+    # Apply the spidr_epoch rollover
+    SPIDR += np.uint64((spidr_epoch + (SPIDR < spidr_cutoff)) * (2**16))
+    # Counter value, in multiples of 1.5625 ns
+    count = (((SPIDR << np.uint(14)) + ToA) << np.uint(4)) - FToA
+    # Phase correction
+    phase = np.uint((x / 2) % 16) or np.uint(16)
+    # Calculate the timestamp
+    ts = np.uint64(count + phase)
+
+    return x, y, ToT, ts
+
+
+@numba.jit(nopython=True)
+def check_spidr_overflow(data, last_spidr=0):
+    """Detect if the SPIDR counter has overflowed in the current data chunk (file)
+
+    Assumes that only single SPIDR overflow can occur within one file, i.e. the duration of the file is <27 sec.
+
+    Arguments:
+    ----------
+    data : NDArray[np.unint64]
+        The stream of raw data from the timepix3 detector
+    last_spidr : int
+        The last SPIDR value at the tail of the previous tpx3 file
+
+    Returns:
+    -------
+    spidr_midpoint : int
+        A value of the spidr counter such that any datapoints in the file with SPIDR < midpoint should be assigned
+        to the next SPIDR epoch (kept track of and incremented outside the function), and vice versa.
+        If spidr_midpoint=0, all data points in the file belong to the previous SPIDR epoch.
+    last_spidr : int
+        The highest SPIDR counter value from the new epoch observed at the end of file.
+    """
+
+    head_min, head_max = 65535, 0
+    tail_min, tail_max = 65535, 0
+    for i in range(64):
+        if not is_packet_header(data[i]) and matches_nibble(data[i], 0xB):
+            head_min = min(get_spidr(data[i]), head_min)
+            head_max = max(get_spidr(data[i]), head_max)
+        if not is_packet_header(data[-i - 1]) and matches_nibble(data[-i - 1], 0xB):
+            tail_min = min(get_spidr(data[-i - 1]), tail_min)
+            tail_max = max(get_spidr(data[-i - 1]), tail_max)
+    if head_min > tail_max:
+        # Transition somewhere in the middle of the file
+        midpoint = (np.uint32(head_min) + np.uint32(tail_max)) // 2
+    elif (head_max - head_min) > 32768:
+        # Large drop in the head of the file
+        midpoint = (65535 + np.uint32(tail_max)) // 2  # Assume the smallest SPIDR from the last epoch at the head is 65535
+    elif (tail_max - tail_min) > 32768:
+        # Large drop in the tail of the file; find tail_max from the new epoch
+        tail_max = 0
+        for i in range(64):
+            if not is_packet_header(data[-i - 1]) and matches_nibble(data[-i - 1], 0xB):
+                tail_spidr = get_spidr(data[-i - 1])
+                if tail_spidr < 32768:
+                    tail_max = max(tail_spidr, tail_max)
+        midpoint = (np.uint32(head_min) + np.uint32(tail_max)) // 2
+    elif last_spidr > head_min:
+        # Transition occured between the files
+        midpoint = 65535  # Everything in this file is in the new SPIDR epoch
+    else:
+        midpoint = 0
+
+    return np.uint16(midpoint), np.uint16(tail_max)
+
+
+@numba.jit(nopython=True)
+def _ingest_raw_data_rollover(data: IA, spidr_epoch=0, last_spidr=0):
+
+    chips = np.zeros_like(data, dtype=np.uint8)
+    x = np.zeros_like(data, dtype="u2")
+    y = np.zeros_like(data, dtype="u2")
+    tot = np.zeros_like(data, dtype="u4")
+    ts = np.zeros_like(data, dtype="u8")
+
+    chip_indx = 0
+    spidr_cutoff, last_spidr = check_spidr_overflow(data, last_spidr=last_spidr)
+    i = 0
+    for msg in data:
+        if is_packet_header(msg):
+            chip_indx = np.uint8(get_block(msg, 8, 32))
+        elif matches_nibble(msg, 0xB):
+            chips[i] = chip_indx
+            _x, _y, _tot, _ts = decode_message(msg, chip_indx, spidr_epoch=spidr_epoch, spidr_cutoff=spidr_cutoff)
+            x[i] = _x
+            y[i] = _y
+            tot[i] = _tot
+            ts[i] = _ts
+            i += 1
+
+    # Sort the timestamps
+    indx = np.argsort(ts[:i], kind="mergesort")
+    chips = chips[indx]
+    x, y, tot, ts = x[indx], y[indx], tot[indx], ts[indx]
+
+    if spidr_cutoff > 0:
+        spidr_epoch += 1
+
+    return x, y, tot, ts, chips, spidr_epoch, last_spidr
+
+
 @numba.jit(nopython=True)
 def _ingest_raw_data(data: IA):
     types = np.zeros_like(data, dtype="<u1")
-    # identify packet headers by magic number (TPX3 as ascii on lowest 8 bytes]
     is_header = is_packet_header(data)
     types[is_header] = 1
     # get the highest nibble
@@ -241,6 +409,32 @@ def _ingest_raw_data(data: IA):
     return x, y, pix_addr, ToA, ToT, FToA, SPIDR, chip_number, basetime, timestamp
 
 
+def ingest_from_files(fpaths: List[Union[str, Path]]) -> Iterable[Dict[str, NDArray]]:
+    """Parse values out of a sequence of timepix3 files with rollover of SPIDR counter.
+
+    Parameters
+    ----------
+    fpaths : A sorted sequence of tpx3 filepaths.
+
+    Returns
+    -------
+    An iterable over the parsing results, each encapsulated in a dictionary
+    """
+
+    spidr_epoch, last_spidr = 0, 0
+    for fpath in fpaths:
+        data = raw_as_numpy(fpath)
+        x, y, tot, ts, chips, spidr_epoch, last_spidr = _ingest_raw_data_rollover(data, spidr_epoch, last_spidr=last_spidr)
+
+        yield {
+            k.strip(): v
+            for k, v in zip(
+                "x, y, tot, timestamp, chips".split(","),
+                (x, y, tot, ts, chips),
+            )
+        }
+
+
 def ingest_raw_data(data: IA) -> Dict[str, NDArray]:
     """
     Parse values out of raw timepix3 data stream.
diff --git a/tpx3awkward/tests/test_toa.py b/tpx3awkward/tests/test_toa.py
new file mode 100644
index 0000000..cb590e8
--- /dev/null
+++ b/tpx3awkward/tests/test_toa.py
@@ -0,0 +1,67 @@
+from tpx3awkward._utils import get_block, matches_nibble, get_spidr, check_spidr_overflow
+
+import numpy as np
+import pytest
+
+
+@pytest.fixture(scope="function")
+def data(n=10):
+    data = np.zeros(n, dtype=np.uint64)
+    return data
+
+
+@pytest.fixture(scope="function")
+def header_msg(chip=3):
+    return (np.uint8(chip) << np.uint(32)) + np.uint64(861425748)
+
+
+@pytest.fixture(scope="function")
+def empty_msg():
+    return np.uint64(0xb) << np.uint(60)
+
+
+def test_get_block(header_msg):
+    assert np.uint8(get_block(header_msg, 8, 32)) == 3
+
+
+def test_matches_nibble():
+    msg = np.uint64(0xb) << np.uint(60)
+    assert matches_nibble(msg, 0xb)
+
+
+def test_get_spidr(empty_msg):
+    msg1 = empty_msg + np.uint32(32768)
+    msg2 = empty_msg + np.uint32(65535)
+    assert get_spidr(msg1) == 32768
+    assert get_spidr(msg2) == 65535
+
+def test_check_spidr_overflow(empty_msg):
+    spidr_arr = [0, 0, 0, 1, 1, 1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10]
+    spidr_arr.extend(range(10, 65535, 3))
+    spidr_arr.extend([65534, 65534, 65535, 65534, 65534, 65535, 65535, 65534])
+    data = [empty_msg + np.uint32(s) for s in spidr_arr]
+    midpoint, last_spidr = check_spidr_overflow(data, 0)
+    assert midpoint == 0
+    assert last_spidr == 65535
+
+    spidr_arr = list(range(20000, 65535))
+    spidr_arr.extend(range(0, 10000))
+    data = [empty_msg + np.uint32(s) for s in spidr_arr]
+    midpoint, last_spidr = check_spidr_overflow(data, 0)
+    assert midpoint == 14999
+    assert last_spidr == 9999
+
+    spidr_arr = [65534, 65534, 65535, 65534, 65534, 65535, 0, 65535, 1, 65534, 0, 0, 0, 1, 1, 2, 3, 4, 5]
+    spidr_arr.extend(range(5, 2000))
+    data = [empty_msg + np.uint32(s) for s in spidr_arr]
+    midpoint, last_spidr = check_spidr_overflow(data, 0)
+    assert midpoint == 33767
+    assert last_spidr == 1999
+
+    spidr_arr = list(range(20000, 65535))
+    spidr_arr.extend([65534, 65534, 65535, 65534, 65534, 65535, 0, 65535, 1, 65534, 0, 0, 1, 1, 2, 3, 4, 5])
+
+    data = [empty_msg + np.uint32(s) for s in spidr_arr]
+    midpoint, last_spidr = check_spidr_overflow(data, 0)
+    assert midpoint == 10002
+    assert last_spidr == 5
\ No newline at end of file

From aaf82f64eb6d4f19eafca40b083add5b2d448bef Mon Sep 17 00:00:00 2001
From: Eugene M <ymatviych@bnl.gov>
Date: Tue, 2 Jul 2024 09:46:22 -0400
Subject: [PATCH 02/15] ENH: use heartbeat_time instead of SPIDR_epochs

---
 tpx3awkward/_utils.py | 288 ++++++++----------------------------------
 1 file changed, 52 insertions(+), 236 deletions(-)

diff --git a/tpx3awkward/_utils.py b/tpx3awkward/_utils.py
index ab47d17..4baad5a 100644
--- a/tpx3awkward/_utils.py
+++ b/tpx3awkward/_utils.py
@@ -133,21 +133,35 @@ def get_spidr(msg):
 
 
 @numba.jit(nopython=True)
-def decode_message(msg, chip, spidr_epoch=0, spidr_cutoff=0):
+def decode_message(msg, chip, last_ts: np.uint64 = 0):
     """Decode TPX3 packages of the second type corresponding to photon events (id'd via 0xB upper nibble)
 
     Parameters
     ----------
-        msg (uint64): tpx3 binary package
+        msg (uint64): tpx3 binary message
         chip (uint8): chip ID, 0..3
-        spidr_epoch: SPIDR epoch at the start of the file; the counter increments every time the SPIDR clock resets
-        spidr_cutoff: if a file spans two SPIDR epochs, this parameter defines a midpoint between the lowest SPIDR
-            id in the previous epoch and the highest SPIDR id in the following epoch; i.e. all packages with
-            SPIDR > spidr_cutoff are guaranteed to belong to the next epoch, and vice versa.
+        last_ts (uint64): last timestamp from the previous message; defines the SPIDR rollover
+
+        # bit position   : ...  44   40   36   32   28   24   20   16   12    8 7  4 3  0
+        # 0xFFFFC0000000 :    1111 1111 1111 1111 1100 0000 0000 0000 0000 0000 0000 0000
+        # 0x3FFFFFFF     :    0000 0000 0000 0000 0011 1111 1111 1111 1111 1111 1111 1111
+        # SPIDR          :                                       ssss ssss ssss ssss ssss
+        # ToA            :                                                   tt tttt tttt
+        # ToA_coarse     :                          ss ssss ssss ssss ssss sstt tttt tttt
+        # pixel_bits     :                          ^^
+        # FToA           :                                                           ffff
+        # count          :                     ss ssss ssss ssss ssss sstt tttt tttt ffff   (FToA is subtracted)
+        # phase          :                                                           pppp
+        # 0x10000000     :                           1 0000 0000 0000 0000 0000 0000 0000
+        # heartbeat_time :    hhhh hhhh hhhh hhhh hhhh hhhh hhhh hhhh hhhh hhhh hhhh hhhh
+        # heartbeat_bits :                          ^^
+        # global_time    :    hhhh hhhh hhhh hhss ssss ssss ssss ssss sstt tttt tttt ffff
+
+        # count = (ToA_coarse << np.uint(4)) - FToA     # Counter value, in multiples of 1.5625 ns
 
     Returns
     ----------
-        Arrays of pixel coordinates and timestamps.
+        Arrays of pixel coordinates, ToT, and timestamps.
     """
     x, y = decode_xy(msg, chip)  # or use x1, y1 = calculateXY(msg, chip) from the Vendor's code
     # ToA is 14 bits
@@ -158,76 +172,34 @@ def decode_message(msg, chip, spidr_epoch=0, spidr_cutoff=0):
     FToA = (msg >> np.uint(16)) & np.uint(0xF)
     # SPIDR time is 16 bits
     SPIDR = np.uint64(get_spidr(msg))
-    # Apply the spidr_epoch rollover
-    SPIDR += np.uint64((spidr_epoch + (SPIDR < spidr_cutoff)) * (2**16))
-    # Counter value, in multiples of 1.5625 ns
-    count = (((SPIDR << np.uint(14)) + ToA) << np.uint(4)) - FToA
+
+    ToA_coarse = (SPIDR << np.uint(14)) | ToA
+    # pixel_bits are the two highest bits of the SPIDR (i.e. the pixelbits range covers 262143 spidr cycles)
+    pixel_bits = int((ToA_coarse >> np.uint(28)) & np.uint(0x3))
+    # heart_bits are the bits at the same positions in the heartbeat_time
+    heartbeat_time = np.uint64(last_ts)
+    heart_bits = int((heartbeat_time >> np.uint(28)) & np.uint(0x3))
+    # Adjust heartbeat_time based on the difference between heart_bits and pixel_bits
+    diff = heart_bits - pixel_bits
+    # diff +/-1 occur when pixelbits step up
+    # diff +/-3 occur when spidr counter overfills
+    # diff can also be 0 -- then nothing happens -- but never +/-2
+    if diff == 1 or diff == -3:
+        heartbeat_time -= np.uint(0x10000000)
+    elif diff == -1 or diff == 3:
+        heartbeat_time += np.uint(0x10000000)
+    # Construct globaltime
+    global_time = (heartbeat_time & np.uint(0xFFFFC0000000)) | (ToA_coarse & np.uint(0x3FFFFFFF))
     # Phase correction
     phase = np.uint((x / 2) % 16) or np.uint(16)
-    # Calculate the timestamp
-    ts = np.uint64(count + phase)
+    # Construct timestamp with phase correction
+    ts = (global_time << np.uint(4)) - FToA + phase
 
     return x, y, ToT, ts
 
 
 @numba.jit(nopython=True)
-def check_spidr_overflow(data, last_spidr=0):
-    """Detect if the SPIDR counter has overflowed in the current data chunk (file)
-
-    Assumes that only single SPIDR overflow can occur within one file, i.e. the duration of the file is <27 sec.
-
-    Arguments:
-    ----------
-    data : NDArray[np.unint64]
-        The stream of raw data from the timepix3 detector
-    last_spidr : int
-        The last SPIDR value at the tail of the previous tpx3 file
-
-    Returns:
-    -------
-    spidr_midpoint : int
-        A value of the spidr counter such that any datapoints in the file with SPIDR < midpoint should be assigned
-        to the next SPIDR epoch (kept track of and incremented outside the function), and vice versa.
-        If spidr_midpoint=0, all data points in the file belong to the previous SPIDR epoch.
-    last_spidr : int
-        The highest SPIDR counter value from the new epoch observed at the end of file.
-    """
-
-    head_min, head_max = 65535, 0
-    tail_min, tail_max = 65535, 0
-    for i in range(64):
-        if not is_packet_header(data[i]) and matches_nibble(data[i], 0xB):
-            head_min = min(get_spidr(data[i]), head_min)
-            head_max = max(get_spidr(data[i]), head_max)
-        if not is_packet_header(data[-i - 1]) and matches_nibble(data[-i - 1], 0xB):
-            tail_min = min(get_spidr(data[-i - 1]), tail_min)
-            tail_max = max(get_spidr(data[-i - 1]), tail_max)
-    if head_min > tail_max:
-        # Transition somewhere in the middle of the file
-        midpoint = (np.uint32(head_min) + np.uint32(tail_max)) // 2
-    elif (head_max - head_min) > 32768:
-        # Large drop in the head of the file
-        midpoint = (65535 + np.uint32(tail_max)) // 2  # Assume the smallest SPIDR from the last epoch at the head is 65535
-    elif (tail_max - tail_min) > 32768:
-        # Large drop in the tail of the file; find tail_max from the new epoch
-        tail_max = 0
-        for i in range(64):
-            if not is_packet_header(data[-i - 1]) and matches_nibble(data[-i - 1], 0xB):
-                tail_spidr = get_spidr(data[-i - 1])
-                if tail_spidr < 32768:
-                    tail_max = max(tail_spidr, tail_max)
-        midpoint = (np.uint32(head_min) + np.uint32(tail_max)) // 2
-    elif last_spidr > head_min:
-        # Transition occured between the files
-        midpoint = 65535  # Everything in this file is in the new SPIDR epoch
-    else:
-        midpoint = 0
-
-    return np.uint16(midpoint), np.uint16(tail_max)
-
-
-@numba.jit(nopython=True)
-def _ingest_raw_data_rollover(data: IA, spidr_epoch=0, last_spidr=0):
+def _ingest_raw_data(data: IA, last_ts: np.uint64 = 0):
 
     chips = np.zeros_like(data, dtype=np.uint8)
     x = np.zeros_like(data, dtype="u2")
@@ -235,19 +207,17 @@ def _ingest_raw_data_rollover(data: IA, spidr_epoch=0, last_spidr=0):
     tot = np.zeros_like(data, dtype="u4")
     ts = np.zeros_like(data, dtype="u8")
 
-    chip_indx = 0
-    spidr_cutoff, last_spidr = check_spidr_overflow(data, last_spidr=last_spidr)
-    i = 0
+    i, chip_indx = 0, 0
     for msg in data:
         if is_packet_header(msg):
             chip_indx = np.uint8(get_block(msg, 8, 32))
         elif matches_nibble(msg, 0xB):
             chips[i] = chip_indx
-            _x, _y, _tot, _ts = decode_message(msg, chip_indx, spidr_epoch=spidr_epoch, spidr_cutoff=spidr_cutoff)
+            _x, _y, _tot, _ts = decode_message(msg, chip_indx, last_ts = last_ts)
             x[i] = _x
             y[i] = _y
             tot[i] = _tot
-            ts[i] = _ts
+            ts[i] = last_ts = _ts
             i += 1
 
     # Sort the timestamps
@@ -255,158 +225,7 @@ def _ingest_raw_data_rollover(data: IA, spidr_epoch=0, last_spidr=0):
     chips = chips[indx]
     x, y, tot, ts = x[indx], y[indx], tot[indx], ts[indx]
 
-    if spidr_cutoff > 0:
-        spidr_epoch += 1
-
-    return x, y, tot, ts, chips, spidr_epoch, last_spidr
-
-
-@numba.jit(nopython=True)
-def _ingest_raw_data(data: IA):
-    types = np.zeros_like(data, dtype="<u1")
-    is_header = is_packet_header(data)
-    types[is_header] = 1
-    # get the highest nibble
-    nibble = data >> np.uint(60)
-    # probably a better way to do this, but brute force!
-    types[~is_header & (nibble == 0xB)] = 2
-    types[~is_header & (nibble == 0x6)] = 3
-    types[~is_header & (nibble == 0x4)] = 4
-    types[~is_header & (nibble == 0x7)] = 5
-
-    # sort out how many photons we have
-    total_photons = np.sum(types == 2)
-
-    # allocate the return arrays
-    x = np.zeros(total_photons, dtype="u2")
-    y = np.zeros(total_photons, dtype="u2")
-    pix_addr = np.zeros(total_photons, dtype="u2")
-    ToA = np.zeros(total_photons, dtype="u2")
-    ToT = np.zeros(total_photons, dtype="u4")
-    FToA = np.zeros(total_photons, dtype="u2")
-    SPIDR = np.zeros(total_photons, dtype="u2")
-    chip_number = np.zeros(total_photons, dtype="u1")
-    basetime = np.zeros(total_photons, dtype="u8")
-    timestamp = np.zeros(total_photons, dtype="u8")
-
-    photon_offset = 0
-    chip = np.uint16(0)
-    expected_msg_count = np.uint16(0)
-    msg_run_count = np.uint(0)
-
-    heartbeat_lsb = np.uint64(0)
-    heartbeat_msb = np.uint64(0)
-    heartbeat_time = np.uint64(0)
-    # loop over the packet headers (can not vectorize this with numpy)
-    for j in range(len(data)):
-        msg = data[j]
-        typ = types[j]
-        if typ == 1:
-            # 1: packet header (id'd via TPX3 magic number)
-            if expected_msg_count != msg_run_count:
-                print("missing messages!", msg)
-            # extract scalar information from the header
-
-            # "number of pixels in chunk" is given in bytes not words
-            # and means all words in the chunk, not just "photons"
-            expected_msg_count = get_block(msg, 16, 48) // 8
-            # what chip we are on
-            chip = np.uint8(get_block(msg, 8, 32))
-            msg_run_count = 0
-        elif typ == 2 or typ == 6:
-            #  2: photon event (id'd via 0xB upper nibble)
-            #  6: frame driven data (id'd via 0xA upper nibble) (??)
-
-            # |
-
-            # pixAddr is 16 bits
-            # these names and math are adapted from c++ code
-            l_pix_addr = pix_addr[photon_offset] = (msg >> np.uint(44)) & np.uint(0xFFFF)
-            # This is laid out 16ibts which are 2 interleaved 8 bit unsigned ints
-            #  CCCCCCCRRRRRRCRR
-            #  |dcol ||spix|^||
-            #  | 7   || 6  |1|2
-            #
-            # The high 7 bits of the column
-            # '1111111000000000'
-            dcol = (l_pix_addr & np.uint(0xFE00)) >> np.uint(8)
-            # The high 6 bits of the row
-            # '0000000111111000'
-            spix = (l_pix_addr & np.uint(0x01F8)) >> np.uint(1)
-            rowcol = _shift_xy(
-                chip,
-                # add the low 2 bits of the row
-                # '0000000000000011'
-                spix + (l_pix_addr & np.uint(0x3)),
-                # add the low 1 bit of the column
-                # '0000000000000100'
-                dcol + ((l_pix_addr & np.uint(0x4)) >> np.uint(2)),
-            )
-            col = x[photon_offset] = rowcol[1]
-            y[photon_offset] = rowcol[0]
-            # ToA is 14 bits
-            ToA[photon_offset] = (msg >> np.uint(30)) & np.uint(0x3FFF)
-            # ToT is 10 bits
-            # report in ns
-            ToT[photon_offset] = ((msg >> np.uint(20)) & np.uint(0x3FF)) * 25
-            # FToA is 4 bits
-            l_FToA = FToA[photon_offset] = (msg >> np.uint(16)) & np.uint(0xF)
-            # SPIDR time is 16 bits
-            SPIDR[photon_offset] = msg & np.uint(0xFFFF)
-            # chip number (this is a constant)
-            chip_number[photon_offset] = chip
-            # heartbeat time
-            basetime[photon_offset] = heartbeat_time
-
-            ToA_coarse = (SPIDR[photon_offset] << np.uint(14)) | ToA[photon_offset]
-            pixelbits = int((ToA_coarse >> np.uint(28)) & np.uint(0x3))
-            heartbeat_time_bits = int((heartbeat_time >> np.uint(28)) & np.uint(0x3))
-            diff = heartbeat_time_bits - pixelbits
-            if diff == 1 or diff == -3:
-                heartbeat_time -= np.uint(0x10000000)
-            elif diff == -1 or diff == 3:
-                heartbeat_time += np.uint(0x10000000)
-            globaltime = (heartbeat_time & np.uint(0xFFFFC0000000)) | (ToA_coarse & np.uint(0x3FFFFFFF))
-
-            timestamp[photon_offset] = (globaltime << np.uint(12)) - (l_FToA << np.uint(8))
-            # correct for phase shift
-            phase = np.uint((col / 2) % 16)
-            if phase == 0:
-                timestamp[photon_offset] += 16 << 8
-            else:
-                timestamp[photon_offset] += phase << 8
-
-            photon_offset += 1
-            msg_run_count += 1
-        elif typ == 3:
-            #  3: TDC timstamp (id'd via 0x6 upper nibble)
-            # TODO: handle these!
-            msg_run_count += 1
-        elif typ == 4:
-            #  4: global timestap (id'd via 0x4 upper nibble)
-            subheader = (msg >> np.uint(56)) & np.uint(0x0F)
-            if subheader == 0x4:
-                # timer lsb, 32 bits of time
-                heartbeat_lsb = (msg >> np.uint(16)) & np.uint(0xFFFFFFFF)
-            elif subheader == 0x5:
-                # timer msb
-
-                time_msg = (msg >> np.uint(16)) & np.uint(0xFFFF)
-                heartbeat_msb = time_msg << np.uint(32)
-                # TODO the c++ code has large jump detection, do not understand why
-                heartbeat_time = heartbeat_msb | heartbeat_lsb
-            else:
-                raise Exception("unknown header")
-
-            msg_run_count += 1
-        elif typ == 5:
-            #  5: "command" data (id'd via 0x7 upper nibble)
-            # TODO handle this!
-            msg_run_count += 1
-        else:
-            raise Exception("Not supported")
-
-    return x, y, pix_addr, ToA, ToT, FToA, SPIDR, chip_number, basetime, timestamp
+    return x, y, tot, ts, chips
 
 
 def ingest_from_files(fpaths: List[Union[str, Path]]) -> Iterable[Dict[str, NDArray]]:
@@ -421,11 +240,11 @@ def ingest_from_files(fpaths: List[Union[str, Path]]) -> Iterable[Dict[str, NDAr
     An iterable over the parsing results, each encapsulated in a dictionary
     """
 
-    spidr_epoch, last_spidr = 0, 0
+    last_ts = 0
     for fpath in fpaths:
         data = raw_as_numpy(fpath)
-        x, y, tot, ts, chips, spidr_epoch, last_spidr = _ingest_raw_data_rollover(data, spidr_epoch, last_spidr=last_spidr)
-
+        x, y, tot, ts, chips = _ingest_raw_data(data, last_ts = last_ts)
+        last_ts = ts[-1]
         yield {
             k.strip(): v
             for k, v in zip(
@@ -435,7 +254,7 @@ def ingest_from_files(fpaths: List[Union[str, Path]]) -> Iterable[Dict[str, NDAr
         }
 
 
-def ingest_raw_data(data: IA) -> Dict[str, NDArray]:
+def ingest_raw_data(data: IA, last_ts: np.uint64 = 0) -> Dict[str, NDArray]:
     """
     Parse values out of raw timepix3 data stream.
 
@@ -447,14 +266,11 @@ def ingest_raw_data(data: IA) -> Dict[str, NDArray]:
     Returns
     -------
     Dict[str, NDArray]
-       Keys of x, y, pix_addr, ToA, ToT, FToA, SPIDR, chip_number
+       Keys of x, y, ToT, chip_number
     """
     return {
         k.strip(): v
-        for k, v in zip(
-            "x, y, pix_addr, ToA, ToT, FToA, SPIDR, chip_number, basetime, timestamp".split(","),
-            _ingest_raw_data(data),
-        )
+        for k, v in zip("x, y, ToT, ts, chip_number".split(","), _ingest_raw_data(data, last_ts=last_ts))
     }
 
 # ^-- tom wrote

From d01bf0c397a7c3f92d930be36df2816920604fd5 Mon Sep 17 00:00:00 2001
From: Eugene M <ymatviych@bnl.gov>
Date: Fri, 19 Jul 2024 13:43:48 -0400
Subject: [PATCH 03/15] Register all message types

---
 tpx3awkward/_utils.py | 75 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 67 insertions(+), 8 deletions(-)

diff --git a/tpx3awkward/_utils.py b/tpx3awkward/_utils.py
index 4baad5a..c1ba420 100644
--- a/tpx3awkward/_utils.py
+++ b/tpx3awkward/_utils.py
@@ -198,6 +198,34 @@ def decode_message(msg, chip, last_ts: np.uint64 = 0):
     return x, y, ToT, ts
 
 
+@numba.jit(nopython=True)
+def decode_heartbeat(msg, heartbeat_lsb=np.uint64(0)):
+    """Decode a heartbeat time message (idetified by its 0x4 upper nibble)
+
+    Parameters
+    ----------
+        msg (uint64): tpx3 binary message with upper_nibble == 0x4
+
+    Returns
+    ----------
+        Integer value of the heartbeat time.
+    """
+    subheader = (msg >> np.uint(56)) & np.uint(0x0F)
+    if subheader == 0x4:
+        # timer lsb, 32 bits of time
+        heartbeat_lsb = (msg >> np.uint(16)) & np.uint(0xFFFFFFFF)
+    elif subheader == 0x5:
+        # timer msb
+        time_msg = (msg >> np.uint(16)) & np.uint(0xFFFF)
+        heartbeat_msb = time_msg << np.uint(32)
+        # TODO the c++ code has large jump detection, do not understand why
+        heartbeat_time = heartbeat_msb | heartbeat_lsb
+    else:
+        raise Exception(f"Unknown subheader {subheader} in the Global Timestamp message")
+    
+    return heartbeat_lsb, heartbeat_time
+
+
 @numba.jit(nopython=True)
 def _ingest_raw_data(data: IA, last_ts: np.uint64 = 0):
 
@@ -207,21 +235,52 @@ def _ingest_raw_data(data: IA, last_ts: np.uint64 = 0):
     tot = np.zeros_like(data, dtype="u4")
     ts = np.zeros_like(data, dtype="u8")
 
-    i, chip_indx = 0, 0
+    photon_count, chip_indx, msg_run_count = 0, 0, 0
     for msg in data:
         if is_packet_header(msg):
+            # Type 1: packet header (id'd via TPX3 magic number)
+            if expected_msg_count != msg_run_count:
+                print("Missing messages!", msg)
+
+            # extract the chip number for the following photon events
             chip_indx = np.uint8(get_block(msg, 8, 32))
+
+            # "number of pixels in chunk" is given in bytes not words and means all words in the chunk, not just "photons"
+            expected_msg_count = get_block(msg, 16, 48) // 8
+            msg_run_count = 0
+
         elif matches_nibble(msg, 0xB):
-            chips[i] = chip_indx
+            # Type 2: photon event (id'd via 0xB upper nibble)
+            chips[photon_count] = chip_indx
             _x, _y, _tot, _ts = decode_message(msg, chip_indx, last_ts = last_ts)
-            x[i] = _x
-            y[i] = _y
-            tot[i] = _tot
-            ts[i] = last_ts = _ts
-            i += 1
+            x[photon_count] = _x
+            y[photon_count] = _y
+            tot[photon_count] = _tot
+            ts[photon_count] = last_ts = _ts
+            photon_count += 1
+            msg_run_count += 1
+
+        elif matches_nibble(msg, 0x6):
+            # Type 3: TDC timstamp (id'd via 0x6 upper nibble)
+            # TODO: handle these!
+            msg_run_count += 1
+        
+        elif matches_nibble(msg, 0x4):
+            # Type 4: global timestap (id'd via 0x4 upper nibble)
+            heartbeat_lsb, heartbeat_time = decode_heartbeat(msg, heartbeat_lsb)
+            msg_run_count += 1
+
+        elif matches_nibble(msg, 0x7):
+            # Type 5: "command" data (id'd via 0x7 upper nibble)
+            # TODO handle this!
+            msg_run_count += 1
+
+        else:
+            raise Exception(f"Not supported: {msg}")
+
 
     # Sort the timestamps
-    indx = np.argsort(ts[:i], kind="mergesort")
+    indx = np.argsort(ts[:photon_count], kind="mergesort")
     chips = chips[indx]
     x, y, tot, ts = x[indx], y[indx], tot[indx], ts[indx]
 

From 08d896c568f182affbef3bd5cdc3f0577186895d Mon Sep 17 00:00:00 2001
From: Eugene M <ymatviych@bnl.gov>
Date: Fri, 19 Jul 2024 16:12:49 -0400
Subject: [PATCH 04/15] FIX: missing initializations and dtypes

---
 tpx3awkward/_utils.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tpx3awkward/_utils.py b/tpx3awkward/_utils.py
index c1ba420..dbcf8bc 100644
--- a/tpx3awkward/_utils.py
+++ b/tpx3awkward/_utils.py
@@ -199,7 +199,7 @@ def decode_message(msg, chip, last_ts: np.uint64 = 0):
 
 
 @numba.jit(nopython=True)
-def decode_heartbeat(msg, heartbeat_lsb=np.uint64(0)):
+def decode_heartbeat(msg: np.uint64, heartbeat_lsb: np.uint64 = np.uint64(0)):
     """Decode a heartbeat time message (idetified by its 0x4 upper nibble)
 
     Parameters
@@ -214,10 +214,12 @@ def decode_heartbeat(msg, heartbeat_lsb=np.uint64(0)):
     if subheader == 0x4:
         # timer lsb, 32 bits of time
         heartbeat_lsb = (msg >> np.uint(16)) & np.uint(0xFFFFFFFF)
+        heartbeat_time = None
     elif subheader == 0x5:
         # timer msb
         time_msg = (msg >> np.uint(16)) & np.uint(0xFFFF)
         heartbeat_msb = time_msg << np.uint(32)
+        print(type(heartbeat_msb), type(heartbeat_lsb))
         # TODO the c++ code has large jump detection, do not understand why
         heartbeat_time = heartbeat_msb | heartbeat_lsb
     else:
@@ -234,8 +236,9 @@ def _ingest_raw_data(data: IA, last_ts: np.uint64 = 0):
     y = np.zeros_like(data, dtype="u2")
     tot = np.zeros_like(data, dtype="u4")
     ts = np.zeros_like(data, dtype="u8")
+    heartbeat_lsb=np.uint64(0)
 
-    photon_count, chip_indx, msg_run_count = 0, 0, 0
+    photon_count, chip_indx, msg_run_count, expected_msg_count = 0, 0, 0, 0
     for msg in data:
         if is_packet_header(msg):
             # Type 1: packet header (id'd via TPX3 magic number)
@@ -267,7 +270,7 @@ def _ingest_raw_data(data: IA, last_ts: np.uint64 = 0):
         
         elif matches_nibble(msg, 0x4):
             # Type 4: global timestap (id'd via 0x4 upper nibble)
-            heartbeat_lsb, heartbeat_time = decode_heartbeat(msg, heartbeat_lsb)
+            heartbeat_lsb, heartbeat_time = decode_heartbeat(msg, np.uint64(heartbeat_lsb))
             msg_run_count += 1
 
         elif matches_nibble(msg, 0x7):

From 6ffe1534ff16fa1abf134af54278a093306544a0 Mon Sep 17 00:00:00 2001
From: Eugene M <ymatviych@bnl.gov>
Date: Wed, 24 Jul 2024 18:36:49 -0400
Subject: [PATCH 05/15] ENH: check for spidr rollover with global timestamp

---
 tpx3awkward/_utils.py | 93 +++++++++++++++++++++----------------------
 1 file changed, 45 insertions(+), 48 deletions(-)

diff --git a/tpx3awkward/_utils.py b/tpx3awkward/_utils.py
index dbcf8bc..42668fc 100644
--- a/tpx3awkward/_utils.py
+++ b/tpx3awkward/_utils.py
@@ -133,14 +133,14 @@ def get_spidr(msg):
 
 
 @numba.jit(nopython=True)
-def decode_message(msg, chip, last_ts: np.uint64 = 0):
+def decode_message(msg, chip, heartbeat_time: np.uint64 = 0):
     """Decode TPX3 packages of the second type corresponding to photon events (id'd via 0xB upper nibble)
 
     Parameters
     ----------
         msg (uint64): tpx3 binary message
         chip (uint8): chip ID, 0..3
-        last_ts (uint64): last timestamp from the previous message; defines the SPIDR rollover
+        heartbeat_time (uint64):
 
         # bit position   : ...  44   40   36   32   28   24   20   16   12    8 7  4 3  0
         # 0xFFFFC0000000 :    1111 1111 1111 1111 1100 0000 0000 0000 0000 0000 0000 0000
@@ -177,7 +177,6 @@ def decode_message(msg, chip, last_ts: np.uint64 = 0):
     # pixel_bits are the two highest bits of the SPIDR (i.e. the pixelbits range covers 262143 spidr cycles)
     pixel_bits = int((ToA_coarse >> np.uint(28)) & np.uint(0x3))
     # heart_bits are the bits at the same positions in the heartbeat_time
-    heartbeat_time = np.uint64(last_ts)
     heart_bits = int((heartbeat_time >> np.uint(28)) & np.uint(0x3))
     # Adjust heartbeat_time based on the difference between heart_bits and pixel_bits
     diff = heart_bits - pixel_bits
@@ -189,7 +188,7 @@ def decode_message(msg, chip, last_ts: np.uint64 = 0):
     elif diff == -1 or diff == 3:
         heartbeat_time += np.uint(0x10000000)
     # Construct globaltime
-    global_time = (heartbeat_time & np.uint(0xFFFFC0000000)) | (ToA_coarse & np.uint(0x3FFFFFFF))
+    global_time = (np.uint64(heartbeat_time) & np.uint(0xFFFFC0000000)) | (ToA_coarse & np.uint(0x3FFFFFFF))
     # Phase correction
     phase = np.uint((x / 2) % 16) or np.uint(16)
     # Construct timestamp with phase correction
@@ -198,45 +197,18 @@ def decode_message(msg, chip, last_ts: np.uint64 = 0):
     return x, y, ToT, ts
 
 
-@numba.jit(nopython=True)
-def decode_heartbeat(msg: np.uint64, heartbeat_lsb: np.uint64 = np.uint64(0)):
-    """Decode a heartbeat time message (idetified by its 0x4 upper nibble)
-
-    Parameters
-    ----------
-        msg (uint64): tpx3 binary message with upper_nibble == 0x4
-
-    Returns
-    ----------
-        Integer value of the heartbeat time.
-    """
-    subheader = (msg >> np.uint(56)) & np.uint(0x0F)
-    if subheader == 0x4:
-        # timer lsb, 32 bits of time
-        heartbeat_lsb = (msg >> np.uint(16)) & np.uint(0xFFFFFFFF)
-        heartbeat_time = None
-    elif subheader == 0x5:
-        # timer msb
-        time_msg = (msg >> np.uint(16)) & np.uint(0xFFFF)
-        heartbeat_msb = time_msg << np.uint(32)
-        print(type(heartbeat_msb), type(heartbeat_lsb))
-        # TODO the c++ code has large jump detection, do not understand why
-        heartbeat_time = heartbeat_msb | heartbeat_lsb
-    else:
-        raise Exception(f"Unknown subheader {subheader} in the Global Timestamp message")
-    
-    return heartbeat_lsb, heartbeat_time
-
-
-@numba.jit(nopython=True)
-def _ingest_raw_data(data: IA, last_ts: np.uint64 = 0):
+# @numba.jit(nopython=True)
+def _ingest_raw_data(data):
 
     chips = np.zeros_like(data, dtype=np.uint8)
     x = np.zeros_like(data, dtype="u2")
     y = np.zeros_like(data, dtype="u2")
     tot = np.zeros_like(data, dtype="u4")
     ts = np.zeros_like(data, dtype="u8")
-    heartbeat_lsb=np.uint64(0)
+    heartbeat_lsb = np.uint64(0)
+    heartbeat_msb = np.uint64(0)
+    heartbeat_time = np.uint64(0)
+    hb = np.zeros_like(data, dtype="u8")
 
     photon_count, chip_indx, msg_run_count, expected_msg_count = 0, 0, 0, 0
     for msg in data:
@@ -255,11 +227,23 @@ def _ingest_raw_data(data: IA, last_ts: np.uint64 = 0):
         elif matches_nibble(msg, 0xB):
             # Type 2: photon event (id'd via 0xB upper nibble)
             chips[photon_count] = chip_indx
-            _x, _y, _tot, _ts = decode_message(msg, chip_indx, last_ts = last_ts)
+            _x, _y, _tot, _ts = decode_message(msg, chip_indx, heartbeat_time=heartbeat_time)
             x[photon_count] = _x
             y[photon_count] = _y
             tot[photon_count] = _tot
-            ts[photon_count] = last_ts = _ts
+            ts[photon_count] = _ts
+            
+            if (photon_count > 0) and (_ts - ts[photon_count-1] > 2**30):
+                prev_ts = ts[:photon_count]   # This portion needs to be adjusted
+                # Find what the current timestamp would be without global heartbeat
+                _, _, _, _ts_0 = decode_message(msg, chip_indx, heartbeat_time=np.uint64(0))
+                # Check if there is a SPIDR rollover in the beginning of the file but before heartbeat was received
+                if int(max(prev_ts[:10])) - int(min(prev_ts[-10:])) > 2**32:
+                    prev_ts[prev_ts < 2**33] += np.uint64(2**34)
+                    _ts_0 += 2**34
+                # Ajust the existing timestamps
+                ts[:photon_count] = prev_ts + (_ts - _ts_0)
+            
             photon_count += 1
             msg_run_count += 1
 
@@ -270,8 +254,22 @@ def _ingest_raw_data(data: IA, last_ts: np.uint64 = 0):
         
         elif matches_nibble(msg, 0x4):
             # Type 4: global timestap (id'd via 0x4 upper nibble)
-            heartbeat_lsb, heartbeat_time = decode_heartbeat(msg, np.uint64(heartbeat_lsb))
+            subheader = (msg >> np.uint(56)) & np.uint(0x0F)
+            if subheader == 0x4:
+                # timer lsb, 32 bits of time
+                heartbeat_lsb = (msg >> np.uint(16)) & np.uint(0xFFFFFFFF)
+            elif subheader == 0x5:
+                # timer msb
+                time_msg = (msg >> np.uint(16)) & np.uint(0xFFFF)
+                heartbeat_msb = time_msg << np.uint(32)
+                heartbeat_time = (heartbeat_msb | heartbeat_lsb) # << np.uint(4)
+                # TODO the c++ code has large jump detection, do not understand why
+            else:
+                raise Exception(f"Unknown subheader {subheader} in the Global Timestamp message")
+            pass
+
             msg_run_count += 1
+            hb[photon_count] = heartbeat_time
 
         elif matches_nibble(msg, 0x7):
             # Type 5: "command" data (id'd via 0x7 upper nibble)
@@ -286,8 +284,9 @@ def _ingest_raw_data(data: IA, last_ts: np.uint64 = 0):
     indx = np.argsort(ts[:photon_count], kind="mergesort")
     chips = chips[indx]
     x, y, tot, ts = x[indx], y[indx], tot[indx], ts[indx]
+    hb = hb[indx]
 
-    return x, y, tot, ts, chips
+    return x, y, tot, ts, chips, hb
 
 
 def ingest_from_files(fpaths: List[Union[str, Path]]) -> Iterable[Dict[str, NDArray]]:
@@ -302,21 +301,19 @@ def ingest_from_files(fpaths: List[Union[str, Path]]) -> Iterable[Dict[str, NDAr
     An iterable over the parsing results, each encapsulated in a dictionary
     """
 
-    last_ts = 0
     for fpath in fpaths:
         data = raw_as_numpy(fpath)
-        x, y, tot, ts, chips = _ingest_raw_data(data, last_ts = last_ts)
-        last_ts = ts[-1]
+        x, y, tot, ts, chips, hb = _ingest_raw_data(data)
         yield {
             k.strip(): v
             for k, v in zip(
-                "x, y, tot, timestamp, chips".split(","),
-                (x, y, tot, ts, chips),
+                "x, y, tot, timestamp, chips, hb".split(","),
+                (x, y, tot, ts, chips, hb),
             )
         }
 
 
-def ingest_raw_data(data: IA, last_ts: np.uint64 = 0) -> Dict[str, NDArray]:
+def ingest_raw_data(data: IA) -> Dict[str, NDArray]:
     """
     Parse values out of raw timepix3 data stream.
 
@@ -332,7 +329,7 @@ def ingest_raw_data(data: IA, last_ts: np.uint64 = 0) -> Dict[str, NDArray]:
     """
     return {
         k.strip(): v
-        for k, v in zip("x, y, ToT, ts, chip_number".split(","), _ingest_raw_data(data, last_ts=last_ts))
+        for k, v in zip("x, y, ToT, ts, chip_number, hb".split(","), _ingest_raw_data(data))
     }
 
 # ^-- tom wrote

From b125687bf7322a2fdd69e492cc890e0ab93036a6 Mon Sep 17 00:00:00 2001
From: Eugene M <ymatviych@bnl.gov>
Date: Wed, 24 Jul 2024 19:57:28 -0400
Subject: [PATCH 06/15] FIX: int overflow errors, force uint64 dtype

---
 tpx3awkward/_utils.py | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/tpx3awkward/_utils.py b/tpx3awkward/_utils.py
index 42668fc..6de53c5 100644
--- a/tpx3awkward/_utils.py
+++ b/tpx3awkward/_utils.py
@@ -163,6 +163,7 @@ def decode_message(msg, chip, heartbeat_time: np.uint64 = 0):
     ----------
         Arrays of pixel coordinates, ToT, and timestamps.
     """
+    msg, heartbeat_time = np.uint64(msg), np.uint64(heartbeat_time)    # Force types
     x, y = decode_xy(msg, chip)  # or use x1, y1 = calculateXY(msg, chip) from the Vendor's code
     # ToA is 14 bits
     ToA = (msg >> np.uint(30)) & np.uint(0x3FFF)
@@ -175,20 +176,20 @@ def decode_message(msg, chip, heartbeat_time: np.uint64 = 0):
 
     ToA_coarse = (SPIDR << np.uint(14)) | ToA
     # pixel_bits are the two highest bits of the SPIDR (i.e. the pixelbits range covers 262143 spidr cycles)
-    pixel_bits = int((ToA_coarse >> np.uint(28)) & np.uint(0x3))
+    pixel_bits = np.int8((ToA_coarse >> np.uint(28)) & np.uint(0x3))
     # heart_bits are the bits at the same positions in the heartbeat_time
-    heart_bits = int((heartbeat_time >> np.uint(28)) & np.uint(0x3))
+    heart_bits = np.int8((heartbeat_time >> np.uint(28)) & np.uint(0x3))
     # Adjust heartbeat_time based on the difference between heart_bits and pixel_bits
     diff = heart_bits - pixel_bits
     # diff +/-1 occur when pixelbits step up
     # diff +/-3 occur when spidr counter overfills
     # diff can also be 0 -- then nothing happens -- but never +/-2
-    if diff == 1 or diff == -3:
+    if (diff == 1 or diff == -3) and (heartbeat_time > np.uint(0x10000000)):
         heartbeat_time -= np.uint(0x10000000)
     elif diff == -1 or diff == 3:
         heartbeat_time += np.uint(0x10000000)
     # Construct globaltime
-    global_time = (np.uint64(heartbeat_time) & np.uint(0xFFFFC0000000)) | (ToA_coarse & np.uint(0x3FFFFFFF))
+    global_time = (heartbeat_time & np.uint(0xFFFFFFFC0000000)) | (ToA_coarse & np.uint(0x3FFFFFFF))
     # Phase correction
     phase = np.uint((x / 2) % 16) or np.uint(16)
     # Construct timestamp with phase correction
@@ -196,8 +197,7 @@ def decode_message(msg, chip, heartbeat_time: np.uint64 = 0):
 
     return x, y, ToT, ts
 
-
-# @numba.jit(nopython=True)
+@numba.jit(nopython=True)
 def _ingest_raw_data(data):
 
     chips = np.zeros_like(data, dtype=np.uint8)
@@ -208,7 +208,6 @@ def _ingest_raw_data(data):
     heartbeat_lsb = np.uint64(0)
     heartbeat_msb = np.uint64(0)
     heartbeat_time = np.uint64(0)
-    hb = np.zeros_like(data, dtype="u8")
 
     photon_count, chip_indx, msg_run_count, expected_msg_count = 0, 0, 0, 0
     for msg in data:
@@ -227,21 +226,24 @@ def _ingest_raw_data(data):
         elif matches_nibble(msg, 0xB):
             # Type 2: photon event (id'd via 0xB upper nibble)
             chips[photon_count] = chip_indx
+            # heartbeat_time = np.uint64(0)
             _x, _y, _tot, _ts = decode_message(msg, chip_indx, heartbeat_time=heartbeat_time)
             x[photon_count] = _x
             y[photon_count] = _y
             tot[photon_count] = _tot
             ts[photon_count] = _ts
             
-            if (photon_count > 0) and (_ts - ts[photon_count-1] > 2**30):
+            if (photon_count > 0) and (_ts > ts[photon_count-1]) and (_ts - ts[photon_count-1] > 2**30):
                 prev_ts = ts[:photon_count]   # This portion needs to be adjusted
                 # Find what the current timestamp would be without global heartbeat
                 _, _, _, _ts_0 = decode_message(msg, chip_indx, heartbeat_time=np.uint64(0))
                 # Check if there is a SPIDR rollover in the beginning of the file but before heartbeat was received
-                if int(max(prev_ts[:10])) - int(min(prev_ts[-10:])) > 2**32:
+                head_max = max(prev_ts[:10])
+                tail_min = min(prev_ts[-10:])
+                if (head_max > tail_min) and (head_max - tail_min > 2**32):
                     prev_ts[prev_ts < 2**33] += np.uint64(2**34)
                     _ts_0 += 2**34
-                # Ajust the existing timestamps
+                # Ajust already processed timestamps
                 ts[:photon_count] = prev_ts + (_ts - _ts_0)
             
             photon_count += 1
@@ -254,13 +256,13 @@ def _ingest_raw_data(data):
         
         elif matches_nibble(msg, 0x4):
             # Type 4: global timestap (id'd via 0x4 upper nibble)
-            subheader = (msg >> np.uint(56)) & np.uint(0x0F)
+            subheader = (msg >> np.uint(56)) & np.uint64(0x0F)
             if subheader == 0x4:
                 # timer lsb, 32 bits of time
-                heartbeat_lsb = (msg >> np.uint(16)) & np.uint(0xFFFFFFFF)
+                heartbeat_lsb = (msg >> np.uint(16)) & np.uint64(0xFFFFFFFF)
             elif subheader == 0x5:
                 # timer msb
-                time_msg = (msg >> np.uint(16)) & np.uint(0xFFFF)
+                time_msg = (msg >> np.uint(16)) & np.uint64(0xFFFF)
                 heartbeat_msb = time_msg << np.uint(32)
                 heartbeat_time = (heartbeat_msb | heartbeat_lsb) # << np.uint(4)
                 # TODO the c++ code has large jump detection, do not understand why
@@ -269,7 +271,6 @@ def _ingest_raw_data(data):
             pass
 
             msg_run_count += 1
-            hb[photon_count] = heartbeat_time
 
         elif matches_nibble(msg, 0x7):
             # Type 5: "command" data (id'd via 0x7 upper nibble)
@@ -284,9 +285,8 @@ def _ingest_raw_data(data):
     indx = np.argsort(ts[:photon_count], kind="mergesort")
     chips = chips[indx]
     x, y, tot, ts = x[indx], y[indx], tot[indx], ts[indx]
-    hb = hb[indx]
 
-    return x, y, tot, ts, chips, hb
+    return x, y, tot, ts, chips
 
 
 def ingest_from_files(fpaths: List[Union[str, Path]]) -> Iterable[Dict[str, NDArray]]:
@@ -303,12 +303,12 @@ def ingest_from_files(fpaths: List[Union[str, Path]]) -> Iterable[Dict[str, NDAr
 
     for fpath in fpaths:
         data = raw_as_numpy(fpath)
-        x, y, tot, ts, chips, hb = _ingest_raw_data(data)
+        x, y, tot, ts, chips = _ingest_raw_data(data)
         yield {
             k.strip(): v
             for k, v in zip(
-                "x, y, tot, timestamp, chips, hb".split(","),
-                (x, y, tot, ts, chips, hb),
+                "x, y, tot, timestamp, chips".split(","),
+                (x, y, tot, ts, chips),
             )
         }
 
@@ -329,7 +329,7 @@ def ingest_raw_data(data: IA) -> Dict[str, NDArray]:
     """
     return {
         k.strip(): v
-        for k, v in zip("x, y, ToT, ts, chip_number, hb".split(","), _ingest_raw_data(data))
+        for k, v in zip("x, y, ToT, ts, chip_number".split(","), _ingest_raw_data(data))
     }
 
 # ^-- tom wrote

From 0a942ae45c9bcfd1613aa4ef278bc9da236b2037 Mon Sep 17 00:00:00 2001
From: Eugene M <ymatviych@bnl.gov>
Date: Thu, 25 Jul 2024 09:58:14 -0400
Subject: [PATCH 07/15] FIX: timer ensure timer LSB is initialized

---
 tpx3awkward/_utils.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/tpx3awkward/_utils.py b/tpx3awkward/_utils.py
index 6de53c5..8e656f0 100644
--- a/tpx3awkward/_utils.py
+++ b/tpx3awkward/_utils.py
@@ -205,8 +205,7 @@ def _ingest_raw_data(data):
     y = np.zeros_like(data, dtype="u2")
     tot = np.zeros_like(data, dtype="u4")
     ts = np.zeros_like(data, dtype="u8")
-    heartbeat_lsb = np.uint64(0)
-    heartbeat_msb = np.uint64(0)
+    heartbeat_lsb = None  #np.uint64(0)
     heartbeat_time = np.uint64(0)
 
     photon_count, chip_indx, msg_run_count, expected_msg_count = 0, 0, 0, 0
@@ -226,7 +225,6 @@ def _ingest_raw_data(data):
         elif matches_nibble(msg, 0xB):
             # Type 2: photon event (id'd via 0xB upper nibble)
             chips[photon_count] = chip_indx
-            # heartbeat_time = np.uint64(0)
             _x, _y, _tot, _ts = decode_message(msg, chip_indx, heartbeat_time=heartbeat_time)
             x[photon_count] = _x
             y[photon_count] = _y
@@ -240,9 +238,10 @@ def _ingest_raw_data(data):
                 # Check if there is a SPIDR rollover in the beginning of the file but before heartbeat was received
                 head_max = max(prev_ts[:10])
                 tail_min = min(prev_ts[-10:])
+                # Compare the difference with some big number (e.g. 1/4 of SPIDR)
                 if (head_max > tail_min) and (head_max - tail_min > 2**32):
                     prev_ts[prev_ts < 2**33] += np.uint64(2**34)
-                    _ts_0 += 2**34
+                    _ts_0 += np.uint64(2**34)
                 # Ajust already processed timestamps
                 ts[:photon_count] = prev_ts + (_ts - _ts_0)
             
@@ -258,14 +257,14 @@ def _ingest_raw_data(data):
             # Type 4: global timestap (id'd via 0x4 upper nibble)
             subheader = (msg >> np.uint(56)) & np.uint64(0x0F)
             if subheader == 0x4:
-                # timer lsb, 32 bits of time
+                # timer LSB, 32 bits of time
                 heartbeat_lsb = (msg >> np.uint(16)) & np.uint64(0xFFFFFFFF)
             elif subheader == 0x5:
-                # timer msb
-                time_msg = (msg >> np.uint(16)) & np.uint64(0xFFFF)
-                heartbeat_msb = time_msg << np.uint(32)
-                heartbeat_time = (heartbeat_msb | heartbeat_lsb) # << np.uint(4)
-                # TODO the c++ code has large jump detection, do not understand why
+                # timer MSB -- only matters if LSB has been received already
+                if heartbeat_lsb is not None:
+                    heartbeat_msb = ((msg >> np.uint(16)) & np.uint64(0xFFFF)) << np.uint(32)
+                    heartbeat_time = (heartbeat_msb | heartbeat_lsb)
+                    # TODO the c++ code has large jump detection, do not understand why
             else:
                 raise Exception(f"Unknown subheader {subheader} in the Global Timestamp message")
             pass

From 7677757d7f68dcf57579f74733b15890505dc7ca Mon Sep 17 00:00:00 2001
From: Justin Goodrich <jgoodrich@bnl.gov>
Date: Mon, 29 Jul 2024 23:13:51 -0400
Subject: [PATCH 08/15] Miscallenous updates to tpx3awkward. Various clean-ups,
 optimized handling of KDTree neighbor list, updated conversion functions, and
 removed file-finding logic specific to CHX's current ophyd object.

---
 tpx3awkward/_utils.py              | 611 ++++++++---------------------
 tpx3awkward/examples/example.ipynb | 351 +++++++++++++++++
 2 files changed, 508 insertions(+), 454 deletions(-)
 create mode 100644 tpx3awkward/examples/example.ipynb

diff --git a/tpx3awkward/_utils.py b/tpx3awkward/_utils.py
index 8e656f0..7f6498a 100644
--- a/tpx3awkward/_utils.py
+++ b/tpx3awkward/_utils.py
@@ -1,19 +1,15 @@
 import os
-import numpy as np
 from pathlib import Path
+from typing import TypeVar, Union, Dict, List, Tuple
+import numpy as np
 from numpy.typing import NDArray
-from typing import TypeVar, Union, Dict, Set, List, Iterable
 import numba
 import pandas as pd
 from scipy.spatial import KDTree
 import multiprocessing
 from tqdm import tqdm
 import warnings
-import gc
-try:
-    from pyCHX.chx_packages import db, get_sid_filenames
-except ImportError:
-    warnings.warn("Could not import pyCHX package. Proceeding without it...")
+
 
 IA = NDArray[np.uint64]
 UnSigned = TypeVar("UnSigned", IA, np.uint64)
@@ -29,28 +25,26 @@ def raw_as_numpy(fpath: Union[str, Path]) -> IA:
     ----------
 
     """
-
     return np.fromfile(fpath, dtype="<u8")
 
 
-@numba.jit(nopython=True)
+@numba.jit(nopython=True, cache=True)
 def get_block(v: UnSigned, width: int, shift: int) -> UnSigned:
-
     return v >> np.uint64(shift) & np.uint64(2**width - 1)
 
 
-@numba.jit(nopython=True)
+@numba.jit(nopython=True, cache=True)
 def matches_nibble(data, nibble) -> numba.boolean:
     return (int(data) >> 60) == nibble
 
 
-@numba.jit(nopython=True)
+@numba.jit(nopython=True, cache=True)
 def is_packet_header(v: UnSigned) -> UnSigned:
     """Identify packet headers by magic number (TPX3 as ascii on lowest 8 bytes]"""
     return get_block(v, 32, 0) == 861425748
 
 
-@numba.jit(nopython=True)
+@numba.jit(nopython=True, cache=True)
 def classify_array(data: IA) -> NDArray[np.uint8]:
     """
     Create an array the same size as the data classifying 64bit uint by type.
@@ -78,7 +72,7 @@ def classify_array(data: IA) -> NDArray[np.uint8]:
     return output
 
 
-@numba.jit(nopython=True)
+@numba.jit(nopython=True, cache=True)
 def _shift_xy(chip, row, col):
     # TODO sort out if this needs to be paremeterized
     out = np.zeros(2, "u4")
@@ -100,7 +94,7 @@ def _shift_xy(chip, row, col):
     return out
 
 
-@numba.jit(nopython=True)
+@numba.jit(nopython=True, cache=True)
 def decode_xy(msg, chip):
     # these names and math are adapted from c++ code
     l_pix_addr = (msg >> np.uint(44)) & np.uint(0xFFFF)
@@ -127,12 +121,12 @@ def decode_xy(msg, chip):
     return rowcol[1], rowcol[0]
 
 
-@numba.jit(nopython=True)
+@numba.jit(nopython=True, cache=True)
 def get_spidr(msg):
     return msg & np.uint(0xFFFF)
 
 
-@numba.jit(nopython=True)
+@numba.jit(nopython=True, cache=True)
 def decode_message(msg, chip, heartbeat_time: np.uint64 = 0):
     """Decode TPX3 packages of the second type corresponding to photon events (id'd via 0xB upper nibble)
 
@@ -197,7 +191,8 @@ def decode_message(msg, chip, heartbeat_time: np.uint64 = 0):
 
     return x, y, ToT, ts
 
-@numba.jit(nopython=True)
+
+@numba.jit(nopython=True, cache=True)
 def _ingest_raw_data(data):
 
     chips = np.zeros_like(data, dtype=np.uint8)
@@ -281,37 +276,12 @@ def _ingest_raw_data(data):
 
 
     # Sort the timestamps
-    indx = np.argsort(ts[:photon_count], kind="mergesort")
-    chips = chips[indx]
-    x, y, tot, ts = x[indx], y[indx], tot[indx], ts[indx]
+    indx = np.argsort(ts[:photon_count], kind="mergesort") # is mergesort the best here? wondering if this could be optimized
+    x, y, tot, ts, chips = x[indx], y[indx], tot[indx], ts[indx], chips[indx]
 
     return x, y, tot, ts, chips
 
 
-def ingest_from_files(fpaths: List[Union[str, Path]]) -> Iterable[Dict[str, NDArray]]:
-    """Parse values out of a sequence of timepix3 files with rollover of SPIDR counter.
-
-    Parameters
-    ----------
-    fpaths : A sorted sequence of tpx3 filepaths.
-
-    Returns
-    -------
-    An iterable over the parsing results, each encapsulated in a dictionary
-    """
-
-    for fpath in fpaths:
-        data = raw_as_numpy(fpath)
-        x, y, tot, ts, chips = _ingest_raw_data(data)
-        yield {
-            k.strip(): v
-            for k, v in zip(
-                "x, y, tot, timestamp, chips".split(","),
-                (x, y, tot, ts, chips),
-            )
-        }
-
-
 def ingest_raw_data(data: IA) -> Dict[str, NDArray]:
     """
     Parse values out of raw timepix3 data stream.
@@ -328,17 +298,14 @@ def ingest_raw_data(data: IA) -> Dict[str, NDArray]:
     """
     return {
         k.strip(): v
-        for k, v in zip("x, y, ToT, ts, chip_number".split(","), _ingest_raw_data(data))
+        for k, v in zip("x, y, ToT, t, chip".split(","), _ingest_raw_data(data))
     }
 
-# ^-- tom wrote
-# v-- justin wrote
+
 """ 
 Some basic functions that help take the initial output of ingest_raw_data and finish the processing.
 """
-
-
-def raw_to_sorted_df(fpath: Union[str, Path]) -> pd.DataFrame:
+def tpx_to_raw_df(fpath: Union[str, Path]) -> pd.DataFrame:
     """
     Parses a .tpx3 file and returns the raw data after timesorting.
 
@@ -353,28 +320,7 @@ def raw_to_sorted_df(fpath: Union[str, Path]) -> pd.DataFrame:
        DataFrame of raw events from the .tpx3 file.
     """
     raw_df = pd.DataFrame(ingest_raw_data(raw_as_numpy(fpath)))
-    return raw_df.sort_values("timestamp").reset_index(drop=True)
-
-
-def condense_raw_df(df: pd.DataFrame) -> pd.DataFrame:
-    """
-    Condenses the raw dataframe with only key information necesary for the analysis. Returns a dataframe with timestamp (renamed to t), x, y, and ToT.
-
-    Parameters
-    ----------
-    df : pd.DataFrame
-        DataFrame generated using raw_to_sorted_df().
-
-    Returns
-    -------
-    pd.DataFrame
-        Dataframe condensed to only contain pertinent information for analysis.
-    """
-    cdf = df[["timestamp", "x", "y", "ToT"]]
-    cdf = cdf.rename(
-        columns={"timestamp": "t"}
-    )  # obviously not necessary, just easier to type 't' a lot than 'timestamp'
-    return cdf
+    return raw_df.sort_values("t").reset_index(drop=True) # should we specify the sorting algorithm? at this point, it should be sorted anyway, but I think dataframes need to be explicitly sorted for use in e.g. merge_asof?
 
 
 def drop_zero_tot(df: pd.DataFrame) -> pd.DataFrame:
@@ -391,26 +337,25 @@ def drop_zero_tot(df: pd.DataFrame) -> pd.DataFrame:
     pd.DataFrame
        df with only the events with ToT > 0
     """
-    fdf = df[df["ToT"] > 0]
-    return fdf
+    return df[df["ToT"] > 0]
 
 
 """
 Functions to help perform clustering and centroiding on raw data.
 """
-TIMESTAMP_VALUE = ((1e-9) / 4096) * 25
-MICROSECOND = 10 ** (-6)
+TIMESTAMP_VALUE = 1.5625*1e-9 # each raw timestamp is 1.5625 seconds
+MICROSECOND = 1e-6
 
 # We have had decent success with these values, but do not know for sure if they are optimal.
-DEFAULT_CLUSTER_RADIUS = 2
-DEFAULT_CLUSTER_TW_MICROSECONDS = 0.5
+DEFAULT_CLUSTER_RADIUS = 3
+DEFAULT_CLUSTER_TW_MICROSECONDS = 0.3
 
 DEFAULT_CLUSTER_TW = int(DEFAULT_CLUSTER_TW_MICROSECONDS * MICROSECOND / TIMESTAMP_VALUE)
 
 
-def neighbor_set_from_df(
+def cluster_df(
     df: pd.DataFrame, tw: int = DEFAULT_CLUSTER_TW, radius: int = DEFAULT_CLUSTER_RADIUS
-) -> tuple[np.ndarray, Set[tuple[int]]]:
+) -> tuple[np.ndarray, np.ndarray]:
     """
     Uses scipy.spatial's KDTree to cluster raw input data. Requires a time window for clustering adjacent pixels and the total search radius.
 
@@ -431,90 +376,41 @@ def neighbor_set_from_df(
         An set of tuples of the indices of the clustered events.  The outer set is each cluster, and the inner tuples are the events in each cluster.
     """
     events = np.array(
-        df[["t", "x", "y", "ToT", "t"]].values
+        df[["t", "x", "y", "ToT", "t"]].values # raw data stored in dataframe. duplicate timestamp column as the first instance is windowed
     )  # first three columns are for search radius of KDTree
     events[:, 0] = np.floor_divide(events[:, 0], tw)  # bin by the time window
-    tree = KDTree(events[:, :3])  # generate KDTree based off the coordinates
+    tree = KDTree(events[:, :3])  # generate KDTree based off the coordinates (t/timewindow, x, y)
     neighbors = tree.query_ball_tree(
         tree, radius
     )  # compare tree against itself to find neighbors within the search radius
-    clusters = set(tuple(n) for n in neighbors)  # turn the list of lists into a set of tuples
-    return events, clusters
-
-
-def cluster_stats(
-    clusters: Set[tuple[int]]
-) -> tuple[int]:
-    """
-    Determines basic information about cluster information, such as the number of clusters and size of the largest cluster.
-
-    Parameters
-    ----------
-    clusters : Set[tuple[int]]
-        The set of tuples of clusters from neighbor_set_from_df()
-
-    Returns
-    -------
-    int
-        The total number of clusters
-    int
-        The number of events in the largest cluster
-    """
-    num_clusters = len(clusters)
-    max_cluster = max(map(len, clusters))
-    return num_clusters, max_cluster
+    if len(neighbors) >= 2147483647: # performance is marginally better if can use int32 for the indices, so check for that
+        dtype = np.int64
+    else:
+        dtype = np.int32
+    return pd.DataFrame(neighbors).fillna(-1).astype(dtype).drop_duplicates().values, events[:, 1:] # a bit weird, but faster than using sets and list operators to unpack neighbors
 
 
-def create_cluster_arr(
-    clusters: Set[tuple[int]], num_clusters: int, max_cluster: int
-) -> np.ndarray:  # is there a better way to do this?
-    """
-    Converts the clusters from a set of tuples of indices to an 2D numpy array format which can be efficiently iterated through with Numba.
-
-    Parameters
-    ----------
-    clusters : Set[tuple[int]]
-        The set of tuples of clusters from neighbor_set_from_df()
-    num_clusters : int
-        The total number of clusters
-    max_cluster : int
-        The number of events in the largest cluster
-
-    Returns
-    -------
-    np.ndarray
-        The cluster data now in a 2D numpy array.
-    """
-    cluster_arr = np.full(
-        (num_clusters, max_cluster), -1, dtype=np.int64
-    )  # fill with -1; these will be passed later
-    for cluster_num, cluster in enumerate(clusters):
-        for event_num, event in enumerate(cluster):
-            cluster_arr[cluster_num, event_num] = event
-    return cluster_arr
-
-
-@numba.jit(nopython=True)
-def cluster_arr_to_cent(
-    cluster_arr: np.ndarray, events: np.ndarray, num_clusters: int, max_cluster: int
-) -> tuple[np.ndarray]:
+@numba.jit(nopython=True, cache=True)
+def centroid_clusters(
+    cluster_arr: np.ndarray, events: np.ndarray
+) -> tuple[np.ndarray]:  
     """
     Performs the centroiding of a group of clusters using Numba.  Note I originally attempted to unpack the clusters using list comprehensions, but this approach is significantly faster.
 
     Parameters
     ----------
-    clusters : Set[tuple[int]]
-        The set of tuples of clusters from neighbor_set_from_df()
-    num_clusters : int
-        The total number of clusters
-    max_cluster : int
-        The number of events in the largest clust
+    clusters : nd.array
+        The numpy representation of the clusters' event indices.
+    events : nd.array
+        The numpy represetation of the event data.
 
     Returns
     -------
     tuple[np.ndarray]
         t, xc, yc, ToT_max, ToT_sum, and n (number of events) in each cluster.
     """
+    num_clusters = cluster_arr.shape[0]
+    max_cluster = cluster_arr.shape[1]
     t = np.zeros(num_clusters, dtype="uint64")
     xc = np.zeros(num_clusters, dtype="float32")
     yc = np.zeros(num_clusters, dtype="float32")
@@ -527,13 +423,13 @@ def cluster_arr_to_cent(
         for event_num in range(max_cluster):
             event = cluster_arr[cluster_id, event_num]
             if event > -1:  # if we have an event here
-                if events[event, 3] > _ToT_max:  # find the max ToT, assign, use that time
-                    _ToT_max = events[event, 3]
-                    t[cluster_id] = events[event, 4]
+                if events[event, 2] > _ToT_max:  # find the max ToT, assign, use that time
+                    _ToT_max = events[event, 2]
+                    t[cluster_id] = events[event, 3]
                     ToT_max[cluster_id] = _ToT_max
-                xc[cluster_id] += events[event, 1] * events[event, 3]  # x and y centroids by time over threshold
-                yc[cluster_id] += events[event, 2] * events[event, 3]
-                ToT_sum[cluster_id] += events[event, 3]  # calcuate sum
+                xc[cluster_id] += events[event, 0] * events[event, 2]  # x and y centroids by time over threshold
+                yc[cluster_id] += events[event, 1] * events[event, 2]
+                ToT_sum[cluster_id] += events[event, 2]  # calcuate sum
                 n[cluster_id] += np.ubyte(1)  # number of events in cluster
             else:
                 break
@@ -568,60 +464,15 @@ def ingest_cent_data(
     }
 
 
-def cent_to_numpy(
-    cluster_arr: np.ndarray, events: int, num_clusters: int, max_cluster: int
-) -> Dict[str, np.ndarray]:
-    """
-    Wrapper function to perform ingest_cent_data(cluster_arr_to_cent())
-
-    Parameters
-    ----------
-    cluster_arr : np.ndarray
-        The array of cluster events from create_cluster_arr()
-    events : int
-        Number of photon events
-    num_clusters : int
-        The total number of clusters
-    max_cluster : int
-        The number of events in the largest clust
-
-    Returns
-    -------
-    Dict[str, np.ndarray]
-       Keys of t, xc, yc, ToT_max, ToT_sum, and n (number of events) in each cluster.
-    """
-    return ingest_cent_data(cluster_arr_to_cent(cluster_arr, events, num_clusters, max_cluster))
-
-
-def cent_to_df(
-    cd_np: Dict[str, np.ndarray]
-) -> pd.DataFrame:
-    """
-    Returns the centroided dataframe from the zipped inputs.
-
-    Parameters
-    ----------
-    cd_np : Dict[str, np.ndarray]
-        Dictionary of the clustered data.
-
-    Returns
-    -------
-    pd.DataFrame
-        Time sorted dataframe of the centroids.
-    """
-    cent_df = pd.DataFrame(cd_np)
-    return cent_df.sort_values("t").reset_index(drop=True)
-
-
-def raw_df_to_cluster_df(
-    raw_df: pd.DataFrame, tw: int = DEFAULT_CLUSTER_TW, radius: int = DEFAULT_CLUSTER_RADIUS
+def raw_df_to_cent_df(
+    df: pd.DataFrame, tw: int = DEFAULT_CLUSTER_TW, radius: int = DEFAULT_CLUSTER_RADIUS
 ) -> pd.DataFrame:
     """
     Uses functions defined herein to take Dataframe of raw data and return dataframe of clustered data.
 
     Parameters
     ----------
-    raw_df : pd.DataFrame
+    df : pd.DataFrame
         Pandas DataFrame of the raw data
     tw : int
         The time window to be considered "coincident" for clustering purposes
@@ -633,11 +484,10 @@ def raw_df_to_cluster_df(
     pd.DataFrame
         Pandas DataFrame of the centroided data.
     """
-    filt_cond_raw_df = drop_zero_tot(condense_raw_df(raw_df))
-    events, clusters = neighbor_set_from_df(filt_cond_raw_df, tw, radius)
-    num_clusters, max_cluster = cluster_stats(clusters)
-    cluster_arr = create_cluster_arr(clusters, num_clusters, max_cluster)
-    return cent_to_df(cent_to_numpy(cluster_arr, events, num_clusters, max_cluster))
+    fdf = drop_zero_tot(df)
+    cluster_arr, events = cluster_df(fdf, tw, radius)
+    data = centroid_clusters(cluster_arr, events)
+    return pd.DataFrame(ingest_cent_data(data)).sort_values("t").reset_index(drop=True) # should we specify the sort type here? this should be *almost* completely sorted already
 
 
 def add_centroid_cols(
@@ -659,107 +509,31 @@ def add_centroid_cols(
         Originally dataframe with new columns x, y, and t_ns added.
     """
     if gap:
-        df.loc[df['xc'] >= 255.5, 'xc'] += 2
-        df.loc[df['yc'] >= 255.5, 'yc'] += 2
-    df["x"] = np.round(df["xc"]).astype(np.uint16)
+        df.loc[df["xc"] >= 255.5, "xc"] += 2
+        df.loc[df["yc"] >= 255.5, "yc"] += 2
+    df["x"] = np.round(df["xc"]).astype(np.uint16) # sometimes you just want to know the closest pixel
     df["y"] = np.round(df["yc"]).astype(np.uint16)
-    df["t_ns"] = df["t"] / 4096 * 25
+    df["t_ns"] = (df["t"].astype(np.float64) * 1.5625) # better way to convert to ns while maintaining precision?
 
     return df
 
 
 """
-A bunch of functions to help process multiple related .tpx3 files into Pandas dataframes stored in .h5 files.
+Functions to help process multiple related .tpx3 files into Pandas dataframes stored in .h5 files.
 """ 
 RAW_H5_SUFFIX = ""
 CENT_H5_SUFFIX = "_cent"
 CONCAT_H5_SUFFIX = "_cent"
 
 
-def extract_fpaths_from_sid(
-    sid: int
-) -> List[str]:
-    """
-    Extract file paths from a given sid.
-    
-    Parameters 
-    ----------
-    sid : int
-        Short ID of a BlueSky scan
-        
-    Returns
-    -------
-    List[str]
-        Filepaths of the written .tpx3, as recorded in Tiled    
-    """
-    return list(db[sid].table()["tpx3_files_raw_filepaths"].to_numpy()[0])
-
-
-def extract_uid_from_fpaths(
-    fpaths: List[str]
-) -> str:
-    """
-    Extract scan unique ID from file paths.
-    
-    Parameters
-    ----------
-    fpaths : List[str]
-        List of the filepaths.
-        
-    Returns
-    -------
-    str
-        String of the first file's unique ID.
-    
-    """
-    return os.path.basename(fpaths[0])[:23]
-
-
-def extract_dir_from_fpaths(
-    fpaths: List[str]
-) -> str:
-    """
-    Extract directory from file paths.
-    
-    Parameters
-    ----------
-    fpaths : List[str]
-        List of the filepaths.
-        
-    Returns
-    -------
-    str 
-        String of the first file's directory.     
-    
-    """
-    return os.path.dirname(fpaths[0])
-
-
-def extract_uid_from_sid(
-    sid: int
-) -> str:
-    """
-    Extract user ID from a given sid.
-    
-    Parameters
-    ----------
-    sid : int
-    
-    Returns
-    -------
-    str
-        String of the short ID's corresponding unique ID.
-        
-    """
-    return extract_uid_from_fpaths(extract_fpaths_from_sid(sid))
-
-
-def convert_file(
-    fpath: Union[str, Path], time_window_microsecond: float = DEFAULT_CLUSTER_TW_MICROSECONDS, radius: int = DEFAULT_CLUSTER_RADIUS, print_details: bool = False
+def convert_tpx_file(
+    fpath: Union[str, Path], time_window_microsecond: float = DEFAULT_CLUSTER_TW_MICROSECONDS, radius: int = DEFAULT_CLUSTER_RADIUS, print_details: bool = False, overwrite: bool = True
 ):
     """
     Convert a .tpx3 file into raw and centroided Pandas dataframes, which are stored in .h5 files.
     
+    TO DO: Args to specify output directory (default will be same directory as .tpx3 file as is now).
+    
     Parameters
     ----------
     fpath : Union[str, Path]
@@ -770,193 +544,122 @@ def convert_file(
         The radius, in pixels, to perform centroiding
     print_details : bool = False
         Boolean toggle about whether to print detailed data.
+    overwrite : bool = True
+        Boolean toggle about whether to overwrite pre-existing data.
+        
     """
     fname, ext = os.path.splitext(fpath)
     dfname = "{}{}.h5".format(fname, RAW_H5_SUFFIX)
     dfcname = "{}{}.h5".format(fname, CONCAT_H5_SUFFIX)
     
     if ext == ".tpx3" and os.path.exists(fpath):
-        file_size = os.path.getsize(fpath)
-        have_df = os.path.exists(dfname)
-        have_dfc = os.path.exists(dfcname)
-
-        if have_df and have_dfc:
-            print("-> {} exists, skipping.".format(dfname))
-        else:
-            print("-> Processing {}, size: {:.1f} MB".format(fpath, file_size/1000000))
-            time_window = time_window_microsecond * 1e-6
-            time_stamp_conversion = 6.1e-12
-            timedif = int(time_window / time_stamp_conversion)
-            
-            if print_details:
-                print("Loading {} data into dataframe...".format(fpath))
-            df = raw_to_sorted_df(fpath)
-            num_events = df.shape[0]
-            
-            if print_details:
-                print("Loading {} complete. {} events found. Saving to: {}".format(fpath, num_events, dfname))
-            df.to_hdf(dfname, key='df', mode='w')
-            
-            if print_details:
-                print("Saving {} complete. Beginning clustering...".format(dfname))
-            df_c = raw_df_to_cluster_df(df, timedif, radius)
-            num_clusters = df_c.shape[0]
+        
+        try: 
             
+            file_size = os.path.getsize(fpath)
+            have_df = os.path.exists(dfname)
+            have_dfc = os.path.exists(dfcname)
+
+            if have_df and have_dfc and not overwrite:
+                
+                print("-> {} exists, skipping.".format(dfname))
+                
+            else:
+                
+                if print_details:
+                    print("-> Processing {}, size: {:.1f} MB".format(fpath, file_size/1000000))
+                
+                time_window = time_window_microsecond * 1e-6
+                time_stamp_conversion = 6.1e-12
+                timedif = int(time_window / time_stamp_conversion)
+
+                if print_details:
+                    print("Loading {} data into dataframe...".format(fpath))
+                    
+                df = tpx_to_raw_df(fpath)
+                num_events = df.shape[0]
+
+                if print_details:
+                    print("Loading {} complete. {} events found. Saving to: {}".format(fpath, num_events, dfname))
+                    
+                df.to_hdf(dfname, key="df", format="table", mode="w")
+
+                if print_details:
+                    print("Saving {} complete. Beginning clustering...".format(dfname))
+                    
+                df_c = raw_df_to_cent_df(df, timedif, radius)
+                
+                num_clusters = df_c.shape[0]
+
+                if print_details:
+                    print("Clustering {} complete. {} clusters found. Saving to {}".format(fpath, num_clusters, dfcname))
+                    
+                df_c.to_hdf(dfcname, key="df", format="table", mode="w")
+                
+                if print_details:
+                    print("Saving {} complete. Moving onto next file.".format(dfcname))
+                
+        except Exception as e:
+                
             if print_details:
-                print("Clustering {} complete. {} clusters found. Saving to {}".format(fpath, num_clusters, dfcname))
-            df_c.to_hdf(dfcname, key='df', mode='w')
-            print("Saving {} complete. Moving onto next file.".format(dfcname))
+                print("Conversion of {} failed due to {}, moving on.".format(fpath,e))
+                
     else:
-        print("File not found. Moving onto next file.")
-        
-            
-def convert_tpx3_parallel(
-    fpaths: Union[str, Path], num_workers: int = None
-):
+        if print_details:
+            print("File not found. Moving onto next file.")
+
+
+def convert_tpx3_files_parallel(fpaths: Union[List[str], List[Path]], num_workers: int = None):
     """
-    Convert a list of .tpx3 files in a parallel processing pool.
+    Convert a list .tpx3 files in parallel using multiprocessing and convert_tpx_file().
+    
+    TO DO: Accept more arguments for convert_tpx_file.
     
     Parameters
     ----------
-    fpaths : Union[str, Path]
-        .tpx3 file paths to convert in a parallel processing pool.
-    num_workers : int = None
-        Number of parallel workers to employ.
+    fpath : Union[str, Path]
+        .tpx3 file path
+    time_window_microsecond : float = DEFAULT_CLUSTER_TW_MICROSECONDS
+        The time window, in microseconds, to perform centroiding
+    radius : int = DEFAULT_CLUSTER_RADIUS
+        The radius, in pixels, to perform centroiding
+    print_details : bool = False
+        Boolean toggle about whether to print detailed data.
+    overwrite : bool = True
+        Boolean toggle about whether to overwrite pre-existing data.
+        
     """
-    if num_workers == None:
-        num_cores = multiprocessing.cpu_count()
-        max_workers = num_cores-1
+    if num_workers is None:
+        max_workers = multiprocessing.cpu_count()
     else:
         max_workers = num_workers
-    
-    with multiprocessing.Pool(processes=max_workers) as pool:
-        pool.map(convert_file, fpaths)
-    
-    print("Parallel conversion complete")
-    
-
-def convert_tpx3_st(fpaths: Union[str, Path]):
-    """
-    Convert a list of .tpx3 files in a single thread.
-    
-    Parameters
-    ----------
-    fpaths : Union[str, Path]
-        .tpx3 file paths to convert in a single thread.
-    """
-    for file in fpaths:
-        convert_file(file)
-        
 
-def get_cent_files(
-    uid: str, dir_name: Union[str, Path]
-) -> List[str]:
-    """
-    Gets a list of the centroided .h5 files from a given uid, sorted by sequence number.
-    
-    Parameters
-    ----------
-    uid : str
-        The unique ID of the scan of which we want to get the files.
+    with multiprocessing.Pool(processes=max_workers) as pool:
+        for _ in tqdm(pool.imap_unordered(convert_tpx_file, fpaths), total=len(fpaths)):
+            pass
         
-    dir_name : Union[str, path]
-        Directory to look in for the files.
         
-    Returns
-    -------
-    List[str]
-        List of the centroided file paths.
+def convert_tpx3_files(fpaths: Union[List[str], List[Path]], print_details = True):
     """
-    cent_files = [
-        os.path.join(dir_name, file)
-        for file in os.listdir(dir_name)
-        if file.endswith("{}.h5".format(CENT_H5_SUFFIX)) and str(uid) in file and len(os.path.basename(file)) == 44
-    ]
-
-    cent_files.sort(key=lambda f: int(os.path.splitext(os.path.basename(f))[0].split("_")[-2]))
-    return cent_files
-
-
-def concat_cent_files(
-    cfpaths: List[Union[str, Path]]
-):
-    """
-    Concatenates several centroided files together.
+    Convert a list .tpx3 files in a single process using convert_tpx_file().
     
-    Parameters
-    ----------
-    cfpaths : List[str, Path]
-        List of the centroided .h5 files to concatenate together.
-    """
-    dir_name = os.path.dirname(cfpaths[0])
-    uid = extract_uid_from_fpaths(cfpaths)
-    
-    dfs = [pd.read_hdf(fpath, key='df') for fpath in tqdm(cfpaths)]
-    combined_df = pd.concat(dfs).reset_index(drop=True)
-    
-    save_path = os.path.join(dir_name, "{}{}.h5".format(uid, CONCAT_H5_SUFFIX))
-    combined_df.to_hdf(save_path, key='df', mode='w')
-    
-    print("-> Saving complete.")
-    
-
-def get_con_cent_file(
-    sid: int
-) -> str:
-    """
-    Gets the location of the concatenated centroid files of a given sid.
+    TO DO: Accept more arguments for convert_tpx_file.
     
     Parameters
     ----------
-    sid : int
-        Short ID of whichto get the centroided file path
+    fpath : Union[str, Path]
+        .tpx3 file path
+    time_window_microsecond : float = DEFAULT_CLUSTER_TW_MICROSECONDS
+        The time window, in microseconds, to perform centroiding
+    radius : int = DEFAULT_CLUSTER_RADIUS
+        The radius, in pixels, to perform centroiding
+    print_details : bool = False
+        Boolean toggle about whether to print detailed data.
+    overwrite : bool = True
+        Boolean toggle about whether to overwrite pre-existing data.
         
-    Returns
-    -------
-    str
-        Path of the centroided file.
     """
-    fpaths = extract_fpaths_from_sid(sid)
-    uid = extract_uid_from_fpaths(fpaths)
-    dir_name = extract_dir_from_fpaths(fpaths)
-    cfpath = os.path.join(dir_name, "{}{}.h5".format(uid, CONCAT_H5_SUFFIX))
-    
-    if os.path.exists(cfpath):
-        return cfpath
-    else:
-        print("-> Warning: {} does not exist".format(cfpath))
-        return None
-
-    
-def convert_sids(
-    sids: List[int]
-):
-    """
-    Convert given sids by converting each .tpx3 file and then concatenating results together into a master dataframe.
-    
-    Parameters
-    ----------
-    sids : List[int]
-        List of BlueSky scans' short IDs to convert.
-    """
-    
-    for sid in sids:
-        print("\n\n---> Beginning sid: {} <---\n".format(sid))
+    for file in fpaths:
+        convert_tpx_file(file, print_details = print_details)
         
-        tpx3fpaths = extract_fpaths_from_sid(sid)
-        dir_name = extract_dir_from_fpaths(tpx3fpaths)
-        num_tpx = len(tpx3fpaths)
-        uid = extract_uid_from_fpaths(tpx3fpaths)
-
-        convert_tpx3_parallel(tpx3fpaths, num_workers=16)
-        centfpaths = get_cent_files(uid, dir_name)
-        num_cent = len(centfpaths)
-
-        if num_tpx == num_cent:
-            print("---> Conversion numbers match")
-            concat_cent_files(centfpaths)
-        else:
-            print("---> Warning: conversion mismatch: tpx3={}, cent={}".format(num_tpx, num_cent))
-
-        print("---> Done with {}!".format(sid))
-        gc.collect() 
+        
\ No newline at end of file
diff --git a/tpx3awkward/examples/example.ipynb b/tpx3awkward/examples/example.ipynb
new file mode 100644
index 0000000..6eeabb1
--- /dev/null
+++ b/tpx3awkward/examples/example.ipynb
@@ -0,0 +1,351 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "76010898-2082-49cb-9bff-8173fd79ad6e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import _utils as tpx\n",
+    "import pandas as pd\n",
+    "from tiled.client import from_uri\n",
+    "db = from_uri('https://tiled.nsls2.bnl.gov', 'dask')['chx']['raw']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "59bb0e40-1686-413c-8595-c38655e2d0d6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "sid = 143200 # example SID (1800 partitions at 1 sec/partition from NSLS-II quantum microscope project)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "1b629cf7-1d46-49b9-a41b-43f3b9972f4e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Raw dataframe: \n",
+      "          x    y  ToT          t  chip\n",
+      "0       502  452  475   11301032     1\n",
+      "1       405  272  275   11302904     1\n",
+      "2       406  272  350   11302904     1\n",
+      "3       295  135  525   11303039     0\n",
+      "4       358  509  300   11303780     1\n",
+      "...     ...  ...  ...        ...   ...\n",
+      "971254  200   46  550  651295354     3\n",
+      "971255  489  104  450  651296021     0\n",
+      "971256  389   21  300  651296305     0\n",
+      "971257  390   21  300  651296306     0\n",
+      "971258   37  325   75  651299518     2\n",
+      "\n",
+      "[971259 rows x 5 columns]\n",
+      "Centroided dataframe: \n",
+      "                t          xc     yc  ToT_max  ToT_sum  n\n",
+      "0        11301032  502.000000  452.0      475      475  1\n",
+      "1        11302904  405.559998  272.0      350      625  2\n",
+      "2        11303039  295.000000  135.0      525      525  1\n",
+      "3        11303780  358.478271  509.0      300      575  2\n",
+      "4        11303973  412.409088   60.0      325      550  2\n",
+      "...           ...         ...    ...      ...      ... ..\n",
+      "659582  651294809  326.000000  379.0      475      475  1\n",
+      "659583  651295354  200.000000   46.0      550      550  1\n",
+      "659584  651296021  489.000000  104.0      450      450  1\n",
+      "659585  651296305  389.500000   21.0      300      600  2\n",
+      "659586  651299518   37.000000  325.0       75       75  1\n",
+      "\n",
+      "[659587 rows x 6 columns]\n",
+      "Centroided dataframe: \n",
+      "                t          xc     yc  ToT_max  ToT_sum  n    x    y  \\\n",
+      "0        11301032  504.000000  454.0      475      475  1  504  454   \n",
+      "1        11302904  407.559998  274.0      350      625  2  408  274   \n",
+      "2        11303039  297.000000  135.0      525      525  1  297  135   \n",
+      "3        11303780  360.478271  511.0      300      575  2  360  511   \n",
+      "4        11303973  414.409088   60.0      325      550  2  414   60   \n",
+      "...           ...         ...    ...      ...      ... ..  ...  ...   \n",
+      "659582  651294809  328.000000  381.0      475      475  1  328  381   \n",
+      "659583  651295354  200.000000   46.0      550      550  1  200   46   \n",
+      "659584  651296021  491.000000  104.0      450      450  1  491  104   \n",
+      "659585  651296305  391.500000   21.0      300      600  2  392   21   \n",
+      "659586  651299518   37.000000  327.0       75       75  1   37  327   \n",
+      "\n",
+      "                t_ns  \n",
+      "0       1.765786e+07  \n",
+      "1       1.766079e+07  \n",
+      "2       1.766100e+07  \n",
+      "3       1.766216e+07  \n",
+      "4       1.766246e+07  \n",
+      "...              ...  \n",
+      "659582  1.017648e+09  \n",
+      "659583  1.017649e+09  \n",
+      "659584  1.017650e+09  \n",
+      "659585  1.017650e+09  \n",
+      "659586  1.017655e+09  \n",
+      "\n",
+      "[659587 rows x 9 columns]\n",
+      "Partition duration: 0.999997634375 sec.\n"
+     ]
+    }
+   ],
+   "source": [
+    "files = db[sid]['primary']['data']['tpx3_files_raw_filepaths'][0].compute()\n",
+    "\n",
+    "file = files[0][5:] # cut off 'file:' from beginning\n",
+    "\n",
+    "df = tpx.tpx_to_raw_df(file)\n",
+    "cdf = tpx.raw_df_to_cent_df(df)\n",
+    "\n",
+    "print('Raw dataframe: ')\n",
+    "print(df)\n",
+    "\n",
+    "print('Centroided dataframe: ')\n",
+    "print(cdf)\n",
+    "\n",
+    "cdf = tpx.add_centroid_cols(cdf)\n",
+    "print('Centroided dataframe: ')\n",
+    "print(cdf)\n",
+    "\n",
+    "duration = (cdf.iloc[-1]['t_ns'] - cdf.iloc[0]['t_ns'])/1e9\n",
+    "print(f'Partition duration: {duration} sec.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "48254ae7-8d0c-4766-b1d1-2b7f4732934c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sid = 143201 # new example SID (1800 partitions at 1 sec/partition from NSLS-II quantum microscope project)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "2adae433-a68c-436e-9640-d4a9745af969",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "-> Processing /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000.tpx3, size: 10.5 MB\n",
+      "Loading /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000.tpx3 data into dataframe...\n",
+      "Loading /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000.tpx3 complete. 975938 events found. Saving to: /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000.h5\n",
+      "Saving /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000.h5 complete. Beginning clustering...\n",
+      "Clustering /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000.tpx3 complete. 659537 clusters found. Saving to /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000_cent.h5\n",
+      "Saving /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000_cent.h5 complete. Moving onto next file.\n",
+      "-> Processing /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000001.tpx3, size: 10.5 MB\n",
+      "Loading /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000001.tpx3 data into dataframe...\n",
+      "Loading /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000001.tpx3 complete. 972867 events found. Saving to: /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000001.h5\n",
+      "Saving /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000001.h5 complete. Beginning clustering...\n",
+      "Clustering /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000001.tpx3 complete. 657491 clusters found. Saving to /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000001_cent.h5\n",
+      "Saving /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000001_cent.h5 complete. Moving onto next file.\n"
+     ]
+    }
+   ],
+   "source": [
+    "files = db[sid]['primary']['data']['tpx3_files_raw_filepaths'][0].compute()\n",
+    "files = [f.replace('file:', '') for f in files]\n",
+    "tpx.convert_tpx3_files(files[0:2])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "05332e5b-17eb-4074-ad92-51ef1273e9f4",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Raw dataframe: \n",
+      "          x    y  ToT          t  chip\n",
+      "0       210  132  575   11294477     3\n",
+      "1       106  227  375   11294540     3\n",
+      "2       107  227  150   11294544     3\n",
+      "3       381  346  475   11294882     1\n",
+      "4       268  444  450   11295743     1\n",
+      "...     ...  ...  ...        ...   ...\n",
+      "975933  103  441  600  651291179     2\n",
+      "975934  193  383  450  651294115     2\n",
+      "975935  121  280  400  651294141     2\n",
+      "975936  256  373  400  651294145     1\n",
+      "975937  255  373   50  651294166     2\n",
+      "\n",
+      "[975938 rows x 5 columns]\n",
+      "Centroided dataframe: \n",
+      "                t          xc     yc  ToT_max  ToT_sum  n\n",
+      "0        11294477  210.000000  132.0      575      575  1\n",
+      "1        11294540  106.285713  227.0      375      525  2\n",
+      "2        11294882  381.000000  346.0      475      475  1\n",
+      "3        11295743  268.000000  444.0      450      450  1\n",
+      "4        11298466  420.000000   52.0      450      450  1\n",
+      "...           ...         ...    ...      ...      ... ..\n",
+      "659532  651290601  101.000000  436.0      575      575  1\n",
+      "659533  651291179  103.000000  441.0      600      600  1\n",
+      "659534  651294115  193.000000  383.0      450      450  1\n",
+      "659535  651294141  121.000000  280.0      400      400  1\n",
+      "659536  651294145  255.888885  373.0      400      450  2\n",
+      "\n",
+      "[659537 rows x 6 columns]\n",
+      "Centroided dataframe: \n",
+      "                t          xc     yc  ToT_max  ToT_sum  n    x    y  \\\n",
+      "0        11294477  210.000000  132.0      575      575  1  210  132   \n",
+      "1        11294540  106.285713  227.0      375      525  2  106  227   \n",
+      "2        11294882  383.000000  348.0      475      475  1  383  348   \n",
+      "3        11295743  270.000000  446.0      450      450  1  270  446   \n",
+      "4        11298466  422.000000   52.0      450      450  1  422   52   \n",
+      "...           ...         ...    ...      ...      ... ..  ...  ...   \n",
+      "659532  651290601  101.000000  438.0      575      575  1  101  438   \n",
+      "659533  651291179  103.000000  443.0      600      600  1  103  443   \n",
+      "659534  651294115  193.000000  385.0      450      450  1  193  385   \n",
+      "659535  651294141  121.000000  282.0      400      400  1  121  282   \n",
+      "659536  651294145  257.888885  375.0      400      450  2  258  375   \n",
+      "\n",
+      "                t_ns  \n",
+      "0       1.764762e+07  \n",
+      "1       1.764772e+07  \n",
+      "2       1.764825e+07  \n",
+      "3       1.764960e+07  \n",
+      "4       1.765385e+07  \n",
+      "...              ...  \n",
+      "659532  1.017642e+09  \n",
+      "659533  1.017642e+09  \n",
+      "659534  1.017647e+09  \n",
+      "659535  1.017647e+09  \n",
+      "659536  1.017647e+09  \n",
+      "\n",
+      "[659537 rows x 9 columns]\n",
+      "Partition duration: 0.99999948125 sec.\n"
+     ]
+    }
+   ],
+   "source": [
+    "df = pd.read_hdf('/nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000.h5')\n",
+    "cdf = pd.read_hdf('/nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000_cent.h5')\n",
+    "\n",
+    "print('Raw dataframe: ')\n",
+    "print(df)\n",
+    "\n",
+    "print('Centroided dataframe: ')\n",
+    "print(cdf)\n",
+    "\n",
+    "cdf = tpx.add_centroid_cols(cdf)\n",
+    "print('Centroided dataframe: ')\n",
+    "print(cdf)\n",
+    "\n",
+    "duration = (cdf.iloc[-1]['t_ns'] - cdf.iloc[0]['t_ns'])/1e9\n",
+    "print(f'Partition duration: {duration} sec.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "c981be40-e631-4f73-bbab-4fadda8b9285",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 1800/1800 [07:51<00:00,  3.82it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "try:\n",
+    "    files = db[sid]['primary']['data']['tpx3_files_raw_filepaths'][0].compute()\n",
+    "    files = [f.replace('file:', '') for f in files]\n",
+    "    tpx.convert_tpx3_files_parallel(files)\n",
+    "    \n",
+    "except Exception as e:\n",
+    "    print(f'Could not finish {sid} due to {e}') "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "946fa74f-8c8c-4f36-bc0c-9f203e023f78",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "All raw .h5 files exist!\n",
+      "All centroided raw .h5 files exist!\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "\n",
+    "raw_h5_files = [os.path.exists(f.replace('.tpx3', '.h5')) for f in files]\n",
+    "cent_h5_files = [os.path.exists(f.replace('.tpx3', '_cent.h5')) for f in files]\n",
+    "\n",
+    "if all(raw_h5_files):\n",
+    "    print('All raw .h5 files exist!')\n",
+    "else:\n",
+    "    print('Missing .h5 files!')\n",
+    "    \n",
+    "if all(cent_h5_files):\n",
+    "    print('All centroided raw .h5 files exist!')\n",
+    "else:\n",
+    "    print('Missing centroided .h5 files!')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "705c639d-dac9-4517-b5ff-ce44365ed4df",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From 065d6075e07875332c8bc9e35b76fae1531709b9 Mon Sep 17 00:00:00 2001
From: Eugene M <ymatviych@bnl.gov>
Date: Mon, 19 Aug 2024 17:00:02 -0400
Subject: [PATCH 09/15] FIX: adjusting timestamps before heartbeat

---
 tpx3awkward/_utils.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/tpx3awkward/_utils.py b/tpx3awkward/_utils.py
index 8e656f0..1ea71d8 100644
--- a/tpx3awkward/_utils.py
+++ b/tpx3awkward/_utils.py
@@ -206,9 +206,12 @@ def _ingest_raw_data(data):
     tot = np.zeros_like(data, dtype="u4")
     ts = np.zeros_like(data, dtype="u8")
     heartbeat_lsb = None  #np.uint64(0)
+    heartbeat_msb = None  #np.uint64(0)
     heartbeat_time = np.uint64(0)
+    hb_init_flag = False    # Indicate when the heartbeat was set for the first time
 
     photon_count, chip_indx, msg_run_count, expected_msg_count = 0, 0, 0, 0
+
     for msg in data:
         if is_packet_header(msg):
             # Type 1: packet header (id'd via TPX3 magic number)
@@ -230,21 +233,21 @@ def _ingest_raw_data(data):
             y[photon_count] = _y
             tot[photon_count] = _tot
             ts[photon_count] = _ts
-            
-            if (photon_count > 0) and (_ts > ts[photon_count-1]) and (_ts - ts[photon_count-1] > 2**30):
+
+            # Adjust timestamps that were set before the first heartbeat was received
+            if hb_init_flag and (photon_count > 0):
                 prev_ts = ts[:photon_count]   # This portion needs to be adjusted
                 # Find what the current timestamp would be without global heartbeat
                 _, _, _, _ts_0 = decode_message(msg, chip_indx, heartbeat_time=np.uint64(0))
-                # Check if there is a SPIDR rollover in the beginning of the file but before heartbeat was received
+                # Check if there is a SPIDR rollover in the beginning of the file but before the received
                 head_max = max(prev_ts[:10])
                 tail_min = min(prev_ts[-10:])
-                # Compare the difference with some big number (e.g. 1/4 of SPIDR)
                 if (head_max > tail_min) and (head_max - tail_min > 2**32):
                     prev_ts[prev_ts < 2**33] += np.uint64(2**34)
                     _ts_0 += np.uint64(2**34)
-                # Ajust already processed timestamps
                 ts[:photon_count] = prev_ts + (_ts - _ts_0)
             
+            hb_init_flag = False
             photon_count += 1
             msg_run_count += 1
 
@@ -257,11 +260,13 @@ def _ingest_raw_data(data):
             # Type 4: global timestap (id'd via 0x4 upper nibble)
             subheader = (msg >> np.uint(56)) & np.uint64(0x0F)
             if subheader == 0x4:
-                # timer LSB, 32 bits of time
+                # timer LSB, 32 bits of time -- needs to be received first, before MSB
                 heartbeat_lsb = (msg >> np.uint(16)) & np.uint64(0xFFFFFFFF)
             elif subheader == 0x5:
                 # timer MSB -- only matters if LSB has been received already
                 if heartbeat_lsb is not None:
+                    if heartbeat_msb is None:
+                        hb_init_flag = True
                     heartbeat_msb = ((msg >> np.uint(16)) & np.uint64(0xFFFF)) << np.uint(32)
                     heartbeat_time = (heartbeat_msb | heartbeat_lsb)
                     # TODO the c++ code has large jump detection, do not understand why
@@ -279,7 +284,6 @@ def _ingest_raw_data(data):
         else:
             raise Exception(f"Not supported: {msg}")
 
-
     # Sort the timestamps
     indx = np.argsort(ts[:photon_count], kind="mergesort")
     chips = chips[indx]

From ba3d5528b087decd1037d81b33d631790ce0bf44 Mon Sep 17 00:00:00 2001
From: Eugene M <ymatviych@bnl.gov>
Date: Mon, 19 Aug 2024 17:07:10 -0400
Subject: [PATCH 10/15] FIX: adjusting timestamps before heartbeat

---
 tpx3awkward/_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tpx3awkward/_utils.py b/tpx3awkward/_utils.py
index 1ea71d8..31a393d 100644
--- a/tpx3awkward/_utils.py
+++ b/tpx3awkward/_utils.py
@@ -246,6 +246,8 @@ def _ingest_raw_data(data):
                     prev_ts[prev_ts < 2**33] += np.uint64(2**34)
                     _ts_0 += np.uint64(2**34)
                 ts[:photon_count] = prev_ts + (_ts - _ts_0)
+
+            # TODO: Do we want to adjust timestamps after SPIDR rollower even if no heartbeat was received at all?
             
             hb_init_flag = False
             photon_count += 1

From 7f2b57b01aa6fcdfcc6514fd3b39a01c90147687 Mon Sep 17 00:00:00 2001
From: Eugene M <ymatviych@bnl.gov>
Date: Mon, 19 Aug 2024 17:23:29 -0400
Subject: [PATCH 11/15] ENH: apply SPIDR rollover with single partition if no
 heartbeat is received

---
 tpx3awkward/_utils.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/tpx3awkward/_utils.py b/tpx3awkward/_utils.py
index 31a393d..1ad253e 100644
--- a/tpx3awkward/_utils.py
+++ b/tpx3awkward/_utils.py
@@ -243,11 +243,9 @@ def _ingest_raw_data(data):
                 head_max = max(prev_ts[:10])
                 tail_min = min(prev_ts[-10:])
                 if (head_max > tail_min) and (head_max - tail_min > 2**32):
-                    prev_ts[prev_ts < 2**33] += np.uint64(2**34)
+                    prev_ts[prev_ts < (tail_min+head_max)/2] += np.uint64(2**34)
                     _ts_0 += np.uint64(2**34)
                 ts[:photon_count] = prev_ts + (_ts - _ts_0)
-
-            # TODO: Do we want to adjust timestamps after SPIDR rollower even if no heartbeat was received at all?
             
             hb_init_flag = False
             photon_count += 1
@@ -286,6 +284,13 @@ def _ingest_raw_data(data):
         else:
             raise Exception(f"Not supported: {msg}")
 
+    # Check if there were no heartbeat messages and adjust for potential SPIDR rollovers
+    if heartbeat_msb is None:
+        head_max = max(ts[:10])
+        tail_min = min(ts[-10:])
+        if (head_max > tail_min) and (head_max - tail_min > 2**32):
+            ts[ts < (tail_min+head_max)/2] += np.uint64(2**34)
+
     # Sort the timestamps
     indx = np.argsort(ts[:photon_count], kind="mergesort")
     chips = chips[indx]

From 4dcf196476cff95e9c6f7ec0c51a44bca332ece0 Mon Sep 17 00:00:00 2001
From: Eugene M <ymatviych@bnl.gov>
Date: Mon, 19 Aug 2024 17:27:32 -0400
Subject: [PATCH 12/15] ENH: apply SPIDR rollover with single partition if no
 heartbeat is received

---
 tpx3awkward/_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tpx3awkward/_utils.py b/tpx3awkward/_utils.py
index 1ad253e..73f6d71 100644
--- a/tpx3awkward/_utils.py
+++ b/tpx3awkward/_utils.py
@@ -239,7 +239,7 @@ def _ingest_raw_data(data):
                 prev_ts = ts[:photon_count]   # This portion needs to be adjusted
                 # Find what the current timestamp would be without global heartbeat
                 _, _, _, _ts_0 = decode_message(msg, chip_indx, heartbeat_time=np.uint64(0))
-                # Check if there is a SPIDR rollover in the beginning of the file but before the received
+                # Check if there is a SPIDR rollover in the beginning of the file before the heartbeat
                 head_max = max(prev_ts[:10])
                 tail_min = min(prev_ts[-10:])
                 if (head_max > tail_min) and (head_max - tail_min > 2**32):

From 4be0cee224a28ad8e6e261b1ced3adc65d2e24fe Mon Sep 17 00:00:00 2001
From: Eugene M <ymatviych@bnl.gov>
Date: Thu, 12 Sep 2024 17:59:45 -0400
Subject: [PATCH 13/15] ENH: raise warning when no heartbeat messages received
 in a file

---
 tpx3awkward/_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tpx3awkward/_utils.py b/tpx3awkward/_utils.py
index 73f6d71..dcd1d16 100644
--- a/tpx3awkward/_utils.py
+++ b/tpx3awkward/_utils.py
@@ -286,6 +286,7 @@ def _ingest_raw_data(data):
 
     # Check if there were no heartbeat messages and adjust for potential SPIDR rollovers
     if heartbeat_msb is None:
+        warnings.warn("No heartbeat messages received; decoded timestamps may be inaccurate.")
         head_max = max(ts[:10])
         tail_min = min(ts[-10:])
         if (head_max > tail_min) and (head_max - tail_min > 2**32):

From c9f9d92f05a42b531ac7e27862a706e018cb7ada Mon Sep 17 00:00:00 2001
From: Eugene M <ymatviych@bnl.gov>
Date: Thu, 12 Sep 2024 18:12:58 -0400
Subject: [PATCH 14/15] FIX: errors in flake8 config

---
 .flake8 | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.flake8 b/.flake8
index 90019b0..10a446f 100644
--- a/.flake8
+++ b/.flake8
@@ -8,4 +8,7 @@ exclude =
     tpx3awkward/_version.py,
     docs/source/conf.py
 max-line-length = 115
-ignore = E203, W503  # Ignore some style 'errors' produced while formatting by 'black'
+ignore =
+    # Ignore some style 'errors' produced while formatting by 'black'
+    E203,
+    W503

From b2330182b727658f24010cb470da2a2f0caf669a Mon Sep 17 00:00:00 2001
From: Justin Goodrich <jgoodrich@bnl.gov>
Date: Tue, 10 Jun 2025 22:08:22 -0400
Subject: [PATCH 15/15] Updated to current production version.

---
 AUTHORS.rst                        |   2 +
 requirements.txt                   |  11 +-
 tpx3awkward/_utils.py              | 619 +++++++++++++++++++++--------
 tpx3awkward/examples/example.ipynb | 351 ----------------
 4 files changed, 460 insertions(+), 523 deletions(-)
 delete mode 100644 tpx3awkward/examples/example.ipynb

diff --git a/AUTHORS.rst b/AUTHORS.rst
index 7a93849..1187ddc 100644
--- a/AUTHORS.rst
+++ b/AUTHORS.rst
@@ -10,4 +10,6 @@ Maintainer
 Contributors
 ------------
 
+* Thomas Caswell <tcaswell@bnl.gov>
 * Justin Goodrich <jgoodrich@bnl.gov>
+* Eugene Matviychuk <ymatviych1@bnl.gov>
diff --git a/requirements.txt b/requirements.txt
index 683f27b..30caa21 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,13 @@
 # List required packages in this file, one per line.
+os
+pathlib
+typing
 numpy
 numba
 pandas
-scipy
-# pyCHX
+multiprocessing
+functools
+warnings
+glob
+tqdm
+gc
diff --git a/tpx3awkward/_utils.py b/tpx3awkward/_utils.py
index 7a337cb..c4d03f5 100644
--- a/tpx3awkward/_utils.py
+++ b/tpx3awkward/_utils.py
@@ -5,10 +5,12 @@
 from numpy.typing import NDArray
 import numba
 import pandas as pd
-from scipy.spatial import KDTree
 import multiprocessing
-from tqdm import tqdm
+from functools import partial
 import warnings
+import glob
+from tqdm import tqdm 
+import gc
 
 
 IA = NDArray[np.uint64]
@@ -210,8 +212,8 @@ def _ingest_raw_data(data):
     for msg in data:
         if is_packet_header(msg):
             # Type 1: packet header (id'd via TPX3 magic number)
-            if expected_msg_count != msg_run_count:
-                print("Missing messages!", msg)
+            #if expected_msg_count != msg_run_count:
+                #print("Missing messages!", msg)
 
             # extract the chip number for the following photon events
             chip_indx = np.uint8(get_block(msg, 8, 32))
@@ -276,12 +278,14 @@ def _ingest_raw_data(data):
             # TODO handle this!
             msg_run_count += 1
 
-        else:
-            raise Exception(f"Not supported: {msg}")
+        #else:
+            #print(f"Exception 'Not supported: {msg}'")
+            #raise Exception(f"Not supported: {msg}")
 
     # Check if there were no heartbeat messages and adjust for potential SPIDR rollovers
     if heartbeat_msb is None:
-        warnings.warn("No heartbeat messages received; decoded timestamps may be inaccurate.")
+        #warnings.warn("No heartbeat messages received; decoded timestamps may be inaccurate.")
+        #print("No heartbeat messages received; decoded timestamps may be inaccurate.")
         head_max = max(ts[:10])
         tail_min = min(ts[-10:])
         if (head_max > tail_min) and (head_max - tail_min > 2**32):
@@ -332,7 +336,7 @@ def tpx_to_raw_df(fpath: Union[str, Path]) -> pd.DataFrame:
        DataFrame of raw events from the .tpx3 file.
     """
     raw_df = pd.DataFrame(ingest_raw_data(raw_as_numpy(fpath)))
-    return raw_df.sort_values("t").reset_index(drop=True) # should we specify the sorting algorithm? at this point, it should be sorted anyway, but I think dataframes need to be explicitly sorted for use in e.g. merge_asof?
+    return raw_df.sort_values("t").reset_index(drop=True) # should we specify the sorting algorithm? at this point? it should be sorted anyway, but I think dataframes need to be explicitly sorted for use in e.g. merge_asof?
 
 
 def drop_zero_tot(df: pd.DataFrame) -> pd.DataFrame:
@@ -365,69 +369,88 @@ def drop_zero_tot(df: pd.DataFrame) -> pd.DataFrame:
 DEFAULT_CLUSTER_TW = int(DEFAULT_CLUSTER_TW_MICROSECONDS * MICROSECOND / TIMESTAMP_VALUE)
 
 
-def cluster_df(
-    df: pd.DataFrame, tw: int = DEFAULT_CLUSTER_TW, radius: int = DEFAULT_CLUSTER_RADIUS
-) -> tuple[np.ndarray, np.ndarray]:
+def cluster_df_optimized(df, tw = DEFAULT_CLUSTER_TW, radius = DEFAULT_CLUSTER_RADIUS):
+    events = df[["t", "x", "y", "ToT", "t"]].to_numpy()
+    events[:, 0] = np.floor_divide(events[:, 0], tw)  # Bin timestamps into time windows
+
+    labels = cluster_df(events, radius, tw)
+
+    return labels, events[:, 1:]
+
+
+@numba.jit(nopython=True, cache=True)
+def cluster_df(events, radius = DEFAULT_CLUSTER_TW, tw = DEFAULT_CLUSTER_RADIUS):
+    n = len(events)
+    labels = np.full(n, -1, dtype=np.int64)
+    cluster_id = 0
+
+    max_time = radius * tw  # maximum time difference allowed for clustering
+    radius_sq = radius ** 2  
+
+    for i in range(n):
+        if labels[i] == -1:  # if event is unclustered
+            labels[i] = cluster_id
+            for j in range(i + 1, n):  # scan forward only
+                if events[j, 4] - events[i, 4] > max_time:  # early exit based on time
+                    break
+                # Compute squared Euclidean distance 
+                dx = events[i, 0] - events[j, 0]
+                dy = events[i, 1] - events[j, 1]
+                dt = events[i, 2] - events[j, 2]
+                distance_sq = dx * dx + dy * dy + dt * dt
+
+                if distance_sq <= radius_sq: 
+                    labels[j] = cluster_id
+            cluster_id += 1
+
+    return labels
+
+
+@numba.jit(nopython=True, cache=True)
+def group_indices(labels):
     """
-    Uses scipy.spatial's KDTree to cluster raw input data. Requires a time window for clustering adjacent pixels and the total search radius.
+    Group indices by cluster ID using pre-allocated arrays in a Numba-optimized way.
 
     Parameters
     ----------
-    df : pd.DataFrame
-        DataFrame with the raw data (after timesorting and ToT filtering).
-    tw : int
-        The time window to be considered "coincident" for clustering purposes
-    radius : int
-        The search radius, using Euclidean distance of x, y, timestamp/tw
+    labels : np.ndarray
+        Array of cluster labels for each event.
+    num_clusters : int
+        Number of unique clusters.
+    max_cluster_size : int
+        Maximum number of events in a single cluster.
 
     Returns
     -------
     np.ndarray
-        Numpy representation of the raw events being used in the clustering.
-    Set[tuple[int]]
-        An set of tuples of the indices of the clustered events.  The outer set is each cluster, and the inner tuples are the events in each cluster.
-    """
-    events = np.array(
-        df[["t", "x", "y", "ToT", "t"]].values # raw data stored in dataframe. duplicate timestamp column as the first instance is windowed
-    )  # first three columns are for search radius of KDTree
-    events[:, 0] = np.floor_divide(events[:, 0], tw)  # bin by the time window
-    tree = KDTree(events[:, :3])  # generate KDTree based off the coordinates (t/timewindow, x, y)
-    neighbors = tree.query_ball_tree(
-        tree, radius
-    )  # compare tree against itself to find neighbors within the search radius
-    if len(neighbors) >= 2147483647: # performance is marginally better if can use int32 for the indices, so check for that
-        dtype = np.int64
-    else:
-        dtype = np.int32
-    return pd.DataFrame(neighbors).fillna(-1).astype(dtype).drop_duplicates().values, events[:, 1:] # a bit weird, but faster than using sets and list operators to unpack neighbors
+        A 2D NumPy array of shape (num_clusters, max_cluster_size), where each row corresponds to a cluster 
+        and contains event indices padded with -1 for unused slots.
+    """
+    num_clusters = np.max(labels) + 1  # Assume no noise, all labels are valid clusters
+    max_cluster_size = np.bincount(labels).max()
+    cluster_array = -1 * np.ones((num_clusters, max_cluster_size), dtype=np.int32)
+    cluster_counts = np.zeros(num_clusters, dtype=np.int32)
+
+    for idx in range(labels.shape[0]):
+        cluster_idx = labels[idx]  # Label is directly the cluster ID
+        cluster_array[cluster_idx, cluster_counts[cluster_idx]] = idx
+        cluster_counts[cluster_idx] += 1
+
+    return cluster_array
 
 
 @numba.jit(nopython=True, cache=True)
 def centroid_clusters(
     cluster_arr: np.ndarray, events: np.ndarray
 ) -> tuple[np.ndarray]:  
-    """
-    Performs the centroiding of a group of clusters using Numba.  Note I originally attempted to unpack the clusters using list comprehensions, but this approach is significantly faster.
 
-    Parameters
-    ----------
-    clusters : nd.array
-        The numpy representation of the clusters' event indices.
-    events : nd.array
-        The numpy represetation of the event data.
-
-    Returns
-    -------
-    tuple[np.ndarray]
-        t, xc, yc, ToT_max, ToT_sum, and n (number of events) in each cluster.
-    """
     num_clusters = cluster_arr.shape[0]
     max_cluster = cluster_arr.shape[1]
     t = np.zeros(num_clusters, dtype="uint64")
     xc = np.zeros(num_clusters, dtype="float32")
     yc = np.zeros(num_clusters, dtype="float32")
-    ToT_max = np.zeros(num_clusters, dtype="uint16")
-    ToT_sum = np.zeros(num_clusters, dtype="uint16")
+    ToT_max = np.zeros(num_clusters, dtype="uint32")
+    ToT_sum = np.zeros(num_clusters, dtype="uint32")
     n = np.zeros(num_clusters, dtype="ubyte")
 
     for cluster_id in range(num_clusters):
@@ -476,70 +499,261 @@ def ingest_cent_data(
     }
 
 
-def raw_df_to_cent_df(
-    df: pd.DataFrame, tw: int = DEFAULT_CLUSTER_TW, radius: int = DEFAULT_CLUSTER_RADIUS
+def add_centroid_cols(
+    df: pd.DataFrame, gap: bool = True
 ) -> pd.DataFrame:
     """
-    Uses functions defined herein to take Dataframe of raw data and return dataframe of clustered data.
+    Calculates centroid positions to the nearest pixel and the timestamp in nanoseconds.
 
     Parameters
     ----------
     df : pd.DataFrame
-        Pandas DataFrame of the raw data
-    tw : int
-        The time window to be considered "coincident" for clustering purposes
-    radius : int
-        The search radius, using Euclidean distance of x, y, timestamp/tw
+        Input centroided dataframe
+    gap : bool = True
+        Determines whether to implement large gap correction by adding 2 empty pixels offsets
 
     Returns
     -------
     pd.DataFrame
-        Pandas DataFrame of the centroided data.
+        Originally dataframe with new columns x, y, and t_ns added.
     """
-    fdf = drop_zero_tot(df)
-    cluster_arr, events = cluster_df(fdf, tw, radius)
-    data = centroid_clusters(cluster_arr, events)
-    return pd.DataFrame(ingest_cent_data(data)).sort_values("t").reset_index(drop=True) # should we specify the sort type here? this should be *almost* completely sorted already
+    if gap:
+        df.loc[df["xc"] >= 255.5, "xc"] += 2
+        df.loc[df["yc"] >= 255.5, "yc"] += 2
+    df["x"] = np.round(df["xc"]).astype(np.uint16) # sometimes you just want to know the closest pixel
+    df["y"] = np.round(df["yc"]).astype(np.uint16)
+    df["t_ns"] = (df["t"].astype(np.float64) * 1.5625) # better way to convert to ns while maintaining precision?
 
+    return df
 
-def add_centroid_cols(
-    df: pd.DataFrame, gap: bool = True
-) -> pd.DataFrame:
+
+def add_centroid_cols_dask(
+    df, gap: bool = True
+):
     """
     Calculates centroid positions to the nearest pixel and the timestamp in nanoseconds.
 
     Parameters
     ----------
-    df : pd.DataFrame
+    df : dd.DataFrame
         Input centroided dataframe
     gap : bool = True
         Determines whether to implement large gap correction by adding 2 empty pixels offsets
 
     Returns
     -------
-    pd.DataFrame
+    dd.DataFrame
         Originally dataframe with new columns x, y, and t_ns added.
     """
     if gap:
-        df.loc[df["xc"] >= 255.5, "xc"] += 2
-        df.loc[df["yc"] >= 255.5, "yc"] += 2
-    df["x"] = np.round(df["xc"]).astype(np.uint16) # sometimes you just want to know the closest pixel
-    df["y"] = np.round(df["yc"]).astype(np.uint16)
-    df["t_ns"] = (df["t"].astype(np.float64) * 1.5625) # better way to convert to ns while maintaining precision?
+        df['xc'] = df['xc'].mask(cond=df['xc'] >= 255.5, other= df['xc'] + 2)
+        df['yc'] = df['yc'].mask(cond=df['yc'] >= 255.5, other= df['yc'] + 2)
+    df["x"] = dd.DataFrame.round(df["xc"]).astype(np.uint16)
+    df["y"] = dd.DataFrame.round(df["yc"]).astype(np.uint16)
+    df["t_ns"] = df["t"] * 1.5625
 
+    #df.compute()
+    return df
+
+
+def trim_corr_file(mask_fpath: str = "/nsls2/users/jgoodrich/proposals/2025-1/qmicroscope/jgoodrich/new_clustering/bool_mask_total.csv"):
+    """
+    Load a boolean mask from a file, supporting .npy and .csv formats.
+
+    Parameters:
+    -----------
+    mask_fpath : str, optional
+        Path to the mask file. Supports `.npy` (NumPy binary) and `.csv` (comma-separated values).
+        Defaults to a predefined file path.
+
+    Returns:
+    --------
+    np.ndarray or None
+        A NumPy boolean array if the file is successfully loaded. 
+        Returns `None` if the file format is unsupported.
+    
+    Notes:
+    ------
+    - If the file is `.npy`, it is loaded using `np.load()` and converted to `bool`.
+    - If the file is `.csv`, it is loaded using `np.loadtxt()` with `delimiter=','` and converted to `bool`.
+    - Prints a message and returns `None` for unsupported file formats.
+    """
+    if mask_fpath is None:
+        return None
+
+    mask_fpath = Path(mask_fpath)
+    
+    if mask_fpath.suffix == '.npy':
+        return np.load(mask_fpath).astype(bool)
+    elif mask_fpath.suffix == '.csv':
+        return np.loadtxt(mask_fpath, delimiter=',').astype(bool)
+    else:
+        print("Unsupported file format. Use .npy or .csv. Returning None.")
+        return None
+
+
+def trim_corr(df: pd.DataFrame, total_mask: np.ndarray) -> None:
+    """
+    Modify df in place by subtracting 16 from column 't' where mask is True.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Dataframe containing 'x' and 'y' columns.
+    total_mask : np.ndarray
+        Boolean mask array indexed by (x, y).
+    """
+    if total_mask is None:
+        print("Trim correction mask is None. No changes applied.")
+        return
+
+    # Apply mask condition using direct NumPy indexing
+    df_mask = total_mask[df['x'].to_numpy(), df['y'].to_numpy()]
+
+    # Use `.loc[]` with a boolean mask of the same length as df
+    df.loc[df_mask, 't'] -= 16
+    return df
+
+
+@numba.njit(cache=True, fastmath=True)
+def timewalk_corr_exp(ToT, b = 167.0, c = -0.016):
+    return np.rint(b * np.exp(c * ToT) / 1.5625).astype(np.uint64)
+
+
+def timewalk_corr(df: pd.DataFrame, b = 167.0, c = -0.016) -> None:
+    """Applies timewalk correction in place."""
+    df.loc[:, 't'] -= timewalk_corr_exp(df['ToT'].to_numpy(), b, c)
     return df
 
 
 """
 Functions to help process multiple related .tpx3 files into Pandas dataframes stored in .h5 files.
-""" 
-RAW_H5_SUFFIX = ""
-CENT_H5_SUFFIX = "_cent"
-CONCAT_H5_SUFFIX = "_cent"
+"""
+def empty_raw_df() -> pd.DataFrame:
+    """
+    Create an empty DataFrame with the expected columns from ingest_raw_data() 
+    and the specified data types.
+
+    Returns
+    -------
+    pd.DataFrame
+        Empty DataFrame with columns:
+        ['x', 'y', 'ToT', 't', 'chip', 'cluster_id'] and appropriate dtypes.
+    """
+    return pd.DataFrame({
+        "x": np.array([], dtype="u2"),         # uint16
+        "y": np.array([], dtype="u2"),         # uint16
+        "ToT": np.array([], dtype="u4"),       # uint32
+        "t": np.array([], dtype="u8"),         # uint64
+        "chip": np.array([], dtype="u1"),      # uint8
+        "cluster_id": np.array([], dtype="u8") # uint64
+    })
+
+
+def empty_cent_df() -> pd.DataFrame:
+    """
+    Create an empty DataFrame with the expected columns from ingest_cent_data() 
+    and the specified data types.
+
+    Returns
+    -------
+    pd.DataFrame
+        Empty DataFrame with columns:
+        ['t', 'xc', 'yc', 'ToT_max', 'ToT_sum', 'n'] and appropriate dtypes.
+    """
+    return pd.DataFrame({
+        "t": np.array([], dtype="uint64"),       # uint64
+        "xc": np.array([], dtype="float32"),     # float32
+        "yc": np.array([], dtype="float32"),     # float32
+        "ToT_max": np.array([], dtype="uint32"), # uint32
+        "ToT_sum": np.array([], dtype="uint32"), # uint32
+        "n": np.array([], dtype="u1")            # uint8 (ubyte)
+    })
+
+
+def find_unmatched_tpx3_files(directory_list, reprocess = False):
+    
+    #Finds .tpx3 files in the given directories that do not have corresponding _cent.h5 files.
+    #Returns a list of Path objects.
+    
+    unmatched_files = []
+    all_tpx3_files = []
+    
+    for tpx3_dir in directory_list:
+        # Get all .tpx3 files
+        tpx3_files = list(Path(tpx3_dir).glob("*.tpx3"))
+
+        if reprocess == True:
+            all_tpx3_files.extend(tpx3_files)
+            continue
+        
+        # Generate corresponding _cent.h5 file paths
+        h5_cent_files = [converted_path(tpx3_file, cent=True) for tpx3_file in tpx3_files]
+        
+        if not h5_cent_files:
+            continue
+        
+        # Find h5_dir from the first _cent.h5 file
+        h5_dir = h5_cent_files[0].parent
+        
+        # Get all existing _cent.h5 files in that directory
+        existing_h5_files = [p for p in h5_dir.glob("*_cent.h5")]
+        
+        # Check which _cent.h5 files are missingl
+        unmatched_files.extend(tpx3_file for tpx3_file, h5_cent_file in zip(tpx3_files, h5_cent_files) if h5_cent_file not in existing_h5_files)
+
+    if reprocess:
+        return all_tpx3_files
+    else:    
+        return unmatched_files
+
+
+def converted_path(filepath, cent=False):
+    """
+    Converts .tpx3 file path(s) to corresponding .h5 file path(s).
+    Handles individual strings, Path objects, lists, or numpy arrays.
+
+    This is specific to CHX beamline pre and post data security. Is there a better way or place to store this?
+    
+    Returns Path objects.
+    """
+    if isinstance(filepath, (list, np.ndarray)):
+        return [converted_path(fp, cent) for fp in filepath]
+    
+    filepath = Path(str(filepath).replace("file:", ""))
+    
+    if "/nsls2/data/chx/proposals/" in str(filepath):
+        h5_path = Path(str(filepath).replace("/assets/", "/Compressed_Data/").replace(".tpx3", "_cent.h5" if cent else ".h5"))
+    elif "/nsls2/data/chx/legacy/" in str(filepath):
+        h5_path = Path(str(filepath).replace(".tpx3", "_cent.h5" if cent else ".h5"))
+    else:
+        raise ValueError(f"Unknown path format: {filepath}")
+    
+    return h5_path
+
+
+def save_df(df: pd.DataFrame, fpath: Union[str, Path]):
+    """
+    Save a Pandas DataFrame to an HDF5 file, ensuring that all necessary directories exist.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        The DataFrame to be saved.
+    fpath : Union[str, Path]
+        The full path to the output HDF5 file.
+    """
+    fpath = Path(fpath)  # Ensure fpath is a Path object
+
+    # Create parent directories if they do not exist
+    fpath.parent.mkdir(parents=True, exist_ok=True)
+
+    # Save DataFrame
+    df.to_hdf(fpath, key="df", format="table", mode="w")
 
 
 def convert_tpx_file(
-    fpath: Union[str, Path], time_window_microsecond: float = DEFAULT_CLUSTER_TW_MICROSECONDS, radius: int = DEFAULT_CLUSTER_RADIUS, print_details: bool = False, overwrite: bool = True
+    tpx3_fpath: Union[str, Path], tw: float = DEFAULT_CLUSTER_TW, radius: int = DEFAULT_CLUSTER_RADIUS, trim_correct: bool = None, timewalk_correct: bool = False, print_details: bool = False, overwrite: bool = True
 ):
     """
     Convert a .tpx3 file into raw and centroided Pandas dataframes, which are stored in .h5 files.
@@ -548,130 +762,195 @@ def convert_tpx_file(
     
     Parameters
     ----------
-    fpath : Union[str, Path]
+    tpx3_fpath : Union[str, Path]
         .tpx3 file path
-    time_window_microsecond : float = DEFAULT_CLUSTER_TW_MICROSECONDS
-        The time window, in microseconds, to perform centroiding
+    tw : float = DEFAULT_CLUSTER_TW_MICROSECONDS
+        The time window, in Timepix timestamp units, to perform centroiding
     radius : int = DEFAULT_CLUSTER_RADIUS
         The radius, in pixels, to perform centroiding
+    trim_correct : bool = False
+        Whether to apply trim correction
+    timewalk_correct : bool = False
+        Whether to apply timewalk correction
     print_details : bool = False
         Boolean toggle about whether to print detailed data.
     overwrite : bool = True
         Boolean toggle about whether to overwrite pre-existing data.
         
     """
-    fname, ext = os.path.splitext(fpath)
-    dfname = "{}{}.h5".format(fname, RAW_H5_SUFFIX)
-    dfcname = "{}{}.h5".format(fname, CONCAT_H5_SUFFIX)
-    
-    if ext == ".tpx3" and os.path.exists(fpath):
-        
-        try: 
-            
-            file_size = os.path.getsize(fpath)
-            have_df = os.path.exists(dfname)
-            have_dfc = os.path.exists(dfcname)
+    if isinstance(tpx3_fpath, str):
+        tpx3_fpath = Path(tpx3_fpath)
 
-            if have_df and have_dfc and not overwrite:
-                
-                print("-> {} exists, skipping.".format(dfname))
-                
-            else:
-                
-                if print_details:
-                    print("-> Processing {}, size: {:.1f} MB".format(fpath, file_size/1000000))
-                
-                time_window = time_window_microsecond * 1e-6
-                time_stamp_conversion = 6.1e-12
-                timedif = int(time_window / time_stamp_conversion)
+    if tpx3_fpath.exists():
+        if tpx3_fpath.suffix == ".tpx3":
 
-                if print_details:
-                    print("Loading {} data into dataframe...".format(fpath))
+            h5_fpath = converted_path(tpx3_fpath, cent=False)
+            cent_h5_fpath = converted_path(tpx3_fpath, cent=True)
+        
+            try: 
+                
+                tpx3_fpath_size = tpx3_fpath.stat().st_size  # Get file size
+                have_df = h5_fpath.exists()       # Check if dfname exists
+                have_dfc = cent_h5_fpath.exists()  # Check if dfcname exists
+    
+                if have_df and have_dfc and not overwrite:
                     
-                df = tpx_to_raw_df(fpath)
-                num_events = df.shape[0]
-
-                if print_details:
-                    print("Loading {} complete. {} events found. Saving to: {}".format(fpath, num_events, dfname))
+                    print("-> {} already processed, skipping.".format(tpx3_fpath.name))
+                    return False
                     
-                df.to_hdf(dfname, key="df", format="table", mode="w")
+                else:
 
-                if print_details:
-                    print("Saving {} complete. Beginning clustering...".format(dfname))
+                    if print_details:
+                        print("-> Processing {}, size: {:.1f} MB".format(tpx3_fpath.name, tpx3_fpath_size/(1024*1024)))
+
+                    if tpx3_fpath_size == 0:
+                        num_events = 0
+                    else:
+                        df = drop_zero_tot(tpx_to_raw_df(tpx3_fpath))
+                        num_events = df.shape[0]
+
+                    if num_events > 0:
                     
-                df_c = raw_df_to_cent_df(df, timedif, radius)
-                
-                num_clusters = df_c.shape[0]
+                        if print_details:
+                            print("Loading {} complete. {} events found.".format(tpx3_fpath.name, num_events))
+        
+                        if trim_correct is not None:
+                            if print_details:
+                                print("Performing trim correction on {}".format(tpx3_fpath.name))
+                            df = trim_corr(df, trim_correct)               
+                            
+                        if timewalk_correct:
+                            if print_details:
+                                print("Performing timewalk correction on {}".format(tpx3_fpath.name))
+                            df = timewalk_corr(df)
+                    
+                        cluster_labels, events = cluster_df_optimized(df, tw, radius)
+                        df['cluster_id'] = cluster_labels
+                        if print_details:
+                            print("Clustering {} complete. {} clusters found. Saving {}...".format(tpx3_fpath.name, cluster_labels.max()+1, h5_fpath.name))
+    
+                        save_df(df, h5_fpath)
+                        if print_details:
+                            print("Saving {} complete. Centroiding...".format(h5_fpath.name))
+                        
+                        cluster_array = group_indices(cluster_labels)
+                        data = centroid_clusters(cluster_array, events)
+                        
+                        cdf = pd.DataFrame(ingest_cent_data(data)).sort_values("t").reset_index(drop=True)
+                        if print_details:
+                            print("Centroiding complete. Saving to {}...".format(cent_h5_fpath.name))
+                            # save cdf
+    
+                        save_df(cdf, cent_h5_fpath)                  
+                        if print_details:
+                            print("Saving {} complete. Checking file existence...".format(cent_h5_fpath.name))
+                            
+                        if cent_h5_fpath.exists():
+                            if print_details:
+                                print("Confirmed {} exists!".format(cent_h5_fpath.name))
+                            to_return = True
+                        else:
+                            if print_details:
+                                print("WARNING: {} doesn't exist but it should?!".format(cent_h5_fpath.name))
+                            to_return = False
+
+                        if print_details:
+                            print("Moving onto next file...")
+    
+                        df, cdf, cluster_labels, events, cluster_array, data = None, None, None, None, None, None   
+                        gc.collect()
+                        return to_return
 
-                if print_details:
-                    print("Clustering {} complete. {} clusters found. Saving to {}".format(fpath, num_clusters, dfcname))
+                    else: 
+
+                        if print_details:
+                            print("No events found! Saving empty dataframes.")
+                        save_df(empty_raw_df(), h5_fpath) 
+                        save_df(empty_cent_df(), cent_h5_fpath) 
+
+                        gc.collect()
+
+                    return True
+                    
+            except Exception as e:
                     
-                df_c.to_hdf(dfcname, key="df", format="table", mode="w")
-                
                 if print_details:
-                    print("Saving {} complete. Moving onto next file.".format(dfcname))
-                
-        except Exception as e:
-                
+                    print(f"Conversion of {tpx3_fpath.name} failed due to {e.__class__.__name__}: {e}, moving on.")
+                    return False
+                    
+        else:
             if print_details:
-                print("Conversion of {} failed due to {}, moving on.".format(fpath,e))
-                
+                print("File was not a .tpx3 file. Moving onto next file.")
+                return False
+
     else:
         if print_details:
-            print("File not found. Moving onto next file.")
+            print("File does not exist. Moving onto next file.")
+            return False
 
 
-def convert_tpx3_files_parallel(fpaths: Union[List[str], List[Path]], num_workers: int = None):
+def convert_tpx3_files_parallel(
+    fpaths: Union[List[str], List[Path]], num_workers: int = None, trim_correct: Union[str, Path] = None, **kwargs
+):
     """
-    Convert a list .tpx3 files in parallel using multiprocessing and convert_tpx_file().
-    
-    TO DO: Accept more arguments for convert_tpx_file.
+    Convert a list of .tpx3 files in parallel using multiprocessing and convert_tpx_file().
     
     Parameters
     ----------
-    fpath : Union[str, Path]
-        .tpx3 file path
-    time_window_microsecond : float = DEFAULT_CLUSTER_TW_MICROSECONDS
-        The time window, in microseconds, to perform centroiding
-    radius : int = DEFAULT_CLUSTER_RADIUS
-        The radius, in pixels, to perform centroiding
-    print_details : bool = False
-        Boolean toggle about whether to print detailed data.
-    overwrite : bool = True
-        Boolean toggle about whether to overwrite pre-existing data.
-        
+    fpaths : Union[List[str], List[Path]]
+        List of .tpx3 file paths to process.
+    num_workers : int, optional
+        Number of worker processes to use. Defaults to (CPU count - 4) to leave room for other tasks.
+    trim_mask_fpath : str, optional
+        Path to the trim correction mask. If None, no correction is applied.
+    **kwargs : dict
+        Additional keyword arguments passed to `convert_tpx_file()`.
     """
-    if num_workers is None:
-        max_workers = multiprocessing.cpu_count()
+    if len(fpaths) > 0:
+        if num_workers is None:
+            max_workers = min(multiprocessing.cpu_count() - 4, len(fpaths))  # Leave 4 cores free
+        else:
+            max_workers = min(num_workers, len(fpaths))  # Don't use more workers than files
+    
+        # Load the mask once
+        trim_mask = trim_corr_file(trim_correct)
+    
+        # Pass the preloaded mask to all workers
+        worker_func = partial(convert_tpx_file, trim_correct=trim_mask, **kwargs)
+    
+        with multiprocessing.Pool(processes=max_workers) as pool:
+            results = list(tqdm(pool.imap_unordered(worker_func, fpaths), total=len(fpaths), desc="Processing files"))
+    
+        # Count successes
+        num_true = sum(results)
     else:
-        max_workers = num_workers
-
-    with multiprocessing.Pool(processes=max_workers) as pool:
-        for _ in tqdm(pool.imap_unordered(convert_tpx_file, fpaths), total=len(fpaths)):
-            pass
-        
+        num_true = 0
         
-def convert_tpx3_files(fpaths: Union[List[str], List[Path]], print_details = True):
+    print(f"Successfully converted {num_true} out of {len(fpaths)}!")
+
+
+def convert_tpx3_files(
+    fpaths: Union[List[str], List[Path]], trim_correct: Union[str, Path] = None, print_details: bool = True, **kwargs
+):
     """
-    Convert a list .tpx3 files in a single process using convert_tpx_file().
-    
-    TO DO: Accept more arguments for convert_tpx_file.
+    Convert a list of .tpx3 files in a single process using convert_tpx_file().
     
     Parameters
     ----------
-    fpath : Union[str, Path]
-        .tpx3 file path
-    time_window_microsecond : float = DEFAULT_CLUSTER_TW_MICROSECONDS
-        The time window, in microseconds, to perform centroiding
-    radius : int = DEFAULT_CLUSTER_RADIUS
-        The radius, in pixels, to perform centroiding
-    print_details : bool = False
-        Boolean toggle about whether to print detailed data.
-    overwrite : bool = True
-        Boolean toggle about whether to overwrite pre-existing data.
-        
+    fpaths : Union[List[str], List[Path]]
+        List of .tpx3 file paths to process.
+    trim_mask_fpath : str, optional
+        Path to the trim correction mask. If None, no correction is applied.
+    print_details : bool, optional
+        Boolean toggle about whether to print detailed data. Default is True.
+    **kwargs : dict
+        Additional keyword arguments passed to `convert_tpx_file()`.
     """
-    for file in fpaths:
-        convert_tpx_file(file, print_details = print_details)
+    # Load the mask once (only if provided)
+    trim_mask = trim_corr_file(trim_correct)
+
+    # Process files sequentially with tqdm progress bar
+    for file in tqdm(fpaths, desc="Processing files"):
+        convert_tpx_file(file, trim_correct=trim_mask, print_details=print_details, **kwargs)
+
         
-        
\ No newline at end of file
diff --git a/tpx3awkward/examples/example.ipynb b/tpx3awkward/examples/example.ipynb
deleted file mode 100644
index 6eeabb1..0000000
--- a/tpx3awkward/examples/example.ipynb
+++ /dev/null
@@ -1,351 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "76010898-2082-49cb-9bff-8173fd79ad6e",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import _utils as tpx\n",
-    "import pandas as pd\n",
-    "from tiled.client import from_uri\n",
-    "db = from_uri('https://tiled.nsls2.bnl.gov', 'dask')['chx']['raw']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "59bb0e40-1686-413c-8595-c38655e2d0d6",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "sid = 143200 # example SID (1800 partitions at 1 sec/partition from NSLS-II quantum microscope project)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "1b629cf7-1d46-49b9-a41b-43f3b9972f4e",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Raw dataframe: \n",
-      "          x    y  ToT          t  chip\n",
-      "0       502  452  475   11301032     1\n",
-      "1       405  272  275   11302904     1\n",
-      "2       406  272  350   11302904     1\n",
-      "3       295  135  525   11303039     0\n",
-      "4       358  509  300   11303780     1\n",
-      "...     ...  ...  ...        ...   ...\n",
-      "971254  200   46  550  651295354     3\n",
-      "971255  489  104  450  651296021     0\n",
-      "971256  389   21  300  651296305     0\n",
-      "971257  390   21  300  651296306     0\n",
-      "971258   37  325   75  651299518     2\n",
-      "\n",
-      "[971259 rows x 5 columns]\n",
-      "Centroided dataframe: \n",
-      "                t          xc     yc  ToT_max  ToT_sum  n\n",
-      "0        11301032  502.000000  452.0      475      475  1\n",
-      "1        11302904  405.559998  272.0      350      625  2\n",
-      "2        11303039  295.000000  135.0      525      525  1\n",
-      "3        11303780  358.478271  509.0      300      575  2\n",
-      "4        11303973  412.409088   60.0      325      550  2\n",
-      "...           ...         ...    ...      ...      ... ..\n",
-      "659582  651294809  326.000000  379.0      475      475  1\n",
-      "659583  651295354  200.000000   46.0      550      550  1\n",
-      "659584  651296021  489.000000  104.0      450      450  1\n",
-      "659585  651296305  389.500000   21.0      300      600  2\n",
-      "659586  651299518   37.000000  325.0       75       75  1\n",
-      "\n",
-      "[659587 rows x 6 columns]\n",
-      "Centroided dataframe: \n",
-      "                t          xc     yc  ToT_max  ToT_sum  n    x    y  \\\n",
-      "0        11301032  504.000000  454.0      475      475  1  504  454   \n",
-      "1        11302904  407.559998  274.0      350      625  2  408  274   \n",
-      "2        11303039  297.000000  135.0      525      525  1  297  135   \n",
-      "3        11303780  360.478271  511.0      300      575  2  360  511   \n",
-      "4        11303973  414.409088   60.0      325      550  2  414   60   \n",
-      "...           ...         ...    ...      ...      ... ..  ...  ...   \n",
-      "659582  651294809  328.000000  381.0      475      475  1  328  381   \n",
-      "659583  651295354  200.000000   46.0      550      550  1  200   46   \n",
-      "659584  651296021  491.000000  104.0      450      450  1  491  104   \n",
-      "659585  651296305  391.500000   21.0      300      600  2  392   21   \n",
-      "659586  651299518   37.000000  327.0       75       75  1   37  327   \n",
-      "\n",
-      "                t_ns  \n",
-      "0       1.765786e+07  \n",
-      "1       1.766079e+07  \n",
-      "2       1.766100e+07  \n",
-      "3       1.766216e+07  \n",
-      "4       1.766246e+07  \n",
-      "...              ...  \n",
-      "659582  1.017648e+09  \n",
-      "659583  1.017649e+09  \n",
-      "659584  1.017650e+09  \n",
-      "659585  1.017650e+09  \n",
-      "659586  1.017655e+09  \n",
-      "\n",
-      "[659587 rows x 9 columns]\n",
-      "Partition duration: 0.999997634375 sec.\n"
-     ]
-    }
-   ],
-   "source": [
-    "files = db[sid]['primary']['data']['tpx3_files_raw_filepaths'][0].compute()\n",
-    "\n",
-    "file = files[0][5:] # cut off 'file:' from beginning\n",
-    "\n",
-    "df = tpx.tpx_to_raw_df(file)\n",
-    "cdf = tpx.raw_df_to_cent_df(df)\n",
-    "\n",
-    "print('Raw dataframe: ')\n",
-    "print(df)\n",
-    "\n",
-    "print('Centroided dataframe: ')\n",
-    "print(cdf)\n",
-    "\n",
-    "cdf = tpx.add_centroid_cols(cdf)\n",
-    "print('Centroided dataframe: ')\n",
-    "print(cdf)\n",
-    "\n",
-    "duration = (cdf.iloc[-1]['t_ns'] - cdf.iloc[0]['t_ns'])/1e9\n",
-    "print(f'Partition duration: {duration} sec.')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "48254ae7-8d0c-4766-b1d1-2b7f4732934c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sid = 143201 # new example SID (1800 partitions at 1 sec/partition from NSLS-II quantum microscope project)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "2adae433-a68c-436e-9640-d4a9745af969",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "-> Processing /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000.tpx3, size: 10.5 MB\n",
-      "Loading /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000.tpx3 data into dataframe...\n",
-      "Loading /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000.tpx3 complete. 975938 events found. Saving to: /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000.h5\n",
-      "Saving /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000.h5 complete. Beginning clustering...\n",
-      "Clustering /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000.tpx3 complete. 659537 clusters found. Saving to /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000_cent.h5\n",
-      "Saving /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000_cent.h5 complete. Moving onto next file.\n",
-      "-> Processing /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000001.tpx3, size: 10.5 MB\n",
-      "Loading /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000001.tpx3 data into dataframe...\n",
-      "Loading /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000001.tpx3 complete. 972867 events found. Saving to: /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000001.h5\n",
-      "Saving /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000001.h5 complete. Beginning clustering...\n",
-      "Clustering /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000001.tpx3 complete. 657491 clusters found. Saving to /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000001_cent.h5\n",
-      "Saving /nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000001_cent.h5 complete. Moving onto next file.\n"
-     ]
-    }
-   ],
-   "source": [
-    "files = db[sid]['primary']['data']['tpx3_files_raw_filepaths'][0].compute()\n",
-    "files = [f.replace('file:', '') for f in files]\n",
-    "tpx.convert_tpx3_files(files[0:2])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "05332e5b-17eb-4074-ad92-51ef1273e9f4",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Raw dataframe: \n",
-      "          x    y  ToT          t  chip\n",
-      "0       210  132  575   11294477     3\n",
-      "1       106  227  375   11294540     3\n",
-      "2       107  227  150   11294544     3\n",
-      "3       381  346  475   11294882     1\n",
-      "4       268  444  450   11295743     1\n",
-      "...     ...  ...  ...        ...   ...\n",
-      "975933  103  441  600  651291179     2\n",
-      "975934  193  383  450  651294115     2\n",
-      "975935  121  280  400  651294141     2\n",
-      "975936  256  373  400  651294145     1\n",
-      "975937  255  373   50  651294166     2\n",
-      "\n",
-      "[975938 rows x 5 columns]\n",
-      "Centroided dataframe: \n",
-      "                t          xc     yc  ToT_max  ToT_sum  n\n",
-      "0        11294477  210.000000  132.0      575      575  1\n",
-      "1        11294540  106.285713  227.0      375      525  2\n",
-      "2        11294882  381.000000  346.0      475      475  1\n",
-      "3        11295743  268.000000  444.0      450      450  1\n",
-      "4        11298466  420.000000   52.0      450      450  1\n",
-      "...           ...         ...    ...      ...      ... ..\n",
-      "659532  651290601  101.000000  436.0      575      575  1\n",
-      "659533  651291179  103.000000  441.0      600      600  1\n",
-      "659534  651294115  193.000000  383.0      450      450  1\n",
-      "659535  651294141  121.000000  280.0      400      400  1\n",
-      "659536  651294145  255.888885  373.0      400      450  2\n",
-      "\n",
-      "[659537 rows x 6 columns]\n",
-      "Centroided dataframe: \n",
-      "                t          xc     yc  ToT_max  ToT_sum  n    x    y  \\\n",
-      "0        11294477  210.000000  132.0      575      575  1  210  132   \n",
-      "1        11294540  106.285713  227.0      375      525  2  106  227   \n",
-      "2        11294882  383.000000  348.0      475      475  1  383  348   \n",
-      "3        11295743  270.000000  446.0      450      450  1  270  446   \n",
-      "4        11298466  422.000000   52.0      450      450  1  422   52   \n",
-      "...           ...         ...    ...      ...      ... ..  ...  ...   \n",
-      "659532  651290601  101.000000  438.0      575      575  1  101  438   \n",
-      "659533  651291179  103.000000  443.0      600      600  1  103  443   \n",
-      "659534  651294115  193.000000  385.0      450      450  1  193  385   \n",
-      "659535  651294141  121.000000  282.0      400      400  1  121  282   \n",
-      "659536  651294145  257.888885  375.0      400      450  2  258  375   \n",
-      "\n",
-      "                t_ns  \n",
-      "0       1.764762e+07  \n",
-      "1       1.764772e+07  \n",
-      "2       1.764825e+07  \n",
-      "3       1.764960e+07  \n",
-      "4       1.765385e+07  \n",
-      "...              ...  \n",
-      "659532  1.017642e+09  \n",
-      "659533  1.017642e+09  \n",
-      "659534  1.017647e+09  \n",
-      "659535  1.017647e+09  \n",
-      "659536  1.017647e+09  \n",
-      "\n",
-      "[659537 rows x 9 columns]\n",
-      "Partition duration: 0.99999948125 sec.\n"
-     ]
-    }
-   ],
-   "source": [
-    "df = pd.read_hdf('/nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000.h5')\n",
-    "cdf = pd.read_hdf('/nsls2/data/chx/legacy/data/2024/02/05/33008fb5-d9f5-4f46-8b5d_00000_000000_cent.h5')\n",
-    "\n",
-    "print('Raw dataframe: ')\n",
-    "print(df)\n",
-    "\n",
-    "print('Centroided dataframe: ')\n",
-    "print(cdf)\n",
-    "\n",
-    "cdf = tpx.add_centroid_cols(cdf)\n",
-    "print('Centroided dataframe: ')\n",
-    "print(cdf)\n",
-    "\n",
-    "duration = (cdf.iloc[-1]['t_ns'] - cdf.iloc[0]['t_ns'])/1e9\n",
-    "print(f'Partition duration: {duration} sec.')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "c981be40-e631-4f73-bbab-4fadda8b9285",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 1800/1800 [07:51<00:00,  3.82it/s]\n"
-     ]
-    }
-   ],
-   "source": [
-    "try:\n",
-    "    files = db[sid]['primary']['data']['tpx3_files_raw_filepaths'][0].compute()\n",
-    "    files = [f.replace('file:', '') for f in files]\n",
-    "    tpx.convert_tpx3_files_parallel(files)\n",
-    "    \n",
-    "except Exception as e:\n",
-    "    print(f'Could not finish {sid} due to {e}') "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "946fa74f-8c8c-4f36-bc0c-9f203e023f78",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "All raw .h5 files exist!\n",
-      "All centroided raw .h5 files exist!\n"
-     ]
-    }
-   ],
-   "source": [
-    "import os\n",
-    "\n",
-    "raw_h5_files = [os.path.exists(f.replace('.tpx3', '.h5')) for f in files]\n",
-    "cent_h5_files = [os.path.exists(f.replace('.tpx3', '_cent.h5')) for f in files]\n",
-    "\n",
-    "if all(raw_h5_files):\n",
-    "    print('All raw .h5 files exist!')\n",
-    "else:\n",
-    "    print('Missing .h5 files!')\n",
-    "    \n",
-    "if all(cent_h5_files):\n",
-    "    print('All centroided raw .h5 files exist!')\n",
-    "else:\n",
-    "    print('Missing centroided .h5 files!')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "705c639d-dac9-4517-b5ff-ce44365ed4df",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.6"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}