diff --git a/dev/poc/dispatch_regression_tests.py b/dev/poc/dispatch_regression_tests.py
new file mode 100644
index 00000000..bfd93984
--- /dev/null
+++ b/dev/poc/dispatch_regression_tests.py
@@ -0,0 +1,68 @@
+from cibuildwheel.oci_container import OCIContainer
+from cibuildwheel.oci_container import OCIPlatform
+import ubelt as ub
+
+python_versions = [
+    '3.9',
+    '3.10',
+    '3.11',
+    '3.12',
+    '3.13',
+]
+
+for python_version in python_versions:
+    current_pyver = 'cp' + python_version.replace('.', '')
+    image = 'python:' + python_version
+    container = OCIContainer(image=image, oci_platform=OCIPlatform.AMD64)
+
+    container.__enter__()   # not using with for ipython dev
+
+    script_path = ub.Path('~/code/line_profiler/dev/poc/regression_test.py').expand()
+    container_script_path = ub.Path('regression_test.py')
+    container.copy_into(script_path, container_script_path)
+
+    container.call(['pip', 'install', 'uv'])
+    container.call(['uv', 'pip', 'install', '--system', 'kwutil', 'ubelt', 'scriptconfig', 'psutil', 'ruamel.yaml', 'py-cpuinfo'])
+
+    line_profiler_versions = [
+        # '4.0.0',
+        '4.1.2',
+        '4.2.0',
+        '5.0.0',
+    ]
+
+    for line_profiler_version in line_profiler_versions:
+        container.call(['uv', 'pip', 'uninstall', '--system', 'line_profiler'])
+        container.call(['uv', 'pip', 'install', '--system', f'line_profiler=={line_profiler_version}'])
+        for _ in range(5):
+            container.call(['python', container_script_path])
+
+    # Test the latest wheels (requires these are built beforehand)
+    local_wheels = ub.Path('/home/joncrall/code/line_profiler/wheelhouse')
+    container.copy_into(local_wheels, ub.Path('wheelhouse'))
+    line_profiler_version = '5.0.1'
+    found = list(local_wheels.glob('*' + current_pyver + '-manylinux*'))
+    assert len(found) == 1
+    wheel_name = found[0].name
+    container.call(['uv', 'pip', 'uninstall', '--system', 'line_profiler'])
+    container.call(['uv', 'pip', 'install', '--system', 'wheelhouse/' + str(wheel_name)])
+    for _ in range(5):
+        container.call(['python', container_script_path])
+
+    container.copy_out(ub.Path('/root/.cache/line_profiler/benchmarks/'), ub.Path('bench_results'))
+
+    container.__exit__(None, None, None)
+
+# FIXME: robustness
+import kwutil
+result_paths = kwutil.util_path.coerce_patterned_paths('bench_results', expected_extension='.yaml')
+import sys, ubelt
+sys.path.append(ubelt.expandpath('~/code/line_profiler/dev/poc'))
+from gather_regression_tests import accumulate_results
+from gather_regression_tests import plot_results
+
+df['context.machine.py_version_simple'] = df['context.machine.py_version'].apply(lambda x: x.split(' ')[0])
+
+df = accumulate_results(result_paths)
+df = df.sort_values('params.line_profiler_version')
+plot_results(df)
diff --git a/dev/poc/gather_regression_tests.py b/dev/poc/gather_regression_tests.py
new file mode 100644
index 00000000..26aadf5e
--- /dev/null
+++ b/dev/poc/gather_regression_tests.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+import scriptconfig as scfg
+import kwutil
+import ubelt as ub
+
+
+def nested_to_dotdict(data):
+    """
+    Construct this flat representation from a nested one
+
+    Args:
+        data (Dict):
+            nested data
+
+        Example:
+            >>> data = {
+            >>>     'type': 'process',
+            >>>     'properties': {
+            >>>         'machine': {
+            >>>             'os_name': 'Linux',
+            >>>             'arch': 'x86_64',
+            >>>             'py_impl': 'CPython',
+            >>>         }}}
+            >>> flat = nested_to_dotdict(data['properties'])
+            >>> print(f'flat = {ub.urepr(flat, nl=2)}')
+            flat = {
+                'machine.os_name': 'Linux',
+                'machine.arch': 'x86_64',
+                'machine.py_impl': 'CPython',
+            }
+    """
+    flat = dict()
+    walker = ub.IndexableWalker(data, list_cls=tuple())
+    for path, value in walker:
+        if not isinstance(value, dict):
+            spath = list(map(str, path))
+            key = '.'.join(spath)
+            flat[key] = value
+    return flat
+
+
+def insert_prefix(self, prefix, index=0):
+    """
+    Adds a prefix to all items
+
+    Args:
+        prefix (str): prefix to insert
+        index (int): the depth to insert the new param
+
+    Example:
+        >>> self = dict({
+        >>>     'proc1.param1': 1,
+        >>>     'proc2.param1': 3,
+        >>>     'proc4.part2.param2': 10,
+        >>> })
+        >>> new = insert_prefix(self, 'foo', index=1)
+        >>> print('new = {}'.format(ub.urepr(new, nl=1)))
+        new = {
+            'foo.proc1.param1': 1,
+            'foo.proc2.param1': 3,
+            'foo.proc4.part2.param2': 10,
+        }
+    """
+    def _generate_new_items():
+        sep = '.'
+        for k, v in self.items():
+            path = k.split(sep)
+            path.insert(index, prefix)
+            k2 = sep.join(path)
+            yield k2, v
+    new = self.__class__(_generate_new_items())
+    return new
+
+
+class GatherRegressionTestsCLI(scfg.DataConfig):
+    """
+    """
+    paths = scfg.Value(None, help='output results from profiling codes')
+
+    @classmethod
+    def main(cls, argv=1, **kwargs):
+        """
+        Example:
+            >>> # xdoctest: +SKIP
+            >>> from gather_regression_tests import *  # NOQA
+            >>> argv = 0
+            >>> kwargs = dict()
+            >>> cls = GatherRegressionTestsCLI
+            >>> config = cls(**kwargs)
+            >>> cls.main(argv=argv, **config)
+        """
+        config = cls.cli(argv=argv, data=kwargs, strict=True, verbose='auto')
+        result_paths = kwutil.util_path.coerce_patterned_paths(config.paths, expected_extension='.yaml')
+
+        df = accumulate_results(result_paths)
+        plot_results(df)
+
+
+def plot_results(df):
+    import kwplot
+    sns = kwplot.autosns()
+    figman = kwplot.FigureManager(dpath='.')
+    fig = figman.figure()
+    fig.clf()
+    ax = fig.gca()
+    sns.lineplot(data=df, x='params.line_profiler_version', y='record.duration', style='record.line_profiler.enabled', ax=ax)
+    figman.finalize('regression_plot.png')
+
+    fig.clf()
+    ax = fig.gca()
+    sns.lineplot(data=df, x='params.line_profiler_version', y='record.duration', style='record.line_profiler.enabled', ax=ax, hue='context.machine.py_version_simple')
+    figman.finalize('regression_plot_pyversions.png')
+
+
+def accumulate_results(result_paths):
+    import pandas as pd
+    records_accum = []
+    for fpath in result_paths:
+        results = kwutil.Yaml.coerce(fpath)
+        records = results['records']
+        context = results['context']['properties']
+        _flat_context = nested_to_dotdict(context)
+        flat_context = insert_prefix(_flat_context, 'context')
+
+        flat_params = insert_prefix(nested_to_dotdict(
+            results['params']), 'params')
+        for record in records:
+            _flat_record = nested_to_dotdict(record)
+            flat_record = insert_prefix(_flat_record, 'record')
+            flat_record.update(flat_context)
+            flat_record.update(flat_params)
+            records_accum.append(flat_record)
+    df = pd.DataFrame(records_accum)
+    df['context.machine.py_version_simple'] = df['context.machine.py_version'].apply(lambda x: x.split(' ')[0])
+    return df
+
+__cli__ = GatherRegressionTestsCLI
+
+if __name__ == '__main__':
+    """
+
+    CommandLine:
+        python ~/code/line_profiler/dev/poc/gather_regression_tests.py
+        python -m gather_regression_tests
+    """
+    __cli__.main()
diff --git a/dev/poc/regression_test.py b/dev/poc/regression_test.py
new file mode 100644
index 00000000..6bbb7106
--- /dev/null
+++ b/dev/poc/regression_test.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+import ubelt as ub
+import sys
+import scriptconfig as scfg
+
+
+# TODO: more than one type of benchmark would be a good idea
+
+BENCHMARK_CODE = ub.codeblock(
+    '''
+    if 'profile' not in globals():
+        try:
+            from line_profiler import profile
+        except ImportError:
+            profile = lambda x: x
+
+    @profile
+    def main(profiled=False):
+         import time
+         start = time.perf_counter()
+         for x in range(2000000): #2000000
+             y = x
+         elapsed_ms = round((time.perf_counter()-start)*1000, 2)
+         if profiled:
+             print(f"Time Elapsed profiling: {elapsed_ms}ms")
+         else:
+            print(f"Time Elapsed without profiling: {elapsed_ms}ms")
+         return elapsed_ms
+
+    if __name__ == '__main__':
+        main()
+    ''')
+
+
+class RegressionTestCLI(scfg.DataConfig):
+    out_fpath = scfg.Value('auto', help='place to write results')
+
+    @classmethod
+    def main(cls, argv=1, **kwargs):
+        """
+        Example:
+            >>> # xdoctest: +SKIP
+            >>> from regression_test import *  # NOQA
+            >>> argv = 0
+            >>> kwargs = dict()
+            >>> cls = RegressionTestCLI
+            >>> config = cls(**kwargs)
+            >>> cls.main(argv=argv, **config)
+        """
+        config = cls.cli(argv=argv, data=kwargs, strict=True, verbose='auto')
+
+        import kwutil
+        import line_profiler
+        context = kwutil.ProcessContext().start()
+
+        dpath = ub.Path.appdir('line_profiler/benchmarks').ensuredir()
+
+        code = BENCHMARK_CODE
+        code_id = ub.hash_data(code)[0:16]
+        fpath = dpath / f'{code_id}.py'
+        fpath.write_text(code)
+
+        # TODO: can we tag the hash of the wheel we used to install?
+        # We need to be able to differentiate dev versions.
+
+        results = {
+            'context': context.obj,
+            'params':  {
+                'line_profiler_version': line_profiler.__version__,
+                'code_id': code_id,
+            },
+            'records': [],
+            'line_records': [],
+        }
+
+        # result_path = fpath.augment(stemsuffix='_' + ub.timestamp(), ext='.lprof')
+        result_path = fpath.augment(ext='.py.lprof')
+
+        with ub.Timer(ns=True) as noprof_timer:
+            res = ub.cmd([sys.executable, fpath], verbose=3, cwd=fpath.parent)
+        res.check_returncode()
+        results['records'].append({
+            'line_profiler.enabled': False,
+            'duration': noprof_timer.elapsed,
+        })
+
+        with ub.Timer(ns=True) as prof_timer:
+            res = ub.cmd([sys.executable, '-m', 'kernprof', '-lv', fpath], verbose=3, cwd=fpath.parent)
+        res.check_returncode()
+        results['records'].append({
+            'line_profiler.enabled': True,
+            'duration': prof_timer.elapsed,
+        })
+
+        # res2 = ub.cmd([sys.executable, '-m', 'line_profiler', '-rmt', result_path], verbose=3)
+        stats = line_profiler.load_stats(result_path)
+
+        for key, timings in stats.timings.items():
+            _path, _line, _name = key
+            for lineno, hits, time in timings:
+                results['line_records'].append({
+                    'time': time,
+                    'lineno': lineno,
+                })
+
+        context.stop()
+        print(f'results = {ub.urepr(results, nl=2)}')
+
+        out_text = kwutil.Yaml.dumps(results)
+        if config.out_fpath == 'auto':
+            out_fpath = fpath.augment('regression_test_' + ub.timestamp(), ext='.yaml')
+        else:
+            out_fpath = config.out_fpath
+        out_fpath.write_text(out_text)
+
+__cli__ = RegressionTestCLI
+
+if __name__ == '__main__':
+    """
+
+    CommandLine:
+        python ~/code/line_profiler/dev/poc/regression_test.py
+        python -m regression_test
+    """
+    __cli__.main()
diff --git a/line_profiler/_line_profiler.pyx b/line_profiler/_line_profiler.pyx
index ff806962..3c9f3148 100644
--- a/line_profiler/_line_profiler.pyx
+++ b/line_profiler/_line_profiler.pyx
@@ -18,6 +18,9 @@ from functools import wraps
 from sys import byteorder
 import sys
 cimport cython
+from cython.operator cimport dereference as deref
+from cpython.object cimport PyObject_Hash
+from cpython.bytes cimport PyBytes_AS_STRING, PyBytes_GET_SIZE
 from cpython.version cimport PY_VERSION_HEX
 from libc.stdint cimport int64_t
 
@@ -66,9 +69,26 @@ ctypedef unsigned long long int uint64
 ctypedef long long int int64
 
 cdef extern from "Python_wrapper.h":
+    """
+    #if PY_VERSION_HEX >= 0x030d00a1 && PY_VERSION_HEX < 0x030e00a1
+    /*      Unfortunately, _Py_HashBytes() is not exposed in Python
+     *      3.13, so just fall back to the slower Py_ObjectHash()
+     */
+    #   define hash_bytecode(bytestring, bytes, len) PyObject_Hash(bytestring)
+    #else
+    #   if PY_VERSION_HEX >= 0x030e00a1  // 3.14.0.a1
+    #       define hash_bytes(b, n) Py_HashBuffer(b, n)
+    #   else
+    #       include "pyhash.h"
+    #       define hash_bytes(b, n) _Py_HashBytes(b, n)
+    #   endif
+    #   define hash_bytecode(bytestring, bytes, len) hash_bytes(bytes, len)
+    #endif
+    """
     ctypedef struct PyObject
     ctypedef struct PyCodeObject
     ctypedef struct PyFrameObject
+    ctypedef Py_ssize_t Py_hash_t
     ctypedef long long PY_LONG_LONG
     ctypedef int (*Py_tracefunc)(
         object self, PyFrameObject *py_frame, int what, PyObject *arg)
@@ -92,6 +112,13 @@ cdef extern from "Python_wrapper.h":
 
     cdef int PyFrame_GetLineNumber(PyFrameObject *frame)
     cdef void Py_XDECREF(PyObject *o)
+    
+    # it's actually an unsigned long, but we want to avoid truncation on windows
+    cdef unsigned long long PyThread_get_thread_ident()
+    cdef Py_hash_t hash_bytecode(PyObject *bytestring, const char* bytes,
+                                 Py_ssize_t len)
+
+ctypedef PyCodeObject *PyCodeObjectPtr
 
 cdef extern from "c_trace_callbacks.c":  # Legacy tracing
     ctypedef unsigned long long Py_uintptr_t
@@ -550,7 +577,7 @@ sys.monitoring.html#monitoring-event-RERAISE
     """
     cdef TraceCallback *legacy_callback
     cdef _SysMonitoringState mon_state
-    cdef public object active_instances  # type: set[LineProfiler]
+    cdef public set active_instances  # type: set[LineProfiler]
     cdef int _wrap_trace
     cdef int _set_frame_local_trace
     cdef int recursion_guard
@@ -1239,7 +1266,7 @@ datamodel.html#user-defined-functions
 
     property _manager:
         def __get__(self):
-            thread_id = threading.get_ident()
+            thread_id = PyThread_get_thread_ident()
             try:
                 return self._managers[thread_id]
             except KeyError:
@@ -1303,7 +1330,7 @@ datamodel.html#user-defined-functions
                 If no profiling data is available on the current thread.
         """
         try:
-            return (<dict>self._c_last_time)[threading.get_ident()]
+            return (<dict>self._c_last_time)[PyThread_get_thread_ident()]
         except KeyError as e:
             # We haven't actually profiled anything yet
             raise (KeyError('No profiling data on the current thread '
@@ -1349,7 +1376,7 @@ datamodel.html#user-defined-functions
         return py_last_time
 
     cpdef disable(self):
-        self._c_last_time[threading.get_ident()].clear()
+        self._c_last_time[PyThread_get_thread_ident()].clear()
         self._manager._handle_disable_event(self)
 
     def get_stats(self):
@@ -1388,55 +1415,74 @@ datamodel.html#user-defined-functions
 @cython.boundscheck(False)
 @cython.wraparound(False)
 cdef inline inner_trace_callback(
-        int is_line_event, object instances, object code, int lineno):
+        int is_line_event, set instances, object code, int lineno):
     """
     The basic building block for the trace callbacks.
     """
-    cdef object prof_
-    cdef object bytecode = code.co_code
+    cdef LineProfiler prof_
     cdef LineProfiler prof
     cdef LastTime old
     cdef int key
-    cdef PY_LONG_LONG time
-    cdef int has_time = 0
+    cdef PY_LONG_LONG time = 0
+    cdef bint has_time = False
     cdef int64 code_hash
-    cdef int64 block_hash
-    cdef unordered_map[int64, LineTime] line_entries
-
-    if any(bytecode):
-        block_hash = hash(bytecode)
-    else:  # Cython functions have empty/zero bytecodes
+    cdef object py_bytes_obj = code.co_code
+    cdef char* data = PyBytes_AS_STRING(py_bytes_obj)
+    cdef Py_ssize_t size = PyBytes_GET_SIZE(py_bytes_obj)
+    cdef unsigned long ident
+    cdef Py_hash_t block_hash
+    cdef LineTime *entry
+    cdef unordered_map[int64, LineTime] *line_entries
+    cdef unordered_map[int64, LastTime] *last_map
+
+    # Cython functions have empty/zero bytecodes
+    # FIXME: `hash_bytecode()` is supposed to switch to
+    # `Py_HashBuffer()` or `_Py_HashBytes()` where available, but
+    # in tests it only seems to add overhead...
+    for i in range(size):
+        if data[i]:
+            # block_hash = hash_bytecode(<PyObject *>py_bytes_obj, data, size)
+            block_hash = hash(py_bytes_obj)
+            break
+    else:  # fallback for Cython functions
         block_hash = hash(code)
+
     code_hash = compute_line_hash(block_hash, lineno)
 
     for prof_ in instances:
+        # for some reason, doing this is much faster than just combining it into the above
+        # like doing "for prof in instances:" is far slower
         prof = <LineProfiler>prof_
         if not prof._c_code_map.count(code_hash):
             continue
         if not has_time:
             time = hpTimer()
-            has_time = 1
-        ident = threading.get_ident()
-        if prof._c_last_time[ident].count(block_hash):
-            old = prof._c_last_time[ident][block_hash]
-            line_entries = prof._c_code_map[code_hash]
+            has_time = True
+        ident = PyThread_get_thread_ident()
+        last_map = &(prof._c_last_time[ident])
+        if deref(last_map).count(block_hash):
+            old = deref(last_map)[block_hash]
+            line_entries = &(prof._c_code_map[code_hash])
             key = old.f_lineno
-            if not line_entries.count(key):
-                prof._c_code_map[code_hash][key] = LineTime(
-                    code_hash, key, 0, 0)
-            prof._c_code_map[code_hash][key].nhits += 1
-            prof._c_code_map[code_hash][key].total_time += time - old.time
+            if not deref(line_entries).count(key):
+                deref(line_entries)[key] = LineTime(code_hash, key, 0, 0)
+            entry = &(deref(line_entries)[key])
+            # Note: explicitly `deref()`-ing here causes the new values
+            # to be assigned to a temp var;
+            # meanwhile, directly dot-accessing a pointer causes Cython
+            # to correctly write `ptr->attr = (ptr->attr + incr)`
+            entry.nhits += 1
+            entry.total_time += time - old.time
         if is_line_event:
             # Get the time again. This way, we don't record much time
             # wasted in this function.
-            prof._c_last_time[ident][block_hash] = LastTime(lineno, hpTimer())
-        elif prof._c_last_time[ident].count(block_hash):
+            deref(last_map)[block_hash] = LastTime(lineno, hpTimer())
+        elif deref(last_map).count(block_hash):
             # We are returning from a function, not executing a line.
             # Delete the last_time record. It may have already been
             # deleted if we are profiling a generator that is being
             # pumped past its end.
-            prof._c_last_time[ident].erase(
-                prof._c_last_time[ident].find(block_hash))
+            deref(last_map).erase(deref(last_map).find(block_hash))
 
 
 cdef extern int legacy_trace_callback(