From c5d4bd79fc1997af27a9e4a68c53003fdc9ba85c Mon Sep 17 00:00:00 2001
From: William Grant <williamgrant@aprioriinvestments.com>
Date: Fri, 17 Feb 2023 16:55:57 -0500
Subject: [PATCH] Benchmarking with airspeed.

Adds a benchmarks/ folder with speed and memory tests
which we can use with airspeed to track perf over time.
---
 benchmarks/.gitignore                         |   1 +
 benchmarks/asv.conf.json                      |  19 ++
 benchmarks/benchmarks/__init__.py             |   0
 benchmarks/benchmarks/cached_speed.py         |  49 ++++
 benchmarks/benchmarks/memory.py               |  39 ++++
 .../benchmarks/uncached_absolute_speed.py     | 215 ++++++++++++++++++
 .../benchmarks/uncached_relative_speed.py     | 132 +++++++++++
 7 files changed, 455 insertions(+)
 create mode 100644 benchmarks/.gitignore
 create mode 100644 benchmarks/asv.conf.json
 create mode 100644 benchmarks/benchmarks/__init__.py
 create mode 100644 benchmarks/benchmarks/cached_speed.py
 create mode 100644 benchmarks/benchmarks/memory.py
 create mode 100644 benchmarks/benchmarks/uncached_absolute_speed.py
 create mode 100644 benchmarks/benchmarks/uncached_relative_speed.py

diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore
new file mode 100644
index 000000000..80922a4dd
--- /dev/null
+++ b/benchmarks/.gitignore
@@ -0,0 +1 @@
+.asv/
diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json
new file mode 100644
index 000000000..fa673cd07
--- /dev/null
+++ b/benchmarks/asv.conf.json
@@ -0,0 +1,19 @@
+{
+    "version": 1,
+    "project": "typed_python",
+    "project_url": "typed-python.readthedocs.io",
+    "repo": "..",
+    "branches": ["dev"],
+    "dvcs": "git",
+    "show_commit_url": "https://github.com/APrioriInvestments/typed_python/commit/",
+    "benchmark_dir": "benchmarks",
+    "build_cache_size": 8,
+    "build_command": ["pip install numpy",
+         "python setup.py build",
+         "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}"
+     ],
+    "environment_type": "conda",
+    "env_dir": ".asv/env",
+    "results_dir": ".asv/results",
+    "html_dir": ".asv/html"
+}
diff --git a/benchmarks/benchmarks/__init__.py b/benchmarks/benchmarks/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/benchmarks/cached_speed.py b/benchmarks/benchmarks/cached_speed.py
new file mode 100644
index 000000000..99accdeb7
--- /dev/null
+++ b/benchmarks/benchmarks/cached_speed.py
@@ -0,0 +1,49 @@
+"""
+Benchmarks:
+organised into suites
+can have setup and teardown methods
+tests start with time, mem, track, or peakmem.
+Four classes of timing test:
+- Cached/Uncached  - i.e is the compiler cache turned on.
+- Absolute/Relative - i.e do we measure absolute time taken (with time_*)
+    or the relative time taken (with track_*)
+
+
+TODO:
+    - Extend suite to better track real usage
+    - Tests to compare the performance of already-cached code (rather than cold start)
+
+"""
+import tempfile
+
+from typed_python.test_util import evaluateExprInFreshProcess
+
+
+class TimeSuiteCached:
+    def time_cache_handles_changed_types(self):
+        xmodule1 = "\n".join([
+            "@Entrypoint",
+            "def f(x):",
+            "    return x",
+            "aList=[]",
+            "@Entrypoint",
+            "def g1(x):",
+            "    return len(aList) + f(x)",
+        ])
+
+        xmodule2 = "\n".join([
+            "@Entrypoint",
+            "def f(x):",
+            "    return x",
+            "@Entrypoint",
+            "def g2(x):",
+            "    return f(x)",
+        ])
+
+        VERSION1 = {'x.py': xmodule1}
+        VERSION2 = {'x.py': xmodule2}
+
+        with tempfile.TemporaryDirectory() as compilerCacheDir:
+            assert evaluateExprInFreshProcess(VERSION1, 'x.g1(1)', compilerCacheDir) == 1
+            assert evaluateExprInFreshProcess(VERSION2, 'x.f(1)', compilerCacheDir) == 1
+            assert evaluateExprInFreshProcess(VERSION1, 'x.g1(1)', compilerCacheDir) == 1
diff --git a/benchmarks/benchmarks/memory.py b/benchmarks/benchmarks/memory.py
new file mode 100644
index 000000000..02153528b
--- /dev/null
+++ b/benchmarks/benchmarks/memory.py
@@ -0,0 +1,39 @@
+"""
+Benchmarks:
+organised into suites
+can have setup and teardown methods
+tests start with time, mem, track, or peakmem.
+Four classes of timing test:
+- Cached/Uncached  - i.e is the compiler cache turned on.
+- Absolute/Relative - i.e do we measure absolute time taken (with time_*)
+    or the relative time taken (with track_*)
+
+
+TODO:
+    - Extend suite to better track real usage
+    - Tests to compare the performance of already-cached code (rather than cold start)
+
+"""
+from typed_python import Class, Member, Held, Final
+
+
+@Held
+class H(Class, Final):
+    x = Member(int, nonempty=True)
+    y = Member(float, nonempty=True)
+
+    def f(self):
+        return self.x + self.y
+
+    def addToX(self, y):
+        return self.x + y
+
+    def increment(self):
+        self.x += 1
+        self.y += 1
+
+
+class MemSuite:
+    def peakmem_held_class_on_heap(self):
+        for _ in range(100000):
+            H()
diff --git a/benchmarks/benchmarks/uncached_absolute_speed.py b/benchmarks/benchmarks/uncached_absolute_speed.py
new file mode 100644
index 000000000..c1f13a1a0
--- /dev/null
+++ b/benchmarks/benchmarks/uncached_absolute_speed.py
@@ -0,0 +1,215 @@
+"""
+Benchmarks:
+organised into suites
+can have setup and teardown methods
+tests start with time, mem, track, or peakmem.
+Four classes of timing test:
+- Cached/Uncached  - i.e is the compiler cache turned on.
+- Absolute/Relative - i.e do we measure absolute time taken (with time_*)
+    or the relative time taken (with track_*)
+
+
+TODO:
+    - Extend suite to better track real usage
+    - Tests to compare the performance of already-cached code (rather than cold start)
+
+"""
+import typed_python.compiler.python_ast_analysis as python_ast_analysis
+import typed_python.python_ast as python_ast
+
+from typed_python import Entrypoint, Class, Member, TupleOf, Function, ListOf
+from typed_python.test_util import CodeEvaluator
+
+
+class AClass(Class):
+    x = Member(int)
+    y = Member(float)
+    z = Member(TupleOf(int))
+
+    def f(self) -> float:
+        return self.x + self.y
+
+    def f(self, arg) -> float:  # noqa
+        return self.x + self.y + arg
+
+    def g(self) -> float:
+        return 100
+
+    def add(self, x) -> float:
+        return 100 + x
+
+    def loop(self, count: int) -> float:
+        i = 0
+        res = self.y
+        while i < count:
+            res = res + self.y
+            i = i + 1
+
+        return res
+
+
+class AChildClass(AClass):
+    def g(self) -> float:
+        return 1234
+
+    def add(self, x) -> float:
+        if isinstance(x, int):
+            return 0.2
+
+        return 1234 + x
+
+
+class TimeSuiteAbsoluteUncached:
+    """The TP perf tests, recast as benchmarks."""
+
+    def time_variables_read(self):
+        count = 200
+        evaluator = CodeEvaluator()
+
+        def makeF(cCount):
+            CODE = (
+                "def f():\n"
+                "    class C:\n"
+            ) + (
+                "        class B:\n"
+                "            pass\n"
+            ) * cCount
+            moduleDict = {}
+            evaluator.evaluateInto(CODE, moduleDict)
+            return moduleDict.get('f')
+
+        pyast = python_ast.convertFunctionToAlgebraicPyAst(makeF(count))
+        python_ast_analysis.computeVariablesReadByClosures(pyast.body)
+
+    def time_bytes_add(self):
+        @Entrypoint
+        def bytesAdd(x: bytes):
+            i = 0
+            res = 0
+            while i < len(x):
+                j = 0
+                while j < len(x):
+                    res = res + x[i] + x[j]
+                    j = j + 1
+                i = i + 1
+            return res
+        bytesAdd(b" " * 1)  # once to compile
+        bytesAdd(b" " * 2000)
+
+    def time_bytes_split(self):
+        @Entrypoint
+        def splitAndCount(s: bytes, sep: bytes, times: int):
+            res = 0
+            for i in range(times):
+                res += len(s.split(sep))
+            return res
+        splitAndCount(b"a,"*100, b",", 10)  # once to compile
+        splitAndCount(b"a,"*100, b",", 10_000)
+
+    def time_call_method_dispatch(self):
+        @Entrypoint
+        def addCaller(c: AClass, count: int):
+            res = c.add(1) + c.add(2.5)
+
+            for i in range(count - 1):
+                res += c.add(1) + c.add(2.5)
+
+            return res
+        c = AClass()
+        c2 = AChildClass()
+        addCaller(c, 200 * 1)
+        addCaller(c, 200 * 10000)
+        addCaller(c2, 200 * 10000)
+
+    def time_call_closure(self):
+        ct = 1000000
+        aList1 = ListOf(int)([])
+
+        def makeAppender(l):
+            @Function
+            def append(y):
+                l.append(y)
+            return append
+
+        @Entrypoint
+        def callManyTimes(c1, ct):
+            for i in range(ct):
+                c1(i)
+
+        callManyTimes(makeAppender(aList1), 1)
+        aList1.clear()
+        callManyTimes(makeAppender(aList1), ct)
+
+    def time_assign_functions_with_closure(self):
+        @Entrypoint
+        def callIt(x):
+            y = 10.0
+            if x % 2:
+                def f(a):
+                    return a + y + 1.0
+            else:
+                def f(a):
+                    return a + y + 2.0
+            res = 0.0
+            for i in range(x):
+                x = x + 1
+                res += f(i)
+            return res
+
+        callIt(1)
+        callIt(1000000)
+
+    def time_mtrgs(self):
+        def q(x):
+            return x-1
+
+        def z(x):
+            return q(x)+1
+
+        def f(x):
+            return z(g(x - 1)) + z(g(x - 2)) + z(x)
+
+        @Entrypoint
+        def g(x):
+            if x > 0:
+                return z(f(x-1)) * z(2) + f(x-2)
+            return 1
+
+        for input in [18, 18.0]:
+            for _ in range(1000):
+                g(input)
+
+    def time_inlining(self):
+        """This one makes no sense in the absolute suite."""
+        def f1(x):
+            return f2(x)
+
+        def f2(x):
+            return f3(x)
+
+        def f3(x):
+            return f4(x)
+
+        def f4(x: int):
+            return x
+
+        @Entrypoint
+        def callsF1(times: int):
+            res = 0.0
+            for i in range(times):
+                res += f1(i)
+            return res
+
+        @Entrypoint
+        def callsF4(times: int):
+            res = 0.0
+            for i in range(times):
+                res += f4(i)
+            return res
+
+        # prime the compilation
+        callsF4(1)
+        callsF1(1)
+
+        callsF1(10000000)
+        callsF4(10000000)
diff --git a/benchmarks/benchmarks/uncached_relative_speed.py b/benchmarks/benchmarks/uncached_relative_speed.py
new file mode 100644
index 000000000..a4de19c04
--- /dev/null
+++ b/benchmarks/benchmarks/uncached_relative_speed.py
@@ -0,0 +1,132 @@
+"""
+Benchmarks:
+organised into suites
+can have setup and teardown methods
+tests start with time, mem, track, or peakmem.
+Four classes of timing test:
+- Cached/Uncached  - i.e is the compiler cache turned on.
+- Absolute/Relative - i.e do we measure absolute time taken (with time_*)
+    or the relative time taken (with track_*)
+
+
+TODO:
+    - Extend suite to better track real usage
+    - Tests to compare the performance of already-cached code (rather than cold start)
+
+"""
+import threading
+import time
+
+from typed_python import Entrypoint, Dict
+
+
+class TimeSuiteRelativeUncached:
+    unit = 'ratio'
+
+    def track_inlining_ratio(self):
+        def f1(x):
+            return f2(x)
+
+        def f2(x):
+            return f3(x)
+
+        def f3(x):
+            return f4(x)
+
+        def f4(x: int):
+            return x
+
+        @Entrypoint
+        def callsF1(times: int):
+            res = 0.0
+            for i in range(times):
+                res += f1(i)
+            return res
+
+        @Entrypoint
+        def callsF4(times: int):
+            res = 0.0
+            for i in range(times):
+                res += f4(i)
+            return res
+
+        # prime the compilation
+        callsF4(1)
+        callsF1(1)
+
+        t0 = time.time()
+        callsF1(10000000)
+        t1 = time.time()
+        callsF4(10000000)
+        t2 = time.time()
+
+        callsDeeply = t1 - t0
+        callsShallowly = t2 - t1
+        return callsDeeply / callsShallowly
+
+    def track_star_kwarg_intermediate_ratio(self):
+        def f(x, y):
+            return x + y
+
+        def g(**kwargs):
+            return f(**kwargs)
+
+        @Entrypoint
+        def sumUsingG(a: int):
+            res = 0.0
+            for i in range(a):
+                res += g(x=2, y=i)
+            return res
+
+        @Entrypoint
+        def sumUsingF(a: int):
+            res = 0.0
+            for i in range(a):
+                res += f(x=2, y=i)
+            return res
+
+        sumUsingF(10)
+        sumUsingG(10)
+
+        t0 = time.time()
+        sumUsingG(1000000)
+        elapsedG = time.time() - t0
+
+        t0 = time.time()
+        sumUsingF(1000000)
+        elapsedF = time.time() - t0
+
+        return elapsedF / elapsedG
+
+    def track_dict_read_write_multicore_ratio(self):
+        @Entrypoint
+        def dict_setmany(d, count, passes):
+            for _ in range(passes):
+                for i in range(count):
+                    if i in d:
+                        d[i] += i
+                    else:
+                        d[i] = i
+
+        # make sure we compile this immediately
+        aDictToForceCompilation = Dict(int, int)()
+        dict_setmany(aDictToForceCompilation, 1, 1)
+
+        # test it with one core
+        t0 = time.time()
+        aDict = Dict(int, int)()
+        dict_setmany(aDict, 10000, 100)
+        t1 = time.time()
+
+        # test it with 2 cores
+        threads = [threading.Thread(target=dict_setmany, args=(Dict(int, int)(), 10000, 100)) for _ in range(2)]
+        t2 = time.time()
+        for t in threads:
+            t.start()
+        for t in threads:
+            t.join()
+        t3 = time.time()
+
+        slowdownRatio = (t3 - t2) / (t1 - t0)
+
+        return slowdownRatio