diff --git a/.gitignore b/.gitignore
index f1cee27..783f035 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,34 +1,6 @@
-data/crankseg_2.mtx
-data/pwtk.mtx
-data/ship_001.mtx
-data/web-Google.mtx
-data/af_shell9.mtx
-data/Fault_639.mtx
-data/kkt_power.mtx
-data/kron_g500-logn18.mtx
-data/mouse_gene.mtx
-data/nd24k.mtx
-data/TSOPF_FS_b300_c3.mtx
-data/mycielskian16.mtx
-data/12month1.mtx
-data/amazon0601.mtx
-data/bibd_22_8.mtx
-data/cage14.mtx
-data/cant.mtx
-data/cari.mtx
-data/cit-Patents.mtx
-data/CoupCons3D.mtx
-data/dielFilterV2real.mtx
-data/fem_hifreq_circuit.mtx
-data/Hardesty2.mtx
-data/hugetrace-00010.mtx
-data/human_gene2.mtx
-data/IMDB.mtx
-data/ldoor.mtx
-data/mycielskian14.mtx
-data/nemsemm1.mtx
-output3/*
-output/*
+data/**
+output3/**
+output/**
 *.[Oo]
 *.out
 *.exe
@@ -36,4 +8,5 @@ output/*
 nohup.out
 hyte
 scache
-.vscode/
\ No newline at end of file
+.vscode/**
+scripts/__pycache__/**
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..8ab4959
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,77 @@
+###################################################
+# Constants
+###################################################
+TARGET := scache
+
+SRC_DIRS := ./src
+BUILD_DIR := ./build
+DEBUG_DIR := $(BUILD_DIR)/debug
+OUTPUT_DIR := ./output
+
+# find the source files, extract the filenames, 
+# stick them in the build dir as .o
+SRC := $(shell find $(SRC_DIRS) -name '*.cpp')
+FILENAMES := $(basename $(notdir $(SRC)))
+OBJS := $(FILENAMES:%=$(BUILD_DIR)/%.o)
+DEBUG_OBJS := $(FILENAMES:%=$(DEBUG_DIR)/%.o)
+
+# Flags for g++
+CPPFLAGS := -O3 -Wall -Wextra -Werror
+DEBUG_CPPFLAGS := $(CPPFLAGS) -g
+
+# Phony targets (do not represent a file)
+.PHONY: remake debug clean 
+
+###################################################
+# Targets
+###################################################
+# Use "order only" prereqs to make sure dirs are created
+# Use the phony to make sure that we always copy the correct
+# exe over
+all: $(BUILD_DIR)/$(TARGET) | $(BUILD_DIR) $(OUTPUT_DIR)
+	cp $(BUILD_DIR)/$(TARGET) ./$(TARGET)
+
+# Redundant target for ease of use
+$(TARGET): all
+
+# Directory targets, silent
+$(BUILD_DIR):
+	@mkdir -p $(BUILD_DIR)
+$(DEBUG_DIR):
+	@mkdir -p $(DEBUG_DIR)
+$(OUTPUT_DIR):
+	@mkdir -p $(OUTPUT_DIR)
+
+# Main and debug targets
+$(BUILD_DIR)/$(TARGET): $(OBJS) | $(BUILD_DIR)
+	g++ $(CPPFLAGS) $(OBJS) -o $(BUILD_DIR)/$(TARGET)
+
+# Construct a unique compilation step for each src->object
+# to minimize re-building. Depends on the src file
+# and order-only on the directory.
+define OBJ_COMP_TEMPLATE =
+$(1)/$(basename $(notdir $(2))).o: $(2) | $(1)
+	g++ -c $(3) $(2) -o $$@
+endef
+
+# Instantiate & evaluate each of the object build steps for f in $(SRC)
+$(foreach f,$(SRC),$(eval $(call OBJ_COMP_TEMPLATE,$(BUILD_DIR),$(f),$(CPPFLAGS))))
+
+#-------------------------------------------------- 
+# Debug targets
+#-------------------------------------------------- 
+debug: $(DEBUG_DIR)/$(TARGET) | $(DEBUG_DIR) $(OUTPUT_DIR)
+	cp $(DEBUG_DIR)/$(TARGET) ./$(TARGET)
+
+$(DEBUG_DIR)/$(TARGET): $(DEBUG_OBJS) | $(DEBUG_DIR)
+	g++ $(DEBUG_CPPFLAGS) $(DEBUG_OBJS) -o $(DEBUG_DIR)/$(TARGET)
+
+$(foreach f,$(SRC),$(eval $(call OBJ_COMP_TEMPLATE,$(DEBUG_DIR),$(f),$(DEBUG_CPPFLAGS))))
+
+#-------------------------------------------------- 
+# Helpful phonies
+#-------------------------------------------------- 
+remake: clean all
+
+clean:
+	-rm -r $(DEBUG_DIR) $(BUILD_DIR) $(OUTPUT_DIR) $(TARGET) 
diff --git a/README.md b/README.md
index a07c08c..9a01c2b 100644
--- a/README.md
+++ b/README.md
@@ -1,16 +1,19 @@
-# SeaCache-sim
+# UM EECS570 WN26 Project Repository
+This is the repository for our EECS570 project. Our report will be added eventually. This project is forked from SeaCache-sim, and the README below largely follows from theirs.
+
+## SeaCache-sim
 This is the source code repository of the MICRO'25 paper *SeaCache: Efficient and Adaptive Caching for Sparse Accelerators*.
 
-## Build
+### Build
 
 ```bash
-$ g++ -O3 -march=native  src/config.cpp src/data.cpp src/estimation.cpp src/parameters.cpp src/util.cpp src/statistics.cpp src/cache.cpp src/dynamic.cpp src/simulator.cpp src/main.cpp -o scache
+$ make install
 ```
 
-## Workload
+### Workload
 The scheduler and simulator accept sparse matrices from MatrixMarket (.mtx). The folder containing these matrices is under `data`.
 
-## Run
+### Run
 The following command simulates multiplication of `matrix1` and `matrix2` with the configuration specified in `config/config.json`:
 ```bash
 $ ./scache matrix1 matrix2 config/config.json
@@ -32,14 +35,21 @@ Here is a sample json configuration:
 - "condensedOP": When set to true, it uses the condensed OP dataflow instead of the default Gustavson's dataflow.
 - "tileDir": Represents the directory containing the tiling selection for each matrix.
 
-## Code description
+
+### scripts
+Check inputs exist (scache, tile files, .mtx files).
+Generate config JSON files for a sweep.
+Run ./scache matrix matrix config.json.
+Parse output .txt files into one CSV.
+
+### Code description
 
 The code shares the same base simulator as the previous work, [HYTE](https://github.com/tsinghua-ideal/HYTE-sim ""). However, this work shifts the focus from tile selection to cache optimization, with the pre-defined tiling selection located in the "tileDir" directory. The modifications primarily involve various cache schemes and prefetching techniques.
 
 The changes are mainly found in the `cache.cpp` and `simulator.cpp` files. The proposed mapping scheme from Section 4.1 of the paper, along with the baseline mapping schemes, are implemented within different branches of the `cacheAccessFiber()` function in `cache.cpp`. The corresponding replacement policies, as described in Section 4.2, are invoked by the different cache schemes. For the guided replacement policies, the prefetch logic and maintenance of the prefetched metadata are implemented in the `prefetchRow()` function, which is iterated during simulation in `simulator.cpp`. The adaptive prefetch size introduced in Section 4.3 is also implemented in `simulator.cpp` and is called during the calculation process.
 
 
-## Reference
+### Reference
 
 If you use this tool in your research, please kindly cite the following paper.
 
diff --git a/config/config.json b/config/config.json
index 7d2a5b3..3bcd195 100644
--- a/config/config.json
+++ b/config/config.json
@@ -4,8 +4,8 @@
     "memorybandwidth": 68,
     "PEcnt": 32,
     "srambank": 32,
-    "baselinetest": 0,
+    "baselinetest": 1,
     "condensedOP": false,
     "tileDir": "./tiles/",
     "outputDir": "./output/"
-}
\ No newline at end of file
+}
diff --git a/generate_trace.py b/generate_trace.py
new file mode 100644
index 0000000..66d129f
--- /dev/null
+++ b/generate_trace.py
@@ -0,0 +1,170 @@
+from scipy.io import mmread
+import numpy as np
+import os
+import argparse
+
+
+def load_matrix(mtx_path):
+    print(f"[INFO] loading matrix from: {mtx_path}")
+    A = mmread(mtx_path).tocsr()
+    print("[INFO] load done")
+    return A
+
+
+def validate_matrix(A):
+    rows, cols = A.shape
+    nnz = A.nnz
+    row_nnz = np.diff(A.indptr)
+
+    stats = {
+        "rows": rows,
+        "cols": cols,
+        "nnz": nnz,
+        "min_row_nnz": int(row_nnz.min()) if len(row_nnz) > 0 else 0,
+        "max_row_nnz": int(row_nnz.max()) if len(row_nnz) > 0 else 0,
+        "avg_row_nnz": float(row_nnz.mean()) if len(row_nnz) > 0 else 0.0,
+        "empty_rows": int(np.sum(row_nnz == 0)),
+    }
+    return stats
+
+
+def print_stats(stats):
+    print("\n[VALIDATE] matrix statistics")
+    print(f"shape: ({stats['rows']}, {stats['cols']})")
+    print(f"nonzeros: {stats['nnz']}")
+    print(f"min row nnz: {stats['min_row_nnz']}")
+    print(f"max row nnz: {stats['max_row_nnz']}")
+    print(f"avg row nnz: {stats['avg_row_nnz']:.4f}")
+    print(f"empty rows: {stats['empty_rows']}")
+
+
+def generate_fiber_trace(A, out_path, include_write=True):
+    """
+    教学版 trace:
+      READ_B_FIBER k
+      WRITE_C_ROW i
+
+    含义：
+      对于 A 的第 i 行，若该行非零列号为 k1, k2, ...
+      则 Gust 风格下会去访问 B 的对应 fibers: B[k1], B[k2], ...
+    """
+    total_reads = 0
+    total_writes = 0
+
+    with open(out_path, "w") as f:
+        f.write("# Teaching/demo fiber trace for Gustavson-style sparse processing\n")
+        f.write("# Format:\n")
+        f.write("#   ROW <i>\n")
+        f.write("#   READ_B_FIBER <k>\n")
+        f.write("#   WRITE_C_ROW <i>\n\n")
+
+        for i in range(A.shape[0]):
+            start = A.indptr[i]
+            end = A.indptr[i + 1]
+            ks = A.indices[start:end]
+
+            f.write(f"ROW {i}\n")
+            for k in ks:
+                f.write(f"READ_B_FIBER {int(k)}\n")
+                total_reads += 1
+
+            if include_write:
+                f.write(f"WRITE_C_ROW {i}\n")
+                total_writes += 1
+
+    return total_reads, total_writes
+
+
+def generate_address_trace(A, out_path, elem_bytes=8, include_write=True):
+    """
+    更像传统 cache trace 的版本：
+    用虚拟基地址把 B fibers / C rows 映射成“伪地址”。
+
+    注意：
+    这只是学习和比较用，不是 SeaCache 官方精确地址格式。
+    """
+    # 给 B fibers 和 C rows 各分一个大的地址空间
+    base_B = 0x10000000
+    base_C = 0x20000000
+
+    total_reads = 0
+    total_writes = 0
+
+    with open(out_path, "w") as f:
+        f.write("# Teaching/demo address trace\n")
+        f.write("# Format: R/W <hex_addr>\n\n")
+
+        for i in range(A.shape[0]):
+            start = A.indptr[i]
+            end = A.indptr[i + 1]
+            ks = A.indices[start:end]
+
+            for k in ks:
+                addr = base_B + int(k) * elem_bytes
+                f.write(f"R {hex(addr)}\n")
+                total_reads += 1
+
+            if include_write:
+                c_addr = base_C + int(i) * elem_bytes
+                f.write(f"W {hex(c_addr)}\n")
+                total_writes += 1
+
+    return total_reads, total_writes
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--mtx", required=True, help="path to input .mtx file")
+    parser.add_argument("--out", required=True, help="output trace path")
+    parser.add_argument(
+        "--mode",
+        choices=["fiber", "addr"],
+        default="fiber",
+        help="fiber: READ_B_FIBER / WRITE_C_ROW; addr: R/W hex_addr",
+    )
+    parser.add_argument(
+        "--no-write",
+        action="store_true",
+        help="do not emit WRITE_C_ROW / W lines",
+    )
+    parser.add_argument(
+        "--elem-bytes",
+        type=int,
+        default=8,
+        help="element size in bytes for addr mode",
+    )
+    args = parser.parse_args()
+
+    if not os.path.exists(args.mtx):
+        raise FileNotFoundError(f"input mtx not found: {args.mtx}")
+
+    A = load_matrix(args.mtx)
+
+    stats = validate_matrix(A)
+    print_stats(stats)
+
+    include_write = not args.no_write
+
+    print(f"\n[INFO] generating trace in mode = {args.mode}")
+    if args.mode == "fiber":
+        reads, writes = generate_fiber_trace(A, args.out, include_write=include_write)
+    else:
+        reads, writes = generate_address_trace(
+            A, args.out, elem_bytes=args.elem_bytes, include_write=include_write
+        )
+
+    print(f"[INFO] trace saved to: {args.out}")
+    print(f"[INFO] read events: {reads}")
+    print(f"[INFO] write events: {writes}")
+    print(f"[INFO] total events: {reads + writes}")
+
+    print("\n[INFO] first 10 lines of trace:")
+    with open(args.out, "r") as f:
+        for idx, line in enumerate(f):
+            print(line.rstrip())
+            if idx >= 9:
+                break
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/data_collection.py b/scripts/data_collection.py
new file mode 100755
index 0000000..0d40ac2
--- /dev/null
+++ b/scripts/data_collection.py
@@ -0,0 +1,388 @@
+#!/usr/bin/env python3
+"""Data collection helper for SeaCache experiments.
+
+Tasks:
+1) Validate matrices/tile files exist.
+2) Generate config JSON files for an experiment sweep.
+3) Run scache for each (matrix, config) pair.
+4) Parse simulator output text into a CSV table.
+"""
+
+from __future__ import annotations
+
+import argparse
+import csv
+import itertools
+import json
+import re
+import subprocess
+import sys
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional, Sequence, Tuple
+
+
+METRIC_PATTERNS = {
+    "total_cycle": re.compile(r"total cycle =\s*(\d+)"),
+    "load_cycle": re.compile(r"load cycle =\s*(\d+)"),
+    "multiply_cycle": re.compile(r"multiply cycle =\s*(\d+)"),
+    "merge_writeback_cycle": re.compile(r"merge and writeback cycle =\s*(\d+)"),
+    "total_sram_cycle": re.compile(r"total SRAM cycle =\s*(\d+)"),
+    "total_dram_cycle": re.compile(r"total DRAM cycle =\s*(\d+)"),
+    "total_pe_cycle": re.compile(r"total PE cycle =\s*(\d+)"),
+    "total_dram_access_a": re.compile(r"total DRAM access A =\s*(\d+)"),
+    "total_dram_access_b": re.compile(r"total DRAM access B =\s*(\d+)"),
+    "total_dram_access_c": re.compile(r"total DRAM access C =\s*(\d+)"),
+    "hitrate": re.compile(r"hitrate =\s*([0-9eE+\-.]+)"),
+}
+
+
+@dataclass(frozen=True)
+class ExperimentConfig:
+    transpose: int
+    cachesize: float
+    memorybandwidth: float
+    pecnt: int
+    srambank: int
+    baselinetest: int
+    condensedop: bool
+
+    def json_obj(self, tile_dir: str, output_dir: str) -> Dict[str, object]:
+        return {
+            "transpose": self.transpose,
+            "cachesize": self.cachesize,
+            "memorybandwidth": self.memorybandwidth,
+            "PEcnt": self.pecnt,
+            "srambank": self.srambank,
+            "baselinetest": self.baselinetest,
+            "condensedOP": self.condensedop,
+            "tileDir": tile_dir,
+            "outputDir": output_dir,
+        }
+
+    def tag(self) -> str:
+        condensed = 1 if self.condensedop else 0
+        return (
+            f"t{self.transpose}_c{self.cachesize:g}_bw{self.memorybandwidth:g}"
+            f"_pe{self.pecnt}_sb{self.srambank}_b{self.baselinetest}_co{condensed}"
+        )
+
+
+def parse_csv_numbers(raw: str, conv):
+    return [conv(x.strip()) for x in raw.split(",") if x.strip()]
+
+
+def discover_matrices(tile_dir: Path) -> List[str]:
+    return sorted([p.name for p in tile_dir.iterdir() if p.is_file() and not p.name.startswith(".")])
+
+
+def matrix_file_path(matrix: str, roots: Sequence[Path]) -> Optional[Path]:
+    for root in roots:
+        if root.name == "largedata":
+            candidate = root / matrix / f"{matrix}.mtx"
+        else:
+            candidate = root / f"{matrix}.mtx"
+        if candidate.exists():
+            return candidate
+    return None
+
+
+def validate_inputs(
+    matrices: Sequence[str],
+    tile_dir: Path,
+    matrix_roots: Sequence[Path],
+    scache_path: Path,
+) -> Tuple[List[str], List[Tuple[str, str]]]:
+    problems: List[Tuple[str, str]] = []
+
+    if not scache_path.exists():
+        problems.append(("scache", f"binary missing at {scache_path}"))
+    elif not scache_path.is_file():
+        problems.append(("scache", f"not a file: {scache_path}"))
+
+    valid_matrices: List[str] = []
+    for matrix in matrices:
+        tile_file = tile_dir / matrix
+        matrix_file = matrix_file_path(matrix, matrix_roots)
+        missing = []
+        if not tile_file.exists():
+            missing.append(f"tile file missing: {tile_file}")
+        if matrix_file is None:
+            looked = ", ".join(str(p) for p in matrix_roots)
+            missing.append(f"matrix .mtx missing in roots: {looked}")
+
+        if missing:
+            problems.append((matrix, "; ".join(missing)))
+        else:
+            valid_matrices.append(matrix)
+
+    return valid_matrices, problems
+
+
+def build_experiment_grid(args: argparse.Namespace) -> List[ExperimentConfig]:
+    if args.profile == "quick":
+        return [
+            ExperimentConfig(0, 1.0, 34.0, 16, 16, 0, False),
+            ExperimentConfig(0, 2.0, 68.0, 32, 32, 0, False),
+            ExperimentConfig(0, 4.0, 136.0, 64, 32, 0, False),
+        ]
+
+    if args.profile == "balanced":
+        return [
+            ExperimentConfig(0, 1.0, 68.0, 32, 32, 0, False),
+            ExperimentConfig(0, 2.0, 68.0, 32, 32, 0, False),
+            ExperimentConfig(0, 4.0, 68.0, 32, 32, 0, False),
+            ExperimentConfig(0, 2.0, 34.0, 32, 32, 0, False),
+            ExperimentConfig(0, 2.0, 136.0, 32, 32, 0, False),
+            ExperimentConfig(0, 2.0, 68.0, 16, 32, 0, False),
+            ExperimentConfig(0, 2.0, 68.0, 64, 32, 0, False),
+            ExperimentConfig(0, 2.0, 68.0, 32, 16, 0, False),
+            ExperimentConfig(0, 2.0, 68.0, 32, 64, 0, False),
+        ]
+
+    if args.profile == "full":
+        configs = [
+            ExperimentConfig(t, c, bw, pe, sb, b, co)
+            for t, c, bw, pe, sb, b, co in itertools.product(
+                parse_csv_numbers(args.transpose_values, int),
+                parse_csv_numbers(args.cachesize_values, float),
+                parse_csv_numbers(args.bandwidth_values, float),
+                parse_csv_numbers(args.pecnt_values, int),
+                parse_csv_numbers(args.srambank_values, int),
+                parse_csv_numbers(args.baseline_values, int),
+                [False if x == 0 else True for x in parse_csv_numbers(args.condensed_values, int)],
+            )
+        ]
+        return configs
+
+    raise ValueError(f"Unknown profile: {args.profile}")
+
+
+def write_config_files(
+    configs: Sequence[ExperimentConfig],
+    config_dir: Path,
+    tile_dir_for_json: str,
+    output_dir_for_json: str,
+) -> Dict[ExperimentConfig, Path]:
+    config_dir.mkdir(parents=True, exist_ok=True)
+    mapping: Dict[ExperimentConfig, Path] = {}
+    for cfg in configs:
+        path = config_dir / f"{cfg.tag()}.json"
+        with path.open("w", encoding="utf-8") as f:
+            json.dump(cfg.json_obj(tile_dir_for_json, output_dir_for_json), f, indent=2)
+            f.write("\n")
+        mapping[cfg] = path
+    return mapping
+
+
+def expected_output_filename(matrix: str, cfg: ExperimentConfig) -> str:
+    prefix = "Base_" if cfg.baselinetest else "SeaCache_"
+    return (
+        f"CGust{prefix}{cfg.cachesize:.6f}MB_{cfg.memorybandwidth:.6f}GBs_"
+        f"{cfg.pecnt}PEs_{cfg.srambank}sbanks__{matrix}_{matrix}_RR_{cfg.transpose}.txt"
+    )
+
+
+def extract_metrics(output_text: str) -> Dict[str, object]:
+    metrics: Dict[str, object] = {}
+    for key, pattern in METRIC_PATTERNS.items():
+        match = pattern.search(output_text)
+        if not match:
+            metrics[key] = None
+            continue
+        value = match.group(1)
+        if key == "hitrate":
+            metrics[key] = float(value)
+        else:
+            metrics[key] = int(value)
+    return metrics
+
+
+def run_one(
+    scache_path: Path,
+    matrix: str,
+    cfg: ExperimentConfig,
+    cfg_path: Path,
+    output_dir: Path,
+    timeout_s: int,
+    dry_run: bool,
+) -> Tuple[str, int, Optional[Path], str]:
+    cmd = [str(scache_path), matrix, matrix, str(cfg_path)]
+
+    if dry_run:
+        return ("dry_run", 0, None, " ".join(cmd))
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+    before_mtime = {p: p.stat().st_mtime for p in output_dir.glob("*.txt") if p.is_file()}
+
+    proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout_s)
+
+    expected = output_dir / expected_output_filename(matrix, cfg)
+    if expected.exists():
+        return ("ok" if proc.returncode == 0 else "failed", proc.returncode, expected, " ".join(cmd))
+
+    after = [p for p in output_dir.glob("*.txt") if p.is_file()]
+    changed = []
+    for p in after:
+        old = before_mtime.get(p)
+        new = p.stat().st_mtime
+        if old is None or new > old:
+            changed.append((new, p))
+    changed.sort(key=lambda x: x[0], reverse=True)
+    candidate = changed[0][1] if changed else None
+
+    return ("ok" if proc.returncode == 0 else "failed", proc.returncode, candidate, " ".join(cmd))
+
+
+def write_csv(rows: List[Dict[str, object]], csv_path: Path) -> None:
+    csv_path.parent.mkdir(parents=True, exist_ok=True)
+    if not rows:
+        csv_path.write_text("", encoding="utf-8")
+        return
+
+    fields = list(rows[0].keys())
+    with csv_path.open("w", newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(f, fieldnames=fields)
+        writer.writeheader()
+        writer.writerows(rows)
+
+
+def parse_args(argv: Sequence[str]) -> argparse.Namespace:
+    p = argparse.ArgumentParser(description="SeaCache data collection script.")
+    p.add_argument("--repo-root", default=".", help="SeaCache repo root")
+    p.add_argument("--scache", default="./scache", help="Path to scache binary")
+    p.add_argument("--tile-dir", default="./tiles", help="Directory of tile files")
+    p.add_argument("--output-dir", default="./output", help="Directory where scache writes .txt results")
+    p.add_argument("--config-dir", default="./generated_configs", help="Directory to write generated config JSON")
+    p.add_argument("--results-csv", default="./output/collected_results.csv", help="CSV file for extracted metrics")
+    p.add_argument("--matrices", default="", help="Comma-separated matrix names (default: discover from tiles)")
+    p.add_argument("--max-matrices", type=int, default=0, help="Limit number of matrices (0 means no limit)")
+    p.add_argument("--profile", choices=["quick", "balanced", "full"], default="balanced")
+    p.add_argument("--timeout", type=int, default=3600, help="Timeout per run in seconds")
+    p.add_argument("--dry-run", action="store_true", help="Only validate + generate configs, skip simulator runs")
+
+    # Used only when --profile full
+    p.add_argument("--transpose-values", default="0")
+    p.add_argument("--cachesize-values", default="1,2,4")
+    p.add_argument("--bandwidth-values", default="34,68,136")
+    p.add_argument("--pecnt-values", default="16,32,64")
+    p.add_argument("--srambank-values", default="16,32,64")
+    p.add_argument("--baseline-values", default="0")
+    p.add_argument("--condensed-values", default="0")
+
+    return p.parse_args(argv)
+
+
+def main(argv: Sequence[str]) -> int:
+    args = parse_args(argv)
+
+    repo_root = Path(args.repo_root).resolve()
+    scache_path = (repo_root / args.scache).resolve()
+    tile_dir = (repo_root / args.tile_dir).resolve()
+    output_dir = (repo_root / args.output_dir).resolve()
+    config_dir = (repo_root / args.config_dir).resolve()
+    results_csv = (repo_root / args.results_csv).resolve()
+
+    matrix_roots = [
+        repo_root / "data",
+        repo_root / "largedata",
+        repo_root / "dense",
+        repo_root / "bfs",
+    ]
+
+    if args.matrices.strip():
+        matrices = [m.strip() for m in args.matrices.split(",") if m.strip()]
+    else:
+        matrices = discover_matrices(tile_dir)
+
+    if args.max_matrices > 0:
+        matrices = matrices[: args.max_matrices]
+
+    valid_matrices, problems = validate_inputs(matrices, tile_dir, matrix_roots, scache_path)
+
+    print(f"[{datetime.now().isoformat(timespec='seconds')}] Validation summary")
+    print(f"  requested matrices: {len(matrices)}")
+    print(f"  valid matrices:     {len(valid_matrices)}")
+    print(f"  validation issues:  {len(problems)}")
+    for item, problem in problems:
+        print(f"  - {item}: {problem}")
+
+    if not valid_matrices:
+        print("No valid matrices found. Exiting.")
+        return 1
+
+    configs = build_experiment_grid(args)
+    cfg_map = write_config_files(
+        configs,
+        config_dir,
+        tile_dir_for_json=args.tile_dir if args.tile_dir.endswith("/") else args.tile_dir + "/",
+        output_dir_for_json=args.output_dir if args.output_dir.endswith("/") else args.output_dir + "/",
+    )
+
+    print(f"[{datetime.now().isoformat(timespec='seconds')}] Generated {len(configs)} configs in {config_dir}")
+
+    rows: List[Dict[str, object]] = []
+    total_jobs = len(valid_matrices) * len(configs)
+    done_jobs = 0
+
+    for matrix in valid_matrices:
+        mtx_path = matrix_file_path(matrix, matrix_roots)
+        assert mtx_path is not None
+
+        for cfg in configs:
+            done_jobs += 1
+            cfg_path = cfg_map[cfg]
+            print(
+                f"[{done_jobs}/{total_jobs}] matrix={matrix} cfg={cfg.tag()} -> running"
+                if not args.dry_run
+                else f"[{done_jobs}/{total_jobs}] matrix={matrix} cfg={cfg.tag()} -> dry run"
+            )
+
+            status, returncode, out_path, cmd = run_one(
+                scache_path=scache_path,
+                matrix=matrix,
+                cfg=cfg,
+                cfg_path=cfg_path,
+                output_dir=output_dir,
+                timeout_s=args.timeout,
+                dry_run=args.dry_run,
+            )
+
+            row: Dict[str, object] = {
+                "matrix": matrix,
+                "matrix_file": str(mtx_path),
+                "config_file": str(cfg_path),
+                "status": status,
+                "returncode": returncode,
+                "command": cmd,
+                "output_file": str(out_path) if out_path else "",
+                "transpose": cfg.transpose,
+                "cachesize": cfg.cachesize,
+                "memorybandwidth": cfg.memorybandwidth,
+                "PEcnt": cfg.pecnt,
+                "srambank": cfg.srambank,
+                "baselinetest": cfg.baselinetest,
+                "condensedOP": int(cfg.condensedop),
+            }
+
+            if out_path and out_path.exists() and not args.dry_run:
+                text = out_path.read_text(encoding="utf-8", errors="ignore")
+                row.update(extract_metrics(text))
+            else:
+                for key in METRIC_PATTERNS:
+                    row[key] = None
+
+            rows.append(row)
+
+    write_csv(rows, results_csv)
+    print(f"[{datetime.now().isoformat(timespec='seconds')}] Wrote {len(rows)} rows to {results_csv}")
+    return 0
+
+
+if __name__ == "__main__":
+    try:
+        raise SystemExit(main(sys.argv[1:]))
+    except subprocess.TimeoutExpired as exc:
+        print(f"Timeout while running command: {exc.cmd}", file=sys.stderr)
+        raise SystemExit(2)
diff --git a/src/cache.cpp b/src/cache.cpp
index e9721b7..f8f803b 100644
--- a/src/cache.cpp
+++ b/src/cache.cpp
@@ -19,20 +19,22 @@ int inputcachesize;
 int SET = cachesize / (CACHEBLOCK * SETASSOC);
 int SETLOG = getlog(SET);
 
+// Params: cachesize, cacheblock
 void setSET() {
   SET = (cachesize) / (CACHEBLOCK * SETASSOC);
   SETLOG = getlog(SET);
+  initialize_cache();
 }
 
-bool Valid[MAXSET][SETASSOC];
-int Tag[MAXSET][SETASSOC];
-int lrubit[MAXSET][SETASSOC];
+bool *Valid = nullptr;
+int *Tag = nullptr;
+int *lrubit = nullptr;
 
-int lfubit[MAXSET][SETASSOC];
+int *lfubit = nullptr;
 
-bool virtualValid[MAXSET][VIRTUALSETASSOC];
-int virtualTag[MAXSET][VIRTUALSETASSOC];
-int virtuallfubit[MAXSET][VIRTUALSETASSOC];
+bool *virtualValid = nullptr;
+int *virtualTag = nullptr;
+int *virtuallfubit = nullptr;
 
 bool Valid4[256];
 int Tag4[256];
@@ -40,19 +42,19 @@ int lrubit4[256];
 
 int LFUbit = 4;
 int LFUmax = (1 << LFUbit) - 1;
-int LFUtag[MAXN];
+int *LFUtag = nullptr;
 
 // split into 4 parts.  witin 16: 0000, 0001, 0010,,,,  1111
-short partialValid[MAXSET][SETASSOC];
+// short *partialValid = nullptr;
 
 // for the pack&split
 const int N_TAG_L_BITS = 0; // Tag-L bits
 
-unsigned char Cnt[MAXSET][SETASSOC];
-bool Next[MAXSET][SETASSOC];
-unsigned short PosOrig[MAXSET][SETASSOC];
+unsigned char *Cnt = nullptr;
+bool *Next = nullptr;
+unsigned short *PosOrig = nullptr;
 
-unsigned short vPosOrig[MAXSET][VIRTUALSETASSOC];
+unsigned short *vPosOrig = nullptr;
 
 // use to record the lru.
 // higher is better (accessed recently)
@@ -124,44 +126,44 @@ unsigned short getOrig(long long addr) {
 // = 1 when use virtual tag
 bool useVirtualTag = 1;
 
-int getLRU(int _set, int _index) { return lrubit[_set][_index]; }
-int getlfubit(int _set, int _index) { return lfubit[_set][_index]; }
+int getLRU(int _set, int _index) { return lrubit[_set * SETASSOC + _index]; }
+int getlfubit(int _set, int _index) { return lfubit[_set * SETASSOC + _index]; }
 
 void updateLRU(int _set, int _index) {
   cachecycle++;
-  lrubit[_set][_index] = cachecycle;
+  lrubit[_set * SETASSOC + _index] = cachecycle;
 }
 
 void updateLRUOPT(int _set, int _index, int nextpos) {
-  lrubit[_set][_index] = nextpos;
+  lrubit[_set * SETASSOC + _index] = nextpos;
 }
 
 void updateLRUOPTLFU(int _set, int _index, int lfutime) {
-  lrubit[_set][_index] = lfutime;
+  lrubit[_set * SETASSOC + _index] = lfutime;
 }
 
 void updatePracticalLFU(int _set, int _index) {
-  if (lfubit[_set][_index]) {
-    lfubit[_set][_index]--;
+  if (lfubit[_set * SETASSOC + _index]) {
+    lfubit[_set * SETASSOC + _index]--;
   }
 }
 
 void initLRU(int _set, int _index) {
   // play the same as updateLRU in LRU policy
   cachecycle++;
-  lrubit[_set][_index] = cachecycle;
+  lrubit[_set * SETASSOC + _index] = cachecycle;
 }
 
 void initLRUOPT(int _set, int _index, int nextpos) {
-  lrubit[_set][_index] = nextpos;
+  lrubit[_set * SETASSOC + _index] = nextpos;
 }
 
 void initLRUOPTLFU(int _set, int _index, int LFUtime) {
-  lrubit[_set][_index] = LFUtime;
+  lrubit[_set * SETASSOC + _index] = LFUtime;
 }
 
 void initPracticalLFU(int _set, int _index, int LFUtime) {
-  lfubit[_set][_index] = LFUtime;
+  lfubit[_set * SETASSOC + _index] = LFUtime;
 }
 
 bool cacheHit(long long addr) {
@@ -169,7 +171,7 @@ bool cacheHit(long long addr) {
   int _tag = getTag(addr);
 
   for (int i = 0; i < SETASSOC; i++) {
-    if (Valid[_set][i] && (Tag[_set][i] == _tag)) {
+    if (Valid[_set * SETASSOC + i] && (Tag[_set * SETASSOC + i] == _tag)) {
       // hit !!
 
       // update lru bit
@@ -188,7 +190,7 @@ bool cacheHitOPT(long long addr, int nextpos) {
   int _tag = getTag(addr);
 
   for (int i = 0; i < SETASSOC; i++) {
-    if (Valid[_set][i] && (Tag[_set][i] == _tag)) {
+    if (Valid[_set * SETASSOC + i] && (Tag[_set * SETASSOC + i] == _tag)) {
       // hit !!
       updateLRUOPT(_set, i, nextpos);
       return 1;
@@ -204,7 +206,7 @@ bool cacheHitOPTLFU(long long addr, int lfutime) {
   int _tag = getTag(addr);
 
   for (int i = 0; i < SETASSOC; i++) {
-    if (Valid[_set][i] && (Tag[_set][i] == _tag)) {
+    if (Valid[_set * SETASSOC + i] && (Tag[_set * SETASSOC + i] == _tag)) {
       // hit !!
       updateLRUOPTLFU(_set, i, lfutime);
       return 1;
@@ -219,19 +221,19 @@ bool cacheHitPracticalLFU(long long addr, bool isfirst, long long firstaddr) {
   int _tag = getTag2(addr);
 
   for (int i = 0; i < SETASSOC; i++) {
-    if (Valid[_set][i]) {
+    if (Valid[_set * SETASSOC + i]) {
       // fuzzy compare
-      if ((Tag[_set][i] <= _tag) && (_tag < Tag[_set][i] + Cnt[_set][i] + 1)) {
+      if ((Tag[_set * SETASSOC + i] <= _tag) && (_tag < Tag[_set * SETASSOC + i] + Cnt[_set * SETASSOC + i] + 1)) {
 
         if (!isfirst) {
           // need to check orig
-          if (PosOrig[_set][i] != getOrig(firstaddr)) {
+          if (PosOrig[_set * SETASSOC + i] != getOrig(firstaddr)) {
             // not the same orig
             continue;
           }
         } else {
           // first
-          if (PosOrig[_set][i] != 0) {
+          if (PosOrig[_set * SETASSOC + i] != 0) {
             continue;
           }
         }
@@ -257,7 +259,7 @@ void cacheReplace(long long addr) {
   int _tag = getTag(addr);
 
   for (int i = 0; i < SETASSOC; i++) {
-    if (!Valid[_set][i]) {
+    if (!Valid[_set * SETASSOC + i]) {
       // if has invalid slot, use it
       replacelru = -1;
       replaceindex = i;
@@ -274,8 +276,8 @@ void cacheReplace(long long addr) {
     }
   }
 
-  Valid[_set][replaceindex] = 1;
-  Tag[_set][replaceindex] = _tag;
+  Valid[_set * SETASSOC + replaceindex] = 1;
+  Tag[_set * SETASSOC + replaceindex] = _tag;
 
   initLRU(_set, replaceindex);
 }
@@ -291,7 +293,7 @@ void cacheReplaceOPT(long long addr, int nextpos) {
   int _tag = getTag(addr);
 
   for (int i = 0; i < SETASSOC; i++) {
-    if (!Valid[_set][i]) {
+    if (!Valid[_set * SETASSOC + i]) {
       // if has invalid slot, use it
       replacelru = -1;
       replaceindex = i;
@@ -308,8 +310,8 @@ void cacheReplaceOPT(long long addr, int nextpos) {
     }
   }
 
-  Valid[_set][replaceindex] = 1;
-  Tag[_set][replaceindex] = _tag;
+  Valid[_set * SETASSOC + replaceindex] = 1;
+  Tag[_set * SETASSOC + replaceindex] = _tag;
 
   initLRUOPT(_set, replaceindex, nextpos);
 }
@@ -324,7 +326,7 @@ void cacheReplaceOPTLFU(long long addr, int LFUtime) {
   int _tag = getTag(addr);
 
   for (int i = 0; i < SETASSOC; i++) {
-    if (!Valid[_set][i]) {
+    if (!Valid[_set * SETASSOC + i]) {
       // if has invalid slot, use it
       replacelru = -1;
       replaceindex = i;
@@ -341,8 +343,8 @@ void cacheReplaceOPTLFU(long long addr, int LFUtime) {
     }
   }
 
-  Valid[_set][replaceindex] = 1;
-  Tag[_set][replaceindex] = _tag;
+  Valid[_set * SETASSOC + replaceindex] = 1;
+  Tag[_set * SETASSOC + replaceindex] = _tag;
 
   initLRUOPTLFU(_set, replaceindex, LFUtime);
 }
@@ -380,22 +382,22 @@ void cacheReplacePracticalLFU(long long addr, bool isfirst,
   // cache miss. if use virtual tag, check whether in virtual tag.
   if (useVirtualTag) {
     for (int i = 0; i < VIRTUALSETASSOC; i++) {
-      if (virtualValid[_set][i]) {
-        if (virtualTag[_set][i] == _tag) {
+      if (virtualValid[_set * VIRTUALSETASSOC + i]) {
+        if (virtualTag[_set * VIRTUALSETASSOC + i] == _tag) {
           // in virtual tag, then first update the virtual tag flfu (-1)
           // then check whether in cache has invalid or flfu less than this
           // if has, then put this into cache. if the replaced one is not
           // invalid, then put it into virtual tag.
           invirtualtag = 1;
           virtualindex = i;
-          virtuallfubit[_set][i]--;
+          virtuallfubit[_set * VIRTUALSETASSOC + i]--;
         }
       }
     }
   }
 
   for (int i = 0; i < SETASSOC; i++) {
-    if (!Valid[_set][i]) {
+    if (!Valid[_set * SETASSOC + i]) {
       // if has invalid slot, use it without other considerations
       replacelfu = -1;
       replaceindex = i;
@@ -413,13 +415,13 @@ void cacheReplacePracticalLFU(long long addr, bool isfirst,
   if (!useVirtualTag) {
     // has invalid slot, fill
     if (replacelfu == -1) {
-      Valid[_set][replaceindex] = 1;
-      Tag[_set][replaceindex] = _tag;
-      Cnt[_set][replaceindex] = fibercnt - 1;
+      Valid[_set * SETASSOC + replaceindex] = 1;
+      Tag[_set * SETASSOC + replaceindex] = _tag;
+      Cnt[_set * SETASSOC + replaceindex] = fibercnt - 1;
       if (!isfirst) {
-        PosOrig[_set][replaceindex] = getOrig(firstaddr);
+        PosOrig[_set * SETASSOC + replaceindex] = getOrig(firstaddr);
       } else {
-        PosOrig[_set][replaceindex] = 0;
+        PosOrig[_set * SETASSOC + replaceindex] = 0;
       }
       initPracticalLFU(_set, replaceindex, 0);
       return;
@@ -427,13 +429,13 @@ void cacheReplacePracticalLFU(long long addr, bool isfirst,
 
     // has 0 slot, replace
     if (replacelfu == 0) {
-      Valid[_set][replaceindex] = 1;
-      Tag[_set][replaceindex] = _tag;
-      Cnt[_set][replaceindex] = fibercnt - 1;
+      Valid[_set * SETASSOC + replaceindex] = 1;
+      Tag[_set * SETASSOC + replaceindex] = _tag;
+      Cnt[_set * SETASSOC + replaceindex] = fibercnt - 1;
       if (!isfirst) {
-        PosOrig[_set][replaceindex] = getOrig(firstaddr);
+        PosOrig[_set * SETASSOC + replaceindex] = getOrig(firstaddr);
       } else {
-        PosOrig[_set][replaceindex] = 0;
+        PosOrig[_set * SETASSOC + replaceindex] = 0;
       }
       initPracticalLFU(_set, replaceindex, 0);
       return;
@@ -447,53 +449,53 @@ void cacheReplacePracticalLFU(long long addr, bool isfirst,
       // has invalid slot, fill, put the virtual tag slot to invalid
       if (replacelfu == -1) {
         // put current slot into cache
-        Valid[_set][replaceindex] = 1;
-        Tag[_set][replaceindex] = _tag;
-        Cnt[_set][replaceindex] = fibercnt - 1;
+        Valid[_set * SETASSOC + replaceindex] = 1;
+        Tag[_set * SETASSOC + replaceindex] = _tag;
+        Cnt[_set * SETASSOC + replaceindex] = fibercnt - 1;
         if (!isfirst) {
-          PosOrig[_set][replaceindex] = getOrig(firstaddr);
+          PosOrig[_set * SETASSOC + replaceindex] = getOrig(firstaddr);
         } else {
-          PosOrig[_set][replaceindex] = 0;
+          PosOrig[_set * SETASSOC + replaceindex] = 0;
         }
-        initPracticalLFU(_set, replaceindex, virtuallfubit[_set][virtualindex]);
+        initPracticalLFU(_set, replaceindex, virtuallfubit[_set * VIRTUALSETASSOC + virtualindex]);
 
         // put current virtual tag to invalid
-        virtualValid[_set][virtualindex] = 0;
-        vPosOrig[_set][virtualindex] = 0;
+        virtualValid[_set * VIRTUALSETASSOC + virtualindex] = 0;
+        vPosOrig[_set * SETASSOC + virtualindex] = 0;
         return;
       }
 
       // a slot in cache has lfu less then this in virtual. replace.
-      if (replacelfu < virtuallfubit[_set][virtualindex]) {
+      if (replacelfu < virtuallfubit[_set * VIRTUALSETASSOC + virtualindex]) {
         // update metadata in cache (config to the current access)
-        Valid[_set][replaceindex] = 1;
-        int oldtag = Tag[_set][replaceindex];
-        Tag[_set][replaceindex] = _tag;
-        Cnt[_set][replaceindex] = fibercnt - 1;
-        int oldorig = PosOrig[_set][replaceindex];
+        Valid[_set * SETASSOC + replaceindex] = 1;
+        int oldtag = Tag[_set * SETASSOC + replaceindex];
+        Tag[_set * SETASSOC + replaceindex] = _tag;
+        Cnt[_set * SETASSOC + replaceindex] = fibercnt - 1;
+        // int oldorig = PosOrig[_set * SETASSOC + replaceindex];
         if (!isfirst) {
-          PosOrig[_set][replaceindex] = getOrig(firstaddr);
+          PosOrig[_set * SETASSOC + replaceindex] = getOrig(firstaddr);
         } else {
-          PosOrig[_set][replaceindex] = 0;
+          PosOrig[_set * SETASSOC + replaceindex] = 0;
         }
-        initPracticalLFU(_set, replaceindex, virtuallfubit[_set][virtualindex]);
+        initPracticalLFU(_set, replaceindex, virtuallfubit[_set * VIRTUALSETASSOC + virtualindex]);
 
         // update metadata in virtual tag (config to the old slot in cache)
-        virtualValid[_set][virtualindex] = 1;
-        virtualTag[_set][virtualindex] = oldtag;
-        virtuallfubit[_set][virtualindex] = replacelfu;
+        virtualValid[_set * VIRTUALSETASSOC + virtualindex] = 1;
+        virtualTag[_set * VIRTUALSETASSOC + virtualindex] = oldtag;
+        virtuallfubit[_set * VIRTUALSETASSOC + virtualindex] = replacelfu;
       }
     } else { // not in cache; not in virtual tag
 
       // has invalid slot, fill
       if (replacelfu == -1) {
-        Valid[_set][replaceindex] = 1;
-        Tag[_set][replaceindex] = _tag;
-        Cnt[_set][replaceindex] = fibercnt - 1;
+        Valid[_set * SETASSOC + replaceindex] = 1;
+        Tag[_set * SETASSOC + replaceindex] = _tag;
+        Cnt[_set * SETASSOC + replaceindex] = fibercnt - 1;
         if (!isfirst) {
-          PosOrig[_set][replaceindex] = getOrig(firstaddr);
+          PosOrig[_set * SETASSOC + replaceindex] = getOrig(firstaddr);
         } else {
-          PosOrig[_set][replaceindex] = 0;
+          PosOrig[_set * SETASSOC + replaceindex] = 0;
         }
         initPracticalLFU(_set, replaceindex, 0);
         return;
@@ -501,13 +503,13 @@ void cacheReplacePracticalLFU(long long addr, bool isfirst,
 
       // has 0 slot, replace
       if (replacelfu == 0) {
-        Valid[_set][replaceindex] = 1;
-        Tag[_set][replaceindex] = _tag;
-        Cnt[_set][replaceindex] = fibercnt - 1;
+        Valid[_set * SETASSOC + replaceindex] = 1;
+        Tag[_set * SETASSOC + replaceindex] = _tag;
+        Cnt[_set * SETASSOC + replaceindex] = fibercnt - 1;
         if (!isfirst) {
-          PosOrig[_set][replaceindex] = getOrig(firstaddr);
+          PosOrig[_set * SETASSOC + replaceindex] = getOrig(firstaddr);
         } else {
-          PosOrig[_set][replaceindex] = 0;
+          PosOrig[_set * SETASSOC + replaceindex] = 0;
         }
 
         initPracticalLFU(_set, replaceindex, 0);
@@ -518,26 +520,26 @@ void cacheReplacePracticalLFU(long long addr, bool isfirst,
       // first put into invalid slot, if there is no invalid slot, then put into
       // lfu=0 slot, if there is no lfu=0 slot, then do nothing
       for (int i = 0; i < VIRTUALSETASSOC; i++) {
-        if (!virtualValid[_set][i]) {
+        if (!virtualValid[_set * VIRTUALSETASSOC + i]) {
           // has an invalid slot, put here and return (don't need to check other
           // slots)
-          virtualValid[_set][i] = 1;
-          virtualTag[_set][i] = _tag;
-          virtuallfubit[_set][i] = 0;
+          virtualValid[_set * VIRTUALSETASSOC + i] = 1;
+          virtualTag[_set * VIRTUALSETASSOC + i] = _tag;
+          virtuallfubit[_set * VIRTUALSETASSOC + i] = 0;
           return;
         } else {
         }
       }
       for (int i = 0; i < VIRTUALSETASSOC; i++) {
-        if (!virtualValid[_set][i]) {
+        if (!virtualValid[_set * VIRTUALSETASSOC + i]) {
         } else {
           // valid
-          if (virtuallfubit[_set][i] == 0) {
+          if (virtuallfubit[_set * VIRTUALSETASSOC + i] == 0) {
             // if the flfu bit is 0, replace it. (according to lru, the current
             // is better)
-            virtualValid[_set][i] = 1;
-            virtualTag[_set][i] = _tag;
-            virtuallfubit[_set][i] = 0;
+            virtualValid[_set * VIRTUALSETASSOC + i] = 1;
+            virtualTag[_set * VIRTUALSETASSOC + i] = _tag;
+            virtuallfubit[_set * VIRTUALSETASSOC + i] = 0;
 
             return;
           }
@@ -645,8 +647,8 @@ void cacheEvict(long long addr) {
 
     // set the valid to 0
     for (int i = 0; i < SETASSOC; i++) {
-      if (Valid[_set][i] && (Tag[_set][i] == _tag)) {
-        Valid[_set][i] = 0;
+      if (Valid[_set * SETASSOC + i] && (Tag[_set * SETASSOC + i] == _tag)) {
+        Valid[_set * SETASSOC + i] = 0;
       }
     }
   }
@@ -661,7 +663,7 @@ void cacheEvict(long long addr) {
 
 // need a queue for each row to track all the nexposes in the prefetch window.
 // the number track in this queue is the I number of A
-queue<int> nextposvector[MAXN];
+queue<int> *nextposvector = nullptr;
 
 int getNextpos(int rowid, int ii) {
 
@@ -680,7 +682,7 @@ int getNextpos(int rowid, int ii) {
   return ReplaceMAX;
 }
 
-int getLFU(int rowid, int ii) {
+int getLFU(int rowid, int /* ii */) {
   int retlfu = LFUtag[rowid];
   LFUtag[rowid]--;
   return retlfu;
@@ -767,12 +769,12 @@ bool cacheReadPracticalLFU(long long addr, bool isfirst, long long firstaddr) {
 }
 
 void initializeCacheValid() {
-  memset(Valid, 0, sizeof(Valid));
+  memset(Valid, 0, sizeof(bool) * SET * SETASSOC);
   if (useVirtualTag) {
-    memset(virtualValid, 0, sizeof(virtualValid));
+    memset(virtualValid, 0, sizeof(bool) * SET * VIRTUALSETASSOC);
   }
-  memset(PosOrig, 0, sizeof(PosOrig));
-  memset(vPosOrig, 0, sizeof(vPosOrig));
+  memset(PosOrig, 0, sizeof(short) * SET * SETASSOC);
+  memset(vPosOrig, 0, sizeof(short) * SET * SETASSOC);
 }
 
 // ii here means the now access position for OPT policy
@@ -780,7 +782,7 @@ __attribute__((noinline)) void cacheAccessFiber(int jj, int fibersize, int ii) {
 
   // fiber + cut + whole
   // only cache the part within a cacheline (x-cache)
-  if (cacheScheme == 0) {
+  if (cacheScheme == CACHE_SCHEME_BASE) {
     // if the whole size exceed the cacheline, then the rest part miss
     long long tmpaddr = getCacheAddr(jj, 0);
 
@@ -811,7 +813,7 @@ __attribute__((noinline)) void cacheAccessFiber(int jj, int fibersize, int ii) {
 
   // fiber + split + whole
   // split to multiple consective cachelines when exceed cacheline size
-  if (cacheScheme == 1) {
+  if (cacheScheme == CACHE_SCHEME_MAPPING) {
     // for each BLOCK segment of the B fiber
 
     // will be set to 1 if any cacheblock is miss
@@ -924,11 +926,11 @@ __attribute__((noinline)) void cacheAccessFiber(int jj, int fibersize, int ii) {
 
   // InnerSP
   // scheme0 + static OPT
-  if (cacheScheme == 11100) {
+  if (cacheScheme == CACHE_SCHEME_INNER_SP) {
     int nextpos = getNextpos(jj, ii);
     // access the head pointer
     computeSramAccess += sramWriteBandwidth(CACHEBLOCK);
-    bool anymiss = 0;
+    // bool anymiss = 0;
 
     long long tmpaddr = getCacheAddr(jj, 0);
     bool tmphit = cacheReadOPT(tmpaddr, nextpos);
@@ -957,11 +959,11 @@ __attribute__((noinline)) void cacheAccessFiber(int jj, int fibersize, int ii) {
 
   // Sparch
   // scheme0 + dynamic OPT
-  if (cacheScheme == 11101) {
+  if (cacheScheme == CACHE_SCHEME_SPARCH) {
     int nextpos = getNextpos(jj, ii);
     // access the head pointer
     computeSramAccess += sramWriteBandwidth(CACHEBLOCK);
-    bool anymiss = 0;
+    // bool anymiss = 0;
 
     long long tmpaddr = getCacheAddr(jj, 0);
     bool tmphit = cacheReadOPT(tmpaddr, nextpos);
@@ -1014,7 +1016,7 @@ __attribute__((noinline)) void cacheAccessFiber(int jj, int fibersize, int ii) {
   // 88 refers to the practical FLFU (enabling 4-bit, virtual tag)  (virtual
   // tag can be configured or not (baseline)) the flu information is no longer
   // kept in the LFUtag, but the extra lfubit
-  if (cacheScheme == 88) {
+  if (cacheScheme == CACHE_SCHEME_FLFU) {
     bool anymiss = 0;
     fibersize = currsizeB[jj] * 3;
     for (int tmpcurr = 0; tmpcurr < fibersize; tmpcurr += CACHEBLOCK) {
@@ -1033,3 +1035,65 @@ __attribute__((noinline)) void cacheAccessFiber(int jj, int fibersize, int ii) {
     }
   }
 }
+
+// (re-)allocate memory dynamically
+int last_cache_set = 0;
+void initialize_cache() {
+  if(SET != last_cache_set) {
+    deinitialize_cache();
+    last_cache_set = SET;
+  }
+  try {
+    Valid = new bool[SET * SETASSOC]();
+    Tag = new int[SET * SETASSOC]();
+    lrubit = new int[SET * SETASSOC]();
+    lfubit = new int[SET * SETASSOC]();
+
+    virtualValid = new bool[SET * VIRTUALSETASSOC]();
+    virtualTag = new int[SET * VIRTUALSETASSOC]();
+    virtuallfubit = new int[SET * VIRTUALSETASSOC]();
+
+    PosOrig = new unsigned short[SET * SETASSOC]();
+    vPosOrig = new unsigned short[SET * SETASSOC]();
+
+    Cnt = new unsigned char[SET * SETASSOC]();
+    Next = new bool[SET * SETASSOC]();
+  } catch (const std::bad_alloc &e) {
+    std::cerr << "Error allocating memory for " << e.what() << std::endl;
+    std::exit(1);
+  }
+}
+
+void deinitialize_cache() {
+    if(Valid != nullptr) delete[] Valid;
+    if(Tag != nullptr) delete[] Tag;
+    if(lrubit != nullptr) delete[] lrubit;
+    if(lfubit != nullptr) delete[] lfubit;
+
+    if(virtualValid != nullptr) delete[] virtualValid;
+    if(virtualTag != nullptr) delete[] virtualTag;
+    if(virtuallfubit != nullptr) delete[] virtuallfubit;
+
+    if(PosOrig != nullptr) delete[] PosOrig;
+    if(vPosOrig != nullptr) delete[] vPosOrig;
+
+    if(Cnt != nullptr) delete[] Cnt;
+    if(Next != nullptr) delete[] Next;
+
+    // (re)set all the pointers to nullptr
+    Valid = nullptr;
+    Tag = nullptr;
+    lrubit = nullptr;
+    lfubit = nullptr;
+
+    virtualValid = nullptr;
+    virtualTag = nullptr;
+    virtuallfubit = nullptr;
+
+    PosOrig = nullptr;
+    vPosOrig = nullptr;
+
+    Cnt = nullptr;
+    Next = nullptr;
+}
+
diff --git a/src/cache.h b/src/cache.h
index 5b05e05..e5d547e 100644
--- a/src/cache.h
+++ b/src/cache.h
@@ -8,7 +8,7 @@
 #define VIRTUALSETASSOC 4
 #define VIRTUALSETASSOCLOG 2
 
-#define MAXSET 1000005
+// #define MAXSET 1000005
 
 #define BIAS 23
 
@@ -23,23 +23,23 @@ extern bool useVirtualTag;
 extern int inputcachesize;
 extern long long elements_processed_since_last_adjustment;
 
-extern queue<int> nextposvector[MAXN];
+extern queue<int> *nextposvector;
 
 extern int LFUmax;
-extern int LFUtag[MAXN];
+extern int *LFUtag;
 
-extern bool Valid[MAXSET][SETASSOC];
-extern int Tag[MAXSET][SETASSOC];
-extern int lrubit[MAXSET][SETASSOC];
+extern bool *Valid;
+extern int *Tag;
+extern int *lrubit;
 
-extern int lfubit[MAXSET][SETASSOC];
+extern int *lfubit;
 
-extern bool virtualValid[MAXSET][VIRTUALSETASSOC];
-extern int virtualTag[MAXSET][VIRTUALSETASSOC];
-extern int virtuallfubit[MAXSET][VIRTUALSETASSOC];
+extern bool *virtualValid;
+extern int *virtualTag;
+extern int *virtuallfubit;
 
-extern unsigned short PosOrig[MAXSET][SETASSOC];
-extern unsigned short vPosOrig[MAXSET][VIRTUALSETASSOC];
+extern unsigned short *PosOrig;
+extern unsigned short *vPosOrig;
 
 extern long long prefetch_discards;
 extern long long prefetch_increments;
@@ -86,4 +86,7 @@ unsigned short getOrig(long long addr);
 
 void setSET();
 
-#endif
\ No newline at end of file
+void initialize_cache();
+void deinitialize_cache();
+
+#endif
diff --git a/src/config.h b/src/config.h
index 24429a0..b91b676 100644
--- a/src/config.h
+++ b/src/config.h
@@ -5,6 +5,17 @@ Keep all hardware configurations.
 #ifndef CONFIG_H
 #define CONFIG_H
 
+// NOTE(ejs): I only enum-ified the cachescheme codes used/described in main.cpp.
+// some undecipherable magic slop remains in simulator.cpp. CACHE_SCHEME_BASE is
+// set to a high number so these enums (hopefully) do not conflict with the magic slop.
+enum cache_scheme {
+    CACHE_SCHEME_BASE=1000000,  // formerly magic 0
+    CACHE_SCHEME_MAPPING,       // formerly magic 1
+    CACHE_SCHEME_FLFU,          // formerly magic 88
+    CACHE_SCHEME_INNER_SP,      // formerly magic 11100
+    CACHE_SCHEME_SPARCH         // formerly magic 11101
+};
+
 // global bandwdith & SRAM configuration
 extern double HBMbandwidth;
 extern int PEcnt, mergecnt;
diff --git a/src/data.cpp b/src/data.cpp
index 5dec8c4..7f98614 100644
--- a/src/data.cpp
+++ b/src/data.cpp
@@ -1,23 +1,85 @@
 #include "data.h"
 #include <fstream>
 #include <sstream>
+#include <iostream>
 #include <string>
 
-int N, M, nzA, nzB;
+#include "headers.h"
 
-std::vector<int> A[MAXN], Ac[MAXN], B[MAXN], Bc[MAXN];
+int M;  // num rows in A
+int N;  // num cols in A (and num rows in B)
+int nzA;
+int nzB;
 
-std::vector<int> sparchA[MAXN], sparchAi[MAXN];
+// Let (r,c) be a valid non-zero entry in A. Then:
+std::vector<int>
+    *A  = nullptr,  // A[r]  = c
+    *Ac = nullptr,  // Ac[c] = r
+    *B  = nullptr,
+    *Bc = nullptr;
 
-int offsetarrayA[MAXN], offsetarrayAc[MAXN];
-int offsetarrayB[MAXN], offsetarrayBc[MAXN];
+std::vector<int> *sparchA = nullptr, *sparchAi = nullptr;
 
-int SI, SK;
-std::vector<int> SA[MAXN];
-std::vector<int> SAc[MAXN];
-std::vector<int> SBc[MAXN];
-std::vector<int> SB[MAXN];
-int SAindex[MAXN], SBcindex[MAXN];
+int *offsetarrayA = nullptr, *offsetarrayAc = nullptr;
+int *offsetarrayB = nullptr, *offsetarrayBc = nullptr;
+
+// Let row r be sampled from A, and it is the (r')th sampled row.
+// Then for all indices [(r,c_1), (r,c_2), ..., (r,c_u)] in the row:
+std::vector<int> *SA = nullptr;     //  SA[r']  = c
+std::vector<int> *SAc = nullptr;    //  SAc[c]  = [hash1(r, ...) x u] (list of u copies of same hash)
+
+// Let col c be sampled from B, and it is the (c')th sampled row.
+// Then for all indices [(r_1,c), (r_2,c), ..., (r_v,c)] in the col:
+std::vector<int> *SBc = nullptr;    // SBc[c']  = r
+std::vector<int> *SB = nullptr;     // SB[r]    = [hash2(c, ...) x v] (list of v copies of same hash)
+int *SAindex = nullptr, *SBcindex = nullptr;
 
 extern double ha1, hb1;
 extern double ha2, hb2;
+
+void initialize_data_A() {
+  try {
+    if(A == nullptr) A = new std::vector<int>[I]();
+    if(Ac == nullptr) Ac = new std::vector<int>[J]();
+    if(SA == nullptr) SA = new std::vector<int>[I]();
+    if(SAc == nullptr) SAc = new std::vector<int>[J]();
+    if(offsetarrayA == nullptr) offsetarrayA = new int[I]();
+    if(offsetarrayAc == nullptr) offsetarrayAc = new int[J]();
+    if(SAindex == nullptr) SAindex = new int[I]();
+  } catch (const std::bad_alloc &e) {
+    std::cerr << "Error allocating memory for " << e.what() << std::endl;
+    std::exit(1);
+  }
+}
+
+void initialize_data_B() {
+  try {
+    if(B == nullptr) B = new std::vector<int>[I]();
+    if(Bc == nullptr) Bc = new std::vector<int>[J]();
+    if(SB == nullptr) SB = new std::vector<int>[I]();
+    if(SBc == nullptr) SBc = new std::vector<int>[J]();
+    if(offsetarrayB == nullptr) offsetarrayB = new int[J]();
+    if(offsetarrayBc == nullptr) offsetarrayBc = new int[K]();
+    if(SBcindex == nullptr) SBcindex = new int[K]();
+  } catch (const std::bad_alloc &e) {
+    std::cerr << "Error allocating memory for " << e.what() << std::endl;
+    std::exit(1);
+  }
+}
+
+void deinitialize_data() {
+  if(A != nullptr) delete[] A;
+  if(Ac != nullptr) delete[] Ac;
+  if(SA != nullptr) delete[] SA;
+  if(SAc != nullptr) delete[] SAc;
+  if(offsetarrayA != nullptr) delete[] offsetarrayA;
+  if(offsetarrayAc != nullptr) delete[] offsetarrayAc;
+
+  if(B != nullptr) delete[] B;
+  if(Bc != nullptr) delete[] Bc;
+  if(SB != nullptr) delete[] SB;
+  if(SBc != nullptr) delete[] SBc;
+  if(offsetarrayB != nullptr) delete[] offsetarrayB;
+  if(offsetarrayBc != nullptr) delete[] offsetarrayBc;
+}
+
diff --git a/src/data.h b/src/data.h
index 41ddf21..dc6978e 100644
--- a/src/data.h
+++ b/src/data.h
@@ -12,23 +12,24 @@ static const int MAXN = 3000000;
 extern int N, M, nzA, nzB;
 
 // sparse matrices A, B and their transposes Ac, Bc
-extern std::vector<int> A[MAXN], Ac[MAXN];
-extern std::vector<int> sparchA[MAXN], sparchAi[MAXN];
-extern std::vector<int> B[MAXN], Bc[MAXN];
+extern std::vector<int> *A, *Ac;
+extern std::vector<int> *sparchA, *sparchAi;
+extern std::vector<int> *B, *Bc;
 
 // store the offsets for A, Ac, B, Bc
-extern int offsetarrayA[MAXN], offsetarrayAc[MAXN];
-extern int offsetarrayB[MAXN], offsetarrayBc[MAXN];
+extern int *offsetarrayA, *offsetarrayAc;
+extern int *offsetarrayB, *offsetarrayBc;
 
 // sample matrix
 extern int SI, SK;
-extern std::vector<int> SA[MAXN];
-extern std::vector<int> SAc[MAXN];
-extern std::vector<int> SBc[MAXN];
-extern std::vector<int> SB[MAXN];
-extern int SAindex[MAXN], SBcindex[MAXN];
+extern std::vector<int> *SA, *SAc, *SBc, *SB;
+extern int *SAindex, *SBcindex;
 
 // Read input matrices A and B from files
 void readInputMatrices(const char *fileA, const char *fileB);
 
+void initialize_data_A();
+void initialize_data_B();
+void deinitialize_data();
+
 #endif // DATA_H
diff --git a/src/dynamic.cpp b/src/dynamic.cpp
index edb01ee..4af4afe 100644
--- a/src/dynamic.cpp
+++ b/src/dynamic.cpp
@@ -76,11 +76,23 @@ void updateDynamicTile(int _tj, int t0, int t1) {
 // use _iii/_jjj/_kkk represent the tile after finetune.
 void update_T() {
 
-  int oldiii = iii, oldjjj = jjj, oldkkk = kkk;
-  int oldtti = tti, oldttj = ttj, oldttk = ttk;
-  int iii2 = (iii + 1) / 2, jjj2 = (jjj + 1) / 2, kkk2 = (kkk + 1) / 2;
-  int tti2 = tti * 2, ttj2 = ttj * 2, ttk2 = ttk * 2;
-  int estsum = 0;
+  int oldiii = iii;
+  int oldjjj = jjj;
+  int oldkkk = kkk;
+
+  int oldtti = tti;
+  int oldttj = ttj;
+  int oldttk = ttk;
+
+  // int iii2 = (iii + 1) / 2;
+  int jjj2 = (jjj + 1) / 2;
+  int kkk2 = (kkk + 1) / 2;
+
+  // int tti2 = tti * 2;
+  int ttj2 = ttj * 2;
+  int ttk2 = ttk * 2;
+
+  // int estsum = 0;
 
   // jjj+kkk type 0
   // Tcnt need to clear and recalculate at each round, sizejksum and tilesum not
@@ -158,9 +170,9 @@ void update_T() {
     mintype = 3;
   }
 
-  // only can increase when *2 <= I/J/K, otherwise overflow
+  // only can increase when *2 < I/J/K, otherwise overflow
   // jjj*2 + kkk type4
-  if (oldjjj * 2 <= J) {
+  if (oldjjj * 2 < J) {
     sizejk = (Tcnt[0][0] + Tcnt[0][1] + Tcnt[1][0] + Tcnt[1][1]) * 3 * 2 +
              oldjjj * 2;
     sizejksum[4] += min(sizejk, Bsize);
@@ -179,7 +191,7 @@ void update_T() {
   }
 
   // jjj + kkk*2 type5
-  if (oldkkk * 2 <= K) {
+  if (oldkkk * 2 < K) {
     sizejk =
         (Tcnt[0][0] + Tcnt[0][1] + Tcnt[1][0] + Tcnt[1][1]) * 3 * 2 + oldjjj;
     sizejksum[5] += min(sizejk, Bsize);
@@ -198,7 +210,7 @@ void update_T() {
   }
 
   // jjj*2 + kkk*2 type6
-  if ((oldjjj * 2 <= J) && (oldkkk * 2 <= K)) {
+  if ((oldjjj * 2 < J) && (oldkkk * 2 < K)) {
     sizejk = (Tcnt[0][0] + Tcnt[0][1] + Tcnt[1][0] + Tcnt[1][1]) * 3 * 4 +
              oldjjj * 2;
     sizejksum[6] += min(sizejk, Bsize);
@@ -217,7 +229,7 @@ void update_T() {
   }
 
   // jjj/2 + kkk*2  type7
-  if (oldkkk * 2 <= K) {
+  if (oldkkk * 2 < K) {
     sizejk = (Tcnt[0][0] + Tcnt[0][1]) * 3 * 2 + jjj2;
     sizejksum[7] += min(sizejk, Bsize);
 
@@ -239,7 +251,7 @@ void update_T() {
   }
 
   // jjj*2 + kkk/2 type8
-  if (oldjjj * 2 <= J) {
+  if (oldjjj * 2 < J) {
     sizejk = (Tcnt[0][0] + Tcnt[1][0]) * 3 * 2 + jjj;
     sizejksum[8] += min(sizejk, Bsize);
 
@@ -342,4 +354,4 @@ void update_T() {
   tti = oldtti;
   ttj = oldttj;
   ttk = oldttk;
-}
\ No newline at end of file
+}
diff --git a/src/estimation.cpp b/src/estimation.cpp
index 5a6a796..4296404 100644
--- a/src/estimation.cpp
+++ b/src/estimation.cpp
@@ -1,7 +1,12 @@
 #include "headers.h"
 #include "util.h"
 
-int SIcnt, SKcnt, SAnnz, SBnnz;
+#include <memory>
+
+int SIcnt; // num sampled rows in A
+int SKcnt; // num sampled cols in B
+int SAnnz;
+int SBnnz;
 
 const int pmod = 1000000007; // A large prime number
 
@@ -15,10 +20,10 @@ void initsample() {
 
 void sampleA() {
   // work on the sample
-  for (int i = 0; i <= I; i++) {
+  for (int i = 0; i < I; i++) {
     // Be the sampled row in probability p
     if (sampleP()) {
-      for (int j = 0; j < A[i].size(); j++) {
+      for (std::size_t j = 0; j < A[i].size(); j++) {
         SA[SIcnt].push_back(A[i][j]);
         // directly push back the h1(x)!
         SAc[A[i][j]].push_back(hash1(i, ha1, hb1, pmod));
@@ -32,10 +37,10 @@ void sampleA() {
 
 void sampleB() {
   // work on the sample
-  for (int k = 0; k <= K; k++) {
-    // Be the sampled row in probability p
+  for (int k = 0; k < K; k++) {
+    // Be the sampled col in probability p
     if (sampleP()) {
-      for (int j = 0; j < Bc[k].size(); j++) {
+      for (std::size_t j = 0; j < Bc[k].size(); j++) {
         SBc[SKcnt].push_back(Bc[k][j]);
         SB[Bc[k][j]].push_back(hash2(k, ha2, hb2, pmod));
         SBnnz++;
@@ -104,7 +109,7 @@ double tkans[128];
 
 int OutputKTJ(int lastk, int nowk, double maxh) {
 
-  int ret = 0;
+  // int ret = 0;
 
   double ans = 0;
 
@@ -269,7 +274,7 @@ void getParameterSample() {
 
   auto time0 = std::chrono::high_resolution_clock::now();
 
-  for (int j = 0; j <= J; j++) {
+  for (int j = 0; j < J; j++) {
 
     int tmpsizea = SAc[j].size();
     int tmpsizeb = SB[j].size();
@@ -360,7 +365,7 @@ void getParameterSample() {
     }
   }
 
-  double ansp = combineSF();
+  // double ansp = combineSF();
   // printf("SF!!  %lf %lf\n", ansp, samplek/ansp);
 
   auto time20 = std::chrono::high_resolution_clock::now();
@@ -526,13 +531,21 @@ void getParameterSample() {
   printf("nnzC = %lld, nnzCTk[7] = %lld\n", nnzCTk[0], nnzCTk[7]);
 }
 
-map<int, bool> estC[MAXN];
 
-int startA[MAXN];
-int endA[MAXN];
-int endB[MAXN];
 
 void getParameter() {
+   // These only live in this function
+   std::unique_ptr<map<int, bool>[]> estC;
+   std::unique_ptr<int[]> startA, endA, endB;
+   try {
+     estC = std::make_unique<map<int, bool>[]>(std::max(I, J));
+     startA = std::make_unique<int[]>(std::max(I, J));
+     endA = std::make_unique<int[]>(I);
+     endB = std::make_unique<int[]>(J);
+   } catch (const std::bad_alloc& e) {
+     std::cerr << "Allocation failed: " << e.what() << std::endl;
+     exit(1);
+   }
 
   // get parameters in force
 
@@ -620,9 +633,11 @@ void getParameter() {
   }
 }
 
-double getvarianceBJ(int ttj) { return 1; }
+// double getvarianceBJ(int ttj) { return 1; }
+double getvarianceBJ(int) { return 1; }
 
-double getvarianceBK(int ttk) { return 1; }
+// double getvarianceBK(int ttk) { return 1; }
+double getvarianceBK(int) { return 1; }
 
 long long getnnzC(int jj) { return nnzCTk[getlog(jj)]; }
 
@@ -773,20 +788,20 @@ long long gustest(int estsum) {
 
   esttotal += max(estpostDram / PEcnt, estpostSram / sramBank);
 
-  int estsquareiii, estsquarejjj, estsquarekkk;
-  int estsquaretti, estsquarettj, estsquarettk;
-
-  if (jjj == kkk) {
-    if (esttotal < estsquaremin) {
-      estsquaremin = esttotal;
-      estsquareiii = iii;
-      estsquarejjj = jjj;
-      estsquarekkk = kkk;
-      estsquaretti = tti;
-      estsquarettj = ttj;
-      estsquarettk = ttk;
-    }
-  }
+  // int estsquareiii, estsquarejjj, estsquarekkk;
+  // int estsquaretti, estsquarettj, estsquarettk;
+
+  // if (jjj == kkk) {
+  //   if (esttotal < estsquaremin) {
+  //     estsquaremin = esttotal;
+  //     estsquareiii = iii;
+  //     estsquarejjj = jjj;
+  //     estsquarekkk = kkk;
+  //     estsquaretti = tti;
+  //     estsquarettj = ttj;
+  //     estsquarettk = ttk;
+  //   }
+  // }
 
   if (esttotal < estmin) {
 
@@ -821,8 +836,12 @@ void postEstAdjust() {
 
   memset(tilecnt, 0, sizeof(tilecnt));
 
-  int iii2 = (estiii + 1) / 2, jjj2 = (estjjj + 1) / 2, kkk2 = (estkkk + 1) / 2;
-  int tti2 = esttti * 2, ttj2 = estttj * 2, ttk2 = estttk * 2;
+  // int iii2 = (estiii + 1) / 2;
+  int jjj2 = (estjjj + 1) / 2;
+  int kkk2 = (estkkk + 1) / 2;
+  // int tti2 = esttti * 2;
+  int ttj2 = estttj * 2;
+  int ttk2 = estttk * 2;
 
   for (int k = 0; k < SKcnt; k++) {
     int tmpszb = SBc[k].size();
@@ -838,7 +857,7 @@ void postEstAdjust() {
   // calculate 4 versions
 
   // J+K type0
-  int outtile = 0;
+  // int outtile = 0;
   long long estsum = 0;
   for (int tj = 0; tj < ttj; tj++) {
     for (int tk = 0; tk < ttk; tk++) {
@@ -868,7 +887,7 @@ void postEstAdjust() {
 
   // J/2+K type1
 
-  outtile = 0;
+  // outtile = 0;
   estsum = 0;
   for (int tj = 0; tj < ttj2; tj++) {
     for (int tk = 0; tk < ttk; tk++) {
@@ -898,7 +917,7 @@ void postEstAdjust() {
 
   // J+K/2 type2
 
-  outtile = 0;
+  // outtile = 0;
   estsum = 0;
   for (int tj = 0; tj < ttj; tj++) {
     for (int tk = 0; tk < ttk2; tk++) {
@@ -928,7 +947,7 @@ void postEstAdjust() {
 
   // J/2+K/2  type3
 
-  outtile = 0;
+  // outtile = 0;
   estsum = 0;
   for (int tj = 0; tj < ttj2; tj++) {
     for (int tk = 0; tk < ttk2; tk++) {
diff --git a/src/main.cpp b/src/main.cpp
index 49dbcb7..282fa44 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -4,10 +4,35 @@
 #include "json.hpp"
 #include "simulator.h"
 #include <fstream>
+#include <cstdlib>
 
 using json = nlohmann::json;
 
 int main(int argc, char *argv[]) {
+  // Clean up memory at exit
+  if(std::atexit(deinitialize_data)) {
+    std::cout << "Error registering deinitialize_data in atexit" << std::endl;
+    return 1;
+  }
+  if(std::atexit(deinitialize_simulator)) {
+    std::cout << "Error registering deinitialize_simulator in atexit" << std::endl;
+    return 1;
+  }
+  if(std::atexit(deinitialize_cache)) {
+    std::cout << "Error registering deinitialize_cache in atexit" << std::endl;
+    return 1;
+  }
+
+  if(argc != 4) {
+    std::cerr << "Usage: " << argv[0] << " matrix1 matrix2 config/file/path\n"
+        << "\nSearch locations for matrix1 and matrix2, in order:\n"
+        << "  ./largedata/matrix1/matrix1.mtx\n"
+        << "  ./data/matrix1.mtx\n"
+        << "  ./dense/matrix1.mtx\n"
+        << "  ./bfs/matrix1.mtx\n"
+        << "config_file is a fully qualified path to the .json config for the run (likely config/config.json).\n" << std::endl;
+    return 1;
+  }
 
   std::string matrix_name1 = argv[1];
   std::string matrix_name2 = argv[2];
@@ -57,7 +82,7 @@ int main(int argc, char *argv[]) {
   }
 
   if (!freopen((output_dir + (ISCACHE ? "C" : "_") + printDataFlow[dataflow] +
-                (baselinetest ? "Base_" : "SeaCache_") +
+                (baselinetest ? "Base_" : "570Cache_") +
                 std::to_string(tmpsram) + "MB_" + std::to_string(tmpbandw) +
                 "GBs_" + std::to_string(tmpPE) + "PEs_" +
                 std::to_string(tmpbank) + "sbanks_" + "_" + matrix_name1 + "_" +
@@ -69,7 +94,8 @@ int main(int argc, char *argv[]) {
     return 1;
   }
 
-  const int BUFFERSIZE = 512;
+  // Lines limited to 1024 characters by spec
+  const std::size_t BUFFERSIZE = 1024;
   char readbuffer[BUFFERSIZE];
 
   // read and ignore annotation '%' lines
@@ -94,6 +120,8 @@ int main(int argc, char *argv[]) {
   I = N;
   J = M;
 
+  initialize_data_A();
+
   string input;
 
   fflush(stdout);
@@ -106,6 +134,7 @@ int main(int argc, char *argv[]) {
     std::vector<std::string> tokens;
     std::string token;
 
+    // Splits the file input on whitespace
     while (iss >> token) {
       tokens.push_back(token);
     }
@@ -113,23 +142,29 @@ int main(int argc, char *argv[]) {
     int xx, yy;
     double zz, lala;
 
-    if (tokens.size() == 2) {
+    /* NOTE(ejs): Each mtx row is a matrix entry.
+    xx, yy [zz, lala]
+        - xx  = row index
+        - yy  = col index
+        - zz  = real component of value? (ignored)
+        - lala= imag. component of value? (ignored)
+    */
+    if (tokens.size() == 2) { // pattern (nonzero values ommitted)
 
       std::istringstream(tokens[0]) >> xx;
       std::istringstream(tokens[1]) >> yy;
       // std::cout << "values: " << xx << ", " << yy << std::endl;
-    } else if (tokens.size() == 3) {
+    } else if (tokens.size() == 3) { // real or integer matrix
 
       std::istringstream(tokens[0]) >> xx;
       std::istringstream(tokens[1]) >> yy;
       std::istringstream(tokens[2]) >> zz;
       // std::cout << "values: " << xx << ", " << yy << ", " << zz << std::endl;
-    } else if (tokens.size() == 4) {
-
+    } else if (tokens.size() == 4) { // complex matrix (we only take the real part, unfortunately)
       std::istringstream(tokens[0]) >> xx;
       std::istringstream(tokens[1]) >> yy;
       std::istringstream(tokens[2]) >> zz;
-      std::istringstream(tokens[2]) >> lala;
+      std::istringstream(tokens[3]) >> lala;
       // std::cout << "values: " << xx << ", " << yy << ", " << zz << std::endl;
     } else {
 
@@ -139,29 +174,40 @@ int main(int argc, char *argv[]) {
       return 0;
     }
 
+    // WARNING(ejs): mtx indices are stored 1-based; it converts 0-based representation in code
     if (transpose) {
-      Ac[xx].push_back(yy);
-      A[yy].push_back(xx);
+      Ac[xx - 1].push_back(yy - 1);
+      A[yy - 1].push_back(xx - 1);
     } else {
-      A[xx].push_back(yy);
-      Ac[yy].push_back(xx);
+      A[xx - 1].push_back(yy - 1);
+      Ac[yy - 1].push_back(xx - 1);
     }
   }
 
-  for (int i = 0; i <= I; i++) {
+  for (int i = 0; i < I; i++) {
     sort(A[i].begin(), A[i].end());
   }
-  for (int j = 0; j <= J; j++) {
+  for (int j = 0; j < J; j++) {
     sort(Ac[j].begin(), Ac[j].end());
   }
 
   if (condensedOP) {
+    // memory management for sparchA, sparchAi
+    sparchA = new std::vector<int>[J]();
+    sparchAi = new std::vector<int>[J]();
+    if(sparchA == nullptr || sparchAi == nullptr) {
+      if(sparchA != nullptr) delete[] sparchA;
+      if(sparchAi != nullptr) delete[] sparchAi;
+      std::cerr << "Error allocating memory for sparchA or sparchAi" << std::endl;
+      std::exit(1);
+    }
+
     // if use the condensed OP dataflow, need to preprocess the A matrix into
     // the condensed format first. first put the data into sparchA[], then put
     // it back to A[], and call gust dataflow
     for (int j = 0; j < J; j++) {
       for (int i = 0; i < I; i++) {
-        if (A[i].size() > j) {
+        if (static_cast<int>(A[i].size()) > j) {
           sparchA[j].push_back(A[i][j]);
           sparchAi[j].push_back(i);
         }
@@ -170,10 +216,13 @@ int main(int argc, char *argv[]) {
 
     for (int j = 0; j < J; j++) {
       A[j].clear();
-      for (int i = 0; i < sparchA[j].size(); i++) {
+      for (int i = 0; i < static_cast<int>(sparchA[j].size()); i++) {
         A[j].push_back(sparchA[j][i]);
       }
     }
+
+    delete[] sparchA;
+    delete[] sparchAi;
   }
 
   long long totalempty = 0;
@@ -183,7 +232,7 @@ int main(int argc, char *argv[]) {
   long long totaltagmatch48 = 0;
   long long totaltagmatch16 = 0;
 
-  for (int i = 1; i <= I + 2; i++) {
+  for (int i = 1; i < I; i++) {
     offsetarrayA[i] = offsetarrayA[i - 1] + A[i - 1].size();
     if (A[i - 1].size() < 48) {
       totalempty += (48 - A[i - 1].size());
@@ -200,7 +249,7 @@ int main(int argc, char *argv[]) {
   printf("** ratio tag access 48 %lf\n", I / ((double)I + totaltagmatch48));
   printf("** ratio tag access 16 %lf\n", I / ((double)I + totaltagmatch16));
 
-  for (int i = 1; i <= J + 2; i++) {
+  for (int i = 1; i < J + 2; i++) {
     offsetarrayAc[i] = offsetarrayAc[i - 1] + Ac[i - 1].size();
   }
 
@@ -239,6 +288,9 @@ int main(int argc, char *argv[]) {
   printf("Matrix B: %d x %d, number of non-zeros = %d\n", N, M, nzB);
   fflush(stdout);
 
+  // FIXME(ejs): This is extremely confusing. It should not automatically try to invert the matrix
+  // if (and only if !!?) it is non-square. The user should be responsible for storing a separate transpose
+  // version of the matrix (or add some intermediate helper that does the transposition).
   if (N != M)
     transpose ^= 1; // when transposeA = 0 -> transposeB = 1; when tranposeA=
                     // 1-> transposeB = 0
@@ -256,6 +308,8 @@ int main(int argc, char *argv[]) {
 
   K = M;
 
+  initialize_data_B();
+
   // std::getline(std::cin, input);
 
   for (int i = 1; i <= nzB; i++) {
@@ -299,31 +353,31 @@ int main(int argc, char *argv[]) {
     }
 
     if (transpose) {
-      Bc[xx].push_back(yy);
-      B[yy].push_back(xx);
+      Bc[xx - 1].push_back(yy - 1);
+      B[yy - 1].push_back(xx - 1);
     } else {
-      B[xx].push_back(yy);
-      Bc[yy].push_back(xx);
+      B[xx - 1].push_back(yy - 1);
+      Bc[yy - 1].push_back(xx - 1);
     }
   }
 
   // cout << N << endl<<M <<endl<< nz << endl << nz/N <<endl;
 
-  for (int j = 0; j <= J; j++) {
+  for (int j = 0; j < J; j++) {
     sort(B[j].begin(), B[j].end());
   }
-  for (int k = 0; k <= K; k++) {
+  for (int k = 0; k < K; k++) {
     sort(Bc[k].begin(), Bc[k].end());
   }
 
-  for (int j = 1; j <= J + 2; j++) {
+  for (int j = 1; j < J; j++) {
     int tmplen = B[j - 1].size();
     offsetarrayB[j] = offsetarrayB[j - 1] + tmplen;
 
     // the actual access size
-    tmplen = tmplen * 3;
+    tmplen = tmplen * 3; // NOTE(ejs): wtf is this 3 supposed to mean?
 
-    int freqj = (offsetarrayAc[j + 1] - offsetarrayAc[j]);
+    // int freqj = (offsetarrayAc[j + 1] - offsetarrayAc[j]);
   }
   // two problem:
   // 1) this calculate way just calculate the minimum
@@ -334,18 +388,17 @@ int main(int argc, char *argv[]) {
   // move this to above for the weights (1 -> )
   // shortpart += J/(CACHEBLOCKSHORT);
 
-  for (int k = 1; k <= K + 2; k++) {
+  for (int k = 1; k < K; k++) {
     offsetarrayBc[k] = offsetarrayBc[k - 1] + Bc[k - 1].size();
   }
 
-  if (ISCACHE == 1) {
-
-    SET = cachesize / (CACHEBLOCK * SETASSOC);
-    SETLOG = getlog(SET);
-  }
 
   sampleB();
 
+  if(ISCACHE==1) {
+    setSET();
+  }
+
   /******************Config************************************/
 
   // notation of J and K in the code is swapped as in the paper
@@ -374,22 +427,30 @@ int main(int argc, char *argv[]) {
     }
   }
 
-  long long SmallestTile = ((long long)pbound) * J;
+  // long long SmallestTile = ((long long)pbound) * J;
 
-  int kbound = getkbound();
-  int jbound = getjbound();
-  int ibound = getibound();
+  // int kbound = getkbound();
+  // int jbound = getjbound();
+  // int ibound = getibound();
 
   int usesearchedtile = 1;
-  if (usesearchedtile) {
+  if (usesearchedtile) { // NOTE(ejs): this branch is **useless** (it is always taken)
 
     ISDYNAMICJ = 0;
     ISDYNAMICK = 0;
     ISDYNAMICI = 0;
 
-    freopen((tile_dir + matrix_name1).c_str(), "r", stdin);
     int t_i, t_j, t_k;
-    scanf("%d%d%d", &t_i, &t_j, &t_k);
+
+    if(!freopen((tile_dir + matrix_name1).c_str(), "r", stdin)) {
+      std::cerr << "Error opening " << (tile_dir + matrix_name1) << std::endl;
+      return 1;
+    }
+
+    if(std::scanf("%d%d%d", &t_i, &t_j, &t_k) != 3) {
+      std::cerr << "Error reading " << (tile_dir + matrix_name1) << ", expected three integers." << std::endl;
+      return 1;
+    }
     fclose(stdin);
 
     iii = t_i;
@@ -399,23 +460,39 @@ int main(int argc, char *argv[]) {
     ttj = (J + jjj - 1) / jjj;
     ttk = (K + kkk - 1) / kkk;
 
+    initialize_simulator();
+
     /////////////// Baseline configurations
 
     if (baselinetest) {
+      // EWH
+      // Incorporate SeaCache into baseline
+      puts("***************** SeaCache *******************");
+      printf("nnzB:%d  K:%d  J/TJ:%d  nzlB:%d\n", nzB, K, (J + jjj - 1) / jjj,
+             nzB / (K * ((J + jjj - 1) / jjj)));
+
+      adaptive_prefetch = 1;
+      useVirtualTag = 1;
+      cacheScheme = CACHE_SCHEME_FLFU;
+      cachesize = inputcachesize;
+
+      runTile(0, iii, jjj, kkk, tti, ttk, ttj, 0);
+
+      adaptive_prefetch = 0;
+      useVirtualTag = 0;
 
       adaptive_prefetch = 0;
 
       ////////////  InnserSP
       // static FLRU + 16 words scheme0
-      puts("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!   test InnerSP   "
+      puts("\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!   test InnerSP   "
            "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!");
       prefetchSize = inputcachesize / 6;
-      cacheScheme = 11100;
+      cacheScheme = CACHE_SCHEME_INNER_SP;
       cachesize = inputcachesize;
       CACHEBLOCK = 16;
       CACHEBLOCKLOG = 4;
-      SET = cachesize / (CACHEBLOCK * SETASSOC);
-      SETLOG = getlog(SET);
+      setSET();
       runTile(0, iii, jjj, kkk, tti, ttk, ttj, 0);
 
       fflush(stdout);
@@ -425,37 +502,32 @@ int main(int argc, char *argv[]) {
       puts("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!   test Sparch   "
            "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!");
       ISCACHE = 1;
-      cacheScheme = 11101;
+      cacheScheme = CACHE_SCHEME_SPARCH;
       prefetchSize = inputcachesize / 6;
       cachesize = inputcachesize - prefetchSize;
       CACHEBLOCK = 144;
       CACHEBLOCKLOG = 8;
-      SET = cachesize / (CACHEBLOCK * SETASSOC);
-      SETLOG = getlog(SET);
+      setSET();
       // calculate metadata overhead.
       // if metadata overflow, choose smaller tile
-      int keepkkk = kkk;
-      int keepttk = ttk;
+      int newkkk = kkk;
+      int newttk = ttk;
       // if can keep, just use current kkk
       if (cachesize > kkk * 2) {
         cachesize -= kkk * 2;
       } else {
         // if can't keep, use smaller kkk
         // (make kkk*2 to be half cachesize)
-        kkk = cachesize / 4;
-        ttk = (K + kkk - 1) / kkk;
+        newkkk = cachesize / 4;
+        newttk = (K + kkk - 1) / kkk;
         cachesize -= kkk * 2;
       }
-      runTile(0, iii, jjj, kkk, tti, ttk, ttj, 0);
-      // return to the selected tile size.
-      kkk = keepkkk;
-      ttk = keepttk;
+      runTile(0, iii, jjj, newkkk, tti, newttk, ttj, 0);
       // return to the default setting
       CACHEBLOCK = 16;
       CACHEBLOCKLOG = 4;
       cachesize = inputcachesize;
-      SET = cachesize / (CACHEBLOCK * SETASSOC);
-      SETLOG = getlog(SET);
+      setSET();
 
       fflush(stdout);
 
@@ -465,18 +537,17 @@ int main(int argc, char *argv[]) {
       // LRU + 4 words scheme0
       // just same as using scheme0 with cacheline = 4
       ISCACHE = 1;
-      cacheScheme = 0;
+      cacheScheme = CACHE_SCHEME_BASE;
       cachesize = inputcachesize;
       CACHEBLOCK = 4;
       CACHEBLOCKLOG = 2;
-      SET = cachesize / (CACHEBLOCK * SETASSOC);
-      SETLOG = getlog(SET);
+      setSET();
       runTile(0, iii, jjj, kkk, tti, ttk, ttj, 0);
+
       // return to the default setting
       CACHEBLOCK = 16;
       CACHEBLOCKLOG = 4;
-      SET = cachesize / (CACHEBLOCK * SETASSOC);
-      SETLOG = getlog(SET);
+      setSET();
 
       fflush(stdout);
 
@@ -489,27 +560,21 @@ int main(int argc, char *argv[]) {
 
       run();
 
-      return 0;
     }
 
-    bool testseacache = 1;
-    if (testseacache) {
-
-      puts("\n***************** SeaCache *******************");
-
+    if (!baselinetest) {
+      puts("\n!!!!!!!!!!!!!!!!!!!! EECS570 !!!!!!!!!!!!!!!!!!!!");
+            
+      /*****************************************
       adaptive_prefetch = 1;
-
-      printf("nnzB:%d  K:%d  J/TJ:%d  nzlB:%d\n", nzB, K, (J + jjj - 1) / jjj,
-             nzB / (K * ((J + jjj - 1) / jjj)));
-
-      useVirtualTag = 1;
-      cacheScheme = 88;
+      useVirtualTag = 2;
+      cacheScheme;
       cachesize = inputcachesize;
 
       runTile(0, iii, jjj, kkk, tti, ttk, ttj, 0);
-
       adaptive_prefetch = 0;
       useVirtualTag = 0;
+      *****************************************/
     }
 
     bool ablationtest = 0;
@@ -523,10 +588,9 @@ int main(int argc, char *argv[]) {
            "!!!!!!!!!!!!!!!!!!!!!!!!");
       puts("CacheScheme 0");
       ISCACHE = 1;
-      cacheScheme = 0;
+      cacheScheme = CACHE_SCHEME_BASE;
       cachesize = inputcachesize;
-      SET = cachesize / (CACHEBLOCK * SETASSOC);
-      SETLOG = getlog(SET);
+      setSET();
       runTile(0, iii, jjj, kkk, tti, ttk, ttj, 0);
 
       puts("\n!!!!!!!!!!!!!!!!!!!!!!!!!! scheme1 (mapping)   "
@@ -534,17 +598,16 @@ int main(int argc, char *argv[]) {
 
       puts("CacheScheme 1");
       ISCACHE = 1;
-      cacheScheme = 1;
+      cacheScheme = CACHE_SCHEME_MAPPING;
       cachesize = inputcachesize;
-      SET = cachesize / (CACHEBLOCK * SETASSOC);
-      SETLOG = getlog(SET);
+      setSET();
       runTile(0, iii, jjj, kkk, tti, ttk, ttj, 0);
 
       puts("\n!!!!!!!!!!!!!!!!!!!!!!!!!! scheme88 without virtue   "
            "!!!!!!!!!!!!!!!!!!!!!!!!");
 
       useVirtualTag = 0;
-      cacheScheme = 88;
+      cacheScheme = CACHE_SCHEME_FLFU;
       cachesize = inputcachesize;
       prefetchSize = cachesize / 6;
       runTile(0, iii, jjj, kkk, tti, ttk, ttj, 0);
@@ -554,7 +617,7 @@ int main(int argc, char *argv[]) {
 
       puts("CacheScheme 88 practical FLFU  with virtual tag 1/6");
       useVirtualTag = 1;
-      cacheScheme = 88;
+      cacheScheme = CACHE_SCHEME_FLFU;
       cachesize = inputcachesize;
       prefetchSize = cachesize / 6;
       runTile(0, iii, jjj, kkk, tti, ttk, ttj, 0);
@@ -562,7 +625,7 @@ int main(int argc, char *argv[]) {
 
       puts("CacheScheme 88 practical FLFU  with virtual tag 1/16");
       useVirtualTag = 1;
-      cacheScheme = 88;
+      cacheScheme = CACHE_SCHEME_FLFU;
       cachesize = inputcachesize;
       prefetchSize = cachesize / 16;
       runTile(0, iii, jjj, kkk, tti, ttk, ttj, 0);
diff --git a/src/parameters.cpp b/src/parameters.cpp
index da07757..68335e3 100644
--- a/src/parameters.cpp
+++ b/src/parameters.cpp
@@ -1,11 +1,13 @@
 #include "parameters.h"
 
 // smaple parameter p
-double samplep = 0.01;
+double samplep = 0.01; // sampling probability
 // sample parameter k
 double samplek = 100;
 
-int I, J, K;
+int I; // num rows in A
+int J; // num rows in A, equiv. num cols in B
+int K; // num cols in B
 
 int tti = 1, ttk = 1, ttj = 1;
 // block size
@@ -17,10 +19,10 @@ InterOrder interorder;
 DataFlow dataflow;
 Format format;
 
-string printInterOrder[10] = {"IJK", "IKJ", "JKI", "JIK", "KIJ", "KJI"};
-string printDataFlow[10] = {"Inner", "Outer", "Gust"};
-string printFormat[10] = {"RR", "RC", "CR", "CC", "BB"};
+string printInterOrder[] = {"IJK", "IKJ", "JKI", "JIK", "KIJ", "KJI"};
+string printDataFlow[] = {"Inner", "Outer", "Gust"};
+string printFormat[] = {"RR", "RC", "CR", "CC", "BB"};
 
 int Asize;
 int Bsize;
-int Csize;
\ No newline at end of file
+int Csize;
diff --git a/src/simulator.cpp b/src/simulator.cpp
index 5340e33..baaff96 100644
--- a/src/simulator.cpp
+++ b/src/simulator.cpp
@@ -5,20 +5,20 @@
 #include "statistics.h"
 
 // store all the buffered C now
-set<int> bufferedC[MAXN];
+set<int> *bufferedC = nullptr;
 // record length of buffered C
 // equals bufferedC[i].size()
-int bufferedClen[MAXN];
+int *bufferedClen = nullptr;
 
 int BLOCKSIZE = 16;
 
-int beginA[MAXN];
-int beginB[MAXN];
+int *beginA = nullptr;
+int *beginB = nullptr;
 
-int beginAc[MAXN];
-int beginBc[MAXN];
+int *beginAc = nullptr;
+int *beginBc = nullptr;
 
-int begin[MAXN];
+// int *begin = nullptr;
 
 /*
 The current fiber size of each array
@@ -28,11 +28,11 @@ stored according to the dataflow order
 update currsize each time the block of the array changes
 (when inter-iterate)
 */
-int currsizeA[MAXN];
-int currsizeAc[MAXN];
-int currsizeB[MAXN];
-int currsizeBc[MAXN];
-int currsizeC[MAXN];
+int *currsizeA = nullptr;
+int *currsizeAc = nullptr;
+int *currsizeB = nullptr;
+int *currsizeBc = nullptr;
+// int *currsizeC = nullptr;
 
 /*
 The currently buffered size of each array
@@ -40,11 +40,11 @@ The currently buffered size of each array
 
 update bufferedsize each time
 */
-int bufferedsizeA[MAXN];
-int bufferedsizeB[MAXN];
-int bufferedsizeC[MAXN];
+int *bufferedsizeA = nullptr;
+int *bufferedsizeB = nullptr;
+// int *bufferedsizeC = nullptr;
 
-int tmpC[MAXN];
+int *tmpC = nullptr;
 
 // start of current block
 int TI, TJ, TK;
@@ -114,8 +114,8 @@ void updateBlockA() {
       if (ti > I)
         break;
 
-      int startj = beginA[ti], tmpj = beginA[ti],
-          maxj = offsetarrayA[ti + 1] - offsetarrayA[ti];
+      int startj = beginA[ti], tmpj = beginA[ti];
+      int maxj = offsetarrayA[ti + 1] - offsetarrayA[ti];
 
       // jjj -> ((ISDYNAMICJ)?dynj:jjj)
 
@@ -139,8 +139,8 @@ void updateBlockA() {
       if (tj > J)
         break;
 
-      int starti = beginAc[tj], tmpi = beginAc[tj],
-          maxi = offsetarrayAc[tj + 1] = offsetarrayAc[tj];
+      int starti = beginAc[tj], tmpi = beginAc[tj];
+      int maxi = offsetarrayAc[tj + 1] = offsetarrayAc[tj];
 
       while (tmpi < maxi && Ac[tj][tmpi] < iii + TI) {
         tmpi++;
@@ -199,7 +199,9 @@ void updateBlockC() {
 void forcebeginA() {
   for (int i = 0; i < I; i++) {
 
-    int startj = 0, tmpj = 0, maxj = offsetarrayA[i + 1] - offsetarrayA[i];
+    // int startj = 0;
+    int tmpj = 0;
+    int maxj = offsetarrayA[i + 1] - offsetarrayA[i];
 
     // here is TJ because TJ have added jjj before call the func
     while (tmpj < maxj && A[i][tmpj] < TJ) {
@@ -210,7 +212,9 @@ void forcebeginA() {
   }
 
   for (int tj = 0; tj < J; tj++) {
-    int starti = 0, tmpi = 0, maxi = offsetarrayAc[tj + 1] - offsetarrayAc[tj];
+    // int starti = 0;
+    int tmpi = 0;
+    int maxi = offsetarrayAc[tj + 1] - offsetarrayAc[tj];
 
     while (tmpi < maxi && Ac[tj][tmpi] < TI) {
       tmpi++;
@@ -223,7 +227,9 @@ void forcebeginB() {
 
   for (int tj = 0; tj < J; tj++) {
 
-    int startk = 0, tmpk = 0, maxk = offsetarrayB[tj + 1] - offsetarrayB[tj];
+    // int startk = 0;
+    int tmpk = 0;
+    int maxk = offsetarrayB[tj + 1] - offsetarrayB[tj];
 
     while (tmpk < maxk && B[tj][tmpk] < TK) {
       tmpk++;
@@ -234,7 +240,9 @@ void forcebeginB() {
 
   for (int tk = 0; tk < K; tk++) {
 
-    int startj = 0, tmpj = 0, maxj = offsetarrayBc[tk + 1] - offsetarrayBc[tk];
+    // int startj = 0;
+    int tmpj = 0;
+    int maxj = offsetarrayBc[tk + 1] - offsetarrayBc[tk];
 
     while (tmpj < maxj && Bc[tk][tmpj] < TJ) {
       tmpj++;
@@ -250,8 +258,9 @@ void updateBeginA() {
     if (ti > I)
       break;
 
-    int startj = beginA[ti], tmpj = beginA[ti],
-        maxj = offsetarrayA[ti + 1] - offsetarrayA[ti];
+    // int startj = beginA[ti];
+    int tmpj = beginA[ti];
+    int maxj = offsetarrayA[ti + 1] - offsetarrayA[ti];
 
     // here is TJ because TJ have added jjj before call the func
     while (tmpj < maxj && A[ti][tmpj] < TJ) {
@@ -267,8 +276,9 @@ void ALLupdateBeginAc() {
     if (tj > J)
       break;
 
-    int starti = beginAc[tj], tmpi = beginAc[tj],
-        maxi = offsetarrayAc[tj + 1] - offsetarrayAc[tj];
+    // int starti = beginAc[tj];
+    int tmpi = beginAc[tj];
+    int maxi = offsetarrayAc[tj + 1] - offsetarrayAc[tj];
 
     while (tmpi < maxi && Ac[tj][tmpi] < TI) {
       tmpi++;
@@ -283,8 +293,9 @@ void AllupdateBeginA() {
     if (ti > I)
       break;
 
-    int startj = beginA[ti], tmpj = beginA[ti],
-        maxj = offsetarrayA[ti + 1] - offsetarrayA[ti];
+    // int startj = beginA[ti];
+    int tmpj = beginA[ti];
+    int maxj = offsetarrayA[ti + 1] - offsetarrayA[ti];
     while (tmpj < maxj && A[ti][tmpj] < TJ) {
       tmpj++;
     }
@@ -300,8 +311,9 @@ void updateBeginAc() {
     if (tj > J)
       break;
 
-    int starti = beginAc[tj], tmpi = beginAc[tj],
-        maxi = offsetarrayAc[tj + 1] - offsetarrayAc[tj];
+    // int starti = beginAc[tj];
+    int tmpi = beginAc[tj];
+    int maxi = offsetarrayAc[tj + 1] - offsetarrayAc[tj];
 
     while (tmpi < maxi && Ac[tj][tmpi] < TI) {
       tmpi++;
@@ -318,8 +330,9 @@ void AllupdateBeginB() {
     if (tj > J)
       break;
 
-    int startk = beginB[tj], tmpk = beginB[tj],
-        maxk = offsetarrayB[tj + 1] - offsetarrayB[tj];
+    // int startk = beginB[tj];
+    int tmpk = beginB[tj];
+    int maxk = offsetarrayB[tj + 1] - offsetarrayB[tj];
 
     while (tmpk < maxk && B[tj][tmpk] < TK) {
       tmpk++;
@@ -336,8 +349,9 @@ void AllupdateBeginBc() {
     if (tk > K)
       break;
 
-    int startj = beginBc[tk], tmpj = beginBc[tk],
-        maxj = offsetarrayBc[tk + 1] - offsetarrayBc[tk];
+    // int startj = beginBc[tk]; 
+    int tmpj = beginBc[tk];
+    int maxj = offsetarrayBc[tk + 1] - offsetarrayBc[tk];
 
     while (tmpj < maxj && Bc[tk][tmpj] < TJ) {
       tmpj++;
@@ -355,8 +369,9 @@ void updateBeginB() {
     if (tj > J)
       break;
 
-    int startk = beginB[tj], tmpk = beginB[tj],
-        maxk = offsetarrayB[tj + 1] - offsetarrayB[tj];
+    // int startk = beginB[tj];
+    int tmpk = beginB[tj];
+    int maxk = offsetarrayB[tj + 1] - offsetarrayB[tj];
 
     while (tmpk < maxk && B[tj][tmpk] < TK) {
       tmpk++;
@@ -373,8 +388,9 @@ void updateBeginBc() {
     if (tk > K)
       break;
 
-    int startj = beginBc[tk], tmpj = beginBc[tk],
-        maxj = offsetarrayBc[tk + 1] - offsetarrayBc[tk];
+    // int startj = beginBc[tk];
+    int tmpj = beginBc[tk];
+    int maxj = offsetarrayBc[tk + 1] - offsetarrayBc[tk];
 
     while (tmpj < maxj && Bc[tk][tmpj] < TJ) {
       tmpj++;
@@ -1086,7 +1102,8 @@ void pre_load_B() {
 
       // equals to 0(when jjj in the first half) or 1(when jjj in the second
       // half);
-      int _TJ, _TK;
+      int _TJ;
+      // int _TK;
 
       for (tj = TJ; tj < TJ + jjj; tj++) {
         if (tj > J)
@@ -1663,7 +1680,7 @@ void get_A_fiber_col(int jj) {
       computeSramAccess += sramReadBandwidth(currsizeAc[jj] * 3 + 2) +
                            sramWriteBandwidth(currsizeAc[jj] * 3 + 2);
 
-      if (cacheScheme == 11100) {
+      if (cacheScheme == CACHE_SCHEME_INNER_SP) {
         // double A access in static FLRU scheme
         computeDramAccess += memoryBandwidthPE(currsizeAc[jj] * 3 + 2);
         computeA += memoryBandwidthPE(currsizeAc[jj] * 3 + 2);
@@ -1699,7 +1716,7 @@ void get_A_fiber(int ii) {
       // hit
       computeSramAccess += sramReadBandwidth(currsizeA[ii] * 3 + 2);
 
-      if (cacheScheme == 11100) {
+      if (cacheScheme == CACHE_SCHEME_INNER_SP) {
         // double A access in static FLRU scheme
         computeSramAccess += sramReadBandwidth(currsizeA[ii] * 3 + 2);
       }
@@ -1711,7 +1728,7 @@ void get_A_fiber(int ii) {
       computeSramAccess += sramReadBandwidth(currsizeA[ii] * 3 + 2) +
                            sramWriteBandwidth(currsizeA[ii] * 3 + 2);
 
-      if (cacheScheme == 11100) {
+      if (cacheScheme == CACHE_SCHEME_INNER_SP) {
         // double A access in static FLRU scheme
         computeDramAccess += memoryBandwidthPE(currsizeA[ii] * 3 + 2);
         computeA += memoryBandwidthPE(currsizeA[ii] * 3 + 2);
@@ -1755,7 +1772,7 @@ void updateCAccess(int ii) {
     // check the delta buffer: how many new C elements (indicate how many
     // increase)
     int deltaC = 0;
-    int oldsize = bufferedClen[ii];
+    // int oldsize = bufferedClen[ii];
     for (int k1 = TK; k1 < TK + ((ISDYNAMICK) ? dynk : kkk); k1++) {
       if (tmpC[k1]) {
         // the k1 is a new element
@@ -1863,7 +1880,7 @@ void get_B_fibers(int ii) {
         if (fulltagA == 0 || ii < fullA) {
           // hit
           computeSramAccess += sramReadBandwidth((tmpj - beginA[ii]) * 3);
-          if (cacheScheme == 11100) {
+          if (cacheScheme == CACHE_SCHEME_INNER_SP) {
             computeSramAccess += sramReadBandwidth((tmpj - beginA[ii]) * 3);
           }
         } else {
@@ -1874,7 +1891,7 @@ void get_B_fibers(int ii) {
           computeSramAccess += sramReadBandwidth((tmpj - beginA[ii]) * 3) +
                                sramWriteBandwidth((tmpj - beginA[ii]) * 3);
 
-          if (cacheScheme == 11100) {
+          if (cacheScheme == CACHE_SCHEME_INNER_SP) {
             computeDramAccess += memoryBandwidthPE((tmpj - beginA[ii]) * 3);
             computeA += memoryBandwidthPE((tmpj - beginA[ii]) * 3);
             computeSramAccess += sramReadBandwidth((tmpj - beginA[ii]) * 3) +
@@ -1889,7 +1906,7 @@ void get_B_fibers(int ii) {
         computeSramAccess += sramReadBandwidth((tmpj - beginA[ii]) * 3) +
                              sramWriteBandwidth((tmpj - beginA[ii]) * 3);
 
-        if (cacheScheme == 11100) {
+        if (cacheScheme == CACHE_SCHEME_INNER_SP) {
           computeDramAccess += memoryBandwidthPE((tmpj - beginA[ii]) * 3);
           computeA += memoryBandwidthPE((tmpj - beginA[ii]) * 3);
 
@@ -1929,13 +1946,13 @@ int prefetchRowNow = 0;
 
 bool prefetchrow(int ii) {
 
-  int needsize;
+  int needsize = 0;
   // FLRU mode; need 2data+1coord+1next pointer (*4)
   if (cacheScheme == 6 || cacheScheme == 7) {
     needsize = currsizeA[ii] * 4 + 1;
   }
   // FLFU mode; don't need next pointer (*3)
-  if (cacheScheme == 66 || cacheScheme == 88) {
+  else if (cacheScheme == 66 || cacheScheme == CACHE_SCHEME_FLFU) {
     needsize = currsizeA[ii] * 3;
   }
 
@@ -1954,8 +1971,8 @@ bool prefetchrow(int ii) {
     // in this prefetch: push the next access queue of jj a ii
     int jj = A[ii][tmpj];
 
-    if (cacheScheme == 6 || cacheScheme == 7 || cacheScheme == 11100 ||
-        cacheScheme == 11101) {
+    if (cacheScheme == 6 || cacheScheme == 7 || cacheScheme == CACHE_SCHEME_INNER_SP ||
+        cacheScheme == CACHE_SCHEME_SPARCH) {
       nextposvector[jj].push(-ii);
     }
     if (cacheScheme == 66) {
@@ -1963,7 +1980,7 @@ bool prefetchrow(int ii) {
     }
 
     // practical flfu. update in the flubit
-    if (cacheScheme == 88) {
+    if (cacheScheme == CACHE_SCHEME_FLFU) {
 
       long long firstaddr = getCacheAddr(jj, 0);
       int fibersize = currsizeB[jj] * 3;
@@ -1979,24 +1996,24 @@ bool prefetchrow(int ii) {
         prefetch_increments++;
 
         for (int i = 0; i < SETASSOC; i++) {
-          if (Valid[_set][i] && (Tag[_set][i] == _tag)) {
+          if (Valid[_set * SETASSOC + i] && (Tag[_set * SETASSOC + i] == _tag)) {
 
             // not the first, need to check orig
             if (tmpcurr != 0) {
-              if (PosOrig[_set][i] != getOrig(firstaddr)) {
+              if (PosOrig[_set * SETASSOC + i] != getOrig(firstaddr)) {
                 // not the same orig
                 continue;
               }
             } else {
-              if (PosOrig[_set][i] != 0) {
+              if (PosOrig[_set * SETASSOC + i] != 0) {
                 continue;
               }
             }
             // hit
             incache = 1;
-            lfubit[_set][i]++;
+            lfubit[_set * SETASSOC + i]++;
             // if the updated flfu bit overflow
-            if (lfubit[_set][i] > LFUmax) {
+            if (lfubit[_set * SETASSOC + i] > LFUmax) {
               needhalf = 1;
             }
             break;
@@ -2008,12 +2025,12 @@ bool prefetchrow(int ii) {
           if (!incache) {
             bool invirtualtag = 0;
             for (int i = 0; i < VIRTUALSETASSOC; i++) {
-              if (virtualValid[_set][i]) {
-                if (virtualTag[_set][i] == _tag) {
+              if (virtualValid[_set * VIRTUALSETASSOC + i]) {
+                if (virtualTag[_set * VIRTUALSETASSOC + i] == _tag) {
                   // in virtual
                   invirtualtag = 1;
-                  virtuallfubit[_set][i]++;
-                  if (virtuallfubit[_set][i] > LFUmax) {
+                  virtuallfubit[_set * VIRTUALSETASSOC + i]++;
+                  if (virtuallfubit[_set * VIRTUALSETASSOC + i] > LFUmax) {
                     needhalf = 1;
                   }
                   // if find a matched, don't need to check others
@@ -2027,13 +2044,13 @@ bool prefetchrow(int ii) {
 
               bool hasinvalid = 0;
               for (int i = 0; i < VIRTUALSETASSOC; i++) {
-                if (virtualValid[_set][i] == 0) {
+                if (virtualValid[_set * VIRTUALSETASSOC + i] == 0) {
                   // has invalide!
                   hasinvalid = 1;
                   // put the slot here
-                  virtualValid[_set][i] = 1;
-                  virtualTag[_set][i] = _tag;
-                  virtuallfubit[_set][i] = 1;
+                  virtualValid[_set * VIRTUALSETASSOC + i] = 1;
+                  virtualTag[_set * VIRTUALSETASSOC + i] = _tag;
+                  virtuallfubit[_set * VIRTUALSETASSOC + i] = 1;
                   break;
                 }
               }
@@ -2041,12 +2058,12 @@ bool prefetchrow(int ii) {
               if (!hasinvalid) {
 
                 for (int i = 0; i < VIRTUALSETASSOC; i++) {
-                  if (virtuallfubit[_set][i] == 0) {
+                  if (virtuallfubit[_set * VIRTUALSETASSOC + i] == 0) {
                     // find a slot = 0, replace it to the current fiber
                     haszero = 1;
-                    virtualValid[_set][i] = 1;
-                    virtualTag[_set][i] = _tag;
-                    virtuallfubit[_set][i] = 1;
+                    virtualValid[_set * VIRTUALSETASSOC + i] = 1;
+                    virtualTag[_set * VIRTUALSETASSOC + i] = _tag;
+                    virtuallfubit[_set * VIRTUALSETASSOC + i] = 1;
 
                     break;
                   }
@@ -2064,8 +2081,8 @@ bool prefetchrow(int ii) {
         // both update in cache or virtual tag will cause the half
         if (needhalf) {
           for (int i = 0; i < SETASSOC; i++) {
-            if (Valid[_set][i]) {
-              lfubit[_set][i] /= 2;
+            if (Valid[_set * SETASSOC + i]) {
+              lfubit[_set * SETASSOC + i] /= 2;
             }
           }
 
@@ -2074,8 +2091,8 @@ bool prefetchrow(int ii) {
           // but is 0 now, then will be replace, but actually better
           if (useVirtualTag) {
             for (int i = 0; i < VIRTUALSETASSOC; i++) {
-              if (virtualValid[_set][i]) {
-                virtuallfubit[_set][i] /= 2;
+              if (virtualValid[_set * VIRTUALSETASSOC + i]) {
+                virtuallfubit[_set * VIRTUALSETASSOC + i] /= 2;
               }
             }
           }
@@ -2093,15 +2110,16 @@ bool prefetchrow(int ii) {
   return 1;
 }
 
-void initialize_adaptive_prefetch(long long nnzA, long long nnzB, int K, int J,
-                                  int T_J) {
+// void initialize_adaptive_prefetch(long long nnzA, long long nnzB, int K, int J,
+//                                   int T_J) {
+void initialize_adaptive_prefetch(long long, long long, int, int, int) {
   // --- Offline Phase ---
-  double avg_nonzero_length_B;
-  if (K > 0 && T_J > 0) {
-    avg_nonzero_length_B = static_cast<double>(nnzB) / K;
-  } else {
-    avg_nonzero_length_B = 1.0;
-  }
+  // double avg_nonzero_length_B;
+  // if (K > 0 && T_J > 0) {
+  //   avg_nonzero_length_B = static_cast<double>(nnzB) / K;
+  // } else {
+  //   avg_nonzero_length_B = 1.0;
+  // }
 
   current_prefetch_size = 1.0 / 128.0;
 
@@ -2169,8 +2187,7 @@ void update_prefetch_size() {
     } else {
       current_discard_rate = ((double)prefetch_discards) / prefetch_increments;
     }
-    double current_no_counter_miss_rate =
-        ((double)data_access_misses) / data_access_total;
+    // double current_no_counter_miss_rate = static_cast<double>(data_access_misses) / data_access_total;
     // printf("Discard Rate: %lf, %d %d \n", current_discard_rate,
     //        prefetch_discards, prefetch_increments);
 
@@ -2266,8 +2283,7 @@ void update_prefetch_size() {
   } else {
     current_discard_rate = ((double)prefetch_discards) / prefetch_increments;
   }
-  double current_no_counter_miss_rate =
-      ((double)data_access_misses) / data_access_total;
+  // double current_no_counter_miss_rate = static_cast<double>(data_access_misses) / data_access_total;
   // printf("Discard Rate: %lf, %d %d \n", current_discard_rate,
   // prefetch_discards,
   //       prefetch_increments);
@@ -2335,10 +2351,10 @@ void calculate() {
 
       // all prefetch scheme
       if (cacheScheme == 6 || cacheScheme == 7 || cacheScheme == 66 ||
-          cacheScheme == 88 || cacheScheme == 11100 || cacheScheme == 11101) {
+          cacheScheme == CACHE_SCHEME_FLFU || cacheScheme == CACHE_SCHEME_INNER_SP || cacheScheme == CACHE_SCHEME_SPARCH) {
         // reinitialize the next pointer for FLRU
-        if (cacheScheme == 6 || cacheScheme == 7 || cacheScheme == 11100 ||
-            cacheScheme == 11101) {
+        if (cacheScheme == 6 || cacheScheme == 7 || cacheScheme == CACHE_SCHEME_INNER_SP ||
+            cacheScheme == CACHE_SCHEME_SPARCH) {
           for (int j1 = TJ; j1 < TJ + jjj; j1++) {
             if (j1 > J)
               break;
@@ -2378,15 +2394,15 @@ void calculate() {
         // update the prefetch window after each row
         // don't need to update prefetch window in static flru
         if (cacheScheme == 6 || cacheScheme == 7 || cacheScheme == 66 ||
-            cacheScheme == 88 || cacheScheme == 11100 || cacheScheme == 11101) {
+            cacheScheme == CACHE_SCHEME_FLFU || cacheScheme == CACHE_SCHEME_INNER_SP || cacheScheme == CACHE_SCHEME_SPARCH) {
 
           // first minus this row's overhead
           int needsize = 0;
-          if (cacheScheme == 6 || cacheScheme == 7 || cacheScheme == 11101) {
+          if (cacheScheme == 6 || cacheScheme == 7 || cacheScheme == CACHE_SCHEME_SPARCH) {
             needsize = currsizeA[TI + ii] * 4 + 1;
           }
           // FLFU mode; don't need next pointer (*3)
-          if (cacheScheme == 66 || cacheScheme == 88) {
+          if (cacheScheme == 66 || cacheScheme == CACHE_SCHEME_FLFU) {
             needsize = currsizeA[TI + ii] * 3;
           }
 
@@ -2424,7 +2440,8 @@ void calculate() {
 
       for (int ii = 0; ii < iii; ii++) {
 
-        int cnew = 0, cnow = 0;
+        // int cnew = 0;
+  int cnow = 0;
 
         // update A
         // get A
@@ -2613,8 +2630,65 @@ void configPartial(float partialA, float partialB, float partialC) {
   }
 }
 
-void reinitialize() {
+void initialize_simulator() {
+  // alloacte memory
+  try {
+    if(bufferedC == nullptr) bufferedC = new set<int>[I]();
+    if(bufferedClen == nullptr) bufferedClen = new int[I]();
+    if(beginA == nullptr) beginA = new int[I]();
+    if(beginB == nullptr) beginB = new int[J]();
+
+    if(beginAc == nullptr) beginAc = new int[J]();
+    if(beginBc == nullptr) beginBc = new int[K]();
+
+    // if(begin == nullptr) new int[];
+
+    if(currsizeA == nullptr) currsizeA = new int[I]();
+    if(currsizeAc == nullptr) currsizeAc = new int[J]();
+    if(currsizeB == nullptr) currsizeB = new int[J]();
+    if(currsizeBc == nullptr) currsizeBc = new int[J]();
+    // if(currsizeC == nullptr) new int[K];
+
+    if(bufferedsizeA == nullptr) bufferedsizeA = new int[I]();
+    if(bufferedsizeB == nullptr) bufferedsizeB = new int[J]();
+    // if(bufferedsizeC == nullptr) new int[K];
+
+    if(tmpC == nullptr) tmpC = new int[K]();
 
+    if(LFUtag == nullptr) LFUtag = new int[J]();
+    if(nextposvector == nullptr) nextposvector = new queue<int>[J]();
+  } catch (const std::bad_alloc &e) {
+    std::cerr << "Error allocating memory for " << e.what() << std::endl;
+    std::exit(1);
+  }
+  
+}
+
+void deinitialize_simulator() {
+  if(bufferedC != nullptr) delete[] bufferedC;
+  if(bufferedClen != nullptr) delete[] bufferedClen;
+  if(beginA != nullptr) delete[] beginA;
+  if(beginB != nullptr) delete[] beginB;
+
+  if(beginAc != nullptr) delete[] beginAc;
+  if(beginBc != nullptr) delete[] beginBc;
+
+  // if(begin != nullptr) delete[] int[];
+
+  if(currsizeA != nullptr) delete[] currsizeA;
+  if(currsizeAc != nullptr) delete[] currsizeAc;
+  if(currsizeB != nullptr) delete[] currsizeB;
+  if(currsizeBc != nullptr) delete[] currsizeBc;
+  // if(currsizeC != nullptr) delete[] currsizeC;
+
+  if(bufferedsizeA != nullptr) delete[] bufferedsizeA;
+  if(bufferedsizeB != nullptr) delete[] bufferedsizeB;
+  // if(bufferedsizeC != nullptr) delete[] bufferedsizeC;
+
+  if(tmpC != nullptr) delete[] tmpC;
+}
+
+void reinitialize() {
   // reinitialize statistics
   totalCycle = 0;
   preCycle = calCycle = postCycle = 0;
@@ -2636,11 +2710,11 @@ void reinitialize() {
     initializeCacheValid();
 
     if (useVirtualTag) {
-      memset(virtualValid, 0, sizeof(virtualValid));
+      memset(virtualValid, 0, sizeof(bool) * SET * VIRTUALSETASSOC);
     }
 
-    memset(PosOrig, 0, sizeof(PosOrig));
-    memset(vPosOrig, 0, sizeof(vPosOrig));
+    memset(PosOrig, 0, sizeof(short) * SET * SETASSOC);
+    memset(vPosOrig, 0, sizeof(short) * SET * SETASSOC);
   }
 
   // reinitialize buffer c
@@ -2808,7 +2882,7 @@ void run() {
   analyze_statistics();
 }
 
-void runTile(bool isest, int iii, int jjj, int kkk, long long tti,
+void runTile(bool isest, int /* iii */, int jjj, int kkk, long long tti,
              long long ttj, long long ttk, long long SmallestTile) {
 
   // only prunning in the estimation mode
@@ -2822,8 +2896,8 @@ void runTile(bool isest, int iii, int jjj, int kkk, long long tti,
     prefetchNow = 0;
     prefetchRowNow = 0;
 
-    if (cacheScheme == 6 || cacheScheme == 7 || cacheScheme == 11100 ||
-        cacheScheme == 11101) {
+    if (cacheScheme == 6 || cacheScheme == 7 || cacheScheme == CACHE_SCHEME_INNER_SP ||
+        cacheScheme == CACHE_SCHEME_SPARCH) {
       for (int j = 0; j < J; j++) {
         while (!nextposvector[j].empty()) {
           nextposvector[j].pop();
@@ -2836,7 +2910,7 @@ void runTile(bool isest, int iii, int jjj, int kkk, long long tti,
       }
     }
 
-    // prunning
+    // pruning
     if (((long long)jjj * kkk) * 4 < SmallestTile) {
       return;
     }
@@ -2873,7 +2947,7 @@ void runTile(bool isest, int iii, int jjj, int kkk, long long tti,
     setSET();
   }
 
-  if (ISCACHE && (cacheScheme == 88)) {
+  if (ISCACHE && (cacheScheme == CACHE_SCHEME_FLFU)) {
 
     cachesize = inputcachesize - prefetchSize;
 
diff --git a/src/simulator.h b/src/simulator.h
index 2a43af6..5b236a0 100644
--- a/src/simulator.h
+++ b/src/simulator.h
@@ -22,10 +22,13 @@ int getkbound();
 int getjbound();
 int getibound();
 
-extern int currsizeB[MAXN];
-extern int currsizeBc[MAXN];
-extern int beginB[MAXN];
+extern int *currsizeB;
+extern int *currsizeBc;
+extern int *beginB;
 
 extern int TI, TJ, TK;
 
+void initialize_simulator();
+void deinitialize_simulator();
+
 #endif